llm-quant/tests/test_rss_item_industry_keywords.py
2025-10-05 14:48:41 +08:00

84 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Test industry and keyword extraction in RSS processing."""
from datetime import datetime, timezone
from app.ingest.rss import RssItem
def test_industry_extraction():
"""Test industry keyword extraction."""
# 创建测试新闻并跳过数据库初始化
class TestRssItem(RssItem):
_skip_db_init = True
item = TestRssItem(
id="test_news",
title="某半导体公司推出新一代芯片",
link="http://example.com",
published=datetime.now(timezone.utc),
summary="该公司在集成电路领域取得重大突破新产品将用于5G通信",
source="test"
)
# 提取行业关键词
item.extract_industries()
# 验证结果
assert "半导体" in item.industries
assert len(item.industries) >= 1
def test_important_keyword_extraction():
"""Test important keyword extraction."""
# 创建测试新闻(包含积极、消极和事件关键词)并跳过数据库初始化
class TestRssItem(RssItem):
_skip_db_init = True
item = TestRssItem(
id="test_news",
title="某公司业绩超预期,同时宣布重大收购计划",
link="http://example.com",
published=datetime.now(timezone.utc),
summary="营收增长显著,但部分业务亏损,将通过并购扩张",
source="test"
)
# 提取重要关键词
item.extract_important_keywords()
# 验证结果:应该包含积极、消极和事件关键词
keywords = set(item.important_keywords)
# 检查是否包含至少一个积极关键词(前缀为+
assert any(k.startswith('+') for k in keywords)
# 检查是否包含至少一个消极关键词(前缀为-
assert any(k.startswith('-') for k in keywords)
# 检查是否包含至少一个事件关键词(前缀为#
assert any(k.startswith('#') for k in keywords)
def test_rss_item_full_extraction():
"""Test full entity, industry and keyword extraction."""
# 创建一个包含多种信息的测试新闻并跳过数据库初始化
class TestRssItem(RssItem):
_skip_db_init = True
item = TestRssItem(
id="test_news",
title="半导体行业利好:某公司重大突破",
link="http://example.com",
published=datetime.now(timezone.utc),
summary="集成电路领域取得重大进展,业绩超预期,宣布增持计划",
source="test"
)
# 提取所有信息
item.extract_entities() # 这将自动调用行业和关键词提取
# 验证结果的完整性
assert item.industries # 应该至少识别出半导体行业
assert item.important_keywords # 应该找到关键词
# 验证关键词类型的完整性
keywords = set(item.important_keywords)
assert any(k.startswith('+') for k in keywords) # 积极关键词
assert any(k.startswith('#') for k in keywords) # 事件关键词