llm-quant/app/ingest/news.py

120 lines
3.1 KiB
Python

"""Unified news ingestion orchestration with GDELT as the primary source."""
from __future__ import annotations
from datetime import date, datetime, timedelta
from typing import Set, Tuple
from app.data.schema import initialize_database
from app.utils.logging import get_logger
from .gdelt import ingest_configured_gdelt
LOGGER = get_logger(__name__)
LOG_EXTRA = {"stage": "news_ingest"}
_PREPARED_WINDOWS: Set[Tuple[str, int]] = set()
def _normalize_date(value: date | datetime) -> datetime:
if isinstance(value, datetime):
return value
return datetime.combine(value, datetime.min.time())
def ingest_latest_news(
*,
days_back: int = 1,
force: bool = False,
) -> int:
"""Fetch latest news primarily via GDELT within a day-level window."""
initialize_database()
now = datetime.utcnow()
days = max(days_back, 1)
start_day = (now.date() - timedelta(days=days - 1))
start_dt = datetime.combine(start_day, datetime.min.time())
end_dt = datetime.combine(now.date(), datetime.max.time())
LOGGER.info(
"触发 GDELT 新闻拉取 days=%s force=%s",
days,
force,
extra=LOG_EXTRA,
)
inserted = ingest_configured_gdelt(
start=start_dt,
end=end_dt,
incremental=not force,
)
LOGGER.info("新闻拉取完成 inserted=%s", inserted, extra=LOG_EXTRA)
return inserted
def ensure_news_range(
start: date | datetime,
end: date | datetime,
*,
force: bool = False,
) -> int:
"""Ensure the news store covers the requested window."""
initialize_database()
start_dt = _normalize_date(start)
end_dt = _normalize_date(end)
if start_dt > end_dt:
start_dt, end_dt = end_dt, start_dt
start_dt = datetime.combine(start_dt.date(), datetime.min.time())
end_dt = datetime.combine(end_dt.date(), datetime.max.time())
LOGGER.info(
"同步 GDELT 新闻数据 start=%s end=%s force=%s",
start_dt.isoformat(),
end_dt.isoformat(),
force,
extra=LOG_EXTRA,
)
inserted = ingest_configured_gdelt(
start=start_dt,
end=end_dt,
incremental=not force,
)
LOGGER.info(
"新闻窗口同步完成 inserted=%s start=%s end=%s",
inserted,
start_dt.isoformat(),
end_dt.isoformat(),
extra=LOG_EXTRA,
)
return inserted
def prepare_news_for_factors(
trade_date: date,
*,
lookback_days: int = 3,
force: bool = False,
) -> int:
"""Prepare news data before sentiment factor computation."""
key = (trade_date.strftime("%Y%m%d"), max(lookback_days, 1))
if not force and key in _PREPARED_WINDOWS:
LOGGER.debug(
"新闻窗口已准备完成 trade_date=%s lookback=%s",
key[0],
key[1],
extra=LOG_EXTRA,
)
return 0
end_date = trade_date
start_date = trade_date - timedelta(days=max(lookback_days - 1, 0))
inserted = ensure_news_range(start_date, end_date, force=force)
if not force:
_PREPARED_WINDOWS.add(key)
return inserted
__all__ = [
"ensure_news_range",
"ingest_latest_news",
"prepare_news_for_factors",
]