"""GDELT Doc API ingestion utilities built on top of gdeltdoc.""" from __future__ import annotations import hashlib import sqlite3 from dataclasses import dataclass, field, replace from datetime import date, datetime, timedelta, timezone from typing import Dict, Iterable, List, Optional, Sequence, Union try: # pragma: no cover - optional dependency from gdeltdoc import GdeltDoc, Filters # type: ignore[import-not-found] except ImportError: # pragma: no cover - optional dependency GdeltDoc = None # type: ignore[assignment] Filters = None # type: ignore[assignment] from app.utils.config import get_config from app.utils.db import db_session from app.utils.logging import get_logger from . import rss as rss_ingest LOGGER = get_logger(__name__) LOG_EXTRA = {"stage": "gdelt_ingest"} DateLike = Union[date, datetime] @dataclass class GdeltSourceConfig: """Configuration describing a single GDELT filter set.""" key: str label: str filters: Dict[str, object] = field(default_factory=dict) ts_codes: Sequence[str] = field(default_factory=tuple) keywords: Sequence[str] = field(default_factory=tuple) num_records: int = 50 def resolve_gdelt_sources() -> List[GdeltSourceConfig]: """Resolve configured GDELT filter groups.""" cfg = get_config() raw = getattr(cfg, "gdelt_sources", None) or {} sources: List[GdeltSourceConfig] = [] if isinstance(raw, dict): for key, data in raw.items(): if not isinstance(data, dict): continue if not data.get("enabled", True): continue label = str(data.get("label") or key) filters = data.get("filters") if isinstance(data.get("filters"), dict) else {} ts_codes = [ str(code).strip().upper() for code in data.get("ts_codes", []) if isinstance(code, str) and code.strip() ] keywords = [ str(token).strip() for token in data.get("keywords", []) if isinstance(token, str) and token.strip() ] num_records = data.get("num_records") if not isinstance(num_records, int) or num_records <= 0: num_records = 50 sources.append( GdeltSourceConfig( key=str(key), label=label, filters=dict(filters), ts_codes=tuple(ts_codes), keywords=tuple(keywords), num_records=num_records, ) ) return sources def _ensure_datetime(value: DateLike, *, start_of_day: bool = True) -> datetime: if isinstance(value, datetime): return _normalize_timestamp(value) if start_of_day: return datetime.combine(value, datetime.min.time()) return datetime.combine(value, datetime.max.time()) def _normalize_timestamp(value: datetime) -> datetime: if value.tzinfo is not None: return value.astimezone(timezone.utc).replace(tzinfo=None) return value def _load_last_published(source_key: str) -> Optional[datetime]: try: with db_session(read_only=True) as conn: row = conn.execute( "SELECT last_published FROM ingest_state WHERE source = ?", (source_key,), ).fetchone() except sqlite3.OperationalError: return None if not row: return None raw = row["last_published"] if not raw: return None try: return _normalize_timestamp(datetime.fromisoformat(raw)) except ValueError: LOGGER.debug("无法解析 GDELT 状态时间 source=%s value=%s", source_key, raw, extra=LOG_EXTRA) return None def _save_last_published(source_key: str, published: datetime) -> None: timestamp = _normalize_timestamp(published).isoformat() try: with db_session() as conn: conn.execute( """ INSERT INTO ingest_state (source, last_published) VALUES (?, ?) ON CONFLICT(source) DO UPDATE SET last_published = excluded.last_published """, (source_key, timestamp), ) except sqlite3.OperationalError: LOGGER.debug("写入 ingest_state 失败,表可能不存在", extra=LOG_EXTRA) def _parse_gdelt_datetime(raw: object) -> datetime: if isinstance(raw, datetime): return _normalize_timestamp(raw) if raw is None: return _normalize_timestamp(datetime.utcnow()) text = str(raw).strip() if not text: return _normalize_timestamp(datetime.utcnow()) # Common GDELT formats: YYYYMMDDHHMMSS or ISO8601 try: if text.isdigit() and len(text) == 14: return _normalize_timestamp(datetime.strptime(text, "%Y%m%d%H%M%S")) if text.endswith("Z"): text = text[:-1] + "+00:00" return _normalize_timestamp(datetime.fromisoformat(text)) except ValueError: pass try: return _normalize_timestamp(datetime.strptime(text, "%Y-%m-%d %H:%M:%S")) except ValueError: LOGGER.debug("无法解析 GDELT 日期:%s", text, extra=LOG_EXTRA) return _normalize_timestamp(datetime.utcnow()) def _build_rss_item(record: Dict[str, object], config: GdeltSourceConfig) -> Optional[rss_ingest.RssItem]: url = record.get("url") or record.get("url_mobile") if not isinstance(url, str) or not url.strip(): return None url = url.strip() title = record.get("title") or record.get("seendate") if not isinstance(title, str) or not title.strip(): title = url title = title.strip() published_raw = ( record.get("seendate") or record.get("publishDate") or record.get("date") or record.get("firstseendate") ) published = _parse_gdelt_datetime(published_raw) summary_candidates: Iterable[object] = ( record.get("summary"), record.get("snippet"), record.get("excerpt"), record.get("altText"), record.get("domain"), ) summary = "" for candidate in summary_candidates: if isinstance(candidate, str) and candidate.strip(): summary = candidate.strip() break if not summary: source_country = record.get("sourcecountry") language = record.get("language") details = [ str(value).strip() for value in (source_country, language) if isinstance(value, str) and value.strip() ] summary = " / ".join(details) if details else title source = record.get("sourcecommonname") or record.get("domain") if not isinstance(source, str) or not source.strip(): source = config.label or "GDELT" source = source.strip() fingerprint = f"{url}|{published.isoformat()}" article_id = hashlib.blake2s(fingerprint.encode("utf-8"), digest_size=16).hexdigest() return rss_ingest.RssItem( id=article_id, title=title, link=url, published=published, summary=summary, source=source, metadata={ "source_key": config.key, "source_label": config.label, }, ) def fetch_gdelt_articles( config: GdeltSourceConfig, *, start: Optional[datetime] = None, end: Optional[datetime] = None, ) -> List[rss_ingest.RssItem]: """Fetch article list from GDELT based on the supplied configuration.""" if GdeltDoc is None or Filters is None: LOGGER.warning("未安装 gdeltdoc,跳过 GDELT 拉取", extra=LOG_EXTRA) return [] filters_kwargs = dict(config.filters) filters_kwargs.setdefault("num_records", config.num_records) if start: filters_kwargs.pop("timespan", None) filters_kwargs["start_date"] = start if end: filters_kwargs.pop("timespan", None) filters_kwargs["end_date"] = end try: filter_obj = Filters(**filters_kwargs) except Exception as exc: # noqa: BLE001 - guard misconfigured filters LOGGER.error("GDELT 过滤器解析失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA) return [] client = GdeltDoc() try: df = client.article_search(filter_obj) except Exception as exc: # noqa: BLE001 - network/service issues LOGGER.warning("GDELT 请求失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA) return [] if df is None or df.empty: LOGGER.info("GDELT 无匹配结果 key=%s", config.key, extra=LOG_EXTRA) return [] items: List[rss_ingest.RssItem] = [] for record in df.to_dict(orient="records"): item = _build_rss_item(record, config) if not item: continue assigned_codes = rss_ingest._assign_ts_codes(item, config.ts_codes, config.keywords) # type: ignore[attr-defined] items.append(replace(item, ts_codes=tuple(assigned_codes))) return items def ingest_configured_gdelt( start: Optional[DateLike] = None, end: Optional[DateLike] = None, *, incremental: bool = True, ) -> int: """Ingest all configured GDELT sources into the news store.""" sources = resolve_gdelt_sources() if not sources: LOGGER.info("未配置 GDELT 来源,跳过新闻拉取", extra=LOG_EXTRA) return 0 start_dt = _ensure_datetime(start) if start else None end_dt = _ensure_datetime(end, start_of_day=False) if end else None aggregated: List[rss_ingest.RssItem] = [] latest_by_source: Dict[str, datetime] = {} fetched = 0 for config in sources: source_start = start_dt if incremental: last_seen = _load_last_published(config.key) if last_seen: candidate = last_seen + timedelta(seconds=1) if source_start is None or candidate > source_start: source_start = candidate LOGGER.info( "开始拉取 GDELT:%s start=%s end=%s incremental=%s", config.label, source_start.isoformat() if source_start else None, end_dt.isoformat() if end_dt else None, incremental, extra=LOG_EXTRA, ) items = fetch_gdelt_articles(config, start=source_start, end=end_dt) if not items: continue aggregated.extend(items) fetched += len(items) LOGGER.info("GDELT 来源 %s 返回 %s 条记录", config.label, len(items), extra=LOG_EXTRA) if not aggregated: return 0 deduped = rss_ingest.deduplicate_items(aggregated) if not deduped: LOGGER.info("GDELT 数据全部为重复项,跳过落库", extra=LOG_EXTRA) return 0 inserted = rss_ingest.save_news_items(deduped) if inserted: latest_by_source.clear() for item in deduped: source_key = str(item.metadata.get("source_key", "") if item.metadata else "") if not source_key: continue current = latest_by_source.get(source_key) candidate = item.published if current is None or candidate > current: latest_by_source[source_key] = candidate for source_key, timestamp in latest_by_source.items(): _save_last_published(source_key, timestamp) LOGGER.info( "GDELT 新闻落库完成 fetched=%s deduped=%s inserted=%s", fetched, len(deduped), inserted, extra=LOG_EXTRA, ) return inserted __all__ = [ "GdeltSourceConfig", "resolve_gdelt_sources", "fetch_gdelt_articles", "ingest_configured_gdelt", ]