llm-quant/app/ingest/gdelt.py

345 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""GDELT Doc API ingestion utilities built on top of gdeltdoc."""
from __future__ import annotations
import hashlib
import sqlite3
from dataclasses import dataclass, field, replace
from datetime import date, datetime, timedelta, timezone
from typing import Dict, Iterable, List, Optional, Sequence, Union
try: # pragma: no cover - optional dependency
from gdeltdoc import GdeltDoc, Filters # type: ignore[import-not-found]
except ImportError: # pragma: no cover - optional dependency
GdeltDoc = None # type: ignore[assignment]
Filters = None # type: ignore[assignment]
from app.utils.config import get_config
from app.utils.db import db_session
from app.utils.logging import get_logger
from . import rss as rss_ingest
LOGGER = get_logger(__name__)
LOG_EXTRA = {"stage": "gdelt_ingest"}
DateLike = Union[date, datetime]
@dataclass
class GdeltSourceConfig:
"""Configuration describing a single GDELT filter set."""
key: str
label: str
filters: Dict[str, object] = field(default_factory=dict)
ts_codes: Sequence[str] = field(default_factory=tuple)
keywords: Sequence[str] = field(default_factory=tuple)
num_records: int = 50
def resolve_gdelt_sources() -> List[GdeltSourceConfig]:
"""Resolve configured GDELT filter groups."""
cfg = get_config()
raw = getattr(cfg, "gdelt_sources", None) or {}
sources: List[GdeltSourceConfig] = []
if isinstance(raw, dict):
for key, data in raw.items():
if not isinstance(data, dict):
continue
if not data.get("enabled", True):
continue
label = str(data.get("label") or key)
filters = data.get("filters") if isinstance(data.get("filters"), dict) else {}
ts_codes = [
str(code).strip().upper()
for code in data.get("ts_codes", [])
if isinstance(code, str) and code.strip()
]
keywords = [
str(token).strip()
for token in data.get("keywords", [])
if isinstance(token, str) and token.strip()
]
num_records = data.get("num_records")
if not isinstance(num_records, int) or num_records <= 0:
num_records = 50
sources.append(
GdeltSourceConfig(
key=str(key),
label=label,
filters=dict(filters),
ts_codes=tuple(ts_codes),
keywords=tuple(keywords),
num_records=num_records,
)
)
return sources
def _ensure_datetime(value: DateLike, *, start_of_day: bool = True) -> datetime:
if isinstance(value, datetime):
return _normalize_timestamp(value)
if start_of_day:
return datetime.combine(value, datetime.min.time())
return datetime.combine(value, datetime.max.time())
def _normalize_timestamp(value: datetime) -> datetime:
if value.tzinfo is not None:
return value.astimezone(timezone.utc).replace(tzinfo=None)
return value
def _load_last_published(source_key: str) -> Optional[datetime]:
try:
with db_session(read_only=True) as conn:
row = conn.execute(
"SELECT last_published FROM ingest_state WHERE source = ?",
(source_key,),
).fetchone()
except sqlite3.OperationalError:
return None
if not row:
return None
raw = row["last_published"]
if not raw:
return None
try:
return _normalize_timestamp(datetime.fromisoformat(raw))
except ValueError:
LOGGER.debug("无法解析 GDELT 状态时间 source=%s value=%s", source_key, raw, extra=LOG_EXTRA)
return None
def _save_last_published(source_key: str, published: datetime) -> None:
timestamp = _normalize_timestamp(published).isoformat()
try:
with db_session() as conn:
conn.execute(
"""
INSERT INTO ingest_state (source, last_published)
VALUES (?, ?)
ON CONFLICT(source) DO UPDATE SET last_published = excluded.last_published
""",
(source_key, timestamp),
)
except sqlite3.OperationalError:
LOGGER.debug("写入 ingest_state 失败,表可能不存在", extra=LOG_EXTRA)
def _parse_gdelt_datetime(raw: object) -> datetime:
if isinstance(raw, datetime):
return _normalize_timestamp(raw)
if raw is None:
return _normalize_timestamp(datetime.utcnow())
text = str(raw).strip()
if not text:
return _normalize_timestamp(datetime.utcnow())
# Common GDELT formats: YYYYMMDDHHMMSS or ISO8601
try:
if text.isdigit() and len(text) == 14:
return _normalize_timestamp(datetime.strptime(text, "%Y%m%d%H%M%S"))
if text.endswith("Z"):
text = text[:-1] + "+00:00"
return _normalize_timestamp(datetime.fromisoformat(text))
except ValueError:
pass
try:
return _normalize_timestamp(datetime.strptime(text, "%Y-%m-%d %H:%M:%S"))
except ValueError:
LOGGER.debug("无法解析 GDELT 日期:%s", text, extra=LOG_EXTRA)
return _normalize_timestamp(datetime.utcnow())
def _build_rss_item(record: Dict[str, object], config: GdeltSourceConfig) -> Optional[rss_ingest.RssItem]:
url = record.get("url") or record.get("url_mobile")
if not isinstance(url, str) or not url.strip():
return None
url = url.strip()
title = record.get("title") or record.get("seendate")
if not isinstance(title, str) or not title.strip():
title = url
title = title.strip()
published_raw = (
record.get("seendate")
or record.get("publishDate")
or record.get("date")
or record.get("firstseendate")
)
published = _parse_gdelt_datetime(published_raw)
summary_candidates: Iterable[object] = (
record.get("summary"),
record.get("snippet"),
record.get("excerpt"),
record.get("altText"),
record.get("domain"),
)
summary = ""
for candidate in summary_candidates:
if isinstance(candidate, str) and candidate.strip():
summary = candidate.strip()
break
if not summary:
source_country = record.get("sourcecountry")
language = record.get("language")
details = [
str(value).strip()
for value in (source_country, language)
if isinstance(value, str) and value.strip()
]
summary = " / ".join(details) if details else title
source = record.get("sourcecommonname") or record.get("domain")
if not isinstance(source, str) or not source.strip():
source = config.label or "GDELT"
source = source.strip()
fingerprint = f"{url}|{published.isoformat()}"
article_id = hashlib.blake2s(fingerprint.encode("utf-8"), digest_size=16).hexdigest()
return rss_ingest.RssItem(
id=article_id,
title=title,
link=url,
published=published,
summary=summary,
source=source,
metadata={
"source_key": config.key,
"source_label": config.label,
},
)
def fetch_gdelt_articles(
config: GdeltSourceConfig,
*,
start: Optional[datetime] = None,
end: Optional[datetime] = None,
) -> List[rss_ingest.RssItem]:
"""Fetch article list from GDELT based on the supplied configuration."""
if GdeltDoc is None or Filters is None:
LOGGER.warning("未安装 gdeltdoc跳过 GDELT 拉取", extra=LOG_EXTRA)
return []
filters_kwargs = dict(config.filters)
filters_kwargs.setdefault("num_records", config.num_records)
if start:
filters_kwargs.pop("timespan", None)
filters_kwargs["start_date"] = start
if end:
filters_kwargs.pop("timespan", None)
filters_kwargs["end_date"] = end
try:
filter_obj = Filters(**filters_kwargs)
except Exception as exc: # noqa: BLE001 - guard misconfigured filters
LOGGER.error("GDELT 过滤器解析失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA)
return []
client = GdeltDoc()
try:
df = client.article_search(filter_obj)
except Exception as exc: # noqa: BLE001 - network/service issues
LOGGER.warning("GDELT 请求失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA)
return []
if df is None or df.empty:
LOGGER.info("GDELT 无匹配结果 key=%s", config.key, extra=LOG_EXTRA)
return []
items: List[rss_ingest.RssItem] = []
for record in df.to_dict(orient="records"):
item = _build_rss_item(record, config)
if not item:
continue
assigned_codes = rss_ingest._assign_ts_codes(item, config.ts_codes, config.keywords) # type: ignore[attr-defined]
items.append(replace(item, ts_codes=tuple(assigned_codes)))
return items
def ingest_configured_gdelt(
start: Optional[DateLike] = None,
end: Optional[DateLike] = None,
*,
incremental: bool = True,
) -> int:
"""Ingest all configured GDELT sources into the news store."""
sources = resolve_gdelt_sources()
if not sources:
LOGGER.info("未配置 GDELT 来源,跳过新闻拉取", extra=LOG_EXTRA)
return 0
start_dt = _ensure_datetime(start) if start else None
end_dt = _ensure_datetime(end, start_of_day=False) if end else None
aggregated: List[rss_ingest.RssItem] = []
latest_by_source: Dict[str, datetime] = {}
fetched = 0
for config in sources:
source_start = start_dt
if incremental:
last_seen = _load_last_published(config.key)
if last_seen:
candidate = last_seen + timedelta(seconds=1)
if source_start is None or candidate > source_start:
source_start = candidate
LOGGER.info(
"开始拉取 GDELT%s start=%s end=%s incremental=%s",
config.label,
source_start.isoformat() if source_start else None,
end_dt.isoformat() if end_dt else None,
incremental,
extra=LOG_EXTRA,
)
items = fetch_gdelt_articles(config, start=source_start, end=end_dt)
if not items:
continue
aggregated.extend(items)
fetched += len(items)
LOGGER.info("GDELT 来源 %s 返回 %s 条记录", config.label, len(items), extra=LOG_EXTRA)
if not aggregated:
return 0
deduped = rss_ingest.deduplicate_items(aggregated)
if not deduped:
LOGGER.info("GDELT 数据全部为重复项,跳过落库", extra=LOG_EXTRA)
return 0
inserted = rss_ingest.save_news_items(deduped)
if inserted:
latest_by_source.clear()
for item in deduped:
source_key = str(item.metadata.get("source_key", "") if item.metadata else "")
if not source_key:
continue
current = latest_by_source.get(source_key)
candidate = item.published
if current is None or candidate > current:
latest_by_source[source_key] = candidate
for source_key, timestamp in latest_by_source.items():
_save_last_published(source_key, timestamp)
LOGGER.info(
"GDELT 新闻落库完成 fetched=%s deduped=%s inserted=%s",
fetched,
len(deduped),
inserted,
extra=LOG_EXTRA,
)
return inserted
__all__ = [
"GdeltSourceConfig",
"resolve_gdelt_sources",
"fetch_gdelt_articles",
"ingest_configured_gdelt",
]