345 lines
12 KiB
Python
345 lines
12 KiB
Python
"""GDELT Doc API ingestion utilities built on top of gdeltdoc."""
|
||
from __future__ import annotations
|
||
|
||
import hashlib
|
||
import sqlite3
|
||
from dataclasses import dataclass, field, replace
|
||
from datetime import date, datetime, timedelta, timezone
|
||
from typing import Dict, Iterable, List, Optional, Sequence, Union
|
||
|
||
try: # pragma: no cover - optional dependency
|
||
from gdeltdoc import GdeltDoc, Filters # type: ignore[import-not-found]
|
||
except ImportError: # pragma: no cover - optional dependency
|
||
GdeltDoc = None # type: ignore[assignment]
|
||
Filters = None # type: ignore[assignment]
|
||
|
||
from app.utils.config import get_config
|
||
from app.utils.db import db_session
|
||
from app.utils.logging import get_logger
|
||
|
||
from . import rss as rss_ingest
|
||
|
||
LOGGER = get_logger(__name__)
|
||
LOG_EXTRA = {"stage": "gdelt_ingest"}
|
||
DateLike = Union[date, datetime]
|
||
|
||
|
||
@dataclass
|
||
class GdeltSourceConfig:
|
||
"""Configuration describing a single GDELT filter set."""
|
||
|
||
key: str
|
||
label: str
|
||
filters: Dict[str, object] = field(default_factory=dict)
|
||
ts_codes: Sequence[str] = field(default_factory=tuple)
|
||
keywords: Sequence[str] = field(default_factory=tuple)
|
||
num_records: int = 50
|
||
|
||
|
||
def resolve_gdelt_sources() -> List[GdeltSourceConfig]:
|
||
"""Resolve configured GDELT filter groups."""
|
||
|
||
cfg = get_config()
|
||
raw = getattr(cfg, "gdelt_sources", None) or {}
|
||
|
||
sources: List[GdeltSourceConfig] = []
|
||
if isinstance(raw, dict):
|
||
for key, data in raw.items():
|
||
if not isinstance(data, dict):
|
||
continue
|
||
if not data.get("enabled", True):
|
||
continue
|
||
label = str(data.get("label") or key)
|
||
filters = data.get("filters") if isinstance(data.get("filters"), dict) else {}
|
||
ts_codes = [
|
||
str(code).strip().upper()
|
||
for code in data.get("ts_codes", [])
|
||
if isinstance(code, str) and code.strip()
|
||
]
|
||
keywords = [
|
||
str(token).strip()
|
||
for token in data.get("keywords", [])
|
||
if isinstance(token, str) and token.strip()
|
||
]
|
||
num_records = data.get("num_records")
|
||
if not isinstance(num_records, int) or num_records <= 0:
|
||
num_records = 50
|
||
sources.append(
|
||
GdeltSourceConfig(
|
||
key=str(key),
|
||
label=label,
|
||
filters=dict(filters),
|
||
ts_codes=tuple(ts_codes),
|
||
keywords=tuple(keywords),
|
||
num_records=num_records,
|
||
)
|
||
)
|
||
return sources
|
||
|
||
|
||
def _ensure_datetime(value: DateLike, *, start_of_day: bool = True) -> datetime:
|
||
if isinstance(value, datetime):
|
||
return _normalize_timestamp(value)
|
||
if start_of_day:
|
||
return datetime.combine(value, datetime.min.time())
|
||
return datetime.combine(value, datetime.max.time())
|
||
|
||
|
||
def _normalize_timestamp(value: datetime) -> datetime:
|
||
if value.tzinfo is not None:
|
||
return value.astimezone(timezone.utc).replace(tzinfo=None)
|
||
return value
|
||
|
||
|
||
def _load_last_published(source_key: str) -> Optional[datetime]:
|
||
try:
|
||
with db_session(read_only=True) as conn:
|
||
row = conn.execute(
|
||
"SELECT last_published FROM ingest_state WHERE source = ?",
|
||
(source_key,),
|
||
).fetchone()
|
||
except sqlite3.OperationalError:
|
||
return None
|
||
if not row:
|
||
return None
|
||
raw = row["last_published"]
|
||
if not raw:
|
||
return None
|
||
try:
|
||
return _normalize_timestamp(datetime.fromisoformat(raw))
|
||
except ValueError:
|
||
LOGGER.debug("无法解析 GDELT 状态时间 source=%s value=%s", source_key, raw, extra=LOG_EXTRA)
|
||
return None
|
||
|
||
|
||
def _save_last_published(source_key: str, published: datetime) -> None:
|
||
timestamp = _normalize_timestamp(published).isoformat()
|
||
try:
|
||
with db_session() as conn:
|
||
conn.execute(
|
||
"""
|
||
INSERT INTO ingest_state (source, last_published)
|
||
VALUES (?, ?)
|
||
ON CONFLICT(source) DO UPDATE SET last_published = excluded.last_published
|
||
""",
|
||
(source_key, timestamp),
|
||
)
|
||
except sqlite3.OperationalError:
|
||
LOGGER.debug("写入 ingest_state 失败,表可能不存在", extra=LOG_EXTRA)
|
||
|
||
|
||
def _parse_gdelt_datetime(raw: object) -> datetime:
|
||
if isinstance(raw, datetime):
|
||
return _normalize_timestamp(raw)
|
||
if raw is None:
|
||
return _normalize_timestamp(datetime.utcnow())
|
||
text = str(raw).strip()
|
||
if not text:
|
||
return _normalize_timestamp(datetime.utcnow())
|
||
# Common GDELT formats: YYYYMMDDHHMMSS or ISO8601
|
||
try:
|
||
if text.isdigit() and len(text) == 14:
|
||
return _normalize_timestamp(datetime.strptime(text, "%Y%m%d%H%M%S"))
|
||
if text.endswith("Z"):
|
||
text = text[:-1] + "+00:00"
|
||
return _normalize_timestamp(datetime.fromisoformat(text))
|
||
except ValueError:
|
||
pass
|
||
try:
|
||
return _normalize_timestamp(datetime.strptime(text, "%Y-%m-%d %H:%M:%S"))
|
||
except ValueError:
|
||
LOGGER.debug("无法解析 GDELT 日期:%s", text, extra=LOG_EXTRA)
|
||
return _normalize_timestamp(datetime.utcnow())
|
||
|
||
|
||
def _build_rss_item(record: Dict[str, object], config: GdeltSourceConfig) -> Optional[rss_ingest.RssItem]:
|
||
url = record.get("url") or record.get("url_mobile")
|
||
if not isinstance(url, str) or not url.strip():
|
||
return None
|
||
url = url.strip()
|
||
|
||
title = record.get("title") or record.get("seendate")
|
||
if not isinstance(title, str) or not title.strip():
|
||
title = url
|
||
title = title.strip()
|
||
|
||
published_raw = (
|
||
record.get("seendate")
|
||
or record.get("publishDate")
|
||
or record.get("date")
|
||
or record.get("firstseendate")
|
||
)
|
||
published = _parse_gdelt_datetime(published_raw)
|
||
|
||
summary_candidates: Iterable[object] = (
|
||
record.get("summary"),
|
||
record.get("snippet"),
|
||
record.get("excerpt"),
|
||
record.get("altText"),
|
||
record.get("domain"),
|
||
)
|
||
summary = ""
|
||
for candidate in summary_candidates:
|
||
if isinstance(candidate, str) and candidate.strip():
|
||
summary = candidate.strip()
|
||
break
|
||
if not summary:
|
||
source_country = record.get("sourcecountry")
|
||
language = record.get("language")
|
||
details = [
|
||
str(value).strip()
|
||
for value in (source_country, language)
|
||
if isinstance(value, str) and value.strip()
|
||
]
|
||
summary = " / ".join(details) if details else title
|
||
|
||
source = record.get("sourcecommonname") or record.get("domain")
|
||
if not isinstance(source, str) or not source.strip():
|
||
source = config.label or "GDELT"
|
||
source = source.strip()
|
||
|
||
fingerprint = f"{url}|{published.isoformat()}"
|
||
article_id = hashlib.blake2s(fingerprint.encode("utf-8"), digest_size=16).hexdigest()
|
||
|
||
return rss_ingest.RssItem(
|
||
id=article_id,
|
||
title=title,
|
||
link=url,
|
||
published=published,
|
||
summary=summary,
|
||
source=source,
|
||
metadata={
|
||
"source_key": config.key,
|
||
"source_label": config.label,
|
||
},
|
||
)
|
||
|
||
|
||
def fetch_gdelt_articles(
|
||
config: GdeltSourceConfig,
|
||
*,
|
||
start: Optional[datetime] = None,
|
||
end: Optional[datetime] = None,
|
||
) -> List[rss_ingest.RssItem]:
|
||
"""Fetch article list from GDELT based on the supplied configuration."""
|
||
|
||
if GdeltDoc is None or Filters is None:
|
||
LOGGER.warning("未安装 gdeltdoc,跳过 GDELT 拉取", extra=LOG_EXTRA)
|
||
return []
|
||
|
||
filters_kwargs = dict(config.filters)
|
||
filters_kwargs.setdefault("num_records", config.num_records)
|
||
if start:
|
||
filters_kwargs.pop("timespan", None)
|
||
filters_kwargs["start_date"] = start
|
||
if end:
|
||
filters_kwargs.pop("timespan", None)
|
||
filters_kwargs["end_date"] = end
|
||
|
||
try:
|
||
filter_obj = Filters(**filters_kwargs)
|
||
except Exception as exc: # noqa: BLE001 - guard misconfigured filters
|
||
LOGGER.error("GDELT 过滤器解析失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA)
|
||
return []
|
||
|
||
client = GdeltDoc()
|
||
try:
|
||
df = client.article_search(filter_obj)
|
||
except Exception as exc: # noqa: BLE001 - network/service issues
|
||
LOGGER.warning("GDELT 请求失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA)
|
||
return []
|
||
|
||
if df is None or df.empty:
|
||
LOGGER.info("GDELT 无匹配结果 key=%s", config.key, extra=LOG_EXTRA)
|
||
return []
|
||
|
||
items: List[rss_ingest.RssItem] = []
|
||
for record in df.to_dict(orient="records"):
|
||
item = _build_rss_item(record, config)
|
||
if not item:
|
||
continue
|
||
assigned_codes = rss_ingest._assign_ts_codes(item, config.ts_codes, config.keywords) # type: ignore[attr-defined]
|
||
items.append(replace(item, ts_codes=tuple(assigned_codes)))
|
||
return items
|
||
|
||
|
||
def ingest_configured_gdelt(
|
||
start: Optional[DateLike] = None,
|
||
end: Optional[DateLike] = None,
|
||
*,
|
||
incremental: bool = True,
|
||
) -> int:
|
||
"""Ingest all configured GDELT sources into the news store."""
|
||
|
||
sources = resolve_gdelt_sources()
|
||
if not sources:
|
||
LOGGER.info("未配置 GDELT 来源,跳过新闻拉取", extra=LOG_EXTRA)
|
||
return 0
|
||
|
||
start_dt = _ensure_datetime(start) if start else None
|
||
end_dt = _ensure_datetime(end, start_of_day=False) if end else None
|
||
|
||
aggregated: List[rss_ingest.RssItem] = []
|
||
latest_by_source: Dict[str, datetime] = {}
|
||
fetched = 0
|
||
for config in sources:
|
||
source_start = start_dt
|
||
if incremental:
|
||
last_seen = _load_last_published(config.key)
|
||
if last_seen:
|
||
candidate = last_seen + timedelta(seconds=1)
|
||
if source_start is None or candidate > source_start:
|
||
source_start = candidate
|
||
LOGGER.info(
|
||
"开始拉取 GDELT:%s start=%s end=%s incremental=%s",
|
||
config.label,
|
||
source_start.isoformat() if source_start else None,
|
||
end_dt.isoformat() if end_dt else None,
|
||
incremental,
|
||
extra=LOG_EXTRA,
|
||
)
|
||
items = fetch_gdelt_articles(config, start=source_start, end=end_dt)
|
||
if not items:
|
||
continue
|
||
aggregated.extend(items)
|
||
fetched += len(items)
|
||
LOGGER.info("GDELT 来源 %s 返回 %s 条记录", config.label, len(items), extra=LOG_EXTRA)
|
||
|
||
if not aggregated:
|
||
return 0
|
||
|
||
deduped = rss_ingest.deduplicate_items(aggregated)
|
||
if not deduped:
|
||
LOGGER.info("GDELT 数据全部为重复项,跳过落库", extra=LOG_EXTRA)
|
||
return 0
|
||
|
||
inserted = rss_ingest.save_news_items(deduped)
|
||
if inserted:
|
||
latest_by_source.clear()
|
||
for item in deduped:
|
||
source_key = str(item.metadata.get("source_key", "") if item.metadata else "")
|
||
if not source_key:
|
||
continue
|
||
current = latest_by_source.get(source_key)
|
||
candidate = item.published
|
||
if current is None or candidate > current:
|
||
latest_by_source[source_key] = candidate
|
||
for source_key, timestamp in latest_by_source.items():
|
||
_save_last_published(source_key, timestamp)
|
||
LOGGER.info(
|
||
"GDELT 新闻落库完成 fetched=%s deduped=%s inserted=%s",
|
||
fetched,
|
||
len(deduped),
|
||
inserted,
|
||
extra=LOG_EXTRA,
|
||
)
|
||
return inserted
|
||
|
||
|
||
__all__ = [
|
||
"GdeltSourceConfig",
|
||
"resolve_gdelt_sources",
|
||
"fetch_gdelt_articles",
|
||
"ingest_configured_gdelt",
|
||
]
|