add GDELT news ingestion with metadata and ingest state tracking
This commit is contained in:
parent
d6292e2b2f
commit
2147dc3244
@ -374,6 +374,12 @@ SCHEMA_STATEMENTS: Iterable[str] = (
|
||||
CREATE INDEX IF NOT EXISTS idx_news_code ON news(ts_code, pub_time DESC);
|
||||
""",
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS ingest_state (
|
||||
source TEXT PRIMARY KEY,
|
||||
last_published TEXT
|
||||
);
|
||||
""",
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS heat_daily (
|
||||
scope TEXT,
|
||||
key TEXT,
|
||||
|
||||
@ -44,6 +44,7 @@ from .api_client import (
|
||||
fetch_us_daily,
|
||||
save_records,
|
||||
)
|
||||
from .gdelt import ingest_configured_gdelt
|
||||
|
||||
LOGGER = get_logger(__name__)
|
||||
|
||||
@ -172,6 +173,7 @@ def ensure_data_coverage(
|
||||
ts_codes: Optional[Sequence[str]] = None,
|
||||
include_limits: bool = True,
|
||||
include_extended: bool = True,
|
||||
include_news: bool = True,
|
||||
force: bool = False,
|
||||
progress_hook: Callable[[str, float], None] | None = None,
|
||||
) -> None:
|
||||
@ -193,6 +195,9 @@ def ensure_data_coverage(
|
||||
extra_steps += 1
|
||||
if include_extended:
|
||||
extra_steps += 4
|
||||
news_enabled = include_news and not _is_disabled("news")
|
||||
if news_enabled:
|
||||
extra_steps += 1
|
||||
total_steps = 5 + extra_steps
|
||||
current_step = 0
|
||||
|
||||
@ -358,6 +363,17 @@ def ensure_data_coverage(
|
||||
_save_with_codes("hk_daily", fetch_hk_daily, targets=HK_CODES)
|
||||
_save_with_codes("us_daily", fetch_us_daily, targets=US_CODES)
|
||||
|
||||
if news_enabled:
|
||||
advance("拉取 GDELT 新闻数据")
|
||||
try:
|
||||
ingest_configured_gdelt(
|
||||
start=start,
|
||||
end=end,
|
||||
incremental=not force,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
LOGGER.warning("GDELT 新闻拉取失败:%s", exc, extra=LOG_EXTRA)
|
||||
|
||||
if progress_hook:
|
||||
progress_hook("数据覆盖检查完成", 1.0)
|
||||
|
||||
|
||||
344
app/ingest/gdelt.py
Normal file
344
app/ingest/gdelt.py
Normal file
@ -0,0 +1,344 @@
|
||||
"""GDELT Doc API ingestion utilities built on top of gdeltdoc."""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import sqlite3
|
||||
from dataclasses import dataclass, field, replace
|
||||
from datetime import date, datetime, timedelta, timezone
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Union
|
||||
|
||||
try: # pragma: no cover - optional dependency
|
||||
from gdeltdoc import GdeltDoc, Filters # type: ignore[import-not-found]
|
||||
except ImportError: # pragma: no cover - optional dependency
|
||||
GdeltDoc = None # type: ignore[assignment]
|
||||
Filters = None # type: ignore[assignment]
|
||||
|
||||
from app.utils.config import get_config
|
||||
from app.utils.db import db_session
|
||||
from app.utils.logging import get_logger
|
||||
|
||||
from . import rss as rss_ingest
|
||||
|
||||
LOGGER = get_logger(__name__)
|
||||
LOG_EXTRA = {"stage": "gdelt_ingest"}
|
||||
DateLike = Union[date, datetime]
|
||||
|
||||
|
||||
@dataclass
|
||||
class GdeltSourceConfig:
|
||||
"""Configuration describing a single GDELT filter set."""
|
||||
|
||||
key: str
|
||||
label: str
|
||||
filters: Dict[str, object] = field(default_factory=dict)
|
||||
ts_codes: Sequence[str] = field(default_factory=tuple)
|
||||
keywords: Sequence[str] = field(default_factory=tuple)
|
||||
num_records: int = 50
|
||||
|
||||
|
||||
def resolve_gdelt_sources() -> List[GdeltSourceConfig]:
|
||||
"""Resolve configured GDELT filter groups."""
|
||||
|
||||
cfg = get_config()
|
||||
raw = getattr(cfg, "gdelt_sources", None) or {}
|
||||
|
||||
sources: List[GdeltSourceConfig] = []
|
||||
if isinstance(raw, dict):
|
||||
for key, data in raw.items():
|
||||
if not isinstance(data, dict):
|
||||
continue
|
||||
if not data.get("enabled", True):
|
||||
continue
|
||||
label = str(data.get("label") or key)
|
||||
filters = data.get("filters") if isinstance(data.get("filters"), dict) else {}
|
||||
ts_codes = [
|
||||
str(code).strip().upper()
|
||||
for code in data.get("ts_codes", [])
|
||||
if isinstance(code, str) and code.strip()
|
||||
]
|
||||
keywords = [
|
||||
str(token).strip()
|
||||
for token in data.get("keywords", [])
|
||||
if isinstance(token, str) and token.strip()
|
||||
]
|
||||
num_records = data.get("num_records")
|
||||
if not isinstance(num_records, int) or num_records <= 0:
|
||||
num_records = 50
|
||||
sources.append(
|
||||
GdeltSourceConfig(
|
||||
key=str(key),
|
||||
label=label,
|
||||
filters=dict(filters),
|
||||
ts_codes=tuple(ts_codes),
|
||||
keywords=tuple(keywords),
|
||||
num_records=num_records,
|
||||
)
|
||||
)
|
||||
return sources
|
||||
|
||||
|
||||
def _ensure_datetime(value: DateLike, *, start_of_day: bool = True) -> datetime:
|
||||
if isinstance(value, datetime):
|
||||
return _normalize_timestamp(value)
|
||||
if start_of_day:
|
||||
return datetime.combine(value, datetime.min.time())
|
||||
return datetime.combine(value, datetime.max.time())
|
||||
|
||||
|
||||
def _normalize_timestamp(value: datetime) -> datetime:
|
||||
if value.tzinfo is not None:
|
||||
return value.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return value
|
||||
|
||||
|
||||
def _load_last_published(source_key: str) -> Optional[datetime]:
|
||||
try:
|
||||
with db_session(read_only=True) as conn:
|
||||
row = conn.execute(
|
||||
"SELECT last_published FROM ingest_state WHERE source = ?",
|
||||
(source_key,),
|
||||
).fetchone()
|
||||
except sqlite3.OperationalError:
|
||||
return None
|
||||
if not row:
|
||||
return None
|
||||
raw = row["last_published"]
|
||||
if not raw:
|
||||
return None
|
||||
try:
|
||||
return _normalize_timestamp(datetime.fromisoformat(raw))
|
||||
except ValueError:
|
||||
LOGGER.debug("无法解析 GDELT 状态时间 source=%s value=%s", source_key, raw, extra=LOG_EXTRA)
|
||||
return None
|
||||
|
||||
|
||||
def _save_last_published(source_key: str, published: datetime) -> None:
|
||||
timestamp = _normalize_timestamp(published).isoformat()
|
||||
try:
|
||||
with db_session() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO ingest_state (source, last_published)
|
||||
VALUES (?, ?)
|
||||
ON CONFLICT(source) DO UPDATE SET last_published = excluded.last_published
|
||||
""",
|
||||
(source_key, timestamp),
|
||||
)
|
||||
except sqlite3.OperationalError:
|
||||
LOGGER.debug("写入 ingest_state 失败,表可能不存在", extra=LOG_EXTRA)
|
||||
|
||||
|
||||
def _parse_gdelt_datetime(raw: object) -> datetime:
|
||||
if isinstance(raw, datetime):
|
||||
return _normalize_timestamp(raw)
|
||||
if raw is None:
|
||||
return _normalize_timestamp(datetime.utcnow())
|
||||
text = str(raw).strip()
|
||||
if not text:
|
||||
return _normalize_timestamp(datetime.utcnow())
|
||||
# Common GDELT formats: YYYYMMDDHHMMSS or ISO8601
|
||||
try:
|
||||
if text.isdigit() and len(text) == 14:
|
||||
return _normalize_timestamp(datetime.strptime(text, "%Y%m%d%H%M%S"))
|
||||
if text.endswith("Z"):
|
||||
text = text[:-1] + "+00:00"
|
||||
return _normalize_timestamp(datetime.fromisoformat(text))
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
return _normalize_timestamp(datetime.strptime(text, "%Y-%m-%d %H:%M:%S"))
|
||||
except ValueError:
|
||||
LOGGER.debug("无法解析 GDELT 日期:%s", text, extra=LOG_EXTRA)
|
||||
return _normalize_timestamp(datetime.utcnow())
|
||||
|
||||
|
||||
def _build_rss_item(record: Dict[str, object], config: GdeltSourceConfig) -> Optional[rss_ingest.RssItem]:
|
||||
url = record.get("url") or record.get("url_mobile")
|
||||
if not isinstance(url, str) or not url.strip():
|
||||
return None
|
||||
url = url.strip()
|
||||
|
||||
title = record.get("title") or record.get("seendate")
|
||||
if not isinstance(title, str) or not title.strip():
|
||||
title = url
|
||||
title = title.strip()
|
||||
|
||||
published_raw = (
|
||||
record.get("seendate")
|
||||
or record.get("publishDate")
|
||||
or record.get("date")
|
||||
or record.get("firstseendate")
|
||||
)
|
||||
published = _parse_gdelt_datetime(published_raw)
|
||||
|
||||
summary_candidates: Iterable[object] = (
|
||||
record.get("summary"),
|
||||
record.get("snippet"),
|
||||
record.get("excerpt"),
|
||||
record.get("altText"),
|
||||
record.get("domain"),
|
||||
)
|
||||
summary = ""
|
||||
for candidate in summary_candidates:
|
||||
if isinstance(candidate, str) and candidate.strip():
|
||||
summary = candidate.strip()
|
||||
break
|
||||
if not summary:
|
||||
source_country = record.get("sourcecountry")
|
||||
language = record.get("language")
|
||||
details = [
|
||||
str(value).strip()
|
||||
for value in (source_country, language)
|
||||
if isinstance(value, str) and value.strip()
|
||||
]
|
||||
summary = " / ".join(details) if details else title
|
||||
|
||||
source = record.get("sourcecommonname") or record.get("domain")
|
||||
if not isinstance(source, str) or not source.strip():
|
||||
source = config.label or "GDELT"
|
||||
source = source.strip()
|
||||
|
||||
fingerprint = f"{url}|{published.isoformat()}"
|
||||
article_id = hashlib.blake2s(fingerprint.encode("utf-8"), digest_size=16).hexdigest()
|
||||
|
||||
return rss_ingest.RssItem(
|
||||
id=article_id,
|
||||
title=title,
|
||||
link=url,
|
||||
published=published,
|
||||
summary=summary,
|
||||
source=source,
|
||||
metadata={
|
||||
"source_key": config.key,
|
||||
"source_label": config.label,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def fetch_gdelt_articles(
|
||||
config: GdeltSourceConfig,
|
||||
*,
|
||||
start: Optional[datetime] = None,
|
||||
end: Optional[datetime] = None,
|
||||
) -> List[rss_ingest.RssItem]:
|
||||
"""Fetch article list from GDELT based on the supplied configuration."""
|
||||
|
||||
if GdeltDoc is None or Filters is None:
|
||||
LOGGER.warning("未安装 gdeltdoc,跳过 GDELT 拉取", extra=LOG_EXTRA)
|
||||
return []
|
||||
|
||||
filters_kwargs = dict(config.filters)
|
||||
filters_kwargs.setdefault("num_records", config.num_records)
|
||||
if start:
|
||||
filters_kwargs.pop("timespan", None)
|
||||
filters_kwargs["start_date"] = start
|
||||
if end:
|
||||
filters_kwargs.pop("timespan", None)
|
||||
filters_kwargs["end_date"] = end
|
||||
|
||||
try:
|
||||
filter_obj = Filters(**filters_kwargs)
|
||||
except Exception as exc: # noqa: BLE001 - guard misconfigured filters
|
||||
LOGGER.error("GDELT 过滤器解析失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA)
|
||||
return []
|
||||
|
||||
client = GdeltDoc()
|
||||
try:
|
||||
df = client.article_search(filter_obj)
|
||||
except Exception as exc: # noqa: BLE001 - network/service issues
|
||||
LOGGER.warning("GDELT 请求失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA)
|
||||
return []
|
||||
|
||||
if df is None or df.empty:
|
||||
LOGGER.info("GDELT 无匹配结果 key=%s", config.key, extra=LOG_EXTRA)
|
||||
return []
|
||||
|
||||
items: List[rss_ingest.RssItem] = []
|
||||
for record in df.to_dict(orient="records"):
|
||||
item = _build_rss_item(record, config)
|
||||
if not item:
|
||||
continue
|
||||
assigned_codes = rss_ingest._assign_ts_codes(item, config.ts_codes, config.keywords) # type: ignore[attr-defined]
|
||||
items.append(replace(item, ts_codes=tuple(assigned_codes)))
|
||||
return items
|
||||
|
||||
|
||||
def ingest_configured_gdelt(
|
||||
start: Optional[DateLike] = None,
|
||||
end: Optional[DateLike] = None,
|
||||
*,
|
||||
incremental: bool = True,
|
||||
) -> int:
|
||||
"""Ingest all configured GDELT sources into the news store."""
|
||||
|
||||
sources = resolve_gdelt_sources()
|
||||
if not sources:
|
||||
LOGGER.info("未配置 GDELT 来源,跳过新闻拉取", extra=LOG_EXTRA)
|
||||
return 0
|
||||
|
||||
start_dt = _ensure_datetime(start) if start else None
|
||||
end_dt = _ensure_datetime(end, start_of_day=False) if end else None
|
||||
|
||||
aggregated: List[rss_ingest.RssItem] = []
|
||||
latest_by_source: Dict[str, datetime] = {}
|
||||
fetched = 0
|
||||
for config in sources:
|
||||
source_start = start_dt
|
||||
if incremental:
|
||||
last_seen = _load_last_published(config.key)
|
||||
if last_seen:
|
||||
candidate = last_seen + timedelta(seconds=1)
|
||||
if source_start is None or candidate > source_start:
|
||||
source_start = candidate
|
||||
LOGGER.info(
|
||||
"开始拉取 GDELT:%s start=%s end=%s incremental=%s",
|
||||
config.label,
|
||||
source_start.isoformat() if source_start else None,
|
||||
end_dt.isoformat() if end_dt else None,
|
||||
incremental,
|
||||
extra=LOG_EXTRA,
|
||||
)
|
||||
items = fetch_gdelt_articles(config, start=source_start, end=end_dt)
|
||||
if not items:
|
||||
continue
|
||||
aggregated.extend(items)
|
||||
fetched += len(items)
|
||||
LOGGER.info("GDELT 来源 %s 返回 %s 条记录", config.label, len(items), extra=LOG_EXTRA)
|
||||
|
||||
if not aggregated:
|
||||
return 0
|
||||
|
||||
deduped = rss_ingest.deduplicate_items(aggregated)
|
||||
if not deduped:
|
||||
LOGGER.info("GDELT 数据全部为重复项,跳过落库", extra=LOG_EXTRA)
|
||||
return 0
|
||||
|
||||
inserted = rss_ingest.save_news_items(deduped)
|
||||
if inserted:
|
||||
latest_by_source.clear()
|
||||
for item in deduped:
|
||||
source_key = str(item.metadata.get("source_key", "") if item.metadata else "")
|
||||
if not source_key:
|
||||
continue
|
||||
current = latest_by_source.get(source_key)
|
||||
candidate = item.published
|
||||
if current is None or candidate > current:
|
||||
latest_by_source[source_key] = candidate
|
||||
for source_key, timestamp in latest_by_source.items():
|
||||
_save_last_published(source_key, timestamp)
|
||||
LOGGER.info(
|
||||
"GDELT 新闻落库完成 fetched=%s deduped=%s inserted=%s",
|
||||
fetched,
|
||||
len(deduped),
|
||||
inserted,
|
||||
extra=LOG_EXTRA,
|
||||
)
|
||||
return inserted
|
||||
|
||||
|
||||
__all__ = [
|
||||
"GdeltSourceConfig",
|
||||
"resolve_gdelt_sources",
|
||||
"fetch_gdelt_articles",
|
||||
"ingest_configured_gdelt",
|
||||
]
|
||||
@ -120,6 +120,7 @@ class RssItem:
|
||||
stock_mentions: List[StockMention] = field(default_factory=list)
|
||||
industries: List[str] = field(default_factory=list)
|
||||
important_keywords: List[str] = field(default_factory=list)
|
||||
metadata: Dict[str, object] = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
"""Initialize company mapper if not already initialized."""
|
||||
@ -481,14 +482,20 @@ def deduplicate_items(items: Iterable[RssItem]) -> List[RssItem]:
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
||||
preassigned_codes = list(item.ts_codes or [])
|
||||
# 提取实体和相关信息
|
||||
item.extract_entities()
|
||||
|
||||
# 如果找到了相关股票,则保留这条新闻
|
||||
if item.stock_mentions:
|
||||
unique.append(item)
|
||||
|
||||
continue
|
||||
|
||||
# 否则如果配置了预设股票代码,则保留这些代码
|
||||
if preassigned_codes:
|
||||
if not item.ts_codes:
|
||||
item.ts_codes = preassigned_codes
|
||||
unique.append(item)
|
||||
return unique
|
||||
|
||||
|
||||
@ -514,16 +521,16 @@ def save_news_items(items: Iterable[RssItem]) -> int:
|
||||
industry_count=len(item.industries)
|
||||
)
|
||||
# 构建包含更多信息的entities对象
|
||||
entities = json.dumps(
|
||||
{
|
||||
"ts_codes": list(base_codes),
|
||||
"source_url": item.link,
|
||||
"industries": item.industries, # 添加行业信息
|
||||
"important_keywords": item.important_keywords, # 添加重要关键词
|
||||
"text_length": len(text_payload), # 添加文本长度信息
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
entity_payload = {
|
||||
"ts_codes": list(base_codes),
|
||||
"source_url": item.link,
|
||||
"industries": item.industries, # 添加行业信息
|
||||
"important_keywords": item.important_keywords, # 添加重要关键词
|
||||
"text_length": len(text_payload), # 添加文本长度信息
|
||||
}
|
||||
if item.metadata:
|
||||
entity_payload["metadata"] = dict(item.metadata)
|
||||
entities = json.dumps(entity_payload, ensure_ascii=False)
|
||||
resolved_codes = base_codes or (None,)
|
||||
for ts_code in resolved_codes:
|
||||
row_id = item.id if ts_code is None else f"{item.id}::{ts_code}"
|
||||
|
||||
@ -58,6 +58,7 @@ def run_ingestion(
|
||||
*,
|
||||
include_limits: bool = True,
|
||||
include_extended: bool = True,
|
||||
include_news: bool = True,
|
||||
post_tasks: Optional[Iterable[PostTask]] = None,
|
||||
) -> None:
|
||||
"""Execute a TuShare ingestion job with optional post processing hooks."""
|
||||
@ -71,6 +72,7 @@ def run_ingestion(
|
||||
ts_codes=job.ts_codes,
|
||||
include_limits=include_limits,
|
||||
include_extended=include_extended,
|
||||
include_news=include_news,
|
||||
force=True,
|
||||
)
|
||||
logger.update_metadata(job.as_dict())
|
||||
@ -103,4 +105,3 @@ __all__ = [
|
||||
"ensure_data_coverage",
|
||||
"run_ingestion",
|
||||
]
|
||||
|
||||
|
||||
@ -548,6 +548,23 @@ def _default_rss_sources() -> Dict[str, object]:
|
||||
}
|
||||
|
||||
|
||||
def _default_gdelt_sources() -> Dict[str, object]:
|
||||
return {
|
||||
"global_market_watch": {
|
||||
"enabled": False,
|
||||
"label": "Global Market Watch",
|
||||
"filters": {
|
||||
"timespan": "24h",
|
||||
"keyword": "\"stock market\" OR 股票",
|
||||
"language": ["en", "zh"],
|
||||
"num_records": 75,
|
||||
},
|
||||
"ts_codes": [],
|
||||
"keywords": [],
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class AppConfig:
|
||||
"""User configurable settings persisted in a simple structure."""
|
||||
@ -567,6 +584,7 @@ class AppConfig:
|
||||
departments: Dict[str, DepartmentSettings] = field(default_factory=_default_departments)
|
||||
portfolio: PortfolioSettings = field(default_factory=PortfolioSettings)
|
||||
alert_channels: Dict[str, AlertChannelSettings] = field(default_factory=dict)
|
||||
gdelt_sources: Dict[str, object] = field(default_factory=_default_gdelt_sources)
|
||||
disabled_ingest_tables: Set[str] = field(default_factory=set)
|
||||
|
||||
def resolve_llm(self, route: Optional[str] = None) -> LLMConfig:
|
||||
@ -651,6 +669,20 @@ def _load_from_file(cfg: AppConfig) -> None:
|
||||
default_rss[str(key)] = value
|
||||
cfg.rss_sources = default_rss
|
||||
|
||||
gdelt_payload = payload.get("gdelt_sources")
|
||||
default_gdelt = _default_gdelt_sources()
|
||||
if isinstance(gdelt_payload, dict):
|
||||
sanitized: Dict[str, object] = {}
|
||||
for key, value in gdelt_payload.items():
|
||||
if isinstance(value, dict):
|
||||
sanitized[str(key)] = value
|
||||
if sanitized:
|
||||
cfg.gdelt_sources = sanitized
|
||||
else:
|
||||
cfg.gdelt_sources = default_gdelt
|
||||
else:
|
||||
cfg.gdelt_sources = default_gdelt
|
||||
|
||||
weights_payload = payload.get("agent_weights")
|
||||
if isinstance(weights_payload, dict):
|
||||
cfg.agent_weights.update_from_dict(weights_payload)
|
||||
@ -953,6 +985,7 @@ def save_config(cfg: AppConfig | None = None) -> None:
|
||||
"force_refresh": cfg.force_refresh,
|
||||
"auto_update_data": cfg.auto_update_data,
|
||||
"decision_method": cfg.decision_method,
|
||||
"gdelt_sources": cfg.gdelt_sources,
|
||||
"disabled_ingest_tables": sorted(cfg.disabled_ingest_tables),
|
||||
"rss_sources": cfg.rss_sources,
|
||||
"agent_weights": cfg.agent_weights.as_dict(),
|
||||
|
||||
139
docs/GDELT_README.md
Normal file
139
docs/GDELT_README.md
Normal file
@ -0,0 +1,139 @@
|
||||
# GDELT 2.0 Doc API Client
|
||||
|
||||
A Python client to fetch data from the [GDELT 2.0 Doc API](https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/).
|
||||
|
||||
This allows for simpler, small-scale analysis of news coverage without having to deal with the complexities of downloading and managing the raw files from S3, or working with the BigQuery export.
|
||||
|
||||
## Installation
|
||||
|
||||
`gdeltdoc` is on PyPi and is installed through pip:
|
||||
|
||||
```bash
|
||||
pip install gdeltdoc
|
||||
```
|
||||
|
||||
## Use
|
||||
|
||||
The `ArtList` and `Timeline*` query modes are supported.
|
||||
|
||||
```python
|
||||
from gdeltdoc import GdeltDoc, Filters
|
||||
|
||||
f = Filters(
|
||||
keyword = "climate change",
|
||||
start_date = "2020-05-10",
|
||||
end_date = "2020-05-11"
|
||||
)
|
||||
|
||||
gd = GdeltDoc()
|
||||
|
||||
# Search for articles matching the filters
|
||||
articles = gd.article_search(f)
|
||||
|
||||
# Get a timeline of the number of articles matching the filters
|
||||
timeline = gd.timeline_search("timelinevol", f)
|
||||
```
|
||||
|
||||
## Integration in `llm_quant`
|
||||
|
||||
This repository wires `gdeltdoc` into the TuShare ingestion workflow so GDELT headlines arrive alongside the usual market data.
|
||||
|
||||
- Configuration lives under `gdelt_sources` in `app/data/config.json` (managed via `AppConfig.gdelt_sources`).
|
||||
- `app/ingest/gdelt.py` wraps the Doc API, materialising results as `RssItem` objects so they share the same dedupe/heat scoring pipeline as RSS feeds.
|
||||
- `app/ingest/coverage.ensure_data_coverage` now calls `ingest_configured_gdelt(...)` after the core TuShare tables, supporting incremental fetches via `ingest_state`.
|
||||
|
||||
Enable a source by flipping `enabled: true` in the config, optionally providing `start_date`/`end_date` windows or a rolling `timespan`. Subsequent runs only request data beyond the last persisted publish time.
|
||||
|
||||
### Article List
|
||||
|
||||
The article list mode of the API generates a list of news articles that match the filters. The client returns this as a pandas DataFrame with columns `url`, `url_mobile`, `title`, `seendate`, `socialimage`, `domain`, `language`, `sourcecountry`.
|
||||
|
||||
### Timeline Search
|
||||
|
||||
There are 5 available modes when making a timeline search:
|
||||
|
||||
- `timelinevol` - a timeline of the volume of news coverage matching the filters, represented as a percentage of the total news articles monitored by GDELT.
|
||||
- `timelinevolraw` - similar to `timelinevol`, but has the actual number of articles and a total rather than a percentage
|
||||
- `timelinelang` - similar to `timelinevol` but breaks the total articles down by published language. Each language is returned as a separate column in the DataFrame.
|
||||
- `timelinesourcecountry` - similar to `timelinevol` but breaks the total articles down by the country they were published in. Each country is returned as a separate column in the DataFrame.
|
||||
- `timelinetone` - a timeline of the average tone of the news coverage matching the filters. See [GDELT's documentation](https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/) for more information about the tone metric.
|
||||
|
||||
### Filters
|
||||
|
||||
The search query passed to the API is constructed from a `gdeltdoc.Filters` object.
|
||||
|
||||
```python
|
||||
from gdeltdoc import Filters, near, repeat
|
||||
|
||||
f = Filters(
|
||||
start_date = "2020-05-01",
|
||||
end_date = "2020-05-02",
|
||||
num_records = 250,
|
||||
keyword = "climate change",
|
||||
domain = ["bbc.co.uk", "nytimes.com"],
|
||||
country = ["UK", "US"],
|
||||
theme = "GENERAL_HEALTH",
|
||||
near = near(10, "airline", "carbon"),
|
||||
repeat = repeat(5, "planet")
|
||||
)
|
||||
```
|
||||
|
||||
Filters for `keyword`, `domain`, `domain_exact`, `country`, `language` and `theme` can be passed either as a single string or as a list of strings. If a list is passed, the values in the list are wrappeed in a boolean OR.
|
||||
|
||||
You must pass either `start_date` and `end_date`, or `timespan`
|
||||
|
||||
- `start_date` - The start date for the filter in YYYY-MM-DD format or as a datetime object in UTC time.
|
||||
Passing a datetime allows you to specify a time down to seconds granularity. The API officially only supports the most recent 3 months of articles. Making a request for an earlier date range may still return data, but it's not guaranteed.
|
||||
- `end_date` - The end date for the filter in YYYY-MM-DD format or as a datetime object in UTC time.
|
||||
- `timespan` - A timespan to search for, relative to the time of the request. Must match one of the API's timespan formats - https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/
|
||||
- `num_records` - The number of records to return. Only used in article list mode and can be up to 250.
|
||||
- `keyword` - Return articles containing the exact phrase `keyword` within the article text.
|
||||
- `domain` - Return articles from the specified domain. Does not require an exact match so passing "cnn.com" will match articles from `cnn.com`, `subdomain.cnn.com` and `notactuallycnn.com`.
|
||||
- `domain_exact` - Similar to `domain`, but requires an exact match.
|
||||
- `country` - Return articles published in a country or list of countries, formatted as the FIPS 2 letter country code.
|
||||
- `language` - Return articles published in the given language, formatted as the ISO 639 language code.
|
||||
- `theme` - Return articles that cover one of GDELT's GKG Themes. A full list of themes can be found [here](http://data.gdeltproject.org/api/v2/guides/LOOKUP-GKGTHEMES.TXT)
|
||||
- `near` - Return articles containing words close to each other in the text. Use `near()` to construct. eg. `near = near(5, "airline", "climate")`, or `multi_near()` if you want to use multiple restrictions eg. `multi_near([(5, "airline", "crisis"), (10, "airline", "climate", "change")], method="AND")` finds "airline" and "crisis" within 5 words, and "airline", "climate", and "change" within 10 words
|
||||
- `repeat` - Return articles containing a single word repeated at least a number of times. Use `repeat()` to construct. eg. `repeat =repeat(3, "environment")`, or `multi_repeat()` if you want to use multiple restrictions eg. `repeat = multi_repeat([(2, "airline"), (3, "airport")], "AND")`
|
||||
- `tone` - Return articles above or below a particular tone score (ie more positive or more negative than a certain threshold). To use, specify either a greater than or less than sign and a positive or negative number (either an integer or floating point number). To find fairly positive articles, use `tone=">5"` or to search for fairly negative articles, use `tone="<-5"`
|
||||
- tone_absolute - The same as `tone` but ignores the positive/negative sign and lets you search for high emotion or low emotion articles, regardless of whether they were happy or sad in tone
|
||||
|
||||
## Developing gdelt-doc-api
|
||||
|
||||
PRs & issues are very welcome!
|
||||
|
||||
### Setup
|
||||
|
||||
It's recommended to use a virtual environment for development. Set one up with
|
||||
|
||||
```
|
||||
python -m venv venv
|
||||
```
|
||||
|
||||
and activate it (on Mac or Linux)
|
||||
|
||||
```
|
||||
source venv/bin/activate
|
||||
```
|
||||
|
||||
Then install the requirements
|
||||
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Tests for this package use `unittest`. Run them with
|
||||
|
||||
```
|
||||
python -m unittest
|
||||
```
|
||||
|
||||
If your PR adds a new feature or helper, please also add some tests
|
||||
|
||||
### Publishing
|
||||
|
||||
There's a bit of automation set up to help publish a new version of the package to PyPI,
|
||||
|
||||
1. Make sure the version string has been updated since the last release. This package follows semantic versioning.
|
||||
2. Create a new release in the Github UI, using the new version as the release name
|
||||
3. Watch as the `publish.yml` Github action builds the package and pushes it to PyPI
|
||||
@ -10,3 +10,4 @@ feedparser>=6.0
|
||||
arch>=6.1.0
|
||||
scipy>=1.11.0
|
||||
torch>=2.3.0
|
||||
gdeltdoc>=0.1.7
|
||||
|
||||
Loading…
Reference in New Issue
Block a user