diff --git a/app/data/schema.py b/app/data/schema.py index 6d4ebbc..dc1854c 100644 --- a/app/data/schema.py +++ b/app/data/schema.py @@ -374,6 +374,12 @@ SCHEMA_STATEMENTS: Iterable[str] = ( CREATE INDEX IF NOT EXISTS idx_news_code ON news(ts_code, pub_time DESC); """, """ + CREATE TABLE IF NOT EXISTS ingest_state ( + source TEXT PRIMARY KEY, + last_published TEXT + ); + """, + """ CREATE TABLE IF NOT EXISTS heat_daily ( scope TEXT, key TEXT, diff --git a/app/ingest/coverage.py b/app/ingest/coverage.py index cdbb983..8810126 100644 --- a/app/ingest/coverage.py +++ b/app/ingest/coverage.py @@ -44,6 +44,7 @@ from .api_client import ( fetch_us_daily, save_records, ) +from .gdelt import ingest_configured_gdelt LOGGER = get_logger(__name__) @@ -172,6 +173,7 @@ def ensure_data_coverage( ts_codes: Optional[Sequence[str]] = None, include_limits: bool = True, include_extended: bool = True, + include_news: bool = True, force: bool = False, progress_hook: Callable[[str, float], None] | None = None, ) -> None: @@ -193,6 +195,9 @@ def ensure_data_coverage( extra_steps += 1 if include_extended: extra_steps += 4 + news_enabled = include_news and not _is_disabled("news") + if news_enabled: + extra_steps += 1 total_steps = 5 + extra_steps current_step = 0 @@ -358,6 +363,17 @@ def ensure_data_coverage( _save_with_codes("hk_daily", fetch_hk_daily, targets=HK_CODES) _save_with_codes("us_daily", fetch_us_daily, targets=US_CODES) + if news_enabled: + advance("拉取 GDELT 新闻数据") + try: + ingest_configured_gdelt( + start=start, + end=end, + incremental=not force, + ) + except Exception as exc: # noqa: BLE001 + LOGGER.warning("GDELT 新闻拉取失败:%s", exc, extra=LOG_EXTRA) + if progress_hook: progress_hook("数据覆盖检查完成", 1.0) diff --git a/app/ingest/gdelt.py b/app/ingest/gdelt.py new file mode 100644 index 0000000..f0aa6a8 --- /dev/null +++ b/app/ingest/gdelt.py @@ -0,0 +1,344 @@ +"""GDELT Doc API ingestion utilities built on top of gdeltdoc.""" +from __future__ import annotations + +import hashlib +import sqlite3 +from dataclasses import dataclass, field, replace +from datetime import date, datetime, timedelta, timezone +from typing import Dict, Iterable, List, Optional, Sequence, Union + +try: # pragma: no cover - optional dependency + from gdeltdoc import GdeltDoc, Filters # type: ignore[import-not-found] +except ImportError: # pragma: no cover - optional dependency + GdeltDoc = None # type: ignore[assignment] + Filters = None # type: ignore[assignment] + +from app.utils.config import get_config +from app.utils.db import db_session +from app.utils.logging import get_logger + +from . import rss as rss_ingest + +LOGGER = get_logger(__name__) +LOG_EXTRA = {"stage": "gdelt_ingest"} +DateLike = Union[date, datetime] + + +@dataclass +class GdeltSourceConfig: + """Configuration describing a single GDELT filter set.""" + + key: str + label: str + filters: Dict[str, object] = field(default_factory=dict) + ts_codes: Sequence[str] = field(default_factory=tuple) + keywords: Sequence[str] = field(default_factory=tuple) + num_records: int = 50 + + +def resolve_gdelt_sources() -> List[GdeltSourceConfig]: + """Resolve configured GDELT filter groups.""" + + cfg = get_config() + raw = getattr(cfg, "gdelt_sources", None) or {} + + sources: List[GdeltSourceConfig] = [] + if isinstance(raw, dict): + for key, data in raw.items(): + if not isinstance(data, dict): + continue + if not data.get("enabled", True): + continue + label = str(data.get("label") or key) + filters = data.get("filters") if isinstance(data.get("filters"), dict) else {} + ts_codes = [ + str(code).strip().upper() + for code in data.get("ts_codes", []) + if isinstance(code, str) and code.strip() + ] + keywords = [ + str(token).strip() + for token in data.get("keywords", []) + if isinstance(token, str) and token.strip() + ] + num_records = data.get("num_records") + if not isinstance(num_records, int) or num_records <= 0: + num_records = 50 + sources.append( + GdeltSourceConfig( + key=str(key), + label=label, + filters=dict(filters), + ts_codes=tuple(ts_codes), + keywords=tuple(keywords), + num_records=num_records, + ) + ) + return sources + + +def _ensure_datetime(value: DateLike, *, start_of_day: bool = True) -> datetime: + if isinstance(value, datetime): + return _normalize_timestamp(value) + if start_of_day: + return datetime.combine(value, datetime.min.time()) + return datetime.combine(value, datetime.max.time()) + + +def _normalize_timestamp(value: datetime) -> datetime: + if value.tzinfo is not None: + return value.astimezone(timezone.utc).replace(tzinfo=None) + return value + + +def _load_last_published(source_key: str) -> Optional[datetime]: + try: + with db_session(read_only=True) as conn: + row = conn.execute( + "SELECT last_published FROM ingest_state WHERE source = ?", + (source_key,), + ).fetchone() + except sqlite3.OperationalError: + return None + if not row: + return None + raw = row["last_published"] + if not raw: + return None + try: + return _normalize_timestamp(datetime.fromisoformat(raw)) + except ValueError: + LOGGER.debug("无法解析 GDELT 状态时间 source=%s value=%s", source_key, raw, extra=LOG_EXTRA) + return None + + +def _save_last_published(source_key: str, published: datetime) -> None: + timestamp = _normalize_timestamp(published).isoformat() + try: + with db_session() as conn: + conn.execute( + """ + INSERT INTO ingest_state (source, last_published) + VALUES (?, ?) + ON CONFLICT(source) DO UPDATE SET last_published = excluded.last_published + """, + (source_key, timestamp), + ) + except sqlite3.OperationalError: + LOGGER.debug("写入 ingest_state 失败,表可能不存在", extra=LOG_EXTRA) + + +def _parse_gdelt_datetime(raw: object) -> datetime: + if isinstance(raw, datetime): + return _normalize_timestamp(raw) + if raw is None: + return _normalize_timestamp(datetime.utcnow()) + text = str(raw).strip() + if not text: + return _normalize_timestamp(datetime.utcnow()) + # Common GDELT formats: YYYYMMDDHHMMSS or ISO8601 + try: + if text.isdigit() and len(text) == 14: + return _normalize_timestamp(datetime.strptime(text, "%Y%m%d%H%M%S")) + if text.endswith("Z"): + text = text[:-1] + "+00:00" + return _normalize_timestamp(datetime.fromisoformat(text)) + except ValueError: + pass + try: + return _normalize_timestamp(datetime.strptime(text, "%Y-%m-%d %H:%M:%S")) + except ValueError: + LOGGER.debug("无法解析 GDELT 日期:%s", text, extra=LOG_EXTRA) + return _normalize_timestamp(datetime.utcnow()) + + +def _build_rss_item(record: Dict[str, object], config: GdeltSourceConfig) -> Optional[rss_ingest.RssItem]: + url = record.get("url") or record.get("url_mobile") + if not isinstance(url, str) or not url.strip(): + return None + url = url.strip() + + title = record.get("title") or record.get("seendate") + if not isinstance(title, str) or not title.strip(): + title = url + title = title.strip() + + published_raw = ( + record.get("seendate") + or record.get("publishDate") + or record.get("date") + or record.get("firstseendate") + ) + published = _parse_gdelt_datetime(published_raw) + + summary_candidates: Iterable[object] = ( + record.get("summary"), + record.get("snippet"), + record.get("excerpt"), + record.get("altText"), + record.get("domain"), + ) + summary = "" + for candidate in summary_candidates: + if isinstance(candidate, str) and candidate.strip(): + summary = candidate.strip() + break + if not summary: + source_country = record.get("sourcecountry") + language = record.get("language") + details = [ + str(value).strip() + for value in (source_country, language) + if isinstance(value, str) and value.strip() + ] + summary = " / ".join(details) if details else title + + source = record.get("sourcecommonname") or record.get("domain") + if not isinstance(source, str) or not source.strip(): + source = config.label or "GDELT" + source = source.strip() + + fingerprint = f"{url}|{published.isoformat()}" + article_id = hashlib.blake2s(fingerprint.encode("utf-8"), digest_size=16).hexdigest() + + return rss_ingest.RssItem( + id=article_id, + title=title, + link=url, + published=published, + summary=summary, + source=source, + metadata={ + "source_key": config.key, + "source_label": config.label, + }, + ) + + +def fetch_gdelt_articles( + config: GdeltSourceConfig, + *, + start: Optional[datetime] = None, + end: Optional[datetime] = None, +) -> List[rss_ingest.RssItem]: + """Fetch article list from GDELT based on the supplied configuration.""" + + if GdeltDoc is None or Filters is None: + LOGGER.warning("未安装 gdeltdoc,跳过 GDELT 拉取", extra=LOG_EXTRA) + return [] + + filters_kwargs = dict(config.filters) + filters_kwargs.setdefault("num_records", config.num_records) + if start: + filters_kwargs.pop("timespan", None) + filters_kwargs["start_date"] = start + if end: + filters_kwargs.pop("timespan", None) + filters_kwargs["end_date"] = end + + try: + filter_obj = Filters(**filters_kwargs) + except Exception as exc: # noqa: BLE001 - guard misconfigured filters + LOGGER.error("GDELT 过滤器解析失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA) + return [] + + client = GdeltDoc() + try: + df = client.article_search(filter_obj) + except Exception as exc: # noqa: BLE001 - network/service issues + LOGGER.warning("GDELT 请求失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA) + return [] + + if df is None or df.empty: + LOGGER.info("GDELT 无匹配结果 key=%s", config.key, extra=LOG_EXTRA) + return [] + + items: List[rss_ingest.RssItem] = [] + for record in df.to_dict(orient="records"): + item = _build_rss_item(record, config) + if not item: + continue + assigned_codes = rss_ingest._assign_ts_codes(item, config.ts_codes, config.keywords) # type: ignore[attr-defined] + items.append(replace(item, ts_codes=tuple(assigned_codes))) + return items + + +def ingest_configured_gdelt( + start: Optional[DateLike] = None, + end: Optional[DateLike] = None, + *, + incremental: bool = True, +) -> int: + """Ingest all configured GDELT sources into the news store.""" + + sources = resolve_gdelt_sources() + if not sources: + LOGGER.info("未配置 GDELT 来源,跳过新闻拉取", extra=LOG_EXTRA) + return 0 + + start_dt = _ensure_datetime(start) if start else None + end_dt = _ensure_datetime(end, start_of_day=False) if end else None + + aggregated: List[rss_ingest.RssItem] = [] + latest_by_source: Dict[str, datetime] = {} + fetched = 0 + for config in sources: + source_start = start_dt + if incremental: + last_seen = _load_last_published(config.key) + if last_seen: + candidate = last_seen + timedelta(seconds=1) + if source_start is None or candidate > source_start: + source_start = candidate + LOGGER.info( + "开始拉取 GDELT:%s start=%s end=%s incremental=%s", + config.label, + source_start.isoformat() if source_start else None, + end_dt.isoformat() if end_dt else None, + incremental, + extra=LOG_EXTRA, + ) + items = fetch_gdelt_articles(config, start=source_start, end=end_dt) + if not items: + continue + aggregated.extend(items) + fetched += len(items) + LOGGER.info("GDELT 来源 %s 返回 %s 条记录", config.label, len(items), extra=LOG_EXTRA) + + if not aggregated: + return 0 + + deduped = rss_ingest.deduplicate_items(aggregated) + if not deduped: + LOGGER.info("GDELT 数据全部为重复项,跳过落库", extra=LOG_EXTRA) + return 0 + + inserted = rss_ingest.save_news_items(deduped) + if inserted: + latest_by_source.clear() + for item in deduped: + source_key = str(item.metadata.get("source_key", "") if item.metadata else "") + if not source_key: + continue + current = latest_by_source.get(source_key) + candidate = item.published + if current is None or candidate > current: + latest_by_source[source_key] = candidate + for source_key, timestamp in latest_by_source.items(): + _save_last_published(source_key, timestamp) + LOGGER.info( + "GDELT 新闻落库完成 fetched=%s deduped=%s inserted=%s", + fetched, + len(deduped), + inserted, + extra=LOG_EXTRA, + ) + return inserted + + +__all__ = [ + "GdeltSourceConfig", + "resolve_gdelt_sources", + "fetch_gdelt_articles", + "ingest_configured_gdelt", +] diff --git a/app/ingest/rss.py b/app/ingest/rss.py index fdf9756..0dd43da 100644 --- a/app/ingest/rss.py +++ b/app/ingest/rss.py @@ -120,6 +120,7 @@ class RssItem: stock_mentions: List[StockMention] = field(default_factory=list) industries: List[str] = field(default_factory=list) important_keywords: List[str] = field(default_factory=list) + metadata: Dict[str, object] = field(default_factory=dict) def __post_init__(self): """Initialize company mapper if not already initialized.""" @@ -481,14 +482,20 @@ def deduplicate_items(items: Iterable[RssItem]) -> List[RssItem]: if key in seen: continue seen.add(key) - + preassigned_codes = list(item.ts_codes or []) # 提取实体和相关信息 item.extract_entities() # 如果找到了相关股票,则保留这条新闻 if item.stock_mentions: unique.append(item) - + continue + + # 否则如果配置了预设股票代码,则保留这些代码 + if preassigned_codes: + if not item.ts_codes: + item.ts_codes = preassigned_codes + unique.append(item) return unique @@ -514,16 +521,16 @@ def save_news_items(items: Iterable[RssItem]) -> int: industry_count=len(item.industries) ) # 构建包含更多信息的entities对象 - entities = json.dumps( - { - "ts_codes": list(base_codes), - "source_url": item.link, - "industries": item.industries, # 添加行业信息 - "important_keywords": item.important_keywords, # 添加重要关键词 - "text_length": len(text_payload), # 添加文本长度信息 - }, - ensure_ascii=False, - ) + entity_payload = { + "ts_codes": list(base_codes), + "source_url": item.link, + "industries": item.industries, # 添加行业信息 + "important_keywords": item.important_keywords, # 添加重要关键词 + "text_length": len(text_payload), # 添加文本长度信息 + } + if item.metadata: + entity_payload["metadata"] = dict(item.metadata) + entities = json.dumps(entity_payload, ensure_ascii=False) resolved_codes = base_codes or (None,) for ts_code in resolved_codes: row_id = item.id if ts_code is None else f"{item.id}::{ts_code}" diff --git a/app/ingest/tushare.py b/app/ingest/tushare.py index de4b5ae..231b0f0 100644 --- a/app/ingest/tushare.py +++ b/app/ingest/tushare.py @@ -58,6 +58,7 @@ def run_ingestion( *, include_limits: bool = True, include_extended: bool = True, + include_news: bool = True, post_tasks: Optional[Iterable[PostTask]] = None, ) -> None: """Execute a TuShare ingestion job with optional post processing hooks.""" @@ -71,6 +72,7 @@ def run_ingestion( ts_codes=job.ts_codes, include_limits=include_limits, include_extended=include_extended, + include_news=include_news, force=True, ) logger.update_metadata(job.as_dict()) @@ -103,4 +105,3 @@ __all__ = [ "ensure_data_coverage", "run_ingestion", ] - diff --git a/app/utils/config.py b/app/utils/config.py index b567a91..c0d6367 100644 --- a/app/utils/config.py +++ b/app/utils/config.py @@ -548,6 +548,23 @@ def _default_rss_sources() -> Dict[str, object]: } +def _default_gdelt_sources() -> Dict[str, object]: + return { + "global_market_watch": { + "enabled": False, + "label": "Global Market Watch", + "filters": { + "timespan": "24h", + "keyword": "\"stock market\" OR 股票", + "language": ["en", "zh"], + "num_records": 75, + }, + "ts_codes": [], + "keywords": [], + } + } + + @dataclass class AppConfig: """User configurable settings persisted in a simple structure.""" @@ -567,6 +584,7 @@ class AppConfig: departments: Dict[str, DepartmentSettings] = field(default_factory=_default_departments) portfolio: PortfolioSettings = field(default_factory=PortfolioSettings) alert_channels: Dict[str, AlertChannelSettings] = field(default_factory=dict) + gdelt_sources: Dict[str, object] = field(default_factory=_default_gdelt_sources) disabled_ingest_tables: Set[str] = field(default_factory=set) def resolve_llm(self, route: Optional[str] = None) -> LLMConfig: @@ -651,6 +669,20 @@ def _load_from_file(cfg: AppConfig) -> None: default_rss[str(key)] = value cfg.rss_sources = default_rss + gdelt_payload = payload.get("gdelt_sources") + default_gdelt = _default_gdelt_sources() + if isinstance(gdelt_payload, dict): + sanitized: Dict[str, object] = {} + for key, value in gdelt_payload.items(): + if isinstance(value, dict): + sanitized[str(key)] = value + if sanitized: + cfg.gdelt_sources = sanitized + else: + cfg.gdelt_sources = default_gdelt + else: + cfg.gdelt_sources = default_gdelt + weights_payload = payload.get("agent_weights") if isinstance(weights_payload, dict): cfg.agent_weights.update_from_dict(weights_payload) @@ -953,6 +985,7 @@ def save_config(cfg: AppConfig | None = None) -> None: "force_refresh": cfg.force_refresh, "auto_update_data": cfg.auto_update_data, "decision_method": cfg.decision_method, + "gdelt_sources": cfg.gdelt_sources, "disabled_ingest_tables": sorted(cfg.disabled_ingest_tables), "rss_sources": cfg.rss_sources, "agent_weights": cfg.agent_weights.as_dict(), diff --git a/docs/GDELT_README.md b/docs/GDELT_README.md new file mode 100644 index 0000000..911ce3b --- /dev/null +++ b/docs/GDELT_README.md @@ -0,0 +1,139 @@ +# GDELT 2.0 Doc API Client + +A Python client to fetch data from the [GDELT 2.0 Doc API](https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/). + +This allows for simpler, small-scale analysis of news coverage without having to deal with the complexities of downloading and managing the raw files from S3, or working with the BigQuery export. + +## Installation + +`gdeltdoc` is on PyPi and is installed through pip: + +```bash +pip install gdeltdoc +``` + +## Use + +The `ArtList` and `Timeline*` query modes are supported. + +```python +from gdeltdoc import GdeltDoc, Filters + +f = Filters( + keyword = "climate change", + start_date = "2020-05-10", + end_date = "2020-05-11" +) + +gd = GdeltDoc() + +# Search for articles matching the filters +articles = gd.article_search(f) + +# Get a timeline of the number of articles matching the filters +timeline = gd.timeline_search("timelinevol", f) +``` + +## Integration in `llm_quant` + +This repository wires `gdeltdoc` into the TuShare ingestion workflow so GDELT headlines arrive alongside the usual market data. + +- Configuration lives under `gdelt_sources` in `app/data/config.json` (managed via `AppConfig.gdelt_sources`). +- `app/ingest/gdelt.py` wraps the Doc API, materialising results as `RssItem` objects so they share the same dedupe/heat scoring pipeline as RSS feeds. +- `app/ingest/coverage.ensure_data_coverage` now calls `ingest_configured_gdelt(...)` after the core TuShare tables, supporting incremental fetches via `ingest_state`. + +Enable a source by flipping `enabled: true` in the config, optionally providing `start_date`/`end_date` windows or a rolling `timespan`. Subsequent runs only request data beyond the last persisted publish time. + +### Article List + +The article list mode of the API generates a list of news articles that match the filters. The client returns this as a pandas DataFrame with columns `url`, `url_mobile`, `title`, `seendate`, `socialimage`, `domain`, `language`, `sourcecountry`. + +### Timeline Search + +There are 5 available modes when making a timeline search: + +- `timelinevol` - a timeline of the volume of news coverage matching the filters, represented as a percentage of the total news articles monitored by GDELT. +- `timelinevolraw` - similar to `timelinevol`, but has the actual number of articles and a total rather than a percentage +- `timelinelang` - similar to `timelinevol` but breaks the total articles down by published language. Each language is returned as a separate column in the DataFrame. +- `timelinesourcecountry` - similar to `timelinevol` but breaks the total articles down by the country they were published in. Each country is returned as a separate column in the DataFrame. +- `timelinetone` - a timeline of the average tone of the news coverage matching the filters. See [GDELT's documentation](https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/) for more information about the tone metric. + +### Filters + +The search query passed to the API is constructed from a `gdeltdoc.Filters` object. + +```python +from gdeltdoc import Filters, near, repeat + +f = Filters( + start_date = "2020-05-01", + end_date = "2020-05-02", + num_records = 250, + keyword = "climate change", + domain = ["bbc.co.uk", "nytimes.com"], + country = ["UK", "US"], + theme = "GENERAL_HEALTH", + near = near(10, "airline", "carbon"), + repeat = repeat(5, "planet") +) +``` + +Filters for `keyword`, `domain`, `domain_exact`, `country`, `language` and `theme` can be passed either as a single string or as a list of strings. If a list is passed, the values in the list are wrappeed in a boolean OR. + +You must pass either `start_date` and `end_date`, or `timespan` + +- `start_date` - The start date for the filter in YYYY-MM-DD format or as a datetime object in UTC time. + Passing a datetime allows you to specify a time down to seconds granularity. The API officially only supports the most recent 3 months of articles. Making a request for an earlier date range may still return data, but it's not guaranteed. +- `end_date` - The end date for the filter in YYYY-MM-DD format or as a datetime object in UTC time. +- `timespan` - A timespan to search for, relative to the time of the request. Must match one of the API's timespan formats - https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/ +- `num_records` - The number of records to return. Only used in article list mode and can be up to 250. +- `keyword` - Return articles containing the exact phrase `keyword` within the article text. +- `domain` - Return articles from the specified domain. Does not require an exact match so passing "cnn.com" will match articles from `cnn.com`, `subdomain.cnn.com` and `notactuallycnn.com`. +- `domain_exact` - Similar to `domain`, but requires an exact match. +- `country` - Return articles published in a country or list of countries, formatted as the FIPS 2 letter country code. +- `language` - Return articles published in the given language, formatted as the ISO 639 language code. +- `theme` - Return articles that cover one of GDELT's GKG Themes. A full list of themes can be found [here](http://data.gdeltproject.org/api/v2/guides/LOOKUP-GKGTHEMES.TXT) +- `near` - Return articles containing words close to each other in the text. Use `near()` to construct. eg. `near = near(5, "airline", "climate")`, or `multi_near()` if you want to use multiple restrictions eg. `multi_near([(5, "airline", "crisis"), (10, "airline", "climate", "change")], method="AND")` finds "airline" and "crisis" within 5 words, and "airline", "climate", and "change" within 10 words +- `repeat` - Return articles containing a single word repeated at least a number of times. Use `repeat()` to construct. eg. `repeat =repeat(3, "environment")`, or `multi_repeat()` if you want to use multiple restrictions eg. `repeat = multi_repeat([(2, "airline"), (3, "airport")], "AND")` +- `tone` - Return articles above or below a particular tone score (ie more positive or more negative than a certain threshold). To use, specify either a greater than or less than sign and a positive or negative number (either an integer or floating point number). To find fairly positive articles, use `tone=">5"` or to search for fairly negative articles, use `tone="<-5"` +- tone_absolute - The same as `tone` but ignores the positive/negative sign and lets you search for high emotion or low emotion articles, regardless of whether they were happy or sad in tone + +## Developing gdelt-doc-api + +PRs & issues are very welcome! + +### Setup + +It's recommended to use a virtual environment for development. Set one up with + +``` +python -m venv venv +``` + +and activate it (on Mac or Linux) + +``` +source venv/bin/activate +``` + +Then install the requirements + +``` +pip install -r requirements.txt +``` + +Tests for this package use `unittest`. Run them with + +``` +python -m unittest +``` + +If your PR adds a new feature or helper, please also add some tests + +### Publishing + +There's a bit of automation set up to help publish a new version of the package to PyPI, + +1. Make sure the version string has been updated since the last release. This package follows semantic versioning. +2. Create a new release in the Github UI, using the new version as the release name +3. Watch as the `publish.yml` Github action builds the package and pushes it to PyPI diff --git a/requirements.txt b/requirements.txt index e486c6a..0b1f8fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ feedparser>=6.0 arch>=6.1.0 scipy>=1.11.0 torch>=2.3.0 +gdeltdoc>=0.1.7