add GDELT news ingestion with metadata and ingest state tracking

2025-10-19 08:45:14 +08:00 · 2025-10-19 08:45:14 +08:00 · 2147dc3244
commit 2147dc3244
parent d6292e2b2f
8 changed files with 560 additions and 13 deletions
--- a/app/data/schema.py
+++ b/app/data/schema.py
@ -374,6 +374,12 @@ SCHEMA_STATEMENTS: Iterable[str] = (
    CREATE INDEX IF NOT EXISTS idx_news_code ON news(ts_code, pub_time DESC);
    """,
    """
+    CREATE TABLE IF NOT EXISTS ingest_state (
+      source TEXT PRIMARY KEY,
+      last_published TEXT
+    );
+    """,
+    """
    CREATE TABLE IF NOT EXISTS heat_daily (
      scope TEXT,
      key TEXT,
--- a/app/ingest/coverage.py
+++ b/app/ingest/coverage.py
@ -44,6 +44,7 @@ from .api_client import (
    fetch_us_daily,
    save_records,
 )
+from .gdelt import ingest_configured_gdelt

 LOGGER = get_logger(__name__)

@ -172,6 +173,7 @@ def ensure_data_coverage(
    ts_codes: Optional[Sequence[str]] = None,
    include_limits: bool = True,
    include_extended: bool = True,
+    include_news: bool = True,
    force: bool = False,
    progress_hook: Callable[[str, float], None] | None = None,
 ) -> None:
@ -193,6 +195,9 @@ def ensure_data_coverage(
        extra_steps += 1
    if include_extended:
        extra_steps += 4
+    news_enabled = include_news and not _is_disabled("news")
+    if news_enabled:
+        extra_steps += 1
    total_steps = 5 + extra_steps
    current_step = 0

@ -358,6 +363,17 @@ def ensure_data_coverage(
        _save_with_codes("hk_daily", fetch_hk_daily, targets=HK_CODES)
        _save_with_codes("us_daily", fetch_us_daily, targets=US_CODES)

+    if news_enabled:
+        advance("拉取 GDELT 新闻数据")
+        try:
+            ingest_configured_gdelt(
+                start=start,
+                end=end,
+                incremental=not force,
+            )
+        except Exception as exc:  # noqa: BLE001
+            LOGGER.warning("GDELT 新闻拉取失败：%s", exc, extra=LOG_EXTRA)
+
    if progress_hook:
        progress_hook("数据覆盖检查完成", 1.0)

--- a/app/ingest/gdelt.py
+++ b/app/ingest/gdelt.py
@ -0,0 +1,344 @@
+"""GDELT Doc API ingestion utilities built on top of gdeltdoc."""
+from __future__ import annotations
+
+import hashlib
+import sqlite3
+from dataclasses import dataclass, field, replace
+from datetime import date, datetime, timedelta, timezone
+from typing import Dict, Iterable, List, Optional, Sequence, Union
+
+try:  # pragma: no cover - optional dependency
+    from gdeltdoc import GdeltDoc, Filters  # type: ignore[import-not-found]
+except ImportError:  # pragma: no cover - optional dependency
+    GdeltDoc = None  # type: ignore[assignment]
+    Filters = None  # type: ignore[assignment]
+
+from app.utils.config import get_config
+from app.utils.db import db_session
+from app.utils.logging import get_logger
+
+from . import rss as rss_ingest
+
+LOGGER = get_logger(__name__)
+LOG_EXTRA = {"stage": "gdelt_ingest"}
+DateLike = Union[date, datetime]
+
+
+@dataclass
+class GdeltSourceConfig:
+    """Configuration describing a single GDELT filter set."""
+
+    key: str
+    label: str
+    filters: Dict[str, object] = field(default_factory=dict)
+    ts_codes: Sequence[str] = field(default_factory=tuple)
+    keywords: Sequence[str] = field(default_factory=tuple)
+    num_records: int = 50
+
+
+def resolve_gdelt_sources() -> List[GdeltSourceConfig]:
+    """Resolve configured GDELT filter groups."""
+
+    cfg = get_config()
+    raw = getattr(cfg, "gdelt_sources", None) or {}
+
+    sources: List[GdeltSourceConfig] = []
+    if isinstance(raw, dict):
+        for key, data in raw.items():
+            if not isinstance(data, dict):
+                continue
+            if not data.get("enabled", True):
+                continue
+            label = str(data.get("label") or key)
+            filters = data.get("filters") if isinstance(data.get("filters"), dict) else {}
+            ts_codes = [
+                str(code).strip().upper()
+                for code in data.get("ts_codes", [])
+                if isinstance(code, str) and code.strip()
+            ]
+            keywords = [
+                str(token).strip()
+                for token in data.get("keywords", [])
+                if isinstance(token, str) and token.strip()
+            ]
+            num_records = data.get("num_records")
+            if not isinstance(num_records, int) or num_records <= 0:
+                num_records = 50
+            sources.append(
+                GdeltSourceConfig(
+                    key=str(key),
+                    label=label,
+                    filters=dict(filters),
+                    ts_codes=tuple(ts_codes),
+                    keywords=tuple(keywords),
+                    num_records=num_records,
+                )
+            )
+    return sources
+
+
+def _ensure_datetime(value: DateLike, *, start_of_day: bool = True) -> datetime:
+    if isinstance(value, datetime):
+        return _normalize_timestamp(value)
+    if start_of_day:
+        return datetime.combine(value, datetime.min.time())
+    return datetime.combine(value, datetime.max.time())
+
+
+def _normalize_timestamp(value: datetime) -> datetime:
+    if value.tzinfo is not None:
+        return value.astimezone(timezone.utc).replace(tzinfo=None)
+    return value
+
+
+def _load_last_published(source_key: str) -> Optional[datetime]:
+    try:
+        with db_session(read_only=True) as conn:
+            row = conn.execute(
+                "SELECT last_published FROM ingest_state WHERE source = ?",
+                (source_key,),
+            ).fetchone()
+    except sqlite3.OperationalError:
+        return None
+    if not row:
+        return None
+    raw = row["last_published"]
+    if not raw:
+        return None
+    try:
+        return _normalize_timestamp(datetime.fromisoformat(raw))
+    except ValueError:
+        LOGGER.debug("无法解析 GDELT 状态时间 source=%s value=%s", source_key, raw, extra=LOG_EXTRA)
+        return None
+
+
+def _save_last_published(source_key: str, published: datetime) -> None:
+    timestamp = _normalize_timestamp(published).isoformat()
+    try:
+        with db_session() as conn:
+            conn.execute(
+                """
+                INSERT INTO ingest_state (source, last_published)
+                VALUES (?, ?)
+                ON CONFLICT(source) DO UPDATE SET last_published = excluded.last_published
+                """,
+                (source_key, timestamp),
+            )
+    except sqlite3.OperationalError:
+        LOGGER.debug("写入 ingest_state 失败，表可能不存在", extra=LOG_EXTRA)
+
+
+def _parse_gdelt_datetime(raw: object) -> datetime:
+    if isinstance(raw, datetime):
+        return _normalize_timestamp(raw)
+    if raw is None:
+        return _normalize_timestamp(datetime.utcnow())
+    text = str(raw).strip()
+    if not text:
+        return _normalize_timestamp(datetime.utcnow())
+    # Common GDELT formats: YYYYMMDDHHMMSS or ISO8601
+    try:
+        if text.isdigit() and len(text) == 14:
+            return _normalize_timestamp(datetime.strptime(text, "%Y%m%d%H%M%S"))
+        if text.endswith("Z"):
+            text = text[:-1] + "+00:00"
+        return _normalize_timestamp(datetime.fromisoformat(text))
+    except ValueError:
+        pass
+    try:
+        return _normalize_timestamp(datetime.strptime(text, "%Y-%m-%d %H:%M:%S"))
+    except ValueError:
+        LOGGER.debug("无法解析 GDELT 日期：%s", text, extra=LOG_EXTRA)
+        return _normalize_timestamp(datetime.utcnow())
+
+
+def _build_rss_item(record: Dict[str, object], config: GdeltSourceConfig) -> Optional[rss_ingest.RssItem]:
+    url = record.get("url") or record.get("url_mobile")
+    if not isinstance(url, str) or not url.strip():
+        return None
+    url = url.strip()
+
+    title = record.get("title") or record.get("seendate")
+    if not isinstance(title, str) or not title.strip():
+        title = url
+    title = title.strip()
+
+    published_raw = (
+        record.get("seendate")
+        or record.get("publishDate")
+        or record.get("date")
+        or record.get("firstseendate")
+    )
+    published = _parse_gdelt_datetime(published_raw)
+
+    summary_candidates: Iterable[object] = (
+        record.get("summary"),
+        record.get("snippet"),
+        record.get("excerpt"),
+        record.get("altText"),
+        record.get("domain"),
+    )
+    summary = ""
+    for candidate in summary_candidates:
+        if isinstance(candidate, str) and candidate.strip():
+            summary = candidate.strip()
+            break
+    if not summary:
+        source_country = record.get("sourcecountry")
+        language = record.get("language")
+        details = [
+            str(value).strip()
+            for value in (source_country, language)
+            if isinstance(value, str) and value.strip()
+        ]
+        summary = " / ".join(details) if details else title
+
+    source = record.get("sourcecommonname") or record.get("domain")
+    if not isinstance(source, str) or not source.strip():
+        source = config.label or "GDELT"
+    source = source.strip()
+
+    fingerprint = f"{url}|{published.isoformat()}"
+    article_id = hashlib.blake2s(fingerprint.encode("utf-8"), digest_size=16).hexdigest()
+
+    return rss_ingest.RssItem(
+        id=article_id,
+        title=title,
+        link=url,
+        published=published,
+        summary=summary,
+        source=source,
+        metadata={
+            "source_key": config.key,
+            "source_label": config.label,
+        },
+    )
+
+
+def fetch_gdelt_articles(
+    config: GdeltSourceConfig,
+    *,
+    start: Optional[datetime] = None,
+    end: Optional[datetime] = None,
+) -> List[rss_ingest.RssItem]:
+    """Fetch article list from GDELT based on the supplied configuration."""
+
+    if GdeltDoc is None or Filters is None:
+        LOGGER.warning("未安装 gdeltdoc，跳过 GDELT 拉取", extra=LOG_EXTRA)
+        return []
+
+    filters_kwargs = dict(config.filters)
+    filters_kwargs.setdefault("num_records", config.num_records)
+    if start:
+        filters_kwargs.pop("timespan", None)
+        filters_kwargs["start_date"] = start
+    if end:
+        filters_kwargs.pop("timespan", None)
+        filters_kwargs["end_date"] = end
+
+    try:
+        filter_obj = Filters(**filters_kwargs)
+    except Exception as exc:  # noqa: BLE001 - guard misconfigured filters
+        LOGGER.error("GDELT 过滤器解析失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA)
+        return []
+
+    client = GdeltDoc()
+    try:
+        df = client.article_search(filter_obj)
+    except Exception as exc:  # noqa: BLE001 - network/service issues
+        LOGGER.warning("GDELT 请求失败 key=%s err=%s", config.key, exc, extra=LOG_EXTRA)
+        return []
+
+    if df is None or df.empty:
+        LOGGER.info("GDELT 无匹配结果 key=%s", config.key, extra=LOG_EXTRA)
+        return []
+
+    items: List[rss_ingest.RssItem] = []
+    for record in df.to_dict(orient="records"):
+        item = _build_rss_item(record, config)
+        if not item:
+            continue
+        assigned_codes = rss_ingest._assign_ts_codes(item, config.ts_codes, config.keywords)  # type: ignore[attr-defined]
+        items.append(replace(item, ts_codes=tuple(assigned_codes)))
+    return items
+
+
+def ingest_configured_gdelt(
+    start: Optional[DateLike] = None,
+    end: Optional[DateLike] = None,
+    *,
+    incremental: bool = True,
+) -> int:
+    """Ingest all configured GDELT sources into the news store."""
+
+    sources = resolve_gdelt_sources()
+    if not sources:
+        LOGGER.info("未配置 GDELT 来源，跳过新闻拉取", extra=LOG_EXTRA)
+        return 0
+
+    start_dt = _ensure_datetime(start) if start else None
+    end_dt = _ensure_datetime(end, start_of_day=False) if end else None
+
+    aggregated: List[rss_ingest.RssItem] = []
+    latest_by_source: Dict[str, datetime] = {}
+    fetched = 0
+    for config in sources:
+        source_start = start_dt
+        if incremental:
+            last_seen = _load_last_published(config.key)
+            if last_seen:
+                candidate = last_seen + timedelta(seconds=1)
+                if source_start is None or candidate > source_start:
+                    source_start = candidate
+        LOGGER.info(
+            "开始拉取 GDELT：%s start=%s end=%s incremental=%s",
+            config.label,
+            source_start.isoformat() if source_start else None,
+            end_dt.isoformat() if end_dt else None,
+            incremental,
+            extra=LOG_EXTRA,
+        )
+        items = fetch_gdelt_articles(config, start=source_start, end=end_dt)
+        if not items:
+            continue
+        aggregated.extend(items)
+        fetched += len(items)
+        LOGGER.info("GDELT 来源 %s 返回 %s 条记录", config.label, len(items), extra=LOG_EXTRA)
+
+    if not aggregated:
+        return 0
+
+    deduped = rss_ingest.deduplicate_items(aggregated)
+    if not deduped:
+        LOGGER.info("GDELT 数据全部为重复项，跳过落库", extra=LOG_EXTRA)
+        return 0
+
+    inserted = rss_ingest.save_news_items(deduped)
+    if inserted:
+        latest_by_source.clear()
+        for item in deduped:
+            source_key = str(item.metadata.get("source_key", "") if item.metadata else "")
+            if not source_key:
+                continue
+            current = latest_by_source.get(source_key)
+            candidate = item.published
+            if current is None or candidate > current:
+                latest_by_source[source_key] = candidate
+        for source_key, timestamp in latest_by_source.items():
+            _save_last_published(source_key, timestamp)
+    LOGGER.info(
+        "GDELT 新闻落库完成 fetched=%s deduped=%s inserted=%s",
+        fetched,
+        len(deduped),
+        inserted,
+        extra=LOG_EXTRA,
+    )
+    return inserted
+
+
+__all__ = [
+    "GdeltSourceConfig",
+    "resolve_gdelt_sources",
+    "fetch_gdelt_articles",
+    "ingest_configured_gdelt",
+]
--- a/app/ingest/rss.py
+++ b/app/ingest/rss.py
@ -120,6 +120,7 @@ class RssItem:
    stock_mentions: List[StockMention] = field(default_factory=list)
    industries: List[str] = field(default_factory=list)
    important_keywords: List[str] = field(default_factory=list)
+    metadata: Dict[str, object] = field(default_factory=dict)
    
    def __post_init__(self):
        """Initialize company mapper if not already initialized."""
@ -481,14 +482,20 @@ def deduplicate_items(items: Iterable[RssItem]) -> List[RssItem]:
        if key in seen:
            continue
        seen.add(key)
-        
+        preassigned_codes = list(item.ts_codes or [])
        # 提取实体和相关信息
        item.extract_entities()
        
        # 如果找到了相关股票，则保留这条新闻
        if item.stock_mentions:
            unique.append(item)
-            
+            continue
+
+        # 否则如果配置了预设股票代码，则保留这些代码
+        if preassigned_codes:
+            if not item.ts_codes:
+                item.ts_codes = preassigned_codes
+            unique.append(item)
    return unique


@ -514,16 +521,16 @@ def save_news_items(items: Iterable[RssItem]) -> int:
            industry_count=len(item.industries)
        )
        # 构建包含更多信息的entities对象
-        entities = json.dumps(
-            {
-                "ts_codes": list(base_codes),
-                "source_url": item.link,
-                "industries": item.industries,  # 添加行业信息
-                "important_keywords": item.important_keywords,  # 添加重要关键词
-                "text_length": len(text_payload),  # 添加文本长度信息
-            },
-            ensure_ascii=False,
-        )
+        entity_payload = {
+            "ts_codes": list(base_codes),
+            "source_url": item.link,
+            "industries": item.industries,  # 添加行业信息
+            "important_keywords": item.important_keywords,  # 添加重要关键词
+            "text_length": len(text_payload),  # 添加文本长度信息
+        }
+        if item.metadata:
+            entity_payload["metadata"] = dict(item.metadata)
+        entities = json.dumps(entity_payload, ensure_ascii=False)
        resolved_codes = base_codes or (None,)
        for ts_code in resolved_codes:
            row_id = item.id if ts_code is None else f"{item.id}::{ts_code}"
--- a/app/ingest/tushare.py
+++ b/app/ingest/tushare.py
@ -58,6 +58,7 @@ def run_ingestion(
    *,
    include_limits: bool = True,
    include_extended: bool = True,
+    include_news: bool = True,
    post_tasks: Optional[Iterable[PostTask]] = None,
 ) -> None:
    """Execute a TuShare ingestion job with optional post processing hooks."""
@ -71,6 +72,7 @@ def run_ingestion(
                ts_codes=job.ts_codes,
                include_limits=include_limits,
                include_extended=include_extended,
+                include_news=include_news,
                force=True,
            )
            logger.update_metadata(job.as_dict())
@ -103,4 +105,3 @@ __all__ = [
    "ensure_data_coverage",
    "run_ingestion",
 ]
-
--- a/app/utils/config.py
+++ b/app/utils/config.py
@ -548,6 +548,23 @@ def _default_rss_sources() -> Dict[str, object]:
    }


+def _default_gdelt_sources() -> Dict[str, object]:
+    return {
+        "global_market_watch": {
+            "enabled": False,
+            "label": "Global Market Watch",
+            "filters": {
+                "timespan": "24h",
+                "keyword": "\"stock market\" OR 股票",
+                "language": ["en", "zh"],
+                "num_records": 75,
+            },
+            "ts_codes": [],
+            "keywords": [],
+        }
+    }
+
+
@dataclass
 class AppConfig:
    """User configurable settings persisted in a simple structure."""
@ -567,6 +584,7 @@ class AppConfig:
    departments: Dict[str, DepartmentSettings] = field(default_factory=_default_departments)
    portfolio: PortfolioSettings = field(default_factory=PortfolioSettings)
    alert_channels: Dict[str, AlertChannelSettings] = field(default_factory=dict)
+    gdelt_sources: Dict[str, object] = field(default_factory=_default_gdelt_sources)
    disabled_ingest_tables: Set[str] = field(default_factory=set)

    def resolve_llm(self, route: Optional[str] = None) -> LLMConfig:
@ -651,6 +669,20 @@ def _load_from_file(cfg: AppConfig) -> None:
                default_rss[str(key)] = value
    cfg.rss_sources = default_rss

+    gdelt_payload = payload.get("gdelt_sources")
+    default_gdelt = _default_gdelt_sources()
+    if isinstance(gdelt_payload, dict):
+        sanitized: Dict[str, object] = {}
+        for key, value in gdelt_payload.items():
+            if isinstance(value, dict):
+                sanitized[str(key)] = value
+        if sanitized:
+            cfg.gdelt_sources = sanitized
+        else:
+            cfg.gdelt_sources = default_gdelt
+    else:
+        cfg.gdelt_sources = default_gdelt
+
    weights_payload = payload.get("agent_weights")
    if isinstance(weights_payload, dict):
        cfg.agent_weights.update_from_dict(weights_payload)
@ -953,6 +985,7 @@ def save_config(cfg: AppConfig | None = None) -> None:
        "force_refresh": cfg.force_refresh,
        "auto_update_data": cfg.auto_update_data,
        "decision_method": cfg.decision_method,
+        "gdelt_sources": cfg.gdelt_sources,
        "disabled_ingest_tables": sorted(cfg.disabled_ingest_tables),
        "rss_sources": cfg.rss_sources,
        "agent_weights": cfg.agent_weights.as_dict(),
--- a/docs/GDELT_README.md
+++ b/docs/GDELT_README.md
@ -0,0 +1,139 @@
+# GDELT 2.0 Doc API Client
+
+A Python client to fetch data from the [GDELT 2.0 Doc API](https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/).
+
+This allows for simpler, small-scale analysis of news coverage without having to deal with the complexities of downloading and managing the raw files from S3, or working with the BigQuery export.
+
+## Installation
+
+`gdeltdoc` is on PyPi and is installed through pip:
+
+```bash
+pip install gdeltdoc
+```
+
+## Use
+
+The `ArtList` and `Timeline*` query modes are supported.
+
+```python
+from gdeltdoc import GdeltDoc, Filters
+
+f = Filters(
+    keyword = "climate change",
+    start_date = "2020-05-10",
+    end_date = "2020-05-11"
+)
+
+gd = GdeltDoc()
+
+# Search for articles matching the filters
+articles = gd.article_search(f)
+
+# Get a timeline of the number of articles matching the filters
+timeline = gd.timeline_search("timelinevol", f)
+```
+
+## Integration in `llm_quant`
+
+This repository wires `gdeltdoc` into the TuShare ingestion workflow so GDELT headlines arrive alongside the usual market data.
+
+- Configuration lives under `gdelt_sources` in `app/data/config.json` (managed via `AppConfig.gdelt_sources`).
+- `app/ingest/gdelt.py` wraps the Doc API, materialising results as `RssItem` objects so they share the same dedupe/heat scoring pipeline as RSS feeds.
+- `app/ingest/coverage.ensure_data_coverage` now calls `ingest_configured_gdelt(...)` after the core TuShare tables, supporting incremental fetches via `ingest_state`.
+
+Enable a source by flipping `enabled: true` in the config, optionally providing `start_date`/`end_date` windows or a rolling `timespan`. Subsequent runs only request data beyond the last persisted publish time.
+
+### Article List
+
+The article list mode of the API generates a list of news articles that match the filters. The client returns this as a pandas DataFrame with columns `url`, `url_mobile`, `title`, `seendate`, `socialimage`, `domain`, `language`, `sourcecountry`.
+
+### Timeline Search
+
+There are 5 available modes when making a timeline search:
+
+- `timelinevol` - a timeline of the volume of news coverage matching the filters, represented as a percentage of the total news articles monitored by GDELT.
+- `timelinevolraw` - similar to `timelinevol`, but has the actual number of articles and a total rather than a percentage
+- `timelinelang` - similar to `timelinevol` but breaks the total articles down by published language. Each language is returned as a separate column in the DataFrame.
+- `timelinesourcecountry` - similar to `timelinevol` but breaks the total articles down by the country they were published in. Each country is returned as a separate column in the DataFrame.
+- `timelinetone` - a timeline of the average tone of the news coverage matching the filters. See [GDELT's documentation](https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/) for more information about the tone metric.
+
+### Filters
+
+The search query passed to the API is constructed from a `gdeltdoc.Filters` object.
+
+```python
+from gdeltdoc import Filters, near, repeat
+
+f = Filters(
+    start_date = "2020-05-01",
+    end_date = "2020-05-02",
+    num_records = 250,
+    keyword = "climate change",
+    domain = ["bbc.co.uk", "nytimes.com"],
+    country = ["UK", "US"],
+    theme = "GENERAL_HEALTH",
+    near = near(10, "airline", "carbon"),
+    repeat = repeat(5, "planet")
+)
+```
+
+Filters for `keyword`, `domain`, `domain_exact`, `country`, `language` and `theme` can be passed either as a single string or as a list of strings. If a list is passed, the values in the list are wrappeed in a boolean OR.
+
+You must pass either `start_date` and `end_date`, or `timespan`
+
+- `start_date` - The start date for the filter in YYYY-MM-DD format or as a datetime object in UTC time.
+  Passing a datetime allows you to specify a time down to seconds granularity. The API officially only supports the most recent 3 months of articles. Making a request for an earlier date range may still return data, but it's not guaranteed.
+- `end_date` - The end date for the filter in YYYY-MM-DD format or as a datetime object in UTC time.
+- `timespan` - A timespan to search for, relative to the time of the request. Must match one of the API's timespan formats - https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/
+- `num_records` - The number of records to return. Only used in article list mode and can be up to 250.
+- `keyword` - Return articles containing the exact phrase `keyword` within the article text.
+- `domain` - Return articles from the specified domain. Does not require an exact match so passing "cnn.com" will match articles from `cnn.com`, `subdomain.cnn.com` and `notactuallycnn.com`.
+- `domain_exact` - Similar to `domain`, but requires an exact match.
+- `country` - Return articles published in a country or list of countries, formatted as the FIPS 2 letter country code.
+- `language` - Return articles published in the given language, formatted as the ISO 639 language code.
+- `theme` - Return articles that cover one of GDELT's GKG Themes. A full list of themes can be found [here](http://data.gdeltproject.org/api/v2/guides/LOOKUP-GKGTHEMES.TXT)
+- `near` - Return articles containing words close to each other in the text. Use `near()` to construct. eg. `near = near(5, "airline", "climate")`, or `multi_near()` if you want to use multiple restrictions eg. `multi_near([(5, "airline", "crisis"), (10, "airline", "climate", "change")], method="AND")` finds "airline" and "crisis" within 5 words, and "airline", "climate", and "change" within 10 words
+- `repeat` - Return articles containing a single word repeated at least a number of times. Use `repeat()` to construct. eg. `repeat =repeat(3, "environment")`, or `multi_repeat()` if you want to use multiple restrictions eg. `repeat = multi_repeat([(2, "airline"), (3, "airport")], "AND")`
+- `tone` - Return articles above or below a particular tone score (ie more positive or more negative than a certain threshold). To use, specify either a greater than or less than sign and a positive or negative number (either an integer or floating point number). To find fairly positive articles, use `tone=">5"` or to search for fairly negative articles, use `tone="<-5"`
+- tone_absolute - The same as `tone` but ignores the positive/negative sign and lets you search for high emotion or low emotion articles, regardless of whether they were happy or sad in tone
+
+## Developing gdelt-doc-api
+
+PRs & issues are very welcome!
+
+### Setup
+
+It's recommended to use a virtual environment for development. Set one up with
+
+```
+python -m venv venv
+```
+
+and activate it (on Mac or Linux)
+
+```
+source venv/bin/activate
+```
+
+Then install the requirements
+
+```
+pip install -r requirements.txt
+```
+
+Tests for this package use `unittest`. Run them with
+
+```
+python -m unittest
+```
+
+If your PR adds a new feature or helper, please also add some tests
+
+### Publishing
+
+There's a bit of automation set up to help publish a new version of the package to PyPI,
+
+1. Make sure the version string has been updated since the last release. This package follows semantic versioning.
+2. Create a new release in the Github UI, using the new version as the release name
+3. Watch as the `publish.yml` Github action builds the package and pushes it to PyPI
--- a/requirements.txt
+++ b/requirements.txt
@ -10,3 +10,4 @@ feedparser>=6.0
 arch>=6.1.0
 scipy>=1.11.0
 torch>=2.3.0
+gdeltdoc>=0.1.7