llm-quant/app/ingest/rss.py

"""RSS ingestion utilities for news sentiment and heat scoring."""
from __future__ import annotations

import json
import re
import sqlite3
from dataclasses import dataclass, replace
from datetime import datetime, timedelta, timezone
from email.utils import parsedate_to_datetime
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
from urllib.parse import urlparse, urljoin
from xml.etree import ElementTree as ET

import requests
from requests import RequestException

import hashlib
import random
import time

try:  # pragma: no cover - optional dependency at runtime
    import feedparser  # type: ignore[import-not-found]
except ImportError:  # pragma: no cover - graceful fallback
    feedparser = None  # type: ignore[assignment]

from app.data.schema import initialize_database
from app.utils import alerts
from app.utils.config import get_config
from app.utils.db import db_session
from app.utils.logging import get_logger


LOGGER = get_logger(__name__)
LOG_EXTRA = {"stage": "rss_ingest"}

DEFAULT_TIMEOUT = 10.0
MAX_SUMMARY_LENGTH = 1500

POSITIVE_KEYWORDS: Tuple[str, ...] = (
    "利好",
    "增长",
    "超预期",
    "创新高",
    "增持",
    "回购",
    "盈利",
    "strong",
    "beat",
    "upgrade",
)
NEGATIVE_KEYWORDS: Tuple[str, ...] = (
    "利空",
    "下跌",
    "亏损",
    "裁员",
    "违约",
    "处罚",
    "暴跌",
    "减持",
    "downgrade",
    "miss",
)

A_SH_CODE_PATTERN = re.compile(r"\b(\d{6})(?:\.(SH|SZ))?\b", re.IGNORECASE)
HK_CODE_PATTERN = re.compile(r"\b(\d{4})\.HK\b", re.IGNORECASE)


@dataclass
class RssFeedConfig:
    """Configuration describing a single RSS source."""

    url: str
    source: str
    ts_codes: Tuple[str, ...] = ()
    keywords: Tuple[str, ...] = ()
    hours_back: int = 48
    max_items: int = 50


@dataclass
class RssItem:
    """Structured representation of an RSS entry."""

    id: str
    title: str
    link: str
    published: datetime
    summary: str
    source: str
    ts_codes: Tuple[str, ...] = ()


DEFAULT_RSS_SOURCES: Tuple[RssFeedConfig, ...] = ()


def fetch_rss_feed(
    url: str,
    *,
    source: Optional[str] = None,
    hours_back: int = 48,
    max_items: int = 50,
    timeout: float = DEFAULT_TIMEOUT,
    max_retries: int = 5,
    retry_backoff: float = 1.5,
    retry_jitter: float = 0.3,
) -> List[RssItem]:
    """Download and parse an RSS feed into structured items."""

    return _fetch_feed_items(
        url,
        source=source,
        hours_back=hours_back,
        max_items=max_items,
        timeout=timeout,
        max_retries=max_retries,
        retry_backoff=retry_backoff,
        retry_jitter=retry_jitter,
        allow_html_redirect=True,
    )


def _fetch_feed_items(
    url: str,
    *,
    source: Optional[str],
    hours_back: int,
    max_items: int,
    timeout: float,
    max_retries: int,
    retry_backoff: float,
    retry_jitter: float,
    allow_html_redirect: bool,
) -> List[RssItem]:

    content = _download_feed(
        url,
        timeout,
        max_retries=max_retries,
        retry_backoff=retry_backoff,
        retry_jitter=retry_jitter,
    )
    if content is None:
        return []

    if allow_html_redirect:
        feed_links = _extract_html_feed_links(content, url)
        if feed_links:
            LOGGER.info(
                "RSS 页面包含子订阅 %s 个，自动展开",
                len(feed_links),
                extra=LOG_EXTRA,
            )
            aggregated: List[RssItem] = []
            for feed_url in feed_links:
                sub_items = _fetch_feed_items(
                    feed_url,
                    source=source,
                    hours_back=hours_back,
                    max_items=max_items,
                    timeout=timeout,
                    max_retries=max_retries,
                    retry_backoff=retry_backoff,
                    retry_jitter=retry_jitter,
                    allow_html_redirect=False,
                )
                aggregated.extend(sub_items)
                if max_items > 0 and len(aggregated) >= max_items:
                    return aggregated[:max_items]
            if aggregated:
                alerts.clear_warnings(_rss_source_key(url))
            else:
                alerts.add_warning(
                    _rss_source_key(url),
                    "聚合页未返回内容",
                )
            return aggregated

    parsed_entries = _parse_feed_content(content)
    total_entries = len(parsed_entries)
    LOGGER.info(
        "RSS 源获取完成 url=%s raw_entries=%s",
        url,
        total_entries,
        extra=LOG_EXTRA,
    )
    if not parsed_entries:
        LOGGER.warning(
            "RSS 无可解析条目 url=%s snippet=%s",
            url,
            _safe_snippet(content),
            extra=LOG_EXTRA,
        )
        return []

    cutoff = datetime.utcnow() - timedelta(hours=max(1, hours_back))
    source_name = source or _source_from_url(url)
    items: List[RssItem] = []
    seen_ids: set[str] = set()
    for entry in parsed_entries:
        published = entry.get("published") or datetime.utcnow()
        if published < cutoff:
            continue
        title = _clean_text(entry.get("title", ""))
        summary = _clean_text(entry.get("summary", ""))
        link = entry.get("link", "")
        raw_id = entry.get("id") or link
        item_id = _normalise_item_id(raw_id, link, title, published)
        if item_id in seen_ids:
            continue
        seen_ids.add(item_id)
        items.append(
            RssItem(
                id=item_id,
                title=title,
                link=link,
                published=published,
                summary=_truncate(summary, MAX_SUMMARY_LENGTH),
                source=source_name,
            )
        )
        if len(items) >= max_items > 0:
            break

    LOGGER.info(
        "RSS 过滤结果 url=%s within_window=%s unique=%s",
        url,
        sum(1 for entry in parsed_entries if (entry.get("published") or datetime.utcnow()) >= cutoff),
        len(items),
        extra=LOG_EXTRA,
    )
    if items:
        alerts.clear_warnings(_rss_source_key(url))

    return items


def deduplicate_items(items: Iterable[RssItem]) -> List[RssItem]:
    """Drop duplicate stories by link/id fingerprint."""

    seen = set()
    unique: List[RssItem] = []
    for item in items:
        key = item.id or item.link
        if key in seen:
            continue
        seen.add(key)
        unique.append(item)
    return unique


def save_news_items(items: Iterable[RssItem]) -> int:
    """Persist RSS items into the `news` table."""

    initialize_database()
    now = datetime.utcnow()
    rows: List[Tuple[object, ...]] = []

    processed = 0
    for item in items:
        text_payload = f"{item.title}\n{item.summary}"
        sentiment = _estimate_sentiment(text_payload)
        base_codes = tuple(code for code in item.ts_codes if code)
        heat = _estimate_heat(item.published, now, len(base_codes), sentiment)
        entities = json.dumps(
            {
                "ts_codes": list(base_codes),
                "source_url": item.link,
            },
            ensure_ascii=False,
        )
        resolved_codes = base_codes or (None,)
        for ts_code in resolved_codes:
            row_id = item.id if ts_code is None else f"{item.id}::{ts_code}"
            rows.append(
                (
                    row_id,
                    ts_code,
                    item.published.replace(tzinfo=timezone.utc).isoformat(),
                    item.source,
                    item.title,
                    item.summary,
                    item.link,
                    entities,
                    sentiment,
                    heat,
                )
            )
        processed += 1

    if not rows:
        return 0

    inserted = 0
    try:
        with db_session() as conn:
            conn.executemany(
                """
                INSERT OR IGNORE INTO news
                (id, ts_code, pub_time, source, title, summary, url, entities, sentiment, heat)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """,
                rows,
            )
            inserted = conn.total_changes
    except sqlite3.OperationalError:
        LOGGER.exception("写入新闻数据失败，表结构可能未初始化", extra=LOG_EXTRA)
        return 0
    except Exception:  # pragma: no cover - guard unexpected sqlite errors
        LOGGER.exception("写入新闻数据异常", extra=LOG_EXTRA)
        return 0

    LOGGER.info(
        "RSS 新闻落库完成 processed=%s inserted=%s",
        processed,
        inserted,
        extra=LOG_EXTRA,
    )
    return inserted


def ingest_configured_rss(
    *,
    hours_back: Optional[int] = None,
    max_items_per_feed: Optional[int] = None,
    max_retries: int = 5,
    retry_backoff: float = 2.0,
    retry_jitter: float = 0.5,
) -> int:
    """Ingest all configured RSS feeds into the news store."""

    configs = resolve_rss_sources()
    if not configs:
        LOGGER.info("未配置 RSS 来源，跳过新闻拉取", extra=LOG_EXTRA)
        return 0

    aggregated: List[RssItem] = []
    fetched_count = 0
    for index, cfg in enumerate(configs, start=1):
        window = hours_back or cfg.hours_back
        limit = max_items_per_feed or cfg.max_items
        LOGGER.info(
            "开始拉取 RSS：%s (window=%sh, limit=%s)",
            cfg.url,
            window,
            limit,
            extra=LOG_EXTRA,
        )
        items = fetch_rss_feed(
            cfg.url,
            source=cfg.source,
            hours_back=window,
            max_items=limit,
            max_retries=max_retries,
            retry_backoff=retry_backoff,
            retry_jitter=retry_jitter,
        )
        if not items:
            LOGGER.info("RSS 来源无新内容：%s", cfg.url, extra=LOG_EXTRA)
            continue
        enriched: List[RssItem] = []
        for item in items:
            codes = _assign_ts_codes(item, cfg.ts_codes, cfg.keywords)
            enriched.append(replace(item, ts_codes=tuple(codes)))
        aggregated.extend(enriched)
        fetched_count += len(enriched)
        if fetched_count and index < len(configs):
            time.sleep(2.0)

    if not aggregated:
        LOGGER.info("RSS 来源未产生有效新闻", extra=LOG_EXTRA)
        alerts.add_warning("RSS", "未获取到任何 RSS 新闻")
        return 0

    deduped = deduplicate_items(aggregated)
    LOGGER.info(
        "RSS 聚合完成 total_fetched=%s unique=%s",
        fetched_count,
        len(deduped),
        extra=LOG_EXTRA,
    )
    return save_news_items(deduped)


def resolve_rss_sources() -> List[RssFeedConfig]:
    """Resolve RSS feed configuration from persisted settings."""

    cfg = get_config()
    raw = getattr(cfg, "rss_sources", None) or {}
    feeds: Dict[str, RssFeedConfig] = {}

    def _add_feed(url: str, **kwargs: object) -> None:
        clean_url = url.strip()
        if not clean_url:
            return
        key = clean_url.lower()
        if key in feeds:
            return
        source_name = kwargs.get("source") or _source_from_url(clean_url)
        feeds[key] = RssFeedConfig(
            url=clean_url,
            source=str(source_name),
            ts_codes=tuple(kwargs.get("ts_codes", ()) or ()),
            keywords=tuple(kwargs.get("keywords", ()) or ()),
            hours_back=int(kwargs.get("hours_back", 48) or 48),
            max_items=int(kwargs.get("max_items", 50) or 50),
        )

    if isinstance(raw, dict):
        for key, value in raw.items():
            if isinstance(value, dict):
                if not value.get("enabled", True):
                    continue
                url = str(value.get("url") or key)
                ts_codes = [
                    str(code).strip().upper()
                    for code in value.get("ts_codes", [])
                    if str(code).strip()
                ]
                keywords = [
                    str(token).strip()
                    for token in value.get("keywords", [])
                    if str(token).strip()
                ]
                _add_feed(
                    url,
                    ts_codes=ts_codes,
                    keywords=keywords,
                    hours_back=value.get("hours_back", 48),
                    max_items=value.get("max_items", 50),
                    source=value.get("source") or value.get("label"),
                )
                continue

            if not value:
                continue
            url = key
            ts_codes: List[str] = []
            if "|" in key:
                prefix, url = key.split("|", 1)
                ts_codes = [
                    token.strip().upper()
                    for token in prefix.replace(",", ":").split(":")
                    if token.strip()
                ]
            _add_feed(url, ts_codes=ts_codes)

    if feeds:
        return list(feeds.values())

    return list(DEFAULT_RSS_SOURCES)


def _download_feed(
    url: str,
    timeout: float,
    *,
    max_retries: int,
    retry_backoff: float,
    retry_jitter: float,
) -> Optional[bytes]:
    headers = {
        "User-Agent": "llm-quant/0.1 (+https://github.com/qiang/llm_quant)",
        "Accept": "application/rss+xml, application/atom+xml, application/xml;q=0.9, */*;q=0.8",
    }
    attempt = 0
    delay = max(0.5, retry_backoff)
    while attempt <= max_retries:
        try:
            response = requests.get(url, headers=headers, timeout=timeout)
        except RequestException as exc:
            attempt += 1
            if attempt > max_retries:
                message = f"源请求失败：{url}"
                LOGGER.warning("RSS 请求失败：%s err=%s", url, exc, extra=LOG_EXTRA)
                alerts.add_warning(_rss_source_key(url), message, str(exc))
                return None
            wait = delay + random.uniform(0, retry_jitter)
            LOGGER.info(
                "RSS 请求异常，%.2f 秒后重试 url=%s attempt=%s/%s",
                wait,
                url,
                attempt,
                max_retries,
                extra=LOG_EXTRA,
            )
            time.sleep(max(wait, 0.1))
            delay *= max(1.1, retry_backoff)
            continue

        status = response.status_code
        if 200 <= status < 300:
            return response.content

        if status in {429, 503}:
            attempt += 1
            if attempt > max_retries:
                LOGGER.warning(
                    "RSS 请求失败：%s status=%s 已达到最大重试次数",
                    url,
                    status,
                    extra=LOG_EXTRA,
                )
                alerts.add_warning(
                    _rss_source_key(url),
                    "源限流",
                    f"HTTP {status}",
                )
                return None
            retry_after = response.headers.get("Retry-After")
            if retry_after:
                try:
                    wait = float(retry_after)
                except ValueError:
                    wait = delay
            else:
                wait = delay
            wait += random.uniform(0, retry_jitter)
            LOGGER.info(
                "RSS 命中限流 status=%s，%.2f 秒后重试 url=%s attempt=%s/%s",
                status,
                wait,
                url,
                attempt,
                max_retries,
                extra=LOG_EXTRA,
            )
            time.sleep(max(wait, 0.1))
            delay *= max(1.1, retry_backoff)
            continue

        LOGGER.warning(
            "RSS 请求失败：%s status=%s",
            url,
            status,
            extra=LOG_EXTRA,
        )
        alerts.add_warning(
            _rss_source_key(url),
            "源响应异常",
            f"HTTP {status}",
        )
        return None

    LOGGER.warning("RSS 请求失败：%s 未获取内容", url, extra=LOG_EXTRA)
    alerts.add_warning(_rss_source_key(url), "未获取内容")
    return None


def _extract_html_feed_links(content: bytes, base_url: str) -> List[str]:
    sample = content[:1024].lower()
    if b"<rss" in sample or b"<feed" in sample:
        return []

    for encoding in ("utf-8", "gb18030", "gb2312"):
        try:
            text = content.decode(encoding)
            break
        except UnicodeDecodeError:
            text = content.decode(encoding, errors="ignore")
            break
    else:
        text = content.decode("utf-8", errors="ignore")

    if "<link" not in text and ".xml" not in text:
        return []

    feed_urls: List[str] = []
    alternates = re.compile(
        r"<link[^>]+rel=[\"']alternate[\"'][^>]+type=[\"']application/(?:rss|atom)\+xml[\"'][^>]*href=[\"']([^\"']+)[\"']",
        re.IGNORECASE,
    )
    for match in alternates.finditer(text):
        href = match.group(1).strip()
        if href:
            feed_urls.append(urljoin(base_url, href))

    if not feed_urls:
        anchors = re.compile(r"href=[\"']([^\"']+\.xml)[\"']", re.IGNORECASE)
        for match in anchors.finditer(text):
            href = match.group(1).strip()
            if href:
                feed_urls.append(urljoin(base_url, href))

    unique_urls: List[str] = []
    seen = set()
    for href in feed_urls:
        if href not in seen and href != base_url:
            seen.add(href)
            unique_urls.append(href)
    return unique_urls


def _safe_snippet(content: bytes, limit: int = 160) -> str:
    try:
        text = content.decode("utf-8")
    except UnicodeDecodeError:
        try:
            text = content.decode("gb18030", errors="ignore")
        except UnicodeDecodeError:
            text = content.decode("latin-1", errors="ignore")
    cleaned = re.sub(r"\s+", " ", text)
    if len(cleaned) > limit:
        return cleaned[: limit - 3] + "..."
    return cleaned


def _parse_feed_content(content: bytes) -> List[Dict[str, object]]:
    if feedparser is not None:
        parsed = feedparser.parse(content)
        entries = []
        for entry in getattr(parsed, "entries", []) or []:
            entries.append(
                {
                    "id": getattr(entry, "id", None) or getattr(entry, "guid", None),
                    "title": getattr(entry, "title", ""),
                    "link": getattr(entry, "link", ""),
                    "summary": getattr(entry, "summary", "") or getattr(entry, "description", ""),
                    "published": _parse_datetime(
                        getattr(entry, "published", None)
                        or getattr(entry, "updated", None)
                        or getattr(entry, "issued", None)
                    ),
                }
        )
        if entries:
            return entries
    else:  # pragma: no cover - log helpful info when dependency missing
        LOGGER.warning(
            "feedparser 未安装，使用简易 XML 解析器回退处理 RSS",
            extra=LOG_EXTRA,
        )

    return _parse_feed_xml(content)


def _parse_feed_xml(content: bytes) -> List[Dict[str, object]]:
    try:
        xml_text = content.decode("utf-8")
    except UnicodeDecodeError:
        xml_text = content.decode("utf-8", errors="ignore")

    try:
        root = ET.fromstring(xml_text)
    except ET.ParseError as exc:  # pragma: no cover - depends on remote feed
        LOGGER.warning("RSS XML 解析失败 err=%s", exc, extra=LOG_EXTRA)
        return _lenient_parse_items(xml_text)

    tag = _local_name(root.tag)
    if tag == "rss":
        candidates = root.findall(".//item")
    elif tag == "feed":
        candidates = root.findall(".//{*}entry")
    else:  # fallback
        candidates = root.findall(".//item") or root.findall(".//{*}entry")

    entries: List[Dict[str, object]] = []
    for node in candidates:
        entries.append(
            {
                "id": _child_text(node, {"id", "guid"}),
                "title": _child_text(node, {"title"}) or "",
                "link": _child_text(node, {"link"}) or "",
                "summary": _child_text(node, {"summary", "description"}) or "",
                "published": _parse_datetime(
                    _child_text(node, {"pubDate", "published", "updated"})
                ),
            }
        )
    if not entries and "<item" in xml_text.lower():
        return _lenient_parse_items(xml_text)
    return entries


def _lenient_parse_items(xml_text: str) -> List[Dict[str, object]]:
    """Fallback parser that tolerates malformed RSS by using regular expressions."""

    items: List[Dict[str, object]] = []
    pattern = re.compile(r"<(item|entry)[^>]*>(.+?)</\\1>", re.IGNORECASE | re.DOTALL)
    for match in pattern.finditer(xml_text):
        block = match.group(0)
        title = _extract_tag_text(block, ["title"]) or ""
        link = _extract_link(block)
        summary = _extract_tag_text(block, ["summary", "description"]) or ""
        published_text = _extract_tag_text(block, ["pubDate", "published", "updated"])
        items.append(
            {
                "id": _extract_tag_text(block, ["id", "guid"]) or link,
                "title": title,
                "link": link,
                "summary": summary,
                "published": _parse_datetime(published_text),
            }
        )
    if items:
        LOGGER.info("RSS 采用宽松解析提取 %s 条记录", len(items), extra=LOG_EXTRA)
    return items


def _extract_tag_text(block: str, names: Sequence[str]) -> Optional[str]:
    for name in names:
        pattern = re.compile(rf"<{name}[^>]*>(.*?)</{name}>", re.IGNORECASE | re.DOTALL)
        match = pattern.search(block)
        if match:
            text = re.sub(r"<[^>]+>", " ", match.group(1))
            return _clean_text(text)
    return None


def _extract_link(block: str) -> str:
    href_pattern = re.compile(r"<link[^>]*href=\"([^\"]+)\"[^>]*>", re.IGNORECASE)
    match = href_pattern.search(block)
    if match:
        return match.group(1).strip()
    inline_pattern = re.compile(r"<link[^>]*>(.*?)</link>", re.IGNORECASE | re.DOTALL)
    match = inline_pattern.search(block)
    if match:
        return match.group(1).strip()
    return ""


def _assign_ts_codes(
    item: RssItem,
    base_codes: Sequence[str],
    keywords: Sequence[str],
) -> List[str]:
    matches: set[str] = set()
    text = f"{item.title} {item.summary}".lower()
    if keywords:
        for keyword in keywords:
            token = keyword.lower().strip()
            if token and token in text:
                matches.update(code.strip().upper() for code in base_codes if code)
                break
    else:
        matches.update(code.strip().upper() for code in base_codes if code)

    detected = _detect_ts_codes(text)
    matches.update(detected)
    return [code for code in matches if code]


def _detect_ts_codes(text: str) -> List[str]:
    codes: set[str] = set()
    for match in A_SH_CODE_PATTERN.finditer(text):
        digits, suffix = match.groups()
        if suffix:
            codes.add(f"{digits}.{suffix.upper()}")
        else:
            exchange = "SH" if digits.startswith(tuple("569")) else "SZ"
            codes.add(f"{digits}.{exchange}")
    for match in HK_CODE_PATTERN.finditer(text):
        digits = match.group(1)
        codes.add(f"{digits.zfill(4)}.HK")
    return sorted(codes)


def _estimate_sentiment(text: str) -> float:
    normalized = text.lower()
    score = 0
    for keyword in POSITIVE_KEYWORDS:
        if keyword.lower() in normalized:
            score += 1
    for keyword in NEGATIVE_KEYWORDS:
        if keyword.lower() in normalized:
            score -= 1
    if score == 0:
        return 0.0
    return max(-1.0, min(1.0, score / 3.0))


def _estimate_heat(
    published: datetime,
    now: datetime,
    code_count: int,
    sentiment: float,
) -> float:
    delta_hours = max(0.0, (now - published).total_seconds() / 3600.0)
    recency = max(0.0, 1.0 - min(delta_hours, 72.0) / 72.0)
    coverage_bonus = min(code_count, 3) * 0.05
    sentiment_bonus = min(abs(sentiment) * 0.1, 0.2)
    heat = recency + coverage_bonus + sentiment_bonus
    return max(0.0, min(1.0, round(heat, 4)))


def _parse_datetime(value: Optional[str]) -> Optional[datetime]:
    if not value:
        return None
    try:
        dt = parsedate_to_datetime(value)
        if dt.tzinfo is not None:
            dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
        return dt
    except (TypeError, ValueError):
        pass

    for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S"):
        try:
            return datetime.strptime(value[:19], fmt)
        except ValueError:
            continue
    return None


def _clean_text(value: Optional[str]) -> str:
    if not value:
        return ""
    text = re.sub(r"<[^>]+>", " ", value)
    return re.sub(r"\s+", " ", text).strip()


def _truncate(value: str, length: int) -> str:
    if len(value) <= length:
        return value
    return value[: length - 3].rstrip() + "..."


def _normalise_item_id(
    raw_id: Optional[str], link: str, title: str, published: datetime
) -> str:
    candidate = (raw_id or link or title).strip()
    if candidate:
        return candidate
    fingerprint = f"{title}|{published.isoformat()}"
    return hashlib.blake2s(fingerprint.encode("utf-8"), digest_size=16).hexdigest()


def _source_from_url(url: str) -> str:
    try:
        parsed = urlparse(url)
    except ValueError:
        return url
    host = parsed.netloc or url
    return host.lower()


def _local_name(tag: str) -> str:
    if "}" in tag:
        return tag.rsplit("}", 1)[-1]
    return tag


def _child_text(node: ET.Element, candidates: set[str]) -> Optional[str]:
    for child in node:
        name = _local_name(child.tag)
        if name in candidates and child.text:
            return child.text.strip()
        if name == "link":
            href = child.attrib.get("href")
            if href:
                return href.strip()
    return None


__all__ = [
    "RssFeedConfig",
    "RssItem",
    "fetch_rss_feed",
    "deduplicate_items",
    "save_news_items",
    "ingest_configured_rss",
    "resolve_rss_sources",
]
def _rss_source_key(url: str) -> str:
    return f"RSS|{url}".strip()