450 lines
17 KiB
Python
450 lines
17 KiB
Python
"""Utility helpers for performing lightweight data quality checks."""
|
||
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass, field
|
||
from datetime import date, datetime, timedelta
|
||
from typing import Dict, Iterable, List, Optional, Sequence
|
||
|
||
from app.utils.db import db_session
|
||
from app.utils.logging import get_logger
|
||
|
||
LOGGER = get_logger(__name__)
|
||
LOG_EXTRA = {"stage": "data_quality"}
|
||
|
||
Severity = str # Literal["ERROR", "WARN", "INFO"] (avoid importing Literal for py<3.8)
|
||
|
||
|
||
@dataclass
|
||
class DataQualityResult:
|
||
check: str
|
||
severity: Severity
|
||
detail: str
|
||
extras: Optional[Dict[str, object]] = None
|
||
|
||
|
||
@dataclass
|
||
class DataQualitySummary:
|
||
window_days: int
|
||
score: float
|
||
total_checks: int
|
||
severity_counts: Dict[Severity, int] = field(default_factory=dict)
|
||
blocking: List[DataQualityResult] = field(default_factory=list)
|
||
warnings: List[DataQualityResult] = field(default_factory=list)
|
||
informational: List[DataQualityResult] = field(default_factory=list)
|
||
|
||
@property
|
||
def has_blockers(self) -> bool:
|
||
return bool(self.blocking)
|
||
|
||
def as_dict(self) -> Dict[str, object]:
|
||
return {
|
||
"window_days": self.window_days,
|
||
"score": self.score,
|
||
"total_checks": self.total_checks,
|
||
"severity_counts": dict(self.severity_counts),
|
||
"has_blockers": self.has_blockers,
|
||
}
|
||
|
||
|
||
def _parse_date(value: object) -> Optional[date]:
|
||
"""Best-effort parse for trade_date columns stored as str/int."""
|
||
if value is None:
|
||
return None
|
||
if isinstance(value, date):
|
||
return value
|
||
text = str(value).strip()
|
||
if not text:
|
||
return None
|
||
for fmt in ("%Y-%m-%d", "%Y%m%d"):
|
||
try:
|
||
return datetime.strptime(text, fmt).date()
|
||
except ValueError:
|
||
continue
|
||
try:
|
||
return datetime.fromisoformat(text).date()
|
||
except ValueError:
|
||
LOGGER.debug("无法解析日期字段 value=%s", value, extra=LOG_EXTRA)
|
||
return None
|
||
|
||
|
||
def run_data_quality_checks(*, window_days: int = 7) -> List[DataQualityResult]:
|
||
"""Execute a suite of lightweight data quality checks."""
|
||
results: List[DataQualityResult] = []
|
||
today = date.today()
|
||
window_start = today - timedelta(days=window_days)
|
||
|
||
try:
|
||
with db_session(read_only=True) as conn:
|
||
# 1. 候选池最新数据
|
||
try:
|
||
row = conn.execute(
|
||
"""
|
||
SELECT trade_date, COUNT(DISTINCT ts_code) AS cnt
|
||
FROM investment_pool
|
||
ORDER BY trade_date DESC
|
||
LIMIT 1
|
||
"""
|
||
).fetchone()
|
||
except Exception: # noqa: BLE001
|
||
LOGGER.exception("查询 investment_pool 失败", extra=LOG_EXTRA)
|
||
results.append(
|
||
DataQualityResult(
|
||
"候选池可用性",
|
||
"ERROR",
|
||
"读取候选池数据失败,请检查 investment_pool 表结构与权限。",
|
||
)
|
||
)
|
||
row = None
|
||
|
||
latest_candidate_date = _parse_date(row["trade_date"]) if row else None
|
||
candidate_count = int(row["cnt"]) if row and row["cnt"] is not None else 0
|
||
if row is None:
|
||
pass
|
||
elif latest_candidate_date is None:
|
||
results.append(
|
||
DataQualityResult(
|
||
"候选池可用性",
|
||
"ERROR",
|
||
"未解析到候选池日期,请确认 trade_date 字段格式。",
|
||
{"raw_value": row["trade_date"]},
|
||
)
|
||
)
|
||
else:
|
||
age = (today - latest_candidate_date).days
|
||
extras = {
|
||
"trade_date": latest_candidate_date.isoformat(),
|
||
"candidate_count": candidate_count,
|
||
"age_days": age,
|
||
}
|
||
if candidate_count == 0:
|
||
results.append(
|
||
DataQualityResult(
|
||
"候选池可用性",
|
||
"ERROR",
|
||
f"{latest_candidate_date} 的候选池为空。",
|
||
extras,
|
||
)
|
||
)
|
||
elif age > window_days:
|
||
results.append(
|
||
DataQualityResult(
|
||
"候选池可用性",
|
||
"WARN",
|
||
f"候选池停留在 {latest_candidate_date},已超过 {window_days} 天未更新。",
|
||
extras,
|
||
)
|
||
)
|
||
else:
|
||
results.append(
|
||
DataQualityResult(
|
||
"候选池可用性",
|
||
"INFO",
|
||
f"最新候选池({latest_candidate_date})包含 {candidate_count} 个标的。",
|
||
extras,
|
||
)
|
||
)
|
||
|
||
# 2. agent_utils 覆盖率
|
||
try:
|
||
row_agent = conn.execute(
|
||
"""
|
||
SELECT trade_date, COUNT(DISTINCT ts_code) AS cnt
|
||
FROM agent_utils
|
||
ORDER BY trade_date DESC
|
||
LIMIT 1
|
||
"""
|
||
).fetchone()
|
||
except Exception: # noqa: BLE001
|
||
LOGGER.exception("查询 agent_utils 失败", extra=LOG_EXTRA)
|
||
results.append(
|
||
DataQualityResult(
|
||
"策略评估数据",
|
||
"ERROR",
|
||
"读取 agent_utils 失败,无法评估部门/代理数据是否可用。",
|
||
)
|
||
)
|
||
row_agent = None
|
||
|
||
latest_agent_date = _parse_date(row_agent["trade_date"]) if row_agent else None
|
||
agent_count = int(row_agent["cnt"]) if row_agent and row_agent["cnt"] is not None else 0
|
||
if row_agent is None:
|
||
pass
|
||
elif latest_agent_date is None:
|
||
results.append(
|
||
DataQualityResult(
|
||
"策略评估数据",
|
||
"ERROR",
|
||
"未解析到 agent_utils 日期,请确认 trade_date 字段格式。",
|
||
{"raw_value": row_agent["trade_date"]},
|
||
)
|
||
)
|
||
else:
|
||
extras = {
|
||
"trade_date": latest_agent_date.isoformat(),
|
||
"decision_count": agent_count,
|
||
}
|
||
if agent_count == 0:
|
||
results.append(
|
||
DataQualityResult(
|
||
"策略评估数据",
|
||
"WARN",
|
||
f"{latest_agent_date} 的 agent_utils 中未找到标的记录。",
|
||
extras,
|
||
)
|
||
)
|
||
else:
|
||
results.append(
|
||
DataQualityResult(
|
||
"策略评估数据",
|
||
"INFO",
|
||
f"{latest_agent_date} 共有 {agent_count} 个标的完成策略评估。",
|
||
extras,
|
||
)
|
||
)
|
||
|
||
if latest_candidate_date and latest_candidate_date != latest_agent_date:
|
||
results.append(
|
||
DataQualityResult(
|
||
"候选与策略同步",
|
||
"WARN",
|
||
"候选池与策略评估日期不一致,建议重新触发评估或数据同步。",
|
||
{
|
||
"candidate_date": latest_candidate_date.isoformat(),
|
||
"agent_date": latest_agent_date.isoformat(),
|
||
},
|
||
)
|
||
)
|
||
|
||
# 3. 开仓记录 vs 快照
|
||
try:
|
||
open_positions = conn.execute(
|
||
"""
|
||
SELECT COUNT(*) AS cnt
|
||
FROM portfolio_positions
|
||
WHERE status = 'open'
|
||
"""
|
||
).fetchone()
|
||
open_position_count = int(open_positions["cnt"]) if open_positions else 0
|
||
except Exception: # noqa: BLE001
|
||
LOGGER.exception("查询 portfolio_positions 失败", extra=LOG_EXTRA)
|
||
open_position_count = 0
|
||
results.append(
|
||
DataQualityResult(
|
||
"持仓数据",
|
||
"WARN",
|
||
"无法读取当前持仓,检查 portfolio_positions 表是否存在。",
|
||
)
|
||
)
|
||
|
||
latest_snapshot_date = None
|
||
snapshot_date_column = None
|
||
try:
|
||
snapshot_info = conn.execute("PRAGMA table_info(portfolio_snapshots)").fetchall()
|
||
except Exception: # noqa: BLE001
|
||
LOGGER.exception("读取 portfolio_snapshots 结构失败", extra=LOG_EXTRA)
|
||
snapshot_info = []
|
||
|
||
preferred_snapshot_columns: Iterable[str] = (
|
||
"as_of",
|
||
"snapshot_date",
|
||
"trade_date",
|
||
"date",
|
||
"created_at",
|
||
"timestamp",
|
||
)
|
||
available_snapshot_columns: List[str] = []
|
||
for row in snapshot_info:
|
||
name = row["name"] if "name" in row.keys() else row[1]
|
||
available_snapshot_columns.append(name)
|
||
if name in preferred_snapshot_columns and snapshot_date_column is None:
|
||
snapshot_date_column = name
|
||
|
||
if snapshot_date_column:
|
||
try:
|
||
snap_row = conn.execute(
|
||
f"""
|
||
SELECT MAX({snapshot_date_column}) AS latest_snapshot
|
||
FROM portfolio_snapshots
|
||
"""
|
||
).fetchone()
|
||
if snap_row and snap_row["latest_snapshot"]:
|
||
latest_snapshot_date = _parse_date(snap_row["latest_snapshot"])
|
||
except Exception: # noqa: BLE001
|
||
LOGGER.exception("查询 portfolio_snapshots 失败", extra=LOG_EXTRA)
|
||
elif available_snapshot_columns:
|
||
results.append(
|
||
DataQualityResult(
|
||
"持仓快照",
|
||
"WARN",
|
||
"未找到标准快照日期字段(如 as_of/snapshot_date),请确认表结构。",
|
||
{"columns": available_snapshot_columns},
|
||
)
|
||
)
|
||
else:
|
||
results.append(
|
||
DataQualityResult(
|
||
"持仓快照",
|
||
"WARN",
|
||
"未检测到 portfolio_snapshots 表,无法校验持仓快照。",
|
||
)
|
||
)
|
||
|
||
if open_position_count > 0:
|
||
if latest_snapshot_date is None:
|
||
results.append(
|
||
DataQualityResult(
|
||
"持仓快照",
|
||
"WARN",
|
||
"存在未平仓头寸,但未找到任何持仓快照记录。",
|
||
{"open_positions": open_position_count},
|
||
)
|
||
)
|
||
elif latest_snapshot_date < window_start:
|
||
results.append(
|
||
DataQualityResult(
|
||
"持仓快照",
|
||
"WARN",
|
||
f"最新持仓快照停留在 {latest_snapshot_date},已超过窗口 {window_days} 天。",
|
||
{
|
||
"latest_snapshot": latest_snapshot_date.isoformat(),
|
||
"open_positions": open_position_count,
|
||
},
|
||
)
|
||
)
|
||
else:
|
||
results.append(
|
||
DataQualityResult(
|
||
"持仓快照",
|
||
"INFO",
|
||
f"最新持仓快照日期:{latest_snapshot_date}。",
|
||
{
|
||
"latest_snapshot": latest_snapshot_date.isoformat(),
|
||
"open_positions": open_position_count,
|
||
},
|
||
)
|
||
)
|
||
|
||
# 4. 新闻数据时效
|
||
latest_news_date = None
|
||
try:
|
||
news_row = conn.execute(
|
||
"""
|
||
SELECT MAX(pub_time) AS latest_pub
|
||
FROM news
|
||
"""
|
||
).fetchone()
|
||
if news_row and news_row["latest_pub"]:
|
||
try:
|
||
latest_news_date = datetime.fromisoformat(
|
||
str(news_row["latest_pub"])
|
||
)
|
||
except ValueError:
|
||
LOGGER.debug(
|
||
"无法解析新闻时间字段 value=%s",
|
||
news_row["latest_pub"],
|
||
extra=LOG_EXTRA,
|
||
)
|
||
except Exception: # noqa: BLE001
|
||
LOGGER.exception("查询 news 失败", extra=LOG_EXTRA)
|
||
|
||
if latest_news_date:
|
||
if latest_news_date.tzinfo is not None:
|
||
now_ts = datetime.now(tz=latest_news_date.tzinfo)
|
||
else:
|
||
now_ts = datetime.now()
|
||
delta_days = (now_ts - latest_news_date).days
|
||
if delta_days > window_days:
|
||
results.append(
|
||
DataQualityResult(
|
||
"新闻数据时效",
|
||
"WARN",
|
||
f"最新新闻时间为 {latest_news_date}, 已超过 {window_days} 天未更新。",
|
||
{"latest_pub_time": str(latest_news_date)},
|
||
)
|
||
)
|
||
else:
|
||
results.append(
|
||
DataQualityResult(
|
||
"新闻数据时效",
|
||
"INFO",
|
||
f"新闻数据最新时间:{latest_news_date}",
|
||
{"latest_pub_time": str(latest_news_date)},
|
||
)
|
||
)
|
||
else:
|
||
results.append(
|
||
DataQualityResult(
|
||
"新闻数据时效",
|
||
"WARN",
|
||
"未找到最新新闻记录,请检查 RSS 或新闻数据接入。",
|
||
)
|
||
)
|
||
except Exception: # noqa: BLE001
|
||
LOGGER.exception("执行数据质量检查失败", extra=LOG_EXTRA)
|
||
results.append(
|
||
DataQualityResult(
|
||
"运行状态",
|
||
"ERROR",
|
||
"数据质量检查过程中发生异常,请查看日志。",
|
||
)
|
||
)
|
||
|
||
return results
|
||
|
||
|
||
def summarize_data_quality(
|
||
results: Sequence[DataQualityResult],
|
||
*,
|
||
window_days: int,
|
||
top_issues: int = 5,
|
||
) -> DataQualitySummary:
|
||
"""Aggregate quality checks into a normalized score and severity summary."""
|
||
|
||
severity_buckets: Dict[str, List[DataQualityResult]] = {}
|
||
for result in results:
|
||
severity = (result.severity or "INFO").upper()
|
||
severity_buckets.setdefault(severity, []).append(result)
|
||
|
||
counts = {severity: len(items) for severity, items in severity_buckets.items()}
|
||
if not results:
|
||
return DataQualitySummary(
|
||
window_days=window_days,
|
||
score=100.0,
|
||
total_checks=0,
|
||
severity_counts=counts,
|
||
)
|
||
|
||
weights = {"ERROR": 5.0, "WARN": 2.0, "INFO": 0.0}
|
||
penalty = 0.0
|
||
for result in results:
|
||
severity = (result.severity or "INFO").upper()
|
||
penalty += weights.get(severity, 2.0)
|
||
max_weight = max(weights.values(), default=1.0)
|
||
max_penalty = max(1.0, len(results) * max_weight)
|
||
score = max(0.0, 100.0 - (penalty / max_penalty) * 100.0)
|
||
|
||
return DataQualitySummary(
|
||
window_days=window_days,
|
||
score=round(score, 2),
|
||
total_checks=len(results),
|
||
severity_counts=counts,
|
||
blocking=severity_buckets.get("ERROR", [])[:top_issues],
|
||
warnings=severity_buckets.get("WARN", [])[:top_issues],
|
||
informational=severity_buckets.get("INFO", [])[:top_issues],
|
||
)
|
||
|
||
|
||
def evaluate_data_quality(
|
||
*,
|
||
window_days: int = 7,
|
||
top_issues: int = 5,
|
||
) -> DataQualitySummary:
|
||
"""Run quality checks and return a scored summary."""
|
||
|
||
results = run_data_quality_checks(window_days=window_days)
|
||
return summarize_data_quality(
|
||
results,
|
||
window_days=window_days,
|
||
top_issues=top_issues,
|
||
)
|