"""Factor validation and quality control utilities.""" from __future__ import annotations from typing import Dict, Optional, Sequence import numpy as np from app.utils.logging import get_logger LOGGER = get_logger(__name__) LOG_EXTRA = {"stage": "factor_validation"} # 因子值范围限制配置 FACTOR_LIMITS = { # 动量类因子限制在 ±100% "mom_": (-1.0, 1.0), # 波动率类因子限制在 0-50% "volat_": (0, 0.5), # 换手率类因子限制在 0-500% (实际换手率可能超过100%) "turn_": (0, 5.0), # 估值评分类因子限制在 -1到1 "val_": (-1.0, 1.0), # 量价类因子 "volume_": (0, 10.0), # 市场状态类因子 "market_": (-1.0, 1.0), # 技术指标类因子 "tech_": (-1.0, 1.0), # 趋势类因子 "trend_": (-1.0, 1.0), # 微观结构类因子 "micro_": (-1.0, 1.0), # 情绪类因子 "sent_": (-1.0, 1.0), } def validate_factor_value( name: str, value: float, ts_code: str, trade_date: str ) -> Optional[float]: """验证单个因子值是否在合理范围内。 Args: name: 因子名称 value: 因子值 ts_code: 股票代码 trade_date: 交易日期 Returns: 如果因子值有效则返回原值,否则返回 None """ if value is None: return None # 检查是否为有限数值 if not np.isfinite(value): LOGGER.warning( "因子值非有限数值 factor=%s value=%f ts_code=%s date=%s", name, value, ts_code, trade_date, extra=LOG_EXTRA ) return None # 根据因子类型应用不同的限制 for prefix, (min_val, max_val) in FACTOR_LIMITS.items(): if name.startswith(prefix): if value < min_val or value > max_val: LOGGER.warning( "因子值超出范围 factor=%s value=%f range=[%f,%f] ts_code=%s date=%s", name, value, min_val, max_val, ts_code, trade_date, extra=LOG_EXTRA ) return None break return value def detect_outliers( values: Dict[str, float], ts_code: str, trade_date: str ) -> Dict[str, float]: """检测和处理因子值中的异常值。 Args: values: 因子值字典 ts_code: 股票代码 trade_date: 交易日期 Returns: 处理后的因子值字典 """ result = {} for name, value in values.items(): validated = validate_factor_value(name, value, ts_code, trade_date) if validated is not None: result[name] = validated return result def check_data_sufficiency( ts_code: str, trade_date: str, min_days: int = 60 ) -> bool: """验证因子计算所需数据是否充分。 Args: ts_code: 股票代码 trade_date: 交易日期 min_days: 最少需要的历史数据天数 Returns: 数据是否充分 """ from app.utils.data_access import DataBroker broker = DataBroker() # 检查历史收盘价数据 close_series = broker.fetch_series("daily", "close", ts_code, trade_date, min_days) # 计算有效值的数量 valid_values = [val for _, val in close_series if val is not None and isinstance(val, (int, float))] if len(valid_values) < min_days: LOGGER.warning( "历史数据不足 ts_code=%s date=%s min_days=%d actual=%d", ts_code, trade_date, min_days, len(valid_values), extra=LOG_EXTRA ) return False # 检查日期点数据完整性 latest_fields = broker.fetch_latest( ts_code, trade_date, ["daily.close", "daily_basic.turnover_rate", "daily_basic.pe", "daily_basic.pb"] ) required_fields = {"daily.close", "daily_basic.turnover_rate"} for field in required_fields: if latest_fields.get(field) is None: LOGGER.warning( "缺少必需字段 field=%s ts_code=%s date=%s", field, ts_code, trade_date, extra=LOG_EXTRA ) return False return True def check_series_sufficiency( data: Sequence, required_length: int, field_name: str, ts_code: str, trade_date: str ) -> bool: """检查数据序列是否满足计算要求。 Args: data: 数据序列 required_length: 所需最小长度 field_name: 字段名称 ts_code: 股票代码 trade_date: 交易日期 Returns: 数据是否足够 """ if len(data) < required_length: LOGGER.warning( "数据长度不足 field=%s required=%d actual=%d ts_code=%s date=%s", field_name, required_length, len(data), ts_code, trade_date, extra=LOG_EXTRA ) return False # 检查数据有效性 valid_count = sum(1 for x in data if x is not None and np.isfinite(x)) if valid_count < required_length: LOGGER.warning( "有效数据不足 field=%s required=%d valid=%d ts_code=%s date=%s", field_name, required_length, valid_count, ts_code, trade_date, extra=LOG_EXTRA ) return False return True