572 lines
18 KiB
Python
572 lines
18 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
概念异动检测器 - 融合版
|
||
|
||
结合两种方法的优势:
|
||
1. 规则评分系统:可解释、稳定、覆盖已知模式
|
||
2. LSTM Autoencoder:发现未知的异常模式
|
||
|
||
融合策略:
|
||
┌─────────────────────────────────────────────────────────┐
|
||
│ 输入特征 │
|
||
│ (alpha, alpha_delta, amt_ratio, amt_delta, rank_pct, │
|
||
│ limit_up_ratio) │
|
||
├─────────────────────────────────────────────────────────┤
|
||
│ │
|
||
│ ┌──────────────┐ ┌──────────────┐ │
|
||
│ │ 规则评分系统 │ │ LSTM Autoencoder │ │
|
||
│ │ (0-100分) │ │ (重构误差) │ │
|
||
│ └──────┬───────┘ └──────┬───────┘ │
|
||
│ │ │ │
|
||
│ ▼ ▼ │
|
||
│ rule_score (0-100) ml_score (标准化后 0-100) │
|
||
│ │
|
||
├─────────────────────────────────────────────────────────┤
|
||
│ 融合策略 │
|
||
│ │
|
||
│ final_score = w1 * rule_score + w2 * ml_score │
|
||
│ │
|
||
│ 异动判定: │
|
||
│ - rule_score >= 60 → 直接触发(规则强信号) │
|
||
│ - ml_score >= 80 → 直接触发(ML强信号) │
|
||
│ - final_score >= 50 → 融合触发 │
|
||
│ │
|
||
└─────────────────────────────────────────────────────────┘
|
||
|
||
优势:
|
||
- 规则系统保证已知模式的检出率
|
||
- ML模型捕捉规则未覆盖的异常
|
||
- 两者互相验证,减少误报
|
||
"""
|
||
|
||
import json
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional, Tuple
|
||
from dataclasses import dataclass
|
||
|
||
import numpy as np
|
||
import torch
|
||
|
||
# 尝试导入模型(可能不存在)
|
||
try:
|
||
from model import LSTMAutoencoder, create_model
|
||
HAS_MODEL = True
|
||
except ImportError:
|
||
HAS_MODEL = False
|
||
|
||
|
||
@dataclass
|
||
class AnomalyResult:
|
||
"""异动检测结果"""
|
||
is_anomaly: bool
|
||
final_score: float # 最终得分 (0-100)
|
||
rule_score: float # 规则得分 (0-100)
|
||
ml_score: float # ML得分 (0-100)
|
||
trigger_reason: str # 触发原因
|
||
rule_details: Dict # 规则明细
|
||
anomaly_type: str # 异动类型: surge_up / surge_down / volume_spike / unknown
|
||
|
||
|
||
class RuleBasedScorer:
|
||
"""
|
||
基于规则的评分系统
|
||
|
||
设计原则:
|
||
- 每个规则独立打分
|
||
- 分数可叠加
|
||
- 阈值可配置
|
||
"""
|
||
|
||
# 默认规则配置
|
||
DEFAULT_RULES = {
|
||
# Alpha 相关(超额收益)
|
||
'alpha_strong': {
|
||
'condition': lambda r: abs(r.get('alpha', 0)) >= 3.0,
|
||
'score': 35,
|
||
'description': 'Alpha强信号(|α|≥3%)'
|
||
},
|
||
'alpha_medium': {
|
||
'condition': lambda r: 2.0 <= abs(r.get('alpha', 0)) < 3.0,
|
||
'score': 25,
|
||
'description': 'Alpha中等(2%≤|α|<3%)'
|
||
},
|
||
'alpha_weak': {
|
||
'condition': lambda r: 1.5 <= abs(r.get('alpha', 0)) < 2.0,
|
||
'score': 15,
|
||
'description': 'Alpha轻微(1.5%≤|α|<2%)'
|
||
},
|
||
|
||
# Alpha 变化率(加速度)
|
||
'alpha_delta_strong': {
|
||
'condition': lambda r: abs(r.get('alpha_delta', 0)) >= 1.0,
|
||
'score': 30,
|
||
'description': 'Alpha加速强(|Δα|≥1%)'
|
||
},
|
||
'alpha_delta_medium': {
|
||
'condition': lambda r: 0.5 <= abs(r.get('alpha_delta', 0)) < 1.0,
|
||
'score': 20,
|
||
'description': 'Alpha加速中(0.5%≤|Δα|<1%)'
|
||
},
|
||
|
||
# 成交额比率(放量)
|
||
'volume_spike_strong': {
|
||
'condition': lambda r: r.get('amt_ratio', 1) >= 5.0,
|
||
'score': 30,
|
||
'description': '极度放量(≥5倍)'
|
||
},
|
||
'volume_spike_medium': {
|
||
'condition': lambda r: 3.0 <= r.get('amt_ratio', 1) < 5.0,
|
||
'score': 20,
|
||
'description': '显著放量(3-5倍)'
|
||
},
|
||
'volume_spike_weak': {
|
||
'condition': lambda r: 2.0 <= r.get('amt_ratio', 1) < 3.0,
|
||
'score': 10,
|
||
'description': '轻微放量(2-3倍)'
|
||
},
|
||
|
||
# 成交额变化率
|
||
'amt_delta_strong': {
|
||
'condition': lambda r: abs(r.get('amt_delta', 0)) >= 1.0,
|
||
'score': 15,
|
||
'description': '成交额急变(|Δamt|≥100%)'
|
||
},
|
||
|
||
# 排名跳变
|
||
'rank_top': {
|
||
'condition': lambda r: r.get('rank_pct', 0.5) >= 0.95,
|
||
'score': 25,
|
||
'description': '排名前5%'
|
||
},
|
||
'rank_bottom': {
|
||
'condition': lambda r: r.get('rank_pct', 0.5) <= 0.05,
|
||
'score': 25,
|
||
'description': '排名后5%'
|
||
},
|
||
'rank_high': {
|
||
'condition': lambda r: 0.9 <= r.get('rank_pct', 0.5) < 0.95,
|
||
'score': 15,
|
||
'description': '排名前10%'
|
||
},
|
||
|
||
# 涨停比例
|
||
'limit_up_high': {
|
||
'condition': lambda r: r.get('limit_up_ratio', 0) >= 0.2,
|
||
'score': 25,
|
||
'description': '涨停比例≥20%'
|
||
},
|
||
'limit_up_medium': {
|
||
'condition': lambda r: 0.1 <= r.get('limit_up_ratio', 0) < 0.2,
|
||
'score': 15,
|
||
'description': '涨停比例10-20%'
|
||
},
|
||
|
||
# 组合条件(更可靠的信号)
|
||
'alpha_with_volume': {
|
||
'condition': lambda r: abs(r.get('alpha', 0)) >= 1.5 and r.get('amt_ratio', 1) >= 2.0,
|
||
'score': 20, # 额外加分
|
||
'description': 'Alpha+放量组合'
|
||
},
|
||
'acceleration_with_rank': {
|
||
'condition': lambda r: abs(r.get('alpha_delta', 0)) >= 0.5 and (r.get('rank_pct', 0.5) >= 0.9 or r.get('rank_pct', 0.5) <= 0.1),
|
||
'score': 15, # 额外加分
|
||
'description': '加速+排名异常组合'
|
||
},
|
||
}
|
||
|
||
def __init__(self, rules: Dict = None):
|
||
"""
|
||
初始化规则评分器
|
||
|
||
Args:
|
||
rules: 自定义规则,格式同 DEFAULT_RULES
|
||
"""
|
||
self.rules = rules or self.DEFAULT_RULES
|
||
|
||
def score(self, features: Dict) -> Tuple[float, Dict]:
|
||
"""
|
||
计算规则得分
|
||
|
||
Args:
|
||
features: 特征字典,包含 alpha, alpha_delta, amt_ratio 等
|
||
Returns:
|
||
score: 总分 (0-100)
|
||
details: 触发的规则明细
|
||
"""
|
||
total_score = 0
|
||
triggered_rules = {}
|
||
|
||
for rule_name, rule_config in self.rules.items():
|
||
try:
|
||
if rule_config['condition'](features):
|
||
total_score += rule_config['score']
|
||
triggered_rules[rule_name] = {
|
||
'score': rule_config['score'],
|
||
'description': rule_config['description']
|
||
}
|
||
except Exception:
|
||
# 忽略规则计算错误
|
||
pass
|
||
|
||
# 限制在 0-100
|
||
total_score = min(100, max(0, total_score))
|
||
|
||
return total_score, triggered_rules
|
||
|
||
def get_anomaly_type(self, features: Dict) -> str:
|
||
"""判断异动类型"""
|
||
alpha = features.get('alpha', 0)
|
||
amt_ratio = features.get('amt_ratio', 1)
|
||
|
||
if alpha >= 1.5:
|
||
return 'surge_up'
|
||
elif alpha <= -1.5:
|
||
return 'surge_down'
|
||
elif amt_ratio >= 3.0:
|
||
return 'volume_spike'
|
||
else:
|
||
return 'unknown'
|
||
|
||
|
||
class MLScorer:
|
||
"""
|
||
基于 LSTM Autoencoder 的评分器
|
||
|
||
将重构误差转换为 0-100 的分数
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
checkpoint_dir: str = 'ml/checkpoints',
|
||
device: str = 'auto'
|
||
):
|
||
self.checkpoint_dir = Path(checkpoint_dir)
|
||
|
||
# 设备
|
||
if device == 'auto':
|
||
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||
else:
|
||
self.device = torch.device(device)
|
||
|
||
self.model = None
|
||
self.thresholds = None
|
||
self.config = None
|
||
|
||
# 尝试加载模型
|
||
self._load_model()
|
||
|
||
def _load_model(self):
|
||
"""加载模型和阈值"""
|
||
if not HAS_MODEL:
|
||
print("警告: 无法导入模型模块")
|
||
return
|
||
|
||
model_path = self.checkpoint_dir / 'best_model.pt'
|
||
thresholds_path = self.checkpoint_dir / 'thresholds.json'
|
||
config_path = self.checkpoint_dir / 'config.json'
|
||
|
||
if not model_path.exists():
|
||
print(f"警告: 模型文件不存在 {model_path}")
|
||
return
|
||
|
||
try:
|
||
# 加载配置
|
||
if config_path.exists():
|
||
with open(config_path, 'r') as f:
|
||
self.config = json.load(f)
|
||
|
||
# 加载模型
|
||
checkpoint = torch.load(model_path, map_location=self.device)
|
||
|
||
model_config = self.config.get('model', {}) if self.config else {}
|
||
self.model = create_model(model_config)
|
||
self.model.load_state_dict(checkpoint['model_state_dict'])
|
||
self.model.to(self.device)
|
||
self.model.eval()
|
||
|
||
# 加载阈值
|
||
if thresholds_path.exists():
|
||
with open(thresholds_path, 'r') as f:
|
||
self.thresholds = json.load(f)
|
||
|
||
print(f"MLScorer 加载成功 (设备: {self.device})")
|
||
|
||
except Exception as e:
|
||
print(f"警告: 模型加载失败 - {e}")
|
||
self.model = None
|
||
|
||
def is_ready(self) -> bool:
|
||
"""检查模型是否就绪"""
|
||
return self.model is not None
|
||
|
||
@torch.no_grad()
|
||
def score(self, sequence: np.ndarray) -> float:
|
||
"""
|
||
计算 ML 得分
|
||
|
||
Args:
|
||
sequence: (seq_len, n_features) 或 (batch, seq_len, n_features)
|
||
Returns:
|
||
score: 0-100 的分数,越高越异常
|
||
"""
|
||
if not self.is_ready():
|
||
return 0.0
|
||
|
||
# 确保是 3D
|
||
if sequence.ndim == 2:
|
||
sequence = sequence[np.newaxis, ...]
|
||
|
||
# 转为 tensor
|
||
x = torch.FloatTensor(sequence).to(self.device)
|
||
|
||
# 计算重构误差
|
||
output, _ = self.model(x)
|
||
mse = ((output - x) ** 2).mean(dim=-1) # (batch, seq_len)
|
||
|
||
# 取最后时刻的误差
|
||
error = mse[:, -1].cpu().numpy()
|
||
|
||
# 转换为 0-100 分数
|
||
# 使用 p95 阈值作为参考
|
||
if self.thresholds:
|
||
p95 = self.thresholds.get('p95', 0.1)
|
||
p99 = self.thresholds.get('p99', 0.2)
|
||
else:
|
||
p95, p99 = 0.1, 0.2
|
||
|
||
# 线性映射:p95 -> 50分, p99 -> 80分
|
||
# error=0 -> 0分, error>=p99*1.5 -> 100分
|
||
score = np.clip(error / p95 * 50, 0, 100)
|
||
|
||
return float(score[0]) if len(score) == 1 else score.tolist()
|
||
|
||
|
||
class HybridAnomalyDetector:
|
||
"""
|
||
融合异动检测器
|
||
|
||
结合规则系统和 ML 模型
|
||
"""
|
||
|
||
# 默认配置
|
||
DEFAULT_CONFIG = {
|
||
# 权重配置
|
||
'rule_weight': 0.6, # 规则权重
|
||
'ml_weight': 0.4, # ML权重
|
||
|
||
# 触发阈值
|
||
'rule_trigger': 60, # 规则直接触发阈值
|
||
'ml_trigger': 80, # ML直接触发阈值
|
||
'fusion_trigger': 50, # 融合触发阈值
|
||
|
||
# 特征列表
|
||
'features': [
|
||
'alpha', 'alpha_delta', 'amt_ratio',
|
||
'amt_delta', 'rank_pct', 'limit_up_ratio'
|
||
],
|
||
|
||
# 序列长度(ML模型需要)
|
||
'seq_len': 30,
|
||
}
|
||
|
||
def __init__(
|
||
self,
|
||
config: Dict = None,
|
||
checkpoint_dir: str = 'ml/checkpoints',
|
||
device: str = 'auto'
|
||
):
|
||
self.config = {**self.DEFAULT_CONFIG, **(config or {})}
|
||
|
||
# 初始化评分器
|
||
self.rule_scorer = RuleBasedScorer()
|
||
self.ml_scorer = MLScorer(checkpoint_dir, device)
|
||
|
||
print(f"HybridAnomalyDetector 初始化完成")
|
||
print(f" 规则权重: {self.config['rule_weight']}")
|
||
print(f" ML权重: {self.config['ml_weight']}")
|
||
print(f" ML模型: {'就绪' if self.ml_scorer.is_ready() else '未加载'}")
|
||
|
||
def detect(
|
||
self,
|
||
features: Dict,
|
||
sequence: np.ndarray = None
|
||
) -> AnomalyResult:
|
||
"""
|
||
检测异动
|
||
|
||
Args:
|
||
features: 当前时刻的特征字典
|
||
sequence: 历史序列 (seq_len, n_features),ML模型需要
|
||
Returns:
|
||
AnomalyResult: 检测结果
|
||
"""
|
||
# 1. 规则评分
|
||
rule_score, rule_details = self.rule_scorer.score(features)
|
||
|
||
# 2. ML评分
|
||
ml_score = 0.0
|
||
if sequence is not None and self.ml_scorer.is_ready():
|
||
ml_score = self.ml_scorer.score(sequence)
|
||
|
||
# 3. 融合得分
|
||
w1 = self.config['rule_weight']
|
||
w2 = self.config['ml_weight']
|
||
|
||
# 如果ML不可用,全部权重给规则
|
||
if not self.ml_scorer.is_ready():
|
||
w1, w2 = 1.0, 0.0
|
||
|
||
final_score = w1 * rule_score + w2 * ml_score
|
||
|
||
# 4. 判断是否异动
|
||
is_anomaly = False
|
||
trigger_reason = ''
|
||
|
||
if rule_score >= self.config['rule_trigger']:
|
||
is_anomaly = True
|
||
trigger_reason = f'规则强信号({rule_score:.0f}分)'
|
||
elif ml_score >= self.config['ml_trigger']:
|
||
is_anomaly = True
|
||
trigger_reason = f'ML强信号({ml_score:.0f}分)'
|
||
elif final_score >= self.config['fusion_trigger']:
|
||
is_anomaly = True
|
||
trigger_reason = f'融合触发({final_score:.0f}分)'
|
||
|
||
# 5. 判断异动类型
|
||
anomaly_type = self.rule_scorer.get_anomaly_type(features) if is_anomaly else ''
|
||
|
||
return AnomalyResult(
|
||
is_anomaly=is_anomaly,
|
||
final_score=final_score,
|
||
rule_score=rule_score,
|
||
ml_score=ml_score,
|
||
trigger_reason=trigger_reason,
|
||
rule_details=rule_details,
|
||
anomaly_type=anomaly_type
|
||
)
|
||
|
||
def detect_batch(
|
||
self,
|
||
features_list: List[Dict],
|
||
sequences: np.ndarray = None
|
||
) -> List[AnomalyResult]:
|
||
"""
|
||
批量检测
|
||
|
||
Args:
|
||
features_list: 特征字典列表
|
||
sequences: (batch, seq_len, n_features)
|
||
Returns:
|
||
List[AnomalyResult]
|
||
"""
|
||
results = []
|
||
|
||
for i, features in enumerate(features_list):
|
||
seq = sequences[i] if sequences is not None else None
|
||
result = self.detect(features, seq)
|
||
results.append(result)
|
||
|
||
return results
|
||
|
||
|
||
# ==================== 便捷函数 ====================
|
||
|
||
def create_detector(
|
||
checkpoint_dir: str = 'ml/checkpoints',
|
||
config: Dict = None
|
||
) -> HybridAnomalyDetector:
|
||
"""创建融合检测器"""
|
||
return HybridAnomalyDetector(config, checkpoint_dir)
|
||
|
||
|
||
def quick_detect(features: Dict) -> bool:
|
||
"""
|
||
快速检测(只用规则,不需要ML模型)
|
||
|
||
适用于:
|
||
- 实时检测
|
||
- ML模型未训练完成时
|
||
"""
|
||
scorer = RuleBasedScorer()
|
||
score, _ = scorer.score(features)
|
||
return score >= 50
|
||
|
||
|
||
# ==================== 测试 ====================
|
||
|
||
if __name__ == "__main__":
|
||
print("=" * 60)
|
||
print("融合异动检测器测试")
|
||
print("=" * 60)
|
||
|
||
# 创建检测器
|
||
detector = create_detector()
|
||
|
||
# 测试用例
|
||
test_cases = [
|
||
{
|
||
'name': '正常情况',
|
||
'features': {
|
||
'alpha': 0.5,
|
||
'alpha_delta': 0.1,
|
||
'amt_ratio': 1.2,
|
||
'amt_delta': 0.1,
|
||
'rank_pct': 0.5,
|
||
'limit_up_ratio': 0.02
|
||
}
|
||
},
|
||
{
|
||
'name': 'Alpha异动',
|
||
'features': {
|
||
'alpha': 3.5,
|
||
'alpha_delta': 0.8,
|
||
'amt_ratio': 2.5,
|
||
'amt_delta': 0.5,
|
||
'rank_pct': 0.92,
|
||
'limit_up_ratio': 0.05
|
||
}
|
||
},
|
||
{
|
||
'name': '放量异动',
|
||
'features': {
|
||
'alpha': 1.2,
|
||
'alpha_delta': 0.3,
|
||
'amt_ratio': 6.0,
|
||
'amt_delta': 1.5,
|
||
'rank_pct': 0.85,
|
||
'limit_up_ratio': 0.08
|
||
}
|
||
},
|
||
{
|
||
'name': '涨停潮',
|
||
'features': {
|
||
'alpha': 2.5,
|
||
'alpha_delta': 0.6,
|
||
'amt_ratio': 3.5,
|
||
'amt_delta': 0.8,
|
||
'rank_pct': 0.98,
|
||
'limit_up_ratio': 0.25
|
||
}
|
||
},
|
||
]
|
||
|
||
print("\n测试结果:")
|
||
print("-" * 60)
|
||
|
||
for case in test_cases:
|
||
result = detector.detect(case['features'])
|
||
|
||
print(f"\n{case['name']}:")
|
||
print(f" 异动: {'是' if result.is_anomaly else '否'}")
|
||
print(f" 最终得分: {result.final_score:.1f}")
|
||
print(f" 规则得分: {result.rule_score:.1f}")
|
||
print(f" ML得分: {result.ml_score:.1f}")
|
||
if result.is_anomaly:
|
||
print(f" 触发原因: {result.trigger_reason}")
|
||
print(f" 异动类型: {result.anomaly_type}")
|
||
print(f" 触发规则: {list(result.rule_details.keys())}")
|
||
|
||
print("\n" + "=" * 60)
|
||
print("测试完成!")
|