update app_vx

This commit is contained in:
2025-11-13 07:40:46 +08:00
parent 926ffa1b8f
commit d64349b606
7 changed files with 13787 additions and 112 deletions

View File

@@ -0,0 +1,540 @@
"""
ClickHouse 查询优化方案 - 针对 /api/event/<int:event_id>/related-stocks-detail
问题分析:
1. N+1 查询问题:每只股票执行 3 次独立查询(共 30+ 次)
2. 重复扫描first_price 和 last_price 需要扫描表两次
3. 缺少批量查询优化
优化方案对比:
┌─────────────┬──────────────┬──────────────┬────────────┐
│ 方案 │ 查询次数 │ 性能提升 │ 实现难度 │
├─────────────┼──────────────┼──────────────┼────────────┤
│ 当前代码 │ N * 3 │ 基准 │ - │
│ 方案1 批量 │ 1 │ 80-90% │ 中等 │
│ 方案2 并行 │ N * 3 (并行)│ 40-60% │ 简单 │
│ 方案3 缓存 │ 减少重复 │ 20-40% │ 简单 │
└─────────────┴──────────────┴──────────────┴────────────┘
"""
# ============================================================================
# 方案 1: 批量查询(推荐)- 将所有股票的查询合并为一次
# ============================================================================
def get_batch_stock_prices_optimized(client, stock_codes, start_datetime, end_datetime):
"""
批量获取多只股票的价格数据(一次查询)
性能对比:
- 旧方案10 只股票 = 20 次查询first + last
- 新方案10 只股票 = 1 次查询
- 性能提升:约 20 倍
Args:
client: ClickHouse 客户端
stock_codes: 股票代码列表 ['600519.SH', '601088.SH', ...]
start_datetime: 开始时间
end_datetime: 结束时间
Returns:
dict: {
'600519.SH': {
'first_price': 1850.0,
'last_price': 1860.0,
'change_pct': 0.54,
'open': 1850.0,
'high': 1865.0,
'low': 1848.0,
'volume': 1234567,
'amount': 2345678900.0
},
...
}
"""
if not stock_codes:
return {}
# 构建批量查询 SQL使用 IN 子句)
query = """
SELECT
code,
-- 第一个价格(事件发生时)
anyIf(close, rownum_asc = 1) as first_price,
-- 最后一个价格(当前时间)
anyIf(close, rownum_desc = 1) as last_price,
-- 涨跌幅
(last_price - first_price) / first_price * 100 as change_pct,
-- 涨跌额
last_price - first_price as change_amount,
-- 其他价格信息(取最后一条记录)
anyIf(open, rownum_desc = 1) as open_price,
anyIf(high, rownum_desc = 1) as high_price,
anyIf(low, rownum_desc = 1) as low_price,
anyIf(volume, rownum_desc = 1) as volume,
anyIf(amt, rownum_desc = 1) as amount
FROM (
SELECT
code,
timestamp,
close,
open,
high,
low,
volume,
amt,
-- 正序排名(用于获取第一个价格)
ROW_NUMBER() OVER (PARTITION BY code ORDER BY timestamp ASC) as rownum_asc,
-- 倒序排名(用于获取最后一个价格)
ROW_NUMBER() OVER (PARTITION BY code ORDER BY timestamp DESC) as rownum_desc
FROM stock_minute
WHERE code IN %(codes)s
AND timestamp >= %(start)s
AND timestamp <= %(end)s
)
GROUP BY code
"""
try:
# 执行批量查询
data = client.execute(query, {
'codes': tuple(stock_codes), # ClickHouse IN 需要 tuple
'start': start_datetime,
'end': end_datetime
})
# 格式化结果为字典
result = {}
for row in data:
code = row[0]
result[code] = {
'first_price': float(row[1]) if row[1] else None,
'last_price': float(row[2]) if row[2] else None,
'change_pct': float(row[3]) if row[3] else None,
'change_amount': float(row[4]) if row[4] else None,
'open_price': float(row[5]) if row[5] else None,
'high_price': float(row[6]) if row[6] else None,
'low_price': float(row[7]) if row[7] else None,
'volume': int(row[8]) if row[8] else None,
'amount': float(row[9]) if row[9] else None,
}
print(f"批量查询完成,获取了 {len(result)} 只股票的数据")
return result
except Exception as e:
print(f"批量查询失败: {e}")
return {}
def get_batch_minute_chart_data(client, stock_codes, start_datetime, end_datetime):
"""
批量获取多只股票的分时图数据
Args:
client: ClickHouse 客户端
stock_codes: 股票代码列表
start_datetime: 开始时间
end_datetime: 结束时间
Returns:
dict: {
'600519.SH': [
{'time': '09:30', 'close': 1850.0, 'volume': 12345, ...},
{'time': '09:31', 'close': 1851.0, 'volume': 12346, ...},
...
],
...
}
"""
if not stock_codes:
return {}
query = """
SELECT
code,
timestamp,
open,
high,
low,
close,
volume,
amt
FROM stock_minute
WHERE code IN %(codes)s
AND timestamp >= %(start)s
AND timestamp <= %(end)s
ORDER BY code, timestamp
"""
try:
data = client.execute(query, {
'codes': tuple(stock_codes),
'start': start_datetime,
'end': end_datetime
})
# 按股票代码分组
result = {}
for row in data:
code = row[0]
if code not in result:
result[code] = []
result[code].append({
'time': row[1].strftime('%H:%M'),
'open': float(row[2]) if row[2] else None,
'high': float(row[3]) if row[3] else None,
'low': float(row[4]) if row[4] else None,
'close': float(row[5]) if row[5] else None,
'volume': float(row[6]) if row[6] else None,
'amount': float(row[7]) if row[7] else None
})
print(f"批量获取分时数据完成,获取了 {len(result)} 只股票的数据")
return result
except Exception as e:
print(f"批量获取分时数据失败: {e}")
return {}
# ============================================================================
# 使用示例:替换原来的 for 循环
# ============================================================================
def api_event_related_stocks_optimized(event_id):
"""优化后的端点实现"""
try:
from datetime import datetime
event = Event.query.get_or_404(event_id)
related_stocks = event.related_stocks.order_by(RelatedStock.correlation.desc()).all()
if not related_stocks:
return jsonify({'code': 200, 'data': {'related_stocks': []}})
# 获取 ClickHouse 客户端
client = get_clickhouse_client()
# 计算时间范围(省略交易日计算逻辑,与原代码相同)
event_time = event.start_time if event.start_time else event.created_at
trading_day, start_time, end_time = get_trading_day_and_times(event_time)
start_datetime = datetime.combine(trading_day, start_time)
end_datetime = datetime.combine(trading_day, end_time)
# ✅ 批量查询所有股票的价格数据(只查询 1 次)
stock_codes = [stock.stock_code for stock in related_stocks]
prices_data = get_batch_stock_prices_optimized(
client, stock_codes, start_datetime, end_datetime
)
# ✅ 批量查询所有股票的分时图数据(只查询 1 次)
minute_data = get_batch_minute_chart_data(
client, stock_codes, start_datetime, end_datetime
)
# 组装返回数据
stocks_data = []
for stock in related_stocks:
# 从批量查询结果中获取数据(无需再次查询)
price_info = prices_data.get(stock.stock_code, {})
chart_data = minute_data.get(stock.stock_code, [])
# 获取股票基本信息(这里可以考虑也批量查询)
stock_info = StockBasicInfo.query.filter_by(SECCODE=stock.stock_code).first()
if not stock_info:
base_code = stock.stock_code.split('.')[0]
stock_info = StockBasicInfo.query.filter_by(SECCODE=base_code).first()
stock_data = {
'id': stock.id,
'stock_code': stock.stock_code,
'stock_name': stock.stock_name,
'sector': stock.sector,
'relation_desc': stock.relation_desc,
'correlation': stock.correlation,
'momentum': stock.momentum,
'listing_date': stock_info.F006D.isoformat() if stock_info and stock_info.F006D else None,
'market': stock_info.F005V if stock_info else None,
# 交易数据(从批量查询结果获取)
'trade_data': {
'latest_price': price_info.get('last_price'),
'first_price': price_info.get('first_price'),
'open_price': price_info.get('open_price'),
'high_price': price_info.get('high_price'),
'low_price': price_info.get('low_price'),
'change_amount': round(price_info['change_amount'], 2) if price_info.get('change_amount') else None,
'change_pct': round(price_info['change_pct'], 2) if price_info.get('change_pct') else None,
'volume': price_info.get('volume'),
'amount': price_info.get('amount'),
'trade_date': trading_day.isoformat(),
},
# 分时图数据
'minute_chart': chart_data
}
stocks_data.append(stock_data)
return jsonify({
'code': 200,
'message': 'success',
'data': {
'event_id': event_id,
'event_title': event.title,
'related_stocks': stocks_data,
'total_count': len(stocks_data)
}
})
except Exception as e:
print(f"Error in api_event_related_stocks_optimized: {e}")
return jsonify({'code': 500, 'message': str(e)}), 500
# ============================================================================
# 方案 2: 异步并行查询(适用于无法批量查询的场景)
# ============================================================================
import asyncio
from concurrent.futures import ThreadPoolExecutor
def get_stock_price_async(client, stock_code, start_datetime, end_datetime):
"""单个股票的查询函数(线程安全)"""
# 与原代码相同的查询逻辑
try:
data = client.execute("""
WITH first_price AS (
SELECT close FROM stock_minute WHERE code = %(code)s ...
)
...
""", {'code': stock_code, 'start': start_datetime, 'end': end_datetime})
return stock_code, data
except Exception as e:
return stock_code, None
def get_all_stocks_parallel(client, stock_codes, start_datetime, end_datetime):
"""
并行查询多只股票(使用线程池)
性能对比:
- 串行10 只股票 * 0.1 秒 = 1 秒
- 并行max(0.1 秒) = 0.1 秒10 倍提速)
"""
with ThreadPoolExecutor(max_workers=10) as executor:
# 提交所有查询任务
futures = [
executor.submit(get_stock_price_async, client, code, start_datetime, end_datetime)
for code in stock_codes
]
# 等待所有任务完成
results = {}
for future in futures:
stock_code, data = future.result()
results[stock_code] = data
return results
# ============================================================================
# 方案 3: 添加缓存层Redis
# ============================================================================
import redis
import json
redis_client = redis.Redis(host='localhost', port=6379, db=0)
def get_stock_price_with_cache(client, stock_code, start_datetime, end_datetime):
"""
带缓存的查询(适用于历史数据)
缓存策略:
- 历史数据(非当日):缓存 24 小时
- 当日数据:缓存 1 分钟
"""
from datetime import datetime
# 生成缓存键
cache_key = f"stock_price:{stock_code}:{start_datetime.date()}:{end_datetime.date()}"
# 尝试从缓存获取
cached_data = redis_client.get(cache_key)
if cached_data:
print(f"从缓存获取 {stock_code} 数据")
return json.loads(cached_data)
# 缓存未命中,查询数据库
print(f"从 ClickHouse 查询 {stock_code} 数据")
data = client.execute("""...""", {
'code': stock_code,
'start': start_datetime,
'end': end_datetime
})
# 格式化数据
result = {
'first_price': float(data[0][2]) if data else None,
'last_price': float(data[0][0]) if data else None,
# ...
}
# 写入缓存
is_today = start_datetime.date() == datetime.now().date()
ttl = 60 if is_today else 86400 # 当日数据缓存 1 分钟,历史数据缓存 24 小时
redis_client.setex(cache_key, ttl, json.dumps(result))
return result
# ============================================================================
# 方案 4: ClickHouse 查询优化(索引提示)
# ============================================================================
def get_stock_price_with_hints(client, stock_code, start_datetime, end_datetime):
"""
使用 ClickHouse 特性优化查询
优化点:
1. PREWHERE 子句(提前过滤,减少数据扫描)
2. FINAL 修饰符(如果使用了 ReplacingMergeTree
3. 分区裁剪(如果表按日期分区)
"""
query = """
SELECT
code,
anyLast(close) as last_price,
any(close) as first_price,
(last_price - first_price) / first_price * 100 as change_pct
FROM stock_minute
PREWHERE code = %(code)s -- 使用 PREWHERE 提前过滤(比 WHERE 快)
WHERE timestamp >= %(start)s
AND timestamp <= %(end)s
GROUP BY code
SETTINGS max_threads = 2 -- 限制线程数(避免资源竞争)
"""
data = client.execute(query, {
'code': stock_code,
'start': start_datetime,
'end': end_datetime
})
return data
# ============================================================================
# 数据库层面优化建议
# ============================================================================
"""
1. 确保 stock_minute 表有以下索引:
- PRIMARY KEY (code, timestamp) -- 主键索引
- INDEX idx_timestamp timestamp TYPE minmax GRANULARITY 3 -- 时间索引
2. 表分区策略(如果数据量大):
CREATE TABLE stock_minute (
code String,
timestamp DateTime,
...
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(timestamp) -- 按月分区
ORDER BY (code, timestamp)
SETTINGS index_granularity = 8192;
3. 使用物化视图预计算(适用于固定查询模式):
CREATE MATERIALIZED VIEW stock_minute_summary
ENGINE = AggregatingMergeTree()
PARTITION BY toYYYYMMDD(timestamp)
ORDER BY (code, timestamp)
AS SELECT
code,
toStartOfMinute(timestamp) as minute,
anyLast(close) as last_close,
any(close) as first_close,
...
FROM stock_minute
GROUP BY code, minute;
4. 检查表统计信息:
SELECT
table,
partition,
rows,
bytes_on_disk
FROM system.parts
WHERE table = 'stock_minute';
"""
# ============================================================================
# 性能对比测试
# ============================================================================
def benchmark_query_methods():
"""
性能对比测试
测试场景:查询 10 只股票的价格数据
预期结果:
- 原方案(串行 N+1~1000ms
- 方案 1批量查询~50ms20 倍提速)
- 方案 2并行查询~200ms5 倍提速)
- 方案 3带缓存~10ms100 倍提速,第二次请求)
"""
import time
stock_codes = ['600519.SH', '601088.SH', '600276.SH', '000001.SZ', ...]
# 测试方案 1批量查询
start = time.time()
result1 = get_batch_stock_prices_optimized(client, stock_codes, start_dt, end_dt)
print(f"批量查询耗时: {(time.time() - start) * 1000:.2f}ms")
# 测试方案 2并行查询
start = time.time()
result2 = get_all_stocks_parallel(client, stock_codes, start_dt, end_dt)
print(f"并行查询耗时: {(time.time() - start) * 1000:.2f}ms")
# 测试原方案(串行)
start = time.time()
result3 = {}
for code in stock_codes:
result3[code] = get_stock_price_original(client, code, start_dt, end_dt)
print(f"串行查询耗时: {(time.time() - start) * 1000:.2f}ms")
# ============================================================================
# 总结与建议
# ============================================================================
"""
推荐实施顺序:
第一步(立即实施):方案 1 - 批量查询
- 实现难度:中等
- 性能提升80-90%
- 风险:低
- 时间1-2 小时
第二步(可选):方案 3 - 添加缓存
- 实现难度:简单
- 性能提升:额外 20-40%
- 风险:低
- 时间30 分钟
第三步(长期):方案 4 - 数据库优化
- 实现难度:中等
- 性能提升20-30%
- 风险:中(需要测试)
- 时间2-4 小时
监控指标:
- 查询时间:目标 < 200ms当前 > 1000ms
- ClickHouse 查询次数:目标 1-2 次(当前 30+ 次)
- 缓存命中率:目标 > 80%(如果使用缓存)
"""