Files
vf_react/clickhouse_optimization_guide.py
2025-11-13 07:40:46 +08:00

541 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
ClickHouse 查询优化方案 - 针对 /api/event/<int:event_id>/related-stocks-detail
问题分析:
1. N+1 查询问题:每只股票执行 3 次独立查询(共 30+ 次)
2. 重复扫描first_price 和 last_price 需要扫描表两次
3. 缺少批量查询优化
优化方案对比:
┌─────────────┬──────────────┬──────────────┬────────────┐
│ 方案 │ 查询次数 │ 性能提升 │ 实现难度 │
├─────────────┼──────────────┼──────────────┼────────────┤
│ 当前代码 │ N * 3 │ 基准 │ - │
│ 方案1 批量 │ 1 │ 80-90% │ 中等 │
│ 方案2 并行 │ N * 3 (并行)│ 40-60% │ 简单 │
│ 方案3 缓存 │ 减少重复 │ 20-40% │ 简单 │
└─────────────┴──────────────┴──────────────┴────────────┘
"""
# ============================================================================
# 方案 1: 批量查询(推荐)- 将所有股票的查询合并为一次
# ============================================================================
def get_batch_stock_prices_optimized(client, stock_codes, start_datetime, end_datetime):
"""
批量获取多只股票的价格数据(一次查询)
性能对比:
- 旧方案10 只股票 = 20 次查询first + last
- 新方案10 只股票 = 1 次查询
- 性能提升:约 20 倍
Args:
client: ClickHouse 客户端
stock_codes: 股票代码列表 ['600519.SH', '601088.SH', ...]
start_datetime: 开始时间
end_datetime: 结束时间
Returns:
dict: {
'600519.SH': {
'first_price': 1850.0,
'last_price': 1860.0,
'change_pct': 0.54,
'open': 1850.0,
'high': 1865.0,
'low': 1848.0,
'volume': 1234567,
'amount': 2345678900.0
},
...
}
"""
if not stock_codes:
return {}
# 构建批量查询 SQL使用 IN 子句)
query = """
SELECT
code,
-- 第一个价格(事件发生时)
anyIf(close, rownum_asc = 1) as first_price,
-- 最后一个价格(当前时间)
anyIf(close, rownum_desc = 1) as last_price,
-- 涨跌幅
(last_price - first_price) / first_price * 100 as change_pct,
-- 涨跌额
last_price - first_price as change_amount,
-- 其他价格信息(取最后一条记录)
anyIf(open, rownum_desc = 1) as open_price,
anyIf(high, rownum_desc = 1) as high_price,
anyIf(low, rownum_desc = 1) as low_price,
anyIf(volume, rownum_desc = 1) as volume,
anyIf(amt, rownum_desc = 1) as amount
FROM (
SELECT
code,
timestamp,
close,
open,
high,
low,
volume,
amt,
-- 正序排名(用于获取第一个价格)
ROW_NUMBER() OVER (PARTITION BY code ORDER BY timestamp ASC) as rownum_asc,
-- 倒序排名(用于获取最后一个价格)
ROW_NUMBER() OVER (PARTITION BY code ORDER BY timestamp DESC) as rownum_desc
FROM stock_minute
WHERE code IN %(codes)s
AND timestamp >= %(start)s
AND timestamp <= %(end)s
)
GROUP BY code
"""
try:
# 执行批量查询
data = client.execute(query, {
'codes': tuple(stock_codes), # ClickHouse IN 需要 tuple
'start': start_datetime,
'end': end_datetime
})
# 格式化结果为字典
result = {}
for row in data:
code = row[0]
result[code] = {
'first_price': float(row[1]) if row[1] else None,
'last_price': float(row[2]) if row[2] else None,
'change_pct': float(row[3]) if row[3] else None,
'change_amount': float(row[4]) if row[4] else None,
'open_price': float(row[5]) if row[5] else None,
'high_price': float(row[6]) if row[6] else None,
'low_price': float(row[7]) if row[7] else None,
'volume': int(row[8]) if row[8] else None,
'amount': float(row[9]) if row[9] else None,
}
print(f"批量查询完成,获取了 {len(result)} 只股票的数据")
return result
except Exception as e:
print(f"批量查询失败: {e}")
return {}
def get_batch_minute_chart_data(client, stock_codes, start_datetime, end_datetime):
"""
批量获取多只股票的分时图数据
Args:
client: ClickHouse 客户端
stock_codes: 股票代码列表
start_datetime: 开始时间
end_datetime: 结束时间
Returns:
dict: {
'600519.SH': [
{'time': '09:30', 'close': 1850.0, 'volume': 12345, ...},
{'time': '09:31', 'close': 1851.0, 'volume': 12346, ...},
...
],
...
}
"""
if not stock_codes:
return {}
query = """
SELECT
code,
timestamp,
open,
high,
low,
close,
volume,
amt
FROM stock_minute
WHERE code IN %(codes)s
AND timestamp >= %(start)s
AND timestamp <= %(end)s
ORDER BY code, timestamp
"""
try:
data = client.execute(query, {
'codes': tuple(stock_codes),
'start': start_datetime,
'end': end_datetime
})
# 按股票代码分组
result = {}
for row in data:
code = row[0]
if code not in result:
result[code] = []
result[code].append({
'time': row[1].strftime('%H:%M'),
'open': float(row[2]) if row[2] else None,
'high': float(row[3]) if row[3] else None,
'low': float(row[4]) if row[4] else None,
'close': float(row[5]) if row[5] else None,
'volume': float(row[6]) if row[6] else None,
'amount': float(row[7]) if row[7] else None
})
print(f"批量获取分时数据完成,获取了 {len(result)} 只股票的数据")
return result
except Exception as e:
print(f"批量获取分时数据失败: {e}")
return {}
# ============================================================================
# 使用示例:替换原来的 for 循环
# ============================================================================
def api_event_related_stocks_optimized(event_id):
"""优化后的端点实现"""
try:
from datetime import datetime
event = Event.query.get_or_404(event_id)
related_stocks = event.related_stocks.order_by(RelatedStock.correlation.desc()).all()
if not related_stocks:
return jsonify({'code': 200, 'data': {'related_stocks': []}})
# 获取 ClickHouse 客户端
client = get_clickhouse_client()
# 计算时间范围(省略交易日计算逻辑,与原代码相同)
event_time = event.start_time if event.start_time else event.created_at
trading_day, start_time, end_time = get_trading_day_and_times(event_time)
start_datetime = datetime.combine(trading_day, start_time)
end_datetime = datetime.combine(trading_day, end_time)
# ✅ 批量查询所有股票的价格数据(只查询 1 次)
stock_codes = [stock.stock_code for stock in related_stocks]
prices_data = get_batch_stock_prices_optimized(
client, stock_codes, start_datetime, end_datetime
)
# ✅ 批量查询所有股票的分时图数据(只查询 1 次)
minute_data = get_batch_minute_chart_data(
client, stock_codes, start_datetime, end_datetime
)
# 组装返回数据
stocks_data = []
for stock in related_stocks:
# 从批量查询结果中获取数据(无需再次查询)
price_info = prices_data.get(stock.stock_code, {})
chart_data = minute_data.get(stock.stock_code, [])
# 获取股票基本信息(这里可以考虑也批量查询)
stock_info = StockBasicInfo.query.filter_by(SECCODE=stock.stock_code).first()
if not stock_info:
base_code = stock.stock_code.split('.')[0]
stock_info = StockBasicInfo.query.filter_by(SECCODE=base_code).first()
stock_data = {
'id': stock.id,
'stock_code': stock.stock_code,
'stock_name': stock.stock_name,
'sector': stock.sector,
'relation_desc': stock.relation_desc,
'correlation': stock.correlation,
'momentum': stock.momentum,
'listing_date': stock_info.F006D.isoformat() if stock_info and stock_info.F006D else None,
'market': stock_info.F005V if stock_info else None,
# 交易数据(从批量查询结果获取)
'trade_data': {
'latest_price': price_info.get('last_price'),
'first_price': price_info.get('first_price'),
'open_price': price_info.get('open_price'),
'high_price': price_info.get('high_price'),
'low_price': price_info.get('low_price'),
'change_amount': round(price_info['change_amount'], 2) if price_info.get('change_amount') else None,
'change_pct': round(price_info['change_pct'], 2) if price_info.get('change_pct') else None,
'volume': price_info.get('volume'),
'amount': price_info.get('amount'),
'trade_date': trading_day.isoformat(),
},
# 分时图数据
'minute_chart': chart_data
}
stocks_data.append(stock_data)
return jsonify({
'code': 200,
'message': 'success',
'data': {
'event_id': event_id,
'event_title': event.title,
'related_stocks': stocks_data,
'total_count': len(stocks_data)
}
})
except Exception as e:
print(f"Error in api_event_related_stocks_optimized: {e}")
return jsonify({'code': 500, 'message': str(e)}), 500
# ============================================================================
# 方案 2: 异步并行查询(适用于无法批量查询的场景)
# ============================================================================
import asyncio
from concurrent.futures import ThreadPoolExecutor
def get_stock_price_async(client, stock_code, start_datetime, end_datetime):
"""单个股票的查询函数(线程安全)"""
# 与原代码相同的查询逻辑
try:
data = client.execute("""
WITH first_price AS (
SELECT close FROM stock_minute WHERE code = %(code)s ...
)
...
""", {'code': stock_code, 'start': start_datetime, 'end': end_datetime})
return stock_code, data
except Exception as e:
return stock_code, None
def get_all_stocks_parallel(client, stock_codes, start_datetime, end_datetime):
"""
并行查询多只股票(使用线程池)
性能对比:
- 串行10 只股票 * 0.1 秒 = 1 秒
- 并行max(0.1 秒) = 0.1 秒10 倍提速)
"""
with ThreadPoolExecutor(max_workers=10) as executor:
# 提交所有查询任务
futures = [
executor.submit(get_stock_price_async, client, code, start_datetime, end_datetime)
for code in stock_codes
]
# 等待所有任务完成
results = {}
for future in futures:
stock_code, data = future.result()
results[stock_code] = data
return results
# ============================================================================
# 方案 3: 添加缓存层Redis
# ============================================================================
import redis
import json
redis_client = redis.Redis(host='localhost', port=6379, db=0)
def get_stock_price_with_cache(client, stock_code, start_datetime, end_datetime):
"""
带缓存的查询(适用于历史数据)
缓存策略:
- 历史数据(非当日):缓存 24 小时
- 当日数据:缓存 1 分钟
"""
from datetime import datetime
# 生成缓存键
cache_key = f"stock_price:{stock_code}:{start_datetime.date()}:{end_datetime.date()}"
# 尝试从缓存获取
cached_data = redis_client.get(cache_key)
if cached_data:
print(f"从缓存获取 {stock_code} 数据")
return json.loads(cached_data)
# 缓存未命中,查询数据库
print(f"从 ClickHouse 查询 {stock_code} 数据")
data = client.execute("""...""", {
'code': stock_code,
'start': start_datetime,
'end': end_datetime
})
# 格式化数据
result = {
'first_price': float(data[0][2]) if data else None,
'last_price': float(data[0][0]) if data else None,
# ...
}
# 写入缓存
is_today = start_datetime.date() == datetime.now().date()
ttl = 60 if is_today else 86400 # 当日数据缓存 1 分钟,历史数据缓存 24 小时
redis_client.setex(cache_key, ttl, json.dumps(result))
return result
# ============================================================================
# 方案 4: ClickHouse 查询优化(索引提示)
# ============================================================================
def get_stock_price_with_hints(client, stock_code, start_datetime, end_datetime):
"""
使用 ClickHouse 特性优化查询
优化点:
1. PREWHERE 子句(提前过滤,减少数据扫描)
2. FINAL 修饰符(如果使用了 ReplacingMergeTree
3. 分区裁剪(如果表按日期分区)
"""
query = """
SELECT
code,
anyLast(close) as last_price,
any(close) as first_price,
(last_price - first_price) / first_price * 100 as change_pct
FROM stock_minute
PREWHERE code = %(code)s -- 使用 PREWHERE 提前过滤(比 WHERE 快)
WHERE timestamp >= %(start)s
AND timestamp <= %(end)s
GROUP BY code
SETTINGS max_threads = 2 -- 限制线程数(避免资源竞争)
"""
data = client.execute(query, {
'code': stock_code,
'start': start_datetime,
'end': end_datetime
})
return data
# ============================================================================
# 数据库层面优化建议
# ============================================================================
"""
1. 确保 stock_minute 表有以下索引:
- PRIMARY KEY (code, timestamp) -- 主键索引
- INDEX idx_timestamp timestamp TYPE minmax GRANULARITY 3 -- 时间索引
2. 表分区策略(如果数据量大):
CREATE TABLE stock_minute (
code String,
timestamp DateTime,
...
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(timestamp) -- 按月分区
ORDER BY (code, timestamp)
SETTINGS index_granularity = 8192;
3. 使用物化视图预计算(适用于固定查询模式):
CREATE MATERIALIZED VIEW stock_minute_summary
ENGINE = AggregatingMergeTree()
PARTITION BY toYYYYMMDD(timestamp)
ORDER BY (code, timestamp)
AS SELECT
code,
toStartOfMinute(timestamp) as minute,
anyLast(close) as last_close,
any(close) as first_close,
...
FROM stock_minute
GROUP BY code, minute;
4. 检查表统计信息:
SELECT
table,
partition,
rows,
bytes_on_disk
FROM system.parts
WHERE table = 'stock_minute';
"""
# ============================================================================
# 性能对比测试
# ============================================================================
def benchmark_query_methods():
"""
性能对比测试
测试场景:查询 10 只股票的价格数据
预期结果:
- 原方案(串行 N+1~1000ms
- 方案 1批量查询~50ms20 倍提速)
- 方案 2并行查询~200ms5 倍提速)
- 方案 3带缓存~10ms100 倍提速,第二次请求)
"""
import time
stock_codes = ['600519.SH', '601088.SH', '600276.SH', '000001.SZ', ...]
# 测试方案 1批量查询
start = time.time()
result1 = get_batch_stock_prices_optimized(client, stock_codes, start_dt, end_dt)
print(f"批量查询耗时: {(time.time() - start) * 1000:.2f}ms")
# 测试方案 2并行查询
start = time.time()
result2 = get_all_stocks_parallel(client, stock_codes, start_dt, end_dt)
print(f"并行查询耗时: {(time.time() - start) * 1000:.2f}ms")
# 测试原方案(串行)
start = time.time()
result3 = {}
for code in stock_codes:
result3[code] = get_stock_price_original(client, code, start_dt, end_dt)
print(f"串行查询耗时: {(time.time() - start) * 1000:.2f}ms")
# ============================================================================
# 总结与建议
# ============================================================================
"""
推荐实施顺序:
第一步(立即实施):方案 1 - 批量查询
- 实现难度:中等
- 性能提升80-90%
- 风险:低
- 时间1-2 小时
第二步(可选):方案 3 - 添加缓存
- 实现难度:简单
- 性能提升:额外 20-40%
- 风险:低
- 时间30 分钟
第三步(长期):方案 4 - 数据库优化
- 实现难度:中等
- 性能提升20-30%
- 风险:中(需要测试)
- 时间2-4 小时
监控指标:
- 查询时间:目标 < 200ms当前 > 1000ms
- ClickHouse 查询次数:目标 1-2 次(当前 30+ 次)
- 缓存命中率:目标 > 80%(如果使用缓存)
"""