|
|
|
|
|
"""
|
|
|
|
|
|
数据处理工具模块
|
|
|
|
|
|
"""
|
|
|
|
|
|
from typing import List, Dict, Tuple, Optional
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def deduplicate_dataframe(
|
|
|
|
|
|
df: pd.DataFrame,
|
|
|
|
|
|
existing_df: pd.DataFrame,
|
|
|
|
|
|
core_columns: List[str] = None,
|
|
|
|
|
|
tolerance: float = 0.0001
|
|
|
|
|
|
) -> pd.DataFrame:
|
|
|
|
|
|
"""
|
|
|
|
|
|
去重DataFrame
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
df: 新数据
|
|
|
|
|
|
existing_df: 已存在的数据
|
|
|
|
|
|
core_columns: 核心比较列
|
|
|
|
|
|
tolerance: 差异容忍度
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
需要去重的数据
|
|
|
|
|
|
"""
|
|
|
|
|
|
if core_columns is None:
|
|
|
|
|
|
core_columns = ["open", "high", "low", "close", "volume"]
|
|
|
|
|
|
|
|
|
|
|
|
if existing_df.empty:
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
# 获取已存在的索引
|
|
|
|
|
|
existing_indices = set(existing_df.index)
|
|
|
|
|
|
|
|
|
|
|
|
to_keep = []
|
|
|
|
|
|
for idx, row in df.iterrows():
|
|
|
|
|
|
if idx not in existing_indices:
|
|
|
|
|
|
to_keep.append(idx)
|
|
|
|
|
|
else:
|
|
|
|
|
|
# 比较核心字段
|
|
|
|
|
|
existing_row = existing_df.loc[idx]
|
|
|
|
|
|
is_same = all(
|
|
|
|
|
|
abs(row[col] - existing_row[col]) < tolerance
|
|
|
|
|
|
for col in core_columns
|
|
|
|
|
|
if col in row and col in existing_row
|
|
|
|
|
|
)
|
|
|
|
|
|
if not is_same:
|
|
|
|
|
|
to_keep.append(idx)
|
|
|
|
|
|
|
|
|
|
|
|
return df.loc[to_keep]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compare_kline_data(
|
|
|
|
|
|
new_data: Dict,
|
|
|
|
|
|
existing_data: Dict,
|
|
|
|
|
|
tolerance: float = 0.0001
|
|
|
|
|
|
) -> bool:
|
|
|
|
|
|
"""
|
|
|
|
|
|
比较K线数据是否相同
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
new_data: 新数据
|
|
|
|
|
|
existing_data: 已存在数据
|
|
|
|
|
|
tolerance: 差异容忍度
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
是否相同
|
|
|
|
|
|
"""
|
|
|
|
|
|
core_fields = ["open", "high", "low", "close", "volume"]
|
|
|
|
|
|
|
|
|
|
|
|
for field in core_fields:
|
|
|
|
|
|
if field not in new_data or field not in existing_data:
|
|
|
|
|
|
return False
|
|
|
|
|
|
if abs(new_data[field] - existing_data[field]) >= tolerance:
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def dataframe_to_dict_list(df: pd.DataFrame) -> List[Dict]:
|
|
|
|
|
|
"""将DataFrame转换为字典列表"""
|
|
|
|
|
|
if df.empty:
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
# 重置索引以便包含日期信息
|
|
|
|
|
|
df_reset = df.reset_index()
|
|
|
|
|
|
|
|
|
|
|
|
# 转换列名
|
|
|
|
|
|
df_reset.columns = [str(col).lower() for col in df_reset.columns]
|
|
|
|
|
|
|
|
|
|
|
|
# 转换为字典列表
|
|
|
|
|
|
records = df_reset.to_dict("records")
|
|
|
|
|
|
|
|
|
|
|
|
# 处理数值类型
|
|
|
|
|
|
for record in records:
|
|
|
|
|
|
for key, value in record.items():
|
|
|
|
|
|
if isinstance(value, (np.integer, np.floating)):
|
|
|
|
|
|
record[key] = float(value) if isinstance(value, np.floating) else int(value)
|
|
|
|
|
|
elif pd.isna(value):
|
|
|
|
|
|
record[key] = None
|
|
|
|
|
|
|
|
|
|
|
|
return records
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_kline_data(
|
|
|
|
|
|
cached_data: pd.DataFrame,
|
|
|
|
|
|
sdk_data: pd.DataFrame
|
|
|
|
|
|
) -> pd.DataFrame:
|
|
|
|
|
|
"""合并K线数据(SDK数据优先)"""
|
|
|
|
|
|
if cached_data.empty:
|
|
|
|
|
|
return sdk_data
|
|
|
|
|
|
|
|
|
|
|
|
if sdk_data.empty:
|
|
|
|
|
|
return cached_data
|
|
|
|
|
|
|
|
|
|
|
|
# 合并数据,SDK数据优先
|
|
|
|
|
|
combined = pd.concat([cached_data, sdk_data])
|
|
|
|
|
|
combined = combined[~combined.index.duplicated(keep="last")]
|
|
|
|
|
|
combined = combined.sort_index()
|
|
|
|
|
|
|
|
|
|
|
|
return combined
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_data_completeness(
|
|
|
|
|
|
actual_count: int,
|
|
|
|
|
|
expected_count: int
|
|
|
|
|
|
) -> float:
|
|
|
|
|
|
"""计算数据完整度"""
|
|
|
|
|
|
if expected_count == 0:
|
|
|
|
|
|
return 1.0
|
|
|
|
|
|
return actual_count / expected_count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_missing_periods(
|
|
|
|
|
|
existing_dates: List,
|
|
|
|
|
|
expected_dates: List
|
|
|
|
|
|
) -> List[Tuple]:
|
|
|
|
|
|
"""检测缺失的时间段"""
|
|
|
|
|
|
missing = set(expected_dates) - set(existing_dates)
|
|
|
|
|
|
return sorted(list(missing))
|