You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

142 lines
3.4 KiB

"""
数据处理工具模块
"""
from typing import List, Dict, Tuple, Optional
import pandas as pd
import numpy as np
def deduplicate_dataframe(
df: pd.DataFrame,
existing_df: pd.DataFrame,
core_columns: List[str] = None,
tolerance: float = 0.0001
) -> pd.DataFrame:
"""
去重DataFrame
Args:
df: 新数据
existing_df: 已存在的数据
core_columns: 核心比较列
tolerance: 差异容忍度
Returns:
需要去重的数据
"""
if core_columns is None:
core_columns = ["open", "high", "low", "close", "volume"]
if existing_df.empty:
return df
# 获取已存在的索引
existing_indices = set(existing_df.index)
to_keep = []
for idx, row in df.iterrows():
if idx not in existing_indices:
to_keep.append(idx)
else:
# 比较核心字段
existing_row = existing_df.loc[idx]
is_same = all(
abs(row[col] - existing_row[col]) < tolerance
for col in core_columns
if col in row and col in existing_row
)
if not is_same:
to_keep.append(idx)
return df.loc[to_keep]
def compare_kline_data(
new_data: Dict,
existing_data: Dict,
tolerance: float = 0.0001
) -> bool:
"""
比较K线数据是否相同
Args:
new_data: 新数据
existing_data: 已存在数据
tolerance: 差异容忍度
Returns:
是否相同
"""
core_fields = ["open", "high", "low", "close", "volume"]
for field in core_fields:
if field not in new_data or field not in existing_data:
return False
if abs(new_data[field] - existing_data[field]) >= tolerance:
return False
return True
def dataframe_to_dict_list(df: pd.DataFrame) -> List[Dict]:
"""将DataFrame转换为字典列表"""
if df.empty:
return []
# 重置索引以便包含日期信息
df_reset = df.reset_index()
# 转换列名
df_reset.columns = [str(col).lower() for col in df_reset.columns]
# 转换为字典列表
records = df_reset.to_dict("records")
# 处理数值类型
for record in records:
for key, value in record.items():
if isinstance(value, (np.integer, np.floating)):
record[key] = float(value) if isinstance(value, np.floating) else int(value)
elif pd.isna(value):
record[key] = None
return records
def merge_kline_data(
cached_data: pd.DataFrame,
sdk_data: pd.DataFrame
) -> pd.DataFrame:
"""合并K线数据SDK数据优先"""
if cached_data.empty:
return sdk_data
if sdk_data.empty:
return cached_data
# 合并数据SDK数据优先
combined = pd.concat([cached_data, sdk_data])
combined = combined[~combined.index.duplicated(keep="last")]
combined = combined.sort_index()
return combined
def calculate_data_completeness(
actual_count: int,
expected_count: int
) -> float:
"""计算数据完整度"""
if expected_count == 0:
return 1.0
return actual_count / expected_count
def detect_missing_periods(
existing_dates: List,
expected_dates: List
) -> List[Tuple]:
"""检测缺失的时间段"""
missing = set(expected_dates) - set(existing_dates)
return sorted(list(missing))