""" 数据处理工具模块 """ from typing import List, Dict, Tuple, Optional import pandas as pd import numpy as np def deduplicate_dataframe( df: pd.DataFrame, existing_df: pd.DataFrame, core_columns: List[str] = None, tolerance: float = 0.0001 ) -> pd.DataFrame: """ 去重DataFrame Args: df: 新数据 existing_df: 已存在的数据 core_columns: 核心比较列 tolerance: 差异容忍度 Returns: 需要去重的数据 """ if core_columns is None: core_columns = ["open", "high", "low", "close", "volume"] if existing_df.empty: return df # 获取已存在的索引 existing_indices = set(existing_df.index) to_keep = [] for idx, row in df.iterrows(): if idx not in existing_indices: to_keep.append(idx) else: # 比较核心字段 existing_row = existing_df.loc[idx] is_same = all( abs(row[col] - existing_row[col]) < tolerance for col in core_columns if col in row and col in existing_row ) if not is_same: to_keep.append(idx) return df.loc[to_keep] def compare_kline_data( new_data: Dict, existing_data: Dict, tolerance: float = 0.0001 ) -> bool: """ 比较K线数据是否相同 Args: new_data: 新数据 existing_data: 已存在数据 tolerance: 差异容忍度 Returns: 是否相同 """ core_fields = ["open", "high", "low", "close", "volume"] for field in core_fields: if field not in new_data or field not in existing_data: return False if abs(new_data[field] - existing_data[field]) >= tolerance: return False return True def dataframe_to_dict_list(df: pd.DataFrame) -> List[Dict]: """将DataFrame转换为字典列表""" if df.empty: return [] # 重置索引以便包含日期信息 df_reset = df.reset_index() # 转换列名 df_reset.columns = [str(col).lower() for col in df_reset.columns] # 转换为字典列表 records = df_reset.to_dict("records") # 处理数值类型 for record in records: for key, value in record.items(): if isinstance(value, (np.integer, np.floating)): record[key] = float(value) if isinstance(value, np.floating) else int(value) elif pd.isna(value): record[key] = None return records def merge_kline_data( cached_data: pd.DataFrame, sdk_data: pd.DataFrame ) -> pd.DataFrame: """合并K线数据(SDK数据优先)""" if cached_data.empty: return sdk_data if sdk_data.empty: return cached_data # 合并数据,SDK数据优先 combined = pd.concat([cached_data, sdk_data]) combined = combined[~combined.index.duplicated(keep="last")] combined = combined.sort_index() return combined def calculate_data_completeness( actual_count: int, expected_count: int ) -> float: """计算数据完整度""" if expected_count == 0: return 1.0 return actual_count / expected_count def detect_missing_periods( existing_dates: List, expected_dates: List ) -> List[Tuple]: """检测缺失的时间段""" missing = set(expected_dates) - set(existing_dates) return sorted(list(missing))