0.9.33 新增时序因子预处理

waditu · Oct 20, 2023 · d2196e1 · d2196e1
1 parent 76aeb77
commit d2196e1
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 0 deletions.
diff --git a/czsc/__init__.py b/czsc/__init__.py
@@ -99,6 +99,7 @@
 
 from czsc.utils.features import (
     normalize_feature,
+    normalize_ts_feature,
 )
 
 __version__ = "0.9.33"

diff --git a/czsc/utils/features.py b/czsc/utils/features.py
@@ -5,6 +5,7 @@
 create_dt: 2023/10/06 15:01
 describe: 因子（特征）处理
 """
+import pandas as pd
 from loguru import logger
 from sklearn.preprocessing import scale
 
@@ -26,3 +27,39 @@ def normalize_feature(df, x_col, **kwargs):
     q = kwargs.get("q", 0.05)           # 缩尾比例
     df[x_col] = df.groupby("dt")[x_col].transform(lambda x: scale(x.clip(lower=x.quantile(q), upper=x.quantile(1 - q))))
     return df
+
+
+def normalize_ts_feature(df, x_col, n=10, **kwargs):
+    """对时间序列数据进行归一化处理
+
+    :param df: 因子数据，必须包含 dt, x_col 列，其中 dt 为日期，x_col 为因子值，数据样例：
+    :param x_col: 因子列名
+    :param n: 分层数量，默认为10
+    :param kwargs:
+
+        - min_periods: expanding 时的最小样本数量，默认为300
+
+    :return: df, 添加了 x_col_norm, x_col_qcut, x_col分层 列
+    """
+    assert df[x_col].nunique() > n, "因子值的取值数量必须大于分层数量"
+    min_periods = kwargs.get("min_periods", 300)
+    if df[x_col].isna().sum() > 0:
+        logger.warning(f"因子列 {x_col} 存在缺失值，请注意！建议先对因子缺失值进行填充")
+
+    if f"{x_col}_norm" not in df.columns:
+        df[f"{x_col}_norm"] = df[x_col].expanding(min_periods=min_periods).apply(
+            lambda x: (x.iloc[-1] - x.mean()) / x.std(), raw=False)
+
+        # 用标准化后的值填充原始值中的缺失值
+        na_x = df[df[f"{x_col}_norm"].isna()][x_col].values
+        df.loc[df[f"{x_col}_norm"].isna(), f"{x_col}_norm"] = na_x - na_x.mean() / na_x.std()
+
+    if f"{x_col}_qcut" not in df.columns:
+        df[f'{x_col}_qcut'] = df[x_col].expanding(min_periods=min_periods).apply(
+            lambda x: pd.qcut(x, q=n, labels=False, duplicates='drop', retbins=False).values[-1], raw=False)
+
+        na_x = df[df[f"{x_col}_qcut"].isna()][x_col].values
+        df.loc[df[f"{x_col}_qcut"].isna(), f"{x_col}_qcut"] = pd.qcut(na_x, q=n, labels=False, duplicates='drop', retbins=False)
+        df[f'{x_col}分层'] = df[f'{x_col}_qcut'].apply(lambda x: f'第{str(int(x+1)).zfill(2)}层')
+
+    return df