Skip to content

Commit

Permalink
0.9.33 新增时序因子预处理
Browse files Browse the repository at this point in the history
  • Loading branch information
zengbin93 committed Oct 20, 2023
1 parent 76aeb77 commit d2196e1
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 0 deletions.
1 change: 1 addition & 0 deletions czsc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@

from czsc.utils.features import (
normalize_feature,
normalize_ts_feature,
)

__version__ = "0.9.33"
Expand Down
37 changes: 37 additions & 0 deletions czsc/utils/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
create_dt: 2023/10/06 15:01
describe: 因子(特征)处理
"""
import pandas as pd
from loguru import logger
from sklearn.preprocessing import scale

Expand All @@ -26,3 +27,39 @@ def normalize_feature(df, x_col, **kwargs):
q = kwargs.get("q", 0.05) # 缩尾比例
df[x_col] = df.groupby("dt")[x_col].transform(lambda x: scale(x.clip(lower=x.quantile(q), upper=x.quantile(1 - q))))
return df


def normalize_ts_feature(df, x_col, n=10, **kwargs):
"""对时间序列数据进行归一化处理
:param df: 因子数据,必须包含 dt, x_col 列,其中 dt 为日期,x_col 为因子值,数据样例:
:param x_col: 因子列名
:param n: 分层数量,默认为10
:param kwargs:
- min_periods: expanding 时的最小样本数量,默认为300
:return: df, 添加了 x_col_norm, x_col_qcut, x_col分层 列
"""
assert df[x_col].nunique() > n, "因子值的取值数量必须大于分层数量"
min_periods = kwargs.get("min_periods", 300)
if df[x_col].isna().sum() > 0:
logger.warning(f"因子列 {x_col} 存在缺失值,请注意!建议先对因子缺失值进行填充")

if f"{x_col}_norm" not in df.columns:
df[f"{x_col}_norm"] = df[x_col].expanding(min_periods=min_periods).apply(
lambda x: (x.iloc[-1] - x.mean()) / x.std(), raw=False)

# 用标准化后的值填充原始值中的缺失值
na_x = df[df[f"{x_col}_norm"].isna()][x_col].values
df.loc[df[f"{x_col}_norm"].isna(), f"{x_col}_norm"] = na_x - na_x.mean() / na_x.std()

if f"{x_col}_qcut" not in df.columns:
df[f'{x_col}_qcut'] = df[x_col].expanding(min_periods=min_periods).apply(
lambda x: pd.qcut(x, q=n, labels=False, duplicates='drop', retbins=False).values[-1], raw=False)

na_x = df[df[f"{x_col}_qcut"].isna()][x_col].values
df.loc[df[f"{x_col}_qcut"].isna(), f"{x_col}_qcut"] = pd.qcut(na_x, q=n, labels=False, duplicates='drop', retbins=False)
df[f'{x_col}分层'] = df[f'{x_col}_qcut'].apply(lambda x: f'第{str(int(x+1)).zfill(2)}层')

return df

0 comments on commit d2196e1

Please sign in to comment.