From ff0eef08fea0837a7cf1ecd65d15d8f67786314a Mon Sep 17 00:00:00 2001 From: zengbin93 Date: Thu, 28 Mar 2024 20:20:22 +0800 Subject: [PATCH] =?UTF-8?q?V0.9.46=20=E6=9B=B4=E6=96=B0=E4=B8=80=E6=89=B9?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=20(#190)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 0.9.46 start coding * 0.9.46 PSI 测试开发 * 0.9.46 update * 0.9.46 新增 optuna 超参分析 * 0.9.46 disk cache 增加默认path * 0.9.46 新增 rolling_tanh 函数 * 0.9.46 新增CCF因子函数 * 0.9.46 新增最大回撤分析组件 * 0.9.46 新增期货调仓函数 --- .github/workflows/pythonpackage.yml | 2 +- README.md | 2 +- czsc/__init__.py | 14 ++++- czsc/connectors/research.py | 3 +- czsc/connectors/tq_connector.py | 67 +++++++++++++++++++- czsc/features/__init__.py | 4 ++ czsc/features/tas.py | 51 ++++++++++++++++ czsc/features/utils.py | 94 +++++++++++++++++++++++++++++ czsc/utils/__init__.py | 5 +- czsc/utils/bar_generator.py | 24 ++++++++ czsc/utils/cache.py | 2 +- czsc/utils/corr.py | 14 ++--- czsc/utils/optuna.py | 50 +++++++++++++++ czsc/utils/st_components.py | 76 +++++++++++++++++++++-- czsc/utils/stats.py | 39 ++++++++++++ docs/requirements.txt | 3 +- examples/develop/psi.py | 61 +++++++++++++++++++ examples/qmt_realtime.py | 3 - examples/signals_dev/fenlei.py | 7 +++ requirements.txt | 3 +- test/test_features.py | 25 ++++++++ 21 files changed, 522 insertions(+), 27 deletions(-) create mode 100644 czsc/features/tas.py create mode 100644 czsc/utils/optuna.py create mode 100644 examples/develop/psi.py create mode 100644 examples/signals_dev/fenlei.py diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 4e706dcac..f6ca04765 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -5,7 +5,7 @@ name: Python package on: push: - branches: [ master, V0.9.45 ] + branches: [ master, V0.9.46 ] pull_request: branches: [ master ] diff --git a/README.md b/README.md index a7644f9a6..97712fce3 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ pip install git@github.com:waditu/czsc.git -U 直接从github指定分支安装最新版: ``` -pip install git+https://github.com/waditu/czsc.git@V0.9.41 -U +pip install git+https://github.com/waditu/czsc.git@V0.9.46 -U ``` 从`pypi`安装: diff --git a/czsc/__init__.py b/czsc/__init__.py index 2a8e00e76..436f9c744 100644 --- a/czsc/__init__.py +++ b/czsc/__init__.py @@ -45,6 +45,8 @@ ExitsOptimize, ) from czsc.utils import ( + format_standard_kline, + KlineChart, WordWriter, BarGenerator, @@ -81,6 +83,7 @@ holds_performance, net_value_stats, subtract_fee, + top_drawdowns, home_path, DiskCache, @@ -94,6 +97,9 @@ DataClient, set_url_token, get_url_token, + + optuna_study, + optuna_good_params, ) # 交易日历工具 @@ -121,6 +127,8 @@ show_stoploss_by_direction, show_cointegration, show_out_in_compare, + show_optuna_study, + show_drawdowns, ) from czsc.utils.bi_info import ( @@ -144,12 +152,14 @@ rolling_compare, rolling_scale, rolling_slope, + rolling_tanh, + feature_adjust, ) -__version__ = "0.9.45" +__version__ = "0.9.46" __author__ = "zengbin93" __email__ = "zeng_bin8888@163.com" -__date__ = "20240308" +__date__ = "20240318" def welcome(): diff --git a/czsc/connectors/research.py b/czsc/connectors/research.py index 55215c3ab..622170881 100644 --- a/czsc/connectors/research.py +++ b/czsc/connectors/research.py @@ -45,6 +45,7 @@ def get_raw_bars(symbol, freq, sdt, edt, fq='前复权', **kwargs): :param kwargs: :return: """ + raw_bars = kwargs.get('raw_bars', True) kwargs['fq'] = fq file = glob.glob(os.path.join(cache_path, "*", f"{symbol}.parquet"))[0] freq = czsc.Freq(freq) @@ -54,5 +55,5 @@ def get_raw_bars(symbol, freq, sdt, edt, fq='前复权', **kwargs): kline = kline[(kline['dt'] >= pd.to_datetime(sdt)) & (kline['dt'] <= pd.to_datetime(edt))] if kline.empty: return [] - _bars = czsc.resample_bars(kline, freq, raw_bars=True, base_freq='1分钟') + _bars = czsc.resample_bars(kline, freq, raw_bars=raw_bars, base_freq='1分钟') return _bars diff --git a/czsc/connectors/tq_connector.py b/czsc/connectors/tq_connector.py index 11cb3d42d..c415abe01 100644 --- a/czsc/connectors/tq_connector.py +++ b/czsc/connectors/tq_connector.py @@ -204,7 +204,14 @@ def is_trade_time(trade_time: Optional[str] = None): def get_daily_backup(api: TqApi, **kwargs): - """获取每日账户中需要备份的信息""" + """获取每日账户中需要备份的信息 + + https://doc.shinnytech.com/tqsdk/latest/reference/tqsdk.objs.html?highlight=account#tqsdk.objs.Order + https://doc.shinnytech.com/tqsdk/latest/reference/tqsdk.objs.html?highlight=account#tqsdk.objs.Position + https://doc.shinnytech.com/tqsdk/latest/reference/tqsdk.objs.html?highlight=account#tqsdk.objs.Account + + :param api: TqApi, 天勤API实例 + """ orders = api.get_order() trades = api.get_trade() position = api.get_position() @@ -229,3 +236,61 @@ def get_daily_backup(api: TqApi, **kwargs): "account": account, } return backup + + +def adjust_portfolio(api: TqApi, portfolio, account=None, **kwargs): + """调整账户组合 + + **注意:** 此函数会阻塞,直到调仓完成;使用前请仔细阅读 TargetPosTask 的源码和文档,确保了解其工作原理 + + :param api: TqApi, 天勤API实例 + :param account: str, 天勤账户 + :param portfolio: dict, 组合配置,key 为合约代码,value 为配置信息; 样例数据: + + { + "KQ.m@CFFEX.T": {"target_volume": 10, "price": "PASSIVE", "offset_priority": "今昨,开"}, + "KQ.m@CFFEX.TS": {"target_volume": 0, "price": "ACTIVE", "offset_priority": "今昨,开"}, + "KQ.m@CFFEX.TF": {"target_volume": 30, "price": "PASSIVE", "offset_priority": "今昨,开"} + } + + :param kwargs: dict, 其他参数 + """ + symbol_infos = {} + for symbol, conf in portfolio.items(): + quote = api.get_quote(symbol) + + lots = conf.get("target_volume", 0) + price = conf.get("price", "PASSIVE") + offset_priority = conf.get("offset_priority", "今昨,开") + + # 踩坑记录:TargetPosTask 的 symbol 必须是合约代码 + contract = quote.underlying_symbol if "@" in symbol else symbol + target_pos = TargetPosTask(api, contract, price=price, offset_priority=offset_priority, account=account) + target_pos.set_target_volume(int(lots)) + symbol_infos[symbol] = {"quote": quote, "target_pos": target_pos, "lots": lots} + + while True: + api.wait_update() + + completed = [] + for symbol, info in symbol_infos.items(): + quote = info["quote"] + target_pos: TargetPosTask = info["target_pos"] + lots = info["lots"] + contract = quote.underlying_symbol if "@" in symbol else symbol + + logger.info(f"调整仓位:{quote.datetime} - {contract}; 目标持仓:{lots}手; 当前持仓:{target_pos._pos.pos}手") + + if target_pos._pos.pos == lots: + completed.append(True) + logger.info(f"调仓完成:{quote.datetime} - {contract}; {lots}手") + else: + completed.append(False) + + if all(completed): + break + + if kwargs.get("close_api", True): + api.close() + + return api diff --git a/czsc/features/__init__.py b/czsc/features/__init__.py index 08bc3518e..9e34e3c68 100644 --- a/czsc/features/__init__.py +++ b/czsc/features/__init__.py @@ -24,4 +24,8 @@ VPF002, VPF003, VPF004, +) + +from .tas import ( + CCF ) \ No newline at end of file diff --git a/czsc/features/tas.py b/czsc/features/tas.py new file mode 100644 index 000000000..6ba8a14a2 --- /dev/null +++ b/czsc/features/tas.py @@ -0,0 +1,51 @@ +""" +技术指标因子 +""" +import inspect +import hashlib +import pandas as pd + + +def CCF(df, **kwargs): + """使用 CZSC 库中的 factor 识别因子,主要用于识别缠论/形态因子 + + :param df: 标准K线数据,DataFrame结构 + :param kwargs: 其他参数 + + - czsc_factor: dict, 缠论因子配置,样例: + + { + "signals_all": ["日线_D1_表里关系V230101_向上_任意_任意_0"], + "signals_any": [], + "signals_not": ["日线_D1_涨跌停V230331_涨停_任意_任意_0"], + } + + - freq: str, default '日线',K线级别 + - tag: str, default None,标签,用于区分不同的因子 + + :return: pd.DataFrame + """ + from czsc.objects import Factor + from czsc.utils import format_standard_kline + from czsc.traders.base import generate_czsc_signals + from czsc.traders.sig_parse import get_signals_config + + czsc_factor = kwargs.get('czsc_factor', None) + freq = kwargs.get('freq', '日线') + assert czsc_factor is not None and isinstance(czsc_factor, dict), "factor 参数必须指定" + tag = kwargs.get('tag', hashlib.sha256(f"{czsc_factor}_{freq}".encode()).hexdigest().upper()[:6]) + + factor_name = inspect.stack()[0][3] + factor_col = f'F#{factor_name}#{tag}' + + czsc_factor = Factor.load(czsc_factor) + signals_seq = czsc_factor.signals_all + czsc_factor.signals_any + czsc_factor.signals_not + signals_config = get_signals_config([x.signal for x in signals_seq]) + + bars = format_standard_kline(df, freq=freq) + dfs = generate_czsc_signals(bars, signals_config, init_n=300, sdt=bars[0].dt, df=True) + dfs[factor_col] = dfs.apply(czsc_factor.is_match, axis=1).astype(int) + + df = pd.merge(df, dfs[['dt', factor_col]], on='dt', how='left') + df[factor_col] = df[factor_col].fillna(0) + return df diff --git a/czsc/features/utils.py b/czsc/features/utils.py index 7ead08350..1d69826cb 100644 --- a/czsc/features/utils.py +++ b/czsc/features/utils.py @@ -187,6 +187,26 @@ def rolling_scale(df: pd.DataFrame, col: str, window=300, min_periods=100, new_c return df +def rolling_tanh(df: pd.DataFrame, col: str, window=300, min_periods=100, new_col=None, **kwargs): + """对序列进行滚动 tanh 变换 + + 双曲正切函数:https://baike.baidu.com/item/%E5%8F%8C%E6%9B%B2%E6%AD%A3%E5%88%87%E5%87%BD%E6%95%B0/15469414 + + :param df: pd.DataFrame, 待计算的数据 + :param col: str, 待计算的列 + :param window: int, 滚动窗口大小, 默认为300 + :param min_periods: int, 最小计算周期, 默认为100 + :param new_col: str, 新列名,默认为 None, 表示使用 f'{col}_scale' 作为新列名 + """ + if kwargs.get("copy", False): + df = df.copy() + new_col = new_col if new_col else f'{col}_tanh' + df = df.sort_values("dt", ascending=True).reset_index(drop=True) + df[new_col] = df[col].rolling(window=window, min_periods=min_periods).apply(lambda x: np.tanh(scale(x))[-1]) # type: ignore + df[new_col] = df[new_col].fillna(0) + return df + + def rolling_slope(df: pd.DataFrame, col: str, window=300, min_periods=100, new_col=None, **kwargs): """计算序列的滚动斜率 @@ -234,3 +254,77 @@ def __lr_slope(x): df[new_col] = df[new_col].fillna(0) return df + + +def feature_adjust_V230101(df: pd.DataFrame, fcol, **kwargs): + """特征调整函数:对特征进行调整,使其符合持仓权重的定义 + + 方法说明:对因子进行滚动相关系数计算,然后对因子值用 maxabs_scale 进行归一化,最后乘以滚动相关系数的符号 + + :param df: pd.DataFrame, 必须包含 dt、symbol、price 列,以及因子列 + :param fcol: str 因子列名 + :param kwargs: dict + """ + window = kwargs.get("window", 1000) + min_periods = kwargs.get("min_periods", 200) + + df = df.copy().sort_values("dt", ascending=True).reset_index(drop=True) + df['n1b'] = df['price'].shift(-1) / df['price'] - 1 + df['corr'] = df[fcol].rolling(window=window, min_periods=min_periods).corr(df['n1b']) + df['corr'] = df['corr'].shift(5).fillna(0) + + df = rolling_scale(df, col=fcol, window=window, min_periods=min_periods, + new_col='weight', method='maxabs_scale', copy=True) + df['weight'] = df['weight'] * np.sign(df['corr']) + + df.drop(['n1b', 'corr'], axis=1, inplace=True) + return df + + +def feature_adjust_V240323(df: pd.DataFrame, fcol, **kwargs): + """特征调整函数:对特征进行调整,使其符合持仓权重的定义 + + 方法说明:对因子进行滚动相关系数计算,然后对因子值用 scale + tanh 进行归一化,最后乘以滚动相关系数的符号 + + :param df: pd.DataFrame, 必须包含 dt、symbol、price 列,以及因子列 + :param fcol: str 因子列名 + :param kwargs: dict + """ + window = kwargs.get("window", 1000) + min_periods = kwargs.get("min_periods", 200) + + df = df.copy().sort_values("dt", ascending=True).reset_index(drop=True) + df['n1b'] = df['price'].shift(-1) / df['price'] - 1 + df['corr'] = df[fcol].rolling(window=window, min_periods=min_periods).corr(df['n1b']) + df['corr'] = df['corr'].shift(5).fillna(0) + + df = rolling_tanh(df, col=fcol, window=window, min_periods=min_periods, new_col='weight') + df['weight'] = df['weight'] * np.sign(df['corr']) + + df.drop(['n1b', 'corr'], axis=1, inplace=True) + return df + + +def feature_adjust(df: pd.DataFrame, fcol, method, **kwargs): + """特征调整函数:对特征进行调整,使其符合持仓权重的定义 + + :param df: pd.DataFrame, 待调整的数据 + :param fcol: str, 因子列名 + :param method: str, 调整方法 + + - V230101: 对因子进行滚动相关系数计算,然后对因子值用 maxabs_scale 进行归一化,最后乘以滚动相关系数的符号 + - V240323: 对因子进行滚动相关系数计算,然后对因子值用 scale + tanh 进行归一化,最后乘以滚动相关系数的符号 + + :param kwargs: dict + + - window: int, 滚动窗口大小 + - min_periods: int, 最小计算周期 + + :return: pd.DataFrame, 新增 weight 列 + """ + if method == "V230101": + return feature_adjust_V230101(df, fcol, **kwargs) + elif method == "V240323": + return feature_adjust_V240323(df, fcol, **kwargs) + else: + raise ValueError(f"Unknown method: {method}") diff --git a/czsc/utils/__init__.py b/czsc/utils/__init__.py index 1425c04e6..91e3fa7ec 100644 --- a/czsc/utils/__init__.py +++ b/czsc/utils/__init__.py @@ -10,7 +10,7 @@ from .echarts_plot import kline_pro, heat_map from .word_writer import WordWriter from .corr import nmi_matrix, single_linear, cross_sectional_ic -from .bar_generator import BarGenerator, freq_end_time, resample_bars +from .bar_generator import BarGenerator, freq_end_time, resample_bars, format_standard_kline from .bar_generator import is_trading_time, get_intraday_times, check_freq_and_market from .io import dill_dump, dill_load, read_json, save_json from .sig import check_pressure_support, check_gap_info, is_bis_down, is_bis_up, get_sub_elements, is_symmetry_zs @@ -18,12 +18,13 @@ from .plotly_plot import KlineChart from .trade import cal_trade_price, update_nbars, update_bbars, update_tbars, risk_free_returns, resample_to_daily from .cross import CrossSectionalPerformance, cross_sectional_ranker -from .stats import daily_performance, net_value_stats, subtract_fee, weekly_performance, holds_performance +from .stats import daily_performance, net_value_stats, subtract_fee, weekly_performance, holds_performance, top_drawdowns from .signal_analyzer import SignalAnalyzer, SignalPerformance from .cache import home_path, get_dir_size, empty_cache_path, DiskCache, disk_cache from .index_composition import index_composition from .data_client import DataClient, set_url_token, get_url_token from .oss import AliyunOSS +from .optuna import optuna_study, optuna_good_params sorted_freqs = ['Tick', '1分钟', '2分钟', '3分钟', '4分钟', '5分钟', '6分钟', '10分钟', '12分钟', diff --git a/czsc/utils/bar_generator.py b/czsc/utils/bar_generator.py index e484529fa..bf6248260 100644 --- a/czsc/utils/bar_generator.py +++ b/czsc/utils/bar_generator.py @@ -39,6 +39,30 @@ def get_intraday_times(freq='1分钟', market="A股"): return freq_market_times[f"{freq}_{market}"] +def format_standard_kline(df: pd.DataFrame, freq: str): + """格式化标准K线数据为 CZSC 标准数据结构 RawBar 列表 + + :param df: 标准K线数据,DataFrame结构 + + =================== ========= ====== ======= ====== ===== =========== =========== + dt symbol open close high low vol amount + =================== ========= ====== ======= ====== ===== =========== =========== + 2023-11-17 00:00:00 689009.SH 33.52 33.41 33.69 33.38 1.97575e+06 6.61661e+07 + 2023-11-20 00:00:00 689009.SH 33.4 32.91 33.45 32.25 5.15016e+06 1.68867e+08 + =================== ========= ====== ======= ====== ===== =========== =========== + + :param freq: K线级别 + :return: list of RawBar + """ + # from czsc.objects import RawBar, Freq + bars = [] + for i, row in df.iterrows(): + bar = RawBar(id=i, symbol=row['symbol'], dt=row['dt'], open=row['open'], close=row['close'], + high=row['high'], low=row['low'], vol=row['vol'], amount=row['amount'], freq=Freq(freq)) + bars.append(bar) + return bars + + def check_freq_and_market(time_seq: List[AnyStr], freq: Optional[AnyStr] = None): """检查时间序列是否为同一周期,是否为同一市场 diff --git a/czsc/utils/cache.py b/czsc/utils/cache.py index 28244cf92..fe940a811 100644 --- a/czsc/utils/cache.py +++ b/czsc/utils/cache.py @@ -159,7 +159,7 @@ def remove(self, k: str, suffix: str = "pkl"): Path.unlink(file) if Path.exists(file) else None -def disk_cache(path: str, suffix: str = "pkl", ttl: int = -1): +def disk_cache(path: str = home_path, suffix: str = "pkl", ttl: int = -1): """缓存装饰器,支持多种数据格式 :param path: 缓存文件夹路径 diff --git a/czsc/utils/corr.py b/czsc/utils/corr.py index 8c53212ac..da35337fb 100644 --- a/czsc/utils/corr.py +++ b/czsc/utils/corr.py @@ -11,17 +11,10 @@ """ import numpy as np import pandas as pd -import seaborn as sns -import matplotlib.pyplot as plt -from sklearn import metrics from tqdm import tqdm from typing import Union -plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 -plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 - - def nmi_matrix(df: pd.DataFrame, heatmap=False) -> pd.DataFrame: """计算高维标准化互信息并以矩阵形式输出 @@ -29,6 +22,13 @@ def nmi_matrix(df: pd.DataFrame, heatmap=False) -> pd.DataFrame: :param heatmap: 是否绘制热力图 :return: """ + import seaborn as sns + import matplotlib.pyplot as plt + from sklearn import metrics + + plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 + plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + cols = df.columns.to_list() m_dict = {} diff --git a/czsc/utils/optuna.py b/czsc/utils/optuna.py new file mode 100644 index 000000000..938e24e21 --- /dev/null +++ b/czsc/utils/optuna.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +""" +author: zengbin93 +email: zeng_bin8888@163.com +create_dt: 2024/3/21 13:56 +describe: optuna 工具函数 +""" +import hashlib +import optuna +import inspect +import pandas as pd + + +def optuna_study(objective, direction="maximize", n_trials=100, **kwargs): + """使用optuna进行参数优化""" + objective_code = inspect.getsource(objective) + study_name = hashlib.md5(f"{objective_code}_{direction}".encode("utf-8")).hexdigest().upper()[:12] + study = optuna.create_study(direction=direction, study_name=study_name) + + timeout = kwargs.pop("timeout", None) + n_jobs = kwargs.pop("n_jobs", 1) + study.optimize(objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs, **kwargs) + return study + + +def optuna_good_params(study: optuna.Study, keep=0.2) -> pd.DataFrame: + """获取optuna优化结果中的最优参数 + + :param study: optuna.study.Study + :param keep: float, 保留最优参数的比例, 默认0.2 + 如果keep小于0,则按比例保留;如果keep大于0,则保留keep个参数组 + :return: pd.DataFrame, 最优参数组列表 + """ + assert keep > 0, "keep必须大于0" + params = [] + for trail in study.trials: + if trail.state != optuna.trial.TrialState.COMPLETE: + continue + if trail.value is None: + continue + + p = {"params": trail.params, "objective": trail.value} + params.append(p) + + n = int(len(params) * keep) if keep < 1 else int(keep) + reverse = study.direction == 2 + params = sorted(params, key=lambda x: x['objective'], reverse=reverse) + dfp = pd.DataFrame(params[:n]) + dfp = dfp.drop_duplicates(subset=['params'], keep='first').reset_index(drop=True) + return dfp diff --git a/czsc/utils/st_components.py b/czsc/utils/st_components.py index 924b75006..ff53a0a4a 100644 --- a/czsc/utils/st_components.py +++ b/czsc/utils/st_components.py @@ -1,5 +1,6 @@ import czsc import hashlib +import optuna import numpy as np import pandas as pd import streamlit as st @@ -15,7 +16,7 @@ def show_daily_return(df, **kwargs): :param df: pd.DataFrame,数据源 :param kwargs: - - title: str,标题 + - sub_title: str,标题 - stat_hold_days: bool,是否展示持有日绩效指标,默认为 True - legend_only_cols: list,仅在图例中展示的列名 - use_st_table: bool,是否使用 st.table 展示绩效指标,默认为 False @@ -73,10 +74,9 @@ def _stats(df_, type_='持有日'): use_st_table = kwargs.get("use_st_table", False) with st.container(): - title = kwargs.get("title", "") - if title: - st.subheader(title) - st.divider() + sub_title = kwargs.get("sub_title", "") + if sub_title: + st.subheader(sub_title, divider="rainbow") with st.expander("交易日绩效指标", expanded=True): if use_st_table: @@ -641,7 +641,7 @@ def show_ts_self_corr(df, col, **kwargs): st.subheader(sub_title, divider="rainbow", anchor=hashlib.md5(sub_title.encode('utf-8')).hexdigest()[:8]) c1, c2, c3, c4 = st.columns(4) min_periods = int(c1.number_input('最小滑动窗口长度', value=20, min_value=0, step=1)) - window = int(c2.number_input('滑动窗口长度', value=0, step=1, help='0 表示按 expanding 方式滑动')) + window = int(c2.number_input('滑动窗口长度', value=200, step=1)) corr_method = c3.selectbox('相关系数计算方法', ['pearson', 'kendall', 'spearman']) n = int(c4.number_input('自相关滞后阶数', value=1, min_value=1, step=1)) @@ -808,3 +808,67 @@ def show_out_in_compare(df, ret_col, mid_dt, **kwargs): } ) st.dataframe(df_stats, use_container_width=True) + + +def show_optuna_study(study: optuna.Study, **kwargs): + # https://optuna.readthedocs.io/en/stable/reference/visualization/index.html + # https://zh-cn.optuna.org/reference/visualization.html + from czsc.utils.optuna import optuna_good_params + + sub_title = kwargs.pop("sub_title", "Optuna Study Visualization") + if sub_title: + anchor = hashlib.md5(sub_title.encode("utf-8")).hexdigest().upper()[:6] + st.subheader(sub_title, divider="rainbow", anchor=anchor) + + fig = optuna.visualization.plot_contour(study) + st.plotly_chart(fig, use_container_width=True) + + fig = optuna.visualization.plot_slice(study) + st.plotly_chart(fig, use_container_width=True) + + with st.expander("最佳参数列表", expanded=False): + params = optuna_good_params(study, keep=kwargs.pop("keep", 0.2)) + st.dataframe(params, use_container_width=True) + return study + + +def show_drawdowns(df, ret_col, **kwargs): + """展示最大回撤分析 + + :param df: pd.DataFrame, columns: cells, index: dates + :param ret_col: str, 回报率列名称 + :param kwargs: + + - sub_title: str, optional, 子标题 + - top: int, optional, 默认10, 返回最大回撤的数量 + + """ + assert isinstance(df, pd.DataFrame), "df 必须是 pd.DataFrame 类型" + if not df.index.dtype == 'datetime64[ns]': + df['dt'] = pd.to_datetime(df['dt']) + df.set_index('dt', inplace=True) + assert df.index.dtype == 'datetime64[ns]', "index必须是datetime64[ns]类型, 请先使用 pd.to_datetime 进行转换" + df = df[[ret_col]].copy().fillna(0) + df.sort_index(inplace=True, ascending=True) + df['cum_ret'] = df[ret_col].cumsum() + df['cum_max'] = df['cum_ret'].cummax() + df['drawdown'] = df['cum_ret'] - df['cum_max'] + + sub_title = kwargs.get('sub_title', "最大回撤分析") + if sub_title: + st.subheader(sub_title, divider="rainbow") + + top = kwargs.get('top', 10) + if top is not None: + with st.expander(f"TOP{top} 最大回撤详情", expanded=False): + dft = czsc.top_drawdowns(df[ret_col].copy(), top=10) + dft = dft.style.background_gradient(cmap='RdYlGn_r', subset=['净值回撤']) + dft = dft.background_gradient(cmap='RdYlGn', subset=['回撤天数', '恢复天数']) + dft = dft.format({'净值回撤': '{:.2%}', '回撤天数': '{:.0f}', '恢复天数': '{:.0f}'}) + st.dataframe(dft, use_container_width=True) + + drawdown = go.Scatter(x=df.index, y=df["drawdown"], fillcolor="red", fill='tozeroy', mode="lines", name="回测曲线") + fig = go.Figure(drawdown) + fig.update_layout(margin=dict(l=0, r=0, t=0, b=0)) + fig.update_layout(title="", xaxis_title="", yaxis_title="净值回撤", legend_title="回撤曲线") + st.plotly_chart(fig, use_container_width=True) diff --git a/czsc/utils/stats.py b/czsc/utils/stats.py index fad156a64..e683af713 100644 --- a/czsc/utils/stats.py +++ b/czsc/utils/stats.py @@ -363,3 +363,42 @@ def holds_performance(df, **kwargs): dfr['cost'] = dfr['change'] * fee / 10000 # 换手成本 dfr['edge_post_fee'] = dfr['edge_pre_fee'] - dfr['cost'] # 净收益 return dfr + + +def top_drawdowns(returns: pd.Series, top: int = 10) -> pd.DataFrame: + """分析最大回撤,返回最大回撤的波峰、波谷、恢复日期、回撤天数、恢复天数 + + :param returns: pd.Series, 日收益率序列,index为日期 + :param top: int, optional, 返回最大回撤的数量,默认10 + :return: pd.DataFrame + """ + returns = returns.copy() + df_cum = returns.cumsum() + underwater = df_cum - df_cum.cummax() + + drawdowns = [] + for _ in range(top): + valley = underwater.idxmin() # end of the period + peak = underwater[:valley][underwater[:valley] == 0].index[-1] + try: + recovery = underwater[valley:][underwater[valley:] == 0].index[0] + except IndexError: + recovery = np.nan # drawdown not recovered + + # Slice out draw-down period + if not pd.isnull(recovery): + underwater.drop(underwater[peak:recovery].index[1:-1], inplace=True) + else: + # drawdown has not ended yet + underwater = underwater.loc[:peak] + + drawdown = df_cum.loc[valley] - df_cum.loc[peak] + + drawdowns.append((peak, valley, recovery, drawdown)) + if (len(returns) == 0) or (len(underwater) == 0) or (np.min(underwater) == 0): + break + + df_drawdowns = pd.DataFrame(drawdowns, columns=["回撤开始", "回撤结束", "回撤修复", "净值回撤"]) + df_drawdowns['回撤天数'] = (df_drawdowns['回撤结束'] - df_drawdowns['回撤开始']).dt.days + df_drawdowns['恢复天数'] = (df_drawdowns['回撤修复'] - df_drawdowns['回撤结束']).dt.days + return df_drawdowns diff --git a/docs/requirements.txt b/docs/requirements.txt index 88d3f6e1d..5717ce801 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -29,4 +29,5 @@ lightgbm>=4.0.0 streamlit redis oss2 -statsmodels \ No newline at end of file +statsmodels +optuna \ No newline at end of file diff --git a/examples/develop/psi.py b/examples/develop/psi.py new file mode 100644 index 000000000..90082c7e2 --- /dev/null +++ b/examples/develop/psi.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +""" +author: Napoleon +create_dt: 2024/03/13 12:14 +describe: psi模型稳定性评估 + + +""" +import numpy as np +import pandas as pd + + +def psi(df: pd.DataFrame, col, n=10, **kwargs): + """PSI 群体稳定性指标,反映数据在不同分箱中的分布变化 + + PSI = ∑(实际占比 - 基准占比) * ln(实际占比 / 基准占比) + + 参考:https://zhuanlan.zhihu.com/p/79682292 风控模型—群体稳定性指标(PSI)深入理解应用 + + :param df: 数据, 必须包含 dt 和 col 列 + :param col: 要计算的列 + :param n: 分箱数 + :param kwargs: + + - scale: 是否进行标准化 + - window: 滚动窗口 + - min_periods: 最小观测数 + - dt_pattern: 时间分组格式,默认 '%Y' 表示按年分组; '%Y-%m' 表示按月分组; 按季度分组 '%Y-%q' + + :return: pd.DataFrame + """ + assert 'dt' in df.columns, '时间列必须为 dt' + assert col in df.columns, f'数据中没有 {col} 列' + df['dt'] = pd.to_datetime(df['dt']) + dt_pattern = kwargs.get('dt_pattern', '%Y') + df['key'] = df['dt'].dt.strftime(dt_pattern) + + if kwargs.get('scale', False): + window = kwargs.get('window', 2000) + min_periods = kwargs.get('min_periods', 100) + + df[col] = df[col].rolling(window=window, min_periods=min_periods).apply( + lambda x: ((x - x.mean()) / x.std())[-1], raw=True).fillna(0) + + df['bin'] = pd.qcut(df[col], n) + dfg = df.groupby(['bin', 'key'], observed=False).size().unstack().fillna(0).apply(lambda x: x / x.sum(), axis=0) + dfg['PSI'] = dfg.diff(axis=1).abs().mean(axis=1) + + # base_col = dfg.columns[0] + # for rate_col in dfg.columns[1:]: + # dfg[f"{col}_PSI"] = (dfg[rate_col] - dfg[base_col]) * np.log((dfg[rate_col] / dfg[base_col])) + # psi_cols = [x for x in dfg.columns if x.endswith('_PSI')] + # dfg['PSI'] = dfg[psi_cols].sum(axis=1) + return dfg + + +if __name__ == '__main__': + from czsc.connectors import research + df = research.get_raw_bars('000001.SH', '日线', '20170101', '20230101', fq='前复权', raw_bars=False) + + dfs = psi(df, 'close', 10, dt_pattern='%Y', scale=True) diff --git a/examples/qmt_realtime.py b/examples/qmt_realtime.py index d3df7a74b..6ba3b160d 100644 --- a/examples/qmt_realtime.py +++ b/examples/qmt_realtime.py @@ -49,6 +49,3 @@ def get_index_members(index_code='000852.SH', trade_date='20230131'): if __name__ == '__main__': manager = qmc.QmtTradeManager(**gjm) manager.run() - - - diff --git a/examples/signals_dev/fenlei.py b/examples/signals_dev/fenlei.py new file mode 100644 index 000000000..f4fbacb13 --- /dev/null +++ b/examples/signals_dev/fenlei.py @@ -0,0 +1,7 @@ +import czsc +from czsc.connectors import research + +bars = research.get_raw_bars("000001.SH", '15分钟', '20101101', '20210101', fq='前复权') + +signals_config = [{'name': "czsc.signals.tas_macd_first_bs_V221201", 'freq': "60分钟"}] +czsc.check_signals_acc(bars, signals_config=signals_config, height='780px', delta_days=5) # type: ignore diff --git a/requirements.txt b/requirements.txt index 10fbcd5c6..f9e604cf1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,5 @@ lightgbm>=4.0.0 streamlit redis oss2 -statsmodels \ No newline at end of file +statsmodels +optuna \ No newline at end of file diff --git a/test/test_features.py b/test/test_features.py index e5393e073..c7aed09b6 100644 --- a/test/test_features.py +++ b/test/test_features.py @@ -1,4 +1,5 @@ import pytest +import numpy as np import pandas as pd @@ -12,3 +13,27 @@ def test_is_event_feature(): # 测试非事件类因子 df2 = pd.DataFrame({'factor': [0, 1, 2, 3, 4, 5]}) assert is_event_feature(df2, 'factor') is False + + +def test_rolling_tanh(): + from czsc.features.utils import rolling_tanh + + # Create a dummy dataframe + df = pd.DataFrame({ + 'dt': pd.date_range(start='1/1/2021', periods=500), + 'col1': np.random.rand(500) + }) + + # Apply the rolling_tanh function + result_df = rolling_tanh(df, 'col1') + assert 'col1_tanh' in result_df.columns + assert result_df['col1_tanh'].between(-1, 1).all() + + # Apply the rolling_tanh function + result_df = rolling_tanh(df, 'col1', new_col='col1_tanh2') + assert 'col1_tanh2' in result_df.columns + assert result_df['col1_tanh2'].between(-1, 1).all() + + result_df = rolling_tanh(df, 'col1', new_col='col1_tanh3', window=100, min_periods=50) + assert 'col1_tanh3' in result_df.columns + assert result_df['col1_tanh3'].between(-1, 1).all()