-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathmarket_preprocess.py
132 lines (108 loc) · 5.62 KB
/
market_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy as np
import pandas as pd
from polygon_ds import get_dates_df
from polygon_df import get_symbol_details_df
from market_beta_resids import colwise_linreg_residuals
symbol_details_path = 'data/sym_details.feather'
details_cols = [
'symbol', 'name', 'type', 'sector', 'industry', 'hq_country', 'exchangeSymbol', 'exchange',
'description', 'tags', 'url', 'listdate', 'cik', 'sic'
]
def all_dates_filer(df: pd.DataFrame) -> pd.DataFrame:
sym_count = df.groupby('symbol')[['open']].count()
active_days = max(df.symbol.value_counts())
passed_sym = sym_count.loc[sym_count['open'] >= active_days].index
df_filtered = df.loc[df.symbol.isin(passed_sym)].reset_index(drop=True)
return df_filtered
def liquidity_filter(df: pd.DataFrame, abs_dollar_cut: float, quantile_dollar_cut: float=None) -> pd.DataFrame:
sym_dollar_avg = df.groupby('symbol')[['dollar_total']].mean()
if quantile_dollar_cut:
min_dollar = df['dollar_total'].quantile(q=qcut)
else:
min_dollar = abs_dollar_cut
passed_sym = sym_dollar_avg.loc[sym_dollar_avg['dollar_total'] > min_dollar].index
df_filtered = df.loc[df.symbol.isin(passed_sym)]
return df_filtered.reset_index(drop=True)
def add_range(df: pd.DataFrame) -> pd.DataFrame:
daily_range = df['high'] - df['low']
df.loc[:, 'range'] = daily_range
range_value_pct = daily_range / df['vwap']
df.loc[:, 'range_value_pct'] = range_value_pct
return df
def range_value_filter(df: pd.DataFrame, low_cut: float, high_cut: float) -> pd.DataFrame:
sym_pct_range_med = df.groupby('symbol')[['range_value_pct']].median()
passed_sym = sym_pct_range_med.loc[sym_pct_range_med['range_value_pct'].between(low_cut, high_cut)].index
df_filtered = df.loc[df.symbol.isin(passed_sym)]
return df_filtered.reset_index(drop=True)
def min_value_filter(df: pd.DataFrame, min_dollar_value: float) -> pd.DataFrame:
sym_med_close = df.groupby('symbol')[['close']].median()
passed_sym = sym_med_close.loc[sym_med_close['close'] > min_dollar_value].index
df_filtered = df.loc[df.symbol.isin(passed_sym)]
return df_filtered.reset_index(drop=True)
def symbol_details_filter(df: pd.DataFrame) -> pd.DataFrame:
sym_details = pd.read_feather(path=symbol_details_path, columns=details_cols)
mask = (sym_details.sector!='') & (sym_details.type.str.upper()=='CS')
sym_details = sym_details[mask].set_index('symbol')
df_filtered = df.loc[df.symbol.isin(sym_details.index), :]
return df_filtered.reset_index(drop=True)
def outlier_squeeze(x, t: int=4):
"""A transformation that suppresses outliers for a standard normal."""
xp = np.clip(x, -t, t)
diff = np.tanh(x - xp)
return xp + diff
def filter_market(df):
nrows_all = df.shape[0]
print(nrows_all, 'Initial rows', len(df.symbol.unique()), 'symbols')
df = all_dates_filer(df)
nrows_1 = df.shape[0]
print((nrows_1 - nrows_all), 'all dates filter', len(df.symbol.unique()), 'symbols')
df = liquidity_filter(df, abs_dollar_cut=500_000)
nrows_2 = df.shape[0]
print((nrows_2 - nrows_1), 'liquidity filter', len(df.symbol.unique()), 'symbols')
df = range_value_filter(df, low_cut=0.005, high_cut=0.5)
nrows_3 = df.shape[0]
print((nrows_3 - nrows_2), 'volitility filter', len(df.symbol.unique()), 'symbols')
df = min_value_filter(df, min_dollar_value=1.0)
nrows_4 = df.shape[0]
print((nrows_4 - nrows_3), 'min $value filter', len(df.symbol.unique()), 'symbols')
df = symbol_details_filter(df)
nrows_5 = df.shape[0]
print((nrows_5 - nrows_4), 'symbol details filter', len(df.symbol.unique()), 'symbols')
print(df.shape[0], 'Final rows', round(df.shape[0] / nrows_all, 3)*100, '% remaining')
return df
def merge_symbol_stats(df):
sym_stats = df.groupby('symbol')[['range_value_pct', 'dollar_total']].median()
sym_details = pd.read_feather(path=symbol_details_path, columns=details_cols).set_index('symbol')
sym_meta = sym_details.join(other=sym_stats, how='right')
# sym_meta.pivot_table(index='industry', columns='sector', values='dollar_total', aggfunc=len)
return sym_meta
def transform_prices(df: pd.DataFrame) -> dict:
df = df.set_index('date_time', drop=True)
r = {}
r['close'] = df.pivot(columns='symbol', values='close')
r['returns'] = r['close'].diff().dropna() # returns
r['log_returns'] = pd.DataFrame(np.log(r['close'])).diff().dropna() # log return
r['log_returns_zs'] = (r['log_returns'] - r['log_returns'].mean()) / r['log_returns'].std(ddof=0) # z-score
r['log_returns_zs_g'] = outlier_squeeze(r['log_returns_zs']) # reduce outliners
# r['log_returns_mad'] = (r['log_returns'] - r['log_returns'].median()) / r['log_returns'].median() # mad
# r['log_returns_mad_g'] = outlier_squeeze(r['log_returns_mad']) # reduce outliners
return r
def prepare_data(start_date: str, end_date: str, beta_symbol: str=None) -> dict:
df_all = get_dates_df(tick_type='daily', symbol='market', start_date=start_date, end_date=end_date)
df_all = add_range(df_all)
df = filter_market(df_all)
sym_meta = merge_symbol_stats(df)
pivot_results = transform_prices(df)
# results dict
r = {}
r['daily_price'] = df.drop(columns=['date', 'midprice', 'range'])
r['symbol_meta'] = sym_meta
r.update(pivot_results)
if beta_symbol:
beta_results = transform_prices(df_all[df_all.symbol == beta_symbol])
log_returns_resid_zs_g = colwise_linreg_residuals(
df=pivot_results['log_returns_zs_g'],
beta_series=beta_results['log_returns_zs_g'][beta_symbol]
)
r['log_returns_resid_zs_g'] = log_returns_resid_zs_g
return r