-
Notifications
You must be signed in to change notification settings - Fork 1
/
data.py
165 lines (137 loc) · 5.69 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import statsmodels.api as sm
import streamlit as st
import pandas as pd
import numpy as np
import zipfile
import shutil
import os
from typing import Dict
from urllib.request import urlopen
from sklearn.metrics import (
mean_squared_error,
mean_absolute_error,
mean_absolute_percentage_error
)
import constant
@st.cache
def download_data():
"""Download the CSV file containing the data"""
data = pd.read_csv(constant.MONTHLY_DATA_URL)
transform_mapping = data.iloc[[0]]
data = data[1:]
data['datetime'] = pd.to_datetime(data['sasdate'])
data.set_index('datetime', inplace=True)
data.drop(['sasdate'], axis=1, inplace=True)
# data.index = data.index.date
# Drop the last row which is all empty
if data.iloc[-1].isnull().all():
data = data[:-1]
return (data, transform_mapping)
@st.cache
def download_appendix():
"""Download the zip folder containing the appendix"""
folder_name = "FRED-MD Appendix"
with urlopen(constant.MONTHLY_APPENDIX_URL) as response, open(f"{folder_name}.zip", 'wb') as out_file:
shutil.copyfileobj(response, out_file)
# extracting required file from zipfile
with zipfile.ZipFile(f"{folder_name}.zip") as zf:
zf.extract(f"{folder_name}/{constant.MONTHLY_APPENDIX_CSV}")
os.remove(f"{folder_name}.zip")
data = pd.read_csv(f"{folder_name}/{constant.MONTHLY_APPENDIX_CSV}", encoding='cp1252')
data.fred = data.fred.replace("IPB51222s", "IPB51222S")
return data
def transformation_table():
"""Mapping of transformation id and description/name"""
df = pd.DataFrame(data={"Transformation ID": range(1,8),
"Description" : ["No transformation", "First difference", "Second difference",
"Natural Log", "% Change", "First difference of % Change", "Exact % Change"]})
return df
def grouping_table():
"""Mapping between group id and group description"""
return pd.DataFrame([
(1, "Output and Income"),
(2, "Labor Market"),
(3, "Housing"),
(4, "Consumption, orders, and inventories"),
(5, "Money and credit"),
(6, "Interest and exchange rates"),
(7, "Prices"),
(8, "Stock Market")
], columns=['group', 'group description'])
def get_transform_mapping(appendix):
# Transform the dataframe based on what is recommended, with a few changes
# These series are not stationary with the original transformations
transform_mapping = appendix[['tcode','fred','description','group']].copy()
transform_mapping.loc[transform_mapping['fred'].isin(
['HOUSTMW', 'HOUSTS', 'PERMITNE', 'PERMITMW']),
'tcode'] = 5.0
transform_mapping = pd.merge(transform_mapping, transformation_table(),
how="inner", left_on="tcode", right_on="Transformation ID")
transform_mapping = pd.merge(transform_mapping, grouping_table(),
how="inner", left_on="group", right_on="group")
transform_mapping.drop(columns="Transformation ID", inplace=True)
transform_mapping.rename(columns={
"description":"fred description",
"Description":"Transformation Description"},
inplace=True)
return transform_mapping
def transform_data(data_df, tf):
"""Transform data to be stationary"""
transformed_data_df = data_df.copy()
for col_name in transformed_data_df.columns:
if type(tf) == pd.DataFrame:
try:
tf_idx = tf[tf['fred'] == col_name].iloc[0]['tcode']
except IndexError as error:
st.error(f"{error} from column name {col_name}")
else:
tf_idx = tf
if tf_idx == 1:
continue
elif tf_idx == 2:
transformed_data_df[col_name] = transformed_data_df[col_name].diff()
elif tf_idx == 3:
transformed_data_df[col_name] = transformed_data_df[col_name].diff().diff()
elif tf_idx == 4:
transformed_data_df[col_name] = np.log(transformed_data_df[col_name])
elif tf_idx == 5:
transformed_data_df[col_name] = np.log(transformed_data_df[col_name]).diff() * 100
elif tf_idx == 6:
transformed_data_df[col_name] = np.log(transformed_data_df[col_name]).diff().diff() * 100
elif tf_idx == 7:
transformed_data_df[col_name] = ((transformed_data_df[col_name] / transformed_data_df[col_name].shift(1)) - 1.0) * 100
else:
raise ValueError
return transformed_data_df
def remove_outliers(dta):
# Compute the mean and interquartile range
mean = dta.mean()
iqr = dta.quantile([0.25, 0.75]).diff().T.iloc[:, 1]
# Replace entries that are more than 10 times the IQR
# away from the mean with NaN (denotes a missing entry)
mask = np.abs(dta) > mean + 10 * iqr
treated = dta.copy()
treated[mask] = np.nan
return treated
def is_stationary(df, alpha=0.05):
"""
Augmented Dicky-Fuller Test
Returns:
True if stationary
adf_ts: ADF test statistic
pvalue: MacKinnon's approximate p-value based on MacKinnon
"""
adf_ts, pvalue, *rest = sm.tsa.adfuller(df,regression='ct')
return (pvalue < alpha, adf_ts, pvalue)
def get_pred_metrics(
actual: pd.Series, preds: Dict[str, pd.Series]
) -> pd.DataFrame:
mspe = dict()
cols = list(preds.keys())
mspe["Mean Squared Error"] = [mean_squared_error(actual.fillna(
method="ffill"), preds[col]) for col in cols]
mspe["Mean Absolute Error"] = [mean_absolute_error(actual.fillna(
method="ffill"), preds[col]) for col in cols]
mspe["Mean Absolute Percentage Error"] = [mean_absolute_percentage_error(
actual.fillna(method="ffill"), preds[col]) for col in cols]
return pd.DataFrame.from_dict(mspe, orient="index", columns=cols)