-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathprediction.py
122 lines (97 loc) · 5.14 KB
/
prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import numpy as np
import argparse, os, sys
from pmdarima.arima import auto_arima
from statsmodels.tsa.vector_ar.vecm import VECM
from sklearn import preprocessing
from sklearn.exceptions import ConvergenceWarning
sys.path.append ("src")
from tools.csv_helper import *
#from selection.gfsm_fselection import gfsm_feature_selection
from selection.pehar_fselection import pehar_feature_selection
from pre_selection.granger_causality import causality_matrix
'''import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)'''
def make_dir (dir):
""" create dir if does not exist """
if not os.path.exists (dir):
os.mkdir (dir)
def predict_target (data, prediction_model, method, nb_predictors, causality_graph):
target_index = list (data. columns). index (target)
if "Arima" in prediction_model:
try:
model = auto_arima (data. loc[:, target]. values, start_p = 1, start_q = 1)
predictions = model. fit_predict (data. loc[:, target]. values, n_periods = 5)
except:
print ("VECM failed for this variable")
predictions = [float('nan') for x in range (5)]
else:
if "PEHAR" in method:
predictors_index = pehar_feature_selection (causality_graph, target_index)[0:nb_predictors]
elif "GFSM":
predictors_index = gfsm_feature_selection (causality_graph, target_index, nb_predictors)
# Add target variable in the first column
predictors_index. insert (0, target_index)
# Construct data from the colmuns
X_train = data. iloc[:, predictors_index]. values
# Fit the model
try:
model = VECM (X_train, k_ar_diff = lag, deterministic = "ci"). fit ()
# Make 5 predictions
predictions = model. predict (steps = 5)[:, 0]
except:
print ("VECM falied for this variable")
predictions = [float('nan') for x in range (5)]
return predictions
#=======================================================================================================================
if __name__ == '__main__':
parser = argparse. ArgumentParser ()
parser. add_argument ("--data", "-d", help = "file of data to process")
parser. add_argument ("--mode", "-m", help = "prediction mode", choices = ["eval", "direct"])
parser. add_argument ("--pred_model", "-pr", help = "prediction model", choices = ["VECM", "ARIMA"])
parser. add_argument ("--fs_method", "-fsm", help = "feature selection method", choices = ["PEHAR_te", "PEHAR_gc"])
parser. add_argument ("--nb_predictors", "-nbp", help = "number of variables to select")
parser. add_argument ("--graph_type", "-g", help = "Causality graph to use, ganger causality (gc), or transfer entropy (te)", choices = ["gc", "te"])
args = parser.parse_args()
#print (args)
# Read data and get information from metadata
data = read_csv_and_metadata_2 (args.data)
data_name = args.data. split ("/")[-1].split ('.')[0]
lag = int (data._metadata["lag_parameter"][0])
targets_ts = data._metadata["predict"]
# Create output directory if does not exist
make_dir ("Processed")
make_dir ("Processed/%s"%data_name)
out_dir = "Processed/%s"%data_name
# Compute causality graphs
gc_causality_graph = causality_matrix (data. values, lag)
te_causality_graph = causality_matrix (data. values, lag)
pd.DataFrame(gc_causality_graph, index=data.columns, columns=data.columns). to_csv ("%s/causality_graph_gc.csv"%(out_dir), sep = ';')
if args.mode == "eval":
evaluation_results = pd.read_excel ("results/evaluation/%s/%s_best.xlsx"%(data_name,data_name), index_col = 0)
all_predictions = {}
info_models = []
for target in targets_ts:
if args. mode == "eval":
pred_model = evaluation_results.loc["predict", target]
nb_predictors = int (evaluation_results.loc["method", target]. split (':')[0])
method = evaluation_results.loc["group", target]
graph_type = evaluation_results.loc["group", target]. split ('_')[-1]
elif args. mode == "direct":
pred_model = args.pred_model
method = args.fs_method
graph_type = args.graphe_type
nb_predictors = args.nb_predictors
info_models. append ([target, pred_model, nb_predictors, method, graph_type])
# Compute causality graphs
if graph_type == "te":
causality_graph = te_causality_graph
elif graph_type == "gc":
causality_graph = gc_causality_graph
predictions = predict_target (data, pred_model, method, nb_predictors, causality_graph)
# Concatenate predictions of all target variables
all_predictions [target] = predictions
pd. DataFrame (all_predictions). to_csv ("%s/predictions.csv"%out_dir, sep = ";", index = False)
pd. DataFrame (info_models, columns = ["Variables", "Prediction_model", "Nb_predictors", "Reduction_method", "Graph_type"]). to_csv ("%s/info_models.csv"%out_dir, sep = ";", index = False)