-
Notifications
You must be signed in to change notification settings - Fork 0
/
soccer_prediction2.py
128 lines (101 loc) · 4.96 KB
/
soccer_prediction2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
import pickle
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold
from sklearn.model_selection import train_test_split
# Importing or Loading dataset
data = "C:/Users/Gabriel/Desktop/full_dataset2.csv"
df = pd.read_csv(data, delimiter=',')
df = df[(df.league_country=='England') | (df.league_country=='Spain') | (df.league_country=='Germany')
| (df.league_country=='France') | (df.league_country=='Turkey') | (df.league_country=='Belgium') | (df.league_country=='Netherlands')
| (df.league_country=='Italy') | (df.league_country=='Portugal') | (df.league_country=='USA') | (df.league_country=='South-Africa')]
def data_transform():
# feature selection
new_df = df.drop(df.iloc[:, 0:23], axis=1,
inplace=False) # Drop a number of categorical features after performing feature correlation
new_df = new_df.drop(['type'], axis=1, inplace=False) # Drop the type column
new_df = new_df.dropna(axis=0, inplace=False) # Drop any row with missing values
# Obtain target class from the score column
new_df['outcome'] = new_df['score_fulltime'].apply(lambda i: 1 if i[0] > i[-1] else (2 if i[0] < i[-1] else 0))
new_df = new_df.drop(['score_fulltime'], axis=1, inplace=False) # Drop the score_fulltime column
# Removing % sign
columns_to_check = new_df.columns
new_df[columns_to_check] = new_df[columns_to_check].replace({r'\%': ''}, regex=True)
# Converting the feature values with % to floating values
new_df.iloc[:, 0:21] = new_df.iloc[:, 0:21].astype('float') / 100.0
# Remove more rows with missing values
new_df = new_df.dropna(axis=0, inplace=False)
return new_df
X = data_transform().drop('outcome',axis=1)
y = data_transform()['outcome'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scale = MinMaxScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)
print('Cross Validation')
# Cross validation of 10 folds and 5 runs
cv_method = RepeatedKFold(n_splits=10, n_repeats=5, random_state=42)
# Support Vector Machines
SVM_clasf = SVC(probability=True)
# Create a dictionary of SVM hyperparameters
# Parameter space for rbf kernels
params_SVC = {'kernel':['rbf'],'C':np.linspace(0.1,1.0),
'gamma':['auto']}
# Using Random Search to explore the best parameter for the a SVM model
SVC_Grid = RandomizedSearchCV(SVM_clasf,params_SVC,scoring='accuracy',cv=cv_method)
# Fitting the parameterized model
SVC_Grid.fit(X_train,y_train)
# Print the best parameter values
print('SVC Best Parameter Values:', SVC_Grid.best_params_)
SVC = SVC_Grid.best_estimator_
print('\n')
# GradientClassifier
GBC = GradientBoostingClassifier()
# Create a dictionary of SVM hyperparameters
# Parameter space for rbf kernels
params_GBC = {'n_estimators':np.arange(100,350,50),'learning_rate':np.linspace(0.001,0.1),
'max_depth': np.arange(2,21)}
# Using Random Search to explore the best parameter for the a SVM model
GBC_Grid = RandomizedSearchCV(GBC,params_GBC,scoring='accuracy',cv=cv_method, n_jobs=-1)
# Fitting the parameterized model
GBC_Grid.fit(X_train,y_train)
# Print the best parameter values
print('GB Best Parameter Values:', GBC_Grid.best_params_)
GB = GBC_Grid.best_estimator_
print('\n')
# XGBClassifier
XGBC = XGBClassifier(n_jobs=-1, booster='gbtree')
# Create a dictionary of SVM hyperparameters
# Parameter space for rbf kernels
params_XGBC = {'n_estimators':np.arange(100,350,50),'eta':[0.0001, 0.001, 0.01, 0.1, 1.0],
'max_depth': np.arange(2,21)}
# Using Random Search to explore the best parameter for the a SVM model
XGBC_Grid = RandomizedSearchCV(XGBC,params_XGBC,scoring='accuracy',cv=cv_method, n_jobs=-1)
# Fitting the parameterized model
XGBC_Grid.fit(X_train,y_train)
# Print the best parameter values
print('XGB Best Parameter Values:', XGBC_Grid.best_params_)
XGB = XGBC_Grid.best_estimator_
print('\n')
# LGBMClassifier
LGBMC = LGBMClassifier(max_depth=-1, n_jobs=-1)
# Create a dictionary of SVM hyperparameters
# Parameter space for rbf kernels
params_LGBMC = {'n_estimators': np.range(100,350,50),'learning_rate':[0.0001, 0.001, 0.01, 0.1, 1.0],
'boosting_type':['gbdt', 'rf']}
# Using Random Search to explore the best parameter for the a SVM model
LGBMC_Grid = RandomizedSearchCV(LGBMC,params_LGBMC,scoring='accuracy',cv=cv_method, n_jobs=-1)
# Fitting the parameterized model
LGBMC_Grid.fit(X_train,y_train)
# Print the best parameter values
print('LGBM Best Parameter Values:', LGBMC_Grid.best_params_)
LGBM = LGBMC_Grid.best_estimator_
print('\n')