Skip to content

Commit

Permalink
Merge pull request #88 from TeoMeWhy/feat/dota
Browse files Browse the repository at this point in the history
POST TRAIN
  • Loading branch information
TeoCalvo authored Nov 26, 2023
2 parents 6b51903 + 4289b43 commit 1d5a00e
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 17 deletions.
6 changes: 5 additions & 1 deletion src/03.silver/dota/models/pre_match/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@

# COMMAND ----------

radiant_prob, dire_prob = model.predict_proba(df_predict[df_predict.columns[3:]])[0]*100
dire_prob, radiant_prob = model.predict_proba(df_predict[model.feature_names_in_])[0]*100

df_dashboard = spark.createDataFrame(
pd.DataFrame(
Expand All @@ -104,3 +104,7 @@
)

df_dashboard.display()

# COMMAND ----------

df_predict[model.feature_names_in_]
2 changes: 1 addition & 1 deletion src/03.silver/dota/models/pre_match/target.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@
FROM silver.dota.matches

WHERE dtMatchDay >= '2018-01-01'
AND dtMatchDay < '2023-08-24'
AND dtMatchDay < '2023-08-29'
AND idDireTeam IS NOT NULL
AND idRadiantTeam IS NOT NULL
46 changes: 31 additions & 15 deletions src/03.silver/dota/models/pre_match/training.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Databricks notebook source
# DBTITLE 1,Imports

from databricks import feature_store

import sys
Expand All @@ -9,15 +10,21 @@
import dbtools

import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 1000)

from sklearn import model_selection
from sklearn import ensemble
from sklearn import pipeline
from sklearn import tree
from sklearn import metrics
from sklearn import preprocessing

from feature_engine import encoding
from feature_engine import imputation
from feature_engine import creation
from feature_engine import selection

import lightgbm as lgb

Expand All @@ -29,7 +36,6 @@
# COMMAND ----------

# DBTITLE 1,Lookups e Target

query = dbtools.import_query("target.sql")
df = spark.sql(query)

Expand Down Expand Up @@ -63,16 +69,14 @@
)

training_df = (training_set.load_df()
.filter('nrFrequency180Radiant > 10 and nrFrequency180Dire > 10')
.filter('avgFrequency180Radiant > 10 and avgFrequency180Dire > 10')
.filter('minFrequency30Radiant > 0 and minFrequency30Dire > 0')
.toPandas())

# COMMAND ----------

# DBTITLE 1,Modelagem
to_remove = set(['descTeamNameRadiant', 'descTeamTagRadiant',
'descTeamTagDire','descTeamNameDire'])

features = list(set(training_df.columns[4:-1]) - to_remove)
features = training_df.columns[4:-1]
target = 'flRadiantWin'

X_train, X_test, y_train, y_test = model_selection.train_test_split(training_df[features],
Expand All @@ -82,8 +86,8 @@

# COMMAND ----------

print("Tamanho base de treino:", X_train.shape[0])
print("Tamanho base de teste:", X_test.shape[0])
print("Tamanho base de treino:", X_train.shape[0], "| Taxa resposta:", y_train.mean())
print("Tamanho base de teste:", X_test.shape[0], "| Taxa resposta:", y_test.mean())

# COMMAND ----------

Expand All @@ -94,17 +98,25 @@
mlflow.sklearn.autolog()

missing_0 = imputation.ArbitraryNumberImputer(arbitrary_number=0,
variables=X_test.columns.tolist())
variables=X_train.columns.tolist())

min_max = preprocessing.MinMaxScaler(feature_range=(1,2)).set_output(transform="pandas")

model = lgb.LGBMClassifier(n_jobs=-1, random_state=42)

params = {"min_child_samples":[900,1000],
"learning_rate":[0.01],
"n_estimators":[1000],
"subsample":[0.9],
"max_depth":[15]}
params = {
"learning_rate":[0.1, 0.01],
"n_estimators":[500,1000],
"min_child_samples":[250,400,800],
"num_leaves": [10,20,30,50,100,200,500]
}

grid = model_selection.GridSearchCV(model, cv=3, param_grid=params, scoring='roc_auc', verbose=3)
grid = model_selection.GridSearchCV(model,
cv=3,
param_grid=params,
scoring='roc_auc',
verbose=3,
n_jobs=1)

model_pipe = pipeline.Pipeline(
[('imputer', missing_0),
Expand Down Expand Up @@ -152,3 +164,7 @@

scikitplot.metrics.plot_lift_curve(y_true=y_test, y_probas=proba_test)
plt.show()

# COMMAND ----------


0 comments on commit 1d5a00e

Please sign in to comment.