Skip to content

Commit

Permalink
Merge pull request #85 from TeoMeWhy/feat/dota
Browse files Browse the repository at this point in the history
Training and predict with ML model
  • Loading branch information
TeoCalvo authored Aug 24, 2023
2 parents a90b0dc + 98d4071 commit 1ae56bb
Show file tree
Hide file tree
Showing 3 changed files with 272 additions and 0 deletions.
106 changes: 106 additions & 0 deletions src/03.silver/dota/models/pre_match/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Databricks notebook source
import datetime
import pandas as pd
import mlflow
from databricks import feature_store

from pyspark.sql import functions as F

import sys

sys.path.insert(0, '../../../../lib/')

import dbtools

model = mlflow.sklearn.load_model("models:/dota_pre_match/production")

# COMMAND ----------

dbutils.widgets.text(label="Radiant", name="Radiant", defaultValue="")
dbutils.widgets.text(label="Dire", name="Dire", defaultValue="")

radiant_name = dbutils.widgets.get("Radiant")
dire_name = dbutils.widgets.get("Dire")

print(radiant_name)
print(dire_name)

# COMMAND ----------

df_teams = (spark.table("silver.dota.team_last_seen")
.select("idTeam", "descTeamName")
.toPandas())

try:
radiant_id = ""
radiant_id = df_teams[df_teams['descTeamName'] == radiant_name]['idTeam'].iloc[0]
except IndexError as err:
print("Verifique o nome do time dos Iluminados")

try:
dire_id = ""
dire_id = df_teams[df_teams['descTeamName'] == dire_name]['idTeam'].iloc[0]
except IndexError as err:
print("Verifique o nome do time dos Temidos")

text = f"Radiant: {radiant_name}({radiant_id}) x {dire_name}({dire_id}) :Dire"

print(text)

# COMMAND ----------

dt_reference = datetime.datetime.now().strftime("%Y-%m-%d")

df = spark.createDataFrame(
pd.DataFrame({
"dtReference": [dt_reference],
"idTeamDire":[dire_id],
"idTeamRadiant":[radiant_id],
})
)

df_teams_fs_dt = (spark.table('feature_store.dota_teams_0')
.filter(f"dtReference = '{dt_reference}'")
.drop(F.col("idTeamRadiant"),
F.col("descTeamNameRadiant"),
F.col("descTeamTagRadiant"),
F.col("idTeamDire"),
F.col("descTeamNameDire"),
F.col("descTeamTagDire")))

df_radiant_fs = (df_teams_fs_dt.pandas_api()
.rename(columns= {i:f"{i}Radiant" for i in df_teams_fs_dt.columns} )
.to_spark())

df_dire_fs = (df_teams_fs_dt.pandas_api()
.rename(columns= {i:f"{i}Dire" for i in df_teams_fs_dt.columns} )
.to_spark())


df_predict = (df.join( df_radiant_fs.alias("radiant"),
df.idTeamRadiant==df_radiant_fs.idTeamRadiant,
"left")
.join(df_dire_fs.alias("dire"),
df.idTeamDire==df_dire_fs.idTeamDire,
"left")
.drop(F.col("radiant.dtReferenceRadiant"),F.col("radiant.idTeamRadiant"))
.drop(F.col("dire.dtReferenceDire"),F.col("dire.idTeamDire"))
.toPandas()
)

# COMMAND ----------

radiant_prob, dire_prob = model.predict_proba(df_predict[df_predict.columns[3:]])[0]*100

df_dashboard = spark.createDataFrame(
pd.DataFrame(
{
"descRadiantTeam": [radiant_name],
"probRadiant": [radiant_prob],
"descDireTeam": [dire_name],
"probDire": [dire_prob],
}
)
)

df_dashboard.display()
12 changes: 12 additions & 0 deletions src/03.silver/dota/models/pre_match/target.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
SELECT idMatch,
flRadiantWin,
string(dtMatchDay) AS dtReference,
idDireTeam AS idTeamDire,
idRadiantTeam AS idTeamRadiant

FROM silver.dota.matches

WHERE dtMatchDay >= '2018-01-01'
AND dtMatchDay < '2023-08-24'
AND idDireTeam IS NOT NULL
AND idRadiantTeam IS NOT NULL
154 changes: 154 additions & 0 deletions src/03.silver/dota/models/pre_match/training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# Databricks notebook source
# DBTITLE 1,Imports
from databricks import feature_store

import sys

sys.path.insert(0, '../../../../lib/')

import dbtools

import pandas as pd

from sklearn import model_selection
from sklearn import ensemble
from sklearn import pipeline
from sklearn import tree
from sklearn import metrics

from feature_engine import encoding
from feature_engine import imputation

import lightgbm as lgb

import mlflow

import scikitplot
import matplotlib.pyplot as plt

# COMMAND ----------

# DBTITLE 1,Lookups e Target

query = dbtools.import_query("target.sql")
df = spark.sql(query)

features_lookup = spark.table('feature_store.dota_teams_0').columns[8:]

lookups = [
feature_store.FeatureLookup(
table_name = 'feature_store.dota_teams_0',
feature_names = features_lookup,
lookup_key = ['dtReference', 'idTeamRadiant'],
rename_outputs = {i:f'{i}Radiant' for i in features_lookup}
),
feature_store.FeatureLookup(
table_name = 'feature_store.dota_teams_0',
feature_names = features_lookup,
lookup_key = ['dtReference', 'idTeamDire'],
rename_outputs = {i:f'{i}Dire' for i in features_lookup}
)
]

# COMMAND ----------

# DBTITLE 1,ABT

fs_client = feature_store.FeatureStoreClient()
training_set = fs_client.create_training_set(
df=df,
feature_lookups=lookups,
label="flRadiantWin",
exclude_columns=['descTeamNameRadiant', 'descTeamTagRadiant', 'descTeamTagDire', 'descTeamNameDire']
)

training_df = (training_set.load_df()
.filter('nrFrequency180Radiant > 10 and nrFrequency180Dire > 10')
.toPandas())

# COMMAND ----------

# DBTITLE 1,Modelagem
to_remove = set(['descTeamNameRadiant', 'descTeamTagRadiant',
'descTeamTagDire','descTeamNameDire'])

features = list(set(training_df.columns[4:-1]) - to_remove)
target = 'flRadiantWin'

X_train, X_test, y_train, y_test = model_selection.train_test_split(training_df[features],
training_df[target],
test_size=0.2,
random_state=42)

# COMMAND ----------

print("Tamanho base de treino:", X_train.shape[0])
print("Tamanho base de teste:", X_test.shape[0])

# COMMAND ----------

mlflow.set_experiment("/Users/[email protected]/dota_pre_match")

with mlflow.start_run():

mlflow.sklearn.autolog()

missing_0 = imputation.ArbitraryNumberImputer(arbitrary_number=0,
variables=X_test.columns.tolist())

model = lgb.LGBMClassifier(n_jobs=-1, random_state=42)

params = {"min_child_samples":[900,1000],
"learning_rate":[0.01],
"n_estimators":[1000],
"subsample":[0.9],
"max_depth":[15]}

grid = model_selection.GridSearchCV(model, cv=3, param_grid=params, scoring='roc_auc', verbose=3)

model_pipe = pipeline.Pipeline(
[('imputer', missing_0),
('model', grid)]
)

model_pipe.fit(X_train, y_train)

pred_test = model_pipe.predict(X_test)
proba_test = model_pipe.predict_proba(X_test)[:,1]

acc_test = metrics.accuracy_score(y_test, pred_test)
auc_test = metrics.roc_auc_score(y_test, proba_test)

mlflow.log_metrics({"test_roc_auc": auc_test,
"test_accuracy_score":acc_test})

# COMMAND ----------

pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score')

# COMMAND ----------

# DBTITLE 1,Predict Test
pred_train = model_pipe.predict(X_train)
proba_train = model_pipe.predict_proba(X_train)

pred_test = model_pipe.predict(X_test)
proba_test = model_pipe.predict_proba(X_test)

# COMMAND ----------

scikitplot.metrics.plot_roc(y_true=y_test,
y_probas=proba_test,
plot_micro=False,
plot_macro=False )
plt.show()

# COMMAND ----------

scikitplot.metrics.plot_ks_statistic(y_true=y_test, y_probas=proba_test)
plt.show()

# COMMAND ----------

scikitplot.metrics.plot_lift_curve(y_true=y_test, y_probas=proba_test)
plt.show()

0 comments on commit 1ae56bb

Please sign in to comment.