main.py

import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from preprocessor import data_today
from sklearn.svm import SVC
from datetime import datetime
from win10toast import ToastNotifier
import os
import re
import click
import inspect
import sys

# output settings
asterisk = '*'*30
minus = '-'*30
data_today = datetime.now().strftime("_%d_%m_%Y__%H_%M")

current_directory = os.getcwd()

dataset = pd.DataFrame()


@click.command()
@click.option("--input_path", "-i", required=True, prompt="Insert input directory", help='Input directory where are stored csv files', type=click.Path(exists=True))
@click.option("--model", "-m", required=True, help='Select which model use for train your data',
              type=click.Choice(['svc', 'lr', 'rf', 'all'], case_sensitive=False), multiple=True)
@click.option("--svc_c", help='Select the value of the parameter C for SVC', type=int)
@click.option("--lr_iter", help='Select the value of the parameter max-max_iter for Logistic Regression', type=int)
@click.option("--rf_estimators", help='Select the value of the parameter n_estimators for Random Forest', type=int)
def command_line(input_path, model, svc_c, lr_iter, rf_estimators):
    """Enter the path in which the data is present and select which ML model to use.
       You can also personalize the list of parameters to test."""
    abs_path = os.path.abspath(input_path)
    dict_params = {"svc": [svc_c],
                   "lr": [lr_iter],
                   "rf": [rf_estimators]
                   }
    dataset = load_df(abs_path)
    if(("svc" not in model and svc_c) or ("lr" not in model and lr_iter) or ("rf" not in model and rf_estimators)):
        click.echo("It's necessary to select the model before personalize its testing parameters!")
    else:
        names, classifiers, parameter = setup_model(list(set(model)), dict_params)
        train_data(zip(names, classifiers, parameter), dataset)


def load_df(path):
    list_csv = os.listdir(path)
    pattern = r'df_last\d*.csv'
    list_csv = [os.path.join(path, file) for file in list_csv if (re.search(pattern, file))]
    if(len(list_csv)):
        print("I've found {} files...".format(len(list_csv)))
        final_csv = pd.concat((pd.read_csv(file).assign(filename=file) for file in list_csv), ignore_index=True)
        return final_csv
    else:
        print("There is no file that respects the following partner: {}".format(pattern))
        sys.exit()


def setup_model(model_list, dict_params):
    names = []
    classifier = []
    parameters = []
    dict_names = {
        "svc": "SVC",
        "lr": "Logistic Regression",
        "rf": "Random Forest"
    }
    dict_classifier = {
        "svc": SVC(probability=True),
        "lr": LogisticRegression(),
        "rf": RandomForestClassifier()
    }
    dict_parameters = [
        {
            # SVC
            "clf__C":  10
        },
        {
            # Logistic Regression
            "clf__max_iter": [1000],
            "clf__solver":['newton-cg', 'saga']
        },
        {
            # Random Forest
            "clf__n_estimators": np.arange(400, 601, 100),
            "clf__max_depth": np.arange(6, 18, 2)
        }]

    if "all" in model_list:
        names = list(dict_names.values())
        classifier = list(dict_classifier.values())
        parameters = dict_parameters
    else:
        for model in model_list:
            names.append(dict_names[model])
            classifier.append(dict_classifier[model])
            if model == "svc":
                if(not isinstance(dict_params[model], int)):
                    value = [10]
                else:
                    value = dict_params[model]
                p = {
                    # SVC
                    "clf__C":  value
                }
                parameters.append(p)
            elif model == "lr":
                if(not isinstance(dict_params[model], int)):
                    value = [1000]
                else:
                    value = dict_params[model]
                p = {
                    # Logistic Regression
                    "clf__max_iter":  value,
                    "clf__solver": ['newton-cg', 'saga']
                }
                parameters.append(p)
            else:
                if(not isinstance(dict_params[model], int)):
                    value = np.arange(400, 601, 100)
                else:
                    value = dict_params[model]
                p = {
                    # Random Forest
                    "clf__n_estimators": value,
                    "clf__max_depth": np.arange(6, 18, 2)
                }
                parameters.append(p)
    return (names, classifier, parameters)


def train_data(model_collection, dataset):
    tf = TfidfVectorizer(max_features=1600, ngram_range=(1, 1))
    # OVERSAMPLING
    oversampling = dataset[(dataset['CLASSE'] == 2) | (dataset['CLASSE'] == 3)]
    classe_3 = dataset[dataset['CLASSE'] == 3]
    dataset = pd.concat([dataset, oversampling, classe_3])

    X = tf.fit_transform(dataset["FRASE"].values.astype('U').tolist())
    y = dataset['CLASSE'].astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
    # save TF
    pickle.dump(tf, open(os.path.join(current_directory, '\\'.join(['transformer', 'tfidf_{}.pkl'.format(data_today)])), "wb"))
    for name, classifier, params in model_collection:
        clf_pipe = Pipeline([
            ('clf', classifier)
        ])
        gs_clf = GridSearchCV(clf_pipe, param_grid=params, verbose=1, cv=5, n_jobs=-1)
        clf = gs_clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        model_file_name = os.path.join(current_directory, '\\'.join(['models', '{}{}.pkl'.format(name, data_today)]))
        log_file_name = os.path.join(current_directory, '\\'.join(['log', 'estimators_log.txt']))

        best_estimators = clf.best_estimator_
        score_table = classification_report(y_test, y_pred)

        print("REPORT {} --> \n{}\nTESTING PARAMETERS -->\n{}\nBEST PARAMETERS FOUND --> \n{}".format(name, score_table, params, best_estimators))
        pickle.dump(clf, open(model_file_name, 'wb'))
        toaster = ToastNotifier()
        toaster.show_toast("Modello salvato!", "{} è stato salvato con successo!".format(name), duration=10)
        file_object = open(log_file_name, 'a')
        text_to_write = ('\n{}\n{}\n{}\n{}\n{}\n{}'.format(asterisk, os.path.basename(model_file_name), asterisk, best_estimators, minus, score_table))
        file_object.write(text_to_write)
        file_object.close()


if __name__ == '__main__':
    command_line()