diff --git a/examples/example_data/mappings_equipment.csv b/examples/example_data/mappings_equipment.csv new file mode 100644 index 0000000..caed027 --- /dev/null +++ b/examples/example_data/mappings_equipment.csv @@ -0,0 +1,25 @@ +in,out_ +combiner,combiner +comb,combiner +cb,combiner +battery,battery +bess,battery +inverter,inverter +invert,inverter +inv,inverter +met,met +meter,meter +module,module +mod,module +recloser,recloser +reclose,recloser +relay,relay +substation,substation +switchgear,switchgear +switch,switchgear +tracker,tracker +transformer,transformer +xfmr,transformer +wiring,wiring +wire,wiring +wires,wiring \ No newline at end of file diff --git a/examples/example_data/mappings_pv_terms.csv b/examples/example_data/mappings_pv_terms.csv new file mode 100644 index 0000000..79acb06 --- /dev/null +++ b/examples/example_data/mappings_pv_terms.csv @@ -0,0 +1,20 @@ +in,out_ +comm,communication +energy,energy +kwh,energy +mwh,energy +grid,grid +curtailment,grid +curtail,grid +poi,grid +offline,outage +solar,solar +pv,solar +photovoltaic,solar +system,system +site,system +farm,system +project,system +sma,make_model +cm,corrective_maintence +pm,preventative_maintence \ No newline at end of file diff --git a/examples/tutorial_text_classify_regex_example.ipynb b/examples/tutorial_text_classify_regex_example.ipynb new file mode 100644 index 0000000..d7c9a67 --- /dev/null +++ b/examples/tutorial_text_classify_regex_example.ipynb @@ -0,0 +1,187 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Adding keyword labels to O&M data\n", + "This notebook demonstrates the use of the `pvops.classify.get_attributes_from_keywords` module for adding asset labels based off O&M notes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "from pvops.text import utils, preprocess\n", + "from pvops.text.classify import get_attributes_from_keywords\n", + "from pvops.text.visualize import visualize_classification_confusion_matrix" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 0: Get sample data, remap assets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# pull in sample data and remap assets for ease of comparison\n", + "\n", + "om_df = pd.read_csv('example_data/example_ML_ticket_data.csv')\n", + "col_dict = {\n", + " \"data\" : \"CompletionDesc\",\n", + " \"eventstart\" : \"Date_EventStart\",\n", + " \"save_data_column\" : \"processed_data\",\n", + " \"save_date_column\" : \"processed_date\",\n", + " \"attribute_col\" : \"Asset\",\n", + " \"predicted_col\" : \"Keyword_Asset\",\n", + " \"remapping_col_from\": \"in\",\n", + " \"remapping_col_to\": \"out_\"\n", + "}\n", + "\n", + "# remap assets\n", + "remapping_df = pd.read_csv('example_data/remappings_asset.csv')\n", + "remapping_df['out_'] = remapping_df['out_'].replace({'met station': 'met',\n", + " 'energy storage': 'battery',\n", + " 'energy meter': 'meter'})\n", + "om_df = utils.remap_attributes(om_df, remapping_df, col_dict, allow_missing_mappings=True)\n", + "om_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 1: Text preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# preprocessing steps\n", + "om_df[col_dict['attribute_col']] = om_df.apply(lambda row: row[col_dict['attribute_col']].lower(), axis=1)\n", + "om_df = preprocess.preprocessor(om_df, lst_stopwords=[], col_dict=col_dict, print_info=False, extract_dates_only=False)\n", + "\n", + "DATA_COL = col_dict['data']\n", + "om_df[DATA_COL] = om_df['processed_data']\n", + "\n", + "# replace terms\n", + "equipment_df = pd.read_csv('~/pvOps/examples/example_data/mappings_equipment.csv')\n", + "pv_terms_df = pd.read_csv('~/pvOps/examples/example_data/mappings_pv_terms.csv')\n", + "pv_reference_df = pd.concat([equipment_df, pv_terms_df])\n", + "om_df = utils.remap_words_in_text(om_df=om_df, remapping_df=pv_reference_df, remapping_col_dict=col_dict)\n", + "\n", + "om_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 2: Search for keywords to use as labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add asset labels from keyword reference dict\n", + "om_df = get_attributes_from_keywords(om_df=om_df,\n", + " col_dict=col_dict,\n", + " reference_df=equipment_df)\n", + "om_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 3: Metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get accuracy measures and count metrics\n", + "PREDICT_COL = col_dict['predicted_col']\n", + "LABEL_COL = col_dict['attribute_col']\n", + "\n", + "# entries with some keyword over interest, over all entries\n", + "label_count = om_df[PREDICT_COL].count() / len(om_df)\n", + "\n", + "# replace 'Other' values with 'Unknown'\n", + "om_df[LABEL_COL] = om_df[LABEL_COL].replace('other', 'unknown')\n", + "# replace NaN values to use accuracy score\n", + "om_df[[LABEL_COL, PREDICT_COL]] = om_df[[LABEL_COL, PREDICT_COL]].fillna('unknown')\n", + "acc_score = accuracy_score(y_true=om_df[LABEL_COL], y_pred=om_df[PREDICT_COL])\n", + "\n", + "msg = f'{label_count:.2%} of entries had a keyword of interest, with {acc_score:.2%} accuracy.'\n", + "print(msg)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 4: Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot confusion matrix\n", + "title = 'Confusion Matrix of Actual and Predicted Asset Labels'\n", + "visualize_classification_confusion_matrix(om_df, col_dict, title)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pvops/text/classify.py b/pvops/text/classify.py index 066600e..2756b2b 100644 --- a/pvops/text/classify.py +++ b/pvops/text/classify.py @@ -8,6 +8,7 @@ import pandas as pd import copy +from pvops.text.preprocess import get_keywords_of_interest def classification_deployer( X, @@ -187,3 +188,53 @@ def classification_deployer( best_gs_instance = gs_clf return pd.concat(rows, axis=1).T, best_gs_instance.best_estimator_ + +def get_attributes_from_keywords(om_df, col_dict, reference_df, reference_col_dict): + """Find keywords of interest in specified column of dataframe, return as new column value. + + If keywords of interest given in a reference dataframe are in the specified column of the + dataframe, return the keyword category, or categories. + For example, if the string 'inverter' is in the list of text, return ['inverter']. + + Parameters + ---------- + om_df : pd.DataFrame + Dataframe to search for keywords of interest, must include text_col. + col_dict : dict of {str : str} + A dictionary that contains the column names needed: + + - data : string, should be assigned to associated column which stores the tokenized text logs + - predicted_col : string, will be used to create keyword search label column + reference_df : DataFrame + Holds columns that define the reference dictionary to search for keywords of interest, + Note: This function can currently only handle single words, no n-gram functionality. + reference_col_dict : dict of {str : str} + A dictionary that contains the column names that describes how + referencing is going to be done + + - reference_col_from : string, should be assigned to + associated column name in reference_df that are possible input reference values + Example: pd.Series(['inverter', 'invert', 'inv']) + - reference_col_to : string, should be assigned to + associated column name in reference_df that are the output reference values + of interest + Example: pd.Series(['inverter', 'inverter', 'inverter']) + + Returns + ------- + om_df: pd.DataFrame + Input df with new_col added, where each found keyword is its own row, may result in + duplicate rows if more than one keywords of interest was found in text_col. + """ + om_df[col_dict['predicted_col']] = om_df[col_dict['data']].apply(get_keywords_of_interest, + reference_df=reference_df, + reference_col_dict=reference_col_dict) + + # each multi-category now in its own row, some logs have multiple equipment issues + multiple_keywords_df = om_df[om_df[col_dict['predicted_col']].str.len() > 1] + om_df = om_df.explode(col_dict['predicted_col']) + + msg = f'{len(multiple_keywords_df)} entries had multiple keywords of interest. Reference: {multiple_keywords_df.index} in original dataframe.' + print(msg) + + return om_df \ No newline at end of file diff --git a/pvops/text/preprocess.py b/pvops/text/preprocess.py index 7384785..3936d95 100644 --- a/pvops/text/preprocess.py +++ b/pvops/text/preprocess.py @@ -449,3 +449,47 @@ def text_remove_numbers_stopwords(document, lst_stopwords): document = " ".join(document) return document + + +def get_keywords_of_interest(document_tok, reference_df, reference_col_dict): + """Find keywords of interest in list of strings from reference dict. + + If keywords of interest given in a reference dict are in the list of + strings, return the keyword category, or categories. For example, + if the string 'inverter' is in the list of text, return ['inverter']. + + Parameters + ---------- + document_tok : list of str + Tokenized text, functionally a list of string values. + reference_df : DataFrame + Holds columns that define the reference dictionary to search for keywords of interest, + Note: This function can currently only handle single words, no n-gram functionality. + reference_col_dict : dict of {str : str} + A dictionary that contains the column names that describes how + referencing is going to be done + + - reference_col_from : string, should be assigned to + associated column name in reference_df that are possible input reference values + Example: pd.Series(['inverter', 'invert', 'inv']) + - reference_col_to : string, should be assigned to + associated column name in reference_df that are the output reference values + of interest + Example: pd.Series(['inverter', 'inverter', 'inverter']) + + Returns + ------- + included_equipment: list of str + List of keywords from reference_dict found in list_of_txt, can be more than one value. + """ + REFERENCE_COL_FROM = reference_col_dict["reference_col_from"] + REFERENCE_COL_TO = reference_col_dict["reference_col_to"] + + reference_dict = dict( + zip(reference_df[REFERENCE_COL_FROM], reference_df[REFERENCE_COL_TO]) + ) + + # keywords of interest + overlap_keywords = reference_dict.keys() & document_tok + included_keywords = list({reference_dict[x] for x in overlap_keywords}) + return included_keywords diff --git a/pvops/text/utils.py b/pvops/text/utils.py index f4095c3..7f411d6 100644 --- a/pvops/text/utils.py +++ b/pvops/text/utils.py @@ -80,3 +80,52 @@ def remap_attributes(om_df, remapping_df, remapping_col_dict, "{sum(df[ATTRIBUTE_COL].isna())}") return df + +def remap_words_in_text(om_df, remapping_df, remapping_col_dict): + """A utility function which remaps a text column of om_df using columns + within remapping_df. + + Parameters + ---------- + om_df : DataFrame + A pandas dataframe containing O&M note data + remapping_df : DataFrame + Holds columns that define the remappings + remapping_col_dict : dict of {str : str} + A dictionary that contains the column names that describes how + remapping is going to be done + + - data : string, should be assigned to associated + column name in om_df which will have its text tokenized and remapped + - remapping_col_from : string, should be assigned + to associated column name in remapping_df that matches + original attribute of interest in om_df + - remapping_col_to : string, should be assigned to + associated column name in remapping_df that contains the + final mapped entries + + Returns + ------- + DataFrame + dataframe with remapped columns populated + """ + df = om_df.copy() + TEXT_COL = remapping_col_dict["data"] + REMAPPING_COL_FROM = remapping_col_dict["remapping_col_from"] + REMAPPING_COL_TO = remapping_col_dict["remapping_col_to"] + + # drop any values where input value is equal to output value + remapping_df = remapping_df[remapping_df[REMAPPING_COL_FROM] != remapping_df[REMAPPING_COL_TO]] + + # case-sensitive + remapping_df[REMAPPING_COL_FROM] = remapping_df[REMAPPING_COL_FROM].str.lower() + remapping_df[REMAPPING_COL_TO] = remapping_df[REMAPPING_COL_TO].str.lower() + df[TEXT_COL] = df[TEXT_COL].str.lower() + + renamer = dict( + zip(remapping_df[REMAPPING_COL_FROM], remapping_df[REMAPPING_COL_TO]) + ) + + df[TEXT_COL] = df[TEXT_COL].replace(renamer, regex=True) + + return df \ No newline at end of file diff --git a/pvops/text/visualize.py b/pvops/text/visualize.py index 062ecd5..467ed73 100644 --- a/pvops/text/visualize.py +++ b/pvops/text/visualize.py @@ -2,6 +2,7 @@ import matplotlib import matplotlib.pyplot as plt import networkx as nx +from sklearn.metrics import ConfusionMatrixDisplay # data structures import numpy as np @@ -381,3 +382,46 @@ def visualize_word_frequency_plot( fig = plt.figure(figsize=(12, 6)) fd.plot(30, cumulative=False, title=title, figure=fig, **graph_aargs) return fd + + +def visualize_classification_confusion_matrix(om_df, col_dict, title=''): + """Visualize confusion matrix comparing known categorical values, and predicted categorical values. + + Parameters + ---------- + om_df : DataFrame + A pandas dataframe containing O&M data, which contains columns specified in om_col_dict + col_dict : dict of {str : str} + A dictionary that contains the column names needed: + + - data : string, should be assigned to associated column which stores the tokenized text logs + - attribute_col : string, will be assigned to attribute column and used to create new attribute_col + - predicted_col : string, will be used to create keyword search label column + + title : str + Optional, title of plot + + Returns + ------- + Matplotlib figure instance + """ + act_col = col_dict['attribute_col'] + pred_col = col_dict['predicted_col'] + + # drop any predicted labels with no actual labels in the data, for a cleaner visual + no_real_values = [cat for cat in om_df[pred_col].unique() if cat not in om_df[act_col].unique()] + no_real_values_mask = om_df[pred_col].isin(no_real_values) + om_df = om_df[~no_real_values_mask] + caption_txt = f'NOTE: Predicted values{no_real_values} had no actual values in the dataset.' + + plt.rcParams.update({'font.size': 8}) + cm_display = ConfusionMatrixDisplay.from_predictions(y_true=om_df[act_col], + y_pred=om_df[pred_col], + normalize='true', + ) + fig = cm_display.plot() + plt.xticks(rotation=90) + plt.tight_layout() + plt.figtext(0.00, 0.01, caption_txt, wrap=True, fontsize=7) + plt.title(title) + return fig