From 4fa536380900755e82916c5538a460b1d15972b6 Mon Sep 17 00:00:00 2001
From: Erica Greene <ericagreene@gmail.com>
Date: Mon, 5 Feb 2018 14:03:43 -0500
Subject: [PATCH] Refactor data preprocessing + Add ability to save model (#78)

Add ability to save the tensorflow model as a SavedModel
---
 conversation_classification/kaggle/.gitignore |   1 +
 conversation_classification/kaggle/model.py   | 235 +++++++++++-------
 .../kaggle/requirements.txt                   |  10 +-
 3 files changed, 150 insertions(+), 96 deletions(-)
 create mode 100644 conversation_classification/kaggle/.gitignore

diff --git a/conversation_classification/kaggle/.gitignore b/conversation_classification/kaggle/.gitignore
new file mode 100644
index 00000000..afed0735
--- /dev/null
+++ b/conversation_classification/kaggle/.gitignore
@@ -0,0 +1 @@
+*.csv
diff --git a/conversation_classification/kaggle/model.py b/conversation_classification/kaggle/model.py
index d25ab62b..79faf3ba 100644
--- a/conversation_classification/kaggle/model.py
+++ b/conversation_classification/kaggle/model.py
@@ -5,19 +5,17 @@
 To Run:
 
 python model.py --train_data=train.csv --predict_data=test.csv --y_class=toxic
-
-Output:
-  * writes predictions on heldout test data to TEST_OUT_PATH
-  * writes predictions on unlabled predict data to PREDICT_OUT_PATH
 """
 
 import argparse
+import os
 import sys
+import shutil
 
 import pandas as pd
 import tensorflow as tf
 import numpy as np
-import sklearn as sk
+from sklearn import metrics
 from sklearn.model_selection import train_test_split
 
 FLAGS = None
@@ -36,38 +34,87 @@
 
 # Training Params
 TRAIN_SEED = 9812 # Random seed used to initialize training
-TRAIN_STEPS = 1000 # Number of steps to take while training
 LEARNING_RATE = 0.01
 BATCH_SIZE = 120
 
-# Output Params
-TEST_OUT_PATH = 'test_out.csv' # Where to write results on heldout data
-PREDICT_OUT_PATH = 'predict_out.csv' # Where to write results on unlabled data
-
 class WikiData:
 
-  def __init__(self, path):
-    self.data = self._load_data(path)
-    self.data['comment_text'] = self.data['comment_text'].astype(str)
+  def __init__(self, data_path, y_class, vocab_processor_path=None,
+               test_mode=False, seed=None, train_percent=None):
+    """
+    Args:
+      * data_path (string): path to file containing train or test data
+      * y_class (string): the class we're training or testing on
+      * vocab_processor_path (string): if provided, the comment_text data will be
+          processed with the vocab processor at that location. If not, a new
+          vocab_processor will be created from the data.
+      * test_mode (boolean): true if loading data just to test on, not training a model
+      * seed (integer): a random seed to use for data splitting
+      * train_percent (fload): the percent of data we should use for training data
+
+    Note: the vocab_processor_path should only be provided if test_mode is true.
+    """
+    data = self._load_data(data_path)
+
+    self.x_train, self.x_train_text = None, None
+    self.x_test, self.x_test_text = None, None
+    self.y_train = None
+    self.y_test = None
+    self.vocab_processor = None
+
+    # If test_mode is True, then put all the data in x_test and y_test
+    if test_mode:
+      train_percent = 0
+
+    # Split the data into test / train sets
+    self.x_train_text, self.x_test_text, self.y_train, self.y_test \
+      = self._split(data, train_percent, 'comment_text', y_class, seed)
+
+    # Either load a VocabularyProcessor or compute one from the training data
+    if test_mode:
+
+      # If test_mode is True and no vocab_processor_path is specified, then
+      # return an error. We shouldn't train a VocabProcessor at test time.
+      if vocab_processor_path is None:
+        tf.logging.error("Loading data in test_mode with no vocab_processor_path")
+        raise ValueError
+
+      self.vocab_processor = self.load_vocab_processor(vocab_processor_path)
+
+    else:
+      self.vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
+        MAX_DOCUMENT_LENGTH)
+      self.x_train = np.array(list(self.vocab_processor.fit_transform(
+        self.x_train_text)))
+
+    # Apply the VocabularyProcessor to the test data
+    self.x_test = np.array(list(self.vocab_processor.transform(
+      self.x_test_text)))
+
+  def _load_vocab_processor(self, path):
+    """Load a VocabularyProcessor from the provided path"""
+    return tf.contrib.learn.preprocessing.VocabularyProcessor.restore(path)
 
   def _load_data(self, path):
       df =  pd.read_csv(path)
-
       return df
 
-  def split(self, train_percent, y_class, seed):
+  def _split(self, data, train_percent, x_field, y_class, seed=None):
     """
     Split divides the Wikipedia data into test and train subsets.
 
     Args:
+      * data (dataframe): a dataframe with data for 'comment_text' and y_class
       * train_percent (float): the fraction of data to use for training
-      * y_class (string): the attribute of the wiki data to predict, e.g. 'toxic'
+      * x_field (string): attribute of the wiki data to use to train, e.g.
+                          'comment_text'
+      * y_class (string): attribute of the wiki data to predict, e.g. 'toxic'
       * seed (integer): a seed to use to split the data in a reproducible way
 
     Returns:
-      x_train (dataframe): the comment_text for the training data
+      x_train (dataframe): the features for the training data
       y_train (dataframe): the 0 or 1 labels for the training data
-      x_test (dataframe):  the comment_text for the test data
+      x_test (dataframe):  the features for the test data
       y_test (dataframe):  the 0 or 1 labels for the test data
     """
 
@@ -76,20 +123,17 @@ def split(self, train_percent, y_class, seed):
             .format(y_class, Y_CLASSES))
       raise ValueError
 
-    if train_percent >= 1 or train_percent <= 0:
+    if train_percent > 1 or train_percent < 0:
       tf.logging.error('Specified train_percent {0} is not between 0 and 1'\
             .format(train_percent))
       raise ValueError
 
-    tf.logging.info("Training on class: '{}'".format(y_class))
-    tf.logging.info("Training data split: {}".format(train_percent))
-
-    X = self.data['comment_text']
-    y = self.data[y_class]
+    X = data[x_field]
+    y = data[y_class]
     x_train, x_test, y_train, y_test = train_test_split(
       X, y, test_size=1-train_percent, random_state=seed)
 
-    return x_train, x_test, y_train, y_test
+    return x_train, x_test, np.array(y_train), np.array(y_test)
 
 def estimator_spec_for_softmax_classification(logits, labels, mode):
   """
@@ -102,16 +146,26 @@ def estimator_spec_for_softmax_classification(logits, labels, mode):
   Returns EstimatorSpec instance for softmax classification.
   """
   predicted_classes = tf.argmax(logits, axis=1)
+  predicted_probs = tf.nn.softmax(logits, name='softmax_tensor')
   predictions = {
     'classes': predicted_classes,
 
     # Add softmax_tensor to the graph. It is used for PREDICT.
-    'probs': tf.nn.softmax(logits, name='softmax_tensor')
+    'probs': predicted_probs
+  }
+
+  # Represents an output of a model that can be served.
+  export_outputs = {
+    'output': tf.estimator.export.ClassificationOutput(scores=predicted_probs)
   }
 
   # PREDICT Mode
   if mode == tf.estimator.ModeKeys.PREDICT:
-    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
+    return tf.estimator.EstimatorSpec(
+      mode=mode,
+      predictions=predictions,
+      export_outputs=export_outputs
+    )
 
   # Calculate loss for both TRAIN and EVAL modes
   loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
@@ -127,15 +181,32 @@ def estimator_spec_for_softmax_classification(logits, labels, mode):
       loss=loss,
       train_op=train_op,
       training_hooks=[logging_hook],
-      predictions={'loss': loss}
+      predictions={'loss': loss},
+      export_outputs=export_outputs
     )
 
   # EVAL Mode
   eval_metric_ops = {
-    'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_classes)
+    'accuracy': tf.metrics.accuracy(
+      labels=labels, predictions=predicted_classes),
+    'auc': tf.metrics.auc(labels=labels, predictions=predicted_classes),
+    'true_negatives': tf.metrics.true_negatives(
+      labels=labels, predictions=predicted_classes),
+    'false_negatives': tf.metrics.false_negatives(
+      labels=labels, predictions=predicted_classes),
+    'true_positives': tf.metrics.true_positives(
+      labels=labels, predictions=predicted_classes),
+    'false_positives': tf.metrics.false_positives(
+      labels=labels, predictions=predicted_classes),
   }
+
   return tf.estimator.EstimatorSpec(
-    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
+    mode=mode,
+    loss=loss,
+    predictions=predictions,
+    eval_metric_ops=eval_metric_ops,
+    export_outputs=export_outputs
+  )
 
 def bag_of_words_model(features, labels, mode):
   """
@@ -171,23 +242,16 @@ def main():
       tf.logging.info('Running in verbose mode')
       tf.logging.set_verbosity(tf.logging.DEBUG)
 
-    # Load and split data
-    tf.logging.debug('Loading data {}'.format(FLAGS.train_data))
-    data = WikiData(FLAGS.train_data)
-
-    x_train_text, x_test_text, y_train, y_test \
-      = data.split(TRAIN_PERCENT, FLAGS.y_class, DATA_SEED)
+    if os.path.isdir(FLAGS.model_dir):
+      tf.logging.info("Removing model data from '/{0}'".format(FLAGS.model_dir))
+      shutil.rmtree(FLAGS.model_dir)
 
-    # Process data
-    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
-      MAX_DOCUMENT_LENGTH)
-
-    x_train = np.array(list(vocab_processor.fit_transform(x_train_text)))
-    x_test = np.array(list(vocab_processor.transform(x_test_text)))
-    y_train = np.array(y_train)
-    y_test = np.array(y_test)
+    # Load and split data
+    tf.logging.info('Loading data from {0}'.format(FLAGS.train_data))
+    data = WikiData(
+      FLAGS.train_data, FLAGS.y_class, seed=DATA_SEED, train_percent=TRAIN_PERCENT)
 
-    n_words = len(vocab_processor.vocabulary_)
+    n_words = len(data.vocab_processor.vocabulary_)
     tf.logging.info('Total words: %d' % n_words)
 
     # Build model
@@ -197,8 +261,8 @@ def main():
       # Subtract 1 because VocabularyProcessor outputs a word-id matrix where word
       # ids start from 1 and 0 means 'no word'. But categorical_column_with_identity
       # assumes 0-based count and uses -1 for missing word.
-      x_train -= 1
-      x_test -= 1
+      data.x_train = data.x_train - 1
+      data.x_test = data.x_test - 1
     else:
       tf.logging.error("Unknown specified model '{}', must be one of {}"
                        .format(FLAGS.model, MODEL_LIST))
@@ -209,78 +273,63 @@ def main():
       config=tf.contrib.learn.RunConfig(
         tf_random_seed=TRAIN_SEED,
       ),
-      model_dir=None)
+      model_dir=FLAGS.model_dir)
 
     # Train model
     train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={WORDS_FEATURE: x_train},
-      y=y_train,
+      x={WORDS_FEATURE: data.x_train},
+      y=data.y_train,
       batch_size=BATCH_SIZE,
       num_epochs=None, # Note: For training, set this to None, so the input_fn
                        # keeps returning data until the required number of train
                        # steps is reached.
       shuffle=True)
-
-    classifier.train(input_fn=train_input_fn, steps=TRAIN_STEPS)
+    classifier.train(input_fn=train_input_fn, steps=FLAGS.train_steps)
 
     # Predict on held-out test data
     test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={WORDS_FEATURE: x_test},
-      y=y_test,
+      x={WORDS_FEATURE: data.x_test},
+      y=data.y_test,
       num_epochs=1,     # Note: For evaluation and prediction set this to 1,
                         # so the input_fn will iterate over the data once and
                         # then raise OutOfRangeError
       shuffle=False)
-
     predicted_test = classifier.predict(input_fn=test_input_fn)
     test_out = pd.DataFrame(
       [(p['classes'], p['probs'][1]) for p in predicted_test],
       columns=['y_predicted', 'prob']
     )
-    test_out['comment_text'] = x_train_text
-    test_out['y_true'] = y_test
-
-    # Write out predictions and probabilities for test data
-    tf.logging.info("Writing test predictions to {}".format(TEST_OUT_PATH))
-    test_out.to_csv(TEST_OUT_PATH)
 
-    # Score with sklearn and TensorFlow (hopefully they're the same!)
-    sklearn_score = sk.metrics.accuracy_score(y_test, test_out['y_predicted'])
+    # Score with sklearn and TensorFlow
+    sklearn_score = metrics.accuracy_score(data.y_test, test_out['y_predicted'])
     tf_scores = classifier.evaluate(input_fn=test_input_fn)
 
-    tf.logging.info('')
-    tf.logging.info('----------Evaluation on Held-Out Data---------')
-    tf.logging.info('Accuracy (sklearn)\t: {0:f}'.format(sklearn_score))
-    tf.logging.info('Accuracy (tensorflow)\t: {0:f}'.format(tf_scores['accuracy']))
-    tf.logging.info('')
+    train_size = len(data.x_train)
+    test_size = len(data.x_test)
 
-    # If specified, predict on unlabeled data
-    if FLAGS.predict_data is None:
-      return
+    baseline = len(data.y_train[data.y_train==0]) / len(data.y_train)
+    if baseline < .5:
+      baseline = 1 - baseline
 
-    data_unlabeled = WikiData(FLAGS.predict_data).data
-
-    tf.logging.info('Generating predictions for {0} unlabeled examples in {1}'
-                    .format(len(data_unlabeled), FLAGS.predict_data))
+    tf.logging.info('')
+    tf.logging.info('----------Evaluation on Held-Out Data---------')
+    tf.logging.info('Train Size: {0} Test Size: {1}'.format(train_size, test_size))
+    tf.logging.info('Baseline (class distribution): {0:f}'.format(baseline))
+    tf.logging.info('Accuracy (sklearn): {0:f}'.format(sklearn_score))
 
-    x_unlabeled = np.array(list(
-      vocab_processor.transform(data_unlabeled['comment_text'])))
+    for key in sorted(tf_scores):
+      tf.logging.info("%s: %s" % (key, tf_scores[key]))
 
-    unlabled_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={WORDS_FEATURE: x_unlabeled},
-      num_epochs=1,
-      shuffle=False)
+    # Export the model
+    feature_spec = {
+      WORDS_FEATURE: tf.FixedLenFeature(
+        dtype=tf.int64, shape=[1, MAX_DOCUMENT_LENGTH])
+    }
+    serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
+    dir_path = 'saved_model'
 
-    predicted_unlabeled = classifier.predict(input_fn=unlabled_input_fn)
-    unlabeled_out = pd.DataFrame(
-      [(p['classes'], p['probs'][1]) for p in predicted_unlabeled],
-      columns=['y_pred', 'prob']
-    )
-    unlabeled_out['comment_text'] = data_unlabeled['comment_text']
+    classifier.export_savedmodel(dir_path, serving_input_fn)
 
-    # Write out predictions and probabilities for unlabled "predict" data
-    tf.logging.info("Writing predictions to {}".format(PREDICT_OUT_PATH))
-    unlabeled_out.to_csv(PREDICT_OUT_PATH)
 
 if __name__ == '__main__':
 
@@ -288,15 +337,17 @@ def main():
   parser.add_argument(
       '--verbose', help='Run in verbose mode.', action='store_true')
   parser.add_argument(
-      "--train_data", type=str, default="", help="Path to the training data.")
+    "--train_data", type=str, default="", help="Path to the training data.")
   parser.add_argument(
-      "--predict_data", type=str, default="", help="Path to the prediction data.")
+      "--model_dir", type=str, default="model", help="Place to save model files")
   parser.add_argument(
       "--y_class", type=str, default="toxic",
     help="Class to train model against, one of {}".format(Y_CLASSES))
   parser.add_argument(
       "--model", type=str, default="bag_of_words",
     help="The model to train, one of {}".format(MODEL_LIST))
+  parser.add_argument(
+    "--train_steps", type=int, default=100, help="The number of steps to train the model")
 
   FLAGS, unparsed = parser.parse_known_args()
 
diff --git a/conversation_classification/kaggle/requirements.txt b/conversation_classification/kaggle/requirements.txt
index a8539c3d..035bd93c 100644
--- a/conversation_classification/kaggle/requirements.txt
+++ b/conversation_classification/kaggle/requirements.txt
@@ -1,8 +1,10 @@
+absl-py==0.1.9
 bleach==1.5.0
 enum34==1.1.6
+futures==3.1.1
 html5lib==0.9999999
-Markdown==2.6.10
-numpy==1.13.3
+Markdown==2.6.11
+numpy==1.14.0
 pandas==0.22.0
 protobuf==3.5.1
 python-dateutil==2.6.1
@@ -11,6 +13,6 @@ scikit-learn==0.19.1
 scipy==1.0.0
 six==1.11.0
 sklearn==0.0
-tensorflow==1.4.1
-tensorflow-tensorboard==0.4.0rc3
+tensorflow==1.5.0
+tensorflow-tensorboard==1.5.0
 Werkzeug==0.14.1