diff --git a/ML_Experiments.ipynb b/ML_Experiments.ipynb new file mode 100644 index 0000000..7e3ae2c --- /dev/null +++ b/ML_Experiments.ipynb @@ -0,0 +1,39 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyOZnVW7CAWe+5PLnWXxx9iF", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NTIEyVwmLTBP" + }, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file diff --git a/summary_be/archive/utils/lstm_bi25/fingerprint.pb b/summary_be/archive/utils/lstm_bi25/fingerprint.pb new file mode 100644 index 0000000..70f8498 --- /dev/null +++ b/summary_be/archive/utils/lstm_bi25/fingerprint.pb @@ -0,0 +1 @@ +ȺƃXӍѹŰ ꋨW(؂猰(2 \ No newline at end of file diff --git a/summary_be/archive/utils/lstm_bi25/keras_metadata.pb b/summary_be/archive/utils/lstm_bi25/keras_metadata.pb new file mode 100644 index 0000000..e7d0857 --- /dev/null +++ b/summary_be/archive/utils/lstm_bi25/keras_metadata.pb @@ -0,0 +1,11 @@ + +-root"_tf_keras_sequential*-{"name": "sequential", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": false, "class_name": "Sequential", "config": {"name": "sequential", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, null, 768]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "bidirectional_input"}}, {"class_name": "Bidirectional", "config": {"name": "bidirectional", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null, 768]}, "layer": {"class_name": "LSTM", "config": {"name": "lstm", "trainable": true, "dtype": "float32", "return_sequences": true, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 25, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 1}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "shared_object_id": 2}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 3}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}}, "merge_mode": "concat"}}, {"class_name": "TimeDistributed", "config": {"name": "time_distributed", "trainable": true, "dtype": "float32", "layer": {"class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}}}]}, "shared_object_id": 11, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, null, 768]}, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {}}}], "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 768]}, "is_graph_network": true, "full_save_spec": {"class_name": "__tuple__", "items": [[{"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, null, 768]}, "float32", "bidirectional_input"]}], {}]}, "save_spec": {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, null, 768]}, "float32", "bidirectional_input"]}, "keras_version": "2.12.0", "backend": "tensorflow", "model_config": {"class_name": "Sequential", "config": {"name": "sequential", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, null, 768]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "bidirectional_input"}, "shared_object_id": 0}, {"class_name": "Bidirectional", "config": {"name": "bidirectional", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null, 768]}, "layer": {"class_name": "LSTM", "config": {"name": "lstm", "trainable": true, "dtype": "float32", "return_sequences": true, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 25, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 1}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "shared_object_id": 2}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 3}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "shared_object_id": 5}, "merge_mode": "concat"}, "shared_object_id": 6}, {"class_name": "TimeDistributed", "config": {"name": "time_distributed", "trainable": true, "dtype": "float32", "layer": {"class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 7}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 8}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "shared_object_id": 9}}, "shared_object_id": 10}]}}, "training_config": {"loss": "binary_crossentropy", "metrics": [[{"class_name": "SensitivityAtSpecificity", "config": {"name": "sensitivity_at_specificity", "dtype": "float32", "class_id": null, "num_thresholds": 1, "specificity": 0.5}, "shared_object_id": 13}]], "weighted_metrics": null, "loss_weights": null, "optimizer_config": {"class_name": "Custom>Adam", "config": {"name": "Adam", "weight_decay": null, "clipnorm": null, "global_clipnorm": null, "clipvalue": null, "use_ema": false, "ema_momentum": 0.99, "ema_overwrite_frequency": null, "jit_compile": true, "is_legacy_optimizer": false, "learning_rate": 0.0010000000474974513, "beta_1": 0.9, "beta_2": 0.999, "epsilon": 1e-07, "amsgrad": false}}}}2 + root.layer_with_weights-0"_tf_keras_layer* {"name": "bidirectional", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null, 768]}, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "Bidirectional", "config": {"name": "bidirectional", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null, 768]}, "layer": {"class_name": "LSTM", "config": {"name": "lstm", "trainable": true, "dtype": "float32", "return_sequences": true, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 25, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 1}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "shared_object_id": 2}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 3}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "shared_object_id": 5}, "merge_mode": "concat"}, "shared_object_id": 6, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {}}, "shared_object_id": 14}], "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 768]}}2 + root.layer_with_weights-1"_tf_keras_layer* {"name": "time_distributed", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "TimeDistributed", "config": {"name": "time_distributed", "trainable": true, "dtype": "float32", "layer": {"class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 7}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 8}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "shared_object_id": 9}}, "shared_object_id": 10, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, null, 50]}, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {}}, "shared_object_id": 15}, "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 50]}}2 + 'root.layer_with_weights-0.forward_layer"_tf_keras_rnn_layer* {"name": "forward_lstm", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "LSTM", "config": {"name": "forward_lstm", "trainable": true, "dtype": "float32", "return_sequences": true, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "zero_output_for_mask": true, "units": 25, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 16}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "shared_object_id": 17}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 18}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "shared_object_id": 20, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, null, 768]}, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {}}, "shared_object_id": 21}], "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 768]}}2 + (root.layer_with_weights-0.backward_layer"_tf_keras_rnn_layer* {"name": "backward_lstm", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "LSTM", "config": {"name": "backward_lstm", "trainable": true, "dtype": "float32", "return_sequences": true, "return_state": false, "go_backwards": true, "stateful": false, "unroll": false, "time_major": false, "zero_output_for_mask": true, "units": 25, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 22}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "shared_object_id": 23}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 24}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "shared_object_id": 26, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, null, 768]}, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {}}, "shared_object_id": 27}], "build_input_shape": {"class_name": "TensorShape", "items": [null, null, 768]}}2 +root.layer_with_weights-1.layer"_tf_keras_layer*{"name": "dense", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 7}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 8}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "shared_object_id": 9, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 50}}, "shared_object_id": 28}}2 + L,root.layer_with_weights-0.forward_layer.cell"_tf_keras_layer*{"name": "lstm_cell_1", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "LSTMCell", "config": {"name": "lstm_cell_1", "trainable": true, "dtype": "float32", "units": 25, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 16}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "shared_object_id": 17}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 18}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "shared_object_id": 19, "build_input_shape": {"class_name": "__tuple__", "items": [null, 768]}}2 + U-root.layer_with_weights-0.backward_layer.cell"_tf_keras_layer*{"name": "lstm_cell_2", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "LSTMCell", "config": {"name": "lstm_cell_2", "trainable": true, "dtype": "float32", "units": 25, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}, "shared_object_id": 22}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "shared_object_id": 23}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 24}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "shared_object_id": 25, "build_input_shape": {"class_name": "__tuple__", "items": [null, 768]}}2 +froot.keras_api.metrics.0"_tf_keras_metric*{"class_name": "Mean", "name": "loss", "dtype": "float32", "config": {"name": "loss", "dtype": "float32"}, "shared_object_id": 29}2 +groot.keras_api.metrics.1"_tf_keras_metric*{"class_name": "SensitivityAtSpecificity", "name": "sensitivity_at_specificity", "dtype": "float32", "config": {"name": "sensitivity_at_specificity", "dtype": "float32", "class_id": null, "num_thresholds": 1, "specificity": 0.5}, "shared_object_id": 13}2 \ No newline at end of file diff --git a/summary_be/archive/utils/lstm_bi25/saved_model.pb b/summary_be/archive/utils/lstm_bi25/saved_model.pb new file mode 100644 index 0000000..18a6483 Binary files /dev/null and b/summary_be/archive/utils/lstm_bi25/saved_model.pb differ diff --git a/summary_be/archive/utils/lstm_bi25/variables/variables.data-00000-of-00001 b/summary_be/archive/utils/lstm_bi25/variables/variables.data-00000-of-00001 new file mode 100644 index 0000000..6b33d28 Binary files /dev/null and b/summary_be/archive/utils/lstm_bi25/variables/variables.data-00000-of-00001 differ diff --git a/summary_be/archive/utils/lstm_bi25/variables/variables.index b/summary_be/archive/utils/lstm_bi25/variables/variables.index new file mode 100644 index 0000000..6d795c1 Binary files /dev/null and b/summary_be/archive/utils/lstm_bi25/variables/variables.index differ diff --git a/summary_be/archive/utils/summarization.py b/summary_be/archive/utils/summarization.py index ad87934..0cfd021 100644 --- a/summary_be/archive/utils/summarization.py +++ b/summary_be/archive/utils/summarization.py @@ -1,248 +1,50 @@ -# Actual summarization logic - -import nltk -import re -import math -from nltk.stem import WordNetLemmatizer -from nltk.tokenize import sent_tokenize, word_tokenize -from nltk.corpus import stopwords - -nltk.data.path.append("/opt/python/nltk_data") - -INVALID = r"[^a-zA-Z\s]" -STOP_WORDS = set(stopwords.words("english")) -lemmatizer = WordNetLemmatizer() - -""" - Function to clean and tokenize a chunk of text into words. - - Parameters - ---------- - text : string - - block of text to process - - Returns - ------- - lemmatized_words : [] - - array of valid words -""" - - -def preprocess(text): - # Remove invalid characters and digits - text = re.sub(INVALID, "", text) - - # Remove stop words + convert to lower case - words = word_tokenize(text) - words = [word.lower() for word in words] - stop_words_removed = [word for word in words if word not in STOP_WORDS] - - # Remove one character words - valid_words = [word for word in stop_words_removed if len(word) > 1] - - # Lemmatize words, aka "builds" becomes "build" etc. - lemmatized_words = [lemmatizer.lemmatize(word) for word in valid_words] - - return lemmatized_words - - -""" - Function to get frequency of words. - - Parameters - ---------- - words : [] - - array of words - - Returns - ------- - word_freq : {} - - dictionary of word frequencies - - key is a string - - val is an int -""" - - -def get_word_freq(words): - dict = {} - unique_words = [] - for word in words: - if word not in unique_words: - unique_words.append(word) - for word in unique_words: - dict[word] = words.count(word) - return dict - - -""" - Function to return sum of tf-idf score of words in a sentence. - - Parameters - ---------- - sent : string - - sentence containing word - sentences : [] - - array of sentences in text - - Returns - ------- - score : float - - tf-idf score of sentence -""" - - -def tf_idf_score(sent, sentences): - sent = re.sub(INVALID, "", sent) - sent_len = len(word_tokenize(sent)) - words_in_given_sent = preprocess(sent) - word_freq = get_word_freq(words_in_given_sent) - - score = 0 - for word in words_in_given_sent: - tf = tf_score(word_freq[word], sent_len) - idf = idf_score(word, sentences) - tf_idf = tf * idf - score += tf_idf - - return score - - -""" - Function to return term frequency score of a word. - - Parameters - ---------- - freq : int - - frequency of word in given sentence - sent_len : int - - total number of words in sentence - - Returns - ------- - tf_score : float - - tf score of word -""" - - -def tf_score(freq, sent_len): - return freq / sent_len - - -""" - Function to return inverse document frequency of a word. - - Parameters - ---------- - word : string - - word to compute score - setences : [] - - array of sentences in text - - Returns - ------- - score : float - - idf score of word -""" - - -def idf_score(word, sentences): - num_sent_containing_word = 0 - for sent in sentences: - words = preprocess(sent) - if word in words: - num_sent_containing_word += 1 - return math.log10(len(sentences) / num_sent_containing_word) - - -""" - Function to return sentence weights based on tf-idf score - - Parameters - ---------- - sentences : [] - - array of sentences in text - - Returns - ------- - sentence_weight : {} - - dictionary of sentence weights - - key is index of sentence in sentences - - val is tf-idf score -""" - - -def get_sentence_weights(sentences): - sentence_weight = {} - for x in range(len(sentences)): - sentence = sentences[x] - importance = tf_idf_score(sentence, sentences) - sentence_weight[x] = importance - sentence_weight = dict( - sorted(sentence_weight.items(), key=lambda item: item[1], reverse=True) - ) - return sentence_weight - - -""" - Function to return indices of top scoring sentences - - Parameters - ---------- - sentence_weight: {} - - dictionary of sentence weights - num_sent: int - - number of top scoring sentences to return - - Returns - ------- - sentence_idx : [] - - indices of top scoring sentences -""" - - -def get_top_scoring_sent(sentence_weight, num_sent): - curr_num_sentences = 0 - sentence_idx = [] - for key in sentence_weight: - if key == 0: - continue - if curr_num_sentences < num_sent - 1: - sentence_idx.append(key) - curr_num_sentences += 1 - else: - break - sentence_idx.sort() - return sentence_idx - - -""" - Function to summarize a given text. - - Parameters - ---------- - text : string - - text to summarize - num_sentences : int - - number of sentences to include in summary - - Returns - ------- - summary : string - - extractive summary of text -""" - - -def summarize(text, num_sentences): - if num_sentences < 1: - raise Exception("Summary must have at least one sentence.") - sentences = sent_tokenize(text) - sentence_weight = get_sentence_weights(sentences) - sentence_idx = get_top_scoring_sent(sentence_weight, num_sentences) - - summary = sentences[0] - for x in range(len(sentences)): - if x in sentence_idx: - summary += " " - summary += sentences[x] - - return summary +import pandas as pd +import numpy as np +import spacy +from spacy.language import Language +import tensorflow as tf +from tensorflow import keras +from sentence_transformers import SentenceTransformer + +model = tf.keras.models.load_model('/lstm_bi25') + +def preprocess(text, + nlp = spacy.load("en_core_web_sm"), + embedder = SentenceTransformer('distilbert-base-nli-mean-tokens'), + min_len = 2): + @Language.component("custom_sentencizer") + def custom_sentencizer(doc): + for i, token in enumerate(doc[:-2]): + # Define sentence start if it occurs after "\n\n" or "\n" + if token.text == "\n\n" or token.text == "\n": + doc[i + 1].is_sent_start = True + return doc + nlp = spacy.load("en_core_web_sm") + nlp.add_pipe("custom_sentencizer", before="parser") + text = nlp(text) + sents = list(text.sents) + + # remove sentences with length below threshold + sents_clean = [sent.text for sent in sents if len(sent) > min_len] + sents_clean = [sent for sent in sents_clean if len(sent)!=0] + + # calculate sentence embeddings + sents_embedding= np.array(embedder.encode(sents_clean, convert_to_tensor=True)) + return sents_clean, sents_embedding + +def summarize(text): + sentences, embeddings = preprocess(text) + + reshape_func = lambda x: x.reshape(1, x.shape[0], x.shape[1]) + x = reshape_func(embeddings) + + y_pred = model.predict(x, verbose=0) + idx = np.argsort(y_pred.flatten())[-3:] + idx = sorted(idx) + pred_summary = "" + for i in idx: + pred_summary += ' ' + pred_summary += sentences[i] + pred_summary.strip() + + return pred_summary \ No newline at end of file diff --git a/summary_be/archive/utils/summarization_old.py b/summary_be/archive/utils/summarization_old.py new file mode 100644 index 0000000..ad87934 --- /dev/null +++ b/summary_be/archive/utils/summarization_old.py @@ -0,0 +1,248 @@ +# Actual summarization logic + +import nltk +import re +import math +from nltk.stem import WordNetLemmatizer +from nltk.tokenize import sent_tokenize, word_tokenize +from nltk.corpus import stopwords + +nltk.data.path.append("/opt/python/nltk_data") + +INVALID = r"[^a-zA-Z\s]" +STOP_WORDS = set(stopwords.words("english")) +lemmatizer = WordNetLemmatizer() + +""" + Function to clean and tokenize a chunk of text into words. + + Parameters + ---------- + text : string + - block of text to process + + Returns + ------- + lemmatized_words : [] + - array of valid words +""" + + +def preprocess(text): + # Remove invalid characters and digits + text = re.sub(INVALID, "", text) + + # Remove stop words + convert to lower case + words = word_tokenize(text) + words = [word.lower() for word in words] + stop_words_removed = [word for word in words if word not in STOP_WORDS] + + # Remove one character words + valid_words = [word for word in stop_words_removed if len(word) > 1] + + # Lemmatize words, aka "builds" becomes "build" etc. + lemmatized_words = [lemmatizer.lemmatize(word) for word in valid_words] + + return lemmatized_words + + +""" + Function to get frequency of words. + + Parameters + ---------- + words : [] + - array of words + + Returns + ------- + word_freq : {} + - dictionary of word frequencies + - key is a string + - val is an int +""" + + +def get_word_freq(words): + dict = {} + unique_words = [] + for word in words: + if word not in unique_words: + unique_words.append(word) + for word in unique_words: + dict[word] = words.count(word) + return dict + + +""" + Function to return sum of tf-idf score of words in a sentence. + + Parameters + ---------- + sent : string + - sentence containing word + sentences : [] + - array of sentences in text + + Returns + ------- + score : float + - tf-idf score of sentence +""" + + +def tf_idf_score(sent, sentences): + sent = re.sub(INVALID, "", sent) + sent_len = len(word_tokenize(sent)) + words_in_given_sent = preprocess(sent) + word_freq = get_word_freq(words_in_given_sent) + + score = 0 + for word in words_in_given_sent: + tf = tf_score(word_freq[word], sent_len) + idf = idf_score(word, sentences) + tf_idf = tf * idf + score += tf_idf + + return score + + +""" + Function to return term frequency score of a word. + + Parameters + ---------- + freq : int + - frequency of word in given sentence + sent_len : int + - total number of words in sentence + + Returns + ------- + tf_score : float + - tf score of word +""" + + +def tf_score(freq, sent_len): + return freq / sent_len + + +""" + Function to return inverse document frequency of a word. + + Parameters + ---------- + word : string + - word to compute score + setences : [] + - array of sentences in text + + Returns + ------- + score : float + - idf score of word +""" + + +def idf_score(word, sentences): + num_sent_containing_word = 0 + for sent in sentences: + words = preprocess(sent) + if word in words: + num_sent_containing_word += 1 + return math.log10(len(sentences) / num_sent_containing_word) + + +""" + Function to return sentence weights based on tf-idf score + + Parameters + ---------- + sentences : [] + - array of sentences in text + + Returns + ------- + sentence_weight : {} + - dictionary of sentence weights + - key is index of sentence in sentences + - val is tf-idf score +""" + + +def get_sentence_weights(sentences): + sentence_weight = {} + for x in range(len(sentences)): + sentence = sentences[x] + importance = tf_idf_score(sentence, sentences) + sentence_weight[x] = importance + sentence_weight = dict( + sorted(sentence_weight.items(), key=lambda item: item[1], reverse=True) + ) + return sentence_weight + + +""" + Function to return indices of top scoring sentences + + Parameters + ---------- + sentence_weight: {} + - dictionary of sentence weights + num_sent: int + - number of top scoring sentences to return + + Returns + ------- + sentence_idx : [] + - indices of top scoring sentences +""" + + +def get_top_scoring_sent(sentence_weight, num_sent): + curr_num_sentences = 0 + sentence_idx = [] + for key in sentence_weight: + if key == 0: + continue + if curr_num_sentences < num_sent - 1: + sentence_idx.append(key) + curr_num_sentences += 1 + else: + break + sentence_idx.sort() + return sentence_idx + + +""" + Function to summarize a given text. + + Parameters + ---------- + text : string + - text to summarize + num_sentences : int + - number of sentences to include in summary + + Returns + ------- + summary : string + - extractive summary of text +""" + + +def summarize(text, num_sentences): + if num_sentences < 1: + raise Exception("Summary must have at least one sentence.") + sentences = sent_tokenize(text) + sentence_weight = get_sentence_weights(sentences) + sentence_idx = get_top_scoring_sent(sentence_weight, num_sentences) + + summary = sentences[0] + for x in range(len(sentences)): + if x in sentence_idx: + summary += " " + summary += sentences[x] + + return summary diff --git a/summary_be/lambda_handlers/summary.py b/summary_be/lambda_handlers/summary.py index 93a79ab..a6c99d9 100644 --- a/summary_be/lambda_handlers/summary.py +++ b/summary_be/lambda_handlers/summary.py @@ -16,7 +16,7 @@ def get_summary(event, context): } try: - result = summarize(text, num_sentences) + result = summarize(text) num_words = len(result.split(" ")) return { "statusCode": 200, diff --git a/summary_be/ml_notebook/dynamic_chunking.ipynb b/summary_be/ml_notebook/dynamic_chunking.ipynb deleted file mode 100644 index 7f3d88f..0000000 --- a/summary_be/ml_notebook/dynamic_chunking.ipynb +++ /dev/null @@ -1,634 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "authorship_tag": "ABX9TyNORzkxfdxKKcNI5yqF1enI", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Dynamic Chunking\n", - "This notebook implements dynamic chunking of text into semantically distinct paragraphs.\n", - "\n", - "* [SBERT](https://www.sbert.net/) is used to extract semantic feature vectors in sentences\n", - "* Vectors can be compared to find sentences of similar meaning\n", - "\n", - "Dynamic chunking solves the following problems:\n", - "\n", - "\n", - "* Summarization of long texts\n", - "* Preservation of detail in the overall summary\n", - "\n", - "**TODO**\n", - "* This notebook is not complete, missing examples and explanations\n", - "* Compare time for each method\n" - ], - "metadata": { - "id": "dTHsgGOLa8Vr" - } - }, - { - "cell_type": "code", - "source": [ - "!pip install sentence_transformers" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "QewF-0LpSq1L", - "outputId": "6a7b403a-e3ad-45e5-8d7b-7cbb71e0f7fe" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: sentence_transformers in /usr/local/lib/python3.7/dist-packages (2.2.2)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (4.64.1)\n", - "Requirement already satisfied: torchvision in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (0.13.1+cu113)\n", - "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (1.7.3)\n", - "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (1.0.2)\n", - "Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (3.7)\n", - "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (0.1.97)\n", - "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (4.24.0)\n", - "Requirement already satisfied: huggingface-hub>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (0.10.1)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (1.21.6)\n", - "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from sentence_transformers) (1.12.1+cu113)\n", - "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (4.13.0)\n", - "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (21.3)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (3.8.0)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (6.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (4.1.1)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2.28.1)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.9->huggingface-hub>=0.4.0->sentence_transformers) (3.0.9)\n", - "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (2022.6.2)\n", - "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.7/dist-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.13.2)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->huggingface-hub>=0.4.0->sentence_transformers) (3.10.0)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from nltk->sentence_transformers) (1.2.0)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from nltk->sentence_transformers) (7.1.2)\n", - "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.7/dist-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (2.1.1)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (2022.9.24)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (1.24.3)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sentence_transformers) (3.1.0)\n", - "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.7/dist-packages (from torchvision->sentence_transformers) (7.1.2)\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import math\n", - "import nltk\n", - "nltk.download('punkt')\n", - "from nltk import sent_tokenize\n", - "from tqdm import tnrange\n", - "from sentence_transformers import SentenceTransformer\n", - "from sklearn.metrics.pairwise import cosine_similarity\n", - "import seaborn as sns\n", - "from scipy.signal import argrelextrema\n", - "import matplotlib.pyplot as plt" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "q0VfRJ_sTNQ6", - "outputId": "f4b03fce-2903-42b6-94f3-96a7d6d388fc" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[nltk_data] Downloading package punkt to /root/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "MODEL = 'all-MiniLM-L6-v2'\n", - "model = SentenceTransformer(MODEL)" - ], - "metadata": { - "id": "ouSiySJaTOnJ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "TEXT = \"\"\"Elon Musk sold $3.95 billion worth of Tesla stock since completing his purchase of Twitter late last month. \n", - " Musk’s Tesla stock sales, totaling 19.5 million shares, have been widely anticipated ever since the Tesla CEO\n", - " reached a deal to buy Twitter for $44 billion. Musk had sold blocks of Tesla shares worth a total of $15.4 billion\n", - " earlier this year since his deal to buy Twitter was announced. Twitter confirmed Musk bought the social media company\n", - " October 27, but he waited until November 4 to start selling additional Tesla shares. He also sold blocks of Tesla \n", - " stock on Monday and Tuesday this week, according to filings to the Securities and Exchange Commission late Tuesday night. \n", - " It’s not clear if the money Musk raised went toward the Twitter purchase, or to support losses at Twitter since he \n", - " took over. Musk disclosed last week that Twitter has seen a “massive drop in revenue,” as a growing number of advertisers \n", - " pause spending on the platform in the wake of his takeover of the company. He blamed “activist groups” pressuring \n", - " advertisers for the loss of ad dollars. He has announced plans to charge users $8 a month to have verified accounts, \n", - " and also announced deep staff cuts. This is not the best time to be selling Tesla shares, which have lost 46% of their \n", - " value so far this year on disappointing sales caused by supply chain problems. Musk received an average price of $202.52 \n", - " for the Tesla shares he sold since the Twitter deal closed, which is down 10% just since he closed on his deal to buy \n", - " Twitter. Shares of Tesla fell 0.7% in after-hours trading Tuesday. The company is facing growing competition in the \n", - " electric vehicle market from established automakers such as Volkswagen, Ford and General Motors. And some investors \n", - " have expressed concerns that Musk will be too distracted by his purchase of Twitter to give enough attention to \n", - " addressing Tesla’s problems.\"\"\"" - ], - "metadata": { - "id": "F8ko-i59hK24" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "sns.set(rc={\"figure.dpi\":100, 'savefig.dpi':100})\n", - "sns.set_context('notebook')\n", - "sns.set_style(\"ticks\")" - ], - "metadata": { - "id": "l89kojjIX6MI" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Cosine-similarity matrix\n", - "Sentence embeddings are compared using cosine-similarity to find similar sentences.\n", - "\n", - "The closer the value is to 1, the smaller the angle between the vectors and the more similar the sentences are to each other." - ], - "metadata": { - "id": "iRwO2n5cd38_" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "n3r4ygRHSVn-" - }, - "outputs": [], - "source": [ - "def embed_sentences(text):\n", - " sent_embeddings = sent_tokenize(text)\n", - " for i in range (len(sent_embeddings)):\n", - " sent_embeddings[i] = model.encode(sent_embeddings[i])\n", - " return sent_embeddings\n", - "\n", - "def create_similarity_matrix(sent_embeddings):\n", - " similarity_matrix = cosine_similarity(sent_embeddings)\n", - " return similarity_matrix" - ] - }, - { - "cell_type": "code", - "source": [ - "similarity_matrix = create_similarity_matrix(embed_sentences(TEXT))\n", - "sns.set(rc={'figure.figsize':(6.25, 5)})\n", - "sns.heatmap(similarity_matrix).set_title('Cosine-Similarity Matrix')\n", - "plt.show()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 465 - }, - "id": "UFThhjjUZcFR", - "outputId": "4274585d-f2a7-4eb5-8229-0c87915d23cc" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "\n" - }, - "metadata": {} - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Dynamic Chunking While Preserving Order\n", - "\n", - "This approach exploits the existing structure of a news article, which groups topics together into sequential chunks.\n", - "\n", - "We are concerned with finding **where** the \"topic\" changes, and inserting a breakpoint. Once these chunks are discovered, we can use parallel processing to generate summaries of each chunk and concatenate them together." - ], - "metadata": { - "id": "ZCP0b-bLjaRt" - } - }, - { - "cell_type": "code", - "source": [ - "def rev_sigmoid(x):\n", - " return (1 / (1 + math.exp(0.5*x)))\n", - "\n", - "def find_splitting_points(similarity_matrix):\n", - " size = 14\n", - " x = np.linspace(-10, 10, size)\n", - " y = np.vectorize(rev_sigmoid)\n", - " activation_weights = np.pad(y(x), (0, similarity_matrix.shape[0] - size))\n", - " diagonals = [similarity_matrix.diagonal(each) for each in range(0, similarity_matrix.shape[0])]\n", - " diagonals = [np.pad(each, (0, similarity_matrix.shape[0]-len(each))) for each in diagonals]\n", - " diagonals = np.stack(diagonals)\n", - " diagonals = diagonals * activation_weights.reshape(-1, 1)\n", - " weighted_sum = np.sum(diagonals, axis = 0)\n", - " return weighted_sum" - ], - "metadata": { - "id": "mIWC6EU_ZP20" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "activated_similarities = find_splitting_points(similarity_matrix)\n", - "fig, ax = plt.subplots()\n", - "minmimas = argrelextrema(activated_similarities, np.less, order=2)\n", - "sns.set(rc={'figure.figsize':(6.5, 5)})\n", - "sns.lineplot(y=activated_similarities, x=range(len(activated_similarities)), ax=ax).set_title('Relative Minimas')\n", - "plt.vlines(x=minmimas, ymin=min(activated_similarities), ymax=max(activated_similarities), colors='purple', ls='--', lw=1, label='vline_multiple - full height')\n", - "plt.show()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 389 - }, - "id": "lraBzbhFZpu-", - "outputId": "9913530d-d403-4464-bea3-12cd1ba7f968" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "\n" - }, - "metadata": {} - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Clustering by Topic\n", - "\n", - "We can also group sentences together by semantic meaning.\n", - "\n", - "This approach allows us to keep track of \"topics\" that are most prevalent in the text, which is an indicator of relevance. Advantages include being able to adjust the length of a text summary while having control over which topics appear in the summary. For example, if the client requests a short summary, we could choose to summarize and concatenate only sentences that embody the most prevalent topics.\n" - ], - "metadata": { - "id": "udc_SEtXozeE" - } - }, - { - "cell_type": "code", - "source": [ - "sentences = sent_tokenize(TEXT)\n", - "sentences = [sentence.strip() for sentence in sentences]\n", - "data = pd.DataFrame(sentences)" - ], - "metadata": { - "id": "AfwdNfxX3bAd" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "def get_sentence_embeddings(sentence):\n", - " embedding = model.encode([sentence])\n", - " return embedding[0]" - ], - "metadata": { - "id": "wOWEcyZf2k-M" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from nltk.cluster import KMeansClusterer\n", - "def cluster(NUM_CLUSTERS = 3):\n", - " data.columns=['sentence']\n", - " data['embeddings']=data['sentence'].apply(get_sentence_embeddings)\n", - "\n", - " iterations=25\n", - " X = np.array(data['embeddings'].tolist())\n", - " kclusterer = KMeansClusterer(\n", - " NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,\n", - " repeats=iterations,avoid_empty_clusters=True)\n", - " assigned_clusters = kclusterer.cluster(X, assign_clusters=True)\n", - " data['cluster']=assigned_clusters\n", - " return data" - ], - "metadata": { - "id": "tDMweRUzqZgv" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "cluster()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 488 - }, - "id": "VksPKXrEzxSp", - "outputId": "9b8ee58f-79ad-4884-cf58-3a1e356ee83b" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " sentence \\\n", - "0 Elon Musk sold $3.95 billion worth of Tesla st... \n", - "1 Musk’s Tesla stock sales, totaling 19.5 millio... \n", - "2 Musk had sold blocks of Tesla shares worth a t... \n", - "3 Twitter confirmed Musk bought the social media... \n", - "4 He also sold blocks of Tesla \\n stock... \n", - "5 It’s not clear if the money Musk raised went t... \n", - "6 Musk disclosed last week that Twitter has seen... \n", - "7 He blamed “activist groups” pressuring \\n ... \n", - "8 He has announced plans to charge users $8 a mo... \n", - "9 This is not the best time to be selling Tesla ... \n", - "10 Musk received an average price of $202.52 \\n ... \n", - "11 Shares of Tesla fell 0.7% in after-hours tradi... \n", - "12 The company is facing growing competition in t... \n", - "13 And some investors \\n have expressed ... \n", - "\n", - " embeddings cluster \n", - "0 [0.015738262, 0.049773764, 0.039228562, -0.002... 1 \n", - "1 [0.0046068756, 0.010958798, 0.06972741, 0.0108... 1 \n", - "2 [-0.014365158, 0.05811444, 0.053490557, 0.0085... 1 \n", - "3 [0.0058419057, -0.014379907, 0.07404005, 0.016... 1 \n", - "4 [-0.054390516, 0.033562854, -0.004229972, 0.01... 0 \n", - "5 [0.007103605, 0.03970247, 0.091158584, -0.0287... 1 \n", - "6 [0.017648496, 0.047859456, 0.077083685, -0.027... 2 \n", - "7 [0.0009142382, 0.03492736, -0.017508835, 0.071... 2 \n", - "8 [-0.04066035, -0.04515377, 0.045038592, -0.019... 2 \n", - "9 [-0.02830206, 0.01951779, 0.061164685, 0.02635... 0 \n", - "10 [0.035062186, 0.0321538, 0.091064826, 0.036340... 1 \n", - "11 [-0.007510837, 0.034629587, 0.07932933, 0.1395... 0 \n", - "12 [0.009982399, -0.004768099, 0.020714289, 0.011... 0 \n", - "13 [0.05617005, 0.018335959, 0.07072651, 0.013934... 1 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sentenceembeddingscluster
0Elon Musk sold $3.95 billion worth of Tesla st...[0.015738262, 0.049773764, 0.039228562, -0.002...1
1Musk’s Tesla stock sales, totaling 19.5 millio...[0.0046068756, 0.010958798, 0.06972741, 0.0108...1
2Musk had sold blocks of Tesla shares worth a t...[-0.014365158, 0.05811444, 0.053490557, 0.0085...1
3Twitter confirmed Musk bought the social media...[0.0058419057, -0.014379907, 0.07404005, 0.016...1
4He also sold blocks of Tesla \\n stock...[-0.054390516, 0.033562854, -0.004229972, 0.01...0
5It’s not clear if the money Musk raised went t...[0.007103605, 0.03970247, 0.091158584, -0.0287...1
6Musk disclosed last week that Twitter has seen...[0.017648496, 0.047859456, 0.077083685, -0.027...2
7He blamed “activist groups” pressuring \\n ...[0.0009142382, 0.03492736, -0.017508835, 0.071...2
8He has announced plans to charge users $8 a mo...[-0.04066035, -0.04515377, 0.045038592, -0.019...2
9This is not the best time to be selling Tesla ...[-0.02830206, 0.01951779, 0.061164685, 0.02635...0
10Musk received an average price of $202.52 \\n ...[0.035062186, 0.0321538, 0.091064826, 0.036340...1
11Shares of Tesla fell 0.7% in after-hours tradi...[-0.007510837, 0.034629587, 0.07932933, 0.1395...0
12The company is facing growing competition in t...[0.009982399, -0.004768099, 0.020714289, 0.011...0
13And some investors \\n have expressed ...[0.05617005, 0.018335959, 0.07072651, 0.013934...1
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 185 - } - ] - } - ] -} \ No newline at end of file diff --git a/summary_be/ml_notebook/fastBART.ipynb b/summary_be/ml_notebook/fastBART.ipynb deleted file mode 100644 index cb25e73..0000000 --- a/summary_be/ml_notebook/fastBART.ipynb +++ /dev/null @@ -1,1022 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9it0AqXuPR3Y" - }, - "source": [ - "# fastBART\n", - "Adaptation of [fastT5](https://github.com/Ki6an/fastT5) for BART text summarization.\n", - "\n", - "* BART is converted to ONNX format and quantized\n", - "* Model is \"flattened\" into a directed graph with nodes being operators\n", - "* Quantization truncates floating point model weights to 8-bit integers\n", - "* We should expect a significant decrease in model size and inference time\n", - "\n", - "**Work in progress**\n", - "* The ONNX model does not have the same outputs as the PyTorch model -> bug\n", - "* ONNX did not increase inference speed as much as expected -> possibly related to bug above\n", - "* CoLab keeps crashing during quantization" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Transformers architecture\n", - "![picture](https://drive.google.com/uc?export=view?&id=1pEr3mTnWSdLAzCfnDYC0G3lg2jZPZiBU)" - ], - "metadata": { - "id": "C8xyZZU4qQZD" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Decoder inefficiencies\n", - "\n", - "* Note in **Figure 1** that the encoder output is fed as the input to the decoder. This output is called a hidden state, and the same hidden state is used for each subsequent computation in the decoder.\n", - "* The encoder output can be computed once, saved, and reused for each subsequent step.\n", - "* The generic Transformers class in PyTorch does not automatically save the encoder output, so the encoder recomputes hidden states during each decoder timestep.\n" - ], - "metadata": { - "id": "2Er0jnSWOfPc" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Token embedding inefficiencies\n", - "\n", - "\n", - "* **Figure 2** illustrates how the embedding of a decoded token depends only on the previous token decoded before it.\n", - "* Caching can be applied to prevent unnecessary computation, shown in **Figure 3**.\n", - "* Seperating the encoder and decoder allows us to control the input/output, ensuring cached values are being passed and only computations related to updating the last token are computed.\n", - "\n", - "\"drawing\"" - ], - "metadata": { - "id": "phu_5Nc6mcl4" - } - }, - { - "cell_type": "markdown", - "source": [ - "### HuggingFace optimizations\n", - "Understanding these optimization strategies is important, because HuggingFace applies them to our PyTorch BART model. We need to ensure that these optimizations are carried over correctly to the ONNX model!" - ], - "metadata": { - "id": "HNMq_cb97WR3" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VdLLk7_ywD-Q" - }, - "outputs": [], - "source": [ - "!pip install -q torch==1.12.1+cpu -f https://download.pytorch.org/whl/torch_stable.html\n", - "!pip install -q -U transformers==4.4.2 onnx==1.8.1 onnxruntime==1.6.0\n", - "import torch" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Creating classes to specify input and ouput of the ONNX model\n", - "\n", - "Caching is used to speed up generation during decoding:\n", - "\n", - "\n", - "* BART has an auto-regressive decoder, which means that future generation steps depend on the same output token hidden state. We will avoid recomputation of the hidden state by saving it.\n", - "* ```past_key_values``` is returned when ```use_cache``` is True, and contains pre-computed hidden states to speed up decoding.\n", - "\n", - "**Problem**: ```past_key_values``` is a tuple, and must be flattened in the ONNX model. However, the decoder of ```BartForConditionalGeneration``` expects a tuple input.\n", - "\n", - "**Solution**: ```past_key_values``` is represented as a list, and ```DecoderWithLMhead``` will create a tuple from the flattened ```past_key_values``` input. The tuple is then passed to the decoder as normal.\n" - ], - "metadata": { - "id": "WKn3wWwxyevA" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EaLgIRvpSnAm" - }, - "outputs": [], - "source": [ - "class BartEncoder(torch.nn.Module):\n", - " \"\"\" Creation of a class to output only the last hidden state from the encoder \"\"\"\n", - " # The last hidden state is the sequence of hidden states at the output of the last layer,\n", - " # which is fed as the input to the decoder. In other words, it's the only one we care about.\n", - "\n", - " def __init__(self, encoder):\n", - " super().__init__()\n", - " self.encoder = encoder\n", - "\n", - " def forward(self, *input, **kwargs):\n", - " return self.encoder(*input, **kwargs)[0]\n", - "\n", - "class DecoderWithLMhead(torch.nn.Module):\n", - " \"\"\" Creation of a class to combine the decoder and the lm head \"\"\"\n", - " # lm_head specifies what the model will be doing. In this case, we want our model to do language\n", - " # modelling, so we combine the decoder with a Language Modelling head.\n", - "\n", - " def __init__(self, decoder, lm_head, final_logits_bias, config):\n", - " super().__init__()\n", - " self.decoder = decoder\n", - " self.lm_head = lm_head\n", - " # BART uses final_logits_bias unlike T5\n", - " self.final_logits_bias = final_logits_bias\n", - " self.config = config\n", - "\n", - " def forward(self, *inputs):\n", - "\n", - " input_ids, attention_mask, encoder_hidden_states = inputs[:3]\n", - " # Creating a tuple for past_key_values from flattened list\n", - " list_pkv = inputs[3:]\n", - " past_key_values = tuple(list_pkv[i : i + 4] for i in range(0, len(list_pkv), 4))\n", - "\n", - " decoder_output = self.decoder(\n", - " input_ids=input_ids, # decoder_input_ids\n", - " encoder_attention_mask=attention_mask,\n", - " encoder_hidden_states=encoder_hidden_states,\n", - " past_key_values=past_key_values,\n", - " )\n", - "\n", - " lm_head_out = self.lm_head(decoder_output[0]) + self.final_logits_bias\n", - "\n", - " return lm_head_out, decoder_output[1]" - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Initial decoder\n", - "\n", - "The very first time that the decoder receives an input from the encoder (per decoding step), there is no need to use caching. This input is called the \"initial hidden state\". Subsequent recurrent decoding steps rely on the same input hidden states, and hence will use ```DecoderWithLMHead```." - ], - "metadata": { - "id": "gzcjpHccTZOV" - } - }, - { - "cell_type": "code", - "source": [ - "class DecoderWithLMheadInitial(torch.nn.Module):\n", - " \"\"\" Creation of a class to combine the decoder and the lm head \"\"\"\n", - "\n", - " def __init__(self, decoder, lm_head, final_logits_bias, config):\n", - " super().__init__()\n", - " self.decoder = decoder\n", - " self.lm_head = lm_head\n", - " self.final_logits_bias = final_logits_bias\n", - " self.config = config\n", - "\n", - " def forward(self, input_ids, attention_mask, encoder_hidden_states):\n", - " decoder_output = self.decoder(\n", - " input_ids=input_ids,\n", - " encoder_attention_mask=attention_mask,\n", - " encoder_hidden_states=encoder_hidden_states,\n", - " )\n", - "\n", - " return (\n", - " self.lm_head(decoder_output[0]) + self.final_logits_bias,\n", - " decoder_output[1],\n", - " )" - ], - "metadata": { - "id": "wJHI7RZCTYbj" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y1iZ0uSlP8aj" - }, - "source": [ - "## Converting Model to ONNX\n", - "\n", - "Since BART is an encoder-decoder seq2seq model, the encoder and decoder have to be split and converted seperately. Specifically, the encoder, initial decoder, and decoder are converted into three seperate ONNX models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "a4nFb3naUeU-" - }, - "outputs": [], - "source": [ - "from transformers import (\n", - " AutoConfig,\n", - " AutoTokenizer,\n", - " BartTokenizerFast,\n", - " BartForConditionalGeneration,\n", - ")\n", - "import functools\n", - "import operator\n", - "from pathlib import Path\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KYrGRN26VK4Y" - }, - "outputs": [], - "source": [ - "MODEL_PATH = 'facebook/bart-large-cnn'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "t7yu7nl84LSE" - }, - "outputs": [], - "source": [ - "tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)\n", - "model = BartForConditionalGeneration.from_pretrained(MODEL_PATH, use_cache=True)" - ] - }, - { - "cell_type": "code", - "source": [ - "_folder = Path.cwd()\n", - "saved_models_path = _folder.joinpath(\"models\")\n", - "saved_models_path" - ], - "metadata": { - "id": "b1AYQNnmfIY9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import functools\n", - "import operator\n", - "from pathlib import Path\n", - "import os" - ], - "metadata": { - "id": "mc0R5ZBzbVRR" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "def get_model_paths(pretrained_model, model_path, quantized):\n", - "\n", - " model_path.mkdir(parents=True, exist_ok=True)\n", - "\n", - " # gets only the filename\n", - " pretrained_model_name = Path(pretrained_model).stem\n", - "\n", - " if not quantized:\n", - " encoder_path = model_path / pretrained_model_name / \"encoder\" / \"model.onnx\"\n", - " decoder_path = model_path / pretrained_model_name / \"decoder\" / \"model.onnx\"\n", - " init_decoder_path = model_path / pretrained_model_name / \"init-decoder\" / \"model.onnx\"\n", - " else:\n", - " encoder_path = model_path / pretrained_model_name / \"encoder-quantized\" / \"model.onnx\"\n", - " decoder_path = model_path / pretrained_model_name / \"decoder-quantized\" / \"model.onnx\"\n", - " init_decoder_path = model_path / pretrained_model_name / \"init-decoder-quantized\" / \"model.onnx\"\n", - "\n", - " encoder_path.parent.mkdir(parents=True, exist_ok=True)\n", - " decoder_path.parent.mkdir(parents=True, exist_ok=True)\n", - " init_decoder_path.parent.mkdir(parents=True, exist_ok=True)\n", - " \n", - " return encoder_path, decoder_path, init_decoder_path" - ], - "metadata": { - "id": "dcwgIFFHmlcr" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "def turn_model_into_encoder_decoder(model):\n", - " \"\"\"Generates an encoder and a decoder model with a language model head from a pretrained BART model\n", - " Args:\n", - " pretrained_version (str): Name of a pretrained model, or path to a pretrained / finetuned version of BART\n", - " Returns:\n", - " simplified_encoder: pytorch BART encoder with a wrapper to output only the hidden states\n", - " decoder_with_lm_head: pytorch BART decoder with a language modelling head\n", - " decoder_with_lm_head_init: initial pytorch BART decoder with a language modelling head\n", - " \"\"\"\n", - " encoder = model.get_encoder()\n", - " decoder = model.get_decoder()\n", - " lm_head = model.get_output_embeddings()\n", - " final_logits_bias = model.final_logits_bias\n", - "\n", - " simplified_encoder = BartEncoder(encoder)\n", - " decoder_with_lm_head = DecoderWithLMhead(decoder, lm_head, final_logits_bias, model.config)\n", - " decoder_with_lm_head_init = DecoderWithLMheadInitial(decoder, lm_head, final_logits_bias, model.config)\n", - "\n", - " return simplified_encoder, decoder_with_lm_head, decoder_with_lm_head_init" - ], - "metadata": { - "id": "0Xf5loL4b4Kz" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Generating ONNX representations\n", - "Exporting a model in ONNX is done by \"tracing\" the graph. In other words, it keeps track of operations the PyTorch model uses to process a dummy input. Each operation is then converted to ONNX format.\n", - "* The dummy input here is a CNN news excerpt because it is most similar to what our extension will be processing\n", - "\n", - "* [ONNX documentation](https://pytorch.org/docs/stable/onnx.html#functions)" - ], - "metadata": { - "id": "1NvpHAAWKzSq" - } - }, - { - "cell_type": "code", - "source": [ - "TEXT = \"\"\"Elon Musk sold $3.95 billion worth of Tesla stock since completing his purchase of Twitter late last month. \n", - " Musk’s Tesla stock sales, totaling 19.5 million shares, have been widely anticipated ever since the Tesla CEO\n", - " reached a deal to buy Twitter for $44 billion. Musk had sold blocks of Tesla shares worth a total of $15.4 billion\n", - " earlier this year since his deal to buy Twitter was announced. Twitter confirmed Musk bought the social media company\n", - " October 27, but he waited until November 4 to start selling additional Tesla shares. He also sold blocks of Tesla \n", - " stock on Monday and Tuesday this week, according to filings to the Securities and Exchange Commission late Tuesday night. \n", - " It’s not clear if the money Musk raised went toward the Twitter purchase, or to support losses at Twitter since he \n", - " took over. Musk disclosed last week that Twitter has seen a “massive drop in revenue,” as a growing number of advertisers \n", - " pause spending on the platform in the wake of his takeover of the company. He blamed “activist groups” pressuring \n", - " advertisers for the loss of ad dollars.\"\"\"" - ], - "metadata": { - "id": "5vbhRvhuVckE" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "def generate_onnx_representation(model):\n", - " \"\"\"Exports a given BART model to onnx\"\"\"\n", - " (\n", - " simplified_encoder,\n", - " decoder_with_lm_head,\n", - " decoder_with_lm_head_init,\n", - " ) = turn_model_into_encoder_decoder(model)\n", - "\n", - " model_config = model.config\n", - " \n", - " encoder_path, decoder_path, init_decoder_path = get_model_paths(\n", - " model_config._name_or_path, saved_models_path, quantized=False\n", - " )\n", - "\n", - " # creating dummy inputs\n", - " tokenizer = AutoTokenizer.from_pretrained(model_config._name_or_path)\n", - " sample_input = TEXT\n", - " model_inputs = tokenizer(sample_input, return_tensors=\"pt\")\n", - " input_ids = model_inputs[\"input_ids\"]\n", - " attention_mask = model_inputs[\"attention_mask\"]\n", - " \n", - " batch_size = 1\n", - " n_heads = model_config.decoder_attention_heads\n", - " seq_length_a, seq_length_b = input_ids.shape\n", - " d_kv = model_config.d_model // n_heads\n", - "\n", - " input_ids_dec = torch.ones((batch_size, 1), dtype=torch.int64)\n", - " attention_mask_dec = torch.ones((batch_size, seq_length_b), dtype=torch.int64)\n", - " enc_out = torch.ones(\n", - " (batch_size, seq_length_b, model_config.d_model), dtype=torch.float32\n", - " )\n", - " sa = torch.ones(\n", - " (batch_size, n_heads, seq_length_a, d_kv), dtype=torch.float32\n", - " )\n", - " ca = torch.ones(\n", - " (batch_size, n_heads, seq_length_b, d_kv), dtype=torch.float32\n", - " )\n", - " # (self attention keys, self attention values, cross attention keys, cross attention values)\n", - " attention_block = (sa, sa, ca, ca)\n", - " past_key_values = (attention_block,) * model_config.decoder_layers\n", - " flat_past_key_values = functools.reduce(operator.iconcat, past_key_values, [])\n", - "\n", - " decoder_all_inputs = tuple(\n", - " [input_ids_dec, attention_mask_dec, enc_out] + flat_past_key_values\n", - " )\n", - "\n", - " # Exports to ONNX\n", - " with torch.no_grad():\n", - "\n", - " decoder_inputs = [\n", - " \"input_ids\",\n", - " \"encoder_attention_mask\", \n", - " \"encoder_hidden_states\", \n", - " ]\n", - " pkv_input_names = [\"pkv_{}\".format(i) for i in range(len(flat_past_key_values))]\n", - " decoder_input_names = decoder_inputs + pkv_input_names\n", - " decoder_output_names = [\"logits\", \"output_past_key_values\"]\n", - "\n", - " dyn_axis_general = {0: \"batch\", 1: \"sequence\"}\n", - " dyn_axis_pkv = {0: \"batch\", 2: \"seq_length\"}\n", - " \n", - " dyn_axis = {\n", - " \"input_ids\": dyn_axis_general,\n", - " \"encoder_attention_mask\": dyn_axis_general,\n", - " \"encoder_hidden_states\": dyn_axis_general,\n", - " \"logits\": dyn_axis_general,\n", - " \"output_past_key_values\": dyn_axis_general,\n", - " }\n", - "\n", - " dyn_pkv = {\n", - " \"pkv_{}\".format(i): dyn_axis_pkv\n", - " for i in range(len(flat_past_key_values))\n", - " }\n", - "\n", - " dyn_axis_params = {**dyn_axis, **dyn_pkv}\n", - "\n", - "\n", - " # export decoder to use past key values\n", - " torch.onnx.export(\n", - " decoder_with_lm_head,\n", - " decoder_all_inputs,\n", - " decoder_path.as_posix(),\n", - " export_params=True,\n", - " do_constant_folding=False,\n", - " opset_version=12,\n", - " input_names=decoder_input_names,\n", - " output_names=decoder_output_names,\n", - " dynamic_axes=dyn_axis_params,\n", - " )\n", - " \n", - " # export initial decoder to produce past key values\n", - " torch.onnx.export(\n", - " decoder_with_lm_head_init,\n", - " (input_ids_dec, attention_mask_dec, enc_out),\n", - " init_decoder_path.as_posix(),\n", - " export_params=True,\n", - " do_constant_folding=False,\n", - " opset_version=12,\n", - " input_names=[\n", - " \"input_ids\",\n", - " \"encoder_attention_mask\",\n", - " \"encoder_hidden_states\",\n", - " ],\n", - " output_names=decoder_output_names,\n", - " dynamic_axes={\n", - " # batch_size, seq_length = input_shape\n", - " \"input_ids\": dyn_axis_general,\n", - " \"encoder_attention_mask\": dyn_axis_general,\n", - " \"encoder_hidden_states\": dyn_axis_general,\n", - " \"logits\": dyn_axis_general,\n", - " \"past_key_values\": dyn_axis_general,\n", - " },\n", - " )\n", - "\n", - " # export encoder\n", - " torch.onnx.export(\n", - " simplified_encoder,\n", - " args=(input_ids, attention_mask),\n", - " f=encoder_path.as_posix(),\n", - " export_params=True,\n", - " opset_version=12,\n", - " do_constant_folding=True,\n", - " input_names=[\"input_ids\", \"attention_mask\"],\n", - " output_names=[\"hidden_states\"],\n", - " dynamic_axes={\n", - " \"input_ids\": {0: \"batch\", 1: \"seq_length\"},\n", - " \"attention_mask\": {0: \"batch\", 1: \"seq_length\"},\n", - " \"hidden_states\": {0: \"batch\", 1: \"seq_length\"},\n", - " },\n", - " )\n", - "\n", - " return encoder_path, decoder_path, init_decoder_path" - ], - "metadata": { - "id": "bVAZHW_SnV1p" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# MOMENT OF TRUTH!\n", - "# Sometimes this will not work. Try re-reunning the cell or refreshing the notebook, restarting your runtime and \"Run All\".\n", - "# Look in the file paths specified below to check they are generated.\n", - "onnx_model_paths = generate_onnx_representation(model)\n", - "onnx_model_paths" - ], - "metadata": { - "id": "_HiQsj-WrKrt" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import onnx\n", - "#Validating ONNX models, may crash due to RAM restrictions\n", - "try:\n", - " onnx.checker.check_model(\"/content/models/bart-large-cnn/encoder/model.onnx\")\n", - "except onnx.checker.ValidationError as e:\n", - " print(f\"Encoder model is invalid: {e}\")\n", - "else:\n", - " print(\"Encoder model is valid!\")\n", - "\n", - "try:\n", - " onnx.checker.check_model(\"/content/models/bart-large-cnn/decoder/model.onnx\")\n", - "except onnx.checker.ValidationError as e:\n", - " print(f\"Decoder model is invalid: {e}\")\n", - "else:\n", - " print(\"Decoder model is valid!\")\n", - "\n", - "try:\n", - " onnx.checker.check_model(\"/content/models/bart-large-cnn/init-decoder/model.onnx\")\n", - "except onnx.checker.ValidationError as e:\n", - " print(f\"Initial decoder model is invalid: {e}\")\n", - "else:\n", - " print(\"Initial decoder model is valid!\")" - ], - "metadata": { - "id": "7oecfdoqpKi5" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "!du -h models/" - ], - "metadata": { - "id": "V8-7x2OA5oz2" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import os, psutil\n", - "\n", - "os.environ[\"OMP_NUM_THREADS\"] = str(psutil.cpu_count(logical=True))\n", - "os.environ[\"OMP_WAIT_POLICY\"] = \"ACTIVE\"\n", - "\n", - "from onnxruntime import (\n", - " GraphOptimizationLevel,\n", - " InferenceSession,\n", - " SessionOptions,\n", - " ExecutionMode,\n", - ")\n", - "\n", - "def get_onnx_runtime_sessions(\n", - " model_paths,\n", - " default: bool = True,\n", - " opt_level: int = 99,\n", - " parallel_exe_mode: bool = True,\n", - " n_threads: int = 4,\n", - " provider=[\n", - " \"CPUExecutionProvider\",\n", - " ],\n", - ") -> InferenceSession:\n", - " \"\"\"\n", - " Optimizes the model\n", - " Args:\n", - " path_to_encoder (str) : the path of input onnx encoder model.\n", - " path_to_decoder (str) : the path of input onnx decoder model.\n", - " path_to_initial_decoder (str) : the path of input initial onnx decoder model.\n", - " opt_level (int) : sess_options.GraphOptimizationLevel param if set 1 uses 'ORT_ENABLE_BASIC',\n", - " 2 for 'ORT_ENABLE_EXTENDED' and 99 for 'ORT_ENABLE_ALL',\n", - " default value is set to 99.\n", - " parallel_exe_mode (bool) : Sets the execution mode. Default is parallel.\n", - " n_threads (int) : Sets the number of threads used to parallelize the execution within nodes. Default is 0 to let onnxruntime choose\n", - " provider : execution providers list.\n", - " default : set this to true, or it will choose the best settings for your hardware.\n", - " (you can test out different settings for better results.)\n", - " Returns:\n", - " encoder_session : encoder onnx InferenceSession\n", - " decoder_session : decoder onnx InferenceSession\n", - " decoder_sess_init : initial decoder onnx InferenceSession\n", - " \"\"\"\n", - " path_to_encoder, path_to_decoder, path_to_initial_decoder = model_paths\n", - "\n", - " if default:\n", - "\n", - " encoder_sess = InferenceSession(str(path_to_encoder))\n", - "\n", - " decoder_sess = InferenceSession(str(path_to_decoder))\n", - "\n", - " decoder_sess_init = InferenceSession(str(path_to_initial_decoder))\n", - "\n", - " else:\n", - "\n", - " # Few properties that might have an impact on performances\n", - " options = SessionOptions()\n", - "\n", - " if opt_level == 1:\n", - " options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC\n", - " elif opt_level == 2:\n", - " options.graph_optimization_level = (\n", - " GraphOptimizationLevel.ORT_ENABLE_EXTENDED\n", - " )\n", - " else:\n", - " assert opt_level == 99\n", - " options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL\n", - "\n", - " # set this true for better performance\n", - " if parallel_exe_mode == True:\n", - " options.execution_mode = ExecutionMode.ORT_PARALLEL\n", - " else:\n", - " options.execution_mode = ExecutionMode.ORT_SEQUENTIAL\n", - "\n", - " options.intra_op_num_threads = n_threads\n", - "\n", - " encoder_sess = InferenceSession(\n", - " str(path_to_encoder), options, providers=provider\n", - " )\n", - "\n", - " decoder_sess = InferenceSession(\n", - " str(path_to_decoder), options, providers=provider\n", - " )\n", - "\n", - " decoder_sess_init = InferenceSession(\n", - " str(path_to_initial_decoder), options, providers=provider\n", - " )\n", - "\n", - " return encoder_sess, decoder_sess, decoder_sess_init" - ], - "metadata": { - "id": "zgISrAZc5rjk" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from transformers.modeling_outputs import (\n", - " BaseModelOutputWithPast,\n", - " Seq2SeqLMOutput,\n", - " BaseModelOutput,\n", - ")\n", - "\n", - "class OnnxBartEncoder(torch.nn.Module):\n", - " def __init__(self, encoder_sess):\n", - " super().__init__()\n", - " self.encoder = encoder_sess\n", - "\n", - " def forward(\n", - " self,\n", - " input_ids,\n", - " attention_mask,\n", - " inputs_embeds=None,\n", - " head_mask=None,\n", - " output_attentions=None,\n", - " output_hidden_states=None,\n", - " return_dict=None,\n", - " ):\n", - " \n", - " encoder_hidden_state = torch.from_numpy(\n", - " self.encoder.run(\n", - " None,\n", - " {\n", - " \"input_ids\": input_ids.cpu().numpy(),\n", - " \"attention_mask\": attention_mask.cpu().numpy(),\n", - " },\n", - " )[0]\n", - " )\n", - "\n", - " return BaseModelOutput(encoder_hidden_state)\n", - "\n", - "\n", - "class OnnxBartDecoderInit(torch.nn.Module):\n", - " def __init__(self, decoder_sess):\n", - " super().__init__()\n", - " self.decoder = decoder_sess\n", - "\n", - " def forward(self, input_ids, encoder_attention_mask, encoder_hidden_states):\n", - "\n", - " decoder_outputs = self.decoder.run(\n", - " None,\n", - " {\n", - " \"input_ids\": input_ids.cpu().numpy(),\n", - " \"encoder_attention_mask\": encoder_attention_mask.cpu().numpy(),\n", - " \"encoder_hidden_states\": encoder_hidden_states.cpu().numpy(),\n", - " },\n", - " )\n", - "\n", - " list_pkv = tuple(torch.from_numpy(x) for x in decoder_outputs[1:])\n", - " out_past_key_values = tuple(\n", - " list_pkv[i : i + 4] for i in range(0, len(list_pkv), 4)\n", - " )\n", - "\n", - " return torch.from_numpy(decoder_outputs[0]), out_past_key_values\n", - "\n", - "\n", - "class OnnxBartDecoder(torch.nn.Module):\n", - " def __init__(self, decoder_sess):\n", - " super().__init__()\n", - " self.decoder = decoder_sess\n", - "\n", - " def forward(self, input_ids, attention_mask, encoder_hidden_states, past_key_values):\n", - "\n", - " decoder_inputs = {\n", - " \"input_ids\": input_ids.cpu().numpy(),\n", - " \"encoder_attention_mask\": attention_mask.cpu().numpy(),\n", - " #\"encoder_hidden_states\": encoder_hidden_states.cpu().numpy(),\n", - " }\n", - "\n", - " flat_past_key_values = functools.reduce(operator.iconcat, past_key_values, [])\n", - " \n", - " input_names = [x.name for x in self.decoder.get_inputs()]\n", - " inputs = [\n", - " input_ids.cpu().numpy(),\n", - " attention_mask.cpu().numpy(),\n", - " ] + [\n", - " tensor.cpu().numpy() for tensor in flat_past_key_values\n", - " ]\n", - "\n", - " decoder_inputs = dict(zip(input_names, inputs))\n", - " decoder_outputs = self.decoder.run(None, decoder_inputs)\n", - " \n", - " list_pkv = tuple(torch.from_numpy(x) for x in decoder_outputs[1:])\n", - " out_past_key_values = tuple(\n", - " list_pkv[i : i + 4] for i in range(0, len(list_pkv), 4)\n", - " )\n", - "\n", - " return torch.from_numpy(decoder_outputs[0]), out_past_key_values\n", - "\n", - "class OnnxBart(BartForConditionalGeneration):\n", - " \"\"\" creates a BART model using onnx sessions (encode, decoder & init_decoder)\"\"\"\n", - "\n", - " def __init__(self, config, onnx_model_sessions):\n", - " \n", - " # we need to call init of BartPreTrainedModel to not create self.model as \n", - " # BartForConditionalGeneration.__init__ would do!\n", - " super(BartForConditionalGeneration, self).__init__(config)\n", - " \n", - " assert len(onnx_model_sessions) == 3, \"all three models should be given\"\n", - "\n", - " encoder_sess, decoder_sess, decoder_sess_init = onnx_model_sessions\n", - "\n", - " self.encoder = OnnxBartEncoder(encoder_sess)\n", - " self.decoder = OnnxBartDecoder(decoder_sess)\n", - " self.decoder_init = OnnxBartDecoderInit(decoder_sess_init)\n", - " \n", - " @property\n", - " def device(self):\n", - " return \"cpu\"\n", - "\n", - " def get_encoder(self):\n", - " return self.encoder\n", - "\n", - " def get_decoder(self):\n", - " return self.decoder\n", - "\n", - " def get_output_embeddings(self):\n", - " return None\n", - " \n", - " def forward(\n", - " self,\n", - " input_ids=None,\n", - " attention_mask=None,\n", - " decoder_input_ids=None,\n", - " decoder_attention_mask=None,\n", - " head_mask=None,\n", - " decoder_head_mask=None,\n", - " encoder_outputs=None,\n", - " past_key_values=None,\n", - " inputs_embeds=None,\n", - " decoder_inputs_embeds=None,\n", - " labels=None,\n", - " use_cache=None,\n", - " output_attentions=None,\n", - " output_hidden_states=None,\n", - " return_dict=None,\n", - " ):\n", - "\n", - " if encoder_outputs is None:\n", - " # Convert encoder inputs in embeddings if needed\n", - " # (when using generate, we already get encoder_outputs generated\n", - " # by _prepare_encoder_decoder_kwargs_for_generation)\n", - " encoder_outputs = self.encoder(\n", - " input_ids=input_ids, attention_mask=attention_mask\n", - " )\n", - "\n", - " encoder_hidden_states = encoder_outputs[0]\n", - "\n", - " if past_key_values is None:\n", - " # runs only for the first time:\n", - " init_onnx_outputs = self.decoder_init(\n", - " decoder_input_ids, attention_mask, encoder_hidden_states\n", - " )\n", - " logits, past_key_values = init_onnx_outputs\n", - "\n", - " else:\n", - " if decoder_input_ids is not None:\n", - " decoder_input_ids = decoder_input_ids[:, -1:]\n", - "\n", - " onnx_outputs = self.decoder(\n", - " decoder_input_ids,\n", - " attention_mask,\n", - " encoder_hidden_states,\n", - " past_key_values,\n", - " )\n", - "\n", - " logits, past_key_values = onnx_outputs\n", - "\n", - " return Seq2SeqLMOutput(logits=logits, past_key_values=past_key_values)" - ], - "metadata": { - "id": "56OS2sJd5t97" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "encoder_path, decoder_path, init_decoder_path = get_model_paths(\n", - " MODEL_PATH, saved_models_path, quantized=False\n", - ")\n", - "onnx_model_paths = encoder_path, decoder_path, init_decoder_path\n", - "onnx_model_paths" - ], - "metadata": { - "id": "OtC8lBIR5v3g" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Warning! May crash due to CoLab RAM restrictions\n", - "onnx_model_sessions = get_onnx_runtime_sessions(onnx_model_paths, default=True)\n", - "onnx_model_sessions" - ], - "metadata": { - "id": "MOYswi3N51J7" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "for path, session in zip(onnx_model_paths, onnx_model_sessions):\n", - " print(\"---\")\n", - " print(\"path:\", os.path.join(*path.parts[-3:]))\n", - " inputs = list(map(lambda x: x.name, session.get_inputs()))\n", - " print(f\"inputs({len(inputs)}):\", inputs)\n", - " outputs = list(map(lambda x: x.name, session.get_outputs()))\n", - " print(f\"outputs({len(outputs)}):\", outputs)" - ], - "metadata": { - "id": "tKZ5rvW26baW" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "config = AutoConfig.from_pretrained(MODEL_PATH)\n", - "onnx_model = OnnxBart(config, onnx_model_sessions)" - ], - "metadata": { - "id": "rFzo-kd36c3E" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "def summarize(model, t_input):\n", - " inputs = tokenizer.encode(t_input, \n", - " return_tensors='pt', \n", - " #max_length=tokenizer.model_max_length, \n", - " #truncation=True, \n", - " #padding=True,\n", - " )\n", - " summary_ids = model.generate(inputs, \n", - " #min_length=0, \n", - " #max_length=100, \n", - " #length_penalty=15, \n", - " #repetition_penalty=1, \n", - " #early_stopping=True, \n", - " num_beams=3,\n", - " )\n", - " output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)\n", - " return output" - ], - "metadata": { - "id": "Xd66qvZT6xa2" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import time\n", - "t0 = time.time()\n", - "print(summarize(model, TEXT))\n", - "t1 = time.time()\n", - "print(t1 - t0)" - ], - "metadata": { - "id": "XXiRFLatFJvy" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "t0 = time.time()\n", - "print(summarize(onnx_model, TEXT))\n", - "t1 = time.time()\n", - "print(t1 - t0)" - ], - "metadata": { - "id": "kVeY9xN3Ff35" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## **TODO**\n", - "\n", - "* REVIEW CODE: As you can see in the cells above, the run time did not improve much. Also, the outputs from both models are different, so there is likely a bug somewhere in the ONNX conversion.\n", - "* Apply quantization, CoLab RAM pls don't let me down 🙏.\n", - "* Look into graph optimizations of the converted model.\n", - "\n", - "### On a side note...\n", - "\n", - "This is a very small excerpt of the converted decoder, visualized using [netron](https://github.com/lutzroeder/netron)!\n", - "\n", - "It's really neat to see the whole model \"flattened\" on a graph.\n", - "\n", - "The actual image is huge, and couldn't be put on this notebook." - ], - "metadata": { - "id": "hxrOJkwqwmu3" - } - }, - { - "cell_type": "markdown", - "source": [ - "![decoder_excerpt.PNG]()" - ], - "metadata": { - "id": "4U-C2bCvwNdY" - } - } - ], - "metadata": { - "colab": { - "provenance": [], - "authorship_tag": "ABX9TyPig5weK8vmR2rgUGsiV5O2", - "include_colab_link": true - }, - "gpuClass": "standard", - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/summary_be/requirements.txt b/summary_be/requirements.txt index 526b238..e6dc824 100644 --- a/summary_be/requirements.txt +++ b/summary_be/requirements.txt @@ -1,21 +1,49 @@ -absl-py==1.3.0 +black==22.12.0 +blis==0.7.9 +catalogue==2.0.8 +certifi==2022.12.7 +charset-normalizer==3.1.0 click==8.1.3 colorama==0.4.6 +confection==0.0.4 +coverage==6.5.0 +cymem==2.0.7 Flask==2.2.2 Flask-Cors==3.0.10 +idna==3.4 importlib-metadata==5.0.0 itsdangerous==2.1.2 Jinja2==3.1.2 joblib==1.2.0 +langcodes==3.3.0 MarkupSafe==2.1.1 +murmurhash==1.0.9 +mypy-extensions==1.0.0 nltk==3.7 numpy==1.23.5 +packaging==23.1 +pathspec==0.11.1 +pathy==0.10.1 +platformdirs==3.2.0 +preshed==3.0.8 +pydantic==1.10.7 python-dotenv==0.21.0 regex==2022.10.31 +requests==2.28.2 rouge-score==0.1.2 six==1.16.0 +smart-open==6.3.0 +spacy==3.5.2 +spacy-legacy==3.0.12 +spacy-loggers==1.0.4 +srsly==2.4.6 +thinc==8.1.9 +tomli==2.0.1 tqdm==4.64.1 +typer==0.7.0 +typing_extensions==4.5.0 +urllib3==1.26.15 +wasabi==1.1.1 Werkzeug==2.2.2 zipp==3.10.0 -coverage==6.5.0 -black==22.12.0 +tensorflow==2.8.0 \ No newline at end of file