Replaced models with new ensemble & BERT-based options

GateNLP · Sep 14, 2020 · 4cab6a8 · 4cab6a8
1 parent 270730f
commit 4cab6a8
Show file tree

Hide file tree

Showing 12 changed files with 74 additions and 59 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
 *.pyc
 resources/
 resources.tar.gz
+models/
+models.tar.gz
 __pycache__/
 training/models/
 *.pyc
diff --git a/README.md b/README.md
@@ -17,7 +17,9 @@ This is a re-implementation of Aker et al. (2017) ["Simple Open Stance Classific
 
 2) Clone this repository
 
-3) Download the [resources](http://staffwww.dcs.shef.ac.uk/people/C.Scarton/resources.tar.gz) required for feature extraction and extract it inside the main folder (`StanceClassifer`)
+3) Download the [resources](https://github.com/GateNLP/StanceClassifier/releases/download/v0.1/resources.tar.gz) required for feature extraction and extract it inside the main folder (`StanceClassifer`)
+
+4) Download the [trained models](https://github.com/GateNLP/StanceClassifier/releases/download/v0.1/models.tar.gz) archive and extract it inside the main folder (`StanceClassifer`)
 
 ## Usage
 
@@ -27,10 +29,8 @@ python -m StanceClassifier -l <LANGUAGE> -s <ORIGINAL_JSON> -o <REPLY_JSON> -c <
 ```
     Supported languages: en
     Supported models: 
-        rf (Random Forest)
-        mlp (Multi-layer perceptron)
-        lr (Logistic Regression)
-        svm (Support Vector Machines)
+        ens (Feature-based ensemble model built with Logistic Regression, Random Forest and Multi-Layer Perceptron classifiers)
+        bert-tm (BERT-based model, i.e. fine-tuning of BERT for the rumour stance classification task, using threshold moving for imbalanced data treatment)
 
     The output is a class:
         0.0 = support
@@ -74,4 +74,4 @@ python Test_Run_HTTP_StanceClassifier_Server.py
 To train new models, you can edit `train_model.py` (more support will be given in the future). To run:
 ```
 python train_model.py
-```
+```
diff --git a/StanceClassifier/__main__.py b/StanceClassifier/__main__.py
@@ -12,7 +12,7 @@
 parser.add_argument('-l', help='language', choices=['en'])
 parser.add_argument('-s', help='stance file to be classified (json file)')
 parser.add_argument('-o', help='original stance file (json file)')
-parser.add_argument('-c', help='classifier', choices=['lr', 'rf', 'svm', 'mlp'], required=True)
+parser.add_argument('-c', help='classifier', choices=['ens','bert-tm'], required=True)
 
 
 args = parser.parse_args()

diff --git a/StanceClassifier/features/extract_features.py b/StanceClassifier/features/extract_features.py
@@ -27,26 +27,26 @@
 #from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 from StanceClassifier.util import Util, path_from_root
 
-
 class Features():
 
-    def __init__(self, resources): 
-        self.emb_file = resources["embeddings_file"]
-        self.emb_size = int(resources["embeddings_size"])
-        self.emoticons = self.read_emoticon(resources["emoticon"])
-        self.emoticons_cat = np.zeros(42)
-        self.acronyms = self.read_acronyms(resources["acronyms"])
-        self.vulgarWords = self.read_vulgar_words (resources["vulgar_words"])
-        self.googleBadWords = self.read_google_bad_words(resources["google_bad_words"])
-        self.surpriseWords = self.read_surprise_words(resources["affect_surprise"])
-        self.doubtWords = self.read_doubt_words(resources["doubt_words"])
-        self.noDoubtWords = self.read_no_doubt_words(resources["no_doubt_words"])
+    def __init__(self, resources, only_text = False):
+        if only_text == False:
+            self.emb_file = resources["embeddings_file"]
+            self.emb_size = int(resources["embeddings_size"])
+            self.emoticons = self.read_emoticon(resources["emoticon"])
+            self.emoticons_cat = np.zeros(42)
+            self.acronyms = self.read_acronyms(resources["acronyms"])
+            self.vulgarWords = self.read_vulgar_words (resources["vulgar_words"])
+            self.googleBadWords = self.read_google_bad_words(resources["google_bad_words"])
+            self.surpriseWords = self.read_surprise_words(resources["affect_surprise"])
+            self.doubtWords = self.read_doubt_words(resources["doubt_words"])
+            self.noDoubtWords = self.read_no_doubt_words(resources["no_doubt_words"])
 
-        self.scaler = load(path_from_root(resources["scaler"]))
+            #self.scaler = load(resources["scaler"])
 
-        self.glove = self.loadGloveModel(self.emb_file)
+            self.glove = self.loadGloveModel(self.emb_file)
 
-        self.support_terms = ["support", "join", "confirm", "aid", "help"]
+            self.support_terms = ["support", "join", "confirm", "aid", "help"]
 
 
     def read_doubt_words(self, f):
@@ -156,7 +156,6 @@ def features(self, source, reply):
         hc = []
         c = 0
 
-
         if "text" not in source.keys():
             s_text_raw = source["full_text"].lower()
         else:
@@ -167,8 +166,8 @@ def features(self, source, reply):
         else:
             r_text_raw = reply["text"].lower()
 
-        s_text = tokenizer.tokenize(s_text_raw)
-        r_text = tokenizer.tokenize(r_text_raw)
+        s_text = tokenizer.tokenize(s_text_raw).lower()
+        r_text = tokenizer.tokenize(r_text_raw).lower()
 
         s_id = source["id"]
         r_id = reply["id"]
@@ -456,8 +455,24 @@ def features(self, source, reply):
         aux_vector = np.append(s_vector, r_vector)
         final_vector = np.append(aux_vector, hc_vector)
 
-        final_vector_scaler = self.scaler.transform(final_vector.reshape(1, -1))
+        #final_vector_scaler = self.scaler.transform(final_vector.reshape(1, -1))
 
-        return final_vector_scaler
+        return final_vector
 
+    def extract_text(self, source, reply):
+        final_text = []
+        tokenizer = PreprocessTwitter()
+        if "text" not in source.keys():
+            s_text_raw = source["full_text"]
+        else:
+            s_text_raw = source["text"]
+
+        if "text" not in reply.keys():
+            r_text_raw = reply["full_text"]
+        else:
+            r_text_raw = reply["text"]
 
+        s_text = tokenizer.tokenize(s_text_raw)
+        r_text = tokenizer.tokenize(r_text_raw)
+        final_text.append((s_text, r_text))
+        return final_text
diff --git a/StanceClassifier/features/preprocesstwitter.py b/StanceClassifier/features/preprocesstwitter.py
@@ -74,6 +74,6 @@ def re_sub(pattern, repl):
 
         text = " ".join(tknzr.tokenize(text))
 
-        return text.lower()
+        return text
 
 
diff --git a/StanceClassifier/stance_classifier.py b/StanceClassifier/stance_classifier.py
@@ -3,10 +3,10 @@
 import sys
 from joblib import load
 import numpy as np
-
+import ktrain
 from .features.extract_features import Features
 from .util import Util, path_from_root
-
+from .testing import test
 RESOURCES_PATH = path_from_root("resources.txt")
 
 class StanceClassifier():
@@ -17,29 +17,29 @@ def __init__(self, model):
         #Load resources:
         print("Loading resources")
         util = Util()
-
+        
         self.resources = util.loadResources(RESOURCES_PATH)
         print("Done. %d resources added!" % len(self.resources.keys()))
-        self.feature_extractor = Features(self.resources)
-
-    def classify(self, source, reply):
-        """
-        :param source: JSON dict object source
-        :param reply: JSON dict object reply
-        :return: stance_class, stance_prob
-            stance_class float
-                0.0 = support
-                1.0 = deny
-                2.0 = query
-                3.0 = comment
-            stance_prob [support_prob, deny_prob, query_prob, comment_prob]
-        """
-        #Load resources:
-        clf = load(path_from_root(self.resources["model_" + self.model]))
-        tweet_features = np.array(self.feature_extractor.features(source, reply))
-        stance_class = clf.predict(tweet_features.reshape(1, -1))[0]
-        stance_prob = clf.predict_proba(tweet_features.reshape(1, -1))[0]
-        return stance_class, stance_prob
-
-
-
+		
+        if model == "bert-tm":
+            self.feature_extractor = Features(self.resources, only_text = True)
+        else:
+            self.feature_extractor = Features(self.resources, only_text = False)
+
+    def classify(self, source, reply): 
+
+        if self.model == "ens":
+            tweet_features = np.array(self.feature_extractor.features(source, reply)).reshape(1, -1)
+            scaler_list = load(path_from_root(self.resources["scaler_ensemble"]))
+            clf_list = load(path_from_root(self.resources["model_ensemble"]))
+            stance_class, stance_prob = test.predict_ensemble(clf_list, scaler_list, tweet_features)
+			
+        if self.model == "bert-tm":
+            tweet_pair = self.feature_extractor.extract_text(source, reply)
+            clf = ktrain.load_predictor(path_from_root(self.resources["model_bert"]))
+            stance_class, stance_prob = test.predict_bert(clf, tweet_pair)
+
+			        
+        #stance_class = clf.predict(tweet_features.reshape(1, -1))[0]
+        #stance_prob = clf.predict_proba(tweet_features.reshape(1, -1))[0]
+        return stance_class, stance_prob
diff --git a/models/lr.model b/models/lr.model
diff --git a/models/mlp.model b/models/mlp.model
diff --git a/models/rf.model b/models/rf.model
diff --git a/models/scaler b/models/scaler
diff --git a/models/svm.model b/models/svm.model
diff --git a/resources.txt b/resources.txt
@@ -7,8 +7,6 @@ doubt_words|||resources/linguisticFeatures/doubtWords.txt
 google_bad_words|||resources/linguisticFeatures/googleOfficialBadWords.txt
 no_doubt_words|||resources/linguisticFeatures/noDoubtWords.txt
 vulgar_words|||resources/linguisticFeatures/vulgarWordsDic.txt
-scaler|||models/scaler
-model_lr|||models/lr.model
-model_rf|||models/rf.model
-model_svm|||models/svm.model
-model_mlp|||models/mlp.model
+scaler_ensemble|||models/scaler_ens
+model_ensemble|||models/model_ensemble
+model_bert|||models/BERT_TM