Skip to content

Commit

Permalink
Replaced models with new ensemble & BERT-based options
Browse files Browse the repository at this point in the history
  • Loading branch information
Ian Roberts committed Sep 14, 2020
1 parent 270730f commit 4cab6a8
Show file tree
Hide file tree
Showing 12 changed files with 74 additions and 59 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
*.pyc
resources/
resources.tar.gz
models/
models.tar.gz
__pycache__/
training/models/
*.pyc
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ This is a re-implementation of Aker et al. (2017) ["Simple Open Stance Classific

2) Clone this repository

3) Download the [resources](http://staffwww.dcs.shef.ac.uk/people/C.Scarton/resources.tar.gz) required for feature extraction and extract it inside the main folder (`StanceClassifer`)
3) Download the [resources](https://github.com/GateNLP/StanceClassifier/releases/download/v0.1/resources.tar.gz) required for feature extraction and extract it inside the main folder (`StanceClassifer`)

4) Download the [trained models](https://github.com/GateNLP/StanceClassifier/releases/download/v0.1/models.tar.gz) archive and extract it inside the main folder (`StanceClassifer`)

## Usage

Expand All @@ -27,10 +29,8 @@ python -m StanceClassifier -l <LANGUAGE> -s <ORIGINAL_JSON> -o <REPLY_JSON> -c <
```
Supported languages: en
Supported models:
rf (Random Forest)
mlp (Multi-layer perceptron)
lr (Logistic Regression)
svm (Support Vector Machines)
ens (Feature-based ensemble model built with Logistic Regression, Random Forest and Multi-Layer Perceptron classifiers)
bert-tm (BERT-based model, i.e. fine-tuning of BERT for the rumour stance classification task, using threshold moving for imbalanced data treatment)

The output is a class:
0.0 = support
Expand Down Expand Up @@ -74,4 +74,4 @@ python Test_Run_HTTP_StanceClassifier_Server.py
To train new models, you can edit `train_model.py` (more support will be given in the future). To run:
```
python train_model.py
```
```
2 changes: 1 addition & 1 deletion StanceClassifier/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
parser.add_argument('-l', help='language', choices=['en'])
parser.add_argument('-s', help='stance file to be classified (json file)')
parser.add_argument('-o', help='original stance file (json file)')
parser.add_argument('-c', help='classifier', choices=['lr', 'rf', 'svm', 'mlp'], required=True)
parser.add_argument('-c', help='classifier', choices=['ens','bert-tm'], required=True)


args = parser.parse_args()
Expand Down
55 changes: 35 additions & 20 deletions StanceClassifier/features/extract_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,26 @@
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from StanceClassifier.util import Util, path_from_root


class Features():

def __init__(self, resources):
self.emb_file = resources["embeddings_file"]
self.emb_size = int(resources["embeddings_size"])
self.emoticons = self.read_emoticon(resources["emoticon"])
self.emoticons_cat = np.zeros(42)
self.acronyms = self.read_acronyms(resources["acronyms"])
self.vulgarWords = self.read_vulgar_words (resources["vulgar_words"])
self.googleBadWords = self.read_google_bad_words(resources["google_bad_words"])
self.surpriseWords = self.read_surprise_words(resources["affect_surprise"])
self.doubtWords = self.read_doubt_words(resources["doubt_words"])
self.noDoubtWords = self.read_no_doubt_words(resources["no_doubt_words"])
def __init__(self, resources, only_text = False):
if only_text == False:
self.emb_file = resources["embeddings_file"]
self.emb_size = int(resources["embeddings_size"])
self.emoticons = self.read_emoticon(resources["emoticon"])
self.emoticons_cat = np.zeros(42)
self.acronyms = self.read_acronyms(resources["acronyms"])
self.vulgarWords = self.read_vulgar_words (resources["vulgar_words"])
self.googleBadWords = self.read_google_bad_words(resources["google_bad_words"])
self.surpriseWords = self.read_surprise_words(resources["affect_surprise"])
self.doubtWords = self.read_doubt_words(resources["doubt_words"])
self.noDoubtWords = self.read_no_doubt_words(resources["no_doubt_words"])

self.scaler = load(path_from_root(resources["scaler"]))
#self.scaler = load(resources["scaler"])

self.glove = self.loadGloveModel(self.emb_file)
self.glove = self.loadGloveModel(self.emb_file)

self.support_terms = ["support", "join", "confirm", "aid", "help"]
self.support_terms = ["support", "join", "confirm", "aid", "help"]


def read_doubt_words(self, f):
Expand Down Expand Up @@ -156,7 +156,6 @@ def features(self, source, reply):
hc = []
c = 0


if "text" not in source.keys():
s_text_raw = source["full_text"].lower()
else:
Expand All @@ -167,8 +166,8 @@ def features(self, source, reply):
else:
r_text_raw = reply["text"].lower()

s_text = tokenizer.tokenize(s_text_raw)
r_text = tokenizer.tokenize(r_text_raw)
s_text = tokenizer.tokenize(s_text_raw).lower()
r_text = tokenizer.tokenize(r_text_raw).lower()

s_id = source["id"]
r_id = reply["id"]
Expand Down Expand Up @@ -456,8 +455,24 @@ def features(self, source, reply):
aux_vector = np.append(s_vector, r_vector)
final_vector = np.append(aux_vector, hc_vector)

final_vector_scaler = self.scaler.transform(final_vector.reshape(1, -1))
#final_vector_scaler = self.scaler.transform(final_vector.reshape(1, -1))

return final_vector_scaler
return final_vector

def extract_text(self, source, reply):
final_text = []
tokenizer = PreprocessTwitter()
if "text" not in source.keys():
s_text_raw = source["full_text"]
else:
s_text_raw = source["text"]

if "text" not in reply.keys():
r_text_raw = reply["full_text"]
else:
r_text_raw = reply["text"]

s_text = tokenizer.tokenize(s_text_raw)
r_text = tokenizer.tokenize(r_text_raw)
final_text.append((s_text, r_text))
return final_text
2 changes: 1 addition & 1 deletion StanceClassifier/features/preprocesstwitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,6 @@ def re_sub(pattern, repl):

text = " ".join(tknzr.tokenize(text))

return text.lower()
return text


52 changes: 26 additions & 26 deletions StanceClassifier/stance_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import sys
from joblib import load
import numpy as np

import ktrain
from .features.extract_features import Features
from .util import Util, path_from_root

from .testing import test
RESOURCES_PATH = path_from_root("resources.txt")

class StanceClassifier():
Expand All @@ -17,29 +17,29 @@ def __init__(self, model):
#Load resources:
print("Loading resources")
util = Util()

self.resources = util.loadResources(RESOURCES_PATH)
print("Done. %d resources added!" % len(self.resources.keys()))
self.feature_extractor = Features(self.resources)

def classify(self, source, reply):
"""
:param source: JSON dict object source
:param reply: JSON dict object reply
:return: stance_class, stance_prob
stance_class float
0.0 = support
1.0 = deny
2.0 = query
3.0 = comment
stance_prob [support_prob, deny_prob, query_prob, comment_prob]
"""
#Load resources:
clf = load(path_from_root(self.resources["model_" + self.model]))
tweet_features = np.array(self.feature_extractor.features(source, reply))
stance_class = clf.predict(tweet_features.reshape(1, -1))[0]
stance_prob = clf.predict_proba(tweet_features.reshape(1, -1))[0]
return stance_class, stance_prob



if model == "bert-tm":
self.feature_extractor = Features(self.resources, only_text = True)
else:
self.feature_extractor = Features(self.resources, only_text = False)

def classify(self, source, reply):

if self.model == "ens":
tweet_features = np.array(self.feature_extractor.features(source, reply)).reshape(1, -1)
scaler_list = load(path_from_root(self.resources["scaler_ensemble"]))
clf_list = load(path_from_root(self.resources["model_ensemble"]))
stance_class, stance_prob = test.predict_ensemble(clf_list, scaler_list, tweet_features)
if self.model == "bert-tm":
tweet_pair = self.feature_extractor.extract_text(source, reply)
clf = ktrain.load_predictor(path_from_root(self.resources["model_bert"]))
stance_class, stance_prob = test.predict_bert(clf, tweet_pair)

#stance_class = clf.predict(tweet_features.reshape(1, -1))[0]
#stance_prob = clf.predict_proba(tweet_features.reshape(1, -1))[0]
return stance_class, stance_prob
Binary file removed models/lr.model
Binary file not shown.
Binary file removed models/mlp.model
Binary file not shown.
Binary file removed models/rf.model
Binary file not shown.
Binary file removed models/scaler
Binary file not shown.
Binary file removed models/svm.model
Binary file not shown.
8 changes: 3 additions & 5 deletions resources.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ doubt_words|||resources/linguisticFeatures/doubtWords.txt
google_bad_words|||resources/linguisticFeatures/googleOfficialBadWords.txt
no_doubt_words|||resources/linguisticFeatures/noDoubtWords.txt
vulgar_words|||resources/linguisticFeatures/vulgarWordsDic.txt
scaler|||models/scaler
model_lr|||models/lr.model
model_rf|||models/rf.model
model_svm|||models/svm.model
model_mlp|||models/mlp.model
scaler_ensemble|||models/scaler_ens
model_ensemble|||models/model_ensemble
model_bert|||models/BERT_TM

0 comments on commit 4cab6a8

Please sign in to comment.