Merge pull request #28 from ebanalyse/confidence

compute confidence scores
ebanalyse · Aug 23, 2021 · 7624e45 · 7624e45
2 parents 2cea1ac + 2cbe351
commit 7624e45
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 7 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# NERDA 0.9.7
+
+* return confidence scores for predictions of all tokens, e.g. model.predict(x, return_confidence=True).
+
 # NERDA 0.9.6
 
 * compute Precision, Recall and Accuracy (optional) with evaluate_performance().

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="NERDA", 
-    version="0.9.6",
+    version="0.9.7",
     author="Lars Kjeldgaard, Lukas Christian Nielsen",
     author_email="[email protected]",
     description="A Framework for Finetuning Transformers for Named-Entity Recognition",

diff --git a/src/NERDA/models.py b/src/NERDA/models.py
@@ -266,7 +266,9 @@ def half(self):
         self.network.half()
         self.halved = True
 
-    def predict(self, sentences: List[List[str]], **kwargs) -> List[List[str]]:
+    def predict(self, sentences: List[List[str]],
+                return_confidence: bool = False,
+                **kwargs) -> List[List[str]]:
         """Predict Named Entities in Word-Tokenized Sentences
 
         Predicts word-tokenized sentences with trained model.
@@ -275,6 +277,9 @@ def predict(self, sentences: List[List[str]], **kwargs) -> List[List[str]]:
             sentences (List[List[str]]): word-tokenized sentences.
             kwargs: arbitrary keyword arguments. For instance
                 'batch_size' and 'num_workers'.
+            return_confidence (bool, optional): if True, return
+                confidence scores for all predicted tokens. Defaults
+                to False.
 
         Returns:
             List[List[str]]: Predicted tags for sentences - one
@@ -288,15 +293,20 @@ def predict(self, sentences: List[List[str]], **kwargs) -> List[List[str]]:
                        device = self.device,
                        tag_encoder = self.tag_encoder,
                        tag_outside = self.tag_outside,
+                       return_confidence = return_confidence,
                        **kwargs)
 
-    def predict_text(self, text: str, **kwargs) -> list:
+    def predict_text(self, text: str, 
+                     return_confidence:bool = False, **kwargs) -> list:
         """Predict Named Entities in a Text
 
         Args:
             text (str): text to predict entities in.
             kwargs: arbitrary keyword arguments. For instance
                 'batch_size' and 'num_workers'.
+            return_confidence (bool, optional): if True, return
+                confidence scores for all predicted tokens. Defaults
+                to False.
 
         Returns:
             tuple: word-tokenized sentences and predicted 
@@ -310,6 +320,7 @@ def predict_text(self, text: str, **kwargs) -> list:
                             device = self.device,
                             tag_encoder = self.tag_encoder,
                             tag_outside = self.tag_outside,
+                            return_confidence=return_confidence,
                             **kwargs)
 
     def evaluate_performance(self, dataset: dict, 

diff --git a/src/NERDA/predictions.py b/src/NERDA/predictions.py
@@ -12,6 +12,10 @@
 import transformers
 import sklearn.preprocessing
 
+def sigmoid_transform(x):
+    prob = 1/(1 + np.exp(-x))
+    return prob
+
 def predict(network: torch.nn.Module, 
             sentences: List[List[str]],
             transformer_tokenizer: transformers.PreTrainedTokenizer,
@@ -23,6 +27,7 @@ def predict(network: torch.nn.Module,
             batch_size: int = 8,
             num_workers: int = 1,
             return_tensors: bool = False,
+            return_confidence: bool = False,
             pad_sequences: bool = True) -> List[List[str]]:
     """Compute predictions.
 
@@ -48,6 +53,9 @@ def predict(network: torch.nn.Module,
         num_workers (int, optional): Number of workers. Defaults
             to 1.
         return_tensors (bool, optional): if True, return tensors.
+        return_confidence (bool, optional): if True, return
+            confidence scores for all predicted tokens. Defaults
+            to False.
         pad_sequences (bool, optional): if True, pad sequences. 
             Defaults to True.
 
@@ -79,6 +87,7 @@ def predict(network: torch.nn.Module,
                            pad_sequences = pad_sequences)
 
     predictions = []
+    probabilities = []
     tensors = []
 
     with torch.no_grad():
@@ -90,27 +99,38 @@ def predict(network: torch.nn.Module,
             for i in range(outputs.shape[0]):
 
                 # extract prediction and transform.
-                preds = tag_encoder.inverse_transform(
-                    outputs[i].argmax(-1).cpu().numpy()
-                )
+
+                # find max by row.
+                values, indices = outputs[i].max(dim=1)
+
+                preds = tag_encoder.inverse_transform(indices.cpu().numpy())
+                probs = values.cpu().numpy()
 
                 if return_tensors:
                     tensors.append(outputs)    
 
                 # subset predictions for original word tokens.
                 preds = [prediction for prediction, offset in zip(preds.tolist(), dl.get('offsets')[i]) if offset]
+                if return_confidence:
+                    probs = [prob for prob, offset in zip(probs.tolist(), dl.get('offsets')[i]) if offset]
 
                 # Remove special tokens ('CLS' + 'SEP').
                 preds = preds[1:-1]
+                if return_confidence:
+                    probs = probs[1:-1]
 
                 # make sure resulting predictions have same length as
                 # original sentence.
 
                 # TODO: Move assert statement to unit tests. Does not work 
                 # in boundary.
                 # assert len(preds) == len(sentences[i])            
-
                 predictions.append(preds)
+                if return_confidence:
+                    probabilities.append(probs)
+
+            if return_confidence:
+                return predictions, probabilities
 
             if return_tensors:
                 return tensors
@@ -128,6 +148,7 @@ def predict_text(network: torch.nn.Module,
                  batch_size: int = 8,
                  num_workers: int = 1,
                  pad_sequences: bool = True,
+                 return_confidence: bool = False,
                  sent_tokenize: Callable = sent_tokenize,
                  word_tokenize: Callable = word_tokenize) -> tuple:
     """Compute Predictions for Text.
@@ -154,6 +175,9 @@ def predict_text(network: torch.nn.Module,
             to 1.
         pad_sequences (bool, optional): if True, pad sequences. 
             Defaults to True.
+        return_confidence (bool, optional): if True, return 
+            confidence scores for predicted tokens. Defaults
+            to False.
 
     Returns:
         tuple: sentence- and word-tokenized text with corresponding
@@ -170,6 +194,7 @@ def predict_text(network: torch.nn.Module,
                           transformer_config = transformer_config,
                           max_len = max_len,
                           device = device,
+                          return_confidence = return_confidence,
                           batch_size = batch_size,
                           num_workers = num_workers,
                           pad_sequences = pad_sequences,

diff --git a/tests/unit_tests/test_predictions.py b/tests/unit_tests/test_predictions.py
@@ -40,6 +40,12 @@ def test_predict_maxlen_exceed():
     sentences = [nltk.word_tokenize(text)]
     model.predict(sentences)
 
+# test confidence scores
+words, preds = model.predict_text(text_single, return_confidence=True)
+
+def test_confs_len():
+    assert len(preds[0])==len(preds[1])
+
 predictions_text_single = model.predict_text(text_single)
 
 def test_predict_text_format():