Update to include support for gte embedding

mitre · Aug 23, 2024 · 9224052 · 9224052
1 parent fa9e8a2
commit 9224052
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 36 deletions.
diff --git a/examples/f_seqbow_20news.py b/examples/f_seqbow_20news.py
@@ -42,7 +42,7 @@
 npmi_matrix = npmi_calc.get_full_vocab_npmi_matrix(X, vectorizer)
 
 
-supervised  = False # True
+supervised  = True # False # True
 use_logging = True
 
 # %%
@@ -68,7 +68,8 @@
 
 # %%
 # We'll use distilbert here as it's more compute efficient than BERT
-tf_llm_name = 'distilbert-base-uncased'
+#tf_llm_name, use_pooling = 'distilbert-base-uncased', False
+tf_llm_name, use_pooling = 'Alibaba-NLP/gte-base-en-v1.5', False
 
 if supervised:
     train_ds = list(zip(train_y_s, train_data))
@@ -99,7 +100,7 @@
 
 estimator = SeqBowEstimator(llm_model_name = tf_llm_name,
                             latent_distribution = latent_distribution,
-                            n_labels = num_classes,
+                            n_labels = num_classes, pool_encoder=use_pooling,
                             vocabulary = vectorizer.get_vocab(),
                             batch_size=batch_size, device=device, log_interval=1,
                             log_method=log_method, gamma=100.0, 

diff --git a/tmnt/data_loading.py b/tmnt/data_loading.py
@@ -41,21 +41,22 @@
     'allenai/scibert_scivocab_uncased': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
     'johngiorgi/declutr-sci-base': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
     'BAAI/bge-base-en-v1.5': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
-    'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained)
-    ## add more model options here if desired
+    'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
+    'Alibaba-NLP/gte-base-en-v1.5': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained)
+    ## add more model options here ...
     }
 
 def get_llm(model_name):
     tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))
-    return tok_fn(model_name), model_fn(model_name)
+    return tok_fn(model_name), model_fn(model_name, trust_remote_code=True)
 
 def get_llm_tokenizer(model_name):
     tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))    
     return tok_fn(model_name)
 
 def get_llm_model(model_name):
     tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))        
-    return model_fn(model_name)
+    return model_fn(model_name, trust_remote_code=True)
 
 def get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, shuffle=False, device='cpu'):
     label_pipeline = lambda x: label_map.get(x, 0)

diff --git a/tmnt/estimator.py b/tmnt/estimator.py
@@ -285,16 +285,11 @@ def from_config(cls, config: Union[str, dict], vocabulary: Union[str, torchtext.
                 logging.error("File {} does not appear to be a valid vocabulary file".format(vocabulary))
                 raise Exception("Invalid Json Configuration File")            
             vocabulary = torchtext.vocab.vocab(voc_js)
-        #if vocabulary['embedding'] is not None:
-        if False:
-            raise Exception("Pre-trained embeddings not yet (re-)supported")
-            #emb_size = vocabulary['embedding'].idx_to_vec[0].size
-        else:
-            emb_size = config['embedding'].get('size')
-            if not emb_size:
-                emb_size = config['derived_info'].get('embedding_size')
-            if not emb_size:
-                raise Exception("Embedding size must be provided as the 'size' attribute of 'embedding' or as 'derived_info.embedding_size'")
+        emb_size = config['embedding'].get('size')
+        if not emb_size:
+            emb_size = config['derived_info'].get('embedding_size')
+        if not emb_size:
+            raise Exception("Embedding size must be provided as the 'size' attribute of 'embedding' or as 'derived_info.embedding_size'")
         gamma = config.get('gamma', 1.0)
         multilabel = config.get('multilabel', False)
         lr = config['lr']
@@ -781,12 +776,6 @@ def from_config(cls, *args, **kwargs):
     def _get_model(self, bow_size=-1):
         if self.embedding_source != 'random':
             e_type, e_name = tuple(self.embedding_source.split(':'))
-            #pt_embedding = nlp.embedding.create(e_type, source=e_name)
-            #self.vocabulary.set_embedding(pt_embedding)
-            #emb_size = len(self.vocabulary.embedding.idx_to_vec[0])
-            #for word in self.vocabulary.embedding._idx_to_token:
-            #    if (self.vocabulary.embedding[word] == mx.nd.zeros(emb_size)).sum() == emb_size:
-            #        self.vocabulary.embedding[word] = mx.nd.random.normal(0, 0.1, emb_size)
         else:
             emb_size = self.embedding_size
         model = \
@@ -1030,7 +1019,6 @@ def _get_model_bias_initialize(self, train_data):
         tr_bow_counts = self._get_bow_wd_counts(train_data)
         model.initialize_bias_terms(tr_bow_counts)
         if self.npmi_matrix is not None:
-            print("****** INITIALIZING NPMI LOSS FUNCTION *******")
             model.initialize_npmi_loss(self.npmi_matrix)
         return model
 
@@ -1057,7 +1045,6 @@ def _get_config(self):
         else:
             config['latent_distribution'] = {'dist_type':'gaussian'}
         config['epochs'] = self.epochs
-        #config['embedding_source'] = self.embedding_source
         config['gamma'] = self.gamma
         config['warmup_ratio'] = self.warmup_ratio
         config['llm_model_name'] = self.llm_model_name
@@ -1091,9 +1078,6 @@ def log_train(self, batch_id, batch_num, step_loss, rec_loss, red_loss, class_lo
                   log_interval, epoch_id, learning_rate):
         """Generate and print out the log message for training. """
         if self.has_classifier:
-            #metric_nm, metric_val = self.metric.compute()
-            #if not isinstance(metric_nm, list):
-            #    metric_nm, metric_val = [metric_nm], [metric_val]
             metric_nm = "AUPRC"
             try:
                 metric_val = self.metric.compute()
@@ -1126,7 +1110,6 @@ def _get_bow_matrix(self, dataloader, cache=False):
         rows = 0
         for i, data in enumerate(dataloader):
             seqs, = data
-            #bow_batch = list(seqs[3].squeeze(axis=1))
             bow_batch = list(seqs[3])
             rows += len(bow_batch)
             if i >= max_rows:
@@ -1170,10 +1153,7 @@ def _get_losses(self, model, batch_data):
             label_ls = label_ls.mean()
             total_ls = (self.gamma * label_ls) + elbo_ls.mean()
             if not self.multilabel:
-                #label_ind = label.argmax(dim=0)
-                #self.metric.update([out], [label_ind])
                 self.metric.update(torch.tensor(out), torch.tensor(label))
-                #self.metric.update(torch.Tensor([out]), torch.Tensor([label_ind]))
             else:
                 self.metric.update([out], [label])
         else:
@@ -1214,7 +1194,6 @@ def fit_with_validation(self,
         joint_loader = PairedDataLoader(train_data, aux_data)
         num_train_steps = len(joint_loader) * self.epochs
 
-        ## The following from HuggingFace trainer.py lines 1047 to 1063
         decay_parameters = get_parameter_names(model.llm, ALL_LAYERNORM_LAYERS)
         decay_parameters = [name for name in decay_parameters if "bias" not in name]
         non_llm_parameters = [name for name,_ in model.named_parameters() if not name.startswith("llm")]
@@ -1288,10 +1267,8 @@ def update_loss_details(total_ls, elbo_ls, red_ls, class_ls):
                 if aux_batch is not None:
                     update_loss_details(total_ls_2, elbo_ls_2, red_ls_2, None)
 
-                #debug
-
                 if not accumulate or (batch_id + 1) % accumulate == 0:
-                    #torch.nn.utils.clip_grad.clip_grad_value_(model.llm.parameters(), 1.0)
+                    torch.nn.utils.clip_grad.clip_grad_value_(model.llm.parameters(), 1.0)
                     optimizer.step()
                     dec_optimizer.step()
                     lr_scheduler.step()

diff --git a/tmnt/eval_npmi.py b/tmnt/eval_npmi.py
@@ -115,6 +115,25 @@ def get_full_vocab_npmi_matrix(self, mat):
                 npmi = (log10(n_docs) + log10(bigram_cnt) - log10(unigram_1) - log10(unigram_2)) / (log10(n_docs) - log10(bigram_cnt) + 1e-4)
             npmi_matrix[w1, w2] = npmi
         return npmi_matrix
+
+class EvaluateNPMIUmass(object):
+
+    def __init__(self, npmi_matrix: np.array, vectorizer: TMNTVectorizer):
+        self.vectorizer = vectorizer
+        self.npmi_matrix = npmi_matrix # by convention this will be lower-triangular
+        dim = npmi_matrix.shape[0]
+        for mc in range(self.npmi_matrix.shape[0]):
+            for i in range(mc+1,dim):
+                self.npmi_matrix[mc,i] = self.npmi_matrix[i,mc]
+
+    def evaluate_topics(self, topic_ids):
+        npmi_score = 0.0
+        total_size = len(topic_ids) * len(topic_ids[0])
+        for topic in topic_ids:
+            for (w1, w2) in combinations(topic):
+                npmi_score += self.npmi_matrix[w1, w2]
+        return npmi_score / total_size
+
 
 
 class FullNPMI(object):