Skip to content

Commit

Permalink
Update to include support for gte embedding
Browse files Browse the repository at this point in the history
  • Loading branch information
wellner committed Aug 23, 2024
1 parent fa9e8a2 commit 9224052
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 36 deletions.
7 changes: 4 additions & 3 deletions examples/f_seqbow_20news.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
npmi_matrix = npmi_calc.get_full_vocab_npmi_matrix(X, vectorizer)


supervised = False # True
supervised = True # False # True
use_logging = True

# %%
Expand All @@ -68,7 +68,8 @@

# %%
# We'll use distilbert here as it's more compute efficient than BERT
tf_llm_name = 'distilbert-base-uncased'
#tf_llm_name, use_pooling = 'distilbert-base-uncased', False
tf_llm_name, use_pooling = 'Alibaba-NLP/gte-base-en-v1.5', False

if supervised:
train_ds = list(zip(train_y_s, train_data))
Expand Down Expand Up @@ -99,7 +100,7 @@

estimator = SeqBowEstimator(llm_model_name = tf_llm_name,
latent_distribution = latent_distribution,
n_labels = num_classes,
n_labels = num_classes, pool_encoder=use_pooling,
vocabulary = vectorizer.get_vocab(),
batch_size=batch_size, device=device, log_interval=1,
log_method=log_method, gamma=100.0,
Expand Down
9 changes: 5 additions & 4 deletions tmnt/data_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,21 +41,22 @@
'allenai/scibert_scivocab_uncased': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
'johngiorgi/declutr-sci-base': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
'BAAI/bge-base-en-v1.5': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained)
## add more model options here if desired
'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained),
'Alibaba-NLP/gte-base-en-v1.5': (AutoTokenizer.from_pretrained, AutoModel.from_pretrained)
## add more model options here ...
}

def get_llm(model_name):
tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))
return tok_fn(model_name), model_fn(model_name)
return tok_fn(model_name), model_fn(model_name, trust_remote_code=True)

def get_llm_tokenizer(model_name):
tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))
return tok_fn(model_name)

def get_llm_model(model_name):
tok_fn, model_fn = llm_catalog.get(model_name, ((AutoTokenizer.from_pretrained, AutoModel.from_pretrained)))
return model_fn(model_name)
return model_fn(model_name, trust_remote_code=True)

def get_unwrapped_llm_dataloader(data, bow_vectorizer, llm_name, label_map, batch_size, max_len, shuffle=False, device='cpu'):
label_pipeline = lambda x: label_map.get(x, 0)
Expand Down
35 changes: 6 additions & 29 deletions tmnt/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,16 +285,11 @@ def from_config(cls, config: Union[str, dict], vocabulary: Union[str, torchtext.
logging.error("File {} does not appear to be a valid vocabulary file".format(vocabulary))
raise Exception("Invalid Json Configuration File")
vocabulary = torchtext.vocab.vocab(voc_js)
#if vocabulary['embedding'] is not None:
if False:
raise Exception("Pre-trained embeddings not yet (re-)supported")
#emb_size = vocabulary['embedding'].idx_to_vec[0].size
else:
emb_size = config['embedding'].get('size')
if not emb_size:
emb_size = config['derived_info'].get('embedding_size')
if not emb_size:
raise Exception("Embedding size must be provided as the 'size' attribute of 'embedding' or as 'derived_info.embedding_size'")
emb_size = config['embedding'].get('size')
if not emb_size:
emb_size = config['derived_info'].get('embedding_size')
if not emb_size:
raise Exception("Embedding size must be provided as the 'size' attribute of 'embedding' or as 'derived_info.embedding_size'")
gamma = config.get('gamma', 1.0)
multilabel = config.get('multilabel', False)
lr = config['lr']
Expand Down Expand Up @@ -781,12 +776,6 @@ def from_config(cls, *args, **kwargs):
def _get_model(self, bow_size=-1):
if self.embedding_source != 'random':
e_type, e_name = tuple(self.embedding_source.split(':'))
#pt_embedding = nlp.embedding.create(e_type, source=e_name)
#self.vocabulary.set_embedding(pt_embedding)
#emb_size = len(self.vocabulary.embedding.idx_to_vec[0])
#for word in self.vocabulary.embedding._idx_to_token:
# if (self.vocabulary.embedding[word] == mx.nd.zeros(emb_size)).sum() == emb_size:
# self.vocabulary.embedding[word] = mx.nd.random.normal(0, 0.1, emb_size)
else:
emb_size = self.embedding_size
model = \
Expand Down Expand Up @@ -1030,7 +1019,6 @@ def _get_model_bias_initialize(self, train_data):
tr_bow_counts = self._get_bow_wd_counts(train_data)
model.initialize_bias_terms(tr_bow_counts)
if self.npmi_matrix is not None:
print("****** INITIALIZING NPMI LOSS FUNCTION *******")
model.initialize_npmi_loss(self.npmi_matrix)
return model

Expand All @@ -1057,7 +1045,6 @@ def _get_config(self):
else:
config['latent_distribution'] = {'dist_type':'gaussian'}
config['epochs'] = self.epochs
#config['embedding_source'] = self.embedding_source
config['gamma'] = self.gamma
config['warmup_ratio'] = self.warmup_ratio
config['llm_model_name'] = self.llm_model_name
Expand Down Expand Up @@ -1091,9 +1078,6 @@ def log_train(self, batch_id, batch_num, step_loss, rec_loss, red_loss, class_lo
log_interval, epoch_id, learning_rate):
"""Generate and print out the log message for training. """
if self.has_classifier:
#metric_nm, metric_val = self.metric.compute()
#if not isinstance(metric_nm, list):
# metric_nm, metric_val = [metric_nm], [metric_val]
metric_nm = "AUPRC"
try:
metric_val = self.metric.compute()
Expand Down Expand Up @@ -1126,7 +1110,6 @@ def _get_bow_matrix(self, dataloader, cache=False):
rows = 0
for i, data in enumerate(dataloader):
seqs, = data
#bow_batch = list(seqs[3].squeeze(axis=1))
bow_batch = list(seqs[3])
rows += len(bow_batch)
if i >= max_rows:
Expand Down Expand Up @@ -1170,10 +1153,7 @@ def _get_losses(self, model, batch_data):
label_ls = label_ls.mean()
total_ls = (self.gamma * label_ls) + elbo_ls.mean()
if not self.multilabel:
#label_ind = label.argmax(dim=0)
#self.metric.update([out], [label_ind])
self.metric.update(torch.tensor(out), torch.tensor(label))
#self.metric.update(torch.Tensor([out]), torch.Tensor([label_ind]))
else:
self.metric.update([out], [label])
else:
Expand Down Expand Up @@ -1214,7 +1194,6 @@ def fit_with_validation(self,
joint_loader = PairedDataLoader(train_data, aux_data)
num_train_steps = len(joint_loader) * self.epochs

## The following from HuggingFace trainer.py lines 1047 to 1063
decay_parameters = get_parameter_names(model.llm, ALL_LAYERNORM_LAYERS)
decay_parameters = [name for name in decay_parameters if "bias" not in name]
non_llm_parameters = [name for name,_ in model.named_parameters() if not name.startswith("llm")]
Expand Down Expand Up @@ -1288,10 +1267,8 @@ def update_loss_details(total_ls, elbo_ls, red_ls, class_ls):
if aux_batch is not None:
update_loss_details(total_ls_2, elbo_ls_2, red_ls_2, None)

#debug

if not accumulate or (batch_id + 1) % accumulate == 0:
#torch.nn.utils.clip_grad.clip_grad_value_(model.llm.parameters(), 1.0)
torch.nn.utils.clip_grad.clip_grad_value_(model.llm.parameters(), 1.0)
optimizer.step()
dec_optimizer.step()
lr_scheduler.step()
Expand Down
19 changes: 19 additions & 0 deletions tmnt/eval_npmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,25 @@ def get_full_vocab_npmi_matrix(self, mat):
npmi = (log10(n_docs) + log10(bigram_cnt) - log10(unigram_1) - log10(unigram_2)) / (log10(n_docs) - log10(bigram_cnt) + 1e-4)
npmi_matrix[w1, w2] = npmi
return npmi_matrix

class EvaluateNPMIUmass(object):

def __init__(self, npmi_matrix: np.array, vectorizer: TMNTVectorizer):
self.vectorizer = vectorizer
self.npmi_matrix = npmi_matrix # by convention this will be lower-triangular
dim = npmi_matrix.shape[0]
for mc in range(self.npmi_matrix.shape[0]):
for i in range(mc+1,dim):
self.npmi_matrix[mc,i] = self.npmi_matrix[i,mc]

def evaluate_topics(self, topic_ids):
npmi_score = 0.0
total_size = len(topic_ids) * len(topic_ids[0])
for topic in topic_ids:
for (w1, w2) in combinations(topic):
npmi_score += self.npmi_matrix[w1, w2]
return npmi_score / total_size



class FullNPMI(object):
Expand Down

0 comments on commit 9224052

Please sign in to comment.