Remove RoBERTa

Too heavy, too slow. #17 for example.
yusanshi · Aug 26, 2021 · 7cf63d3 · 7cf63d3
1 parent 4644403
commit 7cf63d3
Show file tree

Hide file tree

Showing 14 changed files with 29 additions and 724 deletions.
diff --git a/README.md b/README.md
@@ -15,12 +15,10 @@ The repository currently includes the following models.
 
 **Experimental models**
 
-| Model | Description                                                  |
-| ----- | ------------------------------------------------------------ |
-| Exp1  | NRMS + (Sub)category + Ensemble + Positional embedding       |
-| Exp2  | RoBERTa (fixed or fine-tuned, word level or sentence level) + (Sub)category + Positional embedding |
-
-
+| Model | Description                                                                                        |
+| ----- | -------------------------------------------------------------------------------------------------- |
+| Exp1  | NRMS + (Sub)category + Ensemble + Positional embedding                                             |
+| ~~Exp2~~  | ~~RoBERTa (fixed or fine-tuned, word level or sentence level) + (Sub)category + Positional embedding~~ |
 
 ## Get started
 
@@ -91,14 +89,14 @@ tensorboard --logdir=runs/{model_name}
 
 ## Results
 
-| Model     | AUC  | nMRR | nDCG@5 | nDCG@10 | Remark |
-| --------- | ---- | ---- | ------ | ------- | ------ |
-| NRMS      |      |      |        |         |        |
-| NAML      |      |      |        |         |        |
-| LSTUR     |      |      |        |         |        |
-| DKN       |      |      |        |         |        |
-| Hi-Fi Ark |      |      |        |         |        |
-| TANR      |      |      |        |         |        |
+| Model     | AUC | MRR | nDCG@5 | nDCG@10 | Remark |
+| --------- | --- | --- | ------ | ------- | ------ |
+| NRMS      |     |     |        |         |        |
+| NAML      |     |     |        |         |        |
+| LSTUR     |     |     |        |         |        |
+| DKN       |     |     |        |         |        |
+| Hi-Fi Ark |     |     |        |         |        |
+| TANR      |     |     |        |         |        |
 
 Checkpoints: <https://drive.google.com/open?id=TODO>
 

diff --git a/requirements.txt b/requirements.txt
@@ -6,5 +6,4 @@ tqdm
 nltk
 scikit-learn
 swifter
-transformers
 ray[tune]
diff --git a/run.sh b/run.sh
diff --git a/src/config.py b/src/config.py
@@ -27,10 +27,10 @@ class BaseConfig():
     negative_sampling_ratio = 2  # K
     dropout_probability = 0.2
     # Modify the following by the output of `src/dataprocess.py`
-    num_words = 1 + 101220
-    num_categories = 1 + 295
-    num_entities = 1 + 21842
-    num_users = 1 + 711222
+    num_words = 1 + 70975
+    num_categories = 1 + 274
+    num_entities = 1 + 12957
+    num_users = 1 + 50000
     word_embedding_dim = 300
     category_embedding_dim = 100
     # Modify the following only if you use another dataset
@@ -104,31 +104,3 @@ class Exp1Config(BaseConfig):
     # For multi-head self-attention
     num_attention_heads = 15
     ensemble_factor = 1  # Not use ensemble since it's too expensive
-
-
-class Exp2Config(BaseConfig):
-    dataset_attributes = {
-        "news": ['category', 'subcategory', 'title'],
-        "record": []
-    }
-    roberta_level = os.environ[
-        'ROBERTA_LEVEL'] if 'ROBERTA_LEVEL' in os.environ else 'sentence'
-    assert roberta_level in ['word', 'sentence']
-    fine_tune = False
-    # For multi-head self-attention
-    num_attention_heads = 15
-    if fine_tune:
-        for x in ['title', 'abstract']:
-            if x in dataset_attributes['news']:
-                dataset_attributes['news'].remove(x)
-                dataset_attributes['news'].extend(
-                    [f'{x}_roberta', f'{x}_mask_roberta'])
-
-
-class Exp3Config(BaseConfig):
-    dataset_attributes = {
-        "news": ['category', 'subcategory', 'title'],
-        "record": []
-    }
-    # For multi-head self-attention
-    num_attention_heads = 15
diff --git a/src/data_preprocess.py b/src/data_preprocess.py
@@ -11,8 +11,6 @@
 import numpy as np
 import csv
 import importlib
-from transformers import RobertaTokenizer, RobertaModel
-import torch
 
 try:
     config = getattr(importlib.import_module('config'), f"{model_name}Config")
@@ -83,8 +81,8 @@ def parse_behaviors(source, target, user2int_path):
         columns=['user', 'clicked_news', 'candidate_news', 'clicked'])
 
 
-def parse_news(source, target, roberta_output_dir, category2int_path,
-               word2int_path, entity2int_path, mode):
+def parse_news(source, target, category2int_path, word2int_path,
+               entity2int_path, mode):
     """
     Parse news for training set and test set
     Args:
@@ -108,69 +106,6 @@ def parse_news(source, target, roberta_output_dir, category2int_path,
     news.abstract_entities.fillna('[]', inplace=True)
     news.fillna(' ', inplace=True)
 
-    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
-    title_roberta = tokenizer(news.title.tolist(),
-                              padding='max_length',
-                              truncation=True,
-                              max_length=config.num_words_title)
-    abstract_roberta = tokenizer(news.abstract.tolist(),
-                                 padding='max_length',
-                                 truncation=True,
-                                 max_length=config.num_words_abstract)
-
-    roberta_df = pd.DataFrame(data=[
-        title_roberta['input_ids'], title_roberta['attention_mask'],
-        abstract_roberta['input_ids'], abstract_roberta['attention_mask']
-    ]).T
-    roberta_df.columns = [
-        'title_roberta', 'title_mask_roberta', 'abstract_roberta',
-        'abstract_mask_roberta'
-    ]
-
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    for x in [title_roberta, abstract_roberta]:
-        for key in x.keys():
-            x[key] = torch.tensor(x[key]).to(device)
-    Path(roberta_output_dir).mkdir(parents=True, exist_ok=True)
-    roberta = RobertaModel.from_pretrained('roberta-base',
-                                           return_dict=True).to(device)
-    with torch.no_grad():
-        title_last_hidden_state = []
-        title_pooler_output = []
-        abstract_last_hidden_state = []
-        abstract_pooler_output = []
-        for count in tqdm(range(math.ceil(len(news) / config.batch_size)),
-                          desc="Calculating news embeddings with RoBERTa"):
-            title_roberta_minibatch = {
-                k: v[count * config.batch_size:(1 + count) * config.batch_size]
-                for k, v in title_roberta.items()
-            }
-            title_outputs = roberta(**title_roberta_minibatch)
-            title_last_hidden_state.append(
-                title_outputs['last_hidden_state'].cpu().numpy())
-            title_pooler_output.append(
-                title_outputs['pooler_output'].cpu().numpy())
-
-            abstract_roberta_minibatch = {
-                k: v[count * config.batch_size:(1 + count) * config.batch_size]
-                for k, v in abstract_roberta.items()
-            }
-            abstract_outputs = roberta(**abstract_roberta_minibatch)
-            abstract_last_hidden_state.append(
-                abstract_outputs['last_hidden_state'].cpu().numpy())
-            abstract_pooler_output.append(
-                abstract_outputs['pooler_output'].cpu().numpy())
-
-        np.save(path.join(roberta_output_dir, 'title_last_hidden_state.npy'),
-                np.concatenate(title_last_hidden_state, axis=0))
-        np.save(path.join(roberta_output_dir, 'title_pooler_output.npy'),
-                np.concatenate(title_pooler_output, axis=0))
-        np.save(
-            path.join(roberta_output_dir, 'abstract_last_hidden_state.npy'),
-            np.concatenate(abstract_last_hidden_state, axis=0))
-        np.save(path.join(roberta_output_dir, 'abstract_pooler_output.npy'),
-                np.concatenate(abstract_pooler_output, axis=0))
-
     def parse_row(row):
         new_row = [
             row.id,
@@ -267,7 +202,6 @@ def parse_row(row):
                 entity2int[k] = len(entity2int) + 1
 
         parsed_news = news.swifter.apply(parse_row, axis=1)
-        parsed_news = pd.concat([parsed_news, roberta_df], axis=1)
         parsed_news.to_csv(target, sep='\t', index=False)
 
         pd.DataFrame(category2int.items(),
@@ -302,7 +236,6 @@ def parse_row(row):
         entity2int = dict(pd.read_table(entity2int_path).values.tolist())
 
         parsed_news = news.swifter.apply(parse_row, axis=1)
-        parsed_news = pd.concat([parsed_news, roberta_df], axis=1)
         parsed_news.to_csv(target, sep='\t', index=False)
 
     else:
@@ -389,7 +322,6 @@ def transform_entity_embedding(source, target, entity2int_path):
     print('Parse news')
     parse_news(path.join(train_dir, 'news.tsv'),
                path.join(train_dir, 'news_parsed.tsv'),
-               path.join(train_dir, 'roberta'),
                path.join(train_dir, 'category2int.tsv'),
                path.join(train_dir, 'word2int.tsv'),
                path.join(train_dir, 'entity2int.tsv'),
@@ -412,7 +344,6 @@ def transform_entity_embedding(source, target, entity2int_path):
     print('Parse news')
     parse_news(path.join(val_dir, 'news.tsv'),
                path.join(val_dir, 'news_parsed.tsv'),
-               path.join(val_dir, 'roberta'),
                path.join(train_dir, 'category2int.tsv'),
                path.join(train_dir, 'word2int.tsv'),
                path.join(train_dir, 'entity2int.tsv'),
@@ -423,7 +354,6 @@ def transform_entity_embedding(source, target, entity2int_path):
     print('Parse news')
     parse_news(path.join(test_dir, 'news.tsv'),
                path.join(test_dir, 'news_parsed.tsv'),
-               path.join(test_dir, 'roberta'),
                path.join(train_dir, 'category2int.tsv'),
                path.join(train_dir, 'word2int.tsv'),
                path.join(train_dir, 'entity2int.tsv'),

diff --git a/src/dataset.py b/src/dataset.py
@@ -15,12 +15,11 @@
 
 
 class BaseDataset(Dataset):
-    def __init__(self, behaviors_path, news_path, roberta_embedding_dir):
+    def __init__(self, behaviors_path, news_path):
         super(BaseDataset, self).__init__()
         assert all(attribute in [
             'category', 'subcategory', 'title', 'abstract', 'title_entities',
-            'abstract_entities', 'title_roberta', 'title_mask_roberta',
-            'abstract_roberta', 'abstract_mask_roberta'
+            'abstract_entities'
         ] for attribute in config.dataset_attributes['news'])
         assert all(attribute in ['user', 'clicked_news_length']
                    for attribute in config.dataset_attributes['record'])
@@ -33,9 +32,7 @@ def __init__(self, behaviors_path, news_path, roberta_embedding_dir):
             converters={
                 attribute: literal_eval
                 for attribute in set(config.dataset_attributes['news']) & set([
-                    'title', 'abstract', 'title_entities', 'abstract_entities',
-                    'title_roberta', 'title_mask_roberta', 'abstract_roberta',
-                    'abstract_mask_roberta'
+                    'title', 'abstract', 'title_entities', 'abstract_entities'
                 ])
             })
         self.news_id2int = {x: i for i, x in enumerate(self.news_parsed.index)}
@@ -50,11 +47,7 @@ def __init__(self, behaviors_path, news_path, roberta_embedding_dir):
             'title': [0] * config.num_words_title,
             'abstract': [0] * config.num_words_abstract,
             'title_entities': [0] * config.num_words_title,
-            'abstract_entities': [0] * config.num_words_abstract,
-            'title_roberta': [0] * config.num_words_title,
-            'title_mask_roberta': [0] * config.num_words_title,
-            'abstract_roberta': [0] * config.num_words_abstract,
-            'abstract_mask_roberta': [0] * config.num_words_abstract
+            'abstract_entities': [0] * config.num_words_abstract
         }
         for key in padding_all.keys():
             padding_all[key] = torch.tensor(padding_all[key])
@@ -65,45 +58,6 @@ def __init__(self, behaviors_path, news_path, roberta_embedding_dir):
             if k in config.dataset_attributes['news']
         }
 
-        if model_name == 'Exp2' and not config.fine_tune:
-            if config.roberta_level == 'word':
-                self.roberta_embedding = {
-                    k: torch.from_numpy(
-                        np.load(
-                            path.join(roberta_embedding_dir,
-                                      f'{k}_last_hidden_state.npy'))).float()
-                    for k in set(config.dataset_attributes['news'])
-                    & set(['title', 'abstract'])
-                }
-                name2length = {
-                    'title': config.num_words_title,
-                    'abstract': config.num_words_abstract
-                }
-                for k in set(config.dataset_attributes['news']) & set(
-                    ['title', 'abstract']):
-                    self.padding[k] = torch.zeros((name2length[k], 768))
-
-            elif config.roberta_level == 'sentence':
-                self.roberta_embedding = {
-                    k: torch.from_numpy(
-                        np.load(
-                            path.join(roberta_embedding_dir,
-                                      f'{k}_pooler_output.npy'))).float()
-                    for k in set(config.dataset_attributes['news'])
-                    & set(['title', 'abstract'])
-                }
-                for k in set(config.dataset_attributes['news']) & set(
-                    ['title', 'abstract']):
-                    self.padding[k] = torch.zeros(768)
-
-    def _news2dict(self, id):
-        ret = self.news2dict[id]
-        if model_name == 'Exp2' and not config.fine_tune:
-            for k in set(config.dataset_attributes['news']) & set(
-                ['title', 'abstract']):
-                ret[k] = self.roberta_embedding[k][self.news_id2int[id]]
-        return ret
-
     def __len__(self):
         return len(self.behaviors_parsed)
 
@@ -114,10 +68,10 @@ def __getitem__(self, idx):
             item['user'] = row.user
         item["clicked"] = list(map(int, row.clicked.split()))
         item["candidate_news"] = [
-            self._news2dict(x) for x in row.candidate_news.split()
+            self.news2dict[x] for x in row.candidate_news.split()
         ]
         item["clicked_news"] = [
-            self._news2dict(x)
+            self.news2dict[x]
             for x in row.clicked_news.split()[:config.num_clicked_news_a_user]
         ]
         if 'clicked_news_length' in config.dataset_attributes['record']: