Skip to content

Commit

Permalink
Remove RoBERTa
Browse files Browse the repository at this point in the history
Too heavy, too slow. #17 for example.
  • Loading branch information
yusanshi committed Aug 26, 2021
1 parent 4644403 commit 7cf63d3
Show file tree
Hide file tree
Showing 14 changed files with 29 additions and 724 deletions.
26 changes: 12 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,10 @@ The repository currently includes the following models.

**Experimental models**

| Model | Description |
| ----- | ------------------------------------------------------------ |
| Exp1 | NRMS + (Sub)category + Ensemble + Positional embedding |
| Exp2 | RoBERTa (fixed or fine-tuned, word level or sentence level) + (Sub)category + Positional embedding |


| Model | Description |
| ----- | -------------------------------------------------------------------------------------------------- |
| Exp1 | NRMS + (Sub)category + Ensemble + Positional embedding |
| ~~Exp2~~ | ~~RoBERTa (fixed or fine-tuned, word level or sentence level) + (Sub)category + Positional embedding~~ |

## Get started

Expand Down Expand Up @@ -91,14 +89,14 @@ tensorboard --logdir=runs/{model_name}
## Results

| Model | AUC | nMRR | nDCG@5 | nDCG@10 | Remark |
| --------- | ---- | ---- | ------ | ------- | ------ |
| NRMS | | | | | |
| NAML | | | | | |
| LSTUR | | | | | |
| DKN | | | | | |
| Hi-Fi Ark | | | | | |
| TANR | | | | | |
| Model | AUC | MRR | nDCG@5 | nDCG@10 | Remark |
| --------- | --- | --- | ------ | ------- | ------ |
| NRMS | | | | | |
| NAML | | | | | |
| LSTUR | | | | | |
| DKN | | | | | |
| Hi-Fi Ark | | | | | |
| TANR | | | | | |

Checkpoints: <https://drive.google.com/open?id=TODO>

Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,4 @@ tqdm
nltk
scikit-learn
swifter
transformers
ray[tune]
15 changes: 0 additions & 15 deletions run.sh

This file was deleted.

36 changes: 4 additions & 32 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ class BaseConfig():
negative_sampling_ratio = 2 # K
dropout_probability = 0.2
# Modify the following by the output of `src/dataprocess.py`
num_words = 1 + 101220
num_categories = 1 + 295
num_entities = 1 + 21842
num_users = 1 + 711222
num_words = 1 + 70975
num_categories = 1 + 274
num_entities = 1 + 12957
num_users = 1 + 50000
word_embedding_dim = 300
category_embedding_dim = 100
# Modify the following only if you use another dataset
Expand Down Expand Up @@ -104,31 +104,3 @@ class Exp1Config(BaseConfig):
# For multi-head self-attention
num_attention_heads = 15
ensemble_factor = 1 # Not use ensemble since it's too expensive


class Exp2Config(BaseConfig):
dataset_attributes = {
"news": ['category', 'subcategory', 'title'],
"record": []
}
roberta_level = os.environ[
'ROBERTA_LEVEL'] if 'ROBERTA_LEVEL' in os.environ else 'sentence'
assert roberta_level in ['word', 'sentence']
fine_tune = False
# For multi-head self-attention
num_attention_heads = 15
if fine_tune:
for x in ['title', 'abstract']:
if x in dataset_attributes['news']:
dataset_attributes['news'].remove(x)
dataset_attributes['news'].extend(
[f'{x}_roberta', f'{x}_mask_roberta'])


class Exp3Config(BaseConfig):
dataset_attributes = {
"news": ['category', 'subcategory', 'title'],
"record": []
}
# For multi-head self-attention
num_attention_heads = 15
74 changes: 2 additions & 72 deletions src/data_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
import numpy as np
import csv
import importlib
from transformers import RobertaTokenizer, RobertaModel
import torch

try:
config = getattr(importlib.import_module('config'), f"{model_name}Config")
Expand Down Expand Up @@ -83,8 +81,8 @@ def parse_behaviors(source, target, user2int_path):
columns=['user', 'clicked_news', 'candidate_news', 'clicked'])


def parse_news(source, target, roberta_output_dir, category2int_path,
word2int_path, entity2int_path, mode):
def parse_news(source, target, category2int_path, word2int_path,
entity2int_path, mode):
"""
Parse news for training set and test set
Args:
Expand All @@ -108,69 +106,6 @@ def parse_news(source, target, roberta_output_dir, category2int_path,
news.abstract_entities.fillna('[]', inplace=True)
news.fillna(' ', inplace=True)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
title_roberta = tokenizer(news.title.tolist(),
padding='max_length',
truncation=True,
max_length=config.num_words_title)
abstract_roberta = tokenizer(news.abstract.tolist(),
padding='max_length',
truncation=True,
max_length=config.num_words_abstract)

roberta_df = pd.DataFrame(data=[
title_roberta['input_ids'], title_roberta['attention_mask'],
abstract_roberta['input_ids'], abstract_roberta['attention_mask']
]).T
roberta_df.columns = [
'title_roberta', 'title_mask_roberta', 'abstract_roberta',
'abstract_mask_roberta'
]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
for x in [title_roberta, abstract_roberta]:
for key in x.keys():
x[key] = torch.tensor(x[key]).to(device)
Path(roberta_output_dir).mkdir(parents=True, exist_ok=True)
roberta = RobertaModel.from_pretrained('roberta-base',
return_dict=True).to(device)
with torch.no_grad():
title_last_hidden_state = []
title_pooler_output = []
abstract_last_hidden_state = []
abstract_pooler_output = []
for count in tqdm(range(math.ceil(len(news) / config.batch_size)),
desc="Calculating news embeddings with RoBERTa"):
title_roberta_minibatch = {
k: v[count * config.batch_size:(1 + count) * config.batch_size]
for k, v in title_roberta.items()
}
title_outputs = roberta(**title_roberta_minibatch)
title_last_hidden_state.append(
title_outputs['last_hidden_state'].cpu().numpy())
title_pooler_output.append(
title_outputs['pooler_output'].cpu().numpy())

abstract_roberta_minibatch = {
k: v[count * config.batch_size:(1 + count) * config.batch_size]
for k, v in abstract_roberta.items()
}
abstract_outputs = roberta(**abstract_roberta_minibatch)
abstract_last_hidden_state.append(
abstract_outputs['last_hidden_state'].cpu().numpy())
abstract_pooler_output.append(
abstract_outputs['pooler_output'].cpu().numpy())

np.save(path.join(roberta_output_dir, 'title_last_hidden_state.npy'),
np.concatenate(title_last_hidden_state, axis=0))
np.save(path.join(roberta_output_dir, 'title_pooler_output.npy'),
np.concatenate(title_pooler_output, axis=0))
np.save(
path.join(roberta_output_dir, 'abstract_last_hidden_state.npy'),
np.concatenate(abstract_last_hidden_state, axis=0))
np.save(path.join(roberta_output_dir, 'abstract_pooler_output.npy'),
np.concatenate(abstract_pooler_output, axis=0))

def parse_row(row):
new_row = [
row.id,
Expand Down Expand Up @@ -267,7 +202,6 @@ def parse_row(row):
entity2int[k] = len(entity2int) + 1

parsed_news = news.swifter.apply(parse_row, axis=1)
parsed_news = pd.concat([parsed_news, roberta_df], axis=1)
parsed_news.to_csv(target, sep='\t', index=False)

pd.DataFrame(category2int.items(),
Expand Down Expand Up @@ -302,7 +236,6 @@ def parse_row(row):
entity2int = dict(pd.read_table(entity2int_path).values.tolist())

parsed_news = news.swifter.apply(parse_row, axis=1)
parsed_news = pd.concat([parsed_news, roberta_df], axis=1)
parsed_news.to_csv(target, sep='\t', index=False)

else:
Expand Down Expand Up @@ -389,7 +322,6 @@ def transform_entity_embedding(source, target, entity2int_path):
print('Parse news')
parse_news(path.join(train_dir, 'news.tsv'),
path.join(train_dir, 'news_parsed.tsv'),
path.join(train_dir, 'roberta'),
path.join(train_dir, 'category2int.tsv'),
path.join(train_dir, 'word2int.tsv'),
path.join(train_dir, 'entity2int.tsv'),
Expand All @@ -412,7 +344,6 @@ def transform_entity_embedding(source, target, entity2int_path):
print('Parse news')
parse_news(path.join(val_dir, 'news.tsv'),
path.join(val_dir, 'news_parsed.tsv'),
path.join(val_dir, 'roberta'),
path.join(train_dir, 'category2int.tsv'),
path.join(train_dir, 'word2int.tsv'),
path.join(train_dir, 'entity2int.tsv'),
Expand All @@ -423,7 +354,6 @@ def transform_entity_embedding(source, target, entity2int_path):
print('Parse news')
parse_news(path.join(test_dir, 'news.tsv'),
path.join(test_dir, 'news_parsed.tsv'),
path.join(test_dir, 'roberta'),
path.join(train_dir, 'category2int.tsv'),
path.join(train_dir, 'word2int.tsv'),
path.join(train_dir, 'entity2int.tsv'),
Expand Down
58 changes: 6 additions & 52 deletions src/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@


class BaseDataset(Dataset):
def __init__(self, behaviors_path, news_path, roberta_embedding_dir):
def __init__(self, behaviors_path, news_path):
super(BaseDataset, self).__init__()
assert all(attribute in [
'category', 'subcategory', 'title', 'abstract', 'title_entities',
'abstract_entities', 'title_roberta', 'title_mask_roberta',
'abstract_roberta', 'abstract_mask_roberta'
'abstract_entities'
] for attribute in config.dataset_attributes['news'])
assert all(attribute in ['user', 'clicked_news_length']
for attribute in config.dataset_attributes['record'])
Expand All @@ -33,9 +32,7 @@ def __init__(self, behaviors_path, news_path, roberta_embedding_dir):
converters={
attribute: literal_eval
for attribute in set(config.dataset_attributes['news']) & set([
'title', 'abstract', 'title_entities', 'abstract_entities',
'title_roberta', 'title_mask_roberta', 'abstract_roberta',
'abstract_mask_roberta'
'title', 'abstract', 'title_entities', 'abstract_entities'
])
})
self.news_id2int = {x: i for i, x in enumerate(self.news_parsed.index)}
Expand All @@ -50,11 +47,7 @@ def __init__(self, behaviors_path, news_path, roberta_embedding_dir):
'title': [0] * config.num_words_title,
'abstract': [0] * config.num_words_abstract,
'title_entities': [0] * config.num_words_title,
'abstract_entities': [0] * config.num_words_abstract,
'title_roberta': [0] * config.num_words_title,
'title_mask_roberta': [0] * config.num_words_title,
'abstract_roberta': [0] * config.num_words_abstract,
'abstract_mask_roberta': [0] * config.num_words_abstract
'abstract_entities': [0] * config.num_words_abstract
}
for key in padding_all.keys():
padding_all[key] = torch.tensor(padding_all[key])
Expand All @@ -65,45 +58,6 @@ def __init__(self, behaviors_path, news_path, roberta_embedding_dir):
if k in config.dataset_attributes['news']
}

if model_name == 'Exp2' and not config.fine_tune:
if config.roberta_level == 'word':
self.roberta_embedding = {
k: torch.from_numpy(
np.load(
path.join(roberta_embedding_dir,
f'{k}_last_hidden_state.npy'))).float()
for k in set(config.dataset_attributes['news'])
& set(['title', 'abstract'])
}
name2length = {
'title': config.num_words_title,
'abstract': config.num_words_abstract
}
for k in set(config.dataset_attributes['news']) & set(
['title', 'abstract']):
self.padding[k] = torch.zeros((name2length[k], 768))

elif config.roberta_level == 'sentence':
self.roberta_embedding = {
k: torch.from_numpy(
np.load(
path.join(roberta_embedding_dir,
f'{k}_pooler_output.npy'))).float()
for k in set(config.dataset_attributes['news'])
& set(['title', 'abstract'])
}
for k in set(config.dataset_attributes['news']) & set(
['title', 'abstract']):
self.padding[k] = torch.zeros(768)

def _news2dict(self, id):
ret = self.news2dict[id]
if model_name == 'Exp2' and not config.fine_tune:
for k in set(config.dataset_attributes['news']) & set(
['title', 'abstract']):
ret[k] = self.roberta_embedding[k][self.news_id2int[id]]
return ret

def __len__(self):
return len(self.behaviors_parsed)

Expand All @@ -114,10 +68,10 @@ def __getitem__(self, idx):
item['user'] = row.user
item["clicked"] = list(map(int, row.clicked.split()))
item["candidate_news"] = [
self._news2dict(x) for x in row.candidate_news.split()
self.news2dict[x] for x in row.candidate_news.split()
]
item["clicked_news"] = [
self._news2dict(x)
self.news2dict[x]
for x in row.clicked_news.split()[:config.num_clicked_news_a_user]
]
if 'clicked_news_length' in config.dataset_attributes['record']:
Expand Down
Loading

0 comments on commit 7cf63d3

Please sign in to comment.