Skip to content

Commit

Permalink
environment variables
Browse files Browse the repository at this point in the history
  • Loading branch information
gaushh committed Dec 20, 2022
1 parent f6ac2e0 commit 5ca47c3
Show file tree
Hide file tree
Showing 12 changed files with 202 additions and 11 deletions.
5 changes: 2 additions & 3 deletions helper/write_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
'special_tokens': ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"]
},
'bert': {
'hidden_size': 144,
'hidden_size': 24,
'model_path': "../../models/exp02",
'mlm_probability': 0.15,
'evaluation_strategy': "epoch",
Expand All @@ -46,7 +46,6 @@
}
}


with open("../src/config/exp02_config.yaml", 'w') as yamlfile:
with open("../src/config/exp06.yaml", 'w') as yamlfile:
data = yaml.dump(config_info, yamlfile)
print("Write successful")
1 change: 1 addition & 0 deletions setup.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
export CONFIG_FILE="exp02"
pip install -r requirements.txt
pip install -r requirements.txt
wandb login --relogin 8c46e02a8d52f960fb349e009c5b6773c25b6957
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions src/config/exp02_config.yaml → src/config/exp02.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ bert:
auto_find_batch_size: true
evaluation_strategy: epoch
gradient_accumulation_steps: 8
hidden_size: 128
hidden_size: 384
hub_model_id: gaushh/optimized-bert
hub_private_repo: true
hub_strategy: checkpoint
Expand All @@ -13,7 +13,7 @@ bert:
logging_steps: 100
lr_scheduler_type: linear
mlm_probability: 0.15
model_path: ../../models/exp01
model_path: ../../models/exp02
num_train_epochs: 15
push_to_hub: false
save_steps: 5000
Expand Down
46 changes: 46 additions & 0 deletions src/config/exp03.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
bert:
adam_beta1: 0.9
adam_beta2: 0.999
auto_find_batch_size: true
evaluation_strategy: epoch
gradient_accumulation_steps: 8
hidden_size: 192
hub_model_id: gaushh/optimized-bert
hub_private_repo: true
hub_strategy: checkpoint
hub_token: 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
learning_rate: 0.0001
logging_steps: 100
lr_scheduler_type: linear
mlm_probability: 0.15
model_path: ../../models/exp02
num_train_epochs: 15
push_to_hub: false
save_steps: 5000
save_strategy: epoch
warmup_ratio: 0.01
weight_decay: 0.01
dataset:
data_proportion: 0.05
processed_test_dir: ../../data/processed/processed_test
processed_train_dir: ../../data/processed/processed_train
raw_dataset_dir: ../../data/raw/raw_dataset
raw_test_path: ../../data/raw/test.txt
raw_train_path: ../../data/raw/train.txt
test_size: 0.05
seed: 43
tokenizer:
files:
- ../../data/raw/test.txt
max_length: 512
special_tokens:
- '[PAD]'
- '[UNK]'
- '[CLS]'
- '[SEP]'
- '[MASK]'
- <S>
- <T>
tokenizer_path: ../../models/tokenizer
truncate_longer_samples: false
vocab_size: 30522
46 changes: 46 additions & 0 deletions src/config/exp04.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
bert:
adam_beta1: 0.9
adam_beta2: 0.999
auto_find_batch_size: true
evaluation_strategy: epoch
gradient_accumulation_steps: 8
hidden_size: 96
hub_model_id: gaushh/optimized-bert
hub_private_repo: true
hub_strategy: checkpoint
hub_token: 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
learning_rate: 0.0001
logging_steps: 100
lr_scheduler_type: linear
mlm_probability: 0.15
model_path: ../../models/exp02
num_train_epochs: 15
push_to_hub: false
save_steps: 5000
save_strategy: epoch
warmup_ratio: 0.01
weight_decay: 0.01
dataset:
data_proportion: 0.05
processed_test_dir: ../../data/processed/processed_test
processed_train_dir: ../../data/processed/processed_train
raw_dataset_dir: ../../data/raw/raw_dataset
raw_test_path: ../../data/raw/test.txt
raw_train_path: ../../data/raw/train.txt
test_size: 0.05
seed: 43
tokenizer:
files:
- ../../data/raw/test.txt
max_length: 512
special_tokens:
- '[PAD]'
- '[UNK]'
- '[CLS]'
- '[SEP]'
- '[MASK]'
- <S>
- <T>
tokenizer_path: ../../models/tokenizer
truncate_longer_samples: false
vocab_size: 30522
46 changes: 46 additions & 0 deletions src/config/exp05.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
bert:
adam_beta1: 0.9
adam_beta2: 0.999
auto_find_batch_size: true
evaluation_strategy: epoch
gradient_accumulation_steps: 8
hidden_size: 48
hub_model_id: gaushh/optimized-bert
hub_private_repo: true
hub_strategy: checkpoint
hub_token: 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
learning_rate: 0.0001
logging_steps: 100
lr_scheduler_type: linear
mlm_probability: 0.15
model_path: ../../models/exp02
num_train_epochs: 15
push_to_hub: false
save_steps: 5000
save_strategy: epoch
warmup_ratio: 0.01
weight_decay: 0.01
dataset:
data_proportion: 0.05
processed_test_dir: ../../data/processed/processed_test
processed_train_dir: ../../data/processed/processed_train
raw_dataset_dir: ../../data/raw/raw_dataset
raw_test_path: ../../data/raw/test.txt
raw_train_path: ../../data/raw/train.txt
test_size: 0.05
seed: 43
tokenizer:
files:
- ../../data/raw/test.txt
max_length: 512
special_tokens:
- '[PAD]'
- '[UNK]'
- '[CLS]'
- '[SEP]'
- '[MASK]'
- <S>
- <T>
tokenizer_path: ../../models/tokenizer
truncate_longer_samples: false
vocab_size: 30522
46 changes: 46 additions & 0 deletions src/config/exp06.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
bert:
adam_beta1: 0.9
adam_beta2: 0.999
auto_find_batch_size: true
evaluation_strategy: epoch
gradient_accumulation_steps: 8
hidden_size: 24
hub_model_id: gaushh/optimized-bert
hub_private_repo: true
hub_strategy: checkpoint
hub_token: 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
learning_rate: 0.0001
logging_steps: 100
lr_scheduler_type: linear
mlm_probability: 0.15
model_path: ../../models/exp02
num_train_epochs: 15
push_to_hub: false
save_steps: 5000
save_strategy: epoch
warmup_ratio: 0.01
weight_decay: 0.01
dataset:
data_proportion: 0.05
processed_test_dir: ../../data/processed/processed_test
processed_train_dir: ../../data/processed/processed_train
raw_dataset_dir: ../../data/raw/raw_dataset
raw_test_path: ../../data/raw/test.txt
raw_train_path: ../../data/raw/train.txt
test_size: 0.05
seed: 43
tokenizer:
files:
- ../../data/raw/test.txt
max_length: 512
special_tokens:
- '[PAD]'
- '[UNK]'
- '[CLS]'
- '[SEP]'
- '[MASK]'
- <S>
- <T>
tokenizer_path: ../../models/tokenizer
truncate_longer_samples: false
vocab_size: 30522
4 changes: 3 additions & 1 deletion src/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import yaml
import os

with open("../config/exp02_config.yaml", "r") as yamlfile:
config_file = os.environ.get('CONFIG_FILE')
config_path = "../config/{}.yaml".format(config_file)
with open(config_path, "r") as yamlfile:
config = yaml.load(yamlfile, Loader=yaml.FullLoader)

if not os.path.isdir('../../data'):
Expand Down
5 changes: 4 additions & 1 deletion src/modelling/preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
from datasets import load_from_disk, concatenate_datasets
from itertools import chain
import yaml
import os

with open("../config/exp02_config.yaml", "r") as yamlfile:
config_file = os.environ.get('CONFIG_FILE')
config_path = "../config/{}.yaml".format(config_file)
with open(config_path, "r") as yamlfile:
config = yaml.load(yamlfile, Loader=yaml.FullLoader)

max_length = config["tokenizer"]["max_length"]
Expand Down
4 changes: 3 additions & 1 deletion src/modelling/train_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@
from architecture import BertSelfAttention
import wandb

config_file = os.environ.get('CONFIG_FILE')
config_path = "../config/{}.yaml".format(config_file)

with open("../config/exp02_config.yaml", "r") as yamlfile:
with open(config_path, "r") as yamlfile:
config = yaml.load(yamlfile, Loader=yaml.FullLoader)

wandb.init(project="optimized-bert", entity="madridistas")
Expand Down
6 changes: 3 additions & 3 deletions src/modelling/train_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import yaml
from tokenizers import BertWordPieceTokenizer



with open("../config/exp02_config.yaml", "r") as yamlfile:
config_file = os.environ.get('CONFIG_FILE')
config_path = "../config/{}.yaml".format(config_file)
with open(config_path, "r") as yamlfile:
config = yaml.load(yamlfile, Loader=yaml.FullLoader)


Expand Down

0 comments on commit 5ca47c3

Please sign in to comment.