environment variables

gaushh · Dec 20, 2022 · 5ca47c3 · 5ca47c3
1 parent f6ac2e0
commit 5ca47c3
Show file tree

Hide file tree

Showing 12 changed files with 202 additions and 11 deletions.
diff --git a/helper/write_config.py b/helper/write_config.py
@@ -20,7 +20,7 @@
                    'special_tokens': ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"]
                },
                'bert': {
-                   'hidden_size': 144,
+                   'hidden_size': 24,
                    'model_path': "../../models/exp02",
                    'mlm_probability': 0.15,
                    'evaluation_strategy': "epoch",
@@ -46,7 +46,6 @@
                }
                }
 
-
-with open("../src/config/exp02_config.yaml", 'w') as yamlfile:
+with open("../src/config/exp06.yaml", 'w') as yamlfile:
     data = yaml.dump(config_info, yamlfile)
     print("Write successful")
diff --git a/setup.sh b/setup.sh
@@ -1,3 +1,4 @@
+export CONFIG_FILE="exp02"
 pip install -r requirements.txt
 pip install -r requirements.txt
 wandb login --relogin 8c46e02a8d52f960fb349e009c5b6773c25b6957

diff --git a/src/config/exp01_config.yaml → src/config/exp01.yaml b/src/config/exp01_config.yaml → src/config/exp01.yaml
diff --git a/src/config/exp02_config.yaml → src/config/exp02.yaml b/src/config/exp02_config.yaml → src/config/exp02.yaml
@@ -4,7 +4,7 @@ bert:
   auto_find_batch_size: true
   evaluation_strategy: epoch
   gradient_accumulation_steps: 8
-  hidden_size: 128
+  hidden_size: 384
   hub_model_id: gaushh/optimized-bert
   hub_private_repo: true
   hub_strategy: checkpoint
@@ -13,7 +13,7 @@ bert:
   logging_steps: 100
   lr_scheduler_type: linear
   mlm_probability: 0.15
-  model_path: ../../models/exp01
+  model_path: ../../models/exp02
   num_train_epochs: 15
   push_to_hub: false
   save_steps: 5000

diff --git a/src/config/exp03.yaml b/src/config/exp03.yaml
@@ -0,0 +1,46 @@
+bert:
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  auto_find_batch_size: true
+  evaluation_strategy: epoch
+  gradient_accumulation_steps: 8
+  hidden_size: 192
+  hub_model_id: gaushh/optimized-bert
+  hub_private_repo: true
+  hub_strategy: checkpoint
+  hub_token: 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
+  learning_rate: 0.0001
+  logging_steps: 100
+  lr_scheduler_type: linear
+  mlm_probability: 0.15
+  model_path: ../../models/exp02
+  num_train_epochs: 15
+  push_to_hub: false
+  save_steps: 5000
+  save_strategy: epoch
+  warmup_ratio: 0.01
+  weight_decay: 0.01
+dataset:
+  data_proportion: 0.05
+  processed_test_dir: ../../data/processed/processed_test
+  processed_train_dir: ../../data/processed/processed_train
+  raw_dataset_dir: ../../data/raw/raw_dataset
+  raw_test_path: ../../data/raw/test.txt
+  raw_train_path: ../../data/raw/train.txt
+  test_size: 0.05
+seed: 43
+tokenizer:
+  files:
+  - ../../data/raw/test.txt
+  max_length: 512
+  special_tokens:
+  - '[PAD]'
+  - '[UNK]'
+  - '[CLS]'
+  - '[SEP]'
+  - '[MASK]'
+  - <S>
+  - <T>
+  tokenizer_path: ../../models/tokenizer
+  truncate_longer_samples: false
+  vocab_size: 30522
diff --git a/src/config/exp04.yaml b/src/config/exp04.yaml
@@ -0,0 +1,46 @@
+bert:
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  auto_find_batch_size: true
+  evaluation_strategy: epoch
+  gradient_accumulation_steps: 8
+  hidden_size: 96
+  hub_model_id: gaushh/optimized-bert
+  hub_private_repo: true
+  hub_strategy: checkpoint
+  hub_token: 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
+  learning_rate: 0.0001
+  logging_steps: 100
+  lr_scheduler_type: linear
+  mlm_probability: 0.15
+  model_path: ../../models/exp02
+  num_train_epochs: 15
+  push_to_hub: false
+  save_steps: 5000
+  save_strategy: epoch
+  warmup_ratio: 0.01
+  weight_decay: 0.01
+dataset:
+  data_proportion: 0.05
+  processed_test_dir: ../../data/processed/processed_test
+  processed_train_dir: ../../data/processed/processed_train
+  raw_dataset_dir: ../../data/raw/raw_dataset
+  raw_test_path: ../../data/raw/test.txt
+  raw_train_path: ../../data/raw/train.txt
+  test_size: 0.05
+seed: 43
+tokenizer:
+  files:
+  - ../../data/raw/test.txt
+  max_length: 512
+  special_tokens:
+  - '[PAD]'
+  - '[UNK]'
+  - '[CLS]'
+  - '[SEP]'
+  - '[MASK]'
+  - <S>
+  - <T>
+  tokenizer_path: ../../models/tokenizer
+  truncate_longer_samples: false
+  vocab_size: 30522
diff --git a/src/config/exp05.yaml b/src/config/exp05.yaml
@@ -0,0 +1,46 @@
+bert:
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  auto_find_batch_size: true
+  evaluation_strategy: epoch
+  gradient_accumulation_steps: 8
+  hidden_size: 48
+  hub_model_id: gaushh/optimized-bert
+  hub_private_repo: true
+  hub_strategy: checkpoint
+  hub_token: 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
+  learning_rate: 0.0001
+  logging_steps: 100
+  lr_scheduler_type: linear
+  mlm_probability: 0.15
+  model_path: ../../models/exp02
+  num_train_epochs: 15
+  push_to_hub: false
+  save_steps: 5000
+  save_strategy: epoch
+  warmup_ratio: 0.01
+  weight_decay: 0.01
+dataset:
+  data_proportion: 0.05
+  processed_test_dir: ../../data/processed/processed_test
+  processed_train_dir: ../../data/processed/processed_train
+  raw_dataset_dir: ../../data/raw/raw_dataset
+  raw_test_path: ../../data/raw/test.txt
+  raw_train_path: ../../data/raw/train.txt
+  test_size: 0.05
+seed: 43
+tokenizer:
+  files:
+  - ../../data/raw/test.txt
+  max_length: 512
+  special_tokens:
+  - '[PAD]'
+  - '[UNK]'
+  - '[CLS]'
+  - '[SEP]'
+  - '[MASK]'
+  - <S>
+  - <T>
+  tokenizer_path: ../../models/tokenizer
+  truncate_longer_samples: false
+  vocab_size: 30522
diff --git a/src/config/exp06.yaml b/src/config/exp06.yaml
@@ -0,0 +1,46 @@
+bert:
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  auto_find_batch_size: true
+  evaluation_strategy: epoch
+  gradient_accumulation_steps: 8
+  hidden_size: 24
+  hub_model_id: gaushh/optimized-bert
+  hub_private_repo: true
+  hub_strategy: checkpoint
+  hub_token: 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
+  learning_rate: 0.0001
+  logging_steps: 100
+  lr_scheduler_type: linear
+  mlm_probability: 0.15
+  model_path: ../../models/exp02
+  num_train_epochs: 15
+  push_to_hub: false
+  save_steps: 5000
+  save_strategy: epoch
+  warmup_ratio: 0.01
+  weight_decay: 0.01
+dataset:
+  data_proportion: 0.05
+  processed_test_dir: ../../data/processed/processed_test
+  processed_train_dir: ../../data/processed/processed_train
+  raw_dataset_dir: ../../data/raw/raw_dataset
+  raw_test_path: ../../data/raw/test.txt
+  raw_train_path: ../../data/raw/train.txt
+  test_size: 0.05
+seed: 43
+tokenizer:
+  files:
+  - ../../data/raw/test.txt
+  max_length: 512
+  special_tokens:
+  - '[PAD]'
+  - '[UNK]'
+  - '[CLS]'
+  - '[SEP]'
+  - '[MASK]'
+  - <S>
+  - <T>
+  tokenizer_path: ../../models/tokenizer
+  truncate_longer_samples: false
+  vocab_size: 30522
diff --git a/src/data/dataset.py b/src/data/dataset.py
@@ -2,7 +2,9 @@
 import yaml
 import os
 
-with open("../config/exp02_config.yaml", "r") as yamlfile:
+config_file = os.environ.get('CONFIG_FILE')
+config_path = "../config/{}.yaml".format(config_file)
+with open(config_path, "r") as yamlfile:
     config = yaml.load(yamlfile, Loader=yaml.FullLoader)
 
 if not os.path.isdir('../../data'):

diff --git a/src/modelling/preparation.py b/src/modelling/preparation.py
@@ -2,8 +2,11 @@
 from datasets import load_from_disk, concatenate_datasets
 from itertools import chain
 import yaml
+import os
 
-with open("../config/exp02_config.yaml", "r") as yamlfile:
+config_file = os.environ.get('CONFIG_FILE')
+config_path = "../config/{}.yaml".format(config_file)
+with open(config_path, "r") as yamlfile:
     config = yaml.load(yamlfile, Loader=yaml.FullLoader)
 
 max_length = config["tokenizer"]["max_length"]

diff --git a/src/modelling/train_bert.py b/src/modelling/train_bert.py
@@ -9,8 +9,10 @@
 from architecture import BertSelfAttention
 import wandb
 
+config_file = os.environ.get('CONFIG_FILE')
+config_path = "../config/{}.yaml".format(config_file)
 
-with open("../config/exp02_config.yaml", "r") as yamlfile:
+with open(config_path, "r") as yamlfile:
     config = yaml.load(yamlfile, Loader=yaml.FullLoader)
 
 wandb.init(project="optimized-bert", entity="madridistas")

diff --git a/src/modelling/train_tokenizer.py b/src/modelling/train_tokenizer.py
@@ -3,9 +3,9 @@
 import yaml
 from tokenizers import BertWordPieceTokenizer
 
-
-
-with open("../config/exp02_config.yaml", "r") as yamlfile:
+config_file = os.environ.get('CONFIG_FILE')
+config_path = "../config/{}.yaml".format(config_file)
+with open(config_path, "r") as yamlfile:
     config = yaml.load(yamlfile, Loader=yaml.FullLoader)