Merge branch 'main' of https://github.com/gaushh/optimized-bert

gaushh · Dec 25, 2022 · f5a3f7e · f5a3f7e
2 parents 59e419b + a87d316
commit f5a3f7e
Show file tree

Hide file tree

Showing 7 changed files with 1,368 additions and 1 deletion.
diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,7 @@ datasets==2.7.1
 transformers==4.24.
 wandb==0.13.5
 accelerate
-tensorboard
+tensorboard
+pytorch_transformers
+tensorboardX
+scikit-learn
diff --git a/run_glue.sh b/run_glue.sh
@@ -0,0 +1,60 @@
+
+# include parse_yaml function
+pip install -r requirements.txt
+ls
+source ./yaml_parser.sh
+
+# model path should be same location as this shell file
+#python src/glue/download_glue_data.py
+export output_dir="output_glue"
+mkdir $output_dir
+mkdir $output_dir/$config_task1_task_name
+mkdir $output_dir/$config_task2_task_name
+mkdir $output_dir/$config_task3_task_name
+eval $(parse_yaml src/config/glue_config.yaml "config_")
+
+
+echo $config_task1_model_type
+
+python src/glue/run_glue.py \
+  --model_type $config_task1_model_type \
+  --model_name_or_path $config_task1_model_path \
+  --task_name $config_task1_task_name \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $config_task1_glue_dir/$config_task1_task_name/ \
+  --max_seq_length $config_task1_max_seq_length \
+  --per_gpu_train_batch_size $config_task1_per_gpu_train_batch_size \
+  --learning_rate $config_task1_learning_rate \
+  --num_train_epochs $config_task1_num_train_epochs \
+  --output_dir $output_dir/$config_task1_task_name
+
+python src/glue/run_glue.py \
+  --model_type $config_task2_model_type \
+  --model_name_or_path $config_task2_model_path \
+  --task_name $config_task2_task_name \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $config_task2_glue_dir/$config_task2_task_name/ \
+  --max_seq_length $config_task2_max_seq_length \
+  --per_gpu_train_batch_size $config_task2_per_gpu_train_batch_size \
+  --learning_rate $config_task2_learning_rate \
+  --num_train_epochs $config_task2_num_train_epochs \
+  --output_dir $output_dir/$config_task2_task_name
+
+python src/glue/run_glue.py \
+  --model_type $config_task3_model_type \
+  --model_name_or_path $config_task3_model_path \
+  --task_name $config_task3_task_name \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $config_task3_glue_dir/$config_task3_task_name/ \
+  --max_seq_length $config_task3_max_seq_length \
+  --per_gpu_train_batch_size $config_task3_per_gpu_train_batch_size \
+  --learning_rate $config_task3_learning_rate \
+  --num_train_epochs $config_task3_num_train_epochs \
+  --output_dir $output_dir/$config_task3_task_name
+
diff --git a/src/config/glue_config.yaml b/src/config/glue_config.yaml
@@ -0,0 +1,27 @@
+task1:
+  model_type: "bert"
+  model_path: "bert-base-uncased"
+  task_name: "RTE"
+  glue_dir: "data/glue_data"
+  max_seq_length: 128
+  per_gpu_train_batch_size: 32
+  learning_rate: 2e-5
+  num_train_epochs: 3.0
+task2:
+  model_type: "bert"
+  model_path: "bert-base-uncased"
+  task_name: "QQP"
+  glue_dir: "data/glue_data"
+  max_seq_length: 128
+  per_gpu_train_batch_size: 32
+  learning_rate: 2e-5
+  num_train_epochs: 3.0
+task3:
+  model_type: "bert"
+  model_path: "bert-base-uncased"
+  task_name: "CoLA"
+  glue_dir: "data/glue_data"
+  max_seq_length: 128
+  per_gpu_train_batch_size: 32
+  learning_rate: 2e-5
+  num_train_epochs: 3.0
diff --git a/src/glue/download_glue_data.py b/src/glue/download_glue_data.py
@@ -0,0 +1,150 @@
+''' Script for downloading all GLUE data.
+
+Note: for legal reasons, we are unable to host MRPC.
+You can either use the version hosted by the SentEval team, which is already tokenized, 
+or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
+For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
+You should then rename and place specific files in a folder (see below for an example).
+
+mkdir MRPC
+cabextract MSRParaphraseCorpus.msi -d MRPC
+cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
+cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
+rm MRPC/_*
+rm MSRParaphraseCorpus.msi
+
+1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
+2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
+'''
+import io
+import os
+import sys
+import shutil
+import argparse
+import tempfile
+import urllib.request
+import zipfile
+
+TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
+TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip',
+             "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',
+             "QQP":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip',
+             "STS":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
+             "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip',
+             "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip',
+             "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',
+             "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip',
+             "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv'}
+
+MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
+MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
+
+def download_and_extract(task, data_dir):
+    print("Downloading and extracting %s..." % task)
+    if task == "MNLI":
+        print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.")
+    data_file = "%s.zip" % task
+    urllib.request.urlretrieve(TASK2PATH[task], data_file)
+    with zipfile.ZipFile(data_file) as zip_ref:
+        zip_ref.extractall(data_dir)
+    os.remove(data_file)
+    print("\tCompleted!")
+
+def format_mrpc(data_dir, path_to_data):
+    print("Processing MRPC...")
+    mrpc_dir = os.path.join(data_dir, "MRPC")
+    if not os.path.isdir(mrpc_dir):
+        os.mkdir(mrpc_dir)
+    if path_to_data:
+        mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
+        mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
+    else:
+        try:
+            mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
+            mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
+            urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
+            urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
+        except urllib.error.HTTPError:
+            print("Error downloading MRPC")
+            return
+    assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
+    assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
+
+    with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
+            io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
+        header = data_fh.readline()
+        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+        for idx, row in enumerate(data_fh):
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+
+    try:
+        urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
+    except KeyError or urllib.error.HTTPError:
+        print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.")
+        return
+
+    dev_ids = []
+    with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
+        for row in ids_fh:
+            dev_ids.append(row.strip().split('\t'))
+
+    with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
+         io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
+         io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
+        header = data_fh.readline()
+        train_fh.write(header)
+        dev_fh.write(header)
+        for row in data_fh:
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            if [id1, id2] in dev_ids:
+                dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+            else:
+                train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+
+    print("\tCompleted!")
+
+def download_diagnostic(data_dir):
+    print("Downloading and extracting diagnostic...")
+    if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
+        os.mkdir(os.path.join(data_dir, "diagnostic"))
+    data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
+    urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
+    print("\tCompleted!")
+    return
+
+def get_tasks(task_names):
+    task_names = task_names.split(',')
+    if "all" in task_names:
+        tasks = TASKS
+    else:
+        tasks = []
+        for task_name in task_names:
+            assert task_name in TASKS, "Task %s not found!" % task_name
+            tasks.append(task_name)
+    return tasks
+
+def main(arguments):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='data/glue_data')
+    parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
+                        type=str, default='all')
+    parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
+                        type=str, default='')
+    args = parser.parse_args(arguments)
+
+    if not os.path.isdir(args.data_dir):
+        os.mkdir(args.data_dir)
+    tasks = get_tasks(args.tasks)
+
+    for task in tasks:
+        if task == 'MRPC':
+            format_mrpc(args.data_dir, args.path_to_mrpc)
+        elif task == 'diagnostic':
+            download_diagnostic(args.data_dir)
+        else:
+            download_and_extract(task, args.data_dir)
+
+
+if __name__ == '__main__':
+    sys.exit(main(sys.argv[1:]))