Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Gaurav Sharma committed Dec 25, 2022
2 parents 59e419b + a87d316 commit f5a3f7e
Show file tree
Hide file tree
Showing 7 changed files with 1,368 additions and 1 deletion.
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,7 @@ datasets==2.7.1
transformers==4.24.
wandb==0.13.5
accelerate
tensorboard
tensorboard
pytorch_transformers
tensorboardX
scikit-learn
60 changes: 60 additions & 0 deletions run_glue.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@

# include parse_yaml function
pip install -r requirements.txt
ls
source ./yaml_parser.sh

# model path should be same location as this shell file
#python src/glue/download_glue_data.py
export output_dir="output_glue"
mkdir $output_dir
mkdir $output_dir/$config_task1_task_name
mkdir $output_dir/$config_task2_task_name
mkdir $output_dir/$config_task3_task_name
eval $(parse_yaml src/config/glue_config.yaml "config_")


echo $config_task1_model_type

python src/glue/run_glue.py \
--model_type $config_task1_model_type \
--model_name_or_path $config_task1_model_path \
--task_name $config_task1_task_name \
--do_train \
--do_eval \
--do_lower_case \
--data_dir $config_task1_glue_dir/$config_task1_task_name/ \
--max_seq_length $config_task1_max_seq_length \
--per_gpu_train_batch_size $config_task1_per_gpu_train_batch_size \
--learning_rate $config_task1_learning_rate \
--num_train_epochs $config_task1_num_train_epochs \
--output_dir $output_dir/$config_task1_task_name

python src/glue/run_glue.py \
--model_type $config_task2_model_type \
--model_name_or_path $config_task2_model_path \
--task_name $config_task2_task_name \
--do_train \
--do_eval \
--do_lower_case \
--data_dir $config_task2_glue_dir/$config_task2_task_name/ \
--max_seq_length $config_task2_max_seq_length \
--per_gpu_train_batch_size $config_task2_per_gpu_train_batch_size \
--learning_rate $config_task2_learning_rate \
--num_train_epochs $config_task2_num_train_epochs \
--output_dir $output_dir/$config_task2_task_name

python src/glue/run_glue.py \
--model_type $config_task3_model_type \
--model_name_or_path $config_task3_model_path \
--task_name $config_task3_task_name \
--do_train \
--do_eval \
--do_lower_case \
--data_dir $config_task3_glue_dir/$config_task3_task_name/ \
--max_seq_length $config_task3_max_seq_length \
--per_gpu_train_batch_size $config_task3_per_gpu_train_batch_size \
--learning_rate $config_task3_learning_rate \
--num_train_epochs $config_task3_num_train_epochs \
--output_dir $output_dir/$config_task3_task_name

27 changes: 27 additions & 0 deletions src/config/glue_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
task1:
model_type: "bert"
model_path: "bert-base-uncased"
task_name: "RTE"
glue_dir: "data/glue_data"
max_seq_length: 128
per_gpu_train_batch_size: 32
learning_rate: 2e-5
num_train_epochs: 3.0
task2:
model_type: "bert"
model_path: "bert-base-uncased"
task_name: "QQP"
glue_dir: "data/glue_data"
max_seq_length: 128
per_gpu_train_batch_size: 32
learning_rate: 2e-5
num_train_epochs: 3.0
task3:
model_type: "bert"
model_path: "bert-base-uncased"
task_name: "CoLA"
glue_dir: "data/glue_data"
max_seq_length: 128
per_gpu_train_batch_size: 32
learning_rate: 2e-5
num_train_epochs: 3.0
150 changes: 150 additions & 0 deletions src/glue/download_glue_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
''' Script for downloading all GLUE data.
Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).
mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
rm MRPC/_*
rm MSRParaphraseCorpus.msi
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
'''
import io
import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile

TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip',
"SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',
"QQP":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip',
"STS":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
"MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip',
"QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip',
"RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',
"WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip',
"diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv'}

MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'

def download_and_extract(task, data_dir):
print("Downloading and extracting %s..." % task)
if task == "MNLI":
print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.")
data_file = "%s.zip" % task
urllib.request.urlretrieve(TASK2PATH[task], data_file)
with zipfile.ZipFile(data_file) as zip_ref:
zip_ref.extractall(data_dir)
os.remove(data_file)
print("\tCompleted!")

def format_mrpc(data_dir, path_to_data):
print("Processing MRPC...")
mrpc_dir = os.path.join(data_dir, "MRPC")
if not os.path.isdir(mrpc_dir):
os.mkdir(mrpc_dir)
if path_to_data:
mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
else:
try:
mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
except urllib.error.HTTPError:
print("Error downloading MRPC")
return
assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file

with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
header = data_fh.readline()
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))

try:
urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
except KeyError or urllib.error.HTTPError:
print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.")
return

dev_ids = []
with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
for row in ids_fh:
dev_ids.append(row.strip().split('\t'))

with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
header = data_fh.readline()
train_fh.write(header)
dev_fh.write(header)
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
if [id1, id2] in dev_ids:
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
else:
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))

print("\tCompleted!")

def download_diagnostic(data_dir):
print("Downloading and extracting diagnostic...")
if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
os.mkdir(os.path.join(data_dir, "diagnostic"))
data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
print("\tCompleted!")
return

def get_tasks(task_names):
task_names = task_names.split(',')
if "all" in task_names:
tasks = TASKS
else:
tasks = []
for task_name in task_names:
assert task_name in TASKS, "Task %s not found!" % task_name
tasks.append(task_name)
return tasks

def main(arguments):
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', help='directory to save data to', type=str, default='data/glue_data')
parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
type=str, default='all')
parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
type=str, default='')
args = parser.parse_args(arguments)

if not os.path.isdir(args.data_dir):
os.mkdir(args.data_dir)
tasks = get_tasks(args.tasks)

for task in tasks:
if task == 'MRPC':
format_mrpc(args.data_dir, args.path_to_mrpc)
elif task == 'diagnostic':
download_diagnostic(args.data_dir)
else:
download_and_extract(task, args.data_dir)


if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
Loading

0 comments on commit f5a3f7e

Please sign in to comment.