From c239ed147cd8c3450a2eafa3f20024fc85ad5986 Mon Sep 17 00:00:00 2001 From: Navneet Verma Date: Fri, 12 Jan 2024 00:50:57 -0800 Subject: [PATCH] Added fine tuned models Signed-off-by: Navneet Verma --- fine-tuned-models/.gitattributes | 2 ++ fine-tuned-models/amazon_esci.zip | 3 ++ fine-tuned-models/create_hash.py | 16 +++++++++ fine-tuned-models/dbpedia.tar.gz | 3 ++ fine-tuned-models/fiqa.tar.gz | 3 ++ fine-tuned-models/nfcorpus.zip | 3 ++ fine-tuned-models/quora.tar.gz | 3 ++ fine-tuned-models/scidocs.zip | 3 ++ fine-tuned-models/trace_model.py | 60 +++++++++++++++++++++++++++++++ fine-tuned-models/trec_covid.zip | 3 ++ 10 files changed, 99 insertions(+) create mode 100644 fine-tuned-models/.gitattributes create mode 100644 fine-tuned-models/amazon_esci.zip create mode 100644 fine-tuned-models/create_hash.py create mode 100644 fine-tuned-models/dbpedia.tar.gz create mode 100644 fine-tuned-models/fiqa.tar.gz create mode 100644 fine-tuned-models/nfcorpus.zip create mode 100644 fine-tuned-models/quora.tar.gz create mode 100644 fine-tuned-models/scidocs.zip create mode 100644 fine-tuned-models/trace_model.py create mode 100644 fine-tuned-models/trec_covid.zip diff --git a/fine-tuned-models/.gitattributes b/fine-tuned-models/.gitattributes new file mode 100644 index 000000000..8e85e0275 --- /dev/null +++ b/fine-tuned-models/.gitattributes @@ -0,0 +1,2 @@ +*.zip filter=lfs diff=lfs merge=lfs -text +*.tar.gz filter=lfs diff=lfs merge=lfs -text diff --git a/fine-tuned-models/amazon_esci.zip b/fine-tuned-models/amazon_esci.zip new file mode 100644 index 000000000..c1a6603b2 --- /dev/null +++ b/fine-tuned-models/amazon_esci.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3515d206a7e5eb3b53e5a33f41b7e1bca0604d2335060b3e26f3f3d31f78dae7 +size 244930506 diff --git a/fine-tuned-models/create_hash.py b/fine-tuned-models/create_hash.py new file mode 100644 index 000000000..b986a3235 --- /dev/null +++ b/fine-tuned-models/create_hash.py @@ -0,0 +1,16 @@ +import hashlib + +#Example: model_file_path = "/home/ec2-user/dev/norm_comb_tuned/test/trace_models/dbpedia_tuned.zip" +model_file_path = "" + +sha256 = hashlib.sha256() +BUF_SIZE = 65536 # lets read stuff in 64kb chunks! +with open(model_file_path, "rb") as file: + while True: + chunk = file.read(BUF_SIZE) + if not chunk: + break + sha256.update(chunk) +sha256_value = sha256.hexdigest() + +print(sha256_value) diff --git a/fine-tuned-models/dbpedia.tar.gz b/fine-tuned-models/dbpedia.tar.gz new file mode 100644 index 000000000..efb599305 --- /dev/null +++ b/fine-tuned-models/dbpedia.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d11208ecc71810378ac3f86ff7eee7e47506d5149c8758360d9cd511dce142e +size 244931050 diff --git a/fine-tuned-models/fiqa.tar.gz b/fine-tuned-models/fiqa.tar.gz new file mode 100644 index 000000000..417c4f3f8 --- /dev/null +++ b/fine-tuned-models/fiqa.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8503960f15bdf9432a227891b8c60cdb0768c086b600ded4bdc614e3356f42a +size 244924621 diff --git a/fine-tuned-models/nfcorpus.zip b/fine-tuned-models/nfcorpus.zip new file mode 100644 index 000000000..9e4ea0a9f --- /dev/null +++ b/fine-tuned-models/nfcorpus.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:551e961aff148a6b440608ae3ea14042fc244064ec7b38cab814d8827860a0c2 +size 245136068 diff --git a/fine-tuned-models/quora.tar.gz b/fine-tuned-models/quora.tar.gz new file mode 100644 index 000000000..649ff0a69 --- /dev/null +++ b/fine-tuned-models/quora.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27dade8c2b1fea1aa6464d2b594df0bfc5c3c396cc42bc1a475dde4377b55b75 +size 244930726 diff --git a/fine-tuned-models/scidocs.zip b/fine-tuned-models/scidocs.zip new file mode 100644 index 000000000..ba496ae1e --- /dev/null +++ b/fine-tuned-models/scidocs.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa8bba76e8c9826c653c5ee9deb98b8a2722e494d191747cbd81ca10cb3b2c8d +size 244928147 diff --git a/fine-tuned-models/trace_model.py b/fine-tuned-models/trace_model.py new file mode 100644 index 000000000..dc48ebe9b --- /dev/null +++ b/fine-tuned-models/trace_model.py @@ -0,0 +1,60 @@ +import os +from zipfile import ZipFile + +import torch +from sentence_transformers import SentenceTransformer + +#model = SentenceTransformer("models/dbpedia_custom_small") +model = SentenceTransformer("covid_tasb_9") + +folder_path = "traced_model" +model_name = "trec_covid.pt" +zip_file_name = "trec_covid_tuned.zip" + +save_json_folder_path = folder_path +model_output_path = folder_path + +model_path = os.path.join(folder_path, model_name) + +print("model_path:", model_path) + +zip_file_path = os.path.join(model_output_path, zip_file_name) + +# save tokenizer.json in save_json_folder_name +model.save(save_json_folder_path) + +# convert to pt format will need to be in cpu, +# set the device to cpu, convert its input_ids and attention_mask in cpu and save as .pt format +device = torch.device("cpu") +cpu_model = model.to(device) + +sentences = ["This is the first example we want to explore", "I'm using these sentences as example but please try to provide longer example which will be helpful for models"] + +features = cpu_model.tokenizer( + sentences, return_tensors="pt", padding=True, truncation=True +).to(device) + +compiled_model = torch.jit.trace( + cpu_model, + ( + { + "input_ids": features["input_ids"], + "attention_mask": features["attention_mask"], + } + ), + strict=False, +) +torch.jit.save(compiled_model, model_path) +print("model file is saved to ", model_path) + +# zip model file along with tokenizer.json as output +with ZipFile(str(zip_file_path), "w") as zipObj: + zipObj.write( + model_path, + arcname=str(model_name), + ) + zipObj.write( + os.path.join(save_json_folder_path, "tokenizer.json"), + arcname="tokenizer.json", + ) +print("zip file is saved to ", zip_file_path, "\n") diff --git a/fine-tuned-models/trec_covid.zip b/fine-tuned-models/trec_covid.zip new file mode 100644 index 000000000..631a84382 --- /dev/null +++ b/fine-tuned-models/trec_covid.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfdc618b8272bafd280b22d2cefdbebfb8dc0be0860a3bcee52fc5429b2e020e +size 244924935