Added fine tuned models

Signed-off-by: Navneet Verma <[email protected]>
navneet1v · Jan 12, 2024 · c239ed1 · c239ed1
1 parent 63fe67f
commit c239ed1
Show file tree

Hide file tree

Showing 10 changed files with 99 additions and 0 deletions.
diff --git a/fine-tuned-models/.gitattributes b/fine-tuned-models/.gitattributes
@@ -0,0 +1,2 @@
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
diff --git a/fine-tuned-models/amazon_esci.zip b/fine-tuned-models/amazon_esci.zip
diff --git a/fine-tuned-models/create_hash.py b/fine-tuned-models/create_hash.py
@@ -0,0 +1,16 @@
+import hashlib
+
+#Example: model_file_path = "/home/ec2-user/dev/norm_comb_tuned/test/trace_models/dbpedia_tuned.zip"
+model_file_path = "<FULL_PATH_OF_THE_TRACED_MODEL>"
+
+sha256 = hashlib.sha256()
+BUF_SIZE = 65536  # lets read stuff in 64kb chunks!
+with open(model_file_path, "rb") as file:
+    while True:
+        chunk = file.read(BUF_SIZE)
+        if not chunk:
+            break
+        sha256.update(chunk)
+sha256_value = sha256.hexdigest()
+
+print(sha256_value)
diff --git a/fine-tuned-models/dbpedia.tar.gz b/fine-tuned-models/dbpedia.tar.gz
diff --git a/fine-tuned-models/fiqa.tar.gz b/fine-tuned-models/fiqa.tar.gz
diff --git a/fine-tuned-models/nfcorpus.zip b/fine-tuned-models/nfcorpus.zip
diff --git a/fine-tuned-models/quora.tar.gz b/fine-tuned-models/quora.tar.gz
diff --git a/fine-tuned-models/scidocs.zip b/fine-tuned-models/scidocs.zip
diff --git a/fine-tuned-models/trace_model.py b/fine-tuned-models/trace_model.py
@@ -0,0 +1,60 @@
+import os
+from zipfile import ZipFile
+
+import torch
+from sentence_transformers import SentenceTransformer
+
+#model = SentenceTransformer("models/dbpedia_custom_small")
+model = SentenceTransformer("covid_tasb_9")
+
+folder_path = "traced_model"
+model_name = "trec_covid.pt"
+zip_file_name = "trec_covid_tuned.zip"
+
+save_json_folder_path = folder_path
+model_output_path = folder_path
+
+model_path = os.path.join(folder_path, model_name)
+
+print("model_path:", model_path)
+
+zip_file_path = os.path.join(model_output_path, zip_file_name)
+
+# save tokenizer.json in save_json_folder_name
+model.save(save_json_folder_path)
+
+# convert to pt format will need to be in cpu,
+# set the device to cpu, convert its input_ids and attention_mask in cpu and save as .pt format
+device = torch.device("cpu")
+cpu_model = model.to(device)
+
+sentences = ["This is the first example we want to explore", "I'm using these sentences as example but please try to provide longer example which will be helpful for models"]
+
+features = cpu_model.tokenizer(
+    sentences, return_tensors="pt", padding=True, truncation=True
+).to(device)
+
+compiled_model = torch.jit.trace(
+    cpu_model,
+    (
+        {
+            "input_ids": features["input_ids"],
+            "attention_mask": features["attention_mask"],
+        }
+    ),
+    strict=False,
+)
+torch.jit.save(compiled_model, model_path)
+print("model file is saved to ", model_path)
+
+# zip model file along with tokenizer.json as output
+with ZipFile(str(zip_file_path), "w") as zipObj:
+    zipObj.write(
+        model_path,
+        arcname=str(model_name),
+    )
+    zipObj.write(
+        os.path.join(save_json_folder_path, "tokenizer.json"),
+        arcname="tokenizer.json",
+    )
+print("zip file is saved to ", zip_file_path, "\n")
diff --git a/fine-tuned-models/trec_covid.zip b/fine-tuned-models/trec_covid.zip