From 47da9705eef908fa44f4784a30d55623d7f193b6 Mon Sep 17 00:00:00 2001
From: ramakrishna232 <80388711+ramakrishna232@users.noreply.github.com>
Date: Wed, 26 Jun 2024 18:37:13 +0000
Subject: [PATCH] test

---
 .vscode/settings.json     |  3 +++
 ner-submission/dockerfile |  5 ++++
 ner-submission/run.py     | 52 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+)
 create mode 100644 .vscode/settings.json
 create mode 100644 ner-submission/dockerfile
 create mode 100644 ner-submission/run.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..b881eff
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.analysis.autoImportCompletions": true
+}
\ No newline at end of file
diff --git a/ner-submission/dockerfile b/ner-submission/dockerfile
new file mode 100644
index 0000000..3ab6f55
--- /dev/null
+++ b/ner-submission/dockerfile
@@ -0,0 +1,5 @@
+FROM fschlatt/natural-language-processing-exercises:0.0.1
+
+ADD run.py /code/run.py
+
+ENTRYPOINT [ "python3", "/code/run.py" ]
\ No newline at end of file
diff --git a/ner-submission/run.py b/ner-submission/run.py
new file mode 100644
index 0000000..ca648db
--- /dev/null
+++ b/ner-submission/run.py
@@ -0,0 +1,52 @@
+from pathlib import Path
+import json
+import spacy
+from tira.rest_api_client import Client
+from tira.third_party_integrations import get_output_directory
+
+def load_data(file_path):
+    with open(file_path, 'r') as file:
+        data = [json.loads(line) for line in file]
+    return data
+
+def predict_labels(sentences, nlp):
+    predictions = []
+    for sentence in sentences:
+        doc = nlp(sentence['sentence'])
+        tokens = [token.text for token in doc]
+        labels = ['O'] * len(tokens)
+        
+        for ent in doc.ents:
+            ent_tokens = [token.text for token in nlp(ent.text)]
+            start_idx = None
+            for i in range(len(tokens) - len(ent_tokens) + 1):
+                if tokens[i:i+len(ent_tokens)] == ent_tokens:
+                    start_idx = i
+                    break
+            if start_idx is not None:
+                labels[start_idx] = f"B-{ent.label_}"
+                for i in range(1, len(ent_tokens)):
+                    labels[start_idx + i] = f"I-{ent.label_}"
+        
+        predictions.append({"id": sentence['id'], "tags": labels})
+    return predictions
+
+if __name__ == "__main__":
+    tira = Client()
+
+    # Loading validation data (automatically replaced by test data when run on TIRA)
+    text_validation = tira.pd.inputs("nlpbuw-fsu-sose-24", "ner-validation-20240612-training")
+    sentences = text_validation.to_dict(orient="records")
+
+    # Load spaCy model
+    nlp = spacy.load("en_core_web_sm")
+
+    # Predicting labels for each sentence
+    predictions = predict_labels(sentences, nlp)
+
+    # Saving the prediction
+    output_directory = get_output_directory(str(Path(__file__).parent))
+    with open(Path(output_directory) / "predictions.jsonl", 'w') as outfile:
+        for prediction in predictions:
+            json.dump(prediction, outfile)
+            outfile.write('\n')