other approach

tira-io · Jul 1, 2024 · f9a4a65 · f9a4a65
1 parent 0daf55d
commit f9a4a65
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 0 deletions.
diff --git a/named-entity-recognition-submission-2/Dockerfile b/named-entity-recognition-submission-2/Dockerfile
@@ -0,0 +1,6 @@
+# docker build -t fschlatt/authorship-verification-trivial:0.0.1 .
+FROM fschlatt/natural-language-processing-exercises:0.0.1
+
+ADD run.py /code/run.py
+
+ENTRYPOINT [ "python3", "/code/run.py" ]
diff --git a/named-entity-recognition-submission-2/run.py b/named-entity-recognition-submission-2/run.py
@@ -0,0 +1,41 @@
+from pathlib import Path
+from tira.rest_api_client import Client
+from tira.third_party_integrations import get_output_directory
+import pandas as pd
+import re
+
+if __name__ == "__main__":
+    tira = Client()
+
+    # loading validation data (automatically replaced by test data when run on tira)
+    text_validation = tira.pd.inputs("nlpbuw-fsu-sose-24", "ner-validation-20240612-training")
+    targets_validation = tira.pd.truths("nlpbuw-fsu-sose-24", "ner-validation-20240612-training")
+
+    # Sample lists of common person names and location names (for demonstration)
+    person_names = {"Alexander", "John", "Mary", "Lukashenko"}
+    location_names = {"Belarus", "U.S.", "Germany"}
+
+
+    def heuristic_ner_predictor(sentence):
+        tokens = sentence.split()
+        tags = ['O'] * len(tokens)
+
+        for i, token in enumerate(tokens):
+            if token in person_names:
+                tags[i] = 'B-per'
+            elif token in location_names:
+                tags[i] = 'B-geo'
+            elif token[0].isupper() and re.match(r'^[A-Z][a-z]+$', token):
+                tags[i] = 'B-geo'  # General rule for capitalized words, assume as geo entities
+
+        return tags
+
+
+    # labeling the data
+    predictions = text_validation.copy()
+    predictions['tags'] = predictions['sentence'].apply(heuristic_ner_predictor)
+    predictions = predictions[['id', 'tags']]
+
+    # saving the prediction
+    output_directory = get_output_directory(str(Path(__file__).parent))
+    predictions.to_json(Path(output_directory) / "predictions.jsonl", orient="records", lines=True)