Skip to content

Commit

Permalink
other approach
Browse files Browse the repository at this point in the history
  • Loading branch information
Mosh-Bit committed Jul 1, 2024
1 parent 0daf55d commit f9a4a65
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 0 deletions.
6 changes: 6 additions & 0 deletions named-entity-recognition-submission-2/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# docker build -t fschlatt/authorship-verification-trivial:0.0.1 .
FROM fschlatt/natural-language-processing-exercises:0.0.1

ADD run.py /code/run.py

ENTRYPOINT [ "python3", "/code/run.py" ]
41 changes: 41 additions & 0 deletions named-entity-recognition-submission-2/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from pathlib import Path
from tira.rest_api_client import Client
from tira.third_party_integrations import get_output_directory
import pandas as pd
import re

if __name__ == "__main__":
tira = Client()

# loading validation data (automatically replaced by test data when run on tira)
text_validation = tira.pd.inputs("nlpbuw-fsu-sose-24", "ner-validation-20240612-training")
targets_validation = tira.pd.truths("nlpbuw-fsu-sose-24", "ner-validation-20240612-training")

# Sample lists of common person names and location names (for demonstration)
person_names = {"Alexander", "John", "Mary", "Lukashenko"}
location_names = {"Belarus", "U.S.", "Germany"}


def heuristic_ner_predictor(sentence):
tokens = sentence.split()
tags = ['O'] * len(tokens)

for i, token in enumerate(tokens):
if token in person_names:
tags[i] = 'B-per'
elif token in location_names:
tags[i] = 'B-geo'
elif token[0].isupper() and re.match(r'^[A-Z][a-z]+$', token):
tags[i] = 'B-geo' # General rule for capitalized words, assume as geo entities

return tags


# labeling the data
predictions = text_validation.copy()
predictions['tags'] = predictions['sentence'].apply(heuristic_ner_predictor)
predictions = predictions[['id', 'tags']]

# saving the prediction
output_directory = get_output_directory(str(Path(__file__).parent))
predictions.to_json(Path(output_directory) / "predictions.jsonl", orient="records", lines=True)

0 comments on commit f9a4a65

Please sign in to comment.