From 47da9705eef908fa44f4784a30d55623d7f193b6 Mon Sep 17 00:00:00 2001 From: ramakrishna232 <80388711+ramakrishna232@users.noreply.github.com> Date: Wed, 26 Jun 2024 18:37:13 +0000 Subject: [PATCH] test --- .vscode/settings.json | 3 +++ ner-submission/dockerfile | 5 ++++ ner-submission/run.py | 52 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 .vscode/settings.json create mode 100644 ner-submission/dockerfile create mode 100644 ner-submission/run.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b881eff --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.analysis.autoImportCompletions": true +} \ No newline at end of file diff --git a/ner-submission/dockerfile b/ner-submission/dockerfile new file mode 100644 index 0000000..3ab6f55 --- /dev/null +++ b/ner-submission/dockerfile @@ -0,0 +1,5 @@ +FROM fschlatt/natural-language-processing-exercises:0.0.1 + +ADD run.py /code/run.py + +ENTRYPOINT [ "python3", "/code/run.py" ] \ No newline at end of file diff --git a/ner-submission/run.py b/ner-submission/run.py new file mode 100644 index 0000000..ca648db --- /dev/null +++ b/ner-submission/run.py @@ -0,0 +1,52 @@ +from pathlib import Path +import json +import spacy +from tira.rest_api_client import Client +from tira.third_party_integrations import get_output_directory + +def load_data(file_path): + with open(file_path, 'r') as file: + data = [json.loads(line) for line in file] + return data + +def predict_labels(sentences, nlp): + predictions = [] + for sentence in sentences: + doc = nlp(sentence['sentence']) + tokens = [token.text for token in doc] + labels = ['O'] * len(tokens) + + for ent in doc.ents: + ent_tokens = [token.text for token in nlp(ent.text)] + start_idx = None + for i in range(len(tokens) - len(ent_tokens) + 1): + if tokens[i:i+len(ent_tokens)] == ent_tokens: + start_idx = i + break + if start_idx is not None: + labels[start_idx] = f"B-{ent.label_}" + for i in range(1, len(ent_tokens)): + labels[start_idx + i] = f"I-{ent.label_}" + + predictions.append({"id": sentence['id'], "tags": labels}) + return predictions + +if __name__ == "__main__": + tira = Client() + + # Loading validation data (automatically replaced by test data when run on TIRA) + text_validation = tira.pd.inputs("nlpbuw-fsu-sose-24", "ner-validation-20240612-training") + sentences = text_validation.to_dict(orient="records") + + # Load spaCy model + nlp = spacy.load("en_core_web_sm") + + # Predicting labels for each sentence + predictions = predict_labels(sentences, nlp) + + # Saving the prediction + output_directory = get_output_directory(str(Path(__file__).parent)) + with open(Path(output_directory) / "predictions.jsonl", 'w') as outfile: + for prediction in predictions: + json.dump(prediction, outfile) + outfile.write('\n')