From 0edd0ceb51655a91e9d1f85bf0303d143b0d22cf Mon Sep 17 00:00:00 2001 From: Mosh-Bit Date: Mon, 1 Jul 2024 16:59:00 +0200 Subject: [PATCH] new approach --- .../Dockerfile | 2 ++ named-entity-recognition-submission-2/run.py | 34 +++++++++++-------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/named-entity-recognition-submission-2/Dockerfile b/named-entity-recognition-submission-2/Dockerfile index 713147e..b558a21 100644 --- a/named-entity-recognition-submission-2/Dockerfile +++ b/named-entity-recognition-submission-2/Dockerfile @@ -1,6 +1,8 @@ # docker build -t fschlatt/authorship-verification-trivial:0.0.1 . FROM fschlatt/natural-language-processing-exercises:0.0.1 +RUN pip install transformers + ADD run.py /code/run.py ENTRYPOINT [ "python3", "/code/run.py" ] diff --git a/named-entity-recognition-submission-2/run.py b/named-entity-recognition-submission-2/run.py index f96c938..18bbc1d 100644 --- a/named-entity-recognition-submission-2/run.py +++ b/named-entity-recognition-submission-2/run.py @@ -2,7 +2,8 @@ from tira.rest_api_client import Client from tira.third_party_integrations import get_output_directory import pandas as pd -import re +from transformers import AutoTokenizer, AutoModelForTokenClassification +from transformers import pipeline if __name__ == "__main__": tira = Client() @@ -11,31 +12,34 @@ text_validation = tira.pd.inputs("nlpbuw-fsu-sose-24", "ner-validation-20240612-training") targets_validation = tira.pd.truths("nlpbuw-fsu-sose-24", "ner-validation-20240612-training") - # Sample lists of common person names and location names (for demonstration) - person_names = {"Alexander", "John", "Mary", "Lukashenko"} - location_names = {"Belarus", "U.S.", "Germany"} + # Load pre-trained NER model and tokenizer + model_name = "dbmdz/bert-large-cased-finetuned-conll03-english" + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForTokenClassification.from_pretrained(model_name) + ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") - def heuristic_ner_predictor(sentence): - tokens = sentence.split() + def transform_predictions(sentence): + tokens = tokenizer.tokenize(sentence) + ner_results = ner_pipeline(sentence) tags = ['O'] * len(tokens) - for i, token in enumerate(tokens): - if token in person_names: - tags[i] = 'B-per' - elif token in location_names: - tags[i] = 'B-geo' - elif token[0].isupper() and re.match(r'^[A-Z][a-z]+$', token): - tags[i] = 'B-geo' # General rule for capitalized words, assume as geo entities + for result in ner_results: + word_tokens = tokenizer.tokenize(result['word']) + for i, token in enumerate(word_tokens): + if i == 0: + tags[result['index'] - 1] = f"B-{result['entity_group']}" + else: + tags[result['index'] - 1] = f"I-{result['entity_group']}" return tags # labeling the data predictions = text_validation.copy() - predictions['tags'] = predictions['sentence'].apply(heuristic_ner_predictor) + predictions['tags'] = predictions['sentence'].apply(transform_predictions) predictions = predictions[['id', 'tags']] # saving the prediction output_directory = get_output_directory(str(Path(__file__).parent)) - predictions.to_json(Path(output_directory) / "predictions.jsonl", orient="records", lines=True) + predictions.to_json(Path(output_directory) / "predictions.jsonl", orient="records", lines=True) \ No newline at end of file