From 1e71153f6bb573e989558a8ded9654eb1f6b7f0c Mon Sep 17 00:00:00 2001 From: aminfarajian Date: Wed, 17 Jul 2024 11:21:50 +0000 Subject: [PATCH] updated the metrics with language in their arguments --- tower_eval/metrics/errant/metric.py | 8 ++++---- tower_eval/metrics/f1_sequence/metric.py | 17 +++++++++++------ tower_eval/tasks/generate.py | 9 ++++----- tower_eval/utils.py | 21 ++++++++++++++------- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/tower_eval/metrics/errant/metric.py b/tower_eval/metrics/errant/metric.py index a6486f3..6049a82 100644 --- a/tower_eval/metrics/errant/metric.py +++ b/tower_eval/metrics/errant/metric.py @@ -17,15 +17,15 @@ def run( self, hypothesis_path, gold_data_path, - references, - language: str, tokenize_source: bool = False, tokenize_hypothesis: bool = False, **kwargs ) -> dict: - hypothesis_m2 = self.preprocess(hypothesis_path, gold_data_path, language) + language = kwargs["lp"]["src_lang"] + references = kwargs["references_m2"] + hypothesis_m2 = self.preprocess(hypothesis_path, gold_data_path, language, tokenize_source, tokenize_hypothesis) result = self.evaluate( - hypothesis_m2, references, language, tokenize_source, tokenize_hypothesis + hypothesis_m2, references ) result.print_result(self.metric_name()) return result.format_result(self.metric_name()) diff --git a/tower_eval/metrics/f1_sequence/metric.py b/tower_eval/metrics/f1_sequence/metric.py index 9e6187a..6c3ce33 100644 --- a/tower_eval/metrics/f1_sequence/metric.py +++ b/tower_eval/metrics/f1_sequence/metric.py @@ -38,14 +38,17 @@ def run( valid_ner_tags: list[str] = None, **kwargs, ) -> dict: + language = kwargs["lp"]["src_lang"] hypothesis = self._load_samples( hypothesis_path, + language=language, format=hypothesis_format, tokenize=tokenize_hypothesis, + default_noent_tag=default_noent_tag ) hypothesis = self.filter_tags(hypothesis, valid_ner_tags, default_noent_tag) reference = self._load_samples( - gold_data_path, format=reference_format, tokenize=False + gold_data_path, language=language, format=reference_format, tokenize=False, default_noent_tag=default_noent_tag ) reference = self.filter_tags(reference, valid_ner_tags, default_noent_tag) @@ -67,7 +70,7 @@ def evaluate( ) -> F1SequenceResult: # evaluate by tag f1s_by_tag = {} - for tag in self.valid_ner_tags: + for tag in valid_ner_tags: filtered_hypothesis = self.filter_tags(hypothesis, tag, default_noent_tag) filtered_reference = self.filter_tags(reference, tag, default_noent_tag) true_seqs, pred_seqs = align_hyp_ref( @@ -101,9 +104,11 @@ def process_result(self, result) -> MetricResult: def _load_samples( self, filename: str, + language: str, format: Literal["text", "tsv", "xml"] = "text", separator="|", tokenize: bool = False, + default_noent_tag: str = "O" ) -> List[List[str]]: """ " It reads the labeled file, and returns only the tags. @@ -119,13 +124,13 @@ def _load_samples( if tokenize: input_lines = read_lines(filename) input_lines = tokenize_spacy( - lines=input_lines, language=self.language, keep_xml=True + lines=input_lines, language=language, keep_xml=True ) else: with open(filename, "r", encoding="utf8") as ifh: input_lines = ifh.readlines() input_lines = [line.strip() for line in input_lines] - labels = [self.xml2iob(hyp) for hyp in input_lines] + labels = [self.xml2iob(hyp, default_noent_tag) for hyp in input_lines] elif format == "tsv": separator = "\t" with open(filename, "r", encoding="utf8") as infh: @@ -166,11 +171,11 @@ def list_of_tuples_to_tokens(self, line): tokens = [token[1] for token in list_of_tuples] return tokens - def xml2iob(self, xml_string): + def xml2iob(self, xml_string, default_noent_tag): tokens = xml_string.split() annotations = [] # tag is set to "O" (self.noent_tag) by default - tag = self.default_noent_tag + tag = default_noent_tag for token in tokens: matching_open_tag = re.search(PATTERN_OPEN_TAG, token) matching_close_tag = re.search(PATTERN_CLOSE_TAG, token) diff --git a/tower_eval/tasks/generate.py b/tower_eval/tasks/generate.py index b10cb90..5f37c79 100644 --- a/tower_eval/tasks/generate.py +++ b/tower_eval/tasks/generate.py @@ -79,12 +79,11 @@ def generate(i: int, config_path: str, config_type: str, available_models: dict= f"Running inference for task: {task_name} , subtask: {subtask} with model: {model_type}/{model_name} saving to: {output_dir} " ) - if task_name in ["mt", "ape"]: - lp = subtask.split(".")[-1] - src_lang, tgt_lang = get_langs(lp) + lp = subtask.split(".")[-1] + src_lang, tgt_lang = get_langs(lp) - model.source_language = src_lang - model.target_language = tgt_lang + model.source_language = src_lang + model.target_language = tgt_lang model.generation_with_resume( input_file=input_file, output_file=output_file, diff --git a/tower_eval/utils.py b/tower_eval/utils.py index 01c1c89..ce7f2e0 100644 --- a/tower_eval/utils.py +++ b/tower_eval/utils.py @@ -338,10 +338,9 @@ def get_eval_args_given_task( ) gold_data_path = data_dir / task_name / subtask / "test.jsonl" # add language argument to eval_args as it is needed in some metrics - language = subtask.split(".")[1] - if "-" in language: - _, language = get_langs(language) - eval_args["language"] = language + lp = subtask.split(".")[1] + src_lang, trg_lang = get_langs(lp) + eval_args["lp"] = {"src_lang": src_lang, "trg_lang": trg_lang} return hypothesis_path, gold_data_path, eval_args @@ -433,9 +432,17 @@ def get_langs(lp): lang_pattern = "|".join(valid_langs) lp_pattern = rf"^({lang_pattern})-({lang_pattern})$" match = re.match(lp_pattern, lp) - src_lang = match.group(1) - trg_lang = match.group(2) - return src_lang, trg_lang + if match: + # We have a language pair, so both src_lang and trg_lang will be extracted from lp + src_lang = match.group(1) + trg_lang = match.group(2) + return src_lang, trg_lang + elif lp in valid_langs: + # the task is monolingual, hence we only have one language. So, we set the src_lang only. trg_lang will be set to None + src_lang = lp + trg_lang = None + return src_lang, trg_lang + return None, None def add_average_generation_time(