From 872e4ce80ad9afd2189940b2bfc42ff35aa08966 Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sun, 19 Jan 2025 07:09:46 -0800 Subject: [PATCH 1/6] [wip] Update --- models/v0.2.0/scripts/report.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 models/v0.2.0/scripts/report.py diff --git a/models/v0.2.0/scripts/report.py b/models/v0.2.0/scripts/report.py new file mode 100644 index 0000000..03c2dc6 --- /dev/null +++ b/models/v0.2.0/scripts/report.py @@ -0,0 +1,13 @@ +from pathlib import Path + +import typer +from srsly import read_jsonl +from wasabi import msg + + +def report(): + pass + + +if __name__ == "__main__": + typer.run(report) From 87319cdbe2f548704f5c3a9c543104bb62775950 Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sun, 19 Jan 2025 07:15:40 -0800 Subject: [PATCH 2/6] wip --- models/v0.2.0/scripts/report.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/models/v0.2.0/scripts/report.py b/models/v0.2.0/scripts/report.py index 03c2dc6..a411727 100644 --- a/models/v0.2.0/scripts/report.py +++ b/models/v0.2.0/scripts/report.py @@ -1,11 +1,21 @@ from pathlib import Path import typer +import pandas as pd from srsly import read_jsonl from wasabi import msg -def report(): +def report( + # fmt: off + indir: Path = typer.Argument(..., help="") + # fmt: on +): + """Return a table of evaluation results + + The input to `indir` must be a directory where the first-level directories are the model names, + with JSON files from `spacy evaluate` in this file format: {task}_{dataset}.json + """ pass From 362facb18fd6bd0998f9af823322bf1701635c1b Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sun, 19 Jan 2025 07:22:57 -0800 Subject: [PATCH 3/6] [wip] Update --- models/v0.2.0/requirements.txt | 4 +++- models/v0.2.0/scripts/report.py | 14 +++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/models/v0.2.0/requirements.txt b/models/v0.2.0/requirements.txt index 07c7b69..00673e5 100644 --- a/models/v0.2.0/requirements.txt +++ b/models/v0.2.0/requirements.txt @@ -7,4 +7,6 @@ spacy-huggingface-hub build pip==22.0.2 sentencepiece -protobuf \ No newline at end of file +protobuf +typer +pandas \ No newline at end of file diff --git a/models/v0.2.0/scripts/report.py b/models/v0.2.0/scripts/report.py index a411727..0982e3b 100644 --- a/models/v0.2.0/scripts/report.py +++ b/models/v0.2.0/scripts/report.py @@ -2,21 +2,25 @@ import typer import pandas as pd -from srsly import read_jsonl +from srsly import read_json from wasabi import msg def report( - # fmt: off - indir: Path = typer.Argument(..., help="") - # fmt: on + indir: Path = typer.Argument(..., help="Path to the evaluations directory.") ): """Return a table of evaluation results The input to `indir` must be a directory where the first-level directories are the model names, with JSON files from `spacy evaluate` in this file format: {task}_{dataset}.json """ - pass + results = [] + for model_dir in indir.iterdir(): + if model_dir.is_dir(): + for json_file in model_dir.glob("*.json"): + data = read_json(json_file) + results.append(data) + breakpoint() if __name__ == "__main__": From 47d16fcf8ccb75f7c9a99f91fde03c3d8f3a582f Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sun, 19 Jan 2025 07:24:55 -0800 Subject: [PATCH 4/6] [wip] Update --- .../tl_calamancy_lg/{ner-tfnerd.json => ner_tfnerd.json} | 0 .../tl_calamancy_md/{ner-tfnerd.json => ner_tfnerd.json} | 0 .../tl_calamancy_trf/{ner-tfnerd.json => ner_tfnerd.json} | 0 models/v0.2.0/scripts/report.py | 4 +++- 4 files changed, 3 insertions(+), 1 deletion(-) rename models/v0.2.0/evals/tl_calamancy_lg/{ner-tfnerd.json => ner_tfnerd.json} (100%) rename models/v0.2.0/evals/tl_calamancy_md/{ner-tfnerd.json => ner_tfnerd.json} (100%) rename models/v0.2.0/evals/tl_calamancy_trf/{ner-tfnerd.json => ner_tfnerd.json} (100%) diff --git a/models/v0.2.0/evals/tl_calamancy_lg/ner-tfnerd.json b/models/v0.2.0/evals/tl_calamancy_lg/ner_tfnerd.json similarity index 100% rename from models/v0.2.0/evals/tl_calamancy_lg/ner-tfnerd.json rename to models/v0.2.0/evals/tl_calamancy_lg/ner_tfnerd.json diff --git a/models/v0.2.0/evals/tl_calamancy_md/ner-tfnerd.json b/models/v0.2.0/evals/tl_calamancy_md/ner_tfnerd.json similarity index 100% rename from models/v0.2.0/evals/tl_calamancy_md/ner-tfnerd.json rename to models/v0.2.0/evals/tl_calamancy_md/ner_tfnerd.json diff --git a/models/v0.2.0/evals/tl_calamancy_trf/ner-tfnerd.json b/models/v0.2.0/evals/tl_calamancy_trf/ner_tfnerd.json similarity index 100% rename from models/v0.2.0/evals/tl_calamancy_trf/ner-tfnerd.json rename to models/v0.2.0/evals/tl_calamancy_trf/ner_tfnerd.json diff --git a/models/v0.2.0/scripts/report.py b/models/v0.2.0/scripts/report.py index 0982e3b..da65dbe 100644 --- a/models/v0.2.0/scripts/report.py +++ b/models/v0.2.0/scripts/report.py @@ -17,9 +17,11 @@ def report( results = [] for model_dir in indir.iterdir(): if model_dir.is_dir(): + model_name = model_dir.name for json_file in model_dir.glob("*.json"): + task_dataset = json_file.stem data = read_json(json_file) - results.append(data) + results.append((model_name, task_dataset, data)) breakpoint() From 5a9954694745cb3e427df085a7346d1d896f2fa7 Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sun, 19 Jan 2025 07:32:54 -0800 Subject: [PATCH 5/6] [wip] Update --- models/v0.2.0/scripts/report.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/models/v0.2.0/scripts/report.py b/models/v0.2.0/scripts/report.py index da65dbe..491539e 100644 --- a/models/v0.2.0/scripts/report.py +++ b/models/v0.2.0/scripts/report.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Any import typer import pandas as pd @@ -19,10 +20,29 @@ def report( if model_dir.is_dir(): model_name = model_dir.name for json_file in model_dir.glob("*.json"): - task_dataset = json_file.stem + task, dataset = json_file.stem.split("_") data = read_json(json_file) - results.append((model_name, task_dataset, data)) - breakpoint() + results.append((model_name, task, dataset, data)) + + msg.info(f"Found {len(results)} results in {indir}") + + msg.text("Parsing syntactic annotation results...") + syn_rows = [] + for result in results: + pass + + msg.text("Parsing NER results...") + ner_rows = [] + + +def parse_syntactic_results(results: dict[str, Any]) -> dict[str, float]: + """Get tokenizer, lemmatization, morph, and parsing evals""" + pass + + +def parse_ner_results(results: dict[str, Any]) -> dict[str, float]: + """Get NER evals""" + pass if __name__ == "__main__": From 98d695a4f2a6579fa08387a8bb3d82bf9e671a4a Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sun, 19 Jan 2025 15:48:44 -0800 Subject: [PATCH 6/6] [wip] Update --- models/v0.2.0/requirements.txt | 3 ++- models/v0.2.0/scripts/report.py | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/models/v0.2.0/requirements.txt b/models/v0.2.0/requirements.txt index 00673e5..eb91569 100644 --- a/models/v0.2.0/requirements.txt +++ b/models/v0.2.0/requirements.txt @@ -9,4 +9,5 @@ pip==22.0.2 sentencepiece protobuf typer -pandas \ No newline at end of file +pandas +tabulate \ No newline at end of file diff --git a/models/v0.2.0/scripts/report.py b/models/v0.2.0/scripts/report.py index 491539e..03cc263 100644 --- a/models/v0.2.0/scripts/report.py +++ b/models/v0.2.0/scripts/report.py @@ -28,11 +28,39 @@ def report( msg.text("Parsing syntactic annotation results...") syn_rows = [] - for result in results: - pass + for model_name, task, dataset, data in results: + if task == "dep": + row = { + "model": model_name, + "dataset": dataset, + "token_acc": data.get("tokenizer").get("token_f"), + "lemma_acc": data.get("trainable_lemmatizer").get("lemma_acc"), + "tag_acc": data.get("tagger").get("tag_acc"), + "pos_acc": data.get("morphologizer").get("pos_acc"), + "morph_acc": data.get("morphologizer").get("morph_acc"), + "dep_uas": data.get("parser").get("dep_uas"), + "dep_las": data.get("parser").get("dep_las"), + } + syn_rows.append(row) + + syn_df = pd.DataFrame(syn_rows).sort_values(by="dataset").reset_index(drop=True) + print(syn_df.to_markdown(index=False)) msg.text("Parsing NER results...") ner_rows = [] + for model_name, task, dataset, data in results: + if task == "ner": + row = { + "model": model_name, + "dataset": dataset, + "ents_p": data.get("ner").get("ents_p"), + "ents_r": data.get("ner").get("ents_r"), + "ents_f": data.get("ner").get("ents_f"), + } + ner_rows.append(row) + + ner_df = pd.DataFrame(ner_rows).sort_values(by="dataset").reset_index(drop=True) + print(ner_df.to_markdown(index=False)) def parse_syntactic_results(results: dict[str, Any]) -> dict[str, float]: