From 6bf31ea2cedd1703711abe47931fc028425edefa Mon Sep 17 00:00:00 2001 From: Fanyi Pu Date: Sat, 16 Mar 2024 00:19:53 +0800 Subject: [PATCH 1/9] fix mmbench en and cn --- .../mmbench/_default_template_mmbench.yaml | 12 ++++++++ .../mmbench/_default_template_mmbench_cn.yaml | 27 +++++++++++++++++ .../mmbench/_default_template_mmbench_en.yaml | 14 +++++++++ lmms_eval/tasks/mmbench/en_utils.py | 8 ++--- lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml | 30 ++----------------- lmms_eval/tasks/mmbench/mmbench_cn_test.yaml | 30 ++----------------- lmms_eval/tasks/mmbench/mmbench_en.yaml | 8 ----- lmms_eval/tasks/mmbench/mmbench_en_dev.yaml | 20 ++----------- lmms_eval/tasks/mmbench/mmbench_en_test.yaml | 17 +---------- 9 files changed, 64 insertions(+), 102 deletions(-) create mode 100644 lmms_eval/tasks/mmbench/_default_template_mmbench.yaml create mode 100644 lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml create mode 100644 lmms_eval/tasks/mmbench/_default_template_mmbench_en.yaml diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench.yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench.yaml new file mode 100644 index 000000000..8520cc2c9 --- /dev/null +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench.yaml @@ -0,0 +1,12 @@ +dataset_path: lmms-lab/MMBench +dataset_kwargs: + token: True +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +doc_to_target: "answer" diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml new file mode 100644 index 000000000..3fd57e2ad --- /dev/null +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml @@ -0,0 +1,27 @@ +dataset_path: lmms-lab/MMBench +dataset_kwargs: + token: True +dataset_name: "cn" +output_type: generate_until +doc_to_visual: !function cn_utils.mmbench_doc_to_visual +doc_to_text: !function cn_utils.mmbench_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 256 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function cn_utils.mmbench_process_results +metadata: + version: 0.0 + gpt_eval_model_name: "gpt-3.5-turbo" + quick_extract: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" +model_specific_generation_kwargs: + llava: + image_aspect_ratio: original +include: _default_template_mmbench.yaml diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_en.yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_en.yaml new file mode 100644 index 000000000..b885a6b57 --- /dev/null +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_en.yaml @@ -0,0 +1,14 @@ +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer with the option's letter from the given choices directly." +doc_to_visual: !function en_utils.mmbench_doc_to_visual +doc_to_text: !function en_utils.mmbench_doc_to_text +doc_to_target: "answer" +process_results: !function en_utils.mmbench_process_results +model_specific_generation_kwargs: + llava: + image_aspect_ratio: original +output_type: generate_until +include: _default_template_mmbench.yaml +dataset_name: "en" diff --git a/lmms_eval/tasks/mmbench/en_utils.py b/lmms_eval/tasks/mmbench/en_utils.py index 1962dc528..4ed2260e4 100644 --- a/lmms_eval/tasks/mmbench/en_utils.py +++ b/lmms_eval/tasks/mmbench/en_utils.py @@ -36,7 +36,7 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None): "answer": doc.get("answer", None), "options": options_prompt, "category": doc["category"], - "L2-category": doc["l2-category"], + "L2-category": doc["L2-category"], "options_dict": options_dict, "index": doc["index"], "hint": doc["hint"], @@ -44,11 +44,11 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None): "split": doc["split"], } - query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) else f"{data['question']} {data['options']}" + query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']} {data['options']}" if model_specific_prompt_kwargs: query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}" - + return query_prompt @@ -64,7 +64,7 @@ def mmbench_process_results(doc, results): "source": doc["source"], "split": doc["split"], "category": doc["category"], - "L2-category": doc["l2-category"], + "L2-category": doc["L2-category"], } } option_candidate = ["A", "B", "C", "D", "E"] diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml index 3b2b4fbb1..945e7ca36 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml @@ -1,33 +1,7 @@ -dataset_path: lmms-lab/MMBench_CN -dataset_kwargs: - token: True -group: mmbench_cn task: "mmbench_cn_dev" -dataset_name: "default" test_split: "dev" -output_type: generate_until -doc_to_visual: !function cn_utils.mmbench_doc_to_visual -doc_to_text: !function cn_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 256 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function cn_utils.mmbench_process_results metric_list: - metric: submission + higher_is_better: true aggregation: !function cn_utils.mmbench_aggregate_dev_results -metadata: - version: 0.0 - gpt_eval_model_name: "gpt-3.5-turbo" - quick_extract: true - -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original \ No newline at end of file +include: _default_template_mmbench_cn.yaml diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml index b17bb761d..3e2bc0b70 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml @@ -1,33 +1,7 @@ -dataset_path: lmms-lab/MMBench_CN -dataset_kwargs: - token: True -task: "mmbench_cn_test" -dataset_name: "default" +task: mmbench_cn_test test_split: test -output_type: generate_until -doc_to_visual: !function cn_utils.mmbench_doc_to_visual -doc_to_text: !function cn_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 256 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function cn_utils.mmbench_process_results metric_list: - metric: submission aggregation: !function cn_utils.mmbench_aggregate_test_results higher_is_better: true -metadata: - version: 0.0 - gpt_eval_model_name: "gpt-3.5-turbo" - quick_extract: true - -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original +include: _default_template_mmbench_cn.yaml diff --git a/lmms_eval/tasks/mmbench/mmbench_en.yaml b/lmms_eval/tasks/mmbench/mmbench_en.yaml index c518f924e..9fa757cc3 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en.yaml @@ -5,11 +5,3 @@ task: metadata: version: 0.0 sys_prompt: "There are several options:" - -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "\nAnswer with the option's letter from the given choices directly." -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original \ No newline at end of file diff --git a/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml b/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml index 89d6ff76d..5873baa86 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml @@ -1,23 +1,7 @@ -dataset_path: lmms-lab/MMBench_EN -dataset_kwargs: - token: True task: "mmbench_en_dev" test_split: dev -output_type: generate_until -doc_to_visual: !function en_utils.mmbench_doc_to_visual -doc_to_text: !function en_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function en_utils.mmbench_process_results +include: _default_template_mmbench_en.yaml metric_list: - metric: submission aggregation: !function en_utils.mmbench_aggregate_dev_results -metadata: - version: 0.0 \ No newline at end of file + higher_is_better: true diff --git a/lmms_eval/tasks/mmbench/mmbench_en_test.yaml b/lmms_eval/tasks/mmbench/mmbench_en_test.yaml index 92f73ef1f..037315f38 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en_test.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en_test.yaml @@ -1,22 +1,7 @@ -dataset_path: lmms-lab/MMBench_EN -dataset_kwargs: - token: True task: "mmbench_en_test" test_split: test -output_type: generate_until -doc_to_visual: !function en_utils.mmbench_doc_to_visual -doc_to_text: !function en_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 256 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function en_utils.mmbench_process_results +include: _default_template_mmbench_en.yaml metric_list: - metric: submission aggregation: !function en_utils.mmbench_aggregate_test_results higher_is_better: true -metadata: - version: 0.0 \ No newline at end of file From fde7593b606cfdbd6abad2b02ce4f6cfb7520144 Mon Sep 17 00:00:00 2001 From: Fanyi Pu Date: Sat, 16 Mar 2024 00:21:49 +0800 Subject: [PATCH 2/9] update system prompt --- lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml | 1 + lmms_eval/tasks/mmbench/mmbench_cn.yaml | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml index 3fd57e2ad..31f644b1f 100644 --- a/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml @@ -17,6 +17,7 @@ metadata: version: 0.0 gpt_eval_model_name: "gpt-3.5-turbo" quick_extract: true + sys_prompt: "有如下几个选项:" model_specific_prompt_kwargs: default: pre_prompt: "" diff --git a/lmms_eval/tasks/mmbench/mmbench_cn.yaml b/lmms_eval/tasks/mmbench/mmbench_cn.yaml index 82fddeb0c..a587a6563 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn.yaml @@ -3,6 +3,3 @@ task: - mmbench_cn_dev - mmbench_cn_test - mmbench_cn_cc -metadata: - version: 0.0 - sys_prompt: "有如下几个选项:" \ No newline at end of file From d492c6fec5e041b0efda203aaea9a65522b068c7 Mon Sep 17 00:00:00 2001 From: Fanyi Pu Date: Sat, 16 Mar 2024 00:42:41 +0800 Subject: [PATCH 3/9] fix --- lmms_eval/tasks/__init__.py | 3 ++- .../tasks/mmbench/_default_template_mmbench.yaml | 12 ------------ .../tasks/mmbench/_default_template_mmbench_cn.yaml | 8 +------- .../tasks/mmbench/_default_template_mmbench_en.yaml | 13 ++++++++++++- lmms_eval/tasks/mmbench/mmbench_cn.yaml | 5 +++++ 5 files changed, 20 insertions(+), 21 deletions(-) delete mode 100644 lmms_eval/tasks/mmbench/_default_template_mmbench.yaml diff --git a/lmms_eval/tasks/__init__.py b/lmms_eval/tasks/__init__.py index e6e25d857..dc77ed47f 100644 --- a/lmms_eval/tasks/__init__.py +++ b/lmms_eval/tasks/__init__.py @@ -90,9 +90,10 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None: # the user defines the appropriate verbosity. except ModuleNotFoundError as e: eval_logger.debug(f"{yaml_path}: {e}. Config will not be added to registry.") + print(f"{yaml_path}: {e}. Config will not be added to registry.") except Exception as error: import traceback - + eval_logger.debug(f"Failed to load config in {yaml_path}. Config will not be added to registry\n" f"Error: {error}\n" f"Traceback: {traceback.format_exc()}") return 0 diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench.yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench.yaml deleted file mode 100644 index 8520cc2c9..000000000 --- a/lmms_eval/tasks/mmbench/_default_template_mmbench.yaml +++ /dev/null @@ -1,12 +0,0 @@ -dataset_path: lmms-lab/MMBench -dataset_kwargs: - token: True -generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -doc_to_target: "answer" diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml index 31f644b1f..81094620b 100644 --- a/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml @@ -1,11 +1,11 @@ dataset_path: lmms-lab/MMBench dataset_kwargs: token: True +doc_to_target: "answer" dataset_name: "cn" output_type: generate_until doc_to_visual: !function cn_utils.mmbench_doc_to_visual doc_to_text: !function cn_utils.mmbench_doc_to_text -doc_to_target: "answer" generation_kwargs: max_new_tokens: 256 temperature: 0 @@ -13,11 +13,6 @@ generation_kwargs: num_beams: 1 do_sample: false process_results: !function cn_utils.mmbench_process_results -metadata: - version: 0.0 - gpt_eval_model_name: "gpt-3.5-turbo" - quick_extract: true - sys_prompt: "有如下几个选项:" model_specific_prompt_kwargs: default: pre_prompt: "" @@ -25,4 +20,3 @@ model_specific_prompt_kwargs: model_specific_generation_kwargs: llava: image_aspect_ratio: original -include: _default_template_mmbench.yaml diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_en.yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_en.yaml index b885a6b57..ab2b882c8 100644 --- a/lmms_eval/tasks/mmbench/_default_template_mmbench_en.yaml +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_en.yaml @@ -1,3 +1,7 @@ +dataset_path: lmms-lab/MMBench +dataset_kwargs: + token: True +doc_to_target: "answer" model_specific_prompt_kwargs: default: pre_prompt: "" @@ -10,5 +14,12 @@ model_specific_generation_kwargs: llava: image_aspect_ratio: original output_type: generate_until -include: _default_template_mmbench.yaml dataset_name: "en" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false diff --git a/lmms_eval/tasks/mmbench/mmbench_cn.yaml b/lmms_eval/tasks/mmbench/mmbench_cn.yaml index a587a6563..6232531c4 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn.yaml @@ -3,3 +3,8 @@ task: - mmbench_cn_dev - mmbench_cn_test - mmbench_cn_cc +metadata: + version: 0.0 + gpt_eval_model_name: "gpt-3.5-turbo" + quick_extract: true + sys_prompt: "有如下几个选项:" From 4503cd7e3b456d5153fd9b14556d9331fdc5c1ce Mon Sep 17 00:00:00 2001 From: Fanyi Pu Date: Sat, 16 Mar 2024 00:43:40 +0800 Subject: [PATCH 4/9] update path name --- ...mplate_mmbench_cn.yaml => _default_template_mmbench_cn_yaml} | 0 ...mplate_mmbench_en.yaml => _default_template_mmbench_en_yaml} | 0 lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml | 2 +- lmms_eval/tasks/mmbench/mmbench_cn_test.yaml | 2 +- lmms_eval/tasks/mmbench/mmbench_en_dev.yaml | 2 +- lmms_eval/tasks/mmbench/mmbench_en_test.yaml | 2 +- 6 files changed, 4 insertions(+), 4 deletions(-) rename lmms_eval/tasks/mmbench/{_default_template_mmbench_cn.yaml => _default_template_mmbench_cn_yaml} (100%) rename lmms_eval/tasks/mmbench/{_default_template_mmbench_en.yaml => _default_template_mmbench_en_yaml} (100%) diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml similarity index 100% rename from lmms_eval/tasks/mmbench/_default_template_mmbench_cn.yaml rename to lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_en.yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml similarity index 100% rename from lmms_eval/tasks/mmbench/_default_template_mmbench_en.yaml rename to lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml index 945e7ca36..3d7b9d98b 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml @@ -4,4 +4,4 @@ metric_list: - metric: submission higher_is_better: true aggregation: !function cn_utils.mmbench_aggregate_dev_results -include: _default_template_mmbench_cn.yaml +include: _default_template_mmbench_cn_yaml diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml index 3e2bc0b70..b86f092cb 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml @@ -4,4 +4,4 @@ metric_list: - metric: submission aggregation: !function cn_utils.mmbench_aggregate_test_results higher_is_better: true -include: _default_template_mmbench_cn.yaml +include: _default_template_mmbench_cn_yaml diff --git a/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml b/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml index 5873baa86..b4f4a2e9f 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml @@ -1,6 +1,6 @@ task: "mmbench_en_dev" test_split: dev -include: _default_template_mmbench_en.yaml +include: _default_template_mmbench_en_yaml metric_list: - metric: submission aggregation: !function en_utils.mmbench_aggregate_dev_results diff --git a/lmms_eval/tasks/mmbench/mmbench_en_test.yaml b/lmms_eval/tasks/mmbench/mmbench_en_test.yaml index 037315f38..5acf404af 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en_test.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en_test.yaml @@ -1,6 +1,6 @@ task: "mmbench_en_test" test_split: test -include: _default_template_mmbench_en.yaml +include: _default_template_mmbench_en_yaml metric_list: - metric: submission aggregation: !function en_utils.mmbench_aggregate_test_results From b1f2be5a7874c196fa0eb5197789ed4b0ff5349a Mon Sep 17 00:00:00 2001 From: Fanyi Pu Date: Sat, 16 Mar 2024 00:57:47 +0800 Subject: [PATCH 5/9] Update dataset path and name in mmbench_cc.yaml --- lmms_eval/tasks/mmbench/mmbench_cc.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lmms_eval/tasks/mmbench/mmbench_cc.yaml b/lmms_eval/tasks/mmbench/mmbench_cc.yaml index 0ec13674f..238aa10c9 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cc.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cc.yaml @@ -1,9 +1,8 @@ -dataset_path: lmms-lab/MMBench_CN +dataset_path: lmms-lab/MMBench +dataset_name: cc dataset_kwargs: token: True -group: mmbench_cn task: "mmbench_cn_cc" -dataset_name: "chinese_culture" test_split: test output_type: generate_until doc_to_visual: !function cc_utils.mmbench_doc_to_visual From a519fc11e27d944177b0bb1a0ddfa69e3e453401 Mon Sep 17 00:00:00 2001 From: Fanyi Pu Date: Sat, 16 Mar 2024 01:03:42 +0800 Subject: [PATCH 6/9] Add generate_submission_file function to cc_utils.py --- lmms_eval/tasks/mmbench/cc_utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lmms_eval/tasks/mmbench/cc_utils.py b/lmms_eval/tasks/mmbench/cc_utils.py index c049613f2..7009e012e 100644 --- a/lmms_eval/tasks/mmbench/cc_utils.py +++ b/lmms_eval/tasks/mmbench/cc_utils.py @@ -7,6 +7,7 @@ eval_logger = logging.getLogger("lmms-eval") from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file with open(Path(__file__).parent / "mmbench_cn.yaml", "r") as f: raw_data = f.readlines() @@ -66,9 +67,9 @@ def mmbench_cn_cc_process_results(doc, results): return data -def mmbench_cn_cc_aggregate_results(results): +def mmbench_cn_cc_aggregate_results(results, args): df = pd.DataFrame(results) - os.makedirs("./submissions", exist_ok=True) - with pd.ExcelWriter("./submissions/mmbench_cn_cc_results.xlsx") as writer: + file = generate_submission_file("mmbench_cn_cc_results.xlsx", args) + with pd.ExcelWriter(file) as writer: df.to_excel(writer, index=False) - eval_logger.info(f"Saved results to mmbench_cn_cc_results.xlsx") + eval_logger.info(f"Saved results to {file}") From 2406b57a6ded47f43f6b2b223cf53d1042283e32 Mon Sep 17 00:00:00 2001 From: Fanyi Pu Date: Sat, 16 Mar 2024 01:06:44 +0800 Subject: [PATCH 7/9] Refactor mmbench_aggregate_test_results function to use a helper function for generating submission file path --- lmms_eval/tasks/mmbench/cn_utils.py | 3 +-- lmms_eval/tasks/mmbench/en_utils.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lmms_eval/tasks/mmbench/cn_utils.py b/lmms_eval/tasks/mmbench/cn_utils.py index 1010eb1b1..812b9aa38 100644 --- a/lmms_eval/tasks/mmbench/cn_utils.py +++ b/lmms_eval/tasks/mmbench/cn_utils.py @@ -83,8 +83,7 @@ def mmbench_aggregate_dev_results(results, args): def mmbench_aggregate_test_results(results, args): df = pd.DataFrame(results) - Path(args.output_path).joinpath("submissions").mkdir(parents=True, exist_ok=True) - excel_write_path = Path(args.output_path) / "submissions" / f"mmbench_cn_test_results.xlsx" + excel_write_path = generate_submission_file("mmbench_cn_test_results.xlsx", args) with pd.ExcelWriter(excel_write_path) as writer: df.to_excel(writer, index=False) eval_logger.info(f"Saved results to {excel_write_path}") diff --git a/lmms_eval/tasks/mmbench/en_utils.py b/lmms_eval/tasks/mmbench/en_utils.py index 4ed2260e4..932c35019 100644 --- a/lmms_eval/tasks/mmbench/en_utils.py +++ b/lmms_eval/tasks/mmbench/en_utils.py @@ -83,8 +83,7 @@ def mmbench_aggregate_dev_results(results, args): def mmbench_aggregate_test_results(results, args): df = pd.DataFrame(results) - Path(args.output_path).joinpath("submissions").mkdir(parents=True, exist_ok=True) - excel_write_path = Path(args.output_path) / "submissions" / f"mmbench_en_test_results.xlsx" + excel_write_path = generate_submission_file("mmbench_en_test_results.xlsx", args) with pd.ExcelWriter(excel_write_path) as writer: df.to_excel(writer, index=False) eval_logger.info(f"Saved results to {excel_write_path}") From 966ef9c44a8b45d4968c9f27a3e67f8dd1964b5f Mon Sep 17 00:00:00 2001 From: Fanyi Pu Date: Sat, 16 Mar 2024 12:58:06 +0800 Subject: [PATCH 8/9] fix mmbench --- lmms_eval/tasks/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lmms_eval/tasks/__init__.py b/lmms_eval/tasks/__init__.py index dc77ed47f..d99f7e60e 100644 --- a/lmms_eval/tasks/__init__.py +++ b/lmms_eval/tasks/__init__.py @@ -90,7 +90,6 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None: # the user defines the appropriate verbosity. except ModuleNotFoundError as e: eval_logger.debug(f"{yaml_path}: {e}. Config will not be added to registry.") - print(f"{yaml_path}: {e}. Config will not be added to registry.") except Exception as error: import traceback From 077737b64bd893f272afba64f512d38d311804b7 Mon Sep 17 00:00:00 2001 From: Fanyi Pu Date: Sat, 16 Mar 2024 12:58:31 +0800 Subject: [PATCH 9/9] lint --- lmms_eval/tasks/__init__.py | 2 +- lmms_eval/tasks/mmbench/en_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lmms_eval/tasks/__init__.py b/lmms_eval/tasks/__init__.py index d99f7e60e..e6e25d857 100644 --- a/lmms_eval/tasks/__init__.py +++ b/lmms_eval/tasks/__init__.py @@ -92,7 +92,7 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None: eval_logger.debug(f"{yaml_path}: {e}. Config will not be added to registry.") except Exception as error: import traceback - + eval_logger.debug(f"Failed to load config in {yaml_path}. Config will not be added to registry\n" f"Error: {error}\n" f"Traceback: {traceback.format_exc()}") return 0 diff --git a/lmms_eval/tasks/mmbench/en_utils.py b/lmms_eval/tasks/mmbench/en_utils.py index 932c35019..26e260006 100644 --- a/lmms_eval/tasks/mmbench/en_utils.py +++ b/lmms_eval/tasks/mmbench/en_utils.py @@ -48,7 +48,7 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None): if model_specific_prompt_kwargs: query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}" - + return query_prompt