From 27ea9c0055a8abf3a8198829b8617018479918e2 Mon Sep 17 00:00:00 2001 From: Dannoopsy Date: Thu, 27 Jun 2024 17:17:29 +0000 Subject: [PATCH 1/2] add mmbench_ru_dev --- .../mmbench/_default_template_mmbench_ru_yaml | 24 ++++ lmms_eval/tasks/mmbench/mmbench.yaml | 1 + lmms_eval/tasks/mmbench/mmbench_ru_dev.yaml | 10 ++ lmms_eval/tasks/mmbench/ru_utils.py | 128 ++++++++++++++++++ 4 files changed, 163 insertions(+) create mode 100644 lmms_eval/tasks/mmbench/_default_template_mmbench_ru_yaml create mode 100644 lmms_eval/tasks/mmbench/mmbench_ru_dev.yaml create mode 100644 lmms_eval/tasks/mmbench/ru_utils.py diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_ru_yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_ru_yaml new file mode 100644 index 00000000..fad7f927 --- /dev/null +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_ru_yaml @@ -0,0 +1,24 @@ +dataset_path: deepvk/MMBench-ru +dataset_kwargs: + token: True +doc_to_target: "answer" +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer with the option's letter from the given choices directly." +doc_to_visual: !function ru_utils.mmbench_doc_to_visual +doc_to_text: !function ru_utils.mmbench_doc_to_text +doc_to_target: "answer" +process_results: !function ru_utils.mmbench_process_results +model_specific_generation_kwargs: + llava: + image_aspect_ratio: original +output_type: generate_until +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false diff --git a/lmms_eval/tasks/mmbench/mmbench.yaml b/lmms_eval/tasks/mmbench/mmbench.yaml index 821065ee..f2546aa5 100755 --- a/lmms_eval/tasks/mmbench/mmbench.yaml +++ b/lmms_eval/tasks/mmbench/mmbench.yaml @@ -5,6 +5,7 @@ task: - mmbench_cn_dev - mmbench_cn_test - mmbench_cn_cc + - mmbench_ru_dev metadata: version: 0.0 sys_prompt: "There are several options:" diff --git a/lmms_eval/tasks/mmbench/mmbench_ru_dev.yaml b/lmms_eval/tasks/mmbench/mmbench_ru_dev.yaml new file mode 100644 index 00000000..46407ae4 --- /dev/null +++ b/lmms_eval/tasks/mmbench/mmbench_ru_dev.yaml @@ -0,0 +1,10 @@ +task: "mmbench_ru_dev" +test_split: dev +include: _default_template_mmbench_ru_yaml +metric_list: + - metric: gpt_eval_score + aggregation: !function ru_utils.mmbench_aggregate_dev_results_eval + higher_is_better: true + - metric: submission + aggregation: !function ru_utils.mmbench_aggregate_dev_results_submission + higher_is_better: true \ No newline at end of file diff --git a/lmms_eval/tasks/mmbench/ru_utils.py b/lmms_eval/tasks/mmbench/ru_utils.py new file mode 100644 index 00000000..3ff515c7 --- /dev/null +++ b/lmms_eval/tasks/mmbench/ru_utils.py @@ -0,0 +1,128 @@ +import yaml +import os +from pathlib import Path +import pandas as pd +import json + +from loguru import logger as eval_logger +from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file + +with open(Path(__file__).parent / "mmbench.yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + +GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"] +API_TYPE = os.getenv("API_TYPE", "openai") + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") +else: + API_URL = "YOUR_API_URL" + API_KEY = "YOUR_API_KEY" + + +mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME) + + +def mmbench_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None): + option_candidate = ["A", "B", "C", "D", "E"] + options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate) + + data = { + # "img": doc["image"], + "question": doc["question"], + "answer": doc.get("answer", None), + "options": options_prompt, + "category": doc["category"], + "L2-category": doc["l2-category"], + "options_dict": options_dict, + "index": doc["index"], + "hint": doc["hint"], + "source": doc["source"], + "split": doc["split"], + } + + query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']} {data['options']}" + + if model_specific_prompt_kwargs: + query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}" + + return query_prompt + + +def mmbench_process_results(doc, results): + model_response = results[0].strip() + data = { + "gpt_eval_score": { + "index": doc["index"], + "question": doc["question"], + "answer": doc["answer"], + "prediction": model_response, + "hint": doc["hint"], + "source": doc["source"], + "split": doc["split"], + "category": doc["category"], + "L2-category": doc["l2-category"], + }, + "submission": { + "index": doc["index"], + "question": doc["question"], + "answer": doc["answer"], + "prediction": model_response, + "hint": doc["hint"], + "source": doc["source"], + "split": doc["split"], + "category": doc["category"], + "L2-category": doc["l2-category"], + }, + } + option_candidate = ["A", "B", "C", "D", "E"] + for c in option_candidate: + data["submission"][c] = doc.get(c, "nan") + data["gpt_eval_score"][c] = doc.get(c, "nan") + return data + + +def mmbench_aggregate_dev_results_eval(results, args): + print(f"============= MMBench-RU(Dev) Detailed Results =============") + overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai") + file = generate_submission_file("mmbench_ru_dev_results.json", args) + details_info = { + "overall_acc": overall_acc, + "category_acc": category_acc, + "l2_category_acc": l2_category_acc, + } + with open(file, "w") as f: + json.dump(details_info, f) + return overall_acc * 100 + + +def mmbench_aggregate_dev_results_submission(results, args): + df = pd.DataFrame(results) + excel_write_path = generate_submission_file("mmbench_ru_dev_results.xlsx", args) + with pd.ExcelWriter(excel_write_path) as writer: + df.to_excel(writer, index=False) + eval_logger.info(f"Saved results to {excel_write_path}") + + +def mmbench_aggregate_test_results(results, args): + df = pd.DataFrame(results) + excel_write_path = generate_submission_file("mmbench_ru_test_results.xlsx", args) + with pd.ExcelWriter(excel_write_path) as writer: + df.to_excel(writer, index=False) + eval_logger.info(f"Saved results to {excel_write_path}") From ba7081c0abac840002d320e30733e891298dfa11 Mon Sep 17 00:00:00 2001 From: Dannoopsy <63581325+Dannoopsy@users.noreply.github.com> Date: Fri, 28 Jun 2024 12:21:05 +0300 Subject: [PATCH 2/2] change prompt to ru --- lmms_eval/tasks/mmbench/_default_template_mmbench_ru_yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_ru_yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_ru_yaml index fad7f927..993cd52a 100644 --- a/lmms_eval/tasks/mmbench/_default_template_mmbench_ru_yaml +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_ru_yaml @@ -5,7 +5,7 @@ doc_to_target: "answer" model_specific_prompt_kwargs: default: pre_prompt: "" - post_prompt: "\nAnswer with the option's letter from the given choices directly." + post_prompt: "\nВыбери правильный вариант ответа буквой." doc_to_visual: !function ru_utils.mmbench_doc_to_visual doc_to_text: !function ru_utils.mmbench_doc_to_text doc_to_target: "answer"