Skip to content

Commit

Permalink
Merge pull request #13 from EvolvingLMMs-Lab/pufanyi/mmbench_fix
Browse files Browse the repository at this point in the history
[Tasks] Fix MMBench
  • Loading branch information
Luodian authored Mar 16, 2024
2 parents 3386830 + 077737b commit 92dc8e8
Show file tree
Hide file tree
Showing 12 changed files with 69 additions and 113 deletions.
22 changes: 22 additions & 0 deletions lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
dataset_path: lmms-lab/MMBench
dataset_kwargs:
token: True
doc_to_target: "answer"
dataset_name: "cn"
output_type: generate_until
doc_to_visual: !function cn_utils.mmbench_doc_to_visual
doc_to_text: !function cn_utils.mmbench_doc_to_text
generation_kwargs:
max_new_tokens: 256
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function cn_utils.mmbench_process_results
model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
25 changes: 25 additions & 0 deletions lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
dataset_path: lmms-lab/MMBench
dataset_kwargs:
token: True
doc_to_target: "answer"
model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly."
doc_to_visual: !function en_utils.mmbench_doc_to_visual
doc_to_text: !function en_utils.mmbench_doc_to_text
doc_to_target: "answer"
process_results: !function en_utils.mmbench_process_results
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
output_type: generate_until
dataset_name: "en"
generation_kwargs:
until:
- "ASSISTANT:"
max_new_tokens: 1024
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
9 changes: 5 additions & 4 deletions lmms_eval/tasks/mmbench/cc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

eval_logger = logging.getLogger("lmms-eval")
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench_cn.yaml", "r") as f:
raw_data = f.readlines()
Expand Down Expand Up @@ -66,9 +67,9 @@ def mmbench_cn_cc_process_results(doc, results):
return data


def mmbench_cn_cc_aggregate_results(results):
def mmbench_cn_cc_aggregate_results(results, args):
df = pd.DataFrame(results)
os.makedirs("./submissions", exist_ok=True)
with pd.ExcelWriter("./submissions/mmbench_cn_cc_results.xlsx") as writer:
file = generate_submission_file("mmbench_cn_cc_results.xlsx", args)
with pd.ExcelWriter(file) as writer:
df.to_excel(writer, index=False)
eval_logger.info(f"Saved results to mmbench_cn_cc_results.xlsx")
eval_logger.info(f"Saved results to {file}")
3 changes: 1 addition & 2 deletions lmms_eval/tasks/mmbench/cn_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@ def mmbench_aggregate_dev_results(results, args):

def mmbench_aggregate_test_results(results, args):
df = pd.DataFrame(results)
Path(args.output_path).joinpath("submissions").mkdir(parents=True, exist_ok=True)
excel_write_path = Path(args.output_path) / "submissions" / f"mmbench_cn_test_results.xlsx"
excel_write_path = generate_submission_file("mmbench_cn_test_results.xlsx", args)
with pd.ExcelWriter(excel_write_path) as writer:
df.to_excel(writer, index=False)
eval_logger.info(f"Saved results to {excel_write_path}")
9 changes: 4 additions & 5 deletions lmms_eval/tasks/mmbench/en_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
"answer": doc.get("answer", None),
"options": options_prompt,
"category": doc["category"],
"L2-category": doc["l2-category"],
"L2-category": doc["L2-category"],
"options_dict": options_dict,
"index": doc["index"],
"hint": doc["hint"],
"source": doc["source"],
"split": doc["split"],
}

query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) else f"{data['question']} {data['options']}"
query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']} {data['options']}"

if model_specific_prompt_kwargs:
query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}"
Expand All @@ -64,7 +64,7 @@ def mmbench_process_results(doc, results):
"source": doc["source"],
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["l2-category"],
"L2-category": doc["L2-category"],
}
}
option_candidate = ["A", "B", "C", "D", "E"]
Expand All @@ -83,8 +83,7 @@ def mmbench_aggregate_dev_results(results, args):

def mmbench_aggregate_test_results(results, args):
df = pd.DataFrame(results)
Path(args.output_path).joinpath("submissions").mkdir(parents=True, exist_ok=True)
excel_write_path = Path(args.output_path) / "submissions" / f"mmbench_en_test_results.xlsx"
excel_write_path = generate_submission_file("mmbench_en_test_results.xlsx", args)
with pd.ExcelWriter(excel_write_path) as writer:
df.to_excel(writer, index=False)
eval_logger.info(f"Saved results to {excel_write_path}")
5 changes: 2 additions & 3 deletions lmms_eval/tasks/mmbench/mmbench_cc.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
dataset_path: lmms-lab/MMBench_CN
dataset_path: lmms-lab/MMBench
dataset_name: cc
dataset_kwargs:
token: True
group: mmbench_cn
task: "mmbench_cn_cc"
dataset_name: "chinese_culture"
test_split: test
output_type: generate_until
doc_to_visual: !function cc_utils.mmbench_doc_to_visual
Expand Down
4 changes: 3 additions & 1 deletion lmms_eval/tasks/mmbench/mmbench_cn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ task:
- mmbench_cn_cc
metadata:
version: 0.0
sys_prompt: "有如下几个选项:"
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true
sys_prompt: "有如下几个选项:"
30 changes: 2 additions & 28 deletions lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml
Original file line number Diff line number Diff line change
@@ -1,33 +1,7 @@
dataset_path: lmms-lab/MMBench_CN
dataset_kwargs:
token: True
group: mmbench_cn
task: "mmbench_cn_dev"
dataset_name: "default"
test_split: "dev"
output_type: generate_until
doc_to_visual: !function cn_utils.mmbench_doc_to_visual
doc_to_text: !function cn_utils.mmbench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 256
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function cn_utils.mmbench_process_results
metric_list:
- metric: submission
higher_is_better: true
aggregation: !function cn_utils.mmbench_aggregate_dev_results
metadata:
version: 0.0
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true

model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
include: _default_template_mmbench_cn_yaml
30 changes: 2 additions & 28 deletions lmms_eval/tasks/mmbench/mmbench_cn_test.yaml
Original file line number Diff line number Diff line change
@@ -1,33 +1,7 @@
dataset_path: lmms-lab/MMBench_CN
dataset_kwargs:
token: True
task: "mmbench_cn_test"
dataset_name: "default"
task: mmbench_cn_test
test_split: test
output_type: generate_until
doc_to_visual: !function cn_utils.mmbench_doc_to_visual
doc_to_text: !function cn_utils.mmbench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 256
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function cn_utils.mmbench_process_results
metric_list:
- metric: submission
aggregation: !function cn_utils.mmbench_aggregate_test_results
higher_is_better: true
metadata:
version: 0.0
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true

model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
include: _default_template_mmbench_cn_yaml
8 changes: 0 additions & 8 deletions lmms_eval/tasks/mmbench/mmbench_en.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,3 @@ task:
metadata:
version: 0.0
sys_prompt: "There are several options:"

model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly."
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
20 changes: 2 additions & 18 deletions lmms_eval/tasks/mmbench/mmbench_en_dev.yaml
Original file line number Diff line number Diff line change
@@ -1,23 +1,7 @@
dataset_path: lmms-lab/MMBench_EN
dataset_kwargs:
token: True
task: "mmbench_en_dev"
test_split: dev
output_type: generate_until
doc_to_visual: !function en_utils.mmbench_doc_to_visual
doc_to_text: !function en_utils.mmbench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
until:
- "ASSISTANT:"
max_new_tokens: 1024
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function en_utils.mmbench_process_results
include: _default_template_mmbench_en_yaml
metric_list:
- metric: submission
aggregation: !function en_utils.mmbench_aggregate_dev_results
metadata:
version: 0.0
higher_is_better: true
17 changes: 1 addition & 16 deletions lmms_eval/tasks/mmbench/mmbench_en_test.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,7 @@
dataset_path: lmms-lab/MMBench_EN
dataset_kwargs:
token: True
task: "mmbench_en_test"
test_split: test
output_type: generate_until
doc_to_visual: !function en_utils.mmbench_doc_to_visual
doc_to_text: !function en_utils.mmbench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 256
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function en_utils.mmbench_process_results
include: _default_template_mmbench_en_yaml
metric_list:
- metric: submission
aggregation: !function en_utils.mmbench_aggregate_test_results
higher_is_better: true
metadata:
version: 0.0

0 comments on commit 92dc8e8

Please sign in to comment.