Skip to content

Commit

Permalink
update yaml and utils
Browse files Browse the repository at this point in the history
  • Loading branch information
JvThunder committed Mar 15, 2024
1 parent 1fdfb3c commit a92cc3a
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 39 deletions.
6 changes: 4 additions & 2 deletions lmms_eval/tasks/olympiadbench/olympiadbench.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
group: flickr30k
group: olympiadbench
task:
- flickr30k_test
- olympiadbench_test
metadata:
- version: 0.0
4 changes: 2 additions & 2 deletions lmms_eval/tasks/olympiadbench/olympiadbench_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
import math

# how to use
# scorer = AutoScoringJudge()
# scorer = OlympiadBenchEvaluator()
# exp1 = "10^{10^{10^{10}}}"
# exp2 = "10^{10}"
# precision = 1e-4
# res = scorer.judge(exp1, exp2, precision)

class AutoScoringJudge:
class OlympiadBenchEvaluator:
def __init__(self):
# Map of special symbols to their replacements
self.special_signal_map = {
Expand Down
23 changes: 15 additions & 8 deletions lmms_eval/tasks/olympiadbench/olympiadbench_test.yaml
Original file line number Diff line number Diff line change
@@ -1,25 +1,32 @@
dataset_path: lmms-lab/OlympiadBench
dataset_kwargs:
token: True
task : "olympiad_bench"
task : "olympiadbench_test"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.olympiadbench_doc_to_visual
doc_to_text: !function utils.olympiadbench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 64
until:
- "ASSISTANT:"
max_new_tokens: 1024
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function utils.olympiadbench_process_result
process_results: !function utils.olympiadbench_process_results
metric_list:
- metric: human_eval
aggregation: !function utils.human_eval
higher_is_better: True
- metric: submission
aggregation: !function utils.mathvista_aggregate_results
higher_is_better: true
- metric: auto_scoring
aggregation: !function utils.auto_scoring
higher_is_better: True
metadata:
- version: 0.0

model_specific_prompt_kwargs:
default:
shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step"
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
41 changes: 14 additions & 27 deletions lmms_eval/tasks/olympiadbench/utils.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,30 @@
import os
import json
from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocotools.coco import COCO
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
import datetime
from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

import logging

eval_logger = logging.getLogger("lmms-eval")

dir_name = os.path.dirname(os.path.abspath(__file__))


olympiadbench_evaluator = OlympiadBenchEvaluator()

def olympiadbench_doc_to_visual(doc):
return [doc["image"].convert("RGB")]

return [image.convert("RGB") for image in doc["images"]]

def olympiadbench_doc_to_text(doc):
# question = "Please carefully observe the image and come up with a caption for the image"
return f"Provide a one-sentence caption for the provided image."


def olympiadbench_process_result(doc, result):
"""
Args:
doc: a instance of the eval dataset
results: [pred]
Returns:
a dictionary with key: metric name, value: metric value
"""
pred = result[0] if len(result) > 0 else ""
image_id = int(doc["img_id"])

data_dict = {"answer": doc["caption"], "pred": pred, "image_id": image_id}
problem = {
"question_type": doc["question_type"],
"answer_type": doc["answer_type"]
}
pass

return {f"flickr_{metric}": data_dict for metric in FLICKR_METRICS}
def olympiadbench_process_results(doc, result):
pass

def olympiadbench_aggregation_results(results, metric, args):
pass

def olympiadbench_aggregation_result(results, metric, args):
def auto_scoring(results, metric, args):
pass

0 comments on commit a92cc3a

Please sign in to comment.