diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py index b77adb48..b618f8a3 100755 --- a/lmms_eval/models/__init__.py +++ b/lmms_eval/models/__init__.py @@ -3,6 +3,9 @@ import hf_transfer from loguru import logger import sys +import hf_transfer + +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" logger.remove() logger.add(sys.stdout, level="WARNING") @@ -33,9 +36,7 @@ "phi3v": "Phi3v", "tinyllava": "TinyLlava", "llava_hf": "LlavaHf", - "longva": "LongVA", - "llava_onevision": "Llava_OneVision", - "llava_hf": "LlavaHf", + "llava_onevision": "LlavaOneVision", "longva": "LongVA", "vila": "VILA", } @@ -43,9 +44,8 @@ for model_name, model_class in AVAILABLE_MODELS.items(): try: exec(f"from .{model_name} import {model_class}") - except ImportError as e: - # logger.warning(f"Failed to import {model_class} from {model_name}: {e}") - pass + except Exception as e: + logger.debug(f"Failed to import {model_class} from {model_name}: {e}") if os.environ.get("LMMS_EVAL_PLUGINS", None): # Allow specifying other packages to import models from @@ -54,9 +54,5 @@ for model_name, model_class in getattr(m, "AVAILABLE_MODELS").items(): try: exec(f"from {plugin}.models.{model_name} import {model_class}") - except ImportError: - pass - -import hf_transfer - -os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + except ImportError as e: + logger.debug(f"Failed to import {model_class} from {model_name}: {e}") diff --git a/lmms_eval/models/internvl2.py b/lmms_eval/models/internvl2.py index 730493d9..5843bdd1 100644 --- a/lmms_eval/models/internvl2.py +++ b/lmms_eval/models/internvl2.py @@ -213,7 +213,7 @@ def generate_until(self, requests) -> List[str]: for k, v in gen_kwargs.items(): if k not in DEFAULT_GEN_KWARGS: pop_keys.append(k) - + for k in pop_keys: gen_kwargs.pop(k) diff --git a/lmms_eval/models/llava_onevision.py b/lmms_eval/models/llava_onevision.py index 3f27c803..c5c9b5b5 100644 --- a/lmms_eval/models/llava_onevision.py +++ b/lmms_eval/models/llava_onevision.py @@ -54,15 +54,6 @@ except ImportError as e: eval_logger.debug(f"LLaVA is not installed. Please install LLaVA to use this model.\nError: {e}") -# Import LLaVA-vid modules -try: - from llavavid.model.language_model.llava_qwen import LlavaQwenConfig - from llavavid.model.language_model.llava_llama import LlavaConfig - - AutoConfig.register("llava_qwen", LlavaQwenConfig) - AutoConfig.register("llava_llama", LlavaConfig) -except ImportError as e: - eval_logger.debug(f"LLaVA-vid is not installed. Error: {e}") # Determine best attention implementation if version.parse(torch.__version__) >= version.parse("2.1.2"): diff --git a/lmms_eval/models/vila.py b/lmms_eval/models/vila.py index 295bf123..5d48af83 100755 --- a/lmms_eval/models/vila.py +++ b/lmms_eval/models/vila.py @@ -33,9 +33,7 @@ from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria from llava.mm_utils import process_images except ImportError as e: - print(e) - # import pdb;pdb.set_trace() - eval_logger.debug("VILA is not installed. Please install VILA to use this model.") + eval_logger.debug(f"VILA is not installed. Please install VILA to use this model. Error: {e}") @register_model("vila") @@ -81,7 +79,6 @@ def __init__( self.max_frames_num = max_frames_num # self._config = AutoConfig.from_pretrained(self.pretrained) - # import pdb; pdb.set_trace() self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, self.model_name, device_map=self.device_map, attn_implementation=attn_implementation) self.model.image_processor = self._image_processor @@ -202,7 +199,6 @@ def load_video(self, video_path, max_frames_num): return [Image.fromarray(img) for img in spare_frames] except Exception as e: eval_logger.error(f"Failed to load video {video_path} with error: {e}") - # import pdb;pdb.set_trace() return [Image.new("RGB", (448, 448), (0, 0, 0))] * max_frames_num def tok_decode(self, tokens): @@ -278,31 +274,23 @@ def generate_until(self, requests) -> List[str]: pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: - # if self.task_dict[task][split][doc_id]["duration"] != "short": - # # import pdb;pdb.set_trace() - # res.append("A") - # pbar.update(1) - # continue # encode, pad, and truncate contexts for this batch visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] visuals = self.flatten(visuals) num_video_frames = self.model.config.num_video_frames videos = [] - # import pdb;pdb.set_trace() if self.max_frames_num == 0: images = [Image.new("RGB", (448, 448), (0, 0, 0))] * num_video_frames video = process_images(images, self.model.image_processor, self.model.config).half().cuda() videos.append(video) else: for visual in visuals: - # images, video_loading_succeed = LazySupervisedDataset._load_video(visual, num_video_frames, self.model) - # import pdb;pdb.set_trace() if self.video_decode_backend == "decord": images = self.load_video(visual, num_video_frames) elif self.video_decode_backend == "pyav": images = read_video_pyav(visual, num_frm=num_video_frames) - # import pdb;pdb.set_trace() + video = process_images(images, self.model.image_processor, self.model.config).half().cuda() videos.append(video) @@ -370,7 +358,6 @@ def generate_until(self, requests) -> List[str]: outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() print("Question: ", cur_prompt) print("Answer: ", outputs) - # import pdb;pdb.set_trace() res.append(outputs) pbar.update(1) return res diff --git a/lmms_eval/tasks/llava_interleave_bench/_default_template_interleave_yaml b/lmms_eval/tasks/llava_interleave_bench/_default_template_interleave_yaml index 25b44461..8c6bbe64 100644 --- a/lmms_eval/tasks/llava_interleave_bench/_default_template_interleave_yaml +++ b/lmms_eval/tasks/llava_interleave_bench/_default_template_interleave_yaml @@ -2,7 +2,7 @@ output_type: generate_until generation_kwargs: until: - "ASSISTANT:" - image_aspect_ratio: original + image_aspect_ratio: pad metadata: version: 0.0 api_type : openai diff --git a/lmms_eval/tasks/llava_interleave_bench/utils.py b/lmms_eval/tasks/llava_interleave_bench/utils.py index 77175600..3c871d3c 100644 --- a/lmms_eval/tasks/llava_interleave_bench/utils.py +++ b/lmms_eval/tasks/llava_interleave_bench/utils.py @@ -22,6 +22,16 @@ nlrv2 = ["NLVR2_Mantis"] qbench = ["QBench"] +scan_qa = ["ScanQA"] +alfred = ["ALFRED"] +nuscenes = ["nuscenes"] +scannet_chat = ["ScanNet_chat"] +scannet_task = ["ScanNet_task"] +blink = ["BLINK"] +math_verse = ["MathVerse"] +sci_verse = ["SciVerse"] +mantis = ["Mantis"] + def doc_to_visual(doc): max_visual_count = 16 @@ -184,9 +194,19 @@ def overall_score(results): "Puzzle": puzzle, "NLVR2": nlrv2, "QBench": qbench, + "ScanQA": scan_qa, + "ALFRED": alfred, + "nuscenes": nuscenes, + "ScanNet_chat": scannet_chat, + "ScanNet_task": scannet_task, + "BLINK": blink, + "MathVerse": math_verse, + "SciVerse": sci_verse, + "Mantis": mantis, } category_scores = {} + matched_subtasks = set() eval_logger.info(f"Evaluation Sub-Task Results:") for category, subtasks in categories.items(): @@ -196,11 +216,15 @@ def overall_score(results): if result["sub_task"] in subtasks: count += 1 score += result["score"] + matched_subtasks.add(result["sub_task"]) if count > 0: avg_score = score / count category_scores[category] = avg_score eval_logger.info(f"{category}: {avg_score:.3f}") + if not matched_subtasks: + raise ValueError("No subtasks were matched in the results. Check if the subtask names are correct.") + # Calculate overall score total_score = sum(category_scores.values()) num_categories = len(category_scores) diff --git a/lmms_eval/tasks/mathvista/mathvista.yaml b/lmms_eval/tasks/mathvista/mathvista.yaml index b108ca59..24a1b09e 100755 --- a/lmms_eval/tasks/mathvista/mathvista.yaml +++ b/lmms_eval/tasks/mathvista/mathvista.yaml @@ -4,5 +4,5 @@ task: - mathvista_test metadata: version: 0.0 - gpt_eval_model_name: "gpt-4-0613" + gpt_eval_model_name: "gpt-3.5-turbo" quick_extract: false \ No newline at end of file diff --git a/lmms_eval/tasks/mathvista/mathvista_test.yaml b/lmms_eval/tasks/mathvista/mathvista_test.yaml index 31fc2a45..35259bd2 100755 --- a/lmms_eval/tasks/mathvista/mathvista_test.yaml +++ b/lmms_eval/tasks/mathvista/mathvista_test.yaml @@ -8,8 +8,6 @@ doc_to_visual: !function utils.mathvista_doc_to_visual doc_to_text: !function utils.mathvista_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" max_new_tokens: 1024 temperature: 0 top_p: 1.0 @@ -23,7 +21,4 @@ metric_list: model_specific_prompt_kwargs: default: - shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step" -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original \ No newline at end of file + shot_type: "reason-first" # can be "reason-first", "solution", "step-by-step" \ No newline at end of file diff --git a/lmms_eval/tasks/mathvista/mathvista_testmini.yaml b/lmms_eval/tasks/mathvista/mathvista_testmini.yaml index 3f67431b..289ece3c 100755 --- a/lmms_eval/tasks/mathvista/mathvista_testmini.yaml +++ b/lmms_eval/tasks/mathvista/mathvista_testmini.yaml @@ -1,34 +1,9 @@ -dataset_path: AI4Math/MathVista -dataset_kwargs: - token: True -task: "mathvista_testmini" -test_split: testmini -output_type: generate_until -doc_to_visual: !function utils.mathvista_doc_to_visual -doc_to_text: !function utils.mathvista_doc_to_text -doc_to_target: "answer" -generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 - temperature: 0 - top_p: 1.0 - num_beams: 1 - do_sample: false -process_results: !function utils.mathvista_process_results -metric_list: - - metric: gpt_eval_score - aggregation: !function utils.mathvista_aggregate_results - higher_is_better: true - -model_specific_prompt_kwargs: - default: - shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step" - shot: 0 - use_caption: False - use_ocr: False - phi3v: - shot_type: "solution" -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original \ No newline at end of file +group: mathvista_testmini +task: + - mathvista_testmini_cot + - mathvista_testmini_solution + - mathvista_testmini_format +metadata: + version: 0.0 + gpt_eval_model_name: "gpt-3.5-turbo" + quick_extract: false \ No newline at end of file diff --git a/lmms_eval/tasks/mathvista/mathvista_testmini_cot.yaml b/lmms_eval/tasks/mathvista/mathvista_testmini_cot.yaml new file mode 100755 index 00000000..a801dc6d --- /dev/null +++ b/lmms_eval/tasks/mathvista/mathvista_testmini_cot.yaml @@ -0,0 +1,29 @@ +dataset_path: AI4Math/MathVista +dataset_kwargs: + token: True +task: "mathvista_testmini_cot" +test_split: testmini +output_type: generate_until +doc_to_visual: !function utils.mathvista_doc_to_visual +doc_to_text: !function utils.mathvista_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 1024 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false +process_results: !function utils.mathvista_process_results +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mathvista_aggregate_results + higher_is_better: true + +model_specific_prompt_kwargs: + default: + shot_type: "step-by-step" # can be "reason-first", "solution", "step-by-step" + shot: 0 + use_caption: False + use_ocr: False + phi3v: + shot_type: "solution" \ No newline at end of file diff --git a/lmms_eval/tasks/mathvista/mathvista_testmini_format.yaml b/lmms_eval/tasks/mathvista/mathvista_testmini_format.yaml new file mode 100755 index 00000000..8dc81971 --- /dev/null +++ b/lmms_eval/tasks/mathvista/mathvista_testmini_format.yaml @@ -0,0 +1,29 @@ +dataset_path: AI4Math/MathVista +dataset_kwargs: + token: True +task: "mathvista_testmini_format" +test_split: testmini +output_type: generate_until +doc_to_visual: !function utils.mathvista_doc_to_visual +doc_to_text: !function utils.mathvista_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 1024 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false +process_results: !function utils.mathvista_process_results +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mathvista_aggregate_results + higher_is_better: true + +model_specific_prompt_kwargs: + default: + shot_type: "format-prompt" # can be "reason-first", "solution", "step-by-step" + shot: 0 + use_caption: False + use_ocr: False + phi3v: + shot_type: "solution" \ No newline at end of file diff --git a/lmms_eval/tasks/mathvista/mathvista_testmini_solution.yaml b/lmms_eval/tasks/mathvista/mathvista_testmini_solution.yaml new file mode 100755 index 00000000..8eb77b88 --- /dev/null +++ b/lmms_eval/tasks/mathvista/mathvista_testmini_solution.yaml @@ -0,0 +1,29 @@ +dataset_path: AI4Math/MathVista +dataset_kwargs: + token: True +task: "mathvista_testmini_solution" +test_split: testmini +output_type: generate_until +doc_to_visual: !function utils.mathvista_doc_to_visual +doc_to_text: !function utils.mathvista_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 1024 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false +process_results: !function utils.mathvista_process_results +metric_list: + - metric: gpt_eval_score + aggregation: !function utils.mathvista_aggregate_results + higher_is_better: true + +model_specific_prompt_kwargs: + default: + shot_type: "solution" # can be "reason-first", "solution", "step-by-step" + shot: 0 + use_caption: False + use_ocr: False + phi3v: + shot_type: "solution" \ No newline at end of file diff --git a/lmms_eval/tasks/mlvu/mlvu.yaml b/lmms_eval/tasks/mlvu/mlvu.yaml new file mode 100644 index 00000000..375da595 --- /dev/null +++ b/lmms_eval/tasks/mlvu/mlvu.yaml @@ -0,0 +1,21 @@ +dataset_path: sy1998/temp +dataset_kwargs: + token: True + cache_dir: mlvu + video: True +task: mlvu +test_split: test +output_type: generate_until +doc_to_visual: !function utils.mlvu_doc_to_visual +doc_to_text: !function utils.mlvu_doc_to_text +doc_to_target: "answer" +# The return value of process_results will be used by metrics +process_results: !function utils.mlvu_process_results +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: mlvu_percetion_score + aggregation: !function utils.mlvu_aggregate_results + higher_is_better: true + + + diff --git a/lmms_eval/tasks/mlvu/utils.py b/lmms_eval/tasks/mlvu/utils.py new file mode 100644 index 00000000..476cf9dc --- /dev/null +++ b/lmms_eval/tasks/mlvu/utils.py @@ -0,0 +1,124 @@ +from collections import defaultdict +import os +import datetime +import json +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file +from pathlib import Path +import yaml +import sys +from typing import List, Dict, Optional, Union +import re +import cv2 +import numpy as np +from loguru import logger as eval_logger + +TASK_TYPES = [ + "TR", + "AR", + "VS", + "NQA", + "ER", + "PQA", + "SSC", + "AO", + "AC" +] + + + +hf_home = os.getenv("HF_HOME", "./~/.cache/huggingface") +base_cache_dir = os.path.expanduser(hf_home) + +with open(Path(__file__).parent / "mlvu.yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) +cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"] + + + +def mlvu_doc_to_visual(doc): + + cache_dir = os.path.join(base_cache_dir, cache_name) + video_path = doc["video_name"] + video_path = os.path.join(cache_dir, video_path) + if os.path.exists(video_path): + video_path = video_path + else: + sys.exit(f"video path:{video_path} does not exist, please check") + return [video_path] + + +def mlvu_doc_to_text(doc, model_specific_prompt_kwargs=None): + # option_prompt="Carefully watch this video and pay attention to every detail. Based on your observations, select the best option that accurately addresses the question." + option_prompt="" + question = doc["question"] + "\nOnly give the best option.\n" + full_prompt=option_prompt+"\n"+question+"\n"+"Best option: (" + return full_prompt + + +def extract_characters_regex(s): + s = s.strip() + if ")" in s: + index=s.index(")") + pred=s[index-1:index] + return pred + else: + return s + +def mlvu_process_results(doc, results): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name (in this case videomme score), value: metric value + """ + pred = results[0] + # print("****************",pred) + pred_ans = extract_characters_regex(pred) + + task_type = doc["task_type"] + data_dict = {"question_id": doc["question"], "task_type": task_type, "pred_answer": pred_ans, "answer": doc["answer"]} + + return {f"mlvu_percetion_score": data_dict} + + +def mlvu_aggregate_results(results): + """ + Args: + results: a list of values returned by process_results + Returns: + A score + """ + category2score = {} + for task_type in TASK_TYPES: + category2score[task_type] = {"correct": 0, "answered": 0} + + + for result in results: + task_type = result["task_type"] + category2score[task_type]["answered"] += 1 + category2score[task_type]["correct"] += result["pred_answer"] == result["answer"] + + + for task_cate in TASK_TYPES: + total_correct = 0 + total_answered = 0 + for k, v in category2score.items(): + if task_cate in k: + total_correct += v["correct"] + total_answered += v["answered"] + eval_logger.info(f"Evaluation on Task Categories: {task_cate}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%") + + total_correct = 0 + total_answered = 0 + for k, v in category2score.items(): + total_correct += v["correct"] + total_answered += v["answered"] + eval_logger.info(f"Overall Performance: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%") + + return 100 * total_correct / total_answered if total_answered > 0 else 0 diff --git a/lmms_eval/tasks/muirbench/utils.py b/lmms_eval/tasks/muirbench/utils.py index 2924edac..72712d00 100644 --- a/lmms_eval/tasks/muirbench/utils.py +++ b/lmms_eval/tasks/muirbench/utils.py @@ -114,4 +114,4 @@ def apply(self, resps, docs): # Assuming we need the first response that matches or the original response filtered_resps.append(filtered[0]) - return filtered_resps + return filtered_resps \ No newline at end of file diff --git a/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py b/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py index 9897447f..41ae7491 100644 --- a/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py +++ b/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py @@ -1,6 +1,6 @@ import re import sympy as sp - +import math from loguru import logger as eval_logger @@ -9,7 +9,6 @@ from sympy.parsing.latex import parse_latex except ImportError as e: eval_logger.debug("Please install sympy package by running 'pip install sympy' if you want to use OlympiadBenchEvaluator.") -import math # how to use # scorer = OlympiadBenchEvaluator() diff --git a/lmms_eval/tasks/vcr_wiki/_default_template_vcr_yaml b/lmms_eval/tasks/vcr_wiki/_default_template_vcr_yaml index 37ab5e74..cbe474b4 100644 --- a/lmms_eval/tasks/vcr_wiki/_default_template_vcr_yaml +++ b/lmms_eval/tasks/vcr_wiki/_default_template_vcr_yaml @@ -14,4 +14,5 @@ generation_kwargs: # The return value of process_results will be used by metrics # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results metadata: - - version: 0.0.1 \ No newline at end of file + version: 0.0.1 + load_package: False \ No newline at end of file diff --git a/lmms_eval/tasks/vcr_wiki/utils.py b/lmms_eval/tasks/vcr_wiki/utils.py index e16f99b4..5519c078 100644 --- a/lmms_eval/tasks/vcr_wiki/utils.py +++ b/lmms_eval/tasks/vcr_wiki/utils.py @@ -1,8 +1,7 @@ import datetime +import yaml import json import os -from difflib import SequenceMatcher as SM -from functools import partial import evaluate import numpy as np @@ -10,27 +9,40 @@ from nltk.util import ngrams from spacy.cli import download -from lmms_eval.tasks._task_utils.file_utils import generate_submission_file - -# Download the English and Chinese models -try: - nlp_en = spacy.load("en_core_web_sm") -except Exception as e: - download("en_core_web_sm") - nlp_en = spacy.load("en_core_web_sm") +from pathlib import Path +from difflib import SequenceMatcher as SM +from functools import partial -try: - nlp_zh = spacy.load("zh_core_web_sm") -except Exception as e: - download("zh_core_web_sm") - nlp_zh = spacy.load("zh_core_web_sm") +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file +from loguru import logger as eval_logger -nlp = {"en": nlp_en, "zh": nlp_zh} -rouge = evaluate.load("rouge") +with open(Path(__file__).parent / "_default_template_vcr_yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) -from loguru import logger as eval_logger + config = yaml.safe_load("".join(safe_data)) -dir_name = os.path.dirname(os.path.abspath(__file__)) +# Download the English and Chinese models +if config["metadata"]["load_package"]: + try: + nlp_en = spacy.load("en_core_web_sm") + nlp_zh = spacy.load("zh_core_web_sm") + nlp = {"en": nlp_en, "zh": nlp_zh} + rouge = evaluate.load("rouge") + except Exception as e: + eval_logger.debug(f"Failed to load spacy models: {e}") + download("en_core_web_sm") + nlp_en = spacy.load("en_core_web_sm") + download("zh_core_web_sm") + nlp_zh = spacy.load("zh_core_web_sm") + eval_logger.debug("Spacy models not loaded due to load_package is False. Please set load_package to True in the config file to load them.") +else: + nlp = {"en": None, "zh": None} + rouge = None aggregate_results_template = { "max_sim_val": 0, diff --git a/lmms_eval/tasks/vibe_eval/utils.py b/lmms_eval/tasks/vibe_eval/utils.py index 9e3b3c56..1ba1cdb7 100644 --- a/lmms_eval/tasks/vibe_eval/utils.py +++ b/lmms_eval/tasks/vibe_eval/utils.py @@ -2,15 +2,18 @@ from dataclasses import dataclass from typing import Optional, List from pathlib import Path -import yaml - -from reka import ChatMessage -from reka.client import Reka +import yaml import re import os from copy import deepcopy +try: + from reka import ChatMessage + from reka.client import Reka +except ImportError: + eval_logger.warning("Reka is not installed, please install it by `pip install reka-api`") + REKA_API_KEY = os.getenv("REKA_API_KEY", "YOUR_API_KEY") with open(Path(__file__).parent / "vibe_eval.yaml", "r") as f: diff --git a/lmms_eval/tasks/wild_vision_bench/utils.py b/lmms_eval/tasks/wild_vision_bench/utils.py index b7ac6c61..f106d004 100644 --- a/lmms_eval/tasks/wild_vision_bench/utils.py +++ b/lmms_eval/tasks/wild_vision_bench/utils.py @@ -97,6 +97,7 @@ def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10): response = requests.post(API_URL, headers=headers, json=payload, timeout=60) response.raise_for_status() response_data = response.json() + print(response_data) return response_data["choices"][0]["message"]["content"], GPT_EVAL_MODEL_NAME except requests.exceptions.RequestException as e: print(f"Request failed on attempt {attempt+1}: {e}") diff --git a/pyproject.toml b/pyproject.toml index 726ad164..e01140ea 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "pytablewriter", "sacrebleu>=1.5.0", "scikit-learn>=0.24.1", - "sqlitedict", + "sqlitedict==2.1.0", "torch>=2.1.0", # to enable sdpa mode for running 34B model on one 80GB GPU "torchvision>=0.16.0", "timm", @@ -83,9 +83,14 @@ vila = [ gemini = [ "google-generativeai", ] +reka = [ + "httpx==0.23.3", + "reka-api", +] all = [ "vila", "gemini", + "reka", ] [tool.setuptools.packages.find]