diff --git a/.gitignore b/.gitignore
index 2557ab1b..922b9f0b 100755
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ temp
 __pycache__
 .ipynb_checkpoints
 temp
+.DS_STORE
 # IPython
 profile_default/
 ipython_config.py
diff --git a/README.md b/README.md
index 2bf7a26e..7be1c6c7 100755
--- a/README.md
+++ b/README.md
@@ -12,7 +12,9 @@
 
 ## Annoucement
 
-- [2024-06] 🎬🎬 The `lmms-eval/v0.2` has been upgraded to support video evaluations for video models like LLaVA-NeXT Video and Gemini 1.5 Pro across tasks such as EgoSchema, PerceptionTest, VideoMME, and more. Please refer to the [blog](https://lmms-lab.github.io/posts/lmms-eval-0.2/) for more details
+- [2024-07] 👨‍💻👨‍💻 The `lmms-eval/v0.2.1` has been upgraded to support more models, including [LongVA](https://github.com/EvolvingLMMs-Lab/LongVA), [InterVL-2](https://github.com/OpenGVLab/InternVL), [VILA](https://github.com/NVlabs/VILA), and many more evaluation tasks, e.g. [Details Captions](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/136), [MLVU](https://arxiv.org/abs/2406.04264), [WildVision-Bench](https://huggingface.co/datasets/WildVision/wildvision-arena-data), [VITATECS](https://github.com/lscpku/VITATECS) and [LLaVA-Interleave-Bench](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/).
+
+- [2024-06] 🎬🎬 The `lmms-eval/v0.2.0` has been upgraded to support video evaluations for video models like LLaVA-NeXT Video and Gemini 1.5 Pro across tasks such as EgoSchema, PerceptionTest, VideoMME, and more. Please refer to the [blog](https://lmms-lab.github.io/posts/lmms-eval-0.2/) for more details
 
 - [2024-03] 📝📝 We have released the first version of `lmms-eval`, please refer to the [blog](https://lmms-lab.github.io/posts/lmms-eval-0.1/) for more details
 
diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
index 159ac5a6..ef0e2f1c 100755
--- a/lmms_eval/__main__.py
+++ b/lmms_eval/__main__.py
@@ -165,7 +165,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
     # reset logger
     eval_logger.remove()
     eval_logger.add(sys.stdout, colorize=True, level=args.verbosity)
-    eval_logger.add(sys.stderr, level=args.verbosity)
     eval_logger.info(f"Verbosity set to {args.verbosity}")
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py
index f77065e8..2cecfe22 100755
--- a/lmms_eval/api/samplers.py
+++ b/lmms_eval/api/samplers.py
@@ -37,7 +37,9 @@ def get_context(self, doc, num_fewshot):
                     + (
                         str(self.doc_to_target(doc)[0])
                         if type(self.doc_to_target(doc)) is list
-                        else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
+                        else self.doc_to_target(doc)
+                        if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str)
+                        else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
                     )
                     for doc in selected_docs
                 ]
diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
index 041eec31..aec3bcc2 100755
--- a/lmms_eval/evaluator.py
+++ b/lmms_eval/evaluator.py
@@ -327,12 +327,7 @@ def evaluate(
             # hack: remove image columns to speed avoid loading images and speed up postprocessing
             # reason: doc_iterator will actually load image if it's in the doc.
             docs = task.test_docs() if task.has_test_docs() else task.validation_docs()
-            if "d170" not in task_name \
-                and "dc100" not in task_name \
-                and "dc200" not in task_name \
-                and "llava_wilder" not in task_name \
-                and "livebench" not in task_name \
-                and "wildvision" not in task_name:
+            if "d170" not in task_name and "dc100" not in task_name and "dc200" not in task_name and "llava_wilder" not in task_name and "livebench" not in task_name and "wildvision" not in task_name:
                 remove_cols = []
                 features = docs.features
                 # If it is an Image instance or a Sequence of Image instance. Remove it
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
index 2ed909e4..0ca7692c 100755
--- a/lmms_eval/models/__init__.py
+++ b/lmms_eval/models/__init__.py
@@ -4,6 +4,10 @@
 from loguru import logger
 import sys
 
+import hf_transfer
+
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
 logger.remove()
 logger.add(sys.stdout, level="WARNING")
 
@@ -25,6 +29,7 @@
     "llava_sglang": "LlavaSglang",
     "idefics2": "Idefics2",
     "internvl": "InternVLChat",
+    "internvl2": "InternVL2",
     "gemini_api": "GeminiAPI",
     "reka": "Reka",
     "from_log": "FromLog",
@@ -33,14 +38,16 @@
     "tinyllava": "TinyLlava",
     "llava_hf": "LlavaHf",
     "longva": "LongVA",
+    "llava_hf": "LlavaHf",
+    "longva": "LongVA",
+    "vila": "VILA",
 }
 
 for model_name, model_class in AVAILABLE_MODELS.items():
     try:
         exec(f"from .{model_name} import {model_class}")
     except ImportError as e:
-        # logger.warning(f"Failed to import {model_class} from {model_name}: {e}")
-        pass
+        logger.warning(f"Failed to import {model_class} from {model_name}: {e}")
 
 if os.environ.get("LMMS_EVAL_PLUGINS", None):
     # Allow specifying other packages to import models from
@@ -50,8 +57,4 @@
             try:
                 exec(f"from {plugin}.models.{model_name} import {model_class}")
             except ImportError:
-                pass
-
-import hf_transfer
-
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+                logger.warning(f"Failed to import {model_class} from {model_name}")
diff --git a/lmms_eval/models/batch_gpt4.py b/lmms_eval/models/batch_gpt4.py
index 8f4c2220..7541b709 100755
--- a/lmms_eval/models/batch_gpt4.py
+++ b/lmms_eval/models/batch_gpt4.py
@@ -59,7 +59,7 @@ def __init__(
         api_key: str = API_KEY,
         api_url: str = API_URL,
         modality: str = "image",
-        max_frames_for_video: int = 10,
+        max_frames_num: int = 10,
         timeout: int = 120,
         **kwargs,
     ) -> None:
@@ -69,7 +69,7 @@ def __init__(
         # Here we just use the same token as llava for convenient
         self.model_version = model_version
         self.modality = modality
-        self.max_frames_for_video = max_frames_for_video
+        self.max_frames_num = max_frames_num
         self.image_token = "<image>"
         self.timeout = timeout
 
@@ -128,7 +128,7 @@ def generate_until(self, requests):
                     img = self.encode_image(visual)
                     imgs.append(img)
                 elif self.modality == "video":
-                    frames = self.encode_video(visual, self.max_frames_for_video)
+                    frames = self.encode_video(visual, self.max_frames_num)
                     imgs.extend(frames)
 
             messages = []
diff --git a/lmms_eval/models/claude.py b/lmms_eval/models/claude.py
index 4c967e88..5829fbed 100644
--- a/lmms_eval/models/claude.py
+++ b/lmms_eval/models/claude.py
@@ -40,6 +40,7 @@ def __init__(
         image_token: str = "<image>",  # Use to separate interleaved image and text
         system_prompt: str = "",  # Whether you want some special system prompt here
         modality: str = "image",
+        max_frames_num: int = 10,
         continual_mode: bool = False,
         response_persistent_folder: str = None,
         **kwargs,
@@ -49,20 +50,24 @@ def __init__(
         self.image_token = image_token
         self.system_prompt = system_prompt
         self.modality = modality
+        self.max_frames_num = max_frames_num
 
         self.continual_mode = continual_mode
-        if self.continual_mode and response_persistent_folder is None:
-            raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
-        self.response_persistent_folder = response_persistent_folder
-        self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
-
-        if os.path.exists(self.response_persistent_file):
-            with open(self.response_persistent_file, "r") as f:
-                self.response_cache = json.load(f)
-            self.cache_mode = "resume"
-        else:
-            self.response_cache = {}
-            self.cache_mode = "start"
+        if self.continual_mode:
+            if response_persistent_folder is None:
+                raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
+
+            os.makedirs(response_persistent_folder, exist_ok=True)
+            self.response_persistent_folder = response_persistent_folder
+            self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
+
+            if os.path.exists(self.response_persistent_file):
+                with open(self.response_persistent_file, "r") as f:
+                    self.response_cache = json.load(f)
+                self.cache_mode = "resume"
+            else:
+                self.response_cache = {}
+                self.cache_mode = "start"
 
         accelerator = Accelerator()
         if accelerator.num_processes > 1:
@@ -81,7 +86,7 @@ def __init__(
 
     def encode_image(self, image):
         output_buffer = BytesIO()
-        image.save(output_buffer, format="PNG")
+        image.save(output_buffer, format="JPEG")
         byte_data = output_buffer.getvalue()
         base64_str = base64.b64encode(byte_data).decode("utf-8")
         return base64_str
@@ -129,7 +134,7 @@ def shrink_image_to_file_size(self, img: Image, max_file_size=4838990) -> Image:
     def encode_video(self, video_path):
         vr = VideoReader(video_path, ctx=cpu(0))
         total_frame_num = len(vr)
-        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, self.max_frames_for_video, dtype=int)
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, self.max_frames_num, dtype=int)
         frame_idx = uniform_sampled_frames.tolist()
         frames = vr.get_batch(frame_idx).asnumpy()
 
@@ -137,10 +142,10 @@ def encode_video(self, video_path):
         for frame in frames:
             img = Image.fromarray(frame)
             output_buffer = BytesIO()
-            img.save(output_buffer, format="PNG")
+            img.save(output_buffer, format="JPEG")
             byte_data = output_buffer.getvalue()
             base64_str = base64.b64encode(byte_data).decode("utf-8")
-            base64_frames.append(f"data:image/jpeg;base64,{base64_str}")
+            base64_frames.append(f"{base64_str}")
 
         return base64_frames
 
@@ -154,7 +159,7 @@ def generate_until(self, requests) -> List[str]:
             "type": "image",
             "source": {
                 "type": "base64",
-                "media_type": "image/png",
+                "media_type": "image/jpeg",
             },
         }
         empty_text_block = {"type": "text"}
@@ -218,10 +223,12 @@ def generate_until(self, requests) -> List[str]:
 
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
+            if gen_kwargs["max_new_tokens"] > 4096:
+                gen_kwargs["max_new_tokens"] = 4096
             if "temperature" not in gen_kwargs:
                 gen_kwargs["temperature"] = 0
-            if "top_p" not in gen_kwargs:
-                gen_kwargs["top_p"] = None
+            if "top_p" not in gen_kwargs or gen_kwargs["top_p"] is None:
+                gen_kwargs["top_p"] = 1
             if "num_beams" not in gen_kwargs:
                 gen_kwargs["num_beams"] = 1
 
@@ -238,11 +245,13 @@ def generate_until(self, requests) -> List[str]:
                         pbar.update(1)
                         continue
 
+            response_text = message.content[0].text
             res.append(message.content[0].text)
             pbar.update(1)
 
             ###################### CONTINUAL MODE ######################
             if self.continual_mode is True:  # Cache the response
+                response_text = message.content[0].text
                 doc_uuid = f"{task}___{split}___{doc_id}"
                 self.response_cache[doc_uuid] = response_text
                 with open(self.response_persistent_file, "w") as f:
diff --git a/lmms_eval/models/gemini_api.py b/lmms_eval/models/gemini_api.py
index 4a43c9af..4dbc25bd 100644
--- a/lmms_eval/models/gemini_api.py
+++ b/lmms_eval/models/gemini_api.py
@@ -31,7 +31,7 @@
 class GeminiAPI(lmms):
     def __init__(
         self,
-        model_version: str = "gemini-1.5-flash-latest",
+        model_version: str = "gemini-1.5-pro",
         modality: str = "image",
         timeout: int = 120,
         continual_mode: bool = False,
@@ -46,6 +46,8 @@ def __init__(
         if self.continual_mode and response_persistent_folder is None:
             raise ValueError("Continual mode requires a persistent path for the response. We will cache the Gemini API response in this path and use it for future requests. Please provide a valid path.")
         self.response_persistent_folder = response_persistent_folder
+        if not os.path.exists(self.response_persistent_folder):
+            os.makedirs(self.response_persistent_folder)
         self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
 
         if os.path.exists(self.response_persistent_file):
diff --git a/lmms_eval/models/gpt4v.py b/lmms_eval/models/gpt4v.py
index 729e73f7..7d9c5850 100755
--- a/lmms_eval/models/gpt4v.py
+++ b/lmms_eval/models/gpt4v.py
@@ -7,15 +7,13 @@
 from tqdm import tqdm
 import requests as url_requests
 import time
-
+import json
 
 from lmms_eval.api.instance import Instance
 from lmms_eval.api.model import lmms
 from lmms_eval.api.registry import register_model
-from lmms_eval import utils
 
-from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs
-from accelerate.state import AcceleratorState
+from accelerate import Accelerator, DistributedType
 
 try:
     from decord import VideoReader, cpu
@@ -50,8 +48,10 @@ def __init__(
         self,
         model_version: str = "gpt-4-vision-preview",
         modality: str = "video",
-        max_frames_for_video: int = 10,
+        max_frames_num: int = 10,
         timeout: int = 120,
+        continual_mode: bool = False,
+        response_persistent_folder: str = None,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -60,9 +60,25 @@ def __init__(
         # Here we just use the same token as llava for convenient
         self.model_version = model_version
         self.modality = modality
-        self.max_frames_for_video = max_frames_for_video
+        self.max_frames_num = max_frames_num
         self.image_token = "<image>"
         self.timeout = timeout
+        self.continual_mode = continual_mode
+        if self.continual_mode:
+            if response_persistent_folder is None:
+                raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
+
+            os.makedirs(response_persistent_folder, exist_ok=True)
+            self.response_persistent_folder = response_persistent_folder
+            self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
+
+            if os.path.exists(self.response_persistent_file):
+                with open(self.response_persistent_file, "r") as f:
+                    self.response_cache = json.load(f)
+                self.cache_mode = "resume"
+            else:
+                self.response_cache = {}
+                self.cache_mode = "start"
 
         accelerator = Accelerator()
         # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue."
@@ -119,9 +135,16 @@ def generate_until(self, requests) -> List[str]:
         pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
 
         for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
-            # encode, pad, and truncate contexts for this batch
-            # visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
-            visuals = [doc_to_visual(self.task_dict[task][split][0])]
+            if self.continual_mode is True and self.cache_mode == "resume":
+                doc_uuid = f"{task}___{split}___{doc_id}"
+                if doc_uuid in self.response_cache:
+                    response_text = self.response_cache[doc_uuid]
+                    if response_text:
+                        res.append(response_text)
+                        pbar.update(1)
+                        continue
+
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
             visuals = self.flatten(visuals)
             imgs = []  # multiple images or frames for video
             for visual in visuals:
@@ -129,23 +152,26 @@ def generate_until(self, requests) -> List[str]:
                     img = self.encode_image(visual)
                     imgs.append(img)
                 elif self.modality == "video":
-                    frames = self.encode_video(visual, self.max_frames_for_video)
+                    frames = self.encode_video(visual, self.max_frames_num)
                     imgs.extend(frames)
 
-            payload = {"model": self.model_version, "messages": []}
+            payload = {"messages": []}
+            if API_TYPE == "openai":
+                payload["model"] = self.model_version
+
             response_json = {"role": "user", "content": []}
             # When there is no image token in the context, append the image to the text
             if self.image_token not in contexts:
                 payload["messages"].append(deepcopy(response_json))
                 payload["messages"][0]["content"].append({"type": "text", "text": contexts})
                 for img in imgs:
-                    payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+                    payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
             else:
                 contexts = contexts.split(self.image_token)
                 for idx, img in enumerate(imgs):
                     payload["messages"].append(deepcopy(response_json))
                     payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
-                    payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+                    payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
 
                 # If n image tokens are in the contexts
                 # contexts will be splitted into n+1 chunks
@@ -155,6 +181,8 @@ def generate_until(self, requests) -> List[str]:
 
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
+            if gen_kwargs["max_new_tokens"] > 4096:
+                gen_kwargs["max_new_tokens"] = 4096
             if "temperature" not in gen_kwargs:
                 gen_kwargs["temperature"] = 0
             if "top_p" not in gen_kwargs:
@@ -170,19 +198,30 @@ def generate_until(self, requests) -> List[str]:
                     response = url_requests.post(API_URL, headers=headers, json=payload, timeout=self.timeout)
                     response_data = response.json()
 
-                    content = response_data["choices"][0]["message"]["content"].strip()
+                    response_text = response_data["choices"][0]["message"]["content"].strip()
                     break  # If successful, break out of the loop
 
                 except Exception as e:
-                    eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
-                    if attempt < 5 - 1:  # If we have retries left, sleep and then continue to next attempt
+                    try:
+                        error_msg = response.json()
+                    except:
+                        error_msg = ""
+
+                    eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}.\nReponse: {error_msg}")
+                    if attempt <= 5:
                         time.sleep(NUM_SECONDS_TO_SLEEP)
-                    else:  # If this was the last attempt, log and return empty
-                        eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}")
-                        eval_logger.error(f"Response: {response}")
-                        content = ""
-            res.append(content)
+                    else:  # If this was the last attempt, log and return empty string
+                        eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}.\nResponse: {response.json()}")
+                        response_text = ""
+            res.append(response_text)
             pbar.update(1)
+
+            if self.continual_mode is True:  # Cache the response
+                doc_uuid = f"{task}___{split}___{doc_id}"
+                self.response_cache[doc_uuid] = response_text
+                with open(self.response_persistent_file, "w") as f:
+                    json.dump(self.response_cache, f)
+
         pbar.close()
         return res
 
diff --git a/lmms_eval/models/internvl2.py b/lmms_eval/models/internvl2.py
new file mode 100644
index 00000000..5f4365d0
--- /dev/null
+++ b/lmms_eval/models/internvl2.py
@@ -0,0 +1,238 @@
+from typing import List, Tuple
+from lmms_eval.api.instance import Instance
+from decord import VideoReader, cpu
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+import numpy as np
+from transformers import AutoModel, AutoTokenizer
+from lmms_eval.api.registry import register_model
+from accelerate import Accelerator, DistributedType
+from lmms_eval.api.model import lmms
+from tqdm import tqdm
+import logging
+
+eval_logger = logging.getLogger("eval_logger")
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+DEFAULT_GEN_KWARGS = dict(
+    num_beams=1,
+    max_new_tokens=1024,
+    do_sample=False,
+)
+
+
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD)])
+    return transform
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image, input_size=448, max_num=6):
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array([int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)])
+    return frame_indices
+
+
+def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+
+    pixel_values_list, num_patches_list = [], []
+    transform = build_transform(input_size=input_size)
+    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
+        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        pixel_values = [transform(tile) for tile in img]
+        pixel_values = torch.stack(pixel_values)
+        num_patches_list.append(pixel_values.shape[0])
+        pixel_values_list.append(pixel_values)
+    pixel_values = torch.cat(pixel_values_list)
+    return pixel_values, num_patches_list
+
+
+from datetime import timedelta
+from accelerate.state import AcceleratorState
+from accelerate.utils import InitProcessGroupKwargs
+
+
+@register_model("internvl2")
+class InternVL2(lmms):
+    def __init__(
+        self,
+        pretrained: str = "OpenGVLab/InternVL2-2B",
+        modality: str = "image",
+        device: str = "cuda:0",
+        device_map: str = "cuda:0",
+        batch_size: str = "1",
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.path = pretrained
+        self.model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True).eval().cuda()
+        self.tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True)
+
+        batch_size = int(batch_size)
+        assert batch_size == 1, f"Batch size should be 1 for InternVL2, but got {batch_size}."
+        self.batch_size_per_gpu = batch_size
+
+        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+        accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+        if accelerator.num_processes > 1:
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.device_map = f"cuda:{accelerator.local_process_index}"
+        elif accelerator.num_processes == 1 and device_map == "auto":
+            self._device = torch.device(device)
+            self.device_map = device_map
+        else:
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.device_map = f"cuda:{accelerator.local_process_index}"
+
+        if accelerator.num_processes > 1:
+            assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+            # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model
+            # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works
+            # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work.
+            if accelerator.distributed_type == DistributedType.DEEPSPEED:
+                kwargs = {
+                    "train_micro_batch_size_per_gpu": self.batch_size_per_gpu,
+                    "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes,
+                }
+                AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs)
+                eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0")
+
+            if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED:
+                self._model = accelerator.prepare(self.model)
+            else:
+                self._model = accelerator.prepare_model(self.model, evaluation_mode=True)
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        elif accelerator.num_processes == 1 and device_map == "auto":
+            eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism")
+            self._rank = 0
+            self._word_size = 1
+        else:
+            eval_logger.info(f"Using single device: {self._device}")
+            self.model.to(self._device)
+            self._rank = 0
+            self._world_size = 1
+
+        self.device = self._device
+        self.modality = modality
+
+    def flatten(self, input):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+        return new_list
+
+    def generate_until(self, requests) -> List[str]:
+        res = []
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+
+        for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
+            if "until" in gen_kwargs:
+                gen_kwargs.pop("until")
+
+            for k, v in DEFAULT_GEN_KWARGS.items():
+                if k not in gen_kwargs:
+                    gen_kwargs[k] = v
+
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            visuals = self.flatten(visuals)
+            if self.modality == "image":
+                visuals = [load_image(visual).to(torch.bfloat16).cuda() for visual in visuals]
+                pixel_values = torch.cat(visuals, dim=0)
+                num_patches_list = [visual.size(0) for visual in visuals]
+                if visuals:
+                    image_tokens = ["<image>"] * len(visuals)
+                    image_tokens = " ".join(image_tokens)
+                    contexts = image_tokens + "\n" + contexts
+                response, history = self.model.chat(self.tokenizer, pixel_values, contexts, gen_kwargs, num_patches_list=num_patches_list, history=None, return_history=True)
+
+            elif self.modality == "video":
+                assert len(visuals) == 1, f"Only one video is supported, but got {len(visuals)} videos."
+                video_path = visuals[0]
+                pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
+                pixel_values = pixel_values.to(torch.bfloat16).cuda()
+                video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
+                question = video_prefix + contexts
+                response, history = self.model.chat(self.tokenizer, pixel_values, question, gen_kwargs, num_patches_list=num_patches_list, history=None, return_history=True)
+            res.append(response)
+            pbar.update(1)
+        pbar.close()
+        return res
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        assert False, "Not implemented yet."
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 6de4c8f8..7d6420ba 100755
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -58,6 +58,7 @@ def __init__(
         device_map="cuda:0",
         conv_template="vicuna_v1",
         use_cache=True,
+        tie_weights: bool = True,
         truncate_context=False,  # whether to truncate the context in generation, set it False for LLaVA-1.6
         customized_config=None,  # ends in json
         **kwargs,
@@ -97,7 +98,9 @@ def __init__(
             self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, model_name, device_map=self.device_map, **llava_model_args)
         self._config = self._model.config
         self.model.eval()
-        self.model.tie_weights()
+        if tie_weights:
+            self.model.tie_weights()
+
         self.truncation = truncation
         self.batch_size_per_gpu = int(batch_size)
         self.conv_template = conv_template
diff --git a/lmms_eval/models/llava_vid.py b/lmms_eval/models/llava_vid.py
index 6188dd95..14cd7e61 100755
--- a/lmms_eval/models/llava_vid.py
+++ b/lmms_eval/models/llava_vid.py
@@ -59,6 +59,8 @@ def __init__(
         mm_spatial_pool_mode: str = "average",
         overwrite: bool = True,
         video_decode_backend: str = "pyav",
+        delay_load: bool = False,
+        tie_weights: bool = True,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -86,15 +88,19 @@ def __init__(
         self.mm_spatial_pool_out_channels = int(mm_spatial_pool_out_channels)
         self.mm_spatial_pool_mode = mm_spatial_pool_mode
         self.max_frames_num = int(max_frames_num)
+        self.mm_resampler_location = mm_resampler_location
+        self.delay_load = delay_load
         if self.overwrite == True:
             overwrite_config = {}
             overwrite_config["mm_resampler_type"] = self.mm_resampler_type
             overwrite_config["mm_spatial_pool_stride"] = self.mm_spatial_pool_stride
             overwrite_config["mm_spatial_pool_out_channels"] = self.mm_spatial_pool_out_channels
             overwrite_config["mm_spatial_pool_mode"] = self.mm_spatial_pool_mode
-            overwrite_config["mm_resampler_location"] = "before"
-            overwrite_config["patchify_video_feature"] = False
-            overwrite_config["attn_implementation"] = attn_implementation
+            overwrite_config["mm_pooling_position"] = self.mm_resampler_location
+            overwrite_config["mm_newline_position"] = mm_newline_position
+            overwrite_config["add_faster_video"] = False
+            overwrite_config["delay_load"] = self.delay_load
+            # overwrite_config["attn_implementation"] = attn_implementation
 
             cfg_pretrained = AutoConfig.from_pretrained(self.pretrained)
 
@@ -145,7 +151,8 @@ def __init__(
 
         self._config = self._model.config
         self.model.eval()
-        self.model.tie_weights()
+        if tie_weights:
+            self.model.tie_weights()
         self.truncation = truncation
         self.batch_size_per_gpu = int(batch_size)
         self.conv_template = conv_template
diff --git a/lmms_eval/models/longva.py b/lmms_eval/models/longva.py
index 7202c49c..c5bf6861 100644
--- a/lmms_eval/models/longva.py
+++ b/lmms_eval/models/longva.py
@@ -50,7 +50,6 @@
 
 @register_model("longva")
 class LongVA(lmms):
-
     def __init__(
         self,
         pretrained: str = "lmms-lab/LongVA-7B",
@@ -442,7 +441,7 @@ def _collate(x):
             # These steps are not in LLaVA's original code, but are necessary for generation to work
             # TODO: attention to this major generation step...
             if "image_aspect_ratio" in gen_kwargs.keys():
-                gen_kwargs.pop("image_aspect_ratio") 
+                gen_kwargs.pop("image_aspect_ratio")
             try:
                 with torch.inference_mode():
                     cont = self.model.generate(input_ids, attention_mask=attention_masks, pad_token_id=pad_token_ids, images=image_tensor, use_cache=self.use_cache, **gen_kwargs)
@@ -459,4 +458,4 @@ def _collate(x):
         res = re_ords.get_original(res)
 
         pbar.close()
-        return res
\ No newline at end of file
+        return res
diff --git a/lmms_eval/models/model_utils/load_video.py b/lmms_eval/models/model_utils/load_video.py
index 789039e7..dbb3cf6f 100644
--- a/lmms_eval/models/model_utils/load_video.py
+++ b/lmms_eval/models/model_utils/load_video.py
@@ -29,7 +29,8 @@ def record_video_length_packet(container):
 
 
 def read_video_pyav(video_path, num_frm=8):
-    
+    container = av.open(video_path)
+
     if "webm" not in video_path and "mkv" not in video_path:
         # For mp4, we try loading with stream first
         try:
diff --git a/lmms_eval/models/phi3v.py b/lmms_eval/models/phi3v.py
index c30a7081..ab1e838d 100644
--- a/lmms_eval/models/phi3v.py
+++ b/lmms_eval/models/phi3v.py
@@ -1,6 +1,5 @@
 import torch
 
-
 from accelerate import Accelerator, DistributedType
 from lmms_eval import utils
 from lmms_eval.api.instance import Instance
diff --git a/lmms_eval/models/reka.py b/lmms_eval/models/reka.py
index bc461cad..ee1b9c67 100644
--- a/lmms_eval/models/reka.py
+++ b/lmms_eval/models/reka.py
@@ -36,7 +36,7 @@ def __init__(
         self,
         model_version: str = "reka-edge",
         modality: str = "image",
-        max_frames_for_video: int = 10,
+        max_frames_num: int = 5,
         timeout: int = 120,
         continual_mode: bool = False,
         response_persistent_folder: str = None,  # We will cache the Gemini API response in this path and use it for future requests
@@ -45,21 +45,24 @@ def __init__(
         super().__init__()
         self.model_version = model_version
         self.modality = modality
-        self.max_frames_for_video = max_frames_for_video
+        self.max_frames_num = max_frames_num
         self.timeout = timeout
         self.continual_mode = continual_mode
-        if self.continual_mode and response_persistent_folder is None:
-            raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
-        self.response_persistent_folder = response_persistent_folder
-        self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
-
-        if os.path.exists(self.response_persistent_file):
-            with open(self.response_persistent_file, "r") as f:
-                self.response_cache = json.load(f)
-            self.cache_mode = "resume"
-        else:
-            self.response_cache = {}
-            self.cache_mode = "start"
+        if self.continual_mode:
+            if response_persistent_folder is None:
+                raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
+
+            os.makedirs(response_persistent_folder, exist_ok=True)
+            self.response_persistent_folder = response_persistent_folder
+            self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
+
+            if os.path.exists(self.response_persistent_file):
+                with open(self.response_persistent_file, "r") as f:
+                    self.response_cache = json.load(f)
+                self.cache_mode = "resume"
+            else:
+                self.response_cache = {}
+                self.cache_mode = "start"
 
         self.reka = RekaClient(api_key=os.getenv("REKA_API_KEY", "YOUR_API_KEY"))
 
@@ -99,7 +102,7 @@ def encode_image(self, image):
     def encode_video(self, video_path):
         vr = VideoReader(video_path, ctx=cpu(0))
         total_frame_num = len(vr)
-        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, self.max_frames_for_video, dtype=int)
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, self.max_frames_num, dtype=int)
         frame_idx = uniform_sampled_frames.tolist()
         frames = vr.get_batch(frame_idx).asnumpy()
 
@@ -141,7 +144,7 @@ def generate_until(self, requests) -> List[str]:
                 message_content.append({"type": "text", "text": context})
                 assert len(visual) == 1, "Reka only supports one video per request"
                 media_urls = self.encode_video(visual[0])
-                assert len(media_urls) == self.max_frames_for_video, f"Reka only supports {self.max_frames_for_video} frames per request"
+                assert len(media_urls) == self.max_frames_num, f"Reka only supports {self.max_frames_num} frames per request"
                 for media_url in media_urls:
                     message_content.append({"type": "image_url", "image_url": media_url})
 
diff --git a/lmms_eval/models/tinyllava.py b/lmms_eval/models/tinyllava.py
index e07c47b8..a4335f05 100755
--- a/lmms_eval/models/tinyllava.py
+++ b/lmms_eval/models/tinyllava.py
@@ -2,7 +2,6 @@
 
 torch.backends.cuda.matmul.allow_tf32 = True
 
-
 import copy
 from tqdm import tqdm
 from datetime import timedelta
diff --git a/lmms_eval/models/video_chatgpt/model/video_chatgpt.py b/lmms_eval/models/video_chatgpt/model/video_chatgpt.py
index df6fee4f..bded27e7 100644
--- a/lmms_eval/models/video_chatgpt/model/video_chatgpt.py
+++ b/lmms_eval/models/video_chatgpt/model/video_chatgpt.py
@@ -76,7 +76,6 @@ def forward(
             inputs_embeds = self.embed_tokens(input_ids)
 
         if (input_ids.shape[1] != 1 or self.training) and video_spatio_temporal_features is not None:
-
             video_features = self.mm_projector(video_spatio_temporal_features)
             dummy_video_features = torch.zeros(video_features.shape[1], 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
             dummy_video_features = self.mm_projector(dummy_video_features)
diff --git a/lmms_eval/models/vila.py b/lmms_eval/models/vila.py
new file mode 100755
index 00000000..2adff620
--- /dev/null
+++ b/lmms_eval/models/vila.py
@@ -0,0 +1,376 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import logging
+from typing import List, Optional, Union, Tuple
+from PIL import Image
+import math
+import numpy as np
+from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs
+from accelerate.state import AcceleratorState
+from datetime import timedelta
+from decord import VideoReader, cpu
+
+
+from torchvision.transforms import Resize
+
+import signal
+
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+
+eval_logger = logging.getLogger("lmms-eval")
+# import sys;sys.path.append("llava-video")
+try:
+    from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+    from llava.conversation import conv_templates, SeparatorStyle
+    from llava.model.builder import load_pretrained_model
+    from llava.data.dataset import LazySupervisedDataset
+    from llava.utils import disable_torch_init
+    from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+    from llava.mm_utils import process_images
+except ImportError as e:
+    print(e)
+    
+    eval_logger.debug("VILA is not installed. Please install VILA to use this model. Error: {e}")
+
+
+@register_model("vila")
+class VILA(lmms):
+    """
+    VILA Model
+    """
+
+    def __init__(
+        self,
+        pretrained: str = "Efficient-Large-Model/VILA1.5-40b",
+        max_frames_num: Optional[int] = 100,
+        truncation: Optional[bool] = True,
+        device: Optional[str] = "cuda:0",
+        batch_size: Optional[Union[int, str]] = 1,
+        attn_implementation=(
+            "sdpa" if torch.__version__ >= "2.1.2" else "eager"
+        ),  # inference implementation for attention, can be "sdpa", "eager", "flash_attention_2". Seems FA2 is not effective during inference: https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453/5
+        device_map="cuda:0",
+        conv_template="hermes-2",
+        use_cache=True,
+        truncate_context=False,  # whether to truncate the context in generation, set it False for LLaVA-1.6
+        video_decode_backend="decord",
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        assert kwargs == {}, f"Unexpected kwargs: {kwargs}"
+
+        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+        accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+        if accelerator.num_processes > 1:
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.device_map = f"cuda:{accelerator.local_process_index}"
+        elif accelerator.num_processes == 1 and device_map == "auto":
+            self._device = torch.device(device)
+            self.device_map = device_map
+        else:
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.device_map = f"cuda:{accelerator.local_process_index}"
+
+        self.pretrained = pretrained
+        self.model_name = get_model_name_from_path(pretrained)
+        self.max_frames_num = max_frames_num
+        # self._config = AutoConfig.from_pretrained(self.pretrained)
+
+        # import pdb; pdb.set_trace()
+        self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, self.model_name, device_map=self.device_map, attn_implementation=attn_implementation)
+
+        self.model.image_processor = self._image_processor
+
+        self._config = self._model.config
+
+        if self._tokenizer.pad_token_id is None:
+            if "qwen" in self._tokenizer.name_or_path.lower():
+                print("Setting pad token to bos token for qwen model.")
+                self._tokenizer.pad_token_id = 151643
+
+        self.video_decode_backend = video_decode_backend
+        self.model.eval()
+        # self.model.tie_weights()
+        self.truncation = truncation
+        self.batch_size_per_gpu = int(batch_size)
+        self.conv_template = conv_template
+        self.use_cache = use_cache
+        self.truncate_context = truncate_context
+        # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue."
+        if accelerator.num_processes > 1:
+            assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+            # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model
+            # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works
+            # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work.
+            if accelerator.distributed_type == DistributedType.DEEPSPEED:
+                kwargs = {
+                    "train_micro_batch_size_per_gpu": self.batch_size_per_gpu,
+                    "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes,
+                }
+                AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs)
+                eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0")
+            if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED:
+                self._model = accelerator.prepare(self.model)
+            else:
+                self._model = accelerator.prepare_model(self.model, evaluation_mode=True)
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        elif accelerator.num_processes == 1 and device_map == "auto":
+            eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism")
+            self._rank = 0
+            self._word_size = 1
+        else:
+            eval_logger.info(f"Using single device: {self._device}")
+            self.model.to(self._device)
+            self._rank = 0
+            self._world_size = 1
+
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config
+
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+
+    @property
+    def model(self):
+        # returns the model, unwrapping it if using Accelerate
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model
+
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        return self._max_length
+
+    def pad_sequence(self, input_ids, batch_first, padding_value):
+        if self.tokenizer.padding_side == "left":
+            input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids]
+        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value)
+        if self.tokenizer.padding_side == "left":
+            input_ids = torch.flip(input_ids, [1])
+        return input_ids
+
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]:
+        """ """
+        add_special_tokens = False if add_special_tokens is None else add_special_tokens
+        encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+        return encoding
+
+    def load_video(self, video_path, max_frames_num):
+        try:
+            vr = VideoReader(video_path, ctx=cpu(0))
+            total_frame_num = len(vr)
+            fps = round(vr.get_avg_fps())
+            frame_idx = np.linspace(0, total_frame_num - 2, max_frames_num, dtype=int)
+            spare_frames = vr.get_batch(frame_idx).asnumpy()
+            return [Image.fromarray(img) for img in spare_frames]
+        except Exception as e:
+            eval_logger.error(f"Failed to load video {video_path} with error: {e}")
+            
+            return [Image.new("RGB", (448, 448), (0, 0, 0))] * max_frames_num
+
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        res = []
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+
+        for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
+            # encode, pad, and truncate contexts for this batch
+            if type(doc_to_target) == str:
+                continuation = doc_to_target
+            else:
+                continuation = doc_to_target(self.task_dict[task][split][doc_id])
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            visuals = self.flatten(visuals)
+            videos = []
+            for visual in visuals:
+                video = self.load_video(visual, self.max_frames_num)
+                video = self._image_processor.preprocess(video, return_tensors="pt")["pixel_values"].half().cuda()
+                videos.append(video)
+
+            qs = contexts
+            if self.model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+
+            conv = conv_templates[self.conv_template].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+
+            contxt_id = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
+
+            conv = conv_templates[self.conv_template].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], continuation)
+            prompt = conv.get_prompt()
+
+            input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
+            attention_masks = input_ids.ne(self.tokenizer.pad_token_id).long().cuda()
+
+            labels = input_ids.clone()
+            # Context part no need to calculate for loss
+            labels[0, : contxt_id.shape[1]] = -100
+
+            with torch.inference_mode():
+                outputs = self.model(input_ids=input_ids, labels=labels, images=videos, modalities="video")
+
+            loss = outputs["loss"]
+            # loss = torch.exp(loss)
+            logits = outputs["logits"]
+            greedy_tokens = logits.argmax(dim=-1)
+            cont_toks = input_ids[:, contxt_id.shape[1] :]  # [1, seq]
+            greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : input_ids.shape[1]]  # [1, seq]
+            max_equal = (greedy_tokens == cont_toks).all()
+            res.append((float(loss.item()), bool(max_equal)))
+            pbar.update(1)
+        pbar.close()
+        return res
+
+    def flatten(self, input):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+        return new_list
+
+    def generate_until(self, requests) -> List[str]:
+        res = []
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+
+        for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
+            # if self.task_dict[task][split][doc_id]["duration"] != "short":
+            #     
+            #     res.append("A")
+            #     pbar.update(1)
+            #     continue
+            # encode, pad, and truncate contexts for this batch
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            visuals = self.flatten(visuals)
+
+            num_video_frames = self.model.config.num_video_frames
+            videos = []
+            
+            if self.max_frames_num == 0:
+                images = [Image.new("RGB", (448, 448), (0, 0, 0))] * num_video_frames
+                video = process_images(images, self.model.image_processor, self.model.config).half().cuda()
+                videos.append(video)
+            else:
+                for visual in visuals:
+                    # images, video_loading_succeed = LazySupervisedDataset._load_video(visual, num_video_frames, self.model)
+                    
+                    if self.video_decode_backend == "decord":
+                        images = self.load_video(visual, num_video_frames)
+                    elif self.video_decode_backend == "pyav":
+                        images = read_video_pyav(visual, num_frm=num_video_frames)
+                    
+                    video = process_images(images, self.model.image_processor, self.model.config).half().cuda()
+                    videos.append(video)
+
+            qs = f"<video>\n {contexts}"
+            if self.model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + qs
+            else:
+                qs = (DEFAULT_IMAGE_TOKEN + "\n") * len(images) + qs
+
+            # This is much safer for llama3, as we now have some object type in it
+            # if "llama_3" in self.conv_template:
+            #     conv = copy.deepcopy(conv_templates[self.conv_template])
+            # else:
+            #     conv = conv_templates[self.conv_template].copy()
+            conv = conv_templates[self.conv_template].copy()
+
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+
+            input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
+            pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            # if "llama_3" in self.conv_template:
+            #     pad_token_ids = 0  # lmms-lab/llama3-llava-8b is trained on this pad token id. You may need to customize this for other models.
+            attention_masks = input_ids.ne(pad_token_ids).long().cuda()
+
+            # input_ids_list = [tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") for prompt in question_input]
+            # pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            # input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(self.device)
+            # attention_masks = input_ids.ne(pad_token_ids).to(self.device)
+
+            stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+            keywords = [stop_str]
+
+            stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
+
+            cur_prompt = contexts
+
+            if "max_new_tokens" not in gen_kwargs:
+                gen_kwargs["max_new_tokens"] = 1024
+            if "temperature" not in gen_kwargs:
+                gen_kwargs["temperature"] = 0.2
+            if "top_p" not in gen_kwargs:
+                gen_kwargs["top_p"] = None
+            if "num_beams" not in gen_kwargs:
+                gen_kwargs["num_beams"] = 1
+
+            
+            with torch.inference_mode():
+                output_ids = self.model.generate(
+                    input_ids=input_ids,
+                    images=videos,
+                    attention_mask=attention_masks,
+                    use_cache=self.use_cache,
+                    stopping_criteria=[stopping_criteria],
+                    do_sample=True if gen_kwargs["temperature"] > 0 else False,
+                    temperature=gen_kwargs["temperature"],
+                    top_p=gen_kwargs["top_p"],
+                    num_beams=gen_kwargs["num_beams"],
+                    max_new_tokens=gen_kwargs["max_new_tokens"],
+                )
+                # output_ids_2 = self.model.generate(inputs=input_ids, images=videos, attention_mask=attention_masks, modalities="video", do_sample=False, max_new_tokens=50,stopping_criteria=[stopping_criteria])
+                # output_ids = self.model.generate(inputs=input_ids, images=videos, attention_mask=attention_masks, modalities="video", do_sample=True, temperature=0.2, max_new_tokens=50,use_cache=True)
+
+            outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+            print("Question: ", cur_prompt)
+            print("Answer: ", outputs)
+            
+            res.append(outputs)
+            pbar.update(1)
+        return res
diff --git a/lmms_eval/models/xcomposer2_4khd.py b/lmms_eval/models/xcomposer2_4khd.py
deleted file mode 100644
index 6c4f81a7..00000000
--- a/lmms_eval/models/xcomposer2_4khd.py
+++ /dev/null
@@ -1,295 +0,0 @@
-from multiprocessing import context
-import torch
-from transformers import AutoModel, AutoTokenizer
-from PIL import Image
-import numpy as np
-import torchvision.transforms as transforms
-from datetime import timedelta
-
-
-from lmms_eval import utils
-from lmms_eval.api.instance import Instance
-from lmms_eval.api.model import lmms
-from lmms_eval.api.registry import register_model
-from lmms_eval.utils import stop_sequences_criteria
-
-from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs
-from accelerate.state import AcceleratorState
-
-from typing import Optional, Sequence, List, Tuple, Union
-import re
-from tqdm import tqdm
-
-pattern = re.compile(r"[A-Z]")
-
-from loguru import logger as eval_logger
-
-meta_instruction = """You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).
-- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed\
- by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
-- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by\
- the user such as English and 中文.
-- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses\
- effectively based on the provided image."""
-
-
-@register_model("xcomposer2_4khd")
-class XComposer2_4KHD(lmms):
-    def __init__(
-        self,
-        pretrained: str = "internlm/internlm-xcomposer2-4khd-7b",
-        device: Optional[str] = "cuda:0",
-        batch_size: Optional[Union[int, str]] = 1,
-        device_map="cuda:0",
-        need_bos: bool = True,
-        padding: bool = False,
-        half: bool = False,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-
-        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
-        accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
-        if accelerator.num_processes > 1:
-            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
-            self.device_map = f"cuda:{accelerator.local_process_index}"
-        elif accelerator.num_processes == 1 and device_map == "auto":
-            self._device = torch.device(device)
-            self.device_map = device_map
-        else:
-            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
-            self.device_map = f"cuda:{accelerator.local_process_index}"
-
-        self.pretrained = pretrained
-        self.need_bos = need_bos
-        self.padding = padding
-        self._model = AutoModel.from_pretrained(self.pretrained, device_map=self.device_map, trust_remote_code=True)
-        self._tokenizer = AutoTokenizer.from_pretrained(self.pretrained, trust_remote_code=True)
-        self.model.tokenizer = self.tokenizer
-        self.batch_size_per_gpu = batch_size
-
-        if accelerator.num_processes > 1:
-            assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
-            # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model
-            # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works
-            # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work.
-            if accelerator.distributed_type == DistributedType.DEEPSPEED:
-                kwargs = {
-                    "train_micro_batch_size_per_gpu": self.batch_size_per_gpu,
-                    "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes,
-                }
-                AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs)
-                eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0")
-            if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED:
-                self._model = accelerator.prepare(self.model)
-            else:
-                self._model = accelerator.prepare_model(self.model, evaluation_mode=True)
-            self.accelerator = accelerator
-            if self.accelerator.is_local_main_process:
-                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
-            self._rank = self.accelerator.local_process_index
-            self._world_size = self.accelerator.num_processes
-        elif accelerator.num_processes == 1 and device_map == "auto":
-            eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism")
-            self._rank = 0
-            self._word_size = 1
-        else:
-            eval_logger.info(f"Using single device: {self._device}")
-            self.model.to(self._device)
-            self._rank = 0
-            self._world_size = 1
-
-    @property
-    def config(self):
-        # return the associated transformers.AutoConfig for the given pretrained model.
-        return self._config
-
-    @property
-    def tokenizer(self):
-        return self._tokenizer
-
-    @property
-    def model(self):
-        # returns the model, unwrapping it if using Accelerate
-        if hasattr(self, "accelerator"):
-            return self.accelerator.unwrap_model(self._model)
-        else:
-            return self._model
-
-    @property
-    def batch_size(self):
-        return self.batch_size_per_gpu
-
-    @property
-    def device(self):
-        return self._device
-
-    @property
-    def rank(self):
-        return self._rank
-
-    @property
-    def world_size(self):
-        return self._world_size
-
-    def flatten(self, input):
-        new_list = []
-        for i in input:
-            for j in i:
-                new_list.append(j)
-        return new_list
-
-    def generate_until(self, requests) -> List[str]:
-        res = []
-        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
-
-        for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
-            # encode, pad, and truncate contexts for this batch
-            if "[UNUSED_TOKEN_146]" not in contexts:
-                contexts = f"[UNUSED_TOKEN_146]user\n{contexts}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
-            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
-            visuals = self.flatten(visuals)
-
-            if "hd_num" not in gen_kwargs:
-                if listinstr(["docvqa_test", "infovqa_test"], task.lower()):
-                    self.model.hd_num = 65
-                elif listinstr(["docvqa_val", "infovqa_val", "OCRBench"], task.lower()):
-                    self.model.hd_num = 55
-                elif listinstr(["mmmu", "mmbench", "mmvet"], task.lower()):
-                    self.model.hd_num = 16
-                else:
-                    self.model.hd_num = 25
-            else:
-                self.model.hd_num = gen_kwargs.pop("hd_num")
-
-            pt1 = 0
-            embeds = []
-            im_mask = []
-            images_loc = [0]
-            need_bos = self.need_bos
-            padding = self.padding
-            for i, pts in enumerate(images_loc + [len(contexts)]):
-                subtext = contexts[pt1:pts]
-                if need_bos or len(subtext) > 0:
-                    text_embeds = self.model.encode_text(subtext, add_special_tokens=need_bos).to(self.device)
-                    embeds.append(text_embeds)
-                    im_mask.append(torch.zeros(text_embeds.shape[:2]).to(self.device))
-                    need_bos = False
-                if i < len(visuals):
-                    image = visuals[i]
-
-                    image = HD_transform(image, im_num=self.model.hd_num)
-                    image = self.model.vis_processor(image).unsqueeze(0).to(self.device)
-                    image_embeds = self.model.encode_img(image)
-                    embeds.append(image_embeds)
-                    im_mask.append(torch.ones(image_embeds.shape[:2]).to(self.device))
-                pt1 = pts
-            embeds = torch.cat(embeds, dim=1)
-            im_mask = torch.cat(im_mask, dim=1)
-            im_mask = im_mask.bool()
-
-            if "max_new_tokens" not in gen_kwargs:
-                gen_kwargs["max_new_tokens"] = 1024
-            if "temperature" not in gen_kwargs:
-                gen_kwargs["temperature"] = 0
-            if "top_p" not in gen_kwargs:
-                gen_kwargs["top_p"] = None
-            if "num_beams" not in gen_kwargs:
-                gen_kwargs["num_beams"] = 1
-            if "do_sample" not in gen_kwargs:
-                gen_kwargs["do_sample"] = False
-            if "repetition_penalty" not in gen_kwargs:
-                gen_kwargs["repetition_penalty"] = 1.0
-
-            outputs = self.model.generate(
-                inputs_embeds=embeds,
-                im_mask=im_mask,
-                temperature=gen_kwargs["temperature"],
-                max_new_tokens=gen_kwargs["max_new_tokens"],
-                num_beams=gen_kwargs["num_beams"],
-                do_sample=gen_kwargs["do_sample"],
-                repetition_penalty=gen_kwargs["repetition_penalty"],
-            )
-            output_token = outputs[0]
-            if output_token[0] == 0 or output_token[0] == 1:
-                output_token = output_token[1:]
-            output_text = self.model.tokenizer.decode(output_token, add_special_tokens=False)
-            output_text = output_text.split("[UNUSED_TOKEN_145]")[0].strip()
-            output_text = output_text.split("<|im_end|>")[0].strip()
-            # if DATASET_TYPE(task) == "multi-choice":
-            #     output_text = pattern.findall(output_text)
-            #     if len(output_text) == 0:
-            #         print("Error:", output_text)
-            #         output_text = "Z"
-            #     if type(output_text) == list:
-            #         output_text = output_text[0]
-            res.append(output_text)
-            pbar.update(1)
-        pbar.close()
-        return res
-
-    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
-        return super().loglikelihood(requests)
-
-
-def padding_336(b):
-    width, height = b.size
-    tar = int(np.ceil(height / 336) * 336)
-    top_padding = int((tar - height) / 2)
-    bottom_padding = tar - height - top_padding
-    left_padding = 0
-    right_padding = 0
-    b = transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255, 255, 255])
-
-    return b
-
-
-def HD_transform(img, im_num=16):
-    width, height = img.size
-    trans = False
-    if width < height:
-        img = img.transpose(Image.TRANSPOSE)
-        trans = True
-        width, height = img.size
-    ratio = width / height
-    scale = 1
-    while scale * np.ceil(scale / ratio) <= im_num:
-        scale += 1
-    scale -= 1
-    new_w = int(scale * 336)
-    new_h = int(new_w / ratio)
-
-    img = transforms.functional.resize(
-        img,
-        [new_h, new_w],
-    )
-    img = padding_336(img)
-    width, height = img.size
-    assert width * height <= im_num * 336 * 336
-    if trans:
-        img = img.transpose(Image.TRANSPOSE)
-
-    return img
-
-
-def listinstr(lst, s):
-    assert isinstance(lst, list)
-    for item in lst:
-        if item in s:
-            return True
-    return False
-
-
-def DATASET_TYPE(dataset):
-    # Dealing with Custom Dataset
-    dataset = dataset.lower()
-    if listinstr(["mmbench", "seedbench", "ccbench", "mmmu", "scienceqa", "ai2d", "mmstar"], dataset):
-        return "multi-choice"
-    elif listinstr(["mme", "hallusion"], dataset):
-        return "Y/N"
-    elif "coco" in dataset:
-        return "Caption"
-    elif listinstr(["ocrvqa", "textvqa", "chartqa", "mathvista", "docvqa", "infovqa", "llavabench", "mmvet", "ocrbench"], dataset):
-        return "VQA"
-    else:
-        return "QA"
diff --git a/lmms_eval/tasks/__init__.py b/lmms_eval/tasks/__init__.py
index 19f7fea2..4bb2e01d 100755
--- a/lmms_eval/tasks/__init__.py
+++ b/lmms_eval/tasks/__init__.py
@@ -72,7 +72,7 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
         # if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
         for f in file_list:
             # if "detail" in f:
-            #     import pdb;pdb.set_trace()
+            #     
             # if "vatex" in f:
             #     print("a")
             if f.endswith(".yaml"):
@@ -111,7 +111,6 @@ def include_path(task_dir):
 def initialize_tasks(verbosity="INFO"):
     logger.remove()
     eval_logger.add(sys.stdout, colorize=True, level=verbosity)
-    eval_logger.add(sys.stderr, level=verbosity)
     task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
     include_path(task_dir)
 
diff --git a/lmms_eval/tasks/detailcaps/utils.py b/lmms_eval/tasks/detailcaps/utils.py
index a9f6a925..50c0aba2 100644
--- a/lmms_eval/tasks/detailcaps/utils.py
+++ b/lmms_eval/tasks/detailcaps/utils.py
@@ -27,11 +27,12 @@ def detailcaps_doc_to_text(doc, model_specific_prompt_kwargs=None):
     # question = "Please carefully observe the image and come up with a caption for the image"
     return model_specific_prompt_kwargs["prompt"]
 
+
 def detailcaps_doc_to_target(doc):
     references = [
-        doc['GT_Caption_GPT4O'],
-        doc['GT_Caption_GPT4V'],
-        doc['GT_Caption_Gemini15Pro'],
+        doc["GT_Caption_GPT4O"],
+        doc["GT_Caption_GPT4V"],
+        doc["GT_Caption_Gemini15Pro"],
     ]
     return references
 
@@ -54,28 +55,18 @@ def detailcaps_process_result(doc, result):
     return {f"detailcaps_{metric}": data_dict for metric in detailcaps_METRICS}
 
 
-def check_if_context_is_set(expected_context='spawn'):
+def check_if_context_is_set(expected_context="spawn"):
     # 获取默认上下文的名称
     default_context_name = mp.get_context().get_start_method()
-    
+
     # 检查当前上下文是否与预期的上下文相匹配
     is_set_to_expected = default_context_name == expected_context
-    
+
     return is_set_to_expected
 
 
 def detailcaps_aggregation_result(results, metric, args=None):
-
-    scorers = [
-        (Bleu(4), "Bleu_1"), 
-        (Bleu(4), "Bleu_2"), 
-        (Bleu(4), "Bleu_3"), 
-        (Bleu(4), "Bleu_4"), 
-        (Meteor(), "METEOR"), 
-        (Rouge(), "ROUGE_L"), 
-        (Cider(), "CIDEr"),
-        (CAPTURE(), "CAPTURE")
-    ]
+    scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (CAPTURE(), "CAPTURE")]
     scorers_dict = {s[1]: s for s in scorers}
 
     stored_results = []
@@ -112,14 +103,14 @@ def detailcaps_aggregation_result(results, metric, args=None):
     eval_logger.info("tokenization...")
     tokenizer = PTBTokenizer()
 
-    if metric == 'CAPTURE':
+    if metric == "CAPTURE":
         reorg_gts, reorg_res = collections.defaultdict(list), collections.defaultdict(list)
         for _, samples in gts.items():
             for sample in samples:
-                reorg_gts[sample['image_id']].append(sample['caption'])
+                reorg_gts[sample["image_id"]].append(sample["caption"])
         for _, samples in res.items():
             for sample in samples:
-                reorg_res[sample['image_id']].append(sample['caption'])
+                reorg_res[sample["image_id"]].append(sample["caption"])
         gts, res = reorg_gts, reorg_res
     else:
         gts = tokenizer.tokenize(gts)
@@ -127,7 +118,7 @@ def detailcaps_aggregation_result(results, metric, args=None):
 
     eval_logger.info(f"Computing {metric} scores...")
 
-    # if int(os.environ.get("RANK", 0)) == 0:        
+    # if int(os.environ.get("RANK", 0)) == 0:
     #     from IPython import embed; embed()
     # else:
     #     import time; time.sleep(1200)
diff --git a/lmms_eval/tasks/ii_bench/utils.py b/lmms_eval/tasks/ii_bench/utils.py
index f298703a..8b30c12a 100755
--- a/lmms_eval/tasks/ii_bench/utils.py
+++ b/lmms_eval/tasks/ii_bench/utils.py
@@ -1,5 +1,4 @@
 import json
-
 import re
 from collections import Counter
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
diff --git a/lmms_eval/tasks/llava_interleave_bench/_default_template_interleave_yaml b/lmms_eval/tasks/llava_interleave_bench/_default_template_interleave_yaml
new file mode 100644
index 00000000..25b44461
--- /dev/null
+++ b/lmms_eval/tasks/llava_interleave_bench/_default_template_interleave_yaml
@@ -0,0 +1,9 @@
+output_type: generate_until
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  image_aspect_ratio: original
+metadata:
+  version: 0.0
+  api_type : openai
+  gpt_eval_model_name: "gpt-3.5-turbo"
\ No newline at end of file
diff --git a/lmms_eval/tasks/llava_interleave_bench/in_domain.yaml b/lmms_eval/tasks/llava_interleave_bench/in_domain.yaml
new file mode 100644
index 00000000..9a6bf8e4
--- /dev/null
+++ b/lmms_eval/tasks/llava_interleave_bench/in_domain.yaml
@@ -0,0 +1,26 @@
+dataset_path: lmms-lab/LLaVA-NeXT-Interleave-Bench
+dataset_name: in_domain
+dataset_kwargs:
+  token: True
+task: "llava_interleave_bench_in_domain"
+test_split: test
+doc_to_target: "answer"
+doc_to_visual: !function utils.doc_to_visual
+doc_to_text: !function utils.doc_to_text
+process_results: !function utils.interleave_process_results
+
+metric_list:
+  - metric: overall_score
+    aggregation: !function utils.overall_score
+    higher_is_better: true
+
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  do_sample: False
+  image_aspect_ratio: "pad" # for multi-image, we treat each image as original aspect ratio without anyres strategy.
+
+model_specific_prompt_kwargs:
+  default:
+    oe_post_prompt: ""
+    mcq_post_prompt: ""
diff --git a/lmms_eval/tasks/llava_interleave_bench/interleave_bench.yaml b/lmms_eval/tasks/llava_interleave_bench/interleave_bench.yaml
new file mode 100644
index 00000000..2d68410b
--- /dev/null
+++ b/lmms_eval/tasks/llava_interleave_bench/interleave_bench.yaml
@@ -0,0 +1,5 @@
+group: llava_interleave_bench
+task:
+- llava_interleave_bench_in_domain
+- llava_interleave_bench_out_domain
+- llava_interleave_bench_multi_view
\ No newline at end of file
diff --git a/lmms_eval/tasks/llava_interleave_bench/multi_view_in_domain.yaml b/lmms_eval/tasks/llava_interleave_bench/multi_view_in_domain.yaml
new file mode 100644
index 00000000..5b3dec18
--- /dev/null
+++ b/lmms_eval/tasks/llava_interleave_bench/multi_view_in_domain.yaml
@@ -0,0 +1,26 @@
+dataset_path: lmms-lab/LLaVA-NeXT-Interleave-Bench
+dataset_name: multi_view_in_domain
+dataset_kwargs:
+  token: True
+task: "llava_interleave_bench_multi_view"
+test_split: test
+doc_to_target: "answer"
+doc_to_visual: !function utils.doc_to_visual
+doc_to_text: !function utils.doc_to_text_conversation
+process_results: !function utils.interleave_process_results
+
+metric_list:
+  - metric: overall_score
+    aggregation: !function utils.overall_score
+    higher_is_better: true
+
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  do_sample: False
+  image_aspect_ratio: "pad" # for multi-image, we treat each image as original aspect ratio without anyres strategy.
+
+model_specific_prompt_kwargs:
+  default:
+    oe_post_prompt: ""
+    mcq_post_prompt: ""
diff --git a/lmms_eval/tasks/llava_interleave_bench/out_of_domain.yaml b/lmms_eval/tasks/llava_interleave_bench/out_of_domain.yaml
new file mode 100644
index 00000000..47c3a951
--- /dev/null
+++ b/lmms_eval/tasks/llava_interleave_bench/out_of_domain.yaml
@@ -0,0 +1,26 @@
+dataset_path: lmms-lab/LLaVA-NeXT-Interleave-Bench
+dataset_name: out_of_domain
+dataset_kwargs:
+  token: True
+task: "llava_interleave_bench_out_domain"
+test_split: test
+doc_to_target: "answer"
+doc_to_visual: !function utils.doc_to_visual
+doc_to_text: !function utils.doc_to_text
+process_results: !function utils.interleave_process_results
+
+metric_list:
+  - metric: overall_score
+    aggregation: !function utils.overall_score
+    higher_is_better: true
+
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  do_sample: False
+  image_aspect_ratio: "pad" # for multi-image, we treat each image as original aspect ratio without anyres strategy.
+
+model_specific_prompt_kwargs:
+  default:
+    oe_post_prompt: ""
+    mcq_post_prompt: ""
diff --git a/lmms_eval/tasks/llava_interleave_bench/utils.py b/lmms_eval/tasks/llava_interleave_bench/utils.py
new file mode 100644
index 00000000..77175600
--- /dev/null
+++ b/lmms_eval/tasks/llava_interleave_bench/utils.py
@@ -0,0 +1,359 @@
+import os
+import re
+import requests
+import time
+import json
+
+from collections import defaultdict
+from rouge import Rouge
+import yaml
+from pathlib import Path
+from loguru import logger as eval_logger
+from PIL import Image
+
+spot_the_diff = ["Spot-the-Diff", "Birds-to-Words", "CLEVR-Change"]
+image_edit_instruct = ["IEdit", "HQ-Edit", "MagicBrush"]
+visual_story_telling = ["AESOP", "FlintstonesSV", "PororoSV", "VIST"]
+visual_cloze = ["COMICS_Dialogue", "RecipeQA_VisualCloze"]
+text_rich_vqa = ["WebQA", "TQA", "OCR-VQA", "DocVQA"]
+multi_image_vqa = ["MIT-States_StateCoherence", "MIT-States_PropertyCoherence", "VISION", "RecipeQA_ImageCoherence"]
+
+puzzle = ["RAVEN"]
+nlrv2 = ["NLVR2_Mantis"]
+qbench = ["QBench"]
+
+
+def doc_to_visual(doc):
+    max_visual_count = 16
+    visuals = []
+    for i in range(max_visual_count):
+        if f"image_{i}" in doc:
+            image = doc[f"image_{i}"]
+            if image is None:
+                continue  # Skip this image if it's None
+            if isinstance(image, Image.Image):
+                visuals.append(image.copy().convert("RGB"))
+            else:
+                try:
+                    # If the image is not already a PIL Image, try to open it
+                    visuals.append(Image.open(image).convert("RGB"))
+                except Exception as e:
+                    print(f"Error opening image_{i}: {e}")
+                    # Optionally, you can add a placeholder image or just continue
+                    continue
+
+    return visuals
+
+
+# This is the place where you format your question
+def doc_to_text(doc, model_specific_prompt_kwargs=None):
+    if model_specific_prompt_kwargs is None:
+        model_specific_prompt_kwargs = {}
+
+    oe_post_prompt = ""
+    if "oe_post_prompt" in model_specific_prompt_kwargs:
+        oe_post_prompt = model_specific_prompt_kwargs["oe_post_prompt"]
+
+    mcq_post_prompt = ""
+    if "mcq_post_prompt" in model_specific_prompt_kwargs:
+        mcq_post_prompt = model_specific_prompt_kwargs["mcq_post_prompt"]
+
+    user_prompt = doc["question"]
+
+    if mcq_post_prompt != "" and doc["question_type"] == "multi-choice":
+        user_prompt = user_prompt.split("Your answer is:")[0].split("\n")[0].strip()
+        user_prompt = f"{user_prompt}\n{mcq_post_prompt}"
+
+    if oe_post_prompt != "" and doc["question_type"] == "open-ended":
+        user_prompt = f"{user_prompt}\n{oe_post_prompt}"
+
+    return user_prompt
+
+
+def doc_to_text_conversation(doc, model_specific_prompt_kwargs=None):
+    if model_specific_prompt_kwargs is None:
+        model_specific_prompt_kwargs = {}
+
+    conversations = doc["conversations"]
+
+    if isinstance(conversations, list):
+        user_prompt = json.dumps(conversations)
+    else:
+        user_prompt = conversations
+
+    return user_prompt
+
+
+def doc_to_text_multi_turn(doc, model_specific_prompt_kwargs=None):
+    if model_specific_prompt_kwargs is None:
+        model_specific_prompt_kwargs = {}
+
+    return doc["conversations"]
+
+
+def interleave_process_results(doc, results):
+    pred = results[0]
+    sample_id = doc["sample_id"]
+
+    if doc["question_type"] == "multi-choice":
+        score = mcq_acc(doc["answer"], pred)
+        model_response = {"sample_id": sample_id, "sub_task": doc["sub_task"], "question_type": doc["question_type"], "answer": doc["answer"], "parsed_pred": pred, "score": score}
+    elif doc["question_type"] == "open-ended":
+        score = oe_rogue(doc["answer"], pred)
+        model_response = {"sample_id": sample_id, "sub_task": doc["sub_task"], "question_type": doc["question_type"], "answer": doc["answer"], "parsed_pred": pred, "score": score}
+    else:
+        raise ValueError(f"Unknown question type: {doc['question_type']}")
+
+    return {
+        "overall_score": model_response,
+    }
+
+
+def mcq_acc(answer, pred):
+    periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
+    commaStrip = re.compile("(\d)(\,)(\d)")
+    punct = [";", r"/", "[", "]", '"', "{", "}", "(", ")", "=", "+", "\\", "_", "-", ">", "<", "@", "`", ",", "?", "!"]
+
+    def processPunctuation(inText):
+        outText = inText
+        for p in punct:
+            if (p + " " in inText or " " + p in inText) or (re.search(commaStrip, inText) != None):
+                outText = outText.replace(p, "")
+            else:
+                outText = outText.replace(p, " ")
+        outText = periodStrip.sub("", outText, re.UNICODE)
+        return outText
+
+    def process(answer):
+        option_regex = re.compile(r"^([A-E])\.\s*(.+)$", re.IGNORECASE)
+        match = option_regex.match(answer.strip())
+
+        if match:
+            # If matched, return the option letter in uppercase
+            return match.group(1).upper()
+        else:
+            # If no match, process the answer as before
+            answer = answer.replace("\n", " ")
+            answer = answer.replace("\t", " ")
+            answer = answer.strip()
+            answer = processPunctuation(answer)
+            answer = answer.strip("'")
+            answer = answer.strip('"')
+            answer = answer.strip(")")
+            answer = answer.strip("(")
+            answer = answer.strip().lower()
+
+            # Try to find any single letter (A-E) in the processed answer
+            letter_match = re.search(r"\b([A-E])\b", answer, re.IGNORECASE)
+            if letter_match:
+                return letter_match.group(1).upper()
+
+            return answer
+
+    pred = process(pred)
+    answer = process(answer)
+
+    if pred == answer:
+        score = 1
+    else:
+        score = 0
+
+    return score
+
+
+def oe_rogue(answer, pred):
+    rouge = Rouge()
+    if pred == "":
+        score = 0
+    else:
+        if len(pred) > 512:
+            pred = pred[:512]
+        score = rouge.get_scores(pred, answer)[0]["rouge-l"]["f"]
+
+    return score
+
+
+def overall_score(results):
+    categories = {
+        "Spot-the-Diff": spot_the_diff,
+        "Image-Edit": image_edit_instruct,
+        "Visual-Story-Telling": visual_story_telling,
+        "Visual-Cloze": visual_cloze,
+        "Text-Rich-VQA": text_rich_vqa,
+        "Multi-Image-VQA": multi_image_vqa,
+        "Puzzle": puzzle,
+        "NLVR2": nlrv2,
+        "QBench": qbench,
+    }
+
+    category_scores = {}
+
+    eval_logger.info(f"Evaluation Sub-Task Results:")
+    for category, subtasks in categories.items():
+        score = 0
+        count = 0
+        for result in results:
+            if result["sub_task"] in subtasks:
+                count += 1
+                score += result["score"]
+        if count > 0:
+            avg_score = score / count
+            category_scores[category] = avg_score
+            eval_logger.info(f"{category}: {avg_score:.3f}")
+
+    # Calculate overall score
+    total_score = sum(category_scores.values())
+    num_categories = len(category_scores)
+    overall_score = total_score / num_categories if num_categories > 0 else 0
+
+    return overall_score
+
+
+# EVAL_PROMPT = """
+# [Question]
+# {question}
+
+# [Assistant Response]
+# {model_response}
+
+# [Ground Truth Response]
+# {ground_truth}
+
+# [System]
+# Rate whether the assistant response correctly matches the ground truth, it's about a question towards a sequence of images shared by the user.
+# The rating should be 1-5, where 1 is incorrect and 5 is correct.
+# Your response should be in the format:
+# Explanation: (your explanation)
+# Rating: (int)
+# """
+
+# NUM_SECONDS_TO_SLEEP = 5
+# dir_path = os.path.dirname(os.path.realpath(__file__))
+# with open(Path(__file__).parent / "_default_template_interleave_yaml", "r") as f:
+#     raw_data = f.readlines()
+#     safe_data = []
+#     for i, line in enumerate(raw_data):
+#         # remove function definition since yaml load cannot handle it
+#         if "!function" not in line:
+#             safe_data.append(line)
+
+#     config = yaml.safe_load("".join(safe_data))
+
+# GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+# API_TYPE = config["metadata"]["api_type"]
+
+# if API_TYPE == "openai":
+#     API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+#     API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+#     headers = {
+#         "Authorization": f"Bearer {API_KEY}",
+#         "Content-Type": "application/json",
+#     }
+# elif API_TYPE == "azure":
+#     API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+#     API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+#     headers = {
+#         "api-key": API_KEY,
+#         "Content-Type": "application/json",
+#     }
+# else:
+#     API_URL = ""
+#     API_KEY = ""
+
+
+# def get_chat_response(prompt, max_retries=5, wait_time=10):
+#     headers = {
+#         "Authorization": f"Bearer {API_KEY}",
+#         "Content-Type": "application/json",
+#     }
+
+#     payload = {
+#         "model": GPT_EVAL_MODEL_NAME,
+#         "messages": [
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "text", "text": prompt},
+#                 ],
+#             }
+#         ],
+#         "max_tokens": 1024,
+#         "temperature": 0.0,
+#     }
+
+#     for attempt in range(max_retries):
+#         try:
+#             response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+#             response.raise_for_status()
+#             response_data = response.json()
+#             return response_data["choices"][0]["message"]["content"], GPT_EVAL_MODEL_NAME
+#         except requests.exceptions.RequestException as e:
+#             eval_logger.warning(f"Request failed on attempt {attempt+1}: {e}")
+#             time.sleep(wait_time)
+#             if attempt == max_retries - 1:
+#                 eval_logger.error(f"Failed to get response after {max_retries} attempts")
+#                 return "", GPT_EVAL_MODEL_NAME
+#         except Exception as e:
+#             eval_logger.error(f"Error on attempt {attempt+1}: {e}")
+#             return "", GPT_EVAL_MODEL_NAME
+
+
+# def in_domain_oe_gpt_eval(results, args):
+#     total_score = 0
+#     available_count = 0
+#     for result in results:
+#         if result["question_type"] == "open-ended":
+#             question = result["question"]
+#             model_response = result["parsed_pred"]
+#             ground_truth = result["answer"]
+#             content = EVAL_PROMPT.format(question=question, model_response=model_response, ground_truth=ground_truth)
+#             result["gpt_eval_input"] = content
+#             model_output, model_name = get_chat_response(content)
+#             try:
+#                 explanation = re.search(r"Explanation: (.*)\n", model_output).group(1)
+#                 rating = re.search(r"Rating: (\d+)\n", model_output).group(1)
+#                 result["gpt_eval_explanation"] = explanation
+#                 result["gpt_eval_rating"] = rating
+#                 result["gpt_eval_model_name"] = model_name
+#             except:
+#                 eval_logger.error(f"Error on evaluating {result['sample_id']}. Results: {results}")
+#                 result["gpt_eval_explanation"] = ""
+#                 result["gpt_eval_rating"] = 0
+#                 result["gpt_eval_model_name"] = model_name
+
+#             total_score += result["gpt_eval_rating"]
+#             available_count += 1
+
+#         elif result["question_type"] == "multi-choice":
+#             pass
+
+#     return (total_score / available_count) * 20.0 if available_count > 0 else 0
+
+
+# # class MultiChoiceRegexFilter(ExtendedRegexFilter):
+# #     def __init__(self, *args, **kwargs):
+# #         super().__init__(*args, **kwargs)
+
+# #     def apply(self, resps, docs):
+# #         filtered_resps = []
+
+# #         for r, doc in zip(resps, docs):
+# #             # Regex to directly extract the option letter from the model response
+# #             option_letter_regex = re.compile(r"\b([A-Z])\.\s+([^\n]*)")
+
+# #             # Process each response
+# #             filtered = []
+# #             for resp in r:
+# #                 # Try to match the option letter at the start of the response
+# #                 match = option_letter_regex.match(resp)
+# #                 if match:
+# #                     # If a match is found, append the matched letter
+# #                     filtered.append(match.group(1))
+# #                 else:
+# #                     # If no match, return the original response
+# #                     filtered.append(resp)
+
+# #             # Assuming we need the first response that matches or the original response
+# #             filtered_resps.append(filtered[0])
+
+# #         return filtered_resps
diff --git a/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml b/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml
index 37b744f5..356df525 100644
--- a/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml
+++ b/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml
@@ -3,11 +3,8 @@ doc_to_visual: !function utils.llava_doc_to_visual
 doc_to_text: !function utils.llava_doc_to_text
 doc_to_target: "gpt4v_answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  image_aspect_ratio: original
   max_new_tokens: 4096
-  temperature: 0
+  temperature: 0.7
   top_p: 1.0
   num_beams: 1
   do_sample: false
diff --git a/lmms_eval/tasks/llava_wilder/llava_wilder_small.yaml b/lmms_eval/tasks/llava_wilder/llava_wilder_small.yaml
index 1c493673..1d112ee2 100644
--- a/lmms_eval/tasks/llava_wilder/llava_wilder_small.yaml
+++ b/lmms_eval/tasks/llava_wilder/llava_wilder_small.yaml
@@ -1,8 +1,8 @@
-dataset_path: lmms-lab/llava-bench-wilder
+dataset_path: lmms-lab/LLaVA-Bench-Wilder
 dataset_kwargs:
   token: True
 task: "llava_wilder_small"
-test_split: small
+test_split: test 
 model_specific_prompt_kwargs:
   default:
     pre_prompt: ""
@@ -10,4 +10,4 @@ model_specific_prompt_kwargs:
   xcomposer2_4khd:
     pre_prompt: "[UNUSED_TOKEN_146]user\nQuestion: "
     post_prompt: "[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
-include: _default_template_wilder_yaml
\ No newline at end of file
+include: _default_template_wilder_yaml
diff --git a/lmms_eval/tasks/llava_wilder/utils.py b/lmms_eval/tasks/llava_wilder/utils.py
index 96a020bc..04bb8273 100644
--- a/lmms_eval/tasks/llava_wilder/utils.py
+++ b/lmms_eval/tasks/llava_wilder/utils.py
@@ -119,8 +119,8 @@ def llava_process_results(doc, result):
         a dictionary with key: metric name (in this case coco_bleu), value: metric value
     """
     try:
-        question = doc.get("question", "")
-        ans1 = doc.get("answer", "")
+        question = doc.get("Question", "")
+        ans1 = doc.get("Answer", "")
         ans2 = result[0] if result else ""
         content = f"[Question]\n{question}\n\n" + f"[Assistant 1]\n{ans1}\n\n[End of Assistant 1]\n\n" + f"[Assistant 2]\n{ans2}\n\n[End of Assistant 2]\n\n" f"[System]\n{judge_rules}\n\n"
         visuals = llava_doc_to_visual(doc)
@@ -148,7 +148,7 @@ def llava_doc_to_text(doc, model_specific_prompt_kwargs=None):
         model_specific_prompt_kwargs = {}
     pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "")
     post_prompt = model_specific_prompt_kwargs.get("post_prompt", "")
-    return f"{pre_prompt}{doc['question']}{post_prompt}"
+    return f"{pre_prompt}{doc['Question']}{post_prompt}"
 
 
 def llava_all_aggregation(results):
diff --git a/lmms_eval/tasks/longvideobench/utils.py b/lmms_eval/tasks/longvideobench/utils.py
index 4b5c745e..b71e3086 100644
--- a/lmms_eval/tasks/longvideobench/utils.py
+++ b/lmms_eval/tasks/longvideobench/utils.py
@@ -1,5 +1,4 @@
 import json
-
 import re
 from collections import Counter, defaultdict
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
@@ -13,7 +12,6 @@
 from PIL import Image
 import torch
 
-
 from pathlib import Path
 import yaml
 import sys
diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended_2nd.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended_2nd.yaml
new file mode 100644
index 00000000..e8c0f1fe
--- /dev/null
+++ b/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended_2nd.yaml
@@ -0,0 +1,23 @@
+include: _default_template_yaml
+dataset_path: lmms-lab/MixEvals_Video2Text_OpenEnded_2nd
+dataset_name: "video2text_openended"
+task: "mix_evals_video2text_openconv_2nd"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
+doc_to_text: !function utils.mix_evals_video2text_doc_to_text_open_convs
+doc_to_target: ""
+process_results: !function utils.mix_evals_video2text_process_results_open_convs
+
+metric_list:
+  - metric: submission
+    aggregation: !function utils.mix_evals_video2text_aggregate_gen
+    higher_is_better: true
+
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
+    post_prompt: ""
+  gpt4v:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
+    post_prompt: ""
diff --git a/lmms_eval/tasks/mlvu/utils.py b/lmms_eval/tasks/mlvu/utils.py
index 476cf9dc..9829ee8a 100644
--- a/lmms_eval/tasks/mlvu/utils.py
+++ b/lmms_eval/tasks/mlvu/utils.py
@@ -12,18 +12,7 @@
 import numpy as np
 from loguru import logger as eval_logger
 
-TASK_TYPES = [
-    "TR",
-    "AR",
-    "VS",
-    "NQA",
-    "ER",
-    "PQA",
-    "SSC",
-    "AO",
-    "AC"
-]
-
+TASK_TYPES = ["TR", "AR", "VS", "NQA", "ER", "PQA", "SSC", "AO", "AC"]
 
 
 hf_home = os.getenv("HF_HOME", "./~/.cache/huggingface")
@@ -39,9 +28,7 @@
 cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
 
 
-
 def mlvu_doc_to_visual(doc):
-
     cache_dir = os.path.join(base_cache_dir, cache_name)
     video_path = doc["video_name"]
     video_path = os.path.join(cache_dir, video_path)
@@ -54,21 +41,22 @@ def mlvu_doc_to_visual(doc):
 
 def mlvu_doc_to_text(doc, model_specific_prompt_kwargs=None):
     # option_prompt="Carefully watch this video and pay attention to every detail. Based on your observations, select the best option that accurately addresses the question."
-    option_prompt=""
-    question = doc["question"] + "\nOnly give the best option.\n" 
-    full_prompt=option_prompt+"\n"+question+"\n"+"Best option: ("
+    option_prompt = ""
+    question = doc["question"] + "\nOnly give the best option.\n"
+    full_prompt = option_prompt + "\n" + question + "\n" + "Best option: ("
     return full_prompt
 
 
 def extract_characters_regex(s):
     s = s.strip()
     if ")" in s:
-        index=s.index(")")
-        pred=s[index-1:index]
+        index = s.index(")")
+        pred = s[index - 1 : index]
         return pred
     else:
         return s
 
+
 def mlvu_process_results(doc, results):
     """
     Args:
@@ -98,13 +86,11 @@ def mlvu_aggregate_results(results):
     for task_type in TASK_TYPES:
         category2score[task_type] = {"correct": 0, "answered": 0}
 
-
     for result in results:
         task_type = result["task_type"]
         category2score[task_type]["answered"] += 1
         category2score[task_type]["correct"] += result["pred_answer"] == result["answer"]
 
-
     for task_cate in TASK_TYPES:
         total_correct = 0
         total_answered = 0
diff --git a/lmms_eval/tasks/mmupd/utils.py b/lmms_eval/tasks/mmupd/utils.py
index bfc724a5..818e8cb1 100644
--- a/lmms_eval/tasks/mmupd/utils.py
+++ b/lmms_eval/tasks/mmupd/utils.py
@@ -30,6 +30,9 @@
 elif API_TYPE == "azure":
     API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
     API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+else:
+    API_URL = "YOUR_API_URL"
+    API_KEY = "YOUR_API_KEY"
 
 
 mmupd_evaluator = MMUPD_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)
diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/utils.py b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/utils.py
index a74a02d1..b783cb40 100644
--- a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/utils.py
+++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/utils.py
@@ -1,5 +1,4 @@
 import json
-
 import os
 import requests
 import numpy as np
diff --git a/lmms_eval/tasks/qbench/utils.py b/lmms_eval/tasks/qbench/utils.py
index 5648b57b..a5641570 100644
--- a/lmms_eval/tasks/qbench/utils.py
+++ b/lmms_eval/tasks/qbench/utils.py
@@ -1,5 +1,4 @@
 import json
-
 import re
 from collections import Counter, defaultdict
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
diff --git a/lmms_eval/tasks/screenspot/utils_rec.py b/lmms_eval/tasks/screenspot/utils_rec.py
index 1b65ea62..1aebf2f1 100644
--- a/lmms_eval/tasks/screenspot/utils_rec.py
+++ b/lmms_eval/tasks/screenspot/utils_rec.py
@@ -1,7 +1,5 @@
 import re
-
 from datasets import Dataset
-
 from loguru import logger as eval_logger
 
 REC_METRICS = ["IoU", "ACC@0.1", "ACC@0.3", "ACC@0.5", "ACC@0.7", "ACC@0.9", "Center_ACC"]
diff --git a/lmms_eval/tasks/vibe_eval/utils.py b/lmms_eval/tasks/vibe_eval/utils.py
new file mode 100644
index 00000000..9e3b3c56
--- /dev/null
+++ b/lmms_eval/tasks/vibe_eval/utils.py
@@ -0,0 +1,201 @@
+from enum import Enum
+from dataclasses import dataclass
+from typing import Optional, List
+from pathlib import Path
+import yaml
+
+from reka import ChatMessage
+from reka.client import Reka
+
+import re
+import os
+from copy import deepcopy
+
+REKA_API_KEY = os.getenv("REKA_API_KEY", "YOUR_API_KEY")
+
+with open(Path(__file__).parent / "vibe_eval.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+EVALUATOR_NAME = config["metadata"]["evaluator"]
+
+_PROMPT_WITH_IMAGE = """\
+[Question]
+{prompt}
+
+[Assistant Response]
+{generation}
+
+[Ground Truth Response]
+{reference}
+
+[System]
+Rate whether the assistant response correctly matches the ground truth, in regards to the image above.
+The rating should be 1-5, where 1 is incorrect and 5 is correct.
+Your response should be in the format:
+Explanation: (your explanation)
+Rating: (int)"""
+
+_PROMPT_WITH_NO_IMAGE = """\
+[Question]
+{prompt}
+
+[Assistant Response]
+{generation}
+
+[Ground Truth Response]
+{reference}
+
+[System]
+Rate whether the assistant response correctly matches the ground truth, it's about an image shared by the user.
+The rating should be 1-5, where 1 is incorrect and 5 is correct.
+Your response should be in the format:
+Explanation: (your explanation)
+Rating: (int)"""
+
+
+@dataclass
+class Example:
+    """An example loaded from vibe-eval, stored as jsonl in the repo."""
+
+    example_id: str
+    category: str
+    prompt: str
+    reference: str
+    media_filename: str
+    media_url: str
+
+    # The fields below are not stored in the dataset, but are populated by this script.
+    generation: Optional[str] = None
+    score: Optional[int] = None
+    evaluator_explanation: Optional[str] = None
+
+
+class Evaluator(Enum):
+    # Use Reka Core (including image input).
+    REKA_CORE = "reka-core"
+
+    # Use Reka Core, only using text input.
+    REKA_CORE_TEXT = "reka-core-text"
+
+
+def make_evaluator_prompt(example: Example, include_image: bool) -> str:
+    return (_PROMPT_WITH_IMAGE if include_image else _PROMPT_WITH_NO_IMAGE).format(
+        prompt=example.prompt,
+        reference=example.reference,
+        generation=example.generation,
+    )
+
+
+def evaluate(example: Example, evaluator: Evaluator) -> Example:
+    """Evaluates the generation and populates the score and explanation fields."""
+    include_image = evaluator == Evaluator.REKA_CORE
+    evaluator_prompt = make_evaluator_prompt(example, include_image=include_image)
+    client = Reka(api_key=REKA_API_KEY)
+    content = [
+        {"type": "text", "text": evaluator_prompt},
+    ]
+    if include_image:
+        content.append({"type": "image_url", "image_url": example.media_url})
+    evaluator_response = client.chat.create(
+        messages=[
+            ChatMessage(
+                content=content,
+                role="user",
+            )
+        ],
+        model="reka-core",
+        temperature=0.4,
+        max_tokens=1024,
+    )
+    evaluator_response = evaluator_response.responses[0].message.content
+    # evaluator_response = reka.chat(
+    # human=evaluator_prompt,
+    # media_url=example.media_url if include_image else None,
+    # temperature=0.4,
+    # model_name="reka-core-20240415",
+    # request_output_len=1024,
+    # )["text"]
+    re_match = re.search(r"Rating:\s*([1-5])", evaluator_response)
+    if re_match is None:
+        example.score = 0
+        example.evaluator_explanation = evaluator_response
+        return example
+    example.score = int(re_match.group(1))
+    example.evaluator_explanation = evaluator_response
+    return example
+
+
+def vibe_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+
+
+def vibe_doc_to_text(doc, model_specific_prompt_kwargs=None):
+    question = doc["prompt"].strip()
+    if "pre_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["pre_prompt"] != "":
+        question = f"{model_specific_prompt_kwargs['pre_prompt']}{question}"
+    if "post_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["post_prompt"] != "":
+        question = f"{question}{model_specific_prompt_kwargs['post_prompt']}"
+    return question
+
+
+def vibe_process_results(doc, results):
+    example_id = doc["example_id"]
+    category = doc["category"]
+    prompt = doc["prompt"]
+    reference = doc["reference"]
+    media_filename = doc["media_url"]
+    media_url = doc["media_url"]
+    generation = results[0]
+    example = Example(example_id=example_id, category=category, prompt=prompt, reference=reference, media_filename=media_filename, media_url=media_url, generation=generation)
+
+    evaluator = Evaluator.REKA_CORE if EVALUATOR_NAME == "reka-core" else Evaluator.REKA_CORE_TEXT
+
+    example = evaluate(example, evaluator=evaluator)
+    data_dict = {
+        "score": example.score,
+        "evaluator_explanation": example.evaluator_explanation,
+        "prompt": example.prompt,
+        "generation": example.generation,
+        "media_url": example.media_url,
+        "category": example.category,
+    }
+
+    return {
+        "hard": deepcopy(data_dict),
+        "normal": deepcopy(data_dict),
+        "all": deepcopy(data_dict),
+    }
+
+
+def _mean(scores: List[int]) -> float:
+    """Scale from 1-5 to 0-100 and compute means."""
+    return sum(25 * (score - 1) for score in scores) / len(scores)
+
+
+def vibe_aggregation_results(results, category):
+    score = []
+    for res in results:
+        if category in res["category"] or category == "all":
+            score.append(res["score"])
+
+    aggregate_scores = _mean(score)
+    return aggregate_scores
+
+
+def vibe_aggregation_results_normal(results):
+    return vibe_aggregation_results(results, "normal")
+
+
+def vibe_aggregation_results_hard(results):
+    return vibe_aggregation_results(results, "hard")
+
+
+def vibe_aggregation_results_all(results):
+    return vibe_aggregation_results(results, "all")
diff --git a/lmms_eval/tasks/vibe_eval/vibe_eval.yaml b/lmms_eval/tasks/vibe_eval/vibe_eval.yaml
new file mode 100644
index 00000000..ed7f31c6
--- /dev/null
+++ b/lmms_eval/tasks/vibe_eval/vibe_eval.yaml
@@ -0,0 +1,35 @@
+dataset_path: RekaAI/VibeEval
+dataset_kwargs:
+  token: True
+task: "vibe_eval"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.vibe_doc_to_visual
+doc_to_text: !function utils.vibe_doc_to_text
+doc_to_target: "reference"
+generation_kwargs:
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+# The return value of process_results will be used by metrics
+process_results: !function utils.vibe_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: hard
+    aggregation: !function utils.vibe_aggregation_results_hard
+    higher_is_better: true
+  - metric: normal
+    aggregation: !function utils.vibe_aggregation_results_normal
+    higher_is_better: true
+  - metric: all
+    aggregation: !function utils.vibe_aggregation_results_all
+    higher_is_better: true
+metadata:
+  evaluator: "reka-core-text"
+
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
\ No newline at end of file
diff --git a/lmms_eval/tasks/videomme/utils.py b/lmms_eval/tasks/videomme/utils.py
index 89df7914..c9ec7aa6 100644
--- a/lmms_eval/tasks/videomme/utils.py
+++ b/lmms_eval/tasks/videomme/utils.py
@@ -92,6 +92,47 @@
 cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
 
 
+def parse_subtitle_time(time_str):
+    h, m, s_ms = time_str.split(":")
+    s, ms = s_ms.split(",")
+    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
+
+
+def load_subtitles(subtitle_path):
+    subtitles = {}
+    with open(subtitle_path, "r", encoding="utf-8") as file:
+        content = file.read().split("\n\n")
+        for section in content:
+            if section.strip():
+                lines = section.split("\n")
+                if len(lines) >= 3:
+                    time_range = lines[1].split(" --> ")
+                    start_time = parse_subtitle_time(time_range[0])
+                    end_time = parse_subtitle_time(time_range[1])
+                    text = " ".join(line for line in lines[2:])
+                    subtitles[(start_time, end_time)] = text
+    return subtitles
+
+
+def convert_time_to_frame(time_in_seconds, fps):
+    return int(time_in_seconds * fps)
+
+
+def extract_subtitles(video_path, subtitle_path):
+    video = cv2.VideoCapture(video_path)
+    fps = video.get(cv2.CAP_PROP_FPS)
+    total_frame = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    subtitles = load_subtitles(subtitle_path)
+
+    subtitle_frames = []
+    for (start_time, end_time), text in subtitles.items():
+        start_frame = convert_time_to_frame(start_time, fps)
+        end_frame = convert_time_to_frame(end_time, fps)
+        subtitle_frames.append((start_frame, end_frame, text))
+
+    return subtitle_frames, total_frame
+
+
 def parse_subtitle_time(time_str):
     h, m, s_ms = time_str.split(':')
     s, ms = s_ms.split(',')
@@ -130,7 +171,6 @@ def extract_subtitles(video_path, subtitle_path):
     return subtitle_frames,total_frame
 
 def videomme_doc_to_visual(doc):
-
     cache_dir = os.path.join(base_cache_dir, cache_name)
     video_path = doc["videoID"] + ".mp4"
     video_path = os.path.join(cache_dir, video_path)
@@ -213,8 +253,77 @@ def videomme_doc_to_text_subtitle(doc, model_specific_prompt_kwargs=None):
     option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
     question = doc["question"]
     option = str(doc["options"])
-    question = question + "\n" + option + model_specific_prompt_kwargs["post_prompt"]
-    return question
+    # option = "\n".join([f"{opt}" for i, opt in enumerate(doc["options"])])
+    question = question + "\n" + option
+    full_prompt = option_prompt + "\n" + question + "\n" + "The best answer is:"
+    return full_prompt
+
+
+# Frames + Subs
+# This video's subtitles are listed below:
+# 【subtitles】
+
+# Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option.
+# 【question】
+# The best answer is:
+# Frames / Frames + Audio
+# Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
+# 【question】
+# The best answer is:
+
+
+def videomme_doc_to_text_subtitle(doc, model_specific_prompt_kwargs=None):
+    cache_dir = os.path.join(base_cache_dir, cache_name)
+    video_path = doc["videoID"] + ".mp4"
+    subtitle_path = os.path.join(cache_dir, "subtitle", doc["videoID"] + ".srt")
+    video_path = os.path.join(cache_dir, video_path)
+    if os.path.exists(subtitle_path):  # Denote have subtitle
+        subtitle = open(subtitle_path).readlines()
+    else:
+        subtitle = ""
+    subtitles_prompt = "This video's subtitles are listed below: \n"
+    if subtitle == "":
+        subtitle = "No subtitles available"
+    else:
+        if "gemini_api_flag" in model_specific_prompt_kwargs:  # specific for gemini_api
+            if model_specific_prompt_kwargs["gemini_api_flag"] == "full subtitle":
+                textlist = []
+                for ele in subtitle:
+                    pattern = r'<font color="white" size=".72c">(.*?)</font>'
+                    matches = re.findall(pattern, ele)
+                    if matches:
+                        textlist.append(matches[0])
+                subtitle_text = "\n".join(textlist)
+        else:
+            if "frame_num" in model_specific_prompt_kwargs:
+                frame_num = model_specific_prompt_kwargs["frame_num"]
+                subtitle_by_frame, total_frame = extract_subtitles(video_path, subtitle_path)
+                uniform_sampled_frames = np.linspace(0, total_frame - 1, frame_num, dtype=int).tolist()
+
+                subtitle_by_frame_idx = []
+                for frame_idx in uniform_sampled_frames:
+                    for idx, title in enumerate(subtitle_by_frame):
+                        if frame_idx < title[1] and frame_idx >= title[0]:
+                            subtitle_by_frame_idx.append(idx)
+                subtitle_by_frame_idx = list(set(subtitle_by_frame_idx))
+
+                textlist = []
+                for idx in subtitle_by_frame_idx:
+                    pattern = r'<font color="white" size=".72c">(.*?)</font>'
+                    raw_text = re.findall(pattern, subtitle_by_frame[idx][2])
+                    try:
+                        textlist.append(raw_text[0])
+                    except:
+                        continue
+                subtitle_text = "\n".join(textlist)
+        subtitle = subtitle_text
+
+    option_prompt = "Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
+    question = doc["question"]
+    option = str(doc["options"])
+    question = question + "\n" + option
+    full_prompt = subtitles_prompt + subtitle + "\n" + option_prompt + "\n" + question + "\n" + "The best answer is:"
+    return full_prompt
 
 
 def extract_characters_regex(s):
diff --git a/lmms_eval/tasks/vitatecs/utils.py b/lmms_eval/tasks/vitatecs/utils.py
index 9d16c593..b2adcaa9 100644
--- a/lmms_eval/tasks/vitatecs/utils.py
+++ b/lmms_eval/tasks/vitatecs/utils.py
@@ -89,17 +89,17 @@ def process_option_for_matching(sent):
 
 
 def format_question_and_answer(doc):
-    seed = sum(ord(c) for c in doc['caption'] + doc['counterfactual']) % 100
+    seed = sum(ord(c) for c in doc["caption"] + doc["counterfactual"]) % 100
     random.seed(seed)
     if random.random() > 0.5:
-        option_a = process_option_for_question(doc['caption'])
-        option_b = process_option_for_question(doc['counterfactual'])
+        option_a = process_option_for_question(doc["caption"])
+        option_b = process_option_for_question(doc["counterfactual"])
         answer = "(A) " + option_a
     else:
-        option_a = process_option_for_question(doc['counterfactual'])
-        option_b = process_option_for_question(doc['caption'])
+        option_a = process_option_for_question(doc["counterfactual"])
+        option_b = process_option_for_question(doc["caption"])
         answer = "(B) " + option_b
-    options = [process_option_for_matching(doc['caption']), process_option_for_matching(doc['counterfactual'])]
+    options = [process_option_for_matching(doc["caption"]), process_option_for_matching(doc["counterfactual"])]
 
     question = f"Which of the following best describes the content of the video: \n(A) {option_a} \n(B) {option_b}"
     return question, answer, options
@@ -149,7 +149,7 @@ def vitatecs_process_results(doc, result):
                 "video-llm-prediction": pred,
                 "match_success": match_success,
                 "rating": rating,
-                # "chatgpt_prompt": prompt, 
+                # "chatgpt_prompt": prompt,
                 "chatgpt_response": chatgpt_response,
                 "aspect": doc["aspect"],
             },
diff --git a/lmms_eval/tasks/websrc/utils.py b/lmms_eval/tasks/websrc/utils.py
index ffe53323..6e2584a4 100644
--- a/lmms_eval/tasks/websrc/utils.py
+++ b/lmms_eval/tasks/websrc/utils.py
@@ -7,11 +7,9 @@
 import numpy as np
 import os
 import json
-
 from PIL import Image
 
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
-
 from loguru import logger as eval_logger
 
 OPEN_ENDED_PROMPT = "Answer the question using a single word or phrase."
@@ -48,11 +46,13 @@ def websrc_process_results(doc, results):
 
     return {
         "websrc_squad_f1": websrc_ans,
-        "submission": {
-            websrc_ans["question_id"]: pred,
-        }
-        if "question_id" in websrc_ans
-        else None,
+        "submission": (
+            {
+                websrc_ans["question_id"]: pred,
+            }
+            if "question_id" in websrc_ans
+            else None
+        ),
     }
 
 
diff --git a/lmms_eval/tasks/wild_vision_bench/utils.py b/lmms_eval/tasks/wild_vision_bench/utils.py
index bb426557..dbbbc2bc 100644
--- a/lmms_eval/tasks/wild_vision_bench/utils.py
+++ b/lmms_eval/tasks/wild_vision_bench/utils.py
@@ -83,13 +83,9 @@ def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10):
                 "role": "user",
                 "content": [
                     {"type": "text", "text": prompt},
-                    {"type": "image_url",
-                        "image_url" : {
-                            "url" : f"data:image/jpeg;base64, {base64_image}"
-                            }
-                    },
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64, {base64_image}"}},
                 ],
-            }
+            },
         ],
         "max_tokens": 1024,
         "temperature": 0.0,
@@ -111,7 +107,6 @@ def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10):
             return "", GPT_EVAL_MODEL_NAME
 
 
-
 def image_to_base64(pil_image):
     buffered = BytesIO()
     pil_image.save(buffered, format="PNG")
@@ -129,8 +124,9 @@ def get_score(judgement, pattern, pairwise=True):
     else:
         return None, False
 
+
 def wild_vision_doc_to_visual(doc):
-    return [doc["image"].convert('RGB')]
+    return [doc["image"].convert("RGB")]
 
 
 def wild_vision_doc_to_text(doc, model_specific_prompt_kwargs=None):
@@ -151,18 +147,18 @@ def wild_vision_process_results(doc, results):
     base64_image = image_to_base64(doc["image"])
     resps, gpt_name = get_chat_response(base64_image, user_prompt)
     score, _ = get_score(resps, pattern=re.compile("\[\[([AB<>=]+)\]\]"))
-    
+
     if score is None:
         score = resps
-    
+
     if "A>B" in score:
         final_score = -1
-        judgement = "Worse" #Baseline better
+        judgement = "Worse"  # Baseline better
     elif "A>>B" in score:
         final_score = -2
         judgement = "Worse++"
     elif "A=B" in score:
-        final_score = 0 
+        final_score = 0
         judgement = "Tie"
     elif "B>A" in score:
         final_score = 1
@@ -174,15 +170,12 @@ def wild_vision_process_results(doc, results):
         final_score = 0
         judgement = "Unclear"
 
-
-    return {"gpt_eval_score" : {"question" : doc["instruction"], "score" : final_score, "gpt_resps" : resps, "ans_1" : doc[BASELINE_MODEL_NAME], "ans_2" : pred, "filtered_resps" : score, "judgement" : judgement}}
+    return {"gpt_eval_score": {"question": doc["instruction"], "score": final_score, "gpt_resps": resps, "ans_1": doc[BASELINE_MODEL_NAME], "ans_2": pred, "filtered_resps": score, "judgement": judgement}}
 
 
 def wild_vision_aggregation(results):
     score = 0
     for res in results:
         score += res["score"]
-    
-    return score / len(results)
-
 
+    return score / len(results)
diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py
index 80231c9c..40e71b22 100755
--- a/lmms_eval/utils.py
+++ b/lmms_eval/utils.py
@@ -2,6 +2,7 @@
 import re
 import sys
 import yaml
+import json
 import inspect
 import pathlib
 import functools
@@ -41,6 +42,14 @@
 SPACING = " " * 47
 
 
+def is_json(string):
+    try:
+        json.loads(string)
+        return True
+    except json.JSONDecodeError:
+        return False
+
+
 def escaped_split(text, sep_char, maxsplit=-1):
     """Split text into a list on occurrences of the given separation
     character `sep_char`. The separation character may be escaped by a
diff --git a/pyproject.toml b/pyproject.toml
index c9ea022a..da2f2120 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "lmms_eval"
-version = "0.2.0.post1"
+version = "0.2.1"
 authors = [
     { name = "LMMMs-Lab Evaluation Team", email = "lmms_eval@outlook.com" },
 ]
@@ -32,17 +32,25 @@ dependencies = [
     "peft>=0.2.0",
     "pybind11>=2.6.2",
     "pytablewriter",
-    "rouge-score>=0.0.4",
     "sacrebleu>=1.5.0",
     "scikit-learn>=0.24.1",
     "sqlitedict",
     "torch>=2.1.0", # to enable sdpa mode for running 34B model on one 80GB GPU
-    "openai>=1.0.0",
+    "torchvision>=0.16.0",
+    "timm",
+    "einops",
+    "ftfy",
+    "openai",
+    "opencv-python-headless",
+    "av",
+    "hf_transfer",
+    "pywsd",
+    "nltk",
+    "sentencepiece==0.1.99",
     "yt-dlp",
-    "google-generativeai",
     "pycocoevalcap",
     "tqdm-multiprocess",
-    "transformers>=4.37.2",
+    "transformers==4.39.2",
     "transformers-stream-generator",
     "zstandard",
     "pillow",
@@ -63,7 +71,22 @@ dependencies = [
     "decord",
     "zss",
     "pywsd",
-    "capture_metric"
+    "spacy",
+    "anls",
+    "rouge",
+    "capture_metric",
+]
+
+[project.optional-dependencies]
+vila = [
+    "s2wrapper@git+https://github.com/bfshi/scaling_on_scales"
+]
+gemini = [
+    "google-generativeai",
+]
+all = [
+    "vila",
+    "gemini",
 ]
 
 [tool.setuptools.packages.find]
@@ -107,5 +130,5 @@ exclude = [
 lmms-eval = "lmms_eval.__main__:cli_evaluate"
 
 [project.urls]
-Homepage = "https://lmms-lab.github.io/lmms-eval-blog/"
-Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval"
+Homepage = "https://lmms-lab.github.io"
+Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval"
\ No newline at end of file
diff --git a/tools/live_bench/create_dataset.py b/tools/live_bench/create_dataset.py
new file mode 100644
index 00000000..c02d811c
--- /dev/null
+++ b/tools/live_bench/create_dataset.py
@@ -0,0 +1,12 @@
+from live_bench.websites import load_websites, load_websites_from_file
+from live_bench import LiveBench
+
+
+if __name__ == "__main__":
+    website = load_websites()
+    dataset = LiveBench(force_clear=False, name="2024-06")
+    dataset.capture(websites=website, driver_kwargs={"headless": True}, screen_shoter="single_screen", shoter_kwargs={"screen_size": (1024, 1024)}, qa_generator="gpt4v", scorer="gpt4v", checker="gemini")
+
+    website = load_websites_from_file("/data/pufanyi/project/lmms-eval/temp/images")
+    dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="gpt4v", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={})
+    dataset.upload()
diff --git a/tools/live_bench/data_summary.ipynb b/tools/live_bench/data_summary.ipynb
new file mode 100644
index 00000000..43048195
--- /dev/null
+++ b/tools/live_bench/data_summary.ipynb
@@ -0,0 +1,343 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = load_dataset(\"lmms-lab/LiveBench\", \"2024-06\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = data[\"test\"].to_pandas()\n",
+    "df = df[df[\"checker\"].notna()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.iloc[620][\"checker\"] is None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "subtask\n",
+       "Contextual Analysis     477\n",
+       "Deeper Implications     132\n",
+       "Basic Understanding     118\n",
+       "Further Insights         63\n",
+       "Broader Implications     52\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"subtask\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.sample(frac=1).reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_303711/5529174.py:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  top_50_per_subtask = df.groupby('subtask').apply(lambda x: x.nlargest(50, 'score'))\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "250"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "top_50_per_subtask = df.groupby(\"subtask\").apply(lambda x: x.nlargest(50, \"score\"))\n",
+    "top_50_per_subtask.reset_index(drop=True, inplace=True)\n",
+    "len(top_50_per_subtask)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "subtask\n",
+       "Basic Understanding     50\n",
+       "Broader Implications    50\n",
+       "Contextual Analysis     50\n",
+       "Deeper Implications     50\n",
+       "Further Insights        50\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "top_50_per_subtask[\"subtask\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "9.276"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "np.mean(top_50_per_subtask[\"score\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset, Features\n",
+    "import datasets\n",
+    "\n",
+    "\n",
+    "def gen():\n",
+    "    for d in top_50_per_subtask:\n",
+    "        yield d\n",
+    "\n",
+    "\n",
+    "data = Dataset.from_pandas(\n",
+    "    top_50_per_subtask,\n",
+    "    features=Features(\n",
+    "        {\n",
+    "            \"id\": datasets.Value(\"int32\"),\n",
+    "            \"images\": datasets.Sequence(datasets.Image()),\n",
+    "            \"website\": datasets.Value(\"string\"),\n",
+    "            \"question\": datasets.Value(\"string\"),\n",
+    "            \"answer\": datasets.Value(\"string\"),\n",
+    "            \"criteria\": datasets.Value(\"string\"),\n",
+    "            \"subtask\": datasets.Value(\"string\"),\n",
+    "            \"data_generator\": datasets.Value(\"string\"),\n",
+    "            \"checker\": datasets.Value(\"string\"),\n",
+    "            \"date_time\": datasets.Value(\"string\"),\n",
+    "            \"screen_shoter\": datasets.Value(\"string\"),\n",
+    "            \"screen_size\": datasets.Value(\"string\"),\n",
+    "            \"score\": datasets.Value(\"int32\"),\n",
+    "            \"reason\": datasets.Value(\"string\"),\n",
+    "            \"scorer_name\": datasets.Value(\"string\"),\n",
+    "        }\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['id', 'images', 'website', 'question', 'answer', 'criteria', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n",
+       "    num_rows: 250\n",
+       "})"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': Value(dtype='int32', id=None),\n",
+       " 'images': Sequence(feature=Image(mode=None, decode=True, id=None), length=-1, id=None),\n",
+       " 'website': Value(dtype='string', id=None),\n",
+       " 'question': Value(dtype='string', id=None),\n",
+       " 'answer': Value(dtype='string', id=None),\n",
+       " 'criteria': Value(dtype='string', id=None),\n",
+       " 'subtask': Value(dtype='string', id=None),\n",
+       " 'data_generator': Value(dtype='string', id=None),\n",
+       " 'checker': Value(dtype='string', id=None),\n",
+       " 'date_time': Value(dtype='string', id=None),\n",
+       " 'screen_shoter': Value(dtype='string', id=None),\n",
+       " 'screen_size': Value(dtype='string', id=None),\n",
+       " 'score': Value(dtype='int32', id=None),\n",
+       " 'reason': Value(dtype='string', id=None),\n",
+       " 'scorer_name': Value(dtype='string', id=None)}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 250/250 [00:00<00:00, 273.33 examples/s]it/s]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00,  4.98ba/s]\n",
+      "Uploading the dataset shards: 100%|██████████| 1/1 [00:18<00:00, 18.34s/it]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBench/commit/89ce4e07cf0988b2305b7227ac2677d481e6112a', commit_message='Upload dataset', commit_description='', oid='89ce4e07cf0988b2305b7227ac2677d481e6112a', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-06\", split=\"test\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "lmms-eval",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tools/live_bench/filter.ipynb b/tools/live_bench/filter.ipynb
new file mode 100644
index 00000000..1a9b59e0
--- /dev/null
+++ b/tools/live_bench/filter.ipynb
@@ -0,0 +1,341 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import Dataset, load_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading readme: 100%|██████████| 1.60k/1.60k [00:00<00:00, 2.29MB/s]\n",
+      "Downloading data: 100%|██████████| 48.9M/48.9M [00:04<00:00, 11.3MB/s]\n",
+      "Downloading data: 100%|██████████| 87.6M/87.6M [00:04<00:00, 20.7MB/s]\n",
+      "Downloading data: 100%|██████████| 103M/103M [00:05<00:00, 18.8MB/s] \n",
+      "Generating test split: 100%|██████████| 943/943 [00:03<00:00, 290.89 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = load_dataset(\"lmms-lab/LiveBench\", \"2024-06\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    test: Dataset({\n",
+       "        features: ['id', 'images', 'website', 'question', 'answer', 'criteria', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n",
+       "        num_rows: 943\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Filter: 100%|██████████| 441/441 [00:33<00:00, 13.09 examples/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['id', 'images', 'website', 'question', 'answer', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n",
+       "    num_rows: 409\n",
+       "})"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_data = data[\"test\"].filter(lambda example: example[\"score\"] and example[\"score\"] > 5)\n",
+    "filtered_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>images</th>\n",
+       "      <th>website</th>\n",
+       "      <th>question</th>\n",
+       "      <th>answer</th>\n",
+       "      <th>subtask</th>\n",
+       "      <th>data_generator</th>\n",
+       "      <th>checker</th>\n",
+       "      <th>date_time</th>\n",
+       "      <th>screen_shoter</th>\n",
+       "      <th>screen_size</th>\n",
+       "      <th>score</th>\n",
+       "      <th>reason</th>\n",
+       "      <th>scorer_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Look at the image provided. Which article head...</td>\n",
+       "      <td>\"BBC tracks down smuggler behind Channel cross...</td>\n",
+       "      <td>Deeper Implications</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-27 14:36:42</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>10</td>\n",
+       "      <td>The answer accurately identifies the relevant ...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Look at the image. What significant global eve...</td>\n",
+       "      <td>The image of a young girl wearing a life jacke...</td>\n",
+       "      <td>Contextual Analysis</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-27 14:36:42</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>10</td>\n",
+       "      <td>The answer correctly identifies Biden and Trum...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>What detail in the image connected to the \"Bol...</td>\n",
+       "      <td>The \"Interpol Bolivia\" background visible in t...</td>\n",
+       "      <td>Deeper Implications</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-27 14:36:42</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>7</td>\n",
+       "      <td>Authenticity (4/5): The answer is reasonable b...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Analyze the visual composition of the article ...</td>\n",
+       "      <td>The image of the young girl, potentially a mig...</td>\n",
+       "      <td>Contextual Analysis</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-27 14:36:42</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>8</td>\n",
+       "      <td>The question directly relates to assessing the...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Focusing on the article \"BBC tracks down smugg...</td>\n",
+       "      <td>The image of a child, juxtaposed with an artic...</td>\n",
+       "      <td>Broader Implications</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-27 14:36:42</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>10</td>\n",
+       "      <td>The answer directly correlates with the story'...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id                                             images  \\\n",
+       "0   0  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "1   1  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "2   2  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "3   3  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "4   4  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "\n",
+       "                           website  \\\n",
+       "0  {'url': 'https://www.bbc.com/'}   \n",
+       "1  {'url': 'https://www.bbc.com/'}   \n",
+       "2  {'url': 'https://www.bbc.com/'}   \n",
+       "3  {'url': 'https://www.bbc.com/'}   \n",
+       "4  {'url': 'https://www.bbc.com/'}   \n",
+       "\n",
+       "                                            question  \\\n",
+       "0  Look at the image provided. Which article head...   \n",
+       "1  Look at the image. What significant global eve...   \n",
+       "2  What detail in the image connected to the \"Bol...   \n",
+       "3  Analyze the visual composition of the article ...   \n",
+       "4  Focusing on the article \"BBC tracks down smugg...   \n",
+       "\n",
+       "                                              answer               subtask  \\\n",
+       "0  \"BBC tracks down smuggler behind Channel cross...   Deeper Implications   \n",
+       "1  The image of a young girl wearing a life jacke...   Contextual Analysis   \n",
+       "2  The \"Interpol Bolivia\" background visible in t...   Deeper Implications   \n",
+       "3  The image of the young girl, potentially a mig...   Contextual Analysis   \n",
+       "4  The image of a child, juxtaposed with an artic...  Broader Implications   \n",
+       "\n",
+       "  data_generator checker            date_time  screen_shoter   screen_size  \\\n",
+       "0          gpt4v  gemini  2024-06-27 14:36:42  single_screen  (1024, 1024)   \n",
+       "1          gpt4v  gemini  2024-06-27 14:36:42  single_screen  (1024, 1024)   \n",
+       "2          gpt4v  gemini  2024-06-27 14:36:42  single_screen  (1024, 1024)   \n",
+       "3          gpt4v  gemini  2024-06-27 14:36:42  single_screen  (1024, 1024)   \n",
+       "4          gpt4v  gemini  2024-06-27 14:36:42  single_screen  (1024, 1024)   \n",
+       "\n",
+       "   score                                             reason scorer_name  \n",
+       "0     10  The answer accurately identifies the relevant ...       gpt4v  \n",
+       "1     10  The answer correctly identifies Biden and Trum...       gpt4v  \n",
+       "2      7  Authenticity (4/5): The answer is reasonable b...       gpt4v  \n",
+       "3      8  The question directly relates to assessing the...       gpt4v  \n",
+       "4     10  The answer directly correlates with the story'...       gpt4v  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_data.to_pandas().head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 409/409 [00:30<00:00, 13.63 examples/s]?it/s]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 34.92ba/s]\n",
+      "Uploading the dataset shards: 100%|██████████| 1/1 [00:38<00:00, 38.26s/it]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBench/commit/7eaf5caa899cc0b8bae7156cc534e12825a97565', commit_message='Upload dataset', commit_description='', oid='7eaf5caa899cc0b8bae7156cc534e12825a97565', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-06\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "live_bench",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tools/live_bench/live_bench/__init__.py b/tools/live_bench/live_bench/__init__.py
new file mode 100644
index 00000000..ec6dc67d
--- /dev/null
+++ b/tools/live_bench/live_bench/__init__.py
@@ -0,0 +1,2 @@
+from live_bench.data_generator import LiveBench
+from live_bench.api.live_bench import generate_live_bench_from_path, generate_live_bench
diff --git a/tools/live_bench/live_bench/data_generator/__init__.py b/tools/live_bench/live_bench/data_generator/__init__.py
new file mode 100644
index 00000000..65fd9781
--- /dev/null
+++ b/tools/live_bench/live_bench/data_generator/__init__.py
@@ -0,0 +1,4 @@
+from live_bench.data_generator.qa_generator import get_generator, get_random_generator
+from live_bench.data_generator.live_bench_data import LiveBenchData
+from live_bench.data_generator.live_bench import LiveBench
+from live_bench.data_generator.response import Response
diff --git a/tools/live_bench/live_bench/data_generator/check_prompt.md b/tools/live_bench/live_bench/data_generator/check_prompt.md
new file mode 100644
index 00000000..4532e561
--- /dev/null
+++ b/tools/live_bench/live_bench/data_generator/check_prompt.md
@@ -0,0 +1,25 @@
+I would like you to act as a quizmaster who designs questions based on a provided image that would challenge adults to think critically. The image in question is a screenshot from the homepage or section of news website. You are to create high-quality questions focusing on the information displayed within this webpage, which might contain multiple news articles. Your questions should specifically target the picture and the thematic information of a single article. Your question should be answerable, and checkable. Please disregard redundant elements of the website such as headers, and focus on the events depicted in the images themselves. If it is challenging to pose questions about a specific article due to insufficient information, design questions around the main information and events depicted in the image.
+
+Now, you are given a screenshot of the homepage of a news website, with a already generated question and answer. Your task is to refine the question and answer, and refractor them to make the question more answerable, checkable, and challenging. If you don't think the question is good, please provide a new question and answer.
+
+Note that the subtask must be one of these five:
+
+- Basic Understanding
+- Contextual Analysis
+- Deeper Implications
+- Broader Implications
+- Further Insights
+
+If you think the question does not correspond to the subtask, you have two options:
+1. Modify the question to correspond to the subtask.
+2. Modify the subtask to correspond to the question.
+
+However, you should not change the original question's subtask unless the original subtask is not one of these five. If you feel the original question's subtask does not match the question, modify the question to match the subtask instead of rewriting the subtask.
+
+Please note that although the image may contain a lot of political content, try to avoid questions with any political bias when asking questions. The question should focus on understanding and thinking about the image, not on political opinions. Within your capabilities, try to make the questions more challenging. However, you also need to consider the gradability of the questions you set. It is reiterated that what you need to assess is the ability to understand the news webpage, not politics.
+
+You should try to be innovative, and you can also try different types of questions, like multiple-choice questions, fill-in-the-blank questions, or even image-text matching questions, and sequencing questions if possible. Within your capabilities, try to make the questions more challenging.
+
+If you think the question is not good, or it is not answerable, please provide a new question and answer.
+
+Reminder again that you cannot change the original subtask unless the original subtask is not one of the five listed above.
diff --git a/tools/live_bench/live_bench/data_generator/default_criteria.md b/tools/live_bench/live_bench/data_generator/default_criteria.md
new file mode 100644
index 00000000..f276b439
--- /dev/null
+++ b/tools/live_bench/live_bench/data_generator/default_criteria.md
@@ -0,0 +1,16 @@
+### 1. Authenticity (5 points)
+- **5 Points**: The information is directly observable in the image or can be reasonably inferred with strong evidence.
+- **3 Points**: The information has a plausible connection to the image but requires assumptions that are not strongly supported by the image.
+- **1 Point**: The information cannot be observed or reasonably inferred from the image; it seems unrelated or speculative.
+
+### 2. Logical Coherence (3 points)
+- **3 Points**: The answer logically follows from the question and maintains consistency with the image context.
+- **2 Points**: There are minor logical gaps or inconsistencies in the answer relative to the question.
+- **1 Point**: The answer is logically inconsistent or contradictory to the question or image context.
+
+### 3. Clarity and Precision (2 points)
+- **2 Points**: The question and answer are clearly articulated and precisely address specifics of the image.
+- **1 Point**: The question or answer is somewhat vague or overly general, lacking specific details related to the image.
+- **0 Points**: The question or answer is unclear or too ambiguous to determine relevance to the image.
+
+Each Q&A pair can score a maximum of 10 points. The sum of points from these three categories determines the final score for each pair. Provide a brief explanation for each rating, focusing on how well the Q&A adheres to these criteria.
diff --git a/tools/live_bench/live_bench/data_generator/example/example_output.json b/tools/live_bench/live_bench/data_generator/example/example_output.json
new file mode 100644
index 00000000..2526e7df
--- /dev/null
+++ b/tools/live_bench/live_bench/data_generator/example/example_output.json
@@ -0,0 +1,57 @@
+{
+    "Basic Understanding": [
+        {
+            "Question": "Which of the following topics is NOT covered in the news articles shown in the image?\nA) Middle East politics\nB) Technological advancements\nC) Natural disasters\nD) Animal welfare",
+            "Answer": "C) Natural disasters",
+            "Criteria": "Give 10 marks if correctly selected C, otherwise 0 marks."
+        },
+        {
+            "Question": "Based on the image and the headlines provided on the BBC webpage, fill in the blank with the most appropriate word or phrase:\n\"The article titled 'UN Security Council backs US Israel-Gaza ceasefire plan' is accompanied by an image of ______, which symbolizes the impact of the conflict on civilians and the urgency for a ceasefire.\"",
+            "Answer": "The article titled 'UN Security Council backs US Israel-Gaza ceasefire plan' is accompanied by an image of **children amidst rubble**, which symbolizes the impact of the conflict on civilians and the urgency for a ceasefire.",
+            "Criteria": "Award 10 marks for the correct answer 'children amidst rubble', 5 marks for partially correct synonyms or related phrases, and 0 marks for incorrect answers."
+        }
+    ],
+    "Contextual Analysis": [
+		{
+            "Question": "In the image associated with the article about the US Security Council backing the Israel-Gaza ceasefire plan, what are the people in the image doing, and how do their actions and expressions relate to the content of the article?",
+            "Answer": "In the image associated with the article about the US Security Council backing the Israel-Gaza ceasefire plan, the people are navigating through rubble, indicating a scene of destruction. One person is climbing over debris, while another is looking directly at the camera with a serious expression. Their actions and expressions reflect the aftermath of conflict and the urgency of the situation, which aligns with the article's focus on the need for a ceasefire and the release of hostages held by Hamas.",
+            "Criteria": "Award up to 10 marks based on the accuracy and detail of the response: 2 marks for identifying the scene of destruction, 2 marks for mentioning a person climbing over debris, 2 marks for noting someone looking directly at the camera with a serious expression, 4 marks for correctly relating these observations to the aftermath of conflict, the urgency of the situation, and the article's focus on ceasefire and hostage release."
+        },
+        {
+            "Question": "How might the image of children navigating through rubble relate to the themes discussed in the article about Netanyahu and Gaza ceasefire?",
+            "Answer": "The image powerfully underscores the humanitarian impact of the conflict, aligning with the themes discussed in the article where Netanyahu's political maneuvers are contrasted with the pressing need for a ceasefire to alleviate civilian suffering in Gaza.",
+            "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the humanitarian impact, 3 marks for linking the image to the themes of the article, 2 marks for mentioning Netanyahu's political maneuvers, 3 marks for correctly associating the need for a ceasefire with the alleviation of civilian suffering."
+        }
+    ],
+    "Deeper Implications": [
+        {
+            "Question": "What broader issues are raised by the UN Security Council's backing of a ceasefire in the Israel-Gaza context?",
+            "Answer": "The broader issues include international involvement in regional conflicts, the effectiveness of UN resolutions in conflict resolution, and the ongoing debate over the balance between national security and humanitarian needs in conflict zones.",
+            "Criteria": "Award up to 10 marks based on the accuracy and completeness of the response: 3 marks for mentioning international involvement in regional conflicts, 3 marks for discussing the effectiveness of UN resolutions in conflict resolution, 4 marks for addressing the debate over the balance between national security and humanitarian needs in conflict zones."
+        },
+        {
+            "Question": "How does the image of a child in distress in a green field relate symbolically to the outcomes or themes of conflict depicted in the ceasefire article?",
+            "Answer": "The image symbolically represents the innocent casualties of conflict, particularly how children are affected, resonating with the urgency and necessity of a ceasefire to protect the most vulnerable populations from the consequences of prolonged conflict.",
+            "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the symbolic representation of innocent casualties, 3 marks for specifically mentioning how children are affected, 3 marks for relating this to the urgency and necessity of a ceasefire, 2 marks for connecting these elements to the protection of vulnerable populations."
+        }
+    ],
+    "Broader Implications": [
+        {
+            "Question": "Rank the news articles in the image in order of their potential global impact, from highest to lowest.",
+            "Answer": "1. **UN Security Council backs US Israel-Gaza ceasefire plan**\n2. **Netanyahu walks tightrope as US urges Gaza ceasefire deal**\n3. **Apple brings ChatGPT to iPhones in AI overhaul**\n4. **Aircraft carrying Malawi vice-president goes missing**\n5. **Fire at famous Bangkok market kills 1,000 animals**\n6. **Four US college instructors stabbed in public park in China**\n7. **Baltimore shipping channel reopens after bridge collapse**",
+            "Criteria": "Award up to 10 marks based on the accuracy of the ranking: 2 marks for correctly placing the UN Security Council article first, 2 marks for correctly placing the Netanyahu article second, 1 mark each for correctly placing the next three articles (Apple, Aircraft, Fire), and 1 mark each for correctly placing the last two articles (Stabbing, Bridge). Deduct 1 mark for each position an article is away from its correct placement."
+        }
+    ],
+    "Further Insights": [
+        {
+            "Question": "Based on the image, which of the following statements best explains the potential global impact of the events described in the news articles?\nA. The US-Israel-Gaza ceasefire plan backed by the UN Security Council is likely to reduce tensions in the Middle East, potentially leading to a more stable geopolitical environment in the region.\nB. The introduction of ChatGPT to iPhones is expected to significantly disrupt the technology market, overshadowing the geopolitical events in the Middle East and Africa.\nC. The fire at the Bangkok market, which killed 1,000 animals, is likely to have a more profound impact on global environmental policies than the ceasefire plan in the Middle East.\nD. The disappearance of the aircraft carrying the Malawi vice-president is expected to lead to a major international search and rescue operation, diverting attention from other global issues.",
+            "Answer": "A. The US-Israel-Gaza ceasefire plan backed by the UN Security Council is likely to reduce tensions in the Middle East, potentially leading to a more stable geopolitical environment in the region.",
+            "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 10 marks for selecting option A, 0 marks for any other option. Detailed justification for scoring: Option A directly addresses the reduction of tensions and potential stabilization in the Middle East, which is a significant global impact. Other options, while plausible, do not directly relate to the primary global impact as depicted in the provided image and articles."
+        },
+        {
+            "Question": "Considering the current global attention on AI, how might the article about Apple bringing ChatGPT to iPhones in an AI overhaul reflect on broader technological trends and consumer expectations?",
+            "Answer": "This article reflects broader trends in AI integration into consumer technology, highlighting competitive dynamics in the tech industry, and growing consumer expectations for sophisticated AI features in everyday devices.",
+            "Criteria": "Award up to 10 marks based on the depth and accuracy of the response: 3 marks for identifying AI integration into consumer technology, 3 marks for discussing competitive dynamics in the tech industry, 4 marks for explaining the growth in consumer expectations for sophisticated AI features in everyday devices."
+        }
+    ]
+}
diff --git a/tools/live_bench/live_bench/data_generator/example/example_website.png b/tools/live_bench/live_bench/data_generator/example/example_website.png
new file mode 100644
index 00000000..880fbb98
Binary files /dev/null and b/tools/live_bench/live_bench/data_generator/example/example_website.png differ
diff --git a/tools/live_bench/live_bench/data_generator/live_bench.py b/tools/live_bench/live_bench/data_generator/live_bench.py
new file mode 100644
index 00000000..84f72ec9
--- /dev/null
+++ b/tools/live_bench/live_bench/data_generator/live_bench.py
@@ -0,0 +1,163 @@
+import os
+from datetime import datetime
+from typing import List
+from tqdm import tqdm
+from live_bench.data_generator.live_bench_data import LiveBenchData
+from datasets import Dataset, load_dataset
+from live_bench.websites import Website
+from live_bench.driver import load_driver
+from live_bench.data_generator import get_generator, get_random_generator
+from live_bench.screen_shoter import get_shoter
+from live_bench.data_generator.qa_generator import QAGenerator, QAData
+from live_bench.screen_shoter import ScreenImage, ScreenShoter
+from live_bench.data_generator.score_getter import get_score_getter, get_random_score_getter
+from live_bench.data_generator.response import Response
+from live_bench.data_generator.utils.extract_infomation import ImageInfomation, InfomationExtractor
+
+import json
+from typing import List, Tuple
+import logging
+
+logger = logging.getLogger("lmms-eval")
+
+
+def get_qa_data(images: ScreenImage, qa_generator: QAGenerator, *, infomation_getter: InfomationExtractor = None, test=False) -> Tuple[List[QAData], Response]:
+    if infomation_getter:
+        infomation = infomation_getter.extract_infomation(images)
+    else:
+        infomation = None
+    response = qa_generator.generate(images, test=test, infomation=infomation)
+    qa_data = qa_generator.format_response(response)
+    return qa_data, response
+
+
+def get_live_bench_data(
+    driver, website: Website, screen_shoter: ScreenShoter, qa_generator: QAGenerator, checker: QAGenerator, infomation_getter: InfomationExtractor, test=False, scorer=None, score_threshold=5
+) -> Tuple[List[LiveBenchData], Response]:
+    images = screen_shoter.capture(driver, website)
+    qa_data, logs = get_qa_data(images, qa_generator, test=test, infomation_getter=infomation_getter)
+    data = []
+    for qa in qa_data:
+        item = LiveBenchData(screen=images, question=qa.question, answer=qa.answer, subtask=qa.subtask, criteria=qa.criteria, data_generator=qa_generator.get_name(), checker=checker, scorer=scorer)
+        if score_threshold and (not item.score or item.score < score_threshold):
+            continue
+        data.append(item)
+    return data, logs
+
+
+class LiveBench(object):
+    def __init__(self, path: str = "lmms-lab/LiveBench", *, name="auto", split="test", cache_dir=None, remote_path=None, trust_remote_code=True, force_clear=False, **kwargs):
+        self.path = path
+        if name == "auto":
+            name = datetime.now().strftime("%Y-%m")
+        self.name = name
+        self.split = split
+        self.cache_dir = cache_dir
+        self.dataset_kwargs = kwargs
+        if remote_path is None:
+            self.remote_path = path
+        if force_clear:
+            self.clear()
+        else:
+            try:
+                self.hf_data = load_dataset(self.path, name=self.name, split=split, cache_dir=cache_dir, trust_remote_code=trust_remote_code, **kwargs)
+            except Exception as e:
+                logger.error(f"Error loading dataset: {e}")
+                self.clear()
+
+    def clear(self):
+        self.hf_data = Dataset.from_dict(
+            {
+                "id": [],
+                "images": [],
+                "website": [],
+                "question": [],
+                "answer": [],
+                "criteria": [],
+                "subtask": [],
+                "data_generator": [],
+                "checker": [],
+                "date_time": [],
+                "screen_shoter": [],
+                "screen_size": [],
+                "score": [],
+                "reason": [],
+                "scorer_name": [],
+            },
+            features=LiveBenchData.features,
+        )
+
+    def add(self, data: LiveBenchData, id: int = None):
+        if id is None:
+            id = len(self.hf_data)
+        organized_data = data.to_hf_dict()
+        organized_data["id"] = id
+        self.hf_data = self.hf_data.add_item(organized_data)
+
+    def capture(self, websites: List[Website] = None, *, screen_shoter="single_screen", qa_generator=None, checker=None, driver=None, scorer=None, test=False, driver_kwargs={}, shoter_kwargs={}, generator_kwargs={}, log_folder="./logs"):
+        can_quit_driver = False
+        if driver is None and screen_shoter != "human":
+            driver = load_driver(**driver_kwargs)
+            can_quit_driver = True
+        screen_shoter = get_shoter(screen_shoter, **shoter_kwargs)
+        if qa_generator is not None:
+            qa_generator = get_generator(qa_generator, **generator_kwargs)
+        else:
+            qa_generator = get_random_generator(**generator_kwargs)
+        if checker is None:
+            checker = get_random_generator(**generator_kwargs)
+        else:
+            checker = get_generator(checker, **generator_kwargs)
+        if scorer is not None and isinstance(scorer, str):
+            scorer = get_score_getter(scorer)
+        elif scorer is None:
+            scorer = get_random_score_getter()
+        logs = []
+        infomation_getter = InfomationExtractor()
+        for website in tqdm(websites, desc="Capturing websites"):
+            try:
+                data, log = get_live_bench_data(driver, website, screen_shoter, qa_generator, checker, test=test, scorer=scorer, infomation_getter=infomation_getter)
+                logs.append(log.to_dict())
+                for d in data:
+                    self.add(d)
+            except Exception as e:
+                logger.error(f"Error capturing website: {e}")
+                logger.error(f"Website: {website.get_info()}")
+                logs.append(
+                    {
+                        "success": False,
+                        "content": f"Error capturing website: {e}",
+                        "full_log": {
+                            "website": website.get_info(),
+                            "error": str(e),
+                        },
+                    }
+                )
+                continue
+        if not os.path.exists(log_folder):
+            os.makedirs(log_folder)
+        date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+        log_file = os.path.join(log_folder, f"{date_time}.json")
+        full_log = {
+            "info": {
+                "date_time": date_time,
+                "screen_shoter": screen_shoter.get_name(),
+                "qa_generator": qa_generator.get_name(),
+                "checker": checker.get_name(),
+                "scorer": scorer.get_name(),
+            },
+            "websites": [w.get_info() for w in websites],
+            "logs": logs,
+        }
+        with open(log_file, "w") as f:
+            json.dump(full_log, f, indent=4)
+        logger.info(f"Logs saved to {os.path.abspath(log_file)}")
+        if can_quit_driver:
+            driver.quit()
+
+    def upload(self, **kwargs):
+        self.hf_data.push_to_hub(self.remote_path, config_name=self.name, split=self.split, **kwargs)
+
+    def save(self, path: str):
+        self.hf_data.save_to_disk(path)
+        logger.info(f"Data saved to {os.path.abspath(path)}")
diff --git a/tools/live_bench/live_bench/data_generator/prompt.md b/tools/live_bench/live_bench/data_generator/prompt.md
new file mode 100644
index 00000000..10835d73
--- /dev/null
+++ b/tools/live_bench/live_bench/data_generator/prompt.md
@@ -0,0 +1,17 @@
+I would like you to act as a quizmaster who designs questions based on a provided image that would challenge adults to think critically. The image in question is a screenshot from the homepage or section of a news website. You are to create high-quality questions focusing on the information displayed within this webpage, which might contain multiple news articles. Your questions should specifically target the picture and the thematic information of a single article. Your question should be answerable, and checkable. Please disregard redundant elements of the website such as headers, and focus on the events depicted in the images themselves. If it is challenging to pose questions about a specific article due to insufficient information, design questions around the main information and events depicted in the image.
+
+A well-crafted question about an event should allow respondents to gain deeper insights by observing and analyzing the event, paying attention to the following aspects:
+
+- Basic Understanding: Questions that require direct observation or recall of the information presented in the image. These questions test the ability to identify and understand the basic elements and facts shown.
+- Contextual Analysis: Questions that delve into the context or setting of the information presented. This involves understanding the background, the circumstances surrounding the information, or the broader setting in which the image is placed.
+- Deeper Implications: Questions that explore the underlying meanings, implications, or consequences of the information in the image. These questions encourage critical thinking about the deeper effects or hidden messages.
+- Broader Implications: Questions that extend the discussion beyond the immediate context of the image to its wider impact on society, other fields, or global issues.
+- Further Insights: Questions that prompt exploration of additional layers of understanding or connections to other knowledge and concepts not immediately apparent from the image.
+
+Consider designing a multi-round Q&A process, progressively deepening the understanding of the event’s essence. Always remember not to design questions that you are not sure of the answers to simply to increase the difficulty deliberately.
+
+Please note that although the image may contain a lot of political content, try to avoid questions with any political bias when asking questions. Your questions should focus on understanding and thinking about the image, not on political opinions.
+
+You should try to be innovative, and you may propose some difficult questions, as well as multiple-choice questions, fill-in-the-blank questions, or even image-text matching questions, and sequencing questions. Within your capabilities, try to make the questions more challenging.
+
+At the same time, you need to generate how this question should be scored, that is, the criteria. Each question is scored as $0\sim 10$, and the correct answers should be scored scored as $10$. Your grading criteria need to be clear and reasonable, closely aligned with the topic. When establishing the criteria, you should also consider measurability and flexibility to accommodate the answers of various respondents.
diff --git a/tools/live_bench/live_bench/data_generator/score_prompt.md b/tools/live_bench/live_bench/data_generator/score_prompt.md
new file mode 100644
index 00000000..1de806a7
--- /dev/null
+++ b/tools/live_bench/live_bench/data_generator/score_prompt.md
@@ -0,0 +1,20 @@
+Based on the multi-round Q&A regarding the image, please evaluate each question and answer from the multi-round Q&A based on the image for their authenticity (whether the information can be directly obtained from the image or reasonably inferred) and logical coherence. For each Q&A pair, provide a rating from 1 to 10, where 1 indicates very poor and 10 indicates excellent. Additionally, please provide a brief explanation for each rating.
+
+Here are the criteria for evaluating the Q&A pairs:
+
+### 1. Authenticity (5 points)
+- **5 Points**: The information is directly observable in the image or can be reasonably inferred with strong evidence.
+- **3 Points**: The information has a plausible connection to the image but requires assumptions that are not strongly supported by the image.
+- **1 Point**: The information cannot be observed or reasonably inferred from the image; it seems unrelated or speculative.
+
+### 2. Logical Coherence (3 points)
+- **3 Points**: The answer logically follows from the question and maintains consistency with the image context.
+- **2 Points**: There are minor logical gaps or inconsistencies in the answer relative to the question.
+- **1 Point**: The answer is logically inconsistent or contradictory to the question or image context.
+
+### 3. Clarity and Precision (2 points)
+- **2 Points**: The question and answer are clearly articulated and precisely address specifics of the image.
+- **1 Point**: The question or answer is somewhat vague or overly general, lacking specific details related to the image.
+- **0 Points**: The question or answer is unclear or too ambiguous to determine relevance to the image.
+
+Each Q&A pair can score a maximum of 10 points. The sum of points from these three categories determines the final score for each pair. Provide a brief explanation for each rating, focusing on how well the Q&A adheres to these criteria.
diff --git a/tools/live_bench/live_bench/data_generator/utils/__init__.py b/tools/live_bench/live_bench/data_generator/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py b/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py
new file mode 100644
index 00000000..6946c815
--- /dev/null
+++ b/tools/live_bench/live_bench/data_generator/utils/extract_infomation.py
@@ -0,0 +1,116 @@
+import os
+import openai
+import logging
+from bs4 import BeautifulSoup
+import requests
+from live_bench.data_generator.utils.gpt4v import gpt4v_generate_response, format_gpt4v_images
+from live_bench.screen_shoter import ScreenImage
+from live_bench.websites import Website
+from live_bench.data_generator.response import Response
+
+logger = logging.getLogger("live-bench")
+
+
+GPT4V_EXTRACT_TEXT_PROMPT: str = """\
+These are the images of the website that we have captured. Please extract the text from the website.
+You should extract the text from the website as detailed as possible.
+Only output the text extracted from the website, do not include any other information.
+"""
+
+GPT4V_FIND_IMAGES_FEATURES_PROMPT: str = """\
+This is a screenshot from a news website. Your task is to identify the meaningful images in this screenshot and extract relevant information about these images, such as the environment depicted, the actions and expressions of the people, and the connection between these images and the corresponding text. You need to think deeply about these images and provide as much detailed and useful information as possible.
+"""
+
+GPT4V_THINK_DIFFERENTLY_PROMPT: str = """\
+What makes this website different from other websites? What is special about its news? Since it is a news website, where is the "new" reflected? Do not give a generalized answer; you need to provide detailed answers based on the specific content of each news article and the accompanying illustrations.
+"""
+
+
+class ImageInfomation(object):
+    def __init__(self, text=None, image_features=None, differnt_points=None):
+        self.text = text
+        self.image_features = image_features
+        self.differnt_points = differnt_points
+
+    def to_dict(self):
+        res = {}
+        if self.text:
+            res["Text Extracted in the HTML"] = self.text
+        if self.image_features:
+            res["Image Features"] = self.image_features
+        if self.differnt_points:
+            res["Interesting Points"] = self.differnt_points
+        return res
+
+    def __str__(self):
+        return self.get_info()
+
+    def get_info(self):
+        res_list = [f"## {key}\n\n{value}" for key, value in self.to_dict().items()]
+        if res_list:
+            return "**Here is something you can take as reference.**\n\n" + "\n\n".join(res_list)
+        else:
+            return ""
+
+
+class InfomationExtractor(object):
+    def __init__(self, model="gpt-4-turbo", openai_api_key=None):
+        if not openai_api_key:
+            openai_api_key = os.getenv("OPENAI_API_KEY")
+            if not openai_api_key:
+                raise ValueError("OPENAI_API_KEY environment variable not set.")
+        self.client = openai.OpenAI(api_key=openai_api_key)
+        self.model = model
+
+    def extract_text_from_html(self, url):
+        response = requests.get(url)
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        text = "\n".join(soup.stripped_strings)
+        return text
+
+    def extract_text_from_html_using_gpt4v(self, screen_image: ScreenImage, **kwargs) -> Response:
+        website: Website = screen_image.website
+        if website.url:
+            url = website.url
+            text = self.extract_text_from_html(url)
+            text = f"Below is the text extracted from the website {url} for you to take reference:\n{text}"
+        else:
+            text = ""
+        text = f"{GPT4V_EXTRACT_TEXT_PROMPT}\n{text}"
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": text}] + format_gpt4v_images(screen_image.images),
+            }
+        ]
+        response = gpt4v_generate_response(messages, model=self.model, client=self.client, json_format=False, **kwargs)
+        return response
+
+    def extract_infomation(self, screen_image: ScreenImage, **kwargs) -> ImageInfomation:
+        ocrs = self.extract_text_from_html_using_gpt4v(screen_image)
+        infomation = ImageInfomation()
+        if ocrs.success:
+            ocrs = f"Below is the text extracted from the website for you to take reference:\n{ocrs.content}"
+            infomation.text = ocrs
+        else:
+            ocrs = ""
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": f"{GPT4V_FIND_IMAGES_FEATURES_PROMPT}\n{ocrs}"}] + format_gpt4v_images(screen_image.images),
+            }
+        ]
+        response = gpt4v_generate_response(messages, model=self.model, client=self.client, json_format=False, **kwargs)
+        if response.success:
+            infomation.image_features = response.content
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": f"{GPT4V_THINK_DIFFERENTLY_PROMPT}\n\n{str(infomation)}"}] + format_gpt4v_images(screen_image.images),
+            }
+        ]
+        response = gpt4v_generate_response(messages, model=self.model, client=self.client, json_format=False, **kwargs)
+        if response.success:
+            infomation.differnt_points = response.content
+        return infomation
diff --git a/tools/live_bench/live_bench/data_generator/utils/gpt4v.py b/tools/live_bench/live_bench/data_generator/utils/gpt4v.py
new file mode 100644
index 00000000..2ee70794
--- /dev/null
+++ b/tools/live_bench/live_bench/data_generator/utils/gpt4v.py
@@ -0,0 +1,72 @@
+from PIL import Image
+import io
+import base64
+from live_bench.data_generator.response import Response
+import logging
+from time import sleep
+
+logger = logging.getLogger("lmms-eval")
+
+
+def format_gpt4v_images(image):
+    if isinstance(image, Image.Image):
+        buffered = io.BytesIO()
+        image.save(buffered, format="PNG")
+        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/png;base64,{img_str}",
+            },
+        }
+    elif isinstance(image, list):
+        return [format_gpt4v_images(img) for img in image]
+    else:
+        raise ValueError(f"Unsupported image type: {type(image)}")
+
+
+def format_printable_messages(messages):
+    for message in messages:
+        if "content" in message and isinstance(message["content"], list):
+            for content in message["content"]:
+                if "type" in content and content["type"] == "image_url":
+                    content["image_url"]["url"] = "<image_url>"
+    return messages
+
+
+def gpt4v_generate_response(messages, *, client=None, model="gpt-4-turbo", max_tokens: int = 4096, max_try_times: int = 5, json_format="auto", test=False, **kwargs) -> Response:
+    if json_format == "auto":
+        json_format = False
+        for message in messages:
+            if message.get("role") == "user":
+                contents = message.get("content", [])
+                if isinstance(contents, str):
+                    if "json" in contents:
+                        json_format = True
+                        break
+                else:
+                    for content in contents:
+                        if content.get("type", None) == "text" and "json" in content.get("text", ""):
+                            json_format = True
+                            break
+
+    if json_format:
+        response_format = {"type": "json_object"}
+    else:
+        response_format = None
+
+    def _generate():
+        return client.chat.completions.create(model=model, messages=messages, max_tokens=max_tokens, response_format=response_format, **kwargs)
+
+    for times in range(max_try_times):
+        try:
+            response = _generate()
+            return Response(success=True, content=response.choices[0].message.content, full_log={"input": format_printable_messages(messages), "output": response.choices[0].message.content})
+        except Exception as e:
+            logger.error(f"Failed to generate response: {e}")
+            if times < max_try_times - 1:
+                logger.info(f"Retrying... ({times+1}/{max_try_times})")
+                sleep(3)
+            else:
+                logger.error("Failed to generate response after retrying.")
+                return Response(success=False, content=str(e), full_log={"input": format_printable_messages(messages), "output": None})
diff --git a/tools/live_bench/live_bench/driver/__init__.py b/tools/live_bench/live_bench/driver/__init__.py
new file mode 100644
index 00000000..5ac815d2
--- /dev/null
+++ b/tools/live_bench/live_bench/driver/__init__.py
@@ -0,0 +1 @@
+from live_bench.driver.load_driver import load_driver
diff --git a/tools/live_bench/live_bench/screen_shoter/__init__.py b/tools/live_bench/live_bench/screen_shoter/__init__.py
new file mode 100644
index 00000000..e5ea9856
--- /dev/null
+++ b/tools/live_bench/live_bench/screen_shoter/__init__.py
@@ -0,0 +1,2 @@
+from live_bench.screen_shoter.screen_shoter import ScreenShoter, register_shoter, get_shoter
+from live_bench.screen_shoter.screen import ScreenImage
diff --git a/tools/live_bench/live_bench/view.ipynb b/tools/live_bench/live_bench/view.ipynb
new file mode 100644
index 00000000..71d99d91
--- /dev/null
+++ b/tools/live_bench/live_bench/view.ipynb
@@ -0,0 +1,354 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading data: 100%|██████████| 470k/470k [00:00<00:00, 575kB/s]\n",
+      "Generating test split: 100%|██████████| 9/9 [00:00<00:00, 341.15 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset(\"lmms-lab/LiveBench\", \"2024-06\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    test: Dataset({\n",
+       "        features: ['id', 'images', 'website', 'question', 'answer', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n",
+       "        num_rows: 9\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>images</th>\n",
+       "      <th>website</th>\n",
+       "      <th>question</th>\n",
+       "      <th>answer</th>\n",
+       "      <th>subtask</th>\n",
+       "      <th>data_generator</th>\n",
+       "      <th>checker</th>\n",
+       "      <th>date_time</th>\n",
+       "      <th>screen_shoter</th>\n",
+       "      <th>screen_size</th>\n",
+       "      <th>score</th>\n",
+       "      <th>reason</th>\n",
+       "      <th>scorer_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Look at the image provided. Two of the article...</td>\n",
+       "      <td>The shared theme is the legal situation of Hun...</td>\n",
+       "      <td>Contextual Analysis</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-12 01:14:15</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>10</td>\n",
+       "      <td>The answer accurately identifies the article d...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Based on the provided image, what is the main ...</td>\n",
+       "      <td>Ukraine has launched missiles at sites within ...</td>\n",
+       "      <td>Basic Understanding</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-12 01:14:15</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>10</td>\n",
+       "      <td>The information provided in the answer directl...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Based on the juxtaposition of the articles on ...</td>\n",
+       "      <td>The two narratives are the personal and legal ...</td>\n",
+       "      <td>Contextual Analysis</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-12 01:14:15</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>8</td>\n",
+       "      <td>The answer is mostly authentic as the convicti...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Examining the headline and image, what visual ...</td>\n",
+       "      <td>The image showcasing a relatively unscathed st...</td>\n",
+       "      <td>Contextual Analysis</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-12 01:14:15</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>8</td>\n",
+       "      <td>The answer is relevant and provides a plausibl...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Based on the juxtaposition of the articles on ...</td>\n",
+       "      <td>The audience might infer a potential media bia...</td>\n",
+       "      <td>Deeper Implications</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-12 01:14:15</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>8</td>\n",
+       "      <td>The response provides a reasoned inference bas...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>5</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Examining the image and the headline \"What son...</td>\n",
+       "      <td>Potential narratives could include: the impact...</td>\n",
+       "      <td>Broader Implications</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-12 01:14:15</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>7</td>\n",
+       "      <td>Authenticity (4/5): The information about it b...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>6</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Based on the visual framing and placement of t...</td>\n",
+       "      <td>The BBC seemingly prioritizes Hunter Biden's t...</td>\n",
+       "      <td>Contextual Analysis</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-12 01:14:15</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>8</td>\n",
+       "      <td>The answer effectively compares domestic polit...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>7</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Considering the use of the word \"LIVE\" in the ...</td>\n",
+       "      <td>The use of \"LIVE\" could lead the public to per...</td>\n",
+       "      <td>Deeper Implications</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-12 01:14:15</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>10</td>\n",
+       "      <td>The answer provided is directly related to the...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>8</td>\n",
+       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
+       "      <td>{'url': 'https://www.bbc.com/'}</td>\n",
+       "      <td>Based on the image alone, what elements sugges...</td>\n",
+       "      <td>The juxtaposition of the image depicting Presi...</td>\n",
+       "      <td>Deeper Implications</td>\n",
+       "      <td>gpt4v</td>\n",
+       "      <td>gemini</td>\n",
+       "      <td>2024-06-12 01:14:15</td>\n",
+       "      <td>single_screen</td>\n",
+       "      <td>(1024, 1024)</td>\n",
+       "      <td>8</td>\n",
+       "      <td>The answer effectively discusses the potential...</td>\n",
+       "      <td>gpt4v</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id                                             images  \\\n",
+       "0   0  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "1   1  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "2   2  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "3   3  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "4   4  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "5   5  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "6   6  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "7   7  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "8   8  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
+       "\n",
+       "                           website  \\\n",
+       "0  {'url': 'https://www.bbc.com/'}   \n",
+       "1  {'url': 'https://www.bbc.com/'}   \n",
+       "2  {'url': 'https://www.bbc.com/'}   \n",
+       "3  {'url': 'https://www.bbc.com/'}   \n",
+       "4  {'url': 'https://www.bbc.com/'}   \n",
+       "5  {'url': 'https://www.bbc.com/'}   \n",
+       "6  {'url': 'https://www.bbc.com/'}   \n",
+       "7  {'url': 'https://www.bbc.com/'}   \n",
+       "8  {'url': 'https://www.bbc.com/'}   \n",
+       "\n",
+       "                                            question  \\\n",
+       "0  Look at the image provided. Two of the article...   \n",
+       "1  Based on the provided image, what is the main ...   \n",
+       "2  Based on the juxtaposition of the articles on ...   \n",
+       "3  Examining the headline and image, what visual ...   \n",
+       "4  Based on the juxtaposition of the articles on ...   \n",
+       "5  Examining the image and the headline \"What son...   \n",
+       "6  Based on the visual framing and placement of t...   \n",
+       "7  Considering the use of the word \"LIVE\" in the ...   \n",
+       "8  Based on the image alone, what elements sugges...   \n",
+       "\n",
+       "                                              answer               subtask  \\\n",
+       "0  The shared theme is the legal situation of Hun...   Contextual Analysis   \n",
+       "1  Ukraine has launched missiles at sites within ...   Basic Understanding   \n",
+       "2  The two narratives are the personal and legal ...   Contextual Analysis   \n",
+       "3  The image showcasing a relatively unscathed st...   Contextual Analysis   \n",
+       "4  The audience might infer a potential media bia...   Deeper Implications   \n",
+       "5  Potential narratives could include: the impact...  Broader Implications   \n",
+       "6  The BBC seemingly prioritizes Hunter Biden's t...   Contextual Analysis   \n",
+       "7  The use of \"LIVE\" could lead the public to per...   Deeper Implications   \n",
+       "8  The juxtaposition of the image depicting Presi...   Deeper Implications   \n",
+       "\n",
+       "  data_generator checker            date_time  screen_shoter   screen_size  \\\n",
+       "0          gpt4v  gemini  2024-06-12 01:14:15  single_screen  (1024, 1024)   \n",
+       "1          gpt4v  gemini  2024-06-12 01:14:15  single_screen  (1024, 1024)   \n",
+       "2          gpt4v  gemini  2024-06-12 01:14:15  single_screen  (1024, 1024)   \n",
+       "3          gpt4v  gemini  2024-06-12 01:14:15  single_screen  (1024, 1024)   \n",
+       "4          gpt4v  gemini  2024-06-12 01:14:15  single_screen  (1024, 1024)   \n",
+       "5          gpt4v  gemini  2024-06-12 01:14:15  single_screen  (1024, 1024)   \n",
+       "6          gpt4v  gemini  2024-06-12 01:14:15  single_screen  (1024, 1024)   \n",
+       "7          gpt4v  gemini  2024-06-12 01:14:15  single_screen  (1024, 1024)   \n",
+       "8          gpt4v  gemini  2024-06-12 01:14:15  single_screen  (1024, 1024)   \n",
+       "\n",
+       "   score                                             reason scorer_name  \n",
+       "0     10  The answer accurately identifies the article d...       gpt4v  \n",
+       "1     10  The information provided in the answer directl...       gpt4v  \n",
+       "2      8  The answer is mostly authentic as the convicti...       gpt4v  \n",
+       "3      8  The answer is relevant and provides a plausibl...       gpt4v  \n",
+       "4      8  The response provides a reasoned inference bas...       gpt4v  \n",
+       "5      7  Authenticity (4/5): The information about it b...       gpt4v  \n",
+       "6      8  The answer effectively compares domestic polit...       gpt4v  \n",
+       "7     10  The answer provided is directly related to the...       gpt4v  \n",
+       "8      8  The answer effectively discusses the potential...       gpt4v  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset[\"test\"].to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "live_bench",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tools/live_bench/live_bench/websites/__init__.py b/tools/live_bench/live_bench/websites/__init__.py
new file mode 100644
index 00000000..a865d16e
--- /dev/null
+++ b/tools/live_bench/live_bench/websites/__init__.py
@@ -0,0 +1,2 @@
+from live_bench.websites.load_website import load_websites, load_websites_from_file
+from live_bench.websites.website import Website
diff --git a/tools/live_bench/pyproject.toml b/tools/live_bench/pyproject.toml
new file mode 100755
index 00000000..79956c40
--- /dev/null
+++ b/tools/live_bench/pyproject.toml
@@ -0,0 +1,47 @@
+[tool.black]
+line-length = 240
+
+[build-system]
+requires = ["setuptools>=42", "wheel", "setuptools_scm[tomli]>=6.3"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "live_bench"
+version = "0.0.1"
+authors = [
+    { name = "LMMMs-Lab Evaluation Team", email = "lmms_eval@outlook.com" },
+]
+description = "Live Bench"
+readme = "README.md"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+requires-python = ">=3.9"
+license = { text = "MIT" }
+dependencies = [
+    "PyYAML >= 6.0.1",
+    "webdriver_manager >= 4.0.1",
+    "openai >= 1.32.0",
+    "google-generativeai >= 0.6.0",
+    "datasets >= 2.19.2",
+    "Pillow >= 10.3.0",
+    "selenium >= 4.21.0",
+    "undetected-chromedriver >= 3.5.5",
+    "anthropic >= 0.28.0",
+    "bs4 >= 0.0.2",
+]
+
+[tool.setuptools.packages.find]
+include = ["lmms_eval*"]
+
+[tool.setuptools.package-data]
+lmms_eval = ["**/*.yaml", "tasks/**/*"]
+
+[project.scripts]
+lmms-eval = "lmms_eval.__main__:cli_evaluate"
+
+[project.urls]
+Homepage = "https://lmms-lab.github.io/"
+Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval"
diff --git a/tools/live_bench/setup.py b/tools/live_bench/setup.py
new file mode 100755
index 00000000..b908cbe5
--- /dev/null
+++ b/tools/live_bench/setup.py
@@ -0,0 +1,3 @@
+import setuptools
+
+setuptools.setup()
diff --git a/tools/make_vatex.py b/tools/make_vatex.py
new file mode 100644
index 00000000..7882a38b
--- /dev/null
+++ b/tools/make_vatex.py
@@ -0,0 +1,31 @@
+from datasets import load_dataset, Dataset
+import json
+
+with open("data/vatex_public_test_english_v1.1.json", "r") as f:
+    data = json.load(f)
+
+for da in data:
+    da["url"] = "https://www.youtube.com/watch?v=" + da["videoID"]
+
+vatex_dataset = Dataset.from_list(data)
+# vatex_dataset.rename_columns({
+#     'videoID': 'video_name',
+#     'enCap': 'caption'
+# }) #if change name is needed
+hub_dataset_path = "lmms-lab/vatex_from_url"
+
+vatex_dataset.push_to_hub(repo_id=hub_dataset_path, split="test", config_name="vatex_test", token=True)
+
+with open("data/vatex_validation_v1.0.json", "r") as f:
+    data = json.load(f)
+for da in data:
+    da["url"] = "https://www.youtube.com/watch?v=" + da["videoID"]
+
+vatex_dataset = Dataset.from_list(data)
+# vatex_dataset.rename_columns({
+#     'videoID': 'video_name',
+#     'enCap': 'caption'
+# }) #if change name is needed
+hub_dataset_path = "lmms-lab/vatex_from_url"
+
+vatex_dataset.push_to_hub(repo_id=hub_dataset_path, split="validation", config_name="vatex_val_zh", token=True)