Merge branch 'main' into internal_main_dev

EvolvingLMMs-Lab · Aug 7, 2024 · 17a3028 · 17a3028
2 parents 858ecf1 + 3d4884a
commit 17a3028
Show file tree

Hide file tree

Showing 45 changed files with 1,790 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,8 @@
 ---
 
 ## Annoucement
-
+- [2024-07] 🎉🎉 We welcome the new tasks [LongVideoBench](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/117), [MMStar](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/158), new models [Mantis](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/162).
+- [2024-07] 🎉🎉 We have released the [technical report](https://arxiv.org/abs/2407.12772) and [LiveBench](https://huggingface.co/spaces/lmms-lab/LiveBench)! 
 - [2024-07] 👨‍💻👨‍💻 The `lmms-eval/v0.2.1` has been upgraded to support more models, including [LongVA](https://github.com/EvolvingLMMs-Lab/LongVA), [InterVL-2](https://github.com/OpenGVLab/InternVL), [VILA](https://github.com/NVlabs/VILA), and many more evaluation tasks, e.g. [Details Captions](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/136), [MLVU](https://arxiv.org/abs/2406.04264), [WildVision-Bench](https://huggingface.co/datasets/WildVision/wildvision-arena-data), [VITATECS](https://github.com/lscpku/VITATECS) and [LLaVA-Interleave-Bench](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/).
 
 - [2024-06] 🎬🎬 The `lmms-eval/v0.2.0` has been upgraded to support video evaluations for video models like LLaVA-NeXT Video and Gemini 1.5 Pro across tasks such as EgoSchema, PerceptionTest, VideoMME, and more. Please refer to the [blog](https://lmms-lab.github.io/posts/lmms-eval-0.2/) for more details
@@ -165,6 +166,18 @@ python3 -m accelerate.commands.launch \
 python3 -m accelerate.commands.launch --num_processes=8 -m lmms_eval --config ./miscs/example_eval.yaml
 ```
 
+**Evaluation of video model (llava-next-video-32B)**
+```bash
+accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \
+    --model llavavid \
+    --model_args pretrained=lmms-lab/LLaVA-NeXT-Video-32B-Qwen,conv_template=qwen_1_5,video_decode_backend=decord,max_frames_num=32，mm_spatial_pool_mode=average,mm_newline_position=grid,mm_resampler_location=after \
+    --tasks videomme \
+    --batch_size 1 \
+    --log_samples \
+    --log_samples_suffix llava_vid_32B \
+    --output_path ./logs/
+```
+
 **Evaluation with naive model sharding for bigger model (llava-next-72b)**
 
 ```bash
@@ -199,7 +212,7 @@ Please check [supported models](lmms_eval/models/__init__.py) for more details.
 
 ### Supported tasks
 
-Please check [supported tasks](lmms_eval/docs/current_tasks.md) for more details.
+Please check [supported tasks](docs/current_tasks.md) for more details.
 
 ## Add Customized Model and Dataset
 

diff --git a/lmms_eval/models/internvl.py b/lmms_eval/models/internvl.py
@@ -449,7 +449,10 @@ def _collate(x):
             split = split[0]
             batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]  # [B, N]
             flattened_visuals = self.flatten(batched_visuals)
-            pixel_values = self.load_image(flattened_visuals, self.image_size).cuda().to(torch.bfloat16)
+            try:
+                pixel_values = self.load_image(flattened_visuals, self.image_size).cuda().to(torch.bfloat16)
+            except IndexError:
+                pixel_values = None
             gen_kwargs = all_gen_kwargs[0]
 
             if "max_new_tokens" not in gen_kwargs:

diff --git a/lmms_eval/models/internvl2.py b/lmms_eval/models/internvl2.py
@@ -145,6 +145,7 @@ def __init__(
 
         accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
         accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+        self.accelerator = accelerator
         if accelerator.num_processes > 1:
             self._device = torch.device(f"cuda:{accelerator.local_process_index}")
             self.device_map = f"cuda:{accelerator.local_process_index}"
@@ -251,13 +252,16 @@ def generate_until(self, requests) -> List[str]:
             visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
             visuals = self.flatten(visuals)
             if self.modality == "image":
-                visuals = [load_image(visual).to(torch.bfloat16).cuda() for visual in visuals]
-                pixel_values = torch.cat(visuals, dim=0)
-                num_patches_list = [visual.size(0) for visual in visuals]
                 if visuals:
+                    visuals = [load_image(visual).to(torch.bfloat16).cuda() for visual in visuals]
+                    pixel_values = torch.cat(visuals, dim=0)
+                    num_patches_list = [visual.size(0) for visual in visuals]
                     image_tokens = ["<image>"] * len(visuals)
                     image_tokens = " ".join(image_tokens)
                     contexts = image_tokens + "\n" + contexts
+                else:
+                    pixel_values = None
+                    num_patch_list = None
                 response, history = self.model.chat(self.tokenizer, pixel_values, contexts, gen_kwargs, num_patches_list=num_patches_list, history=None, return_history=True)
             elif self.modality == "video":
                 assert len(visuals) == 1, f"Only one video is supported, but got {len(visuals)} videos."

diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
@@ -69,6 +69,7 @@ def __init__(
 
         accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
         accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+        self.accelerator = accelerator
         if accelerator.num_processes > 1:
             self._device = torch.device(f"cuda:{accelerator.local_process_index}")
             self.device_map = f"cuda:{accelerator.local_process_index}"

diff --git a/lmms_eval/models/llava_hf.py b/lmms_eval/models/llava_hf.py
@@ -52,6 +52,7 @@ def __init__(
         device_map: str = "",
         chat_template: Optional[str] = None,
         use_cache: bool = True,
+        specified_eot_token_id: Optional[int] = None,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -85,6 +86,7 @@ def __init__(
         self.batch_size_per_gpu = int(batch_size)
         self.chat_template = chat_template
         self.use_cache = use_cache
+        self.specified_eot_token_id = specified_eot_token_id
         if accelerator.num_processes > 1 and device_map == "":
             assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
             # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model
@@ -316,18 +318,13 @@ def _collate(x):
                     max_new_tokens=gen_kwargs["max_new_tokens"],
                     use_cache=self.use_cache,
                     pad_token_id=self.tokenizer.eos_token_id,
+                    eos_token_id=self.specified_eot_token_id,
                 )
+                cont = cont[:, inputs["input_ids"].shape[-1]:]
             except Exception as e:
                 eval_logger.error(f"Error {e} in generating")
                 cont = ""
             text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
-            if "1.5" in self.pretrained:
-                text_outputs = text_outputs.split("ASSISTANT:")[-1].strip()
-            elif "mistral" in self.pretrained:
-                text_outputs = text_outputs.split("[/INST]")[-1].strip()
-            else:
-                text_outputs = text_outputs.split("ASSISTANT:")[-1].strip()
-
             if self.accelerator.is_main_process and doc_id[0] % 100 == 0:
                 eval_logger.debug(f"Generated text for doc ID {doc_id[0]}:\n\n{text_outputs}\n")
 

diff --git a/lmms_eval/models/llava_vid.py b/lmms_eval/models/llava_vid.py
@@ -95,14 +95,15 @@ def __init__(
         self.max_frames_num = int(max_frames_num)
         self.mm_resampler_location = mm_pooling_position
         self.delay_load = delay_load
+
         if self.overwrite == True:
             overwrite_config = {}
             overwrite_config["mm_resampler_type"] = self.mm_resampler_type
             overwrite_config["mm_spatial_pool_stride"] = self.mm_spatial_pool_stride
             overwrite_config["mm_spatial_pool_out_channels"] = self.mm_spatial_pool_out_channels
             overwrite_config["mm_spatial_pool_mode"] = self.mm_spatial_pool_mode
             overwrite_config["mm_pooling_position"] = self.mm_resampler_location
-            overwrite_config["mm_newline_position"] = mm_newline_position
+            overwrite_config["mm_newline_position"] = self.mm_newline_position
             overwrite_config["add_faster_video"] = False
             overwrite_config["delay_load"] = self.delay_load
             # overwrite_config["attn_implementation"] = attn_implementation
@@ -128,7 +129,7 @@ def __init__(
                 self._tokenizer = AutoTokenizer.from_pretrained(pretrained, use_fast=False)
                 cfg_pretrained = LlavaConfig.from_pretrained(pretrained)
                 if overwrite_config is not None:
-                    print(f"Overwriting config with {overwrite_config}")
+                    eval_logger.log(f"Overwriting config with {overwrite_config}")
                     for k, v in overwrite_config.items():
                         setattr(cfg_pretrained, k, v)
                 kwargs["torch_dtype"] = torch.float16

diff --git a/lmms_eval/models/longva.py b/lmms_eval/models/longva.py
@@ -76,6 +76,7 @@ def __init__(
 
         accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
         accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+        self.accelerator = accelerator
         if accelerator.num_processes > 1:
             self._device = torch.device(f"cuda:{accelerator.local_process_index}")
             self.device_map = f"cuda:{accelerator.local_process_index}"