Skip to content

Commit

Permalink
Merge branch 'main' into internal_main_dev
Browse files Browse the repository at this point in the history
  • Loading branch information
Luodian authored Aug 7, 2024
2 parents 858ecf1 + 3d4884a commit 17a3028
Show file tree
Hide file tree
Showing 45 changed files with 1,790 additions and 23 deletions.
17 changes: 15 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
---

## Annoucement

- [2024-07] 🎉🎉 We welcome the new tasks [LongVideoBench](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/117), [MMStar](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/158), new models [Mantis](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/162).
- [2024-07] 🎉🎉 We have released the [technical report](https://arxiv.org/abs/2407.12772) and [LiveBench](https://huggingface.co/spaces/lmms-lab/LiveBench)!
- [2024-07] 👨‍💻👨‍💻 The `lmms-eval/v0.2.1` has been upgraded to support more models, including [LongVA](https://github.com/EvolvingLMMs-Lab/LongVA), [InterVL-2](https://github.com/OpenGVLab/InternVL), [VILA](https://github.com/NVlabs/VILA), and many more evaluation tasks, e.g. [Details Captions](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/136), [MLVU](https://arxiv.org/abs/2406.04264), [WildVision-Bench](https://huggingface.co/datasets/WildVision/wildvision-arena-data), [VITATECS](https://github.com/lscpku/VITATECS) and [LLaVA-Interleave-Bench](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/).

- [2024-06] 🎬🎬 The `lmms-eval/v0.2.0` has been upgraded to support video evaluations for video models like LLaVA-NeXT Video and Gemini 1.5 Pro across tasks such as EgoSchema, PerceptionTest, VideoMME, and more. Please refer to the [blog](https://lmms-lab.github.io/posts/lmms-eval-0.2/) for more details
Expand Down Expand Up @@ -165,6 +166,18 @@ python3 -m accelerate.commands.launch \
python3 -m accelerate.commands.launch --num_processes=8 -m lmms_eval --config ./miscs/example_eval.yaml
```

**Evaluation of video model (llava-next-video-32B)**
```bash
accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \
--model llavavid \
--model_args pretrained=lmms-lab/LLaVA-NeXT-Video-32B-Qwen,conv_template=qwen_1_5,video_decode_backend=decord,max_frames_num=32,mm_spatial_pool_mode=average,mm_newline_position=grid,mm_resampler_location=after \
--tasks videomme \
--batch_size 1 \
--log_samples \
--log_samples_suffix llava_vid_32B \
--output_path ./logs/
```

**Evaluation with naive model sharding for bigger model (llava-next-72b)**

```bash
Expand Down Expand Up @@ -199,7 +212,7 @@ Please check [supported models](lmms_eval/models/__init__.py) for more details.

### Supported tasks

Please check [supported tasks](lmms_eval/docs/current_tasks.md) for more details.
Please check [supported tasks](docs/current_tasks.md) for more details.

## Add Customized Model and Dataset

Expand Down
5 changes: 4 additions & 1 deletion lmms_eval/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,10 @@ def _collate(x):
split = split[0]
batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] # [B, N]
flattened_visuals = self.flatten(batched_visuals)
pixel_values = self.load_image(flattened_visuals, self.image_size).cuda().to(torch.bfloat16)
try:
pixel_values = self.load_image(flattened_visuals, self.image_size).cuda().to(torch.bfloat16)
except IndexError:
pixel_values = None
gen_kwargs = all_gen_kwargs[0]

if "max_new_tokens" not in gen_kwargs:
Expand Down
10 changes: 7 additions & 3 deletions lmms_eval/models/internvl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def __init__(

accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
self.accelerator = accelerator
if accelerator.num_processes > 1:
self._device = torch.device(f"cuda:{accelerator.local_process_index}")
self.device_map = f"cuda:{accelerator.local_process_index}"
Expand Down Expand Up @@ -251,13 +252,16 @@ def generate_until(self, requests) -> List[str]:
visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
visuals = self.flatten(visuals)
if self.modality == "image":
visuals = [load_image(visual).to(torch.bfloat16).cuda() for visual in visuals]
pixel_values = torch.cat(visuals, dim=0)
num_patches_list = [visual.size(0) for visual in visuals]
if visuals:
visuals = [load_image(visual).to(torch.bfloat16).cuda() for visual in visuals]
pixel_values = torch.cat(visuals, dim=0)
num_patches_list = [visual.size(0) for visual in visuals]
image_tokens = ["<image>"] * len(visuals)
image_tokens = " ".join(image_tokens)
contexts = image_tokens + "\n" + contexts
else:
pixel_values = None
num_patch_list = None
response, history = self.model.chat(self.tokenizer, pixel_values, contexts, gen_kwargs, num_patches_list=num_patches_list, history=None, return_history=True)
elif self.modality == "video":
assert len(visuals) == 1, f"Only one video is supported, but got {len(visuals)} videos."
Expand Down
1 change: 1 addition & 0 deletions lmms_eval/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def __init__(

accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
self.accelerator = accelerator
if accelerator.num_processes > 1:
self._device = torch.device(f"cuda:{accelerator.local_process_index}")
self.device_map = f"cuda:{accelerator.local_process_index}"
Expand Down
11 changes: 4 additions & 7 deletions lmms_eval/models/llava_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def __init__(
device_map: str = "",
chat_template: Optional[str] = None,
use_cache: bool = True,
specified_eot_token_id: Optional[int] = None,
**kwargs,
) -> None:
super().__init__()
Expand Down Expand Up @@ -85,6 +86,7 @@ def __init__(
self.batch_size_per_gpu = int(batch_size)
self.chat_template = chat_template
self.use_cache = use_cache
self.specified_eot_token_id = specified_eot_token_id
if accelerator.num_processes > 1 and device_map == "":
assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
# If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model
Expand Down Expand Up @@ -316,18 +318,13 @@ def _collate(x):
max_new_tokens=gen_kwargs["max_new_tokens"],
use_cache=self.use_cache,
pad_token_id=self.tokenizer.eos_token_id,
eos_token_id=self.specified_eot_token_id,
)
cont = cont[:, inputs["input_ids"].shape[-1]:]
except Exception as e:
eval_logger.error(f"Error {e} in generating")
cont = ""
text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
if "1.5" in self.pretrained:
text_outputs = text_outputs.split("ASSISTANT:")[-1].strip()
elif "mistral" in self.pretrained:
text_outputs = text_outputs.split("[/INST]")[-1].strip()
else:
text_outputs = text_outputs.split("ASSISTANT:")[-1].strip()

if self.accelerator.is_main_process and doc_id[0] % 100 == 0:
eval_logger.debug(f"Generated text for doc ID {doc_id[0]}:\n\n{text_outputs}\n")

Expand Down
5 changes: 3 additions & 2 deletions lmms_eval/models/llava_vid.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,15 @@ def __init__(
self.max_frames_num = int(max_frames_num)
self.mm_resampler_location = mm_pooling_position
self.delay_load = delay_load

if self.overwrite == True:
overwrite_config = {}
overwrite_config["mm_resampler_type"] = self.mm_resampler_type
overwrite_config["mm_spatial_pool_stride"] = self.mm_spatial_pool_stride
overwrite_config["mm_spatial_pool_out_channels"] = self.mm_spatial_pool_out_channels
overwrite_config["mm_spatial_pool_mode"] = self.mm_spatial_pool_mode
overwrite_config["mm_pooling_position"] = self.mm_resampler_location
overwrite_config["mm_newline_position"] = mm_newline_position
overwrite_config["mm_newline_position"] = self.mm_newline_position
overwrite_config["add_faster_video"] = False
overwrite_config["delay_load"] = self.delay_load
# overwrite_config["attn_implementation"] = attn_implementation
Expand All @@ -128,7 +129,7 @@ def __init__(
self._tokenizer = AutoTokenizer.from_pretrained(pretrained, use_fast=False)
cfg_pretrained = LlavaConfig.from_pretrained(pretrained)
if overwrite_config is not None:
print(f"Overwriting config with {overwrite_config}")
eval_logger.log(f"Overwriting config with {overwrite_config}")
for k, v in overwrite_config.items():
setattr(cfg_pretrained, k, v)
kwargs["torch_dtype"] = torch.float16
Expand Down
1 change: 1 addition & 0 deletions lmms_eval/models/longva.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def __init__(

accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
self.accelerator = accelerator
if accelerator.num_processes > 1:
self._device = torch.device(f"cuda:{accelerator.local_process_index}")
self.device_map = f"cuda:{accelerator.local_process_index}"
Expand Down
Loading

0 comments on commit 17a3028

Please sign in to comment.