Skip to content

Commit

Permalink
Update Qwen2_VL class to disable custom video loader by default and r…
Browse files Browse the repository at this point in the history
…emove unused image handling code
  • Loading branch information
pufanyi committed Jan 14, 2025
1 parent bb44714 commit 8811469
Showing 1 changed file with 8 additions and 18 deletions.
26 changes: 8 additions & 18 deletions lmms_eval/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ def __init__(
max_pixels: int = 1605632,
min_pixels: int = 3136,
max_num_frames: int = 10,
use_custom_video_loader: Optional[bool] = True,
use_custom_video_loader: Optional[bool] = False,
fps: Optional[float] = None, # Only applicable if use_custom_video_loader is True
max_image_size: Optional[int] = 1024, # Only applicable if use_custom_video_loader is True
max_image_size: Optional[int] = None, # Only applicable if use_custom_video_loader is True
continual_mode: bool = False,
response_persistent_folder: str = None, # We will cache the Gemini API response in this path and use it for future requests
**kwargs,
Expand Down Expand Up @@ -91,11 +91,6 @@ def __init__(
self.max_pixels = max_pixels
self.min_pixels = min_pixels
self.max_num_frames = max_num_frames
self.processor = AutoProcessor.from_pretrained(pretrained, max_pixels=max_pixels, min_pixels=min_pixels)
self.max_pixels = max_pixels
self.min_pixels = min_pixels
self.max_num_frames = max_num_frames
self._tokenizer = AutoTokenizer.from_pretrained(pretrained)

self._config = self.model.config
self.batch_size_per_gpu = int(batch_size)
Expand Down Expand Up @@ -248,16 +243,18 @@ def _collate(x):
contexts[i] = contexts[i].replace("<image 1>", "<image>")
if "\\<image 1\\>" in contexts[i]:
contexts[i] = contexts[i].replace("\\<image 1\\>", "<image>")
print(contexts[i])
if "<image>" in contexts[i]:
contexts[i] = contexts[i].replace("<image>", "")
# print(contexts[i])

messages = []
processed_visuals = []
for i, context in enumerate(contexts):
context += "\nPlease think step by step."

if "<image>" in context:
context = context.split("<image>")
assert len(context) == 2, f"Expected 2 parts in context but got {len(context)}"
# if "<image>" in context:
# context = context.split("<image>")
# assert len(context) == 2, f"Expected 2 parts in context but got {len(context)}"

message = [{"role": "system", "content": "You are a helpful assistant."}]

Expand All @@ -267,13 +264,6 @@ def _collate(x):
if self.use_custom_video_loader:
visual = read_video_pyav_base64(visual, num_frm=self.max_num_frames, fps=self.fps, img_format="JPEG", max_image_size=self.max_image_size)
image_contents = list(map(lambda x: f"data:image/jpeg;base64,{x}", visual))
if len(context) == 2:
message.append(
{"role": "user", "content": [{"type": "video", "video": image_contents[:-1]}, {"type": "text", "text": context[0]}, {"type": "image", "image": image_contents[-1]}, {"type": "text", "text": context[1]}]}
)
import json

print("message", json.dumps(context, indent=4))
message.append({"role": "user", "content": [{"type": "video", "video": image_contents}, {"type": "text", "text": context}]})
else:
vr = decord.VideoReader(visual)
Expand Down

0 comments on commit 8811469

Please sign in to comment.