Skip to content

[idefics3] fix for vLLM #39470

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,7 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
size = images_kwargs.get("size", None) or self.size

num_patches = num_rows = num_cols = 1
if do_image_splitting:
height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=size["longest_edge"])
height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=4096)
Expand All @@ -891,7 +892,7 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
num_cols = math.ceil(resized_width / max_width)
num_patches = num_rows * num_cols + 1

return num_patches
return num_patches, num_rows, num_cols


__all__ = ["Idefics3ImageProcessor"]
42 changes: 42 additions & 0 deletions src/transformers/models/idefics3/image_processing_idefics3_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,5 +503,47 @@ def to_dict(self):
encoder_dict.pop("return_row_col_info", None)
return encoder_dict

def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
"""
A utility that returns number of image patches for a given image size.

Args:
height (`int`):
Height of the input image.
width (`int`):
Width of the input image.
images_kwargs (`dict`, *optional*)
Any kwargs to override defaults of the image processor.
Returns:
`int`: Number of patches per image.
"""
do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
size = images_kwargs.get("size", None) or self.size

num_patches = num_rows = num_cols = 1
if do_image_splitting:
height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=size["longest_edge"])
height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE)
aspect_ratio = width / height

if width >= height:
resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
resized_height = int(width / aspect_ratio)
resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
elif height > width:
resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
resized_width = int(height * aspect_ratio)
resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]

max_height = max_width = max_image_size["longest_edge"]
if resized_height > max_height or resized_width > max_width:
# Calculate the number of splits
num_rows = math.ceil(resized_height / max_height)
num_cols = math.ceil(resized_width / max_width)
num_patches = num_rows * num_cols + 1

return num_patches, num_rows, num_cols


__all__ = ["Idefics3ImageProcessorFast"]
8 changes: 4 additions & 4 deletions src/transformers/models/idefics3/processing_idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
Processor class for Idefics3.
"""

import math
import re
from itertools import accumulate
from typing import TYPE_CHECKING, Optional, Union
Expand Down Expand Up @@ -390,19 +389,20 @@ def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
images_kwargs = Idefics3ProcessorKwargs._defaults.get("images_kwargs", {})
images_kwargs.update(kwargs)

num_image_patches = [
num_image_row_cols = [
self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
for image_size in image_sizes
]

base_image_length = self.image_seq_len + 3
col_length = self.image_seq_len + 2
num_image_tokens = []
num_image_patches = []

for num_patches in num_image_patches:
num_cols = num_rows = int(math.sqrt(num_patches - 1))
for num_patches, num_rows, num_cols in num_image_row_cols:
row_length = col_length * num_cols + 1
num_image_tokens.append(base_image_length + (row_length * num_rows))
num_image_patches.append(num_patches)

vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})

Expand Down
3 changes: 2 additions & 1 deletion src/transformers/models/smolvlm/image_processing_smolvlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,7 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
size = images_kwargs.get("size", None) or self.size

num_patches = num_rows = num_cols = 1
if do_image_splitting:
height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=size["longest_edge"])
height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=4096)
Expand All @@ -888,7 +889,7 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
num_cols = math.ceil(resized_width / max_width)
num_patches = num_rows * num_cols + 1

return num_patches
return num_patches, num_rows, num_cols


__all__ = ["SmolVLMImageProcessor"]
42 changes: 42 additions & 0 deletions src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,5 +493,47 @@ def to_dict(self):
encoder_dict.pop("return_row_col_info", None)
return encoder_dict

def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
"""
A utility that returns number of image patches for a given image size.

Args:
height (`int`):
Height of the input image.
width (`int`):
Width of the input image.
images_kwargs (`dict`, *optional*)
Any kwargs to override defaults of the image processor.
Returns:
`int`: Number of patches per image.
"""
do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
size = images_kwargs.get("size", None) or self.size

num_patches = num_rows = num_cols = 1
if do_image_splitting:
height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=size["longest_edge"])
height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE)
aspect_ratio = width / height

if width >= height:
resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
resized_height = int(width / aspect_ratio)
resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
elif height > width:
resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
resized_width = int(height * aspect_ratio)
resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]

max_height = max_width = max_image_size["longest_edge"]
if resized_height > max_height or resized_width > max_width:
# Calculate the number of splits
num_rows = math.ceil(resized_height / max_height)
num_cols = math.ceil(resized_width / max_width)
num_patches = num_rows * num_cols + 1

return num_patches, num_rows, num_cols


__all__ = ["SmolVLMImageProcessorFast"]