huggingface · zucchini-nlp · Jul 17, 2025 · Jul 17, 2025
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -870,6 +870,7 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
         max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
         size = images_kwargs.get("size", None) or self.size
 
+        num_patches = num_rows = num_cols = 1
         if do_image_splitting:
             height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=size["longest_edge"])
             height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=4096)
@@ -891,7 +892,7 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
                 num_cols = math.ceil(resized_width / max_width)
                 num_patches = num_rows * num_cols + 1
 
-        return num_patches
+        return num_patches, num_rows, num_cols
 
 
 __all__ = ["Idefics3ImageProcessor"]
diff --git a/src/transformers/models/idefics3/image_processing_idefics3_fast.py b/src/transformers/models/idefics3/image_processing_idefics3_fast.py
@@ -503,5 +503,47 @@ def to_dict(self):
         encoder_dict.pop("return_row_col_info", None)
         return encoder_dict
 
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number of image patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*)
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of patches per image.
+        """
+        do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
+        max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
+        size = images_kwargs.get("size", None) or self.size
+
+        num_patches = num_rows = num_cols = 1
+        if do_image_splitting:
+            height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=size["longest_edge"])
+            height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE)
+            aspect_ratio = width / height
+
+            if width >= height:
+                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int(width / aspect_ratio)
+                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+            elif height > width:
+                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int(height * aspect_ratio)
+                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+
+            max_height = max_width = max_image_size["longest_edge"]
+            if resized_height > max_height or resized_width > max_width:
+                # Calculate the number of splits
+                num_rows = math.ceil(resized_height / max_height)
+                num_cols = math.ceil(resized_width / max_width)
+                num_patches = num_rows * num_cols + 1
+
+        return num_patches, num_rows, num_cols
+
 
 __all__ = ["Idefics3ImageProcessorFast"]
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
@@ -16,7 +16,6 @@
 Processor class for Idefics3.
 """
 
-import math
 import re
 from itertools import accumulate
 from typing import TYPE_CHECKING, Optional, Union
@@ -390,19 +389,20 @@ def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
             images_kwargs = Idefics3ProcessorKwargs._defaults.get("images_kwargs", {})
             images_kwargs.update(kwargs)
 
-            num_image_patches = [
+            num_image_row_cols = [
                 self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
                 for image_size in image_sizes
             ]
 
             base_image_length = self.image_seq_len + 3
             col_length = self.image_seq_len + 2
             num_image_tokens = []
+            num_image_patches = []
 
-            for num_patches in num_image_patches:
-                num_cols = num_rows = int(math.sqrt(num_patches - 1))
+            for num_patches, num_rows, num_cols in num_image_row_cols:
                 row_length = col_length * num_cols + 1
                 num_image_tokens.append(base_image_length + (row_length * num_rows))
+                num_image_patches.append(num_patches)
 
             vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
 

diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py
@@ -867,6 +867,7 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
         max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
         size = images_kwargs.get("size", None) or self.size
 
+        num_patches = num_rows = num_cols = 1
         if do_image_splitting:
             height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=size["longest_edge"])
             height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=4096)
@@ -888,7 +889,7 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
                 num_cols = math.ceil(resized_width / max_width)
                 num_patches = num_rows * num_cols + 1
 
-        return num_patches
+        return num_patches, num_rows, num_cols
 
 
 __all__ = ["SmolVLMImageProcessor"]
diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
@@ -493,5 +493,47 @@ def to_dict(self):
         encoder_dict.pop("return_row_col_info", None)
         return encoder_dict
 
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number of image patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*)
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of patches per image.
+        """
+        do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
+        max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
+        size = images_kwargs.get("size", None) or self.size
+
+        num_patches = num_rows = num_cols = 1
+        if do_image_splitting:
+            height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=size["longest_edge"])
+            height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE)
+            aspect_ratio = width / height
+
+            if width >= height:
+                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int(width / aspect_ratio)
+                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+            elif height > width:
+                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int(height * aspect_ratio)
+                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+
+            max_height = max_width = max_image_size["longest_edge"]
+            if resized_height > max_height or resized_width > max_width:
+                # Calculate the number of splits
+                num_rows = math.ceil(resized_height / max_height)
+                num_cols = math.ceil(resized_width / max_width)
+                num_patches = num_rows * num_cols + 1
+
+        return num_patches, num_rows, num_cols
+
 
 __all__ = ["SmolVLMImageProcessorFast"]