vllm-project · Isotr0py · Jul 20, 2025 · Jul 7, 2025 · Jun 16, 2025 · Jun 18, 2025
@@ -18,7 +18,7 @@ These models are what we list in [supported-text-models][supported-text-models]
 
 ### Transformers
 
-vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned!
+vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs, and require setting `--disable_mm_preprocessor_cache` when running. Support for video inputs and caching of multi-modal preprocessors will be added in future releases.
 
 To check if the modeling backend is Transformers, you can simply do this:
 
@@ -28,14 +28,17 @@ llm = LLM(model=..., task="generate")  # Name or path of your model
 llm.apply_model(lambda model: print(type(model)))
 ```
 
-If it is `TransformersForCausalLM` then it means it's based on Transformers!
+If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers!
 
 !!! tip
     You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md).
 
 !!! note
     vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
 
+!!! note
+    In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
+
 #### Custom models
 
 If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
@@ -99,7 +102,7 @@ Here is what happens in the background when this model is loaded:
 
 1. The config is loaded.
 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
-3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
+3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
 
 That's it!
 

@@ -35,6 +35,8 @@
 REQUIRES_V0_MODELS = [
     # V1 Test: not enough KV cache space in C1.
     "fuyu",
+    # V1 Test: Deadlock issue when processing mm_inputs
+    "llava-onevision-transformers",
 ]
 
 # yapf: disable
@@ -170,6 +172,79 @@
         hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
+    #### Transformers fallback to test
+    ## To reduce test burden, we only test batching arbitrary image size
+    # Dynamic image length and number of patches
+    "llava-onevision-transformers": VLMTestInfo(
+        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
+        max_model_len=16384,
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+            "disable_mm_preprocessor_cache": True,
+            "enable_prefix_caching": False,
+        },
+        marks=[pytest.mark.core_model],
+    ),
+    # FIXME(Isotr0py): Enable this test after
+    # https://github.com/huggingface/transformers/pull/39470 released
+    # "idefics3-transformers": VLMTestInfo(
+    #     models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
+    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+    #     prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+    #     img_idx_to_prompt=lambda idx: "<image>",
+    #     max_model_len=8192,
+    #     max_num_seqs=2,
+    #     auto_cls=AutoModelForImageTextToText,
+    #     hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
+    #     image_size_factors=[(0.25, 0.5, 1.0)],
+    #     vllm_runner_kwargs={
+    #         "model_impl": "transformers",
+    #         "disable_mm_preprocessor_cache": True,
+    #         "enable_prefix_caching": False,
+    #     },
+    #     marks=[pytest.mark.core_model],
+    # ),
+    # Pixel values from processor are not 4D or 5D arrays
+    "qwen2_5_vl-transformers": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(0.25, 0.2, 0.15)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+            "disable_mm_preprocessor_cache": True,
+            "enable_prefix_caching": False,
+        },
+        marks=[pytest.mark.core_model],
+    ),
+    # Check "auto" with fallback to transformers
+    "internvl-transformers": VLMTestInfo(
+        models=["OpenGVLab/InternVL3-1B-hf"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "auto",
+            "disable_mm_preprocessor_cache": True,
+            "enable_prefix_caching": False,
+        },
+        auto_cls=AutoModelForImageTextToText,
+        marks=[pytest.mark.core_model],
+    ),
     #### Extended model tests
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],

@@ -484,6 +484,7 @@ def check_available_online(
 
 _TRANSFORMERS_MODELS = {
     "TransformersForCausalLM": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
+    "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
 }
 
 _EXAMPLE_MODELS = {

@@ -588,6 +588,11 @@ def __post_init__(self) -> None:
             self.truncation_side = "right"
 
         model_info, arch = self.registry.inspect_model_cls(self.architectures)
+        if (arch == "TransformersForMultimodalLM"
+                and self.hf_config == self.hf_text_config):
+            model_info, arch = self.registry.inspect_model_cls(
+                "TransformersForCausalLM")
+
         self._model_info = model_info
         self._architecture = arch
 

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -169,7 +169,7 @@ def device_loading_context(module: torch.nn.Module,
 def resolve_transformers_arch(model_config: ModelConfig,
                               architectures: list[str]):
     for i, arch in enumerate(architectures):
-        if arch == "TransformersForCausalLM":
+        if arch in ["TransformersForCausalLM", "TransformersForMultimodalLM"]:
             continue
         auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
                                            None) or dict()
@@ -205,7 +205,13 @@ def resolve_transformers_arch(model_config: ModelConfig,
                 raise ValueError(
                     f"The Transformers implementation of {arch} is not "
                     "compatible with vLLM.")
-            architectures[i] = "TransformersForCausalLM"
+            # Check if text-config is `self`. If not most probably it is
+            # a composite config, i.e. mutlimodal
+            if model_config.hf_config.get_text_config(
+            ) != model_config.hf_config:
+                architectures[i] = "TransformersForMultimodalLM"
+            else:
+                architectures[i] = "TransformersForCausalLM"
         if model_config.model_impl == ModelImpl.AUTO:
             if not model_module.is_backend_compatible():
                 raise ValueError(
@@ -216,7 +222,13 @@ def resolve_transformers_arch(model_config: ModelConfig,
                 "%s has no vLLM implementation, falling back to Transformers "
                 "implementation. Some features may not be supported and "
                 "performance may not be optimal.", arch)
-            architectures[i] = "TransformersForCausalLM"
+            # Check if text-config is `self`. If not most probably it is
+            # a composite config, i.e. multimodal
+            if model_config.hf_config.get_text_config(
+            ) != model_config.hf_config:
+                architectures[i] = "TransformersForMultimodalLM"
+            else:
+                architectures[i] = "TransformersForCausalLM"
     return architectures
 
 

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -254,6 +254,7 @@
 }
 
 _TRANSFORMERS_MODELS = {
+    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
     "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
 }
 # yapf: enable
@@ -482,15 +483,17 @@ def _normalize_archs(
 
         # make sure Transformers backend is put at the last as a fallback
         if len(normalized_arch) != len(architectures):
-            normalized_arch.append("TransformersForCausalLM")
+            # The order matters. If the CausalLM comes first, then checks for
+            # registered model in MultimodalRegistry fail
+            normalized_arch.extend(
+                ["TransformersForMultimodalLM", "TransformersForCausalLM"])
         return normalized_arch
 
     def inspect_model_cls(
         self,
         architectures: Union[str, list[str]],
     ) -> tuple[_ModelInfo, str]:
         architectures = self._normalize_archs(architectures)
-
         for arch in architectures:
             model_info = self._try_inspect_model_cls(arch)
             if model_info is not None: