Merge branch 'vllm-project:main' into feat/command-tool-parser

gjgjos · web-flow · commit d9eb8e12d878 · 2025-07-09T10:37:25.000+09:00
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -153,7 +153,7 @@ struct ScaledEpilogueBias
       cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
 
   using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
+      cutlass::homogeneous_multiply_add, ElementD, float,
       cutlass::FloatRoundStyle::round_to_nearest>;
 
  public:
@@ -210,7 +210,7 @@ struct ScaledEpilogueBiasAzp
                                               EVTComputeAzp>;
 
   using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
+      cutlass::homogeneous_multiply_add, ElementD, float,
       cutlass::FloatRoundStyle::round_to_nearest>;
 
  public:
@@ -288,7 +288,7 @@ struct ScaledEpilogueBiasAzpToken
                                               EVTComputeAcc>;
 
   using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
+      cutlass::homogeneous_multiply_add, ElementD, float,
       cutlass::FloatRoundStyle::round_to_nearest>;
 
  public:
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -195,7 +195,7 @@ struct ScaledEpilogueBias
       cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
 
   using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
+      cutlass::homogeneous_multiply_add, ElementD, float,
       cutlass::FloatRoundStyle::round_to_nearest>;
 
  public:
@@ -238,7 +238,7 @@ struct ScaledEpilogueColumnBias
       cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
 
   using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
+      cutlass::homogeneous_multiply_add, ElementD, float,
       cutlass::FloatRoundStyle::round_to_nearest>;
 
  public:
@@ -295,7 +295,7 @@ struct ScaledEpilogueBiasAzp
       cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
 
   using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
+      cutlass::homogeneous_multiply_add, ElementD, float,
       cutlass::FloatRoundStyle::round_to_nearest>;
 
  public:
@@ -371,7 +371,7 @@ struct ScaledEpilogueBiasAzpToken
       cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
 
   using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
+      cutlass::homogeneous_multiply_add, ElementD, float,
       cutlass::FloatRoundStyle::round_to_nearest>;
 
  public:
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -7,7 +7,7 @@
 
 constexpr uint64_t THREADS_PER_EXPERT = 512;
 
-__global__ void compute_problem_sizes(const uint32_t* __restrict__ topk_ids,
+__global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
                                       int32_t* problem_sizes1,
                                       int32_t* problem_sizes2,
                                       int32_t* atomic_buffer,
@@ -62,7 +62,7 @@ __global__ void compute_expert_blockscale_offsets(
   }
 }
 
-__global__ void compute_arg_sorts(const uint32_t* __restrict__ topk_ids,
+__global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids,
                                   const int32_t* __restrict__ expert_offsets,
                                   int32_t* input_permutation,
                                   int32_t* output_permutation,
@@ -103,7 +103,7 @@ void get_cutlass_moe_mm_data_caller(
 
   int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
   compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const uint32_t*>(topk_ids.data_ptr()),
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
       static_cast<int32_t*>(problem_sizes1.data_ptr()),
       static_cast<int32_t*>(problem_sizes2.data_ptr()),
       static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
@@ -120,7 +120,7 @@ void get_cutlass_moe_mm_data_caller(
         static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
   }
   compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
-      static_cast<const uint32_t*>(topk_ids.data_ptr()),
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
       static_cast<const int32_t*>(expert_offsets.data_ptr()),
       static_cast<int32_t*>(input_permutation.data_ptr()),
       static_cast<int32_t*>(output_permutation.data_ptr()),
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
@@ -18,9 +18,9 @@ setuptools==78.1.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.8.0.dev20250618
-torchvision==0.23.0.dev20250618
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.9.0.dev20250703
+torchvision==0.24.0.dev20250703
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
@@ -23,7 +23,7 @@
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
 MTEB_RERANK_LANGS = ["en"]
-MTEB_RERANK_TOL = 1e-3
+MTEB_RERANK_TOL = 2e-3
 
 
 class VllmMtebEncoder(mteb.Encoder):
diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py
@@ -68,7 +68,6 @@
                     enable_test=False),
     RerankModelInfo("BAAI/bge-reranker-v2-m3",
                     architecture="XLMRobertaForSequenceClassification",
-                    dtype="float32",
                     enable_test=False)
 ]
 
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
@@ -18,11 +18,8 @@
 ]
 
 RERANK_MODELS = [
-    RerankModelInfo(
-        "jinaai/jina-reranker-v2-base-multilingual",
-        architecture="XLMRobertaForSequenceClassification",
-        dtype="float32",
-    )
+    RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual",
+                    architecture="XLMRobertaForSequenceClassification")
 ]
 
 
diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -12,11 +12,9 @@
 RERANK_MODELS = [
     RerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
                     architecture="Qwen3ForSequenceClassification",
-                    dtype="float32",
                     enable_test=True),
     RerankModelInfo("Qwen/Qwen3-Reranker-4B",
                     architecture="Qwen3ForSequenceClassification",
-                    dtype="float32",
                     enable_test=False)
 ]
 
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
@@ -67,6 +67,7 @@ def test_basic(
         assert "1024" in output or "0, 1" in output
 
 
+@pytest.mark.skip(reason="Temporarily disabled due to timeout")
 @pytest.mark.skipif(not current_platform.is_tpu(),
                     reason="This is a basic test for TPU only")
 @pytest.mark.parametrize("max_tokens", [8])
diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -13,6 +13,7 @@
 
 from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel
 from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr
+from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
 
@@ -283,6 +284,7 @@ def _lora_expand_fake(
         op_func=_lora_expand,
         mutates_args=["output_tensor"],
         fake_impl=_lora_expand_fake,
+        dispatch_key=current_platform.dispatch_key,
     )
     lora_expand = torch.ops.vllm.lora_expand
 
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -13,6 +13,7 @@
 
 from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel
 from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr
+from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
 
@@ -237,6 +238,7 @@ def _lora_shrink_fake(
         op_func=_lora_shrink,
         mutates_args=["output_tensor"],
         fake_impl=_lora_shrink_fake,
+        dispatch_key=current_platform.dispatch_key,
     )
     lora_shrink = torch.ops.vllm.lora_shrink
 
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
@@ -35,7 +35,9 @@ def _get_lora_a_ptr(lora_a_weights: list[torch.Tensor], device: torch.device):
         lora_strides_d1.append(lora_a_weight.stride(1))
         lora_strides_d2.append(lora_a_weight.stride(2))
     if len(lora_a_weights) > 1:
-        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
+        lora_ptr_tensor = torch.tensor(tensor_ptrs,
+                                       device=device,
+                                       dtype=torch.uint64)
     else:
         lora_ptr_tensor = lora_a_weights[0]
 
@@ -89,8 +91,12 @@ def _get_lora_b_ptr(lora_weights: list[torch.Tensor], offset_start: int,
 
     if len(lora_weights) > 1:
         # note these are device tensors
-        lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device)
-        slice_start_tensor = torch.tensor(slice_offset_lst, device=device)
+        lora_ptr_tensor = torch.tensor(tensor_ptrs,
+                                       device=device,
+                                       dtype=torch.uint64)
+        slice_start_tensor = torch.tensor(slice_offset_lst,
+                                          device=device,
+                                          dtype=torch.uint64)
     else:
         slice_start_tensor = slice_offset_lst[0]
         lora_ptr_tensor = lora_b_weight[0]
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -78,7 +78,7 @@ def max_num_tokens_per_rank(self) -> Optional[int]:
         return self.max_num_tokens
 
     def topk_indices_dtype(self) -> Optional[torch.dtype]:
-        return torch.uint32
+        return torch.int32
 
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
@@ -100,7 +100,9 @@ def prepare(
         hidden_dim = a1.size(-1)  # K
 
         assert topk_ids.size(0) == num_tokens
-        # assert expert_map is None, "NYI"
+        assert expert_map is None, """with expert map, -1 id is used for
+            non-local token; this causes error when casting ids to the
+            topk_indices_dtype() uint32"""
 
         # Is this always going to be a1.device?
         device = a1.device
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -929,23 +929,27 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-        )
+            e_score_correction_bias=e_score_correction_bias)
+
+        a1_scale = layer.w13_input_scale
+        a2_scale = layer.w2_input_scale
+        per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+            a2_scale.numel() != 1 if a2_scale is not None else False)
 
         return self.fused_experts(
             x,
             layer.w13_weight,
             layer.w2_weight,
             topk_weights,
             topk_ids,
+            per_act_token=per_act_token,
             activation=activation,
             global_num_experts=global_num_experts,
             expert_map=None if self.disable_expert_map else expert_map,
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
-            a1_scale=layer.w13_input_scale,
-            a2_scale=layer.w2_input_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
         )
 
 
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
@@ -27,6 +27,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser, PlaceholderModule
 
 if TYPE_CHECKING:
@@ -513,7 +514,9 @@ def deserialize_tensorizer_model(model: nn.Module,
             **tensorizer_args.stream_kwargs) as stream, TensorDeserializer(
                 stream,
                 dtype=tensorizer_config.dtype,
-                device=torch.device("cuda", torch.cuda.current_device()),
+                device=f'xpu:{torch.xpu.current_device()}'
+                if current_platform.is_xpu() else
+                f'cuda:{torch.cuda.current_device()}',
                 **tensorizer_args.deserialization_kwargs) as deserializer:
         deserializer.load_into_module(model)
         end = time.perf_counter()
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
@@ -351,11 +351,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         embed_std = 1 / torch.sqrt(
             torch.tensor(self.projector_config.n_embed, dtype=torch.float32))
         if self.tile_tag == "2D":
-            # <|view_separator|>, <|\n|>
+            # <|view_seperator|>, <|\n|>
             self.image_newline = nn.Parameter(
                 torch.randn(self.projector_config.n_embed) * embed_std)
             # This is a typo in original implementation
-            self.view_separator = nn.Parameter(
+            self.view_seperator = nn.Parameter(
                 torch.randn(self.projector_config.n_embed) * embed_std)
         else:
             raise ValueError(
@@ -560,13 +560,13 @@ def _pixel_values_to_embedding(
             if self.global_view_pos == "head":
                 global_local_features = torch.cat([
                     global_features,
-                    self.view_separator[None, :],
+                    self.view_seperator[None, :],
                     local_features,
                 ])
             else:
                 global_local_features = torch.cat([
                     local_features,
-                    self.view_separator[None, :],
+                    self.view_seperator[None, :],
                     global_features,
                 ])
 
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
@@ -65,7 +65,7 @@
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
@@ -1213,7 +1213,10 @@ def get_video_replacement_glm4v(item_idx: int):
                 placeholder.append(eoi_token_id)
                 placeholder.extend(frame_idx)
             placeholder.append(eov_token_id)
-            return placeholder
+            return PromptUpdateDetails.select_token_id(
+                placeholder,
+                embed_token_id=hf_processor.video_token_id,
+            )
 
         return [
             PromptReplacement(
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
@@ -1592,6 +1592,9 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
 
         return modalities
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
@@ -823,10 +823,11 @@ def get_image_processor(
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def get_max_tokens_per_item(
-            self, seq_len: int,
-            mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
-
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         max_image_tokens = self.get_max_image_tokens()
         max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
         return {"image": max_image_tokens, "video": max_video_tokens}
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,6 @@`
`68`	`68`	`enable_test=False),`
`69`	`69`	`RerankModelInfo("BAAI/bge-reranker-v2-m3",`
`70`	`70`	`architecture="XLMRobertaForSequenceClassification",`
`71`		`- dtype="float32",`
`72`	`71`	`enable_test=False)`
`73`	`72`	`]`
`74`	`73`
Original file line number	Diff line number	Diff line change
`@@ -18,11 +18,8 @@`
`18`	`18`	`]`
`19`	`19`
`20`	`20`	`RERANK_MODELS = [`
`21`		`- RerankModelInfo(`
`22`		`- "jinaai/jina-reranker-v2-base-multilingual",`
`23`		`- architecture="XLMRobertaForSequenceClassification",`
`24`		`- dtype="float32",`
`25`		`- )`
	`21`	`+ RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual",`
	`22`	`+ architecture="XLMRobertaForSequenceClassification")`
`26`	`23`	`]`
`27`	`24`
`28`	`25`