Skip to content

Commit d9eb8e1

Browse files
authored
Merge branch 'vllm-project:main' into feat/command-tool-parser
2 parents 13434bc + 5eaf570 commit d9eb8e1

File tree

22 files changed

+103
-61
lines changed

22 files changed

+103
-61
lines changed

csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ struct ScaledEpilogueBias
153153
cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
154154

155155
using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
156-
cutlass::multiply_add, ElementD, float,
156+
cutlass::homogeneous_multiply_add, ElementD, float,
157157
cutlass::FloatRoundStyle::round_to_nearest>;
158158

159159
public:
@@ -210,7 +210,7 @@ struct ScaledEpilogueBiasAzp
210210
EVTComputeAzp>;
211211

212212
using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
213-
cutlass::multiply_add, ElementD, float,
213+
cutlass::homogeneous_multiply_add, ElementD, float,
214214
cutlass::FloatRoundStyle::round_to_nearest>;
215215

216216
public:
@@ -288,7 +288,7 @@ struct ScaledEpilogueBiasAzpToken
288288
EVTComputeAcc>;
289289

290290
using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
291-
cutlass::multiply_add, ElementD, float,
291+
cutlass::homogeneous_multiply_add, ElementD, float,
292292
cutlass::FloatRoundStyle::round_to_nearest>;
293293

294294
public:

csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ struct ScaledEpilogueBias
195195
cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
196196

197197
using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
198-
cutlass::multiply_add, ElementD, float,
198+
cutlass::homogeneous_multiply_add, ElementD, float,
199199
cutlass::FloatRoundStyle::round_to_nearest>;
200200

201201
public:
@@ -238,7 +238,7 @@ struct ScaledEpilogueColumnBias
238238
cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
239239

240240
using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
241-
cutlass::multiply_add, ElementD, float,
241+
cutlass::homogeneous_multiply_add, ElementD, float,
242242
cutlass::FloatRoundStyle::round_to_nearest>;
243243

244244
public:
@@ -295,7 +295,7 @@ struct ScaledEpilogueBiasAzp
295295
cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
296296

297297
using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
298-
cutlass::multiply_add, ElementD, float,
298+
cutlass::homogeneous_multiply_add, ElementD, float,
299299
cutlass::FloatRoundStyle::round_to_nearest>;
300300

301301
public:
@@ -371,7 +371,7 @@ struct ScaledEpilogueBiasAzpToken
371371
cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
372372

373373
using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
374-
cutlass::multiply_add, ElementD, float,
374+
cutlass::homogeneous_multiply_add, ElementD, float,
375375
cutlass::FloatRoundStyle::round_to_nearest>;
376376

377377
public:

csrc/quantization/cutlass_w8a8/moe/moe_data.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
constexpr uint64_t THREADS_PER_EXPERT = 512;
99

10-
__global__ void compute_problem_sizes(const uint32_t* __restrict__ topk_ids,
10+
__global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
1111
int32_t* problem_sizes1,
1212
int32_t* problem_sizes2,
1313
int32_t* atomic_buffer,
@@ -62,7 +62,7 @@ __global__ void compute_expert_blockscale_offsets(
6262
}
6363
}
6464

65-
__global__ void compute_arg_sorts(const uint32_t* __restrict__ topk_ids,
65+
__global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids,
6666
const int32_t* __restrict__ expert_offsets,
6767
int32_t* input_permutation,
6868
int32_t* output_permutation,
@@ -103,7 +103,7 @@ void get_cutlass_moe_mm_data_caller(
103103

104104
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
105105
compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
106-
static_cast<const uint32_t*>(topk_ids.data_ptr()),
106+
static_cast<const int32_t*>(topk_ids.data_ptr()),
107107
static_cast<int32_t*>(problem_sizes1.data_ptr()),
108108
static_cast<int32_t*>(problem_sizes2.data_ptr()),
109109
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
@@ -120,7 +120,7 @@ void get_cutlass_moe_mm_data_caller(
120120
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
121121
}
122122
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
123-
static_cast<const uint32_t*>(topk_ids.data_ptr()),
123+
static_cast<const int32_t*>(topk_ids.data_ptr()),
124124
static_cast<const int32_t*>(expert_offsets.data_ptr()),
125125
static_cast<int32_t*>(input_permutation.data_ptr()),
126126
static_cast<int32_t*>(output_permutation.data_ptr()),

requirements/tpu.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ setuptools==78.1.0
1818
--find-links https://storage.googleapis.com/libtpu-releases/index.html
1919
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
2020
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
21-
torch==2.8.0.dev20250618
22-
torchvision==0.23.0.dev20250618
23-
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
24-
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
25-
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
21+
torch==2.9.0.dev20250703
22+
torchvision==0.24.0.dev20250703
23+
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
24+
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
25+
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250703-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
2626

tests/models/language/pooling/mteb_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
# See #19344
2424
MTEB_RERANK_TASKS = ["NFCorpus"]
2525
MTEB_RERANK_LANGS = ["en"]
26-
MTEB_RERANK_TOL = 1e-3
26+
MTEB_RERANK_TOL = 2e-3
2727

2828

2929
class VllmMtebEncoder(mteb.Encoder):

tests/models/language/pooling/test_baai.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@
6868
enable_test=False),
6969
RerankModelInfo("BAAI/bge-reranker-v2-m3",
7070
architecture="XLMRobertaForSequenceClassification",
71-
dtype="float32",
7271
enable_test=False)
7372
]
7473

tests/models/language/pooling/test_jina.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,8 @@
1818
]
1919

2020
RERANK_MODELS = [
21-
RerankModelInfo(
22-
"jinaai/jina-reranker-v2-base-multilingual",
23-
architecture="XLMRobertaForSequenceClassification",
24-
dtype="float32",
25-
)
21+
RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual",
22+
architecture="XLMRobertaForSequenceClassification")
2623
]
2724

2825

tests/models/language/pooling/test_qwen3_reranker.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,9 @@
1212
RERANK_MODELS = [
1313
RerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
1414
architecture="Qwen3ForSequenceClassification",
15-
dtype="float32",
1615
enable_test=True),
1716
RerankModelInfo("Qwen/Qwen3-Reranker-4B",
1817
architecture="Qwen3ForSequenceClassification",
19-
dtype="float32",
2018
enable_test=False)
2119
]
2220

tests/v1/tpu/test_basic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def test_basic(
6767
assert "1024" in output or "0, 1" in output
6868

6969

70+
@pytest.mark.skip(reason="Temporarily disabled due to timeout")
7071
@pytest.mark.skipif(not current_platform.is_tpu(),
7172
reason="This is a basic test for TPU only")
7273
@pytest.mark.parametrize("max_tokens", [8])

vllm/lora/ops/triton_ops/lora_expand_op.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel
1515
from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr
16+
from vllm.platforms import current_platform
1617
from vllm.utils import direct_register_custom_op
1718

1819

@@ -283,6 +284,7 @@ def _lora_expand_fake(
283284
op_func=_lora_expand,
284285
mutates_args=["output_tensor"],
285286
fake_impl=_lora_expand_fake,
287+
dispatch_key=current_platform.dispatch_key,
286288
)
287289
lora_expand = torch.ops.vllm.lora_expand
288290

0 commit comments

Comments
 (0)