fixes

Varun Sundar Rabindranath · Varun Sundar Rabindranath · commit 533d77f6360b · 2025-07-14T05:32:31.000Z
Signed-off-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -124,11 +124,13 @@ def workspace_shapes(
         if self.allow_deep_gemm:
             assert self.batched_deep_gemm_experts is not None
             return self.batched_deep_gemm_experts.workspace_shapes(
-                a, aq, M, N, K, topk, global_num_experts, local_num_experts)
+                a, aq, M, N, K, topk, global_num_experts, local_num_experts,
+                expert_tokens_metadata)
         else:
             assert self.batched_triton_experts is not None
             return self.batched_triton_experts.workspace_shapes(
-                a, aq, M, N, K, topk, global_num_experts, local_num_experts)
+                a, aq, M, N, K, topk, global_num_experts, local_num_experts,
+                expert_tokens_metadata)
 
     def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
               w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -141,8 +141,6 @@ def apply(
 
         a1q = hidden_states
         _, N, K = w1.size()
-        M, _ = output.size()
-        num_topk = topk_ids.size(1)
 
         local_num_experts = w1.size(0)
         if global_num_experts == -1:
@@ -155,7 +153,6 @@ def apply(
                                   local_num_experts=local_num_experts,
                                   alignment=deep_gemm_block_shape()[0],
                                   expert_tokens_meta=expert_tokens_meta)
-        assert M_sum >= M * num_topk
 
         a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn),
                                  (M_sum, K))
@@ -189,7 +186,6 @@ def apply(
         m_grouped_fp8_gemm_nt_contiguous((a2q, a2q_scale), (w2, w2_scale),
                                          mm2_out, expert_ids)
 
-        # TODO (varun) : We could probably reshape mm2_out and pass as output
         if apply_router_weight_on_input:
             topk_weights = torch.ones_like(topk_weights)
 
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
@@ -37,6 +37,7 @@ def round_up_128(x: int) -> int:
 def compute_aligned_M(M: int, num_topk: int, local_num_experts: int,
                       alignment: int,
                       expert_tokens_meta: Optional[mk.ExpertTokensMetadata]):
+
     if ((expert_tokens_meta is not None)
             and (expert_tokens_meta.expert_num_tokens_cpu is not None)):
         return expert_num_tokens_round_up_and_sum(
@@ -336,7 +337,7 @@ def ep_gather(
 def deepgemm_moe_permute(aq: torch.Tensor,
                          aq_scale: torch.Tensor,
                          topk_ids: torch.Tensor,
-                         local_num_experts: torch.Tensor,
+                         local_num_experts: int,
                          expert_map: Optional[torch.Tensor],
                          expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
                          aq_out: Optional[torch.Tensor] = None):
@@ -378,7 +379,7 @@ def deepgemm_moe_permute(aq: torch.Tensor,
 
     expert_num_tokens = None
     if expert_tokens_meta is not None:
-        expert_num_tokens = expert_tokens_meta.expert_num_tokens_gpu
+        expert_num_tokens = expert_tokens_meta.expert_num_tokens
     else:
         expert_num_tokens = count_expert_num_tokens(topk_ids,
                                                     local_num_experts,
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -891,6 +891,7 @@ def workspace_shapes(
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         assert a.dim() == 2
         num_dp = self.num_dispatchers
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -480,7 +480,8 @@ def _do_fused_experts(self, fused_out: Optional[torch.Tensor],
 
         (workspace13_shape, workspace2_shape, fused_out_shape,
          workspace_dtype) = self.fused_experts.workspace_shapes(
-             a1, a1q, M, N, K, top_k, global_num_experts, local_num_experts)
+             a1, a1q, M, N, K, top_k, global_num_experts, local_num_experts,
+             expert_tokens_meta)
 
         # We can reuse the memory between cache1 and cache3 because by the
         # time we need cache3, we're done with cache1.
@@ -573,10 +574,9 @@ def _maybe_chunk_fused_experts(
         assert num_chunks > 1
 
         # Construct the entire output that can then be processed in chunks.
-        (_, _, fused_out_shape,
-         _) = self.fused_experts.workspace_shapes(a1, a1q, M, N, K, top_k,
-                                                  global_num_experts,
-                                                  local_num_experts)
+        (_, _, fused_out_shape, _) = self.fused_experts.workspace_shapes(
+            a1, a1q, M, N, K, top_k, global_num_experts, local_num_experts,
+            expert_tokens_meta)
         fused_out = torch.empty(fused_out_shape,
                                 device=a1q.device,
                                 dtype=a1.dtype)
@@ -614,8 +614,11 @@ def slice_expert_tokens_metadata(
             need_expert_num_tokens_cpu = (
                 full_expert_tokens_meta.expert_num_tokens_cpu is not None)
             if need_expert_num_tokens_cpu:
+                # This is blocking as some implementations need the count
+                # on the CPU to determine appropriate input/out fused-moe
+                # buffers
                 c_expert_num_tokens_cpu = c_expert_num_tokens.to(
-                    "cpu", non_blocking=True)
+                    "cpu", non_blocking=False)
 
             return ExpertTokensMetadata(
                 expert_num_tokens=c_expert_num_tokens,
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -111,7 +111,7 @@ def prepare(
         # topk_indices_dtype() int32
         #
         if expert_map is not None:
-            logger.warn_once(
+            logger.warning_once(
                 "The PPLX backend does not support expert mapping. "
                 "The provided `expert_map` will be ignored.")
         expert_map = None  #noqa: F841
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -111,11 +111,13 @@ def workspace_shapes(
                                      or is_blackwell_deep_gemm_used()):
             assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
-                a, aq, M, N, K, topk, global_num_experts, local_num_experts)
+                a, aq, M, N, K, topk, global_num_experts, local_num_experts,
+                expert_tokens_meta)
         else:
             return self.triton_expert.workspace_shapes(a, aq, M, N, K, topk,
                                                        global_num_experts,
-                                                       local_num_experts)
+                                                       local_num_experts,
+                                                       expert_tokens_meta)
 
     def apply(
         self,

Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,7 @@ def prepare(`
`111`	`111`	`# topk_indices_dtype() int32`
`112`	`112`	`#`
`113`	`113`	`if expert_map is not None:`
`114`		`- logger.warn_once(`
	`114`	`+ logger.warning_once(`
`115`	`115`	`"The PPLX backend does not support expert mapping. "`
`116`	`116`	"The provided `expert_map` will be ignored.")
`117`	`117`	`expert_map = None #noqa: F841`