[0.9.1][Perf]Remove NZ of kv_b_proj in Deepseek MLA. (#1872)

whx-sjtu · web-flow · commit 5be1d8ce1a70 · 2025-07-19T15:12:30.000+08:00
This PR removes NZ transformation of weights of kv_b_proj. This is
because we find that this matmul weight is not quantized and will fall
back to ND calculation in runtime (because currently float bmm nz is not
supported in torchair graph), which causes two redundant transData
operations (trans weight from NZ back to ND). Removing these two
operations will provide an optimization of about 40us per layer.

Signed-off-by: whx-sjtu &lt;2952154980@qq.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -22,8 +22,7 @@
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
-from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, npu_prefetch,
-                               npu_stream_switch, npu_wait_tensor)
+from vllm_ascend.utils import npu_prefetch, npu_stream_switch, npu_wait_tensor
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -711,12 +710,6 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
         self.W_UV = W_UV.transpose(0, 1).contiguous()
         # Convert from (L, N, P) to (N, P, L)
         self.W_UK_T = W_UK.permute(1, 2, 0).contiguous()
-        if get_ascend_config().enable_weight_nz_layout:
-            # cast quantized weight tensors in NZ layout for higher inference speed
-            self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data,
-                                                       ACL_FORMAT_FRACTAL_NZ)
-            self.W_UK_T.data = torch_npu.npu_format_cast(
-                self.W_UK_T.data, ACL_FORMAT_FRACTAL_NZ)
 
     def _compute_prefill_context(
         self,