Skip to content

Commit 5be1d8c

Browse files
authored
[0.9.1][Perf]Remove NZ of kv_b_proj in Deepseek MLA. (#1872)
This PR removes NZ transformation of weights of kv_b_proj. This is because we find that this matmul weight is not quantized and will fall back to ND calculation in runtime (because currently float bmm nz is not supported in torchair graph), which causes two redundant transData operations (trans weight from NZ back to ND). Removing these two operations will provide an optimization of about 40us per layer. Signed-off-by: whx-sjtu <[email protected]>
1 parent 6ca0eed commit 5be1d8c

File tree

1 file changed

+1
-8
lines changed

1 file changed

+1
-8
lines changed

vllm_ascend/attention/mla_v1.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@
2222
from vllm_ascend.multistream.context import get_multistream_comm_context
2323
from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
2424
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
25-
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, npu_prefetch,
26-
npu_stream_switch, npu_wait_tensor)
25+
from vllm_ascend.utils import npu_prefetch, npu_stream_switch, npu_wait_tensor
2726

2827
if TYPE_CHECKING:
2928
from vllm.v1.core.sched.output import SchedulerOutput
@@ -711,12 +710,6 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
711710
self.W_UV = W_UV.transpose(0, 1).contiguous()
712711
# Convert from (L, N, P) to (N, P, L)
713712
self.W_UK_T = W_UK.permute(1, 2, 0).contiguous()
714-
if get_ascend_config().enable_weight_nz_layout:
715-
# cast quantized weight tensors in NZ layout for higher inference speed
716-
self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data,
717-
ACL_FORMAT_FRACTAL_NZ)
718-
self.W_UK_T.data = torch_npu.npu_format_cast(
719-
self.W_UK_T.data, ACL_FORMAT_FRACTAL_NZ)
720713

721714
def _compute_prefill_context(
722715
self,

0 commit comments

Comments
 (0)