vllm-project · vllm-bot · Jul 21, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -109,6 +109,13 @@ def __init__(
         self.num_kv_heads = num_kv_heads
         self.sliding_window = sliding_window
 
+        # For v1 we have backend agnostic iRoPE (local chunked attention)
+        #  we have to store the flag on the layer so gpu model runner can
+        #  set KVSpec appropriately (and pop it so it doesnt get passed to
+        #  the backends)
+        if envs.VLLM_USE_V1:
+            self.use_irope = extra_impl_args.pop("use_irope", False)
+
         quant_method = quant_config.get_quant_method(
             self, prefix=prefix) if quant_config else None
         if quant_method is not None and not isinstance(

@@ -447,7 +447,6 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
             raise NotImplementedError("KV sharing is not supported in V0.")
@@ -457,10 +456,6 @@ def __init__(
         if logits_soft_cap is not None:
             logger.warning_once("Torch SPDA does not support logits soft cap. "
                                 "Outputs may be slightly off.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in Torch SPDA is not supported yet, it will fall"
-                " back to global attention for long context.")
         self.paged_attn_impl = _get_paged_attn_impl()
         self.num_heads = num_heads
         self.head_size = head_size

@@ -353,7 +353,6 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -385,7 +384,6 @@ def __init__(
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
                                       "FlashAttentionImpl")
-        self.use_irope = use_irope
         self.vllm_flash_attn_version = get_flash_attn_version()
         if is_quantized_kv_cache(self.kv_cache_dtype) \
             and not flash_attn_supports_fp8():

@@ -494,7 +494,6 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[int] = None,
-        use_irope: bool = False,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -510,7 +509,6 @@ def __init__(
         self.kv_cache_dtype = kv_cache_dtype
         self.logits_soft_cap = logits_soft_cap
         self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
-        self.use_irope = use_irope
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 

@@ -136,12 +136,7 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[int] = None,
-        use_irope: bool = False,
     ) -> None:
-        if use_irope:
-            logger.warning_once(
-                "Using irope in Pallas is not supported yet, it will fall back "
-                "to global attention for long context.")
         if blocksparse_params is not None:
             raise ValueError("Paged attention Pallas kernel does "
                              "not support block-sparse attention.")

@@ -338,7 +338,6 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[int] = None,
-        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -371,7 +370,6 @@ def __init__(
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
                                       "FlashAttentionImpl")
-        self.use_irope = use_irope
         if is_quantized_kv_cache(self.kv_cache_dtype):
             raise NotImplementedError(
                 "AiterFlashAttention does not support fp8 kv-cache on this "

@@ -72,9 +72,6 @@ def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
             vllm_config.parallel_config)
         self.headdim = model_config.get_head_size()
 
-        self.attention_chunk_size = getattr(vllm_config.scheduler_config,
-                                            'attention_chunk_size', None)
-
     def build_for_cudagraph_capture(
         self, common_attn_metadata: CommonAttentionMetadata
     ) -> TritonAttentionMetadata:
@@ -209,7 +206,6 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[int] = None,
-        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -232,8 +228,6 @@ def __init__(
         self.logits_soft_cap = logits_soft_cap
         self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
 
-        self.use_irope = use_irope
-
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
         TritonAttentionBackend.validate_head_size(head_size)

@@ -2628,7 +2628,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
             # TODO: Support other attention modules, e.g., cross-attention
             if attn_module.attn_type == AttentionType.DECODER:
                 use_local_attention = (self.attention_chunk_size is not None
-                                       and attn_module.impl.use_irope)
+                                       and attn_module.use_irope)
                 if attn_module.sliding_window is not None:
                     kv_cache_spec[layer_name] = SlidingWindowSpec(
                         block_size=block_size,
@@ -2638,13 +2638,13 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
                         sliding_window=attn_module.sliding_window,
                         use_mla=use_mla)
                 elif use_local_attention:
-                    kv_cache_spec[layer_name] = (ChunkedLocalAttentionSpec(
+                    kv_cache_spec[layer_name] = ChunkedLocalAttentionSpec(
                         block_size=block_size,
                         num_kv_heads=attn_module.num_kv_heads,
                         head_size=attn_module.head_size,
                         dtype=self.kv_cache_dtype,
                         attention_chunk_size=self.attention_chunk_size,
-                        use_mla=use_mla))
+                        use_mla=use_mla)
                 else:
                     kv_cache_spec[layer_name] = FullAttentionSpec(
                         block_size=block_size,

@@ -521,6 +521,10 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
                 continue
 
             if attn_module.attn_type == AttentionType.DECODER:
+                if attn_module.use_irope:
+                    logger.warning_once(
+                        "Using irope in Pallas is not supported yet, it "
+                        "will fall back to global attention for long context.")
                 if attn_module.sliding_window is not None:
                     kv_cache_spec[layer_name] = SlidingWindowSpec(
                         block_size=block_size,