vllm-project
diff --git a/‎.github/workflows/vllm_ascend_test.yaml
Lines changed: 8 additions & 6 deletions b/‎.github/workflows/vllm_ascend_test.yaml
Lines changed: 8 additions & 6 deletions
diff --git a/‎csrc/ops.h
Lines changed: 18 additions & 1 deletion b/‎csrc/ops.h
Lines changed: 18 additions & 1 deletion
diff --git a/‎csrc/torch_binding.cpp
Lines changed: 2 additions & 0 deletions b/‎csrc/torch_binding.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎pyproject.toml
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml
Lines changed: 2 additions & 2 deletions
diff --git a/‎requirements-dev.txt
Lines changed: 1 addition & 0 deletions b/‎requirements-dev.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements.txt
Lines changed: 3 additions & 2 deletions b/‎requirements.txt
Lines changed: 3 additions & 2 deletions
diff --git a/‎tests/compile/__init__.py b/‎tests/compile/__init__.py
diff --git a/‎tests/compile/test_simple.py
Lines changed: 118 additions & 0 deletions b/‎tests/compile/test_simple.py
Lines changed: 118 additions & 0 deletions
diff --git a/‎tests/multicard/test_offline_inference_distributed.py
Lines changed: 1 addition & 0 deletions b/‎tests/multicard/test_offline_inference_distributed.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/singlecard/test_offline_inference.py
Lines changed: 1 addition & 1 deletion b/‎tests/singlecard/test_offline_inference.py
Lines changed: 1 addition & 1 deletion
@@ -115,24 +115,26 @@ jobs:
       - name: Install vllm-project/vllm-ascend
         run: |
           pip install -r requirements-dev.txt
-          pip install -e .
+          pip install -v --no-build-isolation -e .
 
-      - name: Run vllm-project/vllm-ascend test on V0 engine
+      - name: Run vllm-project/vllm-ascend test for V1 Engine
         env:
-          VLLM_USE_V1: 0
+          VLLM_USE_V1: 1
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
         run: |
           if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
             pytest -sv tests/singlecard/test_offline_inference.py
             pytest -sv tests/ops
+            pytest -sv tests/compile
           else
             pytest -sv tests/multicard/test_offline_inference_distributed.py
             pytest -sv tests/ops
+            pytest -sv tests/compile
           fi
 
-      - name: Run vllm-project/vllm-ascend test for V1 Engine
+      - name: Run vllm-project/vllm-ascend test on V0 engine
         env:
-          VLLM_USE_V1: 1
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          VLLM_USE_V1: 0
         run: |
           if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
             pytest -sv tests/singlecard/test_offline_inference.py
 
@@ -21,6 +21,7 @@
 
 #include <vector>
 #include "kernels/types.h"
+#include "torch_npu/csrc/aten/common/from_blob.h"
 
 namespace vllm_ascend {
   extern void rotary_embedding_impl(AscendType type, bool isNeox, void *stream, int64_t *positions, void *queryDst,
@@ -29,4 +30,20 @@ namespace vllm_ascend {
     const int64_t dstKeyStride, const int numHeads, const int numKvHeads,
     const int headSize, const int64_t numTokens, const uint32_t loopCnt,
     uint32_t aivNum);
-}
+
+  torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
+    if (!tensor.is_privateuseone()) {
+      throw std::runtime_error("Tensor must be on NPU device");
+    }
+    // Get the raw data pointer
+    void* data_ptr = tensor.data_ptr();
+    // Get tensor sizes and strides
+    std::vector<int64_t> sizes = tensor.sizes().vec();
+    std::vector<int64_t> strides = tensor.strides().vec();
+    // Get tensor options (dtype, device)
+    auto options = tensor.options();
+    // Create a new tensor from the raw data pointer
+    auto new_tensor = at_npu::native::from_blob(data_ptr, sizes, strides, options);
+    return new_tensor;
+  }
+}
@@ -103,6 +103,8 @@ std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::T
 TORCH_LIBRARY_EXPAND(_C, ops)
 {
     // vLLM-Ascend custom ops
+    ops.def("weak_ref_tensor(Tensor input) -> Tensor");
+    ops.impl("weak_ref_tensor", torch::kPrivateUse1, &vllm_ascend::weak_ref_tensor);
 
     // Rotary embedding
     // Apply GPT-NeoX style rotary embedding to query and key.
 
@@ -11,8 +11,8 @@ requires = [
     "scipy",
     "setuptools>=64",
     "setuptools-scm>=8",
-    "torch_npu",
-    "torch >= 2.5.1",
+    "torch_npu==2.5.1rc1",
+    "torch>=2.5.1",
     "torchvision<0.21.0",
 ]
 build-backend = "setuptools.build_meta"
@@ -1,4 +1,5 @@
 -r requirements-lint.txt
+-r requirements.txt
 modelscope
 pytest >= 6.0
 pytest-asyncio
 
@@ -3,11 +3,12 @@ cmake>=3.26
 decorator
 numpy<2.0.0
 packaging
+pip
 pybind11
 pyyaml
 scipy
 setuptools>=64
 setuptools-scm>=8
-torch_npu
-torch >= 2.5.1
+torch>=2.5.1
 torchvision<0.21.0
+wheel
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test the piecewise compilation with a simple model so that we
+can exactly calculate the expected output and side effects.
+"""
+
+import pytest
+import torch
+from torch import nn
+from torch.library import Library
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+from vllm.utils import direct_register_custom_op
+
+global_counter = 0
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    global global_counter
+    global_counter += 1
+    print(f"{global_counter=}")
+    out.copy_(q)
+    out[0] += 1
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    dispatch_key="PrivateUse1",
+    target_lib=silly_lib,
+)
+
+
+@support_torch_compile
+class SillyModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Overall effect:
+        x += 1
+        x[0] += 2
+        global_counter += 2
+        """
+        x = x + 1
+        x = x + 2
+        out = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, out)
+        x = out
+        x = x - 2
+        x = x - 1
+        out = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, out)
+        x = out
+        x = x + 1
+        return x
+
+
+@pytest.mark.skipif(True, reason="requires unreleased components")
+def test_simple_piecewise_compile():
+
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_inductor=False,
+        use_cudagraph=True,
+        splitting_ops=["silly.attention"],
+        cudagraph_copy_inputs=True,
+        cudagraph_capture_sizes=[1, 2],
+    ))
+    vllm_config.compilation_config.pass_config.enable_fusion = False
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix="")
+
+    inputs = torch.randn(100).npu()
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
+            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
+            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_caputured=
+            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+
+        model(inputs)
+
+        model(torch.randn(2).npu())
+        model(torch.randn(1).npu())
+
+        input = torch.zeros(2).npu()
+        global global_counter
+        global_counter = 0
+        output = model(input)
+        assert global_counter == 2
+        assert torch.allclose(output.cpu(), torch.tensor([3.0, 1.0]))
+
+
+if __name__ == "__main__":
+    test_simple_piecewise_compile()
@@ -47,6 +47,7 @@ def test_models_distributed(model: str,
             dtype=dtype,
             tensor_parallel_size=4,
             distributed_executor_backend=distributed_executor_backend,
+            enforce_eager=True,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
@@ -50,7 +50,7 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
     with VllmRunner(model,
                     max_model_len=8192,
                     dtype=dtype,
-                    enforce_eager=False,
+                    enforce_eager=True,
                     gpu_memory_utilization=0.7) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
Original file line number	Diff line number	Diff line change
`@@ -103,6 +103,8 @@ std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::T`
`103`	`103`	`TORCH_LIBRARY_EXPAND(_C, ops)`
`104`	`104`	`{`
`105`	`105`	`// vLLM-Ascend custom ops`
	`106`	`+ ops.def("weak_ref_tensor(Tensor input) -> Tensor");`
	`107`	`+ ops.impl("weak_ref_tensor", torch::kPrivateUse1, &vllm_ascend::weak_ref_tensor);`
`106`	`108`
`107`	`109`	`// Rotary embedding`
`108`	`110`	`// Apply GPT-NeoX style rotary embedding to query and key.`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`-r requirements-lint.txt`
	`2`	`+-r requirements.txt`
`2`	`3`	`modelscope`
`3`	`4`	`pytest >= 6.0`
`4`	`5`	`pytest-asyncio`