refactor: use platform-agnostic device control for DP engine core

jianzs · jianzs · commit 434e8bf10974 · 2025-04-27T14:17:05.000+08:00
Refactor the DP engine core to use the platform-specific `device_control_env_var` attribute instead of hardcoding CUDA_VISIBLE_DEVICES. This change improves platform compatibility and code maintainability.

The update includes:
1. Moving the `device_id_to_physical_device_id` function to a shared utils file
2. Updating imports in CUDA and ROCm platform files
3. Replacing CUDA-specific environment variable setting with a platform-agnostic approach in the DPEngineCoreProc class

This refactoring enhances the flexibility of the codebase to support different platforms more seamlessly.

Signed-off-by: Jade Zheng &lt;zheng.shoujian@outlook.com&gt;
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -18,6 +18,7 @@
 from vllm.utils import import_pynvml
 
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+from .utils import device_id_to_physical_device_id
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
@@ -37,24 +38,6 @@
 torch.backends.cuda.enable_cudnn_sdp(False)
 
 
-def device_id_to_physical_device_id(device_id: int) -> int:
-    if "CUDA_VISIBLE_DEVICES" in os.environ:
-        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-        if device_ids == [""]:
-            msg = (
-                "CUDA_VISIBLE_DEVICES is set to empty string, which means"
-                " GPU support is disabled. If you are using ray, please unset"
-                " the environment variable `CUDA_VISIBLE_DEVICES` inside the"
-                " worker/actor. "
-                "Check https://github.com/vllm-project/vllm/issues/8402 for"
-                " more information.")
-            raise RuntimeError(msg)
-        physical_device_id = device_ids[device_id]
-        return int(physical_device_id)
-    else:
-        return device_id
-
-
 def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
 
     @wraps(fn)
@@ -328,7 +311,7 @@ def get_device_capability(cls,
                               device_id: int = 0
                               ) -> Optional[DeviceCapability]:
         try:
-            physical_device_id = device_id_to_physical_device_id(device_id)
+            physical_device_id = device_id_to_physical_device_id(device_id, cls.device_control_env_var)
             handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
             major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
             return DeviceCapability(major=major, minor=minor)
@@ -350,20 +333,20 @@ def has_device_capability(
     @classmethod
     @with_nvml_context
     def get_device_name(cls, device_id: int = 0) -> str:
-        physical_device_id = device_id_to_physical_device_id(device_id)
+        physical_device_id = device_id_to_physical_device_id(device_id, cls.device_control_env_var)
         return cls._get_physical_device_name(physical_device_id)
 
     @classmethod
     @with_nvml_context
     def get_device_uuid(cls, device_id: int = 0) -> str:
-        physical_device_id = device_id_to_physical_device_id(device_id)
+        physical_device_id = device_id_to_physical_device_id(device_id, cls.device_control_env_var)
         handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
         return pynvml.nvmlDeviceGetUUID(handle)
 
     @classmethod
     @with_nvml_context
     def get_device_total_memory(cls, device_id: int = 0) -> int:
-        physical_device_id = device_id_to_physical_device_id(device_id)
+        physical_device_id = device_id_to_physical_device_id(device_id, cls.device_control_env_var)
         handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
         return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -10,6 +10,7 @@
 from vllm.logger import init_logger
 
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+from .utils import device_id_to_physical_device_id
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
@@ -89,15 +90,6 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-def device_id_to_physical_device_id(device_id: int) -> int:
-    if "CUDA_VISIBLE_DEVICES" in os.environ:
-        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-        physical_device_id = device_ids[device_id]
-        return int(physical_device_id)
-    else:
-        return device_id
-
-
 def on_mi250_mi300() -> bool:
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
     return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
@@ -223,7 +215,7 @@ def is_fully_connected(physical_device_ids: List[int]) -> bool:
     @with_amdsmi_context
     @lru_cache(maxsize=8)
     def get_device_name(cls, device_id: int = 0) -> str:
-        physical_device_id = device_id_to_physical_device_id(device_id)
+        physical_device_id = device_id_to_physical_device_id(device_id, cls.device_control_env_var)
         handle = amdsmi_get_processor_handles()[physical_device_id]
         return amdsmi_get_gpu_asic_info(handle)["market_name"]
 
diff --git a/vllm/platforms/utils.py b/vllm/platforms/utils.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+
+def device_id_to_physical_device_id(device_id: int,
+                                    device_control_env_var: str) -> int:
+    if device_control_env_var in os.environ:
+        device_ids = os.environ[device_control_env_var].split(",")
+        if device_ids == [""]:
+            msg = (
+                f"{device_control_env_var} is set to empty string, which means"
+                " current platform support is disabled. If you are using ray,"
+                f" please unset the environment variable `{device_control_env_var}`"
+                "inside the worker/actor. "
+                "Check https://github.com/vllm-project/vllm/issues/8402 for"
+                " more information.")
+            raise RuntimeError(msg)
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -592,13 +592,13 @@ def __init__(
         assert 0 <= local_dp_rank <= dp_rank < dp_size
 
         from vllm.platforms import current_platform
-        if current_platform.is_cuda_alike():
-            from vllm.platforms.cuda import device_id_to_physical_device_id
-            tp_size = vllm_config.parallel_config.tensor_parallel_size
-            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
-                str(device_id_to_physical_device_id(i))
-                for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
-                               tp_size))
+        from vllm.platforms.utils import device_id_to_physical_device_id
+        device_control_env_var = current_platform.device_control_env_var
+        tp_size = vllm_config.parallel_config.tensor_parallel_size
+        os.environ[device_control_env_var] = ",".join(
+            str(device_id_to_physical_device_id(i, device_control_env_var))
+            for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
+                           tp_size))
 
         self.local_dp_rank = local_dp_rank
         self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()