Skip to content

Commit e6bf33b

Browse files
ruisearch42hj-mistral
authored andcommitted
Elastic Expert Parallel Initial Support (vllm-project#20775)
Signed-off-by: Rui Qiao <[email protected]> Signed-off-by: Himanshu Jaju <[email protected]>
1 parent 949d580 commit e6bf33b

File tree

24 files changed

+1659
-68
lines changed

24 files changed

+1659
-68
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/bin/bash
2+
3+
MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
4+
LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
5+
HOST="localhost"
6+
PORT=8006
7+
NUM_PROMPTS=20
8+
REQUEST_RATE=5
9+
10+
# Parse command line arguments
11+
while [[ $# -gt 0 ]]; do
12+
case $1 in
13+
--model)
14+
MODEL_NAME="$2"
15+
shift 2
16+
;;
17+
--local-model)
18+
MODEL_NAME=$LOCAL_MODEL_PATH
19+
shift
20+
;;
21+
--host)
22+
HOST="$2"
23+
shift 2
24+
;;
25+
--port)
26+
PORT="$2"
27+
shift 2
28+
;;
29+
--num-prompts)
30+
NUM_PROMPTS="$2"
31+
shift 2
32+
;;
33+
--request-rate)
34+
REQUEST_RATE="$2"
35+
shift 2
36+
;;
37+
-h|--help)
38+
echo "Usage: $0 [OPTIONS]"
39+
echo "Options:"
40+
echo " --model MODEL_NAME Set model name or path (default: deepseek-ai/DeepSeek-V2-Lite)"
41+
echo " --local-model Use local model path (convenience option)"
42+
exit 0
43+
;;
44+
*)
45+
echo "Unknown option: $1"
46+
echo "Use -h or --help for usage information"
47+
exit 1
48+
;;
49+
esac
50+
done
51+
52+
vllm bench serve \
53+
--model $MODEL_NAME \
54+
--host $HOST \
55+
--port $PORT \
56+
--num-prompts $NUM_PROMPTS \
57+
--request-rate $REQUEST_RATE
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: Apache-2.0
3+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
4+
5+
import argparse
6+
import json
7+
import sys
8+
9+
import requests
10+
11+
12+
def scale(host, port, new_dp_size):
13+
url = f"http://{host}:{port}/scale_elastic_ep"
14+
payload = {"new_data_parallel_size": new_dp_size}
15+
headers = {"Content-Type": "application/json"}
16+
17+
print(f"Sending scale request to {url}")
18+
print(f"Payload: {json.dumps(payload, indent=2)}")
19+
20+
try:
21+
response = requests.post(url, json=payload, headers=headers, timeout=300)
22+
23+
print(f"Status Code: {response.status_code}")
24+
print(f"Response: {response.text}")
25+
26+
if response.status_code == 200:
27+
print("Scale up/down request successful!")
28+
return True
29+
else:
30+
print("Scale up/down request failed!")
31+
return False
32+
33+
except requests.exceptions.RequestException as e:
34+
print(f"Request failed: {e}")
35+
return False
36+
37+
38+
def main():
39+
parser = argparse.ArgumentParser(description="Test scale up/down functionality")
40+
parser.add_argument("--host", default="localhost", help="API server host")
41+
parser.add_argument("--port", type=int, default=8006, help="API server port")
42+
parser.add_argument(
43+
"--new-dp-size", type=int, default=2, help="New data parallel size"
44+
)
45+
46+
args = parser.parse_args()
47+
48+
success = scale(args.host, args.port, args.new_dp_size)
49+
sys.exit(0 if success else 1)
50+
51+
52+
if __name__ == "__main__":
53+
main()
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/bin/bash
2+
3+
HOST="0.0.0.0"
4+
PORT=8006
5+
DATA_PARALLEL_SIZE=4
6+
REDUNDANT_EXPERTS=0
7+
LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
8+
MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
9+
10+
while [[ $# -gt 0 ]]; do
11+
case $1 in
12+
--dp)
13+
DATA_PARALLEL_SIZE="$2"
14+
shift 2
15+
;;
16+
--re)
17+
REDUNDANT_EXPERTS="$2"
18+
shift 2
19+
;;
20+
--host)
21+
HOST="$2"
22+
shift 2
23+
;;
24+
--port)
25+
PORT="$2"
26+
shift 2
27+
;;
28+
--model)
29+
MODEL_NAME="$2"
30+
shift 2
31+
;;
32+
--local-model)
33+
MODEL_NAME=$LOCAL_MODEL_PATH
34+
shift
35+
;;
36+
-h|--help)
37+
echo "Usage: $0 [OPTIONS]"
38+
echo "Options:"
39+
echo " --dp SIZE Set data parallel size (default: 4)"
40+
echo " --re SIZE Set redundant experts (default: 0)"
41+
echo " --host HOST Set host address (default: 0.0.0.0)"
42+
echo " --port PORT Set port number (default: 8006)"
43+
echo " --model MODEL_NAME Set model name or path"
44+
echo " -h, --help Show this help message"
45+
exit 0
46+
;;
47+
*)
48+
echo "Unknown option: $1"
49+
echo "Use -h or --help for usage information"
50+
exit 1
51+
;;
52+
esac
53+
done
54+
55+
echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
56+
57+
export RAY_DEDUP_LOGS=0
58+
export VLLM_USE_V1=1
59+
export VLLM_ALL2ALL_BACKEND="pplx"
60+
export VLLM_USE_DEEP_GEMM=1
61+
62+
vllm serve $MODEL_NAME \
63+
--data-parallel-size $DATA_PARALLEL_SIZE \
64+
--data-parallel-size-local $DATA_PARALLEL_SIZE \
65+
--data-parallel-backend ray \
66+
--enforce-eager \
67+
--enable-expert-parallel \
68+
--enable-eplb \
69+
--num-redundant-experts $REDUNDANT_EXPERTS \
70+
--trust-remote-code \
71+
--host $HOST \
72+
--port $PORT
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
From 18c0599c2f07ec965132efa25961dc8179c2dda3 Mon Sep 17 00:00:00 2001
2+
From: Yongji Wu <[email protected]>
3+
Date: Tue, 20 May 2025 13:41:12 -0700
4+
Subject: [PATCH] fix reinit issues due to states not cleaned up
5+
6+
fix double free
7+
---
8+
src/host/init/init.cu | 10 ++++++++++
9+
.../internal/host/nvshmemi_mem_transport.hpp | 15 +++++++++++++++
10+
src/modules/bootstrap/uid/bootstrap_uid.cpp | 5 +++++
11+
3 files changed, 30 insertions(+)
12+
13+
diff --git a/src/host/init/init.cu b/src/host/init/init.cu
14+
index b1c5dbf..1fecb4b 100644
15+
--- a/src/host/init/init.cu
16+
+++ b/src/host/init/init.cu
17+
@@ -43,6 +43,8 @@
18+
#include "internal/host/nvshmemi_types.h"
19+
#include "internal/host/shared_memory.h"
20+
#include "internal/host/nvshmemi_symmetric_heap.hpp"
21+
+// eep-dev
22+
+#include "internal/host/nvshmemi_mem_transport.hpp"
23+
24+
extern __constant__ nvshmemi_device_host_state_t nvshmemi_device_state_d;
25+
static std::map<void *, int> registered_device_states;
26+
@@ -1293,6 +1295,14 @@ void nvshmemid_hostlib_finalize(void *device_ctx, void *transport_device_ctx) {
27+
/* Multi-init Multi-fini*/
28+
nvshmemi_state = NULL;
29+
nvshmemi_device_state.nvshmemi_is_nvshmem_initialized = 0;
30+
+
31+
+ // eep-dev
32+
+ nvshmemi_mem_p2p_transport::destroy_instance();
33+
+ nvshmemi_mem_remote_transport::destroy_instance();
34+
+ free(nvshmemi_default_session);
35+
+ nvshmemi_default_session = nullptr;
36+
+ nvshmemi_device_state.nvshmemi_is_nvshmem_bootstrapped = false;
37+
+
38+
nvshmemi_is_device_state_ready = false;
39+
} else
40+
nvshmemi_boot_handle.barrier(&nvshmemi_boot_handle);
41+
diff --git a/src/include/internal/host/nvshmemi_mem_transport.hpp b/src/include/internal/host/nvshmemi_mem_transport.hpp
42+
index 2495844..e4f408a 100644
43+
--- a/src/include/internal/host/nvshmemi_mem_transport.hpp
44+
+++ b/src/include/internal/host/nvshmemi_mem_transport.hpp
45+
@@ -36,6 +36,13 @@ class nvshmemi_mem_p2p_transport final {
46+
return p2p_objref_;
47+
}
48+
}
49+
+ // eep-dev
50+
+ static void destroy_instance(void) {
51+
+ if (p2p_objref_ != nullptr) {
52+
+ delete p2p_objref_;
53+
+ p2p_objref_ = nullptr;
54+
+ }
55+
+ }
56+
57+
void print_mem_handle(int pe_id, int transport_idx, nvshmemi_symmetric_heap &obj);
58+
59+
@@ -87,6 +94,14 @@ class nvshmemi_mem_remote_transport final {
60+
}
61+
}
62+
63+
+ // eep-dev
64+
+ static void destroy_instance(void) {
65+
+ if (remote_objref_ != nullptr) {
66+
+ delete remote_objref_;
67+
+ remote_objref_ = nullptr;
68+
+ }
69+
+ }
70+
+
71+
int gather_mem_handles(nvshmemi_symmetric_heap &obj, uint64_t heap_offset, size_t size);
72+
/* On-demand registration and release of memory */
73+
int register_mem_handle(nvshmem_mem_handle_t *local_handles, int transport_idx,
74+
diff --git a/src/modules/bootstrap/uid/bootstrap_uid.cpp b/src/modules/bootstrap/uid/bootstrap_uid.cpp
75+
index a1fa748..788fa96 100644
76+
--- a/src/modules/bootstrap/uid/bootstrap_uid.cpp
77+
+++ b/src/modules/bootstrap/uid/bootstrap_uid.cpp
78+
@@ -630,6 +630,11 @@ int nvshmemi_bootstrap_plugin_pre_init(bootstrap_handle_t* handle, const int abi
79+
// Discover the network for bootstrap, if not done previously.
80+
// This code needs to be stateful to be able to be called multiple times by the caller
81+
BOOTSTRAP_CHECK(bootstrap_net_init());
82+
+ // eep-dev
83+
+ if (handle->pre_init_ops != nullptr) {
84+
+ BOOTSTRAP_PTR_FREE(handle->pre_init_ops);
85+
+ handle->pre_init_ops = nullptr;
86+
+ }
87+
if (handle->pre_init_ops == nullptr) {
88+
BOOTSTRAP_CALLOC(&handle->pre_init_ops, 1);
89+
handle->pre_init_ops->get_unique_id = bootstrap_get_unique_id;
90+
--
91+
2.43.0
92+
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
5+
# Default workspace directory
6+
WORKSPACE=$(pwd)/eep_kernels_workspace
7+
INSTALL_NVSHMEM=true
8+
9+
# Parse command line arguments
10+
while getopts "w:n" opt; do
11+
case $opt in
12+
w)
13+
WORKSPACE="$OPTARG"
14+
;;
15+
n)
16+
INSTALL_NVSHMEM=false
17+
;;
18+
\?)
19+
echo "Invalid option: -$OPTARG" >&2
20+
exit 1
21+
;;
22+
esac
23+
done
24+
25+
if [ ! -d "$WORKSPACE" ]; then
26+
mkdir -p $WORKSPACE
27+
fi
28+
29+
30+
# install dependencies if not installed
31+
pip3 install cmake torch ninja
32+
33+
# build nvshmem
34+
pushd $WORKSPACE
35+
# Reset NVSHMEM build if requested
36+
if [ "$INSTALL_NVSHMEM" = true ]; then
37+
mkdir -p nvshmem_src
38+
wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
39+
tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
40+
pushd nvshmem_src
41+
wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
42+
git init
43+
git apply -vvv nvshmem.patch
44+
git apply --reject --whitespace=fix ../../eep_nvshmem.patch
45+
else
46+
pushd nvshmem_src
47+
fi
48+
49+
# assume CUDA_HOME is set correctly
50+
if [ -z "$CUDA_HOME" ]; then
51+
echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
52+
exit 1
53+
fi
54+
55+
# disable all features except IBGDA
56+
export NVSHMEM_IBGDA_SUPPORT=1
57+
58+
export NVSHMEM_SHMEM_SUPPORT=0
59+
export NVSHMEM_UCX_SUPPORT=0
60+
export NVSHMEM_USE_NCCL=0
61+
export NVSHMEM_PMIX_SUPPORT=0
62+
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
63+
export NVSHMEM_USE_GDRCOPY=0
64+
export NVSHMEM_IBRC_SUPPORT=0
65+
export NVSHMEM_BUILD_TESTS=0
66+
export NVSHMEM_BUILD_EXAMPLES=0
67+
export NVSHMEM_MPI_SUPPORT=0
68+
export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
69+
export NVSHMEM_BUILD_TXZ_PACKAGE=0
70+
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
71+
72+
cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
73+
cmake --build $WORKSPACE/nvshmem_build/ --target install
74+
75+
popd
76+
77+
export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
78+
79+
# build and install pplx, require pytorch installed
80+
pushd $WORKSPACE
81+
git clone https://github.com/ppl-ai/pplx-kernels
82+
cd pplx-kernels
83+
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
84+
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
85+
PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install . --no-deps -v
86+

vllm/config.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2008,6 +2008,19 @@ def has_unfinished_dp(dp_group: "ProcessGroup",
20082008
aggregated_has_unfinished = bool(tensor.item())
20092009
return aggregated_has_unfinished
20102010

2011+
@staticmethod
2012+
def sync_kv_cache_memory_size(dp_group: "ProcessGroup",
2013+
kv_cache_memory: int) -> int:
2014+
if kv_cache_memory == -1:
2015+
kv_cache_memory = torch.iinfo(torch.int64).max
2016+
tensor = torch.tensor([kv_cache_memory],
2017+
dtype=torch.int64,
2018+
device="cpu")
2019+
# we cannot use broadcast for stateless dp group since it depends
2020+
# on global rank
2021+
torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
2022+
return tensor.item()
2023+
20112024
def compute_hash(self):
20122025
"""
20132026
Provide a hash that uniquely identifies all the configs

0 commit comments

Comments
 (0)