Skip to content

[DO NOT MERGE] Experiments #368

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 18 commits into
base: main
Choose a base branch
from
Draft
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ data:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 1Gi
sizeLimit: 16Gi
{{ `{{- if .HFModelName }}` }}
- name: model-cache
emptyDir: {}
Expand Down Expand Up @@ -185,6 +185,8 @@ data:
- name: VLLM_LOGGING_LEVEL
value: {{ .Values.modelservice.vllm.logLevel }}
{{- end }}
- name: VLLM_IS_PREFILL
value: "1"
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5557"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
Expand Down Expand Up @@ -230,7 +232,7 @@ data:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 1Gi
sizeLimit: 16Gi
{{ `{{ if .HFModelName }}` }}
- name: model-cache
emptyDir: {}
Expand Down
4 changes: 2 additions & 2 deletions charts/llm-d/templates/sample-application/modelservice.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
{{- range .Values.sampleApplication.decode.extraArgs }}
- {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
{{- end }}
resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }}
resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }}
env:
{{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
- name: HF_TOKEN
Expand All @@ -49,7 +49,7 @@ spec:
{{- range .Values.sampleApplication.prefill.extraArgs }}
- {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
{{- end }}
resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }}
resources: {{ .Values.sampleApplication.prefill.resources | toYaml | nindent 8 }}
env:
{{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
- name: HF_TOKEN
Expand Down
169 changes: 114 additions & 55 deletions charts/llm-d/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -10471,6 +10471,65 @@
"description": "number of desired decode replicas",
"required": [],
"title": "replicas"
},
"resources": {
"description": "ResourceRequirements describes the compute resource requirements.",
"properties": {
"claims": {
"description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
"items": {
"description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
"properties": {
"name": {
"description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
"type": "string"
},
"request": {
"description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
"type": "string"
}
},
"required": [
"name"
],
"type": "object"
},
"type": "array",
"x-kubernetes-list-map-keys": [
"name"
],
"x-kubernetes-list-type": "map"
},
"limits": {
"additionalProperties": {
"oneOf": [
{
"type": "string"
},
{
"type": "number"
}
]
},
"description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
"type": "object"
},
"requests": {
"additionalProperties": {
"oneOf": [
{
"type": "string"
},
{
"type": "number"
}
]
},
"description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
"type": "object"
}
},
"type": "object"
}
},
"required": [],
Expand Down Expand Up @@ -10688,69 +10747,69 @@
"description": "number of desired prefill replicas",
"required": [],
"title": "replicas"
}
},
"required": [],
"title": "prefill",
"type": "object"
},
"resources": {
"description": "ResourceRequirements describes the compute resource requirements.",
"properties": {
"claims": {
"description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
"items": {
"description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
"properties": {
"name": {
"description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
"type": "string"
},
"resources": {
"description": "ResourceRequirements describes the compute resource requirements.",
"properties": {
"claims": {
"description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
"items": {
"description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
"properties": {
"name": {
"description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
"type": "string"
},
"request": {
"description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
"type": "string"
}
},
"required": [
"name"
],
"type": "object"
},
"request": {
"description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
"type": "string"
}
"type": "array",
"x-kubernetes-list-map-keys": [
"name"
],
"x-kubernetes-list-type": "map"
},
"required": [
"name"
],
"type": "object"
},
"type": "array",
"x-kubernetes-list-map-keys": [
"name"
],
"x-kubernetes-list-type": "map"
},
"limits": {
"additionalProperties": {
"oneOf": [
{
"type": "string"
"limits": {
"additionalProperties": {
"oneOf": [
{
"type": "string"
},
{
"type": "number"
}
]
},
{
"type": "number"
}
]
},
"description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
"type": "object"
},
"requests": {
"additionalProperties": {
"oneOf": [
{
"type": "string"
"description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
"type": "object"
},
"requests": {
"additionalProperties": {
"oneOf": [
{
"type": "string"
},
{
"type": "number"
}
]
},
{
"type": "number"
}
]
"description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
"type": "object"
}
},
"description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
"type": "object"
}
},
"required": [],
"title": "prefill",
"type": "object"
}
},
Expand Down
36 changes: 24 additions & 12 deletions charts/llm-d/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -125,22 +125,22 @@ sampleApplication:
# -- Key within the secret under which the token is located
key: HF_TOKEN

# @schema
# $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
# @schema
# -- Modify resource limits/requests available to the pods
# -- Resource requests/limits
# <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"

# -- InferencePool port configuration
inferencePoolPort: 8000

prefill:
# @schema
# $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
# @schema
# -- Modify resource limits/requests available to the pods
# -- Resource requests/limits
# <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"

# -- number of desired prefill replicas
replicas: 1

Expand All @@ -152,6 +152,18 @@ sampleApplication:
extraArgs: []

decode:
# @schema
# $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
# @schema
# -- Modify resource limits/requests available to the pods
# -- Resource requests/limits
# <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"

# -- number of desired decode replicas
replicas: 1

Expand Down
77 changes: 77 additions & 0 deletions quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
sampleApplication:
baseConfigMapRefName: basic-gpu-with-nixl-preset
model:
modelArtifactURI: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
modelName: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
auth:
hfToken:
name: llm-d-hf-token
key: HF_TOKEN
prefill:
replicas: 4
resources:
limits:
nvidia.com/gpu: 1
rdma/ib: 1
requests:
nvidia.com/gpu: 1
rdma/ib: 1
cpu: "32"
memory: 128Gi
extraArgs:
- "--tensor-parallel-size"
- "1"
- "--disable-log-requests"
- "--max-model-len"
- "32768"
- "--block-size"
- "128"
decode:
replicas: 1
resources:
limits:
nvidia.com/gpu: 4
rdma/ib: 1
requests:
nvidia.com/gpu: 4
rdma/ib: 1
cpu: "32"
memory: 128Gi
extraArgs:
- "--tensor-parallel-size"
- "4"
- "--disable-log-requests"
- "--max-model-len"
- "32768"
- "--block-size"
- "128"
modelservice:
vllm:
image:
registry: docker.io
repository: robertgouldshaw2/vllm-nixl
tag: nixl-oh-debug-fixed-0.3
epp:
defaultEnvVarsOverride:
- name: ENABLE_KVCACHE_AWARE_SCORER
value: "false"
- name: ENABLE_PREFIX_AWARE_SCORER
value: "false"
- name: ENABLE_LOAD_AWARE_SCORER
value: "true"
- name: ENABLE_SESSION_AWARE_SCORER
value: "false"
- name: PD_ENABLED
value: "true"
- name: PD_PROMPT_LEN_THRESHOLD
value: "10"
- name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
value: "false"
- name: PREFILL_ENABLE_LOAD_AWARE_SCORER
value: "true"
- name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
value: "false"
- name: PREFILL_ENABLE_SESSION_AWARE_SCORER
value: "false"
redis:
enabled: false
Loading
Loading