llm-d · robertgshaw2-redhat · Jun 6, 2025 · Jun 8, 2025 · Jun 8, 2025 · Jun 8, 2025
diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -129,7 +129,7 @@ data:
             - name: dshm
               emptyDir:
                 medium: Memory
-                sizeLimit: 1Gi
+                sizeLimit: 16Gi
             {{ `{{- if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
@@ -185,6 +185,8 @@ data:
                 - name: VLLM_LOGGING_LEVEL
                   value: {{ .Values.modelservice.vllm.logLevel }}
                 {{- end }}
+                - name: VLLM_IS_PREFILL
+                  value: "1"
                 - name: VLLM_NIXL_SIDE_CHANNEL_PORT
                   value: "5557"
                 - name: VLLM_NIXL_SIDE_CHANNEL_HOST
@@ -230,7 +232,7 @@ data:
             - name: dshm
               emptyDir:
                 medium: Memory
-                sizeLimit: 1Gi
+                sizeLimit: 16Gi
             {{ `{{ if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}

diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml
@@ -30,7 +30,7 @@ spec:
       {{- range .Values.sampleApplication.decode.extraArgs }}
       - {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
       {{- end }}
-      resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }}
+      resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }}
       env:
       {{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
       - name: HF_TOKEN
@@ -49,7 +49,7 @@ spec:
       {{- range .Values.sampleApplication.prefill.extraArgs }}
       - {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
       {{- end }}
-      resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }}
+      resources: {{ .Values.sampleApplication.prefill.resources | toYaml | nindent 8 }}
       env:
       {{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
       - name: HF_TOKEN

diff --git a/charts/llm-d/values.schema.json b/charts/llm-d/values.schema.json
@@ -10471,6 +10471,65 @@
                             "description": "number of desired decode replicas",
                             "required": [],
                             "title": "replicas"
+                        },
+                        "resources": {
+                            "description": "ResourceRequirements describes the compute resource requirements.",
+                            "properties": {
+                                "claims": {
+                                    "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
+                                    "items": {
+                                        "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
+                                        "properties": {
+                                            "name": {
+                                                "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
+                                                "type": "string"
+                                            },
+                                            "request": {
+                                                "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
+                                                "type": "string"
+                                            }
+                                        },
+                                        "required": [
+                                            "name"
+                                        ],
+                                        "type": "object"
+                                    },
+                                    "type": "array",
+                                    "x-kubernetes-list-map-keys": [
+                                        "name"
+                                    ],
+                                    "x-kubernetes-list-type": "map"
+                                },
+                                "limits": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
+                                    },
+                                    "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                },
+                                "requests": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
+                                    },
+                                    "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                }
+                            },
+                            "type": "object"
                         }
                     },
                     "required": [],
@@ -10688,69 +10747,69 @@
                             "description": "number of desired prefill replicas",
                             "required": [],
                             "title": "replicas"
-                        }
-                    },
-                    "required": [],
-                    "title": "prefill",
-                    "type": "object"
-                },
-                "resources": {
-                    "description": "ResourceRequirements describes the compute resource requirements.",
-                    "properties": {
-                        "claims": {
-                            "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
-                            "items": {
-                                "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
-                                "properties": {
-                                    "name": {
-                                        "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
-                                        "type": "string"
+                        },
+                        "resources": {
+                            "description": "ResourceRequirements describes the compute resource requirements.",
+                            "properties": {
+                                "claims": {
+                                    "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
+                                    "items": {
+                                        "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
+                                        "properties": {
+                                            "name": {
+                                                "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
+                                                "type": "string"
+                                            },
+                                            "request": {
+                                                "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
+                                                "type": "string"
+                                            }
+                                        },
+                                        "required": [
+                                            "name"
+                                        ],
+                                        "type": "object"
                                     },
-                                    "request": {
-                                        "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
-                                        "type": "string"
-                                    }
+                                    "type": "array",
+                                    "x-kubernetes-list-map-keys": [
+                                        "name"
+                                    ],
+                                    "x-kubernetes-list-type": "map"
                                 },
-                                "required": [
-                                    "name"
-                                ],
-                                "type": "object"
-                            },
-                            "type": "array",
-                            "x-kubernetes-list-map-keys": [
-                                "name"
-                            ],
-                            "x-kubernetes-list-type": "map"
-                        },
-                        "limits": {
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "string"
+                                "limits": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
                                     },
-                                    {
-                                        "type": "number"
-                                    }
-                                ]
-                            },
-                            "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
-                            "type": "object"
-                        },
-                        "requests": {
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "string"
+                                    "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                },
+                                "requests": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
                                     },
-                                    {
-                                        "type": "number"
-                                    }
-                                ]
+                                    "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                }
                             },
-                            "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
                             "type": "object"
                         }
                     },
+                    "required": [],
+                    "title": "prefill",
                     "type": "object"
                 }
             },

diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml
@@ -125,22 +125,22 @@ sampleApplication:
         # -- Key within the secret under which the token is located
         key: HF_TOKEN
 
-  # @schema
-  # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
-  # @schema
-  # -- Modify resource limits/requests available to the pods
-  # -- Resource requests/limits
-  # <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
-  resources:
-    limits:
-      nvidia.com/gpu: "1"
-    requests:
-      nvidia.com/gpu: "1"
-
   # -- InferencePool port configuration
   inferencePoolPort: 8000
 
   prefill:
+    # @schema
+    # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
+    # @schema
+    # -- Modify resource limits/requests available to the pods
+    # -- Resource requests/limits
+    # <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+    resources:
+      limits:
+        nvidia.com/gpu: "1"
+      requests:
+        nvidia.com/gpu: "1"
+
     # -- number of desired prefill replicas
     replicas: 1
 
@@ -152,6 +152,18 @@ sampleApplication:
     extraArgs: []
 
   decode:
+    # @schema
+    # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
+    # @schema
+    # -- Modify resource limits/requests available to the pods
+    # -- Resource requests/limits
+    # <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+    resources:
+      limits:
+        nvidia.com/gpu: "1"
+      requests:
+        nvidia.com/gpu: "1"
+
     # -- number of desired decode replicas
     replicas: 1
 

diff --git a/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
@@ -0,0 +1,77 @@
+sampleApplication:
+    baseConfigMapRefName: basic-gpu-with-nixl-preset
+    model:
+        modelArtifactURI: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
+        modelName: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+        auth:
+            hfToken:
+                name: llm-d-hf-token
+                key: HF_TOKEN
+    prefill:
+        replicas: 4
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+            rdma/ib: 1
+          requests:
+            nvidia.com/gpu: 1
+            rdma/ib: 1
+            cpu: "32"
+            memory: 128Gi
+        extraArgs:
+            - "--tensor-parallel-size"
+            - "1"
+            - "--disable-log-requests"
+            - "--max-model-len"
+            - "32768"
+            - "--block-size"
+            - "128"
+    decode:
+        replicas: 1
+        resources:
+          limits:
+            nvidia.com/gpu: 4
+            rdma/ib: 1
+          requests:
+            nvidia.com/gpu: 4
+            rdma/ib: 1
+            cpu: "32"
+            memory: 128Gi
+        extraArgs:
+            - "--tensor-parallel-size"
+            - "4"
+            - "--disable-log-requests"
+            - "--max-model-len"
+            - "32768"
+            - "--block-size"
+            - "128"
+modelservice:
+  vllm:
+    image:
+      registry: docker.io    
+      repository: robertgouldshaw2/vllm-nixl
+      tag: nixl-oh-debug-fixed-0.3
+  epp:
+    defaultEnvVarsOverride:
+      - name: ENABLE_KVCACHE_AWARE_SCORER
+        value: "false"
+      - name: ENABLE_PREFIX_AWARE_SCORER
+        value: "false"
+      - name: ENABLE_LOAD_AWARE_SCORER
+        value: "true"
+      - name: ENABLE_SESSION_AWARE_SCORER
+        value: "false"
+      - name: PD_ENABLED
+        value: "true"
+      - name: PD_PROMPT_LEN_THRESHOLD
+        value: "10"
+      - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
+        value: "false"
+      - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
+        value: "true"
+      - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
+        value: "false"
+      - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
+        value: "false"
+redis:
+  enabled: false