From f7506017ced54cae1b8c41705acc4aea82304693 Mon Sep 17 00:00:00 2001
From: mnmehta <30246802+mnmehta@users.noreply.github.com>
Date: Thu, 5 Jun 2025 23:09:57 -0700
Subject: [PATCH 01/18] For quick validation use 1st decode pod if there are
 multiple pods (#305)

Signed-off-by: mnmehta <30246802+mnmehta@users.noreply.github.com>

updated

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>

updated

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 .../sample-application/modelservice.yaml      |   4 +-
 charts/llm-d/values.yaml                      |  36 +-
 .../examples/rob-benchmarking/2P1D-het.yaml   | 697 ++++++++++++++++++
 quickstart/examples/rob-benchmarking/Justfile |  50 ++
 .../examples/rob-benchmarking/Justfile.remote |  36 +
 .../benchmark-interactive-pod.yaml            |  32 +
 quickstart/test-request.sh                    |   2 +-
 7 files changed, 842 insertions(+), 15 deletions(-)
 create mode 100644 quickstart/examples/rob-benchmarking/2P1D-het.yaml
 create mode 100644 quickstart/examples/rob-benchmarking/Justfile
 create mode 100644 quickstart/examples/rob-benchmarking/Justfile.remote
 create mode 100644 quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml
index 6ba5c22..efa35d6 100644
--- a/charts/llm-d/templates/sample-application/modelservice.yaml
+++ b/charts/llm-d/templates/sample-application/modelservice.yaml
@@ -30,7 +30,7 @@ spec:
       {{- range .Values.sampleApplication.decode.extraArgs }}
       - {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
       {{- end }}
-      resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }}
+      resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }}
       env:
       {{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
       - name: HF_TOKEN
@@ -49,7 +49,7 @@ spec:
       {{- range .Values.sampleApplication.prefill.extraArgs }}
       - {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
       {{- end }}
-      resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }}
+      resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }}
       env:
       {{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
       - name: HF_TOKEN
diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml
index 0d9e000..d0aa57a 100644
--- a/charts/llm-d/values.yaml
+++ b/charts/llm-d/values.yaml
@@ -125,22 +125,22 @@ sampleApplication:
         # -- Key within the secret under which the token is located
         key: HF_TOKEN
 
-  # @schema
-  # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
-  # @schema
-  # -- Modify resource limits/requests available to the pods
-  # -- Resource requests/limits
-  # <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
-  resources:
-    limits:
-      nvidia.com/gpu: "1"
-    requests:
-      nvidia.com/gpu: "1"
-
   # -- InferencePool port configuration
   inferencePoolPort: 8000
 
   prefill:
+    # @schema
+    # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
+    # @schema
+    # -- Modify resource limits/requests available to the pods
+    # -- Resource requests/limits
+    # <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+    resources:
+      limits:
+        nvidia.com/gpu: "1"
+      requests:
+        nvidia.com/gpu: "1"
+
     # -- number of desired prefill replicas
     replicas: 1
 
@@ -152,6 +152,18 @@ sampleApplication:
     extraArgs: []
 
   decode:
+    # @schema
+    # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements
+    # @schema
+    # -- Modify resource limits/requests available to the pods
+    # -- Resource requests/limits
+    # <br /> Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+    resources:
+      limits:
+        nvidia.com/gpu: "1"
+      requests:
+        nvidia.com/gpu: "1"
+
     # -- number of desired decode replicas
     replicas: 1
 
diff --git a/quickstart/examples/rob-benchmarking/2P1D-het.yaml b/quickstart/examples/rob-benchmarking/2P1D-het.yaml
new file mode 100644
index 0000000..2dfc7b4
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/2P1D-het.yaml
@@ -0,0 +1,697 @@
+# yaml-language-server: $schema=values.schema.json
+
+# Default values for the llm-d chart.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+# -- Global parameters
+# Global Docker image parameters
+# Please, note that this will override the image parameters, including dependencies, configured to use the global value
+# Current available global Docker image parameters: imageRegistry, imagePullSecrets and storageClass
+# @default -- See below
+global:
+    # -- Global Docker image registry
+    imageRegistry: ""
+
+    # @schema
+    # items:
+    #   type: string
+    # @schema
+    # -- Global Docker registry secret names as an array
+    # </br> E.g. `imagePullSecrets: [myRegistryKeySecretName]`
+    imagePullSecrets: []
+
+    security:
+        allowInsecureImages: true
+
+# @schema
+# additionalProperties: true
+# @schema
+# -- Parameters for bitnami.common dependency
+common: {}
+
+# -- Common parameters
+# -- Override Kubernetes version
+kubeVersion: ""
+
+# -- String to partially override common.names.fullname
+nameOverride: ""
+
+# -- String to fully override common.names.fullname
+fullnameOverride: ""
+
+# -- Default Kubernetes cluster domain
+clusterDomain: cluster.local
+
+# @schema
+# additionalProperties: true
+# @schema
+# -- Labels to add to all deployed objects
+commonLabels: {}
+
+# @schema
+# additionalProperties: true
+# @schema
+# -- Annotations to add to all deployed objects
+commonAnnotations: {}
+
+# @schema
+# items:
+#   type: [string, object]
+# @schema
+# -- Array of extra objects to deploy with the release
+extraDeploy: []
+
+# -- Helm tests
+test:
+    # -- Enable rendering of helm test resources
+    enabled: false
+
+    # @default -- See below
+    image:
+        # -- Test connection pod image registry
+        registry: quay.io
+
+        # -- Test connection pod image repository. Note that the image needs to have both the `sh` and `curl` binaries in it.
+        repository: curl/curl
+
+        # -- Test connection pod image tag. Note that the image needs to have both the `sh` and `curl` binaries in it.
+        tag: latest
+
+        # -- Specify a imagePullPolicy
+        imagePullPolicy: "Always"
+
+        # @schema
+        # items:
+        #   type: string
+        # @schema
+        # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+        pullSecrets: []
+
+# -- Sample application deploying a p-d pair of specific model
+# @default -- See below
+sampleApplication:
+    baseConfigMapRefName: basic-gpu-with-nixl-preset
+
+    # -- Enable rendering of sample application resources
+    enabled: true
+
+    model:
+        # -- Fully qualified pvc URI: pvc://<pvc-name>/<model-path>
+        modelArtifactURI: hf://RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic
+
+        # -- Name of the model
+        modelName: "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
+
+        # -- Aliases to the Model named vllm will serve with
+        servedModelNames: []
+
+        auth:
+            # -- HF token auth config via k8s secret.
+            hfToken:
+                # -- Name of the secret to create to store your huggingface token
+                name: llm-d-hf-token
+                # -- Value of the token. Do not set this but use `envsubst` in conjunction with the helm chart
+                key: HF_TOKEN
+
+    # -- InferencePool port configuration
+    inferencePoolPort: 8000
+
+    prefill:
+        # -- number of desired prefill replicas
+        replicas: 2
+
+        # @schema
+        # items:
+        #   type: string
+        # @schema
+        # -- args to add to the prefill deployment
+        extraArgs:
+            - "--tensor-parallel-size"
+            - "1"
+            - "--disable-log-requests"
+            - "--max-model-len"
+            - "32768"
+            - "--distributed-executor-backend"
+            - "mp"
+            - "--block-size"
+            - "128"
+            - "--max-num-batched-tokens"
+            - "32768"
+
+    decode:
+        # -- number of desired decode replicas
+        replicas: 1
+
+        # @schema
+        # items:
+        #   type: string
+        # @schema
+        # -- args to add to the decode deployment
+        extraArgs:
+            - "--tensor-parallel-size"
+            - "4"
+            - "--disable-log-requests"
+            - "--max-model-len"
+            - "32768"
+            - "--distributed-executor-backend"
+            - "mp"
+            - "--block-size"
+            - "128"
+
+# -- Gateway configuration
+# @default -- See below
+gateway:
+    # -- Deploy resources related to Gateway
+    enabled: true
+
+    # --  String to fully override gateway.fullname
+    fullnameOverride: ""
+
+    # -- String to partially override gateway.fullname
+    nameOverride: ""
+
+    # -- Gateway class that determines the backend used
+    # Currently supported values: "kgateway" or "istio"
+    gatewayClassName: kgateway
+
+    # @schema
+    # additionalProperties: true
+    # @schema
+    # -- Additional annotations provided to the Gateway resource
+    annotations: {}
+
+    # Special parameters applied to kGateway via GatewayParameters resource
+    kGatewayParameters:
+        # @schema
+        # type: [number, boolean]
+        # @schema
+        proxyUID: false
+
+    # @schema
+    # items:
+    #  type: object
+    #  properties:
+    #    name:
+    #      description: Name is the name of the Listener. This name MUST be unique within a Gateway
+    #      type: string
+    #    path:
+    #      description: Path to expose via Ingress
+    #      type: string
+    #    port:
+    #      description: Port is the network port. Multiple listeners may use the same port, subject to the Listener compatibility rules
+    #      type: integer
+    #      minimum: 1
+    #      maximum: 65535
+    #    protocol:
+    #      description: Protocol specifies the network protocol this listener expects to receive
+    #      type: string
+    # @schema
+    # Set of listeners exposed via the Gateway, also propagated to the Ingress if enabled
+    listeners:
+        - name: default
+          path: /
+          port: 80
+          protocol: HTTP
+
+    # -- Gateway's service type. Ingress is only available if the service type is set to NodePort. Accepted values: ["LoadBalancer", "NodePort"]
+    serviceType: NodePort
+
+# -- Ingress configuration
+# @default -- See below
+ingress:
+    # -- Deploy Ingress
+    enabled: true
+
+    # -- Name of the IngressClass cluster resource which defines which controller will implement the resource (e.g nginx)
+    ingressClassName: ""
+
+    # @schema
+    # additionalProperties: true
+    # @schema
+    # -- Additional annotations for the Ingress resource
+    annotations: {}
+
+    # -- Hostname to be used to expose the NodePort service to the inferencing gateway
+    host: ""
+
+    # -- List of additional hostnames to be covered with this ingress record (e.g. a CNAME)
+    # <!-- E.g.
+    # extraHosts:
+    #   - name: llm-d.env.example.com
+    #     path: / (Optional)
+    #     pathType: Prefix (Optional)
+    #     port: 7007 (Optional) -->
+    extraHosts: []
+
+    # -- Path to be used to expose the full route to access the inferencing gateway
+    path: "/"
+
+    # -- Ingress TLS parameters
+    tls:
+        # -- Enable TLS configuration for the host defined at `ingress.host` parameter
+        enabled: false
+
+        # -- The name to which the TLS Secret will be called
+        secretName: ""
+
+    # @schema
+    # items:
+    #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.networking.v1.IngressTLS
+    # @schema
+    # -- The TLS configuration for additional hostnames to be covered with this ingress record.
+    # <br /> Ref: https://kubernetes.io/docs/concepts/services-networking/ingress/#tls
+    # <!-- E.g.
+    # extraTls:
+    #   - hosts:
+    #     - llm-d.env.example.com
+    #     secretName: llm-d-env -->
+    extraTls: []
+
+    # -- used as part of the host dirivation if not specified from OCP cluster domain (dont edit)
+    clusterRouterBase: ""
+
+# -- Model service controller configuration
+# @default -- See below
+modelservice:
+    # -- Toggle to deploy modelservice controller related resources
+    enabled: true
+
+    # -- Enable metrics gathering via podMonitor / ServiceMonitor
+    metrics:
+        # -- Enable metrics scraping from prefill and decode services, see `model
+        enabled: true
+
+        # -- Prometheus ServiceMonitor configuration
+        # <br /> Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md
+        # @default -- See below
+        serviceMonitor:
+            # @schema
+            # additionalProperties: true
+            # @schema
+            # -- Additional annotations provided to the ServiceMonitor
+            annotations: {}
+
+            # @schema
+            # additionalProperties: true
+            # @schema
+            # -- Additional labels provided to the ServiceMonitor
+            labels: {}
+
+            # -- ServiceMonitor endpoint port
+            port: "vllm"
+
+            # -- ServiceMonitor endpoint path
+            path: "/metrics"
+
+            # -- ServiceMonitor endpoint interval at which metrics should be scraped
+            interval: "15s"
+
+            # -- ServiceMonitor namespace selector
+            namespaceSelector:
+                any: false
+
+                # @schema
+                # items:
+                #   type: string
+                # @schema
+                matchNames: []
+
+            # -- ServiceMonitor selector matchLabels
+            # </br> matchLabels must match labels on modelservice Services
+            selector:
+                # @schema
+                # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector
+                # @schema
+                matchLabels: {}
+
+    # --  String to fully override modelservice.fullname
+    fullnameOverride: ""
+
+    # --  String to partially override modelservice.fullname
+    nameOverride: ""
+
+    # -- Number of controller replicas
+    replicas: 1
+
+    # -- Modelservice controller image, please change only if appropriate adjustments to the CRD are being made
+    # @default -- See below
+    image:
+        # -- Model Service controller image registry
+        registry: ghcr.io
+
+        # -- Model Service controller image repository
+        repository: llm-d/llm-d-model-service
+
+        # -- Model Service controller image tag
+        tag: "0.0.10"
+
+        # -- Specify a imagePullPolicy
+        imagePullPolicy: "Always"
+
+        # @schema
+        # items:
+        #   type: string
+        # @schema
+        # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+        pullSecrets: []
+
+    # -- Endpoint picker configuration
+    # @default -- See below
+    epp:
+        # -- Endpoint picker image used in ModelService CR presets
+        # @default -- See below
+        image:
+            # -- Endpoint picker image registry
+            registry: ghcr.io
+
+            # -- Endpoint picker image repository
+            repository: llm-d/llm-d-inference-scheduler
+
+            # -- Endpoint picker image tag
+            tag: 0.0.2
+
+            # -- Specify a imagePullPolicy
+            imagePullPolicy: "Always"
+
+            # @schema
+            # items:
+            #   type: string
+            # @schema
+            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+            pullSecrets: []
+
+        # -- Enable metrics gathering via podMonitor / ServiceMonitor
+        metrics:
+            # -- Enable metrics scraping from endpoint picker service
+            enabled: true
+
+            # -- Prometheus ServiceMonitor configuration
+            # <br /> Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md
+            # @default -- See below
+            serviceMonitor:
+                # @schema
+                # additionalProperties: true
+                # @schema
+                # -- Additional annotations provided to the ServiceMonitor
+                annotations: {}
+
+                # @schema
+                # additionalProperties: true
+                # @schema
+                # -- Additional labels provided to the ServiceMonitor
+                labels: {}
+
+                # -- ServiceMonitor endpoint port
+                port: "metrics"
+
+                # -- ServiceMonitor endpoint path
+                path: "/metrics"
+
+                # -- ServiceMonitor endpoint interval at which metrics should be scraped
+                interval: "10s"
+
+                # -- ServiceMonitor namespace selector
+                namespaceSelector:
+                    any: false
+
+                    # @schema
+                    # items:
+                    #   type: string
+                    # @schema
+                    matchNames: []
+
+                # -- ServiceMonitor selector matchLabels
+                # </br> matchLabels must match labels on modelservice Services
+                selector:
+                    # @schema
+                    # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector
+                    # @schema
+                    matchLabels: {}
+
+        # -- Default environment variables for endpoint picker, use `extraEnvVars` to override default behavior by defining the same variable again.
+        # Ref: https://github.com/llm-d/llm-d-inference-scheduler/blob/main/docs/architecture.md#scorers--configuration
+        defaultEnvVars:
+            - name: ENABLE_KVCACHE_AWARE_SCORER
+              value: "false"
+            - name: KVCACHE_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: KVCACHE_INDEXER_REDIS_ADDR
+              value: '{{ if .Values.redis.enabled }}{{ include "redis.master.service.fullurl" . }}{{ end }}'
+            - name: ENABLE_PREFIX_AWARE_SCORER
+              value: "true"
+            - name: PREFIX_AWARE_SCORER_WEIGHT
+              value: "2"
+            - name: ENABLE_LOAD_AWARE_SCORER
+              value: "true"
+            - name: LOAD_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: ENABLE_SESSION_AWARE_SCORER
+              value: "false"
+            - name: SESSION_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: PD_ENABLED
+              value: "true"
+            - name: PD_PROMPT_LEN_THRESHOLD
+              value: "512"
+            - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
+              value: "false"
+            - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
+              value: "true"
+            - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
+              value: "true"
+            - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT
+              value: "2"
+            - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
+              value: "false"
+            - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: DECODE_ENABLE_KVCACHE_AWARE_SCORER
+              value: "false"
+            - name: DECODE_KVCACHE_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: DECODE_ENABLE_LOAD_AWARE_SCORER
+              value: "true"
+            - name: DECODE_LOAD_AWARE_SCORER_WEIGHT
+              value: "1"
+            - name: DECODE_ENABLE_PREFIX_AWARE_SCORER
+              value: "true"
+            - name: DECODE_PREFIX_AWARE_SCORER_WEIGHT
+              value: "2"
+            - name: DECODE_ENABLE_SESSION_AWARE_SCORER
+              value: "false"
+            - name: DECODE_SESSION_AWARE_SCORER_WEIGHT
+              value: "1"
+
+        # @schema
+        # items:
+        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.EnvVar
+        # @schema
+        # -- Additional environment variables for endpoint picker
+        defaultEnvVarsOverride: []
+
+    # -- Prefill options
+    # @default -- See below
+    prefill:
+        # @schema
+        # items:
+        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration
+        # @schema
+        # -- Tolerations configuration to deploy prefill pods to tainted nodes
+        # @default -- See below
+        tolerations:
+            # -- default NVIDIA GPU toleration
+            - key: nvidia.com/gpu
+              operator: Exists
+              effect: NoSchedule
+
+    # -- Decode options
+    # @default -- See below
+    decode:
+        # @schema
+        # items:
+        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration
+        # @schema
+        # -- Tolerations configuration to deploy decode pods to tainted nodes
+        # @default -- See below
+        tolerations:
+            # -- default NVIDIA GPU toleration
+            - key: nvidia.com/gpu
+              operator: Exists
+              effect: NoSchedule
+
+    # -- vLLM container options
+    # @default -- See below
+    vllm:
+        # -- vLLM image used in ModelService CR presets
+        # @default -- See below
+        image:
+            # -- llm-d image registry
+            registry: ghcr.io
+
+            # -- llm-d image repository
+            repository: llm-d/llm-d-dev
+
+            # -- llm-d image tag
+            tag: 0.0.10
+
+            # -- Specify a imagePullPolicy
+            imagePullPolicy: "IfNotPresent"
+
+            # @schema
+            # items:
+            #   type: string
+            # @schema
+            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+            pullSecrets: []
+
+        # -- Enable metrics gathering via podMonitor / ServiceMonitor
+        metrics:
+            # -- Enable metrics scraping from prefill & decode services
+            enabled: true
+
+    # -- Routing proxy container options
+    # @default -- See below
+    routingProxy:
+        # -- Routing proxy image used in ModelService CR presets
+        image:
+            # -- Routing proxy image registry
+            registry: ghcr.io
+
+            # -- Routing proxy image repository
+            repository: llm-d/llm-d-routing-sidecar
+
+            # -- Routing proxy image tag
+            tag: "0.0.6"
+
+            # -- Specify a imagePullPolicy
+            imagePullPolicy: "IfNotPresent"
+
+            # @schema
+            # items:
+            #   type: string
+            # @schema
+            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+            pullSecrets: []
+
+    # -- llm-d inference simulator container options
+    # @default -- See below
+    inferenceSimulator:
+        # -- llm-d inference simulator image used in ModelService CR presets
+        # @default -- See below
+        image:
+            # -- llm-d inference simulator image registry
+            registry: ghcr.io
+
+            # -- llm-d inference simulator image repository
+            repository: llm-d/llm-d-inference-sim
+
+            # -- llm-d inference simulator image tag
+            tag: "0.0.4"
+
+            # -- Specify a imagePullPolicy
+            imagePullPolicy: "IfNotPresent"
+
+            # @schema
+            # items:
+            #   type: string
+            # @schema
+            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
+            pullSecrets: []
+
+    # @schema
+    # additionalProperties: true
+    # @schema
+    # -- Annotations to add to all modelservice resources
+    annotations: {}
+
+    # @schema
+    # additionalProperties: true
+    # @schema
+    # -- Pod annotations for modelservice
+    podAnnotations: {}
+
+    # @schema
+    # additionalProperties: true
+    # @schema
+    # -- Pod labels for modelservice
+    podLabels: {}
+
+    # Model service controller settings
+    service:
+        # -- Toggle to deploy a Service resource for Model service controller
+        enabled: true
+
+        # -- Port number exposed from Model Service controller
+        port: 8443
+
+        # -- Service type
+        type: ClusterIP
+
+    # -- Service Account Configuration
+    # @default -- See below
+    serviceAccount:
+        # -- Enable the creation of a ServiceAccount for Modelservice pods
+        create: true
+
+        # --  String to fully override modelservice.serviceAccountName, defaults to modelservice.fullname
+        fullnameOverride: ""
+
+        # --  String to partially override modelservice.serviceAccountName, defaults to modelservice.fullname
+        nameOverride: ""
+
+        # @schema
+        # additionalProperties: true
+        # @schema
+        # -- Additional custom labels to the service ServiceAccount.
+        labels: {}
+
+        # @schema
+        # additionalProperties: true
+        # @schema
+        # -- Additional custom annotations for the ServiceAccount.
+        annotations: {}
+
+    rbac:
+        # -- Enable the creation of RBAC resources
+        create: true
+
+# @schema
+# $ref: https://raw.githubusercontent.com/bitnami/charts/refs/tags/redis/20.13.4/bitnami/redis/values.schema.json
+# @schema
+# -- Bitnami/Redis chart configuration
+# @default -- Use sane defaults for minimal Redis deployment
+redis:
+    enabled: false
+    auth:
+        enabled: false
+        existingSecretPasswordKey: ""
+        existingSecret: ""
+    architecture: standalone
+    image:
+        registry: quay.io
+        repository: sclorg/redis-7-c9s
+        tag: c9s
+    master:
+        kind: Deployment
+        resources:
+            limits:
+                memory: "256Mi"
+                cpu: "250m"
+            requests:
+                memory: "128Mi"
+                cpu: "100m"
+        persistence:
+            enabled: true
+            size: "5Gi"
+        pdb:
+            create: false
+        service:
+            ports:
+                redis: 8100
+    networkPolicy:
+        enabled: false
diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
new file mode 100644
index 0000000..209e44f
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/Justfile
@@ -0,0 +1,50 @@
+NAMESPACE := "pete-davidson"
+MODEL := "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
+
+logs POD:
+    kubectl logs -f {{POD}} | grep -v "GET /metrics HTTP/1.1"
+
+get-ips:
+    just get-pods | awk '/^redhatai-llama-4-maverick-17b-128e-instruct-fp8-(decode|prefill)/ {print $6}'
+get-pods:
+    kubectl get pods -n {{NAMESPACE}} -o wide
+
+hf-token:
+  kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=$HF_TOKEN -n {{NAMESPACE}}
+
+[working-directory: '../quickstart']
+install VALUES:
+    ./llmd-installer.sh \
+        --hf-token $HF_TOKEN \
+        --namespace {{NAMESPACE}} \
+        --storage-class shared-vast --storage-size 300Gi \
+        --values-file $PWD/../project/{{VALUES}}
+
+start VALUES: 
+    just install {{VALUES}} && \
+    just hf-token && \
+    just start-bench
+
+[working-directory: '../quickstart']
+uninstall VALUES:
+    ./llmd-installer.sh \
+        --hf-token $HF_TOKEN \
+        --namespace {{NAMESPACE}} \
+        --storage-class shared-vast  --storage-size 300Gi \
+        --values-file $PWD/../project/{{VALUES}} \
+        --uninstall
+
+gh-token GH_TOKEN:
+    kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}}
+
+# Interactive benchmark commands:
+start-bench:
+    kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml
+
+delete-bench:
+    kubectl delete pod -n {{NAMESPACE}} benchmark-interactive
+
+exec-bench:
+    kubectl cp reset_prefixes.sh {{NAMESPACE}}/benchmark-interactive:/app/reset_prefixes.sh && \
+    kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \
+    kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash
diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote
new file mode 100644
index 0000000..bbec981
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/Justfile.remote
@@ -0,0 +1,36 @@
+# Use this Justfile within the cluster.
+
+# MODEL := "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-FP8"
+MODEL := "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+BASE_URL := "http://llm-d-inference-gateway"
+
+eval:
+    lm_eval --model local-completions --tasks gsm8k \
+    --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=50,max_retries=3,tokenized_requests=False \
+    --limit 100
+
+benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
+    python vllm/benchmarks/benchmark_serving.py \
+        --base-url {{BASE_URL}} \
+        --model {{MODEL}} \
+        --dataset-name random \
+        --random-input-len {{INPUT_LEN}} \
+        --random-output-len {{OUTPUT_LEN}}  \
+        --request-rate {{RR}} \
+        --seed $(date +%M%H%M%S) \
+        --num-prompts {{NUM_REQUESTS}} \
+        --ignore-eos
+
+# just benchmark 4 1000 15000 5000 <-- current 1P3D setup
+#
+benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
+    python vllm/benchmarks/benchmark_serving.py \
+        --base-url http://{{POD_IP}}:8000 \
+        --model {{MODEL}} \
+        --dataset-name random \
+        --random-input-len {{INPUT_LEN}} \
+        --random-output-len {{OUTPUT_LEN}}  \
+        --request-rate {{RR}} \
+        --seed $(date +%M%H%M%S) \
+        --num-prompts {{NUM_REQUESTS}} \
+        --ignore-eos
diff --git a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
new file mode 100644
index 0000000..bcb6434
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
@@ -0,0 +1,32 @@
+# benchmark-client-interactive-pod.yaml
+apiVersion: v1
+kind: Pod
+metadata:
+    name: benchmark-interactive
+    labels:
+        app: benchmark-interactive # Labels for organization
+spec:
+    containers:
+        - name: benchmark-runner
+          image: "quay.io/tms/pd-disagg-benchmark:0.0.6"
+          imagePullPolicy: Always
+          stdin: true
+          tty: true
+          resources:
+              requests:
+                  cpu: "16"
+                  memory: "64Gi"
+              limits:
+                  cpu: "16"
+                  memory: "64Gi"
+          env:
+              - name: PROXY_HOST
+                value: "custom-llm-proxy-service"
+              - name: PROXY_PORT
+                value: "80"
+              - name: HF_TOKEN
+                valueFrom:
+                    secretKeyRef:
+                        name: hf-token-secret # set up with just hf_token
+                        key: HF_TOKEN
+    restartPolicy: Never
diff --git a/quickstart/test-request.sh b/quickstart/test-request.sh
index 5635240..26f0afc 100755
--- a/quickstart/test-request.sh
+++ b/quickstart/test-request.sh
@@ -89,7 +89,7 @@ validation() {
   # Discover the decode pod IP
   POD_IP=$(kubectl get pods -n "$NAMESPACE" \
     -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.podIP}{"\n"}{end}' \
-    | grep decode | awk '{print $2}')
+    | grep decode | awk '{print $2}' | head -1)
 
   if [[ -z "$POD_IP" ]]; then
       echo "Error: no decode pod found in namespace $NAMESPACE"

From 36ab0c996e53bda040febc3602a65525cce307f1 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <robertgshaw2@gmail.com>
Date: Sun, 8 Jun 2025 13:17:08 +0000
Subject: [PATCH 02/18] rmove examples

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 .../examples/rob-benchmarking/2P1D-het.yaml   | 697 ------------------
 quickstart/examples/rob-benchmarking/Justfile |  50 --
 .../examples/rob-benchmarking/Justfile.remote |  36 -
 .../benchmark-interactive-pod.yaml            |  32 -
 4 files changed, 815 deletions(-)
 delete mode 100644 quickstart/examples/rob-benchmarking/2P1D-het.yaml
 delete mode 100644 quickstart/examples/rob-benchmarking/Justfile
 delete mode 100644 quickstart/examples/rob-benchmarking/Justfile.remote
 delete mode 100644 quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml

diff --git a/quickstart/examples/rob-benchmarking/2P1D-het.yaml b/quickstart/examples/rob-benchmarking/2P1D-het.yaml
deleted file mode 100644
index 2dfc7b4..0000000
--- a/quickstart/examples/rob-benchmarking/2P1D-het.yaml
+++ /dev/null
@@ -1,697 +0,0 @@
-# yaml-language-server: $schema=values.schema.json
-
-# Default values for the llm-d chart.
-# This is a YAML-formatted file.
-# Declare variables to be passed into your templates.
-
-# -- Global parameters
-# Global Docker image parameters
-# Please, note that this will override the image parameters, including dependencies, configured to use the global value
-# Current available global Docker image parameters: imageRegistry, imagePullSecrets and storageClass
-# @default -- See below
-global:
-    # -- Global Docker image registry
-    imageRegistry: ""
-
-    # @schema
-    # items:
-    #   type: string
-    # @schema
-    # -- Global Docker registry secret names as an array
-    # </br> E.g. `imagePullSecrets: [myRegistryKeySecretName]`
-    imagePullSecrets: []
-
-    security:
-        allowInsecureImages: true
-
-# @schema
-# additionalProperties: true
-# @schema
-# -- Parameters for bitnami.common dependency
-common: {}
-
-# -- Common parameters
-# -- Override Kubernetes version
-kubeVersion: ""
-
-# -- String to partially override common.names.fullname
-nameOverride: ""
-
-# -- String to fully override common.names.fullname
-fullnameOverride: ""
-
-# -- Default Kubernetes cluster domain
-clusterDomain: cluster.local
-
-# @schema
-# additionalProperties: true
-# @schema
-# -- Labels to add to all deployed objects
-commonLabels: {}
-
-# @schema
-# additionalProperties: true
-# @schema
-# -- Annotations to add to all deployed objects
-commonAnnotations: {}
-
-# @schema
-# items:
-#   type: [string, object]
-# @schema
-# -- Array of extra objects to deploy with the release
-extraDeploy: []
-
-# -- Helm tests
-test:
-    # -- Enable rendering of helm test resources
-    enabled: false
-
-    # @default -- See below
-    image:
-        # -- Test connection pod image registry
-        registry: quay.io
-
-        # -- Test connection pod image repository. Note that the image needs to have both the `sh` and `curl` binaries in it.
-        repository: curl/curl
-
-        # -- Test connection pod image tag. Note that the image needs to have both the `sh` and `curl` binaries in it.
-        tag: latest
-
-        # -- Specify a imagePullPolicy
-        imagePullPolicy: "Always"
-
-        # @schema
-        # items:
-        #   type: string
-        # @schema
-        # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-        pullSecrets: []
-
-# -- Sample application deploying a p-d pair of specific model
-# @default -- See below
-sampleApplication:
-    baseConfigMapRefName: basic-gpu-with-nixl-preset
-
-    # -- Enable rendering of sample application resources
-    enabled: true
-
-    model:
-        # -- Fully qualified pvc URI: pvc://<pvc-name>/<model-path>
-        modelArtifactURI: hf://RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic
-
-        # -- Name of the model
-        modelName: "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
-
-        # -- Aliases to the Model named vllm will serve with
-        servedModelNames: []
-
-        auth:
-            # -- HF token auth config via k8s secret.
-            hfToken:
-                # -- Name of the secret to create to store your huggingface token
-                name: llm-d-hf-token
-                # -- Value of the token. Do not set this but use `envsubst` in conjunction with the helm chart
-                key: HF_TOKEN
-
-    # -- InferencePool port configuration
-    inferencePoolPort: 8000
-
-    prefill:
-        # -- number of desired prefill replicas
-        replicas: 2
-
-        # @schema
-        # items:
-        #   type: string
-        # @schema
-        # -- args to add to the prefill deployment
-        extraArgs:
-            - "--tensor-parallel-size"
-            - "1"
-            - "--disable-log-requests"
-            - "--max-model-len"
-            - "32768"
-            - "--distributed-executor-backend"
-            - "mp"
-            - "--block-size"
-            - "128"
-            - "--max-num-batched-tokens"
-            - "32768"
-
-    decode:
-        # -- number of desired decode replicas
-        replicas: 1
-
-        # @schema
-        # items:
-        #   type: string
-        # @schema
-        # -- args to add to the decode deployment
-        extraArgs:
-            - "--tensor-parallel-size"
-            - "4"
-            - "--disable-log-requests"
-            - "--max-model-len"
-            - "32768"
-            - "--distributed-executor-backend"
-            - "mp"
-            - "--block-size"
-            - "128"
-
-# -- Gateway configuration
-# @default -- See below
-gateway:
-    # -- Deploy resources related to Gateway
-    enabled: true
-
-    # --  String to fully override gateway.fullname
-    fullnameOverride: ""
-
-    # -- String to partially override gateway.fullname
-    nameOverride: ""
-
-    # -- Gateway class that determines the backend used
-    # Currently supported values: "kgateway" or "istio"
-    gatewayClassName: kgateway
-
-    # @schema
-    # additionalProperties: true
-    # @schema
-    # -- Additional annotations provided to the Gateway resource
-    annotations: {}
-
-    # Special parameters applied to kGateway via GatewayParameters resource
-    kGatewayParameters:
-        # @schema
-        # type: [number, boolean]
-        # @schema
-        proxyUID: false
-
-    # @schema
-    # items:
-    #  type: object
-    #  properties:
-    #    name:
-    #      description: Name is the name of the Listener. This name MUST be unique within a Gateway
-    #      type: string
-    #    path:
-    #      description: Path to expose via Ingress
-    #      type: string
-    #    port:
-    #      description: Port is the network port. Multiple listeners may use the same port, subject to the Listener compatibility rules
-    #      type: integer
-    #      minimum: 1
-    #      maximum: 65535
-    #    protocol:
-    #      description: Protocol specifies the network protocol this listener expects to receive
-    #      type: string
-    # @schema
-    # Set of listeners exposed via the Gateway, also propagated to the Ingress if enabled
-    listeners:
-        - name: default
-          path: /
-          port: 80
-          protocol: HTTP
-
-    # -- Gateway's service type. Ingress is only available if the service type is set to NodePort. Accepted values: ["LoadBalancer", "NodePort"]
-    serviceType: NodePort
-
-# -- Ingress configuration
-# @default -- See below
-ingress:
-    # -- Deploy Ingress
-    enabled: true
-
-    # -- Name of the IngressClass cluster resource which defines which controller will implement the resource (e.g nginx)
-    ingressClassName: ""
-
-    # @schema
-    # additionalProperties: true
-    # @schema
-    # -- Additional annotations for the Ingress resource
-    annotations: {}
-
-    # -- Hostname to be used to expose the NodePort service to the inferencing gateway
-    host: ""
-
-    # -- List of additional hostnames to be covered with this ingress record (e.g. a CNAME)
-    # <!-- E.g.
-    # extraHosts:
-    #   - name: llm-d.env.example.com
-    #     path: / (Optional)
-    #     pathType: Prefix (Optional)
-    #     port: 7007 (Optional) -->
-    extraHosts: []
-
-    # -- Path to be used to expose the full route to access the inferencing gateway
-    path: "/"
-
-    # -- Ingress TLS parameters
-    tls:
-        # -- Enable TLS configuration for the host defined at `ingress.host` parameter
-        enabled: false
-
-        # -- The name to which the TLS Secret will be called
-        secretName: ""
-
-    # @schema
-    # items:
-    #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.networking.v1.IngressTLS
-    # @schema
-    # -- The TLS configuration for additional hostnames to be covered with this ingress record.
-    # <br /> Ref: https://kubernetes.io/docs/concepts/services-networking/ingress/#tls
-    # <!-- E.g.
-    # extraTls:
-    #   - hosts:
-    #     - llm-d.env.example.com
-    #     secretName: llm-d-env -->
-    extraTls: []
-
-    # -- used as part of the host dirivation if not specified from OCP cluster domain (dont edit)
-    clusterRouterBase: ""
-
-# -- Model service controller configuration
-# @default -- See below
-modelservice:
-    # -- Toggle to deploy modelservice controller related resources
-    enabled: true
-
-    # -- Enable metrics gathering via podMonitor / ServiceMonitor
-    metrics:
-        # -- Enable metrics scraping from prefill and decode services, see `model
-        enabled: true
-
-        # -- Prometheus ServiceMonitor configuration
-        # <br /> Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md
-        # @default -- See below
-        serviceMonitor:
-            # @schema
-            # additionalProperties: true
-            # @schema
-            # -- Additional annotations provided to the ServiceMonitor
-            annotations: {}
-
-            # @schema
-            # additionalProperties: true
-            # @schema
-            # -- Additional labels provided to the ServiceMonitor
-            labels: {}
-
-            # -- ServiceMonitor endpoint port
-            port: "vllm"
-
-            # -- ServiceMonitor endpoint path
-            path: "/metrics"
-
-            # -- ServiceMonitor endpoint interval at which metrics should be scraped
-            interval: "15s"
-
-            # -- ServiceMonitor namespace selector
-            namespaceSelector:
-                any: false
-
-                # @schema
-                # items:
-                #   type: string
-                # @schema
-                matchNames: []
-
-            # -- ServiceMonitor selector matchLabels
-            # </br> matchLabels must match labels on modelservice Services
-            selector:
-                # @schema
-                # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector
-                # @schema
-                matchLabels: {}
-
-    # --  String to fully override modelservice.fullname
-    fullnameOverride: ""
-
-    # --  String to partially override modelservice.fullname
-    nameOverride: ""
-
-    # -- Number of controller replicas
-    replicas: 1
-
-    # -- Modelservice controller image, please change only if appropriate adjustments to the CRD are being made
-    # @default -- See below
-    image:
-        # -- Model Service controller image registry
-        registry: ghcr.io
-
-        # -- Model Service controller image repository
-        repository: llm-d/llm-d-model-service
-
-        # -- Model Service controller image tag
-        tag: "0.0.10"
-
-        # -- Specify a imagePullPolicy
-        imagePullPolicy: "Always"
-
-        # @schema
-        # items:
-        #   type: string
-        # @schema
-        # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-        pullSecrets: []
-
-    # -- Endpoint picker configuration
-    # @default -- See below
-    epp:
-        # -- Endpoint picker image used in ModelService CR presets
-        # @default -- See below
-        image:
-            # -- Endpoint picker image registry
-            registry: ghcr.io
-
-            # -- Endpoint picker image repository
-            repository: llm-d/llm-d-inference-scheduler
-
-            # -- Endpoint picker image tag
-            tag: 0.0.2
-
-            # -- Specify a imagePullPolicy
-            imagePullPolicy: "Always"
-
-            # @schema
-            # items:
-            #   type: string
-            # @schema
-            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-            pullSecrets: []
-
-        # -- Enable metrics gathering via podMonitor / ServiceMonitor
-        metrics:
-            # -- Enable metrics scraping from endpoint picker service
-            enabled: true
-
-            # -- Prometheus ServiceMonitor configuration
-            # <br /> Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md
-            # @default -- See below
-            serviceMonitor:
-                # @schema
-                # additionalProperties: true
-                # @schema
-                # -- Additional annotations provided to the ServiceMonitor
-                annotations: {}
-
-                # @schema
-                # additionalProperties: true
-                # @schema
-                # -- Additional labels provided to the ServiceMonitor
-                labels: {}
-
-                # -- ServiceMonitor endpoint port
-                port: "metrics"
-
-                # -- ServiceMonitor endpoint path
-                path: "/metrics"
-
-                # -- ServiceMonitor endpoint interval at which metrics should be scraped
-                interval: "10s"
-
-                # -- ServiceMonitor namespace selector
-                namespaceSelector:
-                    any: false
-
-                    # @schema
-                    # items:
-                    #   type: string
-                    # @schema
-                    matchNames: []
-
-                # -- ServiceMonitor selector matchLabels
-                # </br> matchLabels must match labels on modelservice Services
-                selector:
-                    # @schema
-                    # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector
-                    # @schema
-                    matchLabels: {}
-
-        # -- Default environment variables for endpoint picker, use `extraEnvVars` to override default behavior by defining the same variable again.
-        # Ref: https://github.com/llm-d/llm-d-inference-scheduler/blob/main/docs/architecture.md#scorers--configuration
-        defaultEnvVars:
-            - name: ENABLE_KVCACHE_AWARE_SCORER
-              value: "false"
-            - name: KVCACHE_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: KVCACHE_INDEXER_REDIS_ADDR
-              value: '{{ if .Values.redis.enabled }}{{ include "redis.master.service.fullurl" . }}{{ end }}'
-            - name: ENABLE_PREFIX_AWARE_SCORER
-              value: "true"
-            - name: PREFIX_AWARE_SCORER_WEIGHT
-              value: "2"
-            - name: ENABLE_LOAD_AWARE_SCORER
-              value: "true"
-            - name: LOAD_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: ENABLE_SESSION_AWARE_SCORER
-              value: "false"
-            - name: SESSION_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: PD_ENABLED
-              value: "true"
-            - name: PD_PROMPT_LEN_THRESHOLD
-              value: "512"
-            - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
-              value: "false"
-            - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
-              value: "true"
-            - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
-              value: "true"
-            - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT
-              value: "2"
-            - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
-              value: "false"
-            - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: DECODE_ENABLE_KVCACHE_AWARE_SCORER
-              value: "false"
-            - name: DECODE_KVCACHE_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: DECODE_ENABLE_LOAD_AWARE_SCORER
-              value: "true"
-            - name: DECODE_LOAD_AWARE_SCORER_WEIGHT
-              value: "1"
-            - name: DECODE_ENABLE_PREFIX_AWARE_SCORER
-              value: "true"
-            - name: DECODE_PREFIX_AWARE_SCORER_WEIGHT
-              value: "2"
-            - name: DECODE_ENABLE_SESSION_AWARE_SCORER
-              value: "false"
-            - name: DECODE_SESSION_AWARE_SCORER_WEIGHT
-              value: "1"
-
-        # @schema
-        # items:
-        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.EnvVar
-        # @schema
-        # -- Additional environment variables for endpoint picker
-        defaultEnvVarsOverride: []
-
-    # -- Prefill options
-    # @default -- See below
-    prefill:
-        # @schema
-        # items:
-        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration
-        # @schema
-        # -- Tolerations configuration to deploy prefill pods to tainted nodes
-        # @default -- See below
-        tolerations:
-            # -- default NVIDIA GPU toleration
-            - key: nvidia.com/gpu
-              operator: Exists
-              effect: NoSchedule
-
-    # -- Decode options
-    # @default -- See below
-    decode:
-        # @schema
-        # items:
-        #   $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration
-        # @schema
-        # -- Tolerations configuration to deploy decode pods to tainted nodes
-        # @default -- See below
-        tolerations:
-            # -- default NVIDIA GPU toleration
-            - key: nvidia.com/gpu
-              operator: Exists
-              effect: NoSchedule
-
-    # -- vLLM container options
-    # @default -- See below
-    vllm:
-        # -- vLLM image used in ModelService CR presets
-        # @default -- See below
-        image:
-            # -- llm-d image registry
-            registry: ghcr.io
-
-            # -- llm-d image repository
-            repository: llm-d/llm-d-dev
-
-            # -- llm-d image tag
-            tag: 0.0.10
-
-            # -- Specify a imagePullPolicy
-            imagePullPolicy: "IfNotPresent"
-
-            # @schema
-            # items:
-            #   type: string
-            # @schema
-            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-            pullSecrets: []
-
-        # -- Enable metrics gathering via podMonitor / ServiceMonitor
-        metrics:
-            # -- Enable metrics scraping from prefill & decode services
-            enabled: true
-
-    # -- Routing proxy container options
-    # @default -- See below
-    routingProxy:
-        # -- Routing proxy image used in ModelService CR presets
-        image:
-            # -- Routing proxy image registry
-            registry: ghcr.io
-
-            # -- Routing proxy image repository
-            repository: llm-d/llm-d-routing-sidecar
-
-            # -- Routing proxy image tag
-            tag: "0.0.6"
-
-            # -- Specify a imagePullPolicy
-            imagePullPolicy: "IfNotPresent"
-
-            # @schema
-            # items:
-            #   type: string
-            # @schema
-            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-            pullSecrets: []
-
-    # -- llm-d inference simulator container options
-    # @default -- See below
-    inferenceSimulator:
-        # -- llm-d inference simulator image used in ModelService CR presets
-        # @default -- See below
-        image:
-            # -- llm-d inference simulator image registry
-            registry: ghcr.io
-
-            # -- llm-d inference simulator image repository
-            repository: llm-d/llm-d-inference-sim
-
-            # -- llm-d inference simulator image tag
-            tag: "0.0.4"
-
-            # -- Specify a imagePullPolicy
-            imagePullPolicy: "IfNotPresent"
-
-            # @schema
-            # items:
-            #   type: string
-            # @schema
-            # -- Optionally specify an array of imagePullSecrets (evaluated as templates)
-            pullSecrets: []
-
-    # @schema
-    # additionalProperties: true
-    # @schema
-    # -- Annotations to add to all modelservice resources
-    annotations: {}
-
-    # @schema
-    # additionalProperties: true
-    # @schema
-    # -- Pod annotations for modelservice
-    podAnnotations: {}
-
-    # @schema
-    # additionalProperties: true
-    # @schema
-    # -- Pod labels for modelservice
-    podLabels: {}
-
-    # Model service controller settings
-    service:
-        # -- Toggle to deploy a Service resource for Model service controller
-        enabled: true
-
-        # -- Port number exposed from Model Service controller
-        port: 8443
-
-        # -- Service type
-        type: ClusterIP
-
-    # -- Service Account Configuration
-    # @default -- See below
-    serviceAccount:
-        # -- Enable the creation of a ServiceAccount for Modelservice pods
-        create: true
-
-        # --  String to fully override modelservice.serviceAccountName, defaults to modelservice.fullname
-        fullnameOverride: ""
-
-        # --  String to partially override modelservice.serviceAccountName, defaults to modelservice.fullname
-        nameOverride: ""
-
-        # @schema
-        # additionalProperties: true
-        # @schema
-        # -- Additional custom labels to the service ServiceAccount.
-        labels: {}
-
-        # @schema
-        # additionalProperties: true
-        # @schema
-        # -- Additional custom annotations for the ServiceAccount.
-        annotations: {}
-
-    rbac:
-        # -- Enable the creation of RBAC resources
-        create: true
-
-# @schema
-# $ref: https://raw.githubusercontent.com/bitnami/charts/refs/tags/redis/20.13.4/bitnami/redis/values.schema.json
-# @schema
-# -- Bitnami/Redis chart configuration
-# @default -- Use sane defaults for minimal Redis deployment
-redis:
-    enabled: false
-    auth:
-        enabled: false
-        existingSecretPasswordKey: ""
-        existingSecret: ""
-    architecture: standalone
-    image:
-        registry: quay.io
-        repository: sclorg/redis-7-c9s
-        tag: c9s
-    master:
-        kind: Deployment
-        resources:
-            limits:
-                memory: "256Mi"
-                cpu: "250m"
-            requests:
-                memory: "128Mi"
-                cpu: "100m"
-        persistence:
-            enabled: true
-            size: "5Gi"
-        pdb:
-            create: false
-        service:
-            ports:
-                redis: 8100
-    networkPolicy:
-        enabled: false
diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
deleted file mode 100644
index 209e44f..0000000
--- a/quickstart/examples/rob-benchmarking/Justfile
+++ /dev/null
@@ -1,50 +0,0 @@
-NAMESPACE := "pete-davidson"
-MODEL := "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
-
-logs POD:
-    kubectl logs -f {{POD}} | grep -v "GET /metrics HTTP/1.1"
-
-get-ips:
-    just get-pods | awk '/^redhatai-llama-4-maverick-17b-128e-instruct-fp8-(decode|prefill)/ {print $6}'
-get-pods:
-    kubectl get pods -n {{NAMESPACE}} -o wide
-
-hf-token:
-  kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=$HF_TOKEN -n {{NAMESPACE}}
-
-[working-directory: '../quickstart']
-install VALUES:
-    ./llmd-installer.sh \
-        --hf-token $HF_TOKEN \
-        --namespace {{NAMESPACE}} \
-        --storage-class shared-vast --storage-size 300Gi \
-        --values-file $PWD/../project/{{VALUES}}
-
-start VALUES: 
-    just install {{VALUES}} && \
-    just hf-token && \
-    just start-bench
-
-[working-directory: '../quickstart']
-uninstall VALUES:
-    ./llmd-installer.sh \
-        --hf-token $HF_TOKEN \
-        --namespace {{NAMESPACE}} \
-        --storage-class shared-vast  --storage-size 300Gi \
-        --values-file $PWD/../project/{{VALUES}} \
-        --uninstall
-
-gh-token GH_TOKEN:
-    kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}}
-
-# Interactive benchmark commands:
-start-bench:
-    kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml
-
-delete-bench:
-    kubectl delete pod -n {{NAMESPACE}} benchmark-interactive
-
-exec-bench:
-    kubectl cp reset_prefixes.sh {{NAMESPACE}}/benchmark-interactive:/app/reset_prefixes.sh && \
-    kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \
-    kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash
diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote
deleted file mode 100644
index bbec981..0000000
--- a/quickstart/examples/rob-benchmarking/Justfile.remote
+++ /dev/null
@@ -1,36 +0,0 @@
-# Use this Justfile within the cluster.
-
-# MODEL := "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-FP8"
-MODEL := "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-BASE_URL := "http://llm-d-inference-gateway"
-
-eval:
-    lm_eval --model local-completions --tasks gsm8k \
-    --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=50,max_retries=3,tokenized_requests=False \
-    --limit 100
-
-benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
-    python vllm/benchmarks/benchmark_serving.py \
-        --base-url {{BASE_URL}} \
-        --model {{MODEL}} \
-        --dataset-name random \
-        --random-input-len {{INPUT_LEN}} \
-        --random-output-len {{OUTPUT_LEN}}  \
-        --request-rate {{RR}} \
-        --seed $(date +%M%H%M%S) \
-        --num-prompts {{NUM_REQUESTS}} \
-        --ignore-eos
-
-# just benchmark 4 1000 15000 5000 <-- current 1P3D setup
-#
-benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
-    python vllm/benchmarks/benchmark_serving.py \
-        --base-url http://{{POD_IP}}:8000 \
-        --model {{MODEL}} \
-        --dataset-name random \
-        --random-input-len {{INPUT_LEN}} \
-        --random-output-len {{OUTPUT_LEN}}  \
-        --request-rate {{RR}} \
-        --seed $(date +%M%H%M%S) \
-        --num-prompts {{NUM_REQUESTS}} \
-        --ignore-eos
diff --git a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
deleted file mode 100644
index bcb6434..0000000
--- a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# benchmark-client-interactive-pod.yaml
-apiVersion: v1
-kind: Pod
-metadata:
-    name: benchmark-interactive
-    labels:
-        app: benchmark-interactive # Labels for organization
-spec:
-    containers:
-        - name: benchmark-runner
-          image: "quay.io/tms/pd-disagg-benchmark:0.0.6"
-          imagePullPolicy: Always
-          stdin: true
-          tty: true
-          resources:
-              requests:
-                  cpu: "16"
-                  memory: "64Gi"
-              limits:
-                  cpu: "16"
-                  memory: "64Gi"
-          env:
-              - name: PROXY_HOST
-                value: "custom-llm-proxy-service"
-              - name: PROXY_PORT
-                value: "80"
-              - name: HF_TOKEN
-                valueFrom:
-                    secretKeyRef:
-                        name: hf-token-secret # set up with just hf_token
-                        key: HF_TOKEN
-    restartPolicy: Never

From ff8ae72771267ab1e40022a3d462c18e1265884a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <robertgshaw2@gmail.com>
Date: Sun, 8 Jun 2025 13:17:38 +0000
Subject: [PATCH 03/18] fix typo

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 charts/llm-d/templates/sample-application/modelservice.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml
index efa35d6..913a086 100644
--- a/charts/llm-d/templates/sample-application/modelservice.yaml
+++ b/charts/llm-d/templates/sample-application/modelservice.yaml
@@ -49,7 +49,7 @@ spec:
       {{- range .Values.sampleApplication.prefill.extraArgs }}
       - {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }}
       {{- end }}
-      resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }}
+      resources: {{ .Values.sampleApplication.prefill.resources | toYaml | nindent 8 }}
       env:
       {{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }}
       - name: HF_TOKEN

From e117b30deb708d67ea705115ab1a86ec4ff89716 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <robertgshaw2@gmail.com>
Date: Sun, 8 Jun 2025 13:18:15 +0000
Subject: [PATCH 04/18] fix

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 quickstart/test-request.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quickstart/test-request.sh b/quickstart/test-request.sh
index 26f0afc..5635240 100755
--- a/quickstart/test-request.sh
+++ b/quickstart/test-request.sh
@@ -89,7 +89,7 @@ validation() {
   # Discover the decode pod IP
   POD_IP=$(kubectl get pods -n "$NAMESPACE" \
     -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.podIP}{"\n"}{end}' \
-    | grep decode | awk '{print $2}' | head -1)
+    | grep decode | awk '{print $2}')
 
   if [[ -z "$POD_IP" ]]; then
       echo "Error: no decode pod found in namespace $NAMESPACE"

From 282ee2ad6a1e79d21e1ab72bd1b9a5d56716fada Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <robertgshaw2@gmail.com>
Date: Sun, 8 Jun 2025 13:58:55 +0000
Subject: [PATCH 05/18] updated schema

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 charts/llm-d/values.schema.json | 169 +++++++++++++++++++++-----------
 1 file changed, 114 insertions(+), 55 deletions(-)

diff --git a/charts/llm-d/values.schema.json b/charts/llm-d/values.schema.json
index a1910e9..b405e0a 100644
--- a/charts/llm-d/values.schema.json
+++ b/charts/llm-d/values.schema.json
@@ -10471,6 +10471,65 @@
                             "description": "number of desired decode replicas",
                             "required": [],
                             "title": "replicas"
+                        },
+                        "resources": {
+                            "description": "ResourceRequirements describes the compute resource requirements.",
+                            "properties": {
+                                "claims": {
+                                    "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
+                                    "items": {
+                                        "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
+                                        "properties": {
+                                            "name": {
+                                                "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
+                                                "type": "string"
+                                            },
+                                            "request": {
+                                                "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
+                                                "type": "string"
+                                            }
+                                        },
+                                        "required": [
+                                            "name"
+                                        ],
+                                        "type": "object"
+                                    },
+                                    "type": "array",
+                                    "x-kubernetes-list-map-keys": [
+                                        "name"
+                                    ],
+                                    "x-kubernetes-list-type": "map"
+                                },
+                                "limits": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
+                                    },
+                                    "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                },
+                                "requests": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
+                                    },
+                                    "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                }
+                            },
+                            "type": "object"
                         }
                     },
                     "required": [],
@@ -10688,69 +10747,69 @@
                             "description": "number of desired prefill replicas",
                             "required": [],
                             "title": "replicas"
-                        }
-                    },
-                    "required": [],
-                    "title": "prefill",
-                    "type": "object"
-                },
-                "resources": {
-                    "description": "ResourceRequirements describes the compute resource requirements.",
-                    "properties": {
-                        "claims": {
-                            "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
-                            "items": {
-                                "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
-                                "properties": {
-                                    "name": {
-                                        "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
-                                        "type": "string"
+                        },
+                        "resources": {
+                            "description": "ResourceRequirements describes the compute resource requirements.",
+                            "properties": {
+                                "claims": {
+                                    "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.",
+                                    "items": {
+                                        "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.",
+                                        "properties": {
+                                            "name": {
+                                                "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.",
+                                                "type": "string"
+                                            },
+                                            "request": {
+                                                "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
+                                                "type": "string"
+                                            }
+                                        },
+                                        "required": [
+                                            "name"
+                                        ],
+                                        "type": "object"
                                     },
-                                    "request": {
-                                        "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.",
-                                        "type": "string"
-                                    }
+                                    "type": "array",
+                                    "x-kubernetes-list-map-keys": [
+                                        "name"
+                                    ],
+                                    "x-kubernetes-list-type": "map"
                                 },
-                                "required": [
-                                    "name"
-                                ],
-                                "type": "object"
-                            },
-                            "type": "array",
-                            "x-kubernetes-list-map-keys": [
-                                "name"
-                            ],
-                            "x-kubernetes-list-type": "map"
-                        },
-                        "limits": {
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "string"
+                                "limits": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
                                     },
-                                    {
-                                        "type": "number"
-                                    }
-                                ]
-                            },
-                            "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
-                            "type": "object"
-                        },
-                        "requests": {
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "string"
+                                    "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                },
+                                "requests": {
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "number"
+                                            }
+                                        ]
                                     },
-                                    {
-                                        "type": "number"
-                                    }
-                                ]
+                                    "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
+                                    "type": "object"
+                                }
                             },
-                            "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/",
                             "type": "object"
                         }
                     },
+                    "required": [],
+                    "title": "prefill",
                     "type": "object"
                 }
             },

From fba496a884b201ffae38fffcb964dc5fb4d26e5a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <robertgshaw2@gmail.com>
Date: Tue, 1 Jul 2025 13:02:48 +0000
Subject: [PATCH 06/18] Your new single commit message

---
 .../presets/basic-gpu-with-nixl-preset.yaml   | 16 ++--
 .../examples/rob-benchmarking/4p1d-het.yaml   | 84 +++++++++++++++++++
 quickstart/examples/rob-benchmarking/Justfile | 40 +++++++++
 .../examples/rob-benchmarking/Justfile.remote | 44 ++++++++++
 .../benchmark-interactive-pod.yaml            | 32 +++++++
 5 files changed, 210 insertions(+), 6 deletions(-)
 create mode 100644 quickstart/examples/rob-benchmarking/4p1d-het.yaml
 create mode 100644 quickstart/examples/rob-benchmarking/Justfile
 create mode 100644 quickstart/examples/rob-benchmarking/Justfile.remote
 create mode 100644 quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml

diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index e84b680..bb312aa 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -84,6 +84,10 @@ data:
                 - name: VLLM_LOGGING_LEVEL
                   value: {{ .Values.modelservice.vllm.logLevel }}
                 {{- end }}
+                - name: USE_BATCHED
+                  value: "0"
+                - name: VLLM_IS_PREFILL
+                  value: "1"
                 - name: VLLM_NIXL_SIDE_CHANNEL_PORT
                   value: "5557"
                 - name: VLLM_NIXL_SIDE_CHANNEL_HOST
@@ -97,8 +101,6 @@ data:
                       fieldPath: status.podIP
                 - name: LMCACHE_DISTRIBUTED_URL
                   value: ${POD_IP}:8200
-                - name: UCX_TLS
-                  value: "^cuda_ipc"
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}
@@ -129,7 +131,7 @@ data:
             - name: dshm
               emptyDir:
                 medium: Memory
-                sizeLimit: 1Gi
+                sizeLimit: 16Gi
             {{ `{{- if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
@@ -185,6 +187,10 @@ data:
                 - name: VLLM_LOGGING_LEVEL
                   value: {{ .Values.modelservice.vllm.logLevel }}
                 {{- end }}
+                - name: USE_BATCHED
+                  value: "0"
+                - name: VLLM_IS_PREFILL
+                  value: "1"
                 - name: VLLM_NIXL_SIDE_CHANNEL_PORT
                   value: "5557"
                 - name: VLLM_NIXL_SIDE_CHANNEL_HOST
@@ -198,8 +204,6 @@ data:
                       fieldPath: status.podIP
                 - name: LMCACHE_DISTRIBUTED_URL
                   value: ${POD_IP}:8200
-                - name: UCX_TLS
-                  value: "^cuda_ipc"
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}
@@ -230,7 +234,7 @@ data:
             - name: dshm
               emptyDir:
                 medium: Memory
-                sizeLimit: 1Gi
+                sizeLimit: 16Gi
             {{ `{{ if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
diff --git a/quickstart/examples/rob-benchmarking/4p1d-het.yaml b/quickstart/examples/rob-benchmarking/4p1d-het.yaml
new file mode 100644
index 0000000..7801080
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/4p1d-het.yaml
@@ -0,0 +1,84 @@
+sampleApplication:
+    baseConfigMapRefName: basic-gpu-with-nixl-preset
+    model:
+        modelArtifactURI: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8
+        modelName: "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+        auth:
+            hfToken:
+                name: llm-d-hf-token
+                key: HF_TOKEN
+    prefill:
+        replicas: 1
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+            rdma/ib: 1
+          requests:
+            nvidia.com/gpu: 1
+            rdma/ib: 1
+            cpu: "32"
+            memory: 128Gi
+        extraArgs:
+            - "--tensor-parallel-size"
+            - "1"
+            - "--disable-log-requests"
+            - "--max-model-len"
+            - "32768"
+            - "--block-size"
+            - "128"
+            - "--enforce-eager"
+            # - "--num-gpu-blocks-override"
+            # - "60000"
+
+    decode:
+        replicas: 1
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+            rdma/ib: 1
+          requests:
+            nvidia.com/gpu: 1
+            rdma/ib: 1
+            cpu: "32"
+            memory: 128Gi
+        extraArgs:
+            - "--tensor-parallel-size"
+            - "1"
+            - "--disable-log-requests"
+            - "--max-model-len"
+            - "32768"
+            - "--block-size"
+            - "128"
+            - "--enforce-eager"
+            # - "--num-gpu-blocks-override"
+            # - "60000"
+modelservice:
+  vllm:
+    image:
+      registry: docker.io    
+      repository: robertgouldshaw2/vllm-nixl
+      tag: launch-debug-0.10
+  epp:
+    defaultEnvVarsOverride:
+      - name: ENABLE_KVCACHE_AWARE_SCORER
+        value: "false"
+      - name: ENABLE_PREFIX_AWARE_SCORER
+        value: "true"
+      - name: ENABLE_LOAD_AWARE_SCORER
+        value: "true"
+      - name: ENABLE_SESSION_AWARE_SCORER
+        value: "false"
+      - name: PD_ENABLED
+        value: "true"
+      - name: PD_PROMPT_LEN_THRESHOLD
+        value: "10"
+      - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
+        value: "false"
+      - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
+        value: "true"
+      - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
+        value: "true"
+      - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
+        value: "false"
+redis:
+  enabled: false
\ No newline at end of file
diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
new file mode 100644
index 0000000..3c794f6
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/Justfile
@@ -0,0 +1,40 @@
+NAMESPACE := "pete-davidson"
+
+logs POD:
+    kubectl logs -f {{POD}} | grep -v "GET /metrics HTTP/1.1"
+
+get-ips:
+    just get-pods | awk '/^qwen-qwen3-30b-a3b-(decode|prefill)/ {print $6}'
+get-pods:
+    kubectl get pods -n {{NAMESPACE}} -o wide
+
+hf-token:
+  kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}}
+
+[working-directory: '/home/rshaw/llm-d-deployer/quickstart']
+install VALUES:
+    ./llmd-installer.sh \
+        --namespace {{NAMESPACE}} \
+        --storage-class shared-vast --storage-size 300Gi \
+        --values-file ./examples/rob-benchmarking/{{VALUES}} --skip-infra
+
+[working-directory: '/home/rshaw/llm-d-deployer/quickstart']
+uninstall:
+    ./llmd-installer.sh \
+        --namespace {{NAMESPACE}} \
+        --uninstall --skip-infra
+
+gh-token GH_TOKEN:
+    kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}}
+
+# Interactive benchmark commands:
+start-bench:
+    kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}}
+    kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml
+
+delete-bench:
+    kubectl delete pod -n {{NAMESPACE}} benchmark-interactive
+
+exec-bench:
+    kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \
+    kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash
diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote
new file mode 100644
index 0000000..ebdad42
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/Justfile.remote
@@ -0,0 +1,44 @@
+# Use this Justfile within the cluster.
+
+MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+BASE_URL := "http://llm-d-inference-gateway"
+
+eval:
+    lm_eval --model local-completions --tasks gsm8k \
+    --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
+    --limit 1000
+
+benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
+    python vllm/benchmarks/benchmark_serving.py \
+        --base-url {{BASE_URL}} \
+        --model {{MODEL}} \
+        --dataset-name random \
+        --random-input-len {{INPUT_LEN}} \
+        --random-output-len {{OUTPUT_LEN}}  \
+        --request-rate {{RR}} \
+        --seed $(date +%M%H%M%S) \
+        --num-prompts {{NUM_REQUESTS}} \
+        --ignore-eos
+
+# just benchmark 4 1000 15000 5000 <-- current 1P3D setup
+benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
+    python vllm/benchmarks/benchmark_serving.py \
+        --base-url http://{{POD_IP}}:8000 \
+        --model {{MODEL}} \
+        --dataset-name random \
+        --random-input-len {{INPUT_LEN}} \
+        --random-output-len {{OUTPUT_LEN}}  \
+        --request-rate {{RR}} \
+        --seed $(date +%M%H%M%S) \
+        --num-prompts {{NUM_REQUESTS}} \
+        --ignore-eos
+
+send_request:
+  curl -X POST {{BASE_URL}}/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{ \
+      "model": "{{MODEL}}", \
+      "prompt": "Red Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \
+      "max_tokens": 150, \
+      "temperature": 0.7 \
+    }'
diff --git a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
new file mode 100644
index 0000000..bcb6434
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
@@ -0,0 +1,32 @@
+# benchmark-client-interactive-pod.yaml
+apiVersion: v1
+kind: Pod
+metadata:
+    name: benchmark-interactive
+    labels:
+        app: benchmark-interactive # Labels for organization
+spec:
+    containers:
+        - name: benchmark-runner
+          image: "quay.io/tms/pd-disagg-benchmark:0.0.6"
+          imagePullPolicy: Always
+          stdin: true
+          tty: true
+          resources:
+              requests:
+                  cpu: "16"
+                  memory: "64Gi"
+              limits:
+                  cpu: "16"
+                  memory: "64Gi"
+          env:
+              - name: PROXY_HOST
+                value: "custom-llm-proxy-service"
+              - name: PROXY_PORT
+                value: "80"
+              - name: HF_TOKEN
+                valueFrom:
+                    secretKeyRef:
+                        name: hf-token-secret # set up with just hf_token
+                        key: HF_TOKEN
+    restartPolicy: Never

From 8b7549a4eb5be69e3a861f335267e1e9ac17bd75 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 3 Jul 2025 16:21:11 -0400
Subject: [PATCH 07/18] updated

Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
---
 .../presets/basic-gpu-preset.yaml             |  4 +++
 quickstart/examples/rob-benchmarking/Justfile | 20 +++++++++---
 .../examples/rob-benchmarking/Justfile.remote | 32 ++++++++++++++++++-
 .../benchmark-interactive-pod.yaml            |  2 +-
 4 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml
index 1a3480b..b2c5fd1 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml
@@ -75,6 +75,8 @@ data:
                 - "--port"
                 - "8001"
               env:
+                - name: VLLM_USE_V1
+                  value: "1"
                 - name: HOME
                   value: /home
                 {{- if .Values.modelservice.vllm.logLevel }}
@@ -154,6 +156,8 @@ data:
                 - "--port"
                 - "8000"
               env:
+                - name: VLLM_USE_V1
+                  value: "1"
                 - name: HOME
                   value: /home
                 {{ if .Values.modelservice.vllm.logLevel }}
diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
index 3c794f6..2135ba6 100644
--- a/quickstart/examples/rob-benchmarking/Justfile
+++ b/quickstart/examples/rob-benchmarking/Justfile
@@ -1,24 +1,23 @@
+HF_TOKEN := "my_token"
 NAMESPACE := "pete-davidson"
 
 logs POD:
-    kubectl logs -f {{POD}} | grep -v "GET /metrics HTTP/1.1"
+    kubectl logs -f {{POD}} -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1"
 
-get-ips:
-    just get-pods | awk '/^qwen-qwen3-30b-a3b-(decode|prefill)/ {print $6}'
 get-pods:
     kubectl get pods -n {{NAMESPACE}} -o wide
 
 hf-token:
   kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}}
 
-[working-directory: '/home/rshaw/llm-d-deployer/quickstart']
+[working-directory: '/Users/robertgshaw/llm-d-deployer/quickstart']
 install VALUES:
     ./llmd-installer.sh \
         --namespace {{NAMESPACE}} \
         --storage-class shared-vast --storage-size 300Gi \
         --values-file ./examples/rob-benchmarking/{{VALUES}} --skip-infra
 
-[working-directory: '/home/rshaw/llm-d-deployer/quickstart']
+[working-directory: '/Users/robertgshaw/llm-d-deployer/quickstart']
 uninstall:
     ./llmd-installer.sh \
         --namespace {{NAMESPACE}} \
@@ -32,9 +31,20 @@ start-bench:
     kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}}
     kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml
 
+start-bench-2:
+    kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod-2.yaml
+
 delete-bench:
     kubectl delete pod -n {{NAMESPACE}} benchmark-interactive
 
 exec-bench:
+    kubectl cp sweep.sh {{NAMESPACE}}/benchmark-interactive:/app/sweep.sh && \
+    kubectl cp sweep-sharegpt.sh {{NAMESPACE}}/benchmark-interactive:/app/sweep-sharegpt.sh && \
     kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \
     kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash
+
+exec-bench-2:
+    kubectl cp sweep.sh {{NAMESPACE}}/benchmark-interactive-2:/app/sweep.sh && \
+    kubectl cp sweep-sharegpt.sh {{NAMESPACE}}/benchmark-interactive-2:/app/sweep-sharegpt.sh && \
+    kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive-2:/app/Justfile && \
+    kubectl exec -it -n {{NAMESPACE}} benchmark-interactive-2 -- /bin/bash
diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote
index ebdad42..952cbf5 100644
--- a/quickstart/examples/rob-benchmarking/Justfile.remote
+++ b/quickstart/examples/rob-benchmarking/Justfile.remote
@@ -1,8 +1,23 @@
 # Use this Justfile within the cluster.
 
-MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+MODEL := "meta-llama/Llama-3.1-8B-Instruct"
+# MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
 BASE_URL := "http://llm-d-inference-gateway"
 
+pull:
+  cd vllm && git pull
+
+download_sharegpt:
+  apt update && apt install wget && wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+sweep POD_IP:
+  uv pip install pybase64 && cd vllm && git pull && cd .. && \
+  MODEL={{MODEL}} POD_IP={{POD_IP}} bash ./sweep.sh   
+
+sweep_sharegpt POD_IP:
+  uv pip install pybase64 && cd vllm && git pull && cd .. && \
+  MODEL={{MODEL}} POD_IP={{POD_IP}} bash ./sweep-sharegpt.sh   
+
 eval:
     lm_eval --model local-completions --tasks gsm8k \
     --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
@@ -20,12 +35,27 @@ benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
         --num-prompts {{NUM_REQUESTS}} \
         --ignore-eos
 
+benchmark_no_pd_concurrency POD_IP CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
+  python vllm/benchmarks/benchmark_serving.py \
+        --base-url http://{{POD_IP}}:8000 \
+        --model {{MODEL}} \
+        --dataset-name random \
+        --random-input-len {{INPUT_LEN}} \
+        --random-output-len {{OUTPUT_LEN}} \
+        --max-concurrency {{CONCURRENCY}} \
+        --num-prompts {{NUM_REQUESTS}} \
+        --seed $(date +%s) \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --metric-percentiles 90,95,99 \
+        --ignore-eos
+
 # just benchmark 4 1000 15000 5000 <-- current 1P3D setup
 benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
     python vllm/benchmarks/benchmark_serving.py \
         --base-url http://{{POD_IP}}:8000 \
         --model {{MODEL}} \
         --dataset-name random \
+        --random-prefix-len 600 \
         --random-input-len {{INPUT_LEN}} \
         --random-output-len {{OUTPUT_LEN}}  \
         --request-rate {{RR}} \
diff --git a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
index bcb6434..0a0eeb2 100644
--- a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
+++ b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml
@@ -4,7 +4,7 @@ kind: Pod
 metadata:
     name: benchmark-interactive
     labels:
-        app: benchmark-interactive # Labels for organization
+        app: benchmark-interactive
 spec:
     containers:
         - name: benchmark-runner

From 1c2226d34565bb27a3acecb904efc0ccc8c31314 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Thu, 3 Jul 2025 20:46:57 +0000
Subject: [PATCH 08/18] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 quickstart/examples/rob-benchmarking/Justfile | 20 +++++--------------
 .../{4p1d-het.yaml => tp-8.yaml}              | 20 +++++++++----------
 2 files changed, 15 insertions(+), 25 deletions(-)
 rename quickstart/examples/rob-benchmarking/{4p1d-het.yaml => tp-8.yaml} (86%)

diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
index 2135ba6..14a056e 100644
--- a/quickstart/examples/rob-benchmarking/Justfile
+++ b/quickstart/examples/rob-benchmarking/Justfile
@@ -1,4 +1,3 @@
-HF_TOKEN := "my_token"
 NAMESPACE := "pete-davidson"
 
 logs POD:
@@ -8,32 +7,29 @@ get-pods:
     kubectl get pods -n {{NAMESPACE}} -o wide
 
 hf-token:
-  kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}}
+  kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN="$HF_TOKEN" -n {{NAMESPACE}}
 
-[working-directory: '/Users/robertgshaw/llm-d-deployer/quickstart']
+[working-directory: '/home/rshaw/llm-d-deployer/quickstart']
 install VALUES:
     ./llmd-installer.sh \
         --namespace {{NAMESPACE}} \
         --storage-class shared-vast --storage-size 300Gi \
         --values-file ./examples/rob-benchmarking/{{VALUES}} --skip-infra
 
-[working-directory: '/Users/robertgshaw/llm-d-deployer/quickstart']
+[working-directory: '/home/rshaw/llm-d-deployer/quickstart']
 uninstall:
     ./llmd-installer.sh \
         --namespace {{NAMESPACE}} \
-        --uninstall --skip-infra
+        --uninstall
 
 gh-token GH_TOKEN:
     kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}}
 
 # Interactive benchmark commands:
 start-bench:
-    kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}}
+    kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN="$HF_TOKEN" -n {{NAMESPACE}}
     kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml
 
-start-bench-2:
-    kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod-2.yaml
-
 delete-bench:
     kubectl delete pod -n {{NAMESPACE}} benchmark-interactive
 
@@ -42,9 +38,3 @@ exec-bench:
     kubectl cp sweep-sharegpt.sh {{NAMESPACE}}/benchmark-interactive:/app/sweep-sharegpt.sh && \
     kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \
     kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash
-
-exec-bench-2:
-    kubectl cp sweep.sh {{NAMESPACE}}/benchmark-interactive-2:/app/sweep.sh && \
-    kubectl cp sweep-sharegpt.sh {{NAMESPACE}}/benchmark-interactive-2:/app/sweep-sharegpt.sh && \
-    kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive-2:/app/Justfile && \
-    kubectl exec -it -n {{NAMESPACE}} benchmark-interactive-2 -- /bin/bash
diff --git a/quickstart/examples/rob-benchmarking/4p1d-het.yaml b/quickstart/examples/rob-benchmarking/tp-8.yaml
similarity index 86%
rename from quickstart/examples/rob-benchmarking/4p1d-het.yaml
rename to quickstart/examples/rob-benchmarking/tp-8.yaml
index 7801080..d8be566 100644
--- a/quickstart/examples/rob-benchmarking/4p1d-het.yaml
+++ b/quickstart/examples/rob-benchmarking/tp-8.yaml
@@ -11,33 +11,33 @@ sampleApplication:
         replicas: 1
         resources:
           limits:
-            nvidia.com/gpu: 1
+            nvidia.com/gpu: 8
             rdma/ib: 1
           requests:
-            nvidia.com/gpu: 1
+            nvidia.com/gpu: 8
             rdma/ib: 1
             cpu: "32"
             memory: 128Gi
         extraArgs:
             - "--tensor-parallel-size"
-            - "1"
+            - "8"
             - "--disable-log-requests"
             - "--max-model-len"
             - "32768"
             - "--block-size"
             - "128"
             - "--enforce-eager"
-            # - "--num-gpu-blocks-override"
-            # - "60000"
+            - "--num-gpu-blocks-override"
+            - "60000"
 
     decode:
         replicas: 1
         resources:
           limits:
-            nvidia.com/gpu: 1
+            nvidia.com/gpu: 8
             rdma/ib: 1
           requests:
-            nvidia.com/gpu: 1
+            nvidia.com/gpu: 8
             rdma/ib: 1
             cpu: "32"
             memory: 128Gi
@@ -50,14 +50,14 @@ sampleApplication:
             - "--block-size"
             - "128"
             - "--enforce-eager"
-            # - "--num-gpu-blocks-override"
-            # - "60000"
+            - "--num-gpu-blocks-override"
+            - "60000"
 modelservice:
   vllm:
     image:
       registry: docker.io    
       repository: robertgouldshaw2/vllm-nixl
-      tag: launch-debug-0.10
+      tag: nixl-oh-debug-0.1
   epp:
     defaultEnvVarsOverride:
       - name: ENABLE_KVCACHE_AWARE_SCORER

From c43e5900d6b2e1bc8b53a4ec8f99e8cbf6d4d103 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Mon, 7 Jul 2025 00:48:20 +0000
Subject: [PATCH 09/18] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 quickstart/examples/rob-benchmarking/Justfile |  5 +-
 .../examples/rob-benchmarking/Justfile.remote | 37 +++------
 .../examples/rob-benchmarking/tp-1.yaml       | 79 +++++++++++++++++++
 .../examples/rob-benchmarking/tp-8.yaml       |  9 +--
 4 files changed, 93 insertions(+), 37 deletions(-)
 create mode 100644 quickstart/examples/rob-benchmarking/tp-1.yaml

diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
index 14a056e..e69ae46 100644
--- a/quickstart/examples/rob-benchmarking/Justfile
+++ b/quickstart/examples/rob-benchmarking/Justfile
@@ -20,21 +20,18 @@ install VALUES:
 uninstall:
     ./llmd-installer.sh \
         --namespace {{NAMESPACE}} \
-        --uninstall
+        --uninstall --skip-infra
 
 gh-token GH_TOKEN:
     kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}}
 
 # Interactive benchmark commands:
 start-bench:
-    kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN="$HF_TOKEN" -n {{NAMESPACE}}
     kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml
 
 delete-bench:
     kubectl delete pod -n {{NAMESPACE}} benchmark-interactive
 
 exec-bench:
-    kubectl cp sweep.sh {{NAMESPACE}}/benchmark-interactive:/app/sweep.sh && \
-    kubectl cp sweep-sharegpt.sh {{NAMESPACE}}/benchmark-interactive:/app/sweep-sharegpt.sh && \
     kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \
     kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash
diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote
index 952cbf5..97911e0 100644
--- a/quickstart/examples/rob-benchmarking/Justfile.remote
+++ b/quickstart/examples/rob-benchmarking/Justfile.remote
@@ -1,28 +1,26 @@
 # Use this Justfile within the cluster.
 
-MODEL := "meta-llama/Llama-3.1-8B-Instruct"
-# MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
+MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
 BASE_URL := "http://llm-d-inference-gateway"
 
 pull:
   cd vllm && git pull
 
-download_sharegpt:
-  apt update && apt install wget && wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-sweep POD_IP:
-  uv pip install pybase64 && cd vllm && git pull && cd .. && \
-  MODEL={{MODEL}} POD_IP={{POD_IP}} bash ./sweep.sh   
-
-sweep_sharegpt POD_IP:
-  uv pip install pybase64 && cd vllm && git pull && cd .. && \
-  MODEL={{MODEL}} POD_IP={{POD_IP}} bash ./sweep-sharegpt.sh   
-
 eval:
     lm_eval --model local-completions --tasks gsm8k \
     --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
     --limit 1000
 
+benchmark_one INPUT_LEN:
+  cd vllm && git fetch && git checkout 3c6fd286b40ada67bba98216ed410bb3a0d38b16 && uv pip install pybase64 && \
+    python benchmarks/benchmark_one_concurrent.py \
+      --base-url {{BASE_URL}} \
+      --model {{MODEL}} \
+      --input-len {{INPUT_LEN}} \
+      --output-len 1 \
+      --num-requests 10 \
+      --seed $(date +%s)
+
 benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
     python vllm/benchmarks/benchmark_serving.py \
         --base-url {{BASE_URL}} \
@@ -49,19 +47,6 @@ benchmark_no_pd_concurrency POD_IP CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN
         --metric-percentiles 90,95,99 \
         --ignore-eos
 
-# just benchmark 4 1000 15000 5000 <-- current 1P3D setup
-benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
-    python vllm/benchmarks/benchmark_serving.py \
-        --base-url http://{{POD_IP}}:8000 \
-        --model {{MODEL}} \
-        --dataset-name random \
-        --random-prefix-len 600 \
-        --random-input-len {{INPUT_LEN}} \
-        --random-output-len {{OUTPUT_LEN}}  \
-        --request-rate {{RR}} \
-        --seed $(date +%M%H%M%S) \
-        --num-prompts {{NUM_REQUESTS}} \
-        --ignore-eos
 
 send_request:
   curl -X POST {{BASE_URL}}/v1/completions \
diff --git a/quickstart/examples/rob-benchmarking/tp-1.yaml b/quickstart/examples/rob-benchmarking/tp-1.yaml
new file mode 100644
index 0000000..9bda5ee
--- /dev/null
+++ b/quickstart/examples/rob-benchmarking/tp-1.yaml
@@ -0,0 +1,79 @@
+sampleApplication:
+    baseConfigMapRefName: basic-gpu-with-nixl-preset
+    model:
+        modelArtifactURI: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8
+        modelName: "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+        auth:
+            hfToken:
+                name: llm-d-hf-token
+                key: HF_TOKEN
+    prefill:
+        replicas: 1
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+            rdma/ib: 1
+          requests:
+            nvidia.com/gpu: 1
+            rdma/ib: 1
+            cpu: "32"
+            memory: 128Gi
+        extraArgs:
+            - "--tensor-parallel-size"
+            - "1"
+            - "--disable-log-requests"
+            - "--max-model-len"
+            - "32768"
+            - "--block-size"
+            - "128"
+            - "--enforce-eager"
+    decode:
+        replicas: 1
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+            rdma/ib: 1
+          requests:
+            nvidia.com/gpu: 1
+            rdma/ib: 1
+            cpu: "32"
+            memory: 128Gi
+        extraArgs:
+            - "--tensor-parallel-size"
+            - "1"
+            - "--disable-log-requests"
+            - "--max-model-len"
+            - "32768"
+            - "--block-size"
+            - "128"
+            - "--enforce-eager"
+modelservice:
+  vllm:
+    image:
+      registry: docker.io    
+      repository: robertgouldshaw2/vllm-nixl
+      tag: nixl-oh-debug-0.3
+  epp:
+    defaultEnvVarsOverride:
+      - name: ENABLE_KVCACHE_AWARE_SCORER
+        value: "false"
+      - name: ENABLE_PREFIX_AWARE_SCORER
+        value: "true"
+      - name: ENABLE_LOAD_AWARE_SCORER
+        value: "true"
+      - name: ENABLE_SESSION_AWARE_SCORER
+        value: "false"
+      - name: PD_ENABLED
+        value: "true"
+      - name: PD_PROMPT_LEN_THRESHOLD
+        value: "10"
+      - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
+        value: "false"
+      - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
+        value: "true"
+      - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
+        value: "true"
+      - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
+        value: "false"
+redis:
+  enabled: false
\ No newline at end of file
diff --git a/quickstart/examples/rob-benchmarking/tp-8.yaml b/quickstart/examples/rob-benchmarking/tp-8.yaml
index d8be566..21ed6c8 100644
--- a/quickstart/examples/rob-benchmarking/tp-8.yaml
+++ b/quickstart/examples/rob-benchmarking/tp-8.yaml
@@ -27,9 +27,6 @@ sampleApplication:
             - "--block-size"
             - "128"
             - "--enforce-eager"
-            - "--num-gpu-blocks-override"
-            - "60000"
-
     decode:
         replicas: 1
         resources:
@@ -43,21 +40,19 @@ sampleApplication:
             memory: 128Gi
         extraArgs:
             - "--tensor-parallel-size"
-            - "1"
+            - "8"
             - "--disable-log-requests"
             - "--max-model-len"
             - "32768"
             - "--block-size"
             - "128"
             - "--enforce-eager"
-            - "--num-gpu-blocks-override"
-            - "60000"
 modelservice:
   vllm:
     image:
       registry: docker.io    
       repository: robertgouldshaw2/vllm-nixl
-      tag: nixl-oh-debug-0.1
+      tag: nixl-oh-debug-0.3
   epp:
     defaultEnvVarsOverride:
       - name: ENABLE_KVCACHE_AWARE_SCORER

From 82d1951c4f1e7609b6700de662ca05ba4d3b8147 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Mon, 7 Jul 2025 01:36:35 +0000
Subject: [PATCH 10/18] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 quickstart/examples/rob-benchmarking/tp-8.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quickstart/examples/rob-benchmarking/tp-8.yaml b/quickstart/examples/rob-benchmarking/tp-8.yaml
index 21ed6c8..5699d43 100644
--- a/quickstart/examples/rob-benchmarking/tp-8.yaml
+++ b/quickstart/examples/rob-benchmarking/tp-8.yaml
@@ -52,7 +52,7 @@ modelservice:
     image:
       registry: docker.io    
       repository: robertgouldshaw2/vllm-nixl
-      tag: nixl-oh-debug-0.3
+      tag: nixl-oh-debug-fixed-0.1
   epp:
     defaultEnvVarsOverride:
       - name: ENABLE_KVCACHE_AWARE_SCORER

From b90e3896fafa670b9b5039c48c94a7031a29809d Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Mon, 7 Jul 2025 01:38:40 +0000
Subject: [PATCH 11/18] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 .../modelservice/presets/basic-gpu-with-nixl-preset.yaml    | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index bb312aa..eebedd8 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -84,10 +84,6 @@ data:
                 - name: VLLM_LOGGING_LEVEL
                   value: {{ .Values.modelservice.vllm.logLevel }}
                 {{- end }}
-                - name: USE_BATCHED
-                  value: "0"
-                - name: VLLM_IS_PREFILL
-                  value: "1"
                 - name: VLLM_NIXL_SIDE_CHANNEL_PORT
                   value: "5557"
                 - name: VLLM_NIXL_SIDE_CHANNEL_HOST
@@ -187,8 +183,6 @@ data:
                 - name: VLLM_LOGGING_LEVEL
                   value: {{ .Values.modelservice.vllm.logLevel }}
                 {{- end }}
-                - name: USE_BATCHED
-                  value: "0"
                 - name: VLLM_IS_PREFILL
                   value: "1"
                 - name: VLLM_NIXL_SIDE_CHANNEL_PORT

From 7083f624c81d613aa0c927199b610aa0dedfc450 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Thu, 10 Jul 2025 02:36:37 +0000
Subject: [PATCH 12/18] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 .../{tp-8.yaml => 4p-1d-llama-70b.yaml}       | 20 +++++++---------
 quickstart/examples/rob-benchmarking/Justfile |  7 ++++--
 .../examples/rob-benchmarking/Justfile.remote | 24 ++++++++++++++-----
 3 files changed, 32 insertions(+), 19 deletions(-)
 rename quickstart/examples/rob-benchmarking/{tp-8.yaml => 4p-1d-llama-70b.yaml} (82%)

diff --git a/quickstart/examples/rob-benchmarking/tp-8.yaml b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
similarity index 82%
rename from quickstart/examples/rob-benchmarking/tp-8.yaml
rename to quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
index 5699d43..19bd780 100644
--- a/quickstart/examples/rob-benchmarking/tp-8.yaml
+++ b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
@@ -1,52 +1,50 @@
 sampleApplication:
     baseConfigMapRefName: basic-gpu-with-nixl-preset
     model:
-        modelArtifactURI: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8
-        modelName: "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+        modelArtifactURI: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
+        modelName: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
         auth:
             hfToken:
                 name: llm-d-hf-token
                 key: HF_TOKEN
     prefill:
-        replicas: 1
+        replicas: 4
         resources:
           limits:
-            nvidia.com/gpu: 8
+            nvidia.com/gpu: 1
             rdma/ib: 1
           requests:
-            nvidia.com/gpu: 8
+            nvidia.com/gpu: 1
             rdma/ib: 1
             cpu: "32"
             memory: 128Gi
         extraArgs:
             - "--tensor-parallel-size"
-            - "8"
+            - "1"
             - "--disable-log-requests"
             - "--max-model-len"
             - "32768"
             - "--block-size"
             - "128"
-            - "--enforce-eager"
     decode:
         replicas: 1
         resources:
           limits:
-            nvidia.com/gpu: 8
+            nvidia.com/gpu: 4
             rdma/ib: 1
           requests:
-            nvidia.com/gpu: 8
+            nvidia.com/gpu: 4
             rdma/ib: 1
             cpu: "32"
             memory: 128Gi
         extraArgs:
             - "--tensor-parallel-size"
-            - "8"
+            - "4"
             - "--disable-log-requests"
             - "--max-model-len"
             - "32768"
             - "--block-size"
             - "128"
-            - "--enforce-eager"
 modelservice:
   vllm:
     image:
diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
index e69ae46..622c01f 100644
--- a/quickstart/examples/rob-benchmarking/Justfile
+++ b/quickstart/examples/rob-benchmarking/Justfile
@@ -1,7 +1,10 @@
 NAMESPACE := "pete-davidson"
 
-logs POD:
-    kubectl logs -f {{POD}} -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1"
+logs:
+    kubectl logs -f $POD -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" | grep -v "TRANSFER BATCHED"
+
+logs-stats:
+    kubectl logs -f $POD -n {{NAMESPACE}} | grep -e "Engine 000:"
 
 get-pods:
     kubectl get pods -n {{NAMESPACE}} -o wide
diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote
index 97911e0..284e5a8 100644
--- a/quickstart/examples/rob-benchmarking/Justfile.remote
+++ b/quickstart/examples/rob-benchmarking/Justfile.remote
@@ -1,6 +1,6 @@
 # Use this Justfile within the cluster.
 
-MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+MODEL := "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
 BASE_URL := "http://llm-d-inference-gateway"
 
 pull:
@@ -8,7 +8,7 @@ pull:
 
 eval:
     lm_eval --model local-completions --tasks gsm8k \
-    --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
+    --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=100,max_retries=0,tokenized_requests=False \
     --limit 1000
 
 benchmark_one INPUT_LEN:
@@ -21,19 +21,31 @@ benchmark_one INPUT_LEN:
       --num-requests 10 \
       --seed $(date +%s)
 
-benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
+benchmark_one_no_pd POD_IP INPUT_LEN:
+  cd vllm && git fetch && git checkout 3c6fd286b40ada67bba98216ed410bb3a0d38b16 && uv pip install pybase64 && \
+    python benchmarks/benchmark_one_concurrent.py \
+      --base-url http://{{POD_IP}}:8000 \
+      --model {{MODEL}} \
+      --input-len {{INPUT_LEN}} \
+      --output-len 1 \
+      --num-requests 10 \
+      --seed $(date +%s)
+
+benchmark CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
     python vllm/benchmarks/benchmark_serving.py \
         --base-url {{BASE_URL}} \
         --model {{MODEL}} \
         --dataset-name random \
         --random-input-len {{INPUT_LEN}} \
         --random-output-len {{OUTPUT_LEN}}  \
-        --request-rate {{RR}} \
+        --max-concurrency {{CONCURRENCY}} \
         --seed $(date +%M%H%M%S) \
         --num-prompts {{NUM_REQUESTS}} \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --metric-percentiles 90,95,99 \
         --ignore-eos
 
-benchmark_no_pd_concurrency POD_IP CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
+benchmark_no_pd POD_IP CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN:
   python vllm/benchmarks/benchmark_serving.py \
         --base-url http://{{POD_IP}}:8000 \
         --model {{MODEL}} \
@@ -42,7 +54,7 @@ benchmark_no_pd_concurrency POD_IP CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN
         --random-output-len {{OUTPUT_LEN}} \
         --max-concurrency {{CONCURRENCY}} \
         --num-prompts {{NUM_REQUESTS}} \
-        --seed $(date +%s) \
+        --seed $(date +%M%H%M%S) \
         --percentile-metrics ttft,tpot,itl,e2el \
         --metric-percentiles 90,95,99 \
         --ignore-eos

From 568a58231c3d34e3eadce87383277282da742a71 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Thu, 10 Jul 2025 17:30:56 +0000
Subject: [PATCH 13/18] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 .../presets/basic-gpu-with-nixl-preset.yaml            |  2 ++
 .../examples/rob-benchmarking/4p-1d-llama-70b.yaml     | 10 +++++-----
 quickstart/examples/rob-benchmarking/Justfile          |  2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index eebedd8..b3d67a6 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -84,6 +84,8 @@ data:
                 - name: VLLM_LOGGING_LEVEL
                   value: {{ .Values.modelservice.vllm.logLevel }}
                 {{- end }}
+                - name: VLLM_LOG_XFER_TIME
+                  value: "0"
                 - name: VLLM_NIXL_SIDE_CHANNEL_PORT
                   value: "5557"
                 - name: VLLM_NIXL_SIDE_CHANNEL_HOST
diff --git a/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
index 19bd780..6d6ccf7 100644
--- a/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
+++ b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
@@ -50,15 +50,15 @@ modelservice:
     image:
       registry: docker.io    
       repository: robertgouldshaw2/vllm-nixl
-      tag: nixl-oh-debug-fixed-0.1
+      tag: nixl-oh-debug-fixed-0.3
   epp:
     defaultEnvVarsOverride:
       - name: ENABLE_KVCACHE_AWARE_SCORER
         value: "false"
       - name: ENABLE_PREFIX_AWARE_SCORER
-        value: "true"
+        value: "false"
       - name: ENABLE_LOAD_AWARE_SCORER
-        value: "true"
+        value: "false"
       - name: ENABLE_SESSION_AWARE_SCORER
         value: "false"
       - name: PD_ENABLED
@@ -68,9 +68,9 @@ modelservice:
       - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
         value: "false"
       - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
-        value: "true"
+        value: "false"
       - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
-        value: "true"
+        value: "false"
       - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
         value: "false"
 redis:
diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
index 622c01f..6a52a8d 100644
--- a/quickstart/examples/rob-benchmarking/Justfile
+++ b/quickstart/examples/rob-benchmarking/Justfile
@@ -1,7 +1,7 @@
 NAMESPACE := "pete-davidson"
 
 logs:
-    kubectl logs -f $POD -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" | grep -v "TRANSFER BATCHED"
+    kubectl logs -f $POD -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" | grep -v ".get_finished" | grep -v ".transfer_batched"
 
 logs-stats:
     kubectl logs -f $POD -n {{NAMESPACE}} | grep -e "Engine 000:"

From c7ef50e5cf248ccf8b3c818a36dc98f330353cce Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Thu, 10 Jul 2025 17:31:58 +0000
Subject: [PATCH 14/18] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 .../examples/rob-benchmarking/tp-1.yaml       | 79 -------------------
 1 file changed, 79 deletions(-)
 delete mode 100644 quickstart/examples/rob-benchmarking/tp-1.yaml

diff --git a/quickstart/examples/rob-benchmarking/tp-1.yaml b/quickstart/examples/rob-benchmarking/tp-1.yaml
deleted file mode 100644
index 9bda5ee..0000000
--- a/quickstart/examples/rob-benchmarking/tp-1.yaml
+++ /dev/null
@@ -1,79 +0,0 @@
-sampleApplication:
-    baseConfigMapRefName: basic-gpu-with-nixl-preset
-    model:
-        modelArtifactURI: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8
-        modelName: "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
-        auth:
-            hfToken:
-                name: llm-d-hf-token
-                key: HF_TOKEN
-    prefill:
-        replicas: 1
-        resources:
-          limits:
-            nvidia.com/gpu: 1
-            rdma/ib: 1
-          requests:
-            nvidia.com/gpu: 1
-            rdma/ib: 1
-            cpu: "32"
-            memory: 128Gi
-        extraArgs:
-            - "--tensor-parallel-size"
-            - "1"
-            - "--disable-log-requests"
-            - "--max-model-len"
-            - "32768"
-            - "--block-size"
-            - "128"
-            - "--enforce-eager"
-    decode:
-        replicas: 1
-        resources:
-          limits:
-            nvidia.com/gpu: 1
-            rdma/ib: 1
-          requests:
-            nvidia.com/gpu: 1
-            rdma/ib: 1
-            cpu: "32"
-            memory: 128Gi
-        extraArgs:
-            - "--tensor-parallel-size"
-            - "1"
-            - "--disable-log-requests"
-            - "--max-model-len"
-            - "32768"
-            - "--block-size"
-            - "128"
-            - "--enforce-eager"
-modelservice:
-  vllm:
-    image:
-      registry: docker.io    
-      repository: robertgouldshaw2/vllm-nixl
-      tag: nixl-oh-debug-0.3
-  epp:
-    defaultEnvVarsOverride:
-      - name: ENABLE_KVCACHE_AWARE_SCORER
-        value: "false"
-      - name: ENABLE_PREFIX_AWARE_SCORER
-        value: "true"
-      - name: ENABLE_LOAD_AWARE_SCORER
-        value: "true"
-      - name: ENABLE_SESSION_AWARE_SCORER
-        value: "false"
-      - name: PD_ENABLED
-        value: "true"
-      - name: PD_PROMPT_LEN_THRESHOLD
-        value: "10"
-      - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
-        value: "false"
-      - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
-        value: "true"
-      - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
-        value: "true"
-      - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
-        value: "false"
-redis:
-  enabled: false
\ No newline at end of file

From 0d5ecc1b971d077ee7de3ccda1626c39ebc6e293 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Thu, 10 Jul 2025 17:33:00 +0000
Subject: [PATCH 15/18] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 .../templates/modelservice/presets/basic-gpu-preset.yaml      | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml
index b2c5fd1..1a3480b 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml
@@ -75,8 +75,6 @@ data:
                 - "--port"
                 - "8001"
               env:
-                - name: VLLM_USE_V1
-                  value: "1"
                 - name: HOME
                   value: /home
                 {{- if .Values.modelservice.vllm.logLevel }}
@@ -156,8 +154,6 @@ data:
                 - "--port"
                 - "8000"
               env:
-                - name: VLLM_USE_V1
-                  value: "1"
                 - name: HOME
                   value: /home
                 {{ if .Values.modelservice.vllm.logLevel }}

From 78754690fb84a9c8fef46103acf805210c7d3196 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Thu, 10 Jul 2025 17:34:52 +0000
Subject: [PATCH 16/18] slim

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 .../modelservice/presets/basic-gpu-with-nixl-preset.yaml    | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index b3d67a6..3c21815 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -84,8 +84,6 @@ data:
                 - name: VLLM_LOGGING_LEVEL
                   value: {{ .Values.modelservice.vllm.logLevel }}
                 {{- end }}
-                - name: VLLM_LOG_XFER_TIME
-                  value: "0"
                 - name: VLLM_NIXL_SIDE_CHANNEL_PORT
                   value: "5557"
                 - name: VLLM_NIXL_SIDE_CHANNEL_HOST
@@ -99,6 +97,8 @@ data:
                       fieldPath: status.podIP
                 - name: LMCACHE_DISTRIBUTED_URL
                   value: ${POD_IP}:8200
+                - name: UCX_TLS
+                  value: "^cuda_ipc"
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}
@@ -200,6 +200,8 @@ data:
                       fieldPath: status.podIP
                 - name: LMCACHE_DISTRIBUTED_URL
                   value: ${POD_IP}:8200
+                - name: UCX_TLS
+                  value: "^cuda_ipc"
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}

From 4edbb2d5a82c6374a011b36b14414a5d3ab6786e Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Thu, 10 Jul 2025 17:46:34 +0000
Subject: [PATCH 17/18] updated to load aware

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
index 6d6ccf7..c3b13a1 100644
--- a/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
+++ b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml
@@ -58,7 +58,7 @@ modelservice:
       - name: ENABLE_PREFIX_AWARE_SCORER
         value: "false"
       - name: ENABLE_LOAD_AWARE_SCORER
-        value: "false"
+        value: "true"
       - name: ENABLE_SESSION_AWARE_SCORER
         value: "false"
       - name: PD_ENABLED
@@ -68,7 +68,7 @@ modelservice:
       - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
         value: "false"
       - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
-        value: "false"
+        value: "true"
       - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
         value: "false"
       - name: PREFILL_ENABLE_SESSION_AWARE_SCORER

From 155838fc884186641615cd8056511bc0847f3b66 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Fri, 11 Jul 2025 01:59:09 +0000
Subject: [PATCH 18/18] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 quickstart/examples/rob-benchmarking/Justfile        | 8 ++++----
 quickstart/examples/rob-benchmarking/Justfile.remote | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile
index 6a52a8d..4b2c3f0 100644
--- a/quickstart/examples/rob-benchmarking/Justfile
+++ b/quickstart/examples/rob-benchmarking/Justfile
@@ -1,10 +1,10 @@
 NAMESPACE := "pete-davidson"
 
-logs:
-    kubectl logs -f $POD -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" | grep -v ".get_finished" | grep -v ".transfer_batched"
+logs POD:
+    kubectl logs -f {{POD}} -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" | grep -v ".get_finished" | grep -v ".transfer_batched"
 
-logs-stats:
-    kubectl logs -f $POD -n {{NAMESPACE}} | grep -e "Engine 000:"
+logs-stats POD:
+    kubectl logs -f {{POD}} -n {{NAMESPACE}} | grep -e "Engine 000:"
 
 get-pods:
     kubectl get pods -n {{NAMESPACE}} -o wide
diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote
index 284e5a8..4e3d64f 100644
--- a/quickstart/examples/rob-benchmarking/Justfile.remote
+++ b/quickstart/examples/rob-benchmarking/Justfile.remote
@@ -1,6 +1,6 @@
 # Use this Justfile within the cluster.
 
-MODEL := "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+MODEL := "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic"
 BASE_URL := "http://llm-d-inference-gateway"
 
 pull: