From f7506017ced54cae1b8c41705acc4aea82304693 Mon Sep 17 00:00:00 2001 From: mnmehta <30246802+mnmehta@users.noreply.github.com> Date: Thu, 5 Jun 2025 23:09:57 -0700 Subject: [PATCH 01/18] For quick validation use 1st decode pod if there are multiple pods (#305) Signed-off-by: mnmehta <30246802+mnmehta@users.noreply.github.com> updated Signed-off-by: rshaw@neuralmagic.com updated Signed-off-by: rshaw@neuralmagic.com --- .../sample-application/modelservice.yaml | 4 +- charts/llm-d/values.yaml | 36 +- .../examples/rob-benchmarking/2P1D-het.yaml | 697 ++++++++++++++++++ quickstart/examples/rob-benchmarking/Justfile | 50 ++ .../examples/rob-benchmarking/Justfile.remote | 36 + .../benchmark-interactive-pod.yaml | 32 + quickstart/test-request.sh | 2 +- 7 files changed, 842 insertions(+), 15 deletions(-) create mode 100644 quickstart/examples/rob-benchmarking/2P1D-het.yaml create mode 100644 quickstart/examples/rob-benchmarking/Justfile create mode 100644 quickstart/examples/rob-benchmarking/Justfile.remote create mode 100644 quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml index 6ba5c22..efa35d6 100644 --- a/charts/llm-d/templates/sample-application/modelservice.yaml +++ b/charts/llm-d/templates/sample-application/modelservice.yaml @@ -30,7 +30,7 @@ spec: {{- range .Values.sampleApplication.decode.extraArgs }} - {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }} {{- end }} - resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }} + resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }} env: {{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }} - name: HF_TOKEN @@ -49,7 +49,7 @@ spec: {{- range .Values.sampleApplication.prefill.extraArgs }} - {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }} {{- end }} - resources: {{ .Values.sampleApplication.resources | toYaml | nindent 8 }} + resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }} env: {{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }} - name: HF_TOKEN diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml index 0d9e000..d0aa57a 100644 --- a/charts/llm-d/values.yaml +++ b/charts/llm-d/values.yaml @@ -125,22 +125,22 @@ sampleApplication: # -- Key within the secret under which the token is located key: HF_TOKEN - # @schema - # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements - # @schema - # -- Modify resource limits/requests available to the pods - # -- Resource requests/limits - #
Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container - resources: - limits: - nvidia.com/gpu: "1" - requests: - nvidia.com/gpu: "1" - # -- InferencePool port configuration inferencePoolPort: 8000 prefill: + # @schema + # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements + # @schema + # -- Modify resource limits/requests available to the pods + # -- Resource requests/limits + #
Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + # -- number of desired prefill replicas replicas: 1 @@ -152,6 +152,18 @@ sampleApplication: extraArgs: [] decode: + # @schema + # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements + # @schema + # -- Modify resource limits/requests available to the pods + # -- Resource requests/limits + #
Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + # -- number of desired decode replicas replicas: 1 diff --git a/quickstart/examples/rob-benchmarking/2P1D-het.yaml b/quickstart/examples/rob-benchmarking/2P1D-het.yaml new file mode 100644 index 0000000..2dfc7b4 --- /dev/null +++ b/quickstart/examples/rob-benchmarking/2P1D-het.yaml @@ -0,0 +1,697 @@ +# yaml-language-server: $schema=values.schema.json + +# Default values for the llm-d chart. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# -- Global parameters +# Global Docker image parameters +# Please, note that this will override the image parameters, including dependencies, configured to use the global value +# Current available global Docker image parameters: imageRegistry, imagePullSecrets and storageClass +# @default -- See below +global: + # -- Global Docker image registry + imageRegistry: "" + + # @schema + # items: + # type: string + # @schema + # -- Global Docker registry secret names as an array + #
E.g. `imagePullSecrets: [myRegistryKeySecretName]` + imagePullSecrets: [] + + security: + allowInsecureImages: true + +# @schema +# additionalProperties: true +# @schema +# -- Parameters for bitnami.common dependency +common: {} + +# -- Common parameters +# -- Override Kubernetes version +kubeVersion: "" + +# -- String to partially override common.names.fullname +nameOverride: "" + +# -- String to fully override common.names.fullname +fullnameOverride: "" + +# -- Default Kubernetes cluster domain +clusterDomain: cluster.local + +# @schema +# additionalProperties: true +# @schema +# -- Labels to add to all deployed objects +commonLabels: {} + +# @schema +# additionalProperties: true +# @schema +# -- Annotations to add to all deployed objects +commonAnnotations: {} + +# @schema +# items: +# type: [string, object] +# @schema +# -- Array of extra objects to deploy with the release +extraDeploy: [] + +# -- Helm tests +test: + # -- Enable rendering of helm test resources + enabled: false + + # @default -- See below + image: + # -- Test connection pod image registry + registry: quay.io + + # -- Test connection pod image repository. Note that the image needs to have both the `sh` and `curl` binaries in it. + repository: curl/curl + + # -- Test connection pod image tag. Note that the image needs to have both the `sh` and `curl` binaries in it. + tag: latest + + # -- Specify a imagePullPolicy + imagePullPolicy: "Always" + + # @schema + # items: + # type: string + # @schema + # -- Optionally specify an array of imagePullSecrets (evaluated as templates) + pullSecrets: [] + +# -- Sample application deploying a p-d pair of specific model +# @default -- See below +sampleApplication: + baseConfigMapRefName: basic-gpu-with-nixl-preset + + # -- Enable rendering of sample application resources + enabled: true + + model: + # -- Fully qualified pvc URI: pvc:/// + modelArtifactURI: hf://RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic + + # -- Name of the model + modelName: "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" + + # -- Aliases to the Model named vllm will serve with + servedModelNames: [] + + auth: + # -- HF token auth config via k8s secret. + hfToken: + # -- Name of the secret to create to store your huggingface token + name: llm-d-hf-token + # -- Value of the token. Do not set this but use `envsubst` in conjunction with the helm chart + key: HF_TOKEN + + # -- InferencePool port configuration + inferencePoolPort: 8000 + + prefill: + # -- number of desired prefill replicas + replicas: 2 + + # @schema + # items: + # type: string + # @schema + # -- args to add to the prefill deployment + extraArgs: + - "--tensor-parallel-size" + - "1" + - "--disable-log-requests" + - "--max-model-len" + - "32768" + - "--distributed-executor-backend" + - "mp" + - "--block-size" + - "128" + - "--max-num-batched-tokens" + - "32768" + + decode: + # -- number of desired decode replicas + replicas: 1 + + # @schema + # items: + # type: string + # @schema + # -- args to add to the decode deployment + extraArgs: + - "--tensor-parallel-size" + - "4" + - "--disable-log-requests" + - "--max-model-len" + - "32768" + - "--distributed-executor-backend" + - "mp" + - "--block-size" + - "128" + +# -- Gateway configuration +# @default -- See below +gateway: + # -- Deploy resources related to Gateway + enabled: true + + # -- String to fully override gateway.fullname + fullnameOverride: "" + + # -- String to partially override gateway.fullname + nameOverride: "" + + # -- Gateway class that determines the backend used + # Currently supported values: "kgateway" or "istio" + gatewayClassName: kgateway + + # @schema + # additionalProperties: true + # @schema + # -- Additional annotations provided to the Gateway resource + annotations: {} + + # Special parameters applied to kGateway via GatewayParameters resource + kGatewayParameters: + # @schema + # type: [number, boolean] + # @schema + proxyUID: false + + # @schema + # items: + # type: object + # properties: + # name: + # description: Name is the name of the Listener. This name MUST be unique within a Gateway + # type: string + # path: + # description: Path to expose via Ingress + # type: string + # port: + # description: Port is the network port. Multiple listeners may use the same port, subject to the Listener compatibility rules + # type: integer + # minimum: 1 + # maximum: 65535 + # protocol: + # description: Protocol specifies the network protocol this listener expects to receive + # type: string + # @schema + # Set of listeners exposed via the Gateway, also propagated to the Ingress if enabled + listeners: + - name: default + path: / + port: 80 + protocol: HTTP + + # -- Gateway's service type. Ingress is only available if the service type is set to NodePort. Accepted values: ["LoadBalancer", "NodePort"] + serviceType: NodePort + +# -- Ingress configuration +# @default -- See below +ingress: + # -- Deploy Ingress + enabled: true + + # -- Name of the IngressClass cluster resource which defines which controller will implement the resource (e.g nginx) + ingressClassName: "" + + # @schema + # additionalProperties: true + # @schema + # -- Additional annotations for the Ingress resource + annotations: {} + + # -- Hostname to be used to expose the NodePort service to the inferencing gateway + host: "" + + # -- List of additional hostnames to be covered with this ingress record (e.g. a CNAME) + # + extraHosts: [] + + # -- Path to be used to expose the full route to access the inferencing gateway + path: "/" + + # -- Ingress TLS parameters + tls: + # -- Enable TLS configuration for the host defined at `ingress.host` parameter + enabled: false + + # -- The name to which the TLS Secret will be called + secretName: "" + + # @schema + # items: + # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.networking.v1.IngressTLS + # @schema + # -- The TLS configuration for additional hostnames to be covered with this ingress record. + #
Ref: https://kubernetes.io/docs/concepts/services-networking/ingress/#tls + # + extraTls: [] + + # -- used as part of the host dirivation if not specified from OCP cluster domain (dont edit) + clusterRouterBase: "" + +# -- Model service controller configuration +# @default -- See below +modelservice: + # -- Toggle to deploy modelservice controller related resources + enabled: true + + # -- Enable metrics gathering via podMonitor / ServiceMonitor + metrics: + # -- Enable metrics scraping from prefill and decode services, see `model + enabled: true + + # -- Prometheus ServiceMonitor configuration + #
Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md + # @default -- See below + serviceMonitor: + # @schema + # additionalProperties: true + # @schema + # -- Additional annotations provided to the ServiceMonitor + annotations: {} + + # @schema + # additionalProperties: true + # @schema + # -- Additional labels provided to the ServiceMonitor + labels: {} + + # -- ServiceMonitor endpoint port + port: "vllm" + + # -- ServiceMonitor endpoint path + path: "/metrics" + + # -- ServiceMonitor endpoint interval at which metrics should be scraped + interval: "15s" + + # -- ServiceMonitor namespace selector + namespaceSelector: + any: false + + # @schema + # items: + # type: string + # @schema + matchNames: [] + + # -- ServiceMonitor selector matchLabels + #
matchLabels must match labels on modelservice Services + selector: + # @schema + # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector + # @schema + matchLabels: {} + + # -- String to fully override modelservice.fullname + fullnameOverride: "" + + # -- String to partially override modelservice.fullname + nameOverride: "" + + # -- Number of controller replicas + replicas: 1 + + # -- Modelservice controller image, please change only if appropriate adjustments to the CRD are being made + # @default -- See below + image: + # -- Model Service controller image registry + registry: ghcr.io + + # -- Model Service controller image repository + repository: llm-d/llm-d-model-service + + # -- Model Service controller image tag + tag: "0.0.10" + + # -- Specify a imagePullPolicy + imagePullPolicy: "Always" + + # @schema + # items: + # type: string + # @schema + # -- Optionally specify an array of imagePullSecrets (evaluated as templates) + pullSecrets: [] + + # -- Endpoint picker configuration + # @default -- See below + epp: + # -- Endpoint picker image used in ModelService CR presets + # @default -- See below + image: + # -- Endpoint picker image registry + registry: ghcr.io + + # -- Endpoint picker image repository + repository: llm-d/llm-d-inference-scheduler + + # -- Endpoint picker image tag + tag: 0.0.2 + + # -- Specify a imagePullPolicy + imagePullPolicy: "Always" + + # @schema + # items: + # type: string + # @schema + # -- Optionally specify an array of imagePullSecrets (evaluated as templates) + pullSecrets: [] + + # -- Enable metrics gathering via podMonitor / ServiceMonitor + metrics: + # -- Enable metrics scraping from endpoint picker service + enabled: true + + # -- Prometheus ServiceMonitor configuration + #
Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md + # @default -- See below + serviceMonitor: + # @schema + # additionalProperties: true + # @schema + # -- Additional annotations provided to the ServiceMonitor + annotations: {} + + # @schema + # additionalProperties: true + # @schema + # -- Additional labels provided to the ServiceMonitor + labels: {} + + # -- ServiceMonitor endpoint port + port: "metrics" + + # -- ServiceMonitor endpoint path + path: "/metrics" + + # -- ServiceMonitor endpoint interval at which metrics should be scraped + interval: "10s" + + # -- ServiceMonitor namespace selector + namespaceSelector: + any: false + + # @schema + # items: + # type: string + # @schema + matchNames: [] + + # -- ServiceMonitor selector matchLabels + #
matchLabels must match labels on modelservice Services + selector: + # @schema + # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector + # @schema + matchLabels: {} + + # -- Default environment variables for endpoint picker, use `extraEnvVars` to override default behavior by defining the same variable again. + # Ref: https://github.com/llm-d/llm-d-inference-scheduler/blob/main/docs/architecture.md#scorers--configuration + defaultEnvVars: + - name: ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: KVCACHE_INDEXER_REDIS_ADDR + value: '{{ if .Values.redis.enabled }}{{ include "redis.master.service.fullurl" . }}{{ end }}' + - name: ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: PREFIX_AWARE_SCORER_WEIGHT + value: "2" + - name: ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: SESSION_AWARE_SCORER_WEIGHT + value: "1" + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "512" + - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT + value: "2" + - name: PREFILL_ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT + value: "1" + - name: DECODE_ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: DECODE_KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: DECODE_ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: DECODE_LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: DECODE_ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: DECODE_PREFIX_AWARE_SCORER_WEIGHT + value: "2" + - name: DECODE_ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: DECODE_SESSION_AWARE_SCORER_WEIGHT + value: "1" + + # @schema + # items: + # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.EnvVar + # @schema + # -- Additional environment variables for endpoint picker + defaultEnvVarsOverride: [] + + # -- Prefill options + # @default -- See below + prefill: + # @schema + # items: + # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration + # @schema + # -- Tolerations configuration to deploy prefill pods to tainted nodes + # @default -- See below + tolerations: + # -- default NVIDIA GPU toleration + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + + # -- Decode options + # @default -- See below + decode: + # @schema + # items: + # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration + # @schema + # -- Tolerations configuration to deploy decode pods to tainted nodes + # @default -- See below + tolerations: + # -- default NVIDIA GPU toleration + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + + # -- vLLM container options + # @default -- See below + vllm: + # -- vLLM image used in ModelService CR presets + # @default -- See below + image: + # -- llm-d image registry + registry: ghcr.io + + # -- llm-d image repository + repository: llm-d/llm-d-dev + + # -- llm-d image tag + tag: 0.0.10 + + # -- Specify a imagePullPolicy + imagePullPolicy: "IfNotPresent" + + # @schema + # items: + # type: string + # @schema + # -- Optionally specify an array of imagePullSecrets (evaluated as templates) + pullSecrets: [] + + # -- Enable metrics gathering via podMonitor / ServiceMonitor + metrics: + # -- Enable metrics scraping from prefill & decode services + enabled: true + + # -- Routing proxy container options + # @default -- See below + routingProxy: + # -- Routing proxy image used in ModelService CR presets + image: + # -- Routing proxy image registry + registry: ghcr.io + + # -- Routing proxy image repository + repository: llm-d/llm-d-routing-sidecar + + # -- Routing proxy image tag + tag: "0.0.6" + + # -- Specify a imagePullPolicy + imagePullPolicy: "IfNotPresent" + + # @schema + # items: + # type: string + # @schema + # -- Optionally specify an array of imagePullSecrets (evaluated as templates) + pullSecrets: [] + + # -- llm-d inference simulator container options + # @default -- See below + inferenceSimulator: + # -- llm-d inference simulator image used in ModelService CR presets + # @default -- See below + image: + # -- llm-d inference simulator image registry + registry: ghcr.io + + # -- llm-d inference simulator image repository + repository: llm-d/llm-d-inference-sim + + # -- llm-d inference simulator image tag + tag: "0.0.4" + + # -- Specify a imagePullPolicy + imagePullPolicy: "IfNotPresent" + + # @schema + # items: + # type: string + # @schema + # -- Optionally specify an array of imagePullSecrets (evaluated as templates) + pullSecrets: [] + + # @schema + # additionalProperties: true + # @schema + # -- Annotations to add to all modelservice resources + annotations: {} + + # @schema + # additionalProperties: true + # @schema + # -- Pod annotations for modelservice + podAnnotations: {} + + # @schema + # additionalProperties: true + # @schema + # -- Pod labels for modelservice + podLabels: {} + + # Model service controller settings + service: + # -- Toggle to deploy a Service resource for Model service controller + enabled: true + + # -- Port number exposed from Model Service controller + port: 8443 + + # -- Service type + type: ClusterIP + + # -- Service Account Configuration + # @default -- See below + serviceAccount: + # -- Enable the creation of a ServiceAccount for Modelservice pods + create: true + + # -- String to fully override modelservice.serviceAccountName, defaults to modelservice.fullname + fullnameOverride: "" + + # -- String to partially override modelservice.serviceAccountName, defaults to modelservice.fullname + nameOverride: "" + + # @schema + # additionalProperties: true + # @schema + # -- Additional custom labels to the service ServiceAccount. + labels: {} + + # @schema + # additionalProperties: true + # @schema + # -- Additional custom annotations for the ServiceAccount. + annotations: {} + + rbac: + # -- Enable the creation of RBAC resources + create: true + +# @schema +# $ref: https://raw.githubusercontent.com/bitnami/charts/refs/tags/redis/20.13.4/bitnami/redis/values.schema.json +# @schema +# -- Bitnami/Redis chart configuration +# @default -- Use sane defaults for minimal Redis deployment +redis: + enabled: false + auth: + enabled: false + existingSecretPasswordKey: "" + existingSecret: "" + architecture: standalone + image: + registry: quay.io + repository: sclorg/redis-7-c9s + tag: c9s + master: + kind: Deployment + resources: + limits: + memory: "256Mi" + cpu: "250m" + requests: + memory: "128Mi" + cpu: "100m" + persistence: + enabled: true + size: "5Gi" + pdb: + create: false + service: + ports: + redis: 8100 + networkPolicy: + enabled: false diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile new file mode 100644 index 0000000..209e44f --- /dev/null +++ b/quickstart/examples/rob-benchmarking/Justfile @@ -0,0 +1,50 @@ +NAMESPACE := "pete-davidson" +MODEL := "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" + +logs POD: + kubectl logs -f {{POD}} | grep -v "GET /metrics HTTP/1.1" + +get-ips: + just get-pods | awk '/^redhatai-llama-4-maverick-17b-128e-instruct-fp8-(decode|prefill)/ {print $6}' +get-pods: + kubectl get pods -n {{NAMESPACE}} -o wide + +hf-token: + kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=$HF_TOKEN -n {{NAMESPACE}} + +[working-directory: '../quickstart'] +install VALUES: + ./llmd-installer.sh \ + --hf-token $HF_TOKEN \ + --namespace {{NAMESPACE}} \ + --storage-class shared-vast --storage-size 300Gi \ + --values-file $PWD/../project/{{VALUES}} + +start VALUES: + just install {{VALUES}} && \ + just hf-token && \ + just start-bench + +[working-directory: '../quickstart'] +uninstall VALUES: + ./llmd-installer.sh \ + --hf-token $HF_TOKEN \ + --namespace {{NAMESPACE}} \ + --storage-class shared-vast --storage-size 300Gi \ + --values-file $PWD/../project/{{VALUES}} \ + --uninstall + +gh-token GH_TOKEN: + kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}} + +# Interactive benchmark commands: +start-bench: + kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml + +delete-bench: + kubectl delete pod -n {{NAMESPACE}} benchmark-interactive + +exec-bench: + kubectl cp reset_prefixes.sh {{NAMESPACE}}/benchmark-interactive:/app/reset_prefixes.sh && \ + kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \ + kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote new file mode 100644 index 0000000..bbec981 --- /dev/null +++ b/quickstart/examples/rob-benchmarking/Justfile.remote @@ -0,0 +1,36 @@ +# Use this Justfile within the cluster. + +# MODEL := "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-FP8" +MODEL := "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" +BASE_URL := "http://llm-d-inference-gateway" + +eval: + lm_eval --model local-completions --tasks gsm8k \ + --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=50,max_retries=3,tokenized_requests=False \ + --limit 100 + +benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN: + python vllm/benchmarks/benchmark_serving.py \ + --base-url {{BASE_URL}} \ + --model {{MODEL}} \ + --dataset-name random \ + --random-input-len {{INPUT_LEN}} \ + --random-output-len {{OUTPUT_LEN}} \ + --request-rate {{RR}} \ + --seed $(date +%M%H%M%S) \ + --num-prompts {{NUM_REQUESTS}} \ + --ignore-eos + +# just benchmark 4 1000 15000 5000 <-- current 1P3D setup +# +benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN: + python vllm/benchmarks/benchmark_serving.py \ + --base-url http://{{POD_IP}}:8000 \ + --model {{MODEL}} \ + --dataset-name random \ + --random-input-len {{INPUT_LEN}} \ + --random-output-len {{OUTPUT_LEN}} \ + --request-rate {{RR}} \ + --seed $(date +%M%H%M%S) \ + --num-prompts {{NUM_REQUESTS}} \ + --ignore-eos diff --git a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml new file mode 100644 index 0000000..bcb6434 --- /dev/null +++ b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml @@ -0,0 +1,32 @@ +# benchmark-client-interactive-pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: benchmark-interactive + labels: + app: benchmark-interactive # Labels for organization +spec: + containers: + - name: benchmark-runner + image: "quay.io/tms/pd-disagg-benchmark:0.0.6" + imagePullPolicy: Always + stdin: true + tty: true + resources: + requests: + cpu: "16" + memory: "64Gi" + limits: + cpu: "16" + memory: "64Gi" + env: + - name: PROXY_HOST + value: "custom-llm-proxy-service" + - name: PROXY_PORT + value: "80" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret # set up with just hf_token + key: HF_TOKEN + restartPolicy: Never diff --git a/quickstart/test-request.sh b/quickstart/test-request.sh index 5635240..26f0afc 100755 --- a/quickstart/test-request.sh +++ b/quickstart/test-request.sh @@ -89,7 +89,7 @@ validation() { # Discover the decode pod IP POD_IP=$(kubectl get pods -n "$NAMESPACE" \ -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.podIP}{"\n"}{end}' \ - | grep decode | awk '{print $2}') + | grep decode | awk '{print $2}' | head -1) if [[ -z "$POD_IP" ]]; then echo "Error: no decode pod found in namespace $NAMESPACE" From 36ab0c996e53bda040febc3602a65525cce307f1 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 8 Jun 2025 13:17:08 +0000 Subject: [PATCH 02/18] rmove examples Signed-off-by: rshaw@neuralmagic.com --- .../examples/rob-benchmarking/2P1D-het.yaml | 697 ------------------ quickstart/examples/rob-benchmarking/Justfile | 50 -- .../examples/rob-benchmarking/Justfile.remote | 36 - .../benchmark-interactive-pod.yaml | 32 - 4 files changed, 815 deletions(-) delete mode 100644 quickstart/examples/rob-benchmarking/2P1D-het.yaml delete mode 100644 quickstart/examples/rob-benchmarking/Justfile delete mode 100644 quickstart/examples/rob-benchmarking/Justfile.remote delete mode 100644 quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml diff --git a/quickstart/examples/rob-benchmarking/2P1D-het.yaml b/quickstart/examples/rob-benchmarking/2P1D-het.yaml deleted file mode 100644 index 2dfc7b4..0000000 --- a/quickstart/examples/rob-benchmarking/2P1D-het.yaml +++ /dev/null @@ -1,697 +0,0 @@ -# yaml-language-server: $schema=values.schema.json - -# Default values for the llm-d chart. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -# -- Global parameters -# Global Docker image parameters -# Please, note that this will override the image parameters, including dependencies, configured to use the global value -# Current available global Docker image parameters: imageRegistry, imagePullSecrets and storageClass -# @default -- See below -global: - # -- Global Docker image registry - imageRegistry: "" - - # @schema - # items: - # type: string - # @schema - # -- Global Docker registry secret names as an array - #
E.g. `imagePullSecrets: [myRegistryKeySecretName]` - imagePullSecrets: [] - - security: - allowInsecureImages: true - -# @schema -# additionalProperties: true -# @schema -# -- Parameters for bitnami.common dependency -common: {} - -# -- Common parameters -# -- Override Kubernetes version -kubeVersion: "" - -# -- String to partially override common.names.fullname -nameOverride: "" - -# -- String to fully override common.names.fullname -fullnameOverride: "" - -# -- Default Kubernetes cluster domain -clusterDomain: cluster.local - -# @schema -# additionalProperties: true -# @schema -# -- Labels to add to all deployed objects -commonLabels: {} - -# @schema -# additionalProperties: true -# @schema -# -- Annotations to add to all deployed objects -commonAnnotations: {} - -# @schema -# items: -# type: [string, object] -# @schema -# -- Array of extra objects to deploy with the release -extraDeploy: [] - -# -- Helm tests -test: - # -- Enable rendering of helm test resources - enabled: false - - # @default -- See below - image: - # -- Test connection pod image registry - registry: quay.io - - # -- Test connection pod image repository. Note that the image needs to have both the `sh` and `curl` binaries in it. - repository: curl/curl - - # -- Test connection pod image tag. Note that the image needs to have both the `sh` and `curl` binaries in it. - tag: latest - - # -- Specify a imagePullPolicy - imagePullPolicy: "Always" - - # @schema - # items: - # type: string - # @schema - # -- Optionally specify an array of imagePullSecrets (evaluated as templates) - pullSecrets: [] - -# -- Sample application deploying a p-d pair of specific model -# @default -- See below -sampleApplication: - baseConfigMapRefName: basic-gpu-with-nixl-preset - - # -- Enable rendering of sample application resources - enabled: true - - model: - # -- Fully qualified pvc URI: pvc:/// - modelArtifactURI: hf://RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic - - # -- Name of the model - modelName: "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" - - # -- Aliases to the Model named vllm will serve with - servedModelNames: [] - - auth: - # -- HF token auth config via k8s secret. - hfToken: - # -- Name of the secret to create to store your huggingface token - name: llm-d-hf-token - # -- Value of the token. Do not set this but use `envsubst` in conjunction with the helm chart - key: HF_TOKEN - - # -- InferencePool port configuration - inferencePoolPort: 8000 - - prefill: - # -- number of desired prefill replicas - replicas: 2 - - # @schema - # items: - # type: string - # @schema - # -- args to add to the prefill deployment - extraArgs: - - "--tensor-parallel-size" - - "1" - - "--disable-log-requests" - - "--max-model-len" - - "32768" - - "--distributed-executor-backend" - - "mp" - - "--block-size" - - "128" - - "--max-num-batched-tokens" - - "32768" - - decode: - # -- number of desired decode replicas - replicas: 1 - - # @schema - # items: - # type: string - # @schema - # -- args to add to the decode deployment - extraArgs: - - "--tensor-parallel-size" - - "4" - - "--disable-log-requests" - - "--max-model-len" - - "32768" - - "--distributed-executor-backend" - - "mp" - - "--block-size" - - "128" - -# -- Gateway configuration -# @default -- See below -gateway: - # -- Deploy resources related to Gateway - enabled: true - - # -- String to fully override gateway.fullname - fullnameOverride: "" - - # -- String to partially override gateway.fullname - nameOverride: "" - - # -- Gateway class that determines the backend used - # Currently supported values: "kgateway" or "istio" - gatewayClassName: kgateway - - # @schema - # additionalProperties: true - # @schema - # -- Additional annotations provided to the Gateway resource - annotations: {} - - # Special parameters applied to kGateway via GatewayParameters resource - kGatewayParameters: - # @schema - # type: [number, boolean] - # @schema - proxyUID: false - - # @schema - # items: - # type: object - # properties: - # name: - # description: Name is the name of the Listener. This name MUST be unique within a Gateway - # type: string - # path: - # description: Path to expose via Ingress - # type: string - # port: - # description: Port is the network port. Multiple listeners may use the same port, subject to the Listener compatibility rules - # type: integer - # minimum: 1 - # maximum: 65535 - # protocol: - # description: Protocol specifies the network protocol this listener expects to receive - # type: string - # @schema - # Set of listeners exposed via the Gateway, also propagated to the Ingress if enabled - listeners: - - name: default - path: / - port: 80 - protocol: HTTP - - # -- Gateway's service type. Ingress is only available if the service type is set to NodePort. Accepted values: ["LoadBalancer", "NodePort"] - serviceType: NodePort - -# -- Ingress configuration -# @default -- See below -ingress: - # -- Deploy Ingress - enabled: true - - # -- Name of the IngressClass cluster resource which defines which controller will implement the resource (e.g nginx) - ingressClassName: "" - - # @schema - # additionalProperties: true - # @schema - # -- Additional annotations for the Ingress resource - annotations: {} - - # -- Hostname to be used to expose the NodePort service to the inferencing gateway - host: "" - - # -- List of additional hostnames to be covered with this ingress record (e.g. a CNAME) - # - extraHosts: [] - - # -- Path to be used to expose the full route to access the inferencing gateway - path: "/" - - # -- Ingress TLS parameters - tls: - # -- Enable TLS configuration for the host defined at `ingress.host` parameter - enabled: false - - # -- The name to which the TLS Secret will be called - secretName: "" - - # @schema - # items: - # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.networking.v1.IngressTLS - # @schema - # -- The TLS configuration for additional hostnames to be covered with this ingress record. - #
Ref: https://kubernetes.io/docs/concepts/services-networking/ingress/#tls - # - extraTls: [] - - # -- used as part of the host dirivation if not specified from OCP cluster domain (dont edit) - clusterRouterBase: "" - -# -- Model service controller configuration -# @default -- See below -modelservice: - # -- Toggle to deploy modelservice controller related resources - enabled: true - - # -- Enable metrics gathering via podMonitor / ServiceMonitor - metrics: - # -- Enable metrics scraping from prefill and decode services, see `model - enabled: true - - # -- Prometheus ServiceMonitor configuration - #
Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md - # @default -- See below - serviceMonitor: - # @schema - # additionalProperties: true - # @schema - # -- Additional annotations provided to the ServiceMonitor - annotations: {} - - # @schema - # additionalProperties: true - # @schema - # -- Additional labels provided to the ServiceMonitor - labels: {} - - # -- ServiceMonitor endpoint port - port: "vllm" - - # -- ServiceMonitor endpoint path - path: "/metrics" - - # -- ServiceMonitor endpoint interval at which metrics should be scraped - interval: "15s" - - # -- ServiceMonitor namespace selector - namespaceSelector: - any: false - - # @schema - # items: - # type: string - # @schema - matchNames: [] - - # -- ServiceMonitor selector matchLabels - #
matchLabels must match labels on modelservice Services - selector: - # @schema - # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector - # @schema - matchLabels: {} - - # -- String to fully override modelservice.fullname - fullnameOverride: "" - - # -- String to partially override modelservice.fullname - nameOverride: "" - - # -- Number of controller replicas - replicas: 1 - - # -- Modelservice controller image, please change only if appropriate adjustments to the CRD are being made - # @default -- See below - image: - # -- Model Service controller image registry - registry: ghcr.io - - # -- Model Service controller image repository - repository: llm-d/llm-d-model-service - - # -- Model Service controller image tag - tag: "0.0.10" - - # -- Specify a imagePullPolicy - imagePullPolicy: "Always" - - # @schema - # items: - # type: string - # @schema - # -- Optionally specify an array of imagePullSecrets (evaluated as templates) - pullSecrets: [] - - # -- Endpoint picker configuration - # @default -- See below - epp: - # -- Endpoint picker image used in ModelService CR presets - # @default -- See below - image: - # -- Endpoint picker image registry - registry: ghcr.io - - # -- Endpoint picker image repository - repository: llm-d/llm-d-inference-scheduler - - # -- Endpoint picker image tag - tag: 0.0.2 - - # -- Specify a imagePullPolicy - imagePullPolicy: "Always" - - # @schema - # items: - # type: string - # @schema - # -- Optionally specify an array of imagePullSecrets (evaluated as templates) - pullSecrets: [] - - # -- Enable metrics gathering via podMonitor / ServiceMonitor - metrics: - # -- Enable metrics scraping from endpoint picker service - enabled: true - - # -- Prometheus ServiceMonitor configuration - #
Ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md - # @default -- See below - serviceMonitor: - # @schema - # additionalProperties: true - # @schema - # -- Additional annotations provided to the ServiceMonitor - annotations: {} - - # @schema - # additionalProperties: true - # @schema - # -- Additional labels provided to the ServiceMonitor - labels: {} - - # -- ServiceMonitor endpoint port - port: "metrics" - - # -- ServiceMonitor endpoint path - path: "/metrics" - - # -- ServiceMonitor endpoint interval at which metrics should be scraped - interval: "10s" - - # -- ServiceMonitor namespace selector - namespaceSelector: - any: false - - # @schema - # items: - # type: string - # @schema - matchNames: [] - - # -- ServiceMonitor selector matchLabels - #
matchLabels must match labels on modelservice Services - selector: - # @schema - # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector - # @schema - matchLabels: {} - - # -- Default environment variables for endpoint picker, use `extraEnvVars` to override default behavior by defining the same variable again. - # Ref: https://github.com/llm-d/llm-d-inference-scheduler/blob/main/docs/architecture.md#scorers--configuration - defaultEnvVars: - - name: ENABLE_KVCACHE_AWARE_SCORER - value: "false" - - name: KVCACHE_AWARE_SCORER_WEIGHT - value: "1" - - name: KVCACHE_INDEXER_REDIS_ADDR - value: '{{ if .Values.redis.enabled }}{{ include "redis.master.service.fullurl" . }}{{ end }}' - - name: ENABLE_PREFIX_AWARE_SCORER - value: "true" - - name: PREFIX_AWARE_SCORER_WEIGHT - value: "2" - - name: ENABLE_LOAD_AWARE_SCORER - value: "true" - - name: LOAD_AWARE_SCORER_WEIGHT - value: "1" - - name: ENABLE_SESSION_AWARE_SCORER - value: "false" - - name: SESSION_AWARE_SCORER_WEIGHT - value: "1" - - name: PD_ENABLED - value: "true" - - name: PD_PROMPT_LEN_THRESHOLD - value: "512" - - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER - value: "false" - - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT - value: "1" - - name: PREFILL_ENABLE_LOAD_AWARE_SCORER - value: "true" - - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT - value: "1" - - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER - value: "true" - - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT - value: "2" - - name: PREFILL_ENABLE_SESSION_AWARE_SCORER - value: "false" - - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT - value: "1" - - name: DECODE_ENABLE_KVCACHE_AWARE_SCORER - value: "false" - - name: DECODE_KVCACHE_AWARE_SCORER_WEIGHT - value: "1" - - name: DECODE_ENABLE_LOAD_AWARE_SCORER - value: "true" - - name: DECODE_LOAD_AWARE_SCORER_WEIGHT - value: "1" - - name: DECODE_ENABLE_PREFIX_AWARE_SCORER - value: "true" - - name: DECODE_PREFIX_AWARE_SCORER_WEIGHT - value: "2" - - name: DECODE_ENABLE_SESSION_AWARE_SCORER - value: "false" - - name: DECODE_SESSION_AWARE_SCORER_WEIGHT - value: "1" - - # @schema - # items: - # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.EnvVar - # @schema - # -- Additional environment variables for endpoint picker - defaultEnvVarsOverride: [] - - # -- Prefill options - # @default -- See below - prefill: - # @schema - # items: - # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration - # @schema - # -- Tolerations configuration to deploy prefill pods to tainted nodes - # @default -- See below - tolerations: - # -- default NVIDIA GPU toleration - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - # -- Decode options - # @default -- See below - decode: - # @schema - # items: - # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.Toleration - # @schema - # -- Tolerations configuration to deploy decode pods to tainted nodes - # @default -- See below - tolerations: - # -- default NVIDIA GPU toleration - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - # -- vLLM container options - # @default -- See below - vllm: - # -- vLLM image used in ModelService CR presets - # @default -- See below - image: - # -- llm-d image registry - registry: ghcr.io - - # -- llm-d image repository - repository: llm-d/llm-d-dev - - # -- llm-d image tag - tag: 0.0.10 - - # -- Specify a imagePullPolicy - imagePullPolicy: "IfNotPresent" - - # @schema - # items: - # type: string - # @schema - # -- Optionally specify an array of imagePullSecrets (evaluated as templates) - pullSecrets: [] - - # -- Enable metrics gathering via podMonitor / ServiceMonitor - metrics: - # -- Enable metrics scraping from prefill & decode services - enabled: true - - # -- Routing proxy container options - # @default -- See below - routingProxy: - # -- Routing proxy image used in ModelService CR presets - image: - # -- Routing proxy image registry - registry: ghcr.io - - # -- Routing proxy image repository - repository: llm-d/llm-d-routing-sidecar - - # -- Routing proxy image tag - tag: "0.0.6" - - # -- Specify a imagePullPolicy - imagePullPolicy: "IfNotPresent" - - # @schema - # items: - # type: string - # @schema - # -- Optionally specify an array of imagePullSecrets (evaluated as templates) - pullSecrets: [] - - # -- llm-d inference simulator container options - # @default -- See below - inferenceSimulator: - # -- llm-d inference simulator image used in ModelService CR presets - # @default -- See below - image: - # -- llm-d inference simulator image registry - registry: ghcr.io - - # -- llm-d inference simulator image repository - repository: llm-d/llm-d-inference-sim - - # -- llm-d inference simulator image tag - tag: "0.0.4" - - # -- Specify a imagePullPolicy - imagePullPolicy: "IfNotPresent" - - # @schema - # items: - # type: string - # @schema - # -- Optionally specify an array of imagePullSecrets (evaluated as templates) - pullSecrets: [] - - # @schema - # additionalProperties: true - # @schema - # -- Annotations to add to all modelservice resources - annotations: {} - - # @schema - # additionalProperties: true - # @schema - # -- Pod annotations for modelservice - podAnnotations: {} - - # @schema - # additionalProperties: true - # @schema - # -- Pod labels for modelservice - podLabels: {} - - # Model service controller settings - service: - # -- Toggle to deploy a Service resource for Model service controller - enabled: true - - # -- Port number exposed from Model Service controller - port: 8443 - - # -- Service type - type: ClusterIP - - # -- Service Account Configuration - # @default -- See below - serviceAccount: - # -- Enable the creation of a ServiceAccount for Modelservice pods - create: true - - # -- String to fully override modelservice.serviceAccountName, defaults to modelservice.fullname - fullnameOverride: "" - - # -- String to partially override modelservice.serviceAccountName, defaults to modelservice.fullname - nameOverride: "" - - # @schema - # additionalProperties: true - # @schema - # -- Additional custom labels to the service ServiceAccount. - labels: {} - - # @schema - # additionalProperties: true - # @schema - # -- Additional custom annotations for the ServiceAccount. - annotations: {} - - rbac: - # -- Enable the creation of RBAC resources - create: true - -# @schema -# $ref: https://raw.githubusercontent.com/bitnami/charts/refs/tags/redis/20.13.4/bitnami/redis/values.schema.json -# @schema -# -- Bitnami/Redis chart configuration -# @default -- Use sane defaults for minimal Redis deployment -redis: - enabled: false - auth: - enabled: false - existingSecretPasswordKey: "" - existingSecret: "" - architecture: standalone - image: - registry: quay.io - repository: sclorg/redis-7-c9s - tag: c9s - master: - kind: Deployment - resources: - limits: - memory: "256Mi" - cpu: "250m" - requests: - memory: "128Mi" - cpu: "100m" - persistence: - enabled: true - size: "5Gi" - pdb: - create: false - service: - ports: - redis: 8100 - networkPolicy: - enabled: false diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile deleted file mode 100644 index 209e44f..0000000 --- a/quickstart/examples/rob-benchmarking/Justfile +++ /dev/null @@ -1,50 +0,0 @@ -NAMESPACE := "pete-davidson" -MODEL := "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" - -logs POD: - kubectl logs -f {{POD}} | grep -v "GET /metrics HTTP/1.1" - -get-ips: - just get-pods | awk '/^redhatai-llama-4-maverick-17b-128e-instruct-fp8-(decode|prefill)/ {print $6}' -get-pods: - kubectl get pods -n {{NAMESPACE}} -o wide - -hf-token: - kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=$HF_TOKEN -n {{NAMESPACE}} - -[working-directory: '../quickstart'] -install VALUES: - ./llmd-installer.sh \ - --hf-token $HF_TOKEN \ - --namespace {{NAMESPACE}} \ - --storage-class shared-vast --storage-size 300Gi \ - --values-file $PWD/../project/{{VALUES}} - -start VALUES: - just install {{VALUES}} && \ - just hf-token && \ - just start-bench - -[working-directory: '../quickstart'] -uninstall VALUES: - ./llmd-installer.sh \ - --hf-token $HF_TOKEN \ - --namespace {{NAMESPACE}} \ - --storage-class shared-vast --storage-size 300Gi \ - --values-file $PWD/../project/{{VALUES}} \ - --uninstall - -gh-token GH_TOKEN: - kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}} - -# Interactive benchmark commands: -start-bench: - kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml - -delete-bench: - kubectl delete pod -n {{NAMESPACE}} benchmark-interactive - -exec-bench: - kubectl cp reset_prefixes.sh {{NAMESPACE}}/benchmark-interactive:/app/reset_prefixes.sh && \ - kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \ - kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote deleted file mode 100644 index bbec981..0000000 --- a/quickstart/examples/rob-benchmarking/Justfile.remote +++ /dev/null @@ -1,36 +0,0 @@ -# Use this Justfile within the cluster. - -# MODEL := "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-FP8" -MODEL := "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" -BASE_URL := "http://llm-d-inference-gateway" - -eval: - lm_eval --model local-completions --tasks gsm8k \ - --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=50,max_retries=3,tokenized_requests=False \ - --limit 100 - -benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN: - python vllm/benchmarks/benchmark_serving.py \ - --base-url {{BASE_URL}} \ - --model {{MODEL}} \ - --dataset-name random \ - --random-input-len {{INPUT_LEN}} \ - --random-output-len {{OUTPUT_LEN}} \ - --request-rate {{RR}} \ - --seed $(date +%M%H%M%S) \ - --num-prompts {{NUM_REQUESTS}} \ - --ignore-eos - -# just benchmark 4 1000 15000 5000 <-- current 1P3D setup -# -benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN: - python vllm/benchmarks/benchmark_serving.py \ - --base-url http://{{POD_IP}}:8000 \ - --model {{MODEL}} \ - --dataset-name random \ - --random-input-len {{INPUT_LEN}} \ - --random-output-len {{OUTPUT_LEN}} \ - --request-rate {{RR}} \ - --seed $(date +%M%H%M%S) \ - --num-prompts {{NUM_REQUESTS}} \ - --ignore-eos diff --git a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml deleted file mode 100644 index bcb6434..0000000 --- a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# benchmark-client-interactive-pod.yaml -apiVersion: v1 -kind: Pod -metadata: - name: benchmark-interactive - labels: - app: benchmark-interactive # Labels for organization -spec: - containers: - - name: benchmark-runner - image: "quay.io/tms/pd-disagg-benchmark:0.0.6" - imagePullPolicy: Always - stdin: true - tty: true - resources: - requests: - cpu: "16" - memory: "64Gi" - limits: - cpu: "16" - memory: "64Gi" - env: - - name: PROXY_HOST - value: "custom-llm-proxy-service" - - name: PROXY_PORT - value: "80" - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret # set up with just hf_token - key: HF_TOKEN - restartPolicy: Never From ff8ae72771267ab1e40022a3d462c18e1265884a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 8 Jun 2025 13:17:38 +0000 Subject: [PATCH 03/18] fix typo Signed-off-by: rshaw@neuralmagic.com --- charts/llm-d/templates/sample-application/modelservice.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml index efa35d6..913a086 100644 --- a/charts/llm-d/templates/sample-application/modelservice.yaml +++ b/charts/llm-d/templates/sample-application/modelservice.yaml @@ -49,7 +49,7 @@ spec: {{- range .Values.sampleApplication.prefill.extraArgs }} - {{ include "common.tplvalues.render" ( dict "value" . "context" $) | quote }} {{- end }} - resources: {{ .Values.sampleApplication.decode.resources | toYaml | nindent 8 }} + resources: {{ .Values.sampleApplication.prefill.resources | toYaml | nindent 8 }} env: {{- if eq (include "sampleApplication.modelArtifactType" . ) "hf" }} - name: HF_TOKEN From e117b30deb708d67ea705115ab1a86ec4ff89716 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 8 Jun 2025 13:18:15 +0000 Subject: [PATCH 04/18] fix Signed-off-by: rshaw@neuralmagic.com --- quickstart/test-request.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quickstart/test-request.sh b/quickstart/test-request.sh index 26f0afc..5635240 100755 --- a/quickstart/test-request.sh +++ b/quickstart/test-request.sh @@ -89,7 +89,7 @@ validation() { # Discover the decode pod IP POD_IP=$(kubectl get pods -n "$NAMESPACE" \ -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.podIP}{"\n"}{end}' \ - | grep decode | awk '{print $2}' | head -1) + | grep decode | awk '{print $2}') if [[ -z "$POD_IP" ]]; then echo "Error: no decode pod found in namespace $NAMESPACE" From 282ee2ad6a1e79d21e1ab72bd1b9a5d56716fada Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 8 Jun 2025 13:58:55 +0000 Subject: [PATCH 05/18] updated schema Signed-off-by: rshaw@neuralmagic.com --- charts/llm-d/values.schema.json | 169 +++++++++++++++++++++----------- 1 file changed, 114 insertions(+), 55 deletions(-) diff --git a/charts/llm-d/values.schema.json b/charts/llm-d/values.schema.json index a1910e9..b405e0a 100644 --- a/charts/llm-d/values.schema.json +++ b/charts/llm-d/values.schema.json @@ -10471,6 +10471,65 @@ "description": "number of desired decode replicas", "required": [], "title": "replicas" + }, + "resources": { + "description": "ResourceRequirements describes the compute resource requirements.", + "properties": { + "claims": { + "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.", + "items": { + "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.", + "properties": { + "name": { + "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.", + "type": "string" + }, + "request": { + "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.", + "type": "string" + } + }, + "required": [ + "name" + ], + "type": "object" + }, + "type": "array", + "x-kubernetes-list-map-keys": [ + "name" + ], + "x-kubernetes-list-type": "map" + }, + "limits": { + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ] + }, + "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/", + "type": "object" + }, + "requests": { + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ] + }, + "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/", + "type": "object" + } + }, + "type": "object" } }, "required": [], @@ -10688,69 +10747,69 @@ "description": "number of desired prefill replicas", "required": [], "title": "replicas" - } - }, - "required": [], - "title": "prefill", - "type": "object" - }, - "resources": { - "description": "ResourceRequirements describes the compute resource requirements.", - "properties": { - "claims": { - "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.", - "items": { - "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.", - "properties": { - "name": { - "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.", - "type": "string" + }, + "resources": { + "description": "ResourceRequirements describes the compute resource requirements.", + "properties": { + "claims": { + "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.", + "items": { + "description": "ResourceClaim references one entry in PodSpec.ResourceClaims.", + "properties": { + "name": { + "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.", + "type": "string" + }, + "request": { + "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.", + "type": "string" + } + }, + "required": [ + "name" + ], + "type": "object" }, - "request": { - "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.", - "type": "string" - } + "type": "array", + "x-kubernetes-list-map-keys": [ + "name" + ], + "x-kubernetes-list-type": "map" }, - "required": [ - "name" - ], - "type": "object" - }, - "type": "array", - "x-kubernetes-list-map-keys": [ - "name" - ], - "x-kubernetes-list-type": "map" - }, - "limits": { - "additionalProperties": { - "oneOf": [ - { - "type": "string" + "limits": { + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ] }, - { - "type": "number" - } - ] - }, - "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/", - "type": "object" - }, - "requests": { - "additionalProperties": { - "oneOf": [ - { - "type": "string" + "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/", + "type": "object" + }, + "requests": { + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ] }, - { - "type": "number" - } - ] + "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/", + "type": "object" + } }, - "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/", "type": "object" } }, + "required": [], + "title": "prefill", "type": "object" } }, From fba496a884b201ffae38fffcb964dc5fb4d26e5a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 1 Jul 2025 13:02:48 +0000 Subject: [PATCH 06/18] Your new single commit message --- .../presets/basic-gpu-with-nixl-preset.yaml | 16 ++-- .../examples/rob-benchmarking/4p1d-het.yaml | 84 +++++++++++++++++++ quickstart/examples/rob-benchmarking/Justfile | 40 +++++++++ .../examples/rob-benchmarking/Justfile.remote | 44 ++++++++++ .../benchmark-interactive-pod.yaml | 32 +++++++ 5 files changed, 210 insertions(+), 6 deletions(-) create mode 100644 quickstart/examples/rob-benchmarking/4p1d-het.yaml create mode 100644 quickstart/examples/rob-benchmarking/Justfile create mode 100644 quickstart/examples/rob-benchmarking/Justfile.remote create mode 100644 quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml index e84b680..bb312aa 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml @@ -84,6 +84,10 @@ data: - name: VLLM_LOGGING_LEVEL value: {{ .Values.modelservice.vllm.logLevel }} {{- end }} + - name: USE_BATCHED + value: "0" + - name: VLLM_IS_PREFILL + value: "1" - name: VLLM_NIXL_SIDE_CHANNEL_PORT value: "5557" - name: VLLM_NIXL_SIDE_CHANNEL_HOST @@ -97,8 +101,6 @@ data: fieldPath: status.podIP - name: LMCACHE_DISTRIBUTED_URL value: ${POD_IP}:8200 - - name: UCX_TLS - value: "^cuda_ipc" {{- if .Values.redis.enabled }} - name: LMCACHE_LOOKUP_URL value: {{ include "redis.master.service.fullurl" .}} @@ -129,7 +131,7 @@ data: - name: dshm emptyDir: medium: Memory - sizeLimit: 1Gi + sizeLimit: 16Gi {{ `{{- if .HFModelName }}` }} - name: model-cache emptyDir: {} @@ -185,6 +187,10 @@ data: - name: VLLM_LOGGING_LEVEL value: {{ .Values.modelservice.vllm.logLevel }} {{- end }} + - name: USE_BATCHED + value: "0" + - name: VLLM_IS_PREFILL + value: "1" - name: VLLM_NIXL_SIDE_CHANNEL_PORT value: "5557" - name: VLLM_NIXL_SIDE_CHANNEL_HOST @@ -198,8 +204,6 @@ data: fieldPath: status.podIP - name: LMCACHE_DISTRIBUTED_URL value: ${POD_IP}:8200 - - name: UCX_TLS - value: "^cuda_ipc" {{- if .Values.redis.enabled }} - name: LMCACHE_LOOKUP_URL value: {{ include "redis.master.service.fullurl" .}} @@ -230,7 +234,7 @@ data: - name: dshm emptyDir: medium: Memory - sizeLimit: 1Gi + sizeLimit: 16Gi {{ `{{ if .HFModelName }}` }} - name: model-cache emptyDir: {} diff --git a/quickstart/examples/rob-benchmarking/4p1d-het.yaml b/quickstart/examples/rob-benchmarking/4p1d-het.yaml new file mode 100644 index 0000000..7801080 --- /dev/null +++ b/quickstart/examples/rob-benchmarking/4p1d-het.yaml @@ -0,0 +1,84 @@ +sampleApplication: + baseConfigMapRefName: basic-gpu-with-nixl-preset + model: + modelArtifactURI: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8 + modelName: "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8" + auth: + hfToken: + name: llm-d-hf-token + key: HF_TOKEN + prefill: + replicas: 1 + resources: + limits: + nvidia.com/gpu: 1 + rdma/ib: 1 + requests: + nvidia.com/gpu: 1 + rdma/ib: 1 + cpu: "32" + memory: 128Gi + extraArgs: + - "--tensor-parallel-size" + - "1" + - "--disable-log-requests" + - "--max-model-len" + - "32768" + - "--block-size" + - "128" + - "--enforce-eager" + # - "--num-gpu-blocks-override" + # - "60000" + + decode: + replicas: 1 + resources: + limits: + nvidia.com/gpu: 1 + rdma/ib: 1 + requests: + nvidia.com/gpu: 1 + rdma/ib: 1 + cpu: "32" + memory: 128Gi + extraArgs: + - "--tensor-parallel-size" + - "1" + - "--disable-log-requests" + - "--max-model-len" + - "32768" + - "--block-size" + - "128" + - "--enforce-eager" + # - "--num-gpu-blocks-override" + # - "60000" +modelservice: + vllm: + image: + registry: docker.io + repository: robertgouldshaw2/vllm-nixl + tag: launch-debug-0.10 + epp: + defaultEnvVarsOverride: + - name: ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: PREFILL_ENABLE_SESSION_AWARE_SCORER + value: "false" +redis: + enabled: false \ No newline at end of file diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile new file mode 100644 index 0000000..3c794f6 --- /dev/null +++ b/quickstart/examples/rob-benchmarking/Justfile @@ -0,0 +1,40 @@ +NAMESPACE := "pete-davidson" + +logs POD: + kubectl logs -f {{POD}} | grep -v "GET /metrics HTTP/1.1" + +get-ips: + just get-pods | awk '/^qwen-qwen3-30b-a3b-(decode|prefill)/ {print $6}' +get-pods: + kubectl get pods -n {{NAMESPACE}} -o wide + +hf-token: + kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}} + +[working-directory: '/home/rshaw/llm-d-deployer/quickstart'] +install VALUES: + ./llmd-installer.sh \ + --namespace {{NAMESPACE}} \ + --storage-class shared-vast --storage-size 300Gi \ + --values-file ./examples/rob-benchmarking/{{VALUES}} --skip-infra + +[working-directory: '/home/rshaw/llm-d-deployer/quickstart'] +uninstall: + ./llmd-installer.sh \ + --namespace {{NAMESPACE}} \ + --uninstall --skip-infra + +gh-token GH_TOKEN: + kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}} + +# Interactive benchmark commands: +start-bench: + kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}} + kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml + +delete-bench: + kubectl delete pod -n {{NAMESPACE}} benchmark-interactive + +exec-bench: + kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \ + kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote new file mode 100644 index 0000000..ebdad42 --- /dev/null +++ b/quickstart/examples/rob-benchmarking/Justfile.remote @@ -0,0 +1,44 @@ +# Use this Justfile within the cluster. + +MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8" +BASE_URL := "http://llm-d-inference-gateway" + +eval: + lm_eval --model local-completions --tasks gsm8k \ + --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \ + --limit 1000 + +benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN: + python vllm/benchmarks/benchmark_serving.py \ + --base-url {{BASE_URL}} \ + --model {{MODEL}} \ + --dataset-name random \ + --random-input-len {{INPUT_LEN}} \ + --random-output-len {{OUTPUT_LEN}} \ + --request-rate {{RR}} \ + --seed $(date +%M%H%M%S) \ + --num-prompts {{NUM_REQUESTS}} \ + --ignore-eos + +# just benchmark 4 1000 15000 5000 <-- current 1P3D setup +benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN: + python vllm/benchmarks/benchmark_serving.py \ + --base-url http://{{POD_IP}}:8000 \ + --model {{MODEL}} \ + --dataset-name random \ + --random-input-len {{INPUT_LEN}} \ + --random-output-len {{OUTPUT_LEN}} \ + --request-rate {{RR}} \ + --seed $(date +%M%H%M%S) \ + --num-prompts {{NUM_REQUESTS}} \ + --ignore-eos + +send_request: + curl -X POST {{BASE_URL}}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ \ + "model": "{{MODEL}}", \ + "prompt": "Red Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \ + "max_tokens": 150, \ + "temperature": 0.7 \ + }' diff --git a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml new file mode 100644 index 0000000..bcb6434 --- /dev/null +++ b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml @@ -0,0 +1,32 @@ +# benchmark-client-interactive-pod.yaml +apiVersion: v1 +kind: Pod +metadata: + name: benchmark-interactive + labels: + app: benchmark-interactive # Labels for organization +spec: + containers: + - name: benchmark-runner + image: "quay.io/tms/pd-disagg-benchmark:0.0.6" + imagePullPolicy: Always + stdin: true + tty: true + resources: + requests: + cpu: "16" + memory: "64Gi" + limits: + cpu: "16" + memory: "64Gi" + env: + - name: PROXY_HOST + value: "custom-llm-proxy-service" + - name: PROXY_PORT + value: "80" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret # set up with just hf_token + key: HF_TOKEN + restartPolicy: Never From 8b7549a4eb5be69e3a861f335267e1e9ac17bd75 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 3 Jul 2025 16:21:11 -0400 Subject: [PATCH 07/18] updated Signed-off-by: Robert Shaw --- .../presets/basic-gpu-preset.yaml | 4 +++ quickstart/examples/rob-benchmarking/Justfile | 20 +++++++++--- .../examples/rob-benchmarking/Justfile.remote | 32 ++++++++++++++++++- .../benchmark-interactive-pod.yaml | 2 +- 4 files changed, 51 insertions(+), 7 deletions(-) diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml index 1a3480b..b2c5fd1 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml @@ -75,6 +75,8 @@ data: - "--port" - "8001" env: + - name: VLLM_USE_V1 + value: "1" - name: HOME value: /home {{- if .Values.modelservice.vllm.logLevel }} @@ -154,6 +156,8 @@ data: - "--port" - "8000" env: + - name: VLLM_USE_V1 + value: "1" - name: HOME value: /home {{ if .Values.modelservice.vllm.logLevel }} diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile index 3c794f6..2135ba6 100644 --- a/quickstart/examples/rob-benchmarking/Justfile +++ b/quickstart/examples/rob-benchmarking/Justfile @@ -1,24 +1,23 @@ +HF_TOKEN := "my_token" NAMESPACE := "pete-davidson" logs POD: - kubectl logs -f {{POD}} | grep -v "GET /metrics HTTP/1.1" + kubectl logs -f {{POD}} -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" -get-ips: - just get-pods | awk '/^qwen-qwen3-30b-a3b-(decode|prefill)/ {print $6}' get-pods: kubectl get pods -n {{NAMESPACE}} -o wide hf-token: kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}} -[working-directory: '/home/rshaw/llm-d-deployer/quickstart'] +[working-directory: '/Users/robertgshaw/llm-d-deployer/quickstart'] install VALUES: ./llmd-installer.sh \ --namespace {{NAMESPACE}} \ --storage-class shared-vast --storage-size 300Gi \ --values-file ./examples/rob-benchmarking/{{VALUES}} --skip-infra -[working-directory: '/home/rshaw/llm-d-deployer/quickstart'] +[working-directory: '/Users/robertgshaw/llm-d-deployer/quickstart'] uninstall: ./llmd-installer.sh \ --namespace {{NAMESPACE}} \ @@ -32,9 +31,20 @@ start-bench: kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}} kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml +start-bench-2: + kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod-2.yaml + delete-bench: kubectl delete pod -n {{NAMESPACE}} benchmark-interactive exec-bench: + kubectl cp sweep.sh {{NAMESPACE}}/benchmark-interactive:/app/sweep.sh && \ + kubectl cp sweep-sharegpt.sh {{NAMESPACE}}/benchmark-interactive:/app/sweep-sharegpt.sh && \ kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \ kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash + +exec-bench-2: + kubectl cp sweep.sh {{NAMESPACE}}/benchmark-interactive-2:/app/sweep.sh && \ + kubectl cp sweep-sharegpt.sh {{NAMESPACE}}/benchmark-interactive-2:/app/sweep-sharegpt.sh && \ + kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive-2:/app/Justfile && \ + kubectl exec -it -n {{NAMESPACE}} benchmark-interactive-2 -- /bin/bash diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote index ebdad42..952cbf5 100644 --- a/quickstart/examples/rob-benchmarking/Justfile.remote +++ b/quickstart/examples/rob-benchmarking/Justfile.remote @@ -1,8 +1,23 @@ # Use this Justfile within the cluster. -MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8" +MODEL := "meta-llama/Llama-3.1-8B-Instruct" +# MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic" BASE_URL := "http://llm-d-inference-gateway" +pull: + cd vllm && git pull + +download_sharegpt: + apt update && apt install wget && wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +sweep POD_IP: + uv pip install pybase64 && cd vllm && git pull && cd .. && \ + MODEL={{MODEL}} POD_IP={{POD_IP}} bash ./sweep.sh + +sweep_sharegpt POD_IP: + uv pip install pybase64 && cd vllm && git pull && cd .. && \ + MODEL={{MODEL}} POD_IP={{POD_IP}} bash ./sweep-sharegpt.sh + eval: lm_eval --model local-completions --tasks gsm8k \ --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \ @@ -20,12 +35,27 @@ benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN: --num-prompts {{NUM_REQUESTS}} \ --ignore-eos +benchmark_no_pd_concurrency POD_IP CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN: + python vllm/benchmarks/benchmark_serving.py \ + --base-url http://{{POD_IP}}:8000 \ + --model {{MODEL}} \ + --dataset-name random \ + --random-input-len {{INPUT_LEN}} \ + --random-output-len {{OUTPUT_LEN}} \ + --max-concurrency {{CONCURRENCY}} \ + --num-prompts {{NUM_REQUESTS}} \ + --seed $(date +%s) \ + --percentile-metrics ttft,tpot,itl,e2el \ + --metric-percentiles 90,95,99 \ + --ignore-eos + # just benchmark 4 1000 15000 5000 <-- current 1P3D setup benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN: python vllm/benchmarks/benchmark_serving.py \ --base-url http://{{POD_IP}}:8000 \ --model {{MODEL}} \ --dataset-name random \ + --random-prefix-len 600 \ --random-input-len {{INPUT_LEN}} \ --random-output-len {{OUTPUT_LEN}} \ --request-rate {{RR}} \ diff --git a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml index bcb6434..0a0eeb2 100644 --- a/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml +++ b/quickstart/examples/rob-benchmarking/benchmark-interactive-pod.yaml @@ -4,7 +4,7 @@ kind: Pod metadata: name: benchmark-interactive labels: - app: benchmark-interactive # Labels for organization + app: benchmark-interactive spec: containers: - name: benchmark-runner From 1c2226d34565bb27a3acecb904efc0ccc8c31314 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 3 Jul 2025 20:46:57 +0000 Subject: [PATCH 08/18] updated Signed-off-by: Robert Shaw --- quickstart/examples/rob-benchmarking/Justfile | 20 +++++-------------- .../{4p1d-het.yaml => tp-8.yaml} | 20 +++++++++---------- 2 files changed, 15 insertions(+), 25 deletions(-) rename quickstart/examples/rob-benchmarking/{4p1d-het.yaml => tp-8.yaml} (86%) diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile index 2135ba6..14a056e 100644 --- a/quickstart/examples/rob-benchmarking/Justfile +++ b/quickstart/examples/rob-benchmarking/Justfile @@ -1,4 +1,3 @@ -HF_TOKEN := "my_token" NAMESPACE := "pete-davidson" logs POD: @@ -8,32 +7,29 @@ get-pods: kubectl get pods -n {{NAMESPACE}} -o wide hf-token: - kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}} + kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN="$HF_TOKEN" -n {{NAMESPACE}} -[working-directory: '/Users/robertgshaw/llm-d-deployer/quickstart'] +[working-directory: '/home/rshaw/llm-d-deployer/quickstart'] install VALUES: ./llmd-installer.sh \ --namespace {{NAMESPACE}} \ --storage-class shared-vast --storage-size 300Gi \ --values-file ./examples/rob-benchmarking/{{VALUES}} --skip-infra -[working-directory: '/Users/robertgshaw/llm-d-deployer/quickstart'] +[working-directory: '/home/rshaw/llm-d-deployer/quickstart'] uninstall: ./llmd-installer.sh \ --namespace {{NAMESPACE}} \ - --uninstall --skip-infra + --uninstall gh-token GH_TOKEN: kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}} # Interactive benchmark commands: start-bench: - kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN={{HF_TOKEN}} -n {{NAMESPACE}} + kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN="$HF_TOKEN" -n {{NAMESPACE}} kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml -start-bench-2: - kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod-2.yaml - delete-bench: kubectl delete pod -n {{NAMESPACE}} benchmark-interactive @@ -42,9 +38,3 @@ exec-bench: kubectl cp sweep-sharegpt.sh {{NAMESPACE}}/benchmark-interactive:/app/sweep-sharegpt.sh && \ kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \ kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash - -exec-bench-2: - kubectl cp sweep.sh {{NAMESPACE}}/benchmark-interactive-2:/app/sweep.sh && \ - kubectl cp sweep-sharegpt.sh {{NAMESPACE}}/benchmark-interactive-2:/app/sweep-sharegpt.sh && \ - kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive-2:/app/Justfile && \ - kubectl exec -it -n {{NAMESPACE}} benchmark-interactive-2 -- /bin/bash diff --git a/quickstart/examples/rob-benchmarking/4p1d-het.yaml b/quickstart/examples/rob-benchmarking/tp-8.yaml similarity index 86% rename from quickstart/examples/rob-benchmarking/4p1d-het.yaml rename to quickstart/examples/rob-benchmarking/tp-8.yaml index 7801080..d8be566 100644 --- a/quickstart/examples/rob-benchmarking/4p1d-het.yaml +++ b/quickstart/examples/rob-benchmarking/tp-8.yaml @@ -11,33 +11,33 @@ sampleApplication: replicas: 1 resources: limits: - nvidia.com/gpu: 1 + nvidia.com/gpu: 8 rdma/ib: 1 requests: - nvidia.com/gpu: 1 + nvidia.com/gpu: 8 rdma/ib: 1 cpu: "32" memory: 128Gi extraArgs: - "--tensor-parallel-size" - - "1" + - "8" - "--disable-log-requests" - "--max-model-len" - "32768" - "--block-size" - "128" - "--enforce-eager" - # - "--num-gpu-blocks-override" - # - "60000" + - "--num-gpu-blocks-override" + - "60000" decode: replicas: 1 resources: limits: - nvidia.com/gpu: 1 + nvidia.com/gpu: 8 rdma/ib: 1 requests: - nvidia.com/gpu: 1 + nvidia.com/gpu: 8 rdma/ib: 1 cpu: "32" memory: 128Gi @@ -50,14 +50,14 @@ sampleApplication: - "--block-size" - "128" - "--enforce-eager" - # - "--num-gpu-blocks-override" - # - "60000" + - "--num-gpu-blocks-override" + - "60000" modelservice: vllm: image: registry: docker.io repository: robertgouldshaw2/vllm-nixl - tag: launch-debug-0.10 + tag: nixl-oh-debug-0.1 epp: defaultEnvVarsOverride: - name: ENABLE_KVCACHE_AWARE_SCORER From c43e5900d6b2e1bc8b53a4ec8f99e8cbf6d4d103 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 7 Jul 2025 00:48:20 +0000 Subject: [PATCH 09/18] updated Signed-off-by: Robert Shaw --- quickstart/examples/rob-benchmarking/Justfile | 5 +- .../examples/rob-benchmarking/Justfile.remote | 37 +++------ .../examples/rob-benchmarking/tp-1.yaml | 79 +++++++++++++++++++ .../examples/rob-benchmarking/tp-8.yaml | 9 +-- 4 files changed, 93 insertions(+), 37 deletions(-) create mode 100644 quickstart/examples/rob-benchmarking/tp-1.yaml diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile index 14a056e..e69ae46 100644 --- a/quickstart/examples/rob-benchmarking/Justfile +++ b/quickstart/examples/rob-benchmarking/Justfile @@ -20,21 +20,18 @@ install VALUES: uninstall: ./llmd-installer.sh \ --namespace {{NAMESPACE}} \ - --uninstall + --uninstall --skip-infra gh-token GH_TOKEN: kubectl create secret generic gh-token-secret --from-literal=GH_TOKEN='{{GH_TOKEN}}' -n {{NAMESPACE}} # Interactive benchmark commands: start-bench: - kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN="$HF_TOKEN" -n {{NAMESPACE}} kubectl apply -n {{NAMESPACE}} -f benchmark-interactive-pod.yaml delete-bench: kubectl delete pod -n {{NAMESPACE}} benchmark-interactive exec-bench: - kubectl cp sweep.sh {{NAMESPACE}}/benchmark-interactive:/app/sweep.sh && \ - kubectl cp sweep-sharegpt.sh {{NAMESPACE}}/benchmark-interactive:/app/sweep-sharegpt.sh && \ kubectl cp Justfile.remote {{NAMESPACE}}/benchmark-interactive:/app/Justfile && \ kubectl exec -it -n {{NAMESPACE}} benchmark-interactive -- /bin/bash diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote index 952cbf5..97911e0 100644 --- a/quickstart/examples/rob-benchmarking/Justfile.remote +++ b/quickstart/examples/rob-benchmarking/Justfile.remote @@ -1,28 +1,26 @@ # Use this Justfile within the cluster. -MODEL := "meta-llama/Llama-3.1-8B-Instruct" -# MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic" +MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8" BASE_URL := "http://llm-d-inference-gateway" pull: cd vllm && git pull -download_sharegpt: - apt update && apt install wget && wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - -sweep POD_IP: - uv pip install pybase64 && cd vllm && git pull && cd .. && \ - MODEL={{MODEL}} POD_IP={{POD_IP}} bash ./sweep.sh - -sweep_sharegpt POD_IP: - uv pip install pybase64 && cd vllm && git pull && cd .. && \ - MODEL={{MODEL}} POD_IP={{POD_IP}} bash ./sweep-sharegpt.sh - eval: lm_eval --model local-completions --tasks gsm8k \ --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \ --limit 1000 +benchmark_one INPUT_LEN: + cd vllm && git fetch && git checkout 3c6fd286b40ada67bba98216ed410bb3a0d38b16 && uv pip install pybase64 && \ + python benchmarks/benchmark_one_concurrent.py \ + --base-url {{BASE_URL}} \ + --model {{MODEL}} \ + --input-len {{INPUT_LEN}} \ + --output-len 1 \ + --num-requests 10 \ + --seed $(date +%s) + benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN: python vllm/benchmarks/benchmark_serving.py \ --base-url {{BASE_URL}} \ @@ -49,19 +47,6 @@ benchmark_no_pd_concurrency POD_IP CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN --metric-percentiles 90,95,99 \ --ignore-eos -# just benchmark 4 1000 15000 5000 <-- current 1P3D setup -benchmark_no_pd POD_IP RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN: - python vllm/benchmarks/benchmark_serving.py \ - --base-url http://{{POD_IP}}:8000 \ - --model {{MODEL}} \ - --dataset-name random \ - --random-prefix-len 600 \ - --random-input-len {{INPUT_LEN}} \ - --random-output-len {{OUTPUT_LEN}} \ - --request-rate {{RR}} \ - --seed $(date +%M%H%M%S) \ - --num-prompts {{NUM_REQUESTS}} \ - --ignore-eos send_request: curl -X POST {{BASE_URL}}/v1/completions \ diff --git a/quickstart/examples/rob-benchmarking/tp-1.yaml b/quickstart/examples/rob-benchmarking/tp-1.yaml new file mode 100644 index 0000000..9bda5ee --- /dev/null +++ b/quickstart/examples/rob-benchmarking/tp-1.yaml @@ -0,0 +1,79 @@ +sampleApplication: + baseConfigMapRefName: basic-gpu-with-nixl-preset + model: + modelArtifactURI: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8 + modelName: "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8" + auth: + hfToken: + name: llm-d-hf-token + key: HF_TOKEN + prefill: + replicas: 1 + resources: + limits: + nvidia.com/gpu: 1 + rdma/ib: 1 + requests: + nvidia.com/gpu: 1 + rdma/ib: 1 + cpu: "32" + memory: 128Gi + extraArgs: + - "--tensor-parallel-size" + - "1" + - "--disable-log-requests" + - "--max-model-len" + - "32768" + - "--block-size" + - "128" + - "--enforce-eager" + decode: + replicas: 1 + resources: + limits: + nvidia.com/gpu: 1 + rdma/ib: 1 + requests: + nvidia.com/gpu: 1 + rdma/ib: 1 + cpu: "32" + memory: 128Gi + extraArgs: + - "--tensor-parallel-size" + - "1" + - "--disable-log-requests" + - "--max-model-len" + - "32768" + - "--block-size" + - "128" + - "--enforce-eager" +modelservice: + vllm: + image: + registry: docker.io + repository: robertgouldshaw2/vllm-nixl + tag: nixl-oh-debug-0.3 + epp: + defaultEnvVarsOverride: + - name: ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: PREFILL_ENABLE_SESSION_AWARE_SCORER + value: "false" +redis: + enabled: false \ No newline at end of file diff --git a/quickstart/examples/rob-benchmarking/tp-8.yaml b/quickstart/examples/rob-benchmarking/tp-8.yaml index d8be566..21ed6c8 100644 --- a/quickstart/examples/rob-benchmarking/tp-8.yaml +++ b/quickstart/examples/rob-benchmarking/tp-8.yaml @@ -27,9 +27,6 @@ sampleApplication: - "--block-size" - "128" - "--enforce-eager" - - "--num-gpu-blocks-override" - - "60000" - decode: replicas: 1 resources: @@ -43,21 +40,19 @@ sampleApplication: memory: 128Gi extraArgs: - "--tensor-parallel-size" - - "1" + - "8" - "--disable-log-requests" - "--max-model-len" - "32768" - "--block-size" - "128" - "--enforce-eager" - - "--num-gpu-blocks-override" - - "60000" modelservice: vllm: image: registry: docker.io repository: robertgouldshaw2/vllm-nixl - tag: nixl-oh-debug-0.1 + tag: nixl-oh-debug-0.3 epp: defaultEnvVarsOverride: - name: ENABLE_KVCACHE_AWARE_SCORER From 82d1951c4f1e7609b6700de662ca05ba4d3b8147 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 7 Jul 2025 01:36:35 +0000 Subject: [PATCH 10/18] updated Signed-off-by: Robert Shaw --- quickstart/examples/rob-benchmarking/tp-8.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quickstart/examples/rob-benchmarking/tp-8.yaml b/quickstart/examples/rob-benchmarking/tp-8.yaml index 21ed6c8..5699d43 100644 --- a/quickstart/examples/rob-benchmarking/tp-8.yaml +++ b/quickstart/examples/rob-benchmarking/tp-8.yaml @@ -52,7 +52,7 @@ modelservice: image: registry: docker.io repository: robertgouldshaw2/vllm-nixl - tag: nixl-oh-debug-0.3 + tag: nixl-oh-debug-fixed-0.1 epp: defaultEnvVarsOverride: - name: ENABLE_KVCACHE_AWARE_SCORER From b90e3896fafa670b9b5039c48c94a7031a29809d Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 7 Jul 2025 01:38:40 +0000 Subject: [PATCH 11/18] updated Signed-off-by: Robert Shaw --- .../modelservice/presets/basic-gpu-with-nixl-preset.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml index bb312aa..eebedd8 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml @@ -84,10 +84,6 @@ data: - name: VLLM_LOGGING_LEVEL value: {{ .Values.modelservice.vllm.logLevel }} {{- end }} - - name: USE_BATCHED - value: "0" - - name: VLLM_IS_PREFILL - value: "1" - name: VLLM_NIXL_SIDE_CHANNEL_PORT value: "5557" - name: VLLM_NIXL_SIDE_CHANNEL_HOST @@ -187,8 +183,6 @@ data: - name: VLLM_LOGGING_LEVEL value: {{ .Values.modelservice.vllm.logLevel }} {{- end }} - - name: USE_BATCHED - value: "0" - name: VLLM_IS_PREFILL value: "1" - name: VLLM_NIXL_SIDE_CHANNEL_PORT From 7083f624c81d613aa0c927199b610aa0dedfc450 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 10 Jul 2025 02:36:37 +0000 Subject: [PATCH 12/18] updated Signed-off-by: Robert Shaw --- .../{tp-8.yaml => 4p-1d-llama-70b.yaml} | 20 +++++++--------- quickstart/examples/rob-benchmarking/Justfile | 7 ++++-- .../examples/rob-benchmarking/Justfile.remote | 24 ++++++++++++++----- 3 files changed, 32 insertions(+), 19 deletions(-) rename quickstart/examples/rob-benchmarking/{tp-8.yaml => 4p-1d-llama-70b.yaml} (82%) diff --git a/quickstart/examples/rob-benchmarking/tp-8.yaml b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml similarity index 82% rename from quickstart/examples/rob-benchmarking/tp-8.yaml rename to quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml index 5699d43..19bd780 100644 --- a/quickstart/examples/rob-benchmarking/tp-8.yaml +++ b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml @@ -1,52 +1,50 @@ sampleApplication: baseConfigMapRefName: basic-gpu-with-nixl-preset model: - modelArtifactURI: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8 - modelName: "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8" + modelArtifactURI: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic + modelName: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" auth: hfToken: name: llm-d-hf-token key: HF_TOKEN prefill: - replicas: 1 + replicas: 4 resources: limits: - nvidia.com/gpu: 8 + nvidia.com/gpu: 1 rdma/ib: 1 requests: - nvidia.com/gpu: 8 + nvidia.com/gpu: 1 rdma/ib: 1 cpu: "32" memory: 128Gi extraArgs: - "--tensor-parallel-size" - - "8" + - "1" - "--disable-log-requests" - "--max-model-len" - "32768" - "--block-size" - "128" - - "--enforce-eager" decode: replicas: 1 resources: limits: - nvidia.com/gpu: 8 + nvidia.com/gpu: 4 rdma/ib: 1 requests: - nvidia.com/gpu: 8 + nvidia.com/gpu: 4 rdma/ib: 1 cpu: "32" memory: 128Gi extraArgs: - "--tensor-parallel-size" - - "8" + - "4" - "--disable-log-requests" - "--max-model-len" - "32768" - "--block-size" - "128" - - "--enforce-eager" modelservice: vllm: image: diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile index e69ae46..622c01f 100644 --- a/quickstart/examples/rob-benchmarking/Justfile +++ b/quickstart/examples/rob-benchmarking/Justfile @@ -1,7 +1,10 @@ NAMESPACE := "pete-davidson" -logs POD: - kubectl logs -f {{POD}} -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" +logs: + kubectl logs -f $POD -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" | grep -v "TRANSFER BATCHED" + +logs-stats: + kubectl logs -f $POD -n {{NAMESPACE}} | grep -e "Engine 000:" get-pods: kubectl get pods -n {{NAMESPACE}} -o wide diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote index 97911e0..284e5a8 100644 --- a/quickstart/examples/rob-benchmarking/Justfile.remote +++ b/quickstart/examples/rob-benchmarking/Justfile.remote @@ -1,6 +1,6 @@ # Use this Justfile within the cluster. -MODEL := "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8" +MODEL := "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" BASE_URL := "http://llm-d-inference-gateway" pull: @@ -8,7 +8,7 @@ pull: eval: lm_eval --model local-completions --tasks gsm8k \ - --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \ + --model_args model={{MODEL}},base_url={{BASE_URL}}/v1/completions,num_concurrent=100,max_retries=0,tokenized_requests=False \ --limit 1000 benchmark_one INPUT_LEN: @@ -21,19 +21,31 @@ benchmark_one INPUT_LEN: --num-requests 10 \ --seed $(date +%s) -benchmark RR NUM_REQUESTS INPUT_LEN OUTPUT_LEN: +benchmark_one_no_pd POD_IP INPUT_LEN: + cd vllm && git fetch && git checkout 3c6fd286b40ada67bba98216ed410bb3a0d38b16 && uv pip install pybase64 && \ + python benchmarks/benchmark_one_concurrent.py \ + --base-url http://{{POD_IP}}:8000 \ + --model {{MODEL}} \ + --input-len {{INPUT_LEN}} \ + --output-len 1 \ + --num-requests 10 \ + --seed $(date +%s) + +benchmark CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN: python vllm/benchmarks/benchmark_serving.py \ --base-url {{BASE_URL}} \ --model {{MODEL}} \ --dataset-name random \ --random-input-len {{INPUT_LEN}} \ --random-output-len {{OUTPUT_LEN}} \ - --request-rate {{RR}} \ + --max-concurrency {{CONCURRENCY}} \ --seed $(date +%M%H%M%S) \ --num-prompts {{NUM_REQUESTS}} \ + --percentile-metrics ttft,tpot,itl,e2el \ + --metric-percentiles 90,95,99 \ --ignore-eos -benchmark_no_pd_concurrency POD_IP CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN: +benchmark_no_pd POD_IP CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN: python vllm/benchmarks/benchmark_serving.py \ --base-url http://{{POD_IP}}:8000 \ --model {{MODEL}} \ @@ -42,7 +54,7 @@ benchmark_no_pd_concurrency POD_IP CONCURRENCY NUM_REQUESTS INPUT_LEN OUTPUT_LEN --random-output-len {{OUTPUT_LEN}} \ --max-concurrency {{CONCURRENCY}} \ --num-prompts {{NUM_REQUESTS}} \ - --seed $(date +%s) \ + --seed $(date +%M%H%M%S) \ --percentile-metrics ttft,tpot,itl,e2el \ --metric-percentiles 90,95,99 \ --ignore-eos From 568a58231c3d34e3eadce87383277282da742a71 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 10 Jul 2025 17:30:56 +0000 Subject: [PATCH 13/18] updated Signed-off-by: Robert Shaw --- .../presets/basic-gpu-with-nixl-preset.yaml | 2 ++ .../examples/rob-benchmarking/4p-1d-llama-70b.yaml | 10 +++++----- quickstart/examples/rob-benchmarking/Justfile | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml index eebedd8..b3d67a6 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml @@ -84,6 +84,8 @@ data: - name: VLLM_LOGGING_LEVEL value: {{ .Values.modelservice.vllm.logLevel }} {{- end }} + - name: VLLM_LOG_XFER_TIME + value: "0" - name: VLLM_NIXL_SIDE_CHANNEL_PORT value: "5557" - name: VLLM_NIXL_SIDE_CHANNEL_HOST diff --git a/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml index 19bd780..6d6ccf7 100644 --- a/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml +++ b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml @@ -50,15 +50,15 @@ modelservice: image: registry: docker.io repository: robertgouldshaw2/vllm-nixl - tag: nixl-oh-debug-fixed-0.1 + tag: nixl-oh-debug-fixed-0.3 epp: defaultEnvVarsOverride: - name: ENABLE_KVCACHE_AWARE_SCORER value: "false" - name: ENABLE_PREFIX_AWARE_SCORER - value: "true" + value: "false" - name: ENABLE_LOAD_AWARE_SCORER - value: "true" + value: "false" - name: ENABLE_SESSION_AWARE_SCORER value: "false" - name: PD_ENABLED @@ -68,9 +68,9 @@ modelservice: - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER value: "false" - name: PREFILL_ENABLE_LOAD_AWARE_SCORER - value: "true" + value: "false" - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER - value: "true" + value: "false" - name: PREFILL_ENABLE_SESSION_AWARE_SCORER value: "false" redis: diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile index 622c01f..6a52a8d 100644 --- a/quickstart/examples/rob-benchmarking/Justfile +++ b/quickstart/examples/rob-benchmarking/Justfile @@ -1,7 +1,7 @@ NAMESPACE := "pete-davidson" logs: - kubectl logs -f $POD -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" | grep -v "TRANSFER BATCHED" + kubectl logs -f $POD -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" | grep -v ".get_finished" | grep -v ".transfer_batched" logs-stats: kubectl logs -f $POD -n {{NAMESPACE}} | grep -e "Engine 000:" From c7ef50e5cf248ccf8b3c818a36dc98f330353cce Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 10 Jul 2025 17:31:58 +0000 Subject: [PATCH 14/18] updated Signed-off-by: Robert Shaw --- .../examples/rob-benchmarking/tp-1.yaml | 79 ------------------- 1 file changed, 79 deletions(-) delete mode 100644 quickstart/examples/rob-benchmarking/tp-1.yaml diff --git a/quickstart/examples/rob-benchmarking/tp-1.yaml b/quickstart/examples/rob-benchmarking/tp-1.yaml deleted file mode 100644 index 9bda5ee..0000000 --- a/quickstart/examples/rob-benchmarking/tp-1.yaml +++ /dev/null @@ -1,79 +0,0 @@ -sampleApplication: - baseConfigMapRefName: basic-gpu-with-nixl-preset - model: - modelArtifactURI: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8 - modelName: "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8" - auth: - hfToken: - name: llm-d-hf-token - key: HF_TOKEN - prefill: - replicas: 1 - resources: - limits: - nvidia.com/gpu: 1 - rdma/ib: 1 - requests: - nvidia.com/gpu: 1 - rdma/ib: 1 - cpu: "32" - memory: 128Gi - extraArgs: - - "--tensor-parallel-size" - - "1" - - "--disable-log-requests" - - "--max-model-len" - - "32768" - - "--block-size" - - "128" - - "--enforce-eager" - decode: - replicas: 1 - resources: - limits: - nvidia.com/gpu: 1 - rdma/ib: 1 - requests: - nvidia.com/gpu: 1 - rdma/ib: 1 - cpu: "32" - memory: 128Gi - extraArgs: - - "--tensor-parallel-size" - - "1" - - "--disable-log-requests" - - "--max-model-len" - - "32768" - - "--block-size" - - "128" - - "--enforce-eager" -modelservice: - vllm: - image: - registry: docker.io - repository: robertgouldshaw2/vllm-nixl - tag: nixl-oh-debug-0.3 - epp: - defaultEnvVarsOverride: - - name: ENABLE_KVCACHE_AWARE_SCORER - value: "false" - - name: ENABLE_PREFIX_AWARE_SCORER - value: "true" - - name: ENABLE_LOAD_AWARE_SCORER - value: "true" - - name: ENABLE_SESSION_AWARE_SCORER - value: "false" - - name: PD_ENABLED - value: "true" - - name: PD_PROMPT_LEN_THRESHOLD - value: "10" - - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER - value: "false" - - name: PREFILL_ENABLE_LOAD_AWARE_SCORER - value: "true" - - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER - value: "true" - - name: PREFILL_ENABLE_SESSION_AWARE_SCORER - value: "false" -redis: - enabled: false \ No newline at end of file From 0d5ecc1b971d077ee7de3ccda1626c39ebc6e293 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 10 Jul 2025 17:33:00 +0000 Subject: [PATCH 15/18] updated Signed-off-by: Robert Shaw --- .../templates/modelservice/presets/basic-gpu-preset.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml index b2c5fd1..1a3480b 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-preset.yaml @@ -75,8 +75,6 @@ data: - "--port" - "8001" env: - - name: VLLM_USE_V1 - value: "1" - name: HOME value: /home {{- if .Values.modelservice.vllm.logLevel }} @@ -156,8 +154,6 @@ data: - "--port" - "8000" env: - - name: VLLM_USE_V1 - value: "1" - name: HOME value: /home {{ if .Values.modelservice.vllm.logLevel }} From 78754690fb84a9c8fef46103acf805210c7d3196 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 10 Jul 2025 17:34:52 +0000 Subject: [PATCH 16/18] slim Signed-off-by: Robert Shaw --- .../modelservice/presets/basic-gpu-with-nixl-preset.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml index b3d67a6..3c21815 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml @@ -84,8 +84,6 @@ data: - name: VLLM_LOGGING_LEVEL value: {{ .Values.modelservice.vllm.logLevel }} {{- end }} - - name: VLLM_LOG_XFER_TIME - value: "0" - name: VLLM_NIXL_SIDE_CHANNEL_PORT value: "5557" - name: VLLM_NIXL_SIDE_CHANNEL_HOST @@ -99,6 +97,8 @@ data: fieldPath: status.podIP - name: LMCACHE_DISTRIBUTED_URL value: ${POD_IP}:8200 + - name: UCX_TLS + value: "^cuda_ipc" {{- if .Values.redis.enabled }} - name: LMCACHE_LOOKUP_URL value: {{ include "redis.master.service.fullurl" .}} @@ -200,6 +200,8 @@ data: fieldPath: status.podIP - name: LMCACHE_DISTRIBUTED_URL value: ${POD_IP}:8200 + - name: UCX_TLS + value: "^cuda_ipc" {{- if .Values.redis.enabled }} - name: LMCACHE_LOOKUP_URL value: {{ include "redis.master.service.fullurl" .}} From 4edbb2d5a82c6374a011b36b14414a5d3ab6786e Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 10 Jul 2025 17:46:34 +0000 Subject: [PATCH 17/18] updated to load aware Signed-off-by: Robert Shaw --- quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml index 6d6ccf7..c3b13a1 100644 --- a/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml +++ b/quickstart/examples/rob-benchmarking/4p-1d-llama-70b.yaml @@ -58,7 +58,7 @@ modelservice: - name: ENABLE_PREFIX_AWARE_SCORER value: "false" - name: ENABLE_LOAD_AWARE_SCORER - value: "false" + value: "true" - name: ENABLE_SESSION_AWARE_SCORER value: "false" - name: PD_ENABLED @@ -68,7 +68,7 @@ modelservice: - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER value: "false" - name: PREFILL_ENABLE_LOAD_AWARE_SCORER - value: "false" + value: "true" - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER value: "false" - name: PREFILL_ENABLE_SESSION_AWARE_SCORER From 155838fc884186641615cd8056511bc0847f3b66 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 11 Jul 2025 01:59:09 +0000 Subject: [PATCH 18/18] updated Signed-off-by: Robert Shaw --- quickstart/examples/rob-benchmarking/Justfile | 8 ++++---- quickstart/examples/rob-benchmarking/Justfile.remote | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/quickstart/examples/rob-benchmarking/Justfile b/quickstart/examples/rob-benchmarking/Justfile index 6a52a8d..4b2c3f0 100644 --- a/quickstart/examples/rob-benchmarking/Justfile +++ b/quickstart/examples/rob-benchmarking/Justfile @@ -1,10 +1,10 @@ NAMESPACE := "pete-davidson" -logs: - kubectl logs -f $POD -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" | grep -v ".get_finished" | grep -v ".transfer_batched" +logs POD: + kubectl logs -f {{POD}} -n {{NAMESPACE}} | grep -v "GET /metrics HTTP/1.1" | grep -v ".get_finished" | grep -v ".transfer_batched" -logs-stats: - kubectl logs -f $POD -n {{NAMESPACE}} | grep -e "Engine 000:" +logs-stats POD: + kubectl logs -f {{POD}} -n {{NAMESPACE}} | grep -e "Engine 000:" get-pods: kubectl get pods -n {{NAMESPACE}} -o wide diff --git a/quickstart/examples/rob-benchmarking/Justfile.remote b/quickstart/examples/rob-benchmarking/Justfile.remote index 284e5a8..4e3d64f 100644 --- a/quickstart/examples/rob-benchmarking/Justfile.remote +++ b/quickstart/examples/rob-benchmarking/Justfile.remote @@ -1,6 +1,6 @@ # Use this Justfile within the cluster. -MODEL := "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" +MODEL := "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" BASE_URL := "http://llm-d-inference-gateway" pull: