From 486ba5f20b740bb06cd30f66359579077164c27b Mon Sep 17 00:00:00 2001 From: Brent Salisbury Date: Sun, 8 Jun 2025 23:36:22 -0400 Subject: [PATCH] Add /v1/chat/completions to quickstart validation script Signed-off-by: Brent Salisbury --- quickstart/README-minikube.md | 4 +-- quickstart/test-request.sh | 62 +++++++++++++++++++++++++++++++---- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/quickstart/README-minikube.md b/quickstart/README-minikube.md index 09e93ab..b63043c 100644 --- a/quickstart/README-minikube.md +++ b/quickstart/README-minikube.md @@ -162,8 +162,8 @@ export HF_TOKEN="your-token" The inference-gateway serves as the HTTP ingress point for all inference requests in our deployment. It’s implemented as a Kubernetes Gateway (`gateway.networking.k8s.io/v1`) using whichever `gatewayClassName` you’ve chosen, either `kgateway` or `istio` and sits in front of your inference pods to handle path-based routing, load-balancing, -retries, and metrics. All calls to `/v1/models` and `/v1/completions` flow through this gateway to the appropriate -`decode` or `prefill` services. +retries, and metrics. API requests such as `/v1/models`, `/v1/completions` and `/v1/chat/completions` are exposed via the +gateway and routed to the appropriate `decode` or `prefill` services. ```bash # ------------------------------------------------------------------------- diff --git a/quickstart/test-request.sh b/quickstart/test-request.sh index 26f0afc..60a19ce 100755 --- a/quickstart/test-request.sh +++ b/quickstart/test-request.sh @@ -122,7 +122,7 @@ validation() { echo # ── 2) POST /v1/completions on decode pod ────────────────────────────────── - echo "2 -> Sending a completion request to the decode pod at ${POD_IP}…" + echo "2 -> Sending a /v1/completions request to the decode pod at ${POD_IP}…" ID=$(gen_id) kubectl run --rm -i curl-"$ID" \ --namespace "$NAMESPACE" \ @@ -136,9 +136,25 @@ validation() { }' echo - # 3) GET /v1/models via the gateway + # ── 3) POST /v1/chat/completions on decode pod ───────────────────────────── + echo "3 -> Sending a /v1/chat/completions request to the decode pod at ${POD_IP}…" + ID=$(gen_id) + kubectl run --rm -i curl-"$ID" \ + --namespace "$NAMESPACE" \ + --image=curlimages/curl --restart=Never -- \ + curl -sS -X POST http://${POD_IP}:8000/v1/chat/completions \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model":"'"$MODEL_ID"'", + "messages":[{"role":"user","content":"Who are you?"}], + "stream":false + }' + echo + + # ── 4) GET /v1/models via the gateway ─────────────────────────────────────── GATEWAY_ADDR=$(kubectl get gateway -n "$NAMESPACE" | tail -n1 | awk '{print $3}') - echo "3 -> Fetching available models via the gateway at ${GATEWAY_ADDR}…" + echo "4 -> Fetching available models via the gateway at ${GATEWAY_ADDR}…" ID=$(gen_id) GW_JSON=$(kubectl run --rm -i curl-"$ID" \ --namespace "$NAMESPACE" \ @@ -156,8 +172,8 @@ validation() { fi echo - # ── 4) POST /v1/completions via gateway ──────────────────────────────────── - echo "4 -> Sending a completion request via the gateway at ${GATEWAY_ADDR} with model '${MODEL_ID}'…" + # ── 5) POST /v1/completions via gateway ──────────────────────────────────── + echo "5 -> Sending a /v1/completions request via the gateway at ${GATEWAY_ADDR} with model '${MODEL_ID}'…" ID=$(gen_id) kubectl run --rm -i curl-"$ID" \ --namespace "$NAMESPACE" \ @@ -170,6 +186,22 @@ validation() { "prompt":"Who are you?" }' echo + + # ── 6) POST /v1/chat/completions via gateway ────────────────────────────── + echo "6 -> Sending a /v1/chat/completions request via the gateway at ${GATEWAY_ADDR} with model '${MODEL_ID}'…" + ID=$(gen_id) + kubectl run --rm -i curl-"$ID" \ + --namespace "$NAMESPACE" \ + --image=curlimages/curl --restart=Never -- \ + curl -sS -X POST http://${GATEWAY_ADDR}/v1/chat/completions \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model":"'"$MODEL_ID"'", + "messages":[{"role":"user","content":"Who are you?"}], + "stream":false + }' + echo } # ── Minikube gateway validation ─────────────────────────────────────────────── @@ -178,7 +210,7 @@ minikube_validation() { echo "Minikube validation: hitting gateway DNS at ${SVC_HOST}" # 1) GET /v1/models via DNS gateway - echo "1 -> GET /v1/models via DNS at ${SVC_HOST}…" + echo "1 -> Sending a GET /v1/models via DNS at ${SVC_HOST}…" ID=$(gen_id) LIST_JSON=$(kubectl run --rm -i curl-"$ID" \ --namespace "$NAMESPACE" \ @@ -203,7 +235,7 @@ minikube_validation() { echo # 2) POST /v1/completions via DNS gateway - echo "2 -> POST /v1/completions via DNS at ${SVC_HOST} with model '${MODEL_ID}'…" + echo "2 -> Sending a POST /v1/completions via DNS at ${SVC_HOST} with model '${MODEL_ID}'…" ID=$(gen_id) kubectl run --rm -i curl-"$ID" \ --namespace "$NAMESPACE" \ @@ -216,6 +248,22 @@ minikube_validation() { "prompt":"You are a helpful AI assistant." }' echo + + # ── 3) POST /v1/chat/completions via DNS gateway + echo "3 -> Sending a POST /v1/chat/completions via DNS at ${SVC_HOST} with model '${MODEL_ID}'…" + ID=$(gen_id) + kubectl run --rm -i curl-"$ID" \ + --namespace "$NAMESPACE" \ + --image=curlimages/curl --restart=Never -- \ + curl -sS -X POST http://${SVC_HOST}/v1/chat/completions \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model":"'"$MODEL_ID"'", + "messages":[{"role":"user","content":"You are a helpful AI assistant."}], + "stream":false + }' + echo } # ── Main ───────────────────────────────────────────