From 02c7f2bf8051addc3abafa9d03d39794d4bdb9c1 Mon Sep 17 00:00:00 2001 From: Chitoku YATO Date: Fri, 14 Mar 2025 22:06:37 -0700 Subject: [PATCH 1/2] Add Max Context Len value to get around with context len 1 error --- docs/portal/dist/db.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/portal/dist/db.json b/docs/portal/dist/db.json index cca31b1e..d44c48f7 100644 --- a/docs/portal/dist/db.json +++ b/docs/portal/dist/db.json @@ -64,14 +64,15 @@ "name": "dustynv/vllm:0.7.4-r36.4.0-cu128-24.04", "docker_image": "dustynv/vllm:0.7.4-r36.4.0-cu128-24.04", "docker_cmd": "vllm serve ${MODEL}", - "docker_args": "--host=${SERVER_ADDR} --port=${SERVER_PORT} --dtype=auto --max-num-seqs=${MAX_BATCH_SIZE} --max-model-len=${MAX_CONTEXT_LEN} --gpu-memory-utilization=0.75", + "docker_args": "--host=${SERVER_ADDR} --port=${SERVER_PORT} --dtype=auto --max-num-seqs=${MAX_BATCH_SIZE} --max-model-len=${MAX_CONTEXT_LEN} --gpu-memory-utilization=0.757", "docker_options": "-it --rm", "server_host": "0.0.0.0:9000", "tags": [ "container", "vllm", "l4t-r36" - ] + ], + "max_context_len": 8192 }, "sudonim": { "docker_cmd": "sudonim serve", From aba652ab30b174bfbfd41a16e2a4fe98bf2f3365 Mon Sep 17 00:00:00 2001 From: Chitoku YATO Date: Fri, 14 Mar 2025 22:35:39 -0700 Subject: [PATCH 2/2] Prefill hf_token with ENV variable notation --- docs/portal/dist/db.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/portal/dist/db.json b/docs/portal/dist/db.json index d44c48f7..baf5bfa1 100644 --- a/docs/portal/dist/db.json +++ b/docs/portal/dist/db.json @@ -64,7 +64,7 @@ "name": "dustynv/vllm:0.7.4-r36.4.0-cu128-24.04", "docker_image": "dustynv/vllm:0.7.4-r36.4.0-cu128-24.04", "docker_cmd": "vllm serve ${MODEL}", - "docker_args": "--host=${SERVER_ADDR} --port=${SERVER_PORT} --dtype=auto --max-num-seqs=${MAX_BATCH_SIZE} --max-model-len=${MAX_CONTEXT_LEN} --gpu-memory-utilization=0.757", + "docker_args": "--host=${SERVER_ADDR} --port=${SERVER_PORT} --dtype=auto --max-num-seqs=${MAX_BATCH_SIZE} --max-model-len=${MAX_CONTEXT_LEN} --gpu-memory-utilization=0.75", "docker_options": "-it --rm", "server_host": "0.0.0.0:9000", "tags": [ @@ -72,7 +72,8 @@ "vllm", "l4t-r36" ], - "max_context_len": 8192 + "max_context_len": 8192, + "hf_token": "${HUGGINGFACE_TOKEN}" }, "sudonim": { "docker_cmd": "sudonim serve",