@@ -70,7 +70,14 @@ logRequests: true
70
70
# define valid model values and the upstream server start
71
71
models :
72
72
" llama " :
73
- cmd : llama-server --port 8999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf
73
+ # multiline for readability
74
+ cmd : >
75
+ llama-server --port 8999
76
+ --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
77
+
78
+ # environment variables to pass to the command
79
+ env :
80
+ - " CUDA_VISIBLE_DEVICES=0"
74
81
75
82
# where to reach the server started by cmd, make sure the ports match
76
83
proxy : http://127.0.0.1:8999
@@ -91,16 +98,9 @@ models:
91
98
# default: 0 = never unload model
92
99
ttl : 60
93
100
94
- " qwen " :
95
- # environment variables to pass to the command
96
- env :
97
- - " CUDA_VISIBLE_DEVICES=0"
98
-
99
- # multiline for readability
100
- cmd : >
101
- llama-server --port 8999
102
- --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
103
- proxy : http://127.0.0.1:8999
101
+ # `useModelName` overrides the model name in the request
102
+ # and sends a specific name to the upstream server
103
+ useModelName : " qwen:qwq"
104
104
105
105
# unlisted models do not show up in /v1/models or /upstream lists
106
106
# but they can still be requested as normal
@@ -117,23 +117,16 @@ models:
117
117
ghcr.io/ggerganov/llama.cpp:server
118
118
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
119
119
120
- # `useModelName` will send a specific model name to the upstream server
121
- # overriding whatever was set in the request
122
- " qwq " :
123
- proxy : http://127.0.0.1:11434
124
- cmd : my-server
125
- useModelName : " qwen:qwq"
126
-
127
- # profiles make it easy to managing multi model (and gpu) configurations.
120
+ # profiles eliminates swapping by running multiple models at the same time
128
121
#
129
122
# Tips:
130
123
# - each model must be listening on a unique address and port
131
124
# - the model name is in this format: "profile_name:model", like "coding:qwen"
132
125
# - the profile will load and unload all models in the profile at the same time
133
126
profiles :
134
127
coding :
135
- - " qwen"
136
128
- " llama"
129
+ - " qwen-unlisted"
137
130
` ` `
138
131
139
132
### Use Case Examples
0 commit comments