llmariner
diff --git a/‎common/pkg/api/completions.go
Lines changed: 49 additions & 1 deletion b/‎common/pkg/api/completions.go
Lines changed: 49 additions & 1 deletion
diff --git a/‎common/pkg/api/completions_test.go
Lines changed: 74 additions & 0 deletions b/‎common/pkg/api/completions_test.go
Lines changed: 74 additions & 0 deletions
diff --git a/‎deployments/engine/templates/_helpers.tpl
Lines changed: 9 additions & 0 deletions b/‎deployments/engine/templates/_helpers.tpl
Lines changed: 9 additions & 0 deletions
diff --git a/‎deployments/engine/templates/configmap.yaml
Lines changed: 8 additions & 0 deletions b/‎deployments/engine/templates/configmap.yaml
Lines changed: 8 additions & 0 deletions
diff --git a/‎deployments/engine/templates/secret.yaml
Lines changed: 23 additions & 0 deletions b/‎deployments/engine/templates/secret.yaml
Lines changed: 23 additions & 0 deletions
diff --git a/‎deployments/engine/values.schema.json
Lines changed: 1 addition & 1 deletion b/‎deployments/engine/values.schema.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎deployments/engine/values.yaml
Lines changed: 30 additions & 0 deletions b/‎deployments/engine/values.yaml
Lines changed: 30 additions & 0 deletions
diff --git a/‎deployments/server/templates/configmap.yaml
Lines changed: 5 additions & 0 deletions b/‎deployments/server/templates/configmap.yaml
Lines changed: 5 additions & 0 deletions
diff --git a/‎deployments/server/values.schema.json
Lines changed: 1 addition & 1 deletion b/‎deployments/server/values.schema.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎deployments/server/values.yaml
Lines changed: 6 additions & 0 deletions b/‎deployments/server/values.yaml
Lines changed: 6 additions & 0 deletions
diff --git a/‎engine/cmd/run.go
Lines changed: 47 additions & 22 deletions b/‎engine/cmd/run.go
Lines changed: 47 additions & 22 deletions
@@ -35,18 +35,23 @@ func ConvertCreateChatCompletionRequestToProto(body []byte) ([]byte, error) {
 }
 
 // ConvertCreateChatCompletionRequestToOpenAI converts the request to the OpenAI format.
-func ConvertCreateChatCompletionRequestToOpenAI(body []byte) ([]byte, error) {
+func ConvertCreateChatCompletionRequestToOpenAI(body []byte, needStringFormat bool) ([]byte, error) {
 	fs := []convertF{
 		// The order of the functions is the opposite of the ConvertCreateChatCompletionRequestToProto.
 		//
 		// We don't have a function that corresponds to convertContentStringToArray as the convertion
 		// doesn't break the OpenAI API spec.
 		convertEncodedTopP,
+		convertEncodedTopP,
 		convertEncodedTemperature,
 		convertEncodedChatTemplateKwargs,
 		convertEncodedFunctionParameters,
 		convertToolChoiceObject,
 	}
+	if needStringFormat {
+		// NIM expects the content field to be a string.
+		fs = append([]convertF{convertContentArrayToString}, fs...)
+	}
 	return applyConvertFuncs(body, fs)
 }
 
@@ -265,3 +270,46 @@ func convertContentStringToArray(r map[string]interface{}) error {
 	}
 	return nil
 }
+
+// convertContentArrayToString converts the content array back to a string for OpenAI format compatibility.
+func convertContentArrayToString(r map[string]interface{}) error {
+	msgs, ok := r["messages"]
+	if !ok {
+		return nil
+	}
+
+	for _, msg := range msgs.([]interface{}) {
+		m := msg.(map[string]interface{})
+		content, ok := m["content"]
+		if !ok {
+			continue
+		}
+
+		// If content is already a string, no conversion needed
+		if _, ok := content.(string); ok {
+			continue
+		}
+
+		// If content is an array, convert it to a string format OpenAI expects
+		if contentArr, ok := content.([]interface{}); ok && len(contentArr) > 0 {
+			// For text-only content, extract just the text
+			if len(contentArr) == 1 {
+				if contentItem, ok := contentArr[0].(map[string]interface{}); ok {
+					if contentType, ok := contentItem["type"].(string); ok && contentType == contentTypeText {
+						if text, ok := contentItem["text"].(string); ok {
+							m["content"] = text
+							continue
+						}
+					} else {
+						// TODO(guangrui): Handle non-text content.
+						return fmt.Errorf("unsupported content type: %s", contentType)
+					}
+				}
+			} else {
+				// TODO(guangrui): Handle more complex content arrays.
+				return fmt.Errorf("content array with multiple items is not supported")
+			}
+		}
+	}
+	return nil
+}
@@ -544,3 +544,77 @@ func TestConvertContentStringToArray(t *testing.T) {
 		})
 	}
 }
+
+func TestConvertContentArrayToString(t *testing.T) {
+	tcs := []struct {
+		name    string
+		body    string
+		want    string
+		wantErr bool
+	}{
+		{
+			name: "no messages field",
+			body: `{"model": "gpt-4"}`,
+			want: `{"model": "gpt-4"}`,
+		},
+		{
+			name: "content already string",
+			body: `{"messages": [{"role": "user", "content": "Hello, world!"}]}`,
+			want: `{"messages": [{"role": "user", "content": "Hello, world!"}]}`,
+		},
+		{
+			name: "no content field",
+			body: `{"messages": [{"role": "user"}]}`,
+			want: `{"messages": [{"role": "user"}]}`,
+		},
+		{
+			name: "single text content in array",
+			body: `{"messages": [{"role": "user", "content": [{"type": "text", "text": "Hello, world!"}]}]}`,
+			want: `{"messages": [{"role": "user", "content": "Hello, world!"}]}`,
+		},
+		{
+			name: "multiple messages mixed formats",
+			body: `{"messages": [
+				{"role": "system", "content": "You are a helpful assistant."},
+				{"role": "user", "content": [{"type": "text", "text": "What's the weather?"}]}
+			]}`,
+			want: `{"messages": [
+				{"role": "system", "content": "You are a helpful assistant."},
+				{"role": "user", "content": "What's the weather?"}
+			]}`,
+		},
+		{
+			name:    "non-text content type",
+			body:    `{"messages": [{"role": "user", "content": [{"type": "image", "image_url": {"url": "https://example.com/image.jpg"}}]}]}`,
+			wantErr: true,
+		},
+		{
+			name:    "multiple content items",
+			body:    `{"messages": [{"role": "user", "content": [{"type": "text", "text": "Hello"}, {"type": "text", "text": "world"}]}]}`,
+			wantErr: true,
+		},
+	}
+
+	for _, tc := range tcs {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := applyConvertFuncs([]byte(tc.body), []convertF{convertContentArrayToString})
+
+			if tc.wantErr {
+				assert.Error(t, err)
+				return
+			}
+
+			assert.NoError(t, err)
+
+			// Compare as parsed JSON to ignore formatting differences
+			var gotJSON, wantJSON map[string]interface{}
+			err = json.Unmarshal(got, &gotJSON)
+			assert.NoError(t, err)
+
+			err = json.Unmarshal([]byte(tc.want), &wantJSON)
+			assert.NoError(t, err)
+
+			assert.Equal(t, wantJSON, gotJSON)
+		})
+	}
+}
@@ -58,6 +58,15 @@ Create the name of the service account to use
 {{ default (include "inference-manager-engine.fullname" .) .Values.serviceAccount.name }}
 {{- end -}}
 
+{{/*
+For inline NGC key, create image pull secret
+*/}}
+{{- define "inference-manager-engine.generatedImagePullSecret" -}}
+{{- if .Values.nim.ngcApiKey }}
+{{- printf "{\"auths\":{\"nvcr.io\":{\"username\":\"$oauthtoken\",\"password\":\"%s\"}}}" .Values.nim.ngcApiKey | b64enc }}
+{{- end }}
+{{- end }}
+
 {{/*
 Do nothing, just for validation.
 */}}
 
@@ -60,6 +60,14 @@ data:
     vllm:
       dynamicLoRALoading: {{ .Values.vllm.dynamicLoRALoading }}
       loggingLevel: {{ .Values.vllm.loggingLevel }}
+    {{- with .Values.nim }}
+    nim:
+      ngcApiKey: {{ .ngcApiKey | b64enc }}
+      {{- with .models }}
+      models:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+    {{- end }}
     model:
       default:
         runtimeName: {{ .Values.model.default.runtimeName }}
 
@@ -0,0 +1,23 @@
+{{- if .Values.nim.ngcApiKey -}}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ngc-secret
+  labels:
+    {{- include "inference-manager-engine.labels" . | nindent 4 }}
+type: kubernetes.io/dockerconfigjson
+data:
+  .dockerconfigjson: {{ template "inference-manager-engine.generatedImagePullSecret" .}}
+
+---
+
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ngc-api
+  labels:
+    {{- include "inference-manager-engine.labels" . | nindent 4 }}
+type: Opaque
+data:
+  NGC_API_KEY: {{ .Values.nim.ngcApiKey | b64enc }}
+{{- end -}}
@@ -277,6 +277,36 @@ vllm:
   # Logging level of VLLM.
   loggingLevel: ERROR
 
+# nim is settings for using NVIDIA NIM (NVIDIA Inference Manager) as serving engine.
+nim:
+  # The NIM API key to use for accessing the NIM API.
+  # +docs:type=string
+  ngcApiKey: ""
+  # The NIM models to use.
+  # For example:
+  # models:
+  #   meta/llama-3.1-8b-instruct:
+  #     image: nvcr.io/nim/meta/llama-3.1-8b-instruct:1.3.3
+  #     imagePullPolicy: IfNotPresent
+  #     modelName: meta/llama-3.1-8b-instruct
+  #     modelVersion: 1.3.3
+  #     openaiPort: 8000
+  #     logLevel: DEBUG
+  #     resources:
+  #       requests:
+  #         cpu: 0
+  #         memory: 0
+  #       limits:
+  #         cpu: 0
+  #         memory: 0
+  #         nvidia.com/gpu: 1
+  #       volume:
+  #         storageClassName: "standard"
+  #         size: "50Gi"
+  #         accessMode: "ReadWriteOnce"
+  # +docs:type=property
+  models: {}
+
 autoscaler:
   # If set to true, the request base autoscaler will be enabled.
   # NOTE: In ollama dynamic-model-loading mode, volume sharing is required.
 
@@ -47,3 +47,8 @@ data:
     gracefulShutdownTimeout: {{ .Values.gracefulShutdownTimeout }}
     serverPodLabelKey: app.kubernetes.io/name
     serverPodLabelValue: {{ include "inference-manager-server.name" . }}
+    {{- with .Values.nimModels }}
+    nimModels:
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+
@@ -89,6 +89,12 @@ vectorStoreManagerServerAddr: vector-store-manager-server-grpc:8081
 # The address of the vector-store-manager-server to call internal vector-store APIs.
 vectorStoreManagerInternalServerAddr: vector-store-manager-server-internal-grpc:8083
 
+# The array of model names to be served by NIM backend.
+# For example:
+# nimModels:
+# - meta/llama-3.1-8b-instruct
+# +docs:type=property
+nimModels: []
 
 engineHeartbeat:
   # Set to true to enable heartbeats from the server to engines.
 
@@ -173,6 +173,7 @@ func run(ctx context.Context, c *config.Config, ns string, lv int) error {
 		modelManager runtime.ModelManager
 	)
 
+	nimModels := make(map[string]bool)
 	errCh := make(chan error)
 	if c.Ollama.DynamicModelLoading {
 		pullerAddr := fmt.Sprintf("%s:%d", ollamaClient.GetName(""), c.Runtime.PullerPort)
@@ -185,27 +186,43 @@ func run(ctx context.Context, c *config.Config, ns string, lv int) error {
 		modelManager = ollamaManager
 
 	} else {
+		clients := map[string]runtime.Client{
+			config.RuntimeNameOllama: ollamaClient,
+			config.RuntimeNameVLLM: runtime.NewVLLMClient(
+				mgr.GetClient(),
+				ns,
+				owner,
+				&c.Runtime,
+				processedConfig,
+				modelClient,
+				&c.VLLM,
+			),
+			config.RuntimeNameTriton: runtime.NewTritonClient(
+				mgr.GetClient(),
+				ns,
+				owner,
+				&c.Runtime,
+				processedConfig,
+			),
+		}
+
+		nimClients := make(map[string]runtime.Client)
+		for _, model := range c.NIM.Models {
+			nimClients[model.ModelName] = runtime.NewNIMClient(
+				mgr.GetClient(),
+				ns,
+				owner,
+				&c.Runtime,
+				&c.NIM,
+				&model,
+			)
+			nimModels[model.ModelName] = true
+		}
+
 		rtClientFactory := &clientFactory{
-			config: c,
-			clients: map[string]runtime.Client{
-				config.RuntimeNameOllama: ollamaClient,
-				config.RuntimeNameVLLM: runtime.NewVLLMClient(
-					mgr.GetClient(),
-					ns,
-					owner,
-					&c.Runtime,
-					processedConfig,
-					modelClient,
-					&c.VLLM,
-				),
-				config.RuntimeNameTriton: runtime.NewTritonClient(
-					mgr.GetClient(),
-					ns,
-					owner,
-					&c.Runtime,
-					processedConfig,
-				),
-			},
+			config:     c,
+			clients:    clients,
+			nimClients: nimClients,
 		}
 
 		rtManager := runtime.NewManager(
@@ -215,6 +232,7 @@ func run(ctx context.Context, c *config.Config, ns string, lv int) error {
 			modelClient,
 			c.VLLM.DynamicLoRALoading,
 			c.Runtime.PullerPort,
+			nimModels,
 		)
 		if err := rtManager.SetupWithManager(mgr, leaderElection); err != nil {
 			return err
@@ -278,6 +296,7 @@ func run(ctx context.Context, c *config.Config, ns string, lv int) error {
 		logger,
 		collector,
 		c.GracefulShutdownTimeout,
+		nimModels,
 	)
 	if err := p.SetupWithManager(mgr, leaderElection); err != nil {
 		return err
@@ -323,11 +342,17 @@ func (f *grpcClientFactory) Create() (processor.ProcessTasksClient, func(), erro
 }
 
 type clientFactory struct {
-	config  *config.Config
-	clients map[string]runtime.Client
+	config     *config.Config
+	clients    map[string]runtime.Client
+	nimClients map[string]runtime.Client
 }
 
 func (f *clientFactory) New(modelID string) (runtime.Client, error) {
+	// skip processing model config if the model is served by NIM runtime.
+	if _, ok := f.config.NIM.Models[modelID]; ok {
+		return f.nimClients[modelID], nil
+	}
+
 	mci := config.NewProcessedModelConfig(f.config).ModelConfigItem(modelID)
 	c, ok := f.clients[mci.RuntimeName]
 	if !ok {