elastic · jonathan-buttner · Jun 4, 2025 · May 27, 2025 · May 27, 2025 · May 27, 2025
diff --git a/docs/changelog/128538.yaml b/docs/changelog/128538.yaml
@@ -1,5 +1,5 @@
 pr: 128538
-summary: "[ML] Add Mistral Chat Completion support to the Inference Plugin"
+summary: "Added Mistral Chat Completion support to the Inference Plugin"
 area: Machine Learning
 type: enhancement
 issues: []
diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java
@@ -182,7 +182,9 @@ static TransportVersion def(int id) {
     public static final TransportVersion RERANKER_FAILURES_ALLOWED_8_19 = def(8_841_0_35);
     public static final TransportVersion ML_INFERENCE_HUGGING_FACE_RERANK_ADDED_8_19 = def(8_841_0_36);
     public static final TransportVersion ML_INFERENCE_SAGEMAKER_CHAT_COMPLETION_8_19 = def(8_841_0_37);
-    public static final TransportVersion ML_INFERENCE_MISTRAL_CHAT_COMPLETION_ADDED_8_19 = def(8_841_0_38);
+    public static final TransportVersion ML_INFERENCE_VERTEXAI_CHATCOMPLETION_ADDED_8_19 = def(8_841_0_38);
+    public static final TransportVersion INFERENCE_CUSTOM_SERVICE_ADDED_8_19 = def(8_841_0_39);
+    public static final TransportVersion ML_INFERENCE_MISTRAL_CHAT_COMPLETION_ADDED_8_19 = def(8_841_0_40);
     public static final TransportVersion V_9_0_0 = def(9_000_0_09);
     public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_1 = def(9_000_0_10);
     public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_2 = def(9_000_0_11);
@@ -268,7 +270,11 @@ static TransportVersion def(int id) {
     public static final TransportVersion ML_INFERENCE_HUGGING_FACE_RERANK_ADDED = def(9_080_0_00);
     public static final TransportVersion SETTINGS_IN_DATA_STREAMS_DRY_RUN = def(9_081_0_00);
     public static final TransportVersion ML_INFERENCE_SAGEMAKER_CHAT_COMPLETION = def(9_082_0_00);
-    public static final TransportVersion ML_INFERENCE_MISTRAL_CHAT_COMPLETION_ADDED = def(9_083_0_00);
+    public static final TransportVersion ML_INFERENCE_VERTEXAI_CHATCOMPLETION_ADDED = def(9_083_0_00);
+    public static final TransportVersion INFERENCE_CUSTOM_SERVICE_ADDED = def(9_084_0_00);
+    public static final TransportVersion ESQL_LIMIT_ROW_SIZE = def(9_085_0_00);
+    public static final TransportVersion ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY = def(9_086_0_00);
+    public static final TransportVersion ML_INFERENCE_MISTRAL_CHAT_COMPLETION_ADDED = def(9_087_0_00);
     /*
      * STOP! READ THIS FIRST! No, really,
      *        ____ _____ ___  ____  _        ____  _____    _    ____    _____ _   _ ___ ____    _____ ___ ____  ____ _____ _

diff --git a/server/src/main/java/org/elasticsearch/inference/UnifiedCompletionRequest.java b/server/src/main/java/org/elasticsearch/inference/UnifiedCompletionRequest.java
@@ -78,6 +78,11 @@ public record UnifiedCompletionRequest(
      * {@link #MAX_COMPLETION_TOKENS_FIELD}. Providers are expected to pass in their supported field name.
      */
     private static final String MAX_TOKENS_PARAM = "max_tokens_field";
+    /**
+     * Some providers don't support the stream_options field.
+     * This parameter is used to skip the stream_options field in the JSON output.
+     */
+    public static final String SKIP_STREAM_OPTIONS_PARAM = "skip_stream_options";
 
     /**
      * Creates a {@link org.elasticsearch.xcontent.ToXContent.Params} that causes ToXContent to include the key values:
@@ -91,6 +96,23 @@ public static Params withMaxTokens(String modelId, Params params) {
         );
     }
 
+    /**
+     * Creates a {@link org.elasticsearch.xcontent.ToXContent.Params} that causes ToXContent to include the key values:
+     * - Key: {@link #MODEL_FIELD}, Value: modelId
+     * - Key: {@link #MAX_TOKENS_FIELD}, Value: {@link #MAX_TOKENS_FIELD}
+     * - Key: {@link #SKIP_STREAM_OPTIONS_PARAM}, Value: "true"
+     */
+    public static Params withMaxTokensAndSkipStreamOptionsField(String modelId, Params params) {
+        return new DelegatingMapParams(
+            Map.ofEntries(
+                Map.entry(MODEL_ID_PARAM, modelId),
+                Map.entry(MAX_TOKENS_PARAM, MAX_TOKENS_FIELD),
+                Map.entry(SKIP_STREAM_OPTIONS_PARAM, Boolean.TRUE.toString())
+            ),
+            params
+        );
+    }
+
     /**
      * Creates a {@link org.elasticsearch.xcontent.ToXContent.Params} that causes ToXContent to include the key values:
      * - Key: {@link #MODEL_FIELD}, Value: modelId

diff --git a/...tests/src/javaRestTest/java/org/elasticsearch/xpack/inference/InferenceGetServicesIT.java b/...tests/src/javaRestTest/java/org/elasticsearch/xpack/inference/InferenceGetServicesIT.java
@@ -154,15 +154,16 @@ public void testGetServicesWithCompletionTaskType() throws IOException {
                     "openai",
                     "streaming_completion_test_service",
                     "hugging_face",
-                    "amazon_sagemaker"
+                    "amazon_sagemaker",
+                    "mistral"
                 ).toArray()
             )
         );
     }
 
     public void testGetServicesWithChatCompletionTaskType() throws IOException {
         List<Object> services = getServices(TaskType.CHAT_COMPLETION);
-        assertThat(services.size(), equalTo(7));
+        assertThat(services.size(), equalTo(8));
 
         var providers = providers(services);
 
@@ -176,7 +177,8 @@ public void testGetServicesWithChatCompletionTaskType() throws IOException {
                     "streaming_completion_test_service",
                     "hugging_face",
                     "amazon_sagemaker",
-                    "googlevertexai"
+                    "googlevertexai",
+                    "mistral"
                 ).toArray()
             )
         );

diff --git a/.../java/org/elasticsearch/xpack/inference/external/response/ErrorMessageResponseEntity.java b/.../java/org/elasticsearch/xpack/inference/external/response/ErrorMessageResponseEntity.java
@@ -21,12 +21,13 @@
  * A pattern is emerging in how external providers provide error responses.
  *
  * At a minimum, these return:
+ * <pre><code>
  * {
  *     "error: {
  *         "message": "(error message)"
  *     }
  * }
- *
+ * </code></pre>
  * Others may return additional information such as error codes specific to the service.
  *
  * This currently covers error handling for Azure AI Studio, however this pattern

diff --git a/...org/elasticsearch/xpack/inference/external/response/streaming/StreamingErrorResponse.java b/...org/elasticsearch/xpack/inference/external/response/streaming/StreamingErrorResponse.java
@@ -0,0 +1,128 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.inference.external.response.streaming;
+
+import org.elasticsearch.core.Nullable;
+import org.elasticsearch.xcontent.ConstructingObjectParser;
+import org.elasticsearch.xcontent.ParseField;
+import org.elasticsearch.xcontent.XContentFactory;
+import org.elasticsearch.xcontent.XContentParser;
+import org.elasticsearch.xcontent.XContentParserConfiguration;
+import org.elasticsearch.xcontent.XContentType;
+import org.elasticsearch.xpack.inference.external.http.HttpResult;
+import org.elasticsearch.xpack.inference.external.http.retry.ErrorResponse;
+import org.elasticsearch.xpack.inference.external.response.ErrorMessageResponseEntity;
+
+import java.util.Objects;
+import java.util.Optional;
+
+/**
+ * Represents an error response from a streaming inference service.
+ * This class extends {@link ErrorResponse} and provides additional fields
+ * specific to streaming errors, such as code, param, and type.
+ * An example error response for a streaming service might look like:
+ * <pre><code>
+ *     {
+ *         "error": {
+ *             "message": "Invalid input",
+ *             "code": "400",
+ *             "param": "input",
+ *             "type": "invalid_request_error"
+ *         }
+ *     }
+ * </code></pre>
+ * TODO: {@link ErrorMessageResponseEntity} is nearly identical to this, but doesn't parse as many fields. We must remove the duplication.
+ */
+public class StreamingErrorResponse extends ErrorResponse {
+    private static final ConstructingObjectParser<Optional<ErrorResponse>, Void> ERROR_PARSER = new ConstructingObjectParser<>(
+        "streaming_error",
+        true,
+        args -> Optional.ofNullable((StreamingErrorResponse) args[0])
+    );
+    private static final ConstructingObjectParser<StreamingErrorResponse, Void> ERROR_BODY_PARSER = new ConstructingObjectParser<>(
+        "streaming_error",
+        true,
+        args -> new StreamingErrorResponse((String) args[0], (String) args[1], (String) args[2], (String) args[3])
+    );
+
+    static {
+        ERROR_BODY_PARSER.declareString(ConstructingObjectParser.constructorArg(), new ParseField("message"));
+        ERROR_BODY_PARSER.declareStringOrNull(ConstructingObjectParser.optionalConstructorArg(), new ParseField("code"));
+        ERROR_BODY_PARSER.declareStringOrNull(ConstructingObjectParser.optionalConstructorArg(), new ParseField("param"));
+        ERROR_BODY_PARSER.declareString(ConstructingObjectParser.constructorArg(), new ParseField("type"));
+
+        ERROR_PARSER.declareObjectOrNull(
+            ConstructingObjectParser.optionalConstructorArg(),
+            ERROR_BODY_PARSER,
+            null,
+            new ParseField("error")
+        );
+    }
+
+    /**
+     * Standard error response parser. This can be overridden for those subclasses that
+     * have a different error response structure.
+     * @param response The error response as an HttpResult
+     */
+    public static ErrorResponse fromResponse(HttpResult response) {
+        try (
+            XContentParser parser = XContentFactory.xContent(XContentType.JSON)
+                .createParser(XContentParserConfiguration.EMPTY, response.body())
+        ) {
+            return ERROR_PARSER.apply(parser, null).orElse(ErrorResponse.UNDEFINED_ERROR);
+        } catch (Exception e) {
+            // swallow the error
+        }
+
+        return ErrorResponse.UNDEFINED_ERROR;
+    }
+
+    /**
+     * Standard error response parser. This can be overridden for those subclasses that
+     * have a different error response structure.
+     * @param response The error response as a string
+     */
+    public static ErrorResponse fromString(String response) {
+        try (
+            XContentParser parser = XContentFactory.xContent(XContentType.JSON).createParser(XContentParserConfiguration.EMPTY, response)
+        ) {
+            return ERROR_PARSER.apply(parser, null).orElse(ErrorResponse.UNDEFINED_ERROR);
+        } catch (Exception e) {
+            // swallow the error
+        }
+
+        return ErrorResponse.UNDEFINED_ERROR;
+    }
+
+    @Nullable
+    private final String code;
+    @Nullable
+    private final String param;
+    private final String type;
+
+    StreamingErrorResponse(String errorMessage, @Nullable String code, @Nullable String param, String type) {
+        super(errorMessage);
+        this.code = code;
+        this.param = param;
+        this.type = Objects.requireNonNull(type);
+    }
+
+    @Nullable
+    public String code() {
+        return code;
+    }
+
+    @Nullable
+    public String param() {
+        return param;
+    }
+
+    public String type() {
+        return type;
+    }
+}
diff --git a/...rg/elasticsearch/xpack/inference/external/unified/UnifiedChatCompletionRequestEntity.java b/...rg/elasticsearch/xpack/inference/external/unified/UnifiedChatCompletionRequestEntity.java
@@ -15,6 +15,8 @@
 import java.io.IOException;
 import java.util.Objects;
 
+import static org.elasticsearch.inference.UnifiedCompletionRequest.SKIP_STREAM_OPTIONS_PARAM;
+
 /**
  * Represents a unified chat completion request entity.
  * This class is used to convert the unified chat input into a format that can be serialized to XContent.
@@ -46,20 +48,13 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         builder.field(NUMBER_OF_RETURNED_CHOICES_FIELD, 1);
 
         builder.field(STREAM_FIELD, stream);
-        if (stream) {
-            fillStreamOptionsFields(builder);
+        // If request is streamed and skip stream options parameter is not true, include stream options in the request.
+        if (stream == true && params.paramAsBoolean(SKIP_STREAM_OPTIONS_PARAM, false) == false) {
+            builder.startObject(STREAM_OPTIONS_FIELD);
+            builder.field(INCLUDE_USAGE_FIELD, true);
+            builder.endObject();
         }
 
         return builder;
     }
-
-    /**
-     * This method is used to fill the stream options fields in the request entity.
-     * It is called when the stream option is set to true.
-     */
-    protected void fillStreamOptionsFields(XContentBuilder builder) throws IOException {
-        builder.startObject(STREAM_OPTIONS_FIELD);
-        builder.field(INCLUDE_USAGE_FIELD, true);
-        builder.endObject();
-    }
 }
diff --git a/.../org/elasticsearch/xpack/inference/services/mistral/MistralCompletionResponseHandler.java b/.../org/elasticsearch/xpack/inference/services/mistral/MistralCompletionResponseHandler.java
@@ -0,0 +1,29 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.inference.services.mistral;
+
+import org.elasticsearch.xpack.inference.external.http.retry.ResponseParser;
+import org.elasticsearch.xpack.inference.services.mistral.response.MistralErrorResponse;
+import org.elasticsearch.xpack.inference.services.openai.OpenAiChatCompletionResponseHandler;
+
+/**
+ * Handles non-streaming completion responses for Mistral models, extending the OpenAI completion response handler.
+ * This class is specifically designed to handle Mistral's error response format.
+ */
+public class MistralCompletionResponseHandler extends OpenAiChatCompletionResponseHandler {
+
+    /**
+     * Constructs a MistralCompletionResponseHandler with the specified request type and response parser.
+     *
+     * @param requestType The type of request being handled (e.g., "mistral completions").
+     * @param parseFunction The function to parse the response.
+     */
+    public MistralCompletionResponseHandler(String requestType, ResponseParser parseFunction) {
+        super(requestType, parseFunction, MistralErrorResponse::fromResponse);
+    }
+}
diff --git a/...ence/src/main/java/org/elasticsearch/xpack/inference/services/mistral/MistralService.java b/...ence/src/main/java/org/elasticsearch/xpack/inference/services/mistral/MistralService.java
@@ -99,15 +99,15 @@ protected void doInfer(
         var actionCreator = new MistralActionCreator(getSender(), getServiceComponents());
 
         switch (model) {
-            case MistralEmbeddingsModel mistralEmbeddingsModel -> {
-                var action = mistralEmbeddingsModel.accept(actionCreator, taskSettings);
-                action.execute(inputs, timeout, listener);
-            }
-            case MistralChatCompletionModel mistralChatCompletionModel -> {
-                var action = mistralChatCompletionModel.accept(actionCreator);
-                action.execute(inputs, timeout, listener);
-            }
-            default -> listener.onFailure(createInvalidModelException(model));
+            case MistralEmbeddingsModel mistralEmbeddingsModel:
+                mistralEmbeddingsModel.accept(actionCreator, taskSettings).execute(inputs, timeout, listener);
+                break;
+            case MistralChatCompletionModel mistralChatCompletionModel:
+                mistralChatCompletionModel.accept(actionCreator).execute(inputs, timeout, listener);
+                break;
+            default:
+                listener.onFailure(createInvalidModelException(model));
+                break;
         }
     }
 
@@ -292,27 +292,23 @@ private static MistralModel createModel(
         String failureMessage,
         ConfigurationParseContext context
     ) {
-        return switch (taskType) {
-            case TEXT_EMBEDDING -> new MistralEmbeddingsModel(
-                modelId,
-                taskType,
-                NAME,
-                serviceSettings,
-                taskSettings,
-                chunkingSettings,
-                secretSettings,
-                context
-            );
-            case CHAT_COMPLETION, COMPLETION -> new MistralChatCompletionModel(
-                modelId,
-                taskType,
-                NAME,
-                serviceSettings,
-                secretSettings,
-                context
-            );
-            default -> throw new ElasticsearchStatusException(failureMessage, RestStatus.BAD_REQUEST);
-        };
+        switch (taskType) {
+            case TEXT_EMBEDDING:
+                return new MistralEmbeddingsModel(
+                    modelId,
+                    taskType,
+                    NAME,
+                    serviceSettings,
+                    taskSettings,
+                    chunkingSettings,
+                    secretSettings,
+                    context
+                );
+            case CHAT_COMPLETION, COMPLETION:
+                return new MistralChatCompletionModel(modelId, taskType, NAME, serviceSettings, secretSettings, context);
+            default:
+                throw new ElasticsearchStatusException(failureMessage, RestStatus.BAD_REQUEST);
+        }
     }
 
     private MistralModel createModelFromPersistent(