milvus: add array data type for collection create (#23219)

rgupta2508 · Rohit Gupta · zc277584121 · web-flow · commit aff50a1e6f8e · 2024-08-28T16:55:57.000Z
Add array data type for milvus vector store collection create Thank you for contributing to LangChain! - [x] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [x] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** a description of the change - **Issue:** the issue # it fixes, if applicable - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Signed-off-by: ChengZi <chen.zhang@zilliz.com> Co-authored-by: Rohit Gupta <rohit.gupta2@walmart.com> Co-authored-by: ChengZi <chen.zhang@zilliz.com> Co-authored-by: Erick Friis <erick@langchain.dev>
diff --git a/libs/partners/milvus/langchain_milvus/vectorstores/milvus.py b/libs/partners/milvus/langchain_milvus/vectorstores/milvus.py
@@ -240,6 +240,7 @@ def __init__(
         replica_number: int = 1,
         timeout: Optional[float] = None,
         num_shards: Optional[int] = None,
+        metadata_schema: Optional[dict[str, Any]] = None,
     ):
         """Initialize the Milvus vector store."""
         try:
@@ -310,6 +311,7 @@ def __init__(
         self.replica_number = replica_number
         self.timeout = timeout
         self.num_shards = num_shards
+        self.metadata_schema = metadata_schema
 
         # Create the connection to the server
         if connection_args is None:
@@ -472,24 +474,47 @@ def _create_collection(
                         )
                         raise ValueError(f"Metadata key {key} is reserved.")
                     # Infer the corresponding datatype of the metadata
-                    dtype = infer_dtype_bydata(value)
-                    # Datatype isn't compatible
-                    if dtype == DataType.UNKNOWN or dtype == DataType.NONE:
-                        logger.error(
-                            (
-                                "Failure to create collection, "
-                                "unrecognized dtype for key: %s"
-                            ),
-                            key,
-                        )
-                        raise ValueError(f"Unrecognized datatype for {key}.")
-                    # Datatype is a string/varchar equivalent
-                    elif dtype == DataType.VARCHAR:
+                    if (
+                        key in self.metadata_schema  # type: ignore
+                        and "dtype" in self.metadata_schema[key]  # type: ignore
+                    ):
+                        kwargs = self.metadata_schema[key].get("kwargs", {})  # type: ignore
                         fields.append(
-                            FieldSchema(key, DataType.VARCHAR, max_length=65_535)
+                            FieldSchema(
+                                name=key,
+                                dtype=self.metadata_schema[key]["dtype"],  # type: ignore
+                                **kwargs,
+                            )
                         )
                     else:
-                        fields.append(FieldSchema(key, dtype))
+                        dtype = infer_dtype_bydata(value)
+                        # Datatype isn't compatible
+                        if dtype == DataType.UNKNOWN or dtype == DataType.NONE:
+                            logger.error(
+                                (
+                                    "Failure to create collection, "
+                                    "unrecognized dtype for key: %s"
+                                ),
+                                key,
+                            )
+                            raise ValueError(f"Unrecognized datatype for {key}.")
+                        # Datatype is a string/varchar equivalent
+                        elif dtype == DataType.VARCHAR:
+                            fields.append(
+                                FieldSchema(key, DataType.VARCHAR, max_length=65_535)
+                            )
+                        # infer_dtype_bydata currently can't recognize array type,
+                        # so this line can not be accessed.
+                        # This line may need to be modified in the future when
+                        # infer_dtype_bydata can recognize array type.
+                        # https://github.com/milvus-io/pymilvus/issues/2165
+                        elif dtype == DataType.ARRAY:
+                            kwargs = self.metadata_schema[key]["kwargs"]  # type: ignore
+                            fields.append(
+                                FieldSchema(name=key, dtype=DataType.ARRAY, **kwargs)
+                            )
+                        else:
+                            fields.append(FieldSchema(key, dtype))
 
         # Create the text field
         fields.append(
diff --git a/libs/partners/milvus/tests/integration_tests/vectorstores/test_milvus.py b/libs/partners/milvus/tests/integration_tests/vectorstores/test_milvus.py
@@ -39,6 +39,7 @@ def _milvus_from_texts(
         # connection_args={"uri": "http://127.0.0.1:19530"},
         connection_args={"uri": "./milvus_demo.db"},
         drop_old=drop,
+        consistency_level="Strong",
         **kwargs,
     )
 
@@ -303,6 +304,51 @@ def test_milvus_enable_dynamic_field_with_partition_key() -> None:
     }
 
 
+def test_milvus_array_field() -> None:
+    """Manually specify metadata schema, including an array_field.
+    For more information about array data type and filtering, please refer to
+    https://milvus.io/docs/array_data_type.md
+    """
+    from pymilvus import DataType
+
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"id": i, "array_field": [i, i + 1, i + 2]} for i in range(len(texts))]
+
+    # Manually specify metadata schema, including an array_field.
+    # If some fields are not specified, Milvus will automatically infer their schemas.
+    docsearch = _milvus_from_texts(
+        metadatas=metadatas,
+        metadata_schema={
+            "array_field": {
+                "dtype": DataType.ARRAY,
+                "kwargs": {"element_type": DataType.INT64, "max_capacity": 50},
+            },
+            # "id": {
+            #     "dtype": DataType.INT64,
+            # }
+        },
+    )
+    output = docsearch.similarity_search("foo", k=10, expr="array_field[0] < 2")
+    assert len(output) == 2
+    output = docsearch.similarity_search(
+        "foo", k=10, expr="ARRAY_CONTAINS(array_field, 3)"
+    )
+    assert len(output) == 2
+
+    # If we use enable_dynamic_field,
+    # there is no need to manually specify metadata schema.
+    docsearch = _milvus_from_texts(
+        enable_dynamic_field=True,
+        metadatas=metadatas,
+    )
+    output = docsearch.similarity_search("foo", k=10, expr="array_field[0] < 2")
+    assert len(output) == 2
+    output = docsearch.similarity_search(
+        "foo", k=10, expr="ARRAY_CONTAINS(array_field, 3)"
+    )
+    assert len(output) == 2
+
+
 # if __name__ == "__main__":
 #     test_milvus()
 #     test_milvus_vector_search()
@@ -319,3 +365,4 @@ def test_milvus_enable_dynamic_field_with_partition_key() -> None:
 #     test_milvus_enable_dynamic_field()
 #     test_milvus_disable_dynamic_field()
 #     test_milvus_metadata_field()
+#     test_milvus_array_field()