Skip to content

Commit aff50a1

Browse files
rgupta2508Rohit Guptazc277584121Erick Friis
authored
milvus: add array data type for collection create (#23219)
Add array data type for milvus vector store collection create Thank you for contributing to LangChain! - [x] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [x] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** a description of the change - **Issue:** the issue # it fixes, if applicable - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Signed-off-by: ChengZi <[email protected]> Co-authored-by: Rohit Gupta <[email protected]> Co-authored-by: ChengZi <[email protected]> Co-authored-by: Erick Friis <[email protected]>
1 parent 754f3c4 commit aff50a1

File tree

2 files changed

+87
-15
lines changed

2 files changed

+87
-15
lines changed

libs/partners/milvus/langchain_milvus/vectorstores/milvus.py

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ def __init__(
240240
replica_number: int = 1,
241241
timeout: Optional[float] = None,
242242
num_shards: Optional[int] = None,
243+
metadata_schema: Optional[dict[str, Any]] = None,
243244
):
244245
"""Initialize the Milvus vector store."""
245246
try:
@@ -310,6 +311,7 @@ def __init__(
310311
self.replica_number = replica_number
311312
self.timeout = timeout
312313
self.num_shards = num_shards
314+
self.metadata_schema = metadata_schema
313315

314316
# Create the connection to the server
315317
if connection_args is None:
@@ -472,24 +474,47 @@ def _create_collection(
472474
)
473475
raise ValueError(f"Metadata key {key} is reserved.")
474476
# Infer the corresponding datatype of the metadata
475-
dtype = infer_dtype_bydata(value)
476-
# Datatype isn't compatible
477-
if dtype == DataType.UNKNOWN or dtype == DataType.NONE:
478-
logger.error(
479-
(
480-
"Failure to create collection, "
481-
"unrecognized dtype for key: %s"
482-
),
483-
key,
484-
)
485-
raise ValueError(f"Unrecognized datatype for {key}.")
486-
# Datatype is a string/varchar equivalent
487-
elif dtype == DataType.VARCHAR:
477+
if (
478+
key in self.metadata_schema # type: ignore
479+
and "dtype" in self.metadata_schema[key] # type: ignore
480+
):
481+
kwargs = self.metadata_schema[key].get("kwargs", {}) # type: ignore
488482
fields.append(
489-
FieldSchema(key, DataType.VARCHAR, max_length=65_535)
483+
FieldSchema(
484+
name=key,
485+
dtype=self.metadata_schema[key]["dtype"], # type: ignore
486+
**kwargs,
487+
)
490488
)
491489
else:
492-
fields.append(FieldSchema(key, dtype))
490+
dtype = infer_dtype_bydata(value)
491+
# Datatype isn't compatible
492+
if dtype == DataType.UNKNOWN or dtype == DataType.NONE:
493+
logger.error(
494+
(
495+
"Failure to create collection, "
496+
"unrecognized dtype for key: %s"
497+
),
498+
key,
499+
)
500+
raise ValueError(f"Unrecognized datatype for {key}.")
501+
# Datatype is a string/varchar equivalent
502+
elif dtype == DataType.VARCHAR:
503+
fields.append(
504+
FieldSchema(key, DataType.VARCHAR, max_length=65_535)
505+
)
506+
# infer_dtype_bydata currently can't recognize array type,
507+
# so this line can not be accessed.
508+
# This line may need to be modified in the future when
509+
# infer_dtype_bydata can recognize array type.
510+
# https://github.com/milvus-io/pymilvus/issues/2165
511+
elif dtype == DataType.ARRAY:
512+
kwargs = self.metadata_schema[key]["kwargs"] # type: ignore
513+
fields.append(
514+
FieldSchema(name=key, dtype=DataType.ARRAY, **kwargs)
515+
)
516+
else:
517+
fields.append(FieldSchema(key, dtype))
493518

494519
# Create the text field
495520
fields.append(

libs/partners/milvus/tests/integration_tests/vectorstores/test_milvus.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def _milvus_from_texts(
3939
# connection_args={"uri": "http://127.0.0.1:19530"},
4040
connection_args={"uri": "./milvus_demo.db"},
4141
drop_old=drop,
42+
consistency_level="Strong",
4243
**kwargs,
4344
)
4445

@@ -303,6 +304,51 @@ def test_milvus_enable_dynamic_field_with_partition_key() -> None:
303304
}
304305

305306

307+
def test_milvus_array_field() -> None:
308+
"""Manually specify metadata schema, including an array_field.
309+
For more information about array data type and filtering, please refer to
310+
https://milvus.io/docs/array_data_type.md
311+
"""
312+
from pymilvus import DataType
313+
314+
texts = ["foo", "bar", "baz"]
315+
metadatas = [{"id": i, "array_field": [i, i + 1, i + 2]} for i in range(len(texts))]
316+
317+
# Manually specify metadata schema, including an array_field.
318+
# If some fields are not specified, Milvus will automatically infer their schemas.
319+
docsearch = _milvus_from_texts(
320+
metadatas=metadatas,
321+
metadata_schema={
322+
"array_field": {
323+
"dtype": DataType.ARRAY,
324+
"kwargs": {"element_type": DataType.INT64, "max_capacity": 50},
325+
},
326+
# "id": {
327+
# "dtype": DataType.INT64,
328+
# }
329+
},
330+
)
331+
output = docsearch.similarity_search("foo", k=10, expr="array_field[0] < 2")
332+
assert len(output) == 2
333+
output = docsearch.similarity_search(
334+
"foo", k=10, expr="ARRAY_CONTAINS(array_field, 3)"
335+
)
336+
assert len(output) == 2
337+
338+
# If we use enable_dynamic_field,
339+
# there is no need to manually specify metadata schema.
340+
docsearch = _milvus_from_texts(
341+
enable_dynamic_field=True,
342+
metadatas=metadatas,
343+
)
344+
output = docsearch.similarity_search("foo", k=10, expr="array_field[0] < 2")
345+
assert len(output) == 2
346+
output = docsearch.similarity_search(
347+
"foo", k=10, expr="ARRAY_CONTAINS(array_field, 3)"
348+
)
349+
assert len(output) == 2
350+
351+
306352
# if __name__ == "__main__":
307353
# test_milvus()
308354
# test_milvus_vector_search()
@@ -319,3 +365,4 @@ def test_milvus_enable_dynamic_field_with_partition_key() -> None:
319365
# test_milvus_enable_dynamic_field()
320366
# test_milvus_disable_dynamic_field()
321367
# test_milvus_metadata_field()
368+
# test_milvus_array_field()

0 commit comments

Comments
 (0)