From 6fafde4c56357d683d8588df32769253a009ddce Mon Sep 17 00:00:00 2001 From: gRedHeadphone Date: Mon, 13 Oct 2025 06:59:31 +0000 Subject: [PATCH 1/6] fix(docs): update command to install packages using uv & update port to default port used in tests --- DEVELOPMENT.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 37095a3..b26725d 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -13,7 +13,7 @@ source .venv/bin/activate Install package in editable mode. ```shell -poetry install --with dev,test,lint +uv sync --group test ``` Start PostgreSQL/PGVector. @@ -22,7 +22,7 @@ docker run --rm -it --name pgvector-container \ -e POSTGRES_USER=langchain \ -e POSTGRES_PASSWORD=langchain \ -e POSTGRES_DB=langchain_test \ - -p 6024:5432 pgvector/pgvector:pg16 \ + -p 5432:5432 pgvector/pgvector:pg16 \ postgres -c log_statement=all ``` From 5b3c3c73633c3317dbef5265afbec647a4e3ec27 Mon Sep 17 00:00:00 2001 From: gRedHeadphone Date: Mon, 13 Oct 2025 08:02:34 +0000 Subject: [PATCH 2/6] feat: json metadata filtering --- langchain_postgres/v2/async_vectorstore.py | 41 ++++- .../fixtures/metadata_filtering_data.py | 173 ++++++++++++++++++ .../v2/test_async_pg_vectorstore_search.py | 9 +- .../v2/test_pg_vectorstore_search.py | 11 +- 4 files changed, 221 insertions(+), 13 deletions(-) diff --git a/langchain_postgres/v2/async_vectorstore.py b/langchain_postgres/v2/async_vectorstore.py index 8382b3e..302dcbd 100644 --- a/langchain_postgres/v2/async_vectorstore.py +++ b/langchain_postgres/v2/async_vectorstore.py @@ -2,6 +2,7 @@ from __future__ import annotations import copy +import datetime import json import uuid from typing import Any, Callable, Iterable, Optional, Sequence @@ -54,6 +55,16 @@ .union(SPECIAL_CASED_OPERATORS) ) +PYTHON_TO_POSTGRES_TYPE_MAP = { + int: "INTEGER", + float: "FLOAT", + str: "TEXT", + bool: "BOOLEAN", + datetime.date: "DATE", + datetime.datetime: "TIMESTAMP", + datetime.time: "TIME", +} + class AsyncPGVectorStore(VectorStore): """Postgres Vector Store class""" @@ -1096,19 +1107,33 @@ def _handle_field_filter( operator = "$eq" filter_value = value + field_selector = field + if self.metadata_json_column is not None and field not in self.metadata_columns and field not in ( + self.id_column, + self.content_column, + self.embedding_column + ): + filter_value_type = type(filter_value[0]) if (isinstance(filter_value, list) or isinstance(filter_value, tuple)) else type(filter_value) + postgres_type = PYTHON_TO_POSTGRES_TYPE_MAP.get(filter_value_type) + if postgres_type is None: + raise ValueError(f"Unsupported type: {filter_value_type}") + field_selector = f"{self.metadata_json_column}->>'{field}'" + if postgres_type != "TEXT" and operator != "$exists": + field_selector = f"({field_selector})::{postgres_type}" + suffix_id = str(uuid.uuid4()).split("-")[0] if operator in COMPARISONS_TO_NATIVE: # Then we implement an equality filter # native is trusted input native = COMPARISONS_TO_NATIVE[operator] param_name = f"{field}_{suffix_id}" - return f"{field} {native} :{param_name}", {f"{param_name}": filter_value} + return f"{field_selector} {native} :{param_name}", {f"{param_name}": filter_value} elif operator == "$between": # Use AND with two comparisons low, high = filter_value low_param_name = f"{field}_low_{suffix_id}" high_param_name = f"{field}_high_{suffix_id}" - return f"({field} BETWEEN :{low_param_name} AND :{high_param_name})", { + return f"({field_selector} BETWEEN :{low_param_name} AND :{high_param_name})", { f"{low_param_name}": low, f"{high_param_name}": high, } @@ -1126,18 +1151,18 @@ def _handle_field_filter( ) param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}" if operator == "$in": - return f"{field} = ANY(:{param_name})", {f"{param_name}": filter_value} + return f"{field_selector} = ANY(:{param_name})", {f"{param_name}": filter_value} else: # i.e. $nin - return f"{field} <> ALL (:{param_name})", { + return f"{field_selector} <> ALL (:{param_name})", { f"{param_name}": filter_value } elif operator in {"$like", "$ilike"}: param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}" if operator == "$like": - return f"({field} LIKE :{param_name})", {f"{param_name}": filter_value} + return f"({field_selector} LIKE :{param_name})", {f"{param_name}": filter_value} else: # i.e. $ilike - return f"({field} ILIKE :{param_name})", {f"{param_name}": filter_value} + return f"({field_selector} ILIKE :{param_name})", {f"{param_name}": filter_value} elif operator == "$exists": if not isinstance(filter_value, bool): raise ValueError( @@ -1146,9 +1171,9 @@ def _handle_field_filter( ) else: if filter_value: - return f"({field} IS NOT NULL)", {} + return f"({field_selector} IS NOT NULL)", {} else: - return f"({field} IS NULL)", {} + return f"({field_selector} IS NULL)", {} else: raise NotImplementedError() diff --git a/tests/unit_tests/fixtures/metadata_filtering_data.py b/tests/unit_tests/fixtures/metadata_filtering_data.py index 8df8c01..684eef4 100644 --- a/tests/unit_tests/fixtures/metadata_filtering_data.py +++ b/tests/unit_tests/fixtures/metadata_filtering_data.py @@ -239,6 +239,179 @@ {"inventory_location": {"$exists": False}}, ["WB003"], ), + # JSON metadata filter + ( + {"code_json": "FT004"}, + ["FT004"], + ), + ( + {"name_json": "Smart Fitness Tracker"}, + ["FT004"], + ), + ( + {"is_available_json": True}, + ["WH001", "FT004", "EC002"], + ), + ( + {"code_json": "WH001", "is_available_json": True}, + ["WH001"], + ), + ( + {"available_quantity_json": {"$eq": 10}}, + ["EC002"], + ), + ( + {"available_quantity_json": {"$ne": 0}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"available_quantity_json": {"$gt": 60}}, + ["FT004"], + ), + ( + {"available_quantity_json": {"$gte": 50}}, + ["WH001", "FT004"], + ), + ( + {"available_quantity_json": {"$lt": 5}}, + ["WB003"], + ), + ( + {"available_quantity_json": {"$lte": 10}}, + ["WB003", "EC002"], + ), + ( + {"code_json": {"$eq": "WH001"}}, + ["WH001"], + ), + ( + {"code_json": {"$ne": "WB003"}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"name_json": {"$gt": "Wireless Headphones"}}, + [], + ), + ( + {"name_json": {"$gte": "Wireless Headphones"}}, + ["WH001"], + ), + ( + {"name_json": {"$lt": "Smart Fitness Tracker"}}, + ["EC002"], + ), + ( + {"name_json": {"$lte": "Smart Fitness Tracker"}}, + ["FT004", "EC002"], + ), + ( + {"is_available_json": {"$eq": True}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"is_available_json": {"$ne": True}}, + ["WB003"], + ), + ( + {"price_json": {"$gt": 200.0}}, + ["EC002"], + ), + ( + {"price_json": {"$gte": 149.99}}, + ["WH001", "EC002"], + ), + ( + {"price_json": {"$lt": 50.0}}, + ["WB003"], + ), + ( + {"price_json": {"$lte": 79.95}}, + ["FT004", "WB003"], + ), + ( + {"$or": [{"code_json": "WH001"}, {"code_json": "EC002"}]}, + ["WH001", "EC002"], + ), + ( + {"$or": [{"code_json": "WH001"}, {"available_quantity_json": 10}]}, + ["WH001", "EC002"], + ), + ( + {"$and": [{"code_json": "WH001"}, {"code_json": "EC002"}]}, + [], + ), + ( + {"$not": {"code_json": "WB003"}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": [{"code_json": "WB003"}]}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": {"available_quantity_json": 0}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": [{"available_quantity_json": 0}]}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": {"is_available_json": True}}, + ["WB003"], + ), + ( + {"$not": [{"is_available_json": True}]}, + ["WB003"], + ), + ( + {"$not": {"price_json": {"$gt": 150.0}}}, + ["WH001", "FT004", "WB003"], + ), + ( + {"$not": [{"price_json": {"$gt": 150.0}}]}, + ["WH001", "FT004", "WB003"], + ), + ( + {"available_quantity_json": {"$between": (40, 60)}}, + ["WH001"], + ), + ( + {"name_json": {"$in": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}}, + ["FT004", "WB003"], + ), + ( + {"available_quantity_json": {"$in": [0, 10]}}, + ["WB003", "EC002"], + ), + ( + {"name_json": {"$nin": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}}, + ["WH001", "EC002"], + ), + ( + {"available_quantity_json": {"$nin": [50, 0, 10]}}, + ["FT004"], + ), + ( + {"name_json": {"$like": "Wireless%"}}, + ["WH001"], + ), + ( + {"name_json": {"$like": "%less%"}}, + ["WH001", "WB003"], + ), + ( + {"$or": [{"code_json": {"$like": "WH00%"}}, {"code_json": {"$like": "EC00%"}}]}, + ["WH001", "EC002"], + ), + ( + {"tags_json": {"$exists": False}}, + [], + ), + ( + {"inventory_location_json": {"$exists": False}}, + ["WB003"], + ) ] NEGATIVE_TEST_CASES = [ diff --git a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py index 16c70fd..7211659 100644 --- a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py @@ -46,7 +46,12 @@ embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] filter_docs = [ - Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts)) + Document( + page_content=texts[i], + metadata=( + METADATAS[i] | {f"{key}_json": value for key, value in METADATAS[i].items()} + ) + ) for i in range(len(texts)) ] # Documents designed for hybrid search testing hybrid_docs_content = { @@ -194,7 +199,7 @@ async def vs_custom_filter( Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=False, + store_metadata=True, ) vs_custom_filter = await AsyncPGVectorStore.create( diff --git a/tests/unit_tests/v2/test_pg_vectorstore_search.py b/tests/unit_tests/v2/test_pg_vectorstore_search.py index 7815a25..0ca690d 100644 --- a/tests/unit_tests/v2/test_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_pg_vectorstore_search.py @@ -42,7 +42,12 @@ Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts)) ] filter_docs = [ - Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts)) + Document( + page_content=texts[i], + metadata=( + METADATAS[i] | {f"{key}_json": value for key, value in METADATAS[i].items()} + ) + ) for i in range(len(texts)) ] embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] @@ -141,7 +146,7 @@ async def vs_custom_filter(self, engine: PGEngine) -> AsyncIterator[PGVectorStor Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=False, + store_metadata=True, overwrite_existing=True, ) @@ -352,7 +357,7 @@ async def vs_custom_filter_sync( Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=False, + store_metadata=True, overwrite_existing=True, ) From 0f18ff01a0d9316201d0093886fc2c821f22485c Mon Sep 17 00:00:00 2001 From: gRedHeadphone Date: Fri, 17 Oct 2025 06:19:28 +0000 Subject: [PATCH 3/6] docs: update documentation on filtering --- examples/pg_vectorstore.ipynb | 2 +- examples/pg_vectorstore_how_to.ipynb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/pg_vectorstore.ipynb b/examples/pg_vectorstore.ipynb index 2c20e90..a6e3837 100644 --- a/examples/pg_vectorstore.ipynb +++ b/examples/pg_vectorstore.ipynb @@ -359,7 +359,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To enable search with filters, it is necessary to declare the columns that you want to filter on when creating the table. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents.\n", + "To achieve performant search with filters, it is crucial to declare the columns you want to filter on within the `metadata_columns` when creating the table, as filtering directly on these columns is far more efficient than attempting to filter on fields within a metadata JSON column. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents.\n", "\n", "`PGVectorStore` currently supports the following operators.\n", "\n", diff --git a/examples/pg_vectorstore_how_to.ipynb b/examples/pg_vectorstore_how_to.ipynb index 2c5e75a..fb38bfa 100644 --- a/examples/pg_vectorstore_how_to.ipynb +++ b/examples/pg_vectorstore_how_to.ipynb @@ -530,7 +530,7 @@ "source": [ "### Search for documents with metadata filter\n", "\n", - "A Vector Store can take advantage of relational data to filter similarity searches. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents. See the [migration guide](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/migrate_pgvector_to_pgvectorstore.ipynb) for details on how to migrate to use metadata columns.\n", + "A Vector Store can take advantage of relational data to filter similarity searches. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents. See the [migration guide](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/migrate_pgvector_to_pgvectorstore.ipynb) for details on how to migrate to use metadata columns for efficient filtering.\n", "\n", "`PGVectorStore` currently supports the following operators and all Postgres data types.\n", "\n", @@ -645,7 +645,7 @@ "\n", "- **`metadata_columns=[\"name\", \"category\", \"price_usd\", \"quantity\", \"sku\", \"image_url\"]`**: These columns are treated as metadata for each product. Metadata provides additional information about a product, such as its name, category, price, quantity available, SKU (Stock Keeping Unit), and an image URL. This information is useful for displaying product details in search results or for filtering and categorization.\n", "\n", - "- **`metadata_json_column=\"metadata\"`**: The `metadata` column can store any additional information about the products in a flexible JSON format. This allows for storing varied and complex data that doesn't fit into the standard columns.\n" + "- **`metadata_json_column=\"metadata\"`**: The `metadata` column can store any additional information about the products in a flexible JSON format. This allows for storing varied and complex data that doesn't fit into the standard columns. Note that filtering on fields within the JSON but not in `metadata_columns` will be less efficient.\n" ] }, { From 0f56c8ef5833033e109cb02d441da8107a1f5669 Mon Sep 17 00:00:00 2001 From: gRedHeadphone Date: Mon, 3 Nov 2025 11:52:06 +0530 Subject: [PATCH 4/6] chore: Update DEVELOPMENT.md Co-authored-by: Averi Kitsch --- DEVELOPMENT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index d7b07be..7cedf45 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -22,7 +22,7 @@ docker run --rm -it --name pgvector-container \ -e POSTGRES_USER=langchain \ -e POSTGRES_PASSWORD=langchain \ -e POSTGRES_DB=langchain_test \ - -p 5432:5432 pgvector/pgvector:pg16 \ + -p 6024:5432 pgvector/pgvector:pg16 \ postgres -c log_statement=all ``` From 440f3a6657010243f2b390285c8c98dd8e0a2947 Mon Sep 17 00:00:00 2001 From: gRedHeadphone Date: Mon, 3 Nov 2025 07:47:46 +0000 Subject: [PATCH 5/6] chore: separate testcases for testing metadata json filtering --- .../fixtures/metadata_filtering_data.py | 173 ------------------ .../v2/test_async_pg_vectorstore_search.py | 41 ++++- .../v2/test_pg_vectorstore_search.py | 75 +++++++- 3 files changed, 101 insertions(+), 188 deletions(-) diff --git a/tests/unit_tests/fixtures/metadata_filtering_data.py b/tests/unit_tests/fixtures/metadata_filtering_data.py index 684eef4..8df8c01 100644 --- a/tests/unit_tests/fixtures/metadata_filtering_data.py +++ b/tests/unit_tests/fixtures/metadata_filtering_data.py @@ -239,179 +239,6 @@ {"inventory_location": {"$exists": False}}, ["WB003"], ), - # JSON metadata filter - ( - {"code_json": "FT004"}, - ["FT004"], - ), - ( - {"name_json": "Smart Fitness Tracker"}, - ["FT004"], - ), - ( - {"is_available_json": True}, - ["WH001", "FT004", "EC002"], - ), - ( - {"code_json": "WH001", "is_available_json": True}, - ["WH001"], - ), - ( - {"available_quantity_json": {"$eq": 10}}, - ["EC002"], - ), - ( - {"available_quantity_json": {"$ne": 0}}, - ["WH001", "FT004", "EC002"], - ), - ( - {"available_quantity_json": {"$gt": 60}}, - ["FT004"], - ), - ( - {"available_quantity_json": {"$gte": 50}}, - ["WH001", "FT004"], - ), - ( - {"available_quantity_json": {"$lt": 5}}, - ["WB003"], - ), - ( - {"available_quantity_json": {"$lte": 10}}, - ["WB003", "EC002"], - ), - ( - {"code_json": {"$eq": "WH001"}}, - ["WH001"], - ), - ( - {"code_json": {"$ne": "WB003"}}, - ["WH001", "FT004", "EC002"], - ), - ( - {"name_json": {"$gt": "Wireless Headphones"}}, - [], - ), - ( - {"name_json": {"$gte": "Wireless Headphones"}}, - ["WH001"], - ), - ( - {"name_json": {"$lt": "Smart Fitness Tracker"}}, - ["EC002"], - ), - ( - {"name_json": {"$lte": "Smart Fitness Tracker"}}, - ["FT004", "EC002"], - ), - ( - {"is_available_json": {"$eq": True}}, - ["WH001", "FT004", "EC002"], - ), - ( - {"is_available_json": {"$ne": True}}, - ["WB003"], - ), - ( - {"price_json": {"$gt": 200.0}}, - ["EC002"], - ), - ( - {"price_json": {"$gte": 149.99}}, - ["WH001", "EC002"], - ), - ( - {"price_json": {"$lt": 50.0}}, - ["WB003"], - ), - ( - {"price_json": {"$lte": 79.95}}, - ["FT004", "WB003"], - ), - ( - {"$or": [{"code_json": "WH001"}, {"code_json": "EC002"}]}, - ["WH001", "EC002"], - ), - ( - {"$or": [{"code_json": "WH001"}, {"available_quantity_json": 10}]}, - ["WH001", "EC002"], - ), - ( - {"$and": [{"code_json": "WH001"}, {"code_json": "EC002"}]}, - [], - ), - ( - {"$not": {"code_json": "WB003"}}, - ["WH001", "FT004", "EC002"], - ), - ( - {"$not": [{"code_json": "WB003"}]}, - ["WH001", "FT004", "EC002"], - ), - ( - {"$not": {"available_quantity_json": 0}}, - ["WH001", "FT004", "EC002"], - ), - ( - {"$not": [{"available_quantity_json": 0}]}, - ["WH001", "FT004", "EC002"], - ), - ( - {"$not": {"is_available_json": True}}, - ["WB003"], - ), - ( - {"$not": [{"is_available_json": True}]}, - ["WB003"], - ), - ( - {"$not": {"price_json": {"$gt": 150.0}}}, - ["WH001", "FT004", "WB003"], - ), - ( - {"$not": [{"price_json": {"$gt": 150.0}}]}, - ["WH001", "FT004", "WB003"], - ), - ( - {"available_quantity_json": {"$between": (40, 60)}}, - ["WH001"], - ), - ( - {"name_json": {"$in": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}}, - ["FT004", "WB003"], - ), - ( - {"available_quantity_json": {"$in": [0, 10]}}, - ["WB003", "EC002"], - ), - ( - {"name_json": {"$nin": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}}, - ["WH001", "EC002"], - ), - ( - {"available_quantity_json": {"$nin": [50, 0, 10]}}, - ["FT004"], - ), - ( - {"name_json": {"$like": "Wireless%"}}, - ["WH001"], - ), - ( - {"name_json": {"$like": "%less%"}}, - ["WH001", "WB003"], - ), - ( - {"$or": [{"code_json": {"$like": "WH00%"}}, {"code_json": {"$like": "EC00%"}}]}, - ["WH001", "EC002"], - ), - ( - {"tags_json": {"$exists": False}}, - [], - ), - ( - {"inventory_location_json": {"$exists": False}}, - ["WB003"], - ) ] NEGATIVE_TEST_CASES = [ diff --git a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py index c5b09b1..c4b7c28 100644 --- a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py @@ -27,6 +27,7 @@ HYBRID_SEARCH_TABLE1 = "test_table_hybrid1" + str(uuid.uuid4()).replace("-", "_") HYBRID_SEARCH_TABLE2 = "test_table_hybrid2" + str(uuid.uuid4()).replace("-", "_") CUSTOM_FILTER_TABLE = "custom_filter" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_METADATA_JSON_TABLE = "custom_metadata_json" + str(uuid.uuid4()).replace("-", "_") VECTOR_SIZE = 768 sync_method_exception_str = "Sync methods are not implemented for AsyncPGVectorStore. Use PGVectorStore interface instead." @@ -46,12 +47,7 @@ embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] filter_docs = [ - Document( - page_content=texts[i], - metadata=( - METADATAS[i] | {f"{key}_json": value for key, value in METADATAS[i].items()} - ) - ) for i in range(len(texts)) + Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts)) ] # Documents designed for hybrid search testing hybrid_docs_content = { @@ -199,7 +195,7 @@ async def vs_custom_filter( Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=True, + store_metadata=False, ) vs_custom_filter = await AsyncPGVectorStore.create( @@ -220,6 +216,24 @@ async def vs_custom_filter( await vs_custom_filter.aadd_documents(filter_docs, ids=ids) yield vs_custom_filter + @pytest_asyncio.fixture(scope="class") + async def vs_metadata_json( + self, engine: PGEngine + ) -> AsyncIterator[AsyncPGVectorStore]: + await engine._ainit_vectorstore_table( + CUSTOM_METADATA_JSON_TABLE, + VECTOR_SIZE, + store_metadata=True, + ) + + vs_metadata_json = await AsyncPGVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_METADATA_JSON_TABLE, + ) + await vs_metadata_json.aadd_documents(filter_docs, ids=ids) + yield vs_metadata_json + async def test_asimilarity_search_score(self, vs: AsyncPGVectorStore) -> None: results = await vs.asimilarity_search_with_score("foo") assert len(results) == 4 @@ -375,6 +389,19 @@ async def test_vectorstore_with_metadata_filters( ) assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter + @pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES) + async def test_vectorstore_with_json_metadata_filters( + self, + vs_metadata_json: AsyncPGVectorStore, + test_filter: dict, + expected_ids: list[str], + ) -> None: + """Test end to end construction and search on json metadata.""" + docs = await vs_metadata_json.asimilarity_search( + "meow", k=5, filter=test_filter + ) + assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter + async def test_asimilarity_hybrid_search(self, vs: AsyncPGVectorStore) -> None: results = await vs.asimilarity_search( "foo", k=1, hybrid_search_config=HybridSearchConfig() diff --git a/tests/unit_tests/v2/test_pg_vectorstore_search.py b/tests/unit_tests/v2/test_pg_vectorstore_search.py index 0ca690d..3fa5108 100644 --- a/tests/unit_tests/v2/test_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_pg_vectorstore_search.py @@ -27,6 +27,8 @@ CUSTOM_TABLE = "custom" + str(uuid.uuid4()).replace("-", "_") CUSTOM_FILTER_TABLE = "custom_filter" + str(uuid.uuid4()).replace("-", "_") CUSTOM_FILTER_TABLE_SYNC = "custom_filter_sync" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_METADATA_JSON_TABLE = "custom_metadata_json" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_METADATA_JSON_TABLE_SYNC = "custom_metadata_json_sync" + str(uuid.uuid4()).replace("-", "_") VECTOR_SIZE = 768 embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) @@ -42,12 +44,7 @@ Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts)) ] filter_docs = [ - Document( - page_content=texts[i], - metadata=( - METADATAS[i] | {f"{key}_json": value for key, value in METADATAS[i].items()} - ) - ) for i in range(len(texts)) + Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts)) ] embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] @@ -146,7 +143,7 @@ async def vs_custom_filter(self, engine: PGEngine) -> AsyncIterator[PGVectorStor Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=True, + store_metadata=False, overwrite_existing=True, ) @@ -168,6 +165,24 @@ async def vs_custom_filter(self, engine: PGEngine) -> AsyncIterator[PGVectorStor await vs_custom_filter.aadd_documents(filter_docs, ids=ids) yield vs_custom_filter + @pytest_asyncio.fixture(scope="class") + async def vs_metadata_json( + self, engine: PGEngine + ) -> AsyncIterator[PGVectorStore]: + await engine.ainit_vectorstore_table( + CUSTOM_METADATA_JSON_TABLE, + VECTOR_SIZE, + store_metadata=True, + ) + + vs_metadata_json = await PGVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_METADATA_JSON_TABLE, + ) + await vs_metadata_json.aadd_documents(filter_docs, ids=ids) + yield vs_metadata_json + async def test_asimilarity_search_score(self, vs: PGVectorStore) -> None: results = await vs.asimilarity_search_with_score("foo") assert len(results) == 4 @@ -270,6 +285,19 @@ async def test_vectorstore_with_metadata_filters( "meow", k=5, filter=test_filter ) assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter + + @pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES) + async def test_vectorstore_with_json_metadata_filters( + self, + vs_metadata_json: PGVectorStore, + test_filter: dict, + expected_ids: list[str], + ) -> None: + """Test end to end construction and search on json metadata.""" + docs = await vs_metadata_json.asimilarity_search( + "meow", k=5, filter=test_filter + ) + assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter async def test_asimilarity_hybrid_search(self, vs: PGVectorStore) -> None: results = await vs.asimilarity_search( @@ -357,7 +385,7 @@ async def vs_custom_filter_sync( Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=True, + store_metadata=False, overwrite_existing=True, ) @@ -380,6 +408,24 @@ async def vs_custom_filter_sync( vs_custom_filter_sync.add_documents(filter_docs, ids=ids) yield vs_custom_filter_sync + @pytest_asyncio.fixture(scope="class") + async def vs_metadata_json_sync( + self, engine_sync: PGEngine + ) -> AsyncIterator[PGVectorStore]: + engine_sync.init_vectorstore_table( + CUSTOM_METADATA_JSON_TABLE_SYNC, + VECTOR_SIZE, + store_metadata=True, + ) + + vs_metadata_json_sync = await PGVectorStore.create( + engine_sync, + embedding_service=embeddings_service, + table_name=CUSTOM_METADATA_JSON_TABLE_SYNC, + ) + vs_metadata_json_sync.add_documents(filter_docs, ids=ids) + yield vs_metadata_json_sync + def test_similarity_search_score(self, vs_custom: PGVectorStore) -> None: results = vs_custom.similarity_search_with_score("foo") assert len(results) == 4 @@ -434,6 +480,19 @@ def test_sync_vectorstore_with_metadata_filters( docs = vs_custom_filter_sync.similarity_search("meow", k=5, filter=test_filter) assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter + @pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES) + def test_sync_vectorstore_with_json_metadata_filters( + self, + vs_metadata_json_sync: PGVectorStore, + test_filter: dict, + expected_ids: list[str], + ) -> None: + """Test end to end construction and search on json metadata.""" + docs = vs_metadata_json_sync.similarity_search( + "meow", k=5, filter=test_filter + ) + assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter + @pytest.mark.parametrize("test_filter", NEGATIVE_TEST_CASES) def test_metadata_filter_negative_tests( self, vs_custom_filter_sync: PGVectorStore, test_filter: dict From e46218133bffe88312e865b6c84a6eccdc2a3929 Mon Sep 17 00:00:00 2001 From: gRedHeadphone Date: Mon, 17 Nov 2025 05:27:22 +0000 Subject: [PATCH 6/6] chore: ruff format --- langchain_postgres/v2/async_vectorstore.py | 42 +++++++++++++------ .../v2/test_async_pg_vectorstore_search.py | 4 +- .../v2/test_pg_vectorstore_search.py | 18 ++++---- 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/langchain_postgres/v2/async_vectorstore.py b/langchain_postgres/v2/async_vectorstore.py index 5eaae84..011e9e8 100644 --- a/langchain_postgres/v2/async_vectorstore.py +++ b/langchain_postgres/v2/async_vectorstore.py @@ -1105,12 +1105,17 @@ def _handle_field_filter( filter_value = value field_selector = field - if self.metadata_json_column is not None and field not in self.metadata_columns and field not in ( - self.id_column, - self.content_column, - self.embedding_column + if ( + self.metadata_json_column is not None + and field not in self.metadata_columns + and field + not in (self.id_column, self.content_column, self.embedding_column) ): - filter_value_type = type(filter_value[0]) if (isinstance(filter_value, list) or isinstance(filter_value, tuple)) else type(filter_value) + filter_value_type = ( + type(filter_value[0]) + if (isinstance(filter_value, list) or isinstance(filter_value, tuple)) + else type(filter_value) + ) postgres_type = PYTHON_TO_POSTGRES_TYPE_MAP.get(filter_value_type) if postgres_type is None: raise ValueError(f"Unsupported type: {filter_value_type}") @@ -1124,16 +1129,21 @@ def _handle_field_filter( # native is trusted input native = COMPARISONS_TO_NATIVE[operator] param_name = f"{field}_{suffix_id}" - return f"{field_selector} {native} :{param_name}", {f"{param_name}": filter_value} + return f"{field_selector} {native} :{param_name}", { + f"{param_name}": filter_value + } elif operator == "$between": # Use AND with two comparisons low, high = filter_value low_param_name = f"{field}_low_{suffix_id}" high_param_name = f"{field}_high_{suffix_id}" - return f"({field_selector} BETWEEN :{low_param_name} AND :{high_param_name})", { - f"{low_param_name}": low, - f"{high_param_name}": high, - } + return ( + f"({field_selector} BETWEEN :{low_param_name} AND :{high_param_name})", + { + f"{low_param_name}": low, + f"{high_param_name}": high, + }, + ) elif operator in {"$in", "$nin"}: # We'll do force coercion to text for val in filter_value: @@ -1148,7 +1158,9 @@ def _handle_field_filter( ) param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}" if operator == "$in": - return f"{field_selector} = ANY(:{param_name})", {f"{param_name}": filter_value} + return f"{field_selector} = ANY(:{param_name})", { + f"{param_name}": filter_value + } else: # i.e. $nin return f"{field_selector} <> ALL (:{param_name})", { f"{param_name}": filter_value @@ -1157,9 +1169,13 @@ def _handle_field_filter( elif operator in {"$like", "$ilike"}: param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}" if operator == "$like": - return f"({field_selector} LIKE :{param_name})", {f"{param_name}": filter_value} + return f"({field_selector} LIKE :{param_name})", { + f"{param_name}": filter_value + } else: # i.e. $ilike - return f"({field_selector} ILIKE :{param_name})", {f"{param_name}": filter_value} + return f"({field_selector} ILIKE :{param_name})", { + f"{param_name}": filter_value + } elif operator == "$exists": if not isinstance(filter_value, bool): raise ValueError( diff --git a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py index c4b7c28..2a63f58 100644 --- a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py @@ -27,7 +27,9 @@ HYBRID_SEARCH_TABLE1 = "test_table_hybrid1" + str(uuid.uuid4()).replace("-", "_") HYBRID_SEARCH_TABLE2 = "test_table_hybrid2" + str(uuid.uuid4()).replace("-", "_") CUSTOM_FILTER_TABLE = "custom_filter" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_METADATA_JSON_TABLE = "custom_metadata_json" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_METADATA_JSON_TABLE = "custom_metadata_json" + str(uuid.uuid4()).replace( + "-", "_" +) VECTOR_SIZE = 768 sync_method_exception_str = "Sync methods are not implemented for AsyncPGVectorStore. Use PGVectorStore interface instead." diff --git a/tests/unit_tests/v2/test_pg_vectorstore_search.py b/tests/unit_tests/v2/test_pg_vectorstore_search.py index 3fa5108..b0d35e8 100644 --- a/tests/unit_tests/v2/test_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_pg_vectorstore_search.py @@ -27,8 +27,12 @@ CUSTOM_TABLE = "custom" + str(uuid.uuid4()).replace("-", "_") CUSTOM_FILTER_TABLE = "custom_filter" + str(uuid.uuid4()).replace("-", "_") CUSTOM_FILTER_TABLE_SYNC = "custom_filter_sync" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_METADATA_JSON_TABLE = "custom_metadata_json" + str(uuid.uuid4()).replace("-", "_") -CUSTOM_METADATA_JSON_TABLE_SYNC = "custom_metadata_json_sync" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_METADATA_JSON_TABLE = "custom_metadata_json" + str(uuid.uuid4()).replace( + "-", "_" +) +CUSTOM_METADATA_JSON_TABLE_SYNC = "custom_metadata_json_sync" + str( + uuid.uuid4() +).replace("-", "_") VECTOR_SIZE = 768 embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) @@ -166,9 +170,7 @@ async def vs_custom_filter(self, engine: PGEngine) -> AsyncIterator[PGVectorStor yield vs_custom_filter @pytest_asyncio.fixture(scope="class") - async def vs_metadata_json( - self, engine: PGEngine - ) -> AsyncIterator[PGVectorStore]: + async def vs_metadata_json(self, engine: PGEngine) -> AsyncIterator[PGVectorStore]: await engine.ainit_vectorstore_table( CUSTOM_METADATA_JSON_TABLE, VECTOR_SIZE, @@ -285,7 +287,7 @@ async def test_vectorstore_with_metadata_filters( "meow", k=5, filter=test_filter ) assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter - + @pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES) async def test_vectorstore_with_json_metadata_filters( self, @@ -488,9 +490,7 @@ def test_sync_vectorstore_with_json_metadata_filters( expected_ids: list[str], ) -> None: """Test end to end construction and search on json metadata.""" - docs = vs_metadata_json_sync.similarity_search( - "meow", k=5, filter=test_filter - ) + docs = vs_metadata_json_sync.similarity_search("meow", k=5, filter=test_filter) assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter @pytest.mark.parametrize("test_filter", NEGATIVE_TEST_CASES)