From 7ff2c569dfd4adcb50baf7f5ba921b3eef5615c9 Mon Sep 17 00:00:00 2001 From: Ondrej Sedlacek Date: Thu, 21 Nov 2024 10:48:53 +0100 Subject: [PATCH 1/2] API: Split `list_entity_type_eids` in two. - Added endpoint to get only documents. - Added endpoint to get only count. - Deprecated original endpoint. --- dp3/api/internal/entity_response_models.py | 6 ++ dp3/api/routers/entity.py | 112 +++++++++++++++++---- dp3/database/snapshots.py | 95 +++++++++++++++-- 3 files changed, 184 insertions(+), 29 deletions(-) diff --git a/dp3/api/internal/entity_response_models.py b/dp3/api/internal/entity_response_models.py index 728f7701..7ab717d5 100644 --- a/dp3/api/internal/entity_response_models.py +++ b/dp3/api/internal/entity_response_models.py @@ -52,6 +52,12 @@ class EntityEidList(BaseModel): data: EntityEidSnapshots +class EntityEidCount(BaseModel): + """Total count of documents available under specified filter.""" + + total_count: int + + class EntityEidData(BaseModel): """Data of entity eid diff --git a/dp3/api/routers/entity.py b/dp3/api/routers/entity.py index 0fed92bf..74433fdd 100644 --- a/dp3/api/routers/entity.py +++ b/dp3/api/routers/entity.py @@ -8,6 +8,7 @@ from dp3.api.internal.entity_response_models import ( EntityEidAttrValue, EntityEidAttrValueOrHistory, + EntityEidCount, EntityEidData, EntityEidList, EntityEidMasterRecord, @@ -76,8 +77,29 @@ def get_eid_snapshots_handler( router = APIRouter(dependencies=[Depends(check_etype)]) +def _validate_snapshot_filters(fulltext_filters, generic_filter): + if not fulltext_filters: + fulltext_filters = {} + if not isinstance(fulltext_filters, dict): + raise HTTPException(status_code=400, detail="Fulltext filter is invalid") + + if not generic_filter: + generic_filter = {} + if not isinstance(generic_filter, dict): + raise HTTPException(status_code=400, detail="Generic filter is invalid") + + for attr in fulltext_filters: + ftr = fulltext_filters[attr] + if not isinstance(ftr, str): + raise HTTPException(status_code=400, detail=f"Filter '{ftr}' is not string") + + return fulltext_filters, generic_filter + + @router.get( - "/{etype}", responses={400: {"description": "Query can't be processed", "model": ErrorResponse}} + "/{etype}", + responses={400: {"description": "Query can't be processed", "model": ErrorResponse}}, + deprecated=True, ) async def list_entity_type_eids( etype: str, @@ -88,7 +110,47 @@ async def list_entity_type_eids( ) -> EntityEidList: """List latest snapshots of all `id`s present in database under `etype`. + Deprecated in favor of `/entity/{etype}/get` and `/entity/{etype}/count` endpoints, + which provide more flexibility and better performance. + + See `/entity/{etype}/get` for more information. + """ + fulltext_filters, generic_filter = _validate_snapshot_filters(fulltext_filters, generic_filter) + + try: + cursor, total_count = DB.snapshots.get_latest(etype, fulltext_filters, generic_filter) + cursor_page = cursor.skip(skip).limit(limit) + except DatabaseError as e: + raise HTTPException(status_code=400, detail=str(e)) from e + + time_created = None + + # Remove _id field + result = [r["last"] for r in cursor_page] + for r in result: + time_created = r["_time_created"] + del r["_time_created"] + + return EntityEidList( + time_created=time_created, count=len(result), total_count=total_count, data=result + ) + + +@router.get( + "/{etype}/get", + responses={400: {"description": "Query can't be processed", "model": ErrorResponse}}, +) +async def get_entity_type_eids( + etype: str, + fulltext_filters: Json = None, + generic_filter: Json = None, + skip: NonNegativeInt = 0, + limit: NonNegativeInt = 20, +) -> EntityEidList: + """List latest snapshots of all `id`s present in database under `etype`. + Contains only latest snapshot. + The `total_count` returned is always 0, use `/entity/{etype}/count` to get total count. Uses pagination. Setting `limit` to 0 is interpreted as no limit (return all results). @@ -164,23 +226,10 @@ async def list_entity_type_eids( Generic and fulltext filters are merged - fulltext overrides conflicting keys. """ - if not fulltext_filters: - fulltext_filters = {} - if not isinstance(fulltext_filters, dict): - raise HTTPException(status_code=400, detail="Fulltext filter is invalid") - - if not generic_filter: - generic_filter = {} - if not isinstance(generic_filter, dict): - raise HTTPException(status_code=400, detail="Generic filter is invalid") - - for attr in fulltext_filters: - ftr = fulltext_filters[attr] - if not isinstance(ftr, str): - raise HTTPException(status_code=400, detail=f"Filter '{ftr}' is not string") + fulltext_filters, generic_filter = _validate_snapshot_filters(fulltext_filters, generic_filter) try: - cursor, total_count = DB.snapshots.get_latest(etype, fulltext_filters, generic_filter) + cursor = DB.snapshots.find_latest(etype, fulltext_filters, generic_filter) cursor_page = cursor.skip(skip).limit(limit) except DatabaseError as e: raise HTTPException(status_code=400, detail=str(e)) from e @@ -193,9 +242,34 @@ async def list_entity_type_eids( time_created = r["_time_created"] del r["_time_created"] - return EntityEidList( - time_created=time_created, count=len(result), total_count=total_count, data=result - ) + return EntityEidList(time_created=time_created, count=len(result), total_count=0, data=result) + + +@router.get( + "/{etype}/count", + responses={400: {"description": "Query can't be processed", "model": ErrorResponse}}, +) +async def count_entity_type_eids( + etype: str, + fulltext_filters: Json = None, + generic_filter: Json = None, +) -> EntityEidCount: + """Count latest snapshots of all `id`s present in database under `etype`. + + Returns only count of documents matching `generic_filter` and `fulltext_filters`, + see `/entity/{etype}/get` documentation for details. + + Note that responses from this endpoint may take much longer than `/entity/{etype}/get` + for large datasets. + """ + fulltext_filters, generic_filter = _validate_snapshot_filters(fulltext_filters, generic_filter) + + try: + count = DB.snapshots.count_latest(etype, fulltext_filters, generic_filter) + except DatabaseError as e: + raise HTTPException(status_code=400, detail=str(e)) from e + + return EntityEidCount(total_count=count) @router.get("/{etype}/{eid}") diff --git a/dp3/database/snapshots.py b/dp3/database/snapshots.py index 9e34e06f..150ae2d3 100644 --- a/dp3/database/snapshots.py +++ b/dp3/database/snapshots.py @@ -185,13 +185,59 @@ def get_latest( May raise `SnapshotCollectionError` if query is invalid. """ snapshot_col = self._col() + query = self._prepare_latest_query(fulltext_filters or {}, generic_filter or {}) - if not fulltext_filters: - fulltext_filters = {} + try: + return snapshot_col.find(query, {"last": 1}).sort( + [("_id", pymongo.ASCENDING)] + ), snapshot_col.count_documents(query) + except OperationFailure as e: + raise SnapshotCollectionError(f"Query is invalid: {e}") from e + + def find_latest( + self, + fulltext_filters: Optional[dict[str, str]] = None, + generic_filter: Optional[dict[str, Any]] = None, + ) -> Cursor: + """Find latest snapshots of given `etype`. + + See [`get_latest`][dp3.database.snapshots.SnapshotCollectionContainer.get_latest] + for more information. + + Returns only documents matching `generic_filter` and `fulltext_filters`, + does not count them. + """ + query = self._prepare_latest_query(fulltext_filters or {}, generic_filter or {}) + try: + return self._col().find(query, {"last": 1}).sort([("_id", pymongo.ASCENDING)]) + except OperationFailure as e: + raise SnapshotCollectionError(f"Query is invalid: {e}") from e - if not generic_filter: - generic_filter = {} + def count_latest( + self, + fulltext_filters: Optional[dict[str, str]] = None, + generic_filter: Optional[dict[str, Any]] = None, + ) -> int: + """Count latest snapshots of given `etype`. + + See [`get_latest`][dp3.database.snapshots.SnapshotCollectionContainer.get_latest] + for more information. + + Returns only count of documents matching `generic_filter` and `fulltext_filters`. + + Note that this method may take much longer than `get_latest` on larger databases, + as it does count all documents, not just return the first few. + """ + query = self._prepare_latest_query(fulltext_filters or {}, generic_filter or {}) + try: + return self._col().count_documents(query) + except OperationFailure as e: + raise SnapshotCollectionError(f"Query is invalid: {e}") from e + def _prepare_latest_query( + self, fulltext_filters: dict[str, str], generic_filter: dict[str, Any] + ): + """Prepare query for get_latest method.""" # Create base of query try: query = search_and_replace(generic_filter) @@ -222,12 +268,7 @@ def get_latest( else: query["last." + attr] = fulltext_filter - try: - return snapshot_col.find(query, {"last": 1}).sort( - [("_id", pymongo.ASCENDING)] - ), snapshot_col.count_documents(query) - except OperationFailure as e: - raise SnapshotCollectionError(f"Query is invalid: {e}") from e + return query def get_by_eid( self, eid: AnyEidT, t1: Optional[datetime] = None, t2: Optional[datetime] = None @@ -778,6 +819,40 @@ def get_latest( """ return self[entity_type].get_latest(fulltext_filters, generic_filter) + def find_latest( + self, + entity_type: str, + fulltext_filters: Optional[dict[str, str]] = None, + generic_filter: Optional[dict[str, Any]] = None, + ) -> Cursor: + """Find latest snapshots of given `etype`. + + see [`get_latest`][dp3.database.snapshots.SnapshotCollectionContainer.get_latest] + for more information. + + Returns only documents matching `generic_filter` and `fulltext_filters`, + does not count them. + """ + return self[entity_type].find_latest(fulltext_filters, generic_filter) + + def count_latest( + self, + entity_type: str, + fulltext_filters: Optional[dict[str, str]] = None, + generic_filter: Optional[dict[str, Any]] = None, + ) -> int: + """Count latest snapshots of given `etype`. + + see [`get_latest`][dp3.database.snapshots.SnapshotCollectionContainer.get_latest] + for more information. + + Returns only count of documents matching `generic_filter` and `fulltext_filters`. + + Note that this method may take much longer than `get_latest` on larger databases, + as it does count all documents, not just return the first few. + """ + return self[entity_type].count_latest(fulltext_filters, generic_filter) + def get_by_eid( self, entity_type: str, From 61dd0de008f1358a7d6a970355dc856113c02141 Mon Sep 17 00:00:00 2001 From: Ondrej Sedlacek Date: Thu, 21 Nov 2024 10:49:27 +0100 Subject: [PATCH 2/2] Docs: Update API page. --- docs/api.md | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++- mkdocs.yml | 2 ++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/docs/api.md b/docs/api.md index 191690a2..b4508064 100644 --- a/docs/api.md +++ b/docs/api.md @@ -7,7 +7,9 @@ There are several API endpoints: - [`GET /`](#index): check if API is running (just returns `It works!` message) - [`POST /datapoints`](#insert-datapoints): insert datapoints into DP³ -- [`GET /entity/`](#list-entities): list current snapshots of all entities of given type +- ~~[`GET /entity/`](#list-entities): list current snapshots of all entities of given type~~ +- [`GET /entity//get`](#get-entities): get current snapshots of entities of entity type +- [`GET /entity//count`](#count-entities): get total document count for query of entity type - [`GET /entity//`](#get-eid-data): get data of entity with given entity id - [`GET /entity///get/`](#get-attr-value): get attribute value - [`GET /entity///set/`](#set-attr-value): set attribute value @@ -198,8 +200,46 @@ v -> some_embedded_dict_field ## List entities +!!! warning "Deprecated" + + This endpoint is deprecated and will be removed in the future, + Use [`GET /entity//get`](#get-entities) to get paged documents and + [`GET /entity//count`](#count-entities) to get total document count for query. + List latest snapshots of all ids present in database under entity type, filtered by `generic_filter` and `fulltext_filters`. +Contains only the latest snapshot per entity. + +Counts all results for given query. + +### Request + +`GET /entity/` + +**Optional query parameters:** + +- skip: how many entities to skip (default: 0) +- limit: how many entities to return (default: 20) +- fulltext_filters: dictionary of fulltext filters (default: no filters) +- generic_filter: dictionary of generic filters (default: no filters) + +### Response + +```json +{ + "time_created": "2023-07-04T12:10:38.827Z", + "data": [ + {} + ] +} +``` + +--- + +## Get entities + +Get a list of latest snapshots of all ids present in database under entity type, +filtered by `generic_filter` and `fulltext_filters`. Contains only the latest snapshot per entity. Uses pagination, default limit is 20, setting to 0 will return all results. @@ -244,6 +284,31 @@ Generic and fulltext filters are merged - fulltext overrides conflicting keys. --- +## Count entities + +Count latest snapshots of all ids present in database under entity type, +filtered by `generic_filter` and `fulltext_filters`. +See [`GET /entity//get`](#get-entities) for details on filter format. + +### Request + +`GET /entity//count` + +**Optional query parameters:** + +- fulltext_filters: dictionary of fulltext filters (default: no filters) +- generic_filter: dictionary of generic filters (default: no filters) + +### Response + +```json +{ + "total_count": 0 +} +``` + +--- + ## Get Eid data Get data of entity type's eid. diff --git a/mkdocs.yml b/mkdocs.yml index 14620d7d..4f8a1e17 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -82,6 +82,8 @@ markdown_extensions: # Additional attribute lists (used e.g. for image size) - attr_list - md_in_html + # Strike-through + - pymdownx.tilde plugins: # Default search bar