diff --git a/Dockerfile b/Dockerfile index 71244404..4f1d8d2e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.10-slim +FROM python:3.13-slim LABEL authors="Oleksandr Khoroshevskyi, Nathan Sheffield" RUN apt-get update @@ -29,9 +29,10 @@ RUN apt-get install -y build-essential RUN pip install uv # Install CPU-only pytorch, eliminating huge nvidia dependencies -RUN pip install torch==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html -RUN pip install https://github.com/pepkit/pipestat/archive/refs/heads/dev.zip +#pip install torch==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html +RUN uv pip install torch --index-url https://download.pytorch.org/whl/cpu --system +# RUN uv pip install https://github.com/pepkit/pipestat/archive/refs/heads/dev.zip --system RUN uv pip install -r requirements/requirements-all.txt --no-cache-dir --system -CMD ["uvicorn", "bedhost.main:app", "--host", "0.0.0.0", "--port", "80"] +CMD ["uvicorn", "bedhost.main:app", "--host", "0.0.0.0", "--port", "80"] \ No newline at end of file diff --git a/bedhost/_version.py b/bedhost/_version.py index ea370a8e..def467e0 100644 --- a/bedhost/_version.py +++ b/bedhost/_version.py @@ -1 +1 @@ -__version__ = "0.12.0" +__version__ = "0.12.1" diff --git a/bedhost/routers/bed_api.py b/bedhost/routers/bed_api.py index 5cbd9b5a..1e745321 100644 --- a/bedhost/routers/bed_api.py +++ b/bedhost/routers/bed_api.py @@ -65,7 +65,9 @@ async def get_example_bed_record(): response_model=BedListResult, ) async def list_beds( - limit: int = 1000, + limit: int = Query( + 1000, ge=1, le=10000, description="Limit (1-10000), default 1000" + ), offset: int = 0, genome: str = Query( default=None, description="filter by genome of the bed file. e.g. 'hg38'" @@ -77,6 +79,7 @@ async def list_beds( """ Returns list of BED files in the database with optional filters. """ + return bbagent.bed.get_ids_list( limit=limit, offset=offset, genome=genome, bed_compliance=bed_compliance ) @@ -388,7 +391,7 @@ async def text_to_bed_search( assay: Optional[Union[str, None]] = None, limit: int = 10, offset: int = 0, - test_request: bool = test_query_parameter, + test_request: bool = test_query_parameter, # needed for usage tracking in @count_requests ): """ Search for a BedFile by a text query. @@ -398,31 +401,8 @@ async def text_to_bed_search( """ _LOGGER.info( - f"Searching for: '{query}' with limit='{limit}' and offset='{offset}' and genome='{genome}'" + f"Searching for: '{query}' with limit='{limit}' and offset='{offset}' and genome='{genome}' and assay='{assay}'" ) - # - # # results_sql = bbagent.bed.sql_search( - # # query, limit=round(limit / 2, 0), offset=round(offset / 2, 0) - # # ) - # # - # # if results_sql.count > results_sql.offset: - # # qdrant_offset = offset - results_sql.offset - # # else: - # # qdrant_offset = offset - results_sql.count - # # - # # results_qdr = bbagent.bed.text_to_bed_search( - # # query, limit=limit, offset=qdrant_offset - 1 if qdrant_offset > 0 else 0 - # # ) - # # - # # results = BedListSearchResult( - # # count=results_qdr.count, - # # limit=limit, - # # offset=offset, - # # results=(results_sql.results + results_qdr.results)[0:limit], - # # ) - # query = query.strip() - # - # if not genome or genome == "hg38": spaceless_query = query.replace(" ", "") if len(spaceless_query) == 32 and spaceless_query == query: @@ -483,17 +463,54 @@ async def text_to_bed_search( if result.count != 0: return result - results = bbagent.bed.semantic_search( + # # Basic semantic search + # results = bbagent.bed.semantic_search( + # query, + # genome_alias=genome, + # assay=assay, + # limit=limit, + # offset=offset, + # ) + + # # Hybrid search + results = bbagent.bed.hybrid_search( query, genome_alias=genome, assay=assay, limit=limit, offset=offset, ) + return results - if results: - return results - raise HTTPException(status_code=404, detail="No records found") + # # # Bi-vec search + # + # # This is disabled for now, as it is sql search mix, which we don't want to mix + # # results_sql = bbagent.bed.sql_search( + # # query, limit=round(limit / 2, 0), offset=round(offset / 2, 0) + # # ) + # # + # # if results_sql.count > results_sql.offset: + # # qdrant_offset = offset - results_sql.offset + # # else: + # # qdrant_offset = offset - results_sql.count + # # results_qdr = bbagent.bed.text_to_bed_search( + # # query, limit=limit, offset=qdrant_offset - 1 if qdrant_offset > 0 else 0 + # # ) + # # results = BedListSearchResult( + # # count=results_qdr.count, + # # limit=limit, + # # offset=offset, + # # ) + # # print("results:", results_qdr) + # # + # # raise HTTPException(status_code=404, detail="No records found") + # + # + # results_qdr = bbagent.bed.text_to_bed_search( + # query, limit=limit, offset=offset + # ) + + return results_qdr @router.get( diff --git a/deployment/config/api-dev.bedbase.org.yaml b/deployment/config/api-dev.bedbase.org.yaml index 79da07a2..dca70f95 100644 --- a/deployment/config/api-dev.bedbase.org.yaml +++ b/deployment/config/api-dev.bedbase.org.yaml @@ -4,7 +4,8 @@ path: # region2vec: databio/r2v-pretrained-for-search region2vec: databio/r2v-encode-hg38 vec2vec: 'databio/v2v-sentencetransformers-encode' - umap_model: "https://huggingface.co/databio/bedbase-umap/resolve/main/hg38_umap_umap_model.joblib" + umap_model: "https://huggingface.co/databio/bedbase-umap/resolve/main/hg38_umap_umap_model_3_13.joblib" + sparse_model: "prithivida/Splade_PP_en_v2" database: host: $POSTGRES_HOST port: 5432 @@ -17,6 +18,7 @@ qdrant: api_key: $QDRANT_API_KEY file_collection: bedbase text_collection: bed_text + hybrid_collection: bedbase_sparse_collection server: host: 0.0.0.0 port: 8000 diff --git a/deployment/config/api.bedbase.org.yaml b/deployment/config/api.bedbase.org.yaml index 9c483233..ac4e7f9a 100644 --- a/deployment/config/api.bedbase.org.yaml +++ b/deployment/config/api.bedbase.org.yaml @@ -1,10 +1,11 @@ path: remote_url_base: http://data.bedbase.org/ - text2vec: "sentence-transformers/all-MiniLM-L6-v2" + text2vec: 'sentence-transformers/all-MiniLM-L6-v2' # region2vec: databio/r2v-pretrained-for-search region2vec: databio/r2v-encode-hg38 - vec2vec: "databio/v2v-sentencetransformers-encode" - umap_model: "https://huggingface.co/databio/bedbase-umap/resolve/main/hg38_umap_umap_model.joblib" + vec2vec: 'databio/v2v-sentencetransformers-encode' + umap_model: "https://huggingface.co/databio/bedbase-umap/resolve/main/hg38_umap_umap_model_3_13.joblib" + sparse_model: "prithivida/Splade_PP_en_v2" database: host: $POSTGRES_HOST port: 5432 @@ -17,6 +18,7 @@ qdrant: api_key: $QDRANT_API_KEY file_collection: bedbase text_collection: bed_text + hybrid_collection: bedbase_sparse_collection server: host: 0.0.0.0 port: 8000 diff --git a/dev.Dockerfile b/dev.Dockerfile index b67d284c..4f1d8d2e 100644 --- a/dev.Dockerfile +++ b/dev.Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.10-slim +FROM python:3.13-slim LABEL authors="Oleksandr Khoroshevskyi, Nathan Sheffield" RUN apt-get update @@ -29,8 +29,9 @@ RUN apt-get install -y build-essential RUN pip install uv # Install CPU-only pytorch, eliminating huge nvidia dependencies -RUN pip install torch==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html -RUN pip install https://github.com/pepkit/pipestat/archive/refs/heads/dev.zip +#pip install torch==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html +RUN uv pip install torch --index-url https://download.pytorch.org/whl/cpu --system +# RUN uv pip install https://github.com/pepkit/pipestat/archive/refs/heads/dev.zip --system RUN uv pip install -r requirements/requirements-all.txt --no-cache-dir --system diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 35b1911d..eaa44f47 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,5 @@ -# bbconf @ git+https://github.com/databio/bbconf.git@umap#egg=bbconf -bbconf>=0.13.0 +# bbconf @ git+https://github.com/databio/bbconf.git@dev#egg=bbconf +bbconf>=0.14.1 fastapi>=0.103.0 logmuse>=0.2.7 markdown