sunitj · sunitj · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
diff --git a/app.py b/app.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import asyncio
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_ollama.llms import OllamaLLM
 import streamlit as st
@@ -35,18 +36,22 @@
 template = """{prompt}
 Answer:
 """
-prompt = ChatPromptTemplate.from_template(template)
+prompt_template = ChatPromptTemplate.from_template(template)
 model = OllamaLLM(model="llama3.1")
-chain = prompt | model
+chain = prompt_template | model
 
 # Main content
 col1, col2 = st.columns(2)
 with col1:
     question = st.text_input("Enter your question here")
     if question:
         with st.spinner("Thinking..."):
-            prompt = lit_review_prompt(question)
-            answer = chain.invoke(prompt)
+            # Streamlit runs the script from top to bottom on each interaction.
+            # For a simple, one-off async call like this, asyncio.run() is a
+            # pragmatic choice. For more complex apps, a different approach
+            # might be needed.
+            prompt = asyncio.run(lit_review_prompt(question))
+            answer = chain.invoke({"prompt": prompt})
             st.success("Done!")
         st.markdown(f"**Answer:** {answer}")
     else:

diff --git a/literaturereviewbot/prompts/literature_review.py b/literaturereviewbot/prompts/literature_review.py
@@ -1,11 +1,14 @@
-from literaturereviewbot.search_pubmed import query as pubmed_query
+from literaturereviewbot import search
 from literaturereviewbot.documents import index_documents, retrieve_documents
 
 
-def generate_prompt(question):
-    _, abstract_arr, pubmed_ids = pubmed_query(question)
+async def generate_prompt(question):
+    _, abstract_arr, ids = await search.query(question)
+    if not abstract_arr:
+        return "I don't have enough information to answer this question."
+
     vector_store = index_documents(
-        query=question, abstract_arr=abstract_arr, ids_arr=pubmed_ids
+        query=question, abstract_arr=abstract_arr, ids_arr=ids
     )
     documents = retrieve_documents(question, vector_store)
     messages = [
@@ -14,7 +17,7 @@ def generate_prompt(question):
             "content_type": "instructions",
             "content": """
              You are a professional biomedical researcher.
-             You will be given a series of article abstracts. 
+             You will be given a series of article abstracts.
              The information in your response should exclusively come from the content type 'abstracts'.
              If no relevant information is found in the abstracts, you can say 'I don't have enough information to answer this question'.
             """,
@@ -23,8 +26,8 @@ def generate_prompt(question):
     for i, doc in enumerate(documents):
         try:
             content = doc.page_content if hasattr(doc, "page_content") else ""
-        except Exception as e:
-            print(f"Error processing document: {doc}, error: {e}")
+        except Exception:
+            print(doc)
             content = ""
             continue
         messages.append(

diff --git a/literaturereviewbot/search.py b/literaturereviewbot/search.py
@@ -0,0 +1,40 @@
+import asyncio
+from . import search_pubmed, search_biorxiv
+
+async def query(query_text=""):
+    """
+    Query both PubMed and bioRxiv for a given text.
+
+    This function queries both PubMed and bioRxiv asynchronously and
+    combines the results.
+
+    Args:
+        query_text (str): The text to search for.
+
+    Returns:
+        tuple: A tuple containing three lists: combined citations,
+               combined abstracts, and combined IDs (PMIDs and DOIs).
+    """
+    # We need to make sure the pubmed query is async
+    pubmed_task = asyncio.create_task(search_pubmed.query(query_text))
+    biorxiv_task = asyncio.create_task(search_biorxiv.query(query_text))
+
+    results = await asyncio.gather(pubmed_task, biorxiv_task)
+
+    pubmed_citations, pubmed_abstracts, pubmed_ids = results[0]
+    biorxiv_citations, biorxiv_abstracts, biorxiv_dois = results[1]
+
+    combined_citations = pubmed_citations + biorxiv_citations
+    combined_abstracts = pubmed_abstracts + biorxiv_abstracts
+    combined_ids = pubmed_ids + biorxiv_dois
+
+    return combined_citations, combined_abstracts, combined_ids
+
+if __name__ == '__main__':
+    async def main():
+        citations, abstracts, ids = await query("crispr")
+        print(f"Found {len(citations)} articles.")
+        for c in citations:
+            print(c)
+
+    asyncio.run(main())
diff --git a/literaturereviewbot/search_biorxiv.py b/literaturereviewbot/search_biorxiv.py
@@ -0,0 +1,81 @@
+import asyncio
+import httpx
+from datetime import datetime, timedelta
+
+PAGE_SIZE = 100
+
+async def query(query_text=""):
+    """
+    Search bioRxiv for a given query text.
+
+    This function fetches preprints from the last 30 days from the bioRxiv API
+    and filters them based on whether the query text appears in the title or abstract.
+
+    Args:
+        query_text (str): The text to search for.
+
+    Returns:
+        tuple: A tuple containing three lists: citations, abstracts, and DOIs.
+    """
+    date_to = datetime.now()
+    date_from = date_to - timedelta(days=30)
+
+    date_to_str = date_to.strftime('%Y-%m-%d')
+    date_from_str = date_from.strftime('%Y-%m-%d')
+
+    url = f"https://api.biorxiv.org/details/biorxiv/{date_from_str}/{date_to_str}"
+
+    citations = []
+    abstracts = []
+    dois = []
+
+    async with httpx.AsyncClient() as client:
+        cursor = 0
+        while True:
+            paginated_url = f"{url}/{cursor}/json"
+            response = await client.get(paginated_url)
+
+            if response.status_code != 200:
+                break
+
+            data = response.json()
+            messages = data.get('messages', [{}])
+            if messages and messages[0].get('status', '') == 'no results':
+                break
+
+            for article in data.get('collection', []):
+                title = article.get('title', '')
+                abstract = article.get('abstract', '')
+
+                if query_text.lower() in title.lower() or query_text.lower() in abstract.lower():
+                    authors = article.get('authors', [])
+                    author_str = ", ".join([f"{a.get('name', '')}" for a in authors])
+                    doi = article.get('doi', '')
+                    date = article.get('date', '')
+
+                    citation = f"{author_str}. {title}. bioRxiv {doi} ({date})"
+
+                    citations.append(citation)
+                    abstracts.append(abstract)
+                    dois.append(doi)
+
+            # bioRxiv API returns 100 results at a time. We need to paginate.
+            # The 'count' in messages gives total results for the query.
+            # 'cursor' is the starting point of the next page.
+            try:
+                count = int(messages[0].get('count', 0))
+                cursor = int(messages[0].get('cursor', 0)) + PAGE_SIZE
+            except (ValueError, TypeError):
+                break
+
+            if cursor >= count:
+                break
+
+    return citations, abstracts, dois
+
+if __name__ == '__main__':
+    async def main():
+        citations, abstracts, dois = await query("crispr")
+        for c in citations:
+            print(c)
+    asyncio.run(main())