Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 0 additions & 42 deletions .github/workflows/ci.yml

This file was deleted.

13 changes: 9 additions & 4 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
import asyncio
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import streamlit as st
Expand Down Expand Up @@ -35,18 +36,22 @@
template = """{prompt}
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)
prompt_template = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="llama3.1")
chain = prompt | model
chain = prompt_template | model

# Main content
col1, col2 = st.columns(2)
with col1:
question = st.text_input("Enter your question here")
if question:
with st.spinner("Thinking..."):
prompt = lit_review_prompt(question)
answer = chain.invoke(prompt)
# Streamlit runs the script from top to bottom on each interaction.
# For a simple, one-off async call like this, asyncio.run() is a
# pragmatic choice. For more complex apps, a different approach
# might be needed.
prompt = asyncio.run(lit_review_prompt(question))
answer = chain.invoke({"prompt": prompt})
st.success("Done!")
st.markdown(f"**Answer:** {answer}")
else:
Expand Down
17 changes: 10 additions & 7 deletions literaturereviewbot/prompts/literature_review.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
from literaturereviewbot.search_pubmed import query as pubmed_query
from literaturereviewbot import search
from literaturereviewbot.documents import index_documents, retrieve_documents


def generate_prompt(question):
_, abstract_arr, pubmed_ids = pubmed_query(question)
async def generate_prompt(question):
_, abstract_arr, ids = await search.query(question)
if not abstract_arr:
return "I don't have enough information to answer this question."

vector_store = index_documents(
query=question, abstract_arr=abstract_arr, ids_arr=pubmed_ids
query=question, abstract_arr=abstract_arr, ids_arr=ids
)
documents = retrieve_documents(question, vector_store)
messages = [
Expand All @@ -14,7 +17,7 @@ def generate_prompt(question):
"content_type": "instructions",
"content": """
You are a professional biomedical researcher.
You will be given a series of article abstracts.
You will be given a series of article abstracts.
The information in your response should exclusively come from the content type 'abstracts'.
If no relevant information is found in the abstracts, you can say 'I don't have enough information to answer this question'.
""",
Expand All @@ -23,8 +26,8 @@ def generate_prompt(question):
for i, doc in enumerate(documents):
try:
content = doc.page_content if hasattr(doc, "page_content") else ""
except Exception as e:
print(f"Error processing document: {doc}, error: {e}")
except Exception:
print(doc)
content = ""
continue
messages.append(
Expand Down
40 changes: 40 additions & 0 deletions literaturereviewbot/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import asyncio
from . import search_pubmed, search_biorxiv

async def query(query_text=""):
"""
Query both PubMed and bioRxiv for a given text.

This function queries both PubMed and bioRxiv asynchronously and
combines the results.

Args:
query_text (str): The text to search for.

Returns:
tuple: A tuple containing three lists: combined citations,
combined abstracts, and combined IDs (PMIDs and DOIs).
"""
# We need to make sure the pubmed query is async
pubmed_task = asyncio.create_task(search_pubmed.query(query_text))
biorxiv_task = asyncio.create_task(search_biorxiv.query(query_text))

results = await asyncio.gather(pubmed_task, biorxiv_task)

pubmed_citations, pubmed_abstracts, pubmed_ids = results[0]
biorxiv_citations, biorxiv_abstracts, biorxiv_dois = results[1]

combined_citations = pubmed_citations + biorxiv_citations
combined_abstracts = pubmed_abstracts + biorxiv_abstracts
combined_ids = pubmed_ids + biorxiv_dois

return combined_citations, combined_abstracts, combined_ids

if __name__ == '__main__':
async def main():
citations, abstracts, ids = await query("crispr")
print(f"Found {len(citations)} articles.")
for c in citations:
print(c)

asyncio.run(main())
81 changes: 81 additions & 0 deletions literaturereviewbot/search_biorxiv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import asyncio
import httpx
from datetime import datetime, timedelta

PAGE_SIZE = 100

async def query(query_text=""):
"""
Search bioRxiv for a given query text.

This function fetches preprints from the last 30 days from the bioRxiv API
and filters them based on whether the query text appears in the title or abstract.

Args:
query_text (str): The text to search for.

Returns:
tuple: A tuple containing three lists: citations, abstracts, and DOIs.
"""
date_to = datetime.now()
date_from = date_to - timedelta(days=30)

date_to_str = date_to.strftime('%Y-%m-%d')
date_from_str = date_from.strftime('%Y-%m-%d')

url = f"https://api.biorxiv.org/details/biorxiv/{date_from_str}/{date_to_str}"

citations = []
abstracts = []
dois = []

async with httpx.AsyncClient() as client:
cursor = 0
while True:
paginated_url = f"{url}/{cursor}/json"
response = await client.get(paginated_url)

if response.status_code != 200:
break

data = response.json()
messages = data.get('messages', [{}])
if messages and messages[0].get('status', '') == 'no results':
break

for article in data.get('collection', []):
title = article.get('title', '')
abstract = article.get('abstract', '')

if query_text.lower() in title.lower() or query_text.lower() in abstract.lower():
authors = article.get('authors', [])
author_str = ", ".join([f"{a.get('name', '')}" for a in authors])
doi = article.get('doi', '')
date = article.get('date', '')

citation = f"{author_str}. {title}. bioRxiv {doi} ({date})"

citations.append(citation)
abstracts.append(abstract)
dois.append(doi)

# bioRxiv API returns 100 results at a time. We need to paginate.
# The 'count' in messages gives total results for the query.
# 'cursor' is the starting point of the next page.
try:
count = int(messages[0].get('count', 0))
cursor = int(messages[0].get('cursor', 0)) + PAGE_SIZE
except (ValueError, TypeError):
break

if cursor >= count:
break

return citations, abstracts, dois

if __name__ == '__main__':
async def main():
citations, abstracts, dois = await query("crispr")
for c in citations:
print(c)
asyncio.run(main())
Loading