diff --git a/apps/local-rag-pdf/README.md b/apps/local-rag-pdf/README.md index 05008310..f4a7c03a 100644 --- a/apps/local-rag-pdf/README.md +++ b/apps/local-rag-pdf/README.md @@ -1 +1,83 @@ -# local-rag-deepseek-mongodb +# Local RAG with PDF, Ollama, and MongoDB Atlas + +This application demonstrates a Retrieval-Augmented Generation (RAG) pipeline using **Ollama** for local LLMs and embeddings, and **MongoDB Atlas** as the vector store. It allows users to upload a PDF, index its content, and ask questions based on the document's context. + +## Features + +- **PDF Ingestion**: Upload and parse PDF documents. +- **Chunking & Embedding**: Splits text into manageable chunks and generates embeddings using Ollama. +- **Vector Storage**: Stores embeddings in MongoDB Atlas Vector Search. +- **Context-Aware QA**: Retrieves relevant context to answer user queries using a local LLM. +- **Conversation History**: Maintains context across multiple turns of conversation. + +## Prerequisites + +Before running this application, ensure you have the following: + +1. **Python 3.9+**: Installed on your system. +2. **MongoDB Atlas Cluster**: + - Create a [free account](https://www.mongodb.com/cloud/atlas/register). + - Deploy a cluster (M0 sandbox is sufficient). + - Get your connection string. +3. **Ollama**: + - Download and install [Ollama](https://ollama.com/). + - Pull the required models: + ```bash + ollama pull llama3 + ollama pull nomic-embed-text + ``` + *(Note: You can configure different models in `config.yaml`)* + +## Installation + +1. **Clone the repository** (if you haven't already): + ```bash + git clone + cd apps/local-rag-pdf + ``` + +2. **Create a virtual environment**: + ```bash + python -m venv .venv + source .venv/bin/activate # On Windows: .venv\Scripts\activate + ``` + +3. **Install dependencies**: + ```bash + pip install -r requirements.txt + ``` + +4. **Configure the application**: + - Open `config.yaml`. + - Update `mongo_connection_str` with your Atlas connection string. + - (Optional) Change `llm_model` or `embedding_model` if you want to use different Ollama models. + +## Usage + +1. **Run the application**: + ```bash + streamlit run app.py + ``` + +2. **Interact with the UI**: + - Upload a PDF file using the sidebar. + - Wait for the ingestion process to complete (check the logs in the terminal). + - Type your question in the chat input box. + +## Architecture + +1. **User** uploads a PDF. +2. **PyMuPDF** extracts text from the PDF. +3. **LangChain** splits the text into chunks. +4. **Ollama** generates vector embeddings for each chunk. +5. **MongoDB Atlas** stores these embeddings. +6. When a **User** asks a question: + - The question is embedded using **Ollama**. + - **MongoDB Atlas** performs a vector search to find relevant chunks. + - The retrieved chunks + the question are sent to the **Ollama** LLM. + - The LLM generates a response based on the context. + +## Troubleshooting + +- **Connection Error**: Ensure your IP address is whitelisted in MongoDB Atlas Network Access. +- **Ollama Error**: Make sure the Ollama service is running locally (`ollama serve`). diff --git a/apps/local-rag-pdf/README.md:Zone.Identifier b/apps/local-rag-pdf/README.md:Zone.Identifier new file mode 100644 index 00000000..be3d70db Binary files /dev/null and b/apps/local-rag-pdf/README.md:Zone.Identifier differ diff --git a/apps/local-rag-pdf/__pycache__/rag_module.cpython-312.pyc b/apps/local-rag-pdf/__pycache__/rag_module.cpython-312.pyc new file mode 100644 index 00000000..8f0becd2 Binary files /dev/null and b/apps/local-rag-pdf/__pycache__/rag_module.cpython-312.pyc differ diff --git a/apps/local-rag-pdf/app.py b/apps/local-rag-pdf/app.py index 6d30a62e..24236bd5 100644 --- a/apps/local-rag-pdf/app.py +++ b/apps/local-rag-pdf/app.py @@ -63,6 +63,8 @@ def process_query(): conversation_history=conversation_history, k=st.session_state["retrieval_k"], score_threshold=st.session_state["retrieval_threshold"], + search_type=st.session_state.get("search_type", "similarity"), + lambda_mult=st.session_state.get("lambda_mult", 0.5), ) except ValueError as e: agent_text = str(e) @@ -141,6 +143,32 @@ def page(): # Display messages and text input display_messages() + # Sidebar settings + with st.sidebar: + st.header("Retrieval Settings") + search_type = st.radio( + "Search Type", + options=["similarity", "mmr"], + format_func=lambda x: "Similarity" + if x == "similarity" + else "MMR (Diversity)", + index=0, + ) + + lambda_mult = 0.5 + if search_type == "mmr": + lambda_mult = st.slider( + "Diversity (Lambda)", + min_value=0.0, + max_value=1.0, + value=0.5, + step=0.1, + help="0.0 = Maximum Diversity, 1.0 = Maximum Relevance", + ) + + st.session_state["search_type"] = search_type + st.session_state["lambda_mult"] = lambda_mult + # Accept user input using the new chat input prompt = st.chat_input("Type your message here...") if prompt: diff --git a/apps/local-rag-pdf/app.py:Zone.Identifier b/apps/local-rag-pdf/app.py:Zone.Identifier new file mode 100644 index 00000000..2986c515 Binary files /dev/null and b/apps/local-rag-pdf/app.py:Zone.Identifier differ diff --git a/apps/local-rag-pdf/config.yaml:Zone.Identifier b/apps/local-rag-pdf/config.yaml:Zone.Identifier new file mode 100644 index 00000000..2986c515 Binary files /dev/null and b/apps/local-rag-pdf/config.yaml:Zone.Identifier differ diff --git a/apps/local-rag-pdf/rag_module.py b/apps/local-rag-pdf/rag_module.py index 840ecb69..09f587e4 100644 --- a/apps/local-rag-pdf/rag_module.py +++ b/apps/local-rag-pdf/rag_module.py @@ -2,15 +2,15 @@ from typing import Optional import yaml -from langchain.schema.output_parser import StrOutputParser -from langchain.schema.runnable import RunnablePassthrough -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.document_loaders import PyPDFLoader +from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.vectorstores.utils import filter_complex_metadata from langchain_core.globals import set_debug, set_verbose +from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate +from langchain_core.runnables import RunnablePassthrough from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch from langchain_ollama import ChatOllama, OllamaEmbeddings +from langchain_text_splitters import RecursiveCharacterTextSplitter from pymongo import MongoClient # Enable verbose debugging @@ -38,6 +38,18 @@ def __init__(self, config_file: str = "config.yaml"): config = load_config(config_file) # Read values from config + required_keys = [ + "llm_model", + "embedding_model", + "mongo_connection_str", + "database_name", + "collection_name", + ] + for key in required_keys: + if key not in config: + logger.error(f"Missing configuration key: {key}") + raise KeyError(f"Missing configuration key: {key}") + llm_model = config["llm_model"] embedding_model = config["embedding_model"] mongo_connection_str = config["mongo_connection_str"] @@ -96,23 +108,31 @@ def upload_and_index_pdf(self, pdf_file_path: str): """ Upload and index a PDF file, chunk its contents, and store the embeddings in MongoDB Atlas. """ - logger.info(f"Starting ingestion for file: {pdf_file_path}") - docs = PyPDFLoader(file_path=pdf_file_path).load() + try: + logger.info(f"Starting ingestion for file: {pdf_file_path}") + docs = PyMuPDFLoader(file_path=pdf_file_path).load() + + if not docs: + logger.warning(f"No content found in file: {pdf_file_path}") + return - logger.info(f"Loaded {len(docs)} pages from {pdf_file_path}") + logger.info(f"Loaded {len(docs)} pages from {pdf_file_path}") - chunks = self.text_splitter.split_documents(docs) - logger.info(f"Split into {len(chunks)} document chunks") + chunks = self.text_splitter.split_documents(docs) + logger.info(f"Split into {len(chunks)} document chunks") - # Optional: Log some sample chunks for verification - for i, chunk in enumerate(chunks[:3]): - logger.debug(f"Chunk {i+1} Content: {chunk.page_content[:200]}...") + # Optional: Log some sample chunks for verification + for i, chunk in enumerate(chunks[:3]): + logger.debug(f"Chunk {i+1} Content: {chunk.page_content[:200]}...") - chunks = filter_complex_metadata(chunks) + chunks = filter_complex_metadata(chunks) - # Add documents to vector store and check embeddings - self.vector_store.add_documents(documents=chunks) - logger.info("Document embeddings stored successfully in MongoDB Atlas.") + # Add documents to vector store and check embeddings + self.vector_store.add_documents(documents=chunks) + logger.info("Document embeddings stored successfully in MongoDB Atlas.") + except Exception as e: + logger.error(f"Failed to ingest PDF file {pdf_file_path}: {e}") + raise def query_with_context( self, @@ -120,6 +140,8 @@ def query_with_context( conversation_history: Optional[list] = None, k: int = 5, score_threshold: float = 0.2, + search_type: str = "similarity", + lambda_mult: float = 0.5, ): """ Answer a query using the RAG pipeline with verbose debugging and conversation history. @@ -129,6 +151,8 @@ def query_with_context( - conversation_history (list): List of previous messages in the conversation. - k (int): Number of retrieved documents. - score_threshold (float): Similarity score threshold for retrieval. + - search_type (str): Type of search ("similarity" or "mmr"). + - lambda_mult (float): Diversity factor for MMR (0.0 to 1.0). Returns: - str: The assistant's response. @@ -136,11 +160,26 @@ def query_with_context( if not self.vector_store: raise ValueError("No vector store found. Please ingest a document first.") - if not self.retriever: - self.retriever = self.vector_store.as_retriever( - search_type="similarity_score_threshold", - search_kwargs={"k": k, "score_threshold": score_threshold}, - ) + # Reset retriever if search parameters change (simplified approach) + # In a more complex app, we might check if params changed. + # Here we just re-create it to be safe and simple. + search_kwargs = {"k": k} + + if search_type == "similarity": + search_kwargs["score_threshold"] = score_threshold + search_type_arg = "similarity_score_threshold" + elif search_type == "mmr": + search_kwargs["lambda_mult"] = lambda_mult + search_type_arg = "mmr" + else: + # Fallback + search_kwargs["score_threshold"] = score_threshold + search_type_arg = "similarity_score_threshold" + + self.retriever = self.vector_store.as_retriever( + search_type=search_type_arg, + search_kwargs=search_kwargs, + ) # Generate and log query embeddings query_embedding = self.embeddings.embed_query(query) diff --git a/apps/local-rag-pdf/rag_module.py:Zone.Identifier b/apps/local-rag-pdf/rag_module.py:Zone.Identifier new file mode 100644 index 00000000..2986c515 Binary files /dev/null and b/apps/local-rag-pdf/rag_module.py:Zone.Identifier differ diff --git a/apps/local-rag-pdf/requirements.txt b/apps/local-rag-pdf/requirements.txt index 454e6cdf..083404bf 100644 --- a/apps/local-rag-pdf/requirements.txt +++ b/apps/local-rag-pdf/requirements.txt @@ -4,4 +4,4 @@ langchain_ollama langchain_community langchain-mongodb pymongo -pypdf +pymupdf diff --git a/apps/local-rag-pdf/requirements.txt:Zone.Identifier b/apps/local-rag-pdf/requirements.txt:Zone.Identifier new file mode 100644 index 00000000..2986c515 Binary files /dev/null and b/apps/local-rag-pdf/requirements.txt:Zone.Identifier differ