From 9dd7937613c50a5583eff4093946b5a7056baecc Mon Sep 17 00:00:00 2001 From: Mateus Suman Carpenter Date: Mon, 24 Nov 2025 00:26:02 -0300 Subject: [PATCH] feat: upgrade PDF loader to PyMuPDF and add MMR search --- apps/local-rag-pdf/README.md | 84 +++++++++++++++++- apps/local-rag-pdf/README.md:Zone.Identifier | Bin 0 -> 93 bytes .../__pycache__/rag_module.cpython-312.pyc | Bin 0 -> 9932 bytes apps/local-rag-pdf/app.py | 28 ++++++ apps/local-rag-pdf/app.py:Zone.Identifier | Bin 0 -> 92 bytes .../local-rag-pdf/config.yaml:Zone.Identifier | Bin 0 -> 92 bytes apps/local-rag-pdf/rag_module.py | 81 ++++++++++++----- .../rag_module.py:Zone.Identifier | Bin 0 -> 92 bytes apps/local-rag-pdf/requirements.txt | 2 +- .../requirements.txt:Zone.Identifier | Bin 0 -> 92 bytes 10 files changed, 172 insertions(+), 23 deletions(-) create mode 100644 apps/local-rag-pdf/README.md:Zone.Identifier create mode 100644 apps/local-rag-pdf/__pycache__/rag_module.cpython-312.pyc create mode 100644 apps/local-rag-pdf/app.py:Zone.Identifier create mode 100644 apps/local-rag-pdf/config.yaml:Zone.Identifier create mode 100644 apps/local-rag-pdf/rag_module.py:Zone.Identifier create mode 100644 apps/local-rag-pdf/requirements.txt:Zone.Identifier diff --git a/apps/local-rag-pdf/README.md b/apps/local-rag-pdf/README.md index 05008310..f4a7c03a 100644 --- a/apps/local-rag-pdf/README.md +++ b/apps/local-rag-pdf/README.md @@ -1 +1,83 @@ -# local-rag-deepseek-mongodb +# Local RAG with PDF, Ollama, and MongoDB Atlas + +This application demonstrates a Retrieval-Augmented Generation (RAG) pipeline using **Ollama** for local LLMs and embeddings, and **MongoDB Atlas** as the vector store. It allows users to upload a PDF, index its content, and ask questions based on the document's context. + +## Features + +- **PDF Ingestion**: Upload and parse PDF documents. +- **Chunking & Embedding**: Splits text into manageable chunks and generates embeddings using Ollama. +- **Vector Storage**: Stores embeddings in MongoDB Atlas Vector Search. +- **Context-Aware QA**: Retrieves relevant context to answer user queries using a local LLM. +- **Conversation History**: Maintains context across multiple turns of conversation. + +## Prerequisites + +Before running this application, ensure you have the following: + +1. **Python 3.9+**: Installed on your system. +2. **MongoDB Atlas Cluster**: + - Create a [free account](https://www.mongodb.com/cloud/atlas/register). + - Deploy a cluster (M0 sandbox is sufficient). + - Get your connection string. +3. **Ollama**: + - Download and install [Ollama](https://ollama.com/). + - Pull the required models: + ```bash + ollama pull llama3 + ollama pull nomic-embed-text + ``` + *(Note: You can configure different models in `config.yaml`)* + +## Installation + +1. **Clone the repository** (if you haven't already): + ```bash + git clone + cd apps/local-rag-pdf + ``` + +2. **Create a virtual environment**: + ```bash + python -m venv .venv + source .venv/bin/activate # On Windows: .venv\Scripts\activate + ``` + +3. **Install dependencies**: + ```bash + pip install -r requirements.txt + ``` + +4. **Configure the application**: + - Open `config.yaml`. + - Update `mongo_connection_str` with your Atlas connection string. + - (Optional) Change `llm_model` or `embedding_model` if you want to use different Ollama models. + +## Usage + +1. **Run the application**: + ```bash + streamlit run app.py + ``` + +2. **Interact with the UI**: + - Upload a PDF file using the sidebar. + - Wait for the ingestion process to complete (check the logs in the terminal). + - Type your question in the chat input box. + +## Architecture + +1. **User** uploads a PDF. +2. **PyMuPDF** extracts text from the PDF. +3. **LangChain** splits the text into chunks. +4. **Ollama** generates vector embeddings for each chunk. +5. **MongoDB Atlas** stores these embeddings. +6. When a **User** asks a question: + - The question is embedded using **Ollama**. + - **MongoDB Atlas** performs a vector search to find relevant chunks. + - The retrieved chunks + the question are sent to the **Ollama** LLM. + - The LLM generates a response based on the context. + +## Troubleshooting + +- **Connection Error**: Ensure your IP address is whitelisted in MongoDB Atlas Network Access. +- **Ollama Error**: Make sure the Ollama service is running locally (`ollama serve`). diff --git a/apps/local-rag-pdf/README.md:Zone.Identifier b/apps/local-rag-pdf/README.md:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..be3d70db482aee00311ec98148fcbed059faa828 GIT binary patch literal 93 zcma!!%Fjy;DN4*MPD?F{<>dl#JyUFrdAWj8fg(kzMWIDGw$4^Dp~b01#WBwLxdo*q si7EL-F)sP#c{%xsDaA4Fsd4j@Krjj*lf@1R3nXA0lEoN;Asd`zb%SPXOCCwi%m}a| zCsc~slB;r+@MZ1XUMk+(d|(T%&L91etJb2@m~th~Bhs!bkH?(Vq@X z1ZdtR2GdOwO*HQocchyqnrYr6wxmN7p>*p+EB*G0ZRz%jcAEEz9qI5ym?0J-xOx9w z7$-jT+tct(QUkSGb1IP3%jWM|{$5}UCz5*Q$23BhPBYQXx9~xsXVStq-Lg-x7IKF0 zJ8lua`L3nbZlZUNiH4rw$5@n6UFT<&WH!TzkoRAhJD0mKe0ns?@qz>e?UPAS5u|t` zo1PVg8}YQDa6G4QQH$!91trc4S94RU7qiy{>1tLMpiyvKk$!i@8jOyDAD7I9XPvrEG3$8j3s4WiwOR;Zr9Sk(1vR5=vGY7dR;~ z4ZV7h(Roqi(wy2bqBwfE9`+ze6dE@bGxy`GPStJ6PV0bbCvV+|nq%uA7$_Xb zorer;lPRndvr}@6u1P}Mj&!1ilKmT$Zc#Erapo;kVDkQfnYR|K!{q$~gv{Ftwu0pv zBQu}66oXsJu8Gd{;w=T+ugRAdT?rYYIQ8nrx=$DzDS&WMZ1Z-=*=WuIIXlfcA?Khu z7v!8Y=T_W0zk>aqOYf&Z;>KtceMw2dv4z4IiGr$1A08I#o>$)@z{TLMwCXgeHa%B= z^FCA0KD}J&)N=(V?+0_L$qM)ej2e>KspsqO0#%-Z8|MhrLA|CxQr&vK{!aDQ^Lj36 z7>Rk0X^nR4_4PL&EO;8$=mouY`~INR7{5*ls@}x!xYvAFGeR)x7QGJW_v-ojI~A?x z^&H=FFQiKnbfvCa+l{-mPRX}Hf5r;UwMExmFV>wA+Id)nQXD4$a3+5+@Vo_KwqZ=39-pDM@ zfM>xIpd53vXp7OvWD~hGI5lX((;Aor^e9uO5%X;6amZA~z5!ijrMaV9C_Xf|j!)M} z=6}RR0q0tCNW)t9@{jo-QA5+WG~?Motk1E58j!=aB##1uQ6-YHaDe3!;OD`EX#!&R z%}T&r5_qM z0*YKlIm)WOYZU0jlNnyPp*j*-&_f|=13fZyRCOC#u6ih4O^usjV8S)rLvcBgm4x_Y zCW(**M&#j>pb|3Zznt%=K`C||7F%HJyOAG@woxChI>0+k36g3TBq=MYuCv122<9X- zr4m{<35~ewF~%WFXn|Beu5et|K$e8oTJ)*hL`)c$DJaw==9UCSN&=LVqF&V|3*uxA`G6!5 zq^N#L_#g*M&c|Ul%c>R9zqq4SHw*@B2MkNbVQ2;nUs5{`%N2)SlS(|EZ_>FoJx{}=n0)8)v(O1vByL9S5T8eTkq`}}(7a5;2%<c>Sc94^VYx8yVKOWRqqRWcYSGm z0dz1RUBSxRcz48rLjYl4BIG?-gL&J$-PD%>SL&nUJp~)@%|ik3_Z94^P`ysi0e61{ z>jPif_0cfo)5A8R;FhIN!5A5UXp+KnOT&{gpklqEV7(Wtb2N=~yESR5Q_k6T6U6|P zd%%FsJ4~$+KQui24qQ3Dx!?eeGp*kzgtyH*K5|@Vt`qHK*pg3sbhC7sA^__4saIe* zdRjsCKfqGx!6p?MT_-p~MF!sq0D)Rm>hbEv9IzOxdoFtTt~obC zI)3>H$~x~DLQKUzLB-B{(7ED-j*?Ws^Mi#bEFf8`R5XV31k3U z00Dk9?|kHR?c@fdsm<3ADf`?4^KFvwuq$D>Mnb5q+7N}Q&g-0%fd;Bo6fzPhIJh_J z7ESKWEo1afd*@&7*t1{(orNmiKI4| zwG4h01thCXQnk~p$_O$VCQVgp-YZmtjB!wdn)5^mli#U(vh8YTfQLLpz1oJW!`=MQ z*ZkSpg|nr|kyZc7$EA)l72g{VTi83tD=n|Cx11=qoLFmly*T{GmUeV*f!l#n?@w3H zt-e<3Ia_HOtv0oNS7Wf$_x0{&{#cp|*1q-DBjwg3 ztG2b)KLl{hS7)YU7oyi@ea3~THE;&1Q-05nA@(; zna>9`=el%lnY+XMCb4|^&WkIT*9Xs(2hTjPJ!mNno+}NEJxG+gE`HlIzG20Nz^?7! zowwIo2a2b`k@wW3fDF)mnEz+^hic87_U5=i=#{ zs3gpu$?$$I(Aq`x3^16+NA}jESrDnx{KzK^gbJ)lX5Lw_^VS)_ZjuWE5Ki893>fhC zc~>R`H4d%D0X0rqoF3^skaJ)3EF7UwcQNxomd}6(#9(|j4 zD~2cB&j0c5MlWolH+)&0Gk7L9@8x~e^Y|6RcNN^a)EYgKiQhK<7lHr{Xai=6!J&pH zWgN2W6{cBw3Ld%5)RW->>(sp2Q|&O(Za1oC)66`knYC>3^%y0gPr;xud3o==&(zCy zvJ4fxe5>Z=+X}u^MCVt3gRI-BtUDULd;ubzd^m;Y^MgLxy>;HN7~Wc^fOTOTI)cwU>@}#tLqNlWHHs0?(Yavhe841^?WEtWj|Pxj zVB3{9?i%KUCSKdgxTg@@ie?MJaS~;@Vc1|E9Z;O4F$aJV)COsR8H_bxWOFt-D~ND5 zt%W$jHeuA?6yZd0VK!Q<7CSISAmCa6qAd`PH^mALYMt$aI8F2@dkG^OaCgur8*$b; z7&*A51KldJ``~n~K-*DxvDrx$A_CWv*__O#!O>Hw0e&8*VIH2D>kPXO@CR@h%Rx9v zLLU05vL0TM%`F;?FuQLOU;uI$Po|S1Cnc3RmeSB*1U1w&nyj(q#4XI~2AtK!0LC1L zQM2vrzFo4xcNYujYgalg?TVV2m<)e?G{bNo0+%8%m`S)2LDU9+=gwVZ_Z^BIVgcj4 z6gvc53vM#a>d;gZw>f9zZ~ zNBf}n4zavP8wCWB06^>X`{ZceJ*>z7q!E~h^d=^!F*yT?%6L?gw|G2T?zYgY9)`co z0Z7imNf8mj`%Pez;MBVvGWZp<$iS(p*+%BbKU(32ih(Oc=>oKdkf$YfNVUR{w0kBB zHlMqXO$iy{#;kNAe^5u5RAh0yBB_Di8%pdLji<>cVg~tt0#g}d{QpD#e3N{$Yi!lJ ze0Y8L;cs^zF7F!qUl~#NCv6=})GAFtR&{C;Rb9ZIN-Mt$?zRwU2CQ zJ0nPEv1AMrl$2^qWs?~Rl}$q7*8*z@W#Cjn5&ab`#Yxd!L3aYxaH{Q60xAIh)s00Q z)LjkMglZgg)PO!@dW4Xr%g7JcR1JdK!sVX=v9M$YZfWtX0JnasOP7%h8lWE;bf2I_ zzM6l$cKOI(Ue^U+B)5Z_-6R_^(iB<`vKa7c-dXCpyw?17$@ezIvm#wf+>)#4f9!2r zI`EBmZ?*qK@mwhaX!}65`{=^?s=s4t)|8i@R4dqUp3OVVJDGSHi2FsDb6=5xMtQgb+ zQ=f~=gUfxT(2Et7%rW86>aUHLQyT0oFqI*3YD~Drua%sb!0we`c9#{DLl%Xi;vs87C*fGA?&8rk1D&y*4oFa9bHeoj!sW8_$);H?3%B)8fgA( zW?|+j9E>85A)i`EJ+)i>VK^fNyVn9ePn}jjyJ02nz_WlZfY2)y*Q?c*j>TiQk1Yq+ zS`HOmnlwNEXnA^hqSSh*;(DnXY%d0-*FizHm?VVv__v8i2e=7^wDxQtoU214IGm6; z8Z`RMS)$hbdnShlKT)8*U*8oHje=yg-#$PLAOQ|aq9j|~7g+c#m}UvQ8JyTn&OhG= zRzpxUi%eU*Bs-faBHpIvk zjW25|7dl>Xod8|K04Npd@0h&ygN7LXA|PrXdxBwwslWRIpMQN#`WyU$KX@L$7WHF9 z8*eV+an%#o?)f3_i-UFJM6Jdd$6y2$TLBD6uR&Ak9ZYb!ApQZUOSJ-iQtcE3Npo2D zAtb*ebWGGAND(Z>1QE6T2_(13A8a#EZD$yJ_wvvt!S}{R%T9aq%I-}9uZ>d-IW$n+ z(Om7_Q|;`lcCk-`ml=D<<6!fK9W!7Edis`?FESfW`spIR;D(!K!MOBncxl#0+QJ)t znhlV)&P~c=Gh?yayKBRvBP+mwqPtWl-h^Nr#{tJ_c+)vbA*fXMh+Jlh8cc9r>2wb6 z0p?=5Bc?yP0H{@q3GLmEw5DA&3MI#KN>Y?*#7Av26H7vDO3Ys6pb}4i5*!nV+SNdj zrN6O(2si%7MwZZn{D@2K)1L|%DBhUn+eBhvHKK^RYQ&#mAeh_^UgAv-{yn9xnDbY~? literal 0 HcmV?d00001 diff --git a/apps/local-rag-pdf/app.py b/apps/local-rag-pdf/app.py index 6d30a62e..24236bd5 100644 --- a/apps/local-rag-pdf/app.py +++ b/apps/local-rag-pdf/app.py @@ -63,6 +63,8 @@ def process_query(): conversation_history=conversation_history, k=st.session_state["retrieval_k"], score_threshold=st.session_state["retrieval_threshold"], + search_type=st.session_state.get("search_type", "similarity"), + lambda_mult=st.session_state.get("lambda_mult", 0.5), ) except ValueError as e: agent_text = str(e) @@ -141,6 +143,32 @@ def page(): # Display messages and text input display_messages() + # Sidebar settings + with st.sidebar: + st.header("Retrieval Settings") + search_type = st.radio( + "Search Type", + options=["similarity", "mmr"], + format_func=lambda x: "Similarity" + if x == "similarity" + else "MMR (Diversity)", + index=0, + ) + + lambda_mult = 0.5 + if search_type == "mmr": + lambda_mult = st.slider( + "Diversity (Lambda)", + min_value=0.0, + max_value=1.0, + value=0.5, + step=0.1, + help="0.0 = Maximum Diversity, 1.0 = Maximum Relevance", + ) + + st.session_state["search_type"] = search_type + st.session_state["lambda_mult"] = lambda_mult + # Accept user input using the new chat input prompt = st.chat_input("Type your message here...") if prompt: diff --git a/apps/local-rag-pdf/app.py:Zone.Identifier b/apps/local-rag-pdf/app.py:Zone.Identifier new file mode 100644 index 0000000000000000000000000000000000000000..2986c5156db70723e57552d0761cf6f8bf32406a GIT binary patch literal 92 zcma!!%Fjy;DN4*MPD?F{<>dl#JyUFrdAWj8fg(kzMWIDGw$4^Dp~b01#WBwLxdo*q ri7EL-F)sP#c{%xsDaA4Fsddl#JyUFrdAWj8fg(kzMWIDGw$4^Dp~b01#WBwLxdo*q ri7EL-F)sP#c{%xsDaA4Fsddl#JyUFrdAWj8fg(kzMWIDGw$4^Dp~b01#WBwLxdo*q ri7EL-F)sP#c{%xsDaA4Fsddl#JyUFrdAWj8fg(kzMWIDGw$4^Dp~b01#WBwLxdo*q ri7EL-F)sP#c{%xsDaA4Fsd