Several confluence loader improvements (langchain-ai#3300)

mrharris · web-flow · commit b4de839ed8a1 · 2023-04-23T15:06:10.000-07:00
This PR addresses several improvements:

- Previously it was not possible to load spaces of more than 100 pages.
The `limit` was being used both as an overall page limit *and* as a per
request pagination limit. This, in combination with the fact that
atlassian seem to use a server-side hard limit of 100 when page content
is expanded, meant it wasn't possible to download &gt;100 pages. Now
`limit` is used *only* as a per-request pagination limit and `max_pages`
is introduced as the way to limit the total number of pages returned by
the paginator.
- Document metadata now includes `source` (the source url), making it
compatible with `RetrievalQAWithSourcesChain`.
 - It is now possible to include inline and footer comments.
- It is now possible to pass `verify_ssl=False` and other parameters to
the confluence object for use cases that require it.
diff --git a/langchain/document_loaders/confluence.py b/langchain/document_loaders/confluence.py
@@ -60,6 +60,8 @@ class ConfluenceLoader(BaseLoader):
     :type min_retry_seconds: Optional[int], optional
     :param max_retry_seconds:  defaults to 10
     :type max_retry_seconds: Optional[int], optional
+    :param confluence_kwargs: additional kwargs to initialize confluence with
+    :type confluence_kwargs: dict, optional
     :raises ValueError: Errors while validating input
     :raises ImportError: Required dependencies not installed.
     """
@@ -74,7 +76,9 @@ def __init__(
         number_of_retries: Optional[int] = 3,
         min_retry_seconds: Optional[int] = 2,
         max_retry_seconds: Optional[int] = 10,
+        confluence_kwargs: Optional[dict] = None,
     ):
+        confluence_kwargs = confluence_kwargs or {}
         errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2)
         if errors:
             raise ValueError(f"Error(s) while validating input: {errors}")
@@ -93,10 +97,16 @@ def __init__(
             )
 
         if oauth2:
-            self.confluence = Confluence(url=url, oauth2=oauth2, cloud=cloud)
+            self.confluence = Confluence(
+                url=url, oauth2=oauth2, cloud=cloud, **confluence_kwargs
+            )
         else:
             self.confluence = Confluence(
-                url=url, username=username, password=api_key, cloud=cloud
+                url=url,
+                username=username,
+                password=api_key,
+                cloud=cloud,
+                **confluence_kwargs,
             )
 
     @staticmethod
@@ -147,7 +157,9 @@ def load(
         label: Optional[str] = None,
         cql: Optional[str] = None,
         include_attachments: bool = False,
+        include_comments: bool = False,
         limit: Optional[int] = 50,
+        max_pages: Optional[int] = 1000,
     ) -> List[Document]:
         """
         :param space_key: Space key retrieved from a confluence URL, defaults to None
@@ -160,8 +172,12 @@ def load(
         :type cql: Optional[str], optional
         :param include_attachments: defaults to False
         :type include_attachments: bool, optional
-        :param limit: Maximum number of pages to retrieve, defaults to 50
+        :param include_comments: defaults to False
+        :type include_comments: bool, optional
+        :param limit: Maximum number of pages to retrieve per request, defaults to 50
         :type limit: int, optional
+        :param max_pages: Maximum number of pages to retrieve in total, defaults 1000
+        :type max_pages: int, optional
         :raises ValueError: _description_
         :raises ImportError: _description_
         :return: _description_
@@ -191,29 +207,41 @@ def load(
                 self.confluence.get_all_pages_from_space,
                 space=space_key,
                 limit=limit,
+                max_pages=max_pages,
                 expand="body.storage.value",
             )
             for page in pages:
-                doc = self.process_page(page, include_attachments, text_maker)
+                doc = self.process_page(
+                    page, include_attachments, include_comments, text_maker
+                )
                 docs.append(doc)
 
         if label:
             pages = self.paginate_request(
                 self.confluence.get_all_pages_by_label,
                 label=label,
                 limit=limit,
+                max_pages=max_pages,
                 expand="body.storage.value",
             )
             for page in pages:
-                doc = self.process_page(page, include_attachments, text_maker)
+                doc = self.process_page(
+                    page, include_attachments, include_comments, text_maker
+                )
                 docs.append(doc)
 
         if cql:
             pages = self.paginate_request(
-                self.confluence.cql, cql=cql, limit=limit, expand="body.storage.value"
+                self.confluence.cql,
+                cql=cql,
+                limit=limit,
+                max_pages=max_pages,
+                expand="body.storage.value",
             )
             for page in pages:
-                doc = self.process_page(page, include_attachments, text_maker)
+                doc = self.process_page(
+                    page, include_attachments, include_comments, text_maker
+                )
                 docs.append(doc)
 
         if page_ids:
@@ -231,19 +259,23 @@ def load(
                     before_sleep=before_sleep_log(logger, logging.WARNING),
                 )(self.confluence.get_page_by_id)
                 page = get_page(page_id=page_id, expand="body.storage.value")
-                doc = self.process_page(page, include_attachments, text_maker)
+                doc = self.process_page(
+                    page, include_attachments, include_comments, text_maker
+                )
                 docs.append(doc)
 
         return docs
 
     def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
         """Paginate the various methods to retrieve groups of pages.
 
-        Unforunately, due to page size, sometimes the Confluence API
-        doesn't match the limit value. Also, due to the Atlassian Python
+        Unfortunately, due to page size, sometimes the Confluence API
+        doesn't match the limit value. If `limit` is  >100 confluence
+        seems to cap the response to 100. Also, due to the Atlassian Python
         package, we don't get the "next" values from the "_links" key because
         they only return the value from the results key. So here, the pagination
-        starts from 0 and goes until the limit. We have to manually check if there
+        starts from 0 and goes until the max_pages, getting the `limit` number
+        of pages with each request. We have to manually check if there
         are more docs based on the length of the returned list of pages, rather than
         just checking for the presence of a `next` key in the response like this page
         would have you do:
@@ -255,10 +287,9 @@ def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
         :rtype: List
         """
 
-        limit = kwargs["limit"]
-        page = 0
-        docs = []
-        while page < limit:
+        max_pages = kwargs.pop("max_pages")
+        docs: List[dict] = []
+        while len(docs) < max_pages:
             get_pages = retry(
                 reraise=True,
                 stop=stop_after_attempt(
@@ -271,16 +302,18 @@ def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
                 ),
                 before_sleep=before_sleep_log(logger, logging.WARNING),
             )(retrieval_method)
-            batch = get_pages(**kwargs, start=page)
-            if len(batch) < limit:
-                page = limit
-            else:
-                page += len(batch)
+            batch = get_pages(**kwargs, start=len(docs))
+            if not batch:
+                break
             docs.extend(batch)
-        return docs
+        return docs[:max_pages]
 
     def process_page(
-        self, page: dict, include_attachments: bool, text_maker: Any
+        self,
+        page: dict,
+        include_attachments: bool,
+        include_comments: bool,
+        text_maker: Any,
     ) -> Document:
         if include_attachments:
             attachment_texts = self.process_attachment(page["id"])
@@ -289,8 +322,23 @@ def process_page(
         text = text_maker.handle(page["body"]["storage"]["value"]) + "".join(
             attachment_texts
         )
+        if include_comments:
+            comments = self.confluence.get_page_comments(
+                page["id"], expand="body.view.value", depth="all"
+            )["results"]
+            comment_texts = [
+                text_maker.handle(comment["body"]["view"]["value"])
+                for comment in comments
+            ]
+            text = text + "".join(comment_texts)
+
         return Document(
-            page_content=text, metadata={"title": page["title"], "id": page["id"]}
+            page_content=text,
+            metadata={
+                "title": page["title"],
+                "id": page["id"],
+                "source": self.base_url.strip("/") + page["_links"]["webui"],
+            },
         )
 
     def process_attachment(self, page_id: str) -> List[str]:
diff --git a/tests/integration_tests/document_loaders/test_confluence.py b/tests/integration_tests/document_loaders/test_confluence.py
@@ -19,6 +19,10 @@ def test_load_single_confluence_page() -> None:
     assert docs[0].page_content is not None
     assert docs[0].metadata["id"] == "33189"
     assert docs[0].metadata["title"] == "An easy intro to using Confluence"
+    assert docs[0].metadata["source"] == (
+        "https://templates.atlassian.net/wiki/"
+        "spaces/RD/pages/33189/An+easy+intro+to+using+Confluence"
+    )
 
 
 @pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
@@ -33,7 +37,18 @@ def test_load_full_confluence_space() -> None:
 @pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
 def test_confluence_pagination() -> None:
     loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
-    docs = loader.load(space_key="RD", limit=5)
+    # this will issue 2 requests; each with a limit of 3 until the max_pages of 5 is met
+    docs = loader.load(space_key="RD", limit=3, max_pages=5)
 
     assert len(docs) == 5
     assert docs[0].page_content is not None
+
+
+@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
+def test_pass_confluence_kwargs() -> None:
+    loader = ConfluenceLoader(
+        url="https://templates.atlassian.net/wiki/",
+        confluence_kwargs={"verify_ssl": False},
+    )
+
+    assert loader.confluence.verify_ssl is False