Skip to content

Commit b4de839

Browse files
authored
Several confluence loader improvements (langchain-ai#3300)
This PR addresses several improvements: - Previously it was not possible to load spaces of more than 100 pages. The `limit` was being used both as an overall page limit *and* as a per request pagination limit. This, in combination with the fact that atlassian seem to use a server-side hard limit of 100 when page content is expanded, meant it wasn't possible to download >100 pages. Now `limit` is used *only* as a per-request pagination limit and `max_pages` is introduced as the way to limit the total number of pages returned by the paginator. - Document metadata now includes `source` (the source url), making it compatible with `RetrievalQAWithSourcesChain`. - It is now possible to include inline and footer comments. - It is now possible to pass `verify_ssl=False` and other parameters to the confluence object for use cases that require it.
1 parent 651cb62 commit b4de839

File tree

2 files changed

+87
-24
lines changed

2 files changed

+87
-24
lines changed

langchain/document_loaders/confluence.py

Lines changed: 71 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ class ConfluenceLoader(BaseLoader):
6060
:type min_retry_seconds: Optional[int], optional
6161
:param max_retry_seconds: defaults to 10
6262
:type max_retry_seconds: Optional[int], optional
63+
:param confluence_kwargs: additional kwargs to initialize confluence with
64+
:type confluence_kwargs: dict, optional
6365
:raises ValueError: Errors while validating input
6466
:raises ImportError: Required dependencies not installed.
6567
"""
@@ -74,7 +76,9 @@ def __init__(
7476
number_of_retries: Optional[int] = 3,
7577
min_retry_seconds: Optional[int] = 2,
7678
max_retry_seconds: Optional[int] = 10,
79+
confluence_kwargs: Optional[dict] = None,
7780
):
81+
confluence_kwargs = confluence_kwargs or {}
7882
errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2)
7983
if errors:
8084
raise ValueError(f"Error(s) while validating input: {errors}")
@@ -93,10 +97,16 @@ def __init__(
9397
)
9498

9599
if oauth2:
96-
self.confluence = Confluence(url=url, oauth2=oauth2, cloud=cloud)
100+
self.confluence = Confluence(
101+
url=url, oauth2=oauth2, cloud=cloud, **confluence_kwargs
102+
)
97103
else:
98104
self.confluence = Confluence(
99-
url=url, username=username, password=api_key, cloud=cloud
105+
url=url,
106+
username=username,
107+
password=api_key,
108+
cloud=cloud,
109+
**confluence_kwargs,
100110
)
101111

102112
@staticmethod
@@ -147,7 +157,9 @@ def load(
147157
label: Optional[str] = None,
148158
cql: Optional[str] = None,
149159
include_attachments: bool = False,
160+
include_comments: bool = False,
150161
limit: Optional[int] = 50,
162+
max_pages: Optional[int] = 1000,
151163
) -> List[Document]:
152164
"""
153165
:param space_key: Space key retrieved from a confluence URL, defaults to None
@@ -160,8 +172,12 @@ def load(
160172
:type cql: Optional[str], optional
161173
:param include_attachments: defaults to False
162174
:type include_attachments: bool, optional
163-
:param limit: Maximum number of pages to retrieve, defaults to 50
175+
:param include_comments: defaults to False
176+
:type include_comments: bool, optional
177+
:param limit: Maximum number of pages to retrieve per request, defaults to 50
164178
:type limit: int, optional
179+
:param max_pages: Maximum number of pages to retrieve in total, defaults 1000
180+
:type max_pages: int, optional
165181
:raises ValueError: _description_
166182
:raises ImportError: _description_
167183
:return: _description_
@@ -191,29 +207,41 @@ def load(
191207
self.confluence.get_all_pages_from_space,
192208
space=space_key,
193209
limit=limit,
210+
max_pages=max_pages,
194211
expand="body.storage.value",
195212
)
196213
for page in pages:
197-
doc = self.process_page(page, include_attachments, text_maker)
214+
doc = self.process_page(
215+
page, include_attachments, include_comments, text_maker
216+
)
198217
docs.append(doc)
199218

200219
if label:
201220
pages = self.paginate_request(
202221
self.confluence.get_all_pages_by_label,
203222
label=label,
204223
limit=limit,
224+
max_pages=max_pages,
205225
expand="body.storage.value",
206226
)
207227
for page in pages:
208-
doc = self.process_page(page, include_attachments, text_maker)
228+
doc = self.process_page(
229+
page, include_attachments, include_comments, text_maker
230+
)
209231
docs.append(doc)
210232

211233
if cql:
212234
pages = self.paginate_request(
213-
self.confluence.cql, cql=cql, limit=limit, expand="body.storage.value"
235+
self.confluence.cql,
236+
cql=cql,
237+
limit=limit,
238+
max_pages=max_pages,
239+
expand="body.storage.value",
214240
)
215241
for page in pages:
216-
doc = self.process_page(page, include_attachments, text_maker)
242+
doc = self.process_page(
243+
page, include_attachments, include_comments, text_maker
244+
)
217245
docs.append(doc)
218246

219247
if page_ids:
@@ -231,19 +259,23 @@ def load(
231259
before_sleep=before_sleep_log(logger, logging.WARNING),
232260
)(self.confluence.get_page_by_id)
233261
page = get_page(page_id=page_id, expand="body.storage.value")
234-
doc = self.process_page(page, include_attachments, text_maker)
262+
doc = self.process_page(
263+
page, include_attachments, include_comments, text_maker
264+
)
235265
docs.append(doc)
236266

237267
return docs
238268

239269
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
240270
"""Paginate the various methods to retrieve groups of pages.
241271
242-
Unforunately, due to page size, sometimes the Confluence API
243-
doesn't match the limit value. Also, due to the Atlassian Python
272+
Unfortunately, due to page size, sometimes the Confluence API
273+
doesn't match the limit value. If `limit` is >100 confluence
274+
seems to cap the response to 100. Also, due to the Atlassian Python
244275
package, we don't get the "next" values from the "_links" key because
245276
they only return the value from the results key. So here, the pagination
246-
starts from 0 and goes until the limit. We have to manually check if there
277+
starts from 0 and goes until the max_pages, getting the `limit` number
278+
of pages with each request. We have to manually check if there
247279
are more docs based on the length of the returned list of pages, rather than
248280
just checking for the presence of a `next` key in the response like this page
249281
would have you do:
@@ -255,10 +287,9 @@ def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
255287
:rtype: List
256288
"""
257289

258-
limit = kwargs["limit"]
259-
page = 0
260-
docs = []
261-
while page < limit:
290+
max_pages = kwargs.pop("max_pages")
291+
docs: List[dict] = []
292+
while len(docs) < max_pages:
262293
get_pages = retry(
263294
reraise=True,
264295
stop=stop_after_attempt(
@@ -271,16 +302,18 @@ def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
271302
),
272303
before_sleep=before_sleep_log(logger, logging.WARNING),
273304
)(retrieval_method)
274-
batch = get_pages(**kwargs, start=page)
275-
if len(batch) < limit:
276-
page = limit
277-
else:
278-
page += len(batch)
305+
batch = get_pages(**kwargs, start=len(docs))
306+
if not batch:
307+
break
279308
docs.extend(batch)
280-
return docs
309+
return docs[:max_pages]
281310

282311
def process_page(
283-
self, page: dict, include_attachments: bool, text_maker: Any
312+
self,
313+
page: dict,
314+
include_attachments: bool,
315+
include_comments: bool,
316+
text_maker: Any,
284317
) -> Document:
285318
if include_attachments:
286319
attachment_texts = self.process_attachment(page["id"])
@@ -289,8 +322,23 @@ def process_page(
289322
text = text_maker.handle(page["body"]["storage"]["value"]) + "".join(
290323
attachment_texts
291324
)
325+
if include_comments:
326+
comments = self.confluence.get_page_comments(
327+
page["id"], expand="body.view.value", depth="all"
328+
)["results"]
329+
comment_texts = [
330+
text_maker.handle(comment["body"]["view"]["value"])
331+
for comment in comments
332+
]
333+
text = text + "".join(comment_texts)
334+
292335
return Document(
293-
page_content=text, metadata={"title": page["title"], "id": page["id"]}
336+
page_content=text,
337+
metadata={
338+
"title": page["title"],
339+
"id": page["id"],
340+
"source": self.base_url.strip("/") + page["_links"]["webui"],
341+
},
294342
)
295343

296344
def process_attachment(self, page_id: str) -> List[str]:

tests/integration_tests/document_loaders/test_confluence.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ def test_load_single_confluence_page() -> None:
1919
assert docs[0].page_content is not None
2020
assert docs[0].metadata["id"] == "33189"
2121
assert docs[0].metadata["title"] == "An easy intro to using Confluence"
22+
assert docs[0].metadata["source"] == (
23+
"https://templates.atlassian.net/wiki/"
24+
"spaces/RD/pages/33189/An+easy+intro+to+using+Confluence"
25+
)
2226

2327

2428
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
@@ -33,7 +37,18 @@ def test_load_full_confluence_space() -> None:
3337
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
3438
def test_confluence_pagination() -> None:
3539
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
36-
docs = loader.load(space_key="RD", limit=5)
40+
# this will issue 2 requests; each with a limit of 3 until the max_pages of 5 is met
41+
docs = loader.load(space_key="RD", limit=3, max_pages=5)
3742

3843
assert len(docs) == 5
3944
assert docs[0].page_content is not None
45+
46+
47+
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
48+
def test_pass_confluence_kwargs() -> None:
49+
loader = ConfluenceLoader(
50+
url="https://templates.atlassian.net/wiki/",
51+
confluence_kwargs={"verify_ssl": False},
52+
)
53+
54+
assert loader.confluence.verify_ssl is False

0 commit comments

Comments
 (0)