@@ -60,6 +60,8 @@ class ConfluenceLoader(BaseLoader):
6060 :type min_retry_seconds: Optional[int], optional
6161 :param max_retry_seconds: defaults to 10
6262 :type max_retry_seconds: Optional[int], optional
63+ :param confluence_kwargs: additional kwargs to initialize confluence with
64+ :type confluence_kwargs: dict, optional
6365 :raises ValueError: Errors while validating input
6466 :raises ImportError: Required dependencies not installed.
6567 """
@@ -74,7 +76,9 @@ def __init__(
7476 number_of_retries : Optional [int ] = 3 ,
7577 min_retry_seconds : Optional [int ] = 2 ,
7678 max_retry_seconds : Optional [int ] = 10 ,
79+ confluence_kwargs : Optional [dict ] = None ,
7780 ):
81+ confluence_kwargs = confluence_kwargs or {}
7882 errors = ConfluenceLoader .validate_init_args (url , api_key , username , oauth2 )
7983 if errors :
8084 raise ValueError (f"Error(s) while validating input: { errors } " )
@@ -93,10 +97,16 @@ def __init__(
9397 )
9498
9599 if oauth2 :
96- self .confluence = Confluence (url = url , oauth2 = oauth2 , cloud = cloud )
100+ self .confluence = Confluence (
101+ url = url , oauth2 = oauth2 , cloud = cloud , ** confluence_kwargs
102+ )
97103 else :
98104 self .confluence = Confluence (
99- url = url , username = username , password = api_key , cloud = cloud
105+ url = url ,
106+ username = username ,
107+ password = api_key ,
108+ cloud = cloud ,
109+ ** confluence_kwargs ,
100110 )
101111
102112 @staticmethod
@@ -147,7 +157,9 @@ def load(
147157 label : Optional [str ] = None ,
148158 cql : Optional [str ] = None ,
149159 include_attachments : bool = False ,
160+ include_comments : bool = False ,
150161 limit : Optional [int ] = 50 ,
162+ max_pages : Optional [int ] = 1000 ,
151163 ) -> List [Document ]:
152164 """
153165 :param space_key: Space key retrieved from a confluence URL, defaults to None
@@ -160,8 +172,12 @@ def load(
160172 :type cql: Optional[str], optional
161173 :param include_attachments: defaults to False
162174 :type include_attachments: bool, optional
163- :param limit: Maximum number of pages to retrieve, defaults to 50
175+ :param include_comments: defaults to False
176+ :type include_comments: bool, optional
177+ :param limit: Maximum number of pages to retrieve per request, defaults to 50
164178 :type limit: int, optional
179+ :param max_pages: Maximum number of pages to retrieve in total, defaults 1000
180+ :type max_pages: int, optional
165181 :raises ValueError: _description_
166182 :raises ImportError: _description_
167183 :return: _description_
@@ -191,29 +207,41 @@ def load(
191207 self .confluence .get_all_pages_from_space ,
192208 space = space_key ,
193209 limit = limit ,
210+ max_pages = max_pages ,
194211 expand = "body.storage.value" ,
195212 )
196213 for page in pages :
197- doc = self .process_page (page , include_attachments , text_maker )
214+ doc = self .process_page (
215+ page , include_attachments , include_comments , text_maker
216+ )
198217 docs .append (doc )
199218
200219 if label :
201220 pages = self .paginate_request (
202221 self .confluence .get_all_pages_by_label ,
203222 label = label ,
204223 limit = limit ,
224+ max_pages = max_pages ,
205225 expand = "body.storage.value" ,
206226 )
207227 for page in pages :
208- doc = self .process_page (page , include_attachments , text_maker )
228+ doc = self .process_page (
229+ page , include_attachments , include_comments , text_maker
230+ )
209231 docs .append (doc )
210232
211233 if cql :
212234 pages = self .paginate_request (
213- self .confluence .cql , cql = cql , limit = limit , expand = "body.storage.value"
235+ self .confluence .cql ,
236+ cql = cql ,
237+ limit = limit ,
238+ max_pages = max_pages ,
239+ expand = "body.storage.value" ,
214240 )
215241 for page in pages :
216- doc = self .process_page (page , include_attachments , text_maker )
242+ doc = self .process_page (
243+ page , include_attachments , include_comments , text_maker
244+ )
217245 docs .append (doc )
218246
219247 if page_ids :
@@ -231,19 +259,23 @@ def load(
231259 before_sleep = before_sleep_log (logger , logging .WARNING ),
232260 )(self .confluence .get_page_by_id )
233261 page = get_page (page_id = page_id , expand = "body.storage.value" )
234- doc = self .process_page (page , include_attachments , text_maker )
262+ doc = self .process_page (
263+ page , include_attachments , include_comments , text_maker
264+ )
235265 docs .append (doc )
236266
237267 return docs
238268
239269 def paginate_request (self , retrieval_method : Callable , ** kwargs : Any ) -> List :
240270 """Paginate the various methods to retrieve groups of pages.
241271
242- Unforunately, due to page size, sometimes the Confluence API
243- doesn't match the limit value. Also, due to the Atlassian Python
272+ Unfortunately, due to page size, sometimes the Confluence API
273+ doesn't match the limit value. If `limit` is >100 confluence
274+ seems to cap the response to 100. Also, due to the Atlassian Python
244275 package, we don't get the "next" values from the "_links" key because
245276 they only return the value from the results key. So here, the pagination
246- starts from 0 and goes until the limit. We have to manually check if there
277+ starts from 0 and goes until the max_pages, getting the `limit` number
278+ of pages with each request. We have to manually check if there
247279 are more docs based on the length of the returned list of pages, rather than
248280 just checking for the presence of a `next` key in the response like this page
249281 would have you do:
@@ -255,10 +287,9 @@ def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
255287 :rtype: List
256288 """
257289
258- limit = kwargs ["limit" ]
259- page = 0
260- docs = []
261- while page < limit :
290+ max_pages = kwargs .pop ("max_pages" )
291+ docs : List [dict ] = []
292+ while len (docs ) < max_pages :
262293 get_pages = retry (
263294 reraise = True ,
264295 stop = stop_after_attempt (
@@ -271,16 +302,18 @@ def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
271302 ),
272303 before_sleep = before_sleep_log (logger , logging .WARNING ),
273304 )(retrieval_method )
274- batch = get_pages (** kwargs , start = page )
275- if len (batch ) < limit :
276- page = limit
277- else :
278- page += len (batch )
305+ batch = get_pages (** kwargs , start = len (docs ))
306+ if not batch :
307+ break
279308 docs .extend (batch )
280- return docs
309+ return docs [: max_pages ]
281310
282311 def process_page (
283- self , page : dict , include_attachments : bool , text_maker : Any
312+ self ,
313+ page : dict ,
314+ include_attachments : bool ,
315+ include_comments : bool ,
316+ text_maker : Any ,
284317 ) -> Document :
285318 if include_attachments :
286319 attachment_texts = self .process_attachment (page ["id" ])
@@ -289,8 +322,23 @@ def process_page(
289322 text = text_maker .handle (page ["body" ]["storage" ]["value" ]) + "" .join (
290323 attachment_texts
291324 )
325+ if include_comments :
326+ comments = self .confluence .get_page_comments (
327+ page ["id" ], expand = "body.view.value" , depth = "all"
328+ )["results" ]
329+ comment_texts = [
330+ text_maker .handle (comment ["body" ]["view" ]["value" ])
331+ for comment in comments
332+ ]
333+ text = text + "" .join (comment_texts )
334+
292335 return Document (
293- page_content = text , metadata = {"title" : page ["title" ], "id" : page ["id" ]}
336+ page_content = text ,
337+ metadata = {
338+ "title" : page ["title" ],
339+ "id" : page ["id" ],
340+ "source" : self .base_url .strip ("/" ) + page ["_links" ]["webui" ],
341+ },
294342 )
295343
296344 def process_attachment (self , page_id : str ) -> List [str ]:
0 commit comments