diff --git a/.gitignore b/.gitignore index bb91773..3cb2889 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ __pycache__/ PDS .DS_Store +/styles/00 archive/ +00 archive/ +.idea/ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..bb133c6 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,190 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/ "null"), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html "null"). + +## \[2.5.0\] - 2025-11-22 + +Introduction of the "Architecture Sandbox" for offline restructuring. + +### Added + +- **Architecture Sandbox:** Introduced `create_editor.py` and `patch_sidebar.py`. Users can now generate a visual Drag & Drop editor (`editor_sidebar.html`) to restructure the exported documentation offline and apply changes massively using the patcher. + +- **Robust Editor Generation:** The editor generator now uses a safe string concatenation approach to avoid syntax errors and supports creating a working copy of the sidebar structure (`sidebar_edit.md`). + + +### Changed + +- **CSS Strategy:** Refined the "Two-Layer" styling approach (Standard + Custom) to be more robust in the documentation and implementation. + + +## \[2.4.1\] - 2025-11-21 + +UI/UX Improvements and Bug Fixes. + +### Added + +- **Metadata Injection:** Page Title, Author, and Modification Date are now injected directly into the HTML Body (top of the page) for better readability. + +- **Automatic Time-stamping:** Output folders are now automatically named with `YYYY-MM-DD HHMM [Title]` to support clean versioned backups. + +- **Persistent Sidebar:** The sidebar width is now remembered across page loads using `localStorage`. + +- **Absolute Links in Markdown:** The generated `sidebar.md` uses absolute file URIs to support opening links in external editors like Logseq or WebStorm directly. + + +### Fixed + +- **Empty Page Bug:** Fixed an issue where pages with empty bodies (folders) resulted in 0-byte HTML files. Now generates a proper HTML skeleton with title and sidebar. + +- **Markdown Patching:** Updated `patch_sidebar.py` to handle absolute file URIs correctly. + +- **UI Layout:** Optimized Sidebar/Content padding and Hamburger button alignment. + + +## \[2.4.0\] - 2025-11-21 + +Advanced Filtering and Tree Logic Update. + +### Added + +- **Label Forest Mode:** The `label` command now supports deep recursion ("Forest Export"). It finds all pages with the include-label and treats them as roots for full tree exports. + +- **Label Pruning:** Added `--exclude-label` to prune subtrees based on a specific label (e.g., 'archived') during recursion. + + +## \[2.3.0\] - 2025-11-21 + +Enterprise Performance & Usability Release. + +### Added + +- **Recursive Inventory:** Changed scanning logic to use `/child/page` API endpoints. This ensures the export respects the **manual sort order** of Confluence. + +- **Multithreading:** Added `-t/--threads` argument to parallelize page downloads (Phase 2), significantly improving performance on large spaces. + +- **Tree Pruning (ID):** Added `--exclude-page-id` to skip specific branches during recursion. + +- **JS Resizer:** The sidebar now has a robust JavaScript-based drag-handle for resizing. + +- **UX Improvements:** + + - Fixed Hamburger position (top-left). + + - Added "Heartbeat" visualization during inventory scan. + + - Added VPN Reminder for Data Center profiles. + + +### Changed + +- **Architecture:** Split process into a strict "Inventory Phase" (Serial, Recursive for sorting) and "Download Phase" (Parallel). + + +## \[2.2.0\] - 2025-11-20 + +Introduction of Static Sidebar Injection. + +### Added + +- **Static Sidebar Injection:** Automatically generates a hierarchical navigation tree and injects it into every HTML page. + +- **Inventory Phase:** Scans all pages/metadata _before_ downloading content to allow for accurate progress bars (`tqdm`) and global tree generation. + +- **Smart Linking:** Improved detection of dead/external links vs. local links based on the inventory. + +- **CSS Auto-Discovery:** The script automatically detects and applies `site.css` from the local `styles/` folder. + +- **Multi-CSS Support:** Allows layering multiple CSS files (Standard + Custom). + +- **`sidebar.html` Export:** Saves the generated sidebar tree as a separate file. + + +### Changed + +- **HTML Layout:** Pages are now wrapped in a Flexbox layout container to support the sidebar. + +- **Logging:** Cleaned up library logging to support progress bars. + + +## \[2.1.0\] - 2025-11-19 + +Major functionality restore and improvement ("Visual Copy" release). + +### Added + +- **HTML Processing with BeautifulSoup:** Re-introduced intelligent HTML parsing. + + - **Image Downloading:** Automatically detects embedded images/emoticons, downloads them, and rewrites HTML links to local paths (`../attachments/`). + + - **Link Sanitizing:** Attempts to rewrite Confluence internal links to relative filenames. + + - **Metadata Injection (Head):** Injects Title, Page ID, and Labels into the HTML ``. + +- **Export View:** Switched API fetch from `storage` format to `export_view` (or `view`) to get rendered HTML (resolves macros like TOC). + +- **Attachment Downloading:** Downloads _all_ attachments of a page via API list, not just those embedded in the text. + + +### Changed + +- **HTML First:** The primary output format is now processed HTML (`export_view`). RST export is optional via `-R`. + +- **Dependencies:** Added `beautifulsoup4` to requirements. + +- **CSS handling:** Improved relative pathing for robust offline viewing. + + +## \[2.0.0\] - 2025-11-17 + +This version introduces a major architectural refactoring to support both Confluence Cloud and Data Center. + +### Added + +- **Confluence Data Center Support:** The script now supports both Confluence Cloud (`--profile cloud`) and Data Center (`--profile dc`). + +- **Configuration File (`confluence_products.ini`):** All platform-specific values (API URL templates, auth methods, base paths) are now defined in this external INI file. + +- **Data Center Authentication:** Added support for Bearer Token (Personal Access Token) authentication. + +- **New `label` Command:** Added support for dumping all pages with a specific label. + +- **Troubleshooting Hints:** Added specific error messages for Data Center users when authentication fails (Intranet/VPN warning). + +- **Documentation:** Added `CONTRIBUTING.md` and `CHANGELOG.md`. + + +### Changed + +- **\[BREAKING CHANGE\] CLI Architecture (Sub-Commands):** The script's interface has been completely modernized, replacing the `-m`/`--mode` flag with sub-commands (like `git`). + + - **REMOVED:** The `-m`/`--mode` flag. + + - **REMOVED:** The `-s`/`--site` argument. + + - **ADDED:** Sub-commands: `single`, `tree`, `space`, `all-spaces`, `label`. + + - **ADDED (Global):** `--base-url`, `--profile`, `--context-path`. + +- **Refactored `myModules.py`:** All API functions are now platform-agnostic. Hardcoded URLs removed. + +- **Internationalization:** All code comments translated to English. + + +_History below this line is from the original author (jgoldin-skillz)._ + +## \[1.0.2\] - 2022-03-03 + +- Bugfixes + + +## \[1.0.1\] - 2022-03-03 + +- Added `confluenceDumpWithPython.py` + + +## \[1.0.0\] - 2022-03-01 + +- Initial version \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..c854a43 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,60 @@ +# Contributing Guide + +This document explains the internal architecture of this script, primarily for future contributors or the original author when reviewing pull requests. + +## Architectural Overview + +The codebase has been refactored from a simple linear scraper to a multi-phase, multithreaded export engine designed for enterprise stability. + +### Core Philosophy + +- **Inventory First:** We scan the entire structure _before_ downloading content. This allows for accurate progress bars (`tqdm`) and correct sorting. + +- **Static & Self-Contained:** The output HTML must work without a server, internet connection, or JavaScript dependencies (Zero-Dependency). + +- **Platform Agnostic:** `confluence_products.ini` abstracts the differences between Cloud and Data Center. + + +### The Export Pipeline (`confluenceDumpWithPython.py`) + +1. **Inventory Phase (Serial):** + + - Uses recursion (e.g., `recursive_scan`) to walk the Confluence tree using `child/page` endpoints. + + - This guarantees the sidebar matches the _manual sort order_ of Confluence (unlike CQL search). + + - Applies pruning (excludes) at this stage to save time. + + - Generates `sidebar.md` (Markdown representation) and `sidebar.html` (HTML Tree). + +2. **Download Phase (Parallel):** + + - Uses `ThreadPoolExecutor` to fetch `export_view` HTML and attachments in parallel. + + - Calls `myModules.process_page_content` to sanitize HTML (BeautifulSoup). + +3. **Injection Phase:** + + - Injects the pre-calculated Sidebar, Metadata, and CSS into every downloaded page. + + +### The Editor Workflow (`create_editor.py` & `patch_sidebar.py`) + +We treat the exported structure as mutable. + +- **`create_editor.py`**: Parses the `sidebar.md` and generates a standalone HTML Single-Page-Application using vanilla JavaScript. It allows Drag & Drop reordering. + +- **`patch_sidebar.py`**: Parses the modified Markdown and re-injects the new navigation tree into all existing HTML files. + + +## Key Files + +- **`confluenceDumpWithPython.py`**: Main entry point and orchestration. + +- **`myModules.py`**: API abstraction, BeautifulSoup logic, and HTML templating. + +- **`confluence_products.ini`**: URL templates for Cloud vs. DC. + +- **`create_editor.py`**: Tool to generate the visual sidebar editor. + +- **`patch_sidebar.py`**: Tool to apply structure changes. \ No newline at end of file diff --git a/README.md b/README.md index 207040a..369ad17 100644 --- a/README.md +++ b/README.md @@ -1,142 +1,181 @@ -# Confluence Dump With Python - -Dump Confluence pages using Python (requests) in HTML and RST format, including embedded pictures and attachments. -References to downloaded files will be updated to their local relative path. - -## Description - -Nonetheless, the refactoring will require only 2 files and accept command-line args: -* `myModules.py`: Contains all the required functions. -* `confluenceDumpWithPython.py`: Script to use with the following command line args: - * `-m, --mode`: The export mode, `single`, `space`, `bylabel`, `pageprops` (required). - * Note: Only `single`, `pageprops` and `space` have been implemented so far. - * `-S, --site`: The Atlassian Site (required). - * `-s, --space`: The Space Key (if needed). - * `-p, --page`: The Page ID (if needed). - * `-l, --label`: The Page label (if needed). - * `-x, --sphinx`: The `_images` and `_static` folders are placed at the root of the export folder, instead of together with the exported HTML files. - * `--notags`: Does not add the tags directives to the rst files (when the `sphinx-tags` addon is not used). -* `updatePageLinks.py`: Update online confluence links to the local files that have been downloaded so far. - * `--folder`: Folder containing the files to update. - * `--test`: Instead of overwriting the original .rst files, it will create updated ones with `zout_` as a prefix. -* `getPageEditorVersion.py`: Get the editor version from single pages or all pages in a space. - * `--site`: The Atlassian Site (required). - * `--page`: Page ID (either/or) - * `--space`: Space Key (either/or) - -For CSS Styling, it uses the `confluence.css` from Confluence that can be obtained by using the Workaround described in: https://jira.atlassian.com/browse/CONFSERVER-40907. -The `site.css` file included with Confluence UI HTML exports is not as complete as the one above. - -### Folder and file structure: - -* The default output folder is `output/` under the same path as the script. -* A folder with the Space name, Page Properties report page, single page name or Page Label name will be created under the output folder. -* By default, the `_images/` and `_static/` folders will be placed in the page|space|pageprops|label folder. - * The `--sphinx` command line option will put those folder directly under the output folder -* The file `styles/confluence.css` will be copied into the defined `_static/` - -## What it does - -* Leverages the Confluence Cloud API -* Puts Confluence meta data like Page ID and Page Labels, in the HTML headers and RST fields. -* beautifulsoup is used to parse HTML to get and update content, ie. change remote links to local links. -* Download for every page, all attachments, emoticons and embedded files. +# Confluence Dump with Python -## Requirements - -* declare system variables: - * `atlassianAPIToken` - * `atlassianUserEmail` - -### Dependencies - -* python3 - * requests - * beautifulsoup4 - * Pillow (handle images) - * pandoc & pypandoc (convert to RST) - * re - -### Installing - -* Clone repo. -* Install dependencies. -* Declare system variables for Atlassian API Token. +This script exports content from a Confluence instance (Cloud or Data Center) using various modes. -### Executing program +**Key Features:** +- **Visual Fidelity & Sidebar:** Creates a visually faithful copy of Confluence pages, including a **fully functional, static navigation sidebar** on the left—something even the standard Confluence export does not provide. + +- **Offline Browsing:** Localizes images and links, and downloads **all** attachments (PDFs, Office docs, etc.) for complete offline access. + +- **Recursive Inventory:** Scans the tree hierarchy to ensure the **correct sort order** (manual Confluence order) in the sidebar. + +- **Metadata Injection:** Automatically adds Page Title, Author, and Modification Date to the top of every page. + +- **Versioning:** Automatically creates timestamped output subfolders (e.g., `2025-11-21 1400 Space IT`) for clean history management. This allows you to run the script repeatedly (e.g., after changes in Confluence) and maintain a history of snapshots without overwriting previous exports. + +- **Performance:** Supports **Multithreaded** downloading (`--threads`) to speed up the export of large spaces. + +- **Tree Pruning:** Exclude specific branches with `--exclude-page-id` or `--exclude-label`. + +- **Index Sandbox:** Includes visual tools to manually restructure the navigation tree via Drag & Drop and apply it to the downloaded files without affecting Confluence. + -* How to download a single page based on its ID. +## Platform Support -``` -confluenceDumpWithPython.py -m single -S -p [] [--sphinx] -``` +This script supports both: -* How to download Page Properties and all the contained pages. +- **Confluence Cloud** + +- **Confluence Data Center** + -``` -confluenceDumpWithPython.py -m pageprops -S -p [] [--sphinx] -``` +The platform-specific API paths and authentication methods are defined in the `confluence_products.ini` file. -* How to download a whole Space. +> **⚠️ Note on Cloud Verification:** The support for **Confluence Cloud** has been carefully ported to the new modular architecture based on the original codebase. However, this refactoring was developed and tested against a **Confluence Data Center** environment. +> +> While the logic remains consistent with the previous version, the Cloud mode has **not yet been verified in a live environment** by the current maintainer due to lack of access. If you encounter issues with Cloud authentication or API paths, please open an issue or submit a Pull Request. -``` -confluenceDumpWithPython.py -m space -S -s [] -``` +## Missing Features / Ideas -## Help +- **Incremental Update:** Currently, the script always performs a full export. An update mode that only downloads changed pages would be a valuable addition. + -No special advice other than: -* make sure that your Atlassian API Token is valid. -* the username for the Cloud Atlassian API is the e-mail address. - -## Authors - -Contributors names and contact info - -@dernorberto - -## Improvements - -- [ ] Add export based on page label. -- [x] Add links to Downloads for the corresponding pages. -- [x] Update all links from downloaded pages to the local copies. -- [x] Add to headers the parent page and page labels. -- [ ] Create an index of the pages to use as a TOC. -- [ ] Create a page layout to display TOC + articles. -- [x] Copy `styles/site.css` into `output/styles/` if not present. -- [ ] Allow using with Confluence Server. +## Requirements -## Issues +- Python 3.x + +- `requests`, `beautifulsoup4`, `tqdm` + +- `pypandoc` (optional, only needed for RST export) + -* It does not like very long attachment files, you'll need to rename them in Confluence before the dump. -* Pages previously migrated from Confluence Server might have issues with old emoticons. The best is to convert the pages to the New Editor, which will replace the missing emoticons. + pip install -r requirements.txt + + + +## Authentication + +Authentication is handled via environment variables, based on the profile you select. + +### For Confluence Cloud (`--profile cloud`) -## Version History -* 1.4 - * Refactoring into a more simple file setup (`confluenceDumpWithPython.py` & `myModules.py`) -* 1.3 - * Added Space export (flat folder structure) -* 1.2 - * Added better HTML header and footer. - * Added page labels to HTML headers. - * Improved output folder argument logic. -* 1.1 - * Added Papge Properties dump and other smaller things -* 1.0 - * Initial Release + export CONFLUENCE_USER="your-email@example.com" + export CONFLUENCE_TOKEN="YourApiTokenHere" + + -## legacy/ folder with previous version of scripts +### For Confluence Data Center (`--profile dc`) -Purpose of the files: -1. `confluenceExportHTMLrequestsByLabel.py`: download a set of pages based on one (or more) page Labels. -2. `confluenceExportHTMLrequestsSingle.py`: download a single page by supplying the page ID as an argument. -3. `confluenceExportHTMLrequestsPagePropertiesReport.py`: download page properties and all the pages in the report by supplying the page ID as an argument. -4. `confluenceExportHTMLrequestsPagesInSpace.py`: download all pages from a space. + export CONFLUENCE_TOKEN="YourPersonalAccessTokenHere" + + + +**⚠️ Troubleshooting Note for Data Center:** If authentication fails (Intranet/SSO blocks), ensure you are on VPN and PATs are enabled. + +## Exporting with CSS Styling + +The script uses a robust **Two-Layer Styling Strategy**. + +### Layer 1: Standard CSS (Default) + +The project folder contains a `styles/` directory. If a CSS file exists there (e.g., `styles/site.css`), it is **automatically applied** to every export. + +### Layer 2: Custom CSS (Optional) + +Use `--css-file "/path/to/my_custom.css"` to apply specific overrides. This file will be loaded **after** the standard CSS. + +## Usage + +### General Syntax + + python3 confluenceDumpWithPython.py [GLOBAL_OPTIONS] [COMMAND_OPTIONS] + + + +### Global Options + + -o OUTDIR, --outdir OUTDIR + The output directory (will be created) + --base-url BASE_URL Confluence Base URL (e.g., '[https://confluence.corp.com](https://confluence.corp.com)') + --profile PROFILE Platform profile ('cloud' or 'dc') + --context-path PATH (DC only) Context path (e.g., '/wiki') + --threads THREADS, -t THREADS + Number of download threads (Default: 1) + --exclude-page-id ID Exclude a page ID and its children (can be repeated) + --no-vpn-reminder Skip the VPN check confirmation (DC only) + --css-file CSS_FILE Path to custom CSS file + -R, --rst Export pages as RST (requires pypandoc) + + + +### Commands + +- **`space`**: Dumps an entire space. Starts at the Space Homepage and recurses down. + + - `-sp`, `--space-key`: The Key of the space. + +- **`tree`**: Dumps a specific page and all its descendants. + + - `-p`, `--pageid`: The Root Page ID. + +- **`single`**: Dumps a single page. + + - `-p`, `--pageid`: The Page ID. + +- **`label`**: Dumps pages by label ("Forest Mode"). Finds all pages with the label and treats them as roots for recursion. + + - `-l`, `--label`: The label to include. + + - `--exclude-label`: Exclude subtrees that have this specific label (e.g. 'archived'). + +- **`all-spaces`**: Dumps all visible spaces. + + +### Examples + +**1\. Data Center: Entire Space, 8 Threads, Exclude Archive** + + python3 confluenceDumpWithPython.py \ + --base-url "[https://confluence.corp.com](https://confluence.corp.com)" \ + --profile dc \ + --context-path "/wiki" \ + -o "./dump_it" \ + -t 8 \ + --exclude-page-id "999999" \ + space -sp "IT" + + + +**2\. Cloud: Single Page Tree** -## License + python3 confluenceDumpWithPython.py \ + --base-url "[https://myteam.atlassian.net](https://myteam.atlassian.net)" \ + --profile cloud \ + -o "./dump_tree" \ + tree -p "12345" + + + +## Index Restructuring Sandbox + +This additional toolset allows you to re-organize the pages and sub-pages structure (the index) of your export locally. This is useful for testing structural changes or cleaning up the navigation flow without touching Confluence or re-downloading pages. + +**The Workflow:** + +1. **Generate Editor:** Create a visual Drag & Drop editor for the index of all exported pages. + + python3 create_editor.py --site-dir "./output/2025-01-01 Space IT" + + + +2. **Edit:** Open `editor_sidebar.html` in your browser. Move pages, create folders, delete items. + +3. **Save:** Click "Copy Markdown" in the editor and paste the content into a new file `sidebar_edit.md` in the site directory. + +4. **Apply:** Patch the new index structure into all **downloaded** HTML files. + + python3 patch_sidebar.py --site-dir "./output/2025-01-01 Space IT" -This project is licensed under the MIT License - see the LICENSE.txt file for details -## Acknowledgments diff --git a/confluenceDumpWithPython.py b/confluenceDumpWithPython.py index 5e68b33..dd54e34 100644 --- a/confluenceDumpWithPython.py +++ b/confluenceDumpWithPython.py @@ -1,265 +1,527 @@ -import os.path +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +This script dumps content from a Confluence instance (Cloud or Data Center). +Features: +- Recursive Inventory Scan (Correct Sort Order) +- Multithreaded Downloading +- HTML Processing with BeautifulSoup (Images, Links, Sidebar, Resizer) +- Static Sidebar Injection +- CSS Auto-Discovery +- Label-based Tree Pruning +- Automatic Timestamped Subdirectories +""" + import argparse -import confluence_dump.myModules as myModules +import os +import sys +import json +import shutil +import glob +import time +import re +from datetime import datetime +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor, as_completed +from confluence_dump import myModules -"""Dump Confluence content using Python +# --- External Libraries --- +try: + import pypandoc +except ImportError: + pypandoc = None -Args: - mode: Download mode - site: Site to export from - space: Space to export from - page: Page to export - outdir: Folder to export to (optional) - sphinx: Sphinx compatible folder structure (optional) - notags: Do not add tags to rst files (optional) +try: + from tqdm import tqdm +except ImportError: + def tqdm(iterable, **kwargs): + return iterable +# --- Global Config & State --- +platform_config = {} +auth_info = {} +all_pages_metadata = [] +seen_metadata_ids = set() +global_sidebar_html = "" -Returns: - HTML and RST files inside the default or custom output folder -""" +# --- Helper Functions --- + +def sanitize_filename(filename): + """ Sanitizes a string to be safe for directory names. """ + s = re.sub(r'[<>:"/\\|?*]', '_', filename) + # Strip whitespace and dots from ends + return s.strip().strip('.') + + +def get_run_title(args, base_url, platform_config, auth_info): + """ Determines the semantic title for the output folder based on the command. """ + if args.command == 'all-spaces': + return "all spaces" + elif args.command == 'space': + return f"Space {args.space_key}" + elif args.command == 'label': + return f"Export {args.label}" + elif args.command in ('single', 'tree'): + try: + context_path = args.context_path + page_data = myModules.get_page_basic(args.pageid, base_url, platform_config, auth_info, context_path) + if page_data and 'title' in page_data: + return page_data['title'] + else: + return f"Page {args.pageid}" + except Exception as e: + print(f"Warning: Could not fetch page title: {e}", file=sys.stderr) + return f"Page {args.pageid}" + return "Export" + + +# --- Processing Helpers --- + +def collect_page_metadata(page_full): + try: + page_id = page_full.get('id') + if not page_id or page_id in seen_metadata_ids: + return + title = page_full.get('title') + ancestors = page_full.get('ancestors', []) + parent_id = ancestors[-1]['id'] if ancestors else None + all_pages_metadata.append({'id': page_id, 'title': title, 'parent_id': parent_id}) + seen_metadata_ids.add(page_id) + except Exception as e: + print(f"Warning: Could not collect metadata for index: {e}", file=sys.stderr) + + +def save_page_attachments(page_id, attachments, base_url, auth_info): + if not attachments or 'results' not in attachments: return + for att in attachments['results']: + download_path = att.get('_links', {}).get('download') + filename = att.get('title') + if download_path and filename: + if download_path.startswith('/'): + full_url = base_url.rstrip('/') + download_path + else: + full_url = base_url.rstrip('/') + '/' + download_path + local_path = os.path.join(myModules.outdir_attachments, filename) + myModules.download_file(full_url, local_path, auth_info) + + +def convert_rst(page_id, page_body, outdir_pages): + if pypandoc is None: return + page_filename_rst = f"{outdir_pages}{page_id}.rst" + try: + pypandoc.convert_text(page_body, 'rst', format='html', outputfile=page_filename_rst) + except Exception as e: + print(f" Error converting RST for {page_id}: {e}", file=sys.stderr) + + +# --- Tree Generation --- + +def build_tree_structure(target_ids): + tree_map = {} + pages_map = {} + relevant_pages = [p for p in all_pages_metadata if p['id'] in target_ids] + for page in relevant_pages: + pid = page['id'] + parent = page['parent_id'] + pages_map[pid] = page + if parent not in tree_map: tree_map[parent] = [] + tree_map[parent].append(pid) + downloaded_ids = set(pages_map.keys()) + root_ids = [] + for page in relevant_pages: + parent = page['parent_id'] + if parent is None or parent not in downloaded_ids: + root_ids.append(page['id']) + return tree_map, pages_map, root_ids + + +def generate_tree_html(target_ids): + tree_map, pages_map, root_ids = build_tree_structure(target_ids) + + # Added \n for readability + def build_branch(parent_id): + if parent_id not in tree_map: return "" + html = "
    \n" + for child_id in tree_map[parent_id]: + if child_id not in pages_map: continue + child = pages_map[child_id] + title = child['title'] + link = f'{title}' + + if child_id in tree_map: + sub_tree = build_branch(child_id) + html += f'
  • {link}{sub_tree}
  • \n' + else: + html += f'
  • {link}
  • \n' + html += "
\n" + return html + + sidebar = '\n' + return sidebar + + +def generate_tree_markdown(target_ids): + tree_map, pages_map, root_ids = build_tree_structure(target_ids) + md_lines = [] + pages_dir_abs = os.path.abspath(myModules.outdir_pages) + pages_uri = Path(pages_dir_abs).as_uri() + + def build_branch_md(parent_id, level): + if parent_id not in tree_map: return + indent = " " * level + for child_id in tree_map[parent_id]: + if child_id not in pages_map: continue + child = pages_map[child_id] + md_lines.append(f"{indent}- [{child['title']}]({pages_uri}/{child_id}.html)") + if child_id in tree_map: + build_branch_md(child_id, level + 1) + + for rid in root_ids: + if rid not in pages_map: continue + page = pages_map[rid] + md_lines.append(f"- [{page['title']}]({pages_uri}/{rid}.html)") + if rid in tree_map: + build_branch_md(rid, 1) + + return "\n".join(md_lines) + + +def save_sidebars(outdir, target_ids): + global global_sidebar_html + global_sidebar_html = generate_tree_html(target_ids) + with open(os.path.join(outdir, 'sidebar.html'), 'w', encoding='utf-8') as f: + f.write(global_sidebar_html) + + sidebar_md = generate_tree_markdown(target_ids) + with open(os.path.join(outdir, 'sidebar.md'), 'w', encoding='utf-8') as f: + f.write(sidebar_md) + + # Create sidebar_orig.md as backup (Golden Master) + with open(os.path.join(outdir, 'sidebar_orig.md'), 'w', encoding='utf-8') as f: + f.write(sidebar_md) + + +# --- Core Logic --- + +def process_page(page_id, global_args, active_css_files=None, exported_page_ids=None, verbose=True): + if verbose: print(f"\nProcessing page ID: {page_id}") + page_full = myModules.get_page_full(page_id, global_args.base_url, platform_config, auth_info, + global_args.context_path) + if not page_full: + print(f" Warning: Could not fetch page {page_id}. Skipping.", file=sys.stderr) + return + if verbose: collect_page_metadata(page_full) + + raw_html = page_full.get('body', {}).get('export_view', {}).get('value') + if not raw_html: + raw_html = page_full.get('body', {}).get('view', {}).get('value', '') + + processed_html = myModules.process_page_content( + raw_html, page_full, global_args.base_url, auth_info, active_css_files, exported_page_ids, global_sidebar_html + ) + + html_filename = os.path.join(myModules.outdir_pages, f"{page_id}.html") + with open(html_filename, 'w', encoding='utf-8') as f: + f.write(processed_html) + + page_attachments = myModules.get_page_attachments(page_id, global_args.base_url, platform_config, auth_info, + global_args.context_path) + save_page_attachments(page_id, page_attachments, global_args.base_url, auth_info) + + json_filename = os.path.join(myModules.outdir_pages, f"{page_id}.json") + page_full['body_processed'] = processed_html + with open(json_filename, 'w', encoding='utf-8') as f: + json.dump(page_full, f, indent=4, ensure_ascii=False) + + if global_args.rst: + convert_rst(page_id, processed_html, myModules.outdir_pages) + + +# --- Index Generation (Restored) --- + +def build_index_html(output_dir, css_files=None): + """ Generates an index.html file listing all downloaded pages hierarchically. """ + print("\nGenerating global index.html...") + tree_map, pages_map, root_ids = build_tree_structure(set(p['id'] for p in all_pages_metadata)) + + # Added \n for readability + def build_list_html(parent_id): + if parent_id not in tree_map: return "" + html = "
    \n" + for child_id in tree_map[parent_id]: + if child_id in pages_map: + child = pages_map[child_id] + html += f'
  • {child["title"]}' + html += build_list_html(child_id) + html += '
  • \n' + html += "
\n" + return html + + body_html = "

Confluence Export Index

    \n" + for rid in root_ids: + page = pages_map[rid] + body_html += f'
  • {page["title"]}' + body_html += build_list_html(rid) + body_html += '
  • \n' + body_html += "
\n" + + css_links = "" + if css_files: + for css in css_files: + clean_css = css.replace('../', '') + css_links += f'\n' + + full_html = f"""Index{css_links}{body_html}""" + + with open(os.path.join(output_dir, "index.html"), 'w', encoding='utf-8') as f: + f.write(full_html) + + +# --- Recursive Inventory & Scanning --- + +def recursive_scan(page_id, args, exclude_ids, scanned_count, exclude_label=None): + if page_id in exclude_ids: + print(f" [Excluded by ID] Pruning tree at page {page_id}", file=sys.stderr) + return [] + + tree_ids = [page_id] + scanned_count[0] += 1 + if scanned_count[0] % 10 == 0: + sys.stderr.write(f"\rScanned {scanned_count[0]} pages...") + sys.stderr.flush() + + while True: + children_data = myModules.get_child_pages(page_id, args.base_url, platform_config, auth_info, args.context_path) + if not children_data or 'results' not in children_data: break + children = children_data['results'] + if not children: break + + for child in children: + child_id = child['id'] + if exclude_label: + labels = [l['name'] for l in child.get('metadata', {}).get('labels', {}).get('results', [])] + if exclude_label in labels: + print(f" [Excluded by Label '{exclude_label}'] Pruning tree at page {child_id}", file=sys.stderr) + continue + collect_page_metadata(child) + tree_ids.extend(recursive_scan(child_id, args, exclude_ids, scanned_count, exclude_label)) + break + return tree_ids + + +def scan_space_inventory(args, exclude_ids): + print("Phase 1: Recursive Inventory Scan...") + scanned_count = [0] + homepage = myModules.get_space_homepage(args.space_key, args.base_url, platform_config, auth_info, + args.context_path) + if not homepage: + print("Error: Could not find Space Homepage.", file=sys.stderr) + return [], [] + root_id = homepage['id'] + collect_page_metadata(homepage) + all_ids_ordered = recursive_scan(root_id, args, exclude_ids, scanned_count) + print(f"\nInventory complete. Found {len(all_ids_ordered)} pages.") + return set(all_ids_ordered), all_ids_ordered + + +def scan_tree_inventory(root_id, args, exclude_ids): + print("Phase 1: Recursive Tree Scan...") + scanned_count = [0] + root_page = myModules.get_page_full(root_id, args.base_url, platform_config, auth_info, args.context_path) + if root_page: collect_page_metadata(root_page) + all_ids_ordered = recursive_scan(root_id, args, exclude_ids, scanned_count) + print(f"\nInventory complete. Found {len(all_ids_ordered)} pages.") + return set(all_ids_ordered), all_ids_ordered + + +def scan_label_forest_inventory(args, exclude_ids): + print(f"Phase 1: Label Forest Scan (Roots: '{args.label}')...") + scanned_count = [0] + root_pages = [] + start = 0 + while True: + res = myModules.get_pages_by_label(args.label, start, 200, args.base_url, platform_config, auth_info, + args.context_path) + if not res or not res.get('results'): break + for p in res['results']: + if p['id'] in exclude_ids: continue + root_pages.append(p) + start += 200 + + full_forest_ids = [] + exclude_label = getattr(args, 'exclude_label', None) + for root in root_pages: + collect_page_metadata(root) + branch_ids = recursive_scan(root['id'], args, exclude_ids, scanned_count, exclude_label) + full_forest_ids.extend(branch_ids) + unique_ordered = list(dict.fromkeys(full_forest_ids)) + print(f"\nInventory complete. Found {len(unique_ordered)} unique pages.") + return set(unique_ordered), unique_ordered + + +# --- Mode Handlers --- + +def run_download_phase(args, all_pages_list, target_ids, active_css_files): + print(f"Phase 2: Downloading & Processing {len(all_pages_list)} pages with {args.threads} threads...") + with ThreadPoolExecutor(max_workers=args.threads) as executor: + futures = [] + for pid in all_pages_list: + futures.append(executor.submit(process_page, pid, args, active_css_files, target_ids, verbose=False)) + for _ in tqdm(as_completed(futures), total=len(futures), desc="Downloading", unit="page"): + pass + + +def handle_space(args, active_css_files, exclude_ids): + print(f"Starting 'space' dump for {args.space_key}") + target_ids, all_pages_list = scan_space_inventory(args, exclude_ids) + save_sidebars(args.outdir, target_ids) + run_download_phase(args, all_pages_list, target_ids, active_css_files) + + +def handle_tree(args, active_css_files, exclude_ids): + print(f"Starting 'tree' dump for {args.pageid}") + target_ids, all_pages_list = scan_tree_inventory(args.pageid, args, exclude_ids) + save_sidebars(args.outdir, target_ids) + run_download_phase(args, all_pages_list, target_ids, active_css_files) + + +def handle_label(args, active_css_files, exclude_ids): + print(f"Starting 'label' dump for {args.label}") + target_ids, all_pages_list = scan_label_forest_inventory(args, exclude_ids) + save_sidebars(args.outdir, target_ids) + run_download_phase(args, all_pages_list, target_ids, active_css_files) + + +def handle_single(args, active_css_files, exclude_ids): + print(f"Starting 'single' dump for {args.pageid}") + root = myModules.get_page_full(args.pageid, args.base_url, platform_config, auth_info, args.context_path) + if root: collect_page_metadata(root) + save_sidebars(args.outdir, {args.pageid}) + process_page(args.pageid, args, active_css_files, {args.pageid}, verbose=True) + + +def handle_all_spaces(args, active_css_files, exclude_ids): + print("Starting 'all-spaces' dump...") + spaces = myModules.get_all_spaces(args.base_url, platform_config, auth_info, args.context_path) + if spaces and 'results' in spaces: + for s in spaces['results']: + print(f"\n--- Processing Space: {s['key']} ---") + global all_pages_metadata, global_sidebar_html, seen_metadata_ids + all_pages_metadata = [] + seen_metadata_ids = set() + s_args = argparse.Namespace(**vars(args)) + s_args.space_key = s['key'] + handle_space(s_args, active_css_files, exclude_ids) + + +# --- Main --- + +def main(): + parser = argparse.ArgumentParser( + description="Confluence Dump (Cloud/DC) with HTML Processing", + formatter_class=argparse.RawTextHelpFormatter + ) + + g = parser.add_argument_group('Global Options') + g.add_argument('-o', '--outdir', required=True, help="Output directory") + g.add_argument('--base-url', required=True, help="Confluence Base URL") + g.add_argument('--profile', required=True, help="cloud or dc") + g.add_argument('--context-path', default=None, help="Context path (DC only)") + g.add_argument('--css-file', default=None, help="Path to custom CSS file") + g.add_argument('-R', '--rst', action='store_true', help="Also export RST") + g.add_argument('-t', '--threads', type=int, default=1, help="Number of threads for download (Default: 1)") + g.add_argument('--exclude-page-id', action='append', help="Exclude a page ID and its children") + g.add_argument('--no-vpn-reminder', action='store_true', help="Skip the VPN check confirmation for Data Center") + + subs = parser.add_subparsers(dest='command', required=True, title="Commands") + + p_single = subs.add_parser('single', help="Dump a single page") + p_single.add_argument('-p', '--pageid', required=True, help="Page ID") + p_single.set_defaults(func=handle_single) + + p_tree = subs.add_parser('tree', help="Dump a page tree (Recursive)") + p_tree.add_argument('-p', '--pageid', required=True, help="Root Page ID") + p_tree.set_defaults(func=handle_tree) + + p_space = subs.add_parser('space', help="Dump an entire space (Recursive from Homepage)") + p_space.add_argument('-sp', '--space-key', required=True, help="Space Key") + p_space.set_defaults(func=handle_space) + + p_label = subs.add_parser('label', help="Dump pages by label") + p_label.add_argument('-l', '--label', required=True, help="Label Name") + p_label.set_defaults(func=handle_label) + + p_all = subs.add_parser('all-spaces', help="Dump all visible spaces") + p_all.set_defaults(func=handle_all_spaces) + + args = parser.parse_args() + + global platform_config, auth_info + active_css_files = [] + exclude_ids = set(args.exclude_page_id) if args.exclude_page_id else set() + + try: + platform_config = myModules.load_platform_config(args.profile) + auth_info = myModules.get_auth_config(platform_config) + + if args.profile == 'dc' and not args.no_vpn_reminder: + print("\n[!] DATA CENTER CHECK: Are you connected to the VPN/Intranet?") + input(" Press Enter to confirm connection (or Ctrl+C to cancel)...") + + # --- Auto-Subfolder Generation --- + timestamp = datetime.now().strftime("%Y-%m-%d %H%M") + run_title = get_run_title(args, args.base_url, platform_config, auth_info) + safe_title = sanitize_filename(run_title) + + new_outdir = os.path.join(args.outdir, f"{timestamp} {safe_title}") + print(f"Creating new output directory: {new_outdir}") + args.outdir = new_outdir + + myModules.setup_output_directories(args.outdir) + myModules.set_variables() + + local_styles_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'styles') + if os.path.exists(local_styles_dir): + for f in glob.glob(os.path.join(local_styles_dir, "*.css")): + if "site.css" in f: + target = os.path.join(myModules.outdir_styles, os.path.basename(f)) + shutil.copy(f, target) + active_css_files.append(f"../styles/{os.path.basename(f)}") + if args.css_file and os.path.exists(args.css_file): + target = os.path.join(myModules.outdir_styles, os.path.basename(args.css_file)) + shutil.copy(args.css_file, target) + active_css_files.append(f"../styles/{os.path.basename(args.css_file)}") + + except KeyboardInterrupt: + print("\nAborted by user.") + sys.exit(0) + except Exception as e: + print(f"Init Error: {e}", file=sys.stderr) + sys.exit(1) + + try: + args.func(args, active_css_files, exclude_ids) + # Build global index (optional, but good fallback) + build_index_html(args.outdir, active_css_files) + print(f"\nDump Complete. Output in {args.outdir}") + except Exception as e: + print(f"Execution Error: {e}", file=sys.stderr) + import traceback + traceback.print_exc() -parser = argparse.ArgumentParser() -parser.add_argument('--mode', '-m', dest='mode', - choices=['single', 'space', 'bylabel', 'pageprops'], - help='Chose a download mode', required=True) -parser.add_argument('--site', '-S', type=str, - help='Atlassian Site', required=True) -parser.add_argument('--space', '-s', type=str, - help='Space Key') -parser.add_argument('--page', '-p', type=int, - help='Page ID') -parser.add_argument('--label', '-l', type=str, - help='Page label') -parser.add_argument('--outdir', '-o', type=str, default='output', - help='Folder for export', required=False) -parser.add_argument('--sphinx', '-x', action='store_true', default=False, - help='Sphinx compatible folder structure', required=False) -parser.add_argument('--tags', action='store_true', default=False, - help='Add labels as .. tags::', required=False) -parser.add_argument('--html', action='store_true', default=False, - help='Include .html file in export (default is only .rst)', required=False) -parser.add_argument('--no-rst', action='store_false', dest="rst", default=True, - help='Disable .rst file in export', required=False) -parser.add_argument('--showlabels', action='store_true', default=False, - help='Export .rst files with the page labels at the bottom', required=False) - -args = parser.parse_args() -atlassian_site = args.site -if args.mode == 'single': - print(f"Exporting a single page (Sphinx set to {args.sphinx})") - page_id = args.page -elif args.mode == 'space': - print(f"Exporting a whole space (Sphinx set to {args.sphinx})") - space_key = args.space -elif args.mode == 'bylabel': - print(f"Exporting all pages with a common label (Sphinx set to {args.sphinx})") -elif args.mode == 'pageprops': - print(f"Exporting a Page Properties page with all its children (Sphinx set to {args.sphinx})") - -my_attachments = [] -my_embeds = [] -my_embeds_externals = [] -my_emoticons = [] -my_emoticons_list = [] - -user_name = os.environ["atlassianUserEmail"] -api_token = os.environ["atlassianAPIToken"] - -sphinx_compatible = args.sphinx -sphinx_tags = args.tags -print("Sphinx set to " + str(sphinx_compatible)) -atlassian_site = args.site -my_outdir_base = args.outdir -if args.mode == 'single': - ############ - ## SINGLE ## - ############ - page_id = args.page - page_name = myModules.get_page_name(atlassian_site,page_id,user_name,api_token) - - my_body_export_view = myModules.get_body_export_view(atlassian_site,page_id, - user_name,api_token).json() - my_body_export_view_html = my_body_export_view['body']['export_view']['value'] - my_body_export_view_title = my_body_export_view['title'].replace("/","-")\ - .replace(",","").replace("&","And").replace(":","-") - - server_url = f"https://{atlassian_site}.atlassian.net/wiki/api/v2/spaces/?limit=250" - - page_url = f"{my_body_export_view['_links']['base']}{my_body_export_view['_links']['webui']}" - page_parent = myModules.get_page_parent(atlassian_site,page_id,user_name,api_token) - - my_outdir_base = os.path.join(my_outdir_base,f"{page_id}-{my_body_export_view_title}") # sets outdir to path under page_name - my_outdir_content = my_outdir_base - -# if args.sphinx is False: -# my_outdir_base = os.path.join(my_outdir_base,f"{page_id}-{my_body_export_view_title}") # sets outdir to path under page_name -# my_outdir_content = my_outdir_base -# else: -# my_outdir_content = my_outdir_base - my_outdirs = [] - my_outdirs = myModules.mk_outdirs(my_outdir_base) # attachments, embeds, scripts - my_page_labels = myModules.get_page_labels(atlassian_site,page_id,user_name,api_token) - print(f"Base export folder is \"{my_outdir_base}\" and the Content goes to \"{my_outdir_content}\"") - myModules.dump_html(atlassian_site,my_body_export_view_html,my_body_export_view_title,page_id,my_outdir_base, my_outdir_content,my_page_labels,page_parent,user_name,api_token,sphinx_compatible,sphinx_tags,arg_html_output=args.html,arg_rst_output=args.rst) - print("Done!") -elif args.mode == 'space': - ########### - ## SPACE ## - ########### - all_spaces_full = myModules.get_spaces_all(atlassian_site,user_name,api_token) # get a dump of all spaces - all_spaces_short = [] # initialize list for less detailed list of spaces - i = 0 - for n in all_spaces_full: - i = i +1 - all_spaces_short.append({ # append the list of spaces - 'space_key' : n['key'], - 'space_id' : n['id'], - 'space_name' : n['name'], - 'homepage_id' : n['homepageId'], - 'spaceDescription' : n['description'], - }) - if (n['key'] == space_key) or n['key'] == str.upper(space_key) or n['key'] == str.lower(space_key): - print("Found space: " + n['key']) - space_id = n['id'] - space_name = n['name'] - current_parent = n['homepageId'] - my_outdir_content = os.path.join(my_outdir_base,f"{space_id}-{space_name}") - os.makedirs(my_outdir_content, exist_ok=True) - if args.sphinx is False: - my_outdir_base = my_outdir_content - - #print("my_outdir_base: " + my_outdir_base) - #print("my_outdir_content: " + my_outdir_content) - - if space_key == "" or space_key is None: # if the supplied space key can't be found - print("Could not find Space Key in this site") - else: - space_title = myModules.get_space_title(atlassian_site,space_id,user_name,api_token) - # - # get list of pages from space - # - all_pages_full = myModules.get_pages_from_space(atlassian_site,space_id,user_name,api_token) - all_pages_short = [] - i = 0 - for n in all_pages_full: - i = i + 1 - all_pages_short.append({ - 'page_id' : n['id'], - 'pageTitle' : n['title'], - 'parentId' : n['parentId'], - 'space_id' : n['spaceId'], - } - ) - # put it all together - print(f"{len(all_pages_short)} pages to export") - page_counter = 0 - for p in all_pages_short: - page_counter = page_counter + 1 - my_body_export_view = myModules.get_body_export_view(atlassian_site,p['page_id'],user_name,api_token).json() - my_body_export_view_html = my_body_export_view['body']['export_view']['value'] - my_body_export_view_name = p['pageTitle'] - my_body_export_view_title = p['pageTitle'].replace("/","-").replace(",","").replace("&","And").replace(" ","_") # added .replace(" ","_") so that filenames have _ as a separator - print() - print(f"Getting page #{page_counter}/{len(all_pages_short)}, {my_body_export_view_title}, {p['page_id']}") - my_body_export_view_labels = myModules.get_page_labels(atlassian_site,p['page_id'],user_name,api_token) - #my_body_export_view_labels = ",".join(myModules.get_page_labels(atlassian_site,p['page_id'],user_name,api_token)) - mypage_url = f"{my_body_export_view['_links']['base']}{my_body_export_view['_links']['webui']}" - print(f"dump_html arg sphinx_compatible = {sphinx_compatible}") - myModules.dump_html(atlassian_site,my_body_export_view_html,my_body_export_view_title,p['page_id'],my_outdir_base,my_outdir_content,my_body_export_view_labels,p['parentId'],user_name,api_token,sphinx_compatible,sphinx_tags,arg_html_output=args.html,arg_rst_output=args.rst) - print("Done!") -elif args.mode == 'pageprops': - ############### - ## PAGEPROPS ## - ############### - my_page_properties_children = [] - my_page_properties_children_dict = {} - - page_id = args.page - # - # Get Page Properties REPORT - # - print("Getting Page Properties Report Details") - my_report_export_view = myModules.get_body_export_view(atlassian_site,page_id,user_name,api_token).json() - my_report_export_view_title = my_report_export_view['title'].replace("/","-").replace(",","").replace("&","And").replace(":","-") - my_report_export_view_html = my_report_export_view['body']['export_view']['value'] - my_report_export_viewName = myModules.get_page_name(atlassian_site,page_id,user_name,api_token) - my_report_export_view_labels = myModules.get_page_labels(atlassian_site,page_id,user_name,api_token) - my_report_export_page_url = f"{my_report_export_view['_links']['base']}{my_report_export_view['_links']['webui']}" - my_report_export_page_parent = myModules.get_page_parent(atlassian_site,page_id,user_name,api_token) - my_report_export_html_filename = f"{my_report_export_view_title}.html" - # str(my_report_export_view_title) + '.html' - # my outdirs - my_outdir_content = os.path.join(my_outdir_base,str(page_id) + "-" + str(my_report_export_view_title)) - #print("my_outdir_base: " + my_outdir_base) - #print("my_outdir_content: " + my_outdir_content) - if args.sphinx is False: - my_outdir_base = my_outdir_content - - my_outdirs = [] - my_outdirs = myModules.mk_outdirs(my_outdir_base) # attachments, embeds, scripts - # get info abbout children - #my_page_properties_children = myModules.get_page_properties_children(atlassian_site,my_report_export_view_html,my_outdir_content,user_name,api_token)[0] # list - #my_page_properties_children_dict = myModules.get_page_properties_children(atlassian_site,my_report_export_view_html,my_outdir_content,user_name,api_token)[1] # dict - (my_page_properties_children,my_page_properties_children_dict) = myModules.get_page_properties_children(atlassian_site,my_report_export_view_html,my_outdir_content,user_name,api_token) - # - # Get Page Properties CHILDREN - # - page_counter = 0 - for p in my_page_properties_children: - page_counter = page_counter + 1 - #print("Handling child: " + p) - my_child_export_view = myModules.get_body_export_view(atlassian_site,p,user_name,api_token).json() - my_child_export_view_html = my_child_export_view['body']['export_view']['value'] - my_child_export_view_name = my_page_properties_children_dict[p]['Name'] - my_child_export_view_labels = myModules.get_page_labels(atlassian_site,p,user_name,api_token) - my_child_export_view_title = my_child_export_view['title'] ##.replace("/","-").replace(":","-").replace(" ","_") - print(f"Getting Child page #{page_counter}/{len(my_page_properties_children)}, {my_child_export_view_title}, {my_page_properties_children_dict[str(p)]['ID']}") - #print("Getting Child page #" + str(page_counter) + '/' + str(len(my_page_properties_children)) + ', ' + my_child_export_view_title + ', ' + my_page_properties_children_dict[str(p)]['ID']) - my_child_export_page_url = f"{my_child_export_view['_links']['base']}{my_child_export_view['_links']['webui']}" - #my_child_export_page_url = str(my_child_export_view['_links']['base']) + str(my_child_export_view['_links']['webui']) - my_child_export_page_parent = myModules.get_page_parent(atlassian_site,p,user_name,api_token) - html_file_name = (f"{my_page_properties_children_dict[p]['Name']}.html").replace(":","-").replace(" ","_") - #html_file_name = my_page_properties_children_dict[p]['Name'].replace(":","-").replace(" ","_") + '.html' - my_page_properties_children_dict[str(p)].update({"Filename": html_file_name}) - - myModules.dump_html( - arg_site=atlassian_site, - arg_html=my_child_export_view_html, - arg_title=my_child_export_view_title, - arg_page_id=p, - arg_outdir_base=my_outdir_base, - arg_outdir_content=my_outdir_content, - arg_page_labels=my_child_export_view_labels, - arg_page_parent=my_child_export_page_parent, - arg_username=user_name, - arg_api_token=api_token, - arg_sphinx_compatible=sphinx_compatible, - arg_sphinx_tags=sphinx_tags, - arg_type="reportchild", - arg_html_output=args.html, - arg_rst_output=args.rst, - arg_show_labels=args.showlabels - ) # creates html files for every child - myModules.dump_html( - arg_site=atlassian_site, - arg_html=my_report_export_view_html, - arg_title=my_report_export_view_title, - arg_page_id=page_id, - arg_outdir_base=my_outdir_base, - arg_outdir_content=my_outdir_content, - arg_page_labels=my_report_export_view_labels, - arg_page_parent=my_report_export_page_parent, - arg_username=user_name, - arg_api_token=api_token, - arg_sphinx_compatible=sphinx_compatible, - arg_sphinx_tags=sphinx_tags, - arg_type="report", - arg_html_output=args.html, - arg_rst_output=args.rst, - arg_show_labels=args.showlabels - ) # finally creating the HTML for the report page - print("Done!") -else: - print("No script mode defined in the command line") +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/confluence_dump/myModules.py b/confluence_dump/myModules.py index 056bfe0..e1f1e74 100644 --- a/confluence_dump/myModules.py +++ b/confluence_dump/myModules.py @@ -1,459 +1,558 @@ -import shutil -import requests -import os.path -import json -from requests.auth import HTTPBasicAuth -from bs4 import BeautifulSoup as bs +# -*- coding: utf-8 -*- +""" +Module to abstract Confluence API calls and provide local file/directory utilities. +Supports both Confluence Cloud and Data Center platforms. +Includes BeautifulSoup logic for HTML processing (downloading assets, fixing links, injecting sidebar). +""" + +import os import sys -import pypandoc -from PIL import Image +import requests +import configparser import re +from requests.auth import HTTPBasicAuth +from urllib.parse import unquote, urlparse +from bs4 import BeautifulSoup, Comment +from datetime import datetime -""" -Arguments needed to run these functions centrally: -* outdirs: outdir, attach_dir, emoticonDir, styles_dir -* page details: Title, ID, Parent, orig URL, Space -* space details: Title, ID, site -* Confluence API: Username, Password +# --- Globals for output directories --- +outdir_base = "" +outdir_pages = "" +outdir_attachments = "" +outdir_styles = "" +outdir_logs = "" -CURRENT STATE -* fixed getting output folders -* next up: getAttachments -""" -# -# Set path for where script is -# -script_dir = os.path.dirname(os.path.abspath(__file__)) -attach_dir = "_images/" -emoticons_dir = "_images/" -styles_dir = "_static/" +# --- Setup Functions --- def set_variables(): - """Set variables for export folders""" - dict_vars = {} - dict_vars['attach_dir'] = "_images/" - dict_vars['emoticons_dir'] = "_images/" - dict_vars['styles_dir'] = "_static/" - attach_dir = "_images/" - emoticons_dir = "_images/" - styles_dir = "_static/" - return(dict_vars) -# -# Create the output folders, set to match Sphynx structure -# -def set_dirs(arg_outdir="output"): # setting default to output - """Set output folders paths for attachments, emoticons and styles""" - my_vars = set_variables() - outdir_attach = os.path.join(arg_outdir,my_vars['attach_dir']) - outdir_emoticons = os.path.join(arg_outdir,my_vars['emoticons_dir']) - outdir_styles = os.path.join(arg_outdir,my_vars['styles_dir']) - return[outdir_attach, outdir_emoticons, outdir_styles] # returns a list - -def mk_outdirs(arg_outdir="output"): # setting default to output - """Create the output folders""" - my_vars = set_variables() - outdir_list = set_dirs(arg_outdir) - outdir_attach = outdir_list[0] - outdir_emoticons = outdir_list[1] - outdir_styles = outdir_list[2] - os.makedirs(arg_outdir, exist_ok=True) - os.makedirs(outdir_attach, exist_ok=True) - os.makedirs(outdir_emoticons, exist_ok=True) - os.makedirs(outdir_styles, exist_ok=True) - if not os.path.exists(os.path.join(outdir_styles, 'confluence.css')): - shutil.copy(os.path.join(script_dir, "styles", "confluence.css"), os.path.join(outdir_styles, "confluence.css")) - return(outdir_list) - -def get_space_title(arg_site,arg_space_id,arg_username,arg_api_token): - """Get Title of a space - - Args: - arg_site: The site name - arg_space_id: ID of the space - arg_username: Username for auth - arg_api_token: API token for auth - - Returns: - response (string): The title of the space - """ - server_url = (f"https://{arg_site}.atlassian.net/wiki/api/v2/spaces/{arg_space_id}") - - response = requests.get(server_url, auth=(arg_username, arg_api_token),timeout=30).json()['name'] - return(response) - -def get_spaces_all(arg_site,arg_username,arg_api_token): - server_url = f"https://{arg_site}.atlassian.net/wiki/api/v2/spaces/?limit=250" - response = requests.get(server_url, auth=(arg_username,arg_api_token),timeout=30) - response.raise_for_status() # raises exception when not a 2xx response - space_list = response.json()['results'] - while 'next' in response.json()['_links'].keys(): - cursorserver_url = f"{server_url}&cursor{response.json()['_links']['next'].split('cursor')[1]}" - response = requests.get(cursorserver_url, auth=(arg_username,arg_api_token),timeout=30) - space_list = space_list + response.json()['results'] - return(space_list) - -def get_pages_from_space(arg_site,arg_space_id,arg_username,arg_api_token): - page_list = [] - server_url = f"https://{arg_site}.atlassian.net/wiki/api/v2/spaces/{arg_space_id}/pages?status=current&limit=250" - response = requests.get(server_url, auth=(arg_username,arg_api_token),timeout=30) - page_list = response.json()['results'] - while 'next' in response.json()['_links'].keys(): - cursorserver_url = f"{server_url}&cursor{response.json()['_links']['next'].split('cursor')[1]}" - response = requests.get(cursorserver_url, auth=(arg_username,arg_api_token),timeout=30) - page_list = page_list + response.json()['results'] - return(page_list) - -def get_body_export_view(arg_site,arg_page_id,arg_username,arg_api_token): - server_url = f"https://{arg_site}.atlassian.net/wiki/rest/api/content/{arg_page_id}?expand=body.export_view" - response = requests.get(server_url, auth=(arg_username, arg_api_token)) - return(response) - -def get_page_name(arg_site,arg_page_id,arg_username,arg_api_token): - server_url = f"https://{arg_site}.atlassian.net/wiki/rest/api/content/{arg_page_id}" - r_pagetree = requests.get(server_url, auth=(arg_username, arg_api_token),timeout=30) - return(r_pagetree.json()['id'] + "_" + r_pagetree.json()['title']) - -def get_page_parent(arg_site,arg_page_id,arg_username,arg_api_token): - server_url = f"https://{arg_site}.atlassian.net/wiki/api/v2/pages/{arg_page_id}" - response = requests.get(server_url, auth=(arg_username, arg_api_token),timeout=30) - return(response.json()['parentId']) - -def get_page_last_modified(arg_site,arg_page_id,arg_username,arg_api_token): - server_url = f"https://{arg_site}.atlassian.net/wiki/rest/api/content/{arg_page_id}?expand=history.lastUpdated" - response = requests.get(server_url, auth=(arg_username, arg_api_token),timeout=30) - data = response.json() - # Check if 'lastUpdated' exists and is not empty - last_updated = data.get('history', {}).get('lastUpdated') - if last_updated: - return last_updated['when'] - # Fallback to 'createdDate' if 'lastUpdated' is not available - return data['history']['createdDate'] - -def remove_illegal_characters(input): - return re.sub(r'[^\w_\.\- ]+', '_', input) - -def get_attachments(arg_site,arg_page_id,arg_outdir_attach,arg_username,arg_api_token): - my_attachments_list = [] - server_url = f"https://{arg_site}.atlassian.net/wiki/rest/api/content/{arg_page_id}?expand=children.attachment" - response = requests.get(server_url, auth=(arg_username, arg_api_token),timeout=30) - my_attachments = response.json()['children']['attachment']['results'] - for attachment in my_attachments: - attachment_title = remove_illegal_characters(requests.utils.unquote(attachment['title']).replace(" ","_").replace(":","-")) # I want attachments without spaces - attachment_file_path = os.path.join(arg_outdir_attach,attachment_title) - if not os.path.exists(attachment_file_path): - print(f"Downloading: {attachment_title}") - try: - attachment_url = f"https://{arg_site}.atlassian.net/wiki{attachment['_links']['download']}" - request_attachment = requests.get(attachment_url, auth=(arg_username, arg_api_token),allow_redirects=True,timeout=30) - open(attachment_file_path, 'wb').write(request_attachment.content) - except: - print(f"WARNING: Skipping attachment file {attachment_file_path} due to issues. url: {attachment_url}") - my_attachments_list.append(attachment_title) - return(my_attachments_list) - -# get page labels -def get_page_labels(arg_site,arg_page_id,arg_username,arg_api_token): - html_labels = [] - server_url = f"https://{arg_site}.atlassian.net/wiki/api/v2/pages/{arg_page_id}/labels" - response = requests.get(server_url, auth=(arg_username,arg_api_token),timeout=30).json() - for l in response['results']: - html_labels.append(l['name']) - print(f"Label: {l['name']}") - html_labels = ", ".join(html_labels) - print(f"Page labels: {html_labels}") - return(html_labels) - -def get_page_properties_children(arg_site,arg_html,arg_outdir,arg_username,arg_api_token): - my_page_properties_children = [] - my_page_properties_children_dict = {} - soup = bs(arg_html, "html.parser") - my_page_properties_items = soup.findAll('td',class_="title") - my_page_properties_items_counter = 0 - for n in my_page_properties_items: - my_page_id = str(n['data-content-id']) - my_page_properties_children.append(str(n['data-content-id'])) - my_page_properties_items_counter = my_page_properties_items_counter + 1 - my_page_name = get_page_name(arg_site,int(my_page_id),arg_username,arg_api_token).rsplit('_',1)[1].replace(":","-").replace(" ","_").replace("%20","_") # replace offending characters from file name - my_page_properties_children_dict.update({ my_page_id:{}}) - my_page_properties_children_dict[my_page_id].update({"ID": my_page_id}) - my_page_properties_children_dict[my_page_id].update({"Name": my_page_name}) - print( f"{my_page_properties_items_counter} Page Properties Children Pages") - return[my_page_properties_children,my_page_properties_children_dict] - -def get_editor_version(arg_site,arg_page_id,arg_username,arg_api_token): - server_url = f"https://{arg_site}.atlassian.net/wiki/rest/api/content/{arg_page_id}?expand=metadata.properties.editor" - response = requests.get(server_url, auth=(arg_username, arg_api_token)) - return(response) - -def dump_html( - arg_site, - arg_html, - arg_title, - arg_page_id, - arg_outdir_base, - arg_outdir_content, - arg_page_labels, - arg_page_parent, - arg_username, - arg_api_token, - arg_sphinx_compatible=True, - arg_sphinx_tags=False, - arg_type="", - arg_html_output=False, - arg_rst_output=True, - arg_show_labels=False - ): - """Create HTML and RST files - - Args: - arg_site: Name of the Confluence Site - arg_html: HTML Content to use for page - arg_title: Title of the page - arg_page_id: Page ID - arg_outdir_base: Base output folder - arg_outdir_content: Output folder for Content - arg_page_labels: Labels of the page - arg_page_parent: Parent of the page - arg_username: Username for authentication - arg_api_token: API Token for authentication - arg_sphinx_compatible: Place _static and _images folder at root of output folder - arg_sphinx_tags: Add tags to output RST - arg_type: For Page Properties, the type of page: "report", "child" or "common" if it's not for Page Properties - - Returns: - HTML, RST and all attachments, embeds and emoticons - """ - my_vars = set_variables() - my_emoticons_list = [] - my_outdir_content = arg_outdir_content - #my_outdir_content = os.path.join(arg_outdir_base,str(arg_page_id) + "-" + str(arg_title)) # this is for html and rst files - os.makedirs(my_outdir_content, exist_ok=True) - #myOutdir = os.path.join(arg_outdir,str(arg_page_id) + "-" + str(arg_title)) - my_outdirs = mk_outdirs(arg_outdir_base) # this is for everything for _images and _static - my_vars = set_variables() # create a dict with the 3 folder paths: attach, emoticons, styles - - soup = bs(arg_html, "html.parser") - - # - # removing elements we don't need like - # *
\n" - f"\n" - f"{arg_title}\n" - f"\n" - f"\n" - f"\n" - f"\n" - f"\n" - f"\n" - f"\n" - f"\n" - f"

{arg_title}

\n" - f"

Original URL: {arg_title}


\n" - ) - - - myFooter = (f"\n" - f"" - ) - # - # At the end of the page, put a link to all attachments. - # - if arg_sphinx_compatible == True: - attach_dir = "../" + my_vars['attach_dir'] - else: - attach_dir = my_vars['attach_dir'] - if len(my_attachments) > 0: - my_pre_footer = "

Attachments

    " - for attachment in my_attachments: - my_pre_footer += (f"
  1. {attachment}
  2. ") - my_pre_footer += "

" - - # - # Putting HTML together - # - pretty_html = soup.prettify() - html_file = open(html_file_path, 'w', encoding='utf-8') - html_file.write(my_header) - html_file.write(pretty_html) - if len(my_attachments) > 0: - html_file.write(my_pre_footer) - html_file.write(myFooter) - html_file.close() - if arg_html_output == True: - print(f"Exported HTML file {html_file_path}") - # - # convert html to rst - # - if not arg_rst_output: - return page_url, html_file_path - - rst_file_name = f"{html_file_name.replace('html','rst')}" - rst_file_path = os.path.join(my_outdir_content,rst_file_name) + c_path = '' + path_params['context_path'] = c_path + + return f"{base_url.rstrip('/')}{template.format(**path_params)}" + + +def _execute_get_request(url, auth_info, params=None): + """ Executes GET request. Prints ERRORS to stderr, but stays silent on success. """ + headers = {"Accept": "application/json"} + if isinstance(auth_info, dict): headers.update(auth_info) + try: - output_rst = pypandoc.convert_file(str(html_file_path), 'rst', format='html',extra_args=['--standalone','--wrap=none','--list-tables']) + resp = requests.get(url, headers=headers, auth=auth_info if not isinstance(auth_info, dict) else None, + params=params) + resp.raise_for_status() + + # Robust check for HTML responses (SSO redirects) with specific hints + if 'application/json' not in resp.headers.get('Content-Type', ''): + print(f"Error: Non-JSON response from {url} (Content-Type: {resp.headers.get('Content-Type')})", + file=sys.stderr) + print("This often happens on authentication failure (redirect to HTML login page).", file=sys.stderr) + + if isinstance(auth_info, dict): + print("\n[Data Center Hint]: Are you connected to the VPN/Intranet?", file=sys.stderr) + print("Many companies block API access from outside the corporate network.", file=sys.stderr) + print("Also ensure that Personal Access Tokens are not disabled by an SSO policy.", file=sys.stderr) + else: + print("\n[Cloud Hint]: Please verify your CONFLUENCE_USER and CONFLUENCE_TOKEN environment variables.", + file=sys.stderr) + + return None + + return resp.json() except Exception as e: - print("There was an issue generating an RST file from the page.") - print(e) + print(f"Request Error: {e}", file=sys.stderr) + return None + + +def get_page_view_url(base_url, platform_config, context_path_override, spaceKey, pageId): + path_params = {'spaceKey': spaceKey, 'pageId': pageId} + return _build_api_url(base_url, platform_config, context_path_override, 'url_view_page', path_params) + + +# --- Downloader --- + +def download_file(url, local_filename, auth_info): + headers = {} + auth_obj = None + if isinstance(auth_info, dict): + headers.update(auth_info) else: - ## - ## RST Header with Page Metadata - ## - if (arg_sphinx_compatible == True): - rst_page_header = (f":conf_pagetype: {arg_type}\n" - f":conf_pageid: {arg_page_id}\n" - f":conf_parent: {arg_page_parent}\n" - f":conf_labels: {arg_page_labels}\n" - f":doc_title: {arg_title}\n" - f"\n" - ) + auth_obj = auth_info + + try: + with requests.get(url, headers=headers, auth=auth_obj, stream=True) as r: + r.raise_for_status() + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + return True + except Exception as e: + print(f"Download Error {url}: {e}", file=sys.stderr) + return False + + +# --- HTML Processor (The Core Logic) --- + +def inject_sidebar(soup, sidebar_html, current_page_id): + """ + Injects the sidebar HTML, wraps content in a layout, + and highlights the current page/opens parent folders. + """ + if not sidebar_html: return soup + + # 1. Parse Sidebar + sidebar_soup = BeautifulSoup(sidebar_html, 'html.parser') + + # 2. Highlight Current Page & Open Parents + target_href = f"{current_page_id}.html" + active_link = sidebar_soup.find('a', href=target_href) + + if active_link: + # Add active class + active_link['class'] = active_link.get('class', []) + ['active-page'] + + # Walk up the tree and set 'open' on all
ancestors + parent = active_link.parent + while parent: + if parent.name == 'details': + parent['open'] = '' # Set attribute + parent = parent.parent + + # 3. Create Layout Wrapper + if soup.body: + # Toggle Button + toggle_btn = soup.new_tag('button', id='sidebar-toggle', attrs={'title': 'Toggle Sidebar'}) + toggle_btn.string = "☰" + + layout_div = soup.new_tag('div', attrs={'class': 'layout-container'}) + + aside = soup.new_tag('aside', id='sidebar') + aside.append(Comment(" CONFLUENCE-SIDEBAR-START ")) + + if sidebar_soup.body: + for child in list(sidebar_soup.body.children): + aside.append(child) else: - rst_page_header = (f".. meta::\n" - f" :confluencePageId: {arg_page_id} \n" - f" :confluencePageLabels: {arg_page_labels} \n" - f" :confluencePageParent: {arg_page_parent} \n" - f"\n" - ) - ## Footer with list of page labels - if arg_show_labels == True: - footer_rst = (f"...." - f"\n" - f"\n**Page labels**: {arg_page_labels} \n") + for child in list(sidebar_soup.children): + aside.append(child) + + aside.append(Comment(" CONFLUENCE-SIDEBAR-END ")) + + # JS Resizer Handle + resizer = soup.new_tag('div', id='resizer') + + main_content = soup.new_tag('main', id='content') + + # Move existing body content into main + for content in list(soup.body.contents): + main_content.append(content) + + layout_div.append(aside) + layout_div.append(resizer) + layout_div.append(main_content) + + soup.body.clear() + soup.body.append(toggle_btn) + soup.body.append(layout_div) + + # Inject Scripts (Toggle + Resizer + Persistence) + script = soup.new_tag('script') + script.string = """ + document.addEventListener('DOMContentLoaded', function() { + const btn = document.getElementById('sidebar-toggle'); + const sidebar = document.getElementById('sidebar'); + const resizer = document.getElementById('resizer'); + + const savedWidth = localStorage.getItem('sidebarWidth'); + if (savedWidth && sidebar) { + sidebar.style.width = savedWidth; + sidebar.style.flexBasis = savedWidth; + } + + if (btn && sidebar) { + btn.addEventListener('click', function() { + sidebar.classList.toggle('collapsed'); + }); + } + + if (resizer && sidebar) { + let isResizing = false; + resizer.addEventListener('mousedown', (e) => { + isResizing = true; + document.body.style.cursor = 'col-resize'; + resizer.classList.add('active'); + }); + document.addEventListener('mousemove', (e) => { + if (!isResizing) return; + let newWidth = e.clientX; + if (newWidth < 50) newWidth = 50; + if (newWidth > window.innerWidth * 0.6) newWidth = window.innerWidth * 0.6; + sidebar.style.width = newWidth + 'px'; + sidebar.style.flexBasis = newWidth + 'px'; + }); + document.addEventListener('mouseup', () => { + if (isResizing) { + localStorage.setItem('sidebarWidth', sidebar.style.width); + } + isResizing = false; + document.body.style.cursor = 'default'; + resizer.classList.remove('active'); + }); + } + }); + """ + soup.body.append(script) + + return soup + + +def process_page_content(html_content, page_metadata, base_url, auth_info, css_files=None, exported_page_ids=None, + sidebar_html=None): + """ + Parses the HTML content using BeautifulSoup. + 1. Injects Metadata (Head). + 2. Injects Page Title & Modification Info (Body Top). + 3. Downloads images & fixes links. + 4. Injects CSS & Sidebar. + """ + # Handle empty content gracefully + soup = BeautifulSoup(html_content or "", 'html.parser') + + valid_ids = set(exported_page_ids) if exported_page_ids else set() + page_id = page_metadata.get('id') + + # 1. Metadata Injection (Head) + if not soup.head: + head = soup.new_tag('head') + soup.insert(0, head) + + title_string = page_metadata.get('title', 'Untitled') + title_tag = soup.new_tag('title') + title_tag.string = title_string + soup.head.append(title_tag) + + meta_id = soup.new_tag('meta', attrs={'name': 'confluence-page-id', 'content': page_id}) + soup.head.append(meta_id) + + labels = [l['name'] for l in page_metadata.get('metadata', {}).get('labels', {}).get('results', [])] + meta_labels = soup.new_tag('meta', attrs={'name': 'confluence-labels', 'content': ', '.join(labels)}) + soup.head.append(meta_labels) + + # --- Inject Title & Metadata in Body --- + if not soup.body: + body = soup.new_tag('body') + # Move any loose children to body (if any existed in empty string scenario) + for element in list(soup.children): + if element.name != 'head': + body.append(element) + soup.append(body) + + # Construct Header Block + h1 = soup.new_tag('h1') + h1.string = title_string + + # Metadata Line + version_info = page_metadata.get('version', {}) + author_name = "Unknown" + date_str = "Unknown Date" + + if 'by' in version_info and 'displayName' in version_info['by']: + author_name = version_info['by']['displayName'] + + if 'when' in version_info: + try: + dt = datetime.strptime(version_info['when'].split('.')[0], "%Y-%m-%dT%H:%M:%S") + date_str = dt.strftime("%d. %b %Y") + except: + date_str = version_info['when'] + + meta_div = soup.new_tag('div', attrs={'class': 'page-metadata'}) + meta_ul = soup.new_tag('ul') + meta_li = soup.new_tag('li', attrs={'class': 'page-metadata-modification-info'}) + + meta_li.append("Last changed by ") + span_author = soup.new_tag('span', attrs={'class': 'author'}) + span_author.string = author_name + meta_li.append(span_author) + meta_li.append(" am ") + span_date = soup.new_tag('span', attrs={'class': 'last-modified'}) + span_date.string = date_str + meta_li.append(span_date) + + meta_ul.append(meta_li) + meta_div.append(meta_ul) + + soup.body.insert(0, meta_div) + soup.body.insert(0, h1) + + # CSS Injection + style_tag = soup.new_tag('style') + style_tag.string = """ + /* Global Reset */ + *, *::before, *::after { box-sizing: border-box; } + + body { margin: 0; padding: 0; font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; } + .layout-container { display: flex; height: 100vh; overflow: hidden; } + + /* Sidebar Styling */ + #sidebar { + flex: 0 0 auto; + width: 350px; + min-width: 50px; + border-right: 1px solid #ddd; + overflow-y: auto; + padding: 10px; + padding-left: 15px; + padding-top: 60px; + padding-right: 4px; + background: #f4f5f7; + font-size: 14px; + resize: horizontal; + position: relative; + transition: width 0.2s, padding 0.2s; + } + + #sidebar.collapsed { + width: 0px !important; min-width: 0 !important; padding: 0; border: none; overflow: hidden; flex-basis: 0 !important; + } + + /* Resizer Handle */ + #resizer { + width: 5px; cursor: col-resize; background-color: transparent; border-left: 1px solid #eee; transition: background-color 0.2s; flex: 0 0 auto; z-index: 10; + } + #resizer:hover, #resizer.active { background-color: #4c9aff; } + + /* Toggle Button */ + #sidebar-toggle { + position: fixed; top: 15px; left: 15px; z-index: 9999; + background: rgba(255, 255, 255, 0.9); border: 1px solid #ccc; border-radius: 4px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); + font-size: 20px; cursor: pointer; color: #42526e; + width: 32px; height: 32px; line-height: 30px; text-align: center; padding: 0; + } + #sidebar-toggle:hover { background: #ebecf0; } + + /* Content Area */ + #content { + flex: 1; + overflow-y: auto; + padding: 40px 30px !important; + max-width: 100%; + } + + /* Page Title & Metadata Styling */ + h1 { margin-top: 0; color: #172b4d; font-size: 2em; font-weight: 600; } + .page-metadata { margin-bottom: 20px; font-size: 12px; color: #6b778c; } + .page-metadata ul { list-style: none; padding: 0; margin: 0; } + .page-metadata li { display: inline-block; margin-right: 10px; } + + /* Sidebar Tree */ + #sidebar ul { list-style: none; padding-left: 28px; margin: 0; } + #sidebar li { margin: 4px 0; white-space: normal; word-wrap: break-word; } + #sidebar li.leaf { list-style: disc; margin-left: 18px; } + #sidebar li.folder { list-style: none; } + + #sidebar summary { cursor: pointer; font-weight: 500; margin-bottom: 2px; color: #42526e; outline: none; } + #sidebar summary > a { color: #42526e; text-decoration: none; } + #sidebar summary > a:hover { text-decoration: underline; color: #0052cc; } + + #sidebar a { text-decoration: none; color: #42526e; } + #sidebar a:hover { color: #0052cc; text-decoration: underline; } + #sidebar a.active-page { color: #0052cc; font-weight: bold; } + + #sidebar details > summary { list-style: none; } + #sidebar details > summary::-webkit-details-marker { display: none; } + #sidebar details > summary::before { content: '▶'; display: inline-block; font-size: 10px; margin-right: 6px; color: #6b778c; transition: transform 0.2s; } + #sidebar details[open] > summary::before { transform: rotate(90deg); } + """ + soup.head.append(style_tag) + + if css_files: + for css_path in css_files: + link_css = soup.new_tag('link', attrs={'rel': 'stylesheet', 'href': css_path, 'type': 'text/css'}) + soup.head.append(link_css) + + # 2. Image Downloading & rewriting + for img in soup.find_all('img'): + src = img.get('src') + if not src: continue + + if src.startswith('/'): + full_url = base_url.rstrip('/') + src + else: + full_url = src + + if '/download/' in src or '/images/icons/' in src: + filename = unquote(os.path.basename(urlparse(src).path)) + local_path = os.path.join(outdir_attachments, filename) + + if download_file(full_url, local_path, auth_info): + img['src'] = f"../attachments/{filename}" + else: + print(f" Warning: Could not download image {src}", file=sys.stderr) + + # 3. Link Rewriting + for a in soup.find_all('a'): + href = a.get('href') + if not href: continue + + target_id = None + linked_id = a.get('data-linked-resource-id') + resource_type = a.get('data-linked-resource-type') + + if linked_id and (not resource_type or resource_type == 'page'): + target_id = linked_id + elif '/pages/' in href: + match = re.search(r'/pages/(\d+)', href) + if match: target_id = match.group(1) + elif 'pageId=' in href: + try: + target_id = re.search(r'pageId=(\d+)', href).group(1) + except: + pass + + if target_id and target_id in valid_ids: + a['href'] = f"{target_id}.html" else: - footer_rst = "" - - rst_file = open(rst_file_path, 'w', encoding='utf-8') - rst_file.write(rst_page_header) - rst_file.write(output_rst) - rst_file.write(footer_rst) - rst_file.close() - print(f"Exported RST file: {rst_file_path}") - if arg_html_output == False: - os.remove(html_file_path) - return page_url, rst_file_path + if href.startswith('/'): + a['href'] = base_url.rstrip('/') + href + + # 4. Sidebar Injection + if sidebar_html: + soup = inject_sidebar(soup, sidebar_html, page_id) + + return str(soup) + + +# ... (API Calls unchanged) ... +def get_page_full(pageId, base_url, platform_config, auth_info, context_path_override): + params_export = {'expand': 'body.export_view,version,ancestors,space,metadata.labels'} + path_params = {'pageId': pageId} + url = _build_api_url(base_url, platform_config, context_path_override, 'url_get_page', path_params) + return _execute_get_request(url, auth_info, params=params_export) + + +def get_child_pages(pageId, base_url, platform_config, auth_info, context_path_override): + url = _build_api_url(base_url, platform_config, context_path_override, 'url_get_child_pages', {'pageId': pageId}) + params = {'limit': 200, 'expand': 'ancestors,metadata.labels'} + return _execute_get_request(url, auth_info, params=params) + + +def get_space_homepage(spaceKey, base_url, platform_config, auth_info, context_path_override): + url = _build_api_url(base_url, platform_config, context_path_override, 'url_get_space', {'spaceKey': spaceKey}) + space_info = _execute_get_request(url, auth_info, {'expand': 'homepage'}) + if space_info and 'homepage' in space_info: + return space_info['homepage'] + return None + + +def get_page_basic(pageId, base_url, platform_config, auth_info, context_path_override): + path_params = {'pageId': pageId} + url = _build_api_url(base_url, platform_config, context_path_override, 'url_get_page', path_params) + return _execute_get_request(url, auth_info) + + +def get_page_children(pageId, base_url, platform_config, auth_info, context_path_override): + url = _build_api_url(base_url, platform_config, context_path_override, 'url_cql_search') + params = {'cql': f'parent={pageId}', 'limit': 200, 'expand': 'ancestors'} + return _execute_get_request(url, auth_info, params=params) + + +def get_page_attachments(pageId, base_url, platform_config, auth_info, context_path_override): + path_params = {'pageId': pageId} + url = _build_api_url(base_url, platform_config, context_path_override, 'url_get_attachments', path_params) + params = {'limit': 200} + return _execute_get_request(url, auth_info, params=params) + + +def get_pages_from_space(spaceKey, start, limit, base_url, platform_config, auth_info, context_path_override): + url = _build_api_url(base_url, platform_config, context_path_override, 'url_cql_search') + params = {'cql': f'space="{spaceKey}"', 'start': start, 'limit': limit, 'expand': 'ancestors'} + return _execute_get_request(url, auth_info, params=params) + + +def get_pages_by_label(label, start, limit, base_url, platform_config, auth_info, context_path_override): + url = _build_api_url(base_url, platform_config, context_path_override, 'url_cql_search') + params = {'cql': f'label="{label}"', 'start': start, 'limit': limit, 'expand': 'ancestors'} + return _execute_get_request(url, auth_info, params=params) + + +def get_all_spaces(base_url, platform_config, auth_info, context_path_override): + url = _build_api_url(base_url, platform_config, context_path_override, 'url_get_all_spaces') + params = {'limit': 200} + return _execute_get_request(url, auth_info, params=params) \ No newline at end of file diff --git a/confluence_products.ini b/confluence_products.ini new file mode 100644 index 0000000..ff66113 --- /dev/null +++ b/confluence_products.ini @@ -0,0 +1,23 @@ +[cloud] +platform_type = cloud +auth_method = basic_api_token +base_path = /wiki +url_get_page = /rest/api/content/{pageId} +url_get_child_pages = /rest/api/content/{pageId}/child/page +url_get_space = /rest/api/space/{spaceKey} +url_get_attachments = /rest/api/content/{pageId}/child/attachment +url_cql_search = /rest/api/content/search +url_get_all_spaces = /rest/api/space +url_view_page = /spaces/{spaceKey}/pages/{pageId} + +[dc] +platform_type = dc +auth_method = bearer_pat +default_context_path = /wiki +url_get_page = {context_path}/rest/api/content/{pageId} +url_get_child_pages = {context_path}/rest/api/content/{pageId}/child/page +url_get_space = {context_path}/rest/api/space/{spaceKey} +url_get_attachments = {context_path}/rest/api/content/{pageId}/child/attachment +url_cql_search = {context_path}/rest/api/content/search +url_get_all_spaces = {context_path}/rest/api/space +url_view_page = {context_path}/pages/viewpage.action?pageId={pageId} \ No newline at end of file diff --git a/create_editor.py b/create_editor.py new file mode 100644 index 0000000..573e0c2 --- /dev/null +++ b/create_editor.py @@ -0,0 +1,659 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Confluence Sidebar Editor Generator (Robust Concatenation Version) +---------------------------------------------------------------- +Generates a standalone HTML editor ('editor_sidebar.html') from 'sidebar.md'. +Uses direct string concatenation to avoid Python formatting issues with JS syntax. + +Usage: + python3 create_editor.py --site-dir "./output/TIMESTAMP Space X" +""" + +import os +import sys +import argparse +import re +import html +import shutil +from urllib.parse import unquote, urlparse + +# --- CSS CONTENT --- +CSS_CONTENT = """ + :root { + --bg-color: #f4f5f7; + --text-color: #172b4d; + --link-color: #0052cc; + --border-color: #dfe1e6; + --hover-color: #ebecf0; + --selected-color: #deebff; + --drop-target-color: #b3d4ff; + --danger-color: #de350b; + --folder-line-color: #999; /* Darker line for better visibility */ + --insert-line-color: #0052cc; + } + + * { box-sizing: border-box; } + + body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; padding: 0; margin: 0; background: var(--bg-color); height: 100vh; display: flex; flex-direction: column; } + + header { background: white; padding: 10px 20px; border-bottom: 1px solid var(--border-color); display: flex; align-items: center; justify-content: space-between; box-shadow: 0 1px 3px rgba(0,0,0,0.05); z-index: 10; flex: 0 0 auto; } + + .toolbar-group { display: flex; gap: 10px; align-items: center; } + + h1 { margin: 0; font-size: 1.1rem; color: var(--text-color); margin-right: 20px; } + + button { background: white; border: 1px solid var(--border-color); color: var(--text-color); padding: 6px 12px; border-radius: 3px; cursor: pointer; font-size: 13px; } + button:hover { background: var(--hover-color); } + button.primary { background: var(--link-color); color: white; border-color: var(--link-color); font-weight: 500; } + button.primary:hover { background: #0065ff; } + + input[type="text"] { padding: 6px 10px; border: 1px solid var(--border-color); border-radius: 3px; width: 200px; font-size: 13px; } + + .workspace { flex: 1; display: flex; overflow: hidden; } + .tree-panel { flex: 1; overflow-y: auto; padding: 20px; background: white; } + + /* Tree Structure Styling */ + ul { list-style: none; padding: 0; margin: 0; } + + /* Nested lists indentation logic: + Shift right and draw a border on the left to create the 'tree line' effect. + */ + ul li ul { + margin-left: 11px; /* Aligns line roughly with center of toggle icon above */ + padding-left: 20px; /* Push content away from the line */ + border-left: 1px solid var(--folder-line-color); + } + + #root-tree { border-left: none; margin-left: 0; } + + li { margin: 0; position: relative; } + + .node-row { + display: flex; align-items: center; + padding: 4px 8px; + border-radius: 3px; + border: 2px solid transparent; + margin-bottom: 2px; + cursor: default; + user-select: none; + transition: background 0.1s; + } + .node-row:hover { background: var(--hover-color); } + .node-row.deleted { opacity: 0.5; text-decoration: line-through; background: #fff0f0; } + + .dragging { opacity: 0.4; background: #eee; } + + /* Drop Zones */ + .drag-over-top { border-top: 3px solid var(--insert-line-color) !important; background: transparent !important; } + .drag-over-bottom { border-bottom: 3px solid var(--insert-line-color) !important; background: transparent !important; } + .drag-over-middle { background: var(--drop-target-color) !important; border: 2px dashed var(--insert-line-color) !important; opacity: 0.9; } + + .toggle-icon { width: 24px; height: 24px; text-align: center; cursor: pointer; color: #6b778c; font-size: 12px; line-height: 24px; margin-right: 2px; border-radius: 3px; } + .toggle-icon:hover { background: rgba(0,0,0,0.1); color: var(--text-color); } + .toggle-icon.leaf { visibility: hidden; } + + .drag-handle { cursor: grab; color: #b3bac5; margin-right: 8px; font-size: 16px; padding: 0 4px; } + .drag-handle:hover { color: var(--text-color); background: #eee; border-radius: 3px; } + + .node-icon { margin-right: 8px; font-size: 16px; } + .node-title { flex: 1; font-size: 14px; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; padding: 2px 5px; border-radius: 3px; border: 1px solid transparent; } + .node-title:focus { border-color: var(--link-color); background: white; outline: none; } + + .actions { display: none; margin-left: 10px; gap: 4px; align-items: center; } + .node-row:hover .actions { display: flex; } + .action-btn { padding: 2px 8px; font-size: 11px; border: 1px solid #ccc; background: #fff; color: #42526e; border-radius: 3px; cursor: pointer; } + .action-btn:hover { background: #ebecf0; color: var(--text-color); } + .btn-del:hover { background: #ffebe6; color: var(--danger-color); border-color: var(--danger-color); } + .btn-link { font-weight: bold; color: var(--link-color); } + + .hidden { display: none !important; } + + #toast { position: fixed; bottom: 20px; right: 20px; background: #333; color: white; padding: 10px 20px; border-radius: 4px; opacity: 0; transition: opacity 0.3s; pointer-events: none; } + #toast.show { opacity: 1; } + + #md-output-container { display: none; position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); background: white; padding: 20px; box-shadow: 0 5px 20px rgba(0,0,0,0.2); border-radius: 8px; z-index: 100; width: 80%; height: 80%; flex-direction: column; } + #md-textarea { flex: 1; width: 100%; margin-bottom: 10px; font-family: monospace; } + #overlay { display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.5); z-index: 99; } +""" + +# --- 2. JS CONTENT --- +JS_CONTENT = """ +console.log("0. JS Block Started."); + +const app = {}; +app.dragSrcEl = null; +app.root = null; + +app.init = function() { + console.log("1. app.init() called"); + this.root = document.getElementById('root-tree'); + + if (!this.root) { + console.error("CRITICAL: #root-tree not found"); + return; + } + + this.root.addEventListener('click', (e) => { + const target = e.target; + if (target.classList.contains('toggle-icon')) { + this.toggle(target); + } else if (target.classList.contains('btn-del')) { + this.deleteNode(target); + } else if (target.classList.contains('btn-add')) { + this.addChild(target); + } else if (target.classList.contains('btn-exp')) { + this.toggleRecursive(target, true); + } else if (target.classList.contains('btn-col')) { + this.toggleRecursive(target, false); + } else if (target.classList.contains('btn-link')) { + const row = target.closest('.node-row'); + const titleDiv = row.querySelector('.node-title'); + const href = titleDiv.getAttribute('data-href'); + if (href) window.open(href, '_blank'); + } + }); + + this.refreshDnD(); + + const countEl = document.getElementById('node-count-display'); + if(countEl && window.EDITOR_NODE_COUNT) { + countEl.innerText = window.EDITOR_NODE_COUNT; + } + console.log("2. App initialized."); +}; + +app.updateFolderState = function(li) { + const ul = li.querySelector('ul'); + const toggle = li.querySelector('.node-row .toggle-icon'); + const icon = li.querySelector('.node-row .node-icon'); + + if (!ul || ul.children.length === 0) { + toggle.classList.add('leaf'); + toggle.innerText = '●'; + icon.innerText = '📄'; + } else { + toggle.classList.remove('leaf'); + if (ul.classList.contains('hidden')) { + toggle.innerText = '▶'; + } else { + toggle.innerText = '▼'; + } + icon.innerText = '📂'; + } +}; + +app.refreshDnD = function() { + const rows = document.querySelectorAll('.node-row'); + rows.forEach(row => { + row.setAttribute('draggable', 'true'); + row.ondragstart = this.handleDragStart.bind(this); + row.ondragover = this.handleDragOver; + row.ondragenter = this.handleDragEnter; + row.ondragleave = this.handleDragLeave; + row.ondrop = this.handleDrop.bind(this); + row.ondragend = this.handleDragEnd; + }); +}; + +app.handleDragStart = function(e) { + this.dragSrcEl = e.target.closest('li'); + e.dataTransfer.effectAllowed = 'move'; + e.dataTransfer.setData('text/html', e.target.innerHTML); + e.target.classList.add('dragging'); +}; + +app.handleDragOver = function(e) { + if (e.preventDefault) e.preventDefault(); + e.dataTransfer.dropEffect = 'move'; + + const row = e.currentTarget; + const rect = row.getBoundingClientRect(); + const relY = e.clientY - rect.top; + const height = rect.height; + + row.classList.remove('drag-over-top', 'drag-over-bottom', 'drag-over-middle'); + + if (relY < height * 0.25) { + row.classList.add('drag-over-top'); + } else if (relY > height * 0.75) { + row.classList.add('drag-over-bottom'); + } else { + row.classList.add('drag-over-middle'); + } + return false; +}; + +app.handleDragEnter = function(e) { + if (e.currentTarget.classList) e.currentTarget.classList.add('drag-over-middle'); +}; + +app.handleDragLeave = function(e) { + if (e.currentTarget.classList) e.currentTarget.classList.remove('drag-over-top', 'drag-over-bottom', 'drag-over-middle'); +}; + +app.handleDrop = function(e) { + if (e.stopPropagation) e.stopPropagation(); + + const targetRow = e.currentTarget; + const targetLi = targetRow.closest('li'); + + if (this.dragSrcEl === targetLi || this.dragSrcEl.contains(targetLi)) { + targetRow.classList.remove('drag-over-top', 'drag-over-bottom', 'drag-over-middle'); + return false; + } + + const rect = targetRow.getBoundingClientRect(); + const relY = e.clientY - rect.top; + const height = rect.height; + + if (relY < height * 0.25) { + targetLi.parentNode.insertBefore(this.dragSrcEl, targetLi); + } else if (relY > height * 0.75) { + targetLi.parentNode.insertBefore(this.dragSrcEl, targetLi.nextSibling); + } else { + let ul = targetLi.querySelector('ul'); + if (!ul) { + ul = document.createElement('ul'); + targetLi.appendChild(ul); + const toggle = targetRow.querySelector('.toggle-icon'); + toggle.classList.remove('leaf'); + toggle.innerText = '▼'; + const icon = targetRow.querySelector('.node-icon'); + icon.innerText = '📂'; + } + ul.classList.remove('hidden'); + ul.appendChild(this.dragSrcEl); + } + + this.dragSrcEl.querySelector('.node-row').classList.remove('dragging'); + this.updateFolderState(targetLi); + const oldParent = this.dragSrcEl.parentElement.closest('li'); + if (oldParent) this.updateFolderState(oldParent); + + targetRow.classList.remove('drag-over-top', 'drag-over-bottom', 'drag-over-middle'); + return false; +}; + +app.handleDragEnd = function(e) { + document.querySelectorAll('.node-row').forEach(row => { + row.classList.remove('drag-over-top', 'drag-over-bottom', 'drag-over-middle', 'dragging'); + }); +}; + +app.toggle = function(el) { + if (el.classList.contains('leaf')) return; + const li = el.closest('li'); + const ul = li.querySelector('ul'); + if (ul) { + if (ul.classList.contains('hidden')) { + ul.classList.remove('hidden'); + el.innerText = '▼'; + } else { + ul.classList.add('hidden'); + el.innerText = '▶'; + } + } +}; + +app.toggleRecursive = function(btn, expand) { + const rootLi = btn.closest('li'); + const processUl = (ul) => { if (expand) ul.classList.remove('hidden'); else ul.classList.add('hidden'); }; + const processToggle = (t) => { if (!t.classList.contains('leaf')) t.innerText = expand ? '▼' : '▶'; }; + + const rootUl = rootLi.querySelector('ul'); + const rootToggle = rootLi.querySelector('.node-row .toggle-icon'); + if (rootUl) processUl(rootUl); + if (rootToggle) processToggle(rootToggle); + + rootLi.querySelectorAll('ul').forEach(processUl); + rootLi.querySelectorAll('.toggle-icon').forEach(processToggle); +}; + +app.expandAll = function() { + document.querySelectorAll('ul.hidden').forEach(ul => ul.classList.remove('hidden')); + document.querySelectorAll('.toggle-icon:not(.leaf)').forEach(el => el.innerText = '▼'); +}; + +app.collapseAll = function() { + document.querySelectorAll('#root-tree ul').forEach(ul => ul.classList.add('hidden')); + document.querySelectorAll('.toggle-icon:not(.leaf)').forEach(el => el.innerText = '▶'); +}; + +app.expandToLevel = function(level) { + this.collapseAll(); + const expandRecursive = (ul, currentLevel) => { + if (currentLevel >= level) return; + + Array.from(ul.children).forEach(li => { + const childUl = li.querySelector('ul'); + const toggle = li.querySelector('.node-row .toggle-icon'); + + if (childUl) { + childUl.classList.remove('hidden'); + if (toggle && !toggle.classList.contains('leaf')) { + toggle.innerText = '▼'; + } + expandRecursive(childUl, currentLevel + 1); + } + }); + }; + if (this.root) expandRecursive(this.root, 0); +}; + +app.deleteNode = function(btn) { + const row = btn.closest('.node-row'); + row.classList.toggle('deleted'); +}; + +app.addChild = function(btn) { + const parentLi = btn.closest('li'); + let ul = parentLi.querySelector('ul'); + + if (!ul) { + ul = document.createElement('ul'); + parentLi.appendChild(ul); + } + ul.classList.remove('hidden'); + + const newLi = app.createNodeElement("New Page", null); + ul.prepend(newLi); + + this.updateFolderState(parentLi); + this.refreshDnD(); +}; + +app.newItem = function() { + const newLi = app.createNodeElement("New Root Item", null); + app.root.prepend(newLi); + app.refreshDnD(); +}; + +app.createNodeElement = function(title, href) { + const li = document.createElement('li'); + // FIX: Use JS ternary operator for optional href check, not Python 'if' + // And simple string concatenation for robustness + const displayStyle = href ? '' : 'style="display:none"'; + + li.innerHTML = ` +
+ + + 📄 +
${title}
+
+ + + + + +
+
+ `; + return li; +}; + +app.filter = function(term) { + term = term.toLowerCase(); + const items = document.querySelectorAll('li'); + + if (!term) { + items.forEach(li => li.classList.remove('hidden')); + return; + } + + items.forEach(li => li.classList.add('hidden')); + + document.querySelectorAll('.node-title').forEach(div => { + if (div.innerText.toLowerCase().includes(term)) { + let li = div.closest('li'); + li.classList.remove('hidden'); + let parent = li.parentElement.closest('li'); + while (parent) { + parent.classList.remove('hidden'); + const ul = parent.querySelector('ul'); + if (ul) ul.classList.remove('hidden'); + parent = parent.parentElement.closest('li'); + } + } + }); +}; + +app.generateMarkdown = function() { + let md = ""; + function walk(ul, level) { + Array.from(ul.children).forEach(li => { + const row = li.querySelector('.node-row'); + if (row.classList.contains('deleted')) return; + const titleDiv = row.querySelector('.node-title'); + const title = titleDiv.innerText.trim(); + const href = titleDiv.getAttribute('data-href'); + const indent = " ".repeat(level); + if (href) { + md += `${indent}- [${title}](${href})\\n`; + } else { + md += `${indent}- ${title}\\n`; + } + const childUl = li.querySelector('ul'); + if (childUl && childUl.children.length > 0) { + walk(childUl, level + 1); + } + }); + } + walk(this.root, 0); + const textarea = document.getElementById('md-textarea'); + textarea.value = md; + document.getElementById('overlay').style.display = 'block'; + document.getElementById('md-output-container').style.display = 'flex'; + textarea.select(); +}; + +app.copyToClipboard = function() { + const el = document.getElementById('md-textarea'); + el.select(); + document.execCommand('copy'); + const t = document.getElementById('toast'); + t.classList.add('show'); + setTimeout(() => t.classList.remove('show'), 2000); +}; + +app.closeModal = function() { + document.getElementById('overlay').style.display = 'none'; + document.getElementById('md-output-container').style.display = 'none'; +}; + +document.addEventListener('DOMContentLoaded', () => { + app.init(); +}); +""" + + +# --- Python Parser Logic --- + +class Node: + def __init__(self, title, href=None, level=0): + self.title = title + self.href = href + self.level = level + self.children = [] + + +def parse_markdown(md_content): + lines = md_content.splitlines() + root = Node("root", level=-1) + stack = [root] + node_count = 0 + + link_pattern = re.compile(r'\[(.*?)\]\((.*?)\)') + + for line in lines: + stripped = line.strip() + if not stripped or not stripped.startswith('-'): + continue + + raw_indent = line[:line.find('-')] + level = raw_indent.count('\t') + (raw_indent.count(' ') // 2) + content = stripped[1:].strip() + + match = link_pattern.search(content) + if match: + title = match.group(1) + raw_href = match.group(2) + filename = os.path.basename(unquote(urlparse(raw_href).path)) + href = f"pages/{filename}" + else: + title = content + href = None + + node = Node(title, href, level) + node_count += 1 + + while len(stack) > 1 and stack[-1].level >= level: + stack.pop() + + stack[-1].children.append(node) + stack.append(node) + + return root, node_count + + +def render_editor_html(node): + if not node.children: return "" + + ul_class = "hidden" if node.level >= 0 else "" + html_out = f"
    \n" + + for child in node.children: + has_children = len(child.children) > 0 + icon = "📂" if has_children else "📄" + arrow = "▶" if has_children else "▶" + toggle_class = "" if has_children else "leaf" + + safe_title = html.escape(child.title) + safe_href = html.escape(child.href) if child.href else "" + href_attr = f'data-href="{safe_href}"' if child.href else "" + + html_out += f'
  • \n' + html_out += f'
    \n' + html_out += f'{arrow}\n' + html_out += f'\n' + html_out += f'{icon}\n' + html_out += f'
    {safe_title}
    \n' + html_out += '
    \n' + + # Logic: Show link button only if href is present + # Since this is Python generating HTML string, we use Python 'if' + link_style = '' if child.href else 'style="display:none"' + html_out += f'\n' + + html_out += '\n' + html_out += '\n' + html_out += '\n' + html_out += '\n' + html_out += '
    \n' + html_out += '
    \n' # End row + + if has_children: + html_out += render_editor_html(child) + + html_out += "
  • \n" + + html_out += "
\n" + return html_out + + +def main(): + parser = argparse.ArgumentParser(description="Generate Sidebar Editor") + parser.add_argument('--site-dir', required=True, help="Directory containing sidebar.md") + args = parser.parse_args() + + site_dir = args.site_dir + source_path = os.path.join(site_dir, "sidebar.md") + edit_path = os.path.join(site_dir, "sidebar_edit.md") + out_html_path = os.path.join(site_dir, "editor_sidebar.html") + + md_to_parse = source_path + if os.path.exists(edit_path): + print(f"Found working copy: {edit_path}") + md_to_parse = edit_path + elif os.path.exists(source_path): + print(f"Creating working copy from: {source_path}") + shutil.copy(source_path, edit_path) + md_to_parse = edit_path + else: + print(f"Error: No sidebar.md found in {site_dir}") + sys.exit(1) + + with open(md_to_parse, 'r', encoding='utf-8') as f: + root, count = parse_markdown(f.read()) + + tree_html = render_editor_html(root) + + # Fix root ul: ensure it's visible and has ID + tree_html = tree_html.replace("