From d0bde467e0ba01ba16a06a1d011f1efdb8c048c0 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Tue, 4 Nov 2025 16:01:06 -0800 Subject: [PATCH 1/2] Lowercase heading IDs This switches from ASCII lowercase to Unicode lowercase when generating heading IDs. This brings mdbook more in line with other tools and sites when they generate heading IDs. The generation still isn't 100% the same as other tools and sites, but it is usually the same in most cases. Closes https://github.com/rust-lang/mdBook/issues/1059 --- CHANGELOG.md | 7 +++++-- crates/mdbook-html/src/utils.rs | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 856201265c..8db03518b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,8 +58,11 @@ The following is a summary of the changes that may require your attention when u [#2847](https://github.com/rust-lang/mdBook/pull/2847) - Added support for admonitions. These are enabled by default, with the option `output.html.admonitions` to disable it. [#2851](https://github.com/rust-lang/mdBook/pull/2851) -- Headers that start or end with HTML characters like `<`, `&`, or `>` now replace those characters in the link ID with `-` instead of being stripped. This brings the header ID generation closer to other tools and sites. - [#2844](https://github.com/rust-lang/mdBook/pull/2844) +- Header ID generation has some minor changes to bring the ID generation closer to other tools and sites: + - IDs now use Unicode lowercase instead of ASCII lowercase. + [#2922](https://github.com/rust-lang/mdBook/pull/2922) + - Headers that start or end with HTML characters like `<`, `&`, or `>` now replace those characters in the link ID with `-` instead of being stripped. + [#2844](https://github.com/rust-lang/mdBook/pull/2844) ### CLI changes diff --git a/crates/mdbook-html/src/utils.rs b/crates/mdbook-html/src/utils.rs index 6c17b8d5a3..b5bc2d7c18 100644 --- a/crates/mdbook-html/src/utils.rs +++ b/crates/mdbook-html/src/utils.rs @@ -76,10 +76,11 @@ pub(crate) fn unique_id(id: &str, used: &mut HashSet) -> String { pub(crate) fn id_from_content(content: &str) -> String { content .trim() + .to_lowercase() .chars() .filter_map(|ch| { if ch.is_alphanumeric() || ch == '_' || ch == '-' { - Some(ch.to_ascii_lowercase()) + Some(ch) } else if ch.is_whitespace() { Some('-') } else { @@ -120,6 +121,6 @@ mod tests { assert_eq!(id_from_content("한국어"), "한국어"); assert_eq!(id_from_content(""), ""); assert_eq!(id_from_content("中文標題 CJK title"), "中文標題-cjk-title"); - assert_eq!(id_from_content("Über"), "Über"); + assert_eq!(id_from_content("Über"), "über"); } } From 051fc9f01db3ae776c1d8e251d0321961a967b72 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Tue, 4 Nov 2025 16:04:14 -0800 Subject: [PATCH 2/2] Add a comment about the intent of HTML id generation --- crates/mdbook-html/src/utils.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/crates/mdbook-html/src/utils.rs b/crates/mdbook-html/src/utils.rs index b5bc2d7c18..68f42a4094 100644 --- a/crates/mdbook-html/src/utils.rs +++ b/crates/mdbook-html/src/utils.rs @@ -74,6 +74,15 @@ pub(crate) fn unique_id(id: &str, used: &mut HashSet) -> String { /// Generates an HTML id from the given text. pub(crate) fn id_from_content(content: &str) -> String { + // This is intended to be close to how header ID generation is done in + // other sites and tools, but is not 100% the same. Not all sites and + // tools use the same algorithm. See these for more information: + // + // - https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#section-links + // - https://docs.gitlab.com/user/markdown/#heading-ids-and-links + // - https://pandoc.org/MANUAL.html#extension-auto_identifiers + // - https://kramdown.gettalong.org/converter/html#auto-ids + // - https://docs.rs/comrak/latest/comrak/options/struct.Extension.html#structfield.header_ids content .trim() .to_lowercase()