From 4417f8cb0a1ff65a27a495987ff8f770b350dbcf Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Wed, 5 Nov 2025 10:45:46 -0800 Subject: [PATCH 1/6] Add a test for unclosed HTML tags --- tests/testsuite/rendering.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/testsuite/rendering.rs b/tests/testsuite/rendering.rs index 7f2fffa92a..961710ede5 100644 --- a/tests/testsuite/rendering.rs +++ b/tests/testsuite/rendering.rs @@ -239,3 +239,25 @@ fn html_blocks() { fn code_block_fenced_with_indent() { BookTest::from_dir("rendering/code_blocks_fenced_with_indent").check_all_main_files(); } + +// Unclosed HTML tags. +// +// Note that the HTML parsing algorithm is much more complicated than what +// this is checking. +#[test] +fn unclosed_html_tags() { + BookTest::init(|_| {}) + .change_file("src/chapter_1.md", "
xfooxyz") + .run("build", |cmd| { + cmd.expect_stderr(str![[r#" + INFO Book building has started + INFO Running the html backend + INFO HTML book written to `[ROOT]/book` + +"#]]); + }) + .check_main_file( + "book/chapter_1.html", + str!["
xfooxyz
"], + ); +} From 1e190137c3bcca27ceebd95d6737696dbb875d8c Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Wed, 5 Nov 2025 11:14:39 -0800 Subject: [PATCH 2/6] Add a check for unclosed elements on the stack This checks for any unclosed elements when processing is finished. This is intended to detect invalid HTML in the source, or bugs in the tree builder. Raw HTML elements generate a warning (which in the future will be a configurable lint). All other sync errors are internal errors as they are not expected, and it would be helpful to know if they ever happen. --- crates/mdbook-html/src/html/tree.rs | 35 +++++++++++++++++++++++++++++ tests/testsuite/rendering.rs | 3 +++ 2 files changed, 38 insertions(+) diff --git a/crates/mdbook-html/src/html/tree.rs b/crates/mdbook-html/src/html/tree.rs index 4184740d09..383c75b4b0 100644 --- a/crates/mdbook-html/src/html/tree.rs +++ b/crates/mdbook-html/src/html/tree.rs @@ -378,6 +378,7 @@ where } } } + self.finish_stack(); self.collect_footnote_defs(); } @@ -736,6 +737,40 @@ where output } + /// Deals with any unclosed elements on the stack. + fn finish_stack(&mut self) { + while let Some(node_id) = self.tag_stack.pop() { + let node = self.tree.get(node_id).unwrap().value(); + match node { + Node::Fragment => {} + Node::Element(el) => { + if el.was_raw { + warn!( + "unclosed HTML tag `<{}>` found in `{}`", + el.name.local, + self.options.path.display() + ); + } else { + panic!( + "internal error: expected empty tag stack.\n + path: `{}`\n\ + element={el:?}", + self.options.path.display() + ); + } + } + node => { + panic!( + "internal error: expected empty tag stack.\n + path: `{}`\n\ + node={node:?}", + self.options.path.display() + ); + } + } + } + } + /// Appends a new footnote reference. fn footnote_reference(&mut self, name: CowStr<'event>) { let len = self.footnote_numbers.len() + 1; diff --git a/tests/testsuite/rendering.rs b/tests/testsuite/rendering.rs index 961710ede5..f77f08401a 100644 --- a/tests/testsuite/rendering.rs +++ b/tests/testsuite/rendering.rs @@ -252,6 +252,9 @@ fn unclosed_html_tags() { cmd.expect_stderr(str![[r#" INFO Book building has started INFO Running the html backend + WARN unclosed HTML tag `` found in `chapter_1.md` + WARN unclosed HTML tag `` found in `chapter_1.md` + WARN unclosed HTML tag `
` found in `chapter_1.md` INFO HTML book written to `[ROOT]/book` "#]]); From 1646e4923ae7798538077fb7d62847ca44b442c2 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Wed, 5 Nov 2025 11:17:48 -0800 Subject: [PATCH 3/6] Add a test for HTML tags out of sync --- tests/testsuite/rendering.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/testsuite/rendering.rs b/tests/testsuite/rendering.rs index f77f08401a..13c157cf01 100644 --- a/tests/testsuite/rendering.rs +++ b/tests/testsuite/rendering.rs @@ -264,3 +264,28 @@ fn unclosed_html_tags() { str!["
xfooxyz
"], ); } + +// Test for HTML tags out of sync. +#[test] +fn unbalanced_html_tags() { + BookTest::init(|_| {}) + .change_file("src/chapter_1.md", "
xfoo
") + .run("build", |cmd| { + cmd.expect_stderr(str![[r#" + INFO Book building has started + INFO Running the html backend +ERROR internal error: HTML tag stack out of sync. + + path: `chapter_1.md` +current=Element(Element { name: QualName { prefix: None, ns: Atom('http://www.w3.org/1999/xhtml' type=static), local: Atom('span' type=inline) }, attrs: {}, self_closing: false, was_raw: true }) +pop name: div + WARN unclosed HTML tag `
` found in `chapter_1.md` + INFO HTML book written to `[ROOT]/book` + +"#]]); + }) + .check_main_file( + "book/chapter_1.html", + str!["
xfoo
"], + ); +} From 5905bf1d853d8a61780fe44f98308abae949ae2e Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Wed, 5 Nov 2025 11:33:58 -0800 Subject: [PATCH 4/6] Factor out Token::TagToken to combat rightwards drift --- crates/mdbook-html/src/html/tree.rs | 72 +++++++++++++++-------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/crates/mdbook-html/src/html/tree.rs b/crates/mdbook-html/src/html/tree.rs index 383c75b4b0..b732ac441b 100644 --- a/crates/mdbook-html/src/html/tree.rs +++ b/crates/mdbook-html/src/html/tree.rs @@ -607,40 +607,10 @@ where trace!("html token={token:?}"); match token { Token::DoctypeToken(_) => {} - Token::TagToken(tag) => { - match tag.kind { - TagKind::StartTag => { - let is_closed = is_void_element(&tag.name) || tag.self_closing; - is_raw = matches!(&*tag.name, "script" | "style"); - let name = QualName::new(None, html5ever::ns!(html), tag.name); - let attrs = tag - .attrs - .into_iter() - .map(|attr| (attr.name, attr.value)) - .collect(); - let mut el = Element { - name, - attrs, - self_closing: tag.self_closing, - was_raw: true, - }; - fix_html_link(&mut el); - self.push(Node::Element(el)); - if is_closed { - // No end element. - self.pop(); - } - } - TagKind::EndTag => { - is_raw = false; - if self.is_html_tag_matching(&tag.name) { - self.pop(); - } - // else the stack is corrupt. I'm not really sure - // what to do here... - } - } - } + Token::TagToken(tag) => match tag.kind { + TagKind::StartTag => self.start_html_tag(tag, &mut is_raw), + TagKind::EndTag => self.end_html_tag(tag, &mut is_raw), + }, Token::CommentToken(comment) => { self.append(Node::Comment(comment)); } @@ -665,6 +635,40 @@ where } } + /// Adds an open HTML tag. + fn start_html_tag(&mut self, tag: html5ever::tokenizer::Tag, is_raw: &mut bool) { + let is_closed = is_void_element(&tag.name) || tag.self_closing; + *is_raw = matches!(&*tag.name, "script" | "style"); + let name = QualName::new(None, html5ever::ns!(html), tag.name); + let attrs = tag + .attrs + .into_iter() + .map(|attr| (attr.name, attr.value)) + .collect(); + let mut el = Element { + name, + attrs, + self_closing: tag.self_closing, + was_raw: true, + }; + fix_html_link(&mut el); + self.push(Node::Element(el)); + if is_closed { + // No end element. + self.pop(); + } + } + + /// Closes the given HTML tag. + fn end_html_tag(&mut self, tag: html5ever::tokenizer::Tag, is_raw: &mut bool) { + *is_raw = false; + if self.is_html_tag_matching(&tag.name) { + self.pop(); + } + // else the stack is corrupt. I'm not really sure + // what to do here... + } + /// This is used to verify HTML parsing keeps the stack of tags in sync. fn is_html_tag_matching(&self, name: &str) -> bool { let current = self.tree.get(self.current_node).unwrap().value(); From 22065ebc79efbf5b0d6ffe36673139aafcd90db6 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Wed, 5 Nov 2025 11:42:43 -0800 Subject: [PATCH 5/6] Give a warning for unclosed HTML tags This changes the internal error message to a warning to let the user know that the HTML tags are unbalanced. In the future this will be a denyable lint. This is a very primitive approach of just ignoring the end tag. Ideally it should recover using the standard HTML parsing algorithm, since there is a chance that there will be a cascade of errors under certain unbalanced situations. --- crates/mdbook-html/src/html/tree.rs | 27 +++++++++++++++------------ tests/testsuite/rendering.rs | 12 +++--------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/crates/mdbook-html/src/html/tree.rs b/crates/mdbook-html/src/html/tree.rs index b732ac441b..4e1d8f3539 100644 --- a/crates/mdbook-html/src/html/tree.rs +++ b/crates/mdbook-html/src/html/tree.rs @@ -19,7 +19,7 @@ use pulldown_cmark::{Alignment, CodeBlockKind, CowStr, Event, LinkType, Tag, Tag use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::ops::Deref; -use tracing::{error, trace, warn}; +use tracing::{trace, warn}; /// Helper to create a [`QualName`]. macro_rules! attr_qual_name { @@ -664,9 +664,18 @@ where *is_raw = false; if self.is_html_tag_matching(&tag.name) { self.pop(); + } else { + // The proper thing to do here is to recover. However, the HTML + // parsing algorithm for that is quite complex. See + // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody + // and the adoption agency algorithm. + warn!( + "unexpected HTML end tag `` found in `{}`\n\ + Check that the HTML tags are properly balanced.", + tag.name, + self.options.path.display() + ); } - // else the stack is corrupt. I'm not really sure - // what to do here... } /// This is used to verify HTML parsing keeps the stack of tags in sync. @@ -675,16 +684,10 @@ where if let Node::Element(el) = current && el.name() == name { - return true; + true + } else { + false } - error!( - "internal error: HTML tag stack out of sync.\n - path: `{}`\n\ - current={current:?}\n\ - pop name: {name}", - self.options.path.display() - ); - false } /// Eats all pulldown-cmark events until the next `End` matching the diff --git a/tests/testsuite/rendering.rs b/tests/testsuite/rendering.rs index 13c157cf01..e0e770398d 100644 --- a/tests/testsuite/rendering.rs +++ b/tests/testsuite/rendering.rs @@ -274,18 +274,12 @@ fn unbalanced_html_tags() { cmd.expect_stderr(str![[r#" INFO Book building has started INFO Running the html backend -ERROR internal error: HTML tag stack out of sync. - - path: `chapter_1.md` -current=Element(Element { name: QualName { prefix: None, ns: Atom('http://www.w3.org/1999/xhtml' type=static), local: Atom('span' type=inline) }, attrs: {}, self_closing: false, was_raw: true }) -pop name: div + WARN unexpected HTML end tag `
` found in `chapter_1.md` +Check that the HTML tags are properly balanced. WARN unclosed HTML tag `
` found in `chapter_1.md` INFO HTML book written to `[ROOT]/book` "#]]); }) - .check_main_file( - "book/chapter_1.html", - str!["
xfoo
"], - ); + .check_main_file("book/chapter_1.html", str!["
xfoo
"]); } From f0117ec3dfcafbe554d020e38ebbe2005c35cdb4 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Wed, 5 Nov 2025 11:45:46 -0800 Subject: [PATCH 6/6] Add a comment about synchronizing the event stack --- crates/mdbook-html/src/html/tree.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/mdbook-html/src/html/tree.rs b/crates/mdbook-html/src/html/tree.rs index 4e1d8f3539..432cf7bea0 100644 --- a/crates/mdbook-html/src/html/tree.rs +++ b/crates/mdbook-html/src/html/tree.rs @@ -307,6 +307,8 @@ where match event { Event::Start(tag) => self.start_tag(tag), Event::End(tag) => { + // TODO: This should validate that the event stack is + // properly synchronized with the tag stack. self.pop(); match tag { TagEnd::TableHead => {