diff --git a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs index b492289a..6cddeaff 100644 --- a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs +++ b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs @@ -21,6 +21,68 @@ open MarkdownUtils let internal htmlEncode (code: string) = code.Replace("&", "&").Replace("<", "<").Replace(">", ">") +/// Encode emojis and problematic Unicode characters as HTML numeric entities +/// Encodes characters in emoji ranges and symbols, but preserves common international text +let internal encodeHighUnicode (text: string) = + if String.IsNullOrEmpty text then + text + else + // Single-pass encoding with lazy StringBuilder allocation + let mutable sb: System.Text.StringBuilder voption = ValueNone + let mutable i = 0 + + while i < text.Length do + let c = text.[i] + + let needsEncoding, codePoint, skipNext = + // Check for surrogate pairs first (emojis and other characters outside BMP) + if + Char.IsHighSurrogate c + && i + 1 < text.Length + && Char.IsLowSurrogate text.[i + 1] + then + let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1]) + // Encode all characters outside BMP (>= 0x10000) as they're typically emojis + true, fullCodePoint, true + else + let codePoint = int c + // Encode specific ranges that contain emojis and symbols: + // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc. + // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates) + (codePoint >= 0x2000 && codePoint <= 0x2BFF), codePoint, false + + if needsEncoding then + // Lazy initialization of StringBuilder only when needed + match sb with + | ValueNone -> + let builder = System.Text.StringBuilder(text.Length + 16) + + if i > 0 then + builder.Append(text, 0, i) |> ignore + + sb <- ValueSome builder + | ValueSome _ -> () + + // Append HTML entity without using sprintf (avoid allocation) + match sb with + | ValueSome builder -> + builder.Append "&#" |> ignore + builder.Append codePoint |> ignore + builder.Append ';' |> ignore + | ValueNone -> () + else + // Only append to StringBuilder if it was already initialized + match sb with + | ValueSome builder -> builder.Append c |> ignore + | ValueNone -> () + + i <- i + (if skipNext then 2 else 1) + + // Return original string if no encoding was needed + match sb with + | ValueNone -> text + | ValueSome builder -> builder.ToString() + /// Basic escaping as done by Markdown including quotes let internal htmlEncodeQuotes (code: string) = (htmlEncode code).Replace("\"", """) @@ -78,7 +140,7 @@ let rec internal formatSpan (ctx: FormattingContext) span = | AnchorLink(id, _) -> ctx.Writer.Write(" ") | EmbedSpans(cmd, _) -> formatSpans ctx (cmd.Render()) - | Literal(str, _) -> ctx.Writer.Write(str) + | Literal(str, _) -> ctx.Writer.Write(encodeHighUnicode str) | HardLineBreak(_) -> ctx.Writer.Write("
" + ctx.Newline) | IndirectLink(body, _, LookupKey ctx.Links (link, title), _) | DirectLink(body, link, title, _) -> diff --git a/tests/FSharp.Markdown.Tests/Markdown.fs b/tests/FSharp.Markdown.Tests/Markdown.fs index c38eedf3..ecf82fd9 100644 --- a/tests/FSharp.Markdown.Tests/Markdown.fs +++ b/tests/FSharp.Markdown.Tests/Markdown.fs @@ -30,6 +30,38 @@ let ``Escape HTML entities inside of code`` () = |> Markdown.ToHtml |> should contain "

a &gt; & b

" +[] +let ``Emojis are encoded as HTML numeric entities`` () = + let html = "Like this 🎉🚧⭐⚠️✅" |> Markdown.ToHtml + html |> should contain "🎉" // 🎉 party popper + html |> should contain "🚧" // 🚧 construction + html |> should contain "⭐" // ⭐ star + html |> should contain "⚠" // ⚠️ warning + html |> should contain "✅" // ✅ check mark + +[] +let ``Regular text without emojis is not modified`` () = + // Fast path optimization: regular text should pass through unchanged + let html = "This is regular text with пристаням Cyrillic and 中文 Chinese" |> Markdown.ToHtml + html |> should contain "пристаням" + html |> should contain "中文" + html |> should not' (contain "&#") // No HTML entities for regular international text + +[] +let ``List without blank line after heading`` () = + // Test the issue mentioned in comment: https://github.com/fsprojects/FSharp.Formatting/issues/964#issuecomment-3515381382 + let markdown = + """# This is my title +- this list +- should render""" + + let html = Markdown.ToHtml markdown + // Check if list is rendered as a separate element, not part of heading + html |> should contain "

This is my title

" + html |> should contain "
    " + html |> should contain "
  • this list
  • " + html |> should contain "
  • should render
  • " + [] let ``Inline HTML tag containing 'at' is not turned into hyperlink`` () = let doc = """hi""" |> Markdown.Parse