diff --git a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs
index b492289a..6cddeaff 100644
--- a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs
+++ b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs
@@ -21,6 +21,68 @@ open MarkdownUtils
let internal htmlEncode (code: string) =
code.Replace("&", "&").Replace("<", "<").Replace(">", ">")
+/// Encode emojis and problematic Unicode characters as HTML numeric entities
+/// Encodes characters in emoji ranges and symbols, but preserves common international text
+let internal encodeHighUnicode (text: string) =
+ if String.IsNullOrEmpty text then
+ text
+ else
+ // Single-pass encoding with lazy StringBuilder allocation
+ let mutable sb: System.Text.StringBuilder voption = ValueNone
+ let mutable i = 0
+
+ while i < text.Length do
+ let c = text.[i]
+
+ let needsEncoding, codePoint, skipNext =
+ // Check for surrogate pairs first (emojis and other characters outside BMP)
+ if
+ Char.IsHighSurrogate c
+ && i + 1 < text.Length
+ && Char.IsLowSurrogate text.[i + 1]
+ then
+ let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1])
+ // Encode all characters outside BMP (>= 0x10000) as they're typically emojis
+ true, fullCodePoint, true
+ else
+ let codePoint = int c
+ // Encode specific ranges that contain emojis and symbols:
+ // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc.
+ // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates)
+ (codePoint >= 0x2000 && codePoint <= 0x2BFF), codePoint, false
+
+ if needsEncoding then
+ // Lazy initialization of StringBuilder only when needed
+ match sb with
+ | ValueNone ->
+ let builder = System.Text.StringBuilder(text.Length + 16)
+
+ if i > 0 then
+ builder.Append(text, 0, i) |> ignore
+
+ sb <- ValueSome builder
+ | ValueSome _ -> ()
+
+ // Append HTML entity without using sprintf (avoid allocation)
+ match sb with
+ | ValueSome builder ->
+ builder.Append "" |> ignore
+ builder.Append codePoint |> ignore
+ builder.Append ';' |> ignore
+ | ValueNone -> ()
+ else
+ // Only append to StringBuilder if it was already initialized
+ match sb with
+ | ValueSome builder -> builder.Append c |> ignore
+ | ValueNone -> ()
+
+ i <- i + (if skipNext then 2 else 1)
+
+ // Return original string if no encoding was needed
+ match sb with
+ | ValueNone -> text
+ | ValueSome builder -> builder.ToString()
+
/// Basic escaping as done by Markdown including quotes
let internal htmlEncodeQuotes (code: string) =
(htmlEncode code).Replace("\"", """)
@@ -78,7 +140,7 @@ let rec internal formatSpan (ctx: FormattingContext) span =
| AnchorLink(id, _) -> ctx.Writer.Write(" ")
| EmbedSpans(cmd, _) -> formatSpans ctx (cmd.Render())
- | Literal(str, _) -> ctx.Writer.Write(str)
+ | Literal(str, _) -> ctx.Writer.Write(encodeHighUnicode str)
| HardLineBreak(_) -> ctx.Writer.Write("
" + ctx.Newline)
| IndirectLink(body, _, LookupKey ctx.Links (link, title), _)
| DirectLink(body, link, title, _) ->
diff --git a/tests/FSharp.Markdown.Tests/Markdown.fs b/tests/FSharp.Markdown.Tests/Markdown.fs
index c38eedf3..ecf82fd9 100644
--- a/tests/FSharp.Markdown.Tests/Markdown.fs
+++ b/tests/FSharp.Markdown.Tests/Markdown.fs
@@ -30,6 +30,38 @@ let ``Escape HTML entities inside of code`` () =
|> Markdown.ToHtml
|> should contain "
a > & b