Fix multi-byte Unicode handling in files (#370)

krassowski · web-flow · commit 3900efdeefef · 2025-12-10T16:05:59.000Z
* Add tests for multi-byte unicode handling

* Fix the Unicode handling issue
diff --git a/jupyter_ydoc/yunicode.py b/jupyter_ydoc/yunicode.py
@@ -74,30 +74,66 @@ def set(self, value: str) -> None:
             # to avoid side-effects such as cursor jumping to the top
             return
 
+        before_bytes = old_value.encode("utf-8")
+        after_bytes = value.encode("utf-8")
+
         with self._ydoc.transaction():
-            matcher = SequenceMatcher(a=old_value, b=value)
+            matcher = SequenceMatcher(a=before_bytes, b=after_bytes)
 
             if (
                 matcher.real_quick_ratio() >= SIMILARITY_THREESHOLD
                 and matcher.ratio() >= SIMILARITY_THREESHOLD
             ):
                 operations = matcher.get_opcodes()
-                offset = 0
+
+                # Fix byte ranges and check for problematic overlaps
+                fixed_operations = []
+                prev_end = 0
+                prev_tag = None
+                has_overlap = False
+
                 for tag, i1, i2, j1, j2 in operations:
-                    match tag:
-                        case "replace":
-                            self._ysource[i1 + offset : i2 + offset] = value[j1:j2]
-                            offset += (j2 - j1) - (i2 - i1)
-                        case "delete":
-                            del self._ysource[i1 + offset : i2 + offset]
-                            offset -= i2 - i1
-                        case "insert":
-                            self._ysource.insert(i1 + offset, value[j1:j2])
-                            offset += j2 - j1
-                        case "equal":
-                            pass
-                        case _:
-                            raise ValueError(f"Unknown tag '{tag}' in sequence matcher")
+                    # Fix byte ranges to proper UTF-8 character boundaries
+                    i1_fixed, i2_fixed = _fix_byte_range_to_char_boundary(before_bytes, i1, i2)
+                    j1_fixed, j2_fixed = _fix_byte_range_to_char_boundary(after_bytes, j1, j2)
+
+                    # Check if this operation overlaps with the previous one
+                    # which can happen with grapheme clusters (emoji + modifiers, etc.)
+                    if i1_fixed < prev_end and prev_tag != "equal":
+                        has_overlap = True
+                        break
+
+                    prev_end = i2_fixed
+                    prev_tag = tag
+                    fixed_operations.append((tag, i1_fixed, i2_fixed, j1_fixed, j2_fixed))
+
+                # If we detected overlapping operations, fall back to hard reload
+                if has_overlap:
+                    self._ysource.clear()
+                    if value:
+                        self._ysource += value
+                else:
+                    # Apply granular operations
+                    offset = 0
+                    for tag, i1, i2, j1, j2 in fixed_operations:
+                        match tag:
+                            case "replace":
+                                self._ysource[i1 + offset : i2 + offset] = after_bytes[
+                                    j1:j2
+                                ].decode("utf-8")
+                                offset += (j2 - j1) - (i2 - i1)
+                            case "delete":
+                                del self._ysource[i1 + offset : i2 + offset]
+                                offset -= i2 - i1
+                            case "insert":
+                                self._ysource.insert(
+                                    i1 + offset, after_bytes[j1:j2].decode("utf-8")
+                                )
+                                offset += j2 - j1
+                            case "equal":
+                                pass
+                            case _:
+                                raise ValueError(f"Unknown tag '{tag}' in sequence matcher")
             else:
                 # for very different strings, just replace the whole content;
                 # this avoids generating a huge number of operations
@@ -118,3 +154,28 @@ def observe(self, callback: Callable[[str, Any], None]) -> None:
         self.unobserve()
         self._subscriptions[self._ystate] = self._ystate.observe(partial(callback, "state"))
         self._subscriptions[self._ysource] = self._ysource.observe(partial(callback, "source"))
+
+
+def _is_utf8_continuation_byte(byte: int) -> bool:
+    """Check if a byte is a UTF-8 continuation byte (10xxxxxx)."""
+    return (byte & 0xC0) == 0x80
+
+
+def _fix_byte_range_to_char_boundary(data: bytes, start: int, end: int) -> tuple[int, int]:
+    """
+    Adjust byte indices to proper UTF-8 character boundaries.
+
+    :param data: The byte data.
+    :param start: The start byte index.
+    :param end: The end byte index.
+    :return: A tuple of (adjusted_start, adjusted_end).
+    """
+    # Move start backward to the beginning of a UTF-8 character
+    while start > 0 and start < len(data) and _is_utf8_continuation_byte(data[start]):
+        start -= 1
+
+    # Move end forward to the end of a UTF-8 character
+    while end < len(data) and _is_utf8_continuation_byte(data[end]):
+        end += 1
+
+    return start, end
diff --git a/tests/test_yunicode.py b/tests/test_yunicode.py
@@ -1,6 +1,7 @@
 # Copyright (c) Jupyter Development Team.
 # Distributed under the terms of the Modified BSD License.
 
+import pytest
 from pycrdt import TextEvent
 from utils import ExpectedEvent
 
@@ -188,3 +189,125 @@ def record_changes(topic, event):
     assert source_events == [
         ExpectedEvent(TextEvent, delta=[{"delete": 109}, {"insert": twinkle_lyrics}])
     ]
+
+
+@pytest.mark.parametrize(
+    "initial, updated, granular",
+    [
+        (
+            # emojis swapped
+            "I like security 🎨 but I really love painting 🔒",
+            "I like security 🔒 but I really love painting 🎨",
+            True,
+        ),
+        (
+            # text changes, emojis stay in place
+            "Here is a rocket: ⭐ and a star: 🚀",
+            "Here is a star: ⭐ and a rocket: 🚀",
+            True,
+        ),
+        (
+            # change of text and emojis
+            "Here are some happy faces: 😀😁😂",
+            "Here are some sad faces: 😞😢😭",
+            True,
+        ),
+        (
+            # change of characters with combining marks
+            "Combining characters: á é í ó ú",
+            "Combining characters: ú ó í é á",
+            True,
+        ),
+        (
+            "Flags: 🇺🇸🇬🇧🇨🇦",
+            "Flags: 🇨🇦🇬🇧🇺🇸",
+            True,
+        ),
+        (
+            # Emoji with skin tone modifiers
+            "Waving hands: 👋👋🏻👋🏿",
+            "Waving hands: 👋🏿👋🏻👋",
+            False,  # overlapping grapheme clusters, should trigger hard reload
+        ),
+        (
+            # Zero-width joiner sequences (family emoji)
+            "A family 👨‍👩‍👧‍👦 (with two children)",
+            "A family 👨‍👩‍👧 (with one child)",
+            True,
+        ),
+        (
+            # Mixed RTL/LTR text
+            "Hello שלום world",
+            "Hello עולם world",
+            True,
+        ),
+        (
+            # Zero-width characters
+            "Word​break vs Word​​break",  # zero-width space
+            "Word​​break vs Word​break",
+            True,
+        ),
+        (
+            # Keycap sequences
+            "Numbers: 1️⃣2️⃣3️⃣",
+            "Numbers: 3️⃣2️⃣1️⃣",
+            True,
+        ),
+        (
+            # Mixed emoji presentation styles
+            "Text style: ☺︎ vs emoji style: ☺️",
+            "Text style: ☹︎ vs emoji style: ☹️",
+            True,
+        ),
+        (
+            # NFD vs NFC normalization (é can be one or two codepoints)
+            "Café" + "\u0301",  # e + combining acute accent at the end
+            "Caff" + "\u0301",
+            True,
+        ),
+        (
+            # Emoji at boundaries
+            "👋 middle text 🎉",
+            "🎉 middle text 👋",
+            True,
+        ),
+        (
+            # Consecutive emoji with different byte lengths
+            "Grinning face + a flag 😀🏴",
+            "Grinning face + a flag 🏴😀",
+            False,  # overlapping grapheme clusters, should trigger hard reload
+        ),
+        (
+            # Japanese characters
+            "こんにちは世界",
+            "こんにちは地球",
+            True,
+        ),
+        (
+            # Julia math operators
+            "x ∈ [1, 2, 3] && y ≥ 0",
+            "x ∉ [1, 2, 3] || y ≤ 0",
+            True,
+        ),
+    ],
+)
+def test_multibyte_unicode(initial, updated, granular):
+    text = YUnicode()
+    text.set(initial)
+
+    changes = []
+
+    def record_changes(topic, event):
+        changes.append((topic, event))  # pragma: nocover
+
+    text.observe(record_changes)
+
+    text.set(updated)
+
+    assert len(changes) == 1
+    source_events = [e for t, e in changes if t == "source"]
+
+    expected_min_delta_length = 3 if granular else 2
+
+    assert len(source_events[0].delta) >= expected_min_delta_length
+    assert text.get() == updated