Skip to content

Commit 3900efd

Browse files
authored
Fix multi-byte Unicode handling in files (#370)
* Add tests for multi-byte unicode handling * Fix the Unicode handling issue
1 parent 3ceafd5 commit 3900efd

File tree

2 files changed

+200
-16
lines changed

2 files changed

+200
-16
lines changed

jupyter_ydoc/yunicode.py

Lines changed: 77 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -74,30 +74,66 @@ def set(self, value: str) -> None:
7474
# to avoid side-effects such as cursor jumping to the top
7575
return
7676

77+
before_bytes = old_value.encode("utf-8")
78+
after_bytes = value.encode("utf-8")
79+
7780
with self._ydoc.transaction():
78-
matcher = SequenceMatcher(a=old_value, b=value)
81+
matcher = SequenceMatcher(a=before_bytes, b=after_bytes)
7982

8083
if (
8184
matcher.real_quick_ratio() >= SIMILARITY_THREESHOLD
8285
and matcher.ratio() >= SIMILARITY_THREESHOLD
8386
):
8487
operations = matcher.get_opcodes()
85-
offset = 0
88+
89+
# Fix byte ranges and check for problematic overlaps
90+
fixed_operations = []
91+
prev_end = 0
92+
prev_tag = None
93+
has_overlap = False
94+
8695
for tag, i1, i2, j1, j2 in operations:
87-
match tag:
88-
case "replace":
89-
self._ysource[i1 + offset : i2 + offset] = value[j1:j2]
90-
offset += (j2 - j1) - (i2 - i1)
91-
case "delete":
92-
del self._ysource[i1 + offset : i2 + offset]
93-
offset -= i2 - i1
94-
case "insert":
95-
self._ysource.insert(i1 + offset, value[j1:j2])
96-
offset += j2 - j1
97-
case "equal":
98-
pass
99-
case _:
100-
raise ValueError(f"Unknown tag '{tag}' in sequence matcher")
96+
# Fix byte ranges to proper UTF-8 character boundaries
97+
i1_fixed, i2_fixed = _fix_byte_range_to_char_boundary(before_bytes, i1, i2)
98+
j1_fixed, j2_fixed = _fix_byte_range_to_char_boundary(after_bytes, j1, j2)
99+
100+
# Check if this operation overlaps with the previous one
101+
# which can happen with grapheme clusters (emoji + modifiers, etc.)
102+
if i1_fixed < prev_end and prev_tag != "equal":
103+
has_overlap = True
104+
break
105+
106+
prev_end = i2_fixed
107+
prev_tag = tag
108+
fixed_operations.append((tag, i1_fixed, i2_fixed, j1_fixed, j2_fixed))
109+
110+
# If we detected overlapping operations, fall back to hard reload
111+
if has_overlap:
112+
self._ysource.clear()
113+
if value:
114+
self._ysource += value
115+
else:
116+
# Apply granular operations
117+
offset = 0
118+
for tag, i1, i2, j1, j2 in fixed_operations:
119+
match tag:
120+
case "replace":
121+
self._ysource[i1 + offset : i2 + offset] = after_bytes[
122+
j1:j2
123+
].decode("utf-8")
124+
offset += (j2 - j1) - (i2 - i1)
125+
case "delete":
126+
del self._ysource[i1 + offset : i2 + offset]
127+
offset -= i2 - i1
128+
case "insert":
129+
self._ysource.insert(
130+
i1 + offset, after_bytes[j1:j2].decode("utf-8")
131+
)
132+
offset += j2 - j1
133+
case "equal":
134+
pass
135+
case _:
136+
raise ValueError(f"Unknown tag '{tag}' in sequence matcher")
101137
else:
102138
# for very different strings, just replace the whole content;
103139
# this avoids generating a huge number of operations
@@ -118,3 +154,28 @@ def observe(self, callback: Callable[[str, Any], None]) -> None:
118154
self.unobserve()
119155
self._subscriptions[self._ystate] = self._ystate.observe(partial(callback, "state"))
120156
self._subscriptions[self._ysource] = self._ysource.observe(partial(callback, "source"))
157+
158+
159+
def _is_utf8_continuation_byte(byte: int) -> bool:
160+
"""Check if a byte is a UTF-8 continuation byte (10xxxxxx)."""
161+
return (byte & 0xC0) == 0x80
162+
163+
164+
def _fix_byte_range_to_char_boundary(data: bytes, start: int, end: int) -> tuple[int, int]:
165+
"""
166+
Adjust byte indices to proper UTF-8 character boundaries.
167+
168+
:param data: The byte data.
169+
:param start: The start byte index.
170+
:param end: The end byte index.
171+
:return: A tuple of (adjusted_start, adjusted_end).
172+
"""
173+
# Move start backward to the beginning of a UTF-8 character
174+
while start > 0 and start < len(data) and _is_utf8_continuation_byte(data[start]):
175+
start -= 1
176+
177+
# Move end forward to the end of a UTF-8 character
178+
while end < len(data) and _is_utf8_continuation_byte(data[end]):
179+
end += 1
180+
181+
return start, end

tests/test_yunicode.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright (c) Jupyter Development Team.
22
# Distributed under the terms of the Modified BSD License.
33

4+
import pytest
45
from pycrdt import TextEvent
56
from utils import ExpectedEvent
67

@@ -188,3 +189,125 @@ def record_changes(topic, event):
188189
assert source_events == [
189190
ExpectedEvent(TextEvent, delta=[{"delete": 109}, {"insert": twinkle_lyrics}])
190191
]
192+
193+
194+
@pytest.mark.parametrize(
195+
"initial, updated, granular",
196+
[
197+
(
198+
# emojis swapped
199+
"I like security 🎨 but I really love painting 🔒",
200+
"I like security 🔒 but I really love painting 🎨",
201+
True,
202+
),
203+
(
204+
# text changes, emojis stay in place
205+
"Here is a rocket: ⭐ and a star: 🚀",
206+
"Here is a star: ⭐ and a rocket: 🚀",
207+
True,
208+
),
209+
(
210+
# change of text and emojis
211+
"Here are some happy faces: 😀😁😂",
212+
"Here are some sad faces: 😞😢😭",
213+
True,
214+
),
215+
(
216+
# change of characters with combining marks
217+
"Combining characters: á é í ó ú",
218+
"Combining characters: ú ó í é á",
219+
True,
220+
),
221+
(
222+
"Flags: 🇺🇸🇬🇧🇨🇦",
223+
"Flags: 🇨🇦🇬🇧🇺🇸",
224+
True,
225+
),
226+
(
227+
# Emoji with skin tone modifiers
228+
"Waving hands: 👋👋🏻👋🏿",
229+
"Waving hands: 👋🏿👋🏻👋",
230+
False, # overlapping grapheme clusters, should trigger hard reload
231+
),
232+
(
233+
# Zero-width joiner sequences (family emoji)
234+
"A family 👨‍👩‍👧‍👦 (with two children)",
235+
"A family 👨‍👩‍👧 (with one child)",
236+
True,
237+
),
238+
(
239+
# Mixed RTL/LTR text
240+
"Hello שלום world",
241+
"Hello עולם world",
242+
True,
243+
),
244+
(
245+
# Zero-width characters
246+
"Word​break vs Word​​break", # zero-width space
247+
"Word​​break vs Word​break",
248+
True,
249+
),
250+
(
251+
# Keycap sequences
252+
"Numbers: 1️⃣2️⃣3️⃣",
253+
"Numbers: 3️⃣2️⃣1️⃣",
254+
True,
255+
),
256+
(
257+
# Mixed emoji presentation styles
258+
"Text style: ☺︎ vs emoji style: ☺️",
259+
"Text style: ☹︎ vs emoji style: ☹️",
260+
True,
261+
),
262+
(
263+
# NFD vs NFC normalization (é can be one or two codepoints)
264+
"Café" + "\u0301", # e + combining acute accent at the end
265+
"Caff" + "\u0301",
266+
True,
267+
),
268+
(
269+
# Emoji at boundaries
270+
"👋 middle text 🎉",
271+
"🎉 middle text 👋",
272+
True,
273+
),
274+
(
275+
# Consecutive emoji with different byte lengths
276+
"Grinning face + a flag 😀🏴",
277+
"Grinning face + a flag 🏴😀",
278+
False, # overlapping grapheme clusters, should trigger hard reload
279+
),
280+
(
281+
# Japanese characters
282+
"こんにちは世界",
283+
"こんにちは地球",
284+
True,
285+
),
286+
(
287+
# Julia math operators
288+
"x ∈ [1, 2, 3] && y ≥ 0",
289+
"x ∉ [1, 2, 3] || y ≤ 0",
290+
True,
291+
),
292+
],
293+
)
294+
def test_multibyte_unicode(initial, updated, granular):
295+
text = YUnicode()
296+
text.set(initial)
297+
298+
changes = []
299+
300+
def record_changes(topic, event):
301+
changes.append((topic, event)) # pragma: nocover
302+
303+
text.observe(record_changes)
304+
305+
text.set(updated)
306+
307+
assert len(changes) == 1
308+
source_events = [e for t, e in changes if t == "source"]
309+
310+
expected_min_delta_length = 3 if granular else 2
311+
312+
assert len(source_events[0].delta) >= expected_min_delta_length
313+
assert text.get() == updated

0 commit comments

Comments
 (0)