@@ -74,30 +74,66 @@ def set(self, value: str) -> None:
7474 # to avoid side-effects such as cursor jumping to the top
7575 return
7676
77+ before_bytes = old_value .encode ("utf-8" )
78+ after_bytes = value .encode ("utf-8" )
79+
7780 with self ._ydoc .transaction ():
78- matcher = SequenceMatcher (a = old_value , b = value )
81+ matcher = SequenceMatcher (a = before_bytes , b = after_bytes )
7982
8083 if (
8184 matcher .real_quick_ratio () >= SIMILARITY_THREESHOLD
8285 and matcher .ratio () >= SIMILARITY_THREESHOLD
8386 ):
8487 operations = matcher .get_opcodes ()
85- offset = 0
88+
89+ # Fix byte ranges and check for problematic overlaps
90+ fixed_operations = []
91+ prev_end = 0
92+ prev_tag = None
93+ has_overlap = False
94+
8695 for tag , i1 , i2 , j1 , j2 in operations :
87- match tag :
88- case "replace" :
89- self ._ysource [i1 + offset : i2 + offset ] = value [j1 :j2 ]
90- offset += (j2 - j1 ) - (i2 - i1 )
91- case "delete" :
92- del self ._ysource [i1 + offset : i2 + offset ]
93- offset -= i2 - i1
94- case "insert" :
95- self ._ysource .insert (i1 + offset , value [j1 :j2 ])
96- offset += j2 - j1
97- case "equal" :
98- pass
99- case _:
100- raise ValueError (f"Unknown tag '{ tag } ' in sequence matcher" )
96+ # Fix byte ranges to proper UTF-8 character boundaries
97+ i1_fixed , i2_fixed = _fix_byte_range_to_char_boundary (before_bytes , i1 , i2 )
98+ j1_fixed , j2_fixed = _fix_byte_range_to_char_boundary (after_bytes , j1 , j2 )
99+
100+ # Check if this operation overlaps with the previous one
101+ # which can happen with grapheme clusters (emoji + modifiers, etc.)
102+ if i1_fixed < prev_end and prev_tag != "equal" :
103+ has_overlap = True
104+ break
105+
106+ prev_end = i2_fixed
107+ prev_tag = tag
108+ fixed_operations .append ((tag , i1_fixed , i2_fixed , j1_fixed , j2_fixed ))
109+
110+ # If we detected overlapping operations, fall back to hard reload
111+ if has_overlap :
112+ self ._ysource .clear ()
113+ if value :
114+ self ._ysource += value
115+ else :
116+ # Apply granular operations
117+ offset = 0
118+ for tag , i1 , i2 , j1 , j2 in fixed_operations :
119+ match tag :
120+ case "replace" :
121+ self ._ysource [i1 + offset : i2 + offset ] = after_bytes [
122+ j1 :j2
123+ ].decode ("utf-8" )
124+ offset += (j2 - j1 ) - (i2 - i1 )
125+ case "delete" :
126+ del self ._ysource [i1 + offset : i2 + offset ]
127+ offset -= i2 - i1
128+ case "insert" :
129+ self ._ysource .insert (
130+ i1 + offset , after_bytes [j1 :j2 ].decode ("utf-8" )
131+ )
132+ offset += j2 - j1
133+ case "equal" :
134+ pass
135+ case _:
136+ raise ValueError (f"Unknown tag '{ tag } ' in sequence matcher" )
101137 else :
102138 # for very different strings, just replace the whole content;
103139 # this avoids generating a huge number of operations
@@ -118,3 +154,28 @@ def observe(self, callback: Callable[[str, Any], None]) -> None:
118154 self .unobserve ()
119155 self ._subscriptions [self ._ystate ] = self ._ystate .observe (partial (callback , "state" ))
120156 self ._subscriptions [self ._ysource ] = self ._ysource .observe (partial (callback , "source" ))
157+
158+
159+ def _is_utf8_continuation_byte (byte : int ) -> bool :
160+ """Check if a byte is a UTF-8 continuation byte (10xxxxxx)."""
161+ return (byte & 0xC0 ) == 0x80
162+
163+
164+ def _fix_byte_range_to_char_boundary (data : bytes , start : int , end : int ) -> tuple [int , int ]:
165+ """
166+ Adjust byte indices to proper UTF-8 character boundaries.
167+
168+ :param data: The byte data.
169+ :param start: The start byte index.
170+ :param end: The end byte index.
171+ :return: A tuple of (adjusted_start, adjusted_end).
172+ """
173+ # Move start backward to the beginning of a UTF-8 character
174+ while start > 0 and start < len (data ) and _is_utf8_continuation_byte (data [start ]):
175+ start -= 1
176+
177+ # Move end forward to the end of a UTF-8 character
178+ while end < len (data ) and _is_utf8_continuation_byte (data [end ]):
179+ end += 1
180+
181+ return start , end
0 commit comments