@@ -76,12 +76,44 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
7676
7777 static Func < byte , string > byteToBinaryString = b => Convert . ToString ( b , 2 ) . PadLeft ( 8 , '0' ) ; //for debugging
7878
79+ // prevents double counting in case there is a toolong error on the edge
80+ public static ( int utfAdjust , int scalarAdjust ) GetFinalScalarUtfAdjustments ( byte headerByte )
81+ {
82+ // Check if the header byte belongs to a 2-byte UTF-8 character
83+ if ( ( headerByte & 0b11100000 ) == 0b11000000 )
84+ {
85+ return ( 1 , 0 ) ;
86+ }
87+ // Check if the header byte belongs to a 3-byte UTF-8 character
88+ else if ( ( headerByte & 0b11110000 ) == 0b11100000 )
89+ {
90+ return ( 2 , 0 ) ;
91+ }
92+ // Check if the header byte belongs to a 4-byte UTF-8 character
93+ else if ( ( headerByte & 0b11111000 ) == 0b11110000 )
94+ {
95+
96+ return ( 2 , 1 ) ;
97+ }
98+ // Otherwise, it's a 1-byte character or continuation byte
99+ return ( 0 , 0 ) ;
100+ }
101+
102+
79103 public unsafe static byte * RewindAndValidateWithErrors ( int howFarBack , byte * buf , int len , ref int utf16CodeUnitCountAdjustment , ref int scalarCountAdjustment )
80104 {
81105
82106 int extraLen = 0 ;
83107 bool foundLeadingBytes = false ;
84108
109+ // Print the byte value at the buf pointer
110+ byte * PinputPlusProcessedlength = buf ;
111+
112+
113+
114+ int TooLongErroronEdgeUtfadjust = 0 ;
115+ int TooLongErroronEdgeScalaradjust = 0 ;
116+
85117 for ( int i = 0 ; i <= howFarBack ; i ++ )
86118 {
87119 byte candidateByte = buf [ 0 - i ] ;
@@ -92,6 +124,8 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
92124 if ( foundLeadingBytes )
93125 {
94126
127+ ( TooLongErroronEdgeUtfadjust , TooLongErroronEdgeScalaradjust ) = GetFinalScalarUtfAdjustments ( candidateByte ) ;
128+
95129 buf -= i ;
96130 break ;
97131 }
@@ -108,6 +142,24 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
108142 byte * invalidBytePointer = GetPointerToFirstInvalidByteScalar ( buf , len + extraLen , out TailUtf16CodeUnitCountAdjustment , out TailScalarCountAdjustment ) ;
109143 // Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCountAdjustment}");
110144
145+ bool isContinuationByte = ( invalidBytePointer [ 0 ] & 0xC0 ) == 0x80 ;
146+ bool isOneByteAfterProcessedLength = ( invalidBytePointer == PinputPlusProcessedlength ) ;
147+
148+
149+
150+ // // Print the byte value at the invalidBytePointer
151+
152+
153+
154+
155+ if ( isContinuationByte && isOneByteAfterProcessedLength )
156+ {
157+
158+ utf16CodeUnitCountAdjustment += TooLongErroronEdgeUtfadjust ;
159+ scalarCountAdjustment += TooLongErroronEdgeScalaradjust ;
160+
161+ }
162+
111163
112164 utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment ;
113165 scalarCountAdjustment += TailScalarCountAdjustment ;
@@ -302,12 +354,12 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
302354 ( adjusttotalbyte , backedupByHowMuch , adjustascii , adjustcont , adjustn4 ) = adjustmentFactor ( pInputBuffer + processedLength ) ;
303355 }
304356
305- if ( TooLongErroronEdge )
306- {
307- asciibytes += adjustascii ;
308- contbytes += adjustcont ;
309- n4 += adjustn4 ;
310- }
357+ // if (TooLongErroronEdge)
358+ // {
359+ // asciibytes += adjustascii;
360+ // contbytes += adjustcont;
361+ // n4 += adjustn4;
362+ // }
311363
312364 var ( utfadjust , scalaradjust ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyte + adjusttotalbyte ) ;
313365
@@ -698,7 +750,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
698750 if ( isContinuationByte && isOneByteAfterProcessedLength )
699751 {
700752
701- TooLongErroronEdge = true ;
753+ // TooLongErroronEdge = true;
702754 }
703755
704756
0 commit comments