@@ -10,72 +10,6 @@ namespace SimdUnicode
1010 public static class UTF8
1111 {
1212
13-
14- static void PrintHexAndBinary ( byte [ ] bytes , int highlightIndex = - 1 )
15- {
16- int chunkSize = 16 ; // 128 bits = 16 bytes
17-
18- // Process each chunk for hexadecimal
19- Console . Write ( "Hex: " ) ;
20- for ( int i = 0 ; i < bytes . Length ; i ++ )
21- {
22- if ( i > 0 && i % chunkSize == 0 )
23- Console . WriteLine ( ) ; // New line after every 16 bytes
24-
25- if ( i == highlightIndex )
26- {
27- Console . ForegroundColor = ConsoleColor . Red ;
28- Console . Write ( $ "{ bytes [ i ] : X2} ") ;
29- Console . ResetColor ( ) ;
30- }
31- else if ( i % ( chunkSize * 2 ) == 0 ) // print green every 256 bytes
32- {
33- Console . ForegroundColor = ConsoleColor . Green ;
34- Console . Write ( $ "{ bytes [ i ] : X2} ") ;
35- Console . ResetColor ( ) ;
36- }
37- else
38- {
39- Console . Write ( $ "{ bytes [ i ] : X2} ") ;
40- }
41-
42- if ( ( i + 1 ) % chunkSize != 0 ) Console . Write ( " " ) ; // Add space between bytes but not at the end of the line
43- }
44- Console . WriteLine ( "\n " ) ; // New line for readability and to separate hex from binary
45-
46- // Process each chunk for binary
47- Console . Write ( "Binary: " ) ;
48- for ( int i = 0 ; i < bytes . Length ; i ++ )
49- {
50- if ( i > 0 && i % chunkSize == 0 )
51- Console . WriteLine ( ) ; // New line after every 16 bytes
52-
53- string binaryString = Convert . ToString ( bytes [ i ] , 2 ) . PadLeft ( 8 , '0' ) ;
54- if ( i == highlightIndex )
55- {
56- Console . ForegroundColor = ConsoleColor . Red ;
57- Console . Write ( $ "{ binaryString } ") ;
58- Console . ResetColor ( ) ;
59- }
60- else if ( i % ( chunkSize * 2 ) == 0 ) // print green every 256 bytes
61- {
62- Console . ForegroundColor = ConsoleColor . Green ;
63- Console . Write ( $ "{ binaryString } ") ;
64- Console . ResetColor ( ) ;
65- }
66- else
67- {
68- Console . Write ( $ "{ binaryString } ") ;
69- }
70-
71- if ( ( i + 1 ) % chunkSize != 0 ) Console . Write ( " " ) ; // Add space between bytes but not at the end of the line
72- }
73- Console . WriteLine ( ) ; // New line for readability
74- }
75-
76-
77- static Func < byte , string > byteToBinaryString = b => Convert . ToString ( b , 2 ) . PadLeft ( 8 , '0' ) ; //for debugging
78-
7913// prevents double counting in case there is a toolong error on the edge
8014 public static ( int utfAdjust , int scalarAdjust ) GetFinalScalarUtfAdjustments ( byte headerByte )
8115 {
@@ -92,7 +26,6 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
9226 // Check if the header byte belongs to a 4-byte UTF-8 character
9327 else if ( ( headerByte & 0b11111000 ) == 0b11110000 )
9428 {
95-
9629 return ( 2 , 1 ) ;
9730 }
9831 // Otherwise, it's a 1-byte character or continuation byte
@@ -107,10 +40,7 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
10740 bool foundLeadingBytes = false ;
10841
10942 // Print the byte value at the buf pointer
110- byte * PinputPlusProcessedlength = buf ;
111-
112-
113-
43+ byte * PinputPlusProcessedlength = buf ;
11444 int TooLongErroronEdgeUtfadjust = 0 ;
11545 int TooLongErroronEdgeScalaradjust = 0 ;
11646
@@ -119,8 +49,6 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
11949 byte candidateByte = buf [ 0 - i ] ;
12050 foundLeadingBytes = ( candidateByte & 0b11000000 ) != 0b10000000 ;
12151
122-
123-
12452 if ( foundLeadingBytes )
12553 {
12654
@@ -140,27 +68,26 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
14068 int TailScalarCountAdjustment = 0 ;
14169
14270 byte * invalidBytePointer = GetPointerToFirstInvalidByteScalar ( buf , len + extraLen , out TailUtf16CodeUnitCountAdjustment , out TailScalarCountAdjustment ) ;
143- // Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCountAdjustment}");
144-
145- bool isContinuationByte = ( invalidBytePointer [ 0 ] & 0xC0 ) == 0x80 ;
146- bool isOneByteAfterProcessedLength = ( invalidBytePointer == PinputPlusProcessedlength ) ;
147-
148-
149-
150- // // Print the byte value at the invalidBytePointer
151-
15271
72+ // We need to take care of eg
73+ // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
74+ // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 *11110000* 10011001 10101011 10000011
75+ // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
76+ // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
77+ // Without the following check, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
78+ // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
79+ // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
80+ // the part between parentheses will be counted as valid and thus scalaradjust/utfadjust will be incremented once too much
15381
82+ bool isContinuationByte = ( invalidBytePointer [ 0 ] & 0xC0 ) == 0x80 ;
83+ bool isOnEdge = ( invalidBytePointer == PinputPlusProcessedlength ) ;
15484
155- if ( isContinuationByte && isOneByteAfterProcessedLength )
85+ if ( isContinuationByte && isOnEdge )
15686 {
157-
15887 utf16CodeUnitCountAdjustment += TooLongErroronEdgeUtfadjust ;
15988 scalarCountAdjustment += TooLongErroronEdgeScalaradjust ;
160-
16189 }
16290
163-
16491 utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment ;
16592 scalarCountAdjustment += TailScalarCountAdjustment ;
16693
@@ -295,7 +222,7 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
295222 const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS ;
296223
297224 // Assuming that a valid UTF-8 sequence ends at pInputBuffer,
298- // computes how many bytes are needed (eg what type of byte) to complete the last character. also counts the number of n4, n2 and ascii affected
225+ // computes how many bytes are needed to complete the last character. also counts the number of n4, n2 and ascii affected
299226 // This will return 1, 2, 3. If the whole byte sequence is valid UTF-8,
300227 // and this function returns returnedvalue>0, then the bytes at pInputBuffer[0],
301228 // ... pInputBuffer[returnedvalue - 1] should be continuation bytes.
@@ -309,8 +236,6 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
309236 {
310237 if ( ( pInputBuffer [ - i ] & 0b11000000 ) != 0b10000000 )
311238 {
312-
313-
314239 break ;
315240 }
316241 contbyteadjust -= 1 ;
@@ -330,19 +255,15 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
330255
331256 public static ( int utfadjust , int scalaradjust ) CalculateN2N3FinalSIMDAdjustments ( int asciibytes , int n4 , int contbytes , int totalbyte )
332257 {
333-
334-
335258 int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte ;
336259 int n2 = - 2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte ;
337260 int utfadjust = - 2 * n4 - 2 * n3 - n2 ;
338261 int scalaradjust = - n4 ;
339262
340-
341-
342263 return ( utfadjust , scalaradjust ) ;
343264 }
344265
345- public unsafe static ( int utfadjust , int scalaradjust ) calculateErrorPathadjust ( int start_point , int processedLength , byte * pInputBuffer , int asciibytes , int n4 , int contbytes , bool TooLongErroronEdge = false )
266+ public unsafe static ( int utfadjust , int scalaradjust ) calculateErrorPathadjust ( int start_point , int processedLength , byte * pInputBuffer , int asciibytes , int n4 , int contbytes )
346267 {
347268 // Calculate the total bytes from start_point to processedLength
348269 int totalbyte = processedLength - start_point ;
@@ -353,21 +274,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
353274 {
354275 ( adjusttotalbyte , backedupByHowMuch , adjustascii , adjustcont , adjustn4 ) = adjustmentFactor ( pInputBuffer + processedLength ) ;
355276 }
356-
357- // if (TooLongErroronEdge)
358- // {
359- // asciibytes += adjustascii;
360- // contbytes += adjustcont;
361- // n4 += adjustn4;
362- // }
363-
364277 var ( utfadjust , scalaradjust ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyte + adjusttotalbyte ) ;
365-
366278 return ( utfadjust , scalaradjust ) ;
367279 }
368280
369-
370-
371281 public unsafe static byte * GetPointerToFirstInvalidByteSse ( byte * pInputBuffer , int inputLength )
372282 {
373283
@@ -522,10 +432,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
522432
523433 public unsafe static byte * GetPointerToFirstInvalidByteAvx2 ( byte * pInputBuffer , int inputLength , out int utf16CodeUnitCountAdjustment , out int scalarCountAdjustment )
524434 {
525-
526-
527-
528-
529435 int processedLength = 0 ;
530436 int TempUtf16CodeUnitCountAdjustment = 0 ;
531437 int TempScalarCountAdjustment = 0 ;
@@ -678,7 +584,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
678584 //
679585 if ( ! Avx2 . TestZ ( prevIncomplete , prevIncomplete ) )
680586 {
681- // TODO : this path is not explicitly tested, write tests
587+ // Note/todo : this path is not yet explicitly tested
682588 int totalbyteasciierror = processedLength - start_point ;
683589 var ( utfadjustasciierror , scalaradjustasciierror ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyteasciierror ) ;
684590
@@ -713,49 +619,13 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
713619
714620 if ( ! Avx2 . TestZ ( error , error ) )
715621 {
716-
717-
718622 int off = processedLength > 32 ? processedLength - 32 : processedLength ; // this does not backup ff processedlength = 32
719-
720-
721623 byte * invalidBytePointer = SimdUnicode . UTF8 . RewindAndValidateWithErrors ( off , pInputBuffer + processedLength , inputLength - processedLength , ref TailUtf16CodeUnitCountAdjustment , ref TailScalarCodeUnitCountAdjustment ) ;
722- bool TooLongErroronEdge = false ;
723-
724624 utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment ;
725625 scalarCountAdjustment = TailScalarCodeUnitCountAdjustment ;
726626
727-
728-
729- // We need to take care of eg
730- // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
731- // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011
732- // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
733- // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
734- // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
735- // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
736- // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
737- // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
738- // If this error arrive at the edge of 2 simd vector, that is where problem abound
739-
740- // Calculate the offset of the invalid byte pointer from the start of the input buffer
741- ulong offsetFromStart = ( ulong ) ( invalidBytePointer - pInputBuffer ) ;
742-
743- // Debugging output
744-
745- bool isContinuationByte = ( invalidBytePointer [ 0 ] & 0xC0 ) == 0x80 ;
746-
747- bool isOneByteAfterProcessedLength = ( invalidBytePointer == pInputBuffer + processedLength ) ;
748-
749-
750- if ( isContinuationByte && isOneByteAfterProcessedLength )
751- {
752-
753- // TooLongErroronEdge = true;
754- }
755-
756-
757627 int totalbyteasciierror = processedLength - start_point ;
758- var ( utfadjustasciierror , scalaradjustasciierror ) = calculateErrorPathadjust ( start_point , processedLength , pInputBuffer , asciibytes , n4 , contbytes , TooLongErroronEdge ) ;
628+ var ( utfadjustasciierror , scalaradjustasciierror ) = calculateErrorPathadjust ( start_point , processedLength , pInputBuffer , asciibytes , n4 , contbytes ) ;
759629
760630 utf16CodeUnitCountAdjustment += utfadjustasciierror ;
761631 scalarCountAdjustment += scalaradjustasciierror ;
@@ -769,13 +639,17 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
769639 {
770640 // We have an unterminated sequence.
771641 var ( totalbyteadjustment , i , tempascii , tempcont , tempn4 ) = adjustmentFactor ( pInputBuffer + processedLength + 32 ) ;
772-
773642 processedLength -= i ;
774643 n4 += tempn4 ;
775644 contbytes += tempcont ;
776-
777645 }
778646
647+ // (Nick Nuon)The counts for continuous bytes can probably be optimized:
648+ // The draft had something like this line:
649+ // contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc));
650+ // this actually counts the number of 2 consecutive continuous bytes
651+ // I put something that was bound to be working regardless as a slow but temporary fix:
652+
779653 Vector256 < byte > top2bits = Vector256 . Create ( ( byte ) 0b11000000 ) ; // Mask to isolate the two most significant bits
780654 Vector256 < byte > contbytemask = Vector256 . Create ( ( byte ) 0b10000000 ) ; // The expected pattern for continuation bytes: 10xxxxxx
781655
@@ -797,10 +671,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
797671 asciibytes += ( int ) ( 32 - Popcnt . PopCount ( ( uint ) mask ) ) ;
798672 }
799673
800- // There are 2 possible scenarios here : either
801- // A) it arrives flush en the border. eg it doesnt need to be processed further
802- // B) There is some bytes remaining in which case we need to call the scalar functien
803- // Either way we need to calculate n2,n3 and update the utf16adjust and scalar adjust
804674 int totalbyte = processedLength - start_point ;
805675 var ( utf16adjust , scalaradjust ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyte ) ;
806676
0 commit comments