@@ -10,7 +10,7 @@ namespace SimdUnicode
1010 public static class UTF8
1111 {
1212
13- // helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
13+ //debug helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
1414static void PrintHexAndBinary ( byte [ ] bytes , int highlightIndex = - 1 )
1515{
1616 int chunkSize = 16 ; // 128 bits = 16 bytes
@@ -78,19 +78,20 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
7878
7979 public unsafe static byte * RewindAndValidateWithErrors ( int howFarBack , byte * buf , int len , ref int utf16CodeUnitCountAdjustment , ref int scalarCountAdjustment )
8080 {
81-
82- int TempUtf16CodeUnitCountAdjustment = 0 ;
83- int TempScalarCountAdjustment = 0 ;
84-
81+ // Console.WriteLine("CALLING REWIND");
8582 int extraLen = 0 ;
8683 bool foundLeadingBytes = false ;
8784
8885 for ( int i = 0 ; i <= howFarBack ; i ++ )
8986 {
9087 byte candidateByte = buf [ 0 - i ] ;
9188 foundLeadingBytes = ( candidateByte & 0b11000000 ) != 0b10000000 ;
89+ Console . WriteLine ( $ "Rewinding byte to offset { - i } : { candidateByte : X2} ") ;
90+ Console . WriteLine ( foundLeadingBytes ) ;
91+
9292 if ( foundLeadingBytes )
93- {
93+ {
94+ Console . WriteLine ( "Found leading byte" ) ;
9495 buf -= i ;
9596 break ;
9697 }
@@ -101,13 +102,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
101102 return buf - howFarBack ;
102103 }
103104
104- utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment ;
105- scalarCountAdjustment += TempScalarCountAdjustment ;
106-
107105 int TailUtf16CodeUnitCountAdjustment = 0 ;
108106 int TailScalarCountAdjustment = 0 ;
109107
110108 byte * invalidBytePointer = GetPointerToFirstInvalidByteScalar ( buf , len + extraLen , out TailUtf16CodeUnitCountAdjustment , out TailScalarCountAdjustment ) ;
109+ // Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCountAdjustment}");
110+
111111
112112 utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment ;
113113 scalarCountAdjustment += TailScalarCountAdjustment ;
@@ -219,7 +219,7 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
219219 }
220220 else
221221 {
222- // we may have a continuation
222+ // we may have a continuation/too long error
223223 utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment ;
224224 scalarCountAdjustment = TempScalarCountAdjustment ;
225225 return pInputBuffer + pos ;
@@ -257,12 +257,11 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
257257 {
258258 if ( ( pInputBuffer [ - i ] & 0b11000000 ) != 0b10000000 )
259259 {
260- string binaryString = Convert . ToString ( pInputBuffer [ - i ] , 2 ) . PadLeft ( 8 , '0' ) ;
261- // Console.WriteLine($"Stopping at byte {binaryString}"); //debug
260+ string binaryString = Convert . ToString ( pInputBuffer [ - i ] , 2 ) . PadLeft ( 8 , '0' ) ; //debug
261+ Console . WriteLine ( $ "Stopping at byte { binaryString } ") ; //debug
262262 break ;
263263 }
264264 contbyteadjust -= 1 ;
265-
266265 }
267266 if ( ( pInputBuffer [ - i ] & 0b10000000 ) == 0 ) {
268267 return ( 0 , i , - 1 , contbyteadjust , 0 ) ; // We must have that i == 1
@@ -279,19 +278,41 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
279278
280279 public static ( int utfadjust , int scalaradjust ) CalculateN2N3FinalSIMDAdjustments ( int asciibytes , int n4 , int contbytes , int totalbyte )
281280 {
282- // Console.WriteLine("---------"); //debug
283- // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug
281+ Console . WriteLine ( "---------" ) ; //debug
282+ Console . WriteLine ( "CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte ) ; //debug
284283 int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte ;
285284 int n2 = - 2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte ;
286285 int utfadjust = - 2 * n4 - 2 * n3 - n2 ;
287286 int scalaradjust = - n4 ;
288287
289- // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug
288+ Console . WriteLine ( "CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust ) ; //debug
290289
291290 return ( utfadjust , scalaradjust ) ;
292291 }
293292
294- public unsafe static ( int utfadjust , int scalaradjust ) calculateErrorPathadjust ( int start_point , int processedLength , byte * pInputBuffer , int asciibytes , int n4 , int contbytes )
293+ // public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) //todo: add an extra bool parameter 'TooLongErroronEdge' which defaults to false
294+ // {
295+ // // Calculate the total bytes from start_point to processedLength
296+ // int totalbyte = processedLength - start_point;
297+ // int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustcont = 0, adjustn4 = 0;
298+
299+ // // Adjust the length to include a complete character, if necessary
300+ // if (totalbyte > 0)
301+ // {
302+ // (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
303+ // }
304+
305+ // // Pseudocode:
306+ // // if 'TooLongErroronEdge' bool is true then
307+ // // then substract (remove) adjustascii, adjustcont, adjustn4 from their respective counterpart in the following function:
308+
309+ // var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4 , contbytes , totalbyte + adjusttotalbyte);
310+
311+
312+ // return (utfadjust, scalaradjust);
313+ // }
314+
315+ public unsafe static ( int utfadjust , int scalaradjust ) calculateErrorPathadjust ( int start_point , int processedLength , byte * pInputBuffer , int asciibytes , int n4 , int contbytes , bool TooLongErroronEdge = false )
295316 {
296317 // Calculate the total bytes from start_point to processedLength
297318 int totalbyte = processedLength - start_point ;
@@ -300,17 +321,25 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
300321 // Adjust the length to include a complete character, if necessary
301322 if ( totalbyte > 0 )
302323 {
303- ( adjusttotalbyte , backedupByHowMuch , adjustascii , adjustcont , adjustn4 ) = adjustmentFactor ( pInputBuffer + processedLength ) ;
324+ ( adjusttotalbyte , backedupByHowMuch , adjustascii , adjustcont , adjustn4 ) = adjustmentFactor ( pInputBuffer + processedLength ) ;
304325 }
305326
306- // var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustcont, totalbyte + adjusttotalbyte);
307- var ( utfadjust , scalaradjust ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyte + adjusttotalbyte ) ;
327+ // Adjust the counters if 'TooLongErroronEdge' is true
328+ if ( TooLongErroronEdge )
329+ {
330+ // If you can figure out why this makes a difference,youre golden
331+ asciibytes += adjustascii ;
332+ contbytes += adjustcont ;
333+ n4 += adjustn4 ;
334+ }
308335
336+ var ( utfadjust , scalaradjust ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyte + adjusttotalbyte ) ;
309337
310338 return ( utfadjust , scalaradjust ) ;
311339 }
312340
313341
342+
314343 public unsafe static byte * GetPointerToFirstInvalidByteSse ( byte * pInputBuffer , int inputLength )
315344 {
316345
@@ -465,9 +494,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
465494
466495 public unsafe static byte * GetPointerToFirstInvalidByteAvx2 ( byte * pInputBuffer , int inputLength , out int utf16CodeUnitCountAdjustment , out int scalarCountAdjustment )
467496 {
468- // Console.ForegroundColor = ConsoleColor.Blue; //debug
469- // Console.WriteLine("-------------------------------------");//debug
470- // Console.ResetColor();//debug
497+ Console . ForegroundColor = ConsoleColor . Blue ; //debug
498+ Console . WriteLine ( "-------------------------------------" ) ; //debug
499+ Console . ResetColor ( ) ; //debug
471500
472501 int processedLength = 0 ;
473502 int TempUtf16CodeUnitCountAdjustment = 0 ;
@@ -659,23 +688,100 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
659688 Vector256 < byte > must23 = Avx2 . Or ( isThirdByte , isFourthByte ) ;
660689 Vector256 < byte > must23As80 = Avx2 . And ( must23 , v80 ) ;
661690 Vector256 < byte > error = Avx2 . Xor ( must23As80 , sc ) ;
662- if ( ! Avx2 . TestZ ( error , error ) )
663- {
664- // Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
665- int totalbyteasciierror = processedLength - start_point ;
666- var ( utfadjustasciierror , scalaradjustasciierror ) = calculateErrorPathadjust ( start_point , processedLength , pInputBuffer , asciibytes , n4 , contbytes ) ;
691+ // if (!Avx2.TestZ(error, error))
692+ // {
693+ // Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
667694
668- utf16CodeUnitCountAdjustment = utfadjustasciierror ;
669- scalarCountAdjustment = scalaradjustasciierror ;
695+ // int off = processedLength >= 32 ? processedLength - 32 : processedLength;
696+ // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
697+
698+ // utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment;
699+ // scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
700+
701+ // // We need to take care of eg
702+ // // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
703+ // // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011
704+ // // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
705+ // // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
706+ // // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
707+ // // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
708+ // // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
709+ // // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
710+ // // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
711+
712+ // // so in short , we want to solve this error while at the same time not disturbing anything else
713+ // // we know that there is a continuation on the edge eg at the 64 byte, we need te check that
714+ // // *TODO:Fill code here *
715+ // // Peudocode for now
716+ // // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
717+ // // pass on true to the
718+
719+
720+ // int totalbyteasciierror = processedLength - start_point;
721+ // var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes);
722+
723+ // utf16CodeUnitCountAdjustment += utfadjustasciierror;
724+ // scalarCountAdjustment += scalaradjustasciierror;
725+
726+ // TailScalarCodeUnitCountAdjustment =0;
727+ // TailUtf16CodeUnitCountAdjustment =0;
670728
671- TailScalarCodeUnitCountAdjustment = 0 ;
672- TailUtf16CodeUnitCountAdjustment = 0 ;
729+
730+
731+ // return invalidBytePointer;
732+ // }
733+
734+ if ( ! Avx2 . TestZ ( error , error ) )
735+ {
736+ Console . WriteLine ( $ "--Error! @ { processedLength } bytes") ; //debug
673737
674738 int off = processedLength >= 32 ? processedLength - 32 : processedLength ;
675739 byte * invalidBytePointer = SimdUnicode . UTF8 . RewindAndValidateWithErrors ( off , pInputBuffer + processedLength , inputLength - processedLength , ref TailUtf16CodeUnitCountAdjustment , ref TailScalarCodeUnitCountAdjustment ) ;
740+ bool TooLongErroronEdge = false ;
741+
742+ utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment ;
743+ scalarCountAdjustment = TailScalarCodeUnitCountAdjustment ;
744+
745+ Console . WriteLine ( $ "RewindScalarValidation's function utf16adjust:{ TailUtf16CodeUnitCountAdjustment } , scalaradjust:{ TailScalarCodeUnitCountAdjustment } ") ;
746+
747+ // We need to take care of eg
748+ // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
749+ // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011
750+ // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
751+ // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
752+ // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
753+ // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
754+ // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
755+ // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
756+ // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
757+
758+ // so in short , we want to solve this error while at the same time not disturbing anything else
759+ // we know that there is a continuation on the edge eg at the 64 byte, we need te check that
760+ // *TODO:Fill code here *
761+ // Peudocode for now
762+ // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
763+ // pass on true to the
764+
765+ // Calculate the offset of the invalid byte pointer from the start of the input buffer
766+ ulong offsetFromStart = ( ulong ) ( invalidBytePointer - pInputBuffer ) ;
767+
768+ // Debugging output
769+ bool isContinuationByte = ( invalidBytePointer [ 0 ] & 0xC0 ) == 0x80 ;
770+ bool isOneByteAfterProcessedLength = ( invalidBytePointer == pInputBuffer + processedLength ) ;
771+
772+ // if (isContinuationByte && isAtBoundary && isOneByteAfterProcessedLength)// this alone creates false positives
773+ if ( isContinuationByte && isOneByteAfterProcessedLength )
774+ {
775+ Console . WriteLine ( "Triggering TooLongErrorOnEdge adjustment" ) ;
776+ TooLongErroronEdge = true ;
777+ }
676778
677- utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment ;
678- scalarCountAdjustment += TailScalarCodeUnitCountAdjustment ;
779+
780+ int totalbyteasciierror = processedLength - start_point ;
781+ var ( utfadjustasciierror , scalaradjustasciierror ) = calculateErrorPathadjust ( start_point , processedLength , pInputBuffer , asciibytes , n4 , contbytes , TooLongErroronEdge ) ;
782+
783+ utf16CodeUnitCountAdjustment += utfadjustasciierror ;
784+ scalarCountAdjustment += scalaradjustasciierror ;
679785
680786 return invalidBytePointer ;
681787 }
@@ -690,7 +796,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
690796 processedLength -= i ;
691797 n4 += tempn4 ; // this is + because the adjustment function returns something negative already
692798 contbytes += tempcont ;
693- // Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug
799+ Console . WriteLine ( $ "Unterminated! @ { processedLength } Backing up by { i } ") ; //debug
694800 }
695801
696802
@@ -763,6 +869,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
763869 {
764870 utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment ;
765871 scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment ;
872+
766873 // An invalid byte was found by the scalar function
767874 return invalidBytePointer ;
768875 }
0 commit comments