@@ -671,22 +671,15 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
671671 contbytes += tempcont ;
672672 }
673673
674- // (Nick Nuon)The counts for continuous bytes can probably be optimized:
675- // The draft had something like this line:
676- // contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc));
677- // this actually counts the number of 2 consecutive continuous bytes
678- // I put something that was bound to be working regardless as a slow but temporary fix:
679-
680- Vector256 < byte > top2bits = Vector256 . Create ( ( byte ) 0b11000000 ) ; // Mask to isolate the two most significant bits
681- Vector256 < byte > contbytemask = Vector256 . Create ( ( byte ) 0b10000000 ) ; // The expected pattern for continuation bytes: 10xxxxxx
682-
683- // Apply the mask and compare
684- Vector256 < byte > maskedData = Avx2 . And ( currentBlock , top2bits ) ;
685- Vector256 < byte > compareResult = Avx2 . CompareEqual ( maskedData , contbytemask ) ;
686- // Move mask to get integer representation
687- contbytes += ( int ) Popcnt . PopCount ( ( uint ) Avx2 . MoveMask ( compareResult ) ) ;
688-
689-
674+ // We update the continuation bytes count using just one SIMD instruction (Avx2.CompareGreaterThan).
675+ // Then we need popcount to count the number of continuation bytes and some arithmetic operations.
676+ // We use the fact that as two's complement, -65 is 0b10111111, so we can use CompareGreaterThan
677+ // to find continuation bytes: any byte greater than -65 is a not continuation byte. E.g., the next one
678+ // is 0b11111110 (-64) and so forth. The smallest possible value is -128, which is 0b10000000.
679+
680+ Vector256 < sbyte > largestcont = Vector256 . Create ( ( sbyte ) - 65 ) ; // -65 => 0b10111111
681+ uint noncont = ( uint ) Avx2 . MoveMask ( Avx2 . CompareGreaterThan ( Vector256 . AsSByte ( currentBlock ) , largestcont ) ) ;
682+ contbytes += ( int ) ( 32 - Popcnt . PopCount ( noncont ) ) ;
690683
691684 // We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation.
692685 n4 += ( int ) Popcnt . PopCount ( ( uint ) Avx2 . MoveMask ( Avx2 . SubtractSaturate ( currentBlock , fourthByte ) ) ) ;
0 commit comments