@@ -128,6 +128,160 @@ public static class UTF8
128128 const byte OVERLONG_4 = 1 << 6 ;
129129 const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS ;
130130
131+ public unsafe static byte * GetPointerToFirstInvalidByteSse ( byte * pInputBuffer , int inputLength )
132+ {
133+
134+ int processedLength = 0 ;
135+
136+ if ( pInputBuffer == null || inputLength <= 0 )
137+ {
138+ return pInputBuffer ;
139+ }
140+ if ( inputLength > 128 )
141+ {
142+ // We skip any ASCII characters at the start of the buffer
143+ int asciirun = 0 ;
144+ for ( ; asciirun + 64 <= inputLength ; asciirun += 64 )
145+ {
146+ Vector128 < byte > block1 = Avx . LoadVector128 ( pInputBuffer + asciirun ) ;
147+ Vector128 < byte > block2 = Avx . LoadVector128 ( pInputBuffer + asciirun + 16 ) ;
148+ Vector128 < byte > block3 = Avx . LoadVector128 ( pInputBuffer + asciirun + 32 ) ;
149+ Vector128 < byte > block4 = Avx . LoadVector128 ( pInputBuffer + asciirun + 48 ) ;
150+
151+ Vector128 < byte > or = Sse2 . Or ( Sse2 . Or ( block1 , block2 ) , Sse2 . Or ( block3 , block4 ) ) ;
152+ if ( Sse2 . MoveMask ( or ) != 0 )
153+ {
154+ break ;
155+ }
156+ }
157+ processedLength = asciirun ;
158+
159+ if ( processedLength + 16 < inputLength )
160+ {
161+ // We still have work to do!
162+ Vector128 < byte > prevInputBlock = Vector128 < byte > . Zero ;
163+
164+ Vector128 < byte > maxValue = Vector128 . Create (
165+ 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 ,
166+ 255 , 255 , 255 , 255 , 255 , 0b11110000 - 1 , 0b11100000 - 1 , 0b11000000 - 1 ) ;
167+ Vector128 < byte > prevIncomplete = Sse2 . SubtractSaturate ( prevInputBlock , maxValue ) ;
168+
169+
170+ Vector128 < byte > shuf1 = Vector128 . Create ( TOO_LONG , TOO_LONG , TOO_LONG , TOO_LONG ,
171+ TOO_LONG , TOO_LONG , TOO_LONG , TOO_LONG ,
172+ TWO_CONTS , TWO_CONTS , TWO_CONTS , TWO_CONTS ,
173+ TOO_SHORT | OVERLONG_2 ,
174+ TOO_SHORT ,
175+ TOO_SHORT | OVERLONG_3 | SURROGATE ,
176+ TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 ) ;
177+
178+ Vector128 < byte > shuf2 = Vector128 . Create ( CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4 ,
179+ CARRY | OVERLONG_2 ,
180+ CARRY ,
181+ CARRY ,
182+ CARRY | TOO_LARGE ,
183+ CARRY | TOO_LARGE | TOO_LARGE_1000 ,
184+ CARRY | TOO_LARGE | TOO_LARGE_1000 ,
185+ CARRY | TOO_LARGE | TOO_LARGE_1000 ,
186+ CARRY | TOO_LARGE | TOO_LARGE_1000 ,
187+ CARRY | TOO_LARGE | TOO_LARGE_1000 ,
188+ CARRY | TOO_LARGE | TOO_LARGE_1000 ,
189+ CARRY | TOO_LARGE | TOO_LARGE_1000 ,
190+ CARRY | TOO_LARGE | TOO_LARGE_1000 ,
191+ CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE ,
192+ CARRY | TOO_LARGE | TOO_LARGE_1000 ,
193+ CARRY | TOO_LARGE | TOO_LARGE_1000 ) ;
194+ Vector128 < byte > shuf3 = Vector128 . Create ( TOO_SHORT , TOO_SHORT , TOO_SHORT , TOO_SHORT ,
195+ TOO_SHORT , TOO_SHORT , TOO_SHORT , TOO_SHORT ,
196+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4 ,
197+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE ,
198+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE ,
199+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE ,
200+ TOO_SHORT , TOO_SHORT , TOO_SHORT , TOO_SHORT ) ;
201+
202+ Vector128 < byte > thirdByte = Vector128 . Create ( ( byte ) ( 0b11100000u - 0x80 ) ) ;
203+ Vector128 < byte > fourthByte = Vector128 . Create ( ( byte ) ( 0b11110000u - 0x80 ) ) ;
204+ Vector128 < byte > v0f = Vector128 . Create ( ( byte ) 0x0F ) ;
205+ Vector128 < byte > v80 = Vector128 . Create ( ( byte ) 0x80 ) ;
206+
207+ for ( ; processedLength + 16 <= inputLength ; processedLength += 16 )
208+ {
209+
210+ Vector128 < byte > currentBlock = Sse2 . LoadVector128 ( pInputBuffer + processedLength ) ;
211+
212+ int mask = Sse2 . MoveMask ( currentBlock ) ;
213+ if ( mask == 0 )
214+ {
215+ // Console.WriteLine("ascii");
216+
217+ // We have an ASCII block, no need to process it, but
218+ // we need to check if the previous block was incomplete.
219+ if ( Sse2 . MoveMask ( prevIncomplete ) != 0 )
220+ {
221+ // return pInputBuffer + processedLength;
222+
223+ // Console.WriteLine("not ascii");
224+ return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( processedLength , pInputBuffer + processedLength , inputLength - processedLength ) ;
225+ }
226+ prevIncomplete = Vector128 < byte > . Zero ;
227+ }
228+ else
229+ {
230+ // Contains non-ASCII characters, we need to do non-trivial processing
231+ Vector128 < byte > prev1 = Ssse3 . AlignRight ( currentBlock , prevInputBlock , ( byte ) ( 16 - 1 ) ) ;
232+ Vector128 < byte > byte_1_high = Ssse3 . Shuffle ( shuf1 , Sse2 . ShiftRightLogical ( prev1 . AsUInt16 ( ) , 4 ) . AsByte ( ) & v0f ) ;
233+ Vector128 < byte > byte_1_low = Ssse3 . Shuffle ( shuf2 , ( prev1 & v0f ) ) ;
234+ Vector128 < byte > byte_2_high = Ssse3 . Shuffle ( shuf3 , Sse2 . ShiftRightLogical ( currentBlock . AsUInt16 ( ) , 4 ) . AsByte ( ) & v0f ) ;
235+ Vector128 < byte > sc = Sse2 . And ( Sse2 . And ( byte_1_high , byte_1_low ) , byte_2_high ) ;
236+ Vector128 < byte > prev2 = Ssse3 . AlignRight ( currentBlock , prevInputBlock , ( byte ) ( 16 - 2 ) ) ;
237+ Vector128 < byte > prev3 = Ssse3 . AlignRight ( currentBlock , prevInputBlock , ( byte ) ( 16 - 3 ) ) ;
238+ prevInputBlock = currentBlock ;
239+ Vector128 < byte > isThirdByte = Sse2 . SubtractSaturate ( prev2 , thirdByte ) ;
240+ Vector128 < byte > isFourthByte = Sse2 . SubtractSaturate ( prev3 , fourthByte ) ;
241+ Vector128 < byte > must23 = Sse2 . Or ( isThirdByte , isFourthByte ) ;
242+ Vector128 < byte > must23As80 = Sse2 . And ( must23 , v80 ) ;
243+ Vector128 < byte > error = Sse2 . Xor ( must23As80 , sc ) ;
244+ if ( Sse2 . MoveMask ( error ) != 0 )
245+ {
246+ return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( processedLength , pInputBuffer + processedLength , inputLength - processedLength ) ;
247+ }
248+ prevIncomplete = Sse2 . SubtractSaturate ( currentBlock , maxValue ) ;
249+ }
250+ }
251+ }
252+ }
253+ // We have processed all the blocks using SIMD, we need to process the remaining bytes.
254+
255+ // Process the remaining bytes with the scalar function
256+ if ( processedLength < inputLength )
257+ {
258+ // We need to possibly backtrack to the start of the last code point
259+ // worst possible case is 4 bytes, where we need to backtrack 3 bytes
260+ // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
261+ if ( processedLength > 0 && ( sbyte ) pInputBuffer [ processedLength ] <= - 65 )
262+ {
263+ processedLength -= 1 ;
264+ if ( processedLength > 0 && ( sbyte ) pInputBuffer [ processedLength ] <= - 65 )
265+ {
266+ processedLength -= 1 ;
267+ if ( processedLength > 0 && ( sbyte ) pInputBuffer [ processedLength ] <= - 65 )
268+ {
269+ processedLength -= 1 ;
270+ }
271+ }
272+ }
273+ byte * invalidBytePointer = SimdUnicode . UTF8 . GetPointerToFirstInvalidByteScalar ( pInputBuffer + processedLength , inputLength - processedLength ) ;
274+ if ( invalidBytePointer != pInputBuffer + inputLength )
275+ {
276+ // An invalid byte was found by the scalar function
277+ return invalidBytePointer ;
278+ }
279+ }
280+
281+ return pInputBuffer + inputLength ;
282+ }
283+
284+
131285 public unsafe static byte * GetPointerToFirstInvalidByteAvx2 ( byte * pInputBuffer , int inputLength )
132286 {
133287 int processedLength = 0 ;
@@ -461,16 +615,14 @@ public static class UTF8
461615 {
462616 return GetPointerToFirstInvalidByteAvx2 ( pInputBuffer , inputLength ) ;
463617 }
464- // TODO add support for other ISAs
465- //if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
466- //{
467- // return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength);
468- //GetPointerToFirstInvalidByteAvx2
469- //}
470- //if (Sse42.IsSupported)
471- //{
472- // return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength);
473- //}
618+ /*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
619+ {
620+ return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength);
621+ }*/
622+ if ( Ssse3 . IsSupported )
623+ {
624+ return GetPointerToFirstInvalidByteSse ( pInputBuffer , inputLength ) ;
625+ }
474626 return GetPointerToFirstInvalidByteScalar ( pInputBuffer , inputLength ) ;
475627 }
476628
0 commit comments