@@ -92,16 +92,69 @@ public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
9292 fixed ( char * pStart = & MemoryMarshal . GetReference ( s ) )
9393 {
9494 int i = 0 ;
95+
96+ /* PAR Unrolled twice:
97+ | Method | N | Mean | Error | StdDev |
98+ |----------------------- |---- |-----------:|---------:|---------:|
99+ | FastUnicodeIsAscii | 100 | 905.7 ns | 17.95 ns | 20.67 ns |
100+ | StandardUnicodeIsAscii | 100 | 2,502.4 ns | 49.67 ns | 66.31 ns |
101+ | RuntimeIsAscii | 100 | 2,522.8 ns | 32.70 ns | 30.59 ns |
102+ | FastUnicodeIsAscii | 200 | 649.3 ns | 10.24 ns | 9.57 ns |
103+ | StandardUnicodeIsAscii | 200 | 5,299.7 ns | 64.91 ns | 57.54 ns |
104+ | RuntimeIsAscii | 200 | 5,307.2 ns | 49.18 ns | 46.00 ns |
105+ | FastUnicodeIsAscii | 500 | 1,382.2 ns | 9.40 ns | 8.79 ns |
106+ | StandardUnicodeIsAscii | 500 | 6,127.7 ns | 57.69 ns | 48.18 ns |
107+ | RuntimeIsAscii | 500 | 6,258.2 ns | 62.05 ns | 58.05 ns | */
108+
109+ // if (s.Length > 16) // Adjusted for the unrolled loop
110+ // {
111+ // Vector128<ushort> total = Sse41.LoadDquVector128((ushort*)pStart);
112+ // i += 8;
113+
114+ // // Unrolling the loop by 2x
115+ // for (; i + 15 < s.Length; i += 16)
116+ // {
117+ // Vector128<ushort> raw1 = Sse41.LoadDquVector128((ushort*)pStart + i);
118+ // Vector128<ushort> raw2 = Sse41.LoadDquVector128((ushort*)pStart + i + 8);
119+
120+ // total = Sse2.Or(total, raw1);
121+ // total = Sse2.Or(total, raw2);
122+ // }
123+
124+ // Vector128<ushort> b127 = Vector128.Create((ushort)127);
125+ // Vector128<ushort> b = Sse41.Max(b127, total);
126+ // Vector128<ushort> b16 = Sse41.CompareEqual(b, b127);
127+ // int movemask = Sse2.MoveMask(b16.AsByte());
128+ // if (movemask != 0xffff)
129+ // {
130+ // return false;
131+ // }
132+ // }
133+
134+ // | Method | N | Mean | Error | StdDev |
135+ // |----------------------- |---- |-----------:|---------:|---------:|
136+ // | FastUnicodeIsAscii | 100 | 904.0 ns | 9.22 ns | 8.17 ns |
137+ // | StandardUnicodeIsAscii | 100 | 2,396.5 ns | 11.33 ns | 10.04 ns |
138+ // | RuntimeIsAscii | 100 | 2,498.8 ns | 42.35 ns | 37.54 ns |
139+ // | FastUnicodeIsAscii | 200 | 1,270.0 ns | 7.69 ns | 6.01 ns |
140+ // | StandardUnicodeIsAscii | 200 | 5,173.0 ns | 57.82 ns | 54.08 ns |
141+ // | RuntimeIsAscii | 200 | 5,197.5 ns | 15.40 ns | 13.65 ns |
142+ // | FastUnicodeIsAscii | 500 | 1,412.0 ns | 24.22 ns | 21.47 ns |
143+ // | StandardUnicodeIsAscii | 500 | 6,196.5 ns | 60.78 ns | 53.88 ns |
144+ // | RuntimeIsAscii | 500 | 6,215.5 ns | 96.43 ns | 90.20 ns |
145+
146+
95147 if ( s . Length > 16 ) // Adjusted for the unrolled loop
96148 {
97- Vector128 < ushort > total = Sse41 . LoadDquVector128 ( ( ushort * ) pStart ) ;
149+ // Using zeroed vector as initialization
150+ Vector128 < ushort > total = Vector128 < ushort > . Zero ;
98151 i += 8 ;
99152
100153 // Unrolling the loop by 2x
101- for ( ; i + 15 < s . Length ; i += 16 )
154+ for ( ; i + 16 < s . Length ; i += 16 )
102155 {
103- Vector128 < ushort > raw1 = Sse41 . LoadDquVector128 ( ( ushort * ) pStart + i ) ;
104- Vector128 < ushort > raw2 = Sse41 . LoadDquVector128 ( ( ushort * ) pStart + i + 8 ) ;
156+ Vector128 < ushort > raw1 = Sse41 . LoadDquVector128 ( ( ushort * ) pStart ) ;
157+ Vector128 < ushort > raw2 = Sse41 . LoadDquVector128 ( ( ushort * ) pStart + i ) ;
105158
106159 total = Sse2 . Or ( total , raw1 ) ;
107160 total = Sse2 . Or ( total , raw2 ) ;
@@ -116,6 +169,7 @@ public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
116169 return false ;
117170 }
118171 }
172+
119173 for ( ; i < s . Length ; i ++ )
120174 {
121175 if ( pStart [ i ] >= 128 ) return false ;
0 commit comments