66using System . Runtime . InteropServices ;
77
88
9- /* PAR:
10- | Method | N | Mean | Error | StdDev |
11- |----------------------- |---- |-----------:|---------:|---------:|
12- | FastUnicodeIsAscii | 100 | 652.6 ns | 2.20 ns | 1.95 ns |
13- | StandardUnicodeIsAscii | 100 | 2,466.5 ns | 21.77 ns | 20.36 ns |
14- | RuntimeIsAscii | 100 | 2,502.7 ns | 29.81 ns | 27.89 ns |
15- | FastUnicodeIsAscii | 200 | 1,300.8 ns | 17.95 ns | 14.99 ns |
16- | StandardUnicodeIsAscii | 200 | 5,216.6 ns | 62.48 ns | 55.38 ns |
17- | RuntimeIsAscii | 200 | 5,293.2 ns | 41.50 ns | 38.82 ns |
18- | FastUnicodeIsAscii | 500 | 2,978.6 ns | 34.99 ns | 32.73 ns |
19- | StandardUnicodeIsAscii | 500 | 6,172.9 ns | 74.53 ns | 69.71 ns |
20- | RuntimeIsAscii | 500 | 6,210.8 ns | 80.82 ns | 63.10 ns | */
219
2210
2311// Ideally, we would want to implement something that looks like
@@ -104,7 +92,22 @@ public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
10492 fixed ( char * pStart = & MemoryMarshal . GetReference ( s ) )
10593 {
10694 int i = 0 ;
107- if ( s . Length > 8 )
95+
96+ /* PAR: not unrolled
97+ | Method | N | Mean | Error | StdDev |
98+ |----------------------- |---- |-----------:|---------:|---------:|
99+ | FastUnicodeIsAscii | 100 | 652.6 ns | 2.20 ns | 1.95 ns |
100+ | StandardUnicodeIsAscii | 100 | 2,466.5 ns | 21.77 ns | 20.36 ns |
101+ | RuntimeIsAscii | 100 | 2,502.7 ns | 29.81 ns | 27.89 ns |
102+ | FastUnicodeIsAscii | 200 | 1,300.8 ns | 17.95 ns | 14.99 ns |
103+ | StandardUnicodeIsAscii | 200 | 5,216.6 ns | 62.48 ns | 55.38 ns |
104+ | RuntimeIsAscii | 200 | 5,293.2 ns | 41.50 ns | 38.82 ns |
105+ | FastUnicodeIsAscii | 500 | 2,978.6 ns | 34.99 ns | 32.73 ns |
106+ | StandardUnicodeIsAscii | 500 | 6,172.9 ns | 74.53 ns | 69.71 ns |
107+ | RuntimeIsAscii | 500 | 6,210.8 ns | 80.82 ns | 63.10 ns | */
108+
109+
110+ /* if (s.Length > 8)
108111 {
109112 Vector128<ushort> total = Sse41.LoadDquVector128((ushort*)pStart);
110113 i += 8;
@@ -113,7 +116,37 @@ public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
113116 {
114117 Vector128<ushort> raw = Sse41.LoadDquVector128((ushort*)pStart + i);
115118 total = Sse2.Or(total, raw);
119+ } */
120+
121+
122+ /*
123+ | Method | N | Mean | Error | StdDev |
124+ |----------------------- |---- |-----------:|---------:|---------:|
125+ | FastUnicodeIsAscii | 100 | 905.7 ns | 17.95 ns | 20.67 ns |
126+ | StandardUnicodeIsAscii | 100 | 2,502.4 ns | 49.67 ns | 66.31 ns |
127+ | RuntimeIsAscii | 100 | 2,522.8 ns | 32.70 ns | 30.59 ns |
128+ | FastUnicodeIsAscii | 200 | 649.3 ns | 10.24 ns | 9.57 ns |
129+ | StandardUnicodeIsAscii | 200 | 5,299.7 ns | 64.91 ns | 57.54 ns |
130+ | RuntimeIsAscii | 200 | 5,307.2 ns | 49.18 ns | 46.00 ns |
131+ | FastUnicodeIsAscii | 500 | 1,382.2 ns | 9.40 ns | 8.79 ns |
132+ | StandardUnicodeIsAscii | 500 | 6,127.7 ns | 57.69 ns | 48.18 ns |
133+ | RuntimeIsAscii | 500 | 6,258.2 ns | 62.05 ns | 58.05 ns | */
134+
135+ if ( s . Length > 16 ) // Adjusted for the unrolled loop
136+ {
137+ Vector128 < ushort > total = Sse41 . LoadDquVector128 ( ( ushort * ) pStart ) ;
138+ i += 8 ;
139+
140+ // Unrolling the loop by 2x
141+ for ( ; i + 15 < s . Length ; i += 16 )
142+ {
143+ Vector128 < ushort > raw1 = Sse41 . LoadDquVector128 ( ( ushort * ) pStart + i ) ;
144+ Vector128 < ushort > raw2 = Sse41 . LoadDquVector128 ( ( ushort * ) pStart + i + 8 ) ;
145+
146+ total = Sse2 . Or ( total , raw1 ) ;
147+ total = Sse2 . Or ( total , raw2 ) ;
116148 }
149+
117150 Vector128 < ushort > b127 = Vector128 . Create ( ( ushort ) 127 ) ;
118151 Vector128 < ushort > b = Sse41 . Max ( b127 , total ) ;
119152 Vector128 < ushort > b16 = Sse41 . CompareEqual ( b , b127 ) ;
0 commit comments