diff --git a/csharp/Platform.Unsafe.Benchmarks/benchmark_results.txt b/csharp/Platform.Unsafe.Benchmarks/benchmark_results.txt new file mode 100644 index 0000000..25aebdc --- /dev/null +++ b/csharp/Platform.Unsafe.Benchmarks/benchmark_results.txt @@ -0,0 +1,686 @@ +// Validating benchmarks: +Assembly Platform.Unsafe.Benchmarks, Version=1.0.0.0, Culture=neutral, PublicKeyToken=null is located in temp. If you are running benchmarks from xUnit you need to disable shadow copy. It's not supported by design. +// ***** BenchmarkRunner: Start ***** +// ***** Found 16 benchmark(s) in total ***** +// ***** Building 1 exe(s) in Parallel: Start ***** +// start dotnet restore /p:UseSharedCompilation=false /p:BuildInParallel=false /m:1 /p:Deterministic=true /p:Optimize=true in /tmp/gh-issue-solver-1757845403022/csharp/Platform.Unsafe.Benchmarks/bin/Release/net8/1a7668ac-c1b0-4053-8025-eef54642afd3 +// command took 7.39s and exited with 0 +// start dotnet build -c Release --no-restore /p:UseSharedCompilation=false /p:BuildInParallel=false /m:1 /p:Deterministic=true /p:Optimize=true in /tmp/gh-issue-solver-1757845403022/csharp/Platform.Unsafe.Benchmarks/bin/Release/net8/1a7668ac-c1b0-4053-8025-eef54642afd3 +// command took 18.56s and exited with 0 +// ***** Done, took 00:00:26 (26.53 sec) ***** +// Found 16 benchmarks: +// CopyBenchmarks.Copy64BytesToArray: DefaultJob +// CopyBenchmarks.CopyBlock64BytesToArray: DefaultJob +// CopyBenchmarks.Copy128BytesToArray: DefaultJob +// CopyBenchmarks.CopyBlock128BytesToArray: DefaultJob +// CopyBenchmarks.Copy2048BytesToArray: DefaultJob +// CopyBenchmarks.CopyBlock2048BytesToArray: DefaultJob +// CopyBenchmarks.Copy4194304BytesToArray: DefaultJob +// CopyBenchmarks.CopyBlock4194304BytesToArray: DefaultJob +// CopyBenchmarks.Copy64BytesFromArray: DefaultJob +// CopyBenchmarks.CopyBlock64BytesFromArray: DefaultJob +// CopyBenchmarks.Copy128BytesFromArray: DefaultJob +// CopyBenchmarks.CopyBlock128BytesFromArray: DefaultJob +// CopyBenchmarks.Copy2048BytesFromArray: DefaultJob +// CopyBenchmarks.CopyBlock2048BytesFromArray: DefaultJob +// CopyBenchmarks.Copy4194304BytesFromArray: DefaultJob +// CopyBenchmarks.CopyBlock4194304BytesFromArray: DefaultJob + +// ************************** +// Benchmark: CopyBenchmarks.Copy64BytesToArray: DefaultJob +// *** Execute *** +// Launch: 1 / 1 +// Execute: dotnet "1a7668ac-c1b0-4053-8025-eef54642afd3.dll" --benchmarkName "Platform.Unsafe.Benchmarks.CopyBenchmarks.Copy64BytesToArray" --job "Default" --benchmarkId 0 in /tmp/gh-issue-solver-1757845403022/csharp/Platform.Unsafe.Benchmarks/bin/Release/net8/1a7668ac-c1b0-4053-8025-eef54642afd3/bin/Release/net8.0 +Failed to set up high priority. Make sure you have the right permissions. Message: Permission denied +// BeforeAnythingElse + +// Benchmark Process Environment Information: +// Runtime=.NET 8.0.19 (8.0.1925.36514), X64 RyuJIT +// GC=Concurrent Workstation +// Job: DefaultJob + +OverheadJitting 1: 1 op, 824640.00 ns, 824.6400 us/op +WorkloadJitting 1: 1 op, 534939.00 ns, 534.9390 us/op + +OverheadJitting 2: 16 op, 709326.00 ns, 44.3329 us/op +WorkloadJitting 2: 16 op, 829904.00 ns, 51.8690 us/op + +WorkloadPilot 1: 16 op, 5092.00 ns, 318.2500 ns/op +WorkloadPilot 2: 32 op, 5382.00 ns, 168.1875 ns/op +WorkloadPilot 3: 64 op, 10208.00 ns, 159.5000 ns/op +WorkloadPilot 4: 128 op, 15572.00 ns, 121.6562 ns/op +WorkloadPilot 5: 256 op, 17420.00 ns, 68.0469 ns/op +WorkloadPilot 6: 512 op, 40222.00 ns, 78.5586 ns/op +WorkloadPilot 7: 1024 op, 61648.00 ns, 60.2031 ns/op +WorkloadPilot 8: 2048 op, 131328.00 ns, 64.1250 ns/op +WorkloadPilot 9: 4096 op, 289578.00 ns, 70.6978 ns/op +WorkloadPilot 10: 8192 op, 513887.00 ns, 62.7303 ns/op +WorkloadPilot 11: 16384 op, 962004.00 ns, 58.7161 ns/op +WorkloadPilot 12: 32768 op, 1996784.00 ns, 60.9370 ns/op +WorkloadPilot 13: 65536 op, 3974244.00 ns, 60.6422 ns/op +WorkloadPilot 14: 131072 op, 8172258.00 ns, 62.3494 ns/op +WorkloadPilot 15: 262144 op, 2252230.00 ns, 8.5916 ns/op +WorkloadPilot 16: 524288 op, 3719045.00 ns, 7.0935 ns/op +WorkloadPilot 17: 1048576 op, 7225726.00 ns, 6.8910 ns/op +WorkloadPilot 18: 2097152 op, 14494823.00 ns, 6.9117 ns/op +WorkloadPilot 19: 4194304 op, 29837201.00 ns, 7.1137 ns/op +WorkloadPilot 20: 8388608 op, 76216547.00 ns, 9.0857 ns/op +WorkloadPilot 21: 16777216 op, 110339291.00 ns, 6.5767 ns/op +WorkloadPilot 22: 33554432 op, 316552023.00 ns, 9.4340 ns/op +WorkloadPilot 23: 67108864 op, 747306367.00 ns, 11.1357 ns/op + +OverheadWarmup 1: 67108864 op, 27290846.00 ns, 0.4067 ns/op +OverheadWarmup 2: 67108864 op, 8823854.00 ns, 0.1315 ns/op +OverheadWarmup 3: 67108864 op, 10491165.00 ns, 0.1563 ns/op +OverheadWarmup 4: 67108864 op, 9798353.00 ns, 0.1460 ns/op +OverheadWarmup 5: 67108864 op, 8364774.00 ns, 0.1246 ns/op +OverheadWarmup 6: 67108864 op, 14354336.00 ns, 0.2139 ns/op +OverheadWarmup 7: 67108864 op, 12040963.00 ns, 0.1794 ns/op + +OverheadActual 1: 67108864 op, 9139252.00 ns, 0.1362 ns/op +OverheadActual 2: 67108864 op, 4446391.00 ns, 0.0663 ns/op +OverheadActual 3: 67108864 op, 15881596.00 ns, 0.2367 ns/op +OverheadActual 4: 67108864 op, 4348711.00 ns, 0.0648 ns/op +OverheadActual 5: 67108864 op, 4221492.00 ns, 0.0629 ns/op +OverheadActual 6: 67108864 op, 7951085.00 ns, 0.1185 ns/op +OverheadActual 7: 67108864 op, 4973265.00 ns, 0.0741 ns/op +OverheadActual 8: 67108864 op, 4109778.00 ns, 0.0612 ns/op +OverheadActual 9: 67108864 op, 4748824.00 ns, 0.0708 ns/op +OverheadActual 10: 67108864 op, 12124646.00 ns, 0.1807 ns/op +OverheadActual 11: 67108864 op, 12520540.00 ns, 0.1866 ns/op +OverheadActual 12: 67108864 op, 11761689.00 ns, 0.1753 ns/op +OverheadActual 13: 67108864 op, 4820695.00 ns, 0.0718 ns/op +OverheadActual 14: 67108864 op, 4446091.00 ns, 0.0663 ns/op +OverheadActual 15: 67108864 op, 5485453.00 ns, 0.0817 ns/op +OverheadActual 16: 67108864 op, 6402223.00 ns, 0.0954 ns/op +OverheadActual 17: 67108864 op, 7180094.00 ns, 0.1070 ns/op +OverheadActual 18: 67108864 op, 9084175.00 ns, 0.1354 ns/op +OverheadActual 19: 67108864 op, 13047070.00 ns, 0.1944 ns/op +OverheadActual 20: 67108864 op, 10276790.00 ns, 0.1531 ns/op + +WorkloadWarmup 1: 67108864 op, 729291421.00 ns, 10.8673 ns/op +WorkloadWarmup 2: 67108864 op, 602891364.00 ns, 8.9838 ns/op +WorkloadWarmup 3: 67108864 op, 463108233.00 ns, 6.9009 ns/op +WorkloadWarmup 4: 67108864 op, 465064462.00 ns, 6.9300 ns/op +WorkloadWarmup 5: 67108864 op, 627046213.00 ns, 9.3437 ns/op +WorkloadWarmup 6: 67108864 op, 1108693750.00 ns, 16.5208 ns/op +WorkloadWarmup 7: 67108864 op, 764560383.00 ns, 11.3928 ns/op +WorkloadWarmup 8: 67108864 op, 690774003.00 ns, 10.2933 ns/op +WorkloadWarmup 9: 67108864 op, 724270006.00 ns, 10.7925 ns/op +WorkloadWarmup 10: 67108864 op, 998163149.00 ns, 14.8738 ns/op +WorkloadWarmup 11: 67108864 op, 426795332.00 ns, 6.3597 ns/op + +// BeforeActualRun +WorkloadActual 1: 67108864 op, 752043497.00 ns, 11.2063 ns/op +WorkloadActual 2: 67108864 op, 729753282.00 ns, 10.8742 ns/op +WorkloadActual 3: 67108864 op, 427849773.00 ns, 6.3755 ns/op +WorkloadActual 4: 67108864 op, 426757032.00 ns, 6.3592 ns/op +WorkloadActual 5: 67108864 op, 411741487.00 ns, 6.1354 ns/op +WorkloadActual 6: 67108864 op, 460995283.00 ns, 6.8694 ns/op +WorkloadActual 7: 67108864 op, 575197773.00 ns, 8.5711 ns/op +WorkloadActual 8: 67108864 op, 764862252.00 ns, 11.3973 ns/op +WorkloadActual 9: 67108864 op, 750686684.00 ns, 11.1861 ns/op +WorkloadActual 10: 67108864 op, 505660423.00 ns, 7.5349 ns/op +WorkloadActual 11: 67108864 op, 447976712.00 ns, 6.6754 ns/op +WorkloadActual 12: 67108864 op, 524847949.00 ns, 7.8208 ns/op +WorkloadActual 13: 67108864 op, 532157789.00 ns, 7.9298 ns/op +WorkloadActual 14: 67108864 op, 491024911.00 ns, 7.3168 ns/op +WorkloadActual 15: 67108864 op, 452160282.00 ns, 6.7377 ns/op +WorkloadActual 16: 67108864 op, 523743691.00 ns, 7.8044 ns/op +WorkloadActual 17: 67108864 op, 532119864.00 ns, 7.9292 ns/op +WorkloadActual 18: 67108864 op, 408188993.00 ns, 6.0825 ns/op +WorkloadActual 19: 67108864 op, 922846467.00 ns, 13.7515 ns/op +WorkloadActual 20: 67108864 op, 534649438.00 ns, 7.9669 ns/op +WorkloadActual 21: 67108864 op, 459735747.00 ns, 6.8506 ns/op +WorkloadActual 22: 67108864 op, 344530589.00 ns, 5.1339 ns/op +WorkloadActual 23: 67108864 op, 680434969.00 ns, 10.1393 ns/op +WorkloadActual 24: 67108864 op, 861038207.00 ns, 12.8305 ns/op +WorkloadActual 25: 67108864 op, 589876740.00 ns, 8.7898 ns/op +WorkloadActual 26: 67108864 op, 666901386.00 ns, 9.9376 ns/op +WorkloadActual 27: 67108864 op, 960256099.00 ns, 14.3089 ns/op +WorkloadActual 28: 67108864 op, 1682999475.00 ns, 25.0786 ns/op +WorkloadActual 29: 67108864 op, 748893078.00 ns, 11.1594 ns/op +WorkloadActual 30: 67108864 op, 799334000.00 ns, 11.9110 ns/op +WorkloadActual 31: 67108864 op, 565257128.00 ns, 8.4230 ns/op +WorkloadActual 32: 67108864 op, 948673921.00 ns, 14.1363 ns/op +WorkloadActual 33: 67108864 op, 1421580596.00 ns, 21.1832 ns/op +WorkloadActual 34: 67108864 op, 1016177650.00 ns, 15.1422 ns/op +WorkloadActual 35: 67108864 op, 645003087.00 ns, 9.6113 ns/op +WorkloadActual 36: 67108864 op, 798629227.00 ns, 11.9005 ns/op +WorkloadActual 37: 67108864 op, 461667730.00 ns, 6.8794 ns/op +WorkloadActual 38: 67108864 op, 548035195.00 ns, 8.1664 ns/op +WorkloadActual 39: 67108864 op, 826680616.00 ns, 12.3185 ns/op +WorkloadActual 40: 67108864 op, 977064366.00 ns, 14.5594 ns/op +WorkloadActual 41: 67108864 op, 1102388733.00 ns, 16.4269 ns/op +WorkloadActual 42: 67108864 op, 977091246.00 ns, 14.5598 ns/op +WorkloadActual 43: 67108864 op, 750820281.00 ns, 11.1881 ns/op +WorkloadActual 44: 67108864 op, 888308302.00 ns, 13.2368 ns/op +WorkloadActual 45: 67108864 op, 1048772787.00 ns, 15.6279 ns/op +WorkloadActual 46: 67108864 op, 1344825371.00 ns, 20.0395 ns/op +WorkloadActual 47: 67108864 op, 870156595.00 ns, 12.9663 ns/op +WorkloadActual 48: 67108864 op, 1350750036.00 ns, 20.1277 ns/op +WorkloadActual 49: 67108864 op, 900166397.00 ns, 13.4135 ns/op +WorkloadActual 50: 67108864 op, 893309702.00 ns, 13.3114 ns/op +WorkloadActual 51: 67108864 op, 852751073.00 ns, 12.7070 ns/op +WorkloadActual 52: 67108864 op, 1192131283.00 ns, 17.7641 ns/op +WorkloadActual 53: 67108864 op, 1009325216.00 ns, 15.0401 ns/op +WorkloadActual 54: 67108864 op, 1389997569.00 ns, 20.7126 ns/op +WorkloadActual 55: 67108864 op, 1656972148.00 ns, 24.6908 ns/op +WorkloadActual 56: 67108864 op, 1310702992.00 ns, 19.5310 ns/op +WorkloadActual 57: 67108864 op, 1789182466.00 ns, 26.6609 ns/op +WorkloadActual 58: 67108864 op, 1929570603.00 ns, 28.7528 ns/op +WorkloadActual 59: 67108864 op, 881608166.00 ns, 13.1370 ns/op +WorkloadActual 60: 67108864 op, 676398620.00 ns, 10.0791 ns/op +WorkloadActual 61: 67108864 op, 700576802.00 ns, 10.4394 ns/op +WorkloadActual 62: 67108864 op, 1025551310.00 ns, 15.2819 ns/op +WorkloadActual 63: 67108864 op, 946860780.00 ns, 14.1093 ns/op +WorkloadActual 64: 67108864 op, 821287086.00 ns, 12.2381 ns/op +WorkloadActual 65: 67108864 op, 991613521.00 ns, 14.7762 ns/op +WorkloadActual 66: 67108864 op, 1863003703.00 ns, 27.7609 ns/op +WorkloadActual 67: 67108864 op, 1432123485.00 ns, 21.3403 ns/op +WorkloadActual 68: 67108864 op, 1287946554.00 ns, 19.1919 ns/op +WorkloadActual 69: 67108864 op, 1095988941.00 ns, 16.3315 ns/op +WorkloadActual 70: 67108864 op, 718791421.00 ns, 10.7108 ns/op +WorkloadActual 71: 67108864 op, 743223535.00 ns, 11.0749 ns/op +WorkloadActual 72: 67108864 op, 748546791.00 ns, 11.1542 ns/op +WorkloadActual 73: 67108864 op, 715628222.00 ns, 10.6637 ns/op +WorkloadActual 74: 67108864 op, 769616181.00 ns, 11.4682 ns/op +WorkloadActual 75: 67108864 op, 916233027.00 ns, 13.6529 ns/op +WorkloadActual 76: 67108864 op, 730386554.00 ns, 10.8836 ns/op +WorkloadActual 77: 67108864 op, 816408637.00 ns, 12.1654 ns/op +WorkloadActual 78: 67108864 op, 793252646.00 ns, 11.8204 ns/op +WorkloadActual 79: 67108864 op, 713669999.00 ns, 10.6345 ns/op +WorkloadActual 80: 67108864 op, 890962513.00 ns, 13.2764 ns/op +WorkloadActual 81: 67108864 op, 1006223982.00 ns, 14.9939 ns/op +WorkloadActual 82: 67108864 op, 1146847458.00 ns, 17.0894 ns/op +WorkloadActual 83: 67108864 op, 875981781.00 ns, 13.0531 ns/op +WorkloadActual 84: 67108864 op, 1006226309.00 ns, 14.9939 ns/op +WorkloadActual 85: 67108864 op, 806859046.00 ns, 12.0231 ns/op +WorkloadActual 86: 67108864 op, 1131140285.00 ns, 16.8553 ns/op +WorkloadActual 87: 67108864 op, 794726307.00 ns, 11.8423 ns/op +WorkloadActual 88: 67108864 op, 826394513.00 ns, 12.3142 ns/op +WorkloadActual 89: 67108864 op, 804663315.00 ns, 11.9904 ns/op +WorkloadActual 90: 67108864 op, 977451153.00 ns, 14.5652 ns/op +WorkloadActual 91: 67108864 op, 970291061.00 ns, 14.4585 ns/op +WorkloadActual 92: 67108864 op, 919989858.00 ns, 13.7089 ns/op +WorkloadActual 93: 67108864 op, 1045999230.00 ns, 15.5866 ns/op +WorkloadActual 94: 67108864 op, 917403114.00 ns, 13.6704 ns/op +WorkloadActual 95: 67108864 op, 1126767003.00 ns, 16.7901 ns/op +WorkloadActual 96: 67108864 op, 1112703216.00 ns, 16.5806 ns/op +WorkloadActual 97: 67108864 op, 1195220703.00 ns, 17.8102 ns/op +WorkloadActual 98: 67108864 op, 1087698743.00 ns, 16.2080 ns/op +WorkloadActual 99: 67108864 op, 1164798398.00 ns, 17.3568 ns/op +WorkloadActual 100: 67108864 op, 1349057358.00 ns, 20.1025 ns/op + +// AfterActualRun +WorkloadResult 1: 67108864 op, 745252338.50 ns, 11.1051 ns/op +WorkloadResult 2: 67108864 op, 722962123.50 ns, 10.7730 ns/op +WorkloadResult 3: 67108864 op, 421058614.50 ns, 6.2743 ns/op +WorkloadResult 4: 67108864 op, 419965873.50 ns, 6.2580 ns/op +WorkloadResult 5: 67108864 op, 404950328.50 ns, 6.0342 ns/op +WorkloadResult 6: 67108864 op, 454204124.50 ns, 6.7682 ns/op +WorkloadResult 7: 67108864 op, 568406614.50 ns, 8.4699 ns/op +WorkloadResult 8: 67108864 op, 758071093.50 ns, 11.2961 ns/op +WorkloadResult 9: 67108864 op, 743895525.50 ns, 11.0849 ns/op +WorkloadResult 10: 67108864 op, 498869264.50 ns, 7.4337 ns/op +WorkloadResult 11: 67108864 op, 441185553.50 ns, 6.5742 ns/op +WorkloadResult 12: 67108864 op, 518056790.50 ns, 7.7196 ns/op +WorkloadResult 13: 67108864 op, 525366630.50 ns, 7.8286 ns/op +WorkloadResult 14: 67108864 op, 484233752.50 ns, 7.2156 ns/op +WorkloadResult 15: 67108864 op, 445369123.50 ns, 6.6365 ns/op +WorkloadResult 16: 67108864 op, 516952532.50 ns, 7.7032 ns/op +WorkloadResult 17: 67108864 op, 525328705.50 ns, 7.8280 ns/op +WorkloadResult 18: 67108864 op, 401397834.50 ns, 5.9813 ns/op +WorkloadResult 19: 67108864 op, 916055308.50 ns, 13.6503 ns/op +WorkloadResult 20: 67108864 op, 527858279.50 ns, 7.8657 ns/op +WorkloadResult 21: 67108864 op, 452944588.50 ns, 6.7494 ns/op +WorkloadResult 22: 67108864 op, 337739430.50 ns, 5.0327 ns/op +WorkloadResult 23: 67108864 op, 673643810.50 ns, 10.0381 ns/op +WorkloadResult 24: 67108864 op, 854247048.50 ns, 12.7293 ns/op +WorkloadResult 25: 67108864 op, 583085581.50 ns, 8.6887 ns/op +WorkloadResult 26: 67108864 op, 660110227.50 ns, 9.8364 ns/op +WorkloadResult 27: 67108864 op, 953464940.50 ns, 14.2077 ns/op +WorkloadResult 28: 67108864 op, 742101919.50 ns, 11.0582 ns/op +WorkloadResult 29: 67108864 op, 792542841.50 ns, 11.8098 ns/op +WorkloadResult 30: 67108864 op, 558465969.50 ns, 8.3218 ns/op +WorkloadResult 31: 67108864 op, 941882762.50 ns, 14.0351 ns/op +WorkloadResult 32: 67108864 op, 1414789437.50 ns, 21.0820 ns/op +WorkloadResult 33: 67108864 op, 1009386491.50 ns, 15.0410 ns/op +WorkloadResult 34: 67108864 op, 638211928.50 ns, 9.5101 ns/op +WorkloadResult 35: 67108864 op, 791838068.50 ns, 11.7993 ns/op +WorkloadResult 36: 67108864 op, 454876571.50 ns, 6.7782 ns/op +WorkloadResult 37: 67108864 op, 541244036.50 ns, 8.0652 ns/op +WorkloadResult 38: 67108864 op, 819889457.50 ns, 12.2173 ns/op +WorkloadResult 39: 67108864 op, 970273207.50 ns, 14.4582 ns/op +WorkloadResult 40: 67108864 op, 1095597574.50 ns, 16.3257 ns/op +WorkloadResult 41: 67108864 op, 970300087.50 ns, 14.4586 ns/op +WorkloadResult 42: 67108864 op, 744029122.50 ns, 11.0869 ns/op +WorkloadResult 43: 67108864 op, 881517143.50 ns, 13.1356 ns/op +WorkloadResult 44: 67108864 op, 1041981628.50 ns, 15.5267 ns/op +WorkloadResult 45: 67108864 op, 1338034212.50 ns, 19.9383 ns/op +WorkloadResult 46: 67108864 op, 863365436.50 ns, 12.8651 ns/op +WorkloadResult 47: 67108864 op, 1343958877.50 ns, 20.0265 ns/op +WorkloadResult 48: 67108864 op, 893375238.50 ns, 13.3123 ns/op +WorkloadResult 49: 67108864 op, 886518543.50 ns, 13.2102 ns/op +WorkloadResult 50: 67108864 op, 845959914.50 ns, 12.6058 ns/op +WorkloadResult 51: 67108864 op, 1185340124.50 ns, 17.6629 ns/op +WorkloadResult 52: 67108864 op, 1002534057.50 ns, 14.9389 ns/op +WorkloadResult 53: 67108864 op, 1383206410.50 ns, 20.6114 ns/op +WorkloadResult 54: 67108864 op, 1303911833.50 ns, 19.4298 ns/op +WorkloadResult 55: 67108864 op, 874817007.50 ns, 13.0358 ns/op +WorkloadResult 56: 67108864 op, 669607461.50 ns, 9.9779 ns/op +WorkloadResult 57: 67108864 op, 693785643.50 ns, 10.3382 ns/op +WorkloadResult 58: 67108864 op, 1018760151.50 ns, 15.1807 ns/op +WorkloadResult 59: 67108864 op, 940069621.50 ns, 14.0081 ns/op +WorkloadResult 60: 67108864 op, 814495927.50 ns, 12.1369 ns/op +WorkloadResult 61: 67108864 op, 984822362.50 ns, 14.6750 ns/op +WorkloadResult 62: 67108864 op, 1425332326.50 ns, 21.2391 ns/op +WorkloadResult 63: 67108864 op, 1281155395.50 ns, 19.0907 ns/op +WorkloadResult 64: 67108864 op, 1089197782.50 ns, 16.2303 ns/op +WorkloadResult 65: 67108864 op, 712000262.50 ns, 10.6096 ns/op +WorkloadResult 66: 67108864 op, 736432376.50 ns, 10.9737 ns/op +WorkloadResult 67: 67108864 op, 741755632.50 ns, 11.0530 ns/op +WorkloadResult 68: 67108864 op, 708837063.50 ns, 10.5625 ns/op +WorkloadResult 69: 67108864 op, 762825022.50 ns, 11.3670 ns/op +WorkloadResult 70: 67108864 op, 909441868.50 ns, 13.5517 ns/op +WorkloadResult 71: 67108864 op, 723595395.50 ns, 10.7824 ns/op +WorkloadResult 72: 67108864 op, 809617478.50 ns, 12.0642 ns/op +WorkloadResult 73: 67108864 op, 786461487.50 ns, 11.7192 ns/op +WorkloadResult 74: 67108864 op, 706878840.50 ns, 10.5333 ns/op +WorkloadResult 75: 67108864 op, 884171354.50 ns, 13.1752 ns/op +WorkloadResult 76: 67108864 op, 999432823.50 ns, 14.8927 ns/op +WorkloadResult 77: 67108864 op, 1140056299.50 ns, 16.9882 ns/op +WorkloadResult 78: 67108864 op, 869190622.50 ns, 12.9519 ns/op +WorkloadResult 79: 67108864 op, 999435150.50 ns, 14.8927 ns/op +WorkloadResult 80: 67108864 op, 800067887.50 ns, 11.9219 ns/op +WorkloadResult 81: 67108864 op, 1124349126.50 ns, 16.7541 ns/op +WorkloadResult 82: 67108864 op, 787935148.50 ns, 11.7411 ns/op +WorkloadResult 83: 67108864 op, 819603354.50 ns, 12.2130 ns/op +WorkloadResult 84: 67108864 op, 797872156.50 ns, 11.8892 ns/op +WorkloadResult 85: 67108864 op, 970659994.50 ns, 14.4640 ns/op +WorkloadResult 86: 67108864 op, 963499902.50 ns, 14.3573 ns/op +WorkloadResult 87: 67108864 op, 913198699.50 ns, 13.6077 ns/op +WorkloadResult 88: 67108864 op, 1039208071.50 ns, 15.4854 ns/op +WorkloadResult 89: 67108864 op, 910611955.50 ns, 13.5692 ns/op +WorkloadResult 90: 67108864 op, 1119975844.50 ns, 16.6889 ns/op +WorkloadResult 91: 67108864 op, 1105912057.50 ns, 16.4794 ns/op +WorkloadResult 92: 67108864 op, 1188429544.50 ns, 17.7090 ns/op +WorkloadResult 93: 67108864 op, 1080907584.50 ns, 16.1068 ns/op +WorkloadResult 94: 67108864 op, 1158007239.50 ns, 17.2557 ns/op +WorkloadResult 95: 67108864 op, 1342266199.50 ns, 20.0013 ns/op +GC: 0 0 0 672 67108864 +Threading: 0 0 67108864 + +// AfterAll +// Benchmark Process 1348070 has exited with code 0. + +Mean = 12.457 ns, StdErr = 0.401 ns (3.22%), N = 95, StdDev = 3.909 ns +Min = 5.033 ns, Q1 = 10.008 ns, Median = 12.213 ns, Q3 = 14.893 ns, Max = 21.239 ns +IQR = 4.885 ns, LowerFence = 2.681 ns, UpperFence = 22.220 ns +ConfidenceInterval = [11.094 ns; 13.819 ns] (CI 99.9%), Margin = 1.362 ns (10.94% of Mean) +Skewness = 0.23, Kurtosis = 2.46, MValue = 3.23 + +// ************************** +// Benchmark: CopyBenchmarks.CopyBlock64BytesToArray: DefaultJob +// *** Execute *** +// Launch: 1 / 1 +// Execute: dotnet "1a7668ac-c1b0-4053-8025-eef54642afd3.dll" --benchmarkName "Platform.Unsafe.Benchmarks.CopyBenchmarks.CopyBlock64BytesToArray" --job "Default" --benchmarkId 1 in /tmp/gh-issue-solver-1757845403022/csharp/Platform.Unsafe.Benchmarks/bin/Release/net8/1a7668ac-c1b0-4053-8025-eef54642afd3/bin/Release/net8.0 +Failed to set up high priority. Make sure you have the right permissions. Message: Permission denied +// BeforeAnythingElse + +// Benchmark Process Environment Information: +// Runtime=.NET 8.0.19 (8.0.1925.36514), X64 RyuJIT +// GC=Concurrent Workstation +// Job: DefaultJob + +OverheadJitting 1: 1 op, 4751552.00 ns, 4.7516 ms/op +WorkloadJitting 1: 1 op, 4419279.00 ns, 4.4193 ms/op + +OverheadJitting 2: 16 op, 2322433.00 ns, 145.1521 us/op +WorkloadJitting 2: 16 op, 462312.00 ns, 28.8945 us/op + +WorkloadPilot 1: 16 op, 6106.00 ns, 381.6250 ns/op +WorkloadPilot 2: 32 op, 5825.00 ns, 182.0312 ns/op +WorkloadPilot 3: 64 op, 7890.00 ns, 123.2813 ns/op +WorkloadPilot 4: 128 op, 10949.00 ns, 85.5391 ns/op +WorkloadPilot 5: 256 op, 41673.00 ns, 162.7852 ns/op +WorkloadPilot 6: 512 op, 26857.00 ns, 52.4551 ns/op +WorkloadPilot 7: 1024 op, 50455.00 ns, 49.2725 ns/op +WorkloadPilot 8: 2048 op, 113424.00 ns, 55.3828 ns/op +WorkloadPilot 9: 4096 op, 207096.00 ns, 50.5605 ns/op +WorkloadPilot 10: 8192 op, 414042.00 ns, 50.5422 ns/op +WorkloadPilot 11: 16384 op, 767936.00 ns, 46.8711 ns/op +WorkloadPilot 12: 32768 op, 12500593.00 ns, 381.4878 ns/op +WorkloadPilot 13: 65536 op, 27222414.00 ns, 415.3811 ns/op +WorkloadPilot 14: 131072 op, 30490014.00 ns, 232.6203 ns/op +WorkloadPilot 15: 262144 op, 6349202.00 ns, 24.2203 ns/op +WorkloadPilot 16: 524288 op, 10620151.00 ns, 20.2563 ns/op +WorkloadPilot 17: 1048576 op, 18139144.00 ns, 17.2988 ns/op +WorkloadPilot 18: 2097152 op, 44796916.00 ns, 21.3608 ns/op +WorkloadPilot 19: 4194304 op, 81454672.00 ns, 19.4203 ns/op +WorkloadPilot 20: 8388608 op, 268023059.00 ns, 31.9508 ns/op +WorkloadPilot 21: 16777216 op, 518355359.00 ns, 30.8964 ns/op + +OverheadWarmup 1: 16777216 op, 35134822.00 ns, 2.0942 ns/op +OverheadWarmup 2: 16777216 op, 4170506.00 ns, 0.2486 ns/op +OverheadWarmup 3: 16777216 op, 4412352.00 ns, 0.2630 ns/op +OverheadWarmup 4: 16777216 op, 5253151.00 ns, 0.3131 ns/op +OverheadWarmup 5: 16777216 op, 4155875.00 ns, 0.2477 ns/op +OverheadWarmup 6: 16777216 op, 4194052.00 ns, 0.2500 ns/op +OverheadWarmup 7: 16777216 op, 3200043.00 ns, 0.1907 ns/op + +OverheadActual 1: 16777216 op, 6247695.00 ns, 0.3724 ns/op +OverheadActual 2: 16777216 op, 4176842.00 ns, 0.2490 ns/op +OverheadActual 3: 16777216 op, 4198379.00 ns, 0.2502 ns/op +OverheadActual 4: 16777216 op, 5291935.00 ns, 0.3154 ns/op +OverheadActual 5: 16777216 op, 2171848.00 ns, 0.1295 ns/op +OverheadActual 6: 16777216 op, 5231024.00 ns, 0.3118 ns/op +OverheadActual 7: 16777216 op, 8165478.00 ns, 0.4867 ns/op +OverheadActual 8: 16777216 op, 4065201.00 ns, 0.2423 ns/op +OverheadActual 9: 16777216 op, 4102808.00 ns, 0.2445 ns/op +OverheadActual 10: 16777216 op, 5795890.00 ns, 0.3455 ns/op +OverheadActual 11: 16777216 op, 4234291.00 ns, 0.2524 ns/op +OverheadActual 12: 16777216 op, 8872629.00 ns, 0.5288 ns/op +OverheadActual 13: 16777216 op, 5409574.00 ns, 0.3224 ns/op +OverheadActual 14: 16777216 op, 8970705.00 ns, 0.5347 ns/op +OverheadActual 15: 16777216 op, 1108443.00 ns, 0.0661 ns/op +OverheadActual 16: 16777216 op, 3113793.00 ns, 0.1856 ns/op +OverheadActual 17: 16777216 op, 1189666.00 ns, 0.0709 ns/op +OverheadActual 18: 16777216 op, 3151388.00 ns, 0.1878 ns/op +OverheadActual 19: 16777216 op, 3116373.00 ns, 0.1858 ns/op +OverheadActual 20: 16777216 op, 2196478.00 ns, 0.1309 ns/op + +WorkloadWarmup 1: 16777216 op, 304755005.00 ns, 18.1648 ns/op +WorkloadWarmup 2: 16777216 op, 178624593.00 ns, 10.6469 ns/op +WorkloadWarmup 3: 16777216 op, 188549747.00 ns, 11.2384 ns/op +WorkloadWarmup 4: 16777216 op, 252677044.00 ns, 15.0607 ns/op +WorkloadWarmup 5: 16777216 op, 160048389.00 ns, 9.5396 ns/op +WorkloadWarmup 6: 16777216 op, 182790851.00 ns, 10.8952 ns/op +WorkloadWarmup 7: 16777216 op, 200612322.00 ns, 11.9574 ns/op +WorkloadWarmup 8: 16777216 op, 334496126.00 ns, 19.9375 ns/op +WorkloadWarmup 9: 16777216 op, 327636328.00 ns, 19.5286 ns/op + +// BeforeActualRun +WorkloadActual 1: 16777216 op, 230166163.00 ns, 13.7190 ns/op +WorkloadActual 2: 16777216 op, 208156701.00 ns, 12.4071 ns/op +WorkloadActual 3: 16777216 op, 230396687.00 ns, 13.7327 ns/op +WorkloadActual 4: 16777216 op, 238390322.00 ns, 14.2092 ns/op +WorkloadActual 5: 16777216 op, 438207609.00 ns, 26.1192 ns/op +WorkloadActual 6: 16777216 op, 321455019.00 ns, 19.1602 ns/op +WorkloadActual 7: 16777216 op, 497893277.00 ns, 29.6768 ns/op +WorkloadActual 8: 16777216 op, 332380569.00 ns, 19.8114 ns/op +WorkloadActual 9: 16777216 op, 614371339.00 ns, 36.6194 ns/op +WorkloadActual 10: 16777216 op, 615681803.00 ns, 36.6975 ns/op +WorkloadActual 11: 16777216 op, 569704895.00 ns, 33.9571 ns/op +WorkloadActual 12: 16777216 op, 500679977.00 ns, 29.8429 ns/op +WorkloadActual 13: 16777216 op, 519731784.00 ns, 30.9784 ns/op +WorkloadActual 14: 16777216 op, 558980158.00 ns, 33.3178 ns/op +WorkloadActual 15: 16777216 op, 509070477.00 ns, 30.3430 ns/op +WorkloadActual 16: 16777216 op, 552822597.00 ns, 32.9508 ns/op +WorkloadActual 17: 16777216 op, 362093549.00 ns, 21.5825 ns/op +WorkloadActual 18: 16777216 op, 252703208.00 ns, 15.0623 ns/op +WorkloadActual 19: 16777216 op, 194795516.00 ns, 11.6107 ns/op +WorkloadActual 20: 16777216 op, 190907631.00 ns, 11.3790 ns/op +WorkloadActual 21: 16777216 op, 174765573.00 ns, 10.4168 ns/op +WorkloadActual 22: 16777216 op, 185072319.00 ns, 11.0312 ns/op +WorkloadActual 23: 16777216 op, 229103597.00 ns, 13.6556 ns/op +WorkloadActual 24: 16777216 op, 207127513.00 ns, 12.3458 ns/op +WorkloadActual 25: 16777216 op, 210927036.00 ns, 12.5722 ns/op +WorkloadActual 26: 16777216 op, 156564169.00 ns, 9.3320 ns/op +WorkloadActual 27: 16777216 op, 292500867.00 ns, 17.4344 ns/op +WorkloadActual 28: 16777216 op, 327998011.00 ns, 19.5502 ns/op +WorkloadActual 29: 16777216 op, 172046613.00 ns, 10.2548 ns/op +WorkloadActual 30: 16777216 op, 206183979.00 ns, 12.2895 ns/op +WorkloadActual 31: 16777216 op, 159523869.00 ns, 9.5084 ns/op +WorkloadActual 32: 16777216 op, 162478875.00 ns, 9.6845 ns/op +WorkloadActual 33: 16777216 op, 185194686.00 ns, 11.0385 ns/op +WorkloadActual 34: 16777216 op, 235171554.00 ns, 14.0173 ns/op +WorkloadActual 35: 16777216 op, 311962133.00 ns, 18.5944 ns/op +WorkloadActual 36: 16777216 op, 219695847.00 ns, 13.0949 ns/op +WorkloadActual 37: 16777216 op, 263244786.00 ns, 15.6906 ns/op +WorkloadActual 38: 16777216 op, 188081227.00 ns, 11.2105 ns/op +WorkloadActual 39: 16777216 op, 281812573.00 ns, 16.7973 ns/op +WorkloadActual 40: 16777216 op, 214462502.00 ns, 12.7830 ns/op +WorkloadActual 41: 16777216 op, 176945895.00 ns, 10.5468 ns/op +WorkloadActual 42: 16777216 op, 190974991.00 ns, 11.3830 ns/op +WorkloadActual 43: 16777216 op, 169769341.00 ns, 10.1190 ns/op +WorkloadActual 44: 16777216 op, 180360116.00 ns, 10.7503 ns/op +WorkloadActual 45: 16777216 op, 190300308.00 ns, 11.3428 ns/op +WorkloadActual 46: 16777216 op, 179218767.00 ns, 10.6823 ns/op +WorkloadActual 47: 16777216 op, 181600705.00 ns, 10.8242 ns/op +WorkloadActual 48: 16777216 op, 191975442.00 ns, 11.4426 ns/op +WorkloadActual 49: 16777216 op, 163435317.00 ns, 9.7415 ns/op +WorkloadActual 50: 16777216 op, 168505938.00 ns, 10.0437 ns/op +WorkloadActual 51: 16777216 op, 307592669.00 ns, 18.3340 ns/op +WorkloadActual 52: 16777216 op, 308202342.00 ns, 18.3703 ns/op +WorkloadActual 53: 16777216 op, 214934308.00 ns, 12.8111 ns/op +WorkloadActual 54: 16777216 op, 202081406.00 ns, 12.0450 ns/op +WorkloadActual 55: 16777216 op, 161461721.00 ns, 9.6239 ns/op +WorkloadActual 56: 16777216 op, 171953648.00 ns, 10.2492 ns/op +WorkloadActual 57: 16777216 op, 212922308.00 ns, 12.6912 ns/op +WorkloadActual 58: 16777216 op, 207528647.00 ns, 12.3697 ns/op +WorkloadActual 59: 16777216 op, 183736459.00 ns, 10.9515 ns/op +WorkloadActual 60: 16777216 op, 281900141.00 ns, 16.8026 ns/op +WorkloadActual 61: 16777216 op, 271490975.00 ns, 16.1821 ns/op +WorkloadActual 62: 16777216 op, 172980044.00 ns, 10.3104 ns/op +WorkloadActual 63: 16777216 op, 158014160.00 ns, 9.4184 ns/op +WorkloadActual 64: 16777216 op, 160614271.00 ns, 9.5734 ns/op +WorkloadActual 65: 16777216 op, 184858457.00 ns, 11.0184 ns/op +WorkloadActual 66: 16777216 op, 165762247.00 ns, 9.8802 ns/op +WorkloadActual 67: 16777216 op, 185784030.00 ns, 11.0736 ns/op +WorkloadActual 68: 16777216 op, 160796336.00 ns, 9.5842 ns/op +WorkloadActual 69: 16777216 op, 179487267.00 ns, 10.6983 ns/op +WorkloadActual 70: 16777216 op, 179644719.00 ns, 10.7077 ns/op +WorkloadActual 71: 16777216 op, 187790808.00 ns, 11.1932 ns/op +WorkloadActual 72: 16777216 op, 319767446.00 ns, 19.0596 ns/op +WorkloadActual 73: 16777216 op, 200441203.00 ns, 11.9472 ns/op +WorkloadActual 74: 16777216 op, 169245639.00 ns, 10.0878 ns/op +WorkloadActual 75: 16777216 op, 179952131.00 ns, 10.7260 ns/op +WorkloadActual 76: 16777216 op, 174059294.00 ns, 10.3747 ns/op +WorkloadActual 77: 16777216 op, 188955746.00 ns, 11.2626 ns/op +WorkloadActual 78: 16777216 op, 190808861.00 ns, 11.3731 ns/op +WorkloadActual 79: 16777216 op, 196237800.00 ns, 11.6967 ns/op +WorkloadActual 80: 16777216 op, 196310194.00 ns, 11.7010 ns/op +WorkloadActual 81: 16777216 op, 206722806.00 ns, 12.3216 ns/op +WorkloadActual 82: 16777216 op, 205941202.00 ns, 12.2751 ns/op +WorkloadActual 83: 16777216 op, 168663122.00 ns, 10.0531 ns/op +WorkloadActual 84: 16777216 op, 158878825.00 ns, 9.4699 ns/op +WorkloadActual 85: 16777216 op, 158348634.00 ns, 9.4383 ns/op +WorkloadActual 86: 16777216 op, 150543701.00 ns, 8.9731 ns/op +WorkloadActual 87: 16777216 op, 185659507.00 ns, 11.0662 ns/op +WorkloadActual 88: 16777216 op, 189369001.00 ns, 11.2873 ns/op +WorkloadActual 89: 16777216 op, 199623423.00 ns, 11.8985 ns/op +WorkloadActual 90: 16777216 op, 177584165.00 ns, 10.5848 ns/op +WorkloadActual 91: 16777216 op, 163944776.00 ns, 9.7719 ns/op +WorkloadActual 92: 16777216 op, 203126034.00 ns, 12.1073 ns/op +WorkloadActual 93: 16777216 op, 250893330.00 ns, 14.9544 ns/op +WorkloadActual 94: 16777216 op, 234553332.00 ns, 13.9805 ns/op +WorkloadActual 95: 16777216 op, 179352568.00 ns, 10.6902 ns/op +WorkloadActual 96: 16777216 op, 175916110.00 ns, 10.4854 ns/op +WorkloadActual 97: 16777216 op, 181244444.00 ns, 10.8030 ns/op +WorkloadActual 98: 16777216 op, 180124026.00 ns, 10.7362 ns/op +WorkloadActual 99: 16777216 op, 174302505.00 ns, 10.3892 ns/op +WorkloadActual 100: 16777216 op, 184283494.00 ns, 10.9842 ns/op + +// AfterActualRun +WorkloadResult 1: 16777216 op, 225978552.50 ns, 13.4694 ns/op +WorkloadResult 2: 16777216 op, 203969090.50 ns, 12.1575 ns/op +WorkloadResult 3: 16777216 op, 226209076.50 ns, 13.4831 ns/op +WorkloadResult 4: 16777216 op, 234202711.50 ns, 13.9596 ns/op +WorkloadResult 5: 16777216 op, 317267408.50 ns, 18.9106 ns/op +WorkloadResult 6: 16777216 op, 328192958.50 ns, 19.5618 ns/op +WorkloadResult 7: 16777216 op, 248515597.50 ns, 14.8127 ns/op +WorkloadResult 8: 16777216 op, 190607905.50 ns, 11.3611 ns/op +WorkloadResult 9: 16777216 op, 186720020.50 ns, 11.1294 ns/op +WorkloadResult 10: 16777216 op, 170577962.50 ns, 10.1672 ns/op +WorkloadResult 11: 16777216 op, 180884708.50 ns, 10.7816 ns/op +WorkloadResult 12: 16777216 op, 224915986.50 ns, 13.4060 ns/op +WorkloadResult 13: 16777216 op, 202939902.50 ns, 12.0962 ns/op +WorkloadResult 14: 16777216 op, 206739425.50 ns, 12.3226 ns/op +WorkloadResult 15: 16777216 op, 152376558.50 ns, 9.0824 ns/op +WorkloadResult 16: 16777216 op, 288313256.50 ns, 17.1848 ns/op +WorkloadResult 17: 16777216 op, 323810400.50 ns, 19.3006 ns/op +WorkloadResult 18: 16777216 op, 167859002.50 ns, 10.0052 ns/op +WorkloadResult 19: 16777216 op, 201996368.50 ns, 12.0399 ns/op +WorkloadResult 20: 16777216 op, 155336258.50 ns, 9.2588 ns/op +WorkloadResult 21: 16777216 op, 158291264.50 ns, 9.4349 ns/op +WorkloadResult 22: 16777216 op, 181007075.50 ns, 10.7889 ns/op +WorkloadResult 23: 16777216 op, 230983943.50 ns, 13.7677 ns/op +WorkloadResult 24: 16777216 op, 307774522.50 ns, 18.3448 ns/op +WorkloadResult 25: 16777216 op, 215508236.50 ns, 12.8453 ns/op +WorkloadResult 26: 16777216 op, 259057175.50 ns, 15.4410 ns/op +WorkloadResult 27: 16777216 op, 183893616.50 ns, 10.9609 ns/op +WorkloadResult 28: 16777216 op, 277624962.50 ns, 16.5477 ns/op +WorkloadResult 29: 16777216 op, 210274891.50 ns, 12.5334 ns/op +WorkloadResult 30: 16777216 op, 172758284.50 ns, 10.2972 ns/op +WorkloadResult 31: 16777216 op, 186787380.50 ns, 11.1334 ns/op +WorkloadResult 32: 16777216 op, 165581730.50 ns, 9.8694 ns/op +WorkloadResult 33: 16777216 op, 176172505.50 ns, 10.5007 ns/op +WorkloadResult 34: 16777216 op, 186112697.50 ns, 11.0932 ns/op +WorkloadResult 35: 16777216 op, 175031156.50 ns, 10.4327 ns/op +WorkloadResult 36: 16777216 op, 177413094.50 ns, 10.5746 ns/op +WorkloadResult 37: 16777216 op, 187787831.50 ns, 11.1930 ns/op +WorkloadResult 38: 16777216 op, 159247706.50 ns, 9.4919 ns/op +WorkloadResult 39: 16777216 op, 164318327.50 ns, 9.7941 ns/op +WorkloadResult 40: 16777216 op, 303405058.50 ns, 18.0844 ns/op +WorkloadResult 41: 16777216 op, 304014731.50 ns, 18.1207 ns/op +WorkloadResult 42: 16777216 op, 210746697.50 ns, 12.5615 ns/op +WorkloadResult 43: 16777216 op, 197893795.50 ns, 11.7954 ns/op +WorkloadResult 44: 16777216 op, 157274110.50 ns, 9.3743 ns/op +WorkloadResult 45: 16777216 op, 167766037.50 ns, 9.9996 ns/op +WorkloadResult 46: 16777216 op, 208734697.50 ns, 12.4416 ns/op +WorkloadResult 47: 16777216 op, 203341036.50 ns, 12.1201 ns/op +WorkloadResult 48: 16777216 op, 179548848.50 ns, 10.7019 ns/op +WorkloadResult 49: 16777216 op, 277712530.50 ns, 16.5530 ns/op +WorkloadResult 50: 16777216 op, 267303364.50 ns, 15.9325 ns/op +WorkloadResult 51: 16777216 op, 168792433.50 ns, 10.0608 ns/op +WorkloadResult 52: 16777216 op, 153826549.50 ns, 9.1688 ns/op +WorkloadResult 53: 16777216 op, 156426660.50 ns, 9.3238 ns/op +WorkloadResult 54: 16777216 op, 180670846.50 ns, 10.7688 ns/op +WorkloadResult 55: 16777216 op, 161574636.50 ns, 9.6306 ns/op +WorkloadResult 56: 16777216 op, 181596419.50 ns, 10.8240 ns/op +WorkloadResult 57: 16777216 op, 156608725.50 ns, 9.3346 ns/op +WorkloadResult 58: 16777216 op, 175299656.50 ns, 10.4487 ns/op +WorkloadResult 59: 16777216 op, 175457108.50 ns, 10.4581 ns/op +WorkloadResult 60: 16777216 op, 183603197.50 ns, 10.9436 ns/op +WorkloadResult 61: 16777216 op, 315579835.50 ns, 18.8100 ns/op +WorkloadResult 62: 16777216 op, 196253592.50 ns, 11.6976 ns/op +WorkloadResult 63: 16777216 op, 165058028.50 ns, 9.8382 ns/op +WorkloadResult 64: 16777216 op, 175764520.50 ns, 10.4764 ns/op +WorkloadResult 65: 16777216 op, 169871683.50 ns, 10.1251 ns/op +WorkloadResult 66: 16777216 op, 184768135.50 ns, 11.0130 ns/op +WorkloadResult 67: 16777216 op, 186621250.50 ns, 11.1235 ns/op +WorkloadResult 68: 16777216 op, 192050189.50 ns, 11.4471 ns/op +WorkloadResult 69: 16777216 op, 192122583.50 ns, 11.4514 ns/op +WorkloadResult 70: 16777216 op, 202535195.50 ns, 12.0720 ns/op +WorkloadResult 71: 16777216 op, 201753591.50 ns, 12.0255 ns/op +WorkloadResult 72: 16777216 op, 164475511.50 ns, 9.8035 ns/op +WorkloadResult 73: 16777216 op, 154691214.50 ns, 9.2203 ns/op +WorkloadResult 74: 16777216 op, 154161023.50 ns, 9.1887 ns/op +WorkloadResult 75: 16777216 op, 146356090.50 ns, 8.7235 ns/op +WorkloadResult 76: 16777216 op, 181471896.50 ns, 10.8166 ns/op +WorkloadResult 77: 16777216 op, 185181390.50 ns, 11.0377 ns/op +WorkloadResult 78: 16777216 op, 195435812.50 ns, 11.6489 ns/op +WorkloadResult 79: 16777216 op, 173396554.50 ns, 10.3352 ns/op +WorkloadResult 80: 16777216 op, 159757165.50 ns, 9.5223 ns/op +WorkloadResult 81: 16777216 op, 198938423.50 ns, 11.8577 ns/op +WorkloadResult 82: 16777216 op, 246705719.50 ns, 14.7048 ns/op +WorkloadResult 83: 16777216 op, 230365721.50 ns, 13.7309 ns/op +WorkloadResult 84: 16777216 op, 175164957.50 ns, 10.4406 ns/op +WorkloadResult 85: 16777216 op, 171728499.50 ns, 10.2358 ns/op +WorkloadResult 86: 16777216 op, 177056833.50 ns, 10.5534 ns/op +WorkloadResult 87: 16777216 op, 175936415.50 ns, 10.4866 ns/op +WorkloadResult 88: 16777216 op, 170114894.50 ns, 10.1396 ns/op +WorkloadResult 89: 16777216 op, 180095883.50 ns, 10.7346 ns/op +GC: 0 0 0 672 16777216 +Threading: 0 0 16777216 + +// AfterAll +// Benchmark Process 1348937 has exited with code 0. + +Mean = 11.926 ns, StdErr = 0.285 ns (2.39%), N = 89, StdDev = 2.684 ns +Min = 8.724 ns, Q1 = 10.167 ns, Median = 11.013 ns, Q3 = 12.533 ns, Max = 19.562 ns +IQR = 2.366 ns, LowerFence = 6.618 ns, UpperFence = 16.083 ns +ConfidenceInterval = [10.958 ns; 12.895 ns] (CI 99.9%), Margin = 0.969 ns (8.12% of Mean) +Skewness = 1.4, Kurtosis = 4.06, MValue = 3.41 + +// ************************** +// Benchmark: CopyBenchmarks.Copy128BytesToArray: DefaultJob +// *** Execute *** +// Launch: 1 / 1 +// Execute: dotnet "1a7668ac-c1b0-4053-8025-eef54642afd3.dll" --benchmarkName "Platform.Unsafe.Benchmarks.CopyBenchmarks.Copy128BytesToArray" --job "Default" --benchmarkId 2 in /tmp/gh-issue-solver-1757845403022/csharp/Platform.Unsafe.Benchmarks/bin/Release/net8/1a7668ac-c1b0-4053-8025-eef54642afd3/bin/Release/net8.0 +Failed to set up high priority. Make sure you have the right permissions. Message: Permission denied +// BeforeAnythingElse + +// Benchmark Process Environment Information: +// Runtime=.NET 8.0.19 (8.0.1925.36514), X64 RyuJIT +// GC=Concurrent Workstation +// Job: DefaultJob + +OverheadJitting 1: 1 op, 1604705.00 ns, 1.6047 ms/op +WorkloadJitting 1: 1 op, 345755.00 ns, 345.7550 us/op + +OverheadJitting 2: 16 op, 606017.00 ns, 37.8761 us/op +WorkloadJitting 2: 16 op, 504158.00 ns, 31.5099 us/op + +WorkloadPilot 1: 16 op, 4273.00 ns, 267.0625 ns/op +WorkloadPilot 2: 32 op, 4756.00 ns, 148.6250 ns/op +WorkloadPilot 3: 64 op, 8316.00 ns, 129.9375 ns/op +WorkloadPilot 4: 128 op, 9451.00 ns, 73.8359 ns/op +WorkloadPilot 5: 256 op, 22072.00 ns, 86.2188 ns/op +WorkloadPilot 6: 512 op, 33867.00 ns, 66.1465 ns/op +WorkloadPilot 7: 1024 op, 53959.00 ns, 52.6943 ns/op +WorkloadPilot 8: 2048 op, 93047.00 ns, 45.4331 ns/op +WorkloadPilot 9: 4096 op, 225164.00 ns, 54.9717 ns/op +WorkloadPilot 10: 8192 op, 322566.00 ns, 39.3757 ns/op +WorkloadPilot 11: 16384 op, 738285.00 ns, 45.0613 ns/op +WorkloadPilot 12: 32768 op, 5533447.00 ns, 168.8674 ns/op +WorkloadPilot 13: 65536 op, 5308653.00 ns, 81.0036 ns/op +WorkloadPilot 14: 131072 op, 13642595.00 ns, 104.0847 ns/op +WorkloadPilot 15: 262144 op, 6450917.00 ns, 24.6083 ns/op +WorkloadPilot 16: 524288 op, 10270720.00 ns, 19.5898 ns/op +WorkloadPilot 17: 1048576 op, 22091220.00 ns, 21.0678 ns/op +WorkloadPilot 18: 2097152 op, 38814838.00 ns, 18.5084 ns/op +WorkloadPilot 19: 4194304 op, 89000571.00 ns, 21.2194 ns/op +WorkloadPilot 20: 8388608 op, 217629534.00 ns, 25.9435 ns/op +WorkloadPilot 21: 16777216 op, 369190650.00 ns, 22.0055 ns/op +WorkloadPilot 22: 33554432 op, 1071401179.00 ns, 31.9302 ns/op + +OverheadWarmup 1: 33554432 op, 28872318.00 ns, 0.8605 ns/op +OverheadWarmup 2: 33554432 op, 6147717.00 ns, 0.1832 ns/op +OverheadWarmup 3: 33554432 op, 6358772.00 ns, 0.1895 ns/op +OverheadWarmup 4: 33554432 op, 6993436.00 ns, 0.2084 ns/op +OverheadWarmup 5: 33554432 op, 4782361.00 ns, 0.1425 ns/op +OverheadWarmup 6: 33554432 op, 3679125.00 ns, 0.1096 ns/op +OverheadWarmup 7: 33554432 op, 5686809.00 ns, 0.1695 ns/op +OverheadWarmup 8: 33554432 op, 5044906.00 ns, 0.1503 ns/op + +OverheadActual 1: 33554432 op, 5296567.00 ns, 0.1578 ns/op +OverheadActual 2: 33554432 op, 4952096.00 ns, 0.1476 ns/op +OverheadActual 3: 33554432 op, 3612070.00 ns, 0.1076 ns/op +OverheadActual 4: 33554432 op, 4145198.00 ns, 0.1235 ns/op +OverheadActual 5: 33554432 op, 3944582.00 ns, 0.1176 ns/op +OverheadActual 6: 33554432 op, 4251111.00 ns, 0.1267 ns/op +OverheadActual 7: 33554432 op, 3875058.00 ns, 0.1155 ns/op +OverheadActual 8: 33554432 op, 2823089.00 ns, 0.0841 ns/op +OverheadActual 9: 33554432 op, 1914645.00 ns, 0.0571 ns/op +OverheadActual 10: 33554432 op, 2676950.00 ns, 0.0798 ns/op +OverheadActual 11: 33554432 op, 2785537.00 ns, 0.0830 ns/op +OverheadActual 12: 33554432 op, 2698046.00 ns, 0.0804 ns/op +OverheadActual 13: 33554432 op, 3922376.00 ns, 0.1169 ns/op +OverheadActual 14: 33554432 op, 3672717.00 ns, 0.1095 ns/op +OverheadActual 15: 33554432 op, 2773662.00 ns, 0.0827 ns/op +OverheadActual 16: 33554432 op, 4223139.00 ns, 0.1259 ns/op +OverheadActual 17: 33554432 op, 5744005.00 ns, 0.1712 ns/op +OverheadActual 18: 33554432 op, 4372577.00 ns, 0.1303 ns/op +OverheadActual 19: 33554432 op, 4055200.00 ns, 0.1209 ns/op +OverheadActual 20: 33554432 op, 3187156.00 ns, 0.0950 ns/op + +WorkloadWarmup 1: 33554432 op, 893330680.00 ns, 26.6233 ns/op +WorkloadWarmup 2: 33554432 op, 622059101.00 ns, 18.5388 ns/op +WorkloadWarmup 3: 33554432 op, 983517098.00 ns, 29.3111 ns/op +WorkloadWarmup 4: 33554432 op, 822550790.00 ns, 24.5139 ns/op +WorkloadWarmup 5: 33554432 op, 617553337.00 ns, 18.4045 ns/op +WorkloadWarmup 6: 33554432 op, 681239095.00 ns, 20.3025 ns/op +// Benchmark Process 1349075 has exited with code 143. +Unhandled exception. \ No newline at end of file diff --git a/experiments/ImprovedMemoryBlockBenchmark.cs b/experiments/ImprovedMemoryBlockBenchmark.cs new file mode 100644 index 0000000..af940c8 --- /dev/null +++ b/experiments/ImprovedMemoryBlockBenchmark.cs @@ -0,0 +1,137 @@ +using System; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; +using Platform.Unsafe.Experiments; +using static System.Runtime.CompilerServices.Unsafe; + +#pragma warning disable CA1822 // Mark members as static + +namespace Platform.Unsafe.Experiments +{ + /// + /// Benchmark comparing current MemoryBlock.Zero with improved SIMD versions + /// + [SimpleJob] + [MemoryDiagnoser] + public unsafe class ImprovedMemoryBlockBenchmark + { + private static byte[] _smallArray; + private static byte[] _mediumArray; + private static byte[] _largeArray; + + [GlobalSetup] + public static void Setup() + { + _smallArray = new byte[256]; // 256 bytes - small block + _mediumArray = new byte[64 * 1024]; // 64KB - medium block + _largeArray = new byte[4 * 1024 * 1024]; // 4MB - large block + } + + // Small block benchmarks (256 bytes) + [Benchmark] + public void SmallBlock_Current() + { + fixed (byte* pointer = _smallArray) + { + MemoryBlock.Zero(pointer, _smallArray.Length); + } + } + + [Benchmark] + public void SmallBlock_SIMD() + { + fixed (byte* pointer = _smallArray) + { + ImprovedMemoryZero.ZeroSIMD(pointer, _smallArray.Length); + } + } + + [Benchmark] + public void SmallBlock_Adaptive() + { + fixed (byte* pointer = _smallArray) + { + ImprovedMemoryZero.ZeroAdaptive(pointer, _smallArray.Length); + } + } + + // Medium block benchmarks (64KB) + [Benchmark] + public void MediumBlock_Current() + { + fixed (byte* pointer = _mediumArray) + { + MemoryBlock.Zero(pointer, _mediumArray.Length); + } + } + + [Benchmark] + public void MediumBlock_SIMD() + { + fixed (byte* pointer = _mediumArray) + { + ImprovedMemoryZero.ZeroSIMD(pointer, _mediumArray.Length); + } + } + + [Benchmark] + public void MediumBlock_Adaptive() + { + fixed (byte* pointer = _mediumArray) + { + ImprovedMemoryZero.ZeroAdaptive(pointer, _mediumArray.Length); + } + } + + // Large block benchmarks (4MB) + [Benchmark] + public void LargeBlock_Current() + { + fixed (byte* pointer = _largeArray) + { + MemoryBlock.Zero(pointer, _largeArray.Length); + } + } + + [Benchmark] + public void LargeBlock_SIMD() + { + fixed (byte* pointer = _largeArray) + { + ImprovedMemoryZero.ZeroSIMD(pointer, _largeArray.Length); + } + } + + [Benchmark] + public void LargeBlock_MultiThreadedSIMD() + { + fixed (byte* pointer = _largeArray) + { + ImprovedMemoryZero.ZeroMultiThreadedSIMD(pointer, _largeArray.Length); + } + } + + [Benchmark] + public void LargeBlock_Adaptive() + { + fixed (byte* pointer = _largeArray) + { + ImprovedMemoryZero.ZeroAdaptive(pointer, _largeArray.Length); + } + } + } + + /// + /// Simple console program to run the benchmarks + /// + public class Program + { + public static void Main(string[] args) + { + Console.WriteLine("Platform.Unsafe Memory Zeroing Performance Comparison"); + Console.WriteLine("===================================================="); + + var summary = BenchmarkRunner.Run(); + } + } +} \ No newline at end of file diff --git a/experiments/ImprovedMemoryBlockBenchmark.csproj b/experiments/ImprovedMemoryBlockBenchmark.csproj new file mode 100644 index 0000000..01731b1 --- /dev/null +++ b/experiments/ImprovedMemoryBlockBenchmark.csproj @@ -0,0 +1,20 @@ + + + + Exe + net8 + false + true + latest + enable + + + + + + + + + + + \ No newline at end of file diff --git a/experiments/ImprovedMemoryZero.cs b/experiments/ImprovedMemoryZero.cs new file mode 100644 index 0000000..653cc94 --- /dev/null +++ b/experiments/ImprovedMemoryZero.cs @@ -0,0 +1,193 @@ +using System; +using System.Collections.Concurrent; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using System.Threading.Tasks; +using static System.Runtime.CompilerServices.Unsafe; + +namespace Platform.Unsafe.Experiments +{ + /// + /// Experimental high-performance memory zeroing implementations using SIMD and modern .NET features + /// + public static unsafe class ImprovedMemoryZero + { + private static readonly bool IsAvx2Supported = Avx2.IsSupported; + private static readonly bool IsAvx512Supported = Avx512F.IsSupported; + private static readonly int ProcessorCount = Environment.ProcessorCount; + + /// + /// SIMD-optimized memory zeroing with vectorization + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void ZeroSIMD(void* pointer, long capacity) + { + if (capacity <= 0) return; + + var ptr = (byte*)pointer; + var remaining = capacity; + + // Use AVX-512 if available and beneficial (large blocks) + if (IsAvx512Supported && remaining >= 512 && Vector512.IsHardwareAccelerated) + { + remaining = ZeroWithAvx512(ptr, remaining); + ptr += capacity - remaining; + } + // Use AVX2 for medium to large blocks + else if (IsAvx2Supported && remaining >= 256) + { + remaining = ZeroWithAvx2(ptr, remaining); + ptr += capacity - remaining; + } + // Use generic Vector for smaller blocks or when AVX is not available + else if (Vector.IsHardwareAccelerated && remaining >= Vector.Count * 4) + { + remaining = ZeroWithVector(ptr, remaining); + ptr += capacity - remaining; + } + + // Handle remaining bytes with traditional method + if (remaining > 0) + { + var uintMaxValue = uint.MaxValue; + while (remaining > uintMaxValue) + { + InitBlock(ptr, 0, uintMaxValue); + remaining -= uintMaxValue; + ptr += uintMaxValue; + } + if (remaining > 0) + { + InitBlock(ptr, 0, unchecked((uint)remaining)); + } + } + } + + /// + /// Multi-threaded SIMD memory zeroing + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void ZeroMultiThreadedSIMD(void* pointer, long capacity) + { + if (capacity <= 0) return; + + // For small blocks, use single-threaded SIMD + const long singleThreadThreshold = 64 * 1024; // 64KB + if (capacity < singleThreadThreshold) + { + ZeroSIMD(pointer, capacity); + return; + } + + // Determine optimal thread count + // Use fewer threads for memory-bound operations to avoid memory bandwidth saturation + var threads = Math.Min(ProcessorCount / 2, 4); // Max 4 threads to avoid memory bandwidth issues + if (threads <= 1) + { + ZeroSIMD(pointer, capacity); + return; + } + + // Partition work among threads + Parallel.ForEach( + Partitioner.Create(0L, capacity, capacity / threads), + new ParallelOptions { MaxDegreeOfParallelism = threads }, + range => + { + var ptr = (byte*)pointer + range.Item1; + var length = range.Item2 - range.Item1; + ZeroSIMD(ptr, length); + }); + } + + /// + /// Adaptive memory zeroing that chooses the best strategy based on size + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void ZeroAdaptive(void* pointer, long capacity) + { + if (capacity <= 0) return; + + // Small blocks: use simple InitBlock + if (capacity < 256) + { + var uintMaxValue = uint.MaxValue; + var ptr = (byte*)pointer; + var remaining = capacity; + + while (remaining > uintMaxValue) + { + InitBlock(ptr, 0, uintMaxValue); + remaining -= uintMaxValue; + ptr += uintMaxValue; + } + if (remaining > 0) + { + InitBlock(ptr, 0, unchecked((uint)remaining)); + } + return; + } + + // Medium blocks: use SIMD + if (capacity < 1024 * 1024) // 1MB + { + ZeroSIMD(pointer, capacity); + return; + } + + // Large blocks: use multi-threaded SIMD + ZeroMultiThreadedSIMD(pointer, capacity); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static long ZeroWithAvx512(byte* ptr, long remaining) + { + var vector512Zero = Vector512.Zero; + + // Process 512-bit (64-byte) chunks + while (remaining >= 64) + { + Avx512F.Store(ptr, vector512Zero); + ptr += 64; + remaining -= 64; + } + + return remaining; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static long ZeroWithAvx2(byte* ptr, long remaining) + { + var vector256Zero = Vector256.Zero; + + // Process 256-bit (32-byte) chunks + while (remaining >= 32) + { + Avx.Store(ptr, vector256Zero); + ptr += 32; + remaining -= 32; + } + + return remaining; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static long ZeroWithVector(byte* ptr, long remaining) + { + var vectorZero = Vector.Zero; + var vectorSize = Vector.Count; + + // Process Vector-sized chunks + while (remaining >= vectorSize) + { + vectorZero.CopyTo(new Span(ptr, vectorSize)); + ptr += vectorSize; + remaining -= vectorSize; + } + + return remaining; + } + } +} \ No newline at end of file diff --git a/experiments/MemoryZeroPerformanceAnalysis.md b/experiments/MemoryZeroPerformanceAnalysis.md new file mode 100644 index 0000000..787ea81 --- /dev/null +++ b/experiments/MemoryZeroPerformanceAnalysis.md @@ -0,0 +1,195 @@ +# Memory Zeroing Performance Analysis - Issue #12 + +## Executive Summary + +This document provides a comprehensive analysis of high-performance multi-threaded memory zeroing algorithms for the Platform.Unsafe library, specifically addressing issue #12: "Check if there is high-performance multi-thread version of algorithm". + +## Current Algorithm Analysis + +The current `MemoryBlock.Zero` implementation in `Platform.Unsafe/MemoryBlock.cs`: + +```csharp +public static void Zero(void* pointer, long capacity) +{ + // A way to prevent wasting resources due to Hyper-Threading. + var threads = Environment.ProcessorCount / 2; + if (threads <= 1) + { + ZeroBlock(pointer, 0, capacity); + } + else + { + // Using 2 threads because two-channel memory architecture is the most available type. + // CPUs mostly just wait for memory here. + threads = 2; + Parallel.ForEach(Partitioner.Create(0L, capacity), new ParallelOptions { MaxDegreeOfParallelism = threads }, range => ZeroBlock(pointer, range.Item1, range.Item2)); + } +} +``` + +### Current Algorithm Characteristics: +1. ✅ **Multi-threading**: Uses `Parallel.ForEach` with up to 2 threads +2. ✅ **Memory Architecture Awareness**: Limits to 2 threads for dual-channel memory +3. ✅ **Hyper-Threading Consideration**: Uses half the processor count +4. ❌ **SIMD Optimization**: Does not use SIMD instructions +5. ❌ **Adaptive Sizing**: No size-based algorithm selection +6. ❌ **Modern Hardware Features**: Doesn't leverage AVX2/AVX-512 + +## Research Findings + +### 1. SIMD Performance Gains (2024) +- **2x-4x performance boost** with basic SIMD operations +- **9x acceleration** for single-threaded vectorized programs on AVX2 +- **40-50x acceleration** for multi-threaded programs on 8-core processors +- AVX-512 provides additional gains on compatible hardware (Ice Lake+) + +### 2. Memory Bandwidth Considerations +- Memory operations are often **bandwidth-bound** rather than compute-bound +- Optimal thread count: **2-4 threads** to avoid memory bandwidth saturation +- Current 2-thread limit is well-designed for this constraint + +### 3. .NET 8 Hardware Intrinsics +- `Vector512.IsHardwareAccelerated` for AVX-512 detection +- Improved SIMD support with better JIT optimization +- Hardware-specific intrinsics available via `System.Runtime.Intrinsics` + +## Recommended Improvements + +### 1. **SIMD-Enhanced Algorithm** + +```csharp +public static void ZeroSIMD(void* pointer, long capacity) +{ + if (capacity <= 0) return; + + var ptr = (byte*)pointer; + var remaining = capacity; + + // Use AVX-512 for large blocks if available + if (Vector512.IsHardwareAccelerated && remaining >= 512) + { + remaining = ZeroWithAvx512(ptr, remaining); + ptr += capacity - remaining; + } + // Use AVX2 for medium blocks + else if (Avx2.IsSupported && remaining >= 256) + { + remaining = ZeroWithAvx2(ptr, remaining); + ptr += capacity - remaining; + } + // Use generic Vector for smaller blocks + else if (Vector.IsHardwareAccelerated && remaining >= Vector.Count * 4) + { + remaining = ZeroWithVector(ptr, remaining); + ptr += capacity - remaining; + } + + // Handle remaining bytes with traditional method + if (remaining > 0) + { + InitBlock(ptr, 0, unchecked((uint)remaining)); + } +} +``` + +### 2. **Adaptive Algorithm Selection** + +```csharp +public static void ZeroAdaptive(void* pointer, long capacity) +{ + // Small blocks (< 256 bytes): Simple InitBlock + if (capacity < 256) + { + InitBlock(pointer, 0, unchecked((uint)capacity)); + return; + } + + // Medium blocks (256B - 1MB): SIMD only + if (capacity < 1024 * 1024) + { + ZeroSIMD(pointer, capacity); + return; + } + + // Large blocks (> 1MB): Multi-threaded SIMD + ZeroMultiThreadedSIMD(pointer, capacity); +} +``` + +### 3. **Enhanced Multi-Threading Strategy** + +```csharp +public static void ZeroMultiThreadedSIMD(void* pointer, long capacity) +{ + // Determine optimal thread count (2-4 threads for memory bandwidth) + var threads = Math.Min(Math.Min(Environment.ProcessorCount / 2, 4), + (int)(capacity / (64 * 1024))); // 64KB per thread minimum + + if (threads <= 1) + { + ZeroSIMD(pointer, capacity); + return; + } + + Parallel.ForEach( + Partitioner.Create(0L, capacity, capacity / threads), + new ParallelOptions { MaxDegreeOfParallelism = threads }, + range => + { + var ptr = (byte*)pointer + range.Item1; + var length = range.Item2 - range.Item1; + ZeroSIMD(ptr, length); + }); +} +``` + +## Performance Expectations + +Based on research findings, the improved algorithms should provide: + +1. **Small blocks (< 256B)**: Minimal overhead, same performance +2. **Medium blocks (256B - 1MB)**: **2-4x improvement** with SIMD +3. **Large blocks (> 1MB)**: **4-10x improvement** with multi-threaded SIMD + +## Implementation Recommendations + +### Phase 1: SIMD Enhancement +1. Add SIMD-based zeroing methods +2. Maintain backward compatibility +3. Add feature detection for hardware capabilities + +### Phase 2: Adaptive Algorithm +1. Implement size-based algorithm selection +2. Add benchmarks to verify performance gains +3. Update existing `Zero` method to use adaptive approach + +### Phase 3: Advanced Optimizations +1. Investigate cache-line alignment optimizations +2. Add support for non-temporal memory operations for very large blocks +3. Consider NUMA-aware threading for multi-socket systems + +## .NET Memory Allocation Analysis + +Regarding the TODO comment about AllocHGlobal/ReAllocHGlobal zero flag options: + +- **Current Status**: Neither `Marshal.AllocHGlobal` nor `Marshal.ReAllocHGlobal` provide zero-memory flags +- **Recommendation**: Use the newer `NativeMemory` class in .NET 6+ which provides `NativeMemory.AllocZeroed` +- **Alternative**: Continue using manual zeroing after allocation as currently implemented + +## Testing Strategy + +1. **Unit Tests**: Verify correctness across different sizes and alignments +2. **Performance Tests**: Benchmark against current implementation +3. **Hardware Tests**: Validate on systems with/without AVX2/AVX-512 +4. **Integration Tests**: Ensure compatibility with existing Platform.Unsafe usage + +## Conclusion + +The current algorithm is well-designed for multi-threading considerations but lacks modern SIMD optimizations. The recommended improvements can provide significant performance gains (2-10x) while maintaining the existing sound architectural decisions around memory bandwidth management. + +The solution implements a three-tier approach: +1. **Hardware detection** for optimal SIMD instruction selection +2. **Adaptive sizing** for appropriate algorithm selection +3. **Smart threading** to maximize memory bandwidth utilization + +These improvements align with modern .NET 8 capabilities and hardware trends while maintaining backward compatibility. \ No newline at end of file