@@ -135,81 +135,136 @@ pub struct CargoProcess<'a> {
135135 pub workspace_package : Option < String > ,
136136}
137137
138- /// Returns an optional list of P-cores, if the system has P-cores and E-cores.
139- /// This list *should* be in a format suitable for the `taskset` command.
140- #[ cfg( target_os = "linux" ) ]
141- fn performance_cores ( ) -> Option < & ' static String > {
142- use std:: sync:: LazyLock ;
143- static PERFORMANCE_CORES : LazyLock < Option < String > > = LazyLock :: new ( || {
144- if std:: fs:: exists ( "/sys/devices/cpu" )
145- . expect ( "Could not check if `/sys/devices/cpu` exists" )
146- {
147- // If /sys/devices/cpu exists, then this is not a hybrid CPU.
148- None
149- } else if std:: fs:: exists ( "/sys/devices/cpu_core" )
150- . expect ( "Could not check if `/sys/devices/cpu_core` exists!" )
151- {
152- // If /sys/devices/cpu_core exists, then this is a hybrid CPU.
153- eprintln ! ( "WARNING: hybrid Intel CPU detected." ) ;
154- eprintln ! ( "WARNING: test suite will only use P-cores, not E-cores" ) ;
155- Some (
156- std:: fs:: read_to_string ( "/sys/devices/cpu_core/cpus" )
157- . unwrap ( )
158- . trim ( )
159- . to_string ( ) ,
160- )
161- } else {
162- // If neither dir exists, then something is wrong, because `/sys/devices/cpu` has been
163- // in Linux for over a decade.
164- eprintln ! ( "WARNING: neither `/sys/devices/cpu` nor `/sys/devices/cpu_core` present" ) ;
165- eprintln ! ( "WARNING: unable to determine if CPU has a hybrid architecture" ) ;
166- None
167- }
168- } ) ;
169- ( * PERFORMANCE_CORES ) . as_ref ( )
138+ // Some CPUs have a hybrid architecture with a mixture of P-cores (power) and E-cores (efficiency).
139+ // When benchmarking we use `taskset` to restrict execution to P-cores. Why?
140+ // 1. The instruction count info for E-cores is often incomplete, and a substantial chunk of events
141+ // is lost.
142+ // 2. The performance characteristics of E-cores are less reliable, so excluding them from the
143+ // benchmark makes things easier.
144+ // 3. An unpredictable mix of P-core and E-core execution can give inconsistent results.
145+ //
146+ // If a hybrid architecture is detected, this type is used to hold information about the P-cores.
147+ // The detection method used varies across platforms.
148+ #[ derive( Debug ) ]
149+ struct PCores {
150+ /// The number of P-cores.
151+ len : usize ,
152+ /// The list of P-cores, in a form suitable for passing to `taskset`.
153+ list : String ,
170154}
171155
172- #[ cfg( not( target_os = "linux" ) ) ]
173- // Modify this stub if you want to add support for P-/E-cores on more OSs
174- fn performance_cores ( ) -> Option < & ' static String > {
175- None
176- }
156+ static P_CORES : LazyLock < Option < PCores > > = LazyLock :: new ( p_cores) ;
157+
158+ cfg_if:: cfg_if! {
159+ if #[ cfg( all( target_os = "linux" , target_arch = "x86_64" ) ) ] {
160+ // On x86-64/Linux we look for the presence of `/sys/devices/cpu_core/` which indicates a
161+ // hybrid architecture.
162+ fn p_cores( ) -> Option <PCores > {
163+ if std:: fs:: exists( "/sys/devices/cpu" ) . unwrap( ) {
164+ // `/sys/devices/cpu` exists: this is not a hybrid CPU.
165+ None
166+ } else if std:: fs:: exists( "/sys/devices/cpu_core" ) . unwrap( ) {
167+ // `/sys/devices/cpu_core/` exists: this is a hybrid CPU, and the `cpus` file
168+ // within contains the list of P-cores. (`sys/devices/cpu_atom/cpus` contains
169+ // the list of E-cores).
170+ let list =
171+ std:: fs:: read_to_string( "/sys/devices/cpu_core/cpus" )
172+ . unwrap( )
173+ . trim( )
174+ . to_string( ) ;
175+ eprintln!(
176+ "WARNING: hybrid Intel CPU detected; test suite will only use P-cores: {list}"
177+ ) ;
178+ // Parse CPU list to extract the number of P-cores. This assumes the P-core ids are
179+ // continuous, in format `m-n`.
180+ let ( first, last) = list
181+ . split_once( "-" )
182+ . unwrap_or_else( || panic!( "unsupported P-core list format: {list:?}." ) ) ;
183+ let first = first
184+ . parse:: <usize >( )
185+ . expect( "expected a number at the start of the P-core list" ) ;
186+ let last = last
187+ . parse:: <usize >( )
188+ . expect( "expected a number at the end of the P-core list" ) ;
189+ let len = last - first + 1 ; // e.g. "0-3" is four cores: [0, 1, 2, 3]
190+ Some ( PCores { len, list } )
191+ } else {
192+ // Neither dir exists: something is wrong, because `/sys/devices/cpu` has been
193+ // in Linux (on x86-64, at least) for over a decade.
194+ eprintln!(
195+ "WARNING: `/sys/devices/{{cpu,cpu_core}}` not found; \
196+ unable to determine if CPU has a hybrid architecture"
197+ ) ;
198+ None
199+ }
200+ }
201+ } else if #[ cfg( all( target_os = "linux" , target_arch = "aarch64" ) ) ] {
202+ // On ARM64/Linux there is no definitive way to distinguish P-cores from E-cores, so we
203+ // must use a heuristic.
204+ //
205+ // Each core has a listed "capacity", a performance estimate relative to the most powerful
206+ // core in the system (scaled 0-1024). For example, an ASUS GX10 Ascent has a Cortex-X925
207+ // with 10 P-cores and a Cortex-A725 with 10 E-cores. The reported capacities are:
208+ // * Cores 0- 4: 718 (E-cores in cluster 1 with 8MiB L3 cache)
209+ // * Cores 5- 9: 997 (P-cores in cluster 1 with 8MiB L3 cache)
210+ // * Cores 10-14: 731 (E-cores in cluster 2 with 16MiB L3 cache)
211+ // * Cores 15-18: 1017 (P-cores in cluster 2 with 16MiB L3 cache)
212+ // * Core 19: 1024 (P-core in cluster 2 with 16MiB L3 cache))
213+ //
214+ // The heuristic is that any core with a capacity at least 90% of the maximum capacity is
215+ // considered a P-core, and any other core is considered an E-core. (The 718/731 and
216+ // 997/1017 differences are presumably due to the L3 cache size. The reason for the
217+ // 1017/1024 difference is unclear. Even though the P-cores are not all identical, they are
218+ // close enough for our purposes.)
219+ fn p_cores( ) -> Option <PCores > {
220+ let mut caps = vec![ ] ;
221+ for i in 0 .. {
222+ let path = format!( "/sys/devices/system/cpu/cpu{i}/cpu_capacity" ) ;
223+ if !std:: fs:: exists( & path) . unwrap( ) {
224+ break ;
225+ }
226+ let cap = std:: fs:: read_to_string( & path) . unwrap( ) . trim( ) . parse:: <usize >( ) . unwrap( ) ;
227+ caps. push( ( i, cap) ) ;
228+ }
177229
178- #[ cfg( target_os = "linux" ) ]
179- /// Makes the benchmark run only on Performance cores.
180- fn run_on_p_cores ( path : & Path , cpu_list : & str ) -> Command {
181- // Parse CPU list to extract the number of P-cores!
182- // This assumes the P-core id's are continuous, in format `first_id-last_id`
183- let ( core_start, core_end) = cpu_list
184- . split_once ( "-" )
185- . unwrap_or_else ( || panic ! ( "Unsupported P-core list format: {cpu_list:?}." ) ) ;
186- let core_start: u32 = core_start
187- . parse ( )
188- . expect ( "Expected a number when parsing the start of the P-core list!" ) ;
189- let core_end: u32 = core_end
190- . parse ( )
191- . expect ( "Expected a number when parsing the end of the P-core list!" ) ;
192- let core_count = core_end - core_start + 1 ; // e.g. "0-3" is four cores: [0, 1, 2, 3]
193-
194- let mut cmd = Command :: new ( "taskset" ) ;
195- // Set job count to P-core count. This is done for 3 reasons:
196- // 1. The instruction count info for E-cores is often incomplete, and a substantial chunk of
197- // events is lost.
198- // 2. The performance characteristics of E-cores are less reliable, so excluding them from the
199- // benchmark makes things easier.
200- // 3. An unpredictable mix of P-core and E-core execution will give inconsistent results.
201- cmd. env ( "CARGO_BUILD_JOBS" , format ! ( "{core_count}" ) ) ;
202- // Pass the P-core list to taskset to pin task to the P-core.
203- cmd. arg ( "--cpu-list" ) ;
204- cmd. arg ( cpu_list) ;
205- cmd. arg ( path) ;
206- cmd
207- }
230+ if let Some ( max_cap) = caps. iter( ) . map( |( _, cap) | cap) . max( ) {
231+ // Filter out cores that fail the 90% capacity check.
232+ let cap_threshold = * max_cap as f64 * 0.9 ;
233+ let p_cores: Vec <_> = caps. iter( ) . filter_map( |( i, cap) | {
234+ if * cap as f64 >= cap_threshold {
235+ Some ( i. to_string( ) )
236+ } else {
237+ None
238+ }
239+ } ) . collect( ) ;
208240
209- #[ cfg( not( target_os = "linux" ) ) ]
210- // Modify this stub if you want to add support for P-cores/E-cores on more OSs.
211- fn run_on_p_cores ( _path : & Path , _cpu_list : & str ) -> Command {
212- todo ! ( "Can't run commands on the P-cores on this platform" ) ;
241+ if p_cores. len( ) == caps. len( ) {
242+ // All cores have roughly the same capacity; this is not a hybrid CPU.
243+ None
244+ } else {
245+ let list = p_cores. join( "," ) ;
246+ eprintln!(
247+ "WARNING: hybrid ARM CPU detected; test suite will only use P-cores: {list}"
248+ ) ;
249+ Some ( PCores {
250+ len: p_cores. len( ) ,
251+ list,
252+ } )
253+ }
254+ } else {
255+ eprintln!(
256+ "WARNING: `/sys/devices/system/cpu/cpu*/cpu_capacity` not found; \
257+ unable to determine if CPU has a hybrid architecture"
258+ ) ;
259+ None
260+ }
261+ }
262+ } else {
263+ // Modify this stub if you want to add support for hybrid architectures on more platforms.
264+ fn p_cores( ) -> Option <PCores > {
265+ None
266+ }
267+ }
213268}
214269
215270impl < ' a > CargoProcess < ' a > {
@@ -230,11 +285,17 @@ impl<'a> CargoProcess<'a> {
230285 }
231286
232287 fn base_command ( & self , cwd : & Path , subcommand : & str ) -> Command {
233- // Processors with P-core and E-cores require special handling.
234- let mut cmd = if let Some ( p_cores) = performance_cores ( ) {
235- run_on_p_cores ( Path :: new ( & self . toolchain . components . cargo ) , p_cores)
288+ let cargo_path = Path :: new ( & self . toolchain . components . cargo ) ;
289+ let mut cmd = if let Some ( p_cores) = ( * P_CORES ) . as_ref ( ) {
290+ // Processors with P-cores and E-cores require special handling.
291+ let mut cmd = Command :: new ( "taskset" ) ;
292+ cmd. env ( "CARGO_BUILD_JOBS" , p_cores. len . to_string ( ) ) ;
293+ cmd. arg ( "--cpu-list" ) ;
294+ cmd. arg ( & p_cores. list ) ;
295+ cmd. arg ( cargo_path) ;
296+ cmd
236297 } else {
237- Command :: new ( Path :: new ( & self . toolchain . components . cargo ) )
298+ Command :: new ( cargo_path )
238299 } ;
239300 cmd
240301 // Not all cargo invocations (e.g. `cargo clean`) need all of these
@@ -620,6 +681,11 @@ fn process_stat_output(
620681 let stdout = String :: from_utf8 ( output. stdout . clone ( ) ) . expect ( "utf8 output" ) ;
621682 let mut stats = Stats :: new ( ) ;
622683
684+ // ARM P-core events have names like `armv8_pmuv3_0/instructions:u/` and
685+ // `armv8_pmuv3_1/branche-misses/`.
686+ #[ cfg( all( target_os = "linux" , target_arch = "aarch64" ) ) ]
687+ let arm_p_core_events_re = regex:: Regex :: new ( r"armv[0-9]_pmuv[0-9]_[0-9]/([^/]*)/" ) . unwrap ( ) ;
688+
623689 let mut self_profile_dir: Option < PathBuf > = None ;
624690 let mut self_profile_crate: Option < String > = None ;
625691 for line in stdout. lines ( ) {
@@ -670,24 +736,43 @@ fn process_stat_output(
670736 }
671737 } ;
672738 }
739+
673740 let mut parts = line. split ( ';' ) . map ( |s| s. trim ( ) ) ;
674741 let cnt = get ! ( parts. next( ) ) ;
742+ if cnt == "<not supported>" || cnt == "<not counted>" || cnt. is_empty ( ) {
743+ continue ;
744+ }
745+
675746 let _unit = get ! ( parts. next( ) ) ;
676- let mut name = get ! ( parts. next( ) ) ;
677- // Map P-core events to normal events
678- if name == "cpu_core/instructions:u/" {
679- name = "instructions:u" ;
747+
748+ #[ allow( unused_mut) ]
749+ let mut name = get ! ( parts. next( ) ) . to_string ( ) ;
750+ // Map P-core event name to normal event names.
751+ cfg_if:: cfg_if! {
752+ if #[ cfg( all( target_os = "linux" , target_arch = "x86_64" ) ) ] {
753+ if name == "cpu_core/instructions:u/" {
754+ name = "instructions:u" . to_string( ) ;
755+ }
756+ } else if #[ cfg( all( target_os = "linux" , target_arch = "aarch64" ) ) ] {
757+ // ARM P-core events have names like `armv8_pmuv3_0/instructions:u/` and
758+ // `armv8_pmuv3_1/branche-misses/`.
759+ if let Some ( event) = arm_p_core_events_re. captures( & name) {
760+ name = event[ 1 ] . to_string( ) ;
761+ }
762+ }
680763 }
764+
681765 let _time = get ! ( parts. next( ) ) ;
766+
682767 let pct = get ! ( parts. next( ) ) ;
683- if cnt == "<not supported>" || cnt == "<not counted>" || cnt. is_empty ( ) {
684- continue ;
685- }
686768 if !pct. starts_with ( "100." ) {
769+ // If this fails, it's probably because the CPU has a hybrid architecture and the
770+ // metric is split across P-cores and E-cores. See `PCores`.
687771 panic ! ( "measurement of `{name}` only active for {pct}% of the time" ) ;
688772 }
773+
689774 stats. insert (
690- name. to_owned ( ) ,
775+ name,
691776 cnt. parse ( )
692777 . map_err ( |e| DeserializeStatError :: ParseError ( cnt. to_string ( ) , e) ) ?,
693778 ) ;
0 commit comments