rust-lang
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎collector/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎collector/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎collector/src/compile/benchmark/target.rs‎
Lines changed: 1 addition & 0 deletions b/‎collector/src/compile/benchmark/target.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎collector/src/compile/execute/mod.rs‎
Lines changed: 168 additions & 83 deletions b/‎collector/src/compile/execute/mod.rs‎
Lines changed: 168 additions & 83 deletions
diff --git a/‎site/frontend/package-lock.json‎
Lines changed: 0 additions & 4 deletions b/‎site/frontend/package-lock.json‎
Lines changed: 0 additions & 4 deletions
@@ -18,6 +18,7 @@ serde = { workspace = true, features = ["derive"] }
 serde_json = { workspace = true }
 tokio = { workspace = true, features = ["rt", "process"] }
 
+cfg-if = "1"
 thiserror = "2"
 tempfile = "3"
 libc = "0.2"
 
@@ -5,6 +5,7 @@ use std::{fmt, str::FromStr};
 /// https://doc.rust-lang.org/nightly/rustc/platform-support.html
 ///
 /// Presently we only support x86_64
+/// FIXME: we actually support Windows and aarch64, but that isn't captured here.
 #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, serde::Deserialize)]
 pub enum Target {
     /// `x86_64-unknown-linux-gnu`
 
@@ -135,81 +135,136 @@ pub struct CargoProcess<'a> {
     pub workspace_package: Option<String>,
 }
 
-/// Returns an optional list of P-cores, if the system has P-cores and E-cores.
-/// This list *should* be in a format suitable for the `taskset` command.
-#[cfg(target_os = "linux")]
-fn performance_cores() -> Option<&'static String> {
-    use std::sync::LazyLock;
-    static PERFORMANCE_CORES: LazyLock<Option<String>> = LazyLock::new(|| {
-        if std::fs::exists("/sys/devices/cpu")
-            .expect("Could not check if `/sys/devices/cpu` exists")
-        {
-            // If /sys/devices/cpu exists, then this is not a hybrid CPU.
-            None
-        } else if std::fs::exists("/sys/devices/cpu_core")
-            .expect("Could not check if `/sys/devices/cpu_core` exists!")
-        {
-            // If /sys/devices/cpu_core exists, then this is a hybrid CPU.
-            eprintln!("WARNING: hybrid Intel CPU detected.");
-            eprintln!("WARNING: test suite will only use P-cores, not E-cores");
-            Some(
-                std::fs::read_to_string("/sys/devices/cpu_core/cpus")
-                    .unwrap()
-                    .trim()
-                    .to_string(),
-            )
-        } else {
-            // If neither dir exists, then something is wrong, because `/sys/devices/cpu` has been
-            // in Linux for over a decade.
-            eprintln!("WARNING: neither `/sys/devices/cpu` nor `/sys/devices/cpu_core` present");
-            eprintln!("WARNING: unable to determine if CPU has a hybrid architecture");
-            None
-        }
-    });
-    (*PERFORMANCE_CORES).as_ref()
+// Some CPUs have a hybrid architecture with a mixture of P-cores (power) and E-cores (efficiency).
+// When benchmarking we use `taskset` to restrict execution to P-cores. Why?
+// 1. The instruction count info for E-cores is often incomplete, and a substantial chunk of events
+//    is lost.
+// 2. The performance characteristics of E-cores are less reliable, so excluding them from the
+//    benchmark makes things easier.
+// 3. An unpredictable mix of P-core and E-core execution can give inconsistent results.
+//
+// If a hybrid architecture is detected, this type is used to hold information about the P-cores.
+// The detection method used varies across platforms.
+#[derive(Debug)]
+struct PCores {
+    /// The number of P-cores.
+    len: usize,
+    /// The list of P-cores, in a form suitable for passing to `taskset`.
+    list: String,
 }
 
-#[cfg(not(target_os = "linux"))]
-// Modify this stub if you want to add support for P-/E-cores on more OSs
-fn performance_cores() -> Option<&'static String> {
-    None
-}
+static P_CORES: LazyLock<Option<PCores>> = LazyLock::new(p_cores);
+
+cfg_if::cfg_if! {
+    if #[cfg(all(target_os = "linux", target_arch = "x86_64"))] {
+        // On x86-64/Linux we look for the presence of `/sys/devices/cpu_core/` which indicates a
+        // hybrid architecture.
+        fn p_cores() -> Option<PCores> {
+            if std::fs::exists("/sys/devices/cpu").unwrap() {
+                // `/sys/devices/cpu` exists: this is not a hybrid CPU.
+                None
+            } else if std::fs::exists("/sys/devices/cpu_core").unwrap() {
+                // `/sys/devices/cpu_core/` exists: this is a hybrid CPU, and the `cpus` file
+                // within contains the list of P-cores. (`sys/devices/cpu_atom/cpus` contains
+                // the list of E-cores).
+                let list =
+                    std::fs::read_to_string("/sys/devices/cpu_core/cpus")
+                        .unwrap()
+                        .trim()
+                        .to_string();
+                eprintln!(
+                    "WARNING: hybrid Intel CPU detected; test suite will only use P-cores: {list}"
+                );
+                // Parse CPU list to extract the number of P-cores. This assumes the P-core ids are
+                // continuous, in format `m-n`.
+                let (first, last) = list
+                    .split_once("-")
+                    .unwrap_or_else(|| panic!("unsupported P-core list format: {list:?}."));
+                let first = first
+                    .parse::<usize>()
+                    .expect("expected a number at the start of the P-core list");
+                let last = last
+                    .parse::<usize>()
+                    .expect("expected a number at the end of the P-core list");
+                let len = last - first + 1; // e.g. "0-3" is four cores: [0, 1, 2, 3]
+                Some(PCores { len, list })
+            } else {
+                // Neither dir exists: something is wrong, because `/sys/devices/cpu` has been
+                // in Linux (on x86-64, at least) for over a decade.
+                eprintln!(
+                    "WARNING: `/sys/devices/{{cpu,cpu_core}}` not found; \
+                    unable to determine if CPU has a hybrid architecture"
+                );
+                None
+            }
+        }
+    } else if #[cfg(all(target_os = "linux", target_arch = "aarch64"))] {
+        // On ARM64/Linux there is no definitive way to distinguish P-cores from E-cores, so we
+        // must use a heuristic.
+        //
+        // Each core has a listed "capacity", a performance estimate relative to the most powerful
+        // core in the system (scaled 0-1024). For example, an ASUS GX10 Ascent has a Cortex-X925
+        // with 10 P-cores and a Cortex-A725 with 10 E-cores. The reported capacities are:
+        // * Cores  0- 4:  718  (E-cores in cluster 1 with 8MiB L3 cache)
+        // * Cores  5- 9:  997  (P-cores in cluster 1 with 8MiB L3 cache)
+        // * Cores 10-14:  731  (E-cores in cluster 2 with 16MiB L3 cache)
+        // * Cores 15-18: 1017  (P-cores in cluster 2 with 16MiB L3 cache)
+        // * Core     19: 1024  (P-core in cluster 2 with 16MiB L3 cache))
+        //
+        // The heuristic is that any core with a capacity at least 90% of the maximum capacity is
+        // considered a P-core, and any other core is considered an E-core. (The 718/731 and
+        // 997/1017 differences are presumably due to the L3 cache size. The reason for the
+        // 1017/1024 difference is unclear. Even though the P-cores are not all identical, they are
+        // close enough for our purposes.)
+        fn p_cores() -> Option<PCores> {
+            let mut caps = vec![];
+            for i in 0.. {
+                let path = format!("/sys/devices/system/cpu/cpu{i}/cpu_capacity");
+                if !std::fs::exists(&path).unwrap() {
+                    break;
+                }
+                let cap = std::fs::read_to_string(&path).unwrap().trim().parse::<usize>().unwrap();
+                caps.push((i, cap));
+            }
 
-#[cfg(target_os = "linux")]
-/// Makes the benchmark run only on Performance cores.
-fn run_on_p_cores(path: &Path, cpu_list: &str) -> Command {
-    // Parse CPU list to extract the number of P-cores!
-    // This assumes the P-core id's are continuous, in format `first_id-last_id`
-    let (core_start, core_end) = cpu_list
-        .split_once("-")
-        .unwrap_or_else(|| panic!("Unsupported P-core list format: {cpu_list:?}."));
-    let core_start: u32 = core_start
-        .parse()
-        .expect("Expected a number when parsing the start of the P-core list!");
-    let core_end: u32 = core_end
-        .parse()
-        .expect("Expected a number when parsing the end of the P-core list!");
-    let core_count = core_end - core_start + 1; // e.g. "0-3" is four cores: [0, 1, 2, 3]
-
-    let mut cmd = Command::new("taskset");
-    // Set job count to P-core count. This is done for 3 reasons:
-    // 1. The instruction count info for E-cores is often incomplete, and a substantial chunk of
-    //    events is lost.
-    // 2. The performance characteristics of E-cores are less reliable, so excluding them from the
-    //    benchmark makes things easier.
-    // 3. An unpredictable mix of P-core and E-core execution will give inconsistent results.
-    cmd.env("CARGO_BUILD_JOBS", format!("{core_count}"));
-    // Pass the P-core list to taskset to pin task to the P-core.
-    cmd.arg("--cpu-list");
-    cmd.arg(cpu_list);
-    cmd.arg(path);
-    cmd
-}
+            if let Some(max_cap) = caps.iter().map(|(_, cap)| cap).max() {
+                // Filter out cores that fail the 90% capacity check.
+                let cap_threshold = *max_cap as f64 * 0.9;
+                let p_cores: Vec<_> = caps.iter().filter_map(|(i, cap)| {
+                    if *cap as f64 >= cap_threshold {
+                        Some(i.to_string())
+                    } else {
+                        None
+                    }
+                }).collect();
 
-#[cfg(not(target_os = "linux"))]
-// Modify this stub if you want to add support for P-cores/E-cores on more OSs.
-fn run_on_p_cores(_path: &Path, _cpu_list: &str) -> Command {
-    todo!("Can't run commands on the P-cores on this platform");
+                if p_cores.len() == caps.len() {
+                    // All cores have roughly the same capacity; this is not a hybrid CPU.
+                    None
+                } else {
+                    let list = p_cores.join(",");
+                    eprintln!(
+                        "WARNING: hybrid ARM CPU detected; test suite will only use P-cores: {list}"
+                    );
+                    Some(PCores {
+                        len: p_cores.len(),
+                        list,
+                    })
+                }
+            } else {
+                eprintln!(
+                    "WARNING: `/sys/devices/system/cpu/cpu*/cpu_capacity` not found; \
+                    unable to determine if CPU has a hybrid architecture"
+                );
+                None
+            }
+        }
+    } else {
+        // Modify this stub if you want to add support for hybrid architectures on more platforms.
+        fn p_cores() -> Option<PCores> {
+            None
+        }
+    }
 }
 
 impl<'a> CargoProcess<'a> {
@@ -230,11 +285,17 @@ impl<'a> CargoProcess<'a> {
     }
 
     fn base_command(&self, cwd: &Path, subcommand: &str) -> Command {
-        // Processors with P-core and E-cores require special handling.
-        let mut cmd = if let Some(p_cores) = performance_cores() {
-            run_on_p_cores(Path::new(&self.toolchain.components.cargo), p_cores)
+        let cargo_path = Path::new(&self.toolchain.components.cargo);
+        let mut cmd = if let Some(p_cores) = (*P_CORES).as_ref() {
+            // Processors with P-cores and E-cores require special handling.
+            let mut cmd = Command::new("taskset");
+            cmd.env("CARGO_BUILD_JOBS", p_cores.len.to_string());
+            cmd.arg("--cpu-list");
+            cmd.arg(&p_cores.list);
+            cmd.arg(cargo_path);
+            cmd
         } else {
-            Command::new(Path::new(&self.toolchain.components.cargo))
+            Command::new(cargo_path)
         };
         cmd
             // Not all cargo invocations (e.g. `cargo clean`) need all of these
@@ -620,6 +681,11 @@ fn process_stat_output(
     let stdout = String::from_utf8(output.stdout.clone()).expect("utf8 output");
     let mut stats = Stats::new();
 
+    // ARM P-core events have names like `armv8_pmuv3_0/instructions:u/` and
+    // `armv8_pmuv3_1/branche-misses/`.
+    #[cfg(all(target_os = "linux", target_arch = "aarch64"))]
+    let arm_p_core_events_re = regex::Regex::new(r"armv[0-9]_pmuv[0-9]_[0-9]/([^/]*)/").unwrap();
+
     let mut self_profile_dir: Option<PathBuf> = None;
     let mut self_profile_crate: Option<String> = None;
     for line in stdout.lines() {
@@ -670,24 +736,43 @@ fn process_stat_output(
                 }
             };
         }
+
         let mut parts = line.split(';').map(|s| s.trim());
         let cnt = get!(parts.next());
+        if cnt == "<not supported>" || cnt == "<not counted>" || cnt.is_empty() {
+            continue;
+        }
+
         let _unit = get!(parts.next());
-        let mut name = get!(parts.next());
-        // Map P-core events to normal events
-        if name == "cpu_core/instructions:u/" {
-            name = "instructions:u";
+
+        #[allow(unused_mut)]
+        let mut name = get!(parts.next()).to_string();
+        // Map P-core event name to normal event names.
+        cfg_if::cfg_if! {
+            if #[cfg(all(target_os = "linux", target_arch = "x86_64"))] {
+                if name == "cpu_core/instructions:u/" {
+                    name = "instructions:u".to_string();
+                }
+            } else if #[cfg(all(target_os = "linux", target_arch = "aarch64"))] {
+                // ARM P-core events have names like `armv8_pmuv3_0/instructions:u/` and
+                // `armv8_pmuv3_1/branche-misses/`.
+                if let Some(event) = arm_p_core_events_re.captures(&name) {
+                    name = event[1].to_string();
+                }
+            }
         }
+
         let _time = get!(parts.next());
+
         let pct = get!(parts.next());
-        if cnt == "<not supported>" || cnt == "<not counted>" || cnt.is_empty() {
-            continue;
-        }
         if !pct.starts_with("100.") {
+            // If this fails, it's probably because the CPU has a hybrid architecture and the
+            // metric is split across P-cores and E-cores. See `PCores`.
             panic!("measurement of `{name}` only active for {pct}% of the time");
         }
+
         stats.insert(
-            name.to_owned(),
+            name,
             cnt.parse()
                 .map_err(|e| DeserializeStatError::ParseError(cnt.to_string(), e))?,
         );