Skip to content

Commit 759a79d

Browse files
authored
Merge pull request #2356 from nnethercote/arm-hybrid-support
ARM hybrid CPU support
2 parents f8d2bd3 + 2902893 commit 759a79d

File tree

5 files changed

+172
-72
lines changed

5 files changed

+172
-72
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

collector/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ serde = { workspace = true, features = ["derive"] }
1818
serde_json = { workspace = true }
1919
tokio = { workspace = true, features = ["rt", "process"] }
2020

21+
cfg-if = "1"
2122
thiserror = "2"
2223
tempfile = "3"
2324
libc = "0.2"

collector/src/compile/benchmark/target.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use std::{fmt, str::FromStr};
55
/// https://doc.rust-lang.org/nightly/rustc/platform-support.html
66
///
77
/// Presently we only support x86_64
8+
/// FIXME: we actually support Windows and aarch64, but that isn't captured here.
89
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, serde::Deserialize)]
910
pub enum Target {
1011
/// `x86_64-unknown-linux-gnu`

collector/src/compile/execute/mod.rs

Lines changed: 169 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -134,66 +134,137 @@ pub struct CargoProcess<'a> {
134134
pub target: Target,
135135
pub workspace_package: Option<String>,
136136
}
137-
/// Returns an optional list of Performance CPU cores, if the system has P and E cores.
138-
/// This list *should* be in a format suitable for the `taskset` command.
139-
#[cfg(target_os = "linux")]
140-
fn performance_cores() -> Option<&'static String> {
141-
use std::sync::LazyLock;
142-
static PERFORMANCE_CORES: LazyLock<Option<String>> = LazyLock::new(|| {
143-
if std::fs::exists("/sys/devices/cpu").expect("Could not check the CPU architecture details: could not check if `/sys/devices/cpu` exists!") {
144-
// If /sys/devices/cpu exists, then this is not a "Performance-hybrid" CPU.
145-
None
146-
}
147-
else if std::fs::exists("/sys/devices/cpu_core").expect("Could not check the CPU architecture detali: could not check if `/sys/devices/cpu_core` exists!") {
148-
// If /sys/devices/cpu_core exists, then this is a "Performance-hybrid" CPU.
149-
eprintln!("WARNING: Performance-Hybrid CPU detected. `rustc-perf` can't run properly on Efficency cores: test suite will only use Performance cores!");
150-
Some(std::fs::read_to_string("/sys/devices/cpu_core/cpus").unwrap().trim().to_string())
151-
} else {
152-
// If neither dir exists, then something is wrong - `/sys/devices/cpu` has been in Linux for over a decade.
153-
eprintln!("WARNING: neither `/sys/devices/cpu` nor `/sys/devices/cpu_core` present, unable to determine if this CPU has a Performance-Hybrid architecture.");
154-
None
155-
}
156-
});
157-
(*PERFORMANCE_CORES).as_ref()
158-
}
159137

160-
#[cfg(not(target_os = "linux"))]
161-
// Modify this stub if you want to add support for P/E cores on more OSs
162-
fn performance_cores() -> Option<&'static String> {
163-
None
138+
// Some CPUs have a hybrid architecture with a mixture of P-cores (power) and E-cores (efficiency).
139+
// When benchmarking we use `taskset` to restrict execution to P-cores. Why?
140+
// 1. The instruction count info for E-cores is often incomplete, and a substantial chunk of events
141+
// is lost.
142+
// 2. The performance characteristics of E-cores are less reliable, so excluding them from the
143+
// benchmark makes things easier.
144+
// 3. An unpredictable mix of P-core and E-core execution can give inconsistent results.
145+
//
146+
// If a hybrid architecture is detected, this type is used to hold information about the P-cores.
147+
// The detection method used varies across platforms.
148+
#[derive(Debug)]
149+
struct PCores {
150+
/// The number of P-cores.
151+
len: usize,
152+
/// The list of P-cores, in a form suitable for passing to `taskset`.
153+
list: String,
164154
}
165155

166-
#[cfg(target_os = "linux")]
167-
/// Makes the benchmark run only on Performance cores.
168-
fn run_on_p_cores(path: &Path, cpu_list: &str) -> Command {
169-
// Parse CPU list to extract the number of P cores!
170-
// This assumes the P core id's are countinus, in format `fisrt_id-last_id`
171-
let (core_start, core_end) = cpu_list
172-
.split_once("-")
173-
.unwrap_or_else(|| panic!("Unsuported P core list format: {cpu_list:?}."));
174-
let core_start: u32 = core_start
175-
.parse()
176-
.expect("Expected a number when parsing the start of the P core list!");
177-
let core_end: u32 = core_end
178-
.parse()
179-
.expect("Expected a number when parsing the end of the P core list!");
180-
let core_count = core_end - core_start;
181-
let mut cmd = Command::new("taskset");
182-
// Set job count to P core count - this is done for 2 reasons:
183-
// 1. The instruction count info for E core is often very incompleate - a substantial chunk of events is lost.
184-
// 2. The performance charcteristics of E cores are less reliable, so excluding them from the benchmark makes things easier.
185-
cmd.env("CARGO_BUILD_JOBS", format!("{core_count}"));
186-
// pass the P core list to taskset to pin task to the P core.
187-
cmd.arg("--cpu-list");
188-
cmd.arg(cpu_list);
189-
cmd.arg(path);
190-
cmd
191-
}
156+
static P_CORES: LazyLock<Option<PCores>> = LazyLock::new(p_cores);
157+
158+
cfg_if::cfg_if! {
159+
if #[cfg(all(target_os = "linux", target_arch = "x86_64"))] {
160+
// On x86-64/Linux we look for the presence of `/sys/devices/cpu_core/` which indicates a
161+
// hybrid architecture.
162+
fn p_cores() -> Option<PCores> {
163+
if std::fs::exists("/sys/devices/cpu").unwrap() {
164+
// `/sys/devices/cpu` exists: this is not a hybrid CPU.
165+
None
166+
} else if std::fs::exists("/sys/devices/cpu_core").unwrap() {
167+
// `/sys/devices/cpu_core/` exists: this is a hybrid CPU, and the `cpus` file
168+
// within contains the list of P-cores. (`sys/devices/cpu_atom/cpus` contains
169+
// the list of E-cores).
170+
let list =
171+
std::fs::read_to_string("/sys/devices/cpu_core/cpus")
172+
.unwrap()
173+
.trim()
174+
.to_string();
175+
eprintln!(
176+
"WARNING: hybrid Intel CPU detected; test suite will only use P-cores: {list}"
177+
);
178+
// Parse CPU list to extract the number of P-cores. This assumes the P-core ids are
179+
// continuous, in format `m-n`.
180+
let (first, last) = list
181+
.split_once("-")
182+
.unwrap_or_else(|| panic!("unsupported P-core list format: {list:?}."));
183+
let first = first
184+
.parse::<usize>()
185+
.expect("expected a number at the start of the P-core list");
186+
let last = last
187+
.parse::<usize>()
188+
.expect("expected a number at the end of the P-core list");
189+
let len = last - first + 1; // e.g. "0-3" is four cores: [0, 1, 2, 3]
190+
Some(PCores { len, list })
191+
} else {
192+
// Neither dir exists: something is wrong, because `/sys/devices/cpu` has been
193+
// in Linux (on x86-64, at least) for over a decade.
194+
eprintln!(
195+
"WARNING: `/sys/devices/{{cpu,cpu_core}}` not found; \
196+
unable to determine if CPU has a hybrid architecture"
197+
);
198+
None
199+
}
200+
}
201+
} else if #[cfg(all(target_os = "linux", target_arch = "aarch64"))] {
202+
// On ARM64/Linux there is no definitive way to distinguish P-cores from E-cores, so we
203+
// must use a heuristic.
204+
//
205+
// Each core has a listed "capacity", a performance estimate relative to the most powerful
206+
// core in the system (scaled 0-1024). For example, an ASUS GX10 Ascent has a Cortex-X925
207+
// with 10 P-cores and a Cortex-A725 with 10 E-cores. The reported capacities are:
208+
// * Cores 0- 4: 718 (E-cores in cluster 1 with 8MiB L3 cache)
209+
// * Cores 5- 9: 997 (P-cores in cluster 1 with 8MiB L3 cache)
210+
// * Cores 10-14: 731 (E-cores in cluster 2 with 16MiB L3 cache)
211+
// * Cores 15-18: 1017 (P-cores in cluster 2 with 16MiB L3 cache)
212+
// * Core 19: 1024 (P-core in cluster 2 with 16MiB L3 cache))
213+
//
214+
// The heuristic is that any core with a capacity at least 90% of the maximum capacity is
215+
// considered a P-core, and any other core is considered an E-core. (The 718/731 and
216+
// 997/1017 differences are presumably due to the L3 cache size. The reason for the
217+
// 1017/1024 difference is unclear. Even though the P-cores are not all identical, they are
218+
// close enough for our purposes.)
219+
fn p_cores() -> Option<PCores> {
220+
let mut caps = vec![];
221+
for i in 0.. {
222+
let path = format!("/sys/devices/system/cpu/cpu{i}/cpu_capacity");
223+
if !std::fs::exists(&path).unwrap() {
224+
break;
225+
}
226+
let cap = std::fs::read_to_string(&path).unwrap().trim().parse::<usize>().unwrap();
227+
caps.push((i, cap));
228+
}
229+
230+
if let Some(max_cap) = caps.iter().map(|(_, cap)| cap).max() {
231+
// Filter out cores that fail the 90% capacity check.
232+
let cap_threshold = *max_cap as f64 * 0.9;
233+
let p_cores: Vec<_> = caps.iter().filter_map(|(i, cap)| {
234+
if *cap as f64 >= cap_threshold {
235+
Some(i.to_string())
236+
} else {
237+
None
238+
}
239+
}).collect();
192240

193-
#[cfg(not(target_os = "linux"))]
194-
// Modify this stub if you want to add support for P/E cores on more OSs
195-
fn run_on_p_cores(_path: &Path, _cpu_list: &str) -> Command {
196-
todo!("Can't run commands on the P cores on this platform");
241+
if p_cores.len() == caps.len() {
242+
// All cores have roughly the same capacity; this is not a hybrid CPU.
243+
None
244+
} else {
245+
let list = p_cores.join(",");
246+
eprintln!(
247+
"WARNING: hybrid ARM CPU detected; test suite will only use P-cores: {list}"
248+
);
249+
Some(PCores {
250+
len: p_cores.len(),
251+
list,
252+
})
253+
}
254+
} else {
255+
eprintln!(
256+
"WARNING: `/sys/devices/system/cpu/cpu*/cpu_capacity` not found; \
257+
unable to determine if CPU has a hybrid architecture"
258+
);
259+
None
260+
}
261+
}
262+
} else {
263+
// Modify this stub if you want to add support for hybrid architectures on more platforms.
264+
fn p_cores() -> Option<PCores> {
265+
None
266+
}
267+
}
197268
}
198269

199270
impl<'a> CargoProcess<'a> {
@@ -214,11 +285,17 @@ impl<'a> CargoProcess<'a> {
214285
}
215286

216287
fn base_command(&self, cwd: &Path, subcommand: &str) -> Command {
217-
// Processors with P and E cores require special handling
218-
let mut cmd = if let Some(p_cores) = performance_cores() {
219-
run_on_p_cores(Path::new(&self.toolchain.components.cargo), p_cores)
288+
let cargo_path = Path::new(&self.toolchain.components.cargo);
289+
let mut cmd = if let Some(p_cores) = (*P_CORES).as_ref() {
290+
// Processors with P-cores and E-cores require special handling.
291+
let mut cmd = Command::new("taskset");
292+
cmd.env("CARGO_BUILD_JOBS", p_cores.len.to_string());
293+
cmd.arg("--cpu-list");
294+
cmd.arg(&p_cores.list);
295+
cmd.arg(cargo_path);
296+
cmd
220297
} else {
221-
Command::new(Path::new(&self.toolchain.components.cargo))
298+
Command::new(cargo_path)
222299
};
223300
cmd
224301
// Not all cargo invocations (e.g. `cargo clean`) need all of these
@@ -604,6 +681,11 @@ fn process_stat_output(
604681
let stdout = String::from_utf8(output.stdout.clone()).expect("utf8 output");
605682
let mut stats = Stats::new();
606683

684+
// ARM P-core events have names like `armv8_pmuv3_0/instructions:u/` and
685+
// `armv8_pmuv3_1/branche-misses/`.
686+
#[cfg(all(target_os = "linux", target_arch = "aarch64"))]
687+
let arm_p_core_events_re = regex::Regex::new(r"armv[0-9]_pmuv[0-9]_[0-9]/([^/]*)/").unwrap();
688+
607689
let mut self_profile_dir: Option<PathBuf> = None;
608690
let mut self_profile_crate: Option<String> = None;
609691
for line in stdout.lines() {
@@ -654,24 +736,43 @@ fn process_stat_output(
654736
}
655737
};
656738
}
739+
657740
let mut parts = line.split(';').map(|s| s.trim());
658741
let cnt = get!(parts.next());
742+
if cnt == "<not supported>" || cnt == "<not counted>" || cnt.is_empty() {
743+
continue;
744+
}
745+
659746
let _unit = get!(parts.next());
660-
let mut name = get!(parts.next());
661-
// Map P-core events to normal events
662-
if name == "cpu_core/instructions:u/" {
663-
name = "instructions:u";
747+
748+
#[allow(unused_mut)]
749+
let mut name = get!(parts.next()).to_string();
750+
// Map P-core event name to normal event names.
751+
cfg_if::cfg_if! {
752+
if #[cfg(all(target_os = "linux", target_arch = "x86_64"))] {
753+
if name == "cpu_core/instructions:u/" {
754+
name = "instructions:u".to_string();
755+
}
756+
} else if #[cfg(all(target_os = "linux", target_arch = "aarch64"))] {
757+
// ARM P-core events have names like `armv8_pmuv3_0/instructions:u/` and
758+
// `armv8_pmuv3_1/branche-misses/`.
759+
if let Some(event) = arm_p_core_events_re.captures(&name) {
760+
name = event[1].to_string();
761+
}
762+
}
664763
}
764+
665765
let _time = get!(parts.next());
766+
666767
let pct = get!(parts.next());
667-
if cnt == "<not supported>" || cnt == "<not counted>" || cnt.is_empty() {
668-
continue;
669-
}
670768
if !pct.starts_with("100.") {
769+
// If this fails, it's probably because the CPU has a hybrid architecture and the
770+
// metric is split across P-cores and E-cores. See `PCores`.
671771
panic!("measurement of `{name}` only active for {pct}% of the time");
672772
}
773+
673774
stats.insert(
674-
name.to_owned(),
775+
name,
675776
cnt.parse()
676777
.map_err(|e| DeserializeStatError::ParseError(cnt.to_string(), e))?,
677778
);

site/frontend/package-lock.json

Lines changed: 0 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)