From 3cd87f89b6b7dfadb8d2fc6a71eda6a697257938 Mon Sep 17 00:00:00 2001 From: "Tom D." Date: Sat, 27 Dec 2025 22:45:33 +0100 Subject: [PATCH 1/2] perf(tsort): avoid reading the whole input into memory and intern strings --- Cargo.lock | 12 ++ Cargo.toml | 5 +- src/uu/tsort/Cargo.toml | 5 +- src/uu/tsort/src/tsort.rs | 359 ++++++++++++++++++++++---------------- 4 files changed, 231 insertions(+), 150 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bf51e63c144..870a612840b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2711,6 +2711,16 @@ dependencies = [ "num-traits", ] +[[package]] +name = "string-interner" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23de088478b31c349c9ba67816fa55d9355232d63c3afea8bf513e31f0f1d2c0" +dependencies = [ + "hashbrown 0.15.4", + "serde", +] + [[package]] name = "strsim" version = "0.11.1" @@ -4043,6 +4053,8 @@ dependencies = [ "clap", "codspeed-divan-compat", "fluent", + "nix", + "string-interner", "tempfile", "thiserror 2.0.17", "uucore", diff --git a/Cargo.toml b/Cargo.toml index 80af38b85c6..632aa3d1b71 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ # coreutils (uutils) # * see the repository LICENSE, README, and CONTRIBUTING files for more information -# spell-checker:ignore (libs) bigdecimal datetime serde bincode gethostid kqueue libselinux mangen memmap uuhelp startswith constness expl unnested logind cfgs +# spell-checker:ignore (libs) bigdecimal datetime serde bincode gethostid kqueue libselinux mangen memmap uuhelp startswith constness expl unnested logind cfgs interner [package] name = "coreutils" @@ -368,7 +368,8 @@ rust-ini = "0.21.0" same-file = "1.0.6" self_cell = "1.0.4" # FIXME we use the exact version because the new 0.5.3 requires an MSRV of 1.88 -selinux = "=0.5.2" +selinux = "0.5.2" +string-interner = "0.19.0" signal-hook = "0.4.1" tempfile = "3.15.0" terminal_size = "0.4.0" diff --git a/src/uu/tsort/Cargo.toml b/src/uu/tsort/Cargo.toml index 94b170223c1..72559199c8c 100644 --- a/src/uu/tsort/Cargo.toml +++ b/src/uu/tsort/Cargo.toml @@ -1,3 +1,4 @@ +#spell-checker:ignore (libs) interner [package] name = "uu_tsort" description = "tsort ~ (uutils) topologically sort input (partially ordered) pairs" @@ -19,9 +20,11 @@ path = "src/tsort.rs" [dependencies] clap = { workspace = true } +fluent = { workspace = true } +string-interner = { workspace = true } thiserror = { workspace = true } +nix = { workspace = true, features = ["fs"] } uucore = { workspace = true } -fluent = { workspace = true } [[bin]] name = "tsort" diff --git a/src/uu/tsort/src/tsort.rs b/src/uu/tsort/src/tsort.rs index 26c6f8ffc99..8eab8ab21c6 100644 --- a/src/uu/tsort/src/tsort.rs +++ b/src/uu/tsort/src/tsort.rs @@ -2,108 +2,95 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -//spell-checker:ignore TAOCP indegree +//spell-checker:ignore TAOCP indegree fadvise FADV +//spell-checker:ignore (libs) interner uclibc use clap::{Arg, ArgAction, Command}; use std::collections::hash_map::Entry; use std::collections::{HashMap, VecDeque}; use std::ffi::OsString; +use std::fs::File; +use std::io::{self, BufRead, BufReader}; use std::path::Path; +use string_interner::StringInterner; +use string_interner::backend::StringBackend; use thiserror::Error; use uucore::display::Quotable; use uucore::error::{UError, UResult, USimpleError}; -use uucore::{format_usage, show}; +use uucore::{format_usage, show, translate}; -use uucore::translate; +// short types for switching interning behavior on the fly. +type Sym = string_interner::symbol::SymbolU32; +type Interner = StringInterner>; mod options { pub const FILE: &str = "file"; } -#[derive(Debug, Error)] -enum TsortError { - /// The input file is actually a directory. - #[error("{input}: {message}", input = .0.maybe_quote(), message = translate!("tsort-error-is-dir"))] - IsDir(OsString), - - /// The number of tokens in the input data is odd. - /// - /// The list of edges must be even because each edge has two - /// components: a source node and a target node. - #[error("{input}: {message}", input = .0.maybe_quote(), message = translate!("tsort-error-odd"))] - NumTokensOdd(OsString), - - /// The graph contains a cycle. - #[error("{input}: {message}", input = .0.maybe_quote(), message = translate!("tsort-error-loop"))] - Loop(OsString), -} - -// Auxiliary struct, just for printing loop nodes via show! macro -#[derive(Debug, Error)] -#[error("{0}")] -struct LoopNode<'a>(&'a str); - -impl UError for TsortError {} -impl UError for LoopNode<'_> {} - #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?; - let mut inputs: Vec = matches + let mut inputs = matches .get_many::(options::FILE) - .map(|vals| vals.cloned().collect()) - .unwrap_or_default(); - - if inputs.is_empty() { - inputs.push(OsString::from("-")); - } - - if inputs.len() > 1 { - return Err(USimpleError::new( - 1, - translate!( - "tsort-error-extra-operand", - "operand" => inputs[1].quote(), - "util" => uucore::util_name() - ), - )); - } - - let input = inputs .into_iter() - .next() - .expect(translate!("tsort-error-at-least-one-input").as_str()); + .flatten(); + + let input = match (inputs.next(), inputs.next()) { + (None, _) => { + return Err(USimpleError::new( + 1, + translate!("tsort-error-at-least-one-input"), + )); + } + (Some(input), None) => input, + (Some(_), Some(extra)) => { + return Err(USimpleError::new( + 1, + translate!( + "tsort-error-extra-operand", + "operand" => extra.quote(), + "util" => uucore::util_name() + ), + )); + } + }; - let data = if input == "-" { - let stdin = std::io::stdin(); - std::io::read_to_string(stdin)? + // Create the directed graph from pairs of tokens in the input data. + let mut g = Graph::new(input.to_string_lossy().to_string()); + if input == "-" { + process_input(io::stdin().lock(), &mut g)?; } else { let path = Path::new(&input); if path.is_dir() { - return Err(TsortError::IsDir(input.clone()).into()); + return Err(TsortError::IsDir(input.to_string_lossy().to_string()).into()); } - std::fs::read_to_string(path)? - }; - // Create the directed graph from pairs of tokens in the input data. - let mut g = Graph::new(input.clone()); - // Input is considered to be in the format - // From1 To1 From2 To2 ... - // with tokens separated by whitespaces - let mut edge_tokens = data.split_whitespace(); - // Note: this is equivalent to iterating over edge_tokens.chunks(2) - // but chunks() exists only for slices and would require unnecessary Vec allocation. - // Itertools::chunks() is not used due to unnecessary overhead for internal RefCells - loop { - // Try take next pair of tokens - let Some(from) = edge_tokens.next() else { - // no more tokens -> end of input. Graph constructed - break; - }; - let Some(to) = edge_tokens.next() else { - return Err(TsortError::NumTokensOdd(input.clone()).into()); - }; - g.add_edge(from, to); + let file = File::open(path)?; + + // advise the OS we will access the data sequentially if available. + #[cfg(any( + target_os = "linux", + target_os = "android", + target_os = "fuchsia", + target_os = "wasi", + target_env = "uclibc", + target_os = "freebsd", + ))] + { + use nix::fcntl::{PosixFadviseAdvice, posix_fadvise}; + use std::os::unix::io::AsFd; + + posix_fadvise( + file.as_fd(), // file descriptor + 0, // start of the file + 0, // length 0 = all + PosixFadviseAdvice::POSIX_FADV_SEQUENTIAL, + ) + .ok(); + } + + let reader = BufReader::new(file); + process_input(reader, &mut g)?; } g.run_tsort(); @@ -117,6 +104,7 @@ pub fn uu_app() -> Command { .override_usage(format_usage(&translate!("tsort-usage"))) .about(translate!("tsort-about")) .infer_long_args(true) + // no-op flag, needed for POSIX compatibility. .arg( Arg::new("warn") .short('w') @@ -128,11 +116,69 @@ pub fn uu_app() -> Command { .hide(true) .value_parser(clap::value_parser!(OsString)) .value_hint(clap::ValueHint::FilePath) - .num_args(0..) + .default_value("-") + .num_args(1..) .action(ArgAction::Append), ) } +#[derive(Debug, Error)] +enum TsortError { + /// The input file is actually a directory. + #[error("{input}: {message}", input = .0.maybe_quote(), message = translate!("tsort-error-is-dir"))] + IsDir(String), + + /// The number of tokens in the input data is odd. + /// + /// The length of the list of edges must be even because each edge has two + /// components: a source node and a target node. + #[error("{input}: {message}", input = .0.maybe_quote(), message = translate!("tsort-error-odd"))] + NumTokensOdd(String), + + /// The graph contains a cycle. + #[error("{input}: {message}", input = .0, message = translate!("tsort-error-loop"))] + Loop(String), + + /// Wrapper for bubbling up IO errors + #[error("{0}")] + IO(#[from] std::io::Error), +} + +// Auxiliary struct, just for printing loop nodes via show! macro +#[derive(Debug, Error)] +#[error("{0}")] +struct LoopNode<'a>(&'a str); + +impl UError for TsortError {} +impl UError for LoopNode<'_> {} + +fn process_input(reader: R, graph: &mut Graph) -> Result<(), TsortError> { + let mut pending: Option = None; + + // Input is considered to be in the format + // From1 To1 From2 To2 ... + // with tokens separated by whitespaces + + for line in reader.lines() { + let line = line?; + for token in line.split_whitespace() { + // Intern the token and get a Sym + let token_sym = graph.interner.get_or_intern(token); + + if let Some(from) = pending.take() { + graph.add_edge(from, token_sym); + } else { + pending = Some(token_sym); + } + } + } + if pending.is_some() { + return Err(TsortError::NumTokensOdd(graph.name())); + } + + Ok(()) +} + /// Find the element `x` in `vec` and remove it, returning its index. fn remove(vec: &mut Vec, x: T) -> Option where @@ -143,40 +189,54 @@ where }) } -// We use String as a representation of node here -// but using integer may improve performance. +#[derive(Clone, Copy, PartialEq, Eq)] +enum VisitedState { + Opened, + Closed, +} + #[derive(Default)] -struct Node<'input> { - successor_names: Vec<&'input str>, +struct Node { + successor_tokens: Vec, predecessor_count: usize, } -impl<'input> Node<'input> { - fn add_successor(&mut self, successor_name: &'input str) { - self.successor_names.push(successor_name); +impl Node { + fn add_successor(&mut self, successor_name: Sym) { + self.successor_tokens.push(successor_name); } } -struct Graph<'input> { - name: OsString, - nodes: HashMap<&'input str, Node<'input>>, +struct Graph { + name_sym: Sym, + nodes: HashMap, + interner: Interner, } -#[derive(Clone, Copy, PartialEq, Eq)] -enum VisitedState { - Opened, - Closed, -} - -impl<'input> Graph<'input> { - fn new(name: OsString) -> Self { +impl Graph { + fn new(name: String) -> Self { + let mut interner = Interner::new(); + let name_sym = interner.get_or_intern(name); Self { - name, + name_sym, + interner, nodes: HashMap::default(), } } - fn add_edge(&mut self, from: &'input str, to: &'input str) { + fn name(&self) -> String { + //SAFETY: the name is interned during graph creation and stored as name_sym. + // gives much better performance on lookup. + unsafe { self.interner.resolve_unchecked(self.name_sym).to_owned() } + } + fn get_node_name(&self, node_sym: Sym) -> &str { + //SAFETY: the only way to get a Sym is by manipulating an interned string. + // gives much better performance on lookup. + + unsafe { self.interner.resolve_unchecked(node_sym) } + } + + fn add_edge(&mut self, from: Sym, to: Sym) { let from_node = self.nodes.entry(from).or_default(); if from != to { from_node.add_successor(to); @@ -185,71 +245,76 @@ impl<'input> Graph<'input> { } } - fn remove_edge(&mut self, u: &'input str, v: &'input str) { - remove(&mut self.nodes.get_mut(u).unwrap().successor_names, v); - self.nodes.get_mut(v).unwrap().predecessor_count -= 1; + fn remove_edge(&mut self, u: Sym, v: Sym) { + remove( + &mut self + .nodes + .get_mut(&u) + .expect("node is part of the graph") + .successor_tokens, + v, + ); + self.nodes + .get_mut(&v) + .expect("node is part of the graph") + .predecessor_count -= 1; } /// Implementation of algorithm T from TAOCP (Don. Knuth), vol. 1. fn run_tsort(&mut self) { - // First, we find nodes that have no prerequisites (independent nodes). - // If no such node exists, then there is a cycle. - let mut independent_nodes_queue: VecDeque<&'input str> = self + let mut independent_nodes_queue: VecDeque = self .nodes .iter() - .filter_map(|(&name, node)| { + .filter_map(|(&sym, node)| { if node.predecessor_count == 0 { - Some(name) + Some(sym) } else { None } }) .collect(); - // To make sure the resulting ordering is deterministic we - // need to order independent nodes. - // - // FIXME: this doesn't comply entirely with the GNU coreutils - // implementation. - independent_nodes_queue.make_contiguous().sort_unstable(); + // Sort by resolved string for deterministic output + independent_nodes_queue + .make_contiguous() + .sort_unstable_by(|a, b| self.get_node_name(*a).cmp(self.get_node_name(*b))); while !self.nodes.is_empty() { - // Get the next node (breaking any cycles necessary to do so). let v = self.find_next_node(&mut independent_nodes_queue); - println!("{v}"); - if let Some(node_to_process) = self.nodes.remove(v) { - for successor_name in node_to_process.successor_names.into_iter().rev() { - let successor_node = self.nodes.get_mut(successor_name).unwrap(); + println!("{}", self.get_node_name(v)); + if let Some(node_to_process) = self.nodes.remove(&v) { + for successor_name in node_to_process.successor_tokens.into_iter().rev() { + // we reverse to match GNU tsort order + let successor_node = self + .nodes + .get_mut(&successor_name) + .expect("node is part of the graph"); successor_node.predecessor_count -= 1; if successor_node.predecessor_count == 0 { - // If we find nodes without any other prerequisites, we add them to the queue. independent_nodes_queue.push_back(successor_name); } } } } } - - /// Get the in-degree of the node with the given name. - fn indegree(&self, name: &str) -> Option { - self.nodes.get(name).map(|data| data.predecessor_count) + pub fn indegree(&self, sym: Sym) -> Option { + self.nodes.get(&sym).map(|data| data.predecessor_count) } - // Pre-condition: self.nodes is non-empty. - fn find_next_node(&mut self, frontier: &mut VecDeque<&'input str>) -> &'input str { + fn find_next_node(&mut self, frontier: &mut VecDeque) -> Sym { // If there are no nodes of in-degree zero but there are still // un-visited nodes in the graph, then there must be a cycle. - // We need to find the cycle, display it, and then break the - // cycle. + // We need to find the cycle, display it on stderr, and break it to go on. // // A cycle is guaranteed to be of length at least two. We break // the cycle by deleting an arbitrary edge (the first). That is // not necessarily the optimal thing, but it should be enough to - // continue making progress in the graph traversal. + // continue making progress in the graph traversal, and matches GNU tsort behavior. // // It is possible that deleting the edge does not actually // result in the target node having in-degree zero, so we repeat // the process until such a node appears. + loop { match frontier.pop_front() { None => self.find_and_break_cycle(frontier), @@ -258,27 +323,28 @@ impl<'input> Graph<'input> { } } - fn find_and_break_cycle(&mut self, frontier: &mut VecDeque<&'input str>) { + fn find_and_break_cycle(&mut self, frontier: &mut VecDeque) { let cycle = self.detect_cycle(); - show!(TsortError::Loop(self.name.clone())); - for &node in &cycle { - show!(LoopNode(node)); + show!(TsortError::Loop(self.name())); + for &sym in &cycle { + show!(LoopNode(self.get_node_name(sym))); } let u = *cycle.last().expect("cycle must be non-empty"); let v = cycle[0]; self.remove_edge(u, v); - if self.indegree(v).unwrap() == 0 { + if self.indegree(v).expect("node is part of the graph") == 0 { frontier.push_back(v); } } - fn detect_cycle(&self) -> Vec<&'input str> { - let mut nodes: Vec<_> = self.nodes.keys().collect(); - nodes.sort_unstable(); + fn detect_cycle(&self) -> Vec { + // Sort by resolved string for deterministic output + let mut nodes: Vec<_> = self.nodes.keys().copied().collect(); + nodes.sort_unstable_by(|a, b| self.get_node_name(*a).cmp(self.get_node_name(*b))); let mut visited = HashMap::new(); let mut stack = Vec::with_capacity(self.nodes.len()); - for node in nodes { + for &node in &nodes { if self.dfs(node, &mut visited, &mut stack) { let (loop_entry, _) = stack.pop().expect("loop is not empty"); @@ -294,13 +360,15 @@ impl<'input> Graph<'input> { fn dfs<'a>( &'a self, - node: &'input str, - visited: &mut HashMap<&'input str, VisitedState>, - stack: &mut Vec<(&'input str, &'a [&'input str])>, + node: Sym, + visited: &mut HashMap, + stack: &mut Vec<(Sym, &'a [Sym])>, ) -> bool { stack.push(( node, - self.nodes.get(node).map_or(&[], |n| &n.successor_names), + self.nodes + .get(&node) + .map_or(&[], |n: &Node| &n.successor_tokens), )); let state = *visited.entry(node).or_insert(VisitedState::Opened); @@ -320,22 +388,19 @@ impl<'input> Graph<'input> { match visited.entry(next_node) { Entry::Vacant(v) => { - // It's a first time we enter this node + // first visit of the node v.insert(VisitedState::Opened); stack.push(( next_node, self.nodes - .get(next_node) - .map_or(&[], |n| &n.successor_names), + .get(&next_node) + .map_or(&[], |n| &n.successor_tokens), )); } Entry::Occupied(o) => { if *o.get() == VisitedState::Opened { - // we are entering the same opened node again -> loop found - // stack contains it - // - // But part of the stack may not be belonging to this loop - // push found node to the stack to be able to trace the beginning of the loop + // We have found a node that was already visited by another iteration => loop completed + // the stack may contain unrelated nodes. This allows narrowing the loop down. stack.push((next_node, &[])); return true; } From b9604fed9d6ff4460ab43cfadb5f0fcbbc33e6f0 Mon Sep 17 00:00:00 2001 From: "Tom D." Date: Sat, 27 Dec 2025 22:45:33 +0100 Subject: [PATCH 2/2] perf(tsort): avoid redundant check on input --- src/uu/tsort/src/tsort.rs | 43 ++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/src/uu/tsort/src/tsort.rs b/src/uu/tsort/src/tsort.rs index 8eab8ab21c6..70c7c54c7a0 100644 --- a/src/uu/tsort/src/tsort.rs +++ b/src/uu/tsort/src/tsort.rs @@ -10,7 +10,6 @@ use std::collections::{HashMap, VecDeque}; use std::ffi::OsString; use std::fs::File; use std::io::{self, BufRead, BufReader}; -use std::path::Path; use string_interner::StringInterner; use string_interner::backend::StringBackend; use thiserror::Error; @@ -60,14 +59,33 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { if input == "-" { process_input(io::stdin().lock(), &mut g)?; } else { + let file = open_file(input)?; + + let reader = BufReader::new(file); + process_input(reader, &mut g)?; + } + + g.run_tsort(); + Ok(()) +} + +fn open_file(input: &OsString) -> Result { + // Windows throws a permission denied error when trying to read a directory, + // so we need to check manually beforehand. + #[cfg(windows)] + { + use std::path::Path; let path = Path::new(&input); if path.is_dir() { - return Err(TsortError::IsDir(input.to_string_lossy().to_string()).into()); + return Err(TsortError::IsDir(input.to_string_lossy().to_string())); } - let file = File::open(path)?; - - // advise the OS we will access the data sequentially if available. + Ok(File::open(path)?) + } + // advise the OS that we are going to read the file sequentially, if available. + #[cfg(not(windows))] + { + let file = File::open(input)?; #[cfg(any( target_os = "linux", target_os = "android", @@ -88,13 +106,8 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { ) .ok(); } - - let reader = BufReader::new(file); - process_input(reader, &mut g)?; + Ok(file) } - - g.run_tsort(); - Ok(()) } pub fn uu_app() -> Command { @@ -160,7 +173,13 @@ fn process_input(reader: R, graph: &mut Graph) -> Result<(), TsortEr // with tokens separated by whitespaces for line in reader.lines() { - let line = line?; + let line = line.map_err(|e| { + if e.kind() == io::ErrorKind::IsADirectory { + TsortError::IsDir(graph.name()) + } else { + e.into() + } + })?; for token in line.split_whitespace() { // Intern the token and get a Sym let token_sym = graph.interner.get_or_intern(token);