diff --git a/libsufr/src/suffix_array.rs b/libsufr/src/suffix_array.rs index 0faa9df..4662350 100644 --- a/libsufr/src/suffix_array.rs +++ b/libsufr/src/suffix_array.rs @@ -3,7 +3,7 @@ use crate::{ sufr_builder::SufrBuilder, sufr_file::SufrFile, types::{ - CountOptions, CountResult, ExtractOptions, ExtractResult, ListOptions, + BisectOptions, BisectResult, CountOptions, CountResult, ExtractOptions, ExtractResult, ListOptions, LocateOptions, LocateResult, SufrBuilderArgs, SufrMetadata, }, }; @@ -17,6 +17,7 @@ pub(crate) trait SuffixArrayTrait: Send + Sync { fn locate(&mut self, args: LocateOptions) -> Result>; fn metadata(&self) -> Result; fn string_at(&mut self, pos: usize, len: Option) -> Result; + fn bisect(&mut self, args: BisectOptions) -> Result>; } // -------------------------------------------------- @@ -48,6 +49,10 @@ impl SuffixArrayTrait for SuffixArray32 { fn string_at(&mut self, pos: usize, len: Option) -> Result { self.inner.string_at(pos, len) } + + fn bisect(&mut self, args: BisectOptions) -> Result> { + self.inner.bisect(args) + } } pub(crate) struct SuffixArray64 { @@ -79,6 +84,10 @@ impl SuffixArrayTrait for SuffixArray64 { fn string_at(&mut self, pos: usize, len: Option) -> Result { self.inner.string_at(pos, len) } + + fn bisect(&mut self, args: BisectOptions) -> Result> { + self.inner.bisect(args) + } } // -------------------------------------------------- @@ -137,6 +146,58 @@ impl SuffixArray { Self::read(&path, low_memory) } + + // -------------------------------------------------- + /// Bisect the index range of occurences of queries. + /// If the index range of a prefix is already known, + /// or if it is desirable to avoid enumerating every match, + /// this method can be used as a faster stand-in for `count` + /// ``` + /// use anyhow::Result; + /// use libsufr::{types::{BisectOptions, BisectResult}, suffix_array::SuffixArray}; + /// + /// fn main() -> Result<()> { + /// let mut suffix_array = SuffixArray::read("../data/inputs/1.sufr", false)?; + /// let opts_without_prefix = BisectOptions { + /// queries: vec!["AC".to_string(), "CG".to_string()], + /// max_query_len: None, + /// low_memory: false, + /// prefix_result: None, + /// }; + /// let result_without_prefix = suffix_array.bisect(opts_without_prefix)?; + /// assert_eq!( + /// result_without_prefix, + /// vec![ + /// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 }, + /// BisectResult { query_num: 1, query: "CG".to_string(), count: 2, first_position: 3, last_position: 4 }] + /// ); + /// let prefix_opts = BisectOptions { + /// queries: vec!["A".to_string()], + /// max_query_len: None, + /// low_memory: false, + /// prefix_result: None, + /// }; + /// let prefix_result = suffix_array.bisect(prefix_opts)?[0].clone(); + /// let opts_with_prefix = BisectOptions { + /// queries: vec!["AC".to_string(), "CG".to_string()], + /// max_query_len: None, + /// low_memory: false, + /// prefix_result: Some(prefix_result), + /// }; + /// let result_with_prefix = suffix_array.bisect(opts_with_prefix)?; + /// assert_eq!( + /// result_with_prefix, + /// vec![ + /// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 }, + /// BisectResult { query_num: 1, query: "CG".to_string(), count: 0, first_position: 0, last_position: 0 }] + /// ); + /// Ok(()) + /// } + /// ``` + pub fn bisect(&mut self, args: BisectOptions) -> Result> { + self.inner.bisect(args) + } + // -------------------------------------------------- /// Count instances of queries /// diff --git a/libsufr/src/sufr_file.rs b/libsufr/src/sufr_file.rs index 5834a8c..e89aeb1 100644 --- a/libsufr/src/sufr_file.rs +++ b/libsufr/src/sufr_file.rs @@ -7,7 +7,7 @@ use crate::{ file_access::FileAccess, sufr_search::{SufrSearch, SufrSearchArgs}, types::{ - CountOptions, CountResult, ExtractOptions, ExtractResult, ExtractSequence, + BisectOptions, BisectResult, CountOptions, CountResult, ExtractOptions, ExtractResult, ExtractSequence, FromUsize, Int, ListOptions, LocateOptions, LocatePosition, LocateResult, SearchOptions, SearchResult, SeedMask, SuffixSortType, SufrMetadata, }, @@ -667,6 +667,124 @@ where Ok(()) } + // -------------------------------------------------- + /// Bisect the index range of occurences of queries. + /// If the index range of a prefix is already known, + /// or if it is desirable to avoid enumerating every match, + /// this method can be used as a faster stand-in for `count` + /// ``` + /// use anyhow::Result; + /// use libsufr::{types::{BisectOptions, BisectResult}, sufr_file::SufrFile}; + /// + /// fn main() -> Result<()> { + /// let mut sufr = SufrFile::::read("../data/inputs/1.sufr", false)?; + /// // bisect without a prefix result, searching the whole suffix array: + /// let opts_without_prefix = BisectOptions { + /// queries: vec!["AC".to_string(), "CG".to_string()], + /// max_query_len: None, + /// low_memory: false, + /// prefix_result: None, + /// }; + /// let result_without_prefix = sufr.bisect(opts_without_prefix)?; + /// // ... both queries appear in the suffix array + /// assert_eq!( + /// result_without_prefix, + /// vec![ + /// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 }, + /// BisectResult { query_num: 1, query: "CG".to_string(), count: 2, first_position: 3, last_position: 4 }] + /// ); + /// // bisect within the range of a prefix result: + /// let prefix_opts = BisectOptions { + /// queries: vec!["A".to_string()], + /// max_query_len: None, + /// low_memory: false, + /// prefix_result: None, + /// }; + /// let prefix_result = sufr.bisect(prefix_opts)?[0].clone(); + /// let opts_with_prefix = BisectOptions { + /// queries: vec!["AC".to_string(), "CG".to_string()], + /// max_query_len: None, + /// low_memory: false, + /// prefix_result: Some(prefix_result), + /// }; + /// let result_with_prefix = sufr.bisect(opts_with_prefix)?; + /// // ... the query AC is found within the range of the prefix result for A, but CG is not. + /// assert_eq!( + /// result_with_prefix, + /// vec![ + /// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 }, + /// BisectResult { query_num: 1, query: "CG".to_string(), count: 0, first_position: 0, last_position: 0 }] + /// ); + /// Ok(()) + /// } + /// ``` + pub fn bisect(&mut self, args: BisectOptions) -> Result> { + // Set memory mode + self.query_low_memory = args.low_memory; + if !self.query_low_memory { + self.set_suffix_array_mem(args.max_query_len)?; + } + + // Construct SufrSearch factory + let now = Instant::now(); + let new_search = || -> Result>> { + let suffix_array_file: FileAccess = FileAccess::new( + &self.filename, + self.suffix_array_pos as u64, + self.len_suffixes.to_usize(), + )?; + let text_file: FileAccess = FileAccess::new( + &self.filename, + self.text_pos as u64, + self.text_len.to_usize(), + )?; + let search_args = SufrSearchArgs { + text: &self.text, + text_len: self.text_len.to_usize(), + text_file, + file: suffix_array_file, + suffix_array: &self.suffix_array_mem, + rank: &self.suffix_array_rank_mem, + len_suffixes: self.len_suffixes.to_usize(), + sort_type: &self.sort_type, + max_query_len: args.max_query_len, + }; + Ok(RefCell::new(SufrSearch::new(search_args))) + }; + + // Retrieve the prefix result's index range. + // If no result was passed, deafult to the full range of the suffix array. + let n = self.len_suffixes.to_usize() - 1; + let search_range = match args.prefix_result { + Some(result) => (result.first_position, result.last_position), + _ => (0, n), + }; + + // Bisect each query in its own thread + let thread_local_search: ThreadLocal>> = + ThreadLocal::new(); + let mut res: Vec<_> = args + .queries + .clone() + .into_par_iter() + .enumerate() + .flat_map(|(query_num, query)| -> Result { + let mut search = + thread_local_search.get_or_try(new_search)?.borrow_mut(); + search.bisect(query_num, &query, search_range.0, search_range.1) + }) + .collect(); + res.sort_by_key(|r| r.query_num); + + info!( + "Bisection of {} queries finished in {:?}", + args.queries.len(), + now.elapsed() + ); + + Ok(res) + } + // -------------------------------------------------- /// Count the occurrences of queries in a suffix array /// @@ -1176,7 +1294,7 @@ mod test { sufr_file::SufrFile, types::{ ExtractOptions, ExtractResult, ExtractSequence, LocateOptions, - LocatePosition, LocateResult, + LocatePosition, LocateResult, BisectOptions, BisectResult, }, }; use anyhow::Result; @@ -1558,6 +1676,35 @@ mod test { Ok(()) } + // -------------------------------------------------- + #[test] + fn test_bisect() -> Result<()> { + let mut sufr = SufrFile::::read("../data/inputs/3.sufr", false)?; + // bisect "A" + let prefix = vec!["A".to_string()]; + let prefix_result = sufr.bisect(BisectOptions { + queries: prefix, + max_query_len: None, + low_memory: false, + prefix_result: None + })?[0].clone(); + // bisect "AA", "AC", "AG", "AT", "AN" within the range of "A". + let queries = vec!["AA".to_string(), "AC".to_string(), "AG".to_string(), "AT".to_string(), "AN".to_string()]; + let queries_result = sufr.bisect(BisectOptions { + queries: queries, + max_query_len: None, + low_memory: false, + prefix_result: Some(prefix_result.clone()), + })?; + // because we queried all of the possible suffixes to "A", + // the count of "A" should be the sum of counts of queries. + assert_eq!( + prefix_result.count, + queries_result.iter().map(|res| res.count).sum(), + ); + Ok(()) + } + // -------------------------------------------------- // The "compare" function is now deeply nested inside the SuffixSearch // which is created inside the "suffix_search" function and I'm lost diff --git a/libsufr/src/sufr_search.rs b/libsufr/src/sufr_search.rs index e053191..2e2a7d3 100644 --- a/libsufr/src/sufr_search.rs +++ b/libsufr/src/sufr_search.rs @@ -3,7 +3,7 @@ use crate::{ file_access::FileAccess, types::{ - Comparison, FromUsize, Int, SearchResult, SearchResultLocations, SuffixSortType, + Comparison, FromUsize, Int, BisectResult, SearchResult, SearchResultLocations, SuffixSortType, }, util::find_lcp_full_offset, }; @@ -168,6 +168,49 @@ where } } + + // -------------------------------------------------- + /// Find the first and last positions of a query string in a suffix array, + /// given a range of viable positions. + /// Returns a `BisectResult` + /// + /// Args: + /// * `query_num`: ordinal number of the query + /// * `query`: a string to search for + /// * `low`: the lowest position at which the query may occur + /// * `high`: the highest position at which the query may occur + pub fn bisect( + &mut self, + query_num: usize, + query: &str, + low: usize, + high: usize, + ) -> Result { + let qry = query.as_bytes(); + if let Some(start) = self.suffix_search_first(qry, low, high, 0, 0) { + // something was found + let end = self + .suffix_search_last(qry, start, high, high + 1, 0, 0) + .unwrap_or(start); + Ok(BisectResult { + query_num: query_num, + query: query.to_string(), + count: end - start + 1, + first_position: start, + last_position: end, + }) + } else { + // nothing was found + Ok(BisectResult { + query_num: query_num, + query: query.to_string(), + count: 0, + first_position: 0, + last_position: 0, + }) + } + } + // -------------------------------------------------- fn suffix_search_first( &mut self, diff --git a/libsufr/src/types.rs b/libsufr/src/types.rs index fc2b0de..fe14cbb 100644 --- a/libsufr/src/types.rs +++ b/libsufr/src/types.rs @@ -344,6 +344,45 @@ impl FromUsize for u64 { } } +// -------------------------------------------------- +/// Options for bisecting the index ranges of occurrences of query suffixes +#[derive(Debug, Clone)] +pub struct BisectOptions { + /// Vector of query strings + pub queries: Vec, + + /// Maximum query length for search + pub max_query_len: Option, + + /// When `true`, the suffix array will be placed into memory. + /// When `false`, the suffix array will be read from disk. + pub low_memory: bool, + + /// Optional, the bisect result for a query that is the common prefix of queries. + /// If passed, search for query ranges will be restricted to the range defined by prefix_result. + pub prefix_result: Option, +} + +// -------------------------------------------------- +/// A struct representing the index range of occurrences of a suffix +#[derive(Debug, Clone, PartialEq)] +pub struct BisectResult { + /// The ordinal position of the original query + pub query_num: usize, + + /// The query string + pub query: String, + + /// The width of the interval + pub count: usize, + + /// The first index of a suffix matching the query + pub first_position: usize, + + /// The last index of a suffix matching the query + pub last_position: usize, +} + // -------------------------------------------------- /// Options for counting the occurrences of suffixes #[derive(Debug, Clone)]