From 583e392ae101d04e889dfab315a85d7fcf940a45 Mon Sep 17 00:00:00 2001 From: George Glidden-Handgis <47768122+georgeglidden@users.noreply.github.com> Date: Tue, 29 Apr 2025 11:09:57 -0700 Subject: [PATCH 1/6] first draft for fast bisect operation, implementing BisectOptions and BisectResult in types, bisect in suffix_array, suffix_file suffix_search. supports bisecting with or without a prefix result, which specifies the range of suffixes to search. if a prefix result is not passed, the full range of suffixes is used. --- libsufr/src/suffix_array.rs | 37 ++++++++++++++++++- libsufr/src/sufr_file.rs | 72 ++++++++++++++++++++++++++++++++++++- libsufr/src/sufr_search.rs | 32 ++++++++++++++++- libsufr/src/types.rs | 37 +++++++++++++++++++ 4 files changed, 175 insertions(+), 3 deletions(-) diff --git a/libsufr/src/suffix_array.rs b/libsufr/src/suffix_array.rs index 0faa9df..bb06bfe 100644 --- a/libsufr/src/suffix_array.rs +++ b/libsufr/src/suffix_array.rs @@ -3,7 +3,7 @@ use crate::{ sufr_builder::SufrBuilder, sufr_file::SufrFile, types::{ - CountOptions, CountResult, ExtractOptions, ExtractResult, ListOptions, + BisectOptions, BisectResult, CountOptions, CountResult, ExtractOptions, ExtractResult, ListOptions, LocateOptions, LocateResult, SufrBuilderArgs, SufrMetadata, }, }; @@ -17,6 +17,7 @@ pub(crate) trait SuffixArrayTrait: Send + Sync { fn locate(&mut self, args: LocateOptions) -> Result>; fn metadata(&self) -> Result; fn string_at(&mut self, pos: usize, len: Option) -> Result; + fn bisect(&mut self, args: BisectOptions) -> Result>; } // -------------------------------------------------- @@ -48,6 +49,10 @@ impl SuffixArrayTrait for SuffixArray32 { fn string_at(&mut self, pos: usize, len: Option) -> Result { self.inner.string_at(pos, len) } + + fn bisect(&mut self, args: BisectOptions) -> Result> { + self.inner.bisect(args) + } } pub(crate) struct SuffixArray64 { @@ -79,6 +84,10 @@ impl SuffixArrayTrait for SuffixArray64 { fn string_at(&mut self, pos: usize, len: Option) -> Result { self.inner.string_at(pos, len) } + + fn bisect(&mut self, args: BisectOptions) -> Result> { + self.inner.bisect(args) + } } // -------------------------------------------------- @@ -137,6 +146,32 @@ impl SuffixArray { Self::read(&path, low_memory) } + + // -------------------------------------------------- + /// Bisect the index range of occurences of queries. + /// If the index range of a prefix is already known, + /// or if it is desirable to avoid enumerating every match, + // this method can be used as a faster stand-in for `count` + /// ``` + /// use anyhow::Result; + /// use libsufr::{types::{BisectOptions, BisectResult}, sufr_file::SufrFile}; + /// + /// fn main() -> Result<()> { + /// let mut sufr = SufrFile::::read("../data/inputs/1.sufr", false)?; + /// let opts = BisectOptions { + /// queries: vec!["AC".to_string(), "AG".to_string(), "GT".to_string()], + /// max_query_len: None, + /// low_memory: true, + /// prefix_result: None, + /// }; + /// let res = sufr.bisect(opts)?; + /// Ok(()) + /// } + /// ``` + pub fn bisect(&mut self, args: BisectOptions) -> Result> { + self.inner.bisect(args) + } + // -------------------------------------------------- /// Count instances of queries /// diff --git a/libsufr/src/sufr_file.rs b/libsufr/src/sufr_file.rs index 5834a8c..d4aba1e 100644 --- a/libsufr/src/sufr_file.rs +++ b/libsufr/src/sufr_file.rs @@ -7,7 +7,7 @@ use crate::{ file_access::FileAccess, sufr_search::{SufrSearch, SufrSearchArgs}, types::{ - CountOptions, CountResult, ExtractOptions, ExtractResult, ExtractSequence, + BisectOptions, BisectResult, CountOptions, CountResult, ExtractOptions, ExtractResult, ExtractSequence, FromUsize, Int, ListOptions, LocateOptions, LocatePosition, LocateResult, SearchOptions, SearchResult, SeedMask, SuffixSortType, SufrMetadata, }, @@ -667,6 +667,76 @@ where Ok(()) } + // -------------------------------------------------- + /// Bisect the index range of occurences of queries. + /// If the index range of a prefix is already known, + /// or if it is desirable to avoid enumerating every match, + // this method can be used as a faster stand-in for `count` + /// ``` + /// use anyhow::Result; + /// use libsufr::{types::{BisectOptions, BisectResult}, sufr_file::SufrFile}; + /// + /// fn main() -> Result<()> { + /// let mut sufr = SufrFile::::read("../data/inputs/1.sufr", false)?; + /// let opts = BisectOptions { + /// queries: vec!["AC".to_string(), "AG".to_string(), "GT".to_string()], + /// max_query_len: None, + /// low_memory: true, + /// prefix_result: None, + /// }; + /// let res = sufr.bisect(opts)?; + /// Ok(()) + /// } + /// ``` + pub fn bisect(&mut self, args: BisectOptions) -> Result> { + // 1. retrieve the prefix result's index range. + // if no result was passed, deafult to the full range of the suffix array. + let n = self.len_suffixes.to_usize() - 1; + let search_range = match args.prefix_result { + Some(result) => (result.first_position, result.last_position), + _ => (0, n), + }; + + // 2. create a SufrSearch struct + let suffix_array_file: FileAccess = FileAccess::new( + &self.filename, + self.suffix_array_pos as u64, + self.len_suffixes.to_usize(), + )?; + let text_file: FileAccess = FileAccess::new( + &self.filename, + self.text_pos as u64, + self.text_len.to_usize(), + )?; + let search_args = SufrSearchArgs { + text: &self.text, + text_len: self.text_len.to_usize(), + text_file, + file: suffix_array_file, + suffix_array: &self.suffix_array_mem, + rank: &self.suffix_array_rank_mem, + len_suffixes: self.len_suffixes.to_usize(), + sort_type: &self.sort_type, + max_query_len: args.max_query_len, + }; + let mut search = SufrSearch::new(search_args); + + // 3. bisect each query + let bisects = args + .queries + .clone() + .iter() + .enumerate() + .map(|(query_num, query)| -> BisectResult { + search.bisect(query_num, &query, search_range.0, search_range.1).unwrap() + }) + .collect(); + + // TODO: multithreading. will need to rework 2 and 3 to resemble suffix_search. + + Ok(bisects) + } + // -------------------------------------------------- /// Count the occurrences of queries in a suffix array /// diff --git a/libsufr/src/sufr_search.rs b/libsufr/src/sufr_search.rs index e053191..4235ff7 100644 --- a/libsufr/src/sufr_search.rs +++ b/libsufr/src/sufr_search.rs @@ -3,7 +3,7 @@ use crate::{ file_access::FileAccess, types::{ - Comparison, FromUsize, Int, SearchResult, SearchResultLocations, SuffixSortType, + Comparison, FromUsize, Int, BisectResult, SearchResult, SearchResultLocations, SuffixSortType, }, util::find_lcp_full_offset, }; @@ -168,6 +168,36 @@ where } } + pub fn bisect( + &mut self, + query_num: usize, + query: &str, + low: usize, + high: usize, + ) -> Result { + let qry = query.as_bytes(); + if let Some(start) = self.suffix_search_first(qry, low, high, 0, 0) { + // something was found + let end = self + .suffix_search_last(qry, start, high, high, 0, 0) + .unwrap_or(start); + Ok(BisectResult { + query_num: query_num, + query: query.to_string(), + first_position: start, + last_position: end, + }) + } else { + // nothing was found + Ok(BisectResult { + query_num: query_num, + query: query.to_string(), + first_position: self.len_suffixes, + last_position: 0, + }) + } + } + // -------------------------------------------------- fn suffix_search_first( &mut self, diff --git a/libsufr/src/types.rs b/libsufr/src/types.rs index fc2b0de..b88ff6c 100644 --- a/libsufr/src/types.rs +++ b/libsufr/src/types.rs @@ -344,6 +344,43 @@ impl FromUsize for u64 { } } +// -------------------------------------------------- +/// Options for bisecting the index ranges of occurrences of query suffixes +#[derive(Debug, Clone)] +pub struct BisectOptions { + /// Vector of query strings + pub queries: Vec, + + /// Maximum query length for search + pub max_query_len: Option, + + /// When `true`, the suffix array will be placed into memory. + /// When `false`, the suffix array will be read from disk. + pub low_memory: bool, + + /// Optional, the bisect result for a query that is the common prefix of queries. + /// If passed, search for query ranges will be restricted to the range defined by prefix_result. + pub prefix_result: Option, +} + +// -------------------------------------------------- +/// A struct representing the index range of occurrences of a suffix +/// +#[derive(Debug, Clone, PartialEq)] +pub struct BisectResult { + /// The ordinal position of the original query + pub query_num: usize, + + /// The query string + pub query: String, + + /// The first index of a suffix matching the query + pub first_position: usize, + + /// The last index of a suffix matching the query + pub last_position: usize, +} + // -------------------------------------------------- /// Options for counting the occurrences of suffixes #[derive(Debug, Clone)] From bf01dd2ffb1a0ca455ccf9edd0a10318fb864bd7 Mon Sep 17 00:00:00 2001 From: George Glidden-Handgis <47768122+georgeglidden@users.noreply.github.com> Date: Tue, 29 Apr 2025 13:04:30 -0700 Subject: [PATCH 2/6] fix an off-by-one error when calling suffix_search_last; the parameter n is the upper end of an open interval, not a closed one, so passing `n := high` would prevent `high` from being searched. avoided by passing `n := high + 1`. --- libsufr/src/sufr_search.rs | 6 ++++-- libsufr/src/types.rs | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/libsufr/src/sufr_search.rs b/libsufr/src/sufr_search.rs index 4235ff7..31a9304 100644 --- a/libsufr/src/sufr_search.rs +++ b/libsufr/src/sufr_search.rs @@ -179,11 +179,12 @@ where if let Some(start) = self.suffix_search_first(qry, low, high, 0, 0) { // something was found let end = self - .suffix_search_last(qry, start, high, high, 0, 0) + .suffix_search_last(qry, start, high, high + 1, 0, 0) .unwrap_or(start); Ok(BisectResult { query_num: query_num, query: query.to_string(), + count: end - start + 1, first_position: start, last_position: end, }) @@ -192,7 +193,8 @@ where Ok(BisectResult { query_num: query_num, query: query.to_string(), - first_position: self.len_suffixes, + count: 0, + first_position: 0, last_position: 0, }) } diff --git a/libsufr/src/types.rs b/libsufr/src/types.rs index b88ff6c..fe14cbb 100644 --- a/libsufr/src/types.rs +++ b/libsufr/src/types.rs @@ -365,7 +365,6 @@ pub struct BisectOptions { // -------------------------------------------------- /// A struct representing the index range of occurrences of a suffix -/// #[derive(Debug, Clone, PartialEq)] pub struct BisectResult { /// The ordinal position of the original query @@ -374,6 +373,9 @@ pub struct BisectResult { /// The query string pub query: String, + /// The width of the interval + pub count: usize, + /// The first index of a suffix matching the query pub first_position: usize, From 31b9abcaff8671741d4ea97fc05f1435552698db Mon Sep 17 00:00:00 2001 From: George Glidden-Handgis <47768122+georgeglidden@users.noreply.github.com> Date: Tue, 29 Apr 2025 13:29:27 -0700 Subject: [PATCH 3/6] better docstrings, passing tests. --- libsufr/src/suffix_array.rs | 75 ++++++++++++++++++++++++++++++++++--- libsufr/src/sufr_file.rs | 69 ++++++++++++++++++++++++++++++++-- libsufr/src/sufr_search.rs | 11 ++++++ 3 files changed, 146 insertions(+), 9 deletions(-) diff --git a/libsufr/src/suffix_array.rs b/libsufr/src/suffix_array.rs index bb06bfe..1e07779 100644 --- a/libsufr/src/suffix_array.rs +++ b/libsufr/src/suffix_array.rs @@ -151,20 +151,83 @@ impl SuffixArray { /// Bisect the index range of occurences of queries. /// If the index range of a prefix is already known, /// or if it is desirable to avoid enumerating every match, - // this method can be used as a faster stand-in for `count` + /// this method can be used as a faster stand-in for `count` /// ``` /// use anyhow::Result; - /// use libsufr::{types::{BisectOptions, BisectResult}, sufr_file::SufrFile}; + /// use libsufr::{types::{BisectOptions, BisectResult}, suffix_array::SuffixArray}; /// /// fn main() -> Result<()> { - /// let mut sufr = SufrFile::::read("../data/inputs/1.sufr", false)?; + /// let mut suffix_array = SuffixArray::read("../data/inputs/1.sufr", false)?; /// let opts = BisectOptions { - /// queries: vec!["AC".to_string(), "AG".to_string(), "GT".to_string()], + /// queries: vec!["A".to_string(), "AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], /// max_query_len: None, - /// low_memory: true, + /// low_memory: false, /// prefix_result: None, /// }; - /// let res = sufr.bisect(opts)?; + /// let res = suffix_array.bisect(opts)?; + /// let expected = vec![ + /// BisectResult { + /// query_num: 0, + /// query: "A".to_string(), + /// count: 2, + /// first_position: 1, + /// last_position: 2 + /// }, + /// BisectResult { + /// query_num: 1, + /// query: "AC".to_string(), + /// count: 2, + /// first_position: 1, + /// last_position: 2 + /// }, + /// BisectResult { + /// query_num: 2, + /// query: "ACA".to_string(), + /// count: 0, + /// first_position: 0, + /// last_position: 0 + /// }, + /// BisectResult { + /// query_num: 3, + /// query: "ACG".to_string(), + /// count: 2, + /// first_position: 1, + /// last_position: 2 + /// }, + /// BisectResult { + /// query_num: 4, + /// query: "ACT".to_string(), + /// count: 0, + /// first_position: 0, + /// last_position: 0 + /// }, + /// BisectResult { + /// query_num: 5, + /// query: "ACC".to_string(), + /// count: 0, + /// first_position: 0, + /// last_position: 0 + /// } + /// ]; + /// assert_eq!(res, expected); + /// + /// let mut suffix_array = SuffixArray::read("../data/inputs/3.sufr", false)?; + /// let opts1 = BisectOptions { + /// queries: vec!["AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], + /// max_query_len: None, + /// low_memory: false, + /// prefix_result: None, + /// }; + /// let res1 = suffix_array.bisect(opts1)?; + /// let opts2 = BisectOptions { + /// queries: vec!["AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], + /// max_query_len: None, + /// low_memory: false, + /// prefix_result: Some(res1[0].clone()), + /// }; + /// let res2 = suffix_array.bisect(opts2)?; + /// assert_eq!(res1, res2); + /// /// Ok(()) /// } /// ``` diff --git a/libsufr/src/sufr_file.rs b/libsufr/src/sufr_file.rs index d4aba1e..6bf7c91 100644 --- a/libsufr/src/sufr_file.rs +++ b/libsufr/src/sufr_file.rs @@ -671,7 +671,7 @@ where /// Bisect the index range of occurences of queries. /// If the index range of a prefix is already known, /// or if it is desirable to avoid enumerating every match, - // this method can be used as a faster stand-in for `count` + /// this method can be used as a faster stand-in for `count` /// ``` /// use anyhow::Result; /// use libsufr::{types::{BisectOptions, BisectResult}, sufr_file::SufrFile}; @@ -679,12 +679,75 @@ where /// fn main() -> Result<()> { /// let mut sufr = SufrFile::::read("../data/inputs/1.sufr", false)?; /// let opts = BisectOptions { - /// queries: vec!["AC".to_string(), "AG".to_string(), "GT".to_string()], + /// queries: vec!["A".to_string(), "AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], /// max_query_len: None, - /// low_memory: true, + /// low_memory: false, /// prefix_result: None, /// }; /// let res = sufr.bisect(opts)?; + /// let expected = vec![ + /// BisectResult { + /// query_num: 0, + /// query: "A".to_string(), + /// count: 2, + /// first_position: 1, + /// last_position: 2 + /// }, + /// BisectResult { + /// query_num: 1, + /// query: "AC".to_string(), + /// count: 2, + /// first_position: 1, + /// last_position: 2 + /// }, + /// BisectResult { + /// query_num: 2, + /// query: "ACA".to_string(), + /// count: 0, + /// first_position: 0, + /// last_position: 0 + /// }, + /// BisectResult { + /// query_num: 3, + /// query: "ACG".to_string(), + /// count: 2, + /// first_position: 1, + /// last_position: 2 + /// }, + /// BisectResult { + /// query_num: 4, + /// query: "ACT".to_string(), + /// count: 0, + /// first_position: 0, + /// last_position: 0 + /// }, + /// BisectResult { + /// query_num: 5, + /// query: "ACC".to_string(), + /// count: 0, + /// first_position: 0, + /// last_position: 0 + /// } + /// ]; + /// assert_eq!(res, expected); + /// + /// let mut sufr = SufrFile::::read("../data/inputs/3.sufr", false)?; + /// let opts1 = BisectOptions { + /// queries: vec!["AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], + /// max_query_len: None, + /// low_memory: false, + /// prefix_result: None, + /// }; + /// let res1 = sufr.bisect(opts1)?; + /// let opts2 = BisectOptions { + /// queries: vec!["AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], + /// max_query_len: None, + /// low_memory: false, + /// prefix_result: Some(res1[0].clone()), + /// }; + /// let res2 = sufr.bisect(opts2)?; + /// assert_eq!(res1, res2); + /// /// Ok(()) /// } /// ``` diff --git a/libsufr/src/sufr_search.rs b/libsufr/src/sufr_search.rs index 31a9304..2e2a7d3 100644 --- a/libsufr/src/sufr_search.rs +++ b/libsufr/src/sufr_search.rs @@ -168,6 +168,17 @@ where } } + + // -------------------------------------------------- + /// Find the first and last positions of a query string in a suffix array, + /// given a range of viable positions. + /// Returns a `BisectResult` + /// + /// Args: + /// * `query_num`: ordinal number of the query + /// * `query`: a string to search for + /// * `low`: the lowest position at which the query may occur + /// * `high`: the highest position at which the query may occur pub fn bisect( &mut self, query_num: usize, From 256bbf7d4b4139dfd91f5821171a7423368950a4 Mon Sep 17 00:00:00 2001 From: George Glidden-Handgis <47768122+georgeglidden@users.noreply.github.com> Date: Wed, 30 Apr 2025 09:53:22 -0700 Subject: [PATCH 4/6] implement multithreading, logging for bisect. --- libsufr/src/sufr_file.rs | 85 +++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 32 deletions(-) diff --git a/libsufr/src/sufr_file.rs b/libsufr/src/sufr_file.rs index 6bf7c91..7dcd562 100644 --- a/libsufr/src/sufr_file.rs +++ b/libsufr/src/sufr_file.rs @@ -752,52 +752,73 @@ where /// } /// ``` pub fn bisect(&mut self, args: BisectOptions) -> Result> { - // 1. retrieve the prefix result's index range. + // set memory mode + self.query_low_memory = args.low_memory; + + if !self.query_low_memory { + self.set_suffix_array_mem(args.max_query_len)?; + } + + // construct SufrSearch factory + let now = Instant::now(); + let new_search = || -> Result>> { + let suffix_array_file: FileAccess = FileAccess::new( + &self.filename, + self.suffix_array_pos as u64, + self.len_suffixes.to_usize(), + )?; + let text_file: FileAccess = FileAccess::new( + &self.filename, + self.text_pos as u64, + self.text_len.to_usize(), + )?; + let search_args = SufrSearchArgs { + text: &self.text, + text_len: self.text_len.to_usize(), + text_file, + file: suffix_array_file, + suffix_array: &self.suffix_array_mem, + rank: &self.suffix_array_rank_mem, + len_suffixes: self.len_suffixes.to_usize(), + sort_type: &self.sort_type, + max_query_len: args.max_query_len, + }; + Ok(RefCell::new(SufrSearch::new(search_args))) + }; + + // retrieve the prefix result's index range. // if no result was passed, deafult to the full range of the suffix array. let n = self.len_suffixes.to_usize() - 1; let search_range = match args.prefix_result { Some(result) => (result.first_position, result.last_position), _ => (0, n), }; - - // 2. create a SufrSearch struct - let suffix_array_file: FileAccess = FileAccess::new( - &self.filename, - self.suffix_array_pos as u64, - self.len_suffixes.to_usize(), - )?; - let text_file: FileAccess = FileAccess::new( - &self.filename, - self.text_pos as u64, - self.text_len.to_usize(), - )?; - let search_args = SufrSearchArgs { - text: &self.text, - text_len: self.text_len.to_usize(), - text_file, - file: suffix_array_file, - suffix_array: &self.suffix_array_mem, - rank: &self.suffix_array_rank_mem, - len_suffixes: self.len_suffixes.to_usize(), - sort_type: &self.sort_type, - max_query_len: args.max_query_len, - }; - let mut search = SufrSearch::new(search_args); - // 3. bisect each query - let bisects = args + // bisect each query + let thread_local_search: ThreadLocal>> = + ThreadLocal::new(); + + + let mut res: Vec<_> = args .queries .clone() - .iter() + .into_par_iter() .enumerate() - .map(|(query_num, query)| -> BisectResult { - search.bisect(query_num, &query, search_range.0, search_range.1).unwrap() + .flat_map(|(query_num, query)| -> Result { + let mut search = + thread_local_search.get_or_try(new_search)?.borrow_mut(); + search.bisect(query_num, &query, search_range.0, search_range.1) }) .collect(); + res.sort_by_key(|r| r.query_num); - // TODO: multithreading. will need to rework 2 and 3 to resemble suffix_search. + info!( + "Bisection of {} queries finished in {:?}", + args.queries.len(), + now.elapsed() + ); - Ok(bisects) + Ok(res) } // -------------------------------------------------- From e7dc2cb4df85beefac728d7bd2a6868c16bcdc18 Mon Sep 17 00:00:00 2001 From: George Glidden-Handgis <47768122+georgeglidden@users.noreply.github.com> Date: Fri, 2 May 2025 13:59:11 -0700 Subject: [PATCH 5/6] clean up doc tests, add unit test. --- libsufr/src/suffix_array.rs | 83 +++++++---------------- libsufr/src/sufr_file.rs | 131 +++++++++++++++++------------------- 2 files changed, 85 insertions(+), 129 deletions(-) diff --git a/libsufr/src/suffix_array.rs b/libsufr/src/suffix_array.rs index 1e07779..4662350 100644 --- a/libsufr/src/suffix_array.rs +++ b/libsufr/src/suffix_array.rs @@ -155,79 +155,42 @@ impl SuffixArray { /// ``` /// use anyhow::Result; /// use libsufr::{types::{BisectOptions, BisectResult}, suffix_array::SuffixArray}; - /// + /// /// fn main() -> Result<()> { /// let mut suffix_array = SuffixArray::read("../data/inputs/1.sufr", false)?; - /// let opts = BisectOptions { - /// queries: vec!["A".to_string(), "AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], + /// let opts_without_prefix = BisectOptions { + /// queries: vec!["AC".to_string(), "CG".to_string()], /// max_query_len: None, /// low_memory: false, /// prefix_result: None, /// }; - /// let res = suffix_array.bisect(opts)?; - /// let expected = vec![ - /// BisectResult { - /// query_num: 0, - /// query: "A".to_string(), - /// count: 2, - /// first_position: 1, - /// last_position: 2 - /// }, - /// BisectResult { - /// query_num: 1, - /// query: "AC".to_string(), - /// count: 2, - /// first_position: 1, - /// last_position: 2 - /// }, - /// BisectResult { - /// query_num: 2, - /// query: "ACA".to_string(), - /// count: 0, - /// first_position: 0, - /// last_position: 0 - /// }, - /// BisectResult { - /// query_num: 3, - /// query: "ACG".to_string(), - /// count: 2, - /// first_position: 1, - /// last_position: 2 - /// }, - /// BisectResult { - /// query_num: 4, - /// query: "ACT".to_string(), - /// count: 0, - /// first_position: 0, - /// last_position: 0 - /// }, - /// BisectResult { - /// query_num: 5, - /// query: "ACC".to_string(), - /// count: 0, - /// first_position: 0, - /// last_position: 0 - /// } - /// ]; - /// assert_eq!(res, expected); - /// - /// let mut suffix_array = SuffixArray::read("../data/inputs/3.sufr", false)?; - /// let opts1 = BisectOptions { - /// queries: vec!["AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], + /// let result_without_prefix = suffix_array.bisect(opts_without_prefix)?; + /// assert_eq!( + /// result_without_prefix, + /// vec![ + /// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 }, + /// BisectResult { query_num: 1, query: "CG".to_string(), count: 2, first_position: 3, last_position: 4 }] + /// ); + /// let prefix_opts = BisectOptions { + /// queries: vec!["A".to_string()], /// max_query_len: None, /// low_memory: false, /// prefix_result: None, /// }; - /// let res1 = suffix_array.bisect(opts1)?; - /// let opts2 = BisectOptions { - /// queries: vec!["AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], + /// let prefix_result = suffix_array.bisect(prefix_opts)?[0].clone(); + /// let opts_with_prefix = BisectOptions { + /// queries: vec!["AC".to_string(), "CG".to_string()], /// max_query_len: None, /// low_memory: false, - /// prefix_result: Some(res1[0].clone()), + /// prefix_result: Some(prefix_result), /// }; - /// let res2 = suffix_array.bisect(opts2)?; - /// assert_eq!(res1, res2); - /// + /// let result_with_prefix = suffix_array.bisect(opts_with_prefix)?; + /// assert_eq!( + /// result_with_prefix, + /// vec![ + /// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 }, + /// BisectResult { query_num: 1, query: "CG".to_string(), count: 0, first_position: 0, last_position: 0 }] + /// ); /// Ok(()) /// } /// ``` diff --git a/libsufr/src/sufr_file.rs b/libsufr/src/sufr_file.rs index 7dcd562..017afe1 100644 --- a/libsufr/src/sufr_file.rs +++ b/libsufr/src/sufr_file.rs @@ -675,91 +675,57 @@ where /// ``` /// use anyhow::Result; /// use libsufr::{types::{BisectOptions, BisectResult}, sufr_file::SufrFile}; - /// + /// /// fn main() -> Result<()> { /// let mut sufr = SufrFile::::read("../data/inputs/1.sufr", false)?; - /// let opts = BisectOptions { - /// queries: vec!["A".to_string(), "AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], + /// // bisect without a prefix result, searching the whole suffix array: + /// let opts_without_prefix = BisectOptions { + /// queries: vec!["AC".to_string(), "CG".to_string()], /// max_query_len: None, /// low_memory: false, /// prefix_result: None, /// }; - /// let res = sufr.bisect(opts)?; - /// let expected = vec![ - /// BisectResult { - /// query_num: 0, - /// query: "A".to_string(), - /// count: 2, - /// first_position: 1, - /// last_position: 2 - /// }, - /// BisectResult { - /// query_num: 1, - /// query: "AC".to_string(), - /// count: 2, - /// first_position: 1, - /// last_position: 2 - /// }, - /// BisectResult { - /// query_num: 2, - /// query: "ACA".to_string(), - /// count: 0, - /// first_position: 0, - /// last_position: 0 - /// }, - /// BisectResult { - /// query_num: 3, - /// query: "ACG".to_string(), - /// count: 2, - /// first_position: 1, - /// last_position: 2 - /// }, - /// BisectResult { - /// query_num: 4, - /// query: "ACT".to_string(), - /// count: 0, - /// first_position: 0, - /// last_position: 0 - /// }, - /// BisectResult { - /// query_num: 5, - /// query: "ACC".to_string(), - /// count: 0, - /// first_position: 0, - /// last_position: 0 - /// } - /// ]; - /// assert_eq!(res, expected); - /// - /// let mut sufr = SufrFile::::read("../data/inputs/3.sufr", false)?; - /// let opts1 = BisectOptions { - /// queries: vec!["AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], + /// let result_without_prefix = sufr.bisect(opts_without_prefix)?; + /// // ... both queries appear in the suffix array + /// assert_eq!( + /// result_without_prefix, + /// vec![ + /// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 }, + /// BisectResult { query_num: 1, query: "CG".to_string(), count: 2, first_position: 3, last_position: 4 }] + /// ); + /// // bisect within the range of a prefix result: + /// let prefix_opts = BisectOptions { + /// queries: vec!["A".to_string()], /// max_query_len: None, /// low_memory: false, /// prefix_result: None, /// }; - /// let res1 = sufr.bisect(opts1)?; - /// let opts2 = BisectOptions { - /// queries: vec!["AC".to_string(), "ACA".to_string(), "ACG".to_string(), "ACT".to_string(), "ACC".to_string()], + /// let prefix_result = sufr.bisect(prefix_opts)?[0].clone(); + /// let opts_with_prefix = BisectOptions { + /// queries: vec!["AC".to_string(), "CG".to_string()], /// max_query_len: None, /// low_memory: false, - /// prefix_result: Some(res1[0].clone()), + /// prefix_result: Some(prefix_result), /// }; - /// let res2 = sufr.bisect(opts2)?; - /// assert_eq!(res1, res2); - /// + /// let result_with_prefix = sufr.bisect(opts_with_prefix)?; + /// // ... the query AC is found within the range of the prefix result for A, but CG is not. + /// assert_eq!( + /// result_with_prefix, + /// vec![ + /// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 }, + /// BisectResult { query_num: 1, query: "CG".to_string(), count: 0, first_position: 0, last_position: 0 }] + /// ); /// Ok(()) /// } /// ``` pub fn bisect(&mut self, args: BisectOptions) -> Result> { - // set memory mode + // Set memory mode self.query_low_memory = args.low_memory; - if !self.query_low_memory { self.set_suffix_array_mem(args.max_query_len)?; } - // construct SufrSearch factory + // Construct SufrSearch factory let now = Instant::now(); let new_search = || -> Result>> { let suffix_array_file: FileAccess = FileAccess::new( @@ -786,19 +752,17 @@ where Ok(RefCell::new(SufrSearch::new(search_args))) }; - // retrieve the prefix result's index range. - // if no result was passed, deafult to the full range of the suffix array. + // Retrieve the prefix result's index range. + // If no result was passed, deafult to the full range of the suffix array. let n = self.len_suffixes.to_usize() - 1; let search_range = match args.prefix_result { Some(result) => (result.first_position, result.last_position), _ => (0, n), }; - // bisect each query + // Bisect each query in its own thread let thread_local_search: ThreadLocal>> = ThreadLocal::new(); - - let mut res: Vec<_> = args .queries .clone() @@ -1330,7 +1294,7 @@ mod test { sufr_file::SufrFile, types::{ ExtractOptions, ExtractResult, ExtractSequence, LocateOptions, - LocatePosition, LocateResult, + LocatePosition, LocateResult, BisectOptions, BisectResult, }, }; use anyhow::Result; @@ -1712,6 +1676,35 @@ mod test { Ok(()) } + // -------------------------------------------------- + #[test] + fn test_bisect() -> Result<()> { + let mut sufr = SufrFile::::read("data/inputs/3.sufr", false)?; + // bisect "A" + let prefix = vec!["A".to_string()]; + let prefix_result = sufr.bisect(BisectOptions { + queries: prefix, + max_query_len: None, + low_memory: false, + prefix_result: None + })?[0].clone(); + // bisect "AA", "AC", "AG", "AT", "AN" within the range of "A". + let queries = vec!["AA".to_string(), "AC".to_string(), "AG".to_string(), "AT".to_string(), "AN".to_string()]; + let queries_result = sufr.bisect(BisectOptions { + queries: queries, + max_query_len: None, + low_memory: false, + prefix_result: Some(prefix_result.clone()), + })?; + // because we queried all of the possible suffixes to "A", + // the count of "A" should be the sum of counts of queries. + assert_eq!( + prefix_result.count, + queries_result.iter().map(|res| res.count).sum(), + ); + Ok(()) + } + // -------------------------------------------------- // The "compare" function is now deeply nested inside the SuffixSearch // which is created inside the "suffix_search" function and I'm lost From b145101b0e026e7aac102d7191c97fa85ad3219a Mon Sep 17 00:00:00 2001 From: George Glidden-Handgis <47768122+georgeglidden@users.noreply.github.com> Date: Fri, 2 May 2025 14:00:43 -0700 Subject: [PATCH 6/6] fix path to input data --- libsufr/src/sufr_file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libsufr/src/sufr_file.rs b/libsufr/src/sufr_file.rs index 017afe1..e89aeb1 100644 --- a/libsufr/src/sufr_file.rs +++ b/libsufr/src/sufr_file.rs @@ -1679,7 +1679,7 @@ mod test { // -------------------------------------------------- #[test] fn test_bisect() -> Result<()> { - let mut sufr = SufrFile::::read("data/inputs/3.sufr", false)?; + let mut sufr = SufrFile::::read("../data/inputs/3.sufr", false)?; // bisect "A" let prefix = vec!["A".to_string()]; let prefix_result = sufr.bisect(BisectOptions {