Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion libsufr/src/suffix_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::{
sufr_builder::SufrBuilder,
sufr_file::SufrFile,
types::{
CountOptions, CountResult, ExtractOptions, ExtractResult, ListOptions,
BisectOptions, BisectResult, CountOptions, CountResult, ExtractOptions, ExtractResult, ListOptions,
LocateOptions, LocateResult, SufrBuilderArgs, SufrMetadata,
},
};
Expand All @@ -17,6 +17,7 @@ pub(crate) trait SuffixArrayTrait: Send + Sync {
fn locate(&mut self, args: LocateOptions) -> Result<Vec<LocateResult>>;
fn metadata(&self) -> Result<SufrMetadata>;
fn string_at(&mut self, pos: usize, len: Option<usize>) -> Result<String>;
fn bisect(&mut self, args: BisectOptions) -> Result<Vec<BisectResult>>;
}

// --------------------------------------------------
Expand Down Expand Up @@ -48,6 +49,10 @@ impl SuffixArrayTrait for SuffixArray32 {
fn string_at(&mut self, pos: usize, len: Option<usize>) -> Result<String> {
self.inner.string_at(pos, len)
}

fn bisect(&mut self, args: BisectOptions) -> Result<Vec<BisectResult>> {
self.inner.bisect(args)
}
}

pub(crate) struct SuffixArray64 {
Expand Down Expand Up @@ -79,6 +84,10 @@ impl SuffixArrayTrait for SuffixArray64 {
fn string_at(&mut self, pos: usize, len: Option<usize>) -> Result<String> {
self.inner.string_at(pos, len)
}

fn bisect(&mut self, args: BisectOptions) -> Result<Vec<BisectResult>> {
self.inner.bisect(args)
}
}

// --------------------------------------------------
Expand Down Expand Up @@ -137,6 +146,58 @@ impl SuffixArray {
Self::read(&path, low_memory)
}


// --------------------------------------------------
/// Bisect the index range of occurences of queries.
/// If the index range of a prefix is already known,
/// or if it is desirable to avoid enumerating every match,
/// this method can be used as a faster stand-in for `count`
/// ```
/// use anyhow::Result;
/// use libsufr::{types::{BisectOptions, BisectResult}, suffix_array::SuffixArray};
///
/// fn main() -> Result<()> {
/// let mut suffix_array = SuffixArray::read("../data/inputs/1.sufr", false)?;
/// let opts_without_prefix = BisectOptions {
/// queries: vec!["AC".to_string(), "CG".to_string()],
/// max_query_len: None,
/// low_memory: false,
/// prefix_result: None,
/// };
/// let result_without_prefix = suffix_array.bisect(opts_without_prefix)?;
/// assert_eq!(
/// result_without_prefix,
/// vec![
/// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 },
/// BisectResult { query_num: 1, query: "CG".to_string(), count: 2, first_position: 3, last_position: 4 }]
/// );
/// let prefix_opts = BisectOptions {
/// queries: vec!["A".to_string()],
/// max_query_len: None,
/// low_memory: false,
/// prefix_result: None,
/// };
/// let prefix_result = suffix_array.bisect(prefix_opts)?[0].clone();
/// let opts_with_prefix = BisectOptions {
/// queries: vec!["AC".to_string(), "CG".to_string()],
/// max_query_len: None,
/// low_memory: false,
/// prefix_result: Some(prefix_result),
/// };
/// let result_with_prefix = suffix_array.bisect(opts_with_prefix)?;
/// assert_eq!(
/// result_with_prefix,
/// vec![
/// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 },
/// BisectResult { query_num: 1, query: "CG".to_string(), count: 0, first_position: 0, last_position: 0 }]
/// );
/// Ok(())
/// }
/// ```
pub fn bisect(&mut self, args: BisectOptions) -> Result<Vec<BisectResult>> {
self.inner.bisect(args)
}

// --------------------------------------------------
/// Count instances of queries
///
Expand Down
151 changes: 149 additions & 2 deletions libsufr/src/sufr_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::{
file_access::FileAccess,
sufr_search::{SufrSearch, SufrSearchArgs},
types::{
CountOptions, CountResult, ExtractOptions, ExtractResult, ExtractSequence,
BisectOptions, BisectResult, CountOptions, CountResult, ExtractOptions, ExtractResult, ExtractSequence,
FromUsize, Int, ListOptions, LocateOptions, LocatePosition, LocateResult,
SearchOptions, SearchResult, SeedMask, SuffixSortType, SufrMetadata,
},
Expand Down Expand Up @@ -667,6 +667,124 @@ where
Ok(())
}

// --------------------------------------------------
/// Bisect the index range of occurences of queries.
/// If the index range of a prefix is already known,
/// or if it is desirable to avoid enumerating every match,
/// this method can be used as a faster stand-in for `count`
/// ```
/// use anyhow::Result;
/// use libsufr::{types::{BisectOptions, BisectResult}, sufr_file::SufrFile};
///
/// fn main() -> Result<()> {
/// let mut sufr = SufrFile::<u32>::read("../data/inputs/1.sufr", false)?;
/// // bisect without a prefix result, searching the whole suffix array:
/// let opts_without_prefix = BisectOptions {
/// queries: vec!["AC".to_string(), "CG".to_string()],
/// max_query_len: None,
/// low_memory: false,
/// prefix_result: None,
/// };
/// let result_without_prefix = sufr.bisect(opts_without_prefix)?;
/// // ... both queries appear in the suffix array
/// assert_eq!(
/// result_without_prefix,
/// vec![
/// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 },
/// BisectResult { query_num: 1, query: "CG".to_string(), count: 2, first_position: 3, last_position: 4 }]
/// );
/// // bisect within the range of a prefix result:
/// let prefix_opts = BisectOptions {
/// queries: vec!["A".to_string()],
/// max_query_len: None,
/// low_memory: false,
/// prefix_result: None,
/// };
/// let prefix_result = sufr.bisect(prefix_opts)?[0].clone();
/// let opts_with_prefix = BisectOptions {
/// queries: vec!["AC".to_string(), "CG".to_string()],
/// max_query_len: None,
/// low_memory: false,
/// prefix_result: Some(prefix_result),
/// };
/// let result_with_prefix = sufr.bisect(opts_with_prefix)?;
/// // ... the query AC is found within the range of the prefix result for A, but CG is not.
/// assert_eq!(
/// result_with_prefix,
/// vec![
/// BisectResult { query_num: 0, query: "AC".to_string(), count: 2, first_position: 1, last_position: 2 },
/// BisectResult { query_num: 1, query: "CG".to_string(), count: 0, first_position: 0, last_position: 0 }]
/// );
/// Ok(())
/// }
/// ```
pub fn bisect(&mut self, args: BisectOptions) -> Result<Vec<BisectResult>> {
// Set memory mode
self.query_low_memory = args.low_memory;
if !self.query_low_memory {
self.set_suffix_array_mem(args.max_query_len)?;
}

// Construct SufrSearch factory
let now = Instant::now();
let new_search = || -> Result<RefCell<SufrSearch<T>>> {
let suffix_array_file: FileAccess<T> = FileAccess::new(
&self.filename,
self.suffix_array_pos as u64,
self.len_suffixes.to_usize(),
)?;
let text_file: FileAccess<u8> = FileAccess::new(
&self.filename,
self.text_pos as u64,
self.text_len.to_usize(),
)?;
let search_args = SufrSearchArgs {
text: &self.text,
text_len: self.text_len.to_usize(),
text_file,
file: suffix_array_file,
suffix_array: &self.suffix_array_mem,
rank: &self.suffix_array_rank_mem,
len_suffixes: self.len_suffixes.to_usize(),
sort_type: &self.sort_type,
max_query_len: args.max_query_len,
};
Ok(RefCell::new(SufrSearch::new(search_args)))
};

// Retrieve the prefix result's index range.
// If no result was passed, deafult to the full range of the suffix array.
let n = self.len_suffixes.to_usize() - 1;
let search_range = match args.prefix_result {
Some(result) => (result.first_position, result.last_position),
_ => (0, n),
};

// Bisect each query in its own thread
let thread_local_search: ThreadLocal<RefCell<SufrSearch<T>>> =
ThreadLocal::new();
let mut res: Vec<_> = args
.queries
.clone()
.into_par_iter()
.enumerate()
.flat_map(|(query_num, query)| -> Result<BisectResult> {
let mut search =
thread_local_search.get_or_try(new_search)?.borrow_mut();
search.bisect(query_num, &query, search_range.0, search_range.1)
})
.collect();
res.sort_by_key(|r| r.query_num);

info!(
"Bisection of {} queries finished in {:?}",
args.queries.len(),
now.elapsed()
);

Ok(res)
}

// --------------------------------------------------
/// Count the occurrences of queries in a suffix array
///
Expand Down Expand Up @@ -1176,7 +1294,7 @@ mod test {
sufr_file::SufrFile,
types::{
ExtractOptions, ExtractResult, ExtractSequence, LocateOptions,
LocatePosition, LocateResult,
LocatePosition, LocateResult, BisectOptions, BisectResult,
},
};
use anyhow::Result;
Expand Down Expand Up @@ -1558,6 +1676,35 @@ mod test {
Ok(())
}

// --------------------------------------------------
#[test]
fn test_bisect() -> Result<()> {
let mut sufr = SufrFile::<u32>::read("../data/inputs/3.sufr", false)?;
// bisect "A"
let prefix = vec!["A".to_string()];
let prefix_result = sufr.bisect(BisectOptions {
queries: prefix,
max_query_len: None,
low_memory: false,
prefix_result: None
})?[0].clone();
// bisect "AA", "AC", "AG", "AT", "AN" within the range of "A".
let queries = vec!["AA".to_string(), "AC".to_string(), "AG".to_string(), "AT".to_string(), "AN".to_string()];
let queries_result = sufr.bisect(BisectOptions {
queries: queries,
max_query_len: None,
low_memory: false,
prefix_result: Some(prefix_result.clone()),
})?;
// because we queried all of the possible suffixes to "A",
// the count of "A" should be the sum of counts of queries.
assert_eq!(
prefix_result.count,
queries_result.iter().map(|res| res.count).sum(),
);
Ok(())
}

// --------------------------------------------------
// The "compare" function is now deeply nested inside the SuffixSearch
// which is created inside the "suffix_search" function and I'm lost
Expand Down
45 changes: 44 additions & 1 deletion libsufr/src/sufr_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use crate::{
file_access::FileAccess,
types::{
Comparison, FromUsize, Int, SearchResult, SearchResultLocations, SuffixSortType,
Comparison, FromUsize, Int, BisectResult, SearchResult, SearchResultLocations, SuffixSortType,
},
util::find_lcp_full_offset,
};
Expand Down Expand Up @@ -168,6 +168,49 @@ where
}
}


// --------------------------------------------------
/// Find the first and last positions of a query string in a suffix array,
/// given a range of viable positions.
/// Returns a `BisectResult`
///
/// Args:
/// * `query_num`: ordinal number of the query
/// * `query`: a string to search for
/// * `low`: the lowest position at which the query may occur
/// * `high`: the highest position at which the query may occur
pub fn bisect(
&mut self,
query_num: usize,
query: &str,
low: usize,
high: usize,
) -> Result<BisectResult> {
let qry = query.as_bytes();
if let Some(start) = self.suffix_search_first(qry, low, high, 0, 0) {
// something was found
let end = self
.suffix_search_last(qry, start, high, high + 1, 0, 0)
.unwrap_or(start);
Ok(BisectResult {
query_num: query_num,
query: query.to_string(),
count: end - start + 1,
first_position: start,
last_position: end,
})
} else {
// nothing was found
Ok(BisectResult {
query_num: query_num,
query: query.to_string(),
count: 0,
first_position: 0,
last_position: 0,
})
}
}

// --------------------------------------------------
fn suffix_search_first(
&mut self,
Expand Down
39 changes: 39 additions & 0 deletions libsufr/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,45 @@ impl FromUsize<u64> for u64 {
}
}

// --------------------------------------------------
/// Options for bisecting the index ranges of occurrences of query suffixes
#[derive(Debug, Clone)]
pub struct BisectOptions {
/// Vector of query strings
pub queries: Vec<String>,

/// Maximum query length for search
pub max_query_len: Option<usize>,

/// When `true`, the suffix array will be placed into memory.
/// When `false`, the suffix array will be read from disk.
pub low_memory: bool,

/// Optional, the bisect result for a query that is the common prefix of queries.
/// If passed, search for query ranges will be restricted to the range defined by prefix_result.
pub prefix_result: Option<BisectResult>,
}

// --------------------------------------------------
/// A struct representing the index range of occurrences of a suffix
#[derive(Debug, Clone, PartialEq)]
pub struct BisectResult {
/// The ordinal position of the original query
pub query_num: usize,

/// The query string
pub query: String,

/// The width of the interval
pub count: usize,

/// The first index of a suffix matching the query
pub first_position: usize,

/// The last index of a suffix matching the query
pub last_position: usize,
}

// --------------------------------------------------
/// Options for counting the occurrences of suffixes
#[derive(Debug, Clone)]
Expand Down