diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 714cee3ec..4ef8ac658 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -54,7 +54,7 @@ rayon = "1.10" rayon-cond = "0.4" serde = { version = "1.0", features = [ "derive" ] } serde_json = "1.0" -unicode-normalization-alignments = "0.1" +unicode-normalization = "^0.1.25" unicode_categories = "0.1" unicode-segmentation = "1.11" indicatif = {version = "0.18", optional = true} @@ -74,6 +74,7 @@ monostate = "0.1.12" ahash = { version = "0.8.11", features = ["serde"] } dary_heap = { version = "0.3.6", features = ["serde"] } compact_str = { version = "0.9", features = ["serde"] } +smallvec = "1" [features] default = ["progressbar", "onig", "esaxx_fast"] diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index 7841314d0..0db40ee93 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -145,6 +145,8 @@ pub mod pre_tokenizers; pub mod processors; pub mod tokenizer; +pub(crate) mod unicode_normalization_alignments; + // Re-export from tokenizer pub use tokenizer::*; diff --git a/tokenizers/src/normalizers/strip.rs b/tokenizers/src/normalizers/strip.rs index 19f5ff314..77bf2a8f7 100644 --- a/tokenizers/src/normalizers/strip.rs +++ b/tokenizers/src/normalizers/strip.rs @@ -1,7 +1,7 @@ use crate::tokenizer::{NormalizedString, Normalizer, Result}; +use crate::unicode_normalization_alignments::char::is_combining_mark; use crate::utils::macro_rules_attribute; use serde::{Deserialize, Serialize}; -use unicode_normalization_alignments::char::is_combining_mark; #[derive(Copy, Clone, Debug, Deserialize, Serialize)] #[serde(tag = "type")] @@ -61,7 +61,7 @@ mod tests { use crate::normalizer::NormalizedString; use crate::normalizers::Lowercase; use crate::normalizers::NFKD; - use unicode_normalization_alignments::UnicodeNormalization; + use crate::unicode_normalization_alignments::UnicodeNormalization; #[test] fn test_strip_accents() { diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 0b8c519ea..68d77c132 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -1,7 +1,7 @@ use crate::pattern::Pattern; +use crate::unicode_normalization_alignments::UnicodeNormalization; use crate::{Offsets, Result}; use std::ops::{Bound, RangeBounds}; -use unicode_normalization_alignments::UnicodeNormalization; use serde::{Deserialize, Serialize}; diff --git a/tokenizers/src/unicode_normalization_alignments/decompose.rs b/tokenizers/src/unicode_normalization_alignments/decompose.rs new file mode 100644 index 000000000..a4ee6f0de --- /dev/null +++ b/tokenizers/src/unicode_normalization_alignments/decompose.rs @@ -0,0 +1,161 @@ +use smallvec::SmallVec; +use std::fmt::{self, Write}; +use std::iter::Fuse; +use std::ops::Range; +use unicode_normalization::char::{ + canonical_combining_class, decompose_canonical, decompose_compatible, +}; + +#[derive(Clone)] +enum DecompositionType { + Canonical, + Compatible, +} + +/// External iterator for a string decomposition's characters. +#[derive(Clone)] +pub struct Decompositions { + kind: DecompositionType, + iter: Fuse, + + // This buffer stores pairs of (canonical combining class, character), + // pushed onto the end in text order. + // + // It's divided into up to three sections: + // 1) A prefix that is free space; + // 2) "Ready" characters which are sorted and ready to emit on demand; + // 3) A "pending" block which stills needs more characters for us to be able + // to sort in canonical order and is not safe to emit. + buffer: SmallVec<[(u8, char, isize); 4]>, + ready: Range, +} + +#[inline] +pub fn new_canonical>(iter: I) -> Decompositions { + Decompositions { + kind: self::DecompositionType::Canonical, + iter: iter.fuse(), + buffer: SmallVec::new(), + ready: 0..0, + } +} + +#[inline] +pub fn new_compatible>(iter: I) -> Decompositions { + Decompositions { + kind: self::DecompositionType::Compatible, + iter: iter.fuse(), + buffer: SmallVec::new(), + ready: 0..0, + } +} + +impl Decompositions { + #[inline] + fn push_back(&mut self, ch: char, first: bool) { + let class = canonical_combining_class(ch); + + if class == 0 { + self.sort_pending(); + } + + self.buffer.push((class, ch, if first { 0 } else { 1 })); + } + + #[inline] + fn sort_pending(&mut self) { + // NB: `sort_by_key` is stable, so it will preserve the original text's + // order within a combining class. + self.buffer[self.ready.end..].sort_by_key(|k| k.0); + self.ready.end = self.buffer.len(); + } + + #[inline] + fn reset_buffer(&mut self) { + // Equivalent to `self.buffer.drain(0..self.ready.end)` (if SmallVec + // supported this API) + let pending = self.buffer.len() - self.ready.end; + for i in 0..pending { + self.buffer[i] = self.buffer[i + self.ready.end]; + } + self.buffer.truncate(pending); + self.ready = 0..0; + } + + #[inline] + fn increment_next_ready(&mut self) { + let next = self.ready.start + 1; + if next == self.ready.end { + self.reset_buffer(); + } else { + self.ready.start = next; + } + } +} + +impl> Iterator for Decompositions { + type Item = (char, isize); + + #[inline] + fn next(&mut self) -> Option<(char, isize)> { + while self.ready.end == 0 { + match (self.iter.next(), &self.kind) { + (Some(ch), &DecompositionType::Canonical) => { + let mut first = true; + decompose_canonical(ch, |d| { + self.push_back(d, first); + first = false; + }); + } + (Some(ch), &DecompositionType::Compatible) => { + let mut first = true; + decompose_compatible(ch, |d| { + self.push_back(d, first); + first = false; + }); + } + (None, _) => { + if self.buffer.is_empty() { + return None; + } else { + self.sort_pending(); + + // This implementation means that we can call `next` + // on an exhausted iterator; the last outer `next` call + // will result in an inner `next` call. To make this + // safe, we use `fuse`. + break; + } + } + } + } + + // We can assume here that, if `self.ready.end` is greater than zero, + // it's also greater than `self.ready.start`. That's because we only + // increment `self.ready.start` inside `increment_next_ready`, and + // whenever it reaches equality with `self.ready.end`, we reset both + // to zero, maintaining the invariant that: + // self.ready.start < self.ready.end || self.ready.end == self.ready.start == 0 + // + // This less-than-obviously-safe implementation is chosen for performance, + // minimizing the number & complexity of branches in `next` in the common + // case of buffering then unbuffering a single character with each call. + let (_, ch, size) = self.buffer[self.ready.start]; + self.increment_next_ready(); + Some((ch, size)) + } + + fn size_hint(&self) -> (usize, Option) { + let (lower, _) = self.iter.size_hint(); + (lower, None) + } +} + +impl + Clone> fmt::Display for Decompositions { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for c in self.clone() { + f.write_char(c.0)?; + } + Ok(()) + } +} diff --git a/tokenizers/src/unicode_normalization_alignments/mod.rs b/tokenizers/src/unicode_normalization_alignments/mod.rs new file mode 100644 index 000000000..c5e12735f --- /dev/null +++ b/tokenizers/src/unicode_normalization_alignments/mod.rs @@ -0,0 +1,73 @@ +pub use unicode_normalization::char; + +pub mod decompose; +pub mod recompose; + +pub use decompose::Decompositions; +pub use recompose::Recompositions; +use std::str::Chars; + +/// Methods for iterating over strings while applying Unicode normalizations +/// as described in +/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/). +pub trait UnicodeNormalization> { + /// Returns an iterator over the string in Unicode Normalization Form D + /// (canonical decomposition). + fn nfd(self) -> Decompositions; + + /// Returns an iterator over the string in Unicode Normalization Form KD + /// (compatibility decomposition). + fn nfkd(self) -> Decompositions; + + /// An Iterator over the string in Unicode Normalization Form C + /// (canonical decomposition followed by canonical composition). + fn nfc(self) -> Recompositions; + + /// An Iterator over the string in Unicode Normalization Form KC + /// (compatibility decomposition followed by canonical composition). + fn nfkc(self) -> Recompositions; +} + +impl<'a> UnicodeNormalization> for &'a str { + #[inline] + fn nfd(self) -> Decompositions> { + decompose::new_canonical(self.chars()) + } + + #[inline] + fn nfkd(self) -> Decompositions> { + decompose::new_compatible(self.chars()) + } + + #[inline] + fn nfc(self) -> Recompositions> { + recompose::new_canonical(self.chars()) + } + + #[inline] + fn nfkc(self) -> Recompositions> { + recompose::new_compatible(self.chars()) + } +} + +impl> UnicodeNormalization for I { + #[inline] + fn nfd(self) -> Decompositions { + decompose::new_canonical(self) + } + + #[inline] + fn nfkd(self) -> Decompositions { + decompose::new_compatible(self) + } + + #[inline] + fn nfc(self) -> Recompositions { + recompose::new_canonical(self) + } + + #[inline] + fn nfkc(self) -> Recompositions { + recompose::new_compatible(self) + } +} diff --git a/tokenizers/src/unicode_normalization_alignments/recompose.rs b/tokenizers/src/unicode_normalization_alignments/recompose.rs new file mode 100644 index 000000000..f60112e20 --- /dev/null +++ b/tokenizers/src/unicode_normalization_alignments/recompose.rs @@ -0,0 +1,145 @@ +use crate::unicode_normalization_alignments::decompose::Decompositions; +use smallvec::SmallVec; +use std::fmt::{self, Write}; +use unicode_normalization::char::{canonical_combining_class, compose}; + +#[derive(Clone)] +enum RecompositionState { + Composing, + Purging(usize), + Finished(usize), +} + +/// External iterator for a string recomposition's characters. +#[derive(Clone)] +pub struct Recompositions { + iter: Decompositions, + state: RecompositionState, + buffer: SmallVec<[(char, isize); 4]>, + composee: Option<(char, isize)>, + last_ccc: Option, +} + +#[inline] +pub fn new_canonical>(iter: I) -> Recompositions { + Recompositions { + iter: super::decompose::new_canonical(iter), + state: self::RecompositionState::Composing, + buffer: SmallVec::new(), + composee: None, + last_ccc: None, + } +} + +#[inline] +pub fn new_compatible>(iter: I) -> Recompositions { + Recompositions { + iter: super::decompose::new_compatible(iter), + state: self::RecompositionState::Composing, + buffer: SmallVec::new(), + composee: None, + last_ccc: None, + } +} + +impl> Iterator for Recompositions { + type Item = (char, isize); + + #[inline] + fn next(&mut self) -> Option<(char, isize)> { + use self::RecompositionState::*; + + loop { + match self.state { + Composing => { + for (ch, change) in self.iter.by_ref() { + let ch_class = canonical_combining_class(ch); + let k = match self.composee { + None => { + if ch_class != 0 { + return Some((ch, change)); + } + self.composee = Some((ch, change)); + continue; + } + Some(k) => k, + }; + match self.last_ccc { + None => match compose(k.0, ch) { + Some(r) => { + self.composee = Some((r, k.1 + change - 1)); + continue; + } + None => { + if ch_class == 0 { + self.composee = Some((ch, change)); + return Some(k); + } + self.buffer.push((ch, change)); + self.last_ccc = Some(ch_class); + } + }, + Some(l_class) => { + if l_class >= ch_class { + // `ch` is blocked from `composee` + if ch_class == 0 { + self.composee = Some((ch, change)); + self.last_ccc = None; + self.state = Purging(0); + return Some(k); + } + self.buffer.push((ch, change)); + self.last_ccc = Some(ch_class); + continue; + } + match compose(k.0, ch) { + Some(r) => { + self.composee = Some((r, k.1 + change - 1)); + continue; + } + None => { + self.buffer.push((ch, change)); + self.last_ccc = Some(ch_class); + } + } + } + } + } + self.state = Finished(0); + if self.composee.is_some() { + return self.composee.take(); + } + } + Purging(next) => match self.buffer.get(next).cloned() { + None => { + self.buffer.clear(); + self.state = Composing; + } + s => { + self.state = Purging(next + 1); + return s; + } + }, + Finished(next) => match self.buffer.get(next).cloned() { + None => { + self.buffer.clear(); + return self.composee.take(); + } + s => { + self.state = Finished(next + 1); + return s; + } + }, + } + } + } +} + +impl + Clone> fmt::Display for Recompositions { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for c in self.clone() { + f.write_char(c.0)?; + } + Ok(()) + } +}