Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ rayon = "1.10"
rayon-cond = "0.4"
serde = { version = "1.0", features = [ "derive" ] }
serde_json = "1.0"
unicode-normalization-alignments = "0.1"
unicode-normalization = "^0.1.25"
unicode_categories = "0.1"
unicode-segmentation = "1.11"
indicatif = {version = "0.18", optional = true}
Expand All @@ -74,6 +74,7 @@ monostate = "0.1.12"
ahash = { version = "0.8.11", features = ["serde"] }
dary_heap = { version = "0.3.6", features = ["serde"] }
compact_str = { version = "0.9", features = ["serde"] }
smallvec = "1"

[features]
default = ["progressbar", "onig", "esaxx_fast"]
Expand Down
2 changes: 2 additions & 0 deletions tokenizers/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ pub mod pre_tokenizers;
pub mod processors;
pub mod tokenizer;

pub(crate) mod unicode_normalization_alignments;

// Re-export from tokenizer
pub use tokenizer::*;

Expand Down
4 changes: 2 additions & 2 deletions tokenizers/src/normalizers/strip.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::tokenizer::{NormalizedString, Normalizer, Result};
use crate::unicode_normalization_alignments::char::is_combining_mark;
use crate::utils::macro_rules_attribute;
use serde::{Deserialize, Serialize};
use unicode_normalization_alignments::char::is_combining_mark;

#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
#[serde(tag = "type")]
Expand Down Expand Up @@ -61,7 +61,7 @@ mod tests {
use crate::normalizer::NormalizedString;
use crate::normalizers::Lowercase;
use crate::normalizers::NFKD;
use unicode_normalization_alignments::UnicodeNormalization;
use crate::unicode_normalization_alignments::UnicodeNormalization;

#[test]
fn test_strip_accents() {
Expand Down
2 changes: 1 addition & 1 deletion tokenizers/src/tokenizer/normalizer.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::pattern::Pattern;
use crate::unicode_normalization_alignments::UnicodeNormalization;
use crate::{Offsets, Result};
use std::ops::{Bound, RangeBounds};
use unicode_normalization_alignments::UnicodeNormalization;

use serde::{Deserialize, Serialize};

Expand Down
161 changes: 161 additions & 0 deletions tokenizers/src/unicode_normalization_alignments/decompose.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
use smallvec::SmallVec;
use std::fmt::{self, Write};
use std::iter::Fuse;
use std::ops::Range;
use unicode_normalization::char::{
canonical_combining_class, decompose_canonical, decompose_compatible,
};

#[derive(Clone)]
enum DecompositionType {
Canonical,
Compatible,
}

/// External iterator for a string decomposition's characters.
#[derive(Clone)]
pub struct Decompositions<I> {
kind: DecompositionType,
iter: Fuse<I>,

// This buffer stores pairs of (canonical combining class, character),
// pushed onto the end in text order.
//
// It's divided into up to three sections:
// 1) A prefix that is free space;
// 2) "Ready" characters which are sorted and ready to emit on demand;
// 3) A "pending" block which stills needs more characters for us to be able
// to sort in canonical order and is not safe to emit.
buffer: SmallVec<[(u8, char, isize); 4]>,
ready: Range<usize>,
}

#[inline]
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
Decompositions {
kind: self::DecompositionType::Canonical,
iter: iter.fuse(),
buffer: SmallVec::new(),
ready: 0..0,
}
}

#[inline]
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
Decompositions {
kind: self::DecompositionType::Compatible,
iter: iter.fuse(),
buffer: SmallVec::new(),
ready: 0..0,
}
}

impl<I> Decompositions<I> {
#[inline]
fn push_back(&mut self, ch: char, first: bool) {
let class = canonical_combining_class(ch);

if class == 0 {
self.sort_pending();
}

self.buffer.push((class, ch, if first { 0 } else { 1 }));
}

#[inline]
fn sort_pending(&mut self) {
// NB: `sort_by_key` is stable, so it will preserve the original text's
// order within a combining class.
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
self.ready.end = self.buffer.len();
}

#[inline]
fn reset_buffer(&mut self) {
// Equivalent to `self.buffer.drain(0..self.ready.end)` (if SmallVec
// supported this API)
let pending = self.buffer.len() - self.ready.end;
for i in 0..pending {
self.buffer[i] = self.buffer[i + self.ready.end];
}
self.buffer.truncate(pending);
self.ready = 0..0;
}

#[inline]
fn increment_next_ready(&mut self) {
let next = self.ready.start + 1;
if next == self.ready.end {
self.reset_buffer();
} else {
self.ready.start = next;
}
}
}

impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
type Item = (char, isize);

#[inline]
fn next(&mut self) -> Option<(char, isize)> {
while self.ready.end == 0 {
match (self.iter.next(), &self.kind) {
(Some(ch), &DecompositionType::Canonical) => {
let mut first = true;
decompose_canonical(ch, |d| {
self.push_back(d, first);
first = false;
});
}
(Some(ch), &DecompositionType::Compatible) => {
let mut first = true;
decompose_compatible(ch, |d| {
self.push_back(d, first);
first = false;
});
}
(None, _) => {
if self.buffer.is_empty() {
return None;
} else {
self.sort_pending();

// This implementation means that we can call `next`
// on an exhausted iterator; the last outer `next` call
// will result in an inner `next` call. To make this
// safe, we use `fuse`.
break;
}
}
}
}

// We can assume here that, if `self.ready.end` is greater than zero,
// it's also greater than `self.ready.start`. That's because we only
// increment `self.ready.start` inside `increment_next_ready`, and
// whenever it reaches equality with `self.ready.end`, we reset both
// to zero, maintaining the invariant that:
// self.ready.start < self.ready.end || self.ready.end == self.ready.start == 0
//
// This less-than-obviously-safe implementation is chosen for performance,
// minimizing the number & complexity of branches in `next` in the common
// case of buffering then unbuffering a single character with each call.
let (_, ch, size) = self.buffer[self.ready.start];
self.increment_next_ready();
Some((ch, size))
}

fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, _) = self.iter.size_hint();
(lower, None)
}
}

impl<I: Iterator<Item = char> + Clone> fmt::Display for Decompositions<I> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.clone() {
f.write_char(c.0)?;
}
Ok(())
}
}
73 changes: 73 additions & 0 deletions tokenizers/src/unicode_normalization_alignments/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
pub use unicode_normalization::char;

pub mod decompose;
pub mod recompose;

pub use decompose::Decompositions;
pub use recompose::Recompositions;
use std::str::Chars;

/// Methods for iterating over strings while applying Unicode normalizations
/// as described in
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
pub trait UnicodeNormalization<I: Iterator<Item = char>> {
/// Returns an iterator over the string in Unicode Normalization Form D
/// (canonical decomposition).
fn nfd(self) -> Decompositions<I>;

/// Returns an iterator over the string in Unicode Normalization Form KD
/// (compatibility decomposition).
fn nfkd(self) -> Decompositions<I>;

/// An Iterator over the string in Unicode Normalization Form C
/// (canonical decomposition followed by canonical composition).
fn nfc(self) -> Recompositions<I>;

/// An Iterator over the string in Unicode Normalization Form KC
/// (compatibility decomposition followed by canonical composition).
fn nfkc(self) -> Recompositions<I>;
}

impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
#[inline]
fn nfd(self) -> Decompositions<Chars<'a>> {
decompose::new_canonical(self.chars())
}

#[inline]
fn nfkd(self) -> Decompositions<Chars<'a>> {
decompose::new_compatible(self.chars())
}

#[inline]
fn nfc(self) -> Recompositions<Chars<'a>> {
recompose::new_canonical(self.chars())
}

#[inline]
fn nfkc(self) -> Recompositions<Chars<'a>> {
recompose::new_compatible(self.chars())
}
}

impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
#[inline]
fn nfd(self) -> Decompositions<I> {
decompose::new_canonical(self)
}

#[inline]
fn nfkd(self) -> Decompositions<I> {
decompose::new_compatible(self)
}

#[inline]
fn nfc(self) -> Recompositions<I> {
recompose::new_canonical(self)
}

#[inline]
fn nfkc(self) -> Recompositions<I> {
recompose::new_compatible(self)
}
}
Loading