Skip to content

Commit 462e889

Browse files
committed
Track position where normalization failed
1 parent ec98c83 commit 462e889

File tree

2 files changed

+148
-67
lines changed

2 files changed

+148
-67
lines changed

src/lib.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,10 @@ extern crate tinyvec;
5454

5555
pub use crate::decompose::Decompositions;
5656
pub use crate::quick_check::{
57-
is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
58-
is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
59-
IsNormalized,
57+
check_nfc, check_nfc_quick, check_nfc_stream_safe, check_nfc_stream_safe_quick, check_nfd,
58+
check_nfd_quick, check_nfd_stream_safe, check_nfd_stream_safe_quick, check_nfkc,
59+
check_nfkc_quick, check_nfkd, check_nfkd_quick, is_nfc, is_nfc_stream_safe, is_nfd,
60+
is_nfd_stream_safe, is_nfkc, is_nfkd, FullResult, NormalizationError, QuickCheck, QuickResult,
6061
};
6162
pub use crate::recompose::Recompositions;
6263
pub use crate::replace::Replacements;

src/quick_check.rs

Lines changed: 144 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -3,33 +3,69 @@ use crate::stream_safe;
33
use crate::tables;
44
use crate::UnicodeNormalization;
55

6-
/// QuickCheck quickly determines if a string is normalized, it can return
7-
/// `Maybe`
6+
use core::error::Error;
7+
use core::fmt;
8+
9+
/// Error returned when a string is not properly normalized.
10+
#[derive(Clone, Debug)]
11+
pub struct NormalizationError {
12+
/// String was normal up to this position.
13+
normal_up_to: usize,
14+
}
15+
impl NormalizationError {
16+
/// Returns the index in the given string up to which it was properly normalized.
17+
pub const fn normal_up_to(&self) -> usize {
18+
self.normal_up_to
19+
}
20+
}
21+
impl fmt::Display for NormalizationError {
22+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23+
write!(
24+
f,
25+
"string was not normalized at position {}",
26+
self.normal_up_to
27+
)
28+
}
29+
}
30+
impl Error for NormalizationError {}
31+
32+
/// Whether additional checking is necessary to verify normalization.
833
///
934
/// The QuickCheck algorithm can quickly determine if a text is or isn't
1035
/// normalized without any allocations in many cases, but it has to be able to
1136
/// return `Maybe` when a full decomposition and recomposition is necessary.
1237
#[derive(Debug, Eq, PartialEq)]
13-
pub enum IsNormalized {
38+
pub enum QuickCheck {
1439
/// The text is definitely normalized.
1540
Yes,
16-
/// The text is definitely not normalized.
17-
No,
1841
/// The text may be normalized.
1942
Maybe,
2043
}
2144

45+
/// Result of quickly checking if a string is normalized.
46+
pub type QuickResult = Result<QuickCheck, NormalizationError>;
47+
48+
/// Result of authoritatively checking if a string is normalized.
49+
pub type FullResult = Result<(), NormalizationError>;
50+
51+
/// Normalization status of single character.
52+
pub(crate) enum IsNormalized {
53+
Yes,
54+
No,
55+
Maybe,
56+
}
57+
2258
// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
2359
#[inline]
24-
fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
60+
fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> QuickResult
2561
where
26-
I: Iterator<Item = char>,
62+
I: Iterator<Item = (usize, char)>,
2763
F: Fn(char) -> IsNormalized,
2864
{
2965
let mut last_cc = 0u8;
3066
let mut nonstarter_count = 0;
31-
let mut result = IsNormalized::Yes;
32-
for ch in s {
67+
let mut result = QuickCheck::Yes;
68+
for (idx, ch) in s {
3369
// For ASCII we know it's always allowed and a starter
3470
if ch <= '\x7f' {
3571
last_cc = 0;
@@ -40,13 +76,13 @@ where
4076
// Otherwise, lookup the combining class and QC property
4177
let cc = canonical_combining_class(ch);
4278
if last_cc > cc && cc != 0 {
43-
return IsNormalized::No;
79+
return Err(NormalizationError { normal_up_to: idx });
4480
}
4581
match is_allowed(ch) {
4682
IsNormalized::Yes => (),
47-
IsNormalized::No => return IsNormalized::No,
83+
IsNormalized::No => return Err(NormalizationError { normal_up_to: idx }),
4884
IsNormalized::Maybe => {
49-
result = IsNormalized::Maybe;
85+
result = QuickCheck::Maybe;
5086
}
5187
}
5288
if stream_safe {
@@ -55,7 +91,7 @@ where
5591
// If we're above `MAX_NONSTARTERS`, we're definitely *not*
5692
// stream-safe normalized.
5793
if nonstarter_count + decomp.leading_nonstarters > stream_safe::MAX_NONSTARTERS {
58-
return IsNormalized::No;
94+
return Err(NormalizationError { normal_up_to: idx });
5995
}
6096
if decomp.leading_nonstarters == decomp.decomposition_len {
6197
nonstarter_count += decomp.decomposition_len;
@@ -65,126 +101,170 @@ where
65101
}
66102
last_cc = cc;
67103
}
68-
result
104+
Ok(result)
105+
}
106+
107+
fn full_check<I: Iterator<Item = (usize, char)>, J: Iterator<Item = char>>(
108+
check: I,
109+
normalized: J,
110+
) -> FullResult {
111+
check.zip(normalized).try_for_each(|((idx, lhs), rhs)| {
112+
if lhs == rhs {
113+
Ok(())
114+
} else {
115+
Err(NormalizationError { normal_up_to: idx })
116+
}
117+
})
69118
}
70119

71-
/// Quickly check if a string is in NFC, potentially returning
72-
/// `IsNormalized::Maybe` if further checks are necessary. In this case a check
73-
/// like `s.chars().nfc().eq(s.chars())` should suffice.
120+
/// Quickly check if a string is in NFC.
74121
#[inline]
75-
pub fn is_nfc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
76-
quick_check(s, tables::qc_nfc, false)
122+
pub fn check_nfc_quick(s: &str) -> QuickResult {
123+
quick_check(s.char_indices(), tables::qc_nfc, false)
77124
}
78125

79126
/// Quickly check if a string is in NFKC.
80127
#[inline]
81-
pub fn is_nfkc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
82-
quick_check(s, tables::qc_nfkc, false)
128+
pub fn check_nfkc_quick(s: &str) -> QuickResult {
129+
quick_check(s.char_indices(), tables::qc_nfkc, false)
83130
}
84131

85132
/// Quickly check if a string is in NFD.
86133
#[inline]
87-
pub fn is_nfd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
88-
quick_check(s, tables::qc_nfd, false)
134+
pub fn check_nfd_quick(s: &str) -> QuickResult {
135+
quick_check(s.char_indices(), tables::qc_nfd, false)
89136
}
90137

91138
/// Quickly check if a string is in NFKD.
92139
#[inline]
93-
pub fn is_nfkd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
94-
quick_check(s, tables::qc_nfkd, false)
140+
pub fn check_nfkd_quick(s: &str) -> QuickResult {
141+
quick_check(s.char_indices(), tables::qc_nfkd, false)
95142
}
96143

97144
/// Quickly check if a string is Stream-Safe NFC.
98145
#[inline]
99-
pub fn is_nfc_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
100-
quick_check(s, tables::qc_nfc, true)
146+
pub fn check_nfc_stream_safe_quick(s: &str) -> QuickResult {
147+
quick_check(s.char_indices(), tables::qc_nfc, true)
101148
}
102149

103150
/// Quickly check if a string is Stream-Safe NFD.
104151
#[inline]
105-
pub fn is_nfd_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
106-
quick_check(s, tables::qc_nfd, true)
152+
pub fn check_nfd_stream_safe_quick(s: &str) -> QuickResult {
153+
quick_check(s.char_indices(), tables::qc_nfd, true)
107154
}
108155

109156
/// Authoritatively check if a string is in NFC.
110157
#[inline]
111-
pub fn is_nfc(s: &str) -> bool {
112-
match is_nfc_quick(s.chars()) {
113-
IsNormalized::Yes => true,
114-
IsNormalized::No => false,
115-
IsNormalized::Maybe => s.chars().eq(s.chars().nfc()),
158+
pub fn check_nfc(s: &str) -> FullResult {
159+
match check_nfc_quick(s)? {
160+
QuickCheck::Yes => Ok(()),
161+
QuickCheck::Maybe => full_check(s.char_indices(), s.chars().nfc()),
116162
}
117163
}
118164

165+
/// Return whether a string is in NFC.
166+
#[inline]
167+
pub fn is_nfc(s: &str) -> bool {
168+
check_nfc(s).is_ok()
169+
}
170+
119171
/// Authoritatively check if a string is in NFKC.
120172
#[inline]
121-
pub fn is_nfkc(s: &str) -> bool {
122-
match is_nfkc_quick(s.chars()) {
123-
IsNormalized::Yes => true,
124-
IsNormalized::No => false,
125-
IsNormalized::Maybe => s.chars().eq(s.chars().nfkc()),
173+
pub fn check_nfkc(s: &str) -> FullResult {
174+
match check_nfkc_quick(s)? {
175+
QuickCheck::Yes => Ok(()),
176+
QuickCheck::Maybe => full_check(s.char_indices(), s.chars().nfkc()),
126177
}
127178
}
128179

180+
/// Return whether a string is in NFKC.
181+
#[inline]
182+
pub fn is_nfkc(s: &str) -> bool {
183+
check_nfkc(s).is_ok()
184+
}
185+
129186
/// Authoritatively check if a string is in NFD.
130187
#[inline]
131-
pub fn is_nfd(s: &str) -> bool {
132-
match is_nfd_quick(s.chars()) {
133-
IsNormalized::Yes => true,
134-
IsNormalized::No => false,
135-
IsNormalized::Maybe => s.chars().eq(s.chars().nfd()),
188+
pub fn check_nfd(s: &str) -> FullResult {
189+
match check_nfd_quick(s)? {
190+
QuickCheck::Yes => Ok(()),
191+
QuickCheck::Maybe => full_check(s.char_indices(), s.chars().nfd()),
136192
}
137193
}
138194

195+
/// Return whether a string is in NFD.
196+
#[inline]
197+
pub fn is_nfd(s: &str) -> bool {
198+
check_nfd(s).is_ok()
199+
}
200+
139201
/// Authoritatively check if a string is in NFKD.
140202
#[inline]
141-
pub fn is_nfkd(s: &str) -> bool {
142-
match is_nfkd_quick(s.chars()) {
143-
IsNormalized::Yes => true,
144-
IsNormalized::No => false,
145-
IsNormalized::Maybe => s.chars().eq(s.chars().nfkd()),
203+
pub fn check_nfkd(s: &str) -> FullResult {
204+
match check_nfkd_quick(s)? {
205+
QuickCheck::Yes => Ok(()),
206+
QuickCheck::Maybe => full_check(s.char_indices(), s.chars().nfkd()),
146207
}
147208
}
148209

210+
/// Return whether a string is in NFKD.
211+
#[inline]
212+
pub fn is_nfkd(s: &str) -> bool {
213+
check_nfkd(s).is_ok()
214+
}
215+
149216
/// Authoritatively check if a string is Stream-Safe NFC.
150217
#[inline]
151-
pub fn is_nfc_stream_safe(s: &str) -> bool {
152-
match is_nfc_stream_safe_quick(s.chars()) {
153-
IsNormalized::Yes => true,
154-
IsNormalized::No => false,
155-
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfc()),
218+
pub fn check_nfc_stream_safe(s: &str) -> FullResult {
219+
match check_nfc_stream_safe_quick(s)? {
220+
QuickCheck::Yes => Ok(()),
221+
QuickCheck::Maybe => full_check(s.char_indices(), s.chars().stream_safe().nfc()),
156222
}
157223
}
158224

225+
/// Return whether a string is Stream-Safe NFC.
226+
#[inline]
227+
pub fn is_nfc_stream_safe(s: &str) -> bool {
228+
check_nfc_stream_safe(s).is_ok()
229+
}
230+
159231
/// Authoritatively check if a string is Stream-Safe NFD.
160232
#[inline]
161-
pub fn is_nfd_stream_safe(s: &str) -> bool {
162-
match is_nfd_stream_safe_quick(s.chars()) {
163-
IsNormalized::Yes => true,
164-
IsNormalized::No => false,
165-
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfd()),
233+
pub fn check_nfd_stream_safe(s: &str) -> FullResult {
234+
match check_nfd_stream_safe_quick(s)? {
235+
QuickCheck::Yes => Ok(()),
236+
QuickCheck::Maybe => full_check(s.char_indices(), s.chars().stream_safe().nfd()),
166237
}
167238
}
168239

240+
/// Return whether a string is Stream-Safe NFD.
241+
#[inline]
242+
pub fn is_nfd_stream_safe(s: &str) -> bool {
243+
check_nfd_stream_safe(s).is_ok()
244+
}
245+
169246
#[cfg(test)]
170247
mod tests {
171-
use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized};
248+
use super::{check_nfc_stream_safe_quick, check_nfd_stream_safe_quick, QuickCheck};
172249

173250
#[test]
174251
fn test_stream_safe_nfd() {
175252
let okay = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
176-
assert_eq!(is_nfd_stream_safe_quick(okay.chars()), IsNormalized::Yes);
253+
assert_eq!(check_nfd_stream_safe_quick(okay).unwrap(), QuickCheck::Yes);
177254

178255
let too_much = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
179-
assert_eq!(is_nfd_stream_safe_quick(too_much.chars()), IsNormalized::No);
256+
assert!(check_nfd_stream_safe_quick(too_much).is_err());
180257
}
181258

182259
#[test]
183260
fn test_stream_safe_nfc() {
184261
let okay = "ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
185-
assert_eq!(is_nfc_stream_safe_quick(okay.chars()), IsNormalized::Maybe);
262+
assert_eq!(
263+
check_nfc_stream_safe_quick(okay).unwrap(),
264+
QuickCheck::Maybe
265+
);
186266

187267
let too_much = "not ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
188-
assert_eq!(is_nfc_stream_safe_quick(too_much.chars()), IsNormalized::No);
268+
assert!(check_nfc_stream_safe_quick(too_much).is_err());
189269
}
190270
}

0 commit comments

Comments
 (0)