@@ -3,33 +3,69 @@ use crate::stream_safe;
33use crate :: tables;
44use crate :: UnicodeNormalization ;
55
6- /// QuickCheck quickly determines if a string is normalized, it can return
7- /// `Maybe`
6+ use core:: error:: Error ;
7+ use core:: fmt;
8+
9+ /// Error returned when a string is not properly normalized.
10+ #[ derive( Clone , Debug ) ]
11+ pub struct NormalizationError {
12+ /// String was normal up to this position.
13+ normal_up_to : usize ,
14+ }
15+ impl NormalizationError {
16+ /// Returns the index in the given string up to which it was properly normalized.
17+ pub const fn normal_up_to ( & self ) -> usize {
18+ self . normal_up_to
19+ }
20+ }
21+ impl fmt:: Display for NormalizationError {
22+ fn fmt ( & self , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
23+ write ! (
24+ f,
25+ "string was not normalized at position {}" ,
26+ self . normal_up_to
27+ )
28+ }
29+ }
30+ impl Error for NormalizationError { }
31+
32+ /// Whether additional checking is necessary to verify normalization.
833///
934/// The QuickCheck algorithm can quickly determine if a text is or isn't
1035/// normalized without any allocations in many cases, but it has to be able to
1136/// return `Maybe` when a full decomposition and recomposition is necessary.
1237#[ derive( Debug , Eq , PartialEq ) ]
13- pub enum IsNormalized {
38+ pub enum QuickCheck {
1439 /// The text is definitely normalized.
1540 Yes ,
16- /// The text is definitely not normalized.
17- No ,
1841 /// The text may be normalized.
1942 Maybe ,
2043}
2144
45+ /// Result of quickly checking if a string is normalized.
46+ pub type QuickResult = Result < QuickCheck , NormalizationError > ;
47+
48+ /// Result of authoritatively checking if a string is normalized.
49+ pub type FullResult = Result < ( ) , NormalizationError > ;
50+
51+ /// Normalization status of single character.
52+ pub ( crate ) enum IsNormalized {
53+ Yes ,
54+ No ,
55+ Maybe ,
56+ }
57+
2258// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
2359#[ inline]
24- fn quick_check < F , I > ( s : I , is_allowed : F , stream_safe : bool ) -> IsNormalized
60+ fn quick_check < F , I > ( s : I , is_allowed : F , stream_safe : bool ) -> QuickResult
2561where
26- I : Iterator < Item = char > ,
62+ I : Iterator < Item = ( usize , char ) > ,
2763 F : Fn ( char ) -> IsNormalized ,
2864{
2965 let mut last_cc = 0u8 ;
3066 let mut nonstarter_count = 0 ;
31- let mut result = IsNormalized :: Yes ;
32- for ch in s {
67+ let mut result = QuickCheck :: Yes ;
68+ for ( idx , ch ) in s {
3369 // For ASCII we know it's always allowed and a starter
3470 if ch <= '\x7f' {
3571 last_cc = 0 ;
@@ -40,13 +76,13 @@ where
4076 // Otherwise, lookup the combining class and QC property
4177 let cc = canonical_combining_class ( ch) ;
4278 if last_cc > cc && cc != 0 {
43- return IsNormalized :: No ;
79+ return Err ( NormalizationError { normal_up_to : idx } ) ;
4480 }
4581 match is_allowed ( ch) {
4682 IsNormalized :: Yes => ( ) ,
47- IsNormalized :: No => return IsNormalized :: No ,
83+ IsNormalized :: No => return Err ( NormalizationError { normal_up_to : idx } ) ,
4884 IsNormalized :: Maybe => {
49- result = IsNormalized :: Maybe ;
85+ result = QuickCheck :: Maybe ;
5086 }
5187 }
5288 if stream_safe {
5591 // If we're above `MAX_NONSTARTERS`, we're definitely *not*
5692 // stream-safe normalized.
5793 if nonstarter_count + decomp. leading_nonstarters > stream_safe:: MAX_NONSTARTERS {
58- return IsNormalized :: No ;
94+ return Err ( NormalizationError { normal_up_to : idx } ) ;
5995 }
6096 if decomp. leading_nonstarters == decomp. decomposition_len {
6197 nonstarter_count += decomp. decomposition_len ;
@@ -65,126 +101,170 @@ where
65101 }
66102 last_cc = cc;
67103 }
68- result
104+ Ok ( result)
105+ }
106+
107+ fn full_check < I : Iterator < Item = ( usize , char ) > , J : Iterator < Item = char > > (
108+ check : I ,
109+ normalized : J ,
110+ ) -> FullResult {
111+ check. zip ( normalized) . try_for_each ( |( ( idx, lhs) , rhs) | {
112+ if lhs == rhs {
113+ Ok ( ( ) )
114+ } else {
115+ Err ( NormalizationError { normal_up_to : idx } )
116+ }
117+ } )
69118}
70119
71- /// Quickly check if a string is in NFC, potentially returning
72- /// `IsNormalized::Maybe` if further checks are necessary. In this case a check
73- /// like `s.chars().nfc().eq(s.chars())` should suffice.
120+ /// Quickly check if a string is in NFC.
74121#[ inline]
75- pub fn is_nfc_quick < I : Iterator < Item = char > > ( s : I ) -> IsNormalized {
76- quick_check ( s, tables:: qc_nfc, false )
122+ pub fn check_nfc_quick ( s : & str ) -> QuickResult {
123+ quick_check ( s. char_indices ( ) , tables:: qc_nfc, false )
77124}
78125
79126/// Quickly check if a string is in NFKC.
80127#[ inline]
81- pub fn is_nfkc_quick < I : Iterator < Item = char > > ( s : I ) -> IsNormalized {
82- quick_check ( s, tables:: qc_nfkc, false )
128+ pub fn check_nfkc_quick ( s : & str ) -> QuickResult {
129+ quick_check ( s. char_indices ( ) , tables:: qc_nfkc, false )
83130}
84131
85132/// Quickly check if a string is in NFD.
86133#[ inline]
87- pub fn is_nfd_quick < I : Iterator < Item = char > > ( s : I ) -> IsNormalized {
88- quick_check ( s, tables:: qc_nfd, false )
134+ pub fn check_nfd_quick ( s : & str ) -> QuickResult {
135+ quick_check ( s. char_indices ( ) , tables:: qc_nfd, false )
89136}
90137
91138/// Quickly check if a string is in NFKD.
92139#[ inline]
93- pub fn is_nfkd_quick < I : Iterator < Item = char > > ( s : I ) -> IsNormalized {
94- quick_check ( s, tables:: qc_nfkd, false )
140+ pub fn check_nfkd_quick ( s : & str ) -> QuickResult {
141+ quick_check ( s. char_indices ( ) , tables:: qc_nfkd, false )
95142}
96143
97144/// Quickly check if a string is Stream-Safe NFC.
98145#[ inline]
99- pub fn is_nfc_stream_safe_quick < I : Iterator < Item = char > > ( s : I ) -> IsNormalized {
100- quick_check ( s, tables:: qc_nfc, true )
146+ pub fn check_nfc_stream_safe_quick ( s : & str ) -> QuickResult {
147+ quick_check ( s. char_indices ( ) , tables:: qc_nfc, true )
101148}
102149
103150/// Quickly check if a string is Stream-Safe NFD.
104151#[ inline]
105- pub fn is_nfd_stream_safe_quick < I : Iterator < Item = char > > ( s : I ) -> IsNormalized {
106- quick_check ( s, tables:: qc_nfd, true )
152+ pub fn check_nfd_stream_safe_quick ( s : & str ) -> QuickResult {
153+ quick_check ( s. char_indices ( ) , tables:: qc_nfd, true )
107154}
108155
109156/// Authoritatively check if a string is in NFC.
110157#[ inline]
111- pub fn is_nfc ( s : & str ) -> bool {
112- match is_nfc_quick ( s. chars ( ) ) {
113- IsNormalized :: Yes => true ,
114- IsNormalized :: No => false ,
115- IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . nfc ( ) ) ,
158+ pub fn check_nfc ( s : & str ) -> FullResult {
159+ match check_nfc_quick ( s) ? {
160+ QuickCheck :: Yes => Ok ( ( ) ) ,
161+ QuickCheck :: Maybe => full_check ( s. char_indices ( ) , s. chars ( ) . nfc ( ) ) ,
116162 }
117163}
118164
165+ /// Return whether a string is in NFC.
166+ #[ inline]
167+ pub fn is_nfc ( s : & str ) -> bool {
168+ check_nfc ( s) . is_ok ( )
169+ }
170+
119171/// Authoritatively check if a string is in NFKC.
120172#[ inline]
121- pub fn is_nfkc ( s : & str ) -> bool {
122- match is_nfkc_quick ( s. chars ( ) ) {
123- IsNormalized :: Yes => true ,
124- IsNormalized :: No => false ,
125- IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . nfkc ( ) ) ,
173+ pub fn check_nfkc ( s : & str ) -> FullResult {
174+ match check_nfkc_quick ( s) ? {
175+ QuickCheck :: Yes => Ok ( ( ) ) ,
176+ QuickCheck :: Maybe => full_check ( s. char_indices ( ) , s. chars ( ) . nfkc ( ) ) ,
126177 }
127178}
128179
180+ /// Return whether a string is in NFKC.
181+ #[ inline]
182+ pub fn is_nfkc ( s : & str ) -> bool {
183+ check_nfkc ( s) . is_ok ( )
184+ }
185+
129186/// Authoritatively check if a string is in NFD.
130187#[ inline]
131- pub fn is_nfd ( s : & str ) -> bool {
132- match is_nfd_quick ( s. chars ( ) ) {
133- IsNormalized :: Yes => true ,
134- IsNormalized :: No => false ,
135- IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . nfd ( ) ) ,
188+ pub fn check_nfd ( s : & str ) -> FullResult {
189+ match check_nfd_quick ( s) ? {
190+ QuickCheck :: Yes => Ok ( ( ) ) ,
191+ QuickCheck :: Maybe => full_check ( s. char_indices ( ) , s. chars ( ) . nfd ( ) ) ,
136192 }
137193}
138194
195+ /// Return whether a string is in NFD.
196+ #[ inline]
197+ pub fn is_nfd ( s : & str ) -> bool {
198+ check_nfd ( s) . is_ok ( )
199+ }
200+
139201/// Authoritatively check if a string is in NFKD.
140202#[ inline]
141- pub fn is_nfkd ( s : & str ) -> bool {
142- match is_nfkd_quick ( s. chars ( ) ) {
143- IsNormalized :: Yes => true ,
144- IsNormalized :: No => false ,
145- IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . nfkd ( ) ) ,
203+ pub fn check_nfkd ( s : & str ) -> FullResult {
204+ match check_nfkd_quick ( s) ? {
205+ QuickCheck :: Yes => Ok ( ( ) ) ,
206+ QuickCheck :: Maybe => full_check ( s. char_indices ( ) , s. chars ( ) . nfkd ( ) ) ,
146207 }
147208}
148209
210+ /// Return whether a string is in NFKD.
211+ #[ inline]
212+ pub fn is_nfkd ( s : & str ) -> bool {
213+ check_nfkd ( s) . is_ok ( )
214+ }
215+
149216/// Authoritatively check if a string is Stream-Safe NFC.
150217#[ inline]
151- pub fn is_nfc_stream_safe ( s : & str ) -> bool {
152- match is_nfc_stream_safe_quick ( s. chars ( ) ) {
153- IsNormalized :: Yes => true ,
154- IsNormalized :: No => false ,
155- IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . stream_safe ( ) . nfc ( ) ) ,
218+ pub fn check_nfc_stream_safe ( s : & str ) -> FullResult {
219+ match check_nfc_stream_safe_quick ( s) ? {
220+ QuickCheck :: Yes => Ok ( ( ) ) ,
221+ QuickCheck :: Maybe => full_check ( s. char_indices ( ) , s. chars ( ) . stream_safe ( ) . nfc ( ) ) ,
156222 }
157223}
158224
225+ /// Return whether a string is Stream-Safe NFC.
226+ #[ inline]
227+ pub fn is_nfc_stream_safe ( s : & str ) -> bool {
228+ check_nfc_stream_safe ( s) . is_ok ( )
229+ }
230+
159231/// Authoritatively check if a string is Stream-Safe NFD.
160232#[ inline]
161- pub fn is_nfd_stream_safe ( s : & str ) -> bool {
162- match is_nfd_stream_safe_quick ( s. chars ( ) ) {
163- IsNormalized :: Yes => true ,
164- IsNormalized :: No => false ,
165- IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . stream_safe ( ) . nfd ( ) ) ,
233+ pub fn check_nfd_stream_safe ( s : & str ) -> FullResult {
234+ match check_nfd_stream_safe_quick ( s) ? {
235+ QuickCheck :: Yes => Ok ( ( ) ) ,
236+ QuickCheck :: Maybe => full_check ( s. char_indices ( ) , s. chars ( ) . stream_safe ( ) . nfd ( ) ) ,
166237 }
167238}
168239
240+ /// Return whether a string is Stream-Safe NFD.
241+ #[ inline]
242+ pub fn is_nfd_stream_safe ( s : & str ) -> bool {
243+ check_nfd_stream_safe ( s) . is_ok ( )
244+ }
245+
169246#[ cfg( test) ]
170247mod tests {
171- use super :: { is_nfc_stream_safe_quick , is_nfd_stream_safe_quick , IsNormalized } ;
248+ use super :: { check_nfc_stream_safe_quick , check_nfd_stream_safe_quick , QuickCheck } ;
172249
173250 #[ test]
174251 fn test_stream_safe_nfd ( ) {
175252 let okay = "Da\u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{0300} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} ngerzone" ;
176- assert_eq ! ( is_nfd_stream_safe_quick ( okay. chars ( ) ) , IsNormalized :: Yes ) ;
253+ assert_eq ! ( check_nfd_stream_safe_quick ( okay) . unwrap ( ) , QuickCheck :: Yes ) ;
177254
178255 let too_much = "Da\u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{031e} \u{0300} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} ngerzone" ;
179- assert_eq ! ( is_nfd_stream_safe_quick ( too_much. chars ( ) ) , IsNormalized :: No ) ;
256+ assert ! ( check_nfd_stream_safe_quick ( too_much) . is_err ( ) ) ;
180257 }
181258
182259 #[ test]
183260 fn test_stream_safe_nfc ( ) {
184261 let okay = "ok\u{e0} \u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} y" ;
185- assert_eq ! ( is_nfc_stream_safe_quick( okay. chars( ) ) , IsNormalized :: Maybe ) ;
262+ assert_eq ! (
263+ check_nfc_stream_safe_quick( okay) . unwrap( ) ,
264+ QuickCheck :: Maybe
265+ ) ;
186266
187267 let too_much = "not ok\u{e0} \u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{031e} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} y" ;
188- assert_eq ! ( is_nfc_stream_safe_quick ( too_much. chars ( ) ) , IsNormalized :: No ) ;
268+ assert ! ( check_nfc_stream_safe_quick ( too_much) . is_err ( ) ) ;
189269 }
190270}
0 commit comments