@@ -799,6 +799,8 @@ pub fn is_utf8(v: &[u8]) -> bool {
799799 // first C2 80 last DF BF
800800 // 3-byte encoding is for codepoints \u0800 to \uffff
801801 // first E0 A0 80 last EF BF BF
802+ // excluding surrogates codepoints \ud800 to \udfff
803+ // ED A0 80 to ED BF BF
802804 // 4-byte encoding is for codepoints \u10000 to \u10ffff
803805 // first F0 90 80 80 last F4 8F BF BF
804806 //
@@ -812,8 +814,6 @@ pub fn is_utf8(v: &[u8]) -> bool {
812814 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
813815 // %xF4 %x80-8F 2( UTF8-tail )
814816 // UTF8-tail = %x80-BF
815- // --
816- // This code allows surrogate pairs: \uD800 to \uDFFF -> ED A0 80 to ED BF BF
817817 match w {
818818 2 => if unsafe_get ( v, i + 1 ) & 192u8 != TAG_CONT_U8 {
819819 return false
@@ -822,7 +822,9 @@ pub fn is_utf8(v: &[u8]) -> bool {
822822 unsafe_get ( v, i + 1 ) ,
823823 unsafe_get ( v, i + 2 ) & 192u8 ) {
824824 ( 0xE0 , 0xA0 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
825- ( 0xE1 .. 0xEF , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
825+ ( 0xE1 .. 0xEC , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
826+ ( 0xED , 0x80 .. 0x9F , TAG_CONT_U8 ) => ( ) ,
827+ ( 0xEE .. 0xEF , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
826828 _ => return false ,
827829 } ,
828830 _ => match ( v_i,
@@ -3012,6 +3014,7 @@ mod tests {
30123014
30133015 #[test]
30143016 fn test_is_utf8() {
3017+ // deny overlong encodings
30153018 assert!(!is_utf8([0xc0, 0x80]));
30163019 assert!(!is_utf8([0xc0, 0xae]));
30173020 assert!(!is_utf8([0xe0, 0x80, 0x80]));
@@ -3020,9 +3023,15 @@ mod tests {
30203023 assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
30213024 assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
30223025
3026+ // deny surrogates
3027+ assert!(!is_utf8([0xED, 0xA0, 0x80]));
3028+ assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3029+
30233030 assert!(is_utf8([0xC2, 0x80]));
30243031 assert!(is_utf8([0xDF, 0xBF]));
30253032 assert!(is_utf8([0xE0, 0xA0, 0x80]));
3033+ assert!(is_utf8([0xED, 0x9F, 0xBF]));
3034+ assert!(is_utf8([0xEE, 0x80, 0x80]));
30263035 assert!(is_utf8([0xEF, 0xBF, 0xBF]));
30273036 assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
30283037 assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
0 commit comments