This bumps IDNA with a recent fix to to_ascii. (#351)

lemire · web-flow · commit 49fef7ffa641 · 2023-04-26T14:36:45.000-04:00
diff --git a/include/ada/ada_idna.h b/include/ada/ada_idna.h
@@ -1,4 +1,4 @@
-/* auto-generated on 2023-03-28 11:03:13 -0400. Do not edit! */
+/* auto-generated on 2023-04-26 14:14:42 -0400. Do not edit! */
 /* begin file include/idna.h */
 #ifndef ADA_IDNA_H
 #define ADA_IDNA_H
@@ -30,6 +30,7 @@ size_t utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output);
 
 #include <string>
 #include <string_view>
+
 namespace ada::idna {
 
 // If the input is ascii, then the mapping is just -> lower case.
@@ -49,6 +50,7 @@ std::u32string map(std::u32string_view input);
 
 #include <string>
 #include <string_view>
+
 namespace ada::idna {
 
 // Normalize the characters according to IDNA (Unicode Normalization Form C).
@@ -63,6 +65,7 @@ void normalize(std::u32string& input);
 
 #include <string>
 #include <string_view>
+
 namespace ada::idna {
 
 bool punycode_to_utf32(std::string_view input, std::u32string& out);
@@ -99,23 +102,31 @@ bool is_label_valid(const std::u32string_view label);
 #include <string_view>
 
 namespace ada::idna {
+
 // Converts a domain (e.g., www.google.com) possibly containing international
 // characters to an ascii domain (with punycode). It will not do percent
 // decoding: percent decoding should be done prior to calling this function. We
 // do not remove tabs and spaces, they should have been removed prior to calling
 // this function. We also do not trim control characters. We also assume that
-// the input is not empty. We return "" on error. For now.
+// the input is not empty. We return "" on error.
+//
+// Example: "www.öbb.at" -> "www.xn--bb-eka.at"
+//
+// This function may accept or even produce invalid domains.
 std::string to_ascii(std::string_view ut8_string);
 
+// Returns true if the string contains a forbidden code point according to the
+// WHATGL URL specification:
+// https://url.spec.whatwg.org/#forbidden-domain-code-point
+bool contains_forbidden_domain_code_point(std::string_view ascii_string);
+
 bool constexpr begins_with(std::u32string_view view,
                            std::u32string_view prefix);
 bool constexpr begins_with(std::string_view view, std::string_view prefix);
 
 bool constexpr is_ascii(std::u32string_view view);
 bool constexpr is_ascii(std::string_view view);
 
-std::string from_ascii_to_ascii(std::string_view ut8_string);
-
 }  // namespace ada::idna
 
 #endif  // ADA_IDNA_TO_ASCII_H
@@ -125,8 +136,12 @@ std::string from_ascii_to_ascii(std::string_view ut8_string);
 #ifndef ADA_IDNA_TO_UNICODE_H
 #define ADA_IDNA_TO_UNICODE_H
 
+#include <string_view>
+
 namespace ada::idna {
+
 std::string to_unicode(std::string_view input);
+
 }  // namespace ada::idna
 
 #endif  // ADA_IDNA_TO_UNICODE_H
diff --git a/src/ada_idna.cpp b/src/ada_idna.cpp
@@ -1,9 +1,10 @@
-/* auto-generated on 2023-03-28 11:03:13 -0400. Do not edit! */
+/* auto-generated on 2023-04-26 14:14:42 -0400. Do not edit! */
 /* begin file src/idna.cpp */
 /* begin file src/unicode_transcoding.cpp */
 
 #include <cstdint>
 #include <cstring>
+
 namespace ada::idna {
 
 size_t utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) {
@@ -7885,9 +7886,10 @@ const char32_t uninorms::decomposition_data[] = {
 namespace ada::idna {
 
 void normalize(std::u32string& input) {
-  //    [Normalize](https://www.unicode.org/reports/tr46/#ProcessingStepNormalize).
-  //    Normalize
-  //     the domain_name string to Unicode Normalization Form C.
+  /**
+   * Normalize the domain_name string to Unicode Normalization Form C.
+   * @see https://www.unicode.org/reports/tr46/#ProcessingStepNormalize
+   */
   ufal::unilib::uninorms::nfc(input);
 }
 
@@ -8115,7 +8117,6 @@ bool utf32_to_punycode(std::u32string_view input, std::string &out) {
 }  // namespace ada::idna
 /* end file src/punycode.cpp */
 /* begin file src/validity.cpp */
-
 #include <algorithm>
 #include <string_view>
 
@@ -9503,18 +9504,18 @@ constexpr static uint8_t is_forbidden_domain_code_point_table[] = {
 
 static_assert(sizeof(is_forbidden_domain_code_point_table) == 256);
 
-inline constexpr bool is_forbidden_domain_code_point(const char c) noexcept {
+inline bool is_forbidden_domain_code_point(const char c) noexcept {
   return is_forbidden_domain_code_point_table[uint8_t(c)];
 }
 
-// We return "" on error. For now.
-std::string from_ascii_to_ascii(std::string_view ut8_string) {
-  static const std::string error = "";
-  if (std::any_of(ut8_string.begin(), ut8_string.end(),
-                  is_forbidden_domain_code_point)) {
-    return error;
-  }
+bool contains_forbidden_domain_code_point(std::string_view view) {
+  return (
+      std::any_of(view.begin(), view.end(), is_forbidden_domain_code_point));
+}
 
+// We return "" on error.
+static std::string from_ascii_to_ascii(std::string_view ut8_string) {
+  static const std::string error = "";
   // copy and map
   // we could be more efficient by avoiding the copy when unnecessary.
   std::string mapped_string = std::string(ut8_string);
@@ -9568,7 +9569,7 @@ std::string from_ascii_to_ascii(std::string_view ut8_string) {
   return out;
 }
 
-// We return "" on error. For now.
+// We return "" on error.
 std::string to_ascii(std::string_view ut8_string) {
   if (is_ascii(ut8_string)) {
     return from_ascii_to_ascii(ut8_string);
@@ -9655,11 +9656,6 @@ std::string to_ascii(std::string_view ut8_string) {
       out.push_back('.');
     }
   }
-
-  if (std::any_of(out.begin(), out.end(), is_forbidden_domain_code_point)) {
-    return error;
-  }
-
   return out;
 }
 }  // namespace ada::idna
diff --git a/src/unicode.cpp b/src/unicode.cpp
@@ -426,7 +426,8 @@ bool to_ascii(std::optional<std::string>& out, const std::string_view plain,
   }
   // input is a non-empty UTF-8 string, must be percent decoded
   std::string idna_ascii = ada::idna::to_ascii(input);
-  if (idna_ascii.empty()) {
+  if (idna_ascii.empty() || contains_forbidden_domain_code_point(
+                                idna_ascii.data(), idna_ascii.size())) {
     return false;
   }
   out = std::move(idna_ascii);

Original file line number	Diff line number	Diff line change
`@@ -426,7 +426,8 @@ bool to_ascii(std::optional<std::string>& out, const std::string_view plain,`
`426`	`426`	`}`
`427`	`427`	`// input is a non-empty UTF-8 string, must be percent decoded`
`428`	`428`	`std::string idna_ascii = ada::idna::to_ascii(input);`
`429`		`- if (idna_ascii.empty()) {`
	`429`	`+ if (idna_ascii.empty() \|\| contains_forbidden_domain_code_point(`
	`430`	`+ idna_ascii.data(), idna_ascii.size())) {`
`430`	`431`	`return false;`
`431`	`432`	`}`
`432`	`433`	`out = std::move(idna_ascii);`