diff --git a/src/core/idna/idna.cc b/src/core/idna/idna.cc index 7d8fdb2b3..685c91f33 100644 --- a/src/core/idna/idna.cc +++ b/src/core/idna/idna.cc @@ -147,6 +147,52 @@ auto idna_passes_contextj(const std::u32string_view label, return true; } +// TODO: Reject labels that are not NFC-normalized +auto idna_is_valid_u_label(const std::u32string_view label) noexcept -> bool { + if (label.empty()) { + return false; + } + + // RFC 5891 §4.2.3.1: must not start or end with a hyphen, and must not + // have a hyphen in both positions 3 and 4 (the IDNA A-label prefix + // shape "xn--" must not appear in U-labels). + if (label.front() == U'-' || label.back() == U'-') { + return false; + } + if (label.size() >= 4 && label[2] == U'-' && label[3] == U'-') { + return false; + } + + // RFC 5891 §4.2.3.2: must not start with a combining mark. + if (is_combining_mark(label.front())) { + return false; + } + + // RFC 5891 §4.2.3.3: every codepoint must be PVALID or satisfy its + // CONTEXTJ / CONTEXTO contextual rule. DISALLOWED and UNASSIGNED reject. + for (std::size_t position = 0; position < label.size(); ++position) { + switch (idna_property(label[position])) { + case IDNAProperty::PValid: + break; + case IDNAProperty::ContextJ: + if (!idna_passes_contextj(label, position)) { + return false; + } + break; + case IDNAProperty::ContextO: + if (!idna_passes_contexto(label, position)) { + return false; + } + break; + case IDNAProperty::Disallowed: + case IDNAProperty::Unassigned: + return false; + } + } + + return true; +} + auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool { if (label.empty()) { return false; diff --git a/src/core/idna/include/sourcemeta/core/idna.h b/src/core/idna/include/sourcemeta/core/idna.h index a3ff5edaf..198312bf3 100644 --- a/src/core/idna/include/sourcemeta/core/idna.h +++ b/src/core/idna/include/sourcemeta/core/idna.h @@ -98,6 +98,23 @@ auto idna_passes_contextj(const std::u32string_view label, SOURCEMETA_CORE_IDNA_EXPORT auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool; +/// @ingroup idna +/// Return whether the given label is a valid U-label per RFC 5891 §4. See +/// https://www.rfc-editor.org/rfc/rfc5891#section-4 for the criteria. +/// The Bidi rule is not checked here because Bidi domain detection is a +/// property of the whole domain, not of a single label. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::idna_is_valid_u_label(U"d\u00E9j\u00E0")); +/// assert(!sourcemeta::core::idna_is_valid_u_label(U"ab--cd")); +/// assert(!sourcemeta::core::idna_is_valid_u_label(U"-abc")); +/// ``` +SOURCEMETA_CORE_IDNA_EXPORT +auto idna_is_valid_u_label(const std::u32string_view label) noexcept -> bool; + } // namespace sourcemeta::core #endif diff --git a/test/idna/CMakeLists.txt b/test/idna/CMakeLists.txt index 17f651661..e22d99397 100644 --- a/test/idna/CMakeLists.txt +++ b/test/idna/CMakeLists.txt @@ -3,7 +3,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME idna idna_property_test.cc idna_passes_contexto_test.cc idna_passes_contextj_test.cc - idna_passes_bidi_rule_test.cc) + idna_passes_bidi_rule_test.cc + idna_is_valid_u_label_test.cc) target_link_libraries(sourcemeta_core_idna_unit PRIVATE sourcemeta::core::idna) diff --git a/test/idna/idna_is_valid_u_label_test.cc b/test/idna/idna_is_valid_u_label_test.cc new file mode 100644 index 000000000..0c8739872 --- /dev/null +++ b/test/idna/idna_is_valid_u_label_test.cc @@ -0,0 +1,145 @@ +#include + +#include + +TEST(IDNA_is_valid_u_label, ascii_letters) { + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"abc")); +} + +TEST(IDNA_is_valid_u_label, ascii_letters_and_digits) { + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"abc123")); +} + +TEST(IDNA_is_valid_u_label, ascii_with_internal_hyphen) { + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"ab-cd")); +} + +TEST(IDNA_is_valid_u_label, latin_with_diacritic) { + // d, e+acute, j, a+grave + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"d\u00E9j\u00E0")); +} + +TEST(IDNA_is_valid_u_label, hebrew_label) { + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"\u05D0\u05D1\u05D2")); +} + +TEST(IDNA_is_valid_u_label, arabic_label) { + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"\u0627\u0628\u062A")); +} + +TEST(IDNA_is_valid_u_label, devanagari_label) { + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"\u0905\u0915\u092E")); +} + +TEST(IDNA_is_valid_u_label, devanagari_with_virama_zwj) { + // KA + VIRAMA + ZWJ + KA forms a valid ContextJ sequence + EXPECT_TRUE( + sourcemeta::core::idna_is_valid_u_label(U"\u0915\u094D\u200D\u0915")); +} + +TEST(IDNA_is_valid_u_label, arabic_with_zwnj_context) { + // Arabic BEH (D) + ZWNJ + Arabic ALEF (R) satisfies ContextJ A.1 + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"\u0628\u200C\u0627")); +} + +TEST(IDNA_is_valid_u_label, catalan_middle_dot) { + // l + U+00B7 + l satisfies ContextO A.3 + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"l\u00B7l")); +} + +TEST(IDNA_is_valid_u_label, single_letter) { + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"a")); +} + +TEST(IDNA_is_valid_u_label, empty_label) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"")); +} + +TEST(IDNA_is_valid_u_label, leading_hyphen) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"-abc")); +} + +TEST(IDNA_is_valid_u_label, trailing_hyphen) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"abc-")); +} + +TEST(IDNA_is_valid_u_label, double_hyphen_at_positions_3_and_4) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"ab--cd")); +} + +TEST(IDNA_is_valid_u_label, xn_prefix_pattern) { + // The "xn--" pattern is the A-label prefix and must not appear in U-labels + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"xn--abc")); +} + +TEST(IDNA_is_valid_u_label, exactly_four_chars_with_double_hyphen) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"ab--")); +} + +TEST(IDNA_is_valid_u_label, leading_nonspacing_combining_mark) { + // U+0301 combining acute is general category Mn + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"\u0301abc")); +} + +TEST(IDNA_is_valid_u_label, leading_spacing_combining_mark) { + // U+093E Devanagari vowel sign Aa is general category Mc + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"\u093E\u0915")); +} + +TEST(IDNA_is_valid_u_label, contains_uppercase_letter) { + // Uppercase ASCII is DISALLOWED in IDNA2008 + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"abcD")); +} + +TEST(IDNA_is_valid_u_label, contains_full_stop) { + // ASCII full stop is DISALLOWED in a label + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"abc.def")); +} + +TEST(IDNA_is_valid_u_label, contains_disallowed_control) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u0007b")); +} + +TEST(IDNA_is_valid_u_label, contains_disallowed_bidi_mark) { + // U+200E LEFT-TO-RIGHT MARK is DISALLOWED + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u200Eb")); +} + +TEST(IDNA_is_valid_u_label, contains_unassigned_codepoint) { + // U+0E80 is unassigned in the Lao block + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u0E80b")); +} + +TEST(IDNA_is_valid_u_label, zwj_without_virama_context_rejected) { + // ZWJ between two non-virama letters fails ContextJ A.2 + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u200Db")); +} + +TEST(IDNA_is_valid_u_label, zwnj_with_no_joining_context_rejected) { + // ZWNJ between two ASCII letters fails ContextJ A.1 + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u200Cb")); +} + +TEST(IDNA_is_valid_u_label, middle_dot_not_flanked_by_l_rejected) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u00B7b")); +} + +TEST(IDNA_is_valid_u_label, greek_keraia_followed_by_non_greek_rejected) { + // U+0375 must be followed by Greek script + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u0375b")); +} + +TEST(IDNA_is_valid_u_label, arabic_indic_digit_mixed_with_extended_rejected) { + // U+0660 and U+06F0 must not appear together + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"\u0627\u0660\u06F0")); +} + +TEST(IDNA_is_valid_u_label, single_hyphen_in_position_3_only) { + // Only position 3 is hyphen, position 4 is not, so allowed + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"ab-cdef")); +} + +TEST(IDNA_is_valid_u_label, single_hyphen_in_position_4_only) { + // Only position 4 is hyphen, position 3 is not, so allowed + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"abc-def")); +}