Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions src/core/idna/idna.cc
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,52 @@ auto idna_passes_contextj(const std::u32string_view label,
return true;
}

// TODO: Reject labels that are not NFC-normalized
auto idna_is_valid_u_label(const std::u32string_view label) noexcept -> bool {
Comment thread
jviotti marked this conversation as resolved.
Comment thread
jviotti marked this conversation as resolved.
if (label.empty()) {
return false;
}

// RFC 5891 §4.2.3.1: must not start or end with a hyphen, and must not
// have a hyphen in both positions 3 and 4 (the IDNA A-label prefix
// shape "xn--" must not appear in U-labels).
if (label.front() == U'-' || label.back() == U'-') {
return false;
}
if (label.size() >= 4 && label[2] == U'-' && label[3] == U'-') {
return false;
}

// RFC 5891 §4.2.3.2: must not start with a combining mark.
if (is_combining_mark(label.front())) {
return false;
}

// RFC 5891 §4.2.3.3: every codepoint must be PVALID or satisfy its
// CONTEXTJ / CONTEXTO contextual rule. DISALLOWED and UNASSIGNED reject.
for (std::size_t position = 0; position < label.size(); ++position) {
switch (idna_property(label[position])) {
case IDNAProperty::PValid:
break;
case IDNAProperty::ContextJ:
if (!idna_passes_contextj(label, position)) {
return false;
}
break;
case IDNAProperty::ContextO:
if (!idna_passes_contexto(label, position)) {
return false;
}
break;
case IDNAProperty::Disallowed:
case IDNAProperty::Unassigned:
return false;
}
}

return true;
}

auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool {
if (label.empty()) {
return false;
Expand Down
17 changes: 17 additions & 0 deletions src/core/idna/include/sourcemeta/core/idna.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,23 @@ auto idna_passes_contextj(const std::u32string_view label,
SOURCEMETA_CORE_IDNA_EXPORT
auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool;

/// @ingroup idna
/// Return whether the given label is a valid U-label per RFC 5891 §4. See
/// https://www.rfc-editor.org/rfc/rfc5891#section-4 for the criteria.
/// The Bidi rule is not checked here because Bidi domain detection is a
/// property of the whole domain, not of a single label. For example:
///
/// ```cpp
/// #include <sourcemeta/core/idna.h>
/// #include <cassert>
///
/// assert(sourcemeta::core::idna_is_valid_u_label(U"d\u00E9j\u00E0"));
/// assert(!sourcemeta::core::idna_is_valid_u_label(U"ab--cd"));
/// assert(!sourcemeta::core::idna_is_valid_u_label(U"-abc"));
/// ```
SOURCEMETA_CORE_IDNA_EXPORT
auto idna_is_valid_u_label(const std::u32string_view label) noexcept -> bool;

} // namespace sourcemeta::core

#endif
3 changes: 2 additions & 1 deletion test/idna/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME idna
idna_property_test.cc
idna_passes_contexto_test.cc
idna_passes_contextj_test.cc
idna_passes_bidi_rule_test.cc)
idna_passes_bidi_rule_test.cc
idna_is_valid_u_label_test.cc)

target_link_libraries(sourcemeta_core_idna_unit
PRIVATE sourcemeta::core::idna)
145 changes: 145 additions & 0 deletions test/idna/idna_is_valid_u_label_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#include <gtest/gtest.h>

#include <sourcemeta/core/idna.h>

TEST(IDNA_is_valid_u_label, ascii_letters) {
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"abc"));
}

TEST(IDNA_is_valid_u_label, ascii_letters_and_digits) {
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"abc123"));
}

TEST(IDNA_is_valid_u_label, ascii_with_internal_hyphen) {
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"ab-cd"));
}

TEST(IDNA_is_valid_u_label, latin_with_diacritic) {
// d, e+acute, j, a+grave
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"d\u00E9j\u00E0"));
}

TEST(IDNA_is_valid_u_label, hebrew_label) {
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"\u05D0\u05D1\u05D2"));
}

TEST(IDNA_is_valid_u_label, arabic_label) {
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"\u0627\u0628\u062A"));
}

TEST(IDNA_is_valid_u_label, devanagari_label) {
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"\u0905\u0915\u092E"));
}

TEST(IDNA_is_valid_u_label, devanagari_with_virama_zwj) {
// KA + VIRAMA + ZWJ + KA forms a valid ContextJ sequence
EXPECT_TRUE(
sourcemeta::core::idna_is_valid_u_label(U"\u0915\u094D\u200D\u0915"));
}

TEST(IDNA_is_valid_u_label, arabic_with_zwnj_context) {
// Arabic BEH (D) + ZWNJ + Arabic ALEF (R) satisfies ContextJ A.1
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"\u0628\u200C\u0627"));
}

TEST(IDNA_is_valid_u_label, catalan_middle_dot) {
// l + U+00B7 + l satisfies ContextO A.3
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"l\u00B7l"));
}

TEST(IDNA_is_valid_u_label, single_letter) {
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"a"));
}

TEST(IDNA_is_valid_u_label, empty_label) {
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U""));
}

TEST(IDNA_is_valid_u_label, leading_hyphen) {
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"-abc"));
}

TEST(IDNA_is_valid_u_label, trailing_hyphen) {
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"abc-"));
}

TEST(IDNA_is_valid_u_label, double_hyphen_at_positions_3_and_4) {
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"ab--cd"));
}

TEST(IDNA_is_valid_u_label, xn_prefix_pattern) {
// The "xn--" pattern is the A-label prefix and must not appear in U-labels
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"xn--abc"));
}

TEST(IDNA_is_valid_u_label, exactly_four_chars_with_double_hyphen) {
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"ab--"));
}

TEST(IDNA_is_valid_u_label, leading_nonspacing_combining_mark) {
// U+0301 combining acute is general category Mn
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"\u0301abc"));
}

TEST(IDNA_is_valid_u_label, leading_spacing_combining_mark) {
// U+093E Devanagari vowel sign Aa is general category Mc
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"\u093E\u0915"));
}

TEST(IDNA_is_valid_u_label, contains_uppercase_letter) {
// Uppercase ASCII is DISALLOWED in IDNA2008
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"abcD"));
}

TEST(IDNA_is_valid_u_label, contains_full_stop) {
// ASCII full stop is DISALLOWED in a label
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"abc.def"));
}

TEST(IDNA_is_valid_u_label, contains_disallowed_control) {
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u0007b"));
}

TEST(IDNA_is_valid_u_label, contains_disallowed_bidi_mark) {
// U+200E LEFT-TO-RIGHT MARK is DISALLOWED
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u200Eb"));
}

TEST(IDNA_is_valid_u_label, contains_unassigned_codepoint) {
// U+0E80 is unassigned in the Lao block
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u0E80b"));
}

TEST(IDNA_is_valid_u_label, zwj_without_virama_context_rejected) {
// ZWJ between two non-virama letters fails ContextJ A.2
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u200Db"));
}

TEST(IDNA_is_valid_u_label, zwnj_with_no_joining_context_rejected) {
// ZWNJ between two ASCII letters fails ContextJ A.1
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u200Cb"));
}

TEST(IDNA_is_valid_u_label, middle_dot_not_flanked_by_l_rejected) {
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u00B7b"));
}

TEST(IDNA_is_valid_u_label, greek_keraia_followed_by_non_greek_rejected) {
// U+0375 must be followed by Greek script
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"a\u0375b"));
}

TEST(IDNA_is_valid_u_label, arabic_indic_digit_mixed_with_extended_rejected) {
// U+0660 and U+06F0 must not appear together
EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(U"\u0627\u0660\u06F0"));
}

TEST(IDNA_is_valid_u_label, single_hyphen_in_position_3_only) {
// Only position 3 is hyphen, position 4 is not, so allowed
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"ab-cdef"));
}

TEST(IDNA_is_valid_u_label, single_hyphen_in_position_4_only) {
// Only position 4 is hyphen, position 3 is not, so allowed
EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"abc-def"));
}
Loading