diff --git a/src/core/unicode/CMakeLists.txt b/src/core/unicode/CMakeLists.txt index c5f15abd1..f93f8d90d 100644 --- a/src/core/unicode/CMakeLists.txt +++ b/src/core/unicode/CMakeLists.txt @@ -17,6 +17,7 @@ add_custom_command( "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedBidiClass.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt" + "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/DerivedNormalizationProps.txt" DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/PropertyValueAliases.txt" @@ -25,6 +26,7 @@ add_custom_command( "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedBidiClass.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt" + "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/DerivedNormalizationProps.txt" COMMENT "Generating Unicode property tables" VERBATIM) diff --git a/src/core/unicode/codegen.py b/src/core/unicode/codegen.py index 90b0ba77b..45550c50a 100644 --- a/src/core/unicode/codegen.py +++ b/src/core/unicode/codegen.py @@ -4,6 +4,15 @@ import sys LINE = re.compile(r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)") +MULTI_PROPERTY_LINE = re.compile( + r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)\s*;\s*(\S+)" +) +# Boolean-property rows in multi-property files use a two-field shape, +# with no value column. Used to recognise the row instead of silently +# skipping it. +BOOLEAN_PROPERTY_LINE = re.compile( + r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)\s*$" +) MISSING_PREFIX = re.compile(r"^#\s*@missing:\s*") TOTAL_CODEPOINTS = 0x110000 @@ -24,6 +33,8 @@ "LRI", "RLI", "FSI", "PDI", ] +NFC_QUICK_CHECK_ORDER = ["Y", "N", "M"] + UNICODE_SCRIPT_ORDER = [ "Adlam", "Ahom", "Anatolian_Hieroglyphs", "Arabic", "Armenian", "Avestan", "Balinese", "Bamum", "Bassa_Vah", "Batak", "Bengali", @@ -121,10 +132,16 @@ def build_value_map(aliases_path, property_short, canonical_order=None): return result -def parse_file(path, value_map): +def parse_file(path, value_map, property_filter=None): """Read a UCD file and return a list of (first, last, value) entries with @missing defaults first and data ranges second, so callers can - apply them in order regardless of where @missing appears in the file.""" + apply them in order regardless of where @missing appears in the file. + + With property_filter set, lines have shape `codepoint; property; value` + (as in DerivedNormalizationProps.txt) and only rows whose property + name matches are returned. Without it, lines have shape + `codepoint; value` and every row contributes.""" + line_re = MULTI_PROPERTY_LINE if property_filter is not None else LINE missing = [] data = [] with open(path) as source: @@ -139,14 +156,25 @@ def parse_file(path, value_map): continue stripped = stripped[prefix.end():] target = missing - match = LINE.match(stripped) + match = line_re.match(stripped) if not match: + # Recognise the boolean-property shape used in multi-property + # files, but only for properties other than the one we are + # filtering for. A boolean-shape row that names our target + # property would be malformed data and must raise. + data_only = stripped.split("#", 1)[0].strip() + if property_filter is not None: + boolean = BOOLEAN_PROPERTY_LINE.fullmatch(data_only) + if boolean and boolean.group(3) != property_filter: + continue raise ValueError( f"{path}:{line_number}: unparseable line: {stripped!r}" ) + if property_filter is not None and match.group(3) != property_filter: + continue first = int(match.group(1), 16) last = int(match.group(2), 16) if match.group(2) else first - raw_value = match.group(3) + raw_value = match.group(4 if property_filter is not None else 3) try: value = value_map[raw_value] except KeyError as error: @@ -196,7 +224,7 @@ def emit_property(output, prefix, stage1, unique_pages): def main(): - if len(sys.argv) != 8: + if len(sys.argv) != 9: print( f"Usage: {sys.argv[0]} " " " @@ -205,7 +233,8 @@ def main(): " " " " " " - "", + " " + "", file=sys.stderr, ) sys.exit(1) @@ -214,23 +243,27 @@ def main(): aliases_path = sys.argv[2] properties = [ - ("COMBINING_CLASS", sys.argv[3], + ("COMBINING_CLASS", sys.argv[3], None, build_value_map(aliases_path, "ccc")), - ("JOINING_TYPE", sys.argv[4], + ("JOINING_TYPE", sys.argv[4], None, build_value_map(aliases_path, "jt", JOINING_TYPE_ORDER)), - ("BIDI_CLASS", sys.argv[5], + ("BIDI_CLASS", sys.argv[5], None, build_value_map(aliases_path, "bc", BIDI_CLASS_ORDER)), - ("UNICODE_SCRIPT", sys.argv[6], + ("UNICODE_SCRIPT", sys.argv[6], None, build_value_map(aliases_path, "sc", UNICODE_SCRIPT_ORDER)), - ("IS_COMBINING_MARK", sys.argv[7], + ("IS_COMBINING_MARK", sys.argv[7], None, build_combining_mark_value_map(aliases_path)), + ("NFC_QUICK_CHECK", sys.argv[8], "NFC_QC", + build_value_map(aliases_path, "NFC_QC", NFC_QUICK_CHECK_ORDER)), ] with open(output_path, "w") as output: output.write("#include \n\n") output.write("namespace {\n\n") - for prefix, input_path, value_map in properties: - stage1, pages = build_pages(parse_file(input_path, value_map)) + for prefix, input_path, property_filter, value_map in properties: + stage1, pages = build_pages( + parse_file(input_path, value_map, property_filter) + ) emit_property(output, prefix, stage1, pages) output.write("} // namespace\n") diff --git a/src/core/unicode/include/sourcemeta/core/unicode.h b/src/core/unicode/include/sourcemeta/core/unicode.h index f39706ff8..aef5897a4 100644 --- a/src/core/unicode/include/sourcemeta/core/unicode.h +++ b/src/core/unicode/include/sourcemeta/core/unicode.h @@ -308,6 +308,25 @@ auto script(const char32_t codepoint) noexcept -> UnicodeScript; SOURCEMETA_CORE_UNICODE_EXPORT auto is_combining_mark(const char32_t codepoint) noexcept -> bool; +/// @ingroup unicode +/// Return the NFC quick-check property of a Unicode codepoint per UAX #15. +/// See https://www.unicode.org/reports/tr15/ for the property's definition. +/// For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::nfc_quick_check(U'A') == +/// sourcemeta::core::NFCQuickCheck::Yes); +/// assert(sourcemeta::core::nfc_quick_check(U'\u2126') == +/// sourcemeta::core::NFCQuickCheck::No); +/// assert(sourcemeta::core::nfc_quick_check(U'\u0300') == +/// sourcemeta::core::NFCQuickCheck::Maybe); +/// ``` +SOURCEMETA_CORE_UNICODE_EXPORT +auto nfc_quick_check(const char32_t codepoint) noexcept -> NFCQuickCheck; + /// @ingroup unicode /// Determine the byte length of the valid UTF-8 codepoint starting at the /// given position within the input. Returns 1 for an ASCII byte, 2/3/4 for a diff --git a/src/core/unicode/include/sourcemeta/core/unicode_ucd.h b/src/core/unicode/include/sourcemeta/core/unicode_ucd.h index 7741eaf62..536054c68 100644 --- a/src/core/unicode/include/sourcemeta/core/unicode_ucd.h +++ b/src/core/unicode/include/sourcemeta/core/unicode_ucd.h @@ -230,6 +230,15 @@ enum class UnicodeScript : std::uint8_t { KatakanaOrHiragana = 175, }; +/// @ingroup unicode +/// The NFC quick-check result for a Unicode codepoint per UAX #15. +/// See https://www.unicode.org/reports/tr15/ for the property's definition. +enum class NFCQuickCheck : std::uint8_t { + Yes = 0, + No = 1, + Maybe = 2, +}; + } // namespace sourcemeta::core #endif diff --git a/src/core/unicode/unicode.cc b/src/core/unicode/unicode.cc index c6bfb9d98..e30e1b47d 100644 --- a/src/core/unicode/unicode.cc +++ b/src/core/unicode/unicode.cc @@ -154,4 +154,13 @@ auto is_combining_mark(const char32_t codepoint) noexcept -> bool { return IS_COMBINING_MARK_STAGE2[(page << 10U) | (codepoint & 0x3FFU)] != 0; } +auto nfc_quick_check(const char32_t codepoint) noexcept -> NFCQuickCheck { + if (codepoint > 0x10FFFF) { + return NFCQuickCheck::Yes; + } + const std::size_t page{NFC_QUICK_CHECK_STAGE1[codepoint >> 10U]}; + return static_cast( + NFC_QUICK_CHECK_STAGE2[(page << 10U) | (codepoint & 0x3FFU)]); +} + } // namespace sourcemeta::core diff --git a/test/unicode/CMakeLists.txt b/test/unicode/CMakeLists.txt index 368060e4d..9bc885fd1 100644 --- a/test/unicode/CMakeLists.txt +++ b/test/unicode/CMakeLists.txt @@ -12,7 +12,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME unicode unicode_joining_type_test.cc unicode_bidi_class_test.cc unicode_script_test.cc - unicode_is_combining_mark_test.cc) + unicode_is_combining_mark_test.cc + unicode_nfc_quick_check_test.cc) target_link_libraries(sourcemeta_core_unicode_unit PRIVATE sourcemeta::core::unicode) diff --git a/test/unicode/unicode_nfc_quick_check_test.cc b/test/unicode/unicode_nfc_quick_check_test.cc new file mode 100644 index 000000000..cdcac7bd0 --- /dev/null +++ b/test/unicode/unicode_nfc_quick_check_test.cc @@ -0,0 +1,109 @@ +#include + +#include + +TEST(Unicode_nfc_quick_check, ascii_letter) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'A'), + sourcemeta::core::NFCQuickCheck::Yes); +} + +TEST(Unicode_nfc_quick_check, ascii_digit) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'0'), + sourcemeta::core::NFCQuickCheck::Yes); +} + +TEST(Unicode_nfc_quick_check, ascii_space) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U' '), + sourcemeta::core::NFCQuickCheck::Yes); +} + +TEST(Unicode_nfc_quick_check, null) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(0x0000), + sourcemeta::core::NFCQuickCheck::Yes); +} + +// Latin-1 precomposed character that IS its own NFC form +TEST(Unicode_nfc_quick_check, latin_a_with_grave) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u00C0'), + sourcemeta::core::NFCQuickCheck::Yes); +} + +TEST(Unicode_nfc_quick_check, latin_small_u_with_diaeresis) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u00FC'), + sourcemeta::core::NFCQuickCheck::Yes); +} + +// U+0300 COMBINING GRAVE ACCENT may compose with a preceding starter +TEST(Unicode_nfc_quick_check, combining_grave_accent_is_maybe) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u0300'), + sourcemeta::core::NFCQuickCheck::Maybe); +} + +// U+0301 COMBINING ACUTE ACCENT +TEST(Unicode_nfc_quick_check, combining_acute_accent_is_maybe) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u0301'), + sourcemeta::core::NFCQuickCheck::Maybe); +} + +// U+2126 OHM SIGN is a singleton decomposition of U+03A9 GREEK CAPITAL +// LETTER OMEGA, so it never appears in NFC output +TEST(Unicode_nfc_quick_check, ohm_sign_is_no) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u2126'), + sourcemeta::core::NFCQuickCheck::No); +} + +// U+0958 DEVANAGARI LETTER QA decomposes to U+0915 U+093C in NFC +TEST(Unicode_nfc_quick_check, devanagari_qa_is_no) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u0958'), + sourcemeta::core::NFCQuickCheck::No); +} + +// U+FB1D HEBREW LETTER YOD WITH HIRIQ decomposes in NFC +TEST(Unicode_nfc_quick_check, hebrew_yod_with_hiriq_is_no) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\uFB1D'), + sourcemeta::core::NFCQuickCheck::No); +} + +// U+2000 EN QUAD is a singleton decomposition of U+0020 SPACE +TEST(Unicode_nfc_quick_check, en_quad_is_no) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u2000'), + sourcemeta::core::NFCQuickCheck::No); +} + +// Hangul precomposed syllable, allowed in NFC +TEST(Unicode_nfc_quick_check, hangul_syllable_is_yes) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\uAC00'), + sourcemeta::core::NFCQuickCheck::Yes); +} + +// Hangul L jamo (leading consonant): cannot start a precomposed syllable +// alone, but composes only with following jamos so is Yes itself +TEST(Unicode_nfc_quick_check, hangul_l_jamo_is_yes) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u1100'), + sourcemeta::core::NFCQuickCheck::Yes); +} + +// Hangul V jamo (vowel): may compose with a preceding L jamo into a LV +// syllable, so the quick check is Maybe +TEST(Unicode_nfc_quick_check, hangul_v_jamo_is_maybe) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u1161'), + sourcemeta::core::NFCQuickCheck::Maybe); +} + +// Hangul T jamo (trailing consonant): may compose with a preceding LV +// syllable into an LVT syllable +TEST(Unicode_nfc_quick_check, hangul_t_jamo_is_maybe) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u11A8'), + sourcemeta::core::NFCQuickCheck::Maybe); +} + +// Past the Unicode maximum: default per @missing rule is Yes +TEST(Unicode_nfc_quick_check, beyond_max_codepoint) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(0x110000), + sourcemeta::core::NFCQuickCheck::Yes); +} + +TEST(Unicode_nfc_quick_check, beyond_max_codepoint_high) { + EXPECT_EQ(sourcemeta::core::nfc_quick_check(0xFFFFFFFF), + sourcemeta::core::NFCQuickCheck::Yes); +}