diff --git a/src/core/unicode/CMakeLists.txt b/src/core/unicode/CMakeLists.txt index f93f8d90d..3049e2cba 100644 --- a/src/core/unicode/CMakeLists.txt +++ b/src/core/unicode/CMakeLists.txt @@ -18,6 +18,7 @@ add_custom_command( "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/DerivedNormalizationProps.txt" + "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/UnicodeData.txt" DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/PropertyValueAliases.txt" @@ -27,6 +28,7 @@ add_custom_command( "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/DerivedNormalizationProps.txt" + "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/UnicodeData.txt" COMMENT "Generating Unicode property tables" VERBATIM) diff --git a/src/core/unicode/codegen.py b/src/core/unicode/codegen.py index 45550c50a..4c9a35864 100644 --- a/src/core/unicode/codegen.py +++ b/src/core/unicode/codegen.py @@ -185,6 +185,107 @@ def parse_file(path, value_map, property_filter=None): return missing + data +def parse_canonical_decompositions(path): + """Read UnicodeData.txt and return {codepoint: [decomposition codepoints]} + for canonical decompositions only. Compatibility decompositions (those + whose field 5 starts with a `` prefix per UAX #44) are excluded. + + Raises if any canonical decomposition has more than two codepoints, which + would indicate a format change in UnicodeData.txt.""" + result = {} + with open(path) as source: + for line_number, line in enumerate(source, start=1): + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + fields = stripped.split(";") + if len(fields) < 6: + raise ValueError( + f"{path}:{line_number}: too few fields: {stripped!r}" + ) + decomp_field = fields[5].strip() + if not decomp_field or decomp_field.startswith("<"): + continue + try: + codepoint = int(fields[0], 16) + except ValueError as error: + raise ValueError( + f"{path}:{line_number}: invalid codepoint: {fields[0]!r}" + ) from error + decomposition = [int(token, 16) for token in decomp_field.split()] + if len(decomposition) > 2: + raise ValueError( + f"{path}:{line_number}: canonical decomposition of " + f"U+{codepoint:04X} has {len(decomposition)} codepoints, " + f"expected 1 or 2" + ) + result[codepoint] = decomposition + return result + + +# Packed per-codepoint entry: (length << OFFSET_BITS) | offset. A zero entry +# means no decomposition. Length 1 / 2 covers the entire canonical space. +DECOMPOSITION_OFFSET_BITS = 14 +DECOMPOSITION_OFFSET_MASK = (1 << DECOMPOSITION_OFFSET_BITS) - 1 + + +def build_canonical_decomposition_pages(decompositions): + """Build the flat blob plus per-codepoint packed entries, then run the + standard two-stage page-table dedup on top of the packed array.""" + blob = [] + packed = [0] * TOTAL_CODEPOINTS + for codepoint in sorted(decompositions): + decomposition = decompositions[codepoint] + offset = len(blob) + if offset > DECOMPOSITION_OFFSET_MASK: + raise ValueError( + f"canonical decomposition blob exceeds " + f"{DECOMPOSITION_OFFSET_BITS}-bit offset cap at " + f"U+{codepoint:04X}" + ) + blob.extend(decomposition) + packed[codepoint] = (len(decomposition) << DECOMPOSITION_OFFSET_BITS) | offset + + page_to_id = {} + unique_pages = [] + stage1 = [] + for page_index in range(NUM_PAGES): + start = page_index * PAGE_SIZE + page = tuple(packed[start : start + PAGE_SIZE]) + if page not in page_to_id: + page_to_id[page] = len(unique_pages) + unique_pages.append(page) + stage1.append(page_to_id[page]) + return blob, stage1, unique_pages + + +def emit_canonical_decomposition(output, blob, stage1, unique_pages): + output.write( + f"constexpr char32_t CANONICAL_DECOMPOSITION_BLOB[{len(blob)}] = {{\n" + ) + for offset in range(0, len(blob), 8): + chunk = blob[offset : offset + 8] + output.write( + " " + ", ".join(f"0x{value:X}" for value in chunk) + ",\n" + ) + output.write("};\n\n") + + output.write( + f"constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE1" + f"[{len(stage1)}] = {{\n" + ) + emit_row(output, stage1) + output.write("};\n\n") + stage2_size = len(unique_pages) * PAGE_SIZE + output.write( + f"constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE2" + f"[{stage2_size}] = {{\n" + ) + for page in unique_pages: + emit_row(output, list(page)) + output.write("};\n\n") + + def build_pages(entries): values = [0] * TOTAL_CODEPOINTS for first, last, value in entries: @@ -224,7 +325,7 @@ def emit_property(output, prefix, stage1, unique_pages): def main(): - if len(sys.argv) != 9: + if len(sys.argv) != 10: print( f"Usage: {sys.argv[0]} " " " @@ -234,7 +335,8 @@ def main(): " " " " " " - "", + " " + "", file=sys.stderr, ) sys.exit(1) @@ -257,6 +359,8 @@ def main(): build_value_map(aliases_path, "NFC_QC", NFC_QUICK_CHECK_ORDER)), ] + unicode_data_path = sys.argv[9] + with open(output_path, "w") as output: output.write("#include \n\n") output.write("namespace {\n\n") @@ -265,6 +369,10 @@ def main(): parse_file(input_path, value_map, property_filter) ) emit_property(output, prefix, stage1, pages) + blob, stage1, pages = build_canonical_decomposition_pages( + parse_canonical_decompositions(unicode_data_path) + ) + emit_canonical_decomposition(output, blob, stage1, pages) output.write("} // namespace\n") diff --git a/src/core/unicode/include/sourcemeta/core/unicode.h b/src/core/unicode/include/sourcemeta/core/unicode.h index aef5897a4..0def97c8b 100644 --- a/src/core/unicode/include/sourcemeta/core/unicode.h +++ b/src/core/unicode/include/sourcemeta/core/unicode.h @@ -327,6 +327,27 @@ auto is_combining_mark(const char32_t codepoint) noexcept -> bool; SOURCEMETA_CORE_UNICODE_EXPORT auto nfc_quick_check(const char32_t codepoint) noexcept -> NFCQuickCheck; +/// @ingroup unicode +/// Return the non-recursive canonical decomposition of a Unicode codepoint +/// per UAX #15. The view points into static data and remains valid for the +/// program's lifetime. An empty view means the codepoint has no canonical +/// decomposition. Hangul precomposed syllables decompose algorithmically +/// per UAX #15 and are reported as empty here. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::canonical_decomposition(U'A').empty()); +/// assert(sourcemeta::core::canonical_decomposition(U'\u00FC') == +/// std::u32string_view{U"u\u0308"}); +/// assert(sourcemeta::core::canonical_decomposition(U'\u2126') == +/// std::u32string_view{U"\u03A9"}); +/// ``` +SOURCEMETA_CORE_UNICODE_EXPORT +auto canonical_decomposition(const char32_t codepoint) noexcept + -> std::u32string_view; + /// @ingroup unicode /// Determine the byte length of the valid UTF-8 codepoint starting at the /// given position within the input. Returns 1 for an ASCII byte, 2/3/4 for a diff --git a/src/core/unicode/unicode.cc b/src/core/unicode/unicode.cc index e30e1b47d..82aaa0173 100644 --- a/src/core/unicode/unicode.cc +++ b/src/core/unicode/unicode.cc @@ -163,4 +163,17 @@ auto nfc_quick_check(const char32_t codepoint) noexcept -> NFCQuickCheck { NFC_QUICK_CHECK_STAGE2[(page << 10U) | (codepoint & 0x3FFU)]); } +auto canonical_decomposition(const char32_t codepoint) noexcept + -> std::u32string_view { + if (codepoint > 0x10FFFF) { + return {}; + } + const std::size_t page{CANONICAL_DECOMPOSITION_STAGE1[codepoint >> 10U]}; + const std::uint16_t packed{ + CANONICAL_DECOMPOSITION_STAGE2[(page << 10U) | (codepoint & 0x3FFU)]}; + const auto length{static_cast(packed >> 14U)}; + const auto offset{static_cast(packed & 0x3FFFU)}; + return std::u32string_view{CANONICAL_DECOMPOSITION_BLOB + offset, length}; +} + } // namespace sourcemeta::core diff --git a/test/unicode/CMakeLists.txt b/test/unicode/CMakeLists.txt index 9bc885fd1..daaa2963a 100644 --- a/test/unicode/CMakeLists.txt +++ b/test/unicode/CMakeLists.txt @@ -13,7 +13,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME unicode unicode_bidi_class_test.cc unicode_script_test.cc unicode_is_combining_mark_test.cc - unicode_nfc_quick_check_test.cc) + unicode_nfc_quick_check_test.cc + unicode_canonical_decomposition_test.cc) target_link_libraries(sourcemeta_core_unicode_unit PRIVATE sourcemeta::core::unicode) diff --git a/test/unicode/unicode_canonical_decomposition_test.cc b/test/unicode/unicode_canonical_decomposition_test.cc new file mode 100644 index 000000000..758072722 --- /dev/null +++ b/test/unicode/unicode_canonical_decomposition_test.cc @@ -0,0 +1,114 @@ +#include + +#include + +#include + +TEST(Unicode_canonical_decomposition, ascii_letter_has_no_decomposition) { + EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'A').empty()); +} + +TEST(Unicode_canonical_decomposition, ascii_digit_has_no_decomposition) { + EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'0').empty()); +} + +TEST(Unicode_canonical_decomposition, null_has_no_decomposition) { + EXPECT_TRUE(sourcemeta::core::canonical_decomposition(0x0000).empty()); +} + +// U+00C0 LATIN CAPITAL LETTER A WITH GRAVE decomposes to U+0041 U+0300 +TEST(Unicode_canonical_decomposition, latin_a_with_grave) { + EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u00C0'), + std::u32string_view{U"A\u0300"}); +} + +// U+00C1 LATIN CAPITAL LETTER A WITH ACUTE decomposes to U+0041 U+0301 +TEST(Unicode_canonical_decomposition, latin_a_with_acute) { + EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u00C1'), + std::u32string_view{U"A\u0301"}); +} + +// U+00FC LATIN SMALL LETTER U WITH DIAERESIS decomposes to U+0075 U+0308 +TEST(Unicode_canonical_decomposition, latin_u_with_diaeresis) { + EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u00FC'), + std::u32string_view{U"u\u0308"}); +} + +// U+2126 OHM SIGN: singleton decomposition to U+03A9 GREEK CAPITAL OMEGA +TEST(Unicode_canonical_decomposition, ohm_sign_singleton) { + EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u2126'), + std::u32string_view{U"\u03A9"}); +} + +// U+1E9B LATIN SMALL LETTER LONG S WITH DOT ABOVE: decomposes one step to +// U+017F U+0307. Full recursive decomposition is the algorithm's job +TEST(Unicode_canonical_decomposition, long_s_with_dot_above_non_recursive) { + EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u1E9B'), + std::u32string_view{U"\u017F\u0307"}); +} + +// Combining marks themselves have no decomposition +TEST(Unicode_canonical_decomposition, combining_grave_no_decomposition) { + EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\u0300').empty()); +} + +TEST(Unicode_canonical_decomposition, combining_acute_no_decomposition) { + EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\u0301').empty()); +} + +// U+212B ANGSTROM SIGN: singleton decomposition to U+00C5 LATIN A WITH RING +TEST(Unicode_canonical_decomposition, angstrom_sign_singleton) { + EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u212B'), + std::u32string_view{U"\u00C5"}); +} + +// U+0958 DEVANAGARI LETTER QA decomposes to U+0915 U+093C +TEST(Unicode_canonical_decomposition, devanagari_qa) { + EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u0958'), + std::u32string_view{U"\u0915\u093C"}); +} + +// U+212C SCRIPT CAPITAL B has decomposition ` 0042` (compatibility) +TEST(Unicode_canonical_decomposition, + compatibility_decomposition_font_excluded) { + EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\u212C').empty()); +} + +// U+FF21 FULLWIDTH LATIN CAPITAL LETTER A has decomposition ` 0041` +TEST(Unicode_canonical_decomposition, + compatibility_decomposition_wide_excluded) { + EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\uFF21').empty()); +} + +// U+212A KELVIN SIGN, by contrast, has an untagged singleton canonical +// decomposition to U+004B (ASCII K), so it must still be present +TEST(Unicode_canonical_decomposition, kelvin_sign_canonical_singleton) { + EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u212A'), + std::u32string_view{U"K"}); +} + +// Hangul precomposed syllables have no entry in UnicodeData.txt. Their +// algorithmic decomposition is the caller's job +TEST(Unicode_canonical_decomposition, hangul_syllable_first) { + EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\uAC00').empty()); +} + +TEST(Unicode_canonical_decomposition, hangul_syllable_last) { + EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\uD7A3').empty()); +} + +// Past the Unicode maximum: empty by definition +TEST(Unicode_canonical_decomposition, beyond_max_codepoint) { + EXPECT_TRUE(sourcemeta::core::canonical_decomposition(0x110000).empty()); +} + +TEST(Unicode_canonical_decomposition, beyond_max_codepoint_high) { + EXPECT_TRUE(sourcemeta::core::canonical_decomposition(0xFFFFFFFF).empty()); +} + +TEST(Unicode_canonical_decomposition, view_outlives_call) { + const auto first{sourcemeta::core::canonical_decomposition(U'\u00C0')}; + const auto second{sourcemeta::core::canonical_decomposition(U'\u00C0')}; + EXPECT_EQ(first.data(), second.data()); + EXPECT_EQ(first.size(), 2u); +}