Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/core/unicode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ add_custom_command(
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/DerivedNormalizationProps.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/UnicodeData.txt"
DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/codegen.py"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/PropertyValueAliases.txt"
Expand All @@ -27,6 +28,7 @@ add_custom_command(
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/DerivedNormalizationProps.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/UnicodeData.txt"
COMMENT "Generating Unicode property tables"
VERBATIM)

Expand Down
112 changes: 110 additions & 2 deletions src/core/unicode/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,107 @@ def parse_file(path, value_map, property_filter=None):
return missing + data


def parse_canonical_decompositions(path):
"""Read UnicodeData.txt and return {codepoint: [decomposition codepoints]}
for canonical decompositions only. Compatibility decompositions (those
whose field 5 starts with a `<tag>` prefix per UAX #44) are excluded.

Raises if any canonical decomposition has more than two codepoints, which
would indicate a format change in UnicodeData.txt."""
result = {}
with open(path) as source:
for line_number, line in enumerate(source, start=1):
stripped = line.strip()
if not stripped or stripped.startswith("#"):
continue
fields = stripped.split(";")
if len(fields) < 6:
raise ValueError(
f"{path}:{line_number}: too few fields: {stripped!r}"
)
decomp_field = fields[5].strip()
if not decomp_field or decomp_field.startswith("<"):
continue
try:
codepoint = int(fields[0], 16)
except ValueError as error:
raise ValueError(
f"{path}:{line_number}: invalid codepoint: {fields[0]!r}"
) from error
decomposition = [int(token, 16) for token in decomp_field.split()]
if len(decomposition) > 2:
raise ValueError(
f"{path}:{line_number}: canonical decomposition of "
f"U+{codepoint:04X} has {len(decomposition)} codepoints, "
f"expected 1 or 2"
)
result[codepoint] = decomposition
return result


# Packed per-codepoint entry: (length << OFFSET_BITS) | offset. A zero entry
# means no decomposition. Length 1 / 2 covers the entire canonical space.
DECOMPOSITION_OFFSET_BITS = 14
DECOMPOSITION_OFFSET_MASK = (1 << DECOMPOSITION_OFFSET_BITS) - 1


def build_canonical_decomposition_pages(decompositions):
"""Build the flat blob plus per-codepoint packed entries, then run the
standard two-stage page-table dedup on top of the packed array."""
blob = []
packed = [0] * TOTAL_CODEPOINTS
for codepoint in sorted(decompositions):
decomposition = decompositions[codepoint]
offset = len(blob)
if offset > DECOMPOSITION_OFFSET_MASK:
raise ValueError(
f"canonical decomposition blob exceeds "
f"{DECOMPOSITION_OFFSET_BITS}-bit offset cap at "
f"U+{codepoint:04X}"
)
blob.extend(decomposition)
packed[codepoint] = (len(decomposition) << DECOMPOSITION_OFFSET_BITS) | offset

page_to_id = {}
unique_pages = []
stage1 = []
for page_index in range(NUM_PAGES):
start = page_index * PAGE_SIZE
page = tuple(packed[start : start + PAGE_SIZE])
if page not in page_to_id:
page_to_id[page] = len(unique_pages)
unique_pages.append(page)
stage1.append(page_to_id[page])
return blob, stage1, unique_pages


def emit_canonical_decomposition(output, blob, stage1, unique_pages):
output.write(
f"constexpr char32_t CANONICAL_DECOMPOSITION_BLOB[{len(blob)}] = {{\n"
)
for offset in range(0, len(blob), 8):
chunk = blob[offset : offset + 8]
output.write(
" " + ", ".join(f"0x{value:X}" for value in chunk) + ",\n"
)
output.write("};\n\n")

output.write(
f"constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE1"
f"[{len(stage1)}] = {{\n"
)
emit_row(output, stage1)
output.write("};\n\n")
stage2_size = len(unique_pages) * PAGE_SIZE
output.write(
f"constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE2"
f"[{stage2_size}] = {{\n"
)
for page in unique_pages:
emit_row(output, list(page))
output.write("};\n\n")


def build_pages(entries):
values = [0] * TOTAL_CODEPOINTS
for first, last, value in entries:
Expand Down Expand Up @@ -224,7 +325,7 @@ def emit_property(output, prefix, stage1, unique_pages):


def main():
if len(sys.argv) != 9:
if len(sys.argv) != 10:
print(
f"Usage: {sys.argv[0]} "
"<output.h> "
Expand All @@ -234,7 +335,8 @@ def main():
"<DerivedBidiClass.txt> "
"<Scripts.txt> "
"<DerivedGeneralCategory.txt> "
"<DerivedNormalizationProps.txt>",
"<DerivedNormalizationProps.txt> "
"<UnicodeData.txt>",
file=sys.stderr,
)
sys.exit(1)
Expand All @@ -257,6 +359,8 @@ def main():
build_value_map(aliases_path, "NFC_QC", NFC_QUICK_CHECK_ORDER)),
]

unicode_data_path = sys.argv[9]

with open(output_path, "w") as output:
output.write("#include <cstdint>\n\n")
output.write("namespace {\n\n")
Expand All @@ -265,6 +369,10 @@ def main():
parse_file(input_path, value_map, property_filter)
)
emit_property(output, prefix, stage1, pages)
blob, stage1, pages = build_canonical_decomposition_pages(
parse_canonical_decompositions(unicode_data_path)
)
emit_canonical_decomposition(output, blob, stage1, pages)
output.write("} // namespace\n")


Expand Down
21 changes: 21 additions & 0 deletions src/core/unicode/include/sourcemeta/core/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,27 @@ auto is_combining_mark(const char32_t codepoint) noexcept -> bool;
SOURCEMETA_CORE_UNICODE_EXPORT
auto nfc_quick_check(const char32_t codepoint) noexcept -> NFCQuickCheck;

/// @ingroup unicode
/// Return the non-recursive canonical decomposition of a Unicode codepoint
/// per UAX #15. The view points into static data and remains valid for the
/// program's lifetime. An empty view means the codepoint has no canonical
/// decomposition. Hangul precomposed syllables decompose algorithmically
/// per UAX #15 and are reported as empty here. For example:
///
/// ```cpp
/// #include <sourcemeta/core/unicode.h>
/// #include <cassert>
///
/// assert(sourcemeta::core::canonical_decomposition(U'A').empty());
/// assert(sourcemeta::core::canonical_decomposition(U'\u00FC') ==
/// std::u32string_view{U"u\u0308"});
/// assert(sourcemeta::core::canonical_decomposition(U'\u2126') ==
/// std::u32string_view{U"\u03A9"});
/// ```
SOURCEMETA_CORE_UNICODE_EXPORT
auto canonical_decomposition(const char32_t codepoint) noexcept
-> std::u32string_view;

/// @ingroup unicode
/// Determine the byte length of the valid UTF-8 codepoint starting at the
/// given position within the input. Returns 1 for an ASCII byte, 2/3/4 for a
Expand Down
13 changes: 13 additions & 0 deletions src/core/unicode/unicode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,17 @@ auto nfc_quick_check(const char32_t codepoint) noexcept -> NFCQuickCheck {
NFC_QUICK_CHECK_STAGE2[(page << 10U) | (codepoint & 0x3FFU)]);
}

auto canonical_decomposition(const char32_t codepoint) noexcept
-> std::u32string_view {
if (codepoint > 0x10FFFF) {
return {};
}
const std::size_t page{CANONICAL_DECOMPOSITION_STAGE1[codepoint >> 10U]};
const std::uint16_t packed{
CANONICAL_DECOMPOSITION_STAGE2[(page << 10U) | (codepoint & 0x3FFU)]};
const auto length{static_cast<std::size_t>(packed >> 14U)};
const auto offset{static_cast<std::size_t>(packed & 0x3FFFU)};
return std::u32string_view{CANONICAL_DECOMPOSITION_BLOB + offset, length};
}

} // namespace sourcemeta::core
3 changes: 2 additions & 1 deletion test/unicode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME unicode
unicode_bidi_class_test.cc
unicode_script_test.cc
unicode_is_combining_mark_test.cc
unicode_nfc_quick_check_test.cc)
unicode_nfc_quick_check_test.cc
unicode_canonical_decomposition_test.cc)

target_link_libraries(sourcemeta_core_unicode_unit
PRIVATE sourcemeta::core::unicode)
114 changes: 114 additions & 0 deletions test/unicode/unicode_canonical_decomposition_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#include <gtest/gtest.h>

#include <sourcemeta/core/unicode.h>

#include <string_view>

TEST(Unicode_canonical_decomposition, ascii_letter_has_no_decomposition) {
EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'A').empty());
}

TEST(Unicode_canonical_decomposition, ascii_digit_has_no_decomposition) {
EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'0').empty());
}

TEST(Unicode_canonical_decomposition, null_has_no_decomposition) {
EXPECT_TRUE(sourcemeta::core::canonical_decomposition(0x0000).empty());
}

// U+00C0 LATIN CAPITAL LETTER A WITH GRAVE decomposes to U+0041 U+0300
TEST(Unicode_canonical_decomposition, latin_a_with_grave) {
EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u00C0'),
std::u32string_view{U"A\u0300"});
}

// U+00C1 LATIN CAPITAL LETTER A WITH ACUTE decomposes to U+0041 U+0301
TEST(Unicode_canonical_decomposition, latin_a_with_acute) {
EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u00C1'),
std::u32string_view{U"A\u0301"});
}

// U+00FC LATIN SMALL LETTER U WITH DIAERESIS decomposes to U+0075 U+0308
TEST(Unicode_canonical_decomposition, latin_u_with_diaeresis) {
EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u00FC'),
std::u32string_view{U"u\u0308"});
}

// U+2126 OHM SIGN: singleton decomposition to U+03A9 GREEK CAPITAL OMEGA
Comment thread
jviotti marked this conversation as resolved.
TEST(Unicode_canonical_decomposition, ohm_sign_singleton) {
EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u2126'),
std::u32string_view{U"\u03A9"});
}

// U+1E9B LATIN SMALL LETTER LONG S WITH DOT ABOVE: decomposes one step to
// U+017F U+0307. Full recursive decomposition is the algorithm's job
TEST(Unicode_canonical_decomposition, long_s_with_dot_above_non_recursive) {
EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u1E9B'),
std::u32string_view{U"\u017F\u0307"});
}

// Combining marks themselves have no decomposition
TEST(Unicode_canonical_decomposition, combining_grave_no_decomposition) {
EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\u0300').empty());
}

TEST(Unicode_canonical_decomposition, combining_acute_no_decomposition) {
EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\u0301').empty());
}

// U+212B ANGSTROM SIGN: singleton decomposition to U+00C5 LATIN A WITH RING
TEST(Unicode_canonical_decomposition, angstrom_sign_singleton) {
EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u212B'),
std::u32string_view{U"\u00C5"});
}

// U+0958 DEVANAGARI LETTER QA decomposes to U+0915 U+093C
TEST(Unicode_canonical_decomposition, devanagari_qa) {
EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u0958'),
std::u32string_view{U"\u0915\u093C"});
}

// U+212C SCRIPT CAPITAL B has decomposition `<font> 0042` (compatibility)
TEST(Unicode_canonical_decomposition,
compatibility_decomposition_font_excluded) {
EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\u212C').empty());
}

// U+FF21 FULLWIDTH LATIN CAPITAL LETTER A has decomposition `<wide> 0041`
TEST(Unicode_canonical_decomposition,
compatibility_decomposition_wide_excluded) {
EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\uFF21').empty());
}

// U+212A KELVIN SIGN, by contrast, has an untagged singleton canonical
// decomposition to U+004B (ASCII K), so it must still be present
TEST(Unicode_canonical_decomposition, kelvin_sign_canonical_singleton) {
EXPECT_EQ(sourcemeta::core::canonical_decomposition(U'\u212A'),
std::u32string_view{U"K"});
}

// Hangul precomposed syllables have no entry in UnicodeData.txt. Their
// algorithmic decomposition is the caller's job
TEST(Unicode_canonical_decomposition, hangul_syllable_first) {
EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\uAC00').empty());
}

TEST(Unicode_canonical_decomposition, hangul_syllable_last) {
EXPECT_TRUE(sourcemeta::core::canonical_decomposition(U'\uD7A3').empty());
}

// Past the Unicode maximum: empty by definition
TEST(Unicode_canonical_decomposition, beyond_max_codepoint) {
EXPECT_TRUE(sourcemeta::core::canonical_decomposition(0x110000).empty());
}

TEST(Unicode_canonical_decomposition, beyond_max_codepoint_high) {
EXPECT_TRUE(sourcemeta::core::canonical_decomposition(0xFFFFFFFF).empty());
}

TEST(Unicode_canonical_decomposition, view_outlives_call) {
const auto first{sourcemeta::core::canonical_decomposition(U'\u00C0')};
const auto second{sourcemeta::core::canonical_decomposition(U'\u00C0')};
EXPECT_EQ(first.data(), second.data());
EXPECT_EQ(first.size(), 2u);
}
Loading