Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/core/unicode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ add_custom_command(
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedBidiClass.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/DerivedNormalizationProps.txt"
DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/codegen.py"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/PropertyValueAliases.txt"
Expand All @@ -25,6 +26,7 @@ add_custom_command(
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedBidiClass.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/DerivedNormalizationProps.txt"
COMMENT "Generating Unicode property tables"
VERBATIM)

Expand Down
59 changes: 46 additions & 13 deletions src/core/unicode/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@
import sys

LINE = re.compile(r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)")
MULTI_PROPERTY_LINE = re.compile(
r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)\s*;\s*(\S+)"
)
# Boolean-property rows in multi-property files use a two-field shape,
# with no value column. Used to recognise the row instead of silently
# skipping it.
BOOLEAN_PROPERTY_LINE = re.compile(
r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)\s*$"
)
MISSING_PREFIX = re.compile(r"^#\s*@missing:\s*")

TOTAL_CODEPOINTS = 0x110000
Expand All @@ -24,6 +33,8 @@
"LRI", "RLI", "FSI", "PDI",
]

NFC_QUICK_CHECK_ORDER = ["Y", "N", "M"]

UNICODE_SCRIPT_ORDER = [
"Adlam", "Ahom", "Anatolian_Hieroglyphs", "Arabic", "Armenian",
"Avestan", "Balinese", "Bamum", "Bassa_Vah", "Batak", "Bengali",
Expand Down Expand Up @@ -121,10 +132,16 @@ def build_value_map(aliases_path, property_short, canonical_order=None):
return result


def parse_file(path, value_map):
def parse_file(path, value_map, property_filter=None):
"""Read a UCD file and return a list of (first, last, value) entries
with @missing defaults first and data ranges second, so callers can
apply them in order regardless of where @missing appears in the file."""
apply them in order regardless of where @missing appears in the file.

With property_filter set, lines have shape `codepoint; property; value`
(as in DerivedNormalizationProps.txt) and only rows whose property
name matches are returned. Without it, lines have shape
`codepoint; value` and every row contributes."""
line_re = MULTI_PROPERTY_LINE if property_filter is not None else LINE
missing = []
data = []
with open(path) as source:
Expand All @@ -139,14 +156,25 @@ def parse_file(path, value_map):
continue
stripped = stripped[prefix.end():]
target = missing
match = LINE.match(stripped)
match = line_re.match(stripped)
if not match:
# Recognise the boolean-property shape used in multi-property
# files, but only for properties other than the one we are
# filtering for. A boolean-shape row that names our target
# property would be malformed data and must raise.
data_only = stripped.split("#", 1)[0].strip()
if property_filter is not None:
boolean = BOOLEAN_PROPERTY_LINE.fullmatch(data_only)
if boolean and boolean.group(3) != property_filter:
continue
raise ValueError(
f"{path}:{line_number}: unparseable line: {stripped!r}"
)
if property_filter is not None and match.group(3) != property_filter:
continue
first = int(match.group(1), 16)
last = int(match.group(2), 16) if match.group(2) else first
raw_value = match.group(3)
raw_value = match.group(4 if property_filter is not None else 3)
try:
value = value_map[raw_value]
except KeyError as error:
Expand Down Expand Up @@ -196,7 +224,7 @@ def emit_property(output, prefix, stage1, unique_pages):


def main():
if len(sys.argv) != 8:
if len(sys.argv) != 9:
print(
f"Usage: {sys.argv[0]} "
"<output.h> "
Expand All @@ -205,7 +233,8 @@ def main():
"<DerivedJoiningType.txt> "
"<DerivedBidiClass.txt> "
"<Scripts.txt> "
"<DerivedGeneralCategory.txt>",
"<DerivedGeneralCategory.txt> "
"<DerivedNormalizationProps.txt>",
file=sys.stderr,
)
sys.exit(1)
Expand All @@ -214,23 +243,27 @@ def main():
aliases_path = sys.argv[2]

properties = [
("COMBINING_CLASS", sys.argv[3],
("COMBINING_CLASS", sys.argv[3], None,
build_value_map(aliases_path, "ccc")),
("JOINING_TYPE", sys.argv[4],
("JOINING_TYPE", sys.argv[4], None,
build_value_map(aliases_path, "jt", JOINING_TYPE_ORDER)),
("BIDI_CLASS", sys.argv[5],
("BIDI_CLASS", sys.argv[5], None,
build_value_map(aliases_path, "bc", BIDI_CLASS_ORDER)),
("UNICODE_SCRIPT", sys.argv[6],
("UNICODE_SCRIPT", sys.argv[6], None,
build_value_map(aliases_path, "sc", UNICODE_SCRIPT_ORDER)),
("IS_COMBINING_MARK", sys.argv[7],
("IS_COMBINING_MARK", sys.argv[7], None,
build_combining_mark_value_map(aliases_path)),
("NFC_QUICK_CHECK", sys.argv[8], "NFC_QC",
build_value_map(aliases_path, "NFC_QC", NFC_QUICK_CHECK_ORDER)),
]

with open(output_path, "w") as output:
output.write("#include <cstdint>\n\n")
output.write("namespace {\n\n")
for prefix, input_path, value_map in properties:
stage1, pages = build_pages(parse_file(input_path, value_map))
for prefix, input_path, property_filter, value_map in properties:
stage1, pages = build_pages(
parse_file(input_path, value_map, property_filter)
)
emit_property(output, prefix, stage1, pages)
output.write("} // namespace\n")

Expand Down
19 changes: 19 additions & 0 deletions src/core/unicode/include/sourcemeta/core/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,25 @@ auto script(const char32_t codepoint) noexcept -> UnicodeScript;
SOURCEMETA_CORE_UNICODE_EXPORT
auto is_combining_mark(const char32_t codepoint) noexcept -> bool;

/// @ingroup unicode
/// Return the NFC quick-check property of a Unicode codepoint per UAX #15.
/// See https://www.unicode.org/reports/tr15/ for the property's definition.
/// For example:
///
/// ```cpp
/// #include <sourcemeta/core/unicode.h>
/// #include <cassert>
///
/// assert(sourcemeta::core::nfc_quick_check(U'A') ==
/// sourcemeta::core::NFCQuickCheck::Yes);
/// assert(sourcemeta::core::nfc_quick_check(U'\u2126') ==
/// sourcemeta::core::NFCQuickCheck::No);
Comment thread
jviotti marked this conversation as resolved.
/// assert(sourcemeta::core::nfc_quick_check(U'\u0300') ==
/// sourcemeta::core::NFCQuickCheck::Maybe);
/// ```
SOURCEMETA_CORE_UNICODE_EXPORT
auto nfc_quick_check(const char32_t codepoint) noexcept -> NFCQuickCheck;

/// @ingroup unicode
/// Determine the byte length of the valid UTF-8 codepoint starting at the
/// given position within the input. Returns 1 for an ASCII byte, 2/3/4 for a
Expand Down
9 changes: 9 additions & 0 deletions src/core/unicode/include/sourcemeta/core/unicode_ucd.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,15 @@ enum class UnicodeScript : std::uint8_t {
KatakanaOrHiragana = 175,
};

/// @ingroup unicode
/// The NFC quick-check result for a Unicode codepoint per UAX #15.
/// See https://www.unicode.org/reports/tr15/ for the property's definition.
enum class NFCQuickCheck : std::uint8_t {
Yes = 0,
No = 1,
Maybe = 2,
};

} // namespace sourcemeta::core

#endif
9 changes: 9 additions & 0 deletions src/core/unicode/unicode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,4 +154,13 @@ auto is_combining_mark(const char32_t codepoint) noexcept -> bool {
return IS_COMBINING_MARK_STAGE2[(page << 10U) | (codepoint & 0x3FFU)] != 0;
}

auto nfc_quick_check(const char32_t codepoint) noexcept -> NFCQuickCheck {
if (codepoint > 0x10FFFF) {
return NFCQuickCheck::Yes;
}
const std::size_t page{NFC_QUICK_CHECK_STAGE1[codepoint >> 10U]};
return static_cast<NFCQuickCheck>(
NFC_QUICK_CHECK_STAGE2[(page << 10U) | (codepoint & 0x3FFU)]);
}

} // namespace sourcemeta::core
3 changes: 2 additions & 1 deletion test/unicode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME unicode
unicode_joining_type_test.cc
unicode_bidi_class_test.cc
unicode_script_test.cc
unicode_is_combining_mark_test.cc)
unicode_is_combining_mark_test.cc
unicode_nfc_quick_check_test.cc)

target_link_libraries(sourcemeta_core_unicode_unit
PRIVATE sourcemeta::core::unicode)
109 changes: 109 additions & 0 deletions test/unicode/unicode_nfc_quick_check_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#include <gtest/gtest.h>

#include <sourcemeta/core/unicode.h>

TEST(Unicode_nfc_quick_check, ascii_letter) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'A'),
sourcemeta::core::NFCQuickCheck::Yes);
}

TEST(Unicode_nfc_quick_check, ascii_digit) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'0'),
sourcemeta::core::NFCQuickCheck::Yes);
}

TEST(Unicode_nfc_quick_check, ascii_space) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U' '),
sourcemeta::core::NFCQuickCheck::Yes);
}

TEST(Unicode_nfc_quick_check, null) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(0x0000),
sourcemeta::core::NFCQuickCheck::Yes);
}

// Latin-1 precomposed character that IS its own NFC form
TEST(Unicode_nfc_quick_check, latin_a_with_grave) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u00C0'),
sourcemeta::core::NFCQuickCheck::Yes);
}

TEST(Unicode_nfc_quick_check, latin_small_u_with_diaeresis) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u00FC'),
sourcemeta::core::NFCQuickCheck::Yes);
}

// U+0300 COMBINING GRAVE ACCENT may compose with a preceding starter
TEST(Unicode_nfc_quick_check, combining_grave_accent_is_maybe) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u0300'),
sourcemeta::core::NFCQuickCheck::Maybe);
}

// U+0301 COMBINING ACUTE ACCENT
TEST(Unicode_nfc_quick_check, combining_acute_accent_is_maybe) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u0301'),
sourcemeta::core::NFCQuickCheck::Maybe);
}

// U+2126 OHM SIGN is a singleton decomposition of U+03A9 GREEK CAPITAL
// LETTER OMEGA, so it never appears in NFC output
TEST(Unicode_nfc_quick_check, ohm_sign_is_no) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u2126'),
sourcemeta::core::NFCQuickCheck::No);
}

// U+0958 DEVANAGARI LETTER QA decomposes to U+0915 U+093C in NFC
TEST(Unicode_nfc_quick_check, devanagari_qa_is_no) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u0958'),
sourcemeta::core::NFCQuickCheck::No);
}

// U+FB1D HEBREW LETTER YOD WITH HIRIQ decomposes in NFC
TEST(Unicode_nfc_quick_check, hebrew_yod_with_hiriq_is_no) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\uFB1D'),
sourcemeta::core::NFCQuickCheck::No);
}

// U+2000 EN QUAD is a singleton decomposition of U+0020 SPACE
TEST(Unicode_nfc_quick_check, en_quad_is_no) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u2000'),
sourcemeta::core::NFCQuickCheck::No);
}

// Hangul precomposed syllable, allowed in NFC
TEST(Unicode_nfc_quick_check, hangul_syllable_is_yes) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\uAC00'),
sourcemeta::core::NFCQuickCheck::Yes);
}

// Hangul L jamo (leading consonant): cannot start a precomposed syllable
// alone, but composes only with following jamos so is Yes itself
TEST(Unicode_nfc_quick_check, hangul_l_jamo_is_yes) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u1100'),
sourcemeta::core::NFCQuickCheck::Yes);
}

// Hangul V jamo (vowel): may compose with a preceding L jamo into a LV
// syllable, so the quick check is Maybe
TEST(Unicode_nfc_quick_check, hangul_v_jamo_is_maybe) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u1161'),
sourcemeta::core::NFCQuickCheck::Maybe);
}

// Hangul T jamo (trailing consonant): may compose with a preceding LV
// syllable into an LVT syllable
TEST(Unicode_nfc_quick_check, hangul_t_jamo_is_maybe) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u11A8'),
sourcemeta::core::NFCQuickCheck::Maybe);
}

// Past the Unicode maximum: default per @missing rule is Yes
TEST(Unicode_nfc_quick_check, beyond_max_codepoint) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(0x110000),
sourcemeta::core::NFCQuickCheck::Yes);
}

TEST(Unicode_nfc_quick_check, beyond_max_codepoint_high) {
EXPECT_EQ(sourcemeta::core::nfc_quick_check(0xFFFFFFFF),
sourcemeta::core::NFCQuickCheck::Yes);
}
Loading