sourcemeta · jviotti · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/src/core/unicode/CMakeLists.txt b/src/core/unicode/CMakeLists.txt
@@ -17,6 +17,7 @@ add_custom_command(
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedBidiClass.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt"
+    "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/DerivedNormalizationProps.txt"
   DEPENDS
     "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/PropertyValueAliases.txt"
@@ -25,6 +26,7 @@ add_custom_command(
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedBidiClass.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt"
+    "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/DerivedNormalizationProps.txt"
   COMMENT "Generating Unicode property tables"
   VERBATIM)
 

diff --git a/src/core/unicode/codegen.py b/src/core/unicode/codegen.py
@@ -4,6 +4,15 @@
 import sys
 
 LINE = re.compile(r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)")
+MULTI_PROPERTY_LINE = re.compile(
+    r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)\s*;\s*(\S+)"
+)
+# Boolean-property rows in multi-property files use a two-field shape,
+# with no value column. Used to recognise the row instead of silently
+# skipping it.
+BOOLEAN_PROPERTY_LINE = re.compile(
+    r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)\s*$"
+)
 MISSING_PREFIX = re.compile(r"^#\s*@missing:\s*")
 
 TOTAL_CODEPOINTS = 0x110000
@@ -24,6 +33,8 @@
     "LRI", "RLI", "FSI", "PDI",
 ]
 
+NFC_QUICK_CHECK_ORDER = ["Y", "N", "M"]
+
 UNICODE_SCRIPT_ORDER = [
     "Adlam", "Ahom", "Anatolian_Hieroglyphs", "Arabic", "Armenian",
     "Avestan", "Balinese", "Bamum", "Bassa_Vah", "Batak", "Bengali",
@@ -121,10 +132,16 @@ def build_value_map(aliases_path, property_short, canonical_order=None):
     return result
 
 
-def parse_file(path, value_map):
+def parse_file(path, value_map, property_filter=None):
     """Read a UCD file and return a list of (first, last, value) entries
     with @missing defaults first and data ranges second, so callers can
-    apply them in order regardless of where @missing appears in the file."""
+    apply them in order regardless of where @missing appears in the file.
+
+    With property_filter set, lines have shape `codepoint; property; value`
+    (as in DerivedNormalizationProps.txt) and only rows whose property
+    name matches are returned. Without it, lines have shape
+    `codepoint; value` and every row contributes."""
+    line_re = MULTI_PROPERTY_LINE if property_filter is not None else LINE
     missing = []
     data = []
     with open(path) as source:
@@ -139,14 +156,25 @@ def parse_file(path, value_map):
                     continue
                 stripped = stripped[prefix.end():]
                 target = missing
-            match = LINE.match(stripped)
+            match = line_re.match(stripped)
             if not match:
+                # Recognise the boolean-property shape used in multi-property
+                # files, but only for properties other than the one we are
+                # filtering for. A boolean-shape row that names our target
+                # property would be malformed data and must raise.
+                data_only = stripped.split("#", 1)[0].strip()
+                if property_filter is not None:
+                    boolean = BOOLEAN_PROPERTY_LINE.fullmatch(data_only)
+                    if boolean and boolean.group(3) != property_filter:
+                        continue
                 raise ValueError(
                     f"{path}:{line_number}: unparseable line: {stripped!r}"
                 )
+            if property_filter is not None and match.group(3) != property_filter:
+                continue
             first = int(match.group(1), 16)
             last = int(match.group(2), 16) if match.group(2) else first
-            raw_value = match.group(3)
+            raw_value = match.group(4 if property_filter is not None else 3)
             try:
                 value = value_map[raw_value]
             except KeyError as error:
@@ -196,7 +224,7 @@ def emit_property(output, prefix, stage1, unique_pages):
 
 
 def main():
-    if len(sys.argv) != 8:
+    if len(sys.argv) != 9:
         print(
             f"Usage: {sys.argv[0]} "
             "<output.h> "
@@ -205,7 +233,8 @@ def main():
             "<DerivedJoiningType.txt> "
             "<DerivedBidiClass.txt> "
             "<Scripts.txt> "
-            "<DerivedGeneralCategory.txt>",
+            "<DerivedGeneralCategory.txt> "
+            "<DerivedNormalizationProps.txt>",
             file=sys.stderr,
         )
         sys.exit(1)
@@ -214,23 +243,27 @@ def main():
     aliases_path = sys.argv[2]
 
     properties = [
-        ("COMBINING_CLASS", sys.argv[3],
+        ("COMBINING_CLASS", sys.argv[3], None,
          build_value_map(aliases_path, "ccc")),
-        ("JOINING_TYPE", sys.argv[4],
+        ("JOINING_TYPE", sys.argv[4], None,
          build_value_map(aliases_path, "jt", JOINING_TYPE_ORDER)),
-        ("BIDI_CLASS", sys.argv[5],
+        ("BIDI_CLASS", sys.argv[5], None,
          build_value_map(aliases_path, "bc", BIDI_CLASS_ORDER)),
-        ("UNICODE_SCRIPT", sys.argv[6],
+        ("UNICODE_SCRIPT", sys.argv[6], None,
          build_value_map(aliases_path, "sc", UNICODE_SCRIPT_ORDER)),
-        ("IS_COMBINING_MARK", sys.argv[7],
+        ("IS_COMBINING_MARK", sys.argv[7], None,
          build_combining_mark_value_map(aliases_path)),
+        ("NFC_QUICK_CHECK", sys.argv[8], "NFC_QC",
+         build_value_map(aliases_path, "NFC_QC", NFC_QUICK_CHECK_ORDER)),
     ]
 
     with open(output_path, "w") as output:
         output.write("#include <cstdint>\n\n")
         output.write("namespace {\n\n")
-        for prefix, input_path, value_map in properties:
-            stage1, pages = build_pages(parse_file(input_path, value_map))
+        for prefix, input_path, property_filter, value_map in properties:
+            stage1, pages = build_pages(
+                parse_file(input_path, value_map, property_filter)
+            )
             emit_property(output, prefix, stage1, pages)
         output.write("} // namespace\n")
 

diff --git a/src/core/unicode/include/sourcemeta/core/unicode.h b/src/core/unicode/include/sourcemeta/core/unicode.h
@@ -308,6 +308,25 @@ auto script(const char32_t codepoint) noexcept -> UnicodeScript;
 SOURCEMETA_CORE_UNICODE_EXPORT
 auto is_combining_mark(const char32_t codepoint) noexcept -> bool;
 
+/// @ingroup unicode
+/// Return the NFC quick-check property of a Unicode codepoint per UAX #15.
+/// See https://www.unicode.org/reports/tr15/ for the property's definition.
+/// For example:
+///
+/// ```cpp
+/// #include <sourcemeta/core/unicode.h>
+/// #include <cassert>
+///
+/// assert(sourcemeta::core::nfc_quick_check(U'A') ==
+///        sourcemeta::core::NFCQuickCheck::Yes);
+/// assert(sourcemeta::core::nfc_quick_check(U'\u2126') ==
+///        sourcemeta::core::NFCQuickCheck::No);
+/// assert(sourcemeta::core::nfc_quick_check(U'\u0300') ==
+///        sourcemeta::core::NFCQuickCheck::Maybe);
+/// ```
+SOURCEMETA_CORE_UNICODE_EXPORT
+auto nfc_quick_check(const char32_t codepoint) noexcept -> NFCQuickCheck;
+
 /// @ingroup unicode
 /// Determine the byte length of the valid UTF-8 codepoint starting at the
 /// given position within the input. Returns 1 for an ASCII byte, 2/3/4 for a

diff --git a/src/core/unicode/include/sourcemeta/core/unicode_ucd.h b/src/core/unicode/include/sourcemeta/core/unicode_ucd.h
@@ -230,6 +230,15 @@ enum class UnicodeScript : std::uint8_t {
   KatakanaOrHiragana = 175,
 };
 
+/// @ingroup unicode
+/// The NFC quick-check result for a Unicode codepoint per UAX #15.
+/// See https://www.unicode.org/reports/tr15/ for the property's definition.
+enum class NFCQuickCheck : std::uint8_t {
+  Yes = 0,
+  No = 1,
+  Maybe = 2,
+};
+
 } // namespace sourcemeta::core
 
 #endif
diff --git a/src/core/unicode/unicode.cc b/src/core/unicode/unicode.cc
@@ -154,4 +154,13 @@ auto is_combining_mark(const char32_t codepoint) noexcept -> bool {
   return IS_COMBINING_MARK_STAGE2[(page << 10U) | (codepoint & 0x3FFU)] != 0;
 }
 
+auto nfc_quick_check(const char32_t codepoint) noexcept -> NFCQuickCheck {
+  if (codepoint > 0x10FFFF) {
+    return NFCQuickCheck::Yes;
+  }
+  const std::size_t page{NFC_QUICK_CHECK_STAGE1[codepoint >> 10U]};
+  return static_cast<NFCQuickCheck>(
+      NFC_QUICK_CHECK_STAGE2[(page << 10U) | (codepoint & 0x3FFU)]);
+}
+
 } // namespace sourcemeta::core
diff --git a/test/unicode/CMakeLists.txt b/test/unicode/CMakeLists.txt
@@ -12,7 +12,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME unicode
     unicode_joining_type_test.cc
     unicode_bidi_class_test.cc
     unicode_script_test.cc
-    unicode_is_combining_mark_test.cc)
+    unicode_is_combining_mark_test.cc
+    unicode_nfc_quick_check_test.cc)
 
 target_link_libraries(sourcemeta_core_unicode_unit
   PRIVATE sourcemeta::core::unicode)
diff --git a/test/unicode/unicode_nfc_quick_check_test.cc b/test/unicode/unicode_nfc_quick_check_test.cc
@@ -0,0 +1,109 @@
+#include <gtest/gtest.h>
+
+#include <sourcemeta/core/unicode.h>
+
+TEST(Unicode_nfc_quick_check, ascii_letter) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'A'),
+            sourcemeta::core::NFCQuickCheck::Yes);
+}
+
+TEST(Unicode_nfc_quick_check, ascii_digit) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'0'),
+            sourcemeta::core::NFCQuickCheck::Yes);
+}
+
+TEST(Unicode_nfc_quick_check, ascii_space) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U' '),
+            sourcemeta::core::NFCQuickCheck::Yes);
+}
+
+TEST(Unicode_nfc_quick_check, null) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(0x0000),
+            sourcemeta::core::NFCQuickCheck::Yes);
+}
+
+// Latin-1 precomposed character that IS its own NFC form
+TEST(Unicode_nfc_quick_check, latin_a_with_grave) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u00C0'),
+            sourcemeta::core::NFCQuickCheck::Yes);
+}
+
+TEST(Unicode_nfc_quick_check, latin_small_u_with_diaeresis) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u00FC'),
+            sourcemeta::core::NFCQuickCheck::Yes);
+}
+
+// U+0300 COMBINING GRAVE ACCENT may compose with a preceding starter
+TEST(Unicode_nfc_quick_check, combining_grave_accent_is_maybe) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u0300'),
+            sourcemeta::core::NFCQuickCheck::Maybe);
+}
+
+// U+0301 COMBINING ACUTE ACCENT
+TEST(Unicode_nfc_quick_check, combining_acute_accent_is_maybe) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u0301'),
+            sourcemeta::core::NFCQuickCheck::Maybe);
+}
+
+// U+2126 OHM SIGN is a singleton decomposition of U+03A9 GREEK CAPITAL
+// LETTER OMEGA, so it never appears in NFC output
+TEST(Unicode_nfc_quick_check, ohm_sign_is_no) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u2126'),
+            sourcemeta::core::NFCQuickCheck::No);
+}
+
+// U+0958 DEVANAGARI LETTER QA decomposes to U+0915 U+093C in NFC
+TEST(Unicode_nfc_quick_check, devanagari_qa_is_no) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u0958'),
+            sourcemeta::core::NFCQuickCheck::No);
+}
+
+// U+FB1D HEBREW LETTER YOD WITH HIRIQ decomposes in NFC
+TEST(Unicode_nfc_quick_check, hebrew_yod_with_hiriq_is_no) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\uFB1D'),
+            sourcemeta::core::NFCQuickCheck::No);
+}
+
+// U+2000 EN QUAD is a singleton decomposition of U+0020 SPACE
+TEST(Unicode_nfc_quick_check, en_quad_is_no) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u2000'),
+            sourcemeta::core::NFCQuickCheck::No);
+}
+
+// Hangul precomposed syllable, allowed in NFC
+TEST(Unicode_nfc_quick_check, hangul_syllable_is_yes) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\uAC00'),
+            sourcemeta::core::NFCQuickCheck::Yes);
+}
+
+// Hangul L jamo (leading consonant): cannot start a precomposed syllable
+// alone, but composes only with following jamos so is Yes itself
+TEST(Unicode_nfc_quick_check, hangul_l_jamo_is_yes) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u1100'),
+            sourcemeta::core::NFCQuickCheck::Yes);
+}
+
+// Hangul V jamo (vowel): may compose with a preceding L jamo into a LV
+// syllable, so the quick check is Maybe
+TEST(Unicode_nfc_quick_check, hangul_v_jamo_is_maybe) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u1161'),
+            sourcemeta::core::NFCQuickCheck::Maybe);
+}
+
+// Hangul T jamo (trailing consonant): may compose with a preceding LV
+// syllable into an LVT syllable
+TEST(Unicode_nfc_quick_check, hangul_t_jamo_is_maybe) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(U'\u11A8'),
+            sourcemeta::core::NFCQuickCheck::Maybe);
+}
+
+// Past the Unicode maximum: default per @missing rule is Yes
+TEST(Unicode_nfc_quick_check, beyond_max_codepoint) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(0x110000),
+            sourcemeta::core::NFCQuickCheck::Yes);
+}
+
+TEST(Unicode_nfc_quick_check, beyond_max_codepoint_high) {
+  EXPECT_EQ(sourcemeta::core::nfc_quick_check(0xFFFFFFFF),
+            sourcemeta::core::NFCQuickCheck::Yes);
+}