sourcemeta · jviotti · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/src/core/unicode/CMakeLists.txt b/src/core/unicode/CMakeLists.txt
@@ -16,13 +16,15 @@ add_custom_command(
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedJoiningType.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedBidiClass.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt"
+    "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt"
   DEPENDS
     "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/PropertyValueAliases.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedCombiningClass.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedJoiningType.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedBidiClass.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt"
+    "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt"
   COMMENT "Generating Unicode property tables"
   VERBATIM)
 

diff --git a/src/core/unicode/codegen.py b/src/core/unicode/codegen.py
@@ -76,6 +76,19 @@ def parse_alias_lines(aliases_path, property_short):
     return rows
 
 
+def build_combining_mark_value_map(aliases_path):
+    """Build {form: int} from PropertyValueAliases.txt mapping each
+    General_Category alias to 1 if it is a combining mark (Mn, Mc, Me,
+    or the supergroup M / Mark / Combining_Mark) and to 0 otherwise."""
+    combining = {"M", "Mn", "Mc", "Me"}
+    result = {}
+    for row in parse_alias_lines(aliases_path, "gc"):
+        value = 1 if any(field in combining for field in row) else 0
+        for field in row:
+            result[field] = value
+    return result
+
+
 def build_value_map(aliases_path, property_short, canonical_order=None):
     """Build {form: int} for a property. With canonical_order, each row's
     integer is its canonical's position in that list; without, the row's
@@ -183,15 +196,16 @@ def emit_property(output, prefix, stage1, unique_pages):
 
 
 def main():
-    if len(sys.argv) != 7:
+    if len(sys.argv) != 8:
         print(
             f"Usage: {sys.argv[0]} "
             "<output.h> "
             "<PropertyValueAliases.txt> "
             "<DerivedCombiningClass.txt> "
             "<DerivedJoiningType.txt> "
             "<DerivedBidiClass.txt> "
-            "<Scripts.txt>",
+            "<Scripts.txt> "
+            "<DerivedGeneralCategory.txt>",
             file=sys.stderr,
         )
         sys.exit(1)
@@ -208,6 +222,8 @@ def main():
          build_value_map(aliases_path, "bc", BIDI_CLASS_ORDER)),
         ("UNICODE_SCRIPT", sys.argv[6],
          build_value_map(aliases_path, "sc", UNICODE_SCRIPT_ORDER)),
+        ("IS_COMBINING_MARK", sys.argv[7],
+         build_combining_mark_value_map(aliases_path)),
     ]
 
     with open(output_path, "w") as output:

diff --git a/src/core/unicode/include/sourcemeta/core/unicode.h b/src/core/unicode/include/sourcemeta/core/unicode.h
@@ -291,6 +291,23 @@ auto bidi_class(const char32_t codepoint) noexcept -> BidiClass;
 SOURCEMETA_CORE_UNICODE_EXPORT
 auto script(const char32_t codepoint) noexcept -> UnicodeScript;
 
+/// @ingroup unicode
+/// Return whether a Unicode codepoint is a combining mark, in the sense
+/// of UAX #44 general category Mn (Nonspacing_Mark), Mc (Spacing_Mark),
+/// or Me (Enclosing_Mark). See https://www.unicode.org/reports/tr44/ for
+/// the property's definition. For example:
+///
+/// ```cpp
+/// #include <sourcemeta/core/unicode.h>
+/// #include <cassert>
+///
+/// assert(sourcemeta::core::is_combining_mark(U'\u0301'));
+/// assert(sourcemeta::core::is_combining_mark(U'\u094D'));
+/// assert(!sourcemeta::core::is_combining_mark(U'A'));
+/// ```
+SOURCEMETA_CORE_UNICODE_EXPORT
+auto is_combining_mark(const char32_t codepoint) noexcept -> bool;
+
 /// @ingroup unicode
 /// Determine the byte length of the valid UTF-8 codepoint starting at the
 /// given position within the input. Returns 1 for an ASCII byte, 2/3/4 for a

diff --git a/src/core/unicode/unicode.cc b/src/core/unicode/unicode.cc
@@ -146,4 +146,12 @@ auto script(const char32_t codepoint) noexcept -> UnicodeScript {
       UNICODE_SCRIPT_STAGE2[(page << 10U) | (codepoint & 0x3FFU)]);
 }
 
+auto is_combining_mark(const char32_t codepoint) noexcept -> bool {
+  if (codepoint > 0x10FFFF) {
+    return false;
+  }
+  const std::size_t page{IS_COMBINING_MARK_STAGE1[codepoint >> 10U]};
+  return IS_COMBINING_MARK_STAGE2[(page << 10U) | (codepoint & 0x3FFU)] != 0;
+}
+
 } // namespace sourcemeta::core
diff --git a/test/unicode/CMakeLists.txt b/test/unicode/CMakeLists.txt
@@ -11,7 +11,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME unicode
     unicode_combining_class_test.cc
     unicode_joining_type_test.cc
     unicode_bidi_class_test.cc
-    unicode_script_test.cc)
+    unicode_script_test.cc
+    unicode_is_combining_mark_test.cc)
 
 target_link_libraries(sourcemeta_core_unicode_unit
   PRIVATE sourcemeta::core::unicode)
diff --git a/test/unicode/unicode_is_combining_mark_test.cc b/test/unicode/unicode_is_combining_mark_test.cc
@@ -0,0 +1,85 @@
+#include <gtest/gtest.h>
+
+#include <sourcemeta/core/unicode.h>
+
+TEST(Unicode_is_combining_mark, ascii_letter) {
+  EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'A'));
+}
+
+TEST(Unicode_is_combining_mark, ascii_digit) {
+  EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'0'));
+}
+
+TEST(Unicode_is_combining_mark, null) {
+  EXPECT_FALSE(sourcemeta::core::is_combining_mark(0x0000));
+}
+
+TEST(Unicode_is_combining_mark, hebrew_letter_alef) {
+  EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'\u05D0'));
+}
+
+TEST(Unicode_is_combining_mark, combining_acute_accent_nonspacing) {
+  // U+0301 Mn (Nonspacing_Mark)
+  EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u0301'));
+}
+
+TEST(Unicode_is_combining_mark, combining_grave_accent_below_nonspacing) {
+  // U+0316 Mn
+  EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u0316'));
+}
+
+TEST(Unicode_is_combining_mark, devanagari_virama_nonspacing) {
+  // U+094D Mn
+  EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u094D'));
+}
+
+TEST(Unicode_is_combining_mark, devanagari_vowel_sign_aa_spacing) {
+  // U+093E Mc (Spacing_Mark)
+  EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u093E'));
+}
+
+TEST(Unicode_is_combining_mark, devanagari_sign_visarga_spacing) {
+  // U+0903 Mc
+  EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u0903'));
+}
+
+TEST(Unicode_is_combining_mark, tamil_vowel_sign_e_spacing) {
+  // U+0BC6 Mc
+  EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u0BC6'));
+}
+
+TEST(Unicode_is_combining_mark, combining_enclosing_circle_enclosing) {
+  // U+20DD Me (Enclosing_Mark)
+  EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u20DD'));
+}
+
+TEST(Unicode_is_combining_mark, combining_enclosing_square_enclosing) {
+  // U+20DE Me
+  EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u20DE'));
+}
+
+TEST(Unicode_is_combining_mark, zero_width_joiner_not_mark) {
+  // U+200D is bidi class BN but general category Cf (Format), not a mark
+  EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'\u200D'));
+}
+
+TEST(Unicode_is_combining_mark, hangul_letter_not_mark) {
+  // U+1100 is Lo (Other_Letter), not a mark
+  EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'\u1100'));
+}
+
+TEST(Unicode_is_combining_mark, emoji_grinning_face_not_mark) {
+  EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'\U0001F600'));
+}
+
+TEST(Unicode_is_combining_mark, unassigned_codepoint) {
+  EXPECT_FALSE(sourcemeta::core::is_combining_mark(0x0E80));
+}
+
+TEST(Unicode_is_combining_mark, max_codepoint) {
+  EXPECT_FALSE(sourcemeta::core::is_combining_mark(0x10FFFF));
+}
+
+TEST(Unicode_is_combining_mark, above_max_codepoint) {
+  EXPECT_FALSE(sourcemeta::core::is_combining_mark(0x110000));
+}
diff --git a/vendor/unicodetools.mask b/vendor/unicodetools.mask