Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/core/unicode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ add_custom_command(
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedJoiningType.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedBidiClass.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt"
DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/codegen.py"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/PropertyValueAliases.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedCombiningClass.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedJoiningType.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedBidiClass.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/Scripts.txt"
"${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedGeneralCategory.txt"
COMMENT "Generating Unicode property tables"
VERBATIM)

Expand Down
20 changes: 18 additions & 2 deletions src/core/unicode/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,19 @@ def parse_alias_lines(aliases_path, property_short):
return rows


def build_combining_mark_value_map(aliases_path):
"""Build {form: int} from PropertyValueAliases.txt mapping each
General_Category alias to 1 if it is a combining mark (Mn, Mc, Me,
or the supergroup M / Mark / Combining_Mark) and to 0 otherwise."""
combining = {"M", "Mn", "Mc", "Me"}
result = {}
for row in parse_alias_lines(aliases_path, "gc"):
value = 1 if any(field in combining for field in row) else 0
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
for field in row:
result[field] = value
return result


def build_value_map(aliases_path, property_short, canonical_order=None):
"""Build {form: int} for a property. With canonical_order, each row's
integer is its canonical's position in that list; without, the row's
Expand Down Expand Up @@ -183,15 +196,16 @@ def emit_property(output, prefix, stage1, unique_pages):


def main():
if len(sys.argv) != 7:
if len(sys.argv) != 8:
print(
f"Usage: {sys.argv[0]} "
"<output.h> "
"<PropertyValueAliases.txt> "
"<DerivedCombiningClass.txt> "
"<DerivedJoiningType.txt> "
"<DerivedBidiClass.txt> "
"<Scripts.txt>",
"<Scripts.txt> "
"<DerivedGeneralCategory.txt>",
file=sys.stderr,
)
sys.exit(1)
Expand All @@ -208,6 +222,8 @@ def main():
build_value_map(aliases_path, "bc", BIDI_CLASS_ORDER)),
("UNICODE_SCRIPT", sys.argv[6],
build_value_map(aliases_path, "sc", UNICODE_SCRIPT_ORDER)),
("IS_COMBINING_MARK", sys.argv[7],
build_combining_mark_value_map(aliases_path)),
]

with open(output_path, "w") as output:
Expand Down
17 changes: 17 additions & 0 deletions src/core/unicode/include/sourcemeta/core/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,23 @@ auto bidi_class(const char32_t codepoint) noexcept -> BidiClass;
SOURCEMETA_CORE_UNICODE_EXPORT
auto script(const char32_t codepoint) noexcept -> UnicodeScript;

/// @ingroup unicode
/// Return whether a Unicode codepoint is a combining mark, in the sense
/// of UAX #44 general category Mn (Nonspacing_Mark), Mc (Spacing_Mark),
/// or Me (Enclosing_Mark). See https://www.unicode.org/reports/tr44/ for
/// the property's definition. For example:
///
/// ```cpp
/// #include <sourcemeta/core/unicode.h>
/// #include <cassert>
///
/// assert(sourcemeta::core::is_combining_mark(U'\u0301'));
/// assert(sourcemeta::core::is_combining_mark(U'\u094D'));
/// assert(!sourcemeta::core::is_combining_mark(U'A'));
/// ```
SOURCEMETA_CORE_UNICODE_EXPORT
auto is_combining_mark(const char32_t codepoint) noexcept -> bool;

/// @ingroup unicode
/// Determine the byte length of the valid UTF-8 codepoint starting at the
/// given position within the input. Returns 1 for an ASCII byte, 2/3/4 for a
Expand Down
8 changes: 8 additions & 0 deletions src/core/unicode/unicode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,4 +146,12 @@ auto script(const char32_t codepoint) noexcept -> UnicodeScript {
UNICODE_SCRIPT_STAGE2[(page << 10U) | (codepoint & 0x3FFU)]);
}

auto is_combining_mark(const char32_t codepoint) noexcept -> bool {
if (codepoint > 0x10FFFF) {
return false;
}
const std::size_t page{IS_COMBINING_MARK_STAGE1[codepoint >> 10U]};
return IS_COMBINING_MARK_STAGE2[(page << 10U) | (codepoint & 0x3FFU)] != 0;
}

} // namespace sourcemeta::core
3 changes: 2 additions & 1 deletion test/unicode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME unicode
unicode_combining_class_test.cc
unicode_joining_type_test.cc
unicode_bidi_class_test.cc
unicode_script_test.cc)
unicode_script_test.cc
unicode_is_combining_mark_test.cc)

target_link_libraries(sourcemeta_core_unicode_unit
PRIVATE sourcemeta::core::unicode)
85 changes: 85 additions & 0 deletions test/unicode/unicode_is_combining_mark_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#include <gtest/gtest.h>

#include <sourcemeta/core/unicode.h>

TEST(Unicode_is_combining_mark, ascii_letter) {
EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'A'));
}

TEST(Unicode_is_combining_mark, ascii_digit) {
EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'0'));
}

TEST(Unicode_is_combining_mark, null) {
EXPECT_FALSE(sourcemeta::core::is_combining_mark(0x0000));
}

TEST(Unicode_is_combining_mark, hebrew_letter_alef) {
EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'\u05D0'));
}

TEST(Unicode_is_combining_mark, combining_acute_accent_nonspacing) {
// U+0301 Mn (Nonspacing_Mark)
EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u0301'));
}

TEST(Unicode_is_combining_mark, combining_grave_accent_below_nonspacing) {
// U+0316 Mn
EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u0316'));
}

TEST(Unicode_is_combining_mark, devanagari_virama_nonspacing) {
// U+094D Mn
EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u094D'));
}

TEST(Unicode_is_combining_mark, devanagari_vowel_sign_aa_spacing) {
// U+093E Mc (Spacing_Mark)
EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u093E'));
}

TEST(Unicode_is_combining_mark, devanagari_sign_visarga_spacing) {
// U+0903 Mc
EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u0903'));
}

TEST(Unicode_is_combining_mark, tamil_vowel_sign_e_spacing) {
// U+0BC6 Mc
EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u0BC6'));
}

TEST(Unicode_is_combining_mark, combining_enclosing_circle_enclosing) {
// U+20DD Me (Enclosing_Mark)
EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u20DD'));
}

TEST(Unicode_is_combining_mark, combining_enclosing_square_enclosing) {
// U+20DE Me
EXPECT_TRUE(sourcemeta::core::is_combining_mark(U'\u20DE'));
}

TEST(Unicode_is_combining_mark, zero_width_joiner_not_mark) {
// U+200D is bidi class BN but general category Cf (Format), not a mark
EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'\u200D'));
}

TEST(Unicode_is_combining_mark, hangul_letter_not_mark) {
// U+1100 is Lo (Other_Letter), not a mark
EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'\u1100'));
}

TEST(Unicode_is_combining_mark, emoji_grinning_face_not_mark) {
EXPECT_FALSE(sourcemeta::core::is_combining_mark(U'\U0001F600'));
}

TEST(Unicode_is_combining_mark, unassigned_codepoint) {
EXPECT_FALSE(sourcemeta::core::is_combining_mark(0x0E80));
}

TEST(Unicode_is_combining_mark, max_codepoint) {
EXPECT_FALSE(sourcemeta::core::is_combining_mark(0x10FFFF));
}

TEST(Unicode_is_combining_mark, above_max_codepoint) {
EXPECT_FALSE(sourcemeta::core::is_combining_mark(0x110000));
}
1 change: 0 additions & 1 deletion vendor/unicodetools.mask

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading