Skip to content

Commit 629531f

Browse files
Generalize code for Hangul syllables and CJK and Tangut ideographs.
1 parent c09400c commit 629531f

3 files changed

Lines changed: 91 additions & 85 deletions

File tree

Modules/unicodedata.c

Lines changed: 33 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,31 +1052,18 @@ static const char * const hangul_syllables[][3] = {
10521052
{ 0, 0, "H" }
10531053
};
10541054

1055-
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
10561055
static int
1057-
is_cjk_unified_ideograph(Py_UCS4 code)
1058-
{
1059-
return
1060-
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1061-
(0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
1062-
(0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
1063-
(0x2A700 <= code && code <= 0x2B73F) || /* CJK Ideograph Extension C */
1064-
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1065-
(0x2B820 <= code && code <= 0x2CEAD) || /* CJK Ideograph Extension E */
1066-
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1067-
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
1068-
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
1069-
(0x31350 <= code && code <= 0x323AF) || /* CJK Ideograph Extension H */
1070-
(0x323B0 <= code && code <= 0x33479); /* CJK Ideograph Extension J */
1071-
}
1072-
1073-
/* These ranges need to match makeunicodedata.py:tangut_ranges. */
1074-
static int
1075-
is_tangut_ideograph(Py_UCS4 code)
1056+
find_prefix_id(Py_UCS4 code)
10761057
{
1077-
return
1078-
(0x17000 <= code && code <= 0x187FF) || /* Tangut */
1079-
(0x18D00 <= code && code <= 0x18D1E); /* Tangut Supplement */
1058+
for (int i = 0; i < (int)Py_ARRAY_LENGTH(derived_name_ranges); i++) {
1059+
if (code < derived_name_ranges[i].first) {
1060+
return -1;
1061+
}
1062+
if (code <= derived_name_ranges[i].last) {
1063+
return derived_name_ranges[i].prefixid;
1064+
}
1065+
}
1066+
return -1;
10801067
}
10811068

10821069
/* macros used to determine if the given code point is in the PUA range that
@@ -1354,7 +1341,9 @@ _getucname(PyObject *self,
13541341
}
13551342
}
13561343

1357-
if (SBase <= code && code < SBase+SCount) {
1344+
int prefixid = find_prefix_id(code);
1345+
if (prefixid == 0) {
1346+
assert(SBase <= code && code < SBase+SCount);
13581347
/* Hangul syllable. */
13591348
int SIndex = code - SBase;
13601349
int L = SIndex / NCount;
@@ -1376,19 +1365,11 @@ _getucname(PyObject *self,
13761365
return 1;
13771366
}
13781367

1379-
if (is_cjk_unified_ideograph(code)) {
1380-
if (buflen < 28)
1381-
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1382-
return 0;
1383-
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1384-
return 1;
1385-
}
1386-
1387-
if (is_tangut_ideograph(code)) {
1388-
if (buflen < 23)
1389-
/* Worst case: TANGUT IDEOGRAPH-18D08 */
1368+
if (prefixid > 0) {
1369+
const char *prefix = derived_name_prefixes[prefixid];
1370+
if (snprintf(buffer, buflen, "%s%04X", prefix, code) >= buflen) {
13901371
return 0;
1391-
sprintf(buffer, "TANGUT IDEOGRAPH-%X", code);
1372+
}
13921373
return 1;
13931374
}
13941375

@@ -1482,8 +1463,19 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
14821463
* Named aliases are not resolved, they are returned as a code point in the
14831464
* PUA */
14841465

1485-
/* Check for hangul syllables. */
1486-
if (PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1466+
int i = 0;
1467+
size_t prefixlen;
1468+
for (; i < (int)Py_ARRAY_LENGTH(derived_name_prefixes); i++) {
1469+
const char *prefix = derived_name_prefixes[i];
1470+
prefixlen = strlen(derived_name_prefixes[i]);
1471+
if (PyOS_strnicmp(name, prefix, prefixlen) == 0) {
1472+
break;
1473+
}
1474+
}
1475+
1476+
if (i == 0) {
1477+
/* Hangul syllables. */
1478+
assert(PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0);
14871479
int len, L = -1, V = -1, T = -1;
14881480
const char *pos = name + 16;
14891481
find_syllable(pos, &len, &L, LCount, 0);
@@ -1500,22 +1492,9 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
15001492
return 0;
15011493
}
15021494

1503-
/* Check for CJK unified ideographs. */
1504-
if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1505-
/* Four or five hexdigits must follow. */
1506-
Py_UCS4 v = parse_hex_code(name + 22, namelen - 22);
1507-
if (!is_cjk_unified_ideograph(v)) {
1508-
return 0;
1509-
}
1510-
*code = v;
1511-
return 1;
1512-
}
1513-
1514-
/* Check for Tangut ideographs. */
1515-
if (PyOS_strnicmp(name, "TANGUT IDEOGRAPH-", 17) == 0) {
1516-
/* Five hexdigits must follow. */
1517-
Py_UCS4 v = parse_hex_code(name + 17, namelen - 17);
1518-
if (!is_tangut_ideograph(v)) {
1495+
if (i < (int)Py_ARRAY_LENGTH(derived_name_prefixes)) {
1496+
Py_UCS4 v = parse_hex_code(name + prefixlen, namelen - prefixlen);
1497+
if (find_prefix_id(v) != i) {
15191498
return 0;
15201499
}
15211500
*code = v;

Modules/unicodename_db.h

Lines changed: 28 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Tools/unicode/makeunicodedata.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -109,25 +109,13 @@
109109
CASED_MASK = 0x2000
110110
EXTENDED_CASE_MASK = 0x4000
111111

112-
# these ranges need to match unicodedata.c:is_cjk_unified_ideograph
113-
cjk_ranges = [
114-
('3400', '4DBF'), # CJK Ideograph Extension A CJK
115-
('4E00', '9FFF'), # CJK Ideograph
116-
('20000', '2A6DF'), # CJK Ideograph Extension B
117-
('2A700', '2B73F'), # CJK Ideograph Extension C
118-
('2B740', '2B81D'), # CJK Ideograph Extension D
119-
('2B820', '2CEAD'), # CJK Ideograph Extension E
120-
('2CEB0', '2EBE0'), # CJK Ideograph Extension F
121-
('2EBF0', '2EE5D'), # CJK Ideograph Extension I
122-
('30000', '3134A'), # CJK Ideograph Extension G
123-
('31350', '323AF'), # CJK Ideograph Extension H
124-
('323B0', '33479'), # CJK Ideograph Extension J
125-
]
126-
127-
# these ranges need to match unicodedata.c:is_tangut_ideograph
128-
tangut_ranges = [
129-
('17000', '187FF'),
130-
('18D00', '18D1E')
112+
# Maps the range names in UnicodeData.txt to prefixes for
113+
# derived names specified by rule NR2.
114+
# Hangul should always be at index 0, since it uses special format.
115+
derived_name_range_names = [
116+
("Hangul Syllable", "HANGUL SYLLABLE "),
117+
("CJK Ideograph", "CJK UNIFIED IDEOGRAPH-"),
118+
("Tangut Ideograph", "TANGUT IDEOGRAPH-"),
131119
]
132120

133121

@@ -737,6 +725,23 @@ def makeunicodename(unicode, trace):
737725
fprint(' {%d, {%s}},' % (len(sequence), seq_str))
738726
fprint('};')
739727

728+
fprint(dedent("""
729+
typedef struct {
730+
Py_UCS4 first;
731+
Py_UCS4 last;
732+
int prefixid;
733+
} derived_name_range;
734+
"""))
735+
736+
fprint('static const derived_name_range derived_name_ranges[] = {')
737+
for name_range in unicode.derived_name_ranges:
738+
fprint(' {0x%s, 0x%s, %d},' % name_range)
739+
fprint('};')
740+
741+
fprint('static const char * const derived_name_prefixes[] = {')
742+
for _, prefix in derived_name_range_names:
743+
fprint(' "%s",' % prefix)
744+
fprint('};')
740745

741746
def merge_old_version(version, new, old):
742747
# Changes to exclusion file not implemented yet
@@ -959,8 +964,7 @@ def __init__(self, version, ideograph_check=True):
959964
char = int(s[0], 16)
960965
table[char] = from_row(s)
961966

962-
cjk_ranges_found = []
963-
tangut_ranges_found = []
967+
self.derived_name_ranges = []
964968

965969
# expand first-last ranges
966970
field = None
@@ -974,20 +978,15 @@ def __init__(self, version, ideograph_check=True):
974978
s.name = ""
975979
field = dataclasses.astuple(s)[:15]
976980
elif s.name[-5:] == "Last>":
977-
if s.name.startswith("<CJK Ideograph"):
978-
cjk_ranges_found.append((field[0],
979-
s.codepoint))
980-
elif s.name.startswith("<Tangut Ideograph"):
981-
tangut_ranges_found.append((field[0],
982-
s.codepoint))
981+
for j, (rangename, _) in enumerate(derived_name_range_names):
982+
if s.name.startswith("<" + rangename):
983+
self.derived_name_ranges.append(
984+
(field[0], s.codepoint, j))
985+
break
983986
s.name = ""
984987
field = None
985988
elif field:
986989
table[i] = from_row(('%X' % i,) + field[1:])
987-
if ideograph_check and cjk_ranges != cjk_ranges_found:
988-
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
989-
if ideograph_check and tangut_ranges != tangut_ranges_found:
990-
raise ValueError("Tangut ranges deviate: have %r" % tangut_ranges_found)
991990

992991
# public attributes
993992
self.filename = UNICODE_DATA % ''

0 commit comments

Comments
 (0)