Skip to content

Commit 8a8ba5b

Browse files
committed
unicodedata: Fix Tangut Ideograph names
1 parent 90d85a9 commit 8a8ba5b

File tree

3 files changed

+61
-6
lines changed

3 files changed

+61
-6
lines changed

Lib/test/test_unicodedata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
7272

7373
# Update this if the database changes. Make sure to do a full rebuild
7474
# (e.g. 'make distclean && make') to get the correct checksum.
75-
expectedchecksum = '26ff0d31c14194b4606a5b3a81ac36df3a14e331'
75+
expectedchecksum = '95cc75e49b140c61b884c16d0a9fbbb0b93a7fa9'
7676

7777
@requires_resource('cpu')
7878
def test_function_checksum(self):

Modules/unicodedata.c

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,7 +1031,7 @@ static const char * const hangul_syllables[][3] = {
10311031

10321032
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
10331033
static int
1034-
is_unified_ideograph(Py_UCS4 code)
1034+
is_cjk_unified_ideograph(Py_UCS4 code)
10351035
{
10361036
return
10371037
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
@@ -1045,6 +1045,15 @@ is_unified_ideograph(Py_UCS4 code)
10451045
(0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
10461046
}
10471047

1048+
/* These ranges need to match makeunicodedata.py:tangut_ranges. */
1049+
static int
1050+
is_tangut_ideograph(Py_UCS4 code)
1051+
{
1052+
return
1053+
(0x17000 <= code && code <= 0x187F7) || /* Tangut */
1054+
(0x18D00 <= code && code <= 0x18D08); /* Tangut Supplement */
1055+
}
1056+
10481057
/* macros used to determine if the given code point is in the PUA range that
10491058
* we are using to store aliases and named sequences */
10501059
#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
@@ -1104,14 +1113,22 @@ _getucname(PyObject *self,
11041113
return 1;
11051114
}
11061115

1107-
if (is_unified_ideograph(code)) {
1116+
if (is_cjk_unified_ideograph(code)) {
11081117
if (buflen < 28)
11091118
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
11101119
return 0;
11111120
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
11121121
return 1;
11131122
}
11141123

1124+
if (is_tangut_ideograph(code)) {
1125+
if (buflen < 23)
1126+
/* Worst case: TANGUT IDEOGRAPH-18D08 */
1127+
return 0;
1128+
sprintf(buffer, "TANGUT IDEOGRAPH-%X", code);
1129+
return 1;
1130+
}
1131+
11151132
/* get offset into phrasebook */
11161133
offset = phrasebook_offset1[(code>>phrasebook_shift)];
11171134
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
@@ -1242,7 +1259,7 @@ _getcode(PyObject* self,
12421259
return 0;
12431260
}
12441261

1245-
/* Check for unified ideographs. */
1262+
/* Check for CJK unified ideographs. */
12461263
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
12471264
/* Four or five hexdigits must follow. */
12481265
v = 0;
@@ -1260,12 +1277,38 @@ _getcode(PyObject* self,
12601277
return 0;
12611278
name++;
12621279
}
1263-
if (!is_unified_ideograph(v))
1280+
if (!is_cjk_unified_ideograph(v))
1281+
return 0;
1282+
*code = v;
1283+
return 1;
1284+
}
1285+
1286+
1287+
/* Check for Tangut ideographs. */
1288+
if (strncmp(name, "TANGUT IDEOGRAPH-", 17) == 0) {
1289+
/* Five hexdigits must follow. */
1290+
v = 0;
1291+
name += 17;
1292+
namelen -= 17;
1293+
if (namelen != 5)
1294+
return 0;
1295+
while (namelen--) {
1296+
v *= 16;
1297+
if (*name >= '0' && *name <= '9')
1298+
v += *name - '0';
1299+
else if (*name >= 'A' && *name <= 'F')
1300+
v += *name - 'A' + 10;
1301+
else
1302+
return 0;
1303+
name++;
1304+
}
1305+
if (!is_tangut_ideograph(v))
12641306
return 0;
12651307
*code = v;
12661308
return 1;
12671309
}
12681310

1311+
12691312
/* the following is the same as python's dictionary lookup, with
12701313
only minor changes. see the makeunicodedata script for more
12711314
details */

Tools/unicode/makeunicodedata.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@
9999
CASED_MASK = 0x2000
100100
EXTENDED_CASE_MASK = 0x4000
101101

102-
# these ranges need to match unicodedata.c:is_unified_ideograph
102+
# these ranges need to match unicodedata.c:is_cjk_unified_ideograph
103103
cjk_ranges = [
104104
('3400', '4DBF'),
105105
('4E00', '9FFF'),
@@ -112,6 +112,12 @@
112112
('31350', '323AF'),
113113
]
114114

115+
# these ranges need to match unicodedata.c:is_tangut_ideograph
116+
tangut_ranges = [
117+
('17000', '187F7'),
118+
('18D00', '18D08')
119+
]
120+
115121

116122
def maketables(trace=0):
117123

@@ -1028,6 +1034,7 @@ def __init__(self, version, cjk_check=True):
10281034
table[char] = from_row(s)
10291035

10301036
cjk_ranges_found = []
1037+
tangut_ranges_found = []
10311038

10321039
# expand first-last ranges
10331040
field = None
@@ -1044,12 +1051,17 @@ def __init__(self, version, cjk_check=True):
10441051
if s.name.startswith("<CJK Ideograph"):
10451052
cjk_ranges_found.append((field[0],
10461053
s.codepoint))
1054+
elif s.name.startswith("<Tangut Ideograph"):
1055+
tangut_ranges_found.append((field[0],
1056+
s.codepoint))
10471057
s.name = ""
10481058
field = None
10491059
elif field:
10501060
table[i] = from_row(('%X' % i,) + field[1:])
10511061
if cjk_check and cjk_ranges != cjk_ranges_found:
10521062
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
1063+
if cjk_check and tangut_ranges != tangut_ranges_found:
1064+
raise ValueError("Tangut ranges deviate: have %r" % tangut_ranges_found)
10531065

10541066
# public attributes
10551067
self.filename = UNICODE_DATA % ''

0 commit comments

Comments
 (0)