diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 3dc0790ca15b41..9a38ca8d89f488 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -71,7 +71,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): # Update this if the database changes. Make sure to do a full rebuild # (e.g. 'make distclean && make') to get the correct checksum. - expectedchecksum = '26ff0d31c14194b4606a5b3a81ac36df3a14e331' + expectedchecksum = '95cc75e49b140c61b884c16d0a9fbbb0b93a7fa9' @requires_resource('cpu') def test_function_checksum(self): @@ -97,6 +97,50 @@ def test_function_checksum(self): result = h.hexdigest() self.assertEqual(result, self.expectedchecksum) + @requires_resource('network') + def test_name(self): + TESTBASEURL = "https://www.unicode.org/Public" + TESTDATAFILE = "extracted/DerivedName.txt" + TESTDATAURL = f"{TESTBASEURL}/{unicodedata.unidata_version}/ucd/{TESTDATAFILE}" + + # Hit the exception early + try: + testdata = open_urlresource(TESTDATAURL, encoding="utf-8") + except PermissionError: + self.skipTest(f"Permission error when downloading {TESTDATAURL} " + f"into the test data directory") + except (OSError, HTTPException) as exc: + self.skipTest(f"Failed to download {TESTDATAURL}: {exc}") + + with testdata: + self.run_name_tests(testdata) + + def run_name_tests(self, testdata): + names_ref = {} + + def parse_cp(s): + return int(s, 16) + + # Parse data + for line in testdata: + line = line.strip() + if not line or line.startswith("#"): + continue + raw_cp, name = line.split("; ") + # Check for a range + if ".." in raw_cp: + cp1, cp2 = map(parse_cp, raw_cp.split("..")) + # remove ‘*’ at the end + name = name[:-1] + for cp in range(cp1, cp2 + 1): + names_ref[cp] = f"{name}{cp:0>4X}" + else: + cp = parse_cp(raw_cp) + names_ref[cp] = name + + for cp in range(0, sys.maxunicode + 1): + self.assertEqual(self.db.name(chr(cp), None), names_ref.get(cp)) + @requires_resource('cpu') def test_name_inverse_lookup(self): for i in range(sys.maxunicode + 1): @@ -104,6 +148,7 @@ def test_name_inverse_lookup(self): if looked_name := self.db.name(char, None): self.assertEqual(self.db.lookup(looked_name), char) + def test_digit(self): self.assertEqual(self.db.digit('A', None), None) self.assertEqual(self.db.digit('9'), 9) diff --git a/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst b/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst new file mode 100644 index 00000000000000..bbdcb4ffa0998b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst @@ -0,0 +1 @@ +unicodedata: Fix missing Tangut Ideographs names. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index c1e22f3868931f..7359a2740615e0 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1025,7 +1025,7 @@ static const char * const hangul_syllables[][3] = { /* These ranges need to match makeunicodedata.py:cjk_ranges. */ static int -is_unified_ideograph(Py_UCS4 code) +is_cjk_unified_ideograph(Py_UCS4 code) { return (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */ @@ -1039,6 +1039,15 @@ is_unified_ideograph(Py_UCS4 code) (0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */ } +/* These ranges need to match makeunicodedata.py:tangut_ranges. */ +static int +is_tangut_ideograph(Py_UCS4 code) +{ + return + (0x17000 <= code && code <= 0x187F7) || /* Tangut */ + (0x18D00 <= code && code <= 0x18D08); /* Tangut Supplement */ +} + /* macros used to determine if the given code point is in the PUA range that * we are using to store aliases and named sequences */ #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end)) @@ -1098,7 +1107,7 @@ _getucname(PyObject *self, return 1; } - if (is_unified_ideograph(code)) { + if (is_cjk_unified_ideograph(code)) { if (buflen < 28) /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ return 0; @@ -1106,6 +1115,14 @@ _getucname(PyObject *self, return 1; } + if (is_tangut_ideograph(code)) { + if (buflen < 23) + /* Worst case: TANGUT IDEOGRAPH-18D08 */ + return 0; + sprintf(buffer, "TANGUT IDEOGRAPH-%X", code); + return 1; + } + /* get offset into phrasebook */ offset = phrasebook_offset1[(code>>phrasebook_shift)]; offset = phrasebook_offset2[(offset<= '0' && *name <= '9') + v += *name - '0'; + else if (*name >= 'A' && *name <= 'F') + v += *name - 'A' + 10; + else + return 0; + name++; + } + if (!is_tangut_ideograph(v)) return 0; *code = v; return 1; } + /* the following is the same as python's dictionary lookup, with only minor changes. see the makeunicodedata script for more details */ diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 034642db06e48b..6c69ba2b946709 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -99,7 +99,7 @@ CASED_MASK = 0x2000 EXTENDED_CASE_MASK = 0x4000 -# these ranges need to match unicodedata.c:is_unified_ideograph +# these ranges need to match unicodedata.c:is_cjk_unified_ideograph cjk_ranges = [ ('3400', '4DBF'), ('4E00', '9FFF'), @@ -112,6 +112,12 @@ ('31350', '323AF'), ] +# these ranges need to match unicodedata.c:is_tangut_ideograph +tangut_ranges = [ + ('17000', '187F7'), + ('18D00', '18D08') +] + def maketables(trace=0): @@ -123,7 +129,7 @@ def maketables(trace=0): for version in old_versions: print("--- Reading", UNICODE_DATA % ("-"+version), "...") - old_unicode = UnicodeData(version, cjk_check=False) + old_unicode = UnicodeData(version, ideograph_check=False) print(len(list(filter(None, old_unicode.table))), "characters") merge_old_version(version, unicode, old_unicode) @@ -1020,7 +1026,7 @@ def from_row(row: List[str]) -> UcdRecord: class UnicodeData: # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned - def __init__(self, version, cjk_check=True): + def __init__(self, version, ideograph_check=True): self.changed = [] table = [None] * 0x110000 for s in UcdFile(UNICODE_DATA, version): @@ -1028,6 +1034,7 @@ def __init__(self, version, cjk_check=True): table[char] = from_row(s) cjk_ranges_found = [] + tangut_ranges_found = [] # expand first-last ranges field = None @@ -1044,12 +1051,17 @@ def __init__(self, version, cjk_check=True): if s.name.startswith("