unicodedata: Fix Tangut Ideograph names

wismill · wismill · commit 8a8ba5b5011a · 2023-02-05T19:46:33.000+01:00
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
@@ -72,7 +72,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
 
     # Update this if the database changes. Make sure to do a full rebuild
     # (e.g. 'make distclean && make') to get the correct checksum.
-    expectedchecksum = '26ff0d31c14194b4606a5b3a81ac36df3a14e331'
+    expectedchecksum = '95cc75e49b140c61b884c16d0a9fbbb0b93a7fa9'
 
     @requires_resource('cpu')
     def test_function_checksum(self):
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
@@ -1031,7 +1031,7 @@ static const char * const hangul_syllables[][3] = {
 
 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
 static int
-is_unified_ideograph(Py_UCS4 code)
+is_cjk_unified_ideograph(Py_UCS4 code)
 {
     return
         (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
@@ -1045,6 +1045,15 @@ is_unified_ideograph(Py_UCS4 code)
         (0x31350 <= code && code <= 0x323AF);   /* CJK Ideograph Extension H */
 }
 
+/* These ranges need to match makeunicodedata.py:tangut_ranges. */
+static int
+is_tangut_ideograph(Py_UCS4 code)
+{
+    return
+        (0x17000 <= code && code <= 0x187F7) || /* Tangut */
+        (0x18D00 <= code && code <= 0x18D08);   /* Tangut Supplement */
+}
+
 /* macros used to determine if the given code point is in the PUA range that
  * we are using to store aliases and named sequences */
 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
@@ -1104,14 +1113,22 @@ _getucname(PyObject *self,
         return 1;
     }
 
-    if (is_unified_ideograph(code)) {
+    if (is_cjk_unified_ideograph(code)) {
         if (buflen < 28)
             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
             return 0;
         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
         return 1;
     }
 
+    if (is_tangut_ideograph(code)) {
+        if (buflen < 23)
+            /* Worst case: TANGUT IDEOGRAPH-18D08 */
+            return 0;
+        sprintf(buffer, "TANGUT IDEOGRAPH-%X", code);
+        return 1;
+    }
+
     /* get offset into phrasebook */
     offset = phrasebook_offset1[(code>>phrasebook_shift)];
     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
@@ -1242,7 +1259,7 @@ _getcode(PyObject* self,
         return 0;
     }
 
-    /* Check for unified ideographs. */
+    /* Check for CJK unified ideographs. */
     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
         /* Four or five hexdigits must follow. */
         v = 0;
@@ -1260,12 +1277,38 @@ _getcode(PyObject* self,
                 return 0;
             name++;
         }
-        if (!is_unified_ideograph(v))
+        if (!is_cjk_unified_ideograph(v))
+            return 0;
+        *code = v;
+        return 1;
+    }
+
+
+    /* Check for Tangut ideographs. */
+    if (strncmp(name, "TANGUT IDEOGRAPH-", 17) == 0) {
+        /* Five hexdigits must follow. */
+        v = 0;
+        name += 17;
+        namelen -= 17;
+        if (namelen != 5)
+            return 0;
+        while (namelen--) {
+            v *= 16;
+            if (*name >= '0' && *name <= '9')
+                v += *name - '0';
+            else if (*name >= 'A' && *name <= 'F')
+                v += *name - 'A' + 10;
+            else
+                return 0;
+            name++;
+        }
+        if (!is_tangut_ideograph(v))
             return 0;
         *code = v;
         return 1;
     }
 
+
     /* the following is the same as python's dictionary lookup, with
        only minor changes.  see the makeunicodedata script for more
        details */
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
@@ -99,7 +99,7 @@
 CASED_MASK = 0x2000
 EXTENDED_CASE_MASK = 0x4000
 
-# these ranges need to match unicodedata.c:is_unified_ideograph
+# these ranges need to match unicodedata.c:is_cjk_unified_ideograph
 cjk_ranges = [
     ('3400', '4DBF'),
     ('4E00', '9FFF'),
@@ -112,6 +112,12 @@
     ('31350', '323AF'),
 ]
 
+# these ranges need to match unicodedata.c:is_tangut_ideograph
+tangut_ranges = [
+    ('17000', '187F7'),
+    ('18D00', '18D08')
+]
+
 
 def maketables(trace=0):
 
@@ -1028,6 +1034,7 @@ def __init__(self, version, cjk_check=True):
             table[char] = from_row(s)
 
         cjk_ranges_found = []
+        tangut_ranges_found = []
 
         # expand first-last ranges
         field = None
@@ -1044,12 +1051,17 @@ def __init__(self, version, cjk_check=True):
                     if s.name.startswith("<CJK Ideograph"):
                         cjk_ranges_found.append((field[0],
                                                  s.codepoint))
+                    elif s.name.startswith("<Tangut Ideograph"):
+                        tangut_ranges_found.append((field[0],
+                                                    s.codepoint))
                     s.name = ""
                     field = None
             elif field:
                 table[i] = from_row(('%X' % i,) + field[1:])
         if cjk_check and cjk_ranges != cjk_ranges_found:
             raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
+        if cjk_check and tangut_ranges != tangut_ranges_found:
+            raise ValueError("Tangut ranges deviate: have %r" % tangut_ranges_found)
 
         # public attributes
         self.filename = UNICODE_DATA % ''