Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,12 @@ def test_decomposition(self):
# New in 17.0.0
self.assertEqual(self.db.decomposition('\uA7F1'), '' if self.old else '<super> 0053')

# Hangul characters
self.assertEqual(self.db.decomposition('\uAC00'), '1100 1161')
self.assertEqual(self.db.decomposition('\uAC01'), '1100 1161 11A8')
self.assertEqual(self.db.decomposition('\uC2F8'), '110A 1161')
self.assertEqual(self.db.decomposition('\uD7A3'), '1112 1175 11C2')

self.assertRaises(TypeError, self.db.decomposition)
self.assertRaises(TypeError, self.db.decomposition, 'xx')

Expand Down Expand Up @@ -687,9 +693,9 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):

# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = ('668dbbea1136e69d4f00677a5988b23bc78aefc6'
expectedchecksum = ('00b13fa975a60b1d3f490f1fc8c126ab24990c75'
if quicktest else
'b869af769bd8fe352c04622ab90533dc54df5cf3')
'ebfc9dd281c2226998fd435744dd2e9321899beb')

@requires_resource('network')
def test_all_names(self):
Expand Down Expand Up @@ -977,9 +983,9 @@ def graphemes(*args):
class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
db = unicodedata.ucd_3_2_0
old = True
expectedchecksum = ('2164a66700e03cba9c9f5ed9e9a8d594d2da136a'
expectedchecksum = ('cb5bbbd1f55b67371e18222b90a8e21c87f16b72'
if quicktest else
'a8276cec9b6991779c5bdaa46c1ae7cc50bc2403')
'74936dffe949d99203a47e6a66565b2fc337bae7')


class UnicodeMiscTest(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix :func:`unicodedata.decomposition` for Hangul characters.
41 changes: 30 additions & 11 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,17 @@
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
}

// For Hangul decomposition
#define SBase 0xAC00
#define LBase 0x1100
#define VBase 0x1161
#define TBase 0x11A7
#define LCount 19
#define VCount 21
#define TCount 28
#define NCount (VCount*TCount)
#define SCount (LCount*NCount)

/*[clinic input]
@permit_long_summary
unicodedata.UCD.decomposition
Expand Down Expand Up @@ -460,6 +471,23 @@
return Py_GetConstant(Py_CONSTANT_EMPTY_STR); /* unassigned */
}

// Hangul Decomposition.
// See https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
if (SBase <= code && code < (SBase + SCount)) {
int SIndex = code - SBase;
int L = LBase + SIndex / NCount;
int V = VBase + (SIndex % NCount) / TCount;
int T = TBase + SIndex % TCount;
if (T != TBase) {
PyOS_snprintf(decomp, sizeof(decomp),
"%04X %04X %04X", L, V, T);
} else {
PyOS_snprintf(decomp, sizeof(decomp),
"%04X %04X", L, V);
}
return PyUnicode_FromString(decomp);
}

if (code < 0 || code >= 0x110000)
index = 0;
else {
Expand Down Expand Up @@ -522,16 +550,6 @@
(*index)++;
}

#define SBase 0xAC00
#define LBase 0x1100
#define VBase 0x1161
#define TBase 0x11A7
#define LCount 19
#define VCount 21
#define TCount 28
#define NCount (VCount*TCount)
#define SCount (LCount*NCount)

static PyObject*
nfd_nfkd(PyObject *self, PyObject *input, int k)
{
Expand Down Expand Up @@ -585,7 +603,8 @@
}
output = new_output;
}
/* Hangul Decomposition. */
// Hangul Decomposition.
// See https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
if (SBase <= code && code < (SBase+SCount)) {
int SIndex = code - SBase;
int L = LBase + SIndex / NCount;
Expand Down Expand Up @@ -1493,7 +1512,7 @@
}

if (i < (int)Py_ARRAY_LENGTH(derived_name_prefixes)) {
Py_UCS4 v = parse_hex_code(name + prefixlen, namelen - prefixlen);

Check warning on line 1515 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows / Build and test (x64)

'function': conversion from 'size_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1515 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows (free-threading) / Build and test (x64)

'function': conversion from 'size_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1515 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows (free-threading) / Build and test (arm64)

'function': conversion from 'size_t' to 'int', possible loss of data [C:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1515 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows / Build and test (arm64)

'function': conversion from 'size_t' to 'int', possible loss of data [C:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]
if (find_prefix_id(v) != i) {
return 0;
}
Expand Down
Loading