Skip to content

Commit 46b00c3

Browse files
committed
Add PyUnstable APIs
1 parent 77eebaf commit 46b00c3

File tree

6 files changed

+117
-55
lines changed

6 files changed

+117
-55
lines changed

Doc/c-api/unicode.rst

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,7 @@ These APIs can be used for fast direct character conversions:
328328
possible. This function does not raise exceptions.
329329
330330
331-
.. c:function:: Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
331+
.. c:function:: Py_ssize_t PyUnstable_UCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
332332
333333
Convert *ch* to lower case, store result in *buffer*, which should be
334334
able to hold as many characters needed for *ch* to be lower cased, and
@@ -341,7 +341,7 @@ These APIs can be used for fast direct character conversions:
341341
.. versionadded:: next
342342
343343
344-
.. c:function:: Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
344+
.. c:function:: Py_ssize_t PyUnstable_UCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
345345
346346
Convert *ch* to upper case, store result in *buffer*, which should be
347347
able to hold as many characters needed for *ch* to be upper cased, and
@@ -354,7 +354,7 @@ These APIs can be used for fast direct character conversions:
354354
.. versionadded:: next
355355
356356
357-
.. c:function:: Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
357+
.. c:function:: Py_ssize_t PyUnstable_UCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
358358
359359
Convert *ch* to title case, store result in *buffer*, which should be
360360
able to hold as many characters needed for *ch* to be title cased, and
@@ -367,7 +367,7 @@ These APIs can be used for fast direct character conversions:
367367
.. versionadded:: next
368368
369369
370-
.. c:function:: Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
370+
.. c:function:: Py_ssize_t PyUnstable_UCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
371371
372372
Foldcase *ch*, store result in *buffer*, which should be
373373
able to hold as many characters needed for *ch* to be foldcased, and
@@ -379,11 +379,12 @@ These APIs can be used for fast direct character conversions:
379379
380380
.. versionadded:: next
381381
382-
.. c:macro:: PyUCS4_CASE_CONVERSION_BUFFER_SIZE
382+
.. c:macro:: PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE
383383
384-
The minimum buffer size needed for any call to :c:func:`PyUCS4_ToLower`,
385-
:c:func:`PyUCS4_ToUpper`, :c:func:`PyUCS4_ToTitle`, or
386-
:c:func:`PyUCS4_ToFolded`. That is, ``3`` for Unicode 16.0.
384+
The minimum buffer size needed for any call to
385+
:c:func:`PyUnstable_UCS4_ToLower`, :c:func:`PyUnstable_UCS4_ToUpper`,
386+
:c:func:`PyUnstable_UCS4_ToTitle`, or :c:func:`PyUnstable_UCS4_ToFolded`.
387+
That is, ``3`` for Unicode 16.0.
387388
388389
.. versionadded:: next
389390

Include/cpython/unicodeobject.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -732,25 +732,25 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
732732
Py_UCS4 ch /* Unicode character */
733733
);
734734

735-
PyAPI_FUNC(Py_ssize_t) PyUCS4_ToLower(
735+
PyAPI_FUNC(Py_ssize_t) PyUnstable_UCS4_ToLower(
736736
Py_UCS4 ch, /* Unicode character */
737737
Py_UCS4 *res, /* Output buffer */
738738
Py_ssize_t size /* Buffer size */
739739
);
740740

741-
PyAPI_FUNC(Py_ssize_t) PyUCS4_ToUpper(
741+
PyAPI_FUNC(Py_ssize_t) PyUnstable_UCS4_ToUpper(
742742
Py_UCS4 ch, /* Unicode character */
743743
Py_UCS4 *res, /* Output buffer */
744744
Py_ssize_t size /* Buffer size */
745745
);
746746

747-
PyAPI_FUNC(Py_ssize_t) PyUCS4_ToTitle(
747+
PyAPI_FUNC(Py_ssize_t) PyUnstable_UCS4_ToTitle(
748748
Py_UCS4 ch, /* Unicode character */
749749
Py_UCS4 *res, /* Output buffer */
750750
Py_ssize_t size /* Buffer size */
751751
);
752752

753-
PyAPI_FUNC(Py_ssize_t) PyUCS4_ToFolded(
753+
PyAPI_FUNC(Py_ssize_t) PyUnstable_UCS4_ToFolded(
754754
Py_UCS4 ch, /* Unicode character */
755755
Py_UCS4 *res, /* Output buffer */
756756
Py_ssize_t size /* Buffer size */
@@ -791,7 +791,7 @@ static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
791791

792792
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
793793

794-
#define PyUCS4_CASE_CONVERSION_BUFFER_SIZE 3
794+
#define PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE 3
795795

796796
static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
797797
return (Py_UNICODE_ISALPHA(ch)

Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst

Lines changed: 0 additions & 1 deletion
This file was deleted.

Modules/_testcapi/unicode.c

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -244,45 +244,48 @@ unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *,
244244
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf, chars);
245245
}
246246

247-
/* Test PyUCS4_ToLower() */
247+
/* Test PyUnstable_UCS4_ToLower() */
248248
static PyObject *
249249
unicode_tolower(PyObject *self, PyObject *arg)
250250
{
251-
Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
252-
return unicode_case_operation(arg, PyUCS4_ToLower, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
251+
Py_UCS4 buf[PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE];
252+
return unicode_case_operation(arg, PyUnstable_UCS4_ToLower,
253+
buf, PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE);
253254
}
254255

255256

256-
/* Test PyUCS4_ToUpper() */
257+
/* Test PyUnstable_UCS4_ToUpper() */
257258
static PyObject *
258259
unicode_toupper(PyObject *self, PyObject *arg)
259260
{
260-
Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
261-
return unicode_case_operation(arg, PyUCS4_ToUpper, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
261+
Py_UCS4 buf[PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE];
262+
return unicode_case_operation(arg, PyUnstable_UCS4_ToUpper,
263+
buf, PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE);
262264
}
263265

264-
/* Test PyUCS4_ToUpper() with a small buffer */
266+
/* Test PyUnstable_UCS4_ToUpper() with a small buffer */
265267
static PyObject *
266268
unicode_toupper_buffer_too_small(PyObject *self, PyObject *arg)
267269
{
268270
Py_UCS4 buf;
269-
return unicode_case_operation(arg, PyUCS4_ToUpper, &buf, 1);
271+
return unicode_case_operation(arg, PyUnstable_UCS4_ToUpper, &buf, 1);
270272
}
271273

272-
/* Test PyUCS4_ToLower() */
274+
/* Test PyUnstable_UCS4_ToTitle() */
273275
static PyObject *
274276
unicode_totitle(PyObject *self, PyObject *arg)
275277
{
276-
Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
277-
return unicode_case_operation(arg, PyUCS4_ToTitle, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
278+
Py_UCS4 buf[PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE];
279+
return unicode_case_operation(arg, PyUnstable_UCS4_ToTitle,
280+
buf, PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE);
278281
}
279282

280-
/* Test PyUCS4_ToLower() */
283+
/* Test PyUnstable_UCS4_ToFolded() */
281284
static PyObject *
282285
unicode_tofolded(PyObject *self, PyObject *arg)
283286
{
284-
Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
285-
return unicode_case_operation(arg, PyUCS4_ToFolded, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
287+
Py_UCS4 buf[PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE];
288+
return unicode_case_operation(arg, PyUnstable_UCS4_ToFolded, buf, PyUnstable_UCS4_CASE_CONVERSION_BUFFER_SIZE);
286289
}
287290

288291

Objects/unicodectype.c

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
199199
return ch + ctype->lower;
200200
}
201201

202-
Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
202+
Py_ssize_t PyUnstable_UCS4_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
203203
{
204204
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
205205

@@ -225,7 +225,23 @@ Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
225225
return 1;
226226
}
227227

228-
Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
228+
int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
229+
{
230+
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
231+
232+
if (ctype->flags & EXTENDED_CASE_MASK) {
233+
int index = ctype->lower & 0xFFFF;
234+
int n = ctype->lower >> 24;
235+
int i;
236+
for (i = 0; i < n; i++)
237+
res[i] = _PyUnicode_ExtendedCase[index + i];
238+
return n;
239+
}
240+
res[0] = ch + ctype->lower;
241+
return 1;
242+
}
243+
244+
Py_ssize_t PyUnstable_UCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
229245
{
230246
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
231247

@@ -251,7 +267,23 @@ Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
251267
return 1;
252268
}
253269

254-
Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
270+
int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
271+
{
272+
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
273+
274+
if (ctype->flags & EXTENDED_CASE_MASK) {
275+
int index = ctype->title & 0xFFFF;
276+
int n = ctype->title >> 24;
277+
int i;
278+
for (i = 0; i < n; i++)
279+
res[i] = _PyUnicode_ExtendedCase[index + i];
280+
return n;
281+
}
282+
res[0] = ch + ctype->title;
283+
return 1;
284+
}
285+
286+
Py_ssize_t PyUnstable_UCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
255287
{
256288
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
257289

@@ -277,7 +309,23 @@ Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
277309
return 1;
278310
}
279311

280-
Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
312+
int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
313+
{
314+
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
315+
316+
if (ctype->flags & EXTENDED_CASE_MASK) {
317+
int index = ctype->upper & 0xFFFF;
318+
int n = ctype->upper >> 24;
319+
int i;
320+
for (i = 0; i < n; i++)
321+
res[i] = _PyUnicode_ExtendedCase[index + i];
322+
return n;
323+
}
324+
res[0] = ch + ctype->upper;
325+
return 1;
326+
}
327+
328+
Py_ssize_t PyUnstable_UCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
281329
{
282330
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
283331

@@ -295,7 +343,22 @@ Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size)
295343
return n;
296344
}
297345

298-
return PyUCS4_ToLower(ch, res, size);
346+
return PyUnstable_UCS4_ToLower(ch, res, size);
347+
}
348+
349+
int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
350+
{
351+
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
352+
353+
if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
354+
int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
355+
int n = (ctype->lower >> 20) & 7;
356+
int i;
357+
for (i = 0; i < n; i++)
358+
res[i] = _PyUnicode_ExtendedCase[index + i];
359+
return n;
360+
}
361+
return _PyUnicode_ToLowerFull(ch, res);
299362
}
300363

301364
int _PyUnicode_IsCased(Py_UCS4 ch)

Objects/unicodeobject.c

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9766,35 +9766,34 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i
97669766
return (final_sigma) ? 0x3C2 : 0x3C3;
97679767
}
97689768

9769-
static Py_ssize_t
9769+
static int
97709770
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9771-
Py_UCS4 c, Py_UCS4 *mapped, Py_ssize_t mapped_size)
9771+
Py_UCS4 c, Py_UCS4 *mapped)
97729772
{
97739773
/* Obscure special case. */
97749774
if (c == 0x3A3) {
97759775
mapped[0] = handle_capital_sigma(kind, data, length, i);
97769776
return 1;
97779777
}
9778-
return PyUCS4_ToLower(c, mapped, mapped_size);
9778+
return _PyUnicode_ToLowerFull(c, mapped);
97799779
}
97809780

97819781
static Py_ssize_t
97829782
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
97839783
{
9784-
Py_ssize_t i, k = 0, n_res, j;
9784+
Py_ssize_t i, k = 0;
9785+
int n_res, j;
97859786
Py_UCS4 c, mapped[3];
97869787

97879788
c = PyUnicode_READ(kind, data, 0);
9788-
n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped));
9789-
assert(n_res >= 1);
9789+
n_res = _PyUnicode_ToTitleFull(c, mapped);
97909790
for (j = 0; j < n_res; j++) {
97919791
*maxchar = Py_MAX(*maxchar, mapped[j]);
97929792
res[k++] = mapped[j];
97939793
}
97949794
for (i = 1; i < length; i++) {
97959795
c = PyUnicode_READ(kind, data, i);
9796-
n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped));
9797-
assert(n_res >= 1);
9796+
n_res = lower_ucs4(kind, data, length, i, c, mapped);
97989797
for (j = 0; j < n_res; j++) {
97999798
*maxchar = Py_MAX(*maxchar, mapped[j]);
98009799
res[k++] = mapped[j];
@@ -9809,18 +9808,17 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4
98099808

98109809
for (i = 0; i < length; i++) {
98119810
Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9812-
Py_ssize_t n_res, j;
9811+
int n_res, j;
98139812
if (Py_UNICODE_ISUPPER(c)) {
9814-
n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped));
9813+
n_res = lower_ucs4(kind, data, length, i, c, mapped);
98159814
}
98169815
else if (Py_UNICODE_ISLOWER(c)) {
9817-
n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped));
9816+
n_res = _PyUnicode_ToUpperFull(c, mapped);
98189817
}
98199818
else {
98209819
n_res = 1;
98219820
mapped[0] = c;
98229821
}
9823-
assert(n_res >= 1);
98249822
for (j = 0; j < n_res; j++) {
98259823
*maxchar = Py_MAX(*maxchar, mapped[j]);
98269824
res[k++] = mapped[j];
@@ -9837,12 +9835,11 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
98379835

98389836
for (i = 0; i < length; i++) {
98399837
Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9840-
Py_ssize_t n_res, j;
9838+
int n_res, j;
98419839
if (lower)
9842-
n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped));
9840+
n_res = lower_ucs4(kind, data, length, i, c, mapped);
98439841
else
9844-
n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped));
9845-
assert(n_res >= 1);
9842+
n_res = _PyUnicode_ToUpperFull(c, mapped);
98469843
for (j = 0; j < n_res; j++) {
98479844
*maxchar = Py_MAX(*maxchar, mapped[j]);
98489845
res[k++] = mapped[j];
@@ -9871,8 +9868,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4
98719868
for (i = 0; i < length; i++) {
98729869
Py_UCS4 c = PyUnicode_READ(kind, data, i);
98739870
Py_UCS4 mapped[3];
9874-
Py_ssize_t j, n_res = PyUCS4_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped));
9875-
assert(n_res >= 1);
9871+
int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
98769872
for (j = 0; j < n_res; j++) {
98779873
*maxchar = Py_MAX(*maxchar, mapped[j]);
98789874
res[k++] = mapped[j];
@@ -9891,13 +9887,13 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m
98919887
for (i = 0; i < length; i++) {
98929888
const Py_UCS4 c = PyUnicode_READ(kind, data, i);
98939889
Py_UCS4 mapped[3];
9894-
Py_ssize_t n_res, j;
9890+
int n_res, j;
98959891

98969892
if (previous_is_cased)
9897-
n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped));
9893+
n_res = lower_ucs4(kind, data, length, i, c, mapped);
98989894
else
9899-
n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped));
9900-
assert(n_res >= 1);
9895+
n_res = _PyUnicode_ToTitleFull(c, mapped);
9896+
99019897
for (j = 0; j < n_res; j++) {
99029898
*maxchar = Py_MAX(*maxchar, mapped[j]);
99039899
res[k++] = mapped[j];

0 commit comments

Comments
 (0)