From 8b0557ab495908cbae7a3e280945318966d4d12d Mon Sep 17 00:00:00 2001 From: vaggelisd Date: Fri, 8 May 2026 17:00:03 +0300 Subject: [PATCH 1/6] [mypyc] Add `librt.strings.isspace` codepoint primitive Adds a codepoint-taking `librt.strings.isspace(c: i32) -> bool` that wraps `Py_UNICODE_ISSPACE`. Combined with the existing `ord(s[i])` specialization (#20578), this lets per-character hot loops avoid the 1-character `PyUnicode` materialization that `s[i].isspace()` forces. Microbenchmark (counting whitespace in a 12 KB SQL fragment, 5000 iterations): mypyc-compiled `s[i].isspace()` takes 0.075 ms; the codepoint path `c: i32 = i32(ord(s[i])); isspace(c)` takes 0.034 ms, roughly 2.2x faster. Wins compound for tokenizer-shaped workloads mixing classification and literal compares. --- mypy/typeshed/stubs/librt/librt/strings.pyi | 4 +++ mypyc/ir/deps.py | 1 + mypyc/lib-rt/codepoint_extra_ops.h | 16 +++++++++ mypyc/lib-rt/strings/librt_strings.c | 38 +++++++++++++++++++++ mypyc/primitives/librt_strings_ops.py | 19 ++++++++++- mypyc/test-data/irbuild-librt-strings.test | 14 ++++++++ mypyc/test-data/run-librt-strings.test | 17 +++++++++ 7 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 mypyc/lib-rt/codepoint_extra_ops.h diff --git a/mypy/typeshed/stubs/librt/librt/strings.pyi b/mypy/typeshed/stubs/librt/librt/strings.pyi index 711e52c2e3700..46e9eac68b24f 100644 --- a/mypy/typeshed/stubs/librt/librt/strings.pyi +++ b/mypy/typeshed/stubs/librt/librt/strings.pyi @@ -40,3 +40,7 @@ def write_f64_le(b: BytesWriter, n: float, /) -> None: ... def write_f64_be(b: BytesWriter, n: float, /) -> None: ... def read_f64_le(b: bytes, index: i64, /) -> float: ... def read_f64_be(b: bytes, index: i64, /) -> float: ... + +# Codepoint classification helpers operating on i32 codepoints (typically +# obtained via ord(s[i])). Negative inputs return False. +def isspace(c: i32, /) -> bool: ... diff --git a/mypyc/ir/deps.py b/mypyc/ir/deps.py index 751845d3a324c..be79bb222f833 100644 --- a/mypyc/ir/deps.py +++ b/mypyc/ir/deps.py @@ -116,4 +116,5 @@ def get_header(self) -> str: STRING_WRITER_EXTRA_OPS: Final = SourceDep("stringwriter_extra_ops.c") BYTEARRAY_EXTRA_OPS: Final = SourceDep("bytearray_extra_ops.c") STR_EXTRA_OPS: Final = SourceDep("str_extra_ops.c") +CODEPOINT_EXTRA_OPS: Final = HeaderDep("codepoint_extra_ops.h") VECS_EXTRA_OPS: Final = SourceDep("vecs_extra_ops.c") diff --git a/mypyc/lib-rt/codepoint_extra_ops.h b/mypyc/lib-rt/codepoint_extra_ops.h new file mode 100644 index 0000000000000..5633efb0987ee --- /dev/null +++ b/mypyc/lib-rt/codepoint_extra_ops.h @@ -0,0 +1,16 @@ +#ifndef MYPYC_CODEPOINT_EXTRA_OPS_H +#define MYPYC_CODEPOINT_EXTRA_OPS_H + +#include +#include +#include + +// Codepoint helpers for librt.strings. +// Inputs are signed int32_t for compatibility with mypyc's i32 type. +// Negative values are treated as non-codepoints and return false. + +static inline bool LibRTStrings_IsSpace(int32_t c) { + return c >= 0 && Py_UNICODE_ISSPACE((Py_UCS4)c); +} + +#endif // MYPYC_CODEPOINT_EXTRA_OPS_H diff --git a/mypyc/lib-rt/strings/librt_strings.c b/mypyc/lib-rt/strings/librt_strings.c index 3f08b5ef43766..8158d6460dbd4 100644 --- a/mypyc/lib-rt/strings/librt_strings.c +++ b/mypyc/lib-rt/strings/librt_strings.c @@ -4,6 +4,7 @@ #include #include #include "CPy.h" +#include "codepoint_extra_ops.h" #include "librt_strings.h" #define CPY_BOOL_ERROR 2 @@ -1153,6 +1154,40 @@ read_f64_be(PyObject *module, PyObject *const *args, size_t nargs) { return PyFloat_FromDouble(CPyBytes_ReadF64BEUnsafe(data + index)); } +// Codepoint classification helpers exposed to interpreted callers. +// The C-side names are prefixed `cp_` to avoid colliding with libc's +// isspace / isdigit / etc. Compiled callers go through the +// LibRTStrings_* static inlines in codepoint_extra_ops.h instead. +// +// All wrappers parse a single int argument as i32 (codepoint) and +// dispatch to the corresponding LibRTStrings_* function. The parse +// step accepts any int but rejects values outside the i32 range with +// OverflowError, matching the input domain of the compiled fast path. + +#define CP_PARSE_I32(arg, var) \ + int32_t var; \ + do { \ + int _overflow; \ + long _c = PyLong_AsLongAndOverflow((arg), &_overflow); \ + if (_c == -1 && PyErr_Occurred()) \ + return NULL; \ + if (_overflow != 0 || _c < INT32_MIN || _c > INT32_MAX) { \ + PyErr_SetString(PyExc_OverflowError, \ + "codepoint out of i32 range"); \ + return NULL; \ + } \ + (var) = (int32_t)_c; \ + } while (0) + +#define DEFINE_CP_BOOL_WRAPPER(name, fn) \ + static PyObject* \ + cp_##name(PyObject *module, PyObject *arg) { \ + CP_PARSE_I32(arg, c); \ + return PyBool_FromLong(fn(c)); \ + } + +DEFINE_CP_BOOL_WRAPPER(isspace, LibRTStrings_IsSpace) + static PyMethodDef librt_strings_module_methods[] = { {"write_i16_le", (PyCFunction) write_i16_le, METH_FASTCALL, PyDoc_STR("Write a 16-bit signed integer to BytesWriter in little-endian format") @@ -1214,6 +1249,9 @@ static PyMethodDef librt_strings_module_methods[] = { {"read_f64_be", (PyCFunction) read_f64_be, METH_FASTCALL, PyDoc_STR("Read a 64-bit float from bytes in big-endian format") }, + {"isspace", cp_isspace, METH_O, + PyDoc_STR("Test whether a codepoint (i32) is Unicode whitespace.") + }, {NULL, NULL, 0, NULL} }; diff --git a/mypyc/primitives/librt_strings_ops.py b/mypyc/primitives/librt_strings_ops.py index 502ab8269e8c4..0a4b2515c1ea8 100644 --- a/mypyc/primitives/librt_strings_ops.py +++ b/mypyc/primitives/librt_strings_ops.py @@ -1,4 +1,9 @@ -from mypyc.ir.deps import BYTES_WRITER_EXTRA_OPS, LIBRT_STRINGS, STRING_WRITER_EXTRA_OPS +from mypyc.ir.deps import ( + BYTES_WRITER_EXTRA_OPS, + CODEPOINT_EXTRA_OPS, + LIBRT_STRINGS, + STRING_WRITER_EXTRA_OPS, +) from mypyc.ir.ops import ERR_MAGIC, ERR_MAGIC_OVERLAPPING, ERR_NEVER from mypyc.ir.rtypes import ( bool_rprimitive, @@ -387,3 +392,15 @@ error_kind=ERR_NEVER, dependencies=[LIBRT_STRINGS, STRING_WRITER_EXTRA_OPS], ) + + +# Codepoint classification helpers operating on i32 codepoints +# (typically obtained via ord(s[i])). Negative inputs return False. +function_op( + name="librt.strings.isspace", + arg_types=[int32_rprimitive], + return_type=bool_rprimitive, + c_function_name="LibRTStrings_IsSpace", + error_kind=ERR_NEVER, + dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], +) diff --git a/mypyc/test-data/irbuild-librt-strings.test b/mypyc/test-data/irbuild-librt-strings.test index 460a109d1d5ac..989f15802954f 100644 --- a/mypyc/test-data/irbuild-librt-strings.test +++ b/mypyc/test-data/irbuild-librt-strings.test @@ -270,3 +270,17 @@ L1: L2: r3 = CPyStringWriter_GetItem(s, r0) return r3 + +[case testLibrtStringsIsSpaceIR] +from librt.strings import isspace +from mypy_extensions import i32 + +def is_ws(c: i32) -> bool: + return isspace(c) +[out] +def is_ws(c): + c :: i32 + r0 :: bool +L0: + r0 = LibRTStrings_IsSpace(c) + return r0 diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index 909766d5c8e74..940b6be1d1a2d 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -1439,3 +1439,20 @@ def test_new_without_init_is_usable() -> None: assert sw.getvalue() == "" sw.write("hello") assert sw.getvalue() == "hello" + +[case testLibrtStringsIsSpace] +from typing import Any +from mypy_extensions import i32 +from librt.strings import isspace + + +def test_isspace() -> None: + assert not isspace(i32(-1)) + assert not isspace(i32(-113)) + # Verify our codepoint primitive agrees with str.isspace() across all + # Unicode codepoints, including the ord(chr(i)) round-trip. Any + # forces generic dispatch on the str side. + for i in range(0x110000): + c = chr(i) + a: Any = c + assert isspace(ord(c)) == isspace(i) == a.isspace() From b731d010e3f1d6e79ca81242ff4cd89a63ab8165 Mon Sep 17 00:00:00 2001 From: vaggelisd Date: Mon, 11 May 2026 15:48:34 +0300 Subject: [PATCH 2/6] Fix CI --- mypyc/ir/deps.py | 2 +- mypyc/lib-rt/codepoint_extra_ops.c | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 mypyc/lib-rt/codepoint_extra_ops.c diff --git a/mypyc/ir/deps.py b/mypyc/ir/deps.py index be79bb222f833..0cf58c83c27bf 100644 --- a/mypyc/ir/deps.py +++ b/mypyc/ir/deps.py @@ -116,5 +116,5 @@ def get_header(self) -> str: STRING_WRITER_EXTRA_OPS: Final = SourceDep("stringwriter_extra_ops.c") BYTEARRAY_EXTRA_OPS: Final = SourceDep("bytearray_extra_ops.c") STR_EXTRA_OPS: Final = SourceDep("str_extra_ops.c") -CODEPOINT_EXTRA_OPS: Final = HeaderDep("codepoint_extra_ops.h") +CODEPOINT_EXTRA_OPS: Final = SourceDep("codepoint_extra_ops.c") VECS_EXTRA_OPS: Final = SourceDep("vecs_extra_ops.c") diff --git a/mypyc/lib-rt/codepoint_extra_ops.c b/mypyc/lib-rt/codepoint_extra_ops.c new file mode 100644 index 0000000000000..ca03eba4e6f51 --- /dev/null +++ b/mypyc/lib-rt/codepoint_extra_ops.c @@ -0,0 +1,8 @@ +#include "codepoint_extra_ops.h" + +// Out-of-line bodies for codepoint helpers that are too large to inline. +// The classification helpers and the ASCII fast paths for case conversion +// stay inline in codepoint_extra_ops.h; this file holds the slow paths +// that round-trip through PyUnicode_FromOrdinal and CPython's Unicode +// machinery. Currently empty; populated as later commits add +// isidentifier, toupper, and tolower. From a2071d22f89c33066d7c4f6c8e3f867912d86edf Mon Sep 17 00:00:00 2001 From: vaggelisd Date: Mon, 11 May 2026 17:17:59 +0300 Subject: [PATCH 3/6] Tag isspace test with _librt so cached librt is used Without the _librt suffix, has_test_name_tag returns False and the test imports the installed PyPI librt 0.11.0, which lacks isspace. --- mypyc/test-data/run-librt-strings.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index 940b6be1d1a2d..3c1f686867fbd 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -1440,7 +1440,7 @@ def test_new_without_init_is_usable() -> None: sw.write("hello") assert sw.getvalue() == "hello" -[case testLibrtStringsIsSpace] +[case testLibrtStringsIsSpace_librt] from typing import Any from mypy_extensions import i32 from librt.strings import isspace From f080e8b095ac56d52034305a52683a746ea12ece Mon Sep 17 00:00:00 2001 From: vaggelisd Date: Thu, 14 May 2026 12:21:04 +0300 Subject: [PATCH 4/6] Address PR review: convert CP_PARSE_I32 macro to function, extend IR test - Replace CP_PARSE_I32 macro with cp_parse_i32 static function since arg is always PyObject* and the macro added no value. - Add isspace(ord(s[i])) case to testLibrtStringsIsSpaceIR to cover the realistic str-index -> ord -> isspace lowering chain. --- mypyc/lib-rt/strings/librt_strings.c | 34 +++++++------ mypyc/test-data/irbuild-librt-strings.test | 58 ++++++++++++++++++++++ 2 files changed, 77 insertions(+), 15 deletions(-) diff --git a/mypyc/lib-rt/strings/librt_strings.c b/mypyc/lib-rt/strings/librt_strings.c index 8158d6460dbd4..ecde8c527f9d4 100644 --- a/mypyc/lib-rt/strings/librt_strings.c +++ b/mypyc/lib-rt/strings/librt_strings.c @@ -1164,25 +1164,29 @@ read_f64_be(PyObject *module, PyObject *const *args, size_t nargs) { // step accepts any int but rejects values outside the i32 range with // OverflowError, matching the input domain of the compiled fast path. -#define CP_PARSE_I32(arg, var) \ - int32_t var; \ - do { \ - int _overflow; \ - long _c = PyLong_AsLongAndOverflow((arg), &_overflow); \ - if (_c == -1 && PyErr_Occurred()) \ - return NULL; \ - if (_overflow != 0 || _c < INT32_MIN || _c > INT32_MAX) { \ - PyErr_SetString(PyExc_OverflowError, \ - "codepoint out of i32 range"); \ - return NULL; \ - } \ - (var) = (int32_t)_c; \ - } while (0) +// Parse a Python int as i32 codepoint. Returns 0 on success and writes +// the value to *out; returns -1 on error with a Python exception set. +static int +cp_parse_i32(PyObject *arg, int32_t *out) { + int overflow; + long c = PyLong_AsLongAndOverflow(arg, &overflow); + if (c == -1 && PyErr_Occurred()) + return -1; + if (overflow != 0 || c < INT32_MIN || c > INT32_MAX) { + PyErr_SetString(PyExc_OverflowError, + "codepoint out of i32 range"); + return -1; + } + *out = (int32_t)c; + return 0; +} #define DEFINE_CP_BOOL_WRAPPER(name, fn) \ static PyObject* \ cp_##name(PyObject *module, PyObject *arg) { \ - CP_PARSE_I32(arg, c); \ + int32_t c; \ + if (cp_parse_i32(arg, &c) < 0) \ + return NULL; \ return PyBool_FromLong(fn(c)); \ } diff --git a/mypyc/test-data/irbuild-librt-strings.test b/mypyc/test-data/irbuild-librt-strings.test index 989f15802954f..794be378a15f4 100644 --- a/mypyc/test-data/irbuild-librt-strings.test +++ b/mypyc/test-data/irbuild-librt-strings.test @@ -277,6 +277,9 @@ from mypy_extensions import i32 def is_ws(c: i32) -> bool: return isspace(c) + +def is_ws_at(s: str, i: int) -> bool: + return isspace(ord(s[i])) [out] def is_ws(c): c :: i32 @@ -284,3 +287,58 @@ def is_ws(c): L0: r0 = LibRTStrings_IsSpace(c) return r0 +def is_ws_at(s, i): + s :: str + i :: int + r0 :: native_int + r1 :: bit + r2, r3 :: i64 + r4 :: ptr + r5 :: c_ptr + r6, r7 :: i64 + r8, r9 :: bool + r10 :: short_int + r11, r12 :: bit + r13 :: native_int + r14, r15 :: i32 + r16 :: bool +L0: + r0 = i & 1 + r1 = r0 == 0 + if r1 goto L1 else goto L2 :: bool +L1: + r2 = i >> 1 + r3 = r2 + goto L3 +L2: + r4 = i ^ 1 + r5 = r4 + r6 = CPyLong_AsInt64(r5) + r3 = r6 + keep_alive i +L3: + r7 = CPyStr_AdjustIndex(s, r3) + r8 = CPyStr_RangeCheck(s, r7) + if r8 goto L5 else goto L4 :: bool +L4: + r9 = raise IndexError('index out of range') + unreachable +L5: + r10 = CPyStr_GetItemUnsafeAsInt(s, r7) + r11 = r10 < 4294967296 :: signed + if r11 goto L6 else goto L8 :: bool +L6: + r12 = r10 >= -4294967296 :: signed + if r12 goto L7 else goto L8 :: bool +L7: + r13 = r10 >> 1 + r14 = truncate r13: native_int to i32 + r15 = r14 + goto L9 +L8: + CPyInt32_Overflow() + unreachable +L9: + r16 = LibRTStrings_IsSpace(r15) + return r16 + From 07cd55737d93a0e17385804f484419cf827c748d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 14 May 2026 09:32:48 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mypyc/test-data/irbuild-librt-strings.test | 1 - 1 file changed, 1 deletion(-) diff --git a/mypyc/test-data/irbuild-librt-strings.test b/mypyc/test-data/irbuild-librt-strings.test index 794be378a15f4..6d2cf93cd1918 100644 --- a/mypyc/test-data/irbuild-librt-strings.test +++ b/mypyc/test-data/irbuild-librt-strings.test @@ -341,4 +341,3 @@ L8: L9: r16 = LibRTStrings_IsSpace(r15) return r16 - From 7e17358e8b21aad494b33c7a72c84823dc8de4a9 Mon Sep 17 00:00:00 2001 From: vaggelisd Date: Thu, 14 May 2026 13:27:56 +0300 Subject: [PATCH 6/6] Split isspace IR test: 64-bit-only case for str-index path The is_ws_at(s, i: int) IR differs between 32-bit and 64-bit (short int tag width). Move that case into its own _64bit-suffixed test so the 32-bit CI job skips it, while is_ws(c: i32) still runs everywhere. --- mypyc/test-data/irbuild-librt-strings.test | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mypyc/test-data/irbuild-librt-strings.test b/mypyc/test-data/irbuild-librt-strings.test index 6d2cf93cd1918..9bb2312b0d88a 100644 --- a/mypyc/test-data/irbuild-librt-strings.test +++ b/mypyc/test-data/irbuild-librt-strings.test @@ -277,9 +277,6 @@ from mypy_extensions import i32 def is_ws(c: i32) -> bool: return isspace(c) - -def is_ws_at(s: str, i: int) -> bool: - return isspace(ord(s[i])) [out] def is_ws(c): c :: i32 @@ -287,6 +284,13 @@ def is_ws(c): L0: r0 = LibRTStrings_IsSpace(c) return r0 + +[case testLibrtStringsIsSpaceFromStrIndexIR_64bit] +from librt.strings import isspace + +def is_ws_at(s: str, i: int) -> bool: + return isspace(ord(s[i])) +[out] def is_ws_at(s, i): s :: str i :: int