diff --git a/mypy/typeshed/stubs/librt/librt/strings.pyi b/mypy/typeshed/stubs/librt/librt/strings.pyi index 711e52c2e370..46e9eac68b24 100644 --- a/mypy/typeshed/stubs/librt/librt/strings.pyi +++ b/mypy/typeshed/stubs/librt/librt/strings.pyi @@ -40,3 +40,7 @@ def write_f64_le(b: BytesWriter, n: float, /) -> None: ... def write_f64_be(b: BytesWriter, n: float, /) -> None: ... def read_f64_le(b: bytes, index: i64, /) -> float: ... def read_f64_be(b: bytes, index: i64, /) -> float: ... + +# Codepoint classification helpers operating on i32 codepoints (typically +# obtained via ord(s[i])). Negative inputs return False. +def isspace(c: i32, /) -> bool: ... diff --git a/mypyc/ir/deps.py b/mypyc/ir/deps.py index 751845d3a324..0cf58c83c27b 100644 --- a/mypyc/ir/deps.py +++ b/mypyc/ir/deps.py @@ -116,4 +116,5 @@ def get_header(self) -> str: STRING_WRITER_EXTRA_OPS: Final = SourceDep("stringwriter_extra_ops.c") BYTEARRAY_EXTRA_OPS: Final = SourceDep("bytearray_extra_ops.c") STR_EXTRA_OPS: Final = SourceDep("str_extra_ops.c") +CODEPOINT_EXTRA_OPS: Final = SourceDep("codepoint_extra_ops.c") VECS_EXTRA_OPS: Final = SourceDep("vecs_extra_ops.c") diff --git a/mypyc/lib-rt/codepoint_extra_ops.c b/mypyc/lib-rt/codepoint_extra_ops.c new file mode 100644 index 000000000000..ca03eba4e6f5 --- /dev/null +++ b/mypyc/lib-rt/codepoint_extra_ops.c @@ -0,0 +1,8 @@ +#include "codepoint_extra_ops.h" + +// Out-of-line bodies for codepoint helpers that are too large to inline. +// The classification helpers and the ASCII fast paths for case conversion +// stay inline in codepoint_extra_ops.h; this file holds the slow paths +// that round-trip through PyUnicode_FromOrdinal and CPython's Unicode +// machinery. Currently empty; populated as later commits add +// isidentifier, toupper, and tolower. diff --git a/mypyc/lib-rt/codepoint_extra_ops.h b/mypyc/lib-rt/codepoint_extra_ops.h new file mode 100644 index 000000000000..5633efb0987e --- /dev/null +++ b/mypyc/lib-rt/codepoint_extra_ops.h @@ -0,0 +1,16 @@ +#ifndef MYPYC_CODEPOINT_EXTRA_OPS_H +#define MYPYC_CODEPOINT_EXTRA_OPS_H + +#include +#include +#include + +// Codepoint helpers for librt.strings. +// Inputs are signed int32_t for compatibility with mypyc's i32 type. +// Negative values are treated as non-codepoints and return false. + +static inline bool LibRTStrings_IsSpace(int32_t c) { + return c >= 0 && Py_UNICODE_ISSPACE((Py_UCS4)c); +} + +#endif // MYPYC_CODEPOINT_EXTRA_OPS_H diff --git a/mypyc/lib-rt/strings/librt_strings.c b/mypyc/lib-rt/strings/librt_strings.c index 3f08b5ef4376..ecde8c527f9d 100644 --- a/mypyc/lib-rt/strings/librt_strings.c +++ b/mypyc/lib-rt/strings/librt_strings.c @@ -4,6 +4,7 @@ #include #include #include "CPy.h" +#include "codepoint_extra_ops.h" #include "librt_strings.h" #define CPY_BOOL_ERROR 2 @@ -1153,6 +1154,44 @@ read_f64_be(PyObject *module, PyObject *const *args, size_t nargs) { return PyFloat_FromDouble(CPyBytes_ReadF64BEUnsafe(data + index)); } +// Codepoint classification helpers exposed to interpreted callers. +// The C-side names are prefixed `cp_` to avoid colliding with libc's +// isspace / isdigit / etc. Compiled callers go through the +// LibRTStrings_* static inlines in codepoint_extra_ops.h instead. +// +// All wrappers parse a single int argument as i32 (codepoint) and +// dispatch to the corresponding LibRTStrings_* function. The parse +// step accepts any int but rejects values outside the i32 range with +// OverflowError, matching the input domain of the compiled fast path. + +// Parse a Python int as i32 codepoint. Returns 0 on success and writes +// the value to *out; returns -1 on error with a Python exception set. +static int +cp_parse_i32(PyObject *arg, int32_t *out) { + int overflow; + long c = PyLong_AsLongAndOverflow(arg, &overflow); + if (c == -1 && PyErr_Occurred()) + return -1; + if (overflow != 0 || c < INT32_MIN || c > INT32_MAX) { + PyErr_SetString(PyExc_OverflowError, + "codepoint out of i32 range"); + return -1; + } + *out = (int32_t)c; + return 0; +} + +#define DEFINE_CP_BOOL_WRAPPER(name, fn) \ + static PyObject* \ + cp_##name(PyObject *module, PyObject *arg) { \ + int32_t c; \ + if (cp_parse_i32(arg, &c) < 0) \ + return NULL; \ + return PyBool_FromLong(fn(c)); \ + } + +DEFINE_CP_BOOL_WRAPPER(isspace, LibRTStrings_IsSpace) + static PyMethodDef librt_strings_module_methods[] = { {"write_i16_le", (PyCFunction) write_i16_le, METH_FASTCALL, PyDoc_STR("Write a 16-bit signed integer to BytesWriter in little-endian format") @@ -1214,6 +1253,9 @@ static PyMethodDef librt_strings_module_methods[] = { {"read_f64_be", (PyCFunction) read_f64_be, METH_FASTCALL, PyDoc_STR("Read a 64-bit float from bytes in big-endian format") }, + {"isspace", cp_isspace, METH_O, + PyDoc_STR("Test whether a codepoint (i32) is Unicode whitespace.") + }, {NULL, NULL, 0, NULL} }; diff --git a/mypyc/primitives/librt_strings_ops.py b/mypyc/primitives/librt_strings_ops.py index 502ab8269e8c..0a4b2515c1ea 100644 --- a/mypyc/primitives/librt_strings_ops.py +++ b/mypyc/primitives/librt_strings_ops.py @@ -1,4 +1,9 @@ -from mypyc.ir.deps import BYTES_WRITER_EXTRA_OPS, LIBRT_STRINGS, STRING_WRITER_EXTRA_OPS +from mypyc.ir.deps import ( + BYTES_WRITER_EXTRA_OPS, + CODEPOINT_EXTRA_OPS, + LIBRT_STRINGS, + STRING_WRITER_EXTRA_OPS, +) from mypyc.ir.ops import ERR_MAGIC, ERR_MAGIC_OVERLAPPING, ERR_NEVER from mypyc.ir.rtypes import ( bool_rprimitive, @@ -387,3 +392,15 @@ error_kind=ERR_NEVER, dependencies=[LIBRT_STRINGS, STRING_WRITER_EXTRA_OPS], ) + + +# Codepoint classification helpers operating on i32 codepoints +# (typically obtained via ord(s[i])). Negative inputs return False. +function_op( + name="librt.strings.isspace", + arg_types=[int32_rprimitive], + return_type=bool_rprimitive, + c_function_name="LibRTStrings_IsSpace", + error_kind=ERR_NEVER, + dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], +) diff --git a/mypyc/test-data/irbuild-librt-strings.test b/mypyc/test-data/irbuild-librt-strings.test index 460a109d1d5a..9bb2312b0d88 100644 --- a/mypyc/test-data/irbuild-librt-strings.test +++ b/mypyc/test-data/irbuild-librt-strings.test @@ -270,3 +270,78 @@ L1: L2: r3 = CPyStringWriter_GetItem(s, r0) return r3 + +[case testLibrtStringsIsSpaceIR] +from librt.strings import isspace +from mypy_extensions import i32 + +def is_ws(c: i32) -> bool: + return isspace(c) +[out] +def is_ws(c): + c :: i32 + r0 :: bool +L0: + r0 = LibRTStrings_IsSpace(c) + return r0 + +[case testLibrtStringsIsSpaceFromStrIndexIR_64bit] +from librt.strings import isspace + +def is_ws_at(s: str, i: int) -> bool: + return isspace(ord(s[i])) +[out] +def is_ws_at(s, i): + s :: str + i :: int + r0 :: native_int + r1 :: bit + r2, r3 :: i64 + r4 :: ptr + r5 :: c_ptr + r6, r7 :: i64 + r8, r9 :: bool + r10 :: short_int + r11, r12 :: bit + r13 :: native_int + r14, r15 :: i32 + r16 :: bool +L0: + r0 = i & 1 + r1 = r0 == 0 + if r1 goto L1 else goto L2 :: bool +L1: + r2 = i >> 1 + r3 = r2 + goto L3 +L2: + r4 = i ^ 1 + r5 = r4 + r6 = CPyLong_AsInt64(r5) + r3 = r6 + keep_alive i +L3: + r7 = CPyStr_AdjustIndex(s, r3) + r8 = CPyStr_RangeCheck(s, r7) + if r8 goto L5 else goto L4 :: bool +L4: + r9 = raise IndexError('index out of range') + unreachable +L5: + r10 = CPyStr_GetItemUnsafeAsInt(s, r7) + r11 = r10 < 4294967296 :: signed + if r11 goto L6 else goto L8 :: bool +L6: + r12 = r10 >= -4294967296 :: signed + if r12 goto L7 else goto L8 :: bool +L7: + r13 = r10 >> 1 + r14 = truncate r13: native_int to i32 + r15 = r14 + goto L9 +L8: + CPyInt32_Overflow() + unreachable +L9: + r16 = LibRTStrings_IsSpace(r15) + return r16 diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index 909766d5c8e7..3c1f686867fb 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -1439,3 +1439,20 @@ def test_new_without_init_is_usable() -> None: assert sw.getvalue() == "" sw.write("hello") assert sw.getvalue() == "hello" + +[case testLibrtStringsIsSpace_librt] +from typing import Any +from mypy_extensions import i32 +from librt.strings import isspace + + +def test_isspace() -> None: + assert not isspace(i32(-1)) + assert not isspace(i32(-113)) + # Verify our codepoint primitive agrees with str.isspace() across all + # Unicode codepoints, including the ord(chr(i)) round-trip. Any + # forces generic dispatch on the str side. + for i in range(0x110000): + c = chr(i) + a: Any = c + assert isspace(ord(c)) == isspace(i) == a.isspace()