diff --git a/mypy/typeshed/stubs/librt/librt/strings.pyi b/mypy/typeshed/stubs/librt/librt/strings.pyi index 01aee3ff758d..7a028f9e7859 100644 --- a/mypy/typeshed/stubs/librt/librt/strings.pyi +++ b/mypy/typeshed/stubs/librt/librt/strings.pyi @@ -47,3 +47,4 @@ def isspace(c: i32, /) -> bool: ... def isdigit(c: i32, /) -> bool: ... def isalnum(c: i32, /) -> bool: ... def isalpha(c: i32, /) -> bool: ... +def isidentifier(c: i32, /) -> bool: ... diff --git a/mypyc/lib-rt/codepoint_extra_ops.c b/mypyc/lib-rt/codepoint_extra_ops.c index ca03eba4e6f5..3eba41727d25 100644 --- a/mypyc/lib-rt/codepoint_extra_ops.c +++ b/mypyc/lib-rt/codepoint_extra_ops.c @@ -1,8 +1,6 @@ +// All codepoint helper bodies live in codepoint_extra_ops.h as static +// inline. This translation unit exists so the header is pulled into +// mypyc-compiled extensions via SourceDep("codepoint_extra_ops.c") in +// mypyc/ir/deps.py (which, in include_runtime_files mode, emits +// `#include ` into the generated __native.c). #include "codepoint_extra_ops.h" - -// Out-of-line bodies for codepoint helpers that are too large to inline. -// The classification helpers and the ASCII fast paths for case conversion -// stay inline in codepoint_extra_ops.h; this file holds the slow paths -// that round-trip through PyUnicode_FromOrdinal and CPython's Unicode -// machinery. Currently empty; populated as later commits add -// isidentifier, toupper, and tolower. diff --git a/mypyc/lib-rt/codepoint_extra_ops.h b/mypyc/lib-rt/codepoint_extra_ops.h index bb83f92e4b87..8d7201fdd70a 100644 --- a/mypyc/lib-rt/codepoint_extra_ops.h +++ b/mypyc/lib-rt/codepoint_extra_ops.h @@ -4,6 +4,7 @@ #include #include #include +#include "CPy.h" // Codepoint helpers for librt.strings. // Inputs are signed int32_t for compatibility with mypyc's i32 type. @@ -25,4 +26,26 @@ static inline bool LibRTStrings_IsAlpha(int32_t c) { return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c); } +// True if c could start a valid identifier (matches XID_Start +// semantics, which is what str.isidentifier reports for a 1-character +// string). The ASCII fast path covers `[A-Za-z_]` inline; non-ASCII +// delegates to PyUnicode_IsIdentifier for correct PEP 3131 handling. +// Aborts via CPyError_OutOfMemory on allocation failure, so this helper +// stays ERR_NEVER. +static inline bool LibRTStrings_IsIdentifier(int32_t c) { + if (c < 0) return false; + if (c < 128) { + return (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_'; + } + PyObject *s = PyUnicode_FromOrdinal((int)c); + if (s == NULL) { + CPyError_OutOfMemory(); + } + int r = PyUnicode_IsIdentifier(s); + Py_DECREF(s); + return r == 1; +} + #endif // MYPYC_CODEPOINT_EXTRA_OPS_H diff --git a/mypyc/lib-rt/strings/librt_strings.c b/mypyc/lib-rt/strings/librt_strings.c index cbc3e5f753fa..62b4edffcd7f 100644 --- a/mypyc/lib-rt/strings/librt_strings.c +++ b/mypyc/lib-rt/strings/librt_strings.c @@ -1194,6 +1194,7 @@ DEFINE_CP_BOOL_WRAPPER(isspace, LibRTStrings_IsSpace) DEFINE_CP_BOOL_WRAPPER(isdigit, LibRTStrings_IsDigit) DEFINE_CP_BOOL_WRAPPER(isalnum, LibRTStrings_IsAlnum) DEFINE_CP_BOOL_WRAPPER(isalpha, LibRTStrings_IsAlpha) +DEFINE_CP_BOOL_WRAPPER(isidentifier, LibRTStrings_IsIdentifier) static PyMethodDef librt_strings_module_methods[] = { {"write_i16_le", (PyCFunction) write_i16_le, METH_FASTCALL, @@ -1268,6 +1269,9 @@ static PyMethodDef librt_strings_module_methods[] = { {"isalpha", cp_isalpha, METH_O, PyDoc_STR("Test whether a codepoint (i32) is a Unicode letter.") }, + {"isidentifier", cp_isidentifier, METH_O, + PyDoc_STR("Test whether a codepoint (i32) is a valid identifier start (XID_Start).") + }, {NULL, NULL, 0, NULL} }; diff --git a/mypyc/primitives/librt_strings_ops.py b/mypyc/primitives/librt_strings_ops.py index 93fa717cf529..312d5a16195b 100644 --- a/mypyc/primitives/librt_strings_ops.py +++ b/mypyc/primitives/librt_strings_ops.py @@ -431,3 +431,15 @@ error_kind=ERR_NEVER, dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], ) + +# isidentifier checks XID_Start semantics for a single codepoint, matching +# str.isidentifier() on a 1-character string. The non-ASCII path allocates +# but swallows OOM (returning False), keeping the function ERR_NEVER. +function_op( + name="librt.strings.isidentifier", + arg_types=[int32_rprimitive], + return_type=bool_rprimitive, + c_function_name="LibRTStrings_IsIdentifier", + error_kind=ERR_NEVER, + dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], +) diff --git a/mypyc/test-data/irbuild-librt-strings.test b/mypyc/test-data/irbuild-librt-strings.test index e5d18b6eb852..e3aaa49bd6f9 100644 --- a/mypyc/test-data/irbuild-librt-strings.test +++ b/mypyc/test-data/irbuild-librt-strings.test @@ -387,3 +387,17 @@ def is_a(c): L0: r0 = LibRTStrings_IsAlpha(c) return r0 + +[case testLibrtStringsIsIdentifierIR] +from librt.strings import isidentifier +from mypy_extensions import i32 + +def is_id(c: i32) -> bool: + return isidentifier(c) +[out] +def is_id(c): + c :: i32 + r0 :: bool +L0: + r0 = LibRTStrings_IsIdentifier(c) + return r0 diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index aa38c713d384..0a3320ff6522 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -1443,7 +1443,7 @@ def test_new_without_init_is_usable() -> None: [case testLibrtStringsCodepointClassifiers_librt] from typing import Any from mypy_extensions import i32 -from librt.strings import isspace, isdigit, isalnum, isalpha +from librt.strings import isspace, isdigit, isalnum, isalpha, isidentifier from testutil import assertRaises @@ -1455,6 +1455,7 @@ def test_codepoint_classifiers() -> None: assert not isdigit(bad) assert not isalnum(bad) assert not isalpha(bad) + assert not isidentifier(bad) # Verify each codepoint primitive agrees with the matching str method # across all Unicode codepoints, including the ord(chr(i)) round-trip. # Any forces generic dispatch on the str side. @@ -1466,6 +1467,7 @@ def test_codepoint_classifiers() -> None: assert isdigit(o) == isdigit(i) == a.isdigit() assert isalnum(o) == isalnum(i) == a.isalnum() assert isalpha(o) == isalpha(i) == a.isalpha() + assert isidentifier(o) == isidentifier(i) == a.isidentifier() def test_codepoint_classifiers_via_any() -> None: @@ -1476,6 +1478,7 @@ def test_codepoint_classifiers_via_any() -> None: (isdigit, "5", "a"), (isalnum, "A", " "), (isalpha, "A", " "), + (isidentifier, "A", "0"), ): f: Any = fn assert f(ord(true_input)) is True