Browse Source
bpo-40596: Fix str.isidentifier() for non-canonicalized strings containing non-BMP characters on Windows. (GH-20053)
pull/20057/head
Serhiy Storchaka
6 years ago
committed by
GitHub
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with
31 additions and
4 deletions
-
Lib/test/test_unicode.py
-
Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst
-
Objects/unicodeobject.c
|
|
|
@ -720,6 +720,13 @@ class UnicodeTest(string_tests.CommonTest, |
|
|
|
self.assertFalse("©".isidentifier()) |
|
|
|
self.assertFalse("0".isidentifier()) |
|
|
|
|
|
|
|
@support.cpython_only |
|
|
|
def test_isidentifier_legacy(self): |
|
|
|
import _testcapi |
|
|
|
u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊' |
|
|
|
self.assertTrue(u.isidentifier()) |
|
|
|
self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier()) |
|
|
|
|
|
|
|
def test_isprintable(self): |
|
|
|
self.assertTrue("".isprintable()) |
|
|
|
self.assertTrue(" ".isprintable()) |
|
|
|
|
|
|
|
@ -0,0 +1,2 @@ |
|
|
|
Fixed :meth:`str.isidentifier` for non-canonicalized strings containing |
|
|
|
non-BMP characters on Windows. |
|
|
|
@ -12356,20 +12356,38 @@ PyUnicode_IsIdentifier(PyObject *self) |
|
|
|
return len && i == len; |
|
|
|
} |
|
|
|
else { |
|
|
|
Py_ssize_t i, len = PyUnicode_GET_SIZE(self); |
|
|
|
Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self); |
|
|
|
if (len == 0) { |
|
|
|
/* an empty string is not a valid identifier */ |
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
const wchar_t *wstr = _PyUnicode_WSTR(self); |
|
|
|
Py_UCS4 ch = wstr[0]; |
|
|
|
Py_UCS4 ch = wstr[i++]; |
|
|
|
#if SIZEOF_WCHAR_T == 2 |
|
|
|
if (Py_UNICODE_IS_HIGH_SURROGATE(ch) |
|
|
|
&& i < len |
|
|
|
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) |
|
|
|
{ |
|
|
|
ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); |
|
|
|
i++; |
|
|
|
} |
|
|
|
#endif |
|
|
|
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { |
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
for (i = 1; i < len; i++) { |
|
|
|
ch = wstr[i]; |
|
|
|
while (i < len) { |
|
|
|
ch = wstr[i++]; |
|
|
|
#if SIZEOF_WCHAR_T == 2 |
|
|
|
if (Py_UNICODE_IS_HIGH_SURROGATE(ch) |
|
|
|
&& i < len |
|
|
|
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) |
|
|
|
{ |
|
|
|
ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); |
|
|
|
i++; |
|
|
|
} |
|
|
|
#endif |
|
|
|
if (!_PyUnicode_IsXidContinue(ch)) { |
|
|
|
return 0; |
|
|
|
} |
|
|
|
|