Browse Source

bpo-30215: Make re.compile() locale agnostic. (#1361)

Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time.  Only the locale at matching
time affects the result of matching.
pull/1468/head
Serhiy Storchaka 9 years ago
committed by GitHub
parent
commit
898ff03e1e
  1. 5
      Doc/library/re.rst
  2. 12
      Lib/re.py
  3. 24
      Lib/sre_compile.py
  4. 10
      Lib/sre_constants.py
  5. 32
      Lib/test/test_re.py
  6. 4
      Misc/NEWS
  7. 3
      Modules/_sre.c
  8. 5
      Modules/sre_constants.h
  9. 69
      Modules/sre_lib.h

5
Doc/library/re.rst

@ -559,6 +559,11 @@ form.
:const:`re.LOCALE` can be used only with bytes patterns and is
not compatible with :const:`re.ASCII`.
.. versionchanged:: 3.7
Compiled regular expression objects with the :const:`re.LOCALE` flag no
longer depend on the locale at compile time. Only the locale at
matching time affects the result of matching.
.. data:: M
MULTILINE

12
Lib/re.py

@ -268,9 +268,7 @@ _MAXCACHE = 512
def _compile(pattern, flags):
# internal: compile pattern
try:
p, loc = _cache[type(pattern), pattern, flags]
if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
return p
return _cache[type(pattern), pattern, flags]
except KeyError:
pass
if isinstance(pattern, _pattern_type):
@ -284,13 +282,7 @@ def _compile(pattern, flags):
if not (flags & DEBUG):
if len(_cache) >= _MAXCACHE:
_cache.clear()
if p.flags & LOCALE:
if not _locale:
return p
loc = _locale.setlocale(_locale.LC_CTYPE)
else:
loc = None
_cache[type(pattern), pattern, flags] = p, loc
_cache[type(pattern), pattern, flags] = p
return p
@functools.lru_cache(_MAXCACHE)

24
Lib/sre_compile.py

@ -78,7 +78,13 @@ def _compile(code, pattern, flags):
fixes = None
for op, av in pattern:
if op in LITERAL_CODES:
if flags & SRE_FLAG_IGNORECASE:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
emit(av)
elif flags & SRE_FLAG_LOCALE:
emit(OP_LOC_IGNORE[op])
emit(av)
else:
lo = _sre.getlower(av, flags)
if fixes and lo in fixes:
emit(IN_IGNORE)
@ -93,17 +99,17 @@ def _compile(code, pattern, flags):
else:
emit(OP_IGNORE[op])
emit(lo)
else:
emit(op)
emit(av)
elif op is IN:
if flags & SRE_FLAG_IGNORECASE:
emit(OP_IGNORE[op])
def fixup(literal, flags=flags):
return _sre.getlower(literal, flags)
else:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
fixup = None
elif flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
fixup = None
else:
emit(IN_IGNORE)
def fixup(literal, flags=flags):
return _sre.getlower(literal, flags)
skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup, fixes)
code[skip] = _len(code) - skip

10
Lib/sre_constants.py

@ -13,7 +13,7 @@
# update when constants are added or removed
MAGIC = 20140917
MAGIC = 20170530
from _sre import MAXREPEAT, MAXGROUPS
@ -87,6 +87,9 @@ OPCODES = _makecodes("""
SUBPATTERN
MIN_REPEAT_ONE
RANGE_IGNORE
LITERAL_LOC_IGNORE
NOT_LITERAL_LOC_IGNORE
IN_LOC_IGNORE
MIN_REPEAT MAX_REPEAT
""")
@ -124,6 +127,11 @@ OP_IGNORE = {
RANGE: RANGE_IGNORE,
}
OP_LOC_IGNORE = {
LITERAL: LITERAL_LOC_IGNORE,
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
}
AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE

32
Lib/test/test_re.py

@ -1730,6 +1730,38 @@ SUBPATTERN None 0 0
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
def test_locale_compiled(self):
oldlocale = locale.setlocale(locale.LC_CTYPE)
self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
for loc in 'en_US.iso88591', 'en_US.utf8':
try:
locale.setlocale(locale.LC_CTYPE, loc)
except locale.Error:
# Unsupported locale on this system
self.skipTest('test needs %s locale' % loc)
locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
p1 = re.compile(b'\xc5\xe5', re.L|re.I)
p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
for p in p1, p2, p3:
self.assertTrue(p.match(b'\xc5\xe5'))
self.assertTrue(p.match(b'\xe5\xe5'))
self.assertTrue(p.match(b'\xc5\xc5'))
self.assertIsNone(p4.match(b'\xe5\xc5'))
self.assertIsNone(p4.match(b'\xe5\xe5'))
self.assertIsNone(p4.match(b'\xc5\xc5'))
locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
for p in p1, p2, p3:
self.assertTrue(p.match(b'\xc5\xe5'))
self.assertIsNone(p.match(b'\xe5\xe5'))
self.assertIsNone(p.match(b'\xc5\xc5'))
self.assertTrue(p4.match(b'\xe5\xc5'))
self.assertIsNone(p4.match(b'\xe5\xe5'))
self.assertIsNone(p4.match(b'\xc5\xc5'))
def test_error(self):
with self.assertRaises(re.error) as cm:
re.compile('(\u20ac))')

4
Misc/NEWS

@ -317,6 +317,10 @@ Extension Modules
Library
-------
- bpo-30215: Compiled regular expression objects with the re.LOCALE flag no
longer depend on the locale at compile time. Only the locale at matching
time affects the result of matching.
- bpo-30185: Avoid KeyboardInterrupt tracebacks in forkserver helper process
when Ctrl-C is received.

3
Modules/_sre.c

@ -1588,6 +1588,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_NOT_LITERAL:
case SRE_OP_LITERAL_IGNORE:
case SRE_OP_NOT_LITERAL_IGNORE:
case SRE_OP_LITERAL_LOC_IGNORE:
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
GET_ARG;
/* The arg is just a character, nothing to check */
break;
@ -1625,6 +1627,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_IN:
case SRE_OP_IN_IGNORE:
case SRE_OP_IN_LOC_IGNORE:
GET_SKIP;
/* Stop 1 before the end; we check the FAILURE below */
if (!_validate_charset(code, code+skip-2))

5
Modules/sre_constants.h

@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/
#define SRE_MAGIC 20140917
#define SRE_MAGIC 20170530
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@ -45,6 +45,9 @@
#define SRE_OP_SUBPATTERN 30
#define SRE_OP_MIN_REPEAT_ONE 31
#define SRE_OP_RANGE_IGNORE 32
#define SRE_OP_LITERAL_LOC_IGNORE 33
#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34
#define SRE_OP_IN_LOC_IGNORE 35
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2

69
Modules/sre_lib.h

@ -100,6 +100,14 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
return 0;
}
LOCAL(int)
SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch)
{
return ch == pattern
|| (SRE_CODE) state->lower(ch) == pattern
|| (SRE_CODE) state->upper(ch) == pattern;
}
LOCAL(int)
SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
{
@ -187,6 +195,18 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
}
}
LOCAL(int)
SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
{
SRE_CODE lo, up;
lo = state->lower(ch);
if (SRE(charset)(state, set, lo))
return 1;
up = state->upper(ch);
return up != lo && SRE(charset)(state, set, up);
}
LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
LOCAL(Py_ssize_t)
@ -247,6 +267,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
ptr++;
break;
case SRE_OP_LITERAL_LOC_IGNORE:
/* repeated literal */
chr = pattern[1];
TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr))
ptr++;
break;
case SRE_OP_NOT_LITERAL:
/* repeated non-literal */
chr = pattern[1];
@ -269,6 +297,14 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
ptr++;
break;
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
/* repeated non-literal */
chr = pattern[1];
TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr))
ptr++;
break;
default:
/* repeated single character pattern */
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
@ -651,7 +687,17 @@ entrance:
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
ctx->pattern, ctx->ptr, ctx->pattern[0]));
if (ctx->ptr >= end ||
state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
state->lower(*ctx->ptr) != *ctx->pattern)
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
break;
case SRE_OP_LITERAL_LOC_IGNORE:
TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
ctx->pattern, ctx->ptr, ctx->pattern[0]));
if (ctx->ptr >= end
|| !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
@ -661,7 +707,17 @@ entrance:
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
ctx->pattern, ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end ||
state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
state->lower(*ctx->ptr) == *ctx->pattern)
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
break;
case SRE_OP_NOT_LITERAL_LOC_IGNORE:
TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
ctx->pattern, ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end
|| SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
@ -677,6 +733,15 @@ entrance:
ctx->ptr++;
break;
case SRE_OP_IN_LOC_IGNORE:
TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr));
if (ctx->ptr >= end
|| !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr))
RETURN_FAILURE;
ctx->pattern += ctx->pattern[0];
ctx->ptr++;
break;
case SRE_OP_JUMP:
case SRE_OP_INFO:
/* jump forward */

Loading…
Cancel
Save