Browse Source
bpo-24214: Fixed the UTF-8 incremental decoder. (GH-12603)
The bug occurred when the encoded surrogate character is passed
to the incremental decoder in two chunks.
pull/12630/head
Serhiy Storchaka
7 years ago
committed by
GitHub
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with
14 additions and
0 deletions
-
Lib/test/test_codecs.py
-
Misc/NEWS.d/next/Core and Builtins/2019-03-28-15-22-45.bpo-24214.tZ6lYU.rst
-
Objects/unicodeobject.c
|
|
|
@ -406,6 +406,15 @@ class ReadTest(MixInCheckStateHandling): |
|
|
|
self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"), |
|
|
|
before + backslashreplace + after) |
|
|
|
|
|
|
|
def test_incremental_surrogatepass(self): |
|
|
|
# Test incremental decoder for surrogatepass handler: |
|
|
|
# see issue #24214 |
|
|
|
data = '\uD901'.encode(self.encoding, 'surrogatepass') |
|
|
|
for i in range(1, len(data)): |
|
|
|
dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass') |
|
|
|
self.assertEqual(dec.decode(data[:i]), '') |
|
|
|
self.assertEqual(dec.decode(data[i:], True), '\uD901') |
|
|
|
|
|
|
|
|
|
|
|
class UTF32Test(ReadTest, unittest.TestCase): |
|
|
|
encoding = "utf-32" |
|
|
|
|
|
|
|
@ -0,0 +1,2 @@ |
|
|
|
Fixed support of the surrogatepass error handler in the UTF-8 incremental |
|
|
|
decoder. |
|
|
|
@ -4883,6 +4883,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s, |
|
|
|
case 2: |
|
|
|
case 3: |
|
|
|
case 4: |
|
|
|
if (s == end || consumed) { |
|
|
|
goto End; |
|
|
|
} |
|
|
|
errmsg = "invalid continuation byte"; |
|
|
|
startinpos = s - starts; |
|
|
|
endinpos = startinpos + ch - 1; |
|
|
|
|