bpo-24214: Fixed the UTF-8 incremental decoder. (GH-12603)

The bug occurred when the encoded surrogate character is passed to the incremental decoder in two chunks.
7 years ago · 7a465cb5ee
3 changed files with 14 additions and 0 deletions
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -406,6 +406,15 @@ class ReadTest(MixInCheckStateHandling):
            self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
                             before + backslashreplace + after)

+    def test_incremental_surrogatepass(self):
+        # Test incremental decoder for surrogatepass handler:
+        # see issue #24214
+        data = '\uD901'.encode(self.encoding, 'surrogatepass')
+        for i in range(1, len(data)):
+            dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
+            self.assertEqual(dec.decode(data[:i]), '')
+            self.assertEqual(dec.decode(data[i:], True), '\uD901')
+

 class UTF32Test(ReadTest, unittest.TestCase):
    encoding = "utf-32"
--- a/Builtins/2019-03-28-15-22-45.bpo-24214.tZ6lYU.rst
+++ b/Builtins/2019-03-28-15-22-45.bpo-24214.tZ6lYU.rst
@ -0,0 +1,2 @@
+Fixed support of the surrogatepass error handler in the UTF-8 incremental
+decoder.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4883,6 +4883,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
        case 2:
        case 3:
        case 4:
+            if (s == end || consumed) {
+                goto End;
+            }
            errmsg = "invalid continuation byte";
            startinpos = s - starts;
            endinpos = startinpos + ch - 1;