Check newly created consistency using _PyUnicode_CheckConsistency(str, 1)

* In debug mode, fill the string data with invalid characters * Simplify also reference counting in PyCodec_BackslashReplaceErrors() and PyCodec_XMLCharRefReplaceError()
14 years ago · 8f825060f1
10 changed files with 31 additions and 14 deletions
--- a/Modules/_json.c
+++ b/Modules/_json.c
@ -246,6 +246,7 @@ ascii_escape_unicode(PyObject *pystr)
        }
    }
    output[chars++] = '"';
+    assert(_PyUnicode_CheckConsistency(rval, 1));
    return rval;
 }

--- a/Modules/md5module.c
+++ b/Modules/md5module.c
@ -397,6 +397,7 @@ MD5_hexdigest(MD5object *self, PyObject *unused)
        c = (digest[i] & 0xf);
        hex_digest[j++] = Py_hexdigits[c];
    }
+    assert(_PyUnicode_CheckConsistency(retval, 1));
    return retval;
 }

--- a/Modules/sha1module.c
+++ b/Modules/sha1module.c
@ -373,6 +373,7 @@ SHA1_hexdigest(SHA1object *self, PyObject *unused)
        c = (digest[i] & 0xf);
        hex_digest[j++] = Py_hexdigits[c];
    }
+    assert(_PyUnicode_CheckConsistency(retval, 1));
    return retval;
 }

--- a/Modules/sha256module.c
+++ b/Modules/sha256module.c
@ -466,6 +466,7 @@ SHA256_hexdigest(SHAobject *self, PyObject *unused)
        c = (digest[i] & 0xf);
        hex_digest[j++] = Py_hexdigits[c];
    }
+    assert(_PyUnicode_CheckConsistency(retval, 1));
    return retval;
 }

--- a/Modules/sha512module.c
+++ b/Modules/sha512module.c
@ -532,6 +532,7 @@ SHA512_hexdigest(SHAobject *self, PyObject *unused)
        c = (digest[i] & 0xf);
        hex_digest[j++] = Py_hexdigits[c];
    }
+    assert(_PyUnicode_CheckConsistency(retval, 1));
    return retval;
 }

--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@ -626,6 +626,7 @@ PyBytes_Repr(PyObject *obj, int smartquotes)
            *p++ = c;
    }
    *p++ = quote;
+    assert(_PyUnicode_CheckConsistency(v, 1));
    return v;
 }

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -967,7 +967,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
    PyObject *obj;
    PyCompactUnicodeObject *unicode;
    void *data;
-    int kind_state;
+    enum PyUnicode_Kind kind;
    int is_sharing, is_ascii;
    Py_ssize_t char_size;
    Py_ssize_t struct_size;
@ -986,17 +986,17 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
    is_sharing = 0;
    struct_size = sizeof(PyCompactUnicodeObject);
    if (maxchar < 128) {
-        kind_state = PyUnicode_1BYTE_KIND;
+        kind = PyUnicode_1BYTE_KIND;
        char_size = 1;
        is_ascii = 1;
        struct_size = sizeof(PyASCIIObject);
    }
    else if (maxchar < 256) {
-        kind_state = PyUnicode_1BYTE_KIND;
+        kind = PyUnicode_1BYTE_KIND;
        char_size = 1;
    }
    else if (maxchar < 65536) {
-        kind_state = PyUnicode_2BYTE_KIND;
+        kind = PyUnicode_2BYTE_KIND;
        char_size = 2;
        if (sizeof(wchar_t) == 2)
            is_sharing = 1;
@ -1007,7 +1007,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
                            "invalid maximum character passed to PyUnicode_New");
            return NULL;
        }
-        kind_state = PyUnicode_4BYTE_KIND;
+        kind = PyUnicode_4BYTE_KIND;
        char_size = 4;
        if (sizeof(wchar_t) == 4)
            is_sharing = 1;
@ -1041,7 +1041,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
    _PyUnicode_LENGTH(unicode) = size;
    _PyUnicode_HASH(unicode) = -1;
    _PyUnicode_STATE(unicode).interned = 0;
-    _PyUnicode_STATE(unicode).kind = kind_state;
+    _PyUnicode_STATE(unicode).kind = kind;
    _PyUnicode_STATE(unicode).compact = 1;
    _PyUnicode_STATE(unicode).ready = 1;
    _PyUnicode_STATE(unicode).ascii = is_ascii;
@ -1049,19 +1049,19 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
        ((char*)data)[size] = 0;
        _PyUnicode_WSTR(unicode) = NULL;
    }
-    else if (kind_state == PyUnicode_1BYTE_KIND) {
+    else if (kind == PyUnicode_1BYTE_KIND) {
        ((char*)data)[size] = 0;
        _PyUnicode_WSTR(unicode) = NULL;
        _PyUnicode_WSTR_LENGTH(unicode) = 0;
        unicode->utf8 = NULL;
        unicode->utf8_length = 0;
-        }
+    }
    else {
        unicode->utf8 = NULL;
        unicode->utf8_length = 0;
-        if (kind_state == PyUnicode_2BYTE_KIND)
+        if (kind == PyUnicode_2BYTE_KIND)
            ((Py_UCS2*)data)[size] = 0;
-        else /* kind_state == PyUnicode_4BYTE_KIND */
+        else /* kind == PyUnicode_4BYTE_KIND */
            ((Py_UCS4*)data)[size] = 0;
        if (is_sharing) {
            _PyUnicode_WSTR_LENGTH(unicode) = size;
@ -1072,6 +1072,13 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
            _PyUnicode_WSTR(unicode) = NULL;
        }
    }
+#ifdef Py_DEBUG
+    /* Fill the data with invalid characters to detect bugs earlier.
+       _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
+       at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
+       and U+FFFFFFFF is an invalid character in Unicode 6.0. */
+    memset(data, 0xff, size * kind);
+#endif
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
    return obj;
 }
--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -534,6 +534,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
        data = PyUnicode_DATA(res);
        for (i = 0; i < len; ++i)
            PyUnicode_WRITE(kind, data, i, '?');
+        assert(_PyUnicode_CheckConsistency(res, 1));
        return Py_BuildValue("(Nn)", res, end);
    }
    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
@ -559,6 +560,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
        data = PyUnicode_DATA(res);
        for (i=0; i < len; i++)
            PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
+        assert(_PyUnicode_CheckConsistency(res, 1));
        return Py_BuildValue("(Nn)", res, end);
    }
    else {
@ -652,8 +654,8 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
            }
            *outp++ = ';';
        }
-        restuple = Py_BuildValue("(On)", res, end);
-        Py_DECREF(res);
+        assert(_PyUnicode_CheckConsistency(res, 1));
+        restuple = Py_BuildValue("(Nn)", res, end);
        Py_DECREF(object);
        return restuple;
    }
@ -720,8 +722,8 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
            *outp++ = Py_hexdigits[c&0xf];
        }

-        restuple = Py_BuildValue("(On)", res, end);
-        Py_DECREF(res);
+        assert(_PyUnicode_CheckConsistency(res, 1));
+        restuple = Py_BuildValue("(Nn)", res, end);
        Py_DECREF(object);
        return restuple;
    }
--- a/Python/compile.c
+++ b/Python/compile.c
@ -263,6 +263,7 @@ _Py_Mangle(PyObject *privateobj, PyObject *ident)
        Py_DECREF(result);
        return NULL;
    }
+    assert(_PyUnicode_CheckConsistency(result, 1));
    return result;
 }

--- a/Python/import.c
+++ b/Python/import.c
@ -992,6 +992,7 @@ make_source_pathname(PyObject *path)
                             (j = dot0-right));
    PyUnicode_WRITE(kind, data, i+j,   'p');
    PyUnicode_WRITE(kind, data, i+j+1, 'y');
+    assert(_PyUnicode_CheckConsistency(result, 1));
    return result;
 }