@ -40,6 +40,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
# define PY_SSIZE_T_CLEAN
# include "Python.h"
# include "pycore_coreconfig.h"
# include "pycore_fileutils.h"
# include "pycore_object.h"
# include "pycore_pylifecycle.h"
@ -264,6 +265,13 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
/* Forward declaration */
static inline int
_PyUnicodeWriter_WriteCharInline ( _PyUnicodeWriter * writer , Py_UCS4 ch ) ;
static PyObject *
unicode_encode_utf8 ( PyObject * unicode , _Py_error_handler error_handler ,
const char * errors ) ;
static PyObject *
unicode_decode_utf8 ( const char * s , Py_ssize_t size ,
_Py_error_handler error_handler , const char * errors ,
Py_ssize_t * consumed ) ;
/* List of static strings. */
static _Py_Identifier * static_strings = NULL ;
@ -388,6 +396,35 @@ _Py_GetErrorHandler(const char *errors)
return _Py_ERROR_OTHER ;
}
static _Py_error_handler
get_error_handler_wide ( const wchar_t * errors )
{
if ( errors = = NULL | | wcscmp ( errors , L " strict " ) = = 0 ) {
return _Py_ERROR_STRICT ;
}
if ( wcscmp ( errors , L " surrogateescape " ) = = 0 ) {
return _Py_ERROR_SURROGATEESCAPE ;
}
if ( wcscmp ( errors , L " replace " ) = = 0 ) {
return _Py_ERROR_REPLACE ;
}
if ( wcscmp ( errors , L " ignore " ) = = 0 ) {
return _Py_ERROR_IGNORE ;
}
if ( wcscmp ( errors , L " backslashreplace " ) = = 0 ) {
return _Py_ERROR_BACKSLASHREPLACE ;
}
if ( wcscmp ( errors , L " surrogatepass " ) = = 0 ) {
return _Py_ERROR_SURROGATEPASS ;
}
if ( wcscmp ( errors , L " xmlcharrefreplace " ) = = 0 ) {
return _Py_ERROR_XMLCHARREFREPLACE ;
}
return _Py_ERROR_OTHER ;
}
/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
This function is kept for backward compatibility with the old API . */
Py_UNICODE
@ -3445,11 +3482,9 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
static PyObject *
unicode_encode_locale ( PyObject * unicode , const char * errors ,
unicode_encode_locale ( PyObject * unicode , _Py_error_handler error_handler ,
int current_locale )
{
_Py_error_handler error_handler = _Py_GetErrorHandler ( errors ) ;
Py_ssize_t wlen ;
wchar_t * wstr = PyUnicode_AsWideCharString ( unicode , & wlen ) ;
if ( wstr = = NULL ) {
@ -3499,30 +3534,44 @@ unicode_encode_locale(PyObject *unicode, const char *errors,
PyObject *
PyUnicode_EncodeLocale ( PyObject * unicode , const char * errors )
{
return unicode_encode_locale ( unicode , errors , 1 ) ;
_Py_error_handler error_handler = _Py_GetErrorHandler ( errors ) ;
return unicode_encode_locale ( unicode , error_handler , 1 ) ;
}
PyObject *
PyUnicode_EncodeFSDefault ( PyObject * unicode )
{
PyInterpreterState * interp = _PyInterpreterState_GET_UNSAFE ( ) ;
const _PyCoreConfig * config = & interp - > core_config ;
# ifdef _Py_FORCE_UTF8_FS_ENCODING
return _PyUnicode_AsUTF8String ( unicode , config - > filesystem_errors ) ;
if ( interp - > fs_codec . encoding ) {
return unicode_encode_utf8 ( unicode ,
interp - > fs_codec . error_handler ,
interp - > fs_codec . errors ) ;
}
else {
const _PyCoreConfig * config = & interp - > core_config ;
_Py_error_handler errors ;
errors = get_error_handler_wide ( config - > filesystem_errors ) ;
assert ( errors ! = _Py_ERROR_UNKNOWN ) ;
return unicode_encode_utf8 ( unicode , errors , NULL ) ;
}
# else
/* Bootstrap check: if the filesystem codec is implemented in Python, we
cannot use it to encode and decode filenames before it is loaded . Load
the Python codec requires to encode at least its own filename . Use the C
implementation of the locale codec until the codec registry is
initialized and the Python codec is loaded . See initfsencoding ( ) . */
if ( interp - > fscodec_initialized ) {
if ( interp - > fs_codec . encoding ) {
return PyUnicode_AsEncodedString ( unicode ,
config - > filesystem_encoding ,
config - > filesystem_errors ) ;
interp - > fs_codec . encoding ,
interp - > fs_codec . errors ) ;
}
else {
return unicode_encode_locale ( unicode ,
config - > filesystem_errors , 0 ) ;
const _PyCoreConfig * config = & interp - > core_config ;
_Py_error_handler errors ;
errors = get_error_handler_wide ( config - > filesystem_errors ) ;
assert ( errors ! = _Py_ERROR_UNKNOWN ) ;
return unicode_encode_locale ( unicode , errors , 0 ) ;
}
# endif
}
@ -3663,11 +3712,9 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
}
static PyObject *
unicode_decode_locale ( const char * str , Py_ssize_t len , const char * errors ,
int current_locale )
unicode_decode_locale ( const char * str , Py_ssize_t len ,
_Py_error_handler errors , int current_locale )
{
_Py_error_handler error_handler = _Py_GetErrorHandler ( errors ) ;
if ( str [ len ] ! = ' \0 ' | | ( size_t ) len ! = strlen ( str ) ) {
PyErr_SetString ( PyExc_ValueError , " embedded null byte " ) ;
return NULL ;
@ -3677,7 +3724,7 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
size_t wlen ;
const char * reason ;
int res = _Py_DecodeLocaleEx ( str , & wstr , & wlen , & reason ,
current_locale , error_handler ) ;
current_locale , errors ) ;
if ( res ! = 0 ) {
if ( res = = - 2 ) {
PyObject * exc ;
@ -3709,14 +3756,16 @@ PyObject*
PyUnicode_DecodeLocaleAndSize ( const char * str , Py_ssize_t len ,
const char * errors )
{
return unicode_decode_locale ( str , len , errors , 1 ) ;
_Py_error_handler error_handler = _Py_GetErrorHandler ( errors ) ;
return unicode_decode_locale ( str , len , error_handler , 1 ) ;
}
PyObject *
PyUnicode_DecodeLocale ( const char * str , const char * errors )
{
Py_ssize_t size = ( Py_ssize_t ) strlen ( str ) ;
return unicode_decode_locale ( str , size , errors , 1 ) ;
_Py_error_handler error_handler = _Py_GetErrorHandler ( errors ) ;
return unicode_decode_locale ( str , size , error_handler , 1 ) ;
}
@ -3730,23 +3779,36 @@ PyObject*
PyUnicode_DecodeFSDefaultAndSize ( const char * s , Py_ssize_t size )
{
PyInterpreterState * interp = _PyInterpreterState_GET_UNSAFE ( ) ;
const _PyCoreConfig * config = & interp - > core_config ;
# ifdef _Py_FORCE_UTF8_FS_ENCODING
return PyUnicode_DecodeUTF8Stateful ( s , size , config - > filesystem_errors , NULL ) ;
if ( interp - > fs_codec . encoding ) {
return unicode_decode_utf8 ( s , size ,
interp - > fs_codec . error_handler ,
interp - > fs_codec . errors ,
NULL ) ;
}
else {
const _PyCoreConfig * config = & interp - > core_config ;
_Py_error_handler errors ;
errors = get_error_handler_wide ( config - > filesystem_errors ) ;
assert ( errors ! = _Py_ERROR_UNKNOWN ) ;
return unicode_decode_utf8 ( s , size , errors , NULL , NULL ) ;
}
# else
/* Bootstrap check: if the filesystem codec is implemented in Python, we
cannot use it to encode and decode filenames before it is loaded . Load
the Python codec requires to encode at least its own filename . Use the C
implementation of the locale codec until the codec registry is
initialized and the Python codec is loaded . See initfsencoding ( ) . */
if ( interp - > fscodec_initialized ) {
if ( interp - > fs_codec . encoding ) {
return PyUnicode_Decode ( s , size ,
config - > filesystem_encoding ,
config - > filesystem_errors ) ;
interp - > fs_codec . encoding ,
interp - > fs_codec . errors ) ;
}
else {
return unicode_decode_locale ( s , size ,
config - > filesystem_errors , 0 ) ;
const _PyCoreConfig * config = & interp - > core_config ;
_Py_error_handler errors ;
errors = get_error_handler_wide ( config - > filesystem_errors ) ;
return unicode_decode_locale ( s , size , errors , 0 ) ;
}
# endif
}
@ -4810,11 +4872,10 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
return p - start ;
}
PyObject *
PyUnicode_DecodeUTF8Stateful ( const char * s ,
Py_ssize_t size ,
const char * errors ,
Py_ssize_t * consumed )
static PyObject *
unicode_decode_utf8 ( const char * s , Py_ssize_t size ,
_Py_error_handler error_handler , const char * errors ,
Py_ssize_t * consumed )
{
_PyUnicodeWriter writer ;
const char * starts = s ;
@ -4825,7 +4886,6 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
const char * errmsg = " " ;
PyObject * error_handler_obj = NULL ;
PyObject * exc = NULL ;
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN ;
if ( size = = 0 ) {
if ( consumed )
@ -4948,6 +5008,16 @@ onError:
}
PyObject *
PyUnicode_DecodeUTF8Stateful ( const char * s ,
Py_ssize_t size ,
const char * errors ,
Py_ssize_t * consumed )
{
return unicode_decode_utf8 ( s , size , _Py_ERROR_UNKNOWN , errors , consumed ) ;
}
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
non - zero , use strict error handler otherwise .
@ -5231,8 +5301,9 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
maximum possible needed ( 4 result bytes per Unicode character ) , and return
the excess memory at the end .
*/
PyObject *
_PyUnicode_AsUTF8String ( PyObject * unicode , const char * errors )
static PyObject *
unicode_encode_utf8 ( PyObject * unicode , _Py_error_handler error_handler ,
const char * errors )
{
enum PyUnicode_Kind kind ;
void * data ;
@ -5260,14 +5331,21 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
case PyUnicode_1BYTE_KIND :
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
assert ( ! PyUnicode_IS_ASCII ( unicode ) ) ;
return ucs1lib_utf8_encoder ( unicode , data , size , errors ) ;
return ucs1lib_utf8_encoder ( unicode , data , size , error_handler , error s ) ;
case PyUnicode_2BYTE_KIND :
return ucs2lib_utf8_encoder ( unicode , data , size , errors ) ;
return ucs2lib_utf8_encoder ( unicode , data , size , error_handler , error s ) ;
case PyUnicode_4BYTE_KIND :
return ucs4lib_utf8_encoder ( unicode , data , size , errors ) ;
return ucs4lib_utf8_encoder ( unicode , data , size , error_handler , error s ) ;
}
}
PyObject *
_PyUnicode_AsUTF8String ( PyObject * unicode , const char * errors )
{
return unicode_encode_utf8 ( unicode , _Py_ERROR_UNKNOWN , errors ) ;
}
PyObject *
PyUnicode_EncodeUTF8 ( const Py_UNICODE * s ,
Py_ssize_t size ,
@ -15575,12 +15653,35 @@ PyUnicode_AsUnicodeCopy(PyObject *unicode)
}
static char *
get_codec_name ( const char * encoding )
static int
encode_wstr_utf8 ( wchar_t * wstr , char * * str , const char * name )
{
PyObject * codec , * name_obj = NULL ;
int res ;
res = _Py_EncodeUTF8Ex ( wstr , str , NULL , NULL , 1 , _Py_ERROR_STRICT ) ;
if ( res = = - 2 ) {
PyErr_Format ( PyExc_RuntimeWarning , " cannot decode %s " , name ) ;
return - 1 ;
}
if ( res < 0 ) {
PyErr_NoMemory ( ) ;
return - 1 ;
}
return 0 ;
}
static int
config_get_codec_name ( wchar_t * * config_encoding )
{
char * encoding ;
if ( encode_wstr_utf8 ( * config_encoding , & encoding , " stdio_encoding " ) < 0 ) {
return - 1 ;
}
PyObject * name_obj = NULL ;
PyObject * codec = _PyCodec_Lookup ( encoding ) ;
PyMem_RawFree ( encoding ) ;
codec = _PyCodec_Lookup ( encoding ) ;
if ( ! codec )
goto error ;
@ -15590,71 +15691,107 @@ get_codec_name(const char *encoding)
goto error ;
}
const char * name_utf8 = PyUnicode_AsUTF8 ( name_obj ) ;
if ( name_utf8 = = NULL ) {
wchar_t * wname = PyUnicode_AsWideCharString ( name_obj , NULL ) ;
Py_DECREF ( name_obj ) ;
if ( wname = = NULL ) {
goto error ;
}
char * name = _PyMem_RawStr dup( name_utf8 ) ;
Py_DECREF ( name_obj ) ;
if ( name = = NULL ) {
wchar_t * raw_wname = _PyMem_RawWcs dup( w name) ;
if ( raw_wname = = NULL ) {
PyMem_Free ( wname ) ;
PyErr_NoMemory ( ) ;
return NULL ;
goto error ;
}
return name ;
PyMem_RawFree ( * config_encoding ) ;
* config_encoding = raw_wname ;
PyMem_Free ( wname ) ;
return 0 ;
error :
Py_XDECREF ( codec ) ;
Py_XDECREF ( name_obj ) ;
return NULL ;
return - 1 ;
}
static _PyInitError
init_stdio_encoding ( PyInterpreterState * interp )
{
/* Update the stdio encoding to the normalized Python codec name. */
_PyCoreConfig * config = & interp - > core_config ;
char * codec_name = get_codec_name ( config - > stdio_encoding ) ;
if ( codec_name = = NULL ) {
if ( config_get_codec_name ( & config - > stdio_encoding ) < 0 ) {
return _Py_INIT_ERR ( " failed to get the Python codec name "
" of the stdio encoding " ) ;
}
PyMem_RawFree ( config - > stdio_encoding ) ;
config - > stdio_encoding = codec_name ;
return _Py_INIT_OK ( ) ;
}
static _PyInitError
init_fs_encoding ( PyInterpreterState * interp )
static int
init_fs_codec ( PyInterpreterState * interp )
{
_PyCoreConfig * config = & interp - > core_config ;
char * encoding = get_codec_name ( config - > filesystem_encoding ) ;
if ( encoding = = NULL ) {
/* Such error can only occurs in critical situations: no more
memory , import a module of the standard library failed , etc . */
return _Py_INIT_ERR ( " failed to get the Python codec "
" of the filesystem encoding " ) ;
_Py_error_handler error_handler ;
error_handler = get_error_handler_wide ( config - > filesystem_errors ) ;
if ( error_handler = = _Py_ERROR_UNKNOWN ) {
PyErr_SetString ( PyExc_RuntimeError , " unknow filesystem error handler " ) ;
return - 1 ;
}
/* Update the filesystem encoding to the normalized Python codec name.
For example , replace " ANSI_X3.4-1968 " ( locale encoding ) with " ascii "
( Python codec name ) . */
PyMem_RawFree ( config - > filesystem_encoding ) ;
config - > filesystem_encoding = encoding ;
char * encoding , * errors ;
if ( encode_wstr_utf8 ( config - > filesystem_encoding ,
& encoding ,
" filesystem_encoding " ) < 0 ) {
return - 1 ;
}
if ( encode_wstr_utf8 ( config - > filesystem_errors ,
& errors ,
" filesystem_errors " ) < 0 ) {
PyMem_RawFree ( encoding ) ;
return - 1 ;
}
PyMem_RawFree ( interp - > fs_codec . encoding ) ;
interp - > fs_codec . encoding = encoding ;
PyMem_RawFree ( interp - > fs_codec . errors ) ;
interp - > fs_codec . errors = errors ;
interp - > fs_codec . error_handler = error_handler ;
/* At this point, PyUnicode_EncodeFSDefault() and
PyUnicode_DecodeFSDefault ( ) can now use the Python codec rather than
the C implementation of the filesystem encoding . */
/* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
global configuration variables . */
if ( _Py_SetFileSystemEncoding ( config - > filesystem_encoding ,
config - > filesystem_errors ) < 0 ) {
return _Py_INIT_NO_MEMORY ( ) ;
if ( _Py_SetFileSystemEncoding ( interp - > fs_codec . encoding ,
interp - > fs_codec . errors ) < 0 ) {
PyErr_NoMemory ( ) ;
return - 1 ;
}
return 0 ;
}
static _PyInitError
init_fs_encoding ( PyInterpreterState * interp )
{
/* Update the filesystem encoding to the normalized Python codec name.
For example , replace " ANSI_X3.4-1968 " ( locale encoding ) with " ascii "
( Python codec name ) . */
_PyCoreConfig * config = & interp - > core_config ;
if ( config_get_codec_name ( & config - > filesystem_encoding ) < 0 ) {
return _Py_INIT_ERR ( " failed to get the Python codec "
" of the filesystem encoding " ) ;
}
/* PyUnicode can now use the Python codec rather than C implementation
for the filesystem encoding */
interp - > fscodec_initialized = 1 ;
if ( init_fs_codec ( interp ) < 0 ) {
return _Py_INIT_ERR ( " cannot initialize filesystem codec " ) ;
}
return _Py_INIT_OK ( ) ;
}
@ -15671,6 +15808,33 @@ _PyUnicode_InitEncodings(PyInterpreterState *interp)
}
# ifdef MS_WINDOWS
int
_PyUnicode_EnableLegacyWindowsFSEncoding ( void )
{
PyInterpreterState * interp = _PyInterpreterState_GET_UNSAFE ( ) ;
_PyCoreConfig * config = & interp - > core_config ;
/* Set the filesystem encoding to mbcs/replace (PEP 529) */
wchar_t * encoding = _PyMem_RawWcsdup ( L " mbcs " ) ;
wchar_t * errors = _PyMem_RawWcsdup ( L " replace " ) ;
if ( encoding = = NULL | | errors = = NULL ) {
PyMem_RawFree ( encoding ) ;
PyMem_RawFree ( errors ) ;
PyErr_NoMemory ( ) ;
return - 1 ;
}
PyMem_RawFree ( config - > filesystem_encoding ) ;
config - > filesystem_encoding = encoding ;
PyMem_RawFree ( config - > filesystem_errors ) ;
config - > filesystem_errors = errors ;
return init_fs_codec ( interp ) ;
}
# endif
void
_PyUnicode_Fini ( void )
{
@ -15694,6 +15858,12 @@ _PyUnicode_Fini(void)
}
_PyUnicode_ClearStaticStrings ( ) ;
( void ) PyUnicode_ClearFreeList ( ) ;
PyInterpreterState * interp = _PyInterpreterState_GET_UNSAFE ( ) ;
PyMem_RawFree ( interp - > fs_codec . encoding ) ;
interp - > fs_codec . encoding = NULL ;
PyMem_RawFree ( interp - > fs_codec . errors ) ;
interp - > fs_codec . errors = NULL ;
}