Move utf8_encode and utf8_decode to ext/standard

9 years ago · 1a512eed44
11 changed files with 110 additions and 90 deletions
--- a/ext/standard/basic_functions.c
+++ b/ext/standard/basic_functions.c
@ -2465,6 +2465,14 @@ ZEND_BEGIN_ARG_INFO_EX(arginfo_substr_compare, 0, 0, 3)
 	ZEND_ARG_INFO(0, length)
 	ZEND_ARG_INFO(0, case_sensitivity)
 ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_INFO_EX(arginfo_utf8_encode, 0, 0, 1)
+	ZEND_ARG_INFO(0, data)
+ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_INFO_EX(arginfo_utf8_decode, 0, 0, 1)
+	ZEND_ARG_INFO(0, data)
+ZEND_END_ARG_INFO()
 /* }}} */
 /* {{{ syslog.c */
 #ifdef HAVE_SYSLOG_H
@ -2764,6 +2772,8 @@ const zend_function_entry basic_functions[] = { /* {{{ */
 	PHP_FE(str_split,														arginfo_str_split)
 	PHP_FE(strpbrk,															arginfo_strpbrk)
 	PHP_FE(substr_compare,													arginfo_substr_compare)
+	PHP_FE(utf8_encode, 													arginfo_utf8_encode)
+	PHP_FE(utf8_decode, 													arginfo_utf8_decode)

 #ifdef HAVE_STRCOLL
 	PHP_FE(strcoll,															arginfo_strcoll)
--- a/ext/standard/php_string.h
+++ b/ext/standard/php_string.h
@ -93,6 +93,8 @@ PHP_FUNCTION(str_word_count);
 PHP_FUNCTION(str_split);
 PHP_FUNCTION(strpbrk);
 PHP_FUNCTION(substr_compare);
+PHP_FUNCTION(utf8_encode);
+PHP_FUNCTION(utf8_decode);
 #ifdef HAVE_STRCOLL
 PHP_FUNCTION(strcoll);
 #endif
--- a/ext/standard/string.c
+++ b/ext/standard/string.c
@ -64,6 +64,8 @@

 /* For str_getcsv() support */
 #include "ext/standard/file.h"
+/* For php_next_utf8_char() */
+#include "ext/standard/html.h"

 #define STR_PAD_LEFT			0
 #define STR_PAD_RIGHT			1
@ -5653,6 +5655,98 @@ PHP_FUNCTION(substr_compare)
 }
 /* }}} */

+/* {{{ */
+static zend_string *php_utf8_encode(const char *s, size_t len)
+{
+	size_t pos = len;
+	zend_string *str;
+	unsigned char c;
+
+	str = zend_string_safe_alloc(len, 2, 0, 0);
+	ZSTR_LEN(str) = 0;
+	while (pos > 0) {
+		/* The lower 256 codepoints of Unicode are identical to Latin-1,
+		 * so we don't need to do any mapping here. */
+		c = (unsigned char)(*s);
+		if (c < 0x80) {
+			ZSTR_VAL(str)[ZSTR_LEN(str)++] = (char) c;
+		/* We only account for the single-byte and two-byte cases because
+		 * we're only dealing with the first 256 Unicode codepoints. */
+		} else {
+			ZSTR_VAL(str)[ZSTR_LEN(str)++] = (0xc0 | (c >> 6));
+			ZSTR_VAL(str)[ZSTR_LEN(str)++] = (0x80 | (c & 0x3f));
+		}
+		pos--;
+		s++;
+	}
+	ZSTR_VAL(str)[ZSTR_LEN(str)] = '\0';
+	str = zend_string_truncate(str, ZSTR_LEN(str), 0);
+	return str;
+}
+/* }}} */
+
+/* {{{ */
+static zend_string *php_utf8_decode(const char *s, size_t len)
+{
+	size_t pos = 0;
+	unsigned int c;
+	zend_string *str;
+
+	str = zend_string_alloc(len, 0);
+	ZSTR_LEN(str) = 0;
+	while (pos < len) {
+		int status = FAILURE;
+		c = php_next_utf8_char((const unsigned char*)s, (size_t) len, &pos, &status);
+
+		/* The lower 256 codepoints of Unicode are identical to Latin-1,
+		 * so we don't need to do any mapping here beyond replacing non-Latin-1
+		 * characters. */
+		if (status == FAILURE || c > 0xFFU) {
+			c = '?';
+		}
+
+		ZSTR_VAL(str)[ZSTR_LEN(str)++] = c;
+	}
+	ZSTR_VAL(str)[ZSTR_LEN(str)] = '\0';
+	if (ZSTR_LEN(str) < len) {
+		str = zend_string_truncate(str, ZSTR_LEN(str), 0);
+	}
+
+	return str;
+}
+/* }}} */
+
+
+/* {{{ proto string utf8_encode(string data) 
+   Encodes an ISO-8859-1 string to UTF-8 */
+PHP_FUNCTION(utf8_encode)
+{
+	char *arg;
+	size_t arg_len;
+
+	if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &arg, &arg_len) == FAILURE) {
+		return;
+	}
+
+	RETURN_STR(php_utf8_encode(arg, arg_len));
+}
+/* }}} */
+
+/* {{{ proto string utf8_decode(string data) 
+   Converts a UTF-8 encoded string to ISO-8859-1 */
+PHP_FUNCTION(utf8_decode)
+{
+	char *arg;
+	size_t arg_len;
+
+	if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &arg, &arg_len) == FAILURE) {
+		return;
+	}
+
+	RETURN_STR(php_utf8_decode(arg, arg_len));
+}
+/* }}} */
+
 /*
 * Local variables:
 * tab-width: 4
--- a/ext/standard/tests/strings/bug43957.phpt
+++ b/ext/standard/tests/strings/bug43957.phpt
@ -1,10 +1,5 @@
 --TEST--
 Bug #43957 (utf8_decode() bogus conversion on multibyte indicator near end of string)
--SKIPIF--
-<?php
-require_once("skipif.inc");
-if (!extension_loaded('xml')) die ("skip xml extension not available");
-?>
 --FILE--
 <?php
  echo utf8_decode('abc'.chr(0xe0));
--- a/ext/standard/tests/strings/bug49687.phpt
+++ b/ext/standard/tests/strings/bug49687.phpt
@ -1,10 +1,5 @@
 --TEST--
 Bug #49687 Several utf8_decode deficiencies and vulnerabilities
--SKIPIF--
-<?php
-require_once("skipif.inc");
-if (!extension_loaded('xml')) die ("skip xml extension not available");
-?>
 --FILE--
 <?php

--- a/ext/standard/tests/strings/utf8.phpt
+++ b/ext/standard/tests/strings/utf8.phpt
@ -1,7 +1,5 @@
 --TEST--
 UTF-8<->ISO Latin 1 encoding/decoding test
--SKIPIF--
-<?php include("skipif.inc"); ?>
 --FILE--
 <?php
 printf("%s -> %s\n", urlencode("æ"), urlencode(utf8_encode("æ")));
--- a/ext/standard/tests/strings/utf8_decode_error.phpt
+++ b/ext/standard/tests/strings/utf8_decode_error.phpt
@ -1,16 +1,10 @@
 --TEST--
 Test utf8_decode() function : error conditions 
--SKIPIF--
-<?php 
-if (!extension_loaded("xml")) {
-	print "skip - XML extension not loaded"; 
-}	 
-?>
 --FILE--
 <?php
 /* Prototype  : proto string utf8_decode(string data)
 * Description: Converts a UTF-8 encoded string to ISO-8859-1 
- * Source code: ext/xml/xml.c
+ * Source code: ext/standard/string.c
 * Alias to functions: 
 */

--- a/ext/standard/tests/strings/utf8_decode_variation1.phpt
+++ b/ext/standard/tests/strings/utf8_decode_variation1.phpt
@ -1,16 +1,10 @@
 --TEST--
 Test utf8_decode() function : usage variations  - different types for data
--SKIPIF--
-<?php 
-if (!extension_loaded("xml")) {
-	print "skip - XML extension not loaded"; 
-}	 
-?>
 --FILE--
 <?php
 /* Prototype  : proto string utf8_decode(string data)
 * Description: Converts a UTF-8 encoded string to ISO-8859-1 
- * Source code: ext/xml/xml.c
+ * Source code: ext/standard/string.c
 * Alias to functions: 
 */

--- a/ext/standard/tests/strings/utf8_encode_error.phpt
+++ b/ext/standard/tests/strings/utf8_encode_error.phpt
@ -1,16 +1,10 @@
 --TEST--
 Test utf8_encode() function : error conditions
--SKIPIF--
-<?php 
-if (!extension_loaded("xml")) {
-	print "skip - XML extension not loaded"; 
-}	 
-?>
 --FILE--
 <?php
 /* Prototype  : proto string utf8_encode(string data)
 * Description: Encodes an ISO-8859-1 string to UTF-8 
- * Source code: ext/xml/xml.c
+ * Source code: ext/standard/string.c
 * Alias to functions: 
 */

--- a/ext/standard/tests/strings/utf8_encode_variation1.phpt
+++ b/ext/standard/tests/strings/utf8_encode_variation1.phpt
@ -1,16 +1,10 @@
 --TEST--
 Test utf8_encode() function : usage variations  - <type here specifics of this variation>
--SKIPIF--
-<?php 
-if (!extension_loaded("xml")) {
-	print "skip - XML extension not loaded"; 
-}	 
-?>
 --FILE--
 <?php
 /* Prototype  : proto string utf8_encode(string data)
 * Description: Encodes an ISO-8859-1 string to UTF-8 
- * Source code: ext/xml/xml.c
+ * Source code: ext/standard/string.c
 * Alias to functions: 
 */

--- a/ext/xml/xml.c
+++ b/ext/xml/xml.c
@ -212,14 +212,6 @@ ZEND_BEGIN_ARG_INFO_EX(arginfo_xml_parser_get_option, 0, 0, 2)
 	ZEND_ARG_INFO(0, option)
 ZEND_END_ARG_INFO()

-ZEND_BEGIN_ARG_INFO_EX(arginfo_utf8_encode, 0, 0, 1)
-	ZEND_ARG_INFO(0, data)
-ZEND_END_ARG_INFO()
-
-ZEND_BEGIN_ARG_INFO_EX(arginfo_utf8_decode, 0, 0, 1)
-	ZEND_ARG_INFO(0, data)
-ZEND_END_ARG_INFO()
-
 const zend_function_entry xml_functions[] = {
 	PHP_FE(xml_parser_create,					arginfo_xml_parser_create)
 	PHP_FE(xml_parser_create_ns,				arginfo_xml_parser_create_ns)
@ -243,8 +235,6 @@ const zend_function_entry xml_functions[] = {
 	PHP_FE(xml_parser_free, 					arginfo_xml_parser_free)
 	PHP_FE(xml_parser_set_option, 				arginfo_xml_parser_set_option)
 	PHP_FE(xml_parser_get_option,				arginfo_xml_parser_get_option)
-	PHP_FE(utf8_encode, 						arginfo_utf8_encode)
-	PHP_FE(utf8_decode, 						arginfo_utf8_decode)
 	PHP_FE_END
 };

@ -1667,46 +1657,6 @@ PHP_FUNCTION(xml_parser_get_option)
 }
 /* }}} */

-/* {{{ proto string utf8_encode(string data) 
-   Encodes an ISO-8859-1 string to UTF-8 */
-PHP_FUNCTION(utf8_encode)
-{
-	char *arg;
-	size_t arg_len;
-	zend_string *encoded;
-
-	if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &arg, &arg_len) == FAILURE) {
-		return;
-	}
-
-	encoded = xml_utf8_encode(arg, arg_len, (XML_Char*)"ISO-8859-1");
-	if (encoded == NULL) {
-		RETURN_FALSE;
-	}
-	RETURN_STR(encoded);
-}
-/* }}} */
-
-/* {{{ proto string utf8_decode(string data) 
-   Converts a UTF-8 encoded string to ISO-8859-1 */
-PHP_FUNCTION(utf8_decode)
-{
-	char *arg;
-	size_t arg_len;
-	zend_string *decoded;
-
-	if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &arg, &arg_len) == FAILURE) {
-		return;
-	}
-
-	decoded = xml_utf8_decode((XML_Char*)arg, arg_len, (XML_Char*)"ISO-8859-1");
-	if (decoded == NULL) {
-		RETURN_FALSE;
-	}
-	RETURN_STR(decoded);
-}
-/* }}} */
-
 #endif

 /*