|
|
@ -163,7 +163,7 @@ static inline unsigned int get_next_char( |
|
|
else |
|
|
else |
|
|
MB_FAILURE(pos, 4); |
|
|
MB_FAILURE(pos, 4); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f); |
|
|
this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f); |
|
|
if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */ |
|
|
if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */ |
|
|
MB_FAILURE(pos, 4); |
|
|
MB_FAILURE(pos, 4); |
|
|
@ -437,7 +437,7 @@ det_charset: |
|
|
|
|
|
|
|
|
if (charset_hint) { |
|
|
if (charset_hint) { |
|
|
int found = 0; |
|
|
int found = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* now walk the charset map and look for the codeset */ |
|
|
/* now walk the charset map and look for the codeset */ |
|
|
for (i = 0; charset_map[i].codeset; i++) { |
|
|
for (i = 0; charset_map[i].codeset; i++) { |
|
|
if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) { |
|
|
if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) { |
|
|
@ -545,7 +545,7 @@ static inline unsigned char unimap_bsearch(const uni_to_enc *table, unsigned cod |
|
|
return 0; |
|
|
return 0; |
|
|
|
|
|
|
|
|
code_key = (unsigned short) code_key_a; |
|
|
code_key = (unsigned short) code_key_a; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
while (l <= h) { |
|
|
while (l <= h) { |
|
|
m = l + (h - l) / 2; |
|
|
m = l + (h - l) / 2; |
|
|
if (code_key < m->un_code_point) |
|
|
if (code_key < m->un_code_point) |
|
|
@ -571,7 +571,7 @@ static inline int map_from_unicode(unsigned code, enum entity_charset charset, u |
|
|
/* identity mapping of code points to unicode */ |
|
|
/* identity mapping of code points to unicode */ |
|
|
if (code > 0xFF) { |
|
|
if (code > 0xFF) { |
|
|
return FAILURE; |
|
|
return FAILURE; |
|
|
} |
|
|
|
|
|
|
|
|
} |
|
|
*res = code; |
|
|
*res = code; |
|
|
break; |
|
|
break; |
|
|
|
|
|
|
|
|
@ -590,7 +590,7 @@ static inline int map_from_unicode(unsigned code, enum entity_charset charset, u |
|
|
return FAILURE; |
|
|
return FAILURE; |
|
|
} |
|
|
} |
|
|
break; |
|
|
break; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
case cs_8859_15: |
|
|
case cs_8859_15: |
|
|
if (code < 0xA4 || (code > 0xBE && code <= 0xFF)) { |
|
|
if (code < 0xA4 || (code > 0xBE && code <= 0xFF)) { |
|
|
*res = code; |
|
|
*res = code; |
|
|
@ -634,7 +634,7 @@ static inline int map_from_unicode(unsigned code, enum entity_charset charset, u |
|
|
case cs_cp866: |
|
|
case cs_cp866: |
|
|
table = unimap_cp866; |
|
|
table = unimap_cp866; |
|
|
table_size = sizeof(unimap_cp866) / sizeof(*unimap_cp866); |
|
|
table_size = sizeof(unimap_cp866) / sizeof(*unimap_cp866); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
table_over_7F: |
|
|
table_over_7F: |
|
|
if (code <= 0x7F) { |
|
|
if (code <= 0x7F) { |
|
|
*res = code; |
|
|
*res = code; |
|
|
@ -710,7 +710,7 @@ static inline int unicode_cp_is_allowed(unsigned uni_cp, int document_type) |
|
|
* Not sure this is the relevant part for HTML 5, though. I opted to |
|
|
* Not sure this is the relevant part for HTML 5, though. I opted to |
|
|
* disallow the characters that would result in a parse error when |
|
|
* disallow the characters that would result in a parse error when |
|
|
* preprocessing of the input stream. See also section 8.1.3. |
|
|
* preprocessing of the input stream. See also section 8.1.3. |
|
|
* |
|
|
|
|
|
|
|
|
* |
|
|
* It's unclear if XHTML 1.0 allows C1 characters. I'll opt to apply to |
|
|
* It's unclear if XHTML 1.0 allows C1 characters. I'll opt to apply to |
|
|
* XHTML 1.0 the same rules as for XML 1.0. |
|
|
* XHTML 1.0 the same rules as for XML 1.0. |
|
|
* See <http://cmsmcq.com/2007/C1.xml>. |
|
|
* See <http://cmsmcq.com/2007/C1.xml>. |
|
|
@ -774,7 +774,7 @@ static inline int numeric_entity_is_allowed(unsigned uni_cp, int document_type) |
|
|
/* {{{ process_numeric_entity |
|
|
/* {{{ process_numeric_entity |
|
|
* Auxiliary function to traverse_for_entities. |
|
|
* Auxiliary function to traverse_for_entities. |
|
|
* On input, *buf should point to the first character after # and on output, it's the last |
|
|
* On input, *buf should point to the first character after # and on output, it's the last |
|
|
* byte read, no matter if there was success or insuccess. |
|
|
|
|
|
|
|
|
* byte read, no matter if there was success or insuccess. |
|
|
*/ |
|
|
*/ |
|
|
static inline int process_numeric_entity(const char **buf, unsigned *code_point) |
|
|
static inline int process_numeric_entity(const char **buf, unsigned *code_point) |
|
|
{ |
|
|
{ |
|
|
@ -784,7 +784,7 @@ static inline int process_numeric_entity(const char **buf, unsigned *code_point) |
|
|
|
|
|
|
|
|
if (hexadecimal && (**buf != '\0')) |
|
|
if (hexadecimal && (**buf != '\0')) |
|
|
(*buf)++; |
|
|
(*buf)++; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* strtol allows whitespace and other stuff in the beginning |
|
|
/* strtol allows whitespace and other stuff in the beginning |
|
|
* we're not interested */ |
|
|
* we're not interested */ |
|
|
if ((hexadecimal && !isxdigit(**buf)) || |
|
|
if ((hexadecimal && !isxdigit(**buf)) || |
|
|
@ -969,7 +969,7 @@ static void traverse_for_entities( |
|
|
goto invalid_code; |
|
|
goto invalid_code; |
|
|
|
|
|
|
|
|
/* are we allowed to decode this entity in this document type? |
|
|
/* are we allowed to decode this entity in this document type? |
|
|
* HTML 5 is the only that has a character that cannot be used in |
|
|
|
|
|
|
|
|
* HTML 5 is the only that has a character that cannot be used in |
|
|
* a numeric entity but is allowed literally (U+000D). The |
|
|
* a numeric entity but is allowed literally (U+000D). The |
|
|
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */ |
|
|
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */ |
|
|
if (!unicode_cp_is_allowed(code, doctype) || |
|
|
if (!unicode_cp_is_allowed(code, doctype) || |
|
|
@ -996,9 +996,9 @@ static void traverse_for_entities( |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert(*next == ';'); |
|
|
assert(*next == ';'); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) || |
|
|
if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) || |
|
|
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))) |
|
|
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))) |
|
|
/* && code2 == '\0' always true for current maps */) |
|
|
/* && code2 == '\0' always true for current maps */) |
|
|
@ -1026,7 +1026,7 @@ invalid_code: |
|
|
*(q++) = *p; |
|
|
*(q++) = *p; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*q = '\0'; |
|
|
*q = '\0'; |
|
|
*retlen = (size_t)(q - ret); |
|
|
*retlen = (size_t)(q - ret); |
|
|
} |
|
|
} |
|
|
@ -1066,7 +1066,7 @@ static entity_table_opt determine_entity_table(int all, int doctype) |
|
|
entity_table_opt retval = {NULL}; |
|
|
entity_table_opt retval = {NULL}; |
|
|
|
|
|
|
|
|
assert(!(doctype == ENT_HTML_DOC_XML1 && all)); |
|
|
assert(!(doctype == ENT_HTML_DOC_XML1 && all)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (all) { |
|
|
if (all) { |
|
|
retval.ms_table = (doctype == ENT_HTML_DOC_HTML5) ? |
|
|
retval.ms_table = (doctype == ENT_HTML_DOC_HTML5) ? |
|
|
entity_ms_table_html5 : entity_ms_table_html4; |
|
|
entity_ms_table_html5 : entity_ms_table_html4; |
|
|
@ -1111,13 +1111,13 @@ PHPAPI char *php_unescape_html_entities(unsigned char *old, size_t oldlen, size_ |
|
|
if (retlen == 0) { |
|
|
if (retlen == 0) { |
|
|
goto empty_source; |
|
|
goto empty_source; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inverse_map = unescape_inverse_map(all, flags); |
|
|
inverse_map = unescape_inverse_map(all, flags); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* replace numeric entities */ |
|
|
/* replace numeric entities */ |
|
|
traverse_for_entities(old, oldlen, ret, &retlen, all, flags, inverse_map, charset); |
|
|
traverse_for_entities(old, oldlen, ret, &retlen, all, flags, inverse_map, charset); |
|
|
|
|
|
|
|
|
empty_source: |
|
|
|
|
|
|
|
|
empty_source: |
|
|
*newlen = retlen; |
|
|
*newlen = retlen; |
|
|
return ret; |
|
|
return ret; |
|
|
} |
|
|
} |
|
|
@ -1141,7 +1141,7 @@ static inline void find_entity_for_char( |
|
|
{ |
|
|
{ |
|
|
unsigned stage1_idx = ENT_STAGE1_INDEX(k); |
|
|
unsigned stage1_idx = ENT_STAGE1_INDEX(k); |
|
|
const entity_stage3_row *c; |
|
|
const entity_stage3_row *c; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (stage1_idx > 0x1D) { |
|
|
if (stage1_idx > 0x1D) { |
|
|
*entity = NULL; |
|
|
*entity = NULL; |
|
|
*entity_len = 0; |
|
|
*entity_len = 0; |
|
|
@ -1162,7 +1162,7 @@ static inline void find_entity_for_char( |
|
|
if (!(*cursor < oldlen)) |
|
|
if (!(*cursor < oldlen)) |
|
|
goto no_suitable_2nd; |
|
|
goto no_suitable_2nd; |
|
|
|
|
|
|
|
|
next_char = get_next_char(charset, old, oldlen, cursor, &status); |
|
|
|
|
|
|
|
|
next_char = get_next_char(charset, old, oldlen, cursor, &status); |
|
|
|
|
|
|
|
|
if (status == FAILURE) |
|
|
if (status == FAILURE) |
|
|
goto no_suitable_2nd; |
|
|
goto no_suitable_2nd; |
|
|
@ -1187,7 +1187,7 @@ no_suitable_2nd: |
|
|
*entity = (const unsigned char *) |
|
|
*entity = (const unsigned char *) |
|
|
c->data.multicodepoint_table[0].leading_entry.default_entity; |
|
|
c->data.multicodepoint_table[0].leading_entry.default_entity; |
|
|
*entity_len = c->data.multicodepoint_table[0].leading_entry.default_entity_len; |
|
|
*entity_len = c->data.multicodepoint_table[0].leading_entry.default_entity_len; |
|
|
} |
|
|
|
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
/* }}} */ |
|
|
/* }}} */ |
|
|
|
|
|
|
|
|
@ -1255,7 +1255,7 @@ PHPAPI char *php_escape_html_entities_ex(unsigned char *old, size_t oldlen, size |
|
|
|
|
|
|
|
|
/* initial estimate */ |
|
|
/* initial estimate */ |
|
|
if (oldlen < 64) { |
|
|
if (oldlen < 64) { |
|
|
maxlen = 128; |
|
|
|
|
|
|
|
|
maxlen = 128; |
|
|
} else { |
|
|
} else { |
|
|
maxlen = 2 * oldlen; |
|
|
maxlen = 2 * oldlen; |
|
|
if (maxlen < oldlen) { |
|
|
if (maxlen < oldlen) { |
|
|
@ -1444,6 +1444,10 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
replaced = php_escape_html_entities_ex(str, str_len, &new_len, all, (int) flags, hint_charset, double_encode TSRMLS_CC); |
|
|
replaced = php_escape_html_entities_ex(str, str_len, &new_len, all, (int) flags, hint_charset, double_encode TSRMLS_CC); |
|
|
|
|
|
if (new_len > INT_MAX) { |
|
|
|
|
|
efree(replaced); |
|
|
|
|
|
RETURN_FALSE; |
|
|
|
|
|
} |
|
|
RETVAL_STRINGL(replaced, (int)new_len, 0); |
|
|
RETVAL_STRINGL(replaced, (int)new_len, 0); |
|
|
} |
|
|
} |
|
|
/* }}} */ |
|
|
/* }}} */ |
|
|
@ -1577,7 +1581,7 @@ static inline void write_s3row_data( |
|
|
} else { |
|
|
} else { |
|
|
spe_cp = uni_cp; |
|
|
spe_cp = uni_cp; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
written_k2 = write_octet_sequence(&key[written_k1], charset, spe_cp); |
|
|
written_k2 = write_octet_sequence(&key[written_k1], charset, spe_cp); |
|
|
memcpy(&entity[1], mcpr[i].normal_entry.entity, l); |
|
|
memcpy(&entity[1], mcpr[i].normal_entry.entity, l); |
|
|
entity[l + 1] = ';'; |
|
|
entity[l + 1] = ';'; |
|
|
@ -1615,7 +1619,7 @@ PHP_FUNCTION(get_html_translation_table) |
|
|
LIMIT_ALL(all, doctype, charset); |
|
|
LIMIT_ALL(all, doctype, charset); |
|
|
|
|
|
|
|
|
array_init(return_value); |
|
|
array_init(return_value); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
entity_table = determine_entity_table(all, doctype); |
|
|
entity_table = determine_entity_table(all, doctype); |
|
|
if (all && !CHARSET_UNICODE_COMPAT(charset)) { |
|
|
if (all && !CHARSET_UNICODE_COMPAT(charset)) { |
|
|
to_uni_table = enc_to_uni_index[charset]; |
|
|
to_uni_table = enc_to_uni_index[charset]; |
|
|
|