From 2f1ddff2a5f8d877537218c19137a4e8a022120a Mon Sep 17 00:00:00 2001 From: Xinchen Hui Date: Mon, 12 Jan 2015 17:24:37 +0800 Subject: [PATCH] Faster strrpos implementation --- UPGRADING | 3 +- Zend/zend_operators.c | 68 ++++++++++++++++---- Zend/zend_operators.h | 45 +++++++++++-- ext/standard/string.c | 142 +++++++++++++++++++----------------------- 4 files changed, 162 insertions(+), 96 deletions(-) diff --git a/UPGRADING b/UPGRADING index 4e1baaca5be..0a707e5b286 100644 --- a/UPGRADING +++ b/UPGRADING @@ -54,7 +54,8 @@ PHP X.Y UPGRADE NOTES . zend_function.common.num_args don't include the variadic argument anymore. . ob_start() no longer issues an E_ERROR, but instead an E_RECOVERABLE_ERROR in case an output buffer is created in an output buffer handler. - . Add zend_memnstr_ex, which is based on string matching sunday algo. + . Added zend_memnstr_ex, which is based on string matching sunday algo. + . Added zend_memnrstr, zend_memnrstr_ex. - DBA . dba_delete() now returns false if the key was not found for the inifile diff --git a/Zend/zend_operators.c b/Zend/zend_operators.c index 052623b97e4..3bbe0ce924c 100644 --- a/Zend/zend_operators.c +++ b/Zend/zend_operators.c @@ -2763,54 +2763,96 @@ process_double: } /* }}} */ -static zend_always_inline void zend_memstr_ex_pre(unsigned int td[], const char *needle, size_t needle_len) /* {{{ */ { +/* + * String matching - Sunday algorithm + * http://www.iti.fh-flensburg.de/lang/algorithmen/pattern/sundayen.htm + */ +static zend_always_inline void zend_memnstr_ex_pre(unsigned int td[], const char *needle, size_t needle_len, int reverse) /* {{{ */ { int i; for (i = 0; i < 256; i++) { td[i] = needle_len + 1; } - for (i = 0; i < needle_len; i++) { - td[(unsigned char)needle[i]] = (int)needle_len - i; + if (reverse) { + for (i = needle_len - 1; i >= 0; i--) { + td[(unsigned char)needle[i]] = i + 1; + } + } else { + for (i = 0; i < needle_len; i++) { + td[(unsigned char)needle[i]] = (int)needle_len - i; + } } } /* }}} */ -/* - * String matching - Sunday algorithm - * http://www.iti.fh-flensburg.de/lang/algorithmen/pattern/sundayen.htm - */ ZEND_API const char* zend_memnstr_ex(const char *haystack, const char *needle, size_t needle_len, char *end) /* {{{ */ { unsigned int td[256]; register size_t i; - const unsigned register char *p; + register const char *p; if (needle_len == 0 || (end - haystack) == 0) { return NULL; } - zend_memstr_ex_pre(td, needle, needle_len); + zend_memnstr_ex_pre(td, needle, needle_len, 0); - p = (const unsigned char *)haystack; + p = haystack; end -= needle_len; - while (p <= (unsigned char *)end) { + while (p <= end) { for (i = 0; i < needle_len; i++) { if (needle[i] != p[i]) { break; } } if (i == needle_len) { - return (const char *)p; + return p; } - p += td[p[needle_len]]; + p += td[(unsigned char)(p[needle_len])]; } return NULL; } /* }}} */ +ZEND_API const char* zend_memnrstr_ex(const char *haystack, const char *needle, size_t needle_len, char *end) /* {{{ */ +{ + unsigned int td[256]; + register size_t i; + register const char *p; + + if (needle_len == 0 || (end - haystack) == 0) { + return NULL; + } + + zend_memnstr_ex_pre(td, needle, needle_len, 1); + + p = end; + p -= needle_len; + + while (p >= haystack) { + for (i = 0; i < needle_len; i++) { + if (needle[i] != p[i]) { + break; + } + } + + if (i == needle_len) { + return (const char *)p; + } + + if (p == haystack) { + return NULL; + } + + p -= td[(unsigned char)(p[-1])]; + } + + return NULL; +} +/* }}} */ /* * Local variables: diff --git a/Zend/zend_operators.h b/Zend/zend_operators.h index ccbadc6f230..d57c4f59b1e 100644 --- a/Zend/zend_operators.h +++ b/Zend/zend_operators.h @@ -88,6 +88,7 @@ ZEND_API zend_bool instanceof_function(const zend_class_entry *instance_ce, cons ZEND_API zend_uchar _is_numeric_string_ex(const char *str, size_t length, zend_long *lval, double *dval, int allow_errors, int *oflow_info); ZEND_API const char* zend_memnstr_ex(const char *haystack, const char *needle, size_t needle_len, char *end); +ZEND_API const char* zend_memnrstr_ex(const char *haystack, const char *needle, size_t needle_len, char *end); END_EXTERN_C() @@ -174,11 +175,12 @@ zend_memnstr(const char *haystack, const char *needle, size_t needle_len, char * size_t off_s; if (needle_len == 1) { - return (char *)memchr(p, *needle, (end-p)); + return (const char *)memchr(p, *needle, (end-p)); } off_p = end - haystack; off_s = (off_p > 0) ? (size_t)off_p : 0; + if (needle_len > off_s) { return NULL; } @@ -187,7 +189,7 @@ zend_memnstr(const char *haystack, const char *needle, size_t needle_len, char * end -= needle_len; while (p <= end) { - if ((p = (char *)memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) { + if ((p = (const char *)memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) { if (!memcmp(needle, p, needle_len-1)) { return p; } @@ -209,7 +211,6 @@ zend_memnstr(const char *haystack, const char *needle, size_t needle_len, char * static zend_always_inline const void *zend_memrchr(const void *s, int c, size_t n) { register const unsigned char *e; - if (n <= 0) { return NULL; } @@ -219,10 +220,46 @@ static zend_always_inline const void *zend_memrchr(const void *s, int c, size_t return (const void *)e; } } - return NULL; } + +static zend_always_inline const char * +zend_memnrstr(const char *haystack, const char *needle, size_t needle_len, char *end) +{ + const char *p = end; + const char ne = needle[needle_len-1]; + ptrdiff_t off_p; + size_t off_s; + + if (needle_len == 1) { + return (const char *)zend_memrchr(haystack, *needle, (p - haystack)); + } + + off_p = end - haystack; + off_s = (off_p > 0) ? (size_t)off_p : 0; + + if (needle_len > off_s) { + return NULL; + } + + if (EXPECTED(off_s < 1024 || needle_len < 3)) { + p -= needle_len; + + do { + if ((p = (const char *)zend_memrchr(haystack, *needle, (p - haystack) + 1)) && ne == p[needle_len-1]) { + if (!memcmp(needle, p, needle_len - 1)) { + return p; + } + } + } while (p-- >= haystack); + + return NULL; + } else { + return zend_memnrstr_ex(haystack, needle, needle_len, end); + } +} + BEGIN_EXTERN_C() ZEND_API int increment_function(zval *op1); ZEND_API int decrement_function(zval *op2); diff --git a/ext/standard/string.c b/ext/standard/string.c index 197c363ce32..d76855f758f 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -1929,9 +1929,9 @@ PHP_FUNCTION(stripos) char *found = NULL; zend_string *haystack; zend_long offset = 0; - char *needle_dup = NULL, *haystack_dup; char needle_char[2]; zval *needle; + zend_string *needle_dup = NULL, *haystack_dup; if (zend_parse_parameters(ZEND_NUM_ARGS(), "Sz|l", &haystack, &needle, &offset) == FAILURE) { return; @@ -1946,40 +1946,38 @@ PHP_FUNCTION(stripos) RETURN_FALSE; } - haystack_dup = estrndup(haystack->val, haystack->len); - php_strtolower(haystack_dup, haystack->len); - if (Z_TYPE_P(needle) == IS_STRING) { if (Z_STRLEN_P(needle) == 0 || Z_STRLEN_P(needle) > haystack->len) { - efree(haystack_dup); RETURN_FALSE; } - needle_dup = estrndup(Z_STRVAL_P(needle), Z_STRLEN_P(needle)); - php_strtolower(needle_dup, Z_STRLEN_P(needle)); - found = (char*)php_memnstr(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack->len); + haystack_dup = php_string_tolower(haystack); + needle_dup = php_string_tolower(Z_STR_P(needle)); + found = (char*)php_memnstr(haystack_dup->val + offset, + needle_dup->val, needle_dup->len, haystack_dup->val + haystack->len); } else { if (php_needle_char(needle, needle_char) != SUCCESS) { - efree(haystack_dup); RETURN_FALSE; } + haystack_dup = php_string_tolower(haystack); needle_char[0] = tolower(needle_char[0]); needle_char[1] = '\0'; - found = (char*)php_memnstr(haystack_dup + offset, + found = (char*)php_memnstr(haystack_dup->val + offset, needle_char, sizeof(needle_char) - 1, - haystack_dup + haystack->len); + haystack_dup->val + haystack->len); } - efree(haystack_dup); - if (needle_dup) { - efree(needle_dup); - } if (found) { - RETURN_LONG(found - haystack_dup); + RETVAL_LONG(found - haystack_dup->val); } else { - RETURN_FALSE; + RETVAL_FALSE; + } + + zend_string_release(haystack_dup); + if (needle_dup) { + zend_string_release(needle_dup); } } /* }}} */ @@ -1994,6 +1992,7 @@ PHP_FUNCTION(strrpos) size_t needle_len; zend_long offset = 0; char *p, *e, ord_needle[2]; + char *found; #ifndef FAST_ZPP if (zend_parse_parameters(ZEND_NUM_ARGS(), "Sz|l", &haystack, &zneedle, &offset) == FAILURE) { @@ -2030,37 +2029,22 @@ PHP_FUNCTION(strrpos) RETURN_FALSE; } p = haystack->val + (size_t)offset; - e = haystack->val + haystack->len - needle_len; + e = haystack->val + haystack->len; } else { if (offset < -INT_MAX || (size_t)(-offset) > haystack->len) { php_error_docref(NULL, E_WARNING, "Offset is greater than the length of haystack string"); RETURN_FALSE; } - p = haystack->val; - if (needle_len > (size_t)(-offset)) { - e = haystack->val + haystack->len - needle_len; + if (haystack->len + (size_t)offset >= needle_len) { + e = haystack->val + haystack->len + (size_t)offset + needle_len; } else { - e = haystack->val + haystack->len + offset; - } - } - - if (needle_len == 1) { - /* Single character search can shortcut memcmps */ - while (e >= p) { - if (*e == *needle) { - RETURN_LONG(e - p + (offset > 0 ? offset : 0)); - } - e--; + e = haystack->val + haystack->len; } - RETURN_FALSE; } - while (e >= p) { - if (memcmp(e, needle, needle_len) == 0) { - RETURN_LONG(e - p + (offset > 0 ? offset : 0)); - } - e--; + if ((found = (char *)zend_memnrstr(p, needle, needle_len, e))) { + RETURN_LONG(found - haystack->val); } RETURN_FALSE; @@ -2072,103 +2056,105 @@ PHP_FUNCTION(strrpos) PHP_FUNCTION(strripos) { zval *zneedle; - char *needle; + zend_string *needle; zend_string *haystack; size_t needle_len; zend_long offset = 0; - char *p, *e, ord_needle[2]; - char *needle_dup, *haystack_dup; + char *p, *e; + char *found; + zend_string *needle_dup, *haystack_dup, *ord_needle = NULL; + ALLOCA_FLAG(use_heap); + if (zend_parse_parameters(ZEND_NUM_ARGS(), "Sz|l", &haystack, &zneedle, &offset) == FAILURE) { RETURN_FALSE; } + STR_ALLOCA_ALLOC(ord_needle, 1, use_heap); if (Z_TYPE_P(zneedle) == IS_STRING) { - needle = Z_STRVAL_P(zneedle); - needle_len = Z_STRLEN_P(zneedle); + needle = Z_STR_P(zneedle); } else { - if (php_needle_char(zneedle, ord_needle) != SUCCESS) { + if (php_needle_char(zneedle, ord_needle->val) != SUCCESS) { RETURN_FALSE; } - ord_needle[1] = '\0'; + ord_needle->val[1] = '\0'; needle = ord_needle; - needle_len = 1; } - if ((haystack->len == 0) || (needle_len == 0)) { + if ((haystack->len == 0) || (needle->len == 0)) { RETURN_FALSE; } - if (needle_len == 1) { + if (needle->len == 1) { /* Single character search can shortcut memcmps Can also avoid tolower emallocs */ if (offset >= 0) { if ((size_t)offset > haystack->len) { + STR_ALLOCA_FREE(ord_needle, use_heap); php_error_docref(NULL, E_WARNING, "Offset is greater than the length of haystack string"); RETURN_FALSE; } - p = haystack->val + offset; + p = haystack->val + (size_t)offset; e = haystack->val + haystack->len - 1; } else { p = haystack->val; if (offset < -INT_MAX || (size_t)(-offset) > haystack->len) { + STR_ALLOCA_FREE(ord_needle, use_heap); php_error_docref(NULL, E_WARNING, "Offset is greater than the length of haystack string"); RETURN_FALSE; } - e = haystack->val + haystack->len + offset; + e = haystack->val + haystack->len + (size_t)offset; } /* Borrow that ord_needle buffer to avoid repeatedly tolower()ing needle */ - *ord_needle = tolower(*needle); + *ord_needle->val = tolower(*needle->val); while (e >= p) { - if (tolower(*e) == *ord_needle) { + if (tolower(*e) == *ord_needle->val) { + STR_ALLOCA_FREE(ord_needle, use_heap); RETURN_LONG(e - p + (offset > 0 ? offset : 0)); } e--; } + STR_ALLOCA_FREE(ord_needle, use_heap); RETURN_FALSE; } - needle_dup = estrndup(needle, needle_len); - php_strtolower(needle_dup, needle_len); - haystack_dup = estrndup(haystack->val, haystack->len); - php_strtolower(haystack_dup, haystack->len); - + haystack_dup = php_string_tolower(haystack); if (offset >= 0) { if ((size_t)offset > haystack->len) { - efree(needle_dup); - efree(haystack_dup); + zend_string_release(haystack_dup); + STR_ALLOCA_FREE(ord_needle, use_heap); php_error_docref(NULL, E_WARNING, "Offset is greater than the length of haystack string"); RETURN_FALSE; } - p = haystack_dup + offset; - e = haystack_dup + haystack->len - needle_len; + p = haystack_dup->val + offset; + e = haystack_dup->val + haystack->len; } else { if (offset < -INT_MAX || (size_t)(-offset) > haystack->len) { - efree(needle_dup); - efree(haystack_dup); + zend_string_release(haystack_dup); + STR_ALLOCA_FREE(ord_needle, use_heap); php_error_docref(NULL, E_WARNING, "Offset is greater than the length of haystack string"); RETURN_FALSE; } - p = haystack_dup; - if (needle_len > (size_t)(-offset)) { - e = haystack_dup + haystack->len - needle_len; + p = haystack_dup->val; + if (haystack->len + (size_t)offset >= needle->len) { + e = haystack_dup->val + haystack->len + (size_t)offset + needle->len; } else { - e = haystack_dup + haystack->len + offset; + e = haystack_dup->val + haystack->len; } } - while (e >= p) { - if (memcmp(e, needle_dup, needle_len) == 0) { - efree(haystack_dup); - efree(needle_dup); - RETURN_LONG(e - p + (offset > 0 ? offset : 0)); - } - e--; + needle_dup = php_string_tolower(needle); + if ((found = (char *)zend_memnrstr(p, needle_dup->val, needle_dup->len, e))) { + RETVAL_LONG(found - haystack_dup->val); + zend_string_release(needle_dup); + zend_string_release(haystack_dup); + STR_ALLOCA_FREE(ord_needle, use_heap); + } else { + zend_string_release(needle_dup); + zend_string_release(haystack_dup); + STR_ALLOCA_FREE(ord_needle, use_heap); + RETURN_FALSE; } - - efree(haystack_dup); - efree(needle_dup); - RETURN_FALSE; } /* }}} */