diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc index 79020fb4442..1cce58dd9c5 100644 --- a/storage/innobase/fts/fts0fts.cc +++ b/storage/innobase/fts/fts0fts.cc @@ -1200,7 +1200,6 @@ fts_tokenizer_word_get( /* If it is a stopword, do not index it */ if (!fts_check_token(text, cache->stopword_info.cached_stopword, - index_cache->index->is_ngram, index_cache->charset)) { return(NULL); @@ -3241,7 +3240,6 @@ fts_query_expansion_fetch_doc( } doc.charset = doc_charset; - doc.is_ngram = result_doc->is_ngram; if (dfield_is_ext(dfield)) { /* We ignore columns that are stored externally, this @@ -3347,7 +3345,6 @@ fts_fetch_doc_from_rec( doc->found = TRUE; doc->charset = get_doc->index_cache->charset; - doc->is_ngram = index->is_ngram; /* Null Field */ if (doc->text.f_len == UNIV_SQL_NULL || doc->text.f_len == 0) { @@ -4379,13 +4376,10 @@ fts_sync_table( return(err); } -/** Check fts token -1. for ngram token, check whether the token contains any words in stopwords -2. for non-ngram token, check if it's stopword or less than fts_min_token_size +/** Check if a fts token is a stopword or less than fts_min_token_size or greater than fts_max_token_size. @param[in] token token string @param[in] stopwords stopwords rb tree -@param[in] is_ngram is ngram parser @param[in] cs token charset @retval true if it is not stopword and length in range @retval false if it is stopword or lenght not in range */ @@ -4393,96 +4387,16 @@ bool fts_check_token( const fts_string_t* token, const ib_rbt_t* stopwords, - bool is_ngram, const CHARSET_INFO* cs) { ut_ad(cs != NULL || stopwords == NULL); - if (!is_ngram) { - ib_rbt_bound_t parent; + ib_rbt_bound_t parent; - if (token->f_n_char < fts_min_token_size - || token->f_n_char > fts_max_token_size - || (stopwords != NULL - && rbt_search(stopwords, &parent, token) == 0)) { - return(false); - } else { - return(true); - } - } - - /* Check token for ngram. */ - DBUG_EXECUTE_IF( - "fts_instrument_ignore_ngram_check", - return(true); - ); - - /* We ignore fts_min_token_size when ngram */ - ut_ad(token->f_n_char > 0 - && token->f_n_char <= fts_max_token_size); - - if (stopwords == NULL) { - return(true); - } - - /*Ngram checks whether the token contains any words in stopwords. - We can't simply use CONTAIN to search in stopwords, because it's - built on COMPARE. So we need to tokenize the token into words - from unigram to f_n_char, and check them separately. */ - for (ulint ngram_token_size = 1; ngram_token_size <= token->f_n_char; - ngram_token_size ++) { - const char* start; - const char* next; - const char* end; - ulint char_len; - ulint n_chars; - - start = reinterpret_cast(token->f_str); - next = start; - end = start + token->f_len; - n_chars = 0; - - while (next < end) { - char_len = my_charlen(cs, next, end); - - if (next + char_len > end || char_len == 0) { - break; - } else { - /* Skip SPACE */ - if (char_len == 1 && *next == ' ') { - start = next + 1; - next = start; - n_chars = 0; - - continue; - } - - next += char_len; - n_chars++; - } - - if (n_chars == ngram_token_size) { - fts_string_t ngram_token; - ngram_token.f_str = - reinterpret_cast( - const_cast(start)); - ngram_token.f_len = next - start; - ngram_token.f_n_char = ngram_token_size; - - ib_rbt_bound_t parent; - if (rbt_search(stopwords, &parent, - &ngram_token) == 0) { - return(false); - } - - /* Move a char forward */ - start += my_charlen(cs, start, end); - n_chars = ngram_token_size - 1; - } - } - } - - return(true); + return(token->f_n_char >= fts_min_token_size + && token->f_n_char <= fts_max_token_size + && (stopwords == NULL + || rbt_search(stopwords, &parent, token) != 0)); } /** Add the token and its start position to the token's list of positions. @@ -4499,8 +4413,7 @@ fts_add_token( /* Ignore string whose character number is less than "fts_min_token_size" or more than "fts_max_token_size" */ - if (fts_check_token(&str, NULL, result_doc->is_ngram, - result_doc->charset)) { + if (fts_check_token(&str, NULL, result_doc->charset)) { mem_heap_t* heap; fts_string_t t_str; @@ -7487,7 +7400,6 @@ fts_init_recover_doc( } doc.charset = get_doc->index_cache->charset; - doc.is_ngram = get_doc->index_cache->index->is_ngram; if (dfield_is_ext(dfield)) { dict_table_t* table = cache->sync->table; diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc index fc80c843412..594f337c978 100644 --- a/storage/innobase/fts/fts0que.cc +++ b/storage/innobase/fts/fts0que.cc @@ -2693,7 +2693,6 @@ fts_query_phrase_split( if (fts_check_token( &result_str, cache->stopword_info.cached_stopword, - query->index->is_ngram, query->fts_index_table.charset)) { /* Add the word to the RB tree so that we can calculate it's frequencey within a document. */ @@ -4278,7 +4277,6 @@ fts_expand_query( result_doc.charset = index_cache->charset; result_doc.parser = index_cache->index->parser; - result_doc.is_ngram = index_cache->index->is_ngram; query->total_size += SIZEOF_RBT_CREATE; diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 66b099ac348..c162c1f9f3f 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -6915,11 +6915,6 @@ ha_innobase::open( static_cast( plugin_decl(parser)->info); - index->is_ngram = strncmp( - plugin_name(parser)->str, - FTS_NGRAM_PARSER_NAME, - plugin_name(parser)->length) == 0; - DBUG_EXECUTE_IF("fts_instrument_use_default_parser", index->parser = &fts_default_parser;); } diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index 8412f5ab53d..a9f063dc224 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -619,8 +619,6 @@ extern "C" void wsrep_thd_set_wsrep_last_query_id(THD *thd, query_id_t id); extern const struct _ft_vft ft_vft_result; -#define FTS_NGRAM_PARSER_NAME "ngram" - /** Structure Returned by ha_innobase::ft_init_ext() */ typedef struct new_ft_info { diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index e88a3554074..5cc3347ebbe 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -2203,7 +2203,6 @@ innobase_create_index_def( memset(index->fields, 0, n_fields * sizeof *index->fields); index->parser = NULL; - index->is_ngram = false; index->key_number = key_number; index->n_fields = n_fields; index->name = mem_heap_strdup(heap, key->name); @@ -2237,12 +2236,6 @@ innobase_create_index_def( static_cast( plugin_decl(parser)->info); - index->is_ngram = strncmp( - plugin_name(parser)->str, - FTS_NGRAM_PARSER_NAME, - plugin_name(parser)->length) - == 0; - break; } } diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index f94d5f2b1ca..87f415c8a04 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -918,8 +918,6 @@ struct dict_index_t{ dict_field_t* fields; /*!< array of field descriptions */ st_mysql_ftparser* parser; /*!< fulltext parser plugin */ - bool is_ngram; - /*!< true if it's ngram parser */ bool has_new_v_col; /*!< whether it has a newly added virtual column in ALTER */ diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h index 80ebcf09d6d..f9d5d07a44c 100644 --- a/storage/innobase/include/fts0priv.h +++ b/storage/innobase/include/fts0priv.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -215,13 +216,10 @@ fts_write_node( fts_node_t* node) /*!< in: node columns */ MY_ATTRIBUTE((warn_unused_result)); -/** Check fts token -1. for ngram token, check whether the token contains any words in stopwords -2. for non-ngram token, check if it's stopword or less than fts_min_token_size +/** Check if a fts token is a stopword or less than fts_min_token_size or greater than fts_max_token_size. @param[in] token token string @param[in] stopwords stopwords rb tree -@param[in] is_ngram is ngram parser @param[in] cs token charset @retval true if it is not stopword and length in range @retval false if it is stopword or length not in range */ @@ -229,7 +227,6 @@ bool fts_check_token( const fts_string_t* token, const ib_rbt_t* stopwords, - bool is_ngram, const CHARSET_INFO* cs); /******************************************************************//** diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h index c1db160602f..55a698e8b66 100644 --- a/storage/innobase/include/fts0types.h +++ b/storage/innobase/include/fts0types.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -277,8 +277,6 @@ struct fts_doc_t { st_mysql_ftparser* parser; /*!< fts plugin parser */ - bool is_ngram; /*!< Whether it is a ngram parser */ - ib_rbt_t* stopwords; /*!< Stopwords */ }; diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h index 47b5e32d3de..50c3361a3f9 100644 --- a/storage/innobase/include/row0merge.h +++ b/storage/innobase/include/row0merge.h @@ -129,7 +129,6 @@ struct index_def_t { index_field_t* fields; /*!< field definitions */ st_mysql_ftparser* parser; /*!< fulltext parser plugin */ - bool is_ngram; /*!< true if it's ngram parser */ }; /** Structure for reporting duplicate records. */ diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc index fca6ae5a1bf..321b55e9894 100644 --- a/storage/innobase/row/row0ftsort.cc +++ b/storage/innobase/row/row0ftsort.cc @@ -94,7 +94,6 @@ row_merge_create_fts_sort_index( new_index->n_def = FTS_NUM_FIELDS_SORT; new_index->cached = TRUE; new_index->parser = index->parser; - new_index->is_ngram = index->is_ngram; idx_field = dict_index_get_nth_field(index, 0); charset = fts_index_get_charset(index); @@ -515,7 +514,6 @@ row_merge_fts_doc_tokenize( ulint data_size[FTS_NUM_AUX_INDEX]; ulint n_tuple[FTS_NUM_AUX_INDEX]; st_mysql_ftparser* parser; - bool is_ngram; t_str.f_n_char = 0; t_ctx->buf_used = 0; @@ -524,7 +522,6 @@ row_merge_fts_doc_tokenize( memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint)); parser = sort_buf[0]->index->parser; - is_ngram = sort_buf[0]->index->is_ngram; /* Tokenize the data and add each word string, its corresponding doc id and position to sort buffer */ @@ -570,7 +567,7 @@ row_merge_fts_doc_tokenize( /* Ignore string whose character number is less than "fts_min_token_size" or more than "fts_max_token_size" */ - if (!fts_check_token(&str, NULL, is_ngram, NULL)) { + if (!fts_check_token(&str, NULL, NULL)) { if (parser != NULL) { UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token); ut_free(fts_token); @@ -589,7 +586,7 @@ row_merge_fts_doc_tokenize( /* if "cached_stopword" is defined, ignore words in the stopword list */ - if (!fts_check_token(&str, t_ctx->cached_stopword, is_ngram, + if (!fts_check_token(&str, t_ctx->cached_stopword, doc->charset)) { if (parser != NULL) { UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token); diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index eb0a58f4c28..ea332adfdc3 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -4504,7 +4504,6 @@ row_merge_create_index( ut_a(index); index->parser = index_def->parser; - index->is_ngram = index_def->is_ngram; index->has_new_v_col = has_new_v_col; /* Note the id of the transaction that created this