Browse Source

Remove dict_index_t::is_ngram

When MySQL 5.7 introduced fulltext parser plugins to InnoDB,
it hard-coded the plugin name "ngram" to mean something special.
Because -fsanitize=undefined was issuing warnings for the
assignment in row_merge_create_index() that the value is out of
range for Boolean, we remove this code that was not intended to
be used in MariaDB 10.2.

fts_check_token(): Remove the special logic for N-gram tokens.
pull/399/head
Marko Mäkelä 9 years ago
parent
commit
73deafbc17
  1. 102
      storage/innobase/fts/fts0fts.cc
  2. 2
      storage/innobase/fts/fts0que.cc
  3. 5
      storage/innobase/handler/ha_innodb.cc
  4. 2
      storage/innobase/handler/ha_innodb.h
  5. 7
      storage/innobase/handler/handler0alter.cc
  6. 2
      storage/innobase/include/dict0mem.h
  7. 7
      storage/innobase/include/fts0priv.h
  8. 4
      storage/innobase/include/fts0types.h
  9. 1
      storage/innobase/include/row0merge.h
  10. 7
      storage/innobase/row/row0ftsort.cc
  11. 1
      storage/innobase/row/row0merge.cc

102
storage/innobase/fts/fts0fts.cc

@ -1200,7 +1200,6 @@ fts_tokenizer_word_get(
/* If it is a stopword, do not index it */
if (!fts_check_token(text,
cache->stopword_info.cached_stopword,
index_cache->index->is_ngram,
index_cache->charset)) {
return(NULL);
@ -3241,7 +3240,6 @@ fts_query_expansion_fetch_doc(
}
doc.charset = doc_charset;
doc.is_ngram = result_doc->is_ngram;
if (dfield_is_ext(dfield)) {
/* We ignore columns that are stored externally, this
@ -3347,7 +3345,6 @@ fts_fetch_doc_from_rec(
doc->found = TRUE;
doc->charset = get_doc->index_cache->charset;
doc->is_ngram = index->is_ngram;
/* Null Field */
if (doc->text.f_len == UNIV_SQL_NULL || doc->text.f_len == 0) {
@ -4379,13 +4376,10 @@ fts_sync_table(
return(err);
}
/** Check fts token
1. for ngram token, check whether the token contains any words in stopwords
2. for non-ngram token, check if it's stopword or less than fts_min_token_size
/** Check if a fts token is a stopword or less than fts_min_token_size
or greater than fts_max_token_size.
@param[in] token token string
@param[in] stopwords stopwords rb tree
@param[in] is_ngram is ngram parser
@param[in] cs token charset
@retval true if it is not stopword and length in range
@retval false if it is stopword or lenght not in range */
@ -4393,96 +4387,16 @@ bool
fts_check_token(
const fts_string_t* token,
const ib_rbt_t* stopwords,
bool is_ngram,
const CHARSET_INFO* cs)
{
ut_ad(cs != NULL || stopwords == NULL);
if (!is_ngram) {
ib_rbt_bound_t parent;
ib_rbt_bound_t parent;
if (token->f_n_char < fts_min_token_size
|| token->f_n_char > fts_max_token_size
|| (stopwords != NULL
&& rbt_search(stopwords, &parent, token) == 0)) {
return(false);
} else {
return(true);
}
}
/* Check token for ngram. */
DBUG_EXECUTE_IF(
"fts_instrument_ignore_ngram_check",
return(true);
);
/* We ignore fts_min_token_size when ngram */
ut_ad(token->f_n_char > 0
&& token->f_n_char <= fts_max_token_size);
if (stopwords == NULL) {
return(true);
}
/*Ngram checks whether the token contains any words in stopwords.
We can't simply use CONTAIN to search in stopwords, because it's
built on COMPARE. So we need to tokenize the token into words
from unigram to f_n_char, and check them separately. */
for (ulint ngram_token_size = 1; ngram_token_size <= token->f_n_char;
ngram_token_size ++) {
const char* start;
const char* next;
const char* end;
ulint char_len;
ulint n_chars;
start = reinterpret_cast<char*>(token->f_str);
next = start;
end = start + token->f_len;
n_chars = 0;
while (next < end) {
char_len = my_charlen(cs, next, end);
if (next + char_len > end || char_len == 0) {
break;
} else {
/* Skip SPACE */
if (char_len == 1 && *next == ' ') {
start = next + 1;
next = start;
n_chars = 0;
continue;
}
next += char_len;
n_chars++;
}
if (n_chars == ngram_token_size) {
fts_string_t ngram_token;
ngram_token.f_str =
reinterpret_cast<byte*>(
const_cast<char*>(start));
ngram_token.f_len = next - start;
ngram_token.f_n_char = ngram_token_size;
ib_rbt_bound_t parent;
if (rbt_search(stopwords, &parent,
&ngram_token) == 0) {
return(false);
}
/* Move a char forward */
start += my_charlen(cs, start, end);
n_chars = ngram_token_size - 1;
}
}
}
return(true);
return(token->f_n_char >= fts_min_token_size
&& token->f_n_char <= fts_max_token_size
&& (stopwords == NULL
|| rbt_search(stopwords, &parent, token) != 0));
}
/** Add the token and its start position to the token's list of positions.
@ -4499,8 +4413,7 @@ fts_add_token(
/* Ignore string whose character number is less than
"fts_min_token_size" or more than "fts_max_token_size" */
if (fts_check_token(&str, NULL, result_doc->is_ngram,
result_doc->charset)) {
if (fts_check_token(&str, NULL, result_doc->charset)) {
mem_heap_t* heap;
fts_string_t t_str;
@ -7487,7 +7400,6 @@ fts_init_recover_doc(
}
doc.charset = get_doc->index_cache->charset;
doc.is_ngram = get_doc->index_cache->index->is_ngram;
if (dfield_is_ext(dfield)) {
dict_table_t* table = cache->sync->table;

2
storage/innobase/fts/fts0que.cc

@ -2693,7 +2693,6 @@ fts_query_phrase_split(
if (fts_check_token(
&result_str,
cache->stopword_info.cached_stopword,
query->index->is_ngram,
query->fts_index_table.charset)) {
/* Add the word to the RB tree so that we can
calculate it's frequencey within a document. */
@ -4278,7 +4277,6 @@ fts_expand_query(
result_doc.charset = index_cache->charset;
result_doc.parser = index_cache->index->parser;
result_doc.is_ngram = index_cache->index->is_ngram;
query->total_size += SIZEOF_RBT_CREATE;

5
storage/innobase/handler/ha_innodb.cc

@ -6915,11 +6915,6 @@ ha_innobase::open(
static_cast<st_mysql_ftparser *>(
plugin_decl(parser)->info);
index->is_ngram = strncmp(
plugin_name(parser)->str,
FTS_NGRAM_PARSER_NAME,
plugin_name(parser)->length) == 0;
DBUG_EXECUTE_IF("fts_instrument_use_default_parser",
index->parser = &fts_default_parser;);
}

2
storage/innobase/handler/ha_innodb.h

@ -619,8 +619,6 @@ extern "C" void wsrep_thd_set_wsrep_last_query_id(THD *thd, query_id_t id);
extern const struct _ft_vft ft_vft_result;
#define FTS_NGRAM_PARSER_NAME "ngram"
/** Structure Returned by ha_innobase::ft_init_ext() */
typedef struct new_ft_info
{

7
storage/innobase/handler/handler0alter.cc

@ -2203,7 +2203,6 @@ innobase_create_index_def(
memset(index->fields, 0, n_fields * sizeof *index->fields);
index->parser = NULL;
index->is_ngram = false;
index->key_number = key_number;
index->n_fields = n_fields;
index->name = mem_heap_strdup(heap, key->name);
@ -2237,12 +2236,6 @@ innobase_create_index_def(
static_cast<st_mysql_ftparser*>(
plugin_decl(parser)->info);
index->is_ngram = strncmp(
plugin_name(parser)->str,
FTS_NGRAM_PARSER_NAME,
plugin_name(parser)->length)
== 0;
break;
}
}

2
storage/innobase/include/dict0mem.h

@ -918,8 +918,6 @@ struct dict_index_t{
dict_field_t* fields; /*!< array of field descriptions */
st_mysql_ftparser*
parser; /*!< fulltext parser plugin */
bool is_ngram;
/*!< true if it's ngram parser */
bool has_new_v_col;
/*!< whether it has a newly added virtual
column in ALTER */

7
storage/innobase/include/fts0priv.h

@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2017, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@ -215,13 +216,10 @@ fts_write_node(
fts_node_t* node) /*!< in: node columns */
MY_ATTRIBUTE((warn_unused_result));
/** Check fts token
1. for ngram token, check whether the token contains any words in stopwords
2. for non-ngram token, check if it's stopword or less than fts_min_token_size
/** Check if a fts token is a stopword or less than fts_min_token_size
or greater than fts_max_token_size.
@param[in] token token string
@param[in] stopwords stopwords rb tree
@param[in] is_ngram is ngram parser
@param[in] cs token charset
@retval true if it is not stopword and length in range
@retval false if it is stopword or length not in range */
@ -229,7 +227,6 @@ bool
fts_check_token(
const fts_string_t* token,
const ib_rbt_t* stopwords,
bool is_ngram,
const CHARSET_INFO* cs);
/******************************************************************//**

4
storage/innobase/include/fts0types.h

@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2017, MariaDB Corporation. All Rights Reserved.
Copyright (c) 2017, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@ -277,8 +277,6 @@ struct fts_doc_t {
st_mysql_ftparser* parser; /*!< fts plugin parser */
bool is_ngram; /*!< Whether it is a ngram parser */
ib_rbt_t* stopwords; /*!< Stopwords */
};

1
storage/innobase/include/row0merge.h

@ -129,7 +129,6 @@ struct index_def_t {
index_field_t* fields; /*!< field definitions */
st_mysql_ftparser*
parser; /*!< fulltext parser plugin */
bool is_ngram; /*!< true if it's ngram parser */
};
/** Structure for reporting duplicate records. */

7
storage/innobase/row/row0ftsort.cc

@ -94,7 +94,6 @@ row_merge_create_fts_sort_index(
new_index->n_def = FTS_NUM_FIELDS_SORT;
new_index->cached = TRUE;
new_index->parser = index->parser;
new_index->is_ngram = index->is_ngram;
idx_field = dict_index_get_nth_field(index, 0);
charset = fts_index_get_charset(index);
@ -515,7 +514,6 @@ row_merge_fts_doc_tokenize(
ulint data_size[FTS_NUM_AUX_INDEX];
ulint n_tuple[FTS_NUM_AUX_INDEX];
st_mysql_ftparser* parser;
bool is_ngram;
t_str.f_n_char = 0;
t_ctx->buf_used = 0;
@ -524,7 +522,6 @@ row_merge_fts_doc_tokenize(
memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
parser = sort_buf[0]->index->parser;
is_ngram = sort_buf[0]->index->is_ngram;
/* Tokenize the data and add each word string, its corresponding
doc id and position to sort buffer */
@ -570,7 +567,7 @@ row_merge_fts_doc_tokenize(
/* Ignore string whose character number is less than
"fts_min_token_size" or more than "fts_max_token_size" */
if (!fts_check_token(&str, NULL, is_ngram, NULL)) {
if (!fts_check_token(&str, NULL, NULL)) {
if (parser != NULL) {
UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
ut_free(fts_token);
@ -589,7 +586,7 @@ row_merge_fts_doc_tokenize(
/* if "cached_stopword" is defined, ignore words in the
stopword list */
if (!fts_check_token(&str, t_ctx->cached_stopword, is_ngram,
if (!fts_check_token(&str, t_ctx->cached_stopword,
doc->charset)) {
if (parser != NULL) {
UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);

1
storage/innobase/row/row0merge.cc

@ -4504,7 +4504,6 @@ row_merge_create_index(
ut_a(index);
index->parser = index_def->parser;
index->is_ngram = index_def->is_ngram;
index->has_new_v_col = has_new_v_col;
/* Note the id of the transaction that created this

Loading…
Cancel
Save