Remove dict_index_t::is_ngram

When MySQL 5.7 introduced fulltext parser plugins to InnoDB, it hard-coded the plugin name "ngram" to mean something special. Because -fsanitize=undefined was issuing warnings for the assignment in row_merge_create_index() that the value is out of range for Boolean, we remove this code that was not intended to be used in MariaDB 10.2. fts_check_token(): Remove the special logic for N-gram tokens.
9 years ago · 73deafbc17
11 changed files with 12 additions and 128 deletions
--- a/storage/innobase/fts/fts0fts.cc
+++ b/storage/innobase/fts/fts0fts.cc
@ -1200,7 +1200,6 @@ fts_tokenizer_word_get(
 	/* If it is a stopword, do not index it */
 	if (!fts_check_token(text,
 		    cache->stopword_info.cached_stopword,
-		    index_cache->index->is_ngram,
 		    index_cache->charset)) {

 		return(NULL);
@ -3241,7 +3240,6 @@ fts_query_expansion_fetch_doc(
 		}

 		doc.charset = doc_charset;
-		doc.is_ngram = result_doc->is_ngram;

 		if (dfield_is_ext(dfield)) {
 			/* We ignore columns that are stored externally, this
@ -3347,7 +3345,6 @@ fts_fetch_doc_from_rec(

 		doc->found = TRUE;
 		doc->charset = get_doc->index_cache->charset;
-		doc->is_ngram = index->is_ngram;

 		/* Null Field */
 		if (doc->text.f_len == UNIV_SQL_NULL || doc->text.f_len == 0) {
@ -4379,13 +4376,10 @@ fts_sync_table(
 	return(err);
 }

-/** Check fts token
-1. for ngram token, check whether the token contains any words in stopwords
-2. for non-ngram token, check if it's stopword or less than fts_min_token_size
+/** Check if a fts token is a stopword or less than fts_min_token_size
 or greater than fts_max_token_size.
@param[in]	token		token string
@param[in]	stopwords	stopwords rb tree
-@param[in]	is_ngram	is ngram parser
@param[in]	cs		token charset
@retval	true	if it is not stopword and length in range
@retval	false	if it is stopword or lenght not in range */
@ -4393,96 +4387,16 @@ bool
 fts_check_token(
 	const fts_string_t*		token,
 	const ib_rbt_t*			stopwords,
-	bool				is_ngram,
 	const CHARSET_INFO*		cs)
 {
 	ut_ad(cs != NULL || stopwords == NULL);

-	if (!is_ngram) {
-		ib_rbt_bound_t  parent;
+	ib_rbt_bound_t  parent;

-		if (token->f_n_char < fts_min_token_size
-		    || token->f_n_char > fts_max_token_size
-		    || (stopwords != NULL
-			&& rbt_search(stopwords, &parent, token) == 0)) {
-			return(false);
-		} else {
-			return(true);
-		}
-	}
-
-	/* Check token for ngram. */
-	DBUG_EXECUTE_IF(
-		"fts_instrument_ignore_ngram_check",
-		return(true);
-	);
-
-	/* We ignore fts_min_token_size when ngram */
-	ut_ad(token->f_n_char > 0
-	      && token->f_n_char <= fts_max_token_size);
-
-	if (stopwords == NULL) {
-		return(true);
-	}
-
-	/*Ngram checks whether the token contains any words in stopwords.
-	We can't simply use CONTAIN to search in stopwords, because it's
-	built on COMPARE. So we need to tokenize the token into words
-	from unigram to f_n_char, and check them separately. */
-	for (ulint ngram_token_size = 1; ngram_token_size <= token->f_n_char;
-	     ngram_token_size ++) {
-		const char*	start;
-		const char*	next;
-		const char*	end;
-		ulint		char_len;
-		ulint		n_chars;
-
-		start = reinterpret_cast<char*>(token->f_str);
-		next = start;
-		end = start + token->f_len;
-		n_chars = 0;
-
-		while (next < end) {
-			char_len = my_charlen(cs, next, end);
-
-			if (next + char_len > end || char_len == 0) {
-				break;
-			} else {
-				/* Skip SPACE */
-				if (char_len == 1 && *next == ' ') {
-					start = next + 1;
-					next = start;
-					n_chars = 0;
-
-					continue;
-				}
-
-				next += char_len;
-				n_chars++;
-			}
-
-			if (n_chars == ngram_token_size) {
-				fts_string_t	ngram_token;
-				ngram_token.f_str =
-					reinterpret_cast<byte*>(
-					const_cast<char*>(start));
-				ngram_token.f_len = next - start;
-				ngram_token.f_n_char = ngram_token_size;
-
-				ib_rbt_bound_t  parent;
-				if (rbt_search(stopwords, &parent,
-					       &ngram_token) == 0) {
-					return(false);
-				}
-
-				/* Move a char forward */
-				start += my_charlen(cs, start, end);
-				n_chars = ngram_token_size - 1;
-			}
-		}
-	}
-
-	return(true);
+	return(token->f_n_char >= fts_min_token_size
+	       && token->f_n_char <= fts_max_token_size
+	       && (stopwords == NULL
+		   || rbt_search(stopwords, &parent, token) != 0));
 }

 /** Add the token and its start position to the token's list of positions.
@ -4499,8 +4413,7 @@ fts_add_token(
 	/* Ignore string whose character number is less than
 	"fts_min_token_size" or more than "fts_max_token_size" */

-	if (fts_check_token(&str, NULL, result_doc->is_ngram,
-			    result_doc->charset)) {
+	if (fts_check_token(&str, NULL, result_doc->charset)) {

 		mem_heap_t*	heap;
 		fts_string_t	t_str;
@ -7487,7 +7400,6 @@ fts_init_recover_doc(
 		}

 		doc.charset = get_doc->index_cache->charset;
-		doc.is_ngram = get_doc->index_cache->index->is_ngram;

 		if (dfield_is_ext(dfield)) {
 			dict_table_t*	table = cache->sync->table;
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@ -2693,7 +2693,6 @@ fts_query_phrase_split(
 		if (fts_check_token(
 			   &result_str,
 			   cache->stopword_info.cached_stopword,
-			   query->index->is_ngram,
 			   query->fts_index_table.charset)) {
 			/* Add the word to the RB tree so that we can
 			calculate it's frequencey within a document. */
@ -4278,7 +4277,6 @@ fts_expand_query(

 	result_doc.charset = index_cache->charset;
 	result_doc.parser = index_cache->index->parser;
-	result_doc.is_ngram = index_cache->index->is_ngram;

 	query->total_size += SIZEOF_RBT_CREATE;

--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@ -6915,11 +6915,6 @@ ha_innobase::open(
 				static_cast<st_mysql_ftparser *>(
 					plugin_decl(parser)->info);

-			index->is_ngram = strncmp(
-				plugin_name(parser)->str,
-				FTS_NGRAM_PARSER_NAME,
-				plugin_name(parser)->length) == 0;
-
 			DBUG_EXECUTE_IF("fts_instrument_use_default_parser",
 				index->parser = &fts_default_parser;);
 		}
--- a/storage/innobase/handler/ha_innodb.h
+++ b/storage/innobase/handler/ha_innodb.h
@ -619,8 +619,6 @@ extern "C" void wsrep_thd_set_wsrep_last_query_id(THD *thd, query_id_t id);

 extern const struct _ft_vft ft_vft_result;

-#define FTS_NGRAM_PARSER_NAME "ngram"
-
 /** Structure Returned by ha_innobase::ft_init_ext() */
 typedef struct new_ft_info
 {
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@ -2203,7 +2203,6 @@ innobase_create_index_def(
 	memset(index->fields, 0, n_fields * sizeof *index->fields);

 	index->parser = NULL;
-	index->is_ngram = false;
 	index->key_number = key_number;
 	index->n_fields = n_fields;
 	index->name = mem_heap_strdup(heap, key->name);
@ -2237,12 +2236,6 @@ innobase_create_index_def(
 						static_cast<st_mysql_ftparser*>(
 						plugin_decl(parser)->info);

-					index->is_ngram = strncmp(
-						plugin_name(parser)->str,
-						FTS_NGRAM_PARSER_NAME,
-						plugin_name(parser)->length)
-						 == 0;
-
 					break;
 				}
 			}
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@ -918,8 +918,6 @@ struct dict_index_t{
 	dict_field_t*	fields;	/*!< array of field descriptions */
 	st_mysql_ftparser*
 			parser;	/*!< fulltext parser plugin */
-	bool		is_ngram;
-				/*!< true if it's ngram parser */
 	bool		has_new_v_col;
 				/*!< whether it has a newly added virtual
 				column in ALTER */
--- a/storage/innobase/include/fts0priv.h
+++ b/storage/innobase/include/fts0priv.h
@ -1,6 +1,7 @@
 /*****************************************************************************

 Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@ -215,13 +216,10 @@ fts_write_node(
 	fts_node_t*	node)		/*!< in: node columns */
 	MY_ATTRIBUTE((warn_unused_result));

-/** Check fts token
-1. for ngram token, check whether the token contains any words in stopwords
-2. for non-ngram token, check if it's stopword or less than fts_min_token_size
+/** Check if a fts token is a stopword or less than fts_min_token_size
 or greater than fts_max_token_size.
@param[in]	token		token string
@param[in]	stopwords	stopwords rb tree
-@param[in]	is_ngram	is ngram parser
@param[in]	cs		token charset
@retval true	if it is not stopword and length in range
@retval false	if it is stopword or length not in range */
@ -229,7 +227,6 @@ bool
 fts_check_token(
 	const fts_string_t*	token,
 	const ib_rbt_t*		stopwords,
-	bool			is_ngram,
 	const CHARSET_INFO*	cs);

 /******************************************************************//**
--- a/storage/innobase/include/fts0types.h
+++ b/storage/innobase/include/fts0types.h
@ -1,7 +1,7 @@
 /*****************************************************************************

 Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@ -277,8 +277,6 @@ struct fts_doc_t {

 	st_mysql_ftparser* parser;	/*!< fts plugin parser */

-	bool		is_ngram;	/*!< Whether it is a ngram parser */
-
 	ib_rbt_t*	stopwords;	/*!< Stopwords */
 };

--- a/storage/innobase/include/row0merge.h
+++ b/storage/innobase/include/row0merge.h
@ -129,7 +129,6 @@ struct index_def_t {
 	index_field_t*	fields;		/*!< field definitions */
 	st_mysql_ftparser*
 			parser;		/*!< fulltext parser plugin */
-	bool		is_ngram;	/*!< true if it's ngram parser */
 };

 /** Structure for reporting duplicate records. */
--- a/storage/innobase/row/row0ftsort.cc
+++ b/storage/innobase/row/row0ftsort.cc
@ -94,7 +94,6 @@ row_merge_create_fts_sort_index(
 	new_index->n_def = FTS_NUM_FIELDS_SORT;
 	new_index->cached = TRUE;
 	new_index->parser = index->parser;
-	new_index->is_ngram = index->is_ngram;

 	idx_field = dict_index_get_nth_field(index, 0);
 	charset = fts_index_get_charset(index);
@ -515,7 +514,6 @@ row_merge_fts_doc_tokenize(
 	ulint		data_size[FTS_NUM_AUX_INDEX];
 	ulint		n_tuple[FTS_NUM_AUX_INDEX];
 	st_mysql_ftparser*	parser;
-	bool			is_ngram;

 	t_str.f_n_char = 0;
 	t_ctx->buf_used = 0;
@ -524,7 +522,6 @@ row_merge_fts_doc_tokenize(
 	memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));

 	parser = sort_buf[0]->index->parser;
-	is_ngram = sort_buf[0]->index->is_ngram;

 	/* Tokenize the data and add each word string, its corresponding
 	doc id and position to sort buffer */
@ -570,7 +567,7 @@ row_merge_fts_doc_tokenize(

 		/* Ignore string whose character number is less than
 		"fts_min_token_size" or more than "fts_max_token_size" */
-		if (!fts_check_token(&str, NULL, is_ngram, NULL)) {
+		if (!fts_check_token(&str, NULL, NULL)) {
 			if (parser != NULL) {
 				UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
 				ut_free(fts_token);
@ -589,7 +586,7 @@ row_merge_fts_doc_tokenize(

 		/* if "cached_stopword" is defined, ignore words in the
 		stopword list */
-		if (!fts_check_token(&str, t_ctx->cached_stopword, is_ngram,
+		if (!fts_check_token(&str, t_ctx->cached_stopword,
 				     doc->charset)) {
 			if (parser != NULL) {
 				UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@ -4504,7 +4504,6 @@ row_merge_create_index(
 		ut_a(index);

 		index->parser = index_def->parser;
-		index->is_ngram = index_def->is_ngram;
 		index->has_new_v_col = has_new_v_col;

 		/* Note the id of the transaction that created this