From d081cd67323e0b9f25f9fd5e6e69b9f33c5c8452 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Wed, 29 May 2013 13:35:50 -0400 Subject: [PATCH] strip carriage returns --- storage/tokudb/hatoku_cmp.cc | 6312 ++++++++++++++++----------------- storage/tokudb/hatoku_hton.cc | 4996 +++++++++++++------------- 2 files changed, 5654 insertions(+), 5654 deletions(-) diff --git a/storage/tokudb/hatoku_cmp.cc b/storage/tokudb/hatoku_cmp.cc index 18e43e301b5..dd41a93d5a0 100644 --- a/storage/tokudb/hatoku_cmp.cc +++ b/storage/tokudb/hatoku_cmp.cc @@ -87,3159 +87,3159 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include "hatoku_cmp.h" - -#ifdef WORDS_BIGENDIAN -#error "WORDS_BIGENDIAN not supported" -#endif - -void get_var_field_info( - uint32_t* field_len, // output: length of field - uint32_t* start_offset, // output, length of offset where data starts - uint32_t var_field_index, //input, index of var field we want info on - const uchar* var_field_offset_ptr, //input, pointer to where offset information for all var fields begins - uint32_t num_offset_bytes //input, number of bytes used to store offsets starting at var_field_offset_ptr - ) -{ - uint32_t data_start_offset = 0; - uint32_t data_end_offset = 0; - switch (num_offset_bytes) { - case (1): - data_end_offset = (var_field_offset_ptr + var_field_index)[0]; - break; - case (2): - data_end_offset = uint2korr(var_field_offset_ptr + 2*var_field_index); - break; - default: - assert(false); - break; - } - - if (var_field_index) { - switch (num_offset_bytes) { - case (1): - data_start_offset = (var_field_offset_ptr + var_field_index - 1)[0]; - break; - case (2): - data_start_offset = uint2korr(var_field_offset_ptr + 2*(var_field_index-1)); - break; - default: - assert(false); - break; - } - } - else { - data_start_offset = 0; - } - - *start_offset = data_start_offset; - assert(data_end_offset >= data_start_offset); - *field_len = data_end_offset - data_start_offset; -} - -void get_blob_field_info( - uint32_t* start_offset, - uint32_t len_of_offsets, - const uchar* var_field_data_ptr, - uint32_t num_offset_bytes - ) -{ - uint32_t data_end_offset; - // - // need to set var_field_data_ptr to point to beginning of blobs, which - // is at the end of the var stuff (if they exist), if var stuff does not exist - // then the bottom variable will be 0, and var_field_data_ptr is already - // set correctly - // - if (len_of_offsets) { - switch (num_offset_bytes) { - case (1): - data_end_offset = (var_field_data_ptr - 1)[0]; - break; - case (2): - data_end_offset = uint2korr(var_field_data_ptr - 2); - break; - default: - assert(false); - break; - } - } - else { - data_end_offset = 0; - } - *start_offset = data_end_offset; -} - - -// this function is pattern matched from -// InnoDB's get_innobase_type_from_mysql_type -TOKU_TYPE mysql_to_toku_type (Field* field) { - TOKU_TYPE ret_val = toku_type_unknown; - enum_field_types mysql_type = field->real_type(); - switch (mysql_type) { - case MYSQL_TYPE_LONG: - case MYSQL_TYPE_LONGLONG: - case MYSQL_TYPE_TINY: - case MYSQL_TYPE_SHORT: - case MYSQL_TYPE_INT24: - case MYSQL_TYPE_DATE: - case MYSQL_TYPE_YEAR: - case MYSQL_TYPE_NEWDATE: - case MYSQL_TYPE_ENUM: - case MYSQL_TYPE_SET: - ret_val = toku_type_int; - goto exit; - case MYSQL_TYPE_TIME: - case MYSQL_TYPE_DATETIME: - case MYSQL_TYPE_TIMESTAMP: -#ifdef MARIADB_BASE_VERSION - // case to handle fractional seconds in MariaDB - // - if (field->key_type() == HA_KEYTYPE_BINARY) { - ret_val = toku_type_fixbinary; - goto exit; - } -#endif - ret_val = toku_type_int; - goto exit; - case MYSQL_TYPE_DOUBLE: - ret_val = toku_type_double; - goto exit; - case MYSQL_TYPE_FLOAT: - ret_val = toku_type_float; - goto exit; -#if 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699 - case MYSQL_TYPE_DATETIME2: - case MYSQL_TYPE_TIMESTAMP2: - case MYSQL_TYPE_TIME2: -#endif - case MYSQL_TYPE_NEWDECIMAL: - case MYSQL_TYPE_BIT: - ret_val = toku_type_fixbinary; - goto exit; - case MYSQL_TYPE_STRING: - if (field->binary()) { - ret_val = toku_type_fixbinary; - } - else { - ret_val = toku_type_fixstring; - } - goto exit; - case MYSQL_TYPE_VARCHAR: - if (field->binary()) { - ret_val = toku_type_varbinary; - } - else { - ret_val = toku_type_varstring; - } - goto exit; - case MYSQL_TYPE_TINY_BLOB: - case MYSQL_TYPE_MEDIUM_BLOB: - case MYSQL_TYPE_BLOB: - case MYSQL_TYPE_LONG_BLOB: - ret_val = toku_type_blob; - goto exit; - // - // I believe these are old types that are no longer - // in any 5.1 tables, so tokudb does not need - // to worry about them - // Putting in this assert in case I am wrong. - // Do not support geometry yet. - // - case MYSQL_TYPE_GEOMETRY: - case MYSQL_TYPE_DECIMAL: - case MYSQL_TYPE_VAR_STRING: - case MYSQL_TYPE_NULL: - assert(false); - } -exit: - return ret_val; -} - - -static inline CHARSET_INFO* get_charset_from_num (uint32_t charset_number) { - // - // patternmatched off of InnoDB, due to MySQL bug 42649 - // - if (charset_number == default_charset_info->number) { - return default_charset_info; - } - else if (charset_number == my_charset_latin1.number) { - return &my_charset_latin1; - } - else { - return get_charset(charset_number, MYF(MY_WME)); - } -} - - - -// -// used to read the length of a variable sized field in a tokudb key (buf). -// -static inline uint32_t get_length_from_var_tokudata (uchar* buf, uint32_t length_bytes) { - uint32_t length = (uint32_t)(buf[0]); - if (length_bytes == 2) { - uint32_t rest_of_length = (uint32_t)buf[1]; - length += rest_of_length<<8; - } - return length; -} - -// -// used to deduce the number of bytes used to store the length of a varstring/varbinary -// in a key field stored in tokudb -// -static inline uint32_t get_length_bytes_from_max(uint32_t max_num_bytes) { - return (max_num_bytes > 255) ? 2 : 1; -} - - - -// -// assuming MySQL in little endian, and we are storing in little endian -// -static inline uchar* pack_toku_int (uchar* to_tokudb, uchar* from_mysql, uint32_t num_bytes) { - switch (num_bytes) { - case (1): - memcpy(to_tokudb, from_mysql, 1); - break; - case (2): - memcpy(to_tokudb, from_mysql, 2); - break; - case (3): - memcpy(to_tokudb, from_mysql, 3); - break; - case (4): - memcpy(to_tokudb, from_mysql, 4); - break; - case (8): - memcpy(to_tokudb, from_mysql, 8); - break; - default: - assert(false); - } - return to_tokudb+num_bytes; -} - -// -// assuming MySQL in little endian, and we are unpacking to little endian -// -static inline uchar* unpack_toku_int(uchar* to_mysql, uchar* from_tokudb, uint32_t num_bytes) { - switch (num_bytes) { - case (1): - memcpy(to_mysql, from_tokudb, 1); - break; - case (2): - memcpy(to_mysql, from_tokudb, 2); - break; - case (3): - memcpy(to_mysql, from_tokudb, 3); - break; - case (4): - memcpy(to_mysql, from_tokudb, 4); - break; - case (8): - memcpy(to_mysql, from_tokudb, 8); - break; - default: - assert(false); - } - return from_tokudb+num_bytes; -} - -static inline int cmp_toku_int (uchar* a_buf, uchar* b_buf, bool is_unsigned, uint32_t num_bytes) { - int ret_val = 0; - // - // case for unsigned integers - // - if (is_unsigned) { - uint32_t a_num, b_num = 0; - uint64_t a_big_num, b_big_num = 0; - switch (num_bytes) { - case (1): - a_num = *a_buf; - b_num = *b_buf; - ret_val = a_num-b_num; - goto exit; - case (2): - a_num = uint2korr(a_buf); - b_num = uint2korr(b_buf); - ret_val = a_num-b_num; - goto exit; - case (3): - a_num = uint3korr(a_buf); - b_num = uint3korr(b_buf); - ret_val = a_num-b_num; - goto exit; - case (4): - a_num = uint4korr(a_buf); - b_num = uint4korr(b_buf); - if (a_num < b_num) { - ret_val = -1; goto exit; - } - if (a_num > b_num) { - ret_val = 1; goto exit; - } - ret_val = 0; - goto exit; - case (8): - a_big_num = uint8korr(a_buf); - b_big_num = uint8korr(b_buf); - if (a_big_num < b_big_num) { - ret_val = -1; goto exit; - } - else if (a_big_num > b_big_num) { - ret_val = 1; goto exit; - } - ret_val = 0; - goto exit; - default: - assert(false); - } - } - // - // case for signed integers - // - else { - int32_t a_num, b_num = 0; - int64_t a_big_num, b_big_num = 0; - switch (num_bytes) { - case (1): - a_num = *(signed char *)a_buf; - b_num = *(signed char *)b_buf; - ret_val = a_num-b_num; - goto exit; - case (2): - a_num = sint2korr(a_buf); - b_num = sint2korr(b_buf); - ret_val = a_num-b_num; - goto exit; - case (3): - a_num = sint3korr(a_buf); - b_num = sint3korr(b_buf); - ret_val = a_num - b_num; - goto exit; - case (4): - a_num = sint4korr(a_buf); - b_num = sint4korr(b_buf); - if (a_num < b_num) { - ret_val = -1; goto exit; - } - if (a_num > b_num) { - ret_val = 1; goto exit; - } - ret_val = 0; - goto exit; - case (8): - a_big_num = sint8korr(a_buf); - b_big_num = sint8korr(b_buf); - if (a_big_num < b_big_num) { - ret_val = -1; goto exit; - } - else if (a_big_num > b_big_num) { - ret_val = 1; goto exit; - } - ret_val = 0; - goto exit; - default: - assert(false); - } - } - // - // if this is hit, indicates bug in writing of this function - // - assert(false); -exit: - return ret_val; -} - -static inline uchar* pack_toku_double (uchar* to_tokudb, uchar* from_mysql) { - memcpy(to_tokudb, from_mysql, sizeof(double)); - return to_tokudb + sizeof(double); -} - - -static inline uchar* unpack_toku_double(uchar* to_mysql, uchar* from_tokudb) { - memcpy(to_mysql, from_tokudb, sizeof(double)); - return from_tokudb + sizeof(double); -} - -static inline int cmp_toku_double(uchar* a_buf, uchar* b_buf) { - int ret_val; - double a_num; - double b_num; - doubleget(a_num, a_buf); - doubleget(b_num, b_buf); - if (a_num < b_num) { - ret_val = -1; - goto exit; - } - else if (a_num > b_num) { - ret_val = 1; - goto exit; - } - ret_val = 0; -exit: - return ret_val; -} - - -static inline uchar* pack_toku_float (uchar* to_tokudb, uchar* from_mysql) { - memcpy(to_tokudb, from_mysql, sizeof(float)); - return to_tokudb + sizeof(float); -} - - -static inline uchar* unpack_toku_float(uchar* to_mysql, uchar* from_tokudb) { - memcpy(to_mysql, from_tokudb, sizeof(float)); - return from_tokudb + sizeof(float); -} - -static inline int cmp_toku_float(uchar* a_buf, uchar* b_buf) { - int ret_val; - float a_num; - float b_num; - // - // This is the way Field_float::cmp gets the floats from the buffers - // - memcpy(&a_num, a_buf, sizeof(float)); - memcpy(&b_num, b_buf, sizeof(float)); - if (a_num < b_num) { - ret_val = -1; - goto exit; - } - else if (a_num > b_num) { - ret_val = 1; - goto exit; - } - ret_val = 0; -exit: - return ret_val; -} - - -static inline uchar* pack_toku_binary(uchar* to_tokudb, uchar* from_mysql, uint32_t num_bytes) { - memcpy(to_tokudb, from_mysql, num_bytes); - return to_tokudb + num_bytes; -} - -static inline uchar* unpack_toku_binary(uchar* to_mysql, uchar* from_tokudb, uint32_t num_bytes) { - memcpy(to_mysql, from_tokudb, num_bytes); - return from_tokudb + num_bytes; -} - - -static inline int cmp_toku_binary( - uchar* a_buf, - uint32_t a_num_bytes, - uchar* b_buf, - uint32_t b_num_bytes - ) -{ - int ret_val = 0; - uint32_t num_bytes_to_cmp = (a_num_bytes < b_num_bytes) ? a_num_bytes : b_num_bytes; - ret_val = memcmp(a_buf, b_buf, num_bytes_to_cmp); - if ((ret_val != 0) || (a_num_bytes == b_num_bytes)) { - goto exit; - } - if (a_num_bytes < b_num_bytes) { - ret_val = -1; - goto exit; - } - else { - ret_val = 1; - goto exit; - } -exit: - return ret_val; -} - -// -// partially copied from below -// -uchar* pack_toku_varbinary_from_desc( - uchar* to_tokudb, - const uchar* from_desc, - uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb - uint32_t field_length //length of field - ) -{ - uint32_t length_bytes_in_tokudb = get_length_bytes_from_max(key_part_length); - uint32_t length = field_length; - set_if_smaller(length, key_part_length); - - // - // copy the length bytes, assuming both are in little endian - // - to_tokudb[0] = (uchar)length & 255; - if (length_bytes_in_tokudb > 1) { - to_tokudb[1] = (uchar) (length >> 8); - } - // - // copy the string - // - memcpy(to_tokudb + length_bytes_in_tokudb, from_desc, length); - return to_tokudb + length + length_bytes_in_tokudb; -} - -static inline uchar* pack_toku_varbinary( - uchar* to_tokudb, - uchar* from_mysql, - uint32_t length_bytes_in_mysql, //number of bytes used to encode the length in from_mysql - uint32_t max_num_bytes - ) -{ - uint32_t length = 0; - uint32_t length_bytes_in_tokudb; - switch (length_bytes_in_mysql) { - case (0): - length = max_num_bytes; - break; - case (1): - length = (uint32_t)(*from_mysql); - break; - case (2): - length = uint2korr(from_mysql); - break; - case (3): - length = uint3korr(from_mysql); - break; - case (4): - length = uint4korr(from_mysql); - break; - } - - // - // from this point on, functionality equivalent to pack_toku_varbinary_from_desc - // - set_if_smaller(length,max_num_bytes); - - length_bytes_in_tokudb = get_length_bytes_from_max(max_num_bytes); - // - // copy the length bytes, assuming both are in little endian - // - to_tokudb[0] = (uchar)length & 255; - if (length_bytes_in_tokudb > 1) { - to_tokudb[1] = (uchar) (length >> 8); - } - // - // copy the string - // - memcpy(to_tokudb + length_bytes_in_tokudb, from_mysql + length_bytes_in_mysql, length); - return to_tokudb + length + length_bytes_in_tokudb; -} - -static inline uchar* unpack_toku_varbinary( - uchar* to_mysql, - uchar* from_tokudb, - uint32_t length_bytes_in_tokudb, // number of bytes used to encode length in from_tokudb - uint32_t length_bytes_in_mysql // number of bytes used to encode length in to_mysql - ) -{ - uint32_t length = get_length_from_var_tokudata(from_tokudb, length_bytes_in_tokudb); - - // - // copy the length into the mysql buffer - // - switch (length_bytes_in_mysql) { - case (0): - break; - case (1): - *to_mysql = (uchar) length; - break; - case (2): - int2store(to_mysql, length); - break; - case (3): - int3store(to_mysql, length); - break; - case (4): - int4store(to_mysql, length); - break; - default: - assert(false); - } - // - // copy the binary data - // - memcpy(to_mysql + length_bytes_in_mysql, from_tokudb + length_bytes_in_tokudb, length); - return from_tokudb + length_bytes_in_tokudb+ length; -} - -static inline int cmp_toku_varbinary( - uchar* a_buf, - uchar* b_buf, - uint32_t length_bytes, //number of bytes used to encode length in a_buf and b_buf - uint32_t* a_bytes_read, - uint32_t* b_bytes_read - ) -{ - int ret_val = 0; - uint32_t a_len = get_length_from_var_tokudata(a_buf, length_bytes); - uint32_t b_len = get_length_from_var_tokudata(b_buf, length_bytes); - ret_val = cmp_toku_binary( - a_buf + length_bytes, - a_len, - b_buf + length_bytes, - b_len - ); - *a_bytes_read = a_len + length_bytes; - *b_bytes_read = b_len + length_bytes; - return ret_val; -} - -static inline uchar* pack_toku_blob( - uchar* to_tokudb, - uchar* from_mysql, - uint32_t length_bytes_in_tokudb, //number of bytes to use to encode the length in to_tokudb - uint32_t length_bytes_in_mysql, //number of bytes used to encode the length in from_mysql - uint32_t max_num_bytes, -#if MYSQL_VERSION_ID >= 50600 - const CHARSET_INFO* charset -#else - CHARSET_INFO* charset -#endif - ) -{ - uint32_t length = 0; - uint32_t local_char_length = 0; - uchar* blob_buf = NULL; - - switch (length_bytes_in_mysql) { - case (0): - length = max_num_bytes; - break; - case (1): - length = (uint32_t)(*from_mysql); - break; - case (2): - length = uint2korr(from_mysql); - break; - case (3): - length = uint3korr(from_mysql); - break; - case (4): - length = uint4korr(from_mysql); - break; - } - set_if_smaller(length,max_num_bytes); - - memcpy(&blob_buf,from_mysql+length_bytes_in_mysql,sizeof(uchar *)); - - local_char_length= ((charset->mbmaxlen > 1) ? - max_num_bytes/charset->mbmaxlen : max_num_bytes); - if (length > local_char_length) - { - local_char_length= my_charpos( - charset, - blob_buf, - blob_buf+length, - local_char_length - ); - set_if_smaller(length, local_char_length); - } - - - // - // copy the length bytes, assuming both are in little endian - // - to_tokudb[0] = (uchar)length & 255; - if (length_bytes_in_tokudb > 1) { - to_tokudb[1] = (uchar) (length >> 8); - } - // - // copy the string - // - memcpy(to_tokudb + length_bytes_in_tokudb, blob_buf, length); - return to_tokudb + length + length_bytes_in_tokudb; -} - - -static inline uchar* unpack_toku_blob( - uchar* to_mysql, - uchar* from_tokudb, - uint32_t length_bytes_in_tokudb, // number of bytes used to encode length in from_tokudb - uint32_t length_bytes_in_mysql // number of bytes used to encode length in to_mysql - ) -{ - uint32_t length = get_length_from_var_tokudata(from_tokudb, length_bytes_in_tokudb); - uchar* blob_pos = NULL; - // - // copy the length into the mysql buffer - // - switch (length_bytes_in_mysql) { - case (0): - break; - case (1): - *to_mysql = (uchar) length; - break; - case (2): - int2store(to_mysql, length); - break; - case (3): - int3store(to_mysql, length); - break; - case (4): - int4store(to_mysql, length); - break; - default: - assert(false); - } - // - // copy the binary data - // - blob_pos = from_tokudb + length_bytes_in_tokudb; - memcpy(to_mysql + length_bytes_in_mysql, &blob_pos, sizeof(uchar *)); - return from_tokudb + length_bytes_in_tokudb+ length; -} - - -// -// partially copied from below -// -uchar* pack_toku_varstring_from_desc( - uchar* to_tokudb, - const uchar* from_desc, - uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb - uint32_t field_length, - uint32_t charset_num//length of field - ) -{ - CHARSET_INFO* charset = NULL; - uint32_t length_bytes_in_tokudb = get_length_bytes_from_max(key_part_length); - uint32_t length = field_length; - uint32_t local_char_length = 0; - set_if_smaller(length, key_part_length); - - charset = get_charset_from_num(charset_num); - - // - // copy the string - // - local_char_length= ((charset->mbmaxlen > 1) ? - key_part_length/charset->mbmaxlen : key_part_length); - if (length > local_char_length) - { - local_char_length= my_charpos( - charset, - from_desc, - from_desc+length, - local_char_length - ); - set_if_smaller(length, local_char_length); - } - - - // - // copy the length bytes, assuming both are in little endian - // - to_tokudb[0] = (uchar)length & 255; - if (length_bytes_in_tokudb > 1) { - to_tokudb[1] = (uchar) (length >> 8); - } - // - // copy the string - // - memcpy(to_tokudb + length_bytes_in_tokudb, from_desc, length); - return to_tokudb + length + length_bytes_in_tokudb; -} - -static inline uchar* pack_toku_varstring( - uchar* to_tokudb, - uchar* from_mysql, - uint32_t length_bytes_in_tokudb, //number of bytes to use to encode the length in to_tokudb - uint32_t length_bytes_in_mysql, //number of bytes used to encode the length in from_mysql - uint32_t max_num_bytes, -#if MYSQL_VERSION_ID >= 50600 - const CHARSET_INFO *charset -#else - CHARSET_INFO* charset -#endif - ) -{ - uint32_t length = 0; - uint32_t local_char_length = 0; - - switch (length_bytes_in_mysql) { - case (0): - length = max_num_bytes; - break; - case (1): - length = (uint32_t)(*from_mysql); - break; - case (2): - length = uint2korr(from_mysql); - break; - case (3): - length = uint3korr(from_mysql); - break; - case (4): - length = uint4korr(from_mysql); - break; - } - set_if_smaller(length,max_num_bytes); - - local_char_length= ((charset->mbmaxlen > 1) ? - max_num_bytes/charset->mbmaxlen : max_num_bytes); - if (length > local_char_length) - { - local_char_length= my_charpos( - charset, - from_mysql+length_bytes_in_mysql, - from_mysql+length_bytes_in_mysql+length, - local_char_length - ); - set_if_smaller(length, local_char_length); - } - - - // - // copy the length bytes, assuming both are in little endian - // - to_tokudb[0] = (uchar)length & 255; - if (length_bytes_in_tokudb > 1) { - to_tokudb[1] = (uchar) (length >> 8); - } - // - // copy the string - // - memcpy(to_tokudb + length_bytes_in_tokudb, from_mysql + length_bytes_in_mysql, length); - return to_tokudb + length + length_bytes_in_tokudb; -} - -static inline int cmp_toku_string( - uchar* a_buf, - uint32_t a_num_bytes, - uchar* b_buf, - uint32_t b_num_bytes, - uint32_t charset_number - ) -{ - int ret_val = 0; - CHARSET_INFO* charset = NULL; - - charset = get_charset_from_num(charset_number); - - ret_val = charset->coll->strnncollsp( - charset, - a_buf, - a_num_bytes, - b_buf, - b_num_bytes, - 0 - ); - return ret_val; -} - -static inline int cmp_toku_varstring( - uchar* a_buf, - uchar* b_buf, - uint32_t length_bytes, //number of bytes used to encode length in a_buf and b_buf - uint32_t charset_num, - uint32_t* a_bytes_read, - uint32_t* b_bytes_read - ) -{ - int ret_val = 0; - uint32_t a_len = get_length_from_var_tokudata(a_buf, length_bytes); - uint32_t b_len = get_length_from_var_tokudata(b_buf, length_bytes); - ret_val = cmp_toku_string( - a_buf + length_bytes, - a_len, - b_buf + length_bytes, - b_len, - charset_num - ); - *a_bytes_read = a_len + length_bytes; - *b_bytes_read = b_len + length_bytes; - return ret_val; -} - -static inline int tokudb_compare_two_hidden_keys( - const void* new_key_data, - const uint32_t new_key_size, - const void* saved_key_data, - const uint32_t saved_key_size - ) { - assert( (new_key_size >= TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH) && (saved_key_size >= TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH) ); - ulonglong a = hpk_char_to_num((uchar *) new_key_data); - ulonglong b = hpk_char_to_num((uchar *) saved_key_data); - return a < b ? -1 : (a > b ? 1 : 0); -} - -// -// Returns number of bytes used for a given TOKU_TYPE -// in a key descriptor. The number of bytes returned -// here MUST match the number of bytes used for the encoding -// in create_toku_key_descriptor_for_key -// Parameters: -// [in] row_desc - buffer that contains portion of descriptor -// created in create_toku_key_descriptor_for_key. The first -// byte points to the TOKU_TYPE. -// -uint32_t skip_field_in_descriptor(uchar* row_desc) { - uchar* row_desc_pos = row_desc; - TOKU_TYPE toku_type = (TOKU_TYPE)row_desc_pos[0]; - row_desc_pos++; - - switch (toku_type) { - case (toku_type_hpk): - case (toku_type_double): - case (toku_type_float): - break; - case (toku_type_int): - row_desc_pos += 2; - break; - case (toku_type_fixbinary): - case (toku_type_varbinary): - row_desc_pos++; - break; - case (toku_type_fixstring): - case (toku_type_varstring): - case (toku_type_blob): - row_desc_pos++; - row_desc_pos += sizeof(uint32_t); - break; - default: - assert(false); - break; - } - return (uint32_t)(row_desc_pos - row_desc); -} - -// -// outputs a descriptor for key into buf. Returns number of bytes used in buf -// to store the descriptor. Number of bytes used MUST match number of bytes -// we would skip in skip_field_in_descriptor -// -int create_toku_key_descriptor_for_key(KEY* key, uchar* buf) { - uchar* pos = buf; - uint32_t num_bytes_in_field = 0; - uint32_t charset_num = 0; - for (uint i = 0; i < get_key_parts(key); i++){ - Field* field = key->key_part[i].field; - // - // The first byte states if there is a null byte - // 0 means no null byte, non-zer means there - // is one - // - *pos = field->null_bit; - pos++; - - // - // The second byte for each field is the type - // - TOKU_TYPE type = mysql_to_toku_type(field); - assert (type < 256); - *pos = (uchar)(type & 255); - pos++; - - // - // based on the type, extra data follows afterwards - // - switch (type) { - // - // two bytes follow for ints, first one states how many - // bytes the int is (1 , 2, 3, 4 or 8) - // next one states if it is signed or not - // - case (toku_type_int): - num_bytes_in_field = field->pack_length(); - assert (num_bytes_in_field < 256); - *pos = (uchar)(num_bytes_in_field & 255); - pos++; - *pos = (field->flags & UNSIGNED_FLAG) ? 1 : 0; - pos++; - break; - // - // nothing follows floats and doubles - // - case (toku_type_double): - case (toku_type_float): - break; - // - // one byte follow stating the length of the field - // - case (toku_type_fixbinary): - num_bytes_in_field = field->pack_length(); - set_if_smaller(num_bytes_in_field, key->key_part[i].length); - assert(num_bytes_in_field < 256); - pos[0] = (uchar)(num_bytes_in_field & 255); - pos++; - break; - // - // one byte follows: the number of bytes used to encode the length - // - case (toku_type_varbinary): - *pos = (uchar)(get_length_bytes_from_max(key->key_part[i].length) & 255); - pos++; - break; - // - // five bytes follow: one for the number of bytes to encode the length, - // four for the charset number - // - case (toku_type_fixstring): - case (toku_type_varstring): - case (toku_type_blob): - *pos = (uchar)(get_length_bytes_from_max(key->key_part[i].length) & 255); - pos++; - charset_num = field->charset()->number; - pos[0] = (uchar)(charset_num & 255); - pos[1] = (uchar)((charset_num >> 8) & 255); - pos[2] = (uchar)((charset_num >> 16) & 255); - pos[3] = (uchar)((charset_num >> 24) & 255); - pos += 4; - break; - default: - assert(false); - - } - } - return pos - buf; -} - - -// -// Creates a descriptor for a DB. That contains all information necessary -// to do both key comparisons and data comparisons (for dup-sort databases). -// -// There are two types of descriptors we care about: -// 1) Primary key, (in a no-dup database) -// 2) secondary keys, which are a secondary key followed by a primary key, -// but in a no-dup database. -// -// I realize this may be confusing, but here is how it works. -// All DB's have a key compare. -// The format of the descriptor must be able to handle both. -// -// The first four bytes store an offset into the descriptor to the second piece -// used for data comparisons. So, if in the future we want to append something -// to the descriptor, we can. -// -// -int create_toku_key_descriptor( - uchar* buf, - bool is_first_hpk, - KEY* first_key, - bool is_second_hpk, - KEY* second_key - ) -{ - // - // The first four bytes always contain the offset of where the first key - // ends. - // - uchar* pos = buf + 4; - uint32_t num_bytes = 0; - uint32_t offset = 0; - - - if (is_first_hpk) { - pos[0] = 0; //say there is NO infinity byte - pos[1] = 0; //field cannot be NULL, stating it - pos[2] = toku_type_hpk; - pos += 3; - } - else { - // - // first key is NOT a hidden primary key, so we now pack first_key - // - pos[0] = 1; //say there is an infinity byte - pos++; - num_bytes = create_toku_key_descriptor_for_key(first_key, pos); - pos += num_bytes; - } - - // - // if we do not have a second key, we can jump to exit right now - // we do not have a second key if it is not a hidden primary key - // and if second_key is NULL - // - if (is_first_hpk || (!is_second_hpk && (second_key == NULL)) ) { - goto exit; - } - - // - // if we have a second key, and it is an hpk, we need to pack it, and - // write in the offset to this position in the first four bytes - // - if (is_second_hpk) { - pos[0] = 0; //field cannot be NULL, stating it - pos[1] = toku_type_hpk; - pos += 2; - } - else { - // - // second key is NOT a hidden primary key, so we now pack second_key - // - num_bytes = create_toku_key_descriptor_for_key(second_key, pos); - pos += num_bytes; - } - - -exit: - offset = pos - buf; - buf[0] = (uchar)(offset & 255); - buf[1] = (uchar)((offset >> 8) & 255); - buf[2] = (uchar)((offset >> 16) & 255); - buf[3] = (uchar)((offset >> 24) & 255); - - return pos - buf; -} - - -static inline int compare_toku_field( - uchar* a_buf, - uchar* b_buf, - uchar* row_desc, - uint32_t* a_bytes_read, - uint32_t* b_bytes_read, - uint32_t* row_desc_bytes_read - ) -{ - int ret_val = 0; - uchar* row_desc_pos = row_desc; - uint32_t num_bytes = 0; - uint32_t length_bytes = 0; - uint32_t charset_num = 0; - bool is_unsigned = false; - - TOKU_TYPE toku_type = (TOKU_TYPE)row_desc_pos[0]; - row_desc_pos++; - - switch (toku_type) { - case (toku_type_hpk): - ret_val = tokudb_compare_two_hidden_keys( - a_buf, - TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH, - b_buf, - TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH - ); - *a_bytes_read = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH; - *b_bytes_read = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH; - break; - case (toku_type_int): - num_bytes = row_desc_pos[0]; - is_unsigned = row_desc_pos[1]; - ret_val = cmp_toku_int( - a_buf, - b_buf, - is_unsigned, - num_bytes - ); - *a_bytes_read = num_bytes; - *b_bytes_read = num_bytes; - row_desc_pos += 2; - break; - case (toku_type_double): - ret_val = cmp_toku_double(a_buf, b_buf); - *a_bytes_read = sizeof(double); - *b_bytes_read = sizeof(double); - break; - case (toku_type_float): - ret_val = cmp_toku_float(a_buf, b_buf); - *a_bytes_read = sizeof(float); - *b_bytes_read = sizeof(float); - break; - case (toku_type_fixbinary): - num_bytes = row_desc_pos[0]; - ret_val = cmp_toku_binary(a_buf, num_bytes, b_buf,num_bytes); - *a_bytes_read = num_bytes; - *b_bytes_read = num_bytes; - row_desc_pos++; - break; - case (toku_type_varbinary): - length_bytes = row_desc_pos[0]; - ret_val = cmp_toku_varbinary( - a_buf, - b_buf, - length_bytes, - a_bytes_read, - b_bytes_read - ); - row_desc_pos++; - break; - case (toku_type_fixstring): - case (toku_type_varstring): - case (toku_type_blob): - length_bytes = row_desc_pos[0]; - row_desc_pos++; - // - // not sure we want to read charset_num like this - // - charset_num = *(uint32_t *)row_desc_pos; - row_desc_pos += sizeof(uint32_t); - ret_val = cmp_toku_varstring( - a_buf, - b_buf, - length_bytes, - charset_num, - a_bytes_read, - b_bytes_read - ); - break; - default: - assert(false); - break; - } - - *row_desc_bytes_read = row_desc_pos - row_desc; - return ret_val; -} - -// -// packs a field from a MySQL buffer into a tokudb buffer. -// Used for inserts/updates -// -uchar* pack_toku_key_field( - uchar* to_tokudb, - uchar* from_mysql, - Field* field, - uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff - ) -{ - uchar* new_pos = NULL; - uint32_t num_bytes = 0; - TOKU_TYPE toku_type = mysql_to_toku_type(field); - switch(toku_type) { - case (toku_type_int): - assert(key_part_length == field->pack_length()); - new_pos = pack_toku_int( - to_tokudb, - from_mysql, - field->pack_length() - ); - goto exit; - case (toku_type_double): - assert(field->pack_length() == sizeof(double)); - assert(key_part_length == sizeof(double)); - new_pos = pack_toku_double(to_tokudb, from_mysql); - goto exit; - case (toku_type_float): - assert(field->pack_length() == sizeof(float)); - assert(key_part_length == sizeof(float)); - new_pos = pack_toku_float(to_tokudb, from_mysql); - goto exit; - case (toku_type_fixbinary): - num_bytes = field->pack_length(); - set_if_smaller(num_bytes, key_part_length); - new_pos = pack_toku_binary( - to_tokudb, - from_mysql, - num_bytes - ); - goto exit; - case (toku_type_fixstring): - num_bytes = field->pack_length(); - set_if_smaller(num_bytes, key_part_length); - new_pos = pack_toku_varstring( - to_tokudb, - from_mysql, - get_length_bytes_from_max(key_part_length), - 0, - num_bytes, - field->charset() - ); - goto exit; - case (toku_type_varbinary): - new_pos = pack_toku_varbinary( - to_tokudb, - from_mysql, - ((Field_varstring *)field)->length_bytes, - key_part_length - ); - goto exit; - case (toku_type_varstring): - new_pos = pack_toku_varstring( - to_tokudb, - from_mysql, - get_length_bytes_from_max(key_part_length), - ((Field_varstring *)field)->length_bytes, - key_part_length, - field->charset() - ); - goto exit; - case (toku_type_blob): - new_pos = pack_toku_blob( - to_tokudb, - from_mysql, - get_length_bytes_from_max(key_part_length), - ((Field_blob *)field)->row_pack_length(), //only calling this because packlength is returned - key_part_length, - field->charset() - ); - goto exit; - default: - assert(false); - } - assert(false); -exit: - return new_pos; -} - -// -// packs a field from a MySQL buffer into a tokudb buffer. -// Used for queries. The only difference between this function -// and pack_toku_key_field is that all variable sized columns -// use 2 bytes to encode the length, regardless of the field -// So varchar(4) will still use 2 bytes to encode the field -// -uchar* pack_key_toku_key_field( - uchar* to_tokudb, - uchar* from_mysql, - Field* field, - uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff - ) -{ - uchar* new_pos = NULL; - TOKU_TYPE toku_type = mysql_to_toku_type(field); - switch(toku_type) { - case (toku_type_int): - case (toku_type_double): - case (toku_type_float): - case (toku_type_fixbinary): - case (toku_type_fixstring): - new_pos = pack_toku_key_field(to_tokudb, from_mysql, field, key_part_length); - goto exit; - case (toku_type_varbinary): - new_pos = pack_toku_varbinary( - to_tokudb, - from_mysql, - 2, // for some idiotic reason, 2 bytes are always used here, regardless of length of field - key_part_length - ); - goto exit; - case (toku_type_varstring): - case (toku_type_blob): - new_pos = pack_toku_varstring( - to_tokudb, - from_mysql, - get_length_bytes_from_max(key_part_length), - 2, // for some idiotic reason, 2 bytes are always used here, regardless of length of field - key_part_length, - field->charset() - ); - goto exit; - default: - assert(false); - } - - assert(false); -exit: - return new_pos; -} - - -uchar* unpack_toku_key_field( - uchar* to_mysql, - uchar* from_tokudb, - Field* field, - uint32_t key_part_length - ) -{ - uchar* new_pos = NULL; - uint32_t num_bytes = 0; - uint32_t num_bytes_copied; - TOKU_TYPE toku_type = mysql_to_toku_type(field); - switch(toku_type) { - case (toku_type_int): - assert(key_part_length == field->pack_length()); - new_pos = unpack_toku_int( - to_mysql, - from_tokudb, - field->pack_length() - ); - goto exit; - case (toku_type_double): - assert(field->pack_length() == sizeof(double)); - assert(key_part_length == sizeof(double)); - new_pos = unpack_toku_double(to_mysql, from_tokudb); - goto exit; - case (toku_type_float): - assert(field->pack_length() == sizeof(float)); - assert(key_part_length == sizeof(float)); - new_pos = unpack_toku_float(to_mysql, from_tokudb); - goto exit; - case (toku_type_fixbinary): - num_bytes = field->pack_length(); - set_if_smaller(num_bytes, key_part_length); - new_pos = unpack_toku_binary( - to_mysql, - from_tokudb, - num_bytes - ); - goto exit; - case (toku_type_fixstring): - num_bytes = field->pack_length(); - new_pos = unpack_toku_varbinary( - to_mysql, - from_tokudb, - get_length_bytes_from_max(key_part_length), - 0 - ); - num_bytes_copied = new_pos - (from_tokudb + get_length_bytes_from_max(key_part_length)); - assert(num_bytes_copied <= num_bytes); - memset(to_mysql+num_bytes_copied, field->charset()->pad_char, num_bytes - num_bytes_copied); - goto exit; - case (toku_type_varbinary): - case (toku_type_varstring): - new_pos = unpack_toku_varbinary( - to_mysql, - from_tokudb, - get_length_bytes_from_max(key_part_length), - ((Field_varstring *)field)->length_bytes - ); - goto exit; - case (toku_type_blob): - new_pos = unpack_toku_blob( - to_mysql, - from_tokudb, - get_length_bytes_from_max(key_part_length), - ((Field_blob *)field)->row_pack_length() //only calling this because packlength is returned - ); - goto exit; - default: - assert(false); - } - assert(false); -exit: - return new_pos; -} - - -int tokudb_compare_two_keys( - const void* new_key_data, - const uint32_t new_key_size, - const void* saved_key_data, - const uint32_t saved_key_size, - const void* row_desc, - const uint32_t row_desc_size, - bool cmp_prefix - ) -{ - int ret_val = 0; - int8_t new_key_inf_val = COL_NEG_INF; - int8_t saved_key_inf_val = COL_NEG_INF; - - uchar* row_desc_ptr = (uchar *)row_desc; - uchar *new_key_ptr = (uchar *)new_key_data; - uchar *saved_key_ptr = (uchar *)saved_key_data; - - uint32_t new_key_bytes_left = new_key_size; - uint32_t saved_key_bytes_left = saved_key_size; - - // - // if the keys have an infinity byte, set it - // - if (row_desc_ptr[0]) { - new_key_inf_val = (int8_t)new_key_ptr[0]; - saved_key_inf_val = (int8_t)saved_key_ptr[0]; - new_key_ptr++; - saved_key_ptr++; - } - row_desc_ptr++; - - while ( (uint32_t)(new_key_ptr - (uchar *)new_key_data) < new_key_size && - (uint32_t)(saved_key_ptr - (uchar *)saved_key_data) < saved_key_size && - (uint32_t)(row_desc_ptr - (uchar *)row_desc) < row_desc_size - ) - { - uint32_t new_key_field_length; - uint32_t saved_key_field_length; - uint32_t row_desc_field_length; - // - // if there is a null byte at this point in the key - // - if (row_desc_ptr[0]) { - // - // compare null bytes. If different, return - // - if (new_key_ptr[0] != saved_key_ptr[0]) { - ret_val = ((int) *new_key_ptr - (int) *saved_key_ptr); - goto exit; - } - saved_key_ptr++; - // - // in case we just read the fact that new_key_ptr and saved_key_ptr - // have NULL as their next field - // - if (!*new_key_ptr++) { - // - // skip row_desc_ptr[0] read in if clause - // - row_desc_ptr++; - // - // skip data that describes rest of field - // - row_desc_ptr += skip_field_in_descriptor(row_desc_ptr); - continue; - } - } - row_desc_ptr++; - - ret_val = compare_toku_field( - new_key_ptr, - saved_key_ptr, - row_desc_ptr, - &new_key_field_length, - &saved_key_field_length, - &row_desc_field_length - ); - new_key_ptr += new_key_field_length; - saved_key_ptr += saved_key_field_length; - row_desc_ptr += row_desc_field_length; - if (ret_val) { - goto exit; - } - - assert((uint32_t)(new_key_ptr - (uchar *)new_key_data) <= new_key_size); - assert((uint32_t)(saved_key_ptr - (uchar *)saved_key_data) <= saved_key_size); - assert((uint32_t)(row_desc_ptr - (uchar *)row_desc) <= row_desc_size); - } - new_key_bytes_left = new_key_size - ((uint32_t)(new_key_ptr - (uchar *)new_key_data)); - saved_key_bytes_left = saved_key_size - ((uint32_t)(saved_key_ptr - (uchar *)saved_key_data)); - if (cmp_prefix) { - ret_val = 0; - } - // - // in this case, read both keys to completion, now read infinity byte - // - else if (new_key_bytes_left== 0 && saved_key_bytes_left== 0) { - ret_val = new_key_inf_val - saved_key_inf_val; - } - // - // at this point, one SHOULD be 0 - // - else if (new_key_bytes_left == 0 && saved_key_bytes_left > 0) { - ret_val = (new_key_inf_val == COL_POS_INF ) ? 1 : -1; - } - else if (new_key_bytes_left > 0 && saved_key_bytes_left == 0) { - ret_val = (saved_key_inf_val == COL_POS_INF ) ? -1 : 1; - } - // - // this should never happen, perhaps we should assert(false) - // - else { - assert(false); - ret_val = new_key_bytes_left - saved_key_bytes_left; - } -exit: - return ret_val; -} - -int tokudb_cmp_dbt_key(DB* file, const DBT *keya, const DBT *keyb) { - int cmp; - if (file->cmp_descriptor->dbt.size == 0) { - int num_bytes_cmp = keya->size < keyb->size ? - keya->size : keyb->size; - cmp = memcmp(keya->data,keyb->data,num_bytes_cmp); - if (cmp == 0 && (keya->size != keyb->size)) { - cmp = keya->size < keyb->size ? -1 : 1; - } - } - else { - cmp = tokudb_compare_two_keys( - keya->data, - keya->size, - keyb->data, - keyb->size, - (uchar *)file->cmp_descriptor->dbt.data + 4, - (*(uint32_t *)file->cmp_descriptor->dbt.data) - 4, - false - ); - } - return cmp; -} - -//TODO: QQQ Only do one direction for prefix. -int tokudb_prefix_cmp_dbt_key(DB *file, const DBT *keya, const DBT *keyb) { - int cmp = tokudb_compare_two_keys( - keya->data, - keya->size, - keyb->data, - keyb->size, - (uchar *)file->cmp_descriptor->dbt.data + 4, - *(uint32_t *)file->cmp_descriptor->dbt.data - 4, - true - ); - return cmp; -} - -static int tokudb_compare_two_key_parts( - const void* new_key_data, - const uint32_t new_key_size, - const void* saved_key_data, - const uint32_t saved_key_size, - const void* row_desc, - const uint32_t row_desc_size, - uint max_parts - ) -{ - int ret_val = 0; - - uchar* row_desc_ptr = (uchar *)row_desc; - uchar *new_key_ptr = (uchar *)new_key_data; - uchar *saved_key_ptr = (uchar *)saved_key_data; - - // - // if the keys have an infinity byte, set it - // - if (row_desc_ptr[0]) { - // new_key_inf_val = (int8_t)new_key_ptr[0]; - // saved_key_inf_val = (int8_t)saved_key_ptr[0]; - new_key_ptr++; - saved_key_ptr++; - } - row_desc_ptr++; - - for (uint i = 0; i < max_parts; i++) { - if (!((uint32_t)(new_key_ptr - (uchar *)new_key_data) < new_key_size && - (uint32_t)(saved_key_ptr - (uchar *)saved_key_data) < saved_key_size && - (uint32_t)(row_desc_ptr - (uchar *)row_desc) < row_desc_size)) - break; - uint32_t new_key_field_length; - uint32_t saved_key_field_length; - uint32_t row_desc_field_length; - // - // if there is a null byte at this point in the key - // - if (row_desc_ptr[0]) { - // - // compare null bytes. If different, return - // - if (new_key_ptr[0] != saved_key_ptr[0]) { - ret_val = ((int) *new_key_ptr - (int) *saved_key_ptr); - goto exit; - } - saved_key_ptr++; - // - // in case we just read the fact that new_key_ptr and saved_key_ptr - // have NULL as their next field - // - if (!*new_key_ptr++) { - // - // skip row_desc_ptr[0] read in if clause - // - row_desc_ptr++; - // - // skip data that describes rest of field - // - row_desc_ptr += skip_field_in_descriptor(row_desc_ptr); - continue; - } - } - row_desc_ptr++; - - ret_val = compare_toku_field( - new_key_ptr, - saved_key_ptr, - row_desc_ptr, - &new_key_field_length, - &saved_key_field_length, - &row_desc_field_length - ); - new_key_ptr += new_key_field_length; - saved_key_ptr += saved_key_field_length; - row_desc_ptr += row_desc_field_length; - if (ret_val) { - goto exit; - } - - assert((uint32_t)(new_key_ptr - (uchar *)new_key_data) <= new_key_size); - assert((uint32_t)(saved_key_ptr - (uchar *)saved_key_data) <= saved_key_size); - assert((uint32_t)(row_desc_ptr - (uchar *)row_desc) <= row_desc_size); - } - - ret_val = 0; -exit: - return ret_val; -} - -static int tokudb_cmp_dbt_key_parts(DB *file, const DBT *keya, const DBT *keyb, uint max_parts) { - assert(file->cmp_descriptor->dbt.size); - return tokudb_compare_two_key_parts( - keya->data, - keya->size, - keyb->data, - keyb->size, - (uchar *)file->cmp_descriptor->dbt.data + 4, - (*(uint32_t *)file->cmp_descriptor->dbt.data) - 4, - max_parts); -} - -uint32_t create_toku_main_key_pack_descriptor ( - uchar* buf - ) -{ - // - // The first four bytes always contain the offset of where the first key - // ends. - // - uchar* pos = buf + 4; - uint32_t offset = 0; - // - // one byte states if this is the main dictionary - // - pos[0] = 1; - pos++; - goto exit; - - -exit: - offset = pos - buf; - buf[0] = (uchar)(offset & 255); - buf[1] = (uchar)((offset >> 8) & 255); - buf[2] = (uchar)((offset >> 16) & 255); - buf[3] = (uchar)((offset >> 24) & 255); - - return pos - buf; -} - -#define COL_FIX_FIELD 0x11 -#define COL_VAR_FIELD 0x22 -#define COL_BLOB_FIELD 0x33 - -#define COL_HAS_NO_CHARSET 0x44 -#define COL_HAS_CHARSET 0x55 - -#define COL_FIX_PK_OFFSET 0x66 -#define COL_VAR_PK_OFFSET 0x77 - -#define CK_FIX_RANGE 0x88 -#define CK_VAR_RANGE 0x99 - -#define COPY_OFFSET_TO_BUF memcpy ( \ - pos, \ - &kc_info->cp_info[pk_index][field_index].col_pack_val, \ - sizeof(uint32_t) \ - ); \ - pos += sizeof(uint32_t); - - -uint32_t pack_desc_pk_info(uchar* buf, KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, KEY_PART_INFO* key_part) { - uchar* pos = buf; - uint16 field_index = key_part->field->field_index; - Field* field = table_share->field[field_index]; - TOKU_TYPE toku_type = mysql_to_toku_type(field); - uint32_t key_part_length = key_part->length; - uint32_t field_length; - uchar len_bytes = 0; - - switch(toku_type) { - case (toku_type_int): - case (toku_type_double): - case (toku_type_float): - pos[0] = COL_FIX_FIELD; - pos++; - assert(kc_info->field_lengths[field_index] < 256); - pos[0] = kc_info->field_lengths[field_index]; - pos++; - break; - case (toku_type_fixbinary): - pos[0] = COL_FIX_FIELD; - pos++; - field_length = field->pack_length(); - set_if_smaller(key_part_length, field_length); - assert(key_part_length < 256); - pos[0] = (uchar)key_part_length; - pos++; - break; - case (toku_type_fixstring): - case (toku_type_varbinary): - case (toku_type_varstring): - case (toku_type_blob): - pos[0] = COL_VAR_FIELD; - pos++; - len_bytes = (key_part_length > 255) ? 2 : 1; - pos[0] = len_bytes; - pos++; - break; - default: - assert(false); - } - - return pos - buf; -} - -uint32_t pack_desc_pk_offset_info( - uchar* buf, - KEY_AND_COL_INFO* kc_info, - TABLE_SHARE* table_share, - KEY_PART_INFO* key_part, - KEY* prim_key, - uchar* pk_info - ) -{ - uchar* pos = buf; - uint16 field_index = key_part->field->field_index; - bool found_col_in_pk = false; - uint32_t index_in_pk; - - bool is_constant_offset = true; - uint32_t offset = 0; - for (uint i = 0; i < get_key_parts(prim_key); i++) { - KEY_PART_INFO curr = prim_key->key_part[i]; - uint16 curr_field_index = curr.field->field_index; - - if (pk_info[2*i] == COL_VAR_FIELD) { - is_constant_offset = false; - } - - if (curr_field_index == field_index) { - found_col_in_pk = true; - index_in_pk = i; - break; - } - offset += pk_info[2*i + 1]; - } - assert(found_col_in_pk); - if (is_constant_offset) { - pos[0] = COL_FIX_PK_OFFSET; - pos++; - - memcpy (pos, &offset, sizeof(offset)); - pos += sizeof(offset); - } - else { - pos[0] = COL_VAR_PK_OFFSET; - pos++; - - memcpy(pos, &index_in_pk, sizeof(index_in_pk)); - pos += sizeof(index_in_pk); - } - return pos - buf; -} - -uint32_t pack_desc_offset_info(uchar* buf, KEY_AND_COL_INFO* kc_info, uint pk_index, TABLE_SHARE* table_share, KEY_PART_INFO* key_part) { - uchar* pos = buf; - uint16 field_index = key_part->field->field_index; - Field* field = table_share->field[field_index]; - TOKU_TYPE toku_type = mysql_to_toku_type(field); - bool found_index = false; - - switch(toku_type) { - case (toku_type_int): - case (toku_type_double): - case (toku_type_float): - case (toku_type_fixbinary): - case (toku_type_fixstring): - pos[0] = COL_FIX_FIELD; - pos++; - - // copy the offset - COPY_OFFSET_TO_BUF; - break; - case (toku_type_varbinary): - case (toku_type_varstring): - pos[0] = COL_VAR_FIELD; - pos++; - - // copy the offset - COPY_OFFSET_TO_BUF; - break; - case (toku_type_blob): - pos[0] = COL_BLOB_FIELD; - pos++; - for (uint32_t i = 0; i < kc_info->num_blobs; i++) { - uint32_t blob_index = kc_info->blob_fields[i]; - if (blob_index == field_index) { - uint32_t val = i; - memcpy(pos, &val, sizeof(uint32_t)); - pos += sizeof(uint32_t); - found_index = true; - break; - } - } - assert(found_index); - break; - default: - assert(false); - } - - return pos - buf; -} - -uint32_t pack_desc_key_length_info(uchar* buf, KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, KEY_PART_INFO* key_part) { - uchar* pos = buf; - uint16 field_index = key_part->field->field_index; - Field* field = table_share->field[field_index]; - TOKU_TYPE toku_type = mysql_to_toku_type(field); - uint32_t key_part_length = key_part->length; - uint32_t field_length; - - switch(toku_type) { - case (toku_type_int): - case (toku_type_double): - case (toku_type_float): - // copy the key_part length - field_length = kc_info->field_lengths[field_index]; - memcpy(pos, &field_length, sizeof(field_length)); - pos += sizeof(key_part_length); - break; - case (toku_type_fixbinary): - case (toku_type_fixstring): - field_length = field->pack_length(); - set_if_smaller(key_part_length, field_length); - case (toku_type_varbinary): - case (toku_type_varstring): - case (toku_type_blob): - // copy the key_part length - memcpy(pos, &key_part_length, sizeof(key_part_length)); - pos += sizeof(key_part_length); - break; - default: - assert(false); - } - - return pos - buf; -} - -uint32_t pack_desc_char_info(uchar* buf, KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, KEY_PART_INFO* key_part) { - uchar* pos = buf; - uint16 field_index = key_part->field->field_index; - Field* field = table_share->field[field_index]; - TOKU_TYPE toku_type = mysql_to_toku_type(field); - uint32_t charset_num = 0; - - switch(toku_type) { - case (toku_type_int): - case (toku_type_double): - case (toku_type_float): - case (toku_type_fixbinary): - case (toku_type_varbinary): - pos[0] = COL_HAS_NO_CHARSET; - pos++; - break; - case (toku_type_fixstring): - case (toku_type_varstring): - case (toku_type_blob): - pos[0] = COL_HAS_CHARSET; - pos++; - - // copy the charset - charset_num = field->charset()->number; - pos[0] = (uchar)(charset_num & 255); - pos[1] = (uchar)((charset_num >> 8) & 255); - pos[2] = (uchar)((charset_num >> 16) & 255); - pos[3] = (uchar)((charset_num >> 24) & 255); - pos += 4; - break; - default: - assert(false); - } - - return pos - buf; -} - -uint32_t pack_some_row_info ( - uchar* buf, - uint pk_index, - TABLE_SHARE* table_share, - KEY_AND_COL_INFO* kc_info - ) -{ - uchar* pos = buf; - uint32_t num_null_bytes = 0; - // - // four bytes stating number of null bytes - // - num_null_bytes = table_share->null_bytes; - memcpy(pos, &num_null_bytes, sizeof(num_null_bytes)); - pos += sizeof(num_null_bytes); - // - // eight bytes stating mcp_info - // - memcpy(pos, &kc_info->mcp_info[pk_index], sizeof(MULTI_COL_PACK_INFO)); - pos += sizeof(MULTI_COL_PACK_INFO); - // - // one byte for the number of offset bytes - // - pos[0] = (uchar)kc_info->num_offset_bytes; - pos++; - - return pos - buf; -} - -uint32_t get_max_clustering_val_pack_desc_size( - TABLE_SHARE* table_share - ) -{ - uint32_t ret_val = 0; - // - // the fixed stuff: - // first the things in pack_some_row_info - // second another mcp_info - // third a byte that states if blobs exist - ret_val += sizeof(uint32_t) + sizeof(MULTI_COL_PACK_INFO) + 1; - ret_val += sizeof(MULTI_COL_PACK_INFO); - ret_val++; - // - // now the variable stuff - // an upper bound is, for each field, byte stating if it is fixed or var, followed - // by 8 bytes for endpoints - // - ret_val += (table_share->fields)*(1 + 2*sizeof(uint32_t)); - // - // four bytes storing the length of this portion - // - ret_val += 4; - - return ret_val; -} - -uint32_t create_toku_clustering_val_pack_descriptor ( - uchar* buf, - uint pk_index, - TABLE_SHARE* table_share, - KEY_AND_COL_INFO* kc_info, - uint32_t keynr, - bool is_clustering - ) -{ - uchar* pos = buf + 4; - uint32_t offset = 0; - bool start_range_set = false; - uint32_t last_col = 0; - // - // do not need to write anything if the key is not clustering - // - if (!is_clustering) { - goto exit; - } - - pos += pack_some_row_info( - pos, - pk_index, - table_share, - kc_info - ); - - // - // eight bytes stating mcp_info of clustering key - // - memcpy(pos, &kc_info->mcp_info[keynr], sizeof(MULTI_COL_PACK_INFO)); - pos += sizeof(MULTI_COL_PACK_INFO); - - // - // store bit that states if blobs exist - // - pos[0] = (kc_info->num_blobs) ? 1 : 0; - pos++; - - // - // descriptor assumes that all fields filtered from pk are - // also filtered from clustering key val. Doing check here to - // make sure something unexpected does not happen - // - for (uint i = 0; i < table_share->fields; i++) { - bool col_filtered = bitmap_is_set(&kc_info->key_filters[keynr],i); - bool col_filtered_in_pk = bitmap_is_set(&kc_info->key_filters[pk_index],i); - if (col_filtered_in_pk) { - assert(col_filtered); - } - } - - // - // first handle the fixed fields - // - start_range_set = false; - last_col = 0; - for (uint i = 0; i < table_share->fields; i++) { - bool col_filtered = bitmap_is_set(&kc_info->key_filters[keynr],i); - if (kc_info->field_lengths[i] == 0) { - // - // not a fixed field, continue - // - continue; - } - if (col_filtered && start_range_set) { - // - // need to set the end range - // - start_range_set = false; - uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val + kc_info->field_lengths[last_col]; - memcpy(pos, &end_offset, sizeof(end_offset)); - pos += sizeof(end_offset); - } - else if (!col_filtered) { - if (!start_range_set) { - pos[0] = CK_FIX_RANGE; - pos++; - start_range_set = true; - uint32_t start_offset = kc_info->cp_info[pk_index][i].col_pack_val; - memcpy(pos, &start_offset , sizeof(start_offset)); - pos += sizeof(start_offset); - } - last_col = i; - } - else { - continue; - } - } - if (start_range_set) { - // - // need to set the end range - // - start_range_set = false; - uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val+ kc_info->field_lengths[last_col]; - memcpy(pos, &end_offset, sizeof(end_offset)); - pos += sizeof(end_offset); - } - - // - // now handle the var fields - // - start_range_set = false; - last_col = 0; - for (uint i = 0; i < table_share->fields; i++) { - bool col_filtered = bitmap_is_set(&kc_info->key_filters[keynr],i); - if (kc_info->length_bytes[i] == 0) { - // - // not a var field, continue - // - continue; - } - if (col_filtered && start_range_set) { - // - // need to set the end range - // - start_range_set = false; - uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val; - memcpy(pos, &end_offset, sizeof(end_offset)); - pos += sizeof(end_offset); - } - else if (!col_filtered) { - if (!start_range_set) { - pos[0] = CK_VAR_RANGE; - pos++; - - start_range_set = true; - uint32_t start_offset = kc_info->cp_info[pk_index][i].col_pack_val; - memcpy(pos, &start_offset , sizeof(start_offset)); - pos += sizeof(start_offset); - } - last_col = i; - } - else { - continue; - } - } - if (start_range_set) { - start_range_set = false; - uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val; - memcpy(pos, &end_offset, sizeof(end_offset)); - pos += sizeof(end_offset); - } - -exit: - offset = pos - buf; - buf[0] = (uchar)(offset & 255); - buf[1] = (uchar)((offset >> 8) & 255); - buf[2] = (uchar)((offset >> 16) & 255); - buf[3] = (uchar)((offset >> 24) & 255); - - return pos - buf; -} - -uint32_t pack_clustering_val_from_desc( - uchar* buf, - void* row_desc, - uint32_t row_desc_size, - const DBT* pk_val - ) -{ - uchar* null_bytes_src_ptr = NULL; - uchar* fixed_src_ptr = NULL; - uchar* var_src_offset_ptr = NULL; - uchar* var_src_data_ptr = NULL; - uchar* fixed_dest_ptr = NULL; - uchar* var_dest_offset_ptr = NULL; - uchar* var_dest_data_ptr = NULL; - uchar* orig_var_dest_data_ptr = NULL; - uchar* desc_pos = (uchar *)row_desc; - uint32_t num_null_bytes = 0; - uint32_t num_offset_bytes; - MULTI_COL_PACK_INFO src_mcp_info, dest_mcp_info; - uchar has_blobs; - - memcpy(&num_null_bytes, desc_pos, sizeof(num_null_bytes)); - desc_pos += sizeof(num_null_bytes); - - memcpy(&src_mcp_info, desc_pos, sizeof(src_mcp_info)); - desc_pos += sizeof(src_mcp_info); - - num_offset_bytes = desc_pos[0]; - desc_pos++; - - memcpy(&dest_mcp_info, desc_pos, sizeof(dest_mcp_info)); - desc_pos += sizeof(dest_mcp_info); - - has_blobs = desc_pos[0]; - desc_pos++; - - // - //set the variables - // - null_bytes_src_ptr = (uchar *)pk_val->data; - fixed_src_ptr = null_bytes_src_ptr + num_null_bytes; - var_src_offset_ptr = fixed_src_ptr + src_mcp_info.fixed_field_size; - var_src_data_ptr = var_src_offset_ptr + src_mcp_info.len_of_offsets; - - fixed_dest_ptr = buf + num_null_bytes; - var_dest_offset_ptr = fixed_dest_ptr + dest_mcp_info.fixed_field_size; - var_dest_data_ptr = var_dest_offset_ptr + dest_mcp_info.len_of_offsets; - orig_var_dest_data_ptr = var_dest_data_ptr; - - // - // copy the null bytes - // - memcpy(buf, null_bytes_src_ptr, num_null_bytes); - while ( (uint32_t)(desc_pos - (uchar *)row_desc) < row_desc_size) { - uint32_t start, end, length; - uchar curr = desc_pos[0]; - desc_pos++; - - memcpy(&start, desc_pos, sizeof(start)); - desc_pos += sizeof(start); - - memcpy(&end, desc_pos, sizeof(end)); - desc_pos += sizeof(end); - - assert (start <= end); - - if (curr == CK_FIX_RANGE) { - length = end - start; - - memcpy(fixed_dest_ptr, fixed_src_ptr + start, length); - fixed_dest_ptr += length; - } - else if (curr == CK_VAR_RANGE) { - uint32_t start_data_size; - uint32_t start_data_offset; - uint32_t end_data_size; - uint32_t end_data_offset; - uint32_t offset_diffs; - - get_var_field_info( - &start_data_size, - &start_data_offset, - start, - var_src_offset_ptr, - num_offset_bytes - ); - get_var_field_info( - &end_data_size, - &end_data_offset, - end, - var_src_offset_ptr, - num_offset_bytes - ); - length = end_data_offset + end_data_size - start_data_offset; - // - // copy the data - // - memcpy( - var_dest_data_ptr, - var_src_data_ptr + start_data_offset, - length - ); - var_dest_data_ptr += length; - - // - // put in offset info - // - offset_diffs = (end_data_offset + end_data_size) - (uint32_t)(var_dest_data_ptr - orig_var_dest_data_ptr); - for (uint32_t i = start; i <= end; i++) { - if ( num_offset_bytes == 1 ) { - assert(offset_diffs < 256); - var_dest_offset_ptr[0] = var_src_offset_ptr[i] - (uchar)offset_diffs; - var_dest_offset_ptr++; - } - else if ( num_offset_bytes == 2 ) { - uint32_t tmp = uint2korr(var_src_offset_ptr + 2*i); - uint32_t new_offset = tmp - offset_diffs; - assert(new_offset < 1<<16); - int2store(var_dest_offset_ptr,new_offset); - var_dest_offset_ptr += 2; - } - else { - assert(false); - } - } - } - else { - assert(false); - } - } - // - // copy blobs - // at this point, var_dest_data_ptr is pointing to the end, where blobs should be located - // so, we put the blobs at var_dest_data_ptr - // - if (has_blobs) { - uint32_t num_blob_bytes; - uint32_t start_offset; - uchar* src_blob_ptr = NULL; - get_blob_field_info( - &start_offset, - src_mcp_info.len_of_offsets, - var_src_data_ptr, - num_offset_bytes - ); - src_blob_ptr = var_src_data_ptr + start_offset; - num_blob_bytes = pk_val->size - (start_offset + (var_src_data_ptr - null_bytes_src_ptr)); - memcpy(var_dest_data_ptr, src_blob_ptr, num_blob_bytes); - var_dest_data_ptr += num_blob_bytes; - } - return var_dest_data_ptr - buf; -} - - -uint32_t get_max_secondary_key_pack_desc_size( - KEY_AND_COL_INFO* kc_info - ) -{ - uint32_t ret_val = 0; - // - // the fixed stuff: - // byte that states if main dictionary - // byte that states if hpk - // the things in pack_some_row_info - ret_val++; - ret_val++; - ret_val += sizeof(uint32_t) + sizeof(MULTI_COL_PACK_INFO) + 1; - // - // now variable sized stuff - // - - // first the blobs - ret_val += sizeof(kc_info->num_blobs); - ret_val+= kc_info->num_blobs; - - // then the pk - // one byte for num key parts - // two bytes for each key part - ret_val++; - ret_val += MAX_REF_PARTS*2; - - // then the key - // null bit, then null byte, - // then 1 byte stating what it is, then 4 for offset, 4 for key length, - // 1 for if charset exists, and 4 for charset - ret_val += MAX_REF_PARTS*(1 + sizeof(uint32_t) + 1 + 3*sizeof(uint32_t) + 1); - // - // four bytes storing the length of this portion - // - ret_val += 4; - return ret_val; -} - -uint32_t create_toku_secondary_key_pack_descriptor ( - uchar* buf, - bool has_hpk, - uint pk_index, - TABLE_SHARE* table_share, - TABLE* table, - KEY_AND_COL_INFO* kc_info, - KEY* key_info, - KEY* prim_key - ) -{ - // - // The first four bytes always contain the offset of where the first key - // ends. - // - uchar* pk_info = NULL; - uchar* pos = buf + 4; - uint32_t offset = 0; - - // - // first byte states that it is NOT main dictionary - // - pos[0] = 0; - pos++; - - // - // one byte states if main dictionary has an hpk or not - // - if (has_hpk) { - pos[0] = 1; - } - else { - pos[0] = 0; - } - pos++; - - pos += pack_some_row_info( - pos, - pk_index, - table_share, - kc_info - ); - - // - // store blob information - // - memcpy(pos, &kc_info->num_blobs, sizeof(kc_info->num_blobs)); - pos += sizeof(uint32_t); - for (uint32_t i = 0; i < kc_info->num_blobs; i++) { - // - // store length bytes for each blob - // - Field* field = table_share->field[kc_info->blob_fields[i]]; - pos[0] = (uchar)field->row_pack_length(); - pos++; - } - - // - // store the pk information - // - if (has_hpk) { - pos[0] = 0; - pos++; - } - else { - // - // store number of parts - // - assert(get_key_parts(prim_key) < 128); - pos[0] = 2 * get_key_parts(prim_key); - pos++; - // - // for each part, store if it is a fixed field or var field - // if fixed, store number of bytes, if var, store - // number of length bytes - // total should be two bytes per key part stored - // - pk_info = pos; - uchar* tmp = pos; - for (uint i = 0; i < get_key_parts(prim_key); i++) { - tmp += pack_desc_pk_info( - tmp, - kc_info, - table_share, - &prim_key->key_part[i] - ); - } - // - // asserting that we moved forward as much as we think we have - // - assert(tmp - pos == (2 * get_key_parts(prim_key))); - pos = tmp; - } - - for (uint i = 0; i < get_key_parts(key_info); i++) { - KEY_PART_INFO curr_kpi = key_info->key_part[i]; - uint16 field_index = curr_kpi.field->field_index; - Field* field = table_share->field[field_index]; - bool is_col_in_pk = false; - - if (bitmap_is_set(&kc_info->key_filters[pk_index],field_index)) { - assert(!has_hpk && prim_key != NULL); - is_col_in_pk = true; - } - else { - is_col_in_pk = false; - } - - pos[0] = field->null_bit; - pos++; - - if (is_col_in_pk) { - // - // assert that columns in pk do not have a null bit - // because in MySQL, pk columns cannot be null - // - assert(!field->null_bit); - } - - if (field->null_bit) { - uint32_t null_offset = get_null_offset(table,table->field[field_index]); - memcpy(pos, &null_offset, sizeof(uint32_t)); - pos += sizeof(uint32_t); - } - if (is_col_in_pk) { - pos += pack_desc_pk_offset_info( - pos, - kc_info, - table_share, - &curr_kpi, - prim_key, - pk_info - ); - } - else { - pos += pack_desc_offset_info( - pos, - kc_info, - pk_index, - table_share, - &curr_kpi - ); - } - pos += pack_desc_key_length_info( - pos, - kc_info, - table_share, - &curr_kpi - ); - pos += pack_desc_char_info( - pos, - kc_info, - table_share, - &curr_kpi - ); - } - - offset = pos - buf; - buf[0] = (uchar)(offset & 255); - buf[1] = (uchar)((offset >> 8) & 255); - buf[2] = (uchar)((offset >> 16) & 255); - buf[3] = (uchar)((offset >> 24) & 255); - - return pos - buf; -} - -uint32_t skip_key_in_desc( - uchar* row_desc - ) -{ - uchar* pos = row_desc; - uchar col_bin_or_char; - // - // skip the byte that states if it is a fix field or var field, we do not care - // - pos++; - - // - // skip the offset information - // - pos += sizeof(uint32_t); - - // - // skip the key_part_length info - // - pos += sizeof(uint32_t); - col_bin_or_char = pos[0]; - pos++; - if (col_bin_or_char == COL_HAS_NO_CHARSET) { - goto exit; - } - // - // skip the charset info - // - pos += 4; - - -exit: - return (uint32_t)(pos-row_desc); -} - - -uint32_t max_key_size_from_desc( - void* row_desc, - uint32_t row_desc_size - ) -{ - uchar* desc_pos = (uchar *)row_desc; - uint32_t num_blobs; - uint32_t num_pk_columns; - // - // start at 1 for the infinity byte - // - uint32_t max_size = 1; - - // skip byte that states if main dictionary - bool is_main_dictionary = desc_pos[0]; - desc_pos++; - assert(!is_main_dictionary); - - // skip hpk byte - desc_pos++; - - // skip num_null_bytes - desc_pos += sizeof(uint32_t); - - // skip mcp_info - desc_pos += sizeof(MULTI_COL_PACK_INFO); - - // skip offset_bytes - desc_pos++; - - // skip over blobs - memcpy(&num_blobs, desc_pos, sizeof(num_blobs)); - desc_pos += sizeof(num_blobs); - desc_pos += num_blobs; - - // skip over pk info - num_pk_columns = desc_pos[0]/2; - desc_pos++; - desc_pos += 2*num_pk_columns; - - while ( (uint32_t)(desc_pos - (uchar *)row_desc) < row_desc_size) { - uchar has_charset; - uint32_t key_length = 0; - - uchar null_bit = desc_pos[0]; - desc_pos++; - - if (null_bit) { - // - // column is NULLable, skip null_offset, and add a null byte - // - max_size++; - desc_pos += sizeof(uint32_t); - } - // - // skip over byte that states if fix or var - // - desc_pos++; - - // skip over offset - desc_pos += sizeof(uint32_t); - - // - // get the key length and add it to return value - // - memcpy(&key_length, desc_pos, sizeof(key_length)); - desc_pos += sizeof(key_length); - max_size += key_length; - max_size += 2; // 2 bytes for a potential length bytes, we are upperbounding, does not need to be super tight - - has_charset = desc_pos[0]; - desc_pos++; - - uint32_t charset_num; - if (has_charset == COL_HAS_CHARSET) { - // skip over charsent num - desc_pos += sizeof(charset_num); - } - else { - assert(has_charset == COL_HAS_NO_CHARSET); - } - } - return max_size; -} - -uint32_t pack_key_from_desc( - uchar* buf, - void* row_desc, - uint32_t row_desc_size, - const DBT* pk_key, - const DBT* pk_val - ) -{ - MULTI_COL_PACK_INFO mcp_info; - uint32_t num_null_bytes; - uint32_t num_blobs; - uint32_t num_pk_columns; - uchar* blob_lengths = NULL; - uchar* pk_info = NULL; - uchar* pk_data_ptr = NULL; - uchar* null_bytes_ptr = NULL; - uchar* fixed_field_ptr = NULL; - uchar* var_field_offset_ptr = NULL; - const uchar* var_field_data_ptr = NULL; - uint32_t num_offset_bytes; - uchar* packed_key_pos = buf; - uchar* desc_pos = (uchar *)row_desc; - - bool is_main_dictionary = desc_pos[0]; - desc_pos++; - assert(!is_main_dictionary); - - // - // get the constant info out of descriptor - // - bool hpk = desc_pos[0]; - desc_pos++; - - memcpy(&num_null_bytes, desc_pos, sizeof(num_null_bytes)); - desc_pos += sizeof(num_null_bytes); - - memcpy(&mcp_info, desc_pos, sizeof(mcp_info)); - desc_pos += sizeof(mcp_info); - - num_offset_bytes = desc_pos[0]; - desc_pos++; - - memcpy(&num_blobs, desc_pos, sizeof(num_blobs)); - desc_pos += sizeof(num_blobs); - - blob_lengths = desc_pos; - desc_pos += num_blobs; - - num_pk_columns = desc_pos[0]/2; - desc_pos++; - pk_info = desc_pos; - desc_pos += 2*num_pk_columns; - - // - // now start packing the key - // - - // - // pack the infinity byte - // - packed_key_pos[0] = COL_ZERO; - packed_key_pos++; - // - // now start packing each column of the key, as described in descriptor - // - if (!hpk) { - // +1 for the infinity byte - pk_data_ptr = (uchar *)pk_key->data + 1; - } - null_bytes_ptr = (uchar *)pk_val->data; - fixed_field_ptr = null_bytes_ptr + num_null_bytes; - var_field_offset_ptr = fixed_field_ptr + mcp_info.fixed_field_size; - var_field_data_ptr = var_field_offset_ptr + mcp_info.len_of_offsets; - while ( (uint32_t)(desc_pos - (uchar *)row_desc) < row_desc_size) { - uchar col_fix_val; - uchar has_charset; - uint32_t col_pack_val = 0; - uint32_t key_length = 0; - - uchar null_bit = desc_pos[0]; - desc_pos++; - - if (null_bit) { - // - // column is NULLable, need to check the null bytes to see if it is NULL - // - uint32_t null_offset = 0; - bool is_field_null; - memcpy(&null_offset, desc_pos, sizeof(null_offset)); - desc_pos += sizeof(null_offset); - - is_field_null = (null_bytes_ptr[null_offset] & null_bit) ? true: false; - if (is_field_null) { - packed_key_pos[0] = NULL_COL_VAL; - packed_key_pos++; - desc_pos += skip_key_in_desc(desc_pos); - continue; - } - else { - packed_key_pos[0] = NONNULL_COL_VAL; - packed_key_pos++; - } - } - // - // now pack the column (unless it was NULL, and we continued) - // - col_fix_val = desc_pos[0]; - desc_pos++; - - memcpy(&col_pack_val, desc_pos, sizeof(col_pack_val)); - desc_pos += sizeof(col_pack_val); - - memcpy(&key_length, desc_pos, sizeof(key_length)); - desc_pos += sizeof(key_length); - - has_charset = desc_pos[0]; - desc_pos++; - - uint32_t charset_num = 0; - if (has_charset == COL_HAS_CHARSET) { - memcpy(&charset_num, desc_pos, sizeof(charset_num)); - desc_pos += sizeof(charset_num); - } - else { - assert(has_charset == COL_HAS_NO_CHARSET); - } - // - // case where column is in pk val - // - if (col_fix_val == COL_FIX_FIELD || col_fix_val == COL_VAR_FIELD || col_fix_val == COL_BLOB_FIELD) { - if (col_fix_val == COL_FIX_FIELD && has_charset == COL_HAS_NO_CHARSET) { - memcpy(packed_key_pos, &fixed_field_ptr[col_pack_val], key_length); - packed_key_pos += key_length; - } - else if (col_fix_val == COL_VAR_FIELD && has_charset == COL_HAS_NO_CHARSET) { - uint32_t data_start_offset = 0; - - uint32_t data_size = 0; - get_var_field_info( - &data_size, - &data_start_offset, - col_pack_val, - var_field_offset_ptr, - num_offset_bytes - ); - - // - // length of this field in this row is data_size - // data is located beginning at var_field_data_ptr + data_start_offset - // - packed_key_pos = pack_toku_varbinary_from_desc( - packed_key_pos, - var_field_data_ptr + data_start_offset, - key_length, //number of bytes to use to encode the length in to_tokudb - data_size //length of field - ); - } - else { - const uchar* data_start = NULL; - uint32_t data_start_offset = 0; - uint32_t data_size = 0; - - if (col_fix_val == COL_FIX_FIELD) { - data_start_offset = col_pack_val; - data_size = key_length; - data_start = fixed_field_ptr + data_start_offset; - } - else if (col_fix_val == COL_VAR_FIELD){ - get_var_field_info( - &data_size, - &data_start_offset, - col_pack_val, - var_field_offset_ptr, - num_offset_bytes - ); - data_start = var_field_data_ptr + data_start_offset; - } - else if (col_fix_val == COL_BLOB_FIELD) { - uint32_t blob_index = col_pack_val; - uint32_t blob_offset; - const uchar* blob_ptr = NULL; - uint32_t field_len; - uint32_t field_len_bytes = blob_lengths[blob_index]; - get_blob_field_info( - &blob_offset, - mcp_info.len_of_offsets, - var_field_data_ptr, - num_offset_bytes - ); - blob_ptr = var_field_data_ptr + blob_offset; - assert(num_blobs > 0); - // - // skip over other blobs to get to the one we want to make a key out of - // - for (uint32_t i = 0; i < blob_index; i++) { - blob_ptr = unpack_toku_field_blob( - NULL, - blob_ptr, - blob_lengths[i], - true - ); - } - // - // at this point, blob_ptr is pointing to the blob we want to make a key from - // - field_len = get_blob_field_len(blob_ptr, field_len_bytes); - // - // now we set the variables to make the key - // - data_start = blob_ptr + field_len_bytes; - data_size = field_len; - - - } - else { - assert(false); - } - - packed_key_pos = pack_toku_varstring_from_desc( - packed_key_pos, - data_start, - key_length, - data_size, - charset_num - ); - } - } - // - // case where column is in pk key - // - else { - if (col_fix_val == COL_FIX_PK_OFFSET) { - memcpy(packed_key_pos, &pk_data_ptr[col_pack_val], key_length); - packed_key_pos += key_length; - } - else if (col_fix_val == COL_VAR_PK_OFFSET) { - uchar* tmp_pk_data_ptr = pk_data_ptr; - uint32_t index_in_pk = col_pack_val; - // - // skip along in pk to the right column - // - for (uint32_t i = 0; i < index_in_pk; i++) { - if (pk_info[2*i] == COL_FIX_FIELD) { - tmp_pk_data_ptr += pk_info[2*i + 1]; - } - else if (pk_info[2*i] == COL_VAR_FIELD) { - uint32_t len_bytes = pk_info[2*i + 1]; - uint32_t len; - if (len_bytes == 1) { - len = tmp_pk_data_ptr[0]; - tmp_pk_data_ptr++; - } - else if (len_bytes == 2) { - len = uint2korr(tmp_pk_data_ptr); - tmp_pk_data_ptr += 2; - } - else { - assert(false); - } - tmp_pk_data_ptr += len; - } - else { - assert(false); - } - } - // - // at this point, tmp_pk_data_ptr is pointing at the column - // - uint32_t is_fix_field = pk_info[2*index_in_pk]; - if (is_fix_field == COL_FIX_FIELD) { - memcpy(packed_key_pos, tmp_pk_data_ptr, key_length); - packed_key_pos += key_length; - } - else if (is_fix_field == COL_VAR_FIELD) { - const uchar* data_start = NULL; - uint32_t data_size = 0; - uint32_t len_bytes = pk_info[2*index_in_pk + 1]; - if (len_bytes == 1) { - data_size = tmp_pk_data_ptr[0]; - tmp_pk_data_ptr++; - } - else if (len_bytes == 2) { - data_size = uint2korr(tmp_pk_data_ptr); - tmp_pk_data_ptr += 2; - } - else { - assert(false); - } - data_start = tmp_pk_data_ptr; - - if (has_charset == COL_HAS_CHARSET) { - packed_key_pos = pack_toku_varstring_from_desc( - packed_key_pos, - data_start, - key_length, - data_size, - charset_num - ); - } - else if (has_charset == COL_HAS_NO_CHARSET) { - packed_key_pos = pack_toku_varbinary_from_desc( - packed_key_pos, - data_start, - key_length, - data_size //length of field - ); - } - else { - assert(false); - } - } - else { - assert(false); - } - } - else { - assert(false); - } - } - - } - assert( (uint32_t)(desc_pos - (uchar *)row_desc) == row_desc_size); - - // - // now append the primary key to the end of the key - // - if (hpk) { - memcpy(packed_key_pos, pk_key->data, pk_key->size); - packed_key_pos += pk_key->size; - } - else { - memcpy(packed_key_pos, (uchar *)pk_key->data + 1, pk_key->size - 1); - packed_key_pos += (pk_key->size - 1); - } - - return (uint32_t)(packed_key_pos - buf); // -} - -bool fields_have_same_name( - Field* a, - Field* b - ) -{ - return strcmp(a->field_name, b->field_name) == 0; -} - -bool fields_are_same_type( - Field* a, - Field* b - ) -{ - bool retval = true; - enum_field_types a_mysql_type = a->real_type(); - enum_field_types b_mysql_type = b->real_type(); - TOKU_TYPE a_toku_type = mysql_to_toku_type(a); - TOKU_TYPE b_toku_type = mysql_to_toku_type(b); - // make sure have same names - // make sure have same types - if (a_mysql_type != b_mysql_type) { - retval = false; - goto cleanup; - } - // Thanks to MariaDB 5.5, we can have two fields - // be the same MySQL type but not the same toku type, - // This is an issue introduced with MariaDB's fractional time - // implementation - if (a_toku_type != b_toku_type) { - retval = false; - goto cleanup; - } - // make sure that either both are nullable, or both not nullable - if ((a->null_bit && !b->null_bit) || (!a->null_bit && b->null_bit)) { - retval = false; - goto cleanup; - } - switch (a_mysql_type) { - case MYSQL_TYPE_TINY: - case MYSQL_TYPE_SHORT: - case MYSQL_TYPE_INT24: - case MYSQL_TYPE_LONG: - case MYSQL_TYPE_LONGLONG: - // length, unsigned, auto increment - if (a->pack_length() != b->pack_length() || - (a->flags & UNSIGNED_FLAG) != (b->flags & UNSIGNED_FLAG) || - (a->flags & AUTO_INCREMENT_FLAG) != (b->flags & AUTO_INCREMENT_FLAG)) { - retval = false; - goto cleanup; - } - break; - case MYSQL_TYPE_DOUBLE: - case MYSQL_TYPE_FLOAT: - // length, unsigned, auto increment - if (a->pack_length() != b->pack_length() || - (a->flags & UNSIGNED_FLAG) != (b->flags & UNSIGNED_FLAG) || - (a->flags & AUTO_INCREMENT_FLAG) != (b->flags & AUTO_INCREMENT_FLAG)) { - retval = false; - goto cleanup; - } - break; - case MYSQL_TYPE_NEWDECIMAL: - // length, unsigned - if (a->pack_length() != b->pack_length() || - (a->flags & UNSIGNED_FLAG) != (b->flags & UNSIGNED_FLAG)) { - retval = false; - goto cleanup; - } - break; - case MYSQL_TYPE_ENUM: - case MYSQL_TYPE_SET: - case MYSQL_TYPE_BIT: - // length - if (a->pack_length() != b->pack_length()) { - retval = false; - goto cleanup; - } - break; - case MYSQL_TYPE_DATE: - case MYSQL_TYPE_DATETIME: - case MYSQL_TYPE_YEAR: - case MYSQL_TYPE_NEWDATE: - case MYSQL_TYPE_TIME: - case MYSQL_TYPE_TIMESTAMP: -#if 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699 - case MYSQL_TYPE_DATETIME2: - case MYSQL_TYPE_TIMESTAMP2: - case MYSQL_TYPE_TIME2: -#endif - // length - if (a->pack_length() != b->pack_length()) { - retval = false; - goto cleanup; - } - break; - case MYSQL_TYPE_TINY_BLOB: - case MYSQL_TYPE_MEDIUM_BLOB: - case MYSQL_TYPE_BLOB: - case MYSQL_TYPE_LONG_BLOB: - // test the charset - if (a->charset()->number != b->charset()->number) { - retval = false; - goto cleanup; - } - if (a->row_pack_length() != b->row_pack_length()) { - retval = false; - goto cleanup; - } - break; - case MYSQL_TYPE_STRING: - if (a->pack_length() != b->pack_length()) { - retval = false; - goto cleanup; - } - // if both are binary, we know have same pack lengths, - // so we can goto end - if (a->binary() && b->binary()) { - // nothing to do, we are good - } - else if (!a->binary() && !b->binary()) { - // test the charset - if (a->charset()->number != b->charset()->number) { - retval = false; - goto cleanup; - } - } - else { - // one is binary and the other is not, so not the same - retval = false; - goto cleanup; - } - break; - case MYSQL_TYPE_VARCHAR: - if (a->field_length != b->field_length) { - retval = false; - goto cleanup; - } - // if both are binary, we know have same pack lengths, - // so we can goto end - if (a->binary() && b->binary()) { - // nothing to do, we are good - } - else if (!a->binary() && !b->binary()) { - // test the charset - if (a->charset()->number != b->charset()->number) { - retval = false; - goto cleanup; - } - } - else { - // one is binary and the other is not, so not the same - retval = false; - goto cleanup; - } - break; - // - // I believe these are old types that are no longer - // in any 5.1 tables, so tokudb does not need - // to worry about them - // Putting in this assert in case I am wrong. - // Do not support geometry yet. - // - case MYSQL_TYPE_GEOMETRY: - case MYSQL_TYPE_DECIMAL: - case MYSQL_TYPE_VAR_STRING: - case MYSQL_TYPE_NULL: - assert(false); - } - -cleanup: - return retval; -} - - -bool are_two_fields_same( - Field* a, - Field* b - ) -{ - return fields_have_same_name(a, b) && fields_are_same_type(a, b); -} - - +#include "hatoku_cmp.h" + +#ifdef WORDS_BIGENDIAN +#error "WORDS_BIGENDIAN not supported" +#endif + +void get_var_field_info( + uint32_t* field_len, // output: length of field + uint32_t* start_offset, // output, length of offset where data starts + uint32_t var_field_index, //input, index of var field we want info on + const uchar* var_field_offset_ptr, //input, pointer to where offset information for all var fields begins + uint32_t num_offset_bytes //input, number of bytes used to store offsets starting at var_field_offset_ptr + ) +{ + uint32_t data_start_offset = 0; + uint32_t data_end_offset = 0; + switch (num_offset_bytes) { + case (1): + data_end_offset = (var_field_offset_ptr + var_field_index)[0]; + break; + case (2): + data_end_offset = uint2korr(var_field_offset_ptr + 2*var_field_index); + break; + default: + assert(false); + break; + } + + if (var_field_index) { + switch (num_offset_bytes) { + case (1): + data_start_offset = (var_field_offset_ptr + var_field_index - 1)[0]; + break; + case (2): + data_start_offset = uint2korr(var_field_offset_ptr + 2*(var_field_index-1)); + break; + default: + assert(false); + break; + } + } + else { + data_start_offset = 0; + } + + *start_offset = data_start_offset; + assert(data_end_offset >= data_start_offset); + *field_len = data_end_offset - data_start_offset; +} + +void get_blob_field_info( + uint32_t* start_offset, + uint32_t len_of_offsets, + const uchar* var_field_data_ptr, + uint32_t num_offset_bytes + ) +{ + uint32_t data_end_offset; + // + // need to set var_field_data_ptr to point to beginning of blobs, which + // is at the end of the var stuff (if they exist), if var stuff does not exist + // then the bottom variable will be 0, and var_field_data_ptr is already + // set correctly + // + if (len_of_offsets) { + switch (num_offset_bytes) { + case (1): + data_end_offset = (var_field_data_ptr - 1)[0]; + break; + case (2): + data_end_offset = uint2korr(var_field_data_ptr - 2); + break; + default: + assert(false); + break; + } + } + else { + data_end_offset = 0; + } + *start_offset = data_end_offset; +} + + +// this function is pattern matched from +// InnoDB's get_innobase_type_from_mysql_type +TOKU_TYPE mysql_to_toku_type (Field* field) { + TOKU_TYPE ret_val = toku_type_unknown; + enum_field_types mysql_type = field->real_type(); + switch (mysql_type) { + case MYSQL_TYPE_LONG: + case MYSQL_TYPE_LONGLONG: + case MYSQL_TYPE_TINY: + case MYSQL_TYPE_SHORT: + case MYSQL_TYPE_INT24: + case MYSQL_TYPE_DATE: + case MYSQL_TYPE_YEAR: + case MYSQL_TYPE_NEWDATE: + case MYSQL_TYPE_ENUM: + case MYSQL_TYPE_SET: + ret_val = toku_type_int; + goto exit; + case MYSQL_TYPE_TIME: + case MYSQL_TYPE_DATETIME: + case MYSQL_TYPE_TIMESTAMP: +#ifdef MARIADB_BASE_VERSION + // case to handle fractional seconds in MariaDB + // + if (field->key_type() == HA_KEYTYPE_BINARY) { + ret_val = toku_type_fixbinary; + goto exit; + } +#endif + ret_val = toku_type_int; + goto exit; + case MYSQL_TYPE_DOUBLE: + ret_val = toku_type_double; + goto exit; + case MYSQL_TYPE_FLOAT: + ret_val = toku_type_float; + goto exit; +#if 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699 + case MYSQL_TYPE_DATETIME2: + case MYSQL_TYPE_TIMESTAMP2: + case MYSQL_TYPE_TIME2: +#endif + case MYSQL_TYPE_NEWDECIMAL: + case MYSQL_TYPE_BIT: + ret_val = toku_type_fixbinary; + goto exit; + case MYSQL_TYPE_STRING: + if (field->binary()) { + ret_val = toku_type_fixbinary; + } + else { + ret_val = toku_type_fixstring; + } + goto exit; + case MYSQL_TYPE_VARCHAR: + if (field->binary()) { + ret_val = toku_type_varbinary; + } + else { + ret_val = toku_type_varstring; + } + goto exit; + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + ret_val = toku_type_blob; + goto exit; + // + // I believe these are old types that are no longer + // in any 5.1 tables, so tokudb does not need + // to worry about them + // Putting in this assert in case I am wrong. + // Do not support geometry yet. + // + case MYSQL_TYPE_GEOMETRY: + case MYSQL_TYPE_DECIMAL: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_NULL: + assert(false); + } +exit: + return ret_val; +} + + +static inline CHARSET_INFO* get_charset_from_num (uint32_t charset_number) { + // + // patternmatched off of InnoDB, due to MySQL bug 42649 + // + if (charset_number == default_charset_info->number) { + return default_charset_info; + } + else if (charset_number == my_charset_latin1.number) { + return &my_charset_latin1; + } + else { + return get_charset(charset_number, MYF(MY_WME)); + } +} + + + +// +// used to read the length of a variable sized field in a tokudb key (buf). +// +static inline uint32_t get_length_from_var_tokudata (uchar* buf, uint32_t length_bytes) { + uint32_t length = (uint32_t)(buf[0]); + if (length_bytes == 2) { + uint32_t rest_of_length = (uint32_t)buf[1]; + length += rest_of_length<<8; + } + return length; +} + +// +// used to deduce the number of bytes used to store the length of a varstring/varbinary +// in a key field stored in tokudb +// +static inline uint32_t get_length_bytes_from_max(uint32_t max_num_bytes) { + return (max_num_bytes > 255) ? 2 : 1; +} + + + +// +// assuming MySQL in little endian, and we are storing in little endian +// +static inline uchar* pack_toku_int (uchar* to_tokudb, uchar* from_mysql, uint32_t num_bytes) { + switch (num_bytes) { + case (1): + memcpy(to_tokudb, from_mysql, 1); + break; + case (2): + memcpy(to_tokudb, from_mysql, 2); + break; + case (3): + memcpy(to_tokudb, from_mysql, 3); + break; + case (4): + memcpy(to_tokudb, from_mysql, 4); + break; + case (8): + memcpy(to_tokudb, from_mysql, 8); + break; + default: + assert(false); + } + return to_tokudb+num_bytes; +} + +// +// assuming MySQL in little endian, and we are unpacking to little endian +// +static inline uchar* unpack_toku_int(uchar* to_mysql, uchar* from_tokudb, uint32_t num_bytes) { + switch (num_bytes) { + case (1): + memcpy(to_mysql, from_tokudb, 1); + break; + case (2): + memcpy(to_mysql, from_tokudb, 2); + break; + case (3): + memcpy(to_mysql, from_tokudb, 3); + break; + case (4): + memcpy(to_mysql, from_tokudb, 4); + break; + case (8): + memcpy(to_mysql, from_tokudb, 8); + break; + default: + assert(false); + } + return from_tokudb+num_bytes; +} + +static inline int cmp_toku_int (uchar* a_buf, uchar* b_buf, bool is_unsigned, uint32_t num_bytes) { + int ret_val = 0; + // + // case for unsigned integers + // + if (is_unsigned) { + uint32_t a_num, b_num = 0; + uint64_t a_big_num, b_big_num = 0; + switch (num_bytes) { + case (1): + a_num = *a_buf; + b_num = *b_buf; + ret_val = a_num-b_num; + goto exit; + case (2): + a_num = uint2korr(a_buf); + b_num = uint2korr(b_buf); + ret_val = a_num-b_num; + goto exit; + case (3): + a_num = uint3korr(a_buf); + b_num = uint3korr(b_buf); + ret_val = a_num-b_num; + goto exit; + case (4): + a_num = uint4korr(a_buf); + b_num = uint4korr(b_buf); + if (a_num < b_num) { + ret_val = -1; goto exit; + } + if (a_num > b_num) { + ret_val = 1; goto exit; + } + ret_val = 0; + goto exit; + case (8): + a_big_num = uint8korr(a_buf); + b_big_num = uint8korr(b_buf); + if (a_big_num < b_big_num) { + ret_val = -1; goto exit; + } + else if (a_big_num > b_big_num) { + ret_val = 1; goto exit; + } + ret_val = 0; + goto exit; + default: + assert(false); + } + } + // + // case for signed integers + // + else { + int32_t a_num, b_num = 0; + int64_t a_big_num, b_big_num = 0; + switch (num_bytes) { + case (1): + a_num = *(signed char *)a_buf; + b_num = *(signed char *)b_buf; + ret_val = a_num-b_num; + goto exit; + case (2): + a_num = sint2korr(a_buf); + b_num = sint2korr(b_buf); + ret_val = a_num-b_num; + goto exit; + case (3): + a_num = sint3korr(a_buf); + b_num = sint3korr(b_buf); + ret_val = a_num - b_num; + goto exit; + case (4): + a_num = sint4korr(a_buf); + b_num = sint4korr(b_buf); + if (a_num < b_num) { + ret_val = -1; goto exit; + } + if (a_num > b_num) { + ret_val = 1; goto exit; + } + ret_val = 0; + goto exit; + case (8): + a_big_num = sint8korr(a_buf); + b_big_num = sint8korr(b_buf); + if (a_big_num < b_big_num) { + ret_val = -1; goto exit; + } + else if (a_big_num > b_big_num) { + ret_val = 1; goto exit; + } + ret_val = 0; + goto exit; + default: + assert(false); + } + } + // + // if this is hit, indicates bug in writing of this function + // + assert(false); +exit: + return ret_val; +} + +static inline uchar* pack_toku_double (uchar* to_tokudb, uchar* from_mysql) { + memcpy(to_tokudb, from_mysql, sizeof(double)); + return to_tokudb + sizeof(double); +} + + +static inline uchar* unpack_toku_double(uchar* to_mysql, uchar* from_tokudb) { + memcpy(to_mysql, from_tokudb, sizeof(double)); + return from_tokudb + sizeof(double); +} + +static inline int cmp_toku_double(uchar* a_buf, uchar* b_buf) { + int ret_val; + double a_num; + double b_num; + doubleget(a_num, a_buf); + doubleget(b_num, b_buf); + if (a_num < b_num) { + ret_val = -1; + goto exit; + } + else if (a_num > b_num) { + ret_val = 1; + goto exit; + } + ret_val = 0; +exit: + return ret_val; +} + + +static inline uchar* pack_toku_float (uchar* to_tokudb, uchar* from_mysql) { + memcpy(to_tokudb, from_mysql, sizeof(float)); + return to_tokudb + sizeof(float); +} + + +static inline uchar* unpack_toku_float(uchar* to_mysql, uchar* from_tokudb) { + memcpy(to_mysql, from_tokudb, sizeof(float)); + return from_tokudb + sizeof(float); +} + +static inline int cmp_toku_float(uchar* a_buf, uchar* b_buf) { + int ret_val; + float a_num; + float b_num; + // + // This is the way Field_float::cmp gets the floats from the buffers + // + memcpy(&a_num, a_buf, sizeof(float)); + memcpy(&b_num, b_buf, sizeof(float)); + if (a_num < b_num) { + ret_val = -1; + goto exit; + } + else if (a_num > b_num) { + ret_val = 1; + goto exit; + } + ret_val = 0; +exit: + return ret_val; +} + + +static inline uchar* pack_toku_binary(uchar* to_tokudb, uchar* from_mysql, uint32_t num_bytes) { + memcpy(to_tokudb, from_mysql, num_bytes); + return to_tokudb + num_bytes; +} + +static inline uchar* unpack_toku_binary(uchar* to_mysql, uchar* from_tokudb, uint32_t num_bytes) { + memcpy(to_mysql, from_tokudb, num_bytes); + return from_tokudb + num_bytes; +} + + +static inline int cmp_toku_binary( + uchar* a_buf, + uint32_t a_num_bytes, + uchar* b_buf, + uint32_t b_num_bytes + ) +{ + int ret_val = 0; + uint32_t num_bytes_to_cmp = (a_num_bytes < b_num_bytes) ? a_num_bytes : b_num_bytes; + ret_val = memcmp(a_buf, b_buf, num_bytes_to_cmp); + if ((ret_val != 0) || (a_num_bytes == b_num_bytes)) { + goto exit; + } + if (a_num_bytes < b_num_bytes) { + ret_val = -1; + goto exit; + } + else { + ret_val = 1; + goto exit; + } +exit: + return ret_val; +} + +// +// partially copied from below +// +uchar* pack_toku_varbinary_from_desc( + uchar* to_tokudb, + const uchar* from_desc, + uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb + uint32_t field_length //length of field + ) +{ + uint32_t length_bytes_in_tokudb = get_length_bytes_from_max(key_part_length); + uint32_t length = field_length; + set_if_smaller(length, key_part_length); + + // + // copy the length bytes, assuming both are in little endian + // + to_tokudb[0] = (uchar)length & 255; + if (length_bytes_in_tokudb > 1) { + to_tokudb[1] = (uchar) (length >> 8); + } + // + // copy the string + // + memcpy(to_tokudb + length_bytes_in_tokudb, from_desc, length); + return to_tokudb + length + length_bytes_in_tokudb; +} + +static inline uchar* pack_toku_varbinary( + uchar* to_tokudb, + uchar* from_mysql, + uint32_t length_bytes_in_mysql, //number of bytes used to encode the length in from_mysql + uint32_t max_num_bytes + ) +{ + uint32_t length = 0; + uint32_t length_bytes_in_tokudb; + switch (length_bytes_in_mysql) { + case (0): + length = max_num_bytes; + break; + case (1): + length = (uint32_t)(*from_mysql); + break; + case (2): + length = uint2korr(from_mysql); + break; + case (3): + length = uint3korr(from_mysql); + break; + case (4): + length = uint4korr(from_mysql); + break; + } + + // + // from this point on, functionality equivalent to pack_toku_varbinary_from_desc + // + set_if_smaller(length,max_num_bytes); + + length_bytes_in_tokudb = get_length_bytes_from_max(max_num_bytes); + // + // copy the length bytes, assuming both are in little endian + // + to_tokudb[0] = (uchar)length & 255; + if (length_bytes_in_tokudb > 1) { + to_tokudb[1] = (uchar) (length >> 8); + } + // + // copy the string + // + memcpy(to_tokudb + length_bytes_in_tokudb, from_mysql + length_bytes_in_mysql, length); + return to_tokudb + length + length_bytes_in_tokudb; +} + +static inline uchar* unpack_toku_varbinary( + uchar* to_mysql, + uchar* from_tokudb, + uint32_t length_bytes_in_tokudb, // number of bytes used to encode length in from_tokudb + uint32_t length_bytes_in_mysql // number of bytes used to encode length in to_mysql + ) +{ + uint32_t length = get_length_from_var_tokudata(from_tokudb, length_bytes_in_tokudb); + + // + // copy the length into the mysql buffer + // + switch (length_bytes_in_mysql) { + case (0): + break; + case (1): + *to_mysql = (uchar) length; + break; + case (2): + int2store(to_mysql, length); + break; + case (3): + int3store(to_mysql, length); + break; + case (4): + int4store(to_mysql, length); + break; + default: + assert(false); + } + // + // copy the binary data + // + memcpy(to_mysql + length_bytes_in_mysql, from_tokudb + length_bytes_in_tokudb, length); + return from_tokudb + length_bytes_in_tokudb+ length; +} + +static inline int cmp_toku_varbinary( + uchar* a_buf, + uchar* b_buf, + uint32_t length_bytes, //number of bytes used to encode length in a_buf and b_buf + uint32_t* a_bytes_read, + uint32_t* b_bytes_read + ) +{ + int ret_val = 0; + uint32_t a_len = get_length_from_var_tokudata(a_buf, length_bytes); + uint32_t b_len = get_length_from_var_tokudata(b_buf, length_bytes); + ret_val = cmp_toku_binary( + a_buf + length_bytes, + a_len, + b_buf + length_bytes, + b_len + ); + *a_bytes_read = a_len + length_bytes; + *b_bytes_read = b_len + length_bytes; + return ret_val; +} + +static inline uchar* pack_toku_blob( + uchar* to_tokudb, + uchar* from_mysql, + uint32_t length_bytes_in_tokudb, //number of bytes to use to encode the length in to_tokudb + uint32_t length_bytes_in_mysql, //number of bytes used to encode the length in from_mysql + uint32_t max_num_bytes, +#if MYSQL_VERSION_ID >= 50600 + const CHARSET_INFO* charset +#else + CHARSET_INFO* charset +#endif + ) +{ + uint32_t length = 0; + uint32_t local_char_length = 0; + uchar* blob_buf = NULL; + + switch (length_bytes_in_mysql) { + case (0): + length = max_num_bytes; + break; + case (1): + length = (uint32_t)(*from_mysql); + break; + case (2): + length = uint2korr(from_mysql); + break; + case (3): + length = uint3korr(from_mysql); + break; + case (4): + length = uint4korr(from_mysql); + break; + } + set_if_smaller(length,max_num_bytes); + + memcpy(&blob_buf,from_mysql+length_bytes_in_mysql,sizeof(uchar *)); + + local_char_length= ((charset->mbmaxlen > 1) ? + max_num_bytes/charset->mbmaxlen : max_num_bytes); + if (length > local_char_length) + { + local_char_length= my_charpos( + charset, + blob_buf, + blob_buf+length, + local_char_length + ); + set_if_smaller(length, local_char_length); + } + + + // + // copy the length bytes, assuming both are in little endian + // + to_tokudb[0] = (uchar)length & 255; + if (length_bytes_in_tokudb > 1) { + to_tokudb[1] = (uchar) (length >> 8); + } + // + // copy the string + // + memcpy(to_tokudb + length_bytes_in_tokudb, blob_buf, length); + return to_tokudb + length + length_bytes_in_tokudb; +} + + +static inline uchar* unpack_toku_blob( + uchar* to_mysql, + uchar* from_tokudb, + uint32_t length_bytes_in_tokudb, // number of bytes used to encode length in from_tokudb + uint32_t length_bytes_in_mysql // number of bytes used to encode length in to_mysql + ) +{ + uint32_t length = get_length_from_var_tokudata(from_tokudb, length_bytes_in_tokudb); + uchar* blob_pos = NULL; + // + // copy the length into the mysql buffer + // + switch (length_bytes_in_mysql) { + case (0): + break; + case (1): + *to_mysql = (uchar) length; + break; + case (2): + int2store(to_mysql, length); + break; + case (3): + int3store(to_mysql, length); + break; + case (4): + int4store(to_mysql, length); + break; + default: + assert(false); + } + // + // copy the binary data + // + blob_pos = from_tokudb + length_bytes_in_tokudb; + memcpy(to_mysql + length_bytes_in_mysql, &blob_pos, sizeof(uchar *)); + return from_tokudb + length_bytes_in_tokudb+ length; +} + + +// +// partially copied from below +// +uchar* pack_toku_varstring_from_desc( + uchar* to_tokudb, + const uchar* from_desc, + uint32_t key_part_length, //number of bytes to use to encode the length in to_tokudb + uint32_t field_length, + uint32_t charset_num//length of field + ) +{ + CHARSET_INFO* charset = NULL; + uint32_t length_bytes_in_tokudb = get_length_bytes_from_max(key_part_length); + uint32_t length = field_length; + uint32_t local_char_length = 0; + set_if_smaller(length, key_part_length); + + charset = get_charset_from_num(charset_num); + + // + // copy the string + // + local_char_length= ((charset->mbmaxlen > 1) ? + key_part_length/charset->mbmaxlen : key_part_length); + if (length > local_char_length) + { + local_char_length= my_charpos( + charset, + from_desc, + from_desc+length, + local_char_length + ); + set_if_smaller(length, local_char_length); + } + + + // + // copy the length bytes, assuming both are in little endian + // + to_tokudb[0] = (uchar)length & 255; + if (length_bytes_in_tokudb > 1) { + to_tokudb[1] = (uchar) (length >> 8); + } + // + // copy the string + // + memcpy(to_tokudb + length_bytes_in_tokudb, from_desc, length); + return to_tokudb + length + length_bytes_in_tokudb; +} + +static inline uchar* pack_toku_varstring( + uchar* to_tokudb, + uchar* from_mysql, + uint32_t length_bytes_in_tokudb, //number of bytes to use to encode the length in to_tokudb + uint32_t length_bytes_in_mysql, //number of bytes used to encode the length in from_mysql + uint32_t max_num_bytes, +#if MYSQL_VERSION_ID >= 50600 + const CHARSET_INFO *charset +#else + CHARSET_INFO* charset +#endif + ) +{ + uint32_t length = 0; + uint32_t local_char_length = 0; + + switch (length_bytes_in_mysql) { + case (0): + length = max_num_bytes; + break; + case (1): + length = (uint32_t)(*from_mysql); + break; + case (2): + length = uint2korr(from_mysql); + break; + case (3): + length = uint3korr(from_mysql); + break; + case (4): + length = uint4korr(from_mysql); + break; + } + set_if_smaller(length,max_num_bytes); + + local_char_length= ((charset->mbmaxlen > 1) ? + max_num_bytes/charset->mbmaxlen : max_num_bytes); + if (length > local_char_length) + { + local_char_length= my_charpos( + charset, + from_mysql+length_bytes_in_mysql, + from_mysql+length_bytes_in_mysql+length, + local_char_length + ); + set_if_smaller(length, local_char_length); + } + + + // + // copy the length bytes, assuming both are in little endian + // + to_tokudb[0] = (uchar)length & 255; + if (length_bytes_in_tokudb > 1) { + to_tokudb[1] = (uchar) (length >> 8); + } + // + // copy the string + // + memcpy(to_tokudb + length_bytes_in_tokudb, from_mysql + length_bytes_in_mysql, length); + return to_tokudb + length + length_bytes_in_tokudb; +} + +static inline int cmp_toku_string( + uchar* a_buf, + uint32_t a_num_bytes, + uchar* b_buf, + uint32_t b_num_bytes, + uint32_t charset_number + ) +{ + int ret_val = 0; + CHARSET_INFO* charset = NULL; + + charset = get_charset_from_num(charset_number); + + ret_val = charset->coll->strnncollsp( + charset, + a_buf, + a_num_bytes, + b_buf, + b_num_bytes, + 0 + ); + return ret_val; +} + +static inline int cmp_toku_varstring( + uchar* a_buf, + uchar* b_buf, + uint32_t length_bytes, //number of bytes used to encode length in a_buf and b_buf + uint32_t charset_num, + uint32_t* a_bytes_read, + uint32_t* b_bytes_read + ) +{ + int ret_val = 0; + uint32_t a_len = get_length_from_var_tokudata(a_buf, length_bytes); + uint32_t b_len = get_length_from_var_tokudata(b_buf, length_bytes); + ret_val = cmp_toku_string( + a_buf + length_bytes, + a_len, + b_buf + length_bytes, + b_len, + charset_num + ); + *a_bytes_read = a_len + length_bytes; + *b_bytes_read = b_len + length_bytes; + return ret_val; +} + +static inline int tokudb_compare_two_hidden_keys( + const void* new_key_data, + const uint32_t new_key_size, + const void* saved_key_data, + const uint32_t saved_key_size + ) { + assert( (new_key_size >= TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH) && (saved_key_size >= TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH) ); + ulonglong a = hpk_char_to_num((uchar *) new_key_data); + ulonglong b = hpk_char_to_num((uchar *) saved_key_data); + return a < b ? -1 : (a > b ? 1 : 0); +} + +// +// Returns number of bytes used for a given TOKU_TYPE +// in a key descriptor. The number of bytes returned +// here MUST match the number of bytes used for the encoding +// in create_toku_key_descriptor_for_key +// Parameters: +// [in] row_desc - buffer that contains portion of descriptor +// created in create_toku_key_descriptor_for_key. The first +// byte points to the TOKU_TYPE. +// +uint32_t skip_field_in_descriptor(uchar* row_desc) { + uchar* row_desc_pos = row_desc; + TOKU_TYPE toku_type = (TOKU_TYPE)row_desc_pos[0]; + row_desc_pos++; + + switch (toku_type) { + case (toku_type_hpk): + case (toku_type_double): + case (toku_type_float): + break; + case (toku_type_int): + row_desc_pos += 2; + break; + case (toku_type_fixbinary): + case (toku_type_varbinary): + row_desc_pos++; + break; + case (toku_type_fixstring): + case (toku_type_varstring): + case (toku_type_blob): + row_desc_pos++; + row_desc_pos += sizeof(uint32_t); + break; + default: + assert(false); + break; + } + return (uint32_t)(row_desc_pos - row_desc); +} + +// +// outputs a descriptor for key into buf. Returns number of bytes used in buf +// to store the descriptor. Number of bytes used MUST match number of bytes +// we would skip in skip_field_in_descriptor +// +int create_toku_key_descriptor_for_key(KEY* key, uchar* buf) { + uchar* pos = buf; + uint32_t num_bytes_in_field = 0; + uint32_t charset_num = 0; + for (uint i = 0; i < get_key_parts(key); i++){ + Field* field = key->key_part[i].field; + // + // The first byte states if there is a null byte + // 0 means no null byte, non-zer means there + // is one + // + *pos = field->null_bit; + pos++; + + // + // The second byte for each field is the type + // + TOKU_TYPE type = mysql_to_toku_type(field); + assert (type < 256); + *pos = (uchar)(type & 255); + pos++; + + // + // based on the type, extra data follows afterwards + // + switch (type) { + // + // two bytes follow for ints, first one states how many + // bytes the int is (1 , 2, 3, 4 or 8) + // next one states if it is signed or not + // + case (toku_type_int): + num_bytes_in_field = field->pack_length(); + assert (num_bytes_in_field < 256); + *pos = (uchar)(num_bytes_in_field & 255); + pos++; + *pos = (field->flags & UNSIGNED_FLAG) ? 1 : 0; + pos++; + break; + // + // nothing follows floats and doubles + // + case (toku_type_double): + case (toku_type_float): + break; + // + // one byte follow stating the length of the field + // + case (toku_type_fixbinary): + num_bytes_in_field = field->pack_length(); + set_if_smaller(num_bytes_in_field, key->key_part[i].length); + assert(num_bytes_in_field < 256); + pos[0] = (uchar)(num_bytes_in_field & 255); + pos++; + break; + // + // one byte follows: the number of bytes used to encode the length + // + case (toku_type_varbinary): + *pos = (uchar)(get_length_bytes_from_max(key->key_part[i].length) & 255); + pos++; + break; + // + // five bytes follow: one for the number of bytes to encode the length, + // four for the charset number + // + case (toku_type_fixstring): + case (toku_type_varstring): + case (toku_type_blob): + *pos = (uchar)(get_length_bytes_from_max(key->key_part[i].length) & 255); + pos++; + charset_num = field->charset()->number; + pos[0] = (uchar)(charset_num & 255); + pos[1] = (uchar)((charset_num >> 8) & 255); + pos[2] = (uchar)((charset_num >> 16) & 255); + pos[3] = (uchar)((charset_num >> 24) & 255); + pos += 4; + break; + default: + assert(false); + + } + } + return pos - buf; +} + + +// +// Creates a descriptor for a DB. That contains all information necessary +// to do both key comparisons and data comparisons (for dup-sort databases). +// +// There are two types of descriptors we care about: +// 1) Primary key, (in a no-dup database) +// 2) secondary keys, which are a secondary key followed by a primary key, +// but in a no-dup database. +// +// I realize this may be confusing, but here is how it works. +// All DB's have a key compare. +// The format of the descriptor must be able to handle both. +// +// The first four bytes store an offset into the descriptor to the second piece +// used for data comparisons. So, if in the future we want to append something +// to the descriptor, we can. +// +// +int create_toku_key_descriptor( + uchar* buf, + bool is_first_hpk, + KEY* first_key, + bool is_second_hpk, + KEY* second_key + ) +{ + // + // The first four bytes always contain the offset of where the first key + // ends. + // + uchar* pos = buf + 4; + uint32_t num_bytes = 0; + uint32_t offset = 0; + + + if (is_first_hpk) { + pos[0] = 0; //say there is NO infinity byte + pos[1] = 0; //field cannot be NULL, stating it + pos[2] = toku_type_hpk; + pos += 3; + } + else { + // + // first key is NOT a hidden primary key, so we now pack first_key + // + pos[0] = 1; //say there is an infinity byte + pos++; + num_bytes = create_toku_key_descriptor_for_key(first_key, pos); + pos += num_bytes; + } + + // + // if we do not have a second key, we can jump to exit right now + // we do not have a second key if it is not a hidden primary key + // and if second_key is NULL + // + if (is_first_hpk || (!is_second_hpk && (second_key == NULL)) ) { + goto exit; + } + + // + // if we have a second key, and it is an hpk, we need to pack it, and + // write in the offset to this position in the first four bytes + // + if (is_second_hpk) { + pos[0] = 0; //field cannot be NULL, stating it + pos[1] = toku_type_hpk; + pos += 2; + } + else { + // + // second key is NOT a hidden primary key, so we now pack second_key + // + num_bytes = create_toku_key_descriptor_for_key(second_key, pos); + pos += num_bytes; + } + + +exit: + offset = pos - buf; + buf[0] = (uchar)(offset & 255); + buf[1] = (uchar)((offset >> 8) & 255); + buf[2] = (uchar)((offset >> 16) & 255); + buf[3] = (uchar)((offset >> 24) & 255); + + return pos - buf; +} + + +static inline int compare_toku_field( + uchar* a_buf, + uchar* b_buf, + uchar* row_desc, + uint32_t* a_bytes_read, + uint32_t* b_bytes_read, + uint32_t* row_desc_bytes_read + ) +{ + int ret_val = 0; + uchar* row_desc_pos = row_desc; + uint32_t num_bytes = 0; + uint32_t length_bytes = 0; + uint32_t charset_num = 0; + bool is_unsigned = false; + + TOKU_TYPE toku_type = (TOKU_TYPE)row_desc_pos[0]; + row_desc_pos++; + + switch (toku_type) { + case (toku_type_hpk): + ret_val = tokudb_compare_two_hidden_keys( + a_buf, + TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH, + b_buf, + TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + ); + *a_bytes_read = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH; + *b_bytes_read = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH; + break; + case (toku_type_int): + num_bytes = row_desc_pos[0]; + is_unsigned = row_desc_pos[1]; + ret_val = cmp_toku_int( + a_buf, + b_buf, + is_unsigned, + num_bytes + ); + *a_bytes_read = num_bytes; + *b_bytes_read = num_bytes; + row_desc_pos += 2; + break; + case (toku_type_double): + ret_val = cmp_toku_double(a_buf, b_buf); + *a_bytes_read = sizeof(double); + *b_bytes_read = sizeof(double); + break; + case (toku_type_float): + ret_val = cmp_toku_float(a_buf, b_buf); + *a_bytes_read = sizeof(float); + *b_bytes_read = sizeof(float); + break; + case (toku_type_fixbinary): + num_bytes = row_desc_pos[0]; + ret_val = cmp_toku_binary(a_buf, num_bytes, b_buf,num_bytes); + *a_bytes_read = num_bytes; + *b_bytes_read = num_bytes; + row_desc_pos++; + break; + case (toku_type_varbinary): + length_bytes = row_desc_pos[0]; + ret_val = cmp_toku_varbinary( + a_buf, + b_buf, + length_bytes, + a_bytes_read, + b_bytes_read + ); + row_desc_pos++; + break; + case (toku_type_fixstring): + case (toku_type_varstring): + case (toku_type_blob): + length_bytes = row_desc_pos[0]; + row_desc_pos++; + // + // not sure we want to read charset_num like this + // + charset_num = *(uint32_t *)row_desc_pos; + row_desc_pos += sizeof(uint32_t); + ret_val = cmp_toku_varstring( + a_buf, + b_buf, + length_bytes, + charset_num, + a_bytes_read, + b_bytes_read + ); + break; + default: + assert(false); + break; + } + + *row_desc_bytes_read = row_desc_pos - row_desc; + return ret_val; +} + +// +// packs a field from a MySQL buffer into a tokudb buffer. +// Used for inserts/updates +// +uchar* pack_toku_key_field( + uchar* to_tokudb, + uchar* from_mysql, + Field* field, + uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff + ) +{ + uchar* new_pos = NULL; + uint32_t num_bytes = 0; + TOKU_TYPE toku_type = mysql_to_toku_type(field); + switch(toku_type) { + case (toku_type_int): + assert(key_part_length == field->pack_length()); + new_pos = pack_toku_int( + to_tokudb, + from_mysql, + field->pack_length() + ); + goto exit; + case (toku_type_double): + assert(field->pack_length() == sizeof(double)); + assert(key_part_length == sizeof(double)); + new_pos = pack_toku_double(to_tokudb, from_mysql); + goto exit; + case (toku_type_float): + assert(field->pack_length() == sizeof(float)); + assert(key_part_length == sizeof(float)); + new_pos = pack_toku_float(to_tokudb, from_mysql); + goto exit; + case (toku_type_fixbinary): + num_bytes = field->pack_length(); + set_if_smaller(num_bytes, key_part_length); + new_pos = pack_toku_binary( + to_tokudb, + from_mysql, + num_bytes + ); + goto exit; + case (toku_type_fixstring): + num_bytes = field->pack_length(); + set_if_smaller(num_bytes, key_part_length); + new_pos = pack_toku_varstring( + to_tokudb, + from_mysql, + get_length_bytes_from_max(key_part_length), + 0, + num_bytes, + field->charset() + ); + goto exit; + case (toku_type_varbinary): + new_pos = pack_toku_varbinary( + to_tokudb, + from_mysql, + ((Field_varstring *)field)->length_bytes, + key_part_length + ); + goto exit; + case (toku_type_varstring): + new_pos = pack_toku_varstring( + to_tokudb, + from_mysql, + get_length_bytes_from_max(key_part_length), + ((Field_varstring *)field)->length_bytes, + key_part_length, + field->charset() + ); + goto exit; + case (toku_type_blob): + new_pos = pack_toku_blob( + to_tokudb, + from_mysql, + get_length_bytes_from_max(key_part_length), + ((Field_blob *)field)->row_pack_length(), //only calling this because packlength is returned + key_part_length, + field->charset() + ); + goto exit; + default: + assert(false); + } + assert(false); +exit: + return new_pos; +} + +// +// packs a field from a MySQL buffer into a tokudb buffer. +// Used for queries. The only difference between this function +// and pack_toku_key_field is that all variable sized columns +// use 2 bytes to encode the length, regardless of the field +// So varchar(4) will still use 2 bytes to encode the field +// +uchar* pack_key_toku_key_field( + uchar* to_tokudb, + uchar* from_mysql, + Field* field, + uint32_t key_part_length //I really hope this is temporary as I phase out the pack_cmp stuff + ) +{ + uchar* new_pos = NULL; + TOKU_TYPE toku_type = mysql_to_toku_type(field); + switch(toku_type) { + case (toku_type_int): + case (toku_type_double): + case (toku_type_float): + case (toku_type_fixbinary): + case (toku_type_fixstring): + new_pos = pack_toku_key_field(to_tokudb, from_mysql, field, key_part_length); + goto exit; + case (toku_type_varbinary): + new_pos = pack_toku_varbinary( + to_tokudb, + from_mysql, + 2, // for some idiotic reason, 2 bytes are always used here, regardless of length of field + key_part_length + ); + goto exit; + case (toku_type_varstring): + case (toku_type_blob): + new_pos = pack_toku_varstring( + to_tokudb, + from_mysql, + get_length_bytes_from_max(key_part_length), + 2, // for some idiotic reason, 2 bytes are always used here, regardless of length of field + key_part_length, + field->charset() + ); + goto exit; + default: + assert(false); + } + + assert(false); +exit: + return new_pos; +} + + +uchar* unpack_toku_key_field( + uchar* to_mysql, + uchar* from_tokudb, + Field* field, + uint32_t key_part_length + ) +{ + uchar* new_pos = NULL; + uint32_t num_bytes = 0; + uint32_t num_bytes_copied; + TOKU_TYPE toku_type = mysql_to_toku_type(field); + switch(toku_type) { + case (toku_type_int): + assert(key_part_length == field->pack_length()); + new_pos = unpack_toku_int( + to_mysql, + from_tokudb, + field->pack_length() + ); + goto exit; + case (toku_type_double): + assert(field->pack_length() == sizeof(double)); + assert(key_part_length == sizeof(double)); + new_pos = unpack_toku_double(to_mysql, from_tokudb); + goto exit; + case (toku_type_float): + assert(field->pack_length() == sizeof(float)); + assert(key_part_length == sizeof(float)); + new_pos = unpack_toku_float(to_mysql, from_tokudb); + goto exit; + case (toku_type_fixbinary): + num_bytes = field->pack_length(); + set_if_smaller(num_bytes, key_part_length); + new_pos = unpack_toku_binary( + to_mysql, + from_tokudb, + num_bytes + ); + goto exit; + case (toku_type_fixstring): + num_bytes = field->pack_length(); + new_pos = unpack_toku_varbinary( + to_mysql, + from_tokudb, + get_length_bytes_from_max(key_part_length), + 0 + ); + num_bytes_copied = new_pos - (from_tokudb + get_length_bytes_from_max(key_part_length)); + assert(num_bytes_copied <= num_bytes); + memset(to_mysql+num_bytes_copied, field->charset()->pad_char, num_bytes - num_bytes_copied); + goto exit; + case (toku_type_varbinary): + case (toku_type_varstring): + new_pos = unpack_toku_varbinary( + to_mysql, + from_tokudb, + get_length_bytes_from_max(key_part_length), + ((Field_varstring *)field)->length_bytes + ); + goto exit; + case (toku_type_blob): + new_pos = unpack_toku_blob( + to_mysql, + from_tokudb, + get_length_bytes_from_max(key_part_length), + ((Field_blob *)field)->row_pack_length() //only calling this because packlength is returned + ); + goto exit; + default: + assert(false); + } + assert(false); +exit: + return new_pos; +} + + +int tokudb_compare_two_keys( + const void* new_key_data, + const uint32_t new_key_size, + const void* saved_key_data, + const uint32_t saved_key_size, + const void* row_desc, + const uint32_t row_desc_size, + bool cmp_prefix + ) +{ + int ret_val = 0; + int8_t new_key_inf_val = COL_NEG_INF; + int8_t saved_key_inf_val = COL_NEG_INF; + + uchar* row_desc_ptr = (uchar *)row_desc; + uchar *new_key_ptr = (uchar *)new_key_data; + uchar *saved_key_ptr = (uchar *)saved_key_data; + + uint32_t new_key_bytes_left = new_key_size; + uint32_t saved_key_bytes_left = saved_key_size; + + // + // if the keys have an infinity byte, set it + // + if (row_desc_ptr[0]) { + new_key_inf_val = (int8_t)new_key_ptr[0]; + saved_key_inf_val = (int8_t)saved_key_ptr[0]; + new_key_ptr++; + saved_key_ptr++; + } + row_desc_ptr++; + + while ( (uint32_t)(new_key_ptr - (uchar *)new_key_data) < new_key_size && + (uint32_t)(saved_key_ptr - (uchar *)saved_key_data) < saved_key_size && + (uint32_t)(row_desc_ptr - (uchar *)row_desc) < row_desc_size + ) + { + uint32_t new_key_field_length; + uint32_t saved_key_field_length; + uint32_t row_desc_field_length; + // + // if there is a null byte at this point in the key + // + if (row_desc_ptr[0]) { + // + // compare null bytes. If different, return + // + if (new_key_ptr[0] != saved_key_ptr[0]) { + ret_val = ((int) *new_key_ptr - (int) *saved_key_ptr); + goto exit; + } + saved_key_ptr++; + // + // in case we just read the fact that new_key_ptr and saved_key_ptr + // have NULL as their next field + // + if (!*new_key_ptr++) { + // + // skip row_desc_ptr[0] read in if clause + // + row_desc_ptr++; + // + // skip data that describes rest of field + // + row_desc_ptr += skip_field_in_descriptor(row_desc_ptr); + continue; + } + } + row_desc_ptr++; + + ret_val = compare_toku_field( + new_key_ptr, + saved_key_ptr, + row_desc_ptr, + &new_key_field_length, + &saved_key_field_length, + &row_desc_field_length + ); + new_key_ptr += new_key_field_length; + saved_key_ptr += saved_key_field_length; + row_desc_ptr += row_desc_field_length; + if (ret_val) { + goto exit; + } + + assert((uint32_t)(new_key_ptr - (uchar *)new_key_data) <= new_key_size); + assert((uint32_t)(saved_key_ptr - (uchar *)saved_key_data) <= saved_key_size); + assert((uint32_t)(row_desc_ptr - (uchar *)row_desc) <= row_desc_size); + } + new_key_bytes_left = new_key_size - ((uint32_t)(new_key_ptr - (uchar *)new_key_data)); + saved_key_bytes_left = saved_key_size - ((uint32_t)(saved_key_ptr - (uchar *)saved_key_data)); + if (cmp_prefix) { + ret_val = 0; + } + // + // in this case, read both keys to completion, now read infinity byte + // + else if (new_key_bytes_left== 0 && saved_key_bytes_left== 0) { + ret_val = new_key_inf_val - saved_key_inf_val; + } + // + // at this point, one SHOULD be 0 + // + else if (new_key_bytes_left == 0 && saved_key_bytes_left > 0) { + ret_val = (new_key_inf_val == COL_POS_INF ) ? 1 : -1; + } + else if (new_key_bytes_left > 0 && saved_key_bytes_left == 0) { + ret_val = (saved_key_inf_val == COL_POS_INF ) ? -1 : 1; + } + // + // this should never happen, perhaps we should assert(false) + // + else { + assert(false); + ret_val = new_key_bytes_left - saved_key_bytes_left; + } +exit: + return ret_val; +} + +int tokudb_cmp_dbt_key(DB* file, const DBT *keya, const DBT *keyb) { + int cmp; + if (file->cmp_descriptor->dbt.size == 0) { + int num_bytes_cmp = keya->size < keyb->size ? + keya->size : keyb->size; + cmp = memcmp(keya->data,keyb->data,num_bytes_cmp); + if (cmp == 0 && (keya->size != keyb->size)) { + cmp = keya->size < keyb->size ? -1 : 1; + } + } + else { + cmp = tokudb_compare_two_keys( + keya->data, + keya->size, + keyb->data, + keyb->size, + (uchar *)file->cmp_descriptor->dbt.data + 4, + (*(uint32_t *)file->cmp_descriptor->dbt.data) - 4, + false + ); + } + return cmp; +} + +//TODO: QQQ Only do one direction for prefix. +int tokudb_prefix_cmp_dbt_key(DB *file, const DBT *keya, const DBT *keyb) { + int cmp = tokudb_compare_two_keys( + keya->data, + keya->size, + keyb->data, + keyb->size, + (uchar *)file->cmp_descriptor->dbt.data + 4, + *(uint32_t *)file->cmp_descriptor->dbt.data - 4, + true + ); + return cmp; +} + +static int tokudb_compare_two_key_parts( + const void* new_key_data, + const uint32_t new_key_size, + const void* saved_key_data, + const uint32_t saved_key_size, + const void* row_desc, + const uint32_t row_desc_size, + uint max_parts + ) +{ + int ret_val = 0; + + uchar* row_desc_ptr = (uchar *)row_desc; + uchar *new_key_ptr = (uchar *)new_key_data; + uchar *saved_key_ptr = (uchar *)saved_key_data; + + // + // if the keys have an infinity byte, set it + // + if (row_desc_ptr[0]) { + // new_key_inf_val = (int8_t)new_key_ptr[0]; + // saved_key_inf_val = (int8_t)saved_key_ptr[0]; + new_key_ptr++; + saved_key_ptr++; + } + row_desc_ptr++; + + for (uint i = 0; i < max_parts; i++) { + if (!((uint32_t)(new_key_ptr - (uchar *)new_key_data) < new_key_size && + (uint32_t)(saved_key_ptr - (uchar *)saved_key_data) < saved_key_size && + (uint32_t)(row_desc_ptr - (uchar *)row_desc) < row_desc_size)) + break; + uint32_t new_key_field_length; + uint32_t saved_key_field_length; + uint32_t row_desc_field_length; + // + // if there is a null byte at this point in the key + // + if (row_desc_ptr[0]) { + // + // compare null bytes. If different, return + // + if (new_key_ptr[0] != saved_key_ptr[0]) { + ret_val = ((int) *new_key_ptr - (int) *saved_key_ptr); + goto exit; + } + saved_key_ptr++; + // + // in case we just read the fact that new_key_ptr and saved_key_ptr + // have NULL as their next field + // + if (!*new_key_ptr++) { + // + // skip row_desc_ptr[0] read in if clause + // + row_desc_ptr++; + // + // skip data that describes rest of field + // + row_desc_ptr += skip_field_in_descriptor(row_desc_ptr); + continue; + } + } + row_desc_ptr++; + + ret_val = compare_toku_field( + new_key_ptr, + saved_key_ptr, + row_desc_ptr, + &new_key_field_length, + &saved_key_field_length, + &row_desc_field_length + ); + new_key_ptr += new_key_field_length; + saved_key_ptr += saved_key_field_length; + row_desc_ptr += row_desc_field_length; + if (ret_val) { + goto exit; + } + + assert((uint32_t)(new_key_ptr - (uchar *)new_key_data) <= new_key_size); + assert((uint32_t)(saved_key_ptr - (uchar *)saved_key_data) <= saved_key_size); + assert((uint32_t)(row_desc_ptr - (uchar *)row_desc) <= row_desc_size); + } + + ret_val = 0; +exit: + return ret_val; +} + +static int tokudb_cmp_dbt_key_parts(DB *file, const DBT *keya, const DBT *keyb, uint max_parts) { + assert(file->cmp_descriptor->dbt.size); + return tokudb_compare_two_key_parts( + keya->data, + keya->size, + keyb->data, + keyb->size, + (uchar *)file->cmp_descriptor->dbt.data + 4, + (*(uint32_t *)file->cmp_descriptor->dbt.data) - 4, + max_parts); +} + +uint32_t create_toku_main_key_pack_descriptor ( + uchar* buf + ) +{ + // + // The first four bytes always contain the offset of where the first key + // ends. + // + uchar* pos = buf + 4; + uint32_t offset = 0; + // + // one byte states if this is the main dictionary + // + pos[0] = 1; + pos++; + goto exit; + + +exit: + offset = pos - buf; + buf[0] = (uchar)(offset & 255); + buf[1] = (uchar)((offset >> 8) & 255); + buf[2] = (uchar)((offset >> 16) & 255); + buf[3] = (uchar)((offset >> 24) & 255); + + return pos - buf; +} + +#define COL_FIX_FIELD 0x11 +#define COL_VAR_FIELD 0x22 +#define COL_BLOB_FIELD 0x33 + +#define COL_HAS_NO_CHARSET 0x44 +#define COL_HAS_CHARSET 0x55 + +#define COL_FIX_PK_OFFSET 0x66 +#define COL_VAR_PK_OFFSET 0x77 + +#define CK_FIX_RANGE 0x88 +#define CK_VAR_RANGE 0x99 + +#define COPY_OFFSET_TO_BUF memcpy ( \ + pos, \ + &kc_info->cp_info[pk_index][field_index].col_pack_val, \ + sizeof(uint32_t) \ + ); \ + pos += sizeof(uint32_t); + + +uint32_t pack_desc_pk_info(uchar* buf, KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, KEY_PART_INFO* key_part) { + uchar* pos = buf; + uint16 field_index = key_part->field->field_index; + Field* field = table_share->field[field_index]; + TOKU_TYPE toku_type = mysql_to_toku_type(field); + uint32_t key_part_length = key_part->length; + uint32_t field_length; + uchar len_bytes = 0; + + switch(toku_type) { + case (toku_type_int): + case (toku_type_double): + case (toku_type_float): + pos[0] = COL_FIX_FIELD; + pos++; + assert(kc_info->field_lengths[field_index] < 256); + pos[0] = kc_info->field_lengths[field_index]; + pos++; + break; + case (toku_type_fixbinary): + pos[0] = COL_FIX_FIELD; + pos++; + field_length = field->pack_length(); + set_if_smaller(key_part_length, field_length); + assert(key_part_length < 256); + pos[0] = (uchar)key_part_length; + pos++; + break; + case (toku_type_fixstring): + case (toku_type_varbinary): + case (toku_type_varstring): + case (toku_type_blob): + pos[0] = COL_VAR_FIELD; + pos++; + len_bytes = (key_part_length > 255) ? 2 : 1; + pos[0] = len_bytes; + pos++; + break; + default: + assert(false); + } + + return pos - buf; +} + +uint32_t pack_desc_pk_offset_info( + uchar* buf, + KEY_AND_COL_INFO* kc_info, + TABLE_SHARE* table_share, + KEY_PART_INFO* key_part, + KEY* prim_key, + uchar* pk_info + ) +{ + uchar* pos = buf; + uint16 field_index = key_part->field->field_index; + bool found_col_in_pk = false; + uint32_t index_in_pk; + + bool is_constant_offset = true; + uint32_t offset = 0; + for (uint i = 0; i < get_key_parts(prim_key); i++) { + KEY_PART_INFO curr = prim_key->key_part[i]; + uint16 curr_field_index = curr.field->field_index; + + if (pk_info[2*i] == COL_VAR_FIELD) { + is_constant_offset = false; + } + + if (curr_field_index == field_index) { + found_col_in_pk = true; + index_in_pk = i; + break; + } + offset += pk_info[2*i + 1]; + } + assert(found_col_in_pk); + if (is_constant_offset) { + pos[0] = COL_FIX_PK_OFFSET; + pos++; + + memcpy (pos, &offset, sizeof(offset)); + pos += sizeof(offset); + } + else { + pos[0] = COL_VAR_PK_OFFSET; + pos++; + + memcpy(pos, &index_in_pk, sizeof(index_in_pk)); + pos += sizeof(index_in_pk); + } + return pos - buf; +} + +uint32_t pack_desc_offset_info(uchar* buf, KEY_AND_COL_INFO* kc_info, uint pk_index, TABLE_SHARE* table_share, KEY_PART_INFO* key_part) { + uchar* pos = buf; + uint16 field_index = key_part->field->field_index; + Field* field = table_share->field[field_index]; + TOKU_TYPE toku_type = mysql_to_toku_type(field); + bool found_index = false; + + switch(toku_type) { + case (toku_type_int): + case (toku_type_double): + case (toku_type_float): + case (toku_type_fixbinary): + case (toku_type_fixstring): + pos[0] = COL_FIX_FIELD; + pos++; + + // copy the offset + COPY_OFFSET_TO_BUF; + break; + case (toku_type_varbinary): + case (toku_type_varstring): + pos[0] = COL_VAR_FIELD; + pos++; + + // copy the offset + COPY_OFFSET_TO_BUF; + break; + case (toku_type_blob): + pos[0] = COL_BLOB_FIELD; + pos++; + for (uint32_t i = 0; i < kc_info->num_blobs; i++) { + uint32_t blob_index = kc_info->blob_fields[i]; + if (blob_index == field_index) { + uint32_t val = i; + memcpy(pos, &val, sizeof(uint32_t)); + pos += sizeof(uint32_t); + found_index = true; + break; + } + } + assert(found_index); + break; + default: + assert(false); + } + + return pos - buf; +} + +uint32_t pack_desc_key_length_info(uchar* buf, KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, KEY_PART_INFO* key_part) { + uchar* pos = buf; + uint16 field_index = key_part->field->field_index; + Field* field = table_share->field[field_index]; + TOKU_TYPE toku_type = mysql_to_toku_type(field); + uint32_t key_part_length = key_part->length; + uint32_t field_length; + + switch(toku_type) { + case (toku_type_int): + case (toku_type_double): + case (toku_type_float): + // copy the key_part length + field_length = kc_info->field_lengths[field_index]; + memcpy(pos, &field_length, sizeof(field_length)); + pos += sizeof(key_part_length); + break; + case (toku_type_fixbinary): + case (toku_type_fixstring): + field_length = field->pack_length(); + set_if_smaller(key_part_length, field_length); + case (toku_type_varbinary): + case (toku_type_varstring): + case (toku_type_blob): + // copy the key_part length + memcpy(pos, &key_part_length, sizeof(key_part_length)); + pos += sizeof(key_part_length); + break; + default: + assert(false); + } + + return pos - buf; +} + +uint32_t pack_desc_char_info(uchar* buf, KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, KEY_PART_INFO* key_part) { + uchar* pos = buf; + uint16 field_index = key_part->field->field_index; + Field* field = table_share->field[field_index]; + TOKU_TYPE toku_type = mysql_to_toku_type(field); + uint32_t charset_num = 0; + + switch(toku_type) { + case (toku_type_int): + case (toku_type_double): + case (toku_type_float): + case (toku_type_fixbinary): + case (toku_type_varbinary): + pos[0] = COL_HAS_NO_CHARSET; + pos++; + break; + case (toku_type_fixstring): + case (toku_type_varstring): + case (toku_type_blob): + pos[0] = COL_HAS_CHARSET; + pos++; + + // copy the charset + charset_num = field->charset()->number; + pos[0] = (uchar)(charset_num & 255); + pos[1] = (uchar)((charset_num >> 8) & 255); + pos[2] = (uchar)((charset_num >> 16) & 255); + pos[3] = (uchar)((charset_num >> 24) & 255); + pos += 4; + break; + default: + assert(false); + } + + return pos - buf; +} + +uint32_t pack_some_row_info ( + uchar* buf, + uint pk_index, + TABLE_SHARE* table_share, + KEY_AND_COL_INFO* kc_info + ) +{ + uchar* pos = buf; + uint32_t num_null_bytes = 0; + // + // four bytes stating number of null bytes + // + num_null_bytes = table_share->null_bytes; + memcpy(pos, &num_null_bytes, sizeof(num_null_bytes)); + pos += sizeof(num_null_bytes); + // + // eight bytes stating mcp_info + // + memcpy(pos, &kc_info->mcp_info[pk_index], sizeof(MULTI_COL_PACK_INFO)); + pos += sizeof(MULTI_COL_PACK_INFO); + // + // one byte for the number of offset bytes + // + pos[0] = (uchar)kc_info->num_offset_bytes; + pos++; + + return pos - buf; +} + +uint32_t get_max_clustering_val_pack_desc_size( + TABLE_SHARE* table_share + ) +{ + uint32_t ret_val = 0; + // + // the fixed stuff: + // first the things in pack_some_row_info + // second another mcp_info + // third a byte that states if blobs exist + ret_val += sizeof(uint32_t) + sizeof(MULTI_COL_PACK_INFO) + 1; + ret_val += sizeof(MULTI_COL_PACK_INFO); + ret_val++; + // + // now the variable stuff + // an upper bound is, for each field, byte stating if it is fixed or var, followed + // by 8 bytes for endpoints + // + ret_val += (table_share->fields)*(1 + 2*sizeof(uint32_t)); + // + // four bytes storing the length of this portion + // + ret_val += 4; + + return ret_val; +} + +uint32_t create_toku_clustering_val_pack_descriptor ( + uchar* buf, + uint pk_index, + TABLE_SHARE* table_share, + KEY_AND_COL_INFO* kc_info, + uint32_t keynr, + bool is_clustering + ) +{ + uchar* pos = buf + 4; + uint32_t offset = 0; + bool start_range_set = false; + uint32_t last_col = 0; + // + // do not need to write anything if the key is not clustering + // + if (!is_clustering) { + goto exit; + } + + pos += pack_some_row_info( + pos, + pk_index, + table_share, + kc_info + ); + + // + // eight bytes stating mcp_info of clustering key + // + memcpy(pos, &kc_info->mcp_info[keynr], sizeof(MULTI_COL_PACK_INFO)); + pos += sizeof(MULTI_COL_PACK_INFO); + + // + // store bit that states if blobs exist + // + pos[0] = (kc_info->num_blobs) ? 1 : 0; + pos++; + + // + // descriptor assumes that all fields filtered from pk are + // also filtered from clustering key val. Doing check here to + // make sure something unexpected does not happen + // + for (uint i = 0; i < table_share->fields; i++) { + bool col_filtered = bitmap_is_set(&kc_info->key_filters[keynr],i); + bool col_filtered_in_pk = bitmap_is_set(&kc_info->key_filters[pk_index],i); + if (col_filtered_in_pk) { + assert(col_filtered); + } + } + + // + // first handle the fixed fields + // + start_range_set = false; + last_col = 0; + for (uint i = 0; i < table_share->fields; i++) { + bool col_filtered = bitmap_is_set(&kc_info->key_filters[keynr],i); + if (kc_info->field_lengths[i] == 0) { + // + // not a fixed field, continue + // + continue; + } + if (col_filtered && start_range_set) { + // + // need to set the end range + // + start_range_set = false; + uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val + kc_info->field_lengths[last_col]; + memcpy(pos, &end_offset, sizeof(end_offset)); + pos += sizeof(end_offset); + } + else if (!col_filtered) { + if (!start_range_set) { + pos[0] = CK_FIX_RANGE; + pos++; + start_range_set = true; + uint32_t start_offset = kc_info->cp_info[pk_index][i].col_pack_val; + memcpy(pos, &start_offset , sizeof(start_offset)); + pos += sizeof(start_offset); + } + last_col = i; + } + else { + continue; + } + } + if (start_range_set) { + // + // need to set the end range + // + start_range_set = false; + uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val+ kc_info->field_lengths[last_col]; + memcpy(pos, &end_offset, sizeof(end_offset)); + pos += sizeof(end_offset); + } + + // + // now handle the var fields + // + start_range_set = false; + last_col = 0; + for (uint i = 0; i < table_share->fields; i++) { + bool col_filtered = bitmap_is_set(&kc_info->key_filters[keynr],i); + if (kc_info->length_bytes[i] == 0) { + // + // not a var field, continue + // + continue; + } + if (col_filtered && start_range_set) { + // + // need to set the end range + // + start_range_set = false; + uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val; + memcpy(pos, &end_offset, sizeof(end_offset)); + pos += sizeof(end_offset); + } + else if (!col_filtered) { + if (!start_range_set) { + pos[0] = CK_VAR_RANGE; + pos++; + + start_range_set = true; + uint32_t start_offset = kc_info->cp_info[pk_index][i].col_pack_val; + memcpy(pos, &start_offset , sizeof(start_offset)); + pos += sizeof(start_offset); + } + last_col = i; + } + else { + continue; + } + } + if (start_range_set) { + start_range_set = false; + uint32_t end_offset = kc_info->cp_info[pk_index][last_col].col_pack_val; + memcpy(pos, &end_offset, sizeof(end_offset)); + pos += sizeof(end_offset); + } + +exit: + offset = pos - buf; + buf[0] = (uchar)(offset & 255); + buf[1] = (uchar)((offset >> 8) & 255); + buf[2] = (uchar)((offset >> 16) & 255); + buf[3] = (uchar)((offset >> 24) & 255); + + return pos - buf; +} + +uint32_t pack_clustering_val_from_desc( + uchar* buf, + void* row_desc, + uint32_t row_desc_size, + const DBT* pk_val + ) +{ + uchar* null_bytes_src_ptr = NULL; + uchar* fixed_src_ptr = NULL; + uchar* var_src_offset_ptr = NULL; + uchar* var_src_data_ptr = NULL; + uchar* fixed_dest_ptr = NULL; + uchar* var_dest_offset_ptr = NULL; + uchar* var_dest_data_ptr = NULL; + uchar* orig_var_dest_data_ptr = NULL; + uchar* desc_pos = (uchar *)row_desc; + uint32_t num_null_bytes = 0; + uint32_t num_offset_bytes; + MULTI_COL_PACK_INFO src_mcp_info, dest_mcp_info; + uchar has_blobs; + + memcpy(&num_null_bytes, desc_pos, sizeof(num_null_bytes)); + desc_pos += sizeof(num_null_bytes); + + memcpy(&src_mcp_info, desc_pos, sizeof(src_mcp_info)); + desc_pos += sizeof(src_mcp_info); + + num_offset_bytes = desc_pos[0]; + desc_pos++; + + memcpy(&dest_mcp_info, desc_pos, sizeof(dest_mcp_info)); + desc_pos += sizeof(dest_mcp_info); + + has_blobs = desc_pos[0]; + desc_pos++; + + // + //set the variables + // + null_bytes_src_ptr = (uchar *)pk_val->data; + fixed_src_ptr = null_bytes_src_ptr + num_null_bytes; + var_src_offset_ptr = fixed_src_ptr + src_mcp_info.fixed_field_size; + var_src_data_ptr = var_src_offset_ptr + src_mcp_info.len_of_offsets; + + fixed_dest_ptr = buf + num_null_bytes; + var_dest_offset_ptr = fixed_dest_ptr + dest_mcp_info.fixed_field_size; + var_dest_data_ptr = var_dest_offset_ptr + dest_mcp_info.len_of_offsets; + orig_var_dest_data_ptr = var_dest_data_ptr; + + // + // copy the null bytes + // + memcpy(buf, null_bytes_src_ptr, num_null_bytes); + while ( (uint32_t)(desc_pos - (uchar *)row_desc) < row_desc_size) { + uint32_t start, end, length; + uchar curr = desc_pos[0]; + desc_pos++; + + memcpy(&start, desc_pos, sizeof(start)); + desc_pos += sizeof(start); + + memcpy(&end, desc_pos, sizeof(end)); + desc_pos += sizeof(end); + + assert (start <= end); + + if (curr == CK_FIX_RANGE) { + length = end - start; + + memcpy(fixed_dest_ptr, fixed_src_ptr + start, length); + fixed_dest_ptr += length; + } + else if (curr == CK_VAR_RANGE) { + uint32_t start_data_size; + uint32_t start_data_offset; + uint32_t end_data_size; + uint32_t end_data_offset; + uint32_t offset_diffs; + + get_var_field_info( + &start_data_size, + &start_data_offset, + start, + var_src_offset_ptr, + num_offset_bytes + ); + get_var_field_info( + &end_data_size, + &end_data_offset, + end, + var_src_offset_ptr, + num_offset_bytes + ); + length = end_data_offset + end_data_size - start_data_offset; + // + // copy the data + // + memcpy( + var_dest_data_ptr, + var_src_data_ptr + start_data_offset, + length + ); + var_dest_data_ptr += length; + + // + // put in offset info + // + offset_diffs = (end_data_offset + end_data_size) - (uint32_t)(var_dest_data_ptr - orig_var_dest_data_ptr); + for (uint32_t i = start; i <= end; i++) { + if ( num_offset_bytes == 1 ) { + assert(offset_diffs < 256); + var_dest_offset_ptr[0] = var_src_offset_ptr[i] - (uchar)offset_diffs; + var_dest_offset_ptr++; + } + else if ( num_offset_bytes == 2 ) { + uint32_t tmp = uint2korr(var_src_offset_ptr + 2*i); + uint32_t new_offset = tmp - offset_diffs; + assert(new_offset < 1<<16); + int2store(var_dest_offset_ptr,new_offset); + var_dest_offset_ptr += 2; + } + else { + assert(false); + } + } + } + else { + assert(false); + } + } + // + // copy blobs + // at this point, var_dest_data_ptr is pointing to the end, where blobs should be located + // so, we put the blobs at var_dest_data_ptr + // + if (has_blobs) { + uint32_t num_blob_bytes; + uint32_t start_offset; + uchar* src_blob_ptr = NULL; + get_blob_field_info( + &start_offset, + src_mcp_info.len_of_offsets, + var_src_data_ptr, + num_offset_bytes + ); + src_blob_ptr = var_src_data_ptr + start_offset; + num_blob_bytes = pk_val->size - (start_offset + (var_src_data_ptr - null_bytes_src_ptr)); + memcpy(var_dest_data_ptr, src_blob_ptr, num_blob_bytes); + var_dest_data_ptr += num_blob_bytes; + } + return var_dest_data_ptr - buf; +} + + +uint32_t get_max_secondary_key_pack_desc_size( + KEY_AND_COL_INFO* kc_info + ) +{ + uint32_t ret_val = 0; + // + // the fixed stuff: + // byte that states if main dictionary + // byte that states if hpk + // the things in pack_some_row_info + ret_val++; + ret_val++; + ret_val += sizeof(uint32_t) + sizeof(MULTI_COL_PACK_INFO) + 1; + // + // now variable sized stuff + // + + // first the blobs + ret_val += sizeof(kc_info->num_blobs); + ret_val+= kc_info->num_blobs; + + // then the pk + // one byte for num key parts + // two bytes for each key part + ret_val++; + ret_val += MAX_REF_PARTS*2; + + // then the key + // null bit, then null byte, + // then 1 byte stating what it is, then 4 for offset, 4 for key length, + // 1 for if charset exists, and 4 for charset + ret_val += MAX_REF_PARTS*(1 + sizeof(uint32_t) + 1 + 3*sizeof(uint32_t) + 1); + // + // four bytes storing the length of this portion + // + ret_val += 4; + return ret_val; +} + +uint32_t create_toku_secondary_key_pack_descriptor ( + uchar* buf, + bool has_hpk, + uint pk_index, + TABLE_SHARE* table_share, + TABLE* table, + KEY_AND_COL_INFO* kc_info, + KEY* key_info, + KEY* prim_key + ) +{ + // + // The first four bytes always contain the offset of where the first key + // ends. + // + uchar* pk_info = NULL; + uchar* pos = buf + 4; + uint32_t offset = 0; + + // + // first byte states that it is NOT main dictionary + // + pos[0] = 0; + pos++; + + // + // one byte states if main dictionary has an hpk or not + // + if (has_hpk) { + pos[0] = 1; + } + else { + pos[0] = 0; + } + pos++; + + pos += pack_some_row_info( + pos, + pk_index, + table_share, + kc_info + ); + + // + // store blob information + // + memcpy(pos, &kc_info->num_blobs, sizeof(kc_info->num_blobs)); + pos += sizeof(uint32_t); + for (uint32_t i = 0; i < kc_info->num_blobs; i++) { + // + // store length bytes for each blob + // + Field* field = table_share->field[kc_info->blob_fields[i]]; + pos[0] = (uchar)field->row_pack_length(); + pos++; + } + + // + // store the pk information + // + if (has_hpk) { + pos[0] = 0; + pos++; + } + else { + // + // store number of parts + // + assert(get_key_parts(prim_key) < 128); + pos[0] = 2 * get_key_parts(prim_key); + pos++; + // + // for each part, store if it is a fixed field or var field + // if fixed, store number of bytes, if var, store + // number of length bytes + // total should be two bytes per key part stored + // + pk_info = pos; + uchar* tmp = pos; + for (uint i = 0; i < get_key_parts(prim_key); i++) { + tmp += pack_desc_pk_info( + tmp, + kc_info, + table_share, + &prim_key->key_part[i] + ); + } + // + // asserting that we moved forward as much as we think we have + // + assert(tmp - pos == (2 * get_key_parts(prim_key))); + pos = tmp; + } + + for (uint i = 0; i < get_key_parts(key_info); i++) { + KEY_PART_INFO curr_kpi = key_info->key_part[i]; + uint16 field_index = curr_kpi.field->field_index; + Field* field = table_share->field[field_index]; + bool is_col_in_pk = false; + + if (bitmap_is_set(&kc_info->key_filters[pk_index],field_index)) { + assert(!has_hpk && prim_key != NULL); + is_col_in_pk = true; + } + else { + is_col_in_pk = false; + } + + pos[0] = field->null_bit; + pos++; + + if (is_col_in_pk) { + // + // assert that columns in pk do not have a null bit + // because in MySQL, pk columns cannot be null + // + assert(!field->null_bit); + } + + if (field->null_bit) { + uint32_t null_offset = get_null_offset(table,table->field[field_index]); + memcpy(pos, &null_offset, sizeof(uint32_t)); + pos += sizeof(uint32_t); + } + if (is_col_in_pk) { + pos += pack_desc_pk_offset_info( + pos, + kc_info, + table_share, + &curr_kpi, + prim_key, + pk_info + ); + } + else { + pos += pack_desc_offset_info( + pos, + kc_info, + pk_index, + table_share, + &curr_kpi + ); + } + pos += pack_desc_key_length_info( + pos, + kc_info, + table_share, + &curr_kpi + ); + pos += pack_desc_char_info( + pos, + kc_info, + table_share, + &curr_kpi + ); + } + + offset = pos - buf; + buf[0] = (uchar)(offset & 255); + buf[1] = (uchar)((offset >> 8) & 255); + buf[2] = (uchar)((offset >> 16) & 255); + buf[3] = (uchar)((offset >> 24) & 255); + + return pos - buf; +} + +uint32_t skip_key_in_desc( + uchar* row_desc + ) +{ + uchar* pos = row_desc; + uchar col_bin_or_char; + // + // skip the byte that states if it is a fix field or var field, we do not care + // + pos++; + + // + // skip the offset information + // + pos += sizeof(uint32_t); + + // + // skip the key_part_length info + // + pos += sizeof(uint32_t); + col_bin_or_char = pos[0]; + pos++; + if (col_bin_or_char == COL_HAS_NO_CHARSET) { + goto exit; + } + // + // skip the charset info + // + pos += 4; + + +exit: + return (uint32_t)(pos-row_desc); +} + + +uint32_t max_key_size_from_desc( + void* row_desc, + uint32_t row_desc_size + ) +{ + uchar* desc_pos = (uchar *)row_desc; + uint32_t num_blobs; + uint32_t num_pk_columns; + // + // start at 1 for the infinity byte + // + uint32_t max_size = 1; + + // skip byte that states if main dictionary + bool is_main_dictionary = desc_pos[0]; + desc_pos++; + assert(!is_main_dictionary); + + // skip hpk byte + desc_pos++; + + // skip num_null_bytes + desc_pos += sizeof(uint32_t); + + // skip mcp_info + desc_pos += sizeof(MULTI_COL_PACK_INFO); + + // skip offset_bytes + desc_pos++; + + // skip over blobs + memcpy(&num_blobs, desc_pos, sizeof(num_blobs)); + desc_pos += sizeof(num_blobs); + desc_pos += num_blobs; + + // skip over pk info + num_pk_columns = desc_pos[0]/2; + desc_pos++; + desc_pos += 2*num_pk_columns; + + while ( (uint32_t)(desc_pos - (uchar *)row_desc) < row_desc_size) { + uchar has_charset; + uint32_t key_length = 0; + + uchar null_bit = desc_pos[0]; + desc_pos++; + + if (null_bit) { + // + // column is NULLable, skip null_offset, and add a null byte + // + max_size++; + desc_pos += sizeof(uint32_t); + } + // + // skip over byte that states if fix or var + // + desc_pos++; + + // skip over offset + desc_pos += sizeof(uint32_t); + + // + // get the key length and add it to return value + // + memcpy(&key_length, desc_pos, sizeof(key_length)); + desc_pos += sizeof(key_length); + max_size += key_length; + max_size += 2; // 2 bytes for a potential length bytes, we are upperbounding, does not need to be super tight + + has_charset = desc_pos[0]; + desc_pos++; + + uint32_t charset_num; + if (has_charset == COL_HAS_CHARSET) { + // skip over charsent num + desc_pos += sizeof(charset_num); + } + else { + assert(has_charset == COL_HAS_NO_CHARSET); + } + } + return max_size; +} + +uint32_t pack_key_from_desc( + uchar* buf, + void* row_desc, + uint32_t row_desc_size, + const DBT* pk_key, + const DBT* pk_val + ) +{ + MULTI_COL_PACK_INFO mcp_info; + uint32_t num_null_bytes; + uint32_t num_blobs; + uint32_t num_pk_columns; + uchar* blob_lengths = NULL; + uchar* pk_info = NULL; + uchar* pk_data_ptr = NULL; + uchar* null_bytes_ptr = NULL; + uchar* fixed_field_ptr = NULL; + uchar* var_field_offset_ptr = NULL; + const uchar* var_field_data_ptr = NULL; + uint32_t num_offset_bytes; + uchar* packed_key_pos = buf; + uchar* desc_pos = (uchar *)row_desc; + + bool is_main_dictionary = desc_pos[0]; + desc_pos++; + assert(!is_main_dictionary); + + // + // get the constant info out of descriptor + // + bool hpk = desc_pos[0]; + desc_pos++; + + memcpy(&num_null_bytes, desc_pos, sizeof(num_null_bytes)); + desc_pos += sizeof(num_null_bytes); + + memcpy(&mcp_info, desc_pos, sizeof(mcp_info)); + desc_pos += sizeof(mcp_info); + + num_offset_bytes = desc_pos[0]; + desc_pos++; + + memcpy(&num_blobs, desc_pos, sizeof(num_blobs)); + desc_pos += sizeof(num_blobs); + + blob_lengths = desc_pos; + desc_pos += num_blobs; + + num_pk_columns = desc_pos[0]/2; + desc_pos++; + pk_info = desc_pos; + desc_pos += 2*num_pk_columns; + + // + // now start packing the key + // + + // + // pack the infinity byte + // + packed_key_pos[0] = COL_ZERO; + packed_key_pos++; + // + // now start packing each column of the key, as described in descriptor + // + if (!hpk) { + // +1 for the infinity byte + pk_data_ptr = (uchar *)pk_key->data + 1; + } + null_bytes_ptr = (uchar *)pk_val->data; + fixed_field_ptr = null_bytes_ptr + num_null_bytes; + var_field_offset_ptr = fixed_field_ptr + mcp_info.fixed_field_size; + var_field_data_ptr = var_field_offset_ptr + mcp_info.len_of_offsets; + while ( (uint32_t)(desc_pos - (uchar *)row_desc) < row_desc_size) { + uchar col_fix_val; + uchar has_charset; + uint32_t col_pack_val = 0; + uint32_t key_length = 0; + + uchar null_bit = desc_pos[0]; + desc_pos++; + + if (null_bit) { + // + // column is NULLable, need to check the null bytes to see if it is NULL + // + uint32_t null_offset = 0; + bool is_field_null; + memcpy(&null_offset, desc_pos, sizeof(null_offset)); + desc_pos += sizeof(null_offset); + + is_field_null = (null_bytes_ptr[null_offset] & null_bit) ? true: false; + if (is_field_null) { + packed_key_pos[0] = NULL_COL_VAL; + packed_key_pos++; + desc_pos += skip_key_in_desc(desc_pos); + continue; + } + else { + packed_key_pos[0] = NONNULL_COL_VAL; + packed_key_pos++; + } + } + // + // now pack the column (unless it was NULL, and we continued) + // + col_fix_val = desc_pos[0]; + desc_pos++; + + memcpy(&col_pack_val, desc_pos, sizeof(col_pack_val)); + desc_pos += sizeof(col_pack_val); + + memcpy(&key_length, desc_pos, sizeof(key_length)); + desc_pos += sizeof(key_length); + + has_charset = desc_pos[0]; + desc_pos++; + + uint32_t charset_num = 0; + if (has_charset == COL_HAS_CHARSET) { + memcpy(&charset_num, desc_pos, sizeof(charset_num)); + desc_pos += sizeof(charset_num); + } + else { + assert(has_charset == COL_HAS_NO_CHARSET); + } + // + // case where column is in pk val + // + if (col_fix_val == COL_FIX_FIELD || col_fix_val == COL_VAR_FIELD || col_fix_val == COL_BLOB_FIELD) { + if (col_fix_val == COL_FIX_FIELD && has_charset == COL_HAS_NO_CHARSET) { + memcpy(packed_key_pos, &fixed_field_ptr[col_pack_val], key_length); + packed_key_pos += key_length; + } + else if (col_fix_val == COL_VAR_FIELD && has_charset == COL_HAS_NO_CHARSET) { + uint32_t data_start_offset = 0; + + uint32_t data_size = 0; + get_var_field_info( + &data_size, + &data_start_offset, + col_pack_val, + var_field_offset_ptr, + num_offset_bytes + ); + + // + // length of this field in this row is data_size + // data is located beginning at var_field_data_ptr + data_start_offset + // + packed_key_pos = pack_toku_varbinary_from_desc( + packed_key_pos, + var_field_data_ptr + data_start_offset, + key_length, //number of bytes to use to encode the length in to_tokudb + data_size //length of field + ); + } + else { + const uchar* data_start = NULL; + uint32_t data_start_offset = 0; + uint32_t data_size = 0; + + if (col_fix_val == COL_FIX_FIELD) { + data_start_offset = col_pack_val; + data_size = key_length; + data_start = fixed_field_ptr + data_start_offset; + } + else if (col_fix_val == COL_VAR_FIELD){ + get_var_field_info( + &data_size, + &data_start_offset, + col_pack_val, + var_field_offset_ptr, + num_offset_bytes + ); + data_start = var_field_data_ptr + data_start_offset; + } + else if (col_fix_val == COL_BLOB_FIELD) { + uint32_t blob_index = col_pack_val; + uint32_t blob_offset; + const uchar* blob_ptr = NULL; + uint32_t field_len; + uint32_t field_len_bytes = blob_lengths[blob_index]; + get_blob_field_info( + &blob_offset, + mcp_info.len_of_offsets, + var_field_data_ptr, + num_offset_bytes + ); + blob_ptr = var_field_data_ptr + blob_offset; + assert(num_blobs > 0); + // + // skip over other blobs to get to the one we want to make a key out of + // + for (uint32_t i = 0; i < blob_index; i++) { + blob_ptr = unpack_toku_field_blob( + NULL, + blob_ptr, + blob_lengths[i], + true + ); + } + // + // at this point, blob_ptr is pointing to the blob we want to make a key from + // + field_len = get_blob_field_len(blob_ptr, field_len_bytes); + // + // now we set the variables to make the key + // + data_start = blob_ptr + field_len_bytes; + data_size = field_len; + + + } + else { + assert(false); + } + + packed_key_pos = pack_toku_varstring_from_desc( + packed_key_pos, + data_start, + key_length, + data_size, + charset_num + ); + } + } + // + // case where column is in pk key + // + else { + if (col_fix_val == COL_FIX_PK_OFFSET) { + memcpy(packed_key_pos, &pk_data_ptr[col_pack_val], key_length); + packed_key_pos += key_length; + } + else if (col_fix_val == COL_VAR_PK_OFFSET) { + uchar* tmp_pk_data_ptr = pk_data_ptr; + uint32_t index_in_pk = col_pack_val; + // + // skip along in pk to the right column + // + for (uint32_t i = 0; i < index_in_pk; i++) { + if (pk_info[2*i] == COL_FIX_FIELD) { + tmp_pk_data_ptr += pk_info[2*i + 1]; + } + else if (pk_info[2*i] == COL_VAR_FIELD) { + uint32_t len_bytes = pk_info[2*i + 1]; + uint32_t len; + if (len_bytes == 1) { + len = tmp_pk_data_ptr[0]; + tmp_pk_data_ptr++; + } + else if (len_bytes == 2) { + len = uint2korr(tmp_pk_data_ptr); + tmp_pk_data_ptr += 2; + } + else { + assert(false); + } + tmp_pk_data_ptr += len; + } + else { + assert(false); + } + } + // + // at this point, tmp_pk_data_ptr is pointing at the column + // + uint32_t is_fix_field = pk_info[2*index_in_pk]; + if (is_fix_field == COL_FIX_FIELD) { + memcpy(packed_key_pos, tmp_pk_data_ptr, key_length); + packed_key_pos += key_length; + } + else if (is_fix_field == COL_VAR_FIELD) { + const uchar* data_start = NULL; + uint32_t data_size = 0; + uint32_t len_bytes = pk_info[2*index_in_pk + 1]; + if (len_bytes == 1) { + data_size = tmp_pk_data_ptr[0]; + tmp_pk_data_ptr++; + } + else if (len_bytes == 2) { + data_size = uint2korr(tmp_pk_data_ptr); + tmp_pk_data_ptr += 2; + } + else { + assert(false); + } + data_start = tmp_pk_data_ptr; + + if (has_charset == COL_HAS_CHARSET) { + packed_key_pos = pack_toku_varstring_from_desc( + packed_key_pos, + data_start, + key_length, + data_size, + charset_num + ); + } + else if (has_charset == COL_HAS_NO_CHARSET) { + packed_key_pos = pack_toku_varbinary_from_desc( + packed_key_pos, + data_start, + key_length, + data_size //length of field + ); + } + else { + assert(false); + } + } + else { + assert(false); + } + } + else { + assert(false); + } + } + + } + assert( (uint32_t)(desc_pos - (uchar *)row_desc) == row_desc_size); + + // + // now append the primary key to the end of the key + // + if (hpk) { + memcpy(packed_key_pos, pk_key->data, pk_key->size); + packed_key_pos += pk_key->size; + } + else { + memcpy(packed_key_pos, (uchar *)pk_key->data + 1, pk_key->size - 1); + packed_key_pos += (pk_key->size - 1); + } + + return (uint32_t)(packed_key_pos - buf); // +} + +bool fields_have_same_name( + Field* a, + Field* b + ) +{ + return strcmp(a->field_name, b->field_name) == 0; +} + +bool fields_are_same_type( + Field* a, + Field* b + ) +{ + bool retval = true; + enum_field_types a_mysql_type = a->real_type(); + enum_field_types b_mysql_type = b->real_type(); + TOKU_TYPE a_toku_type = mysql_to_toku_type(a); + TOKU_TYPE b_toku_type = mysql_to_toku_type(b); + // make sure have same names + // make sure have same types + if (a_mysql_type != b_mysql_type) { + retval = false; + goto cleanup; + } + // Thanks to MariaDB 5.5, we can have two fields + // be the same MySQL type but not the same toku type, + // This is an issue introduced with MariaDB's fractional time + // implementation + if (a_toku_type != b_toku_type) { + retval = false; + goto cleanup; + } + // make sure that either both are nullable, or both not nullable + if ((a->null_bit && !b->null_bit) || (!a->null_bit && b->null_bit)) { + retval = false; + goto cleanup; + } + switch (a_mysql_type) { + case MYSQL_TYPE_TINY: + case MYSQL_TYPE_SHORT: + case MYSQL_TYPE_INT24: + case MYSQL_TYPE_LONG: + case MYSQL_TYPE_LONGLONG: + // length, unsigned, auto increment + if (a->pack_length() != b->pack_length() || + (a->flags & UNSIGNED_FLAG) != (b->flags & UNSIGNED_FLAG) || + (a->flags & AUTO_INCREMENT_FLAG) != (b->flags & AUTO_INCREMENT_FLAG)) { + retval = false; + goto cleanup; + } + break; + case MYSQL_TYPE_DOUBLE: + case MYSQL_TYPE_FLOAT: + // length, unsigned, auto increment + if (a->pack_length() != b->pack_length() || + (a->flags & UNSIGNED_FLAG) != (b->flags & UNSIGNED_FLAG) || + (a->flags & AUTO_INCREMENT_FLAG) != (b->flags & AUTO_INCREMENT_FLAG)) { + retval = false; + goto cleanup; + } + break; + case MYSQL_TYPE_NEWDECIMAL: + // length, unsigned + if (a->pack_length() != b->pack_length() || + (a->flags & UNSIGNED_FLAG) != (b->flags & UNSIGNED_FLAG)) { + retval = false; + goto cleanup; + } + break; + case MYSQL_TYPE_ENUM: + case MYSQL_TYPE_SET: + case MYSQL_TYPE_BIT: + // length + if (a->pack_length() != b->pack_length()) { + retval = false; + goto cleanup; + } + break; + case MYSQL_TYPE_DATE: + case MYSQL_TYPE_DATETIME: + case MYSQL_TYPE_YEAR: + case MYSQL_TYPE_NEWDATE: + case MYSQL_TYPE_TIME: + case MYSQL_TYPE_TIMESTAMP: +#if 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699 + case MYSQL_TYPE_DATETIME2: + case MYSQL_TYPE_TIMESTAMP2: + case MYSQL_TYPE_TIME2: +#endif + // length + if (a->pack_length() != b->pack_length()) { + retval = false; + goto cleanup; + } + break; + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + // test the charset + if (a->charset()->number != b->charset()->number) { + retval = false; + goto cleanup; + } + if (a->row_pack_length() != b->row_pack_length()) { + retval = false; + goto cleanup; + } + break; + case MYSQL_TYPE_STRING: + if (a->pack_length() != b->pack_length()) { + retval = false; + goto cleanup; + } + // if both are binary, we know have same pack lengths, + // so we can goto end + if (a->binary() && b->binary()) { + // nothing to do, we are good + } + else if (!a->binary() && !b->binary()) { + // test the charset + if (a->charset()->number != b->charset()->number) { + retval = false; + goto cleanup; + } + } + else { + // one is binary and the other is not, so not the same + retval = false; + goto cleanup; + } + break; + case MYSQL_TYPE_VARCHAR: + if (a->field_length != b->field_length) { + retval = false; + goto cleanup; + } + // if both are binary, we know have same pack lengths, + // so we can goto end + if (a->binary() && b->binary()) { + // nothing to do, we are good + } + else if (!a->binary() && !b->binary()) { + // test the charset + if (a->charset()->number != b->charset()->number) { + retval = false; + goto cleanup; + } + } + else { + // one is binary and the other is not, so not the same + retval = false; + goto cleanup; + } + break; + // + // I believe these are old types that are no longer + // in any 5.1 tables, so tokudb does not need + // to worry about them + // Putting in this assert in case I am wrong. + // Do not support geometry yet. + // + case MYSQL_TYPE_GEOMETRY: + case MYSQL_TYPE_DECIMAL: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_NULL: + assert(false); + } + +cleanup: + return retval; +} + + +bool are_two_fields_same( + Field* a, + Field* b + ) +{ + return fields_have_same_name(a, b) && fields_are_same_type(a, b); +} + + diff --git a/storage/tokudb/hatoku_hton.cc b/storage/tokudb/hatoku_hton.cc index 886141bd5d3..725275cd963 100644 --- a/storage/tokudb/hatoku_hton.cc +++ b/storage/tokudb/hatoku_hton.cc @@ -87,2501 +87,2501 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -/* -*- mode: C; c-basic-offset: 4 -*- */ -#define MYSQL_SERVER 1 -#include "hatoku_defines.h" -#include - -#include "stdint.h" -#if defined(_WIN32) -#include "misc.h" -#endif -#define __STDC_FORMAT_MACROS -#include -#include "toku_os.h" -#include "toku_time.h" -#include "partitioned_counter.h" - -/* We define DTRACE after mysql_priv.h in case it disabled dtrace in the main server */ -#ifdef HAVE_DTRACE -#define _DTRACE_VERSION 1 -#else -#endif - -#include -#include "hatoku_hton.h" -#include "ha_tokudb.h" - -#undef PACKAGE -#undef VERSION -#undef HAVE_DTRACE -#undef _DTRACE_VERSION - -#define TOKU_METADB_NAME "tokudb_meta" - -typedef struct savepoint_info { - DB_TXN* txn; - tokudb_trx_data* trx; - bool in_sub_stmt; -} *SP_INFO, SP_INFO_T; - -static uchar *tokudb_get_key(TOKUDB_SHARE * share, size_t * length, my_bool not_used __attribute__ ((unused))) { - *length = share->table_name_length; - return (uchar *) share->table_name; -} - -static handler *tokudb_create_handler(handlerton * hton, TABLE_SHARE * table, MEM_ROOT * mem_root); - -static MYSQL_THDVAR_BOOL(commit_sync, - PLUGIN_VAR_THDLOCAL, - "sync on txn commit", - /* check */ NULL, - /* update */ NULL, - /* default*/ true -); -static MYSQL_THDVAR_UINT(pk_insert_mode, - 0, - "set the primary key insert mode", - NULL, - NULL, - 1, // default - 0, // min? - 2, // max - 1 // blocksize -); -static MYSQL_THDVAR_BOOL(load_save_space, - 0, - "if on, intial loads are slower but take less space", - NULL, - NULL, - false -); -static MYSQL_THDVAR_BOOL(disable_slow_alter, - 0, - "if on, alter tables that require copy are disabled", - NULL, - NULL, - false -); -static MYSQL_THDVAR_BOOL(disable_hot_alter, - 0, - "if on, hot alter table is disabled", - NULL, - NULL, - false -); -static MYSQL_THDVAR_BOOL(create_index_online, - 0, - "if on, create index done online", - NULL, - NULL, - true -); -static MYSQL_THDVAR_BOOL(disable_prefetching, - 0, - "if on, prefetching disabled", - NULL, - NULL, - false -); -static MYSQL_THDVAR_BOOL(prelock_empty, - 0, - "Tokudb Prelock Empty Table", - NULL, - NULL, - true -); -static MYSQL_THDVAR_BOOL(log_client_errors, - 0, - "Tokudb Log Client Errors", - NULL, - NULL, - false -); -static MYSQL_THDVAR_UINT(block_size, - 0, - "fractal tree block size", - NULL, - NULL, - 4<<20, // default - 4096, // min - ~0U, // max - 1 // blocksize??? -); -static MYSQL_THDVAR_UINT(read_block_size, - 0, - "fractal tree read block size", - NULL, - NULL, - 128*1024, // default - 4096, // min - ~0U, // max - 1 // blocksize??? -); -static MYSQL_THDVAR_UINT(read_buf_size, - 0, - "fractal tree read block size", //TODO: Is this a typo? - NULL, - NULL, - 128*1024, // default - 0, // min - 1*1024*1024, // max - 1 // blocksize??? -); -#if TOKU_INCLUDE_UPSERT -static MYSQL_THDVAR_BOOL(disable_slow_update, - PLUGIN_VAR_THDLOCAL, - "disable slow update", - NULL, // check - NULL, // update - false // default -); -static MYSQL_THDVAR_BOOL(disable_slow_upsert, - PLUGIN_VAR_THDLOCAL, - "disable slow upsert", - NULL, // check - NULL, // update - false // default -); -#endif -static MYSQL_THDVAR_UINT(analyze_time, - 0, - "analyze time", - NULL, - NULL, - 60, // default - 0, // min - ~0U, // max - 1 // blocksize??? -); - -static void tokudb_checkpoint_lock(THD * thd); -static void tokudb_checkpoint_unlock(THD * thd); - -static void tokudb_checkpoint_lock_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save) -{ - my_bool* val = (my_bool *) var_ptr; - *val= *(my_bool *) save ? true : false; - if (*val) { - tokudb_checkpoint_lock(thd); - } - else { - tokudb_checkpoint_unlock(thd); - } -} - -static MYSQL_THDVAR_BOOL(checkpoint_lock, - 0, - "Tokudb Checkpoint Lock", - NULL, - tokudb_checkpoint_lock_update, - false -); - -static const char *tokudb_row_format_names[] = { - "tokudb_uncompressed", - "tokudb_zlib", - "tokudb_quicklz", - "tokudb_lzma", - "tokudb_fast", - "tokudb_small", - "tokudb_default", - NullS -}; - -static TYPELIB tokudb_row_format_typelib = { - array_elements(tokudb_row_format_names) - 1, - "tokudb_row_format_typelib", - tokudb_row_format_names, - NULL -}; - -static MYSQL_THDVAR_ENUM(row_format, PLUGIN_VAR_OPCMDARG, - "Specifies the compression method for a table during this session. " - "Possible values are TOKUDB_UNCOMPRESSED, TOKUDB_ZLIB, TOKUDB_QUICKLZ, " - "TOKUDB_LZMA, TOKUDB_FAST, TOKUDB_SMALL and TOKUDB_DEFAULT", - NULL, NULL, SRV_ROW_FORMAT_DEFAULT, &tokudb_row_format_typelib); - -srv_row_format_t get_row_format(THD *thd) -{ - return (srv_row_format_t) THDVAR(thd, row_format); -} - -static void tokudb_print_error(const DB_ENV * db_env, const char *db_errpfx, const char *buffer); -static void tokudb_cleanup_log_files(void); -static int tokudb_end(handlerton * hton, ha_panic_function type); -static bool tokudb_flush_logs(handlerton * hton); -static bool tokudb_show_status(handlerton * hton, THD * thd, stat_print_fn * print, enum ha_stat_type); -static int tokudb_close_connection(handlerton * hton, THD * thd); -static int tokudb_commit(handlerton * hton, THD * thd, bool all); -static int tokudb_rollback(handlerton * hton, THD * thd, bool all); -#if TOKU_INCLUDE_XA -static int tokudb_xa_prepare(handlerton* hton, THD* thd, bool all); -static int tokudb_xa_recover(handlerton* hton, XID* xid_list, uint len); -static int tokudb_commit_by_xid(handlerton* hton, XID* xid); -static int tokudb_rollback_by_xid(handlerton* hton, XID* xid); -#endif - -static int tokudb_rollback_to_savepoint(handlerton * hton, THD * thd, void *savepoint); -static int tokudb_savepoint(handlerton * hton, THD * thd, void *savepoint); -static int tokudb_release_savepoint(handlerton * hton, THD * thd, void *savepoint); -static int tokudb_discover(handlerton *hton, THD* thd, const char *db, const char *name, uchar **frmblob, size_t *frmlen); -static int tokudb_discover2(handlerton *hton, THD* thd, const char *db, const char *name, bool translate_name,uchar **frmblob, size_t *frmlen); -handlerton *tokudb_hton; - -const char *ha_tokudb_ext = ".tokudb"; -char *tokudb_data_dir; -ulong tokudb_debug; -DB_ENV *db_env; -DB* metadata_db; -HASH tokudb_open_tables; -pthread_mutex_t tokudb_mutex; -pthread_mutex_t tokudb_meta_mutex; - -static PARTITIONED_COUNTER tokudb_primary_key_bytes_inserted; -void toku_hton_update_primary_key_bytes_inserted(uint64_t row_size) { - increment_partitioned_counter(tokudb_primary_key_bytes_inserted, row_size); -} - -static ulonglong tokudb_lock_timeout; -static ulong tokudb_cleaner_period; -static ulong tokudb_cleaner_iterations; - -#define ASSERT_MSGLEN 1024 - -void toku_hton_assert_fail(const char* expr_as_string, const char * fun, const char * file, int line, int caller_errno) { - char msg[ASSERT_MSGLEN]; - if (db_env) { - snprintf(msg, ASSERT_MSGLEN, "Handlerton: %s ", expr_as_string); - db_env->crash(db_env, msg, fun, file, line,caller_errno); - } - else { - snprintf(msg, ASSERT_MSGLEN, "Handlerton assertion failed, no env, %s, %d, %s, %s (errno=%d)\n", file, line, fun, expr_as_string, caller_errno); - perror(msg); - fflush(stderr); - } - abort(); -} - -//my_bool tokudb_shared_data = false; -static uint32_t tokudb_init_flags = - DB_CREATE | DB_THREAD | DB_PRIVATE | - DB_INIT_LOCK | - DB_INIT_MPOOL | - DB_INIT_TXN | - DB_INIT_LOG | - DB_RECOVER; -static uint32_t tokudb_env_flags = 0; -// static uint32_t tokudb_lock_type = DB_LOCK_DEFAULT; -// static ulong tokudb_log_buffer_size = 0; -// static ulong tokudb_log_file_size = 0; -static my_bool tokudb_directio = FALSE; -static my_bool tokudb_checkpoint_on_flush_logs = FALSE; -static ulonglong tokudb_cache_size = 0; -static ulonglong tokudb_max_lock_memory = 0; -static char *tokudb_home; -static char *tokudb_tmp_dir; -static char *tokudb_log_dir; -// static long tokudb_lock_scan_time = 0; -// static ulong tokudb_region_size = 0; -// static ulong tokudb_cache_parts = 1; -const char *tokudb_hton_name = "TokuDB"; -static uint32_t tokudb_checkpointing_period; -static uint32_t tokudb_fsync_log_period; -uint32_t tokudb_write_status_frequency; -uint32_t tokudb_read_status_frequency; -#ifdef TOKUDB_VERSION -char *tokudb_version = (char*) TOKUDB_VERSION; -#else -char *tokudb_version; -#endif -static int tokudb_fs_reserve_percent; // file system reserve as a percentage of total disk space - -#if defined(_WIN32) -extern "C" { -#include "ydb.h" -} -#endif - -// A flag set if the handlerton is in an initialized, usable state, -// plus a reader-write lock to protect it without serializing reads. -// Since we don't have static initializers for the opaque rwlock type, -// use constructor and destructor functions to create and destroy -// the lock before and after main(), respectively. -static int tokudb_hton_initialized; -static rw_lock_t tokudb_hton_initialized_lock; - -static void create_tokudb_hton_intialized_lock(void) __attribute__((constructor)); -static void destroy_tokudb_hton_initialized_lock(void) __attribute__((destructor)); - -static void create_tokudb_hton_intialized_lock(void) -{ - my_rwlock_init(&tokudb_hton_initialized_lock, 0); -} - -static void destroy_tokudb_hton_initialized_lock(void) -{ - rwlock_destroy(&tokudb_hton_initialized_lock); -} - -static SHOW_VAR *toku_global_status_variables = NULL; -static uint64_t toku_global_status_max_rows; -static TOKU_ENGINE_STATUS_ROW_S* toku_global_status_rows = NULL; - -static int tokudb_init_func(void *p) { - TOKUDB_DBUG_ENTER("tokudb_init_func"); - int r; -#if defined(_WIN64) - r = toku_ydb_init(); - if (r) { - printf("got error %d\n", r); - goto error; - } -#endif - - // 3938: lock the handlerton's initialized status flag for writing - r = rw_wrlock(&tokudb_hton_initialized_lock); - assert(r == 0); - - db_env = NULL; - metadata_db = NULL; - - tokudb_hton = (handlerton *) p; - - pthread_mutex_init(&tokudb_mutex, MY_MUTEX_INIT_FAST); - pthread_mutex_init(&tokudb_meta_mutex, MY_MUTEX_INIT_FAST); - (void) my_hash_init(&tokudb_open_tables, table_alias_charset, 32, 0, 0, (my_hash_get_key) tokudb_get_key, 0, 0); - - tokudb_hton->state = SHOW_OPTION_YES; - // tokudb_hton->flags= HTON_CAN_RECREATE; // QQQ this came from skeleton - tokudb_hton->flags = HTON_CLOSE_CURSORS_AT_COMMIT; - -#if TOKU_INCLUDE_OTHER_DB_TYPE - // we have historically been a dynamic storage engine, so we set db_type according. - // however, extended keys is triggered off of the db_type, so tokudb adds another type so that extended keys works - tokudb_hton->db_type = DB_TYPE_UNKNOWN; - tokudb_hton->other_db_type = DB_TYPE_TOKUDB; -#else - tokudb_hton->db_type = DB_TYPE_TOKUDB; -#endif - - tokudb_hton->create = tokudb_create_handler; - tokudb_hton->close_connection = tokudb_close_connection; - - tokudb_hton->savepoint_offset = sizeof(SP_INFO_T); - tokudb_hton->savepoint_set = tokudb_savepoint; - tokudb_hton->savepoint_rollback = tokudb_rollback_to_savepoint; - tokudb_hton->savepoint_release = tokudb_release_savepoint; - - tokudb_hton->discover = tokudb_discover; -#if defined(MYSQL_HANDLERTON_INCLUDE_DISCOVER2) - tokudb_hton->discover2 = tokudb_discover2; -#endif - tokudb_hton->commit = tokudb_commit; - tokudb_hton->rollback = tokudb_rollback; -#if TOKU_INCLUDE_XA - tokudb_hton->prepare=tokudb_xa_prepare; - tokudb_hton->recover=tokudb_xa_recover; - tokudb_hton->commit_by_xid=tokudb_commit_by_xid; - tokudb_hton->rollback_by_xid=tokudb_rollback_by_xid; -#endif - - tokudb_hton->panic = tokudb_end; - tokudb_hton->flush_logs = tokudb_flush_logs; - tokudb_hton->show_status = tokudb_show_status; - if (!tokudb_home) - tokudb_home = mysql_real_data_home; - DBUG_PRINT("info", ("tokudb_home: %s", tokudb_home)); -#if 0 - if (!tokudb_log_buffer_size) { // QQQ - tokudb_log_buffer_size = max(table_cache_size * 512, 32 * 1024); - DBUG_PRINT("info", ("computing tokudb_log_buffer_size %ld\n", tokudb_log_buffer_size)); - } - tokudb_log_file_size = tokudb_log_buffer_size * 4; - tokudb_log_file_size = MY_ALIGN(tokudb_log_file_size, 1024 * 1024L); - tokudb_log_file_size = max(tokudb_log_file_size, 10 * 1024 * 1024L); - DBUG_PRINT("info", ("computing tokudb_log_file_size: %ld\n", tokudb_log_file_size)); -#endif - if ((r = db_env_create(&db_env, 0))) { - DBUG_PRINT("info", ("db_env_create %d\n", r)); - goto error; - } - - DBUG_PRINT("info", ("tokudb_env_flags: 0x%x\n", tokudb_env_flags)); - r = db_env->set_flags(db_env, tokudb_env_flags, 1); - if (r) { // QQQ - if (tokudb_debug & TOKUDB_DEBUG_INIT) - TOKUDB_TRACE("%s:WARNING: flags=%x r=%d\n", __FUNCTION__, tokudb_env_flags, r); - // goto error; - } - - // config error handling - db_env->set_errcall(db_env, tokudb_print_error); - db_env->set_errpfx(db_env, "TokuDB"); - - // - // set default comparison functions - // - r = db_env->set_default_bt_compare(db_env, tokudb_cmp_dbt_key); - if (r) { - DBUG_PRINT("info", ("set_default_bt_compare%d\n", r)); - goto error; - } - - { - char *tmp_dir = tokudb_tmp_dir; - char *data_dir = tokudb_data_dir; - if (data_dir == 0) { - data_dir = mysql_data_home; - } - if (tmp_dir == 0) { - tmp_dir = data_dir; - } - DBUG_PRINT("info", ("tokudb_data_dir: %s\n", data_dir)); - db_env->set_data_dir(db_env, data_dir); - - DBUG_PRINT("info", ("tokudb_tmp_dir: %s\n", tmp_dir)); - db_env->set_tmp_dir(db_env, tmp_dir); - } - - if (tokudb_log_dir) { - DBUG_PRINT("info", ("tokudb_log_dir: %s\n", tokudb_log_dir)); - db_env->set_lg_dir(db_env, tokudb_log_dir); - } - - // config the cache table size to min(1/2 of physical memory, 1/8 of the process address space) - if (tokudb_cache_size == 0) { - uint64_t physmem, maxdata; - physmem = toku_os_get_phys_memory_size(); - tokudb_cache_size = physmem / 2; - r = toku_os_get_max_process_data_size(&maxdata); - if (r == 0) { - if (tokudb_cache_size > maxdata / 8) - tokudb_cache_size = maxdata / 8; - } - } - if (tokudb_cache_size) { - DBUG_PRINT("info", ("tokudb_cache_size: %lld\n", tokudb_cache_size)); - r = db_env->set_cachesize(db_env, (uint32_t)(tokudb_cache_size >> 30), (uint32_t)(tokudb_cache_size % (1024L * 1024L * 1024L)), 1); - if (r) { - DBUG_PRINT("info", ("set_cachesize %d\n", r)); - goto error; - } - } - if (tokudb_max_lock_memory == 0) { - tokudb_max_lock_memory = tokudb_cache_size/8; - } - if (tokudb_max_lock_memory) { - DBUG_PRINT("info", ("tokudb_max_lock_memory: %lld\n", tokudb_max_lock_memory)); - r = db_env->set_lk_max_memory(db_env, tokudb_max_lock_memory); - if (r) { - DBUG_PRINT("info", ("set_lk_max_memory %d\n", r)); - goto error; - } - } - - uint32_t gbytes, bytes; int parts; - r = db_env->get_cachesize(db_env, &gbytes, &bytes, &parts); - if (r == 0) - if (tokudb_debug & TOKUDB_DEBUG_INIT) - TOKUDB_TRACE("%s:tokudb_cache_size=%lld\n", __FUNCTION__, ((unsigned long long) gbytes << 30) + bytes); - -#if 0 - // QQQ config the logs - DBUG_PRINT("info", ("tokudb_log_file_size: %ld\n", tokudb_log_file_size)); - db_env->set_lg_max(db_env, tokudb_log_file_size); - DBUG_PRINT("info", ("tokudb_log_buffer_size: %ld\n", tokudb_log_buffer_size)); - db_env->set_lg_bsize(db_env, tokudb_log_buffer_size); - // DBUG_PRINT("info",("tokudb_region_size: %ld\n", tokudb_region_size)); - // db_env->set_lg_regionmax(db_env, tokudb_region_size); -#endif - - if (db_env->set_redzone) { - r = db_env->set_redzone(db_env, tokudb_fs_reserve_percent); - if (r && (tokudb_debug & TOKUDB_DEBUG_INIT)) - TOKUDB_TRACE("%s:%d r=%d\n", __FUNCTION__, __LINE__, r); - } - - if (tokudb_debug & TOKUDB_DEBUG_INIT) TOKUDB_TRACE("%s:env open:flags=%x\n", __FUNCTION__, tokudb_init_flags); - - r = db_env->set_generate_row_callback_for_put(db_env,generate_row_for_put); - assert(r == 0); - r = db_env->set_generate_row_callback_for_del(db_env,generate_row_for_del); - assert(r == 0); - db_env->set_update(db_env, tokudb_update_fun); - db_env_set_direct_io(tokudb_directio == TRUE); - db_env->change_fsync_log_period(db_env, tokudb_fsync_log_period); - r = db_env->open(db_env, tokudb_home, tokudb_init_flags, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH); - - if (tokudb_debug & TOKUDB_DEBUG_INIT) TOKUDB_TRACE("%s:env opened:return=%d\n", __FUNCTION__, r); - - if (r) { - DBUG_PRINT("info", ("env->open %d\n", r)); - goto error; - } - - r = db_env->checkpointing_set_period(db_env, tokudb_checkpointing_period); - assert(r == 0); - r = db_env->cleaner_set_period(db_env, tokudb_cleaner_period); - assert(r == 0); - r = db_env->cleaner_set_iterations(db_env, tokudb_cleaner_iterations); - assert(r == 0); - - r = db_env->set_lock_timeout(db_env, tokudb_lock_timeout); - assert(r == 0); - - r = db_env->get_engine_status_num_rows (db_env, &toku_global_status_max_rows); - assert(r == 0); - - { - const myf mem_flags = MY_FAE|MY_WME|MY_ZEROFILL|MY_ALLOW_ZERO_PTR|MY_FREE_ON_ERROR; - toku_global_status_variables = (SHOW_VAR*)my_malloc(sizeof(*toku_global_status_variables)*toku_global_status_max_rows, mem_flags); - toku_global_status_rows = (TOKU_ENGINE_STATUS_ROW_S*)my_malloc(sizeof(*toku_global_status_rows)*toku_global_status_max_rows, mem_flags); - } - - r = db_create(&metadata_db, db_env, 0); - if (r) { - DBUG_PRINT("info", ("failed to create metadata db %d\n", r)); - goto error; - } - - - r= metadata_db->open(metadata_db, NULL, TOKU_METADB_NAME, NULL, DB_BTREE, DB_THREAD, 0); - if (r) { - if (r != ENOENT) { - sql_print_error("Got error %d when trying to open metadata_db", r); - goto error; - } - r = metadata_db->close(metadata_db,0); - assert(r == 0); - r = db_create(&metadata_db, db_env, 0); - if (r) { - DBUG_PRINT("info", ("failed to create metadata db %d\n", r)); - goto error; - } - - r= metadata_db->open(metadata_db, NULL, TOKU_METADB_NAME, NULL, DB_BTREE, DB_THREAD | DB_CREATE | DB_EXCL, my_umask); - if (r) { - goto error; - } - } - - - - tokudb_primary_key_bytes_inserted = create_partitioned_counter(); - - //3938: succeeded, set the init status flag and unlock - tokudb_hton_initialized = 1; - rw_unlock(&tokudb_hton_initialized_lock); - DBUG_RETURN(false); - -error: - if (metadata_db) { - int rr = metadata_db->close(metadata_db, 0); - assert(rr==0); - } - if (db_env) { - int rr= db_env->close(db_env, 0); - assert(rr==0); - db_env = 0; - } - - // 3938: failed to initialized, drop the flag and lock - tokudb_hton_initialized = 0; - rw_unlock(&tokudb_hton_initialized_lock); - DBUG_RETURN(true); -} - -static int tokudb_done_func(void *p) { - TOKUDB_DBUG_ENTER("tokudb_done_func"); - { - const myf mem_flags = MY_FAE|MY_WME|MY_ZEROFILL|MY_ALLOW_ZERO_PTR|MY_FREE_ON_ERROR; - my_free(toku_global_status_variables, mem_flags); - my_free(toku_global_status_rows, mem_flags); - toku_global_status_variables = NULL; - toku_global_status_rows = NULL; - } - my_hash_free(&tokudb_open_tables); - pthread_mutex_destroy(&tokudb_mutex); - pthread_mutex_destroy(&tokudb_meta_mutex); -#if defined(_WIN64) - toku_ydb_destroy(); -#endif - TOKUDB_DBUG_RETURN(0); -} - -static handler *tokudb_create_handler(handlerton * hton, TABLE_SHARE * table, MEM_ROOT * mem_root) { - return new(mem_root) ha_tokudb(hton, table); -} - -int tokudb_end(handlerton * hton, ha_panic_function type) { - TOKUDB_DBUG_ENTER("tokudb_end"); - int error = 0; - - // 3938: if we finalize the storage engine plugin, it is no longer - // initialized. grab a writer lock for the duration of the - // call, so we can drop the flag and destroy the mutexes - // in isolation. - rw_wrlock(&tokudb_hton_initialized_lock); - assert(tokudb_hton_initialized); - - if (metadata_db) { - int r = metadata_db->close(metadata_db, 0); - assert(r == 0); - } - if (db_env) { - if (tokudb_init_flags & DB_INIT_LOG) - tokudb_cleanup_log_files(); - error = db_env->close(db_env, 0); // Error is logged - assert(error==0); - db_env = NULL; - } - - // 3938: drop the initialized flag and unlock - tokudb_hton_initialized = 0; - rw_unlock(&tokudb_hton_initialized_lock); - - TOKUDB_DBUG_RETURN(error); -} - -static int tokudb_close_connection(handlerton * hton, THD * thd) { - int error = 0; - tokudb_trx_data* trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); - if (trx && trx->checkpoint_lock_taken) { - error = db_env->checkpointing_resume(db_env); - } - my_free(trx, MYF(0)); - return error; -} - -bool tokudb_flush_logs(handlerton * hton) { - TOKUDB_DBUG_ENTER("tokudb_flush_logs"); - int error; - bool result = 0; - - if (tokudb_checkpoint_on_flush_logs) { - // - // take the checkpoint - // - error = db_env->txn_checkpoint(db_env, 0, 0, 0); - if (error) { - my_error(ER_ERROR_DURING_CHECKPOINT, MYF(0), error); - result = 1; - goto exit; - } - } - else { - error = db_env->log_flush(db_env, NULL); - assert(error == 0); - } - - result = 0; -exit: - TOKUDB_DBUG_RETURN(result); -} - -uint get_pk_insert_mode(THD* thd) { - return THDVAR(thd, pk_insert_mode); -} - -bool get_load_save_space(THD* thd) { - return (THDVAR(thd, load_save_space) != 0); -} - -bool get_disable_slow_alter(THD* thd) { - return (THDVAR(thd, disable_slow_alter) != 0); -} - -bool get_disable_hot_alter(THD* thd) { - return THDVAR(thd, disable_hot_alter) != 0; -} - -bool get_create_index_online(THD* thd) { - return (THDVAR(thd, create_index_online) != 0); -} - -bool get_disable_prefetching(THD* thd) { - return (THDVAR(thd, disable_prefetching) != 0); -} - -bool get_prelock_empty(THD* thd) { - return (THDVAR(thd, prelock_empty) != 0); -} - -bool get_log_client_errors(THD* thd) { - return (THDVAR(thd, log_client_errors) != 0); -} - -uint get_tokudb_block_size(THD* thd) { - return THDVAR(thd, block_size); -} - -uint get_tokudb_read_block_size(THD* thd) { - return THDVAR(thd, read_block_size); -} - -uint get_tokudb_read_buf_size(THD* thd) { - return THDVAR(thd, read_buf_size); -} - -#if TOKU_INCLUDE_UPSERT -bool get_disable_slow_update(THD *thd) { - return (THDVAR(thd, disable_slow_update) != 0); -} - -bool get_disable_slow_upsert(THD *thd) { - return (THDVAR(thd, disable_slow_upsert) != 0); -} -#endif -uint get_analyze_time(THD *thd) { - return THDVAR(thd, analyze_time); -} - -typedef struct txn_progress_info { - char status[200]; - THD* thd; -} *TXN_PROGRESS_INFO; - - -void txn_progress_func(TOKU_TXN_PROGRESS progress, void* extra) { - TXN_PROGRESS_INFO progress_info = (TXN_PROGRESS_INFO)extra; - int r; - if (progress->stalled_on_checkpoint) { - if (progress->is_commit) { - r = sprintf( - progress_info->status, - "Writing committed changes to disk, processing commit of transaction, %"PRId64" out of %"PRId64, - progress->entries_processed, - progress->entries_total - ); - assert(r >= 0); - } - else { - r = sprintf( - progress_info->status, - "Writing committed changes to disk, processing abort of transaction, %"PRId64" out of %"PRId64, - progress->entries_processed, - progress->entries_total - ); - assert(r >= 0); - } - } - else { - if (progress->is_commit) { - r = sprintf( - progress_info->status, - "processing commit of transaction, %"PRId64" out of %"PRId64, - progress->entries_processed, - progress->entries_total - ); - assert(r >= 0); - } - else { - r = sprintf( - progress_info->status, - "processing abort of transaction, %"PRId64" out of %"PRId64, - progress->entries_processed, - progress->entries_total - ); - assert(r >= 0); - } - } - thd_proc_info(progress_info->thd, progress_info->status); -} - - -static void commit_txn_with_progress(DB_TXN* txn, uint32_t flags, THD* thd) { - int r; - struct txn_progress_info info; - info.thd = thd; - r = txn->commit_with_progress(txn, flags, txn_progress_func, &info); - if (r != 0) { - sql_print_error("tried committing transaction %p and got error code %d", txn, r); - } - assert(r == 0); -} - -static void abort_txn_with_progress(DB_TXN* txn, THD* thd) { - int r; - struct txn_progress_info info; - info.thd = thd; - r = txn->abort_with_progress(txn, txn_progress_func, &info); - if (r != 0) { - sql_print_error("tried aborting transaction %p and got error code %d", txn, r); - } - assert(r == 0); -} - -static int tokudb_commit(handlerton * hton, THD * thd, bool all) { - TOKUDB_DBUG_ENTER("tokudb_commit"); - DBUG_PRINT("trans", ("ending transaction %s", all ? "all" : "stmt")); - uint32_t syncflag = THDVAR(thd, commit_sync) ? 0 : DB_TXN_NOSYNC; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); - DB_TXN **txn = all ? &trx->all : &trx->stmt; - if (*txn) { - if (tokudb_debug & TOKUDB_DEBUG_TXN) { - TOKUDB_TRACE("doing txn commit:%d:%p\n", all, *txn); - } - // test hook to induce a crash on a debug build - DBUG_EXECUTE_IF("tokudb_crash_commit_before", DBUG_SUICIDE();); - commit_txn_with_progress(*txn, syncflag, thd); - // test hook to induce a crash on a debug build - DBUG_EXECUTE_IF("tokudb_crash_commit_after", DBUG_SUICIDE();); - if (*txn == trx->sp_level) { - trx->sp_level = 0; - } - *txn = 0; - trx->sub_sp_level = NULL; - } - else if (tokudb_debug & TOKUDB_DEBUG_TXN) { - TOKUDB_TRACE("nothing to commit %d\n", all); - } - reset_stmt_progress(&trx->stmt_progress); - TOKUDB_DBUG_RETURN(0); -} - -static int tokudb_rollback(handlerton * hton, THD * thd, bool all) { - TOKUDB_DBUG_ENTER("tokudb_rollback"); - DBUG_PRINT("trans", ("aborting transaction %s", all ? "all" : "stmt")); - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); - DB_TXN **txn = all ? &trx->all : &trx->stmt; - if (*txn) { - if (tokudb_debug & TOKUDB_DEBUG_TXN) { - TOKUDB_TRACE("rollback:%p\n", *txn); - } - abort_txn_with_progress(*txn, thd); - if (*txn == trx->sp_level) { - trx->sp_level = 0; - } - *txn = 0; - trx->sub_sp_level = NULL; - } - else { - if (tokudb_debug & TOKUDB_DEBUG_TXN) { - TOKUDB_TRACE("abort0\n"); - } - } - reset_stmt_progress(&trx->stmt_progress); - TOKUDB_DBUG_RETURN(0); -} - -#if TOKU_INCLUDE_XA - -static int tokudb_xa_prepare(handlerton* hton, THD* thd, bool all) { - TOKUDB_DBUG_ENTER("tokudb_xa_prepare"); - int r = 0; - DBUG_PRINT("trans", ("preparing transaction %s", all ? "all" : "stmt")); - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); - DB_TXN* txn = all ? trx->all : trx->stmt; - if (txn) { - if (tokudb_debug & TOKUDB_DEBUG_TXN) { - TOKUDB_TRACE("doing txn prepare:%d:%p\n", all, txn); - } - // a TOKU_XA_XID is identical to a MYSQL_XID - TOKU_XA_XID thd_xid; - thd_get_xid(thd, (MYSQL_XID*) &thd_xid); - // test hook to induce a crash on a debug build - DBUG_EXECUTE_IF("tokudb_crash_prepare_before", DBUG_SUICIDE();); - r = txn->xa_prepare(txn, &thd_xid); - // test hook to induce a crash on a debug build - DBUG_EXECUTE_IF("tokudb_crash_prepare_after", DBUG_SUICIDE();); - } - else if (tokudb_debug & TOKUDB_DEBUG_TXN) { - TOKUDB_TRACE("nothing to prepare %d\n", all); - } - TOKUDB_DBUG_RETURN(r); -} - -static int tokudb_xa_recover(handlerton* hton, XID* xid_list, uint len) { - TOKUDB_DBUG_ENTER("tokudb_xa_recover"); - int r = 0; - if (len == 0 || xid_list == NULL) { - TOKUDB_DBUG_RETURN(0); - } - long num_returned = 0; - r = db_env->txn_xa_recover( - db_env, - (TOKU_XA_XID*)xid_list, - len, - &num_returned, - DB_NEXT - ); - assert(r == 0); - TOKUDB_DBUG_RETURN((int)num_returned); -} - -static int tokudb_commit_by_xid(handlerton* hton, XID* xid) { - TOKUDB_DBUG_ENTER("tokudb_commit_by_xid"); - int r = 0; - DB_TXN* txn = NULL; - TOKU_XA_XID* toku_xid = (TOKU_XA_XID*)xid; - - r = db_env->get_txn_from_xid(db_env, toku_xid, &txn); - if (r) { goto cleanup; } - - r = txn->commit(txn, 0); - if (r) { goto cleanup; } - - r = 0; -cleanup: - TOKUDB_DBUG_RETURN(r); -} - -static int tokudb_rollback_by_xid(handlerton* hton, XID* xid) { - TOKUDB_DBUG_ENTER("tokudb_rollback_by_xid"); - int r = 0; - DB_TXN* txn = NULL; - TOKU_XA_XID* toku_xid = (TOKU_XA_XID*)xid; - - r = db_env->get_txn_from_xid(db_env, toku_xid, &txn); - if (r) { goto cleanup; } - - r = txn->abort(txn); - if (r) { goto cleanup; } - - r = 0; -cleanup: - TOKUDB_DBUG_RETURN(r); -} - -#endif - -static int tokudb_savepoint(handlerton * hton, THD * thd, void *savepoint) { - TOKUDB_DBUG_ENTER("tokudb_savepoint"); - int error; - SP_INFO save_info = (SP_INFO)savepoint; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); - if (thd->in_sub_stmt) { - assert(trx->stmt); - error = db_env->txn_begin(db_env, trx->sub_sp_level, &(save_info->txn), DB_INHERIT_ISOLATION); - if (error) { - goto cleanup; - } - trx->sub_sp_level = save_info->txn; - save_info->in_sub_stmt = true; - } - else { - error = db_env->txn_begin(db_env, trx->sp_level, &(save_info->txn), DB_INHERIT_ISOLATION); - if (error) { - goto cleanup; - } - trx->sp_level = save_info->txn; - save_info->in_sub_stmt = false; - } - save_info->trx = trx; - error = 0; -cleanup: - TOKUDB_DBUG_RETURN(error); -} - -static int tokudb_rollback_to_savepoint(handlerton * hton, THD * thd, void *savepoint) { - TOKUDB_DBUG_ENTER("tokudb_rollback_to_savepoint"); - int error; - SP_INFO save_info = (SP_INFO)savepoint; - DB_TXN* parent = NULL; - DB_TXN* txn_to_rollback = save_info->txn; - - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); - parent = txn_to_rollback->parent; - if (!(error = txn_to_rollback->abort(txn_to_rollback))) { - if (save_info->in_sub_stmt) { - trx->sub_sp_level = parent; - } - else { - trx->sp_level = parent; - } - error = tokudb_savepoint(hton, thd, savepoint); - } - TOKUDB_DBUG_RETURN(error); -} - -static int tokudb_release_savepoint(handlerton * hton, THD * thd, void *savepoint) { - TOKUDB_DBUG_ENTER("tokudb_release_savepoint"); - int error; - - SP_INFO save_info = (SP_INFO)savepoint; - DB_TXN* parent = NULL; - DB_TXN* txn_to_commit = save_info->txn; - - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); - parent = txn_to_commit->parent; - if (!(error = txn_to_commit->commit(txn_to_commit, 0))) { - if (save_info->in_sub_stmt) { - trx->sub_sp_level = parent; - } - else { - trx->sp_level = parent; - } - save_info->txn = NULL; - } - TOKUDB_DBUG_RETURN(error); -} - -static int tokudb_discover(handlerton *hton, THD* thd, const char *db, const char *name, uchar **frmblob, size_t *frmlen) { - return tokudb_discover2(hton, thd, db, name, true, frmblob, frmlen); -} - -static int tokudb_discover2(handlerton *hton, THD* thd, const char *db, const char *name, bool translate_name, - uchar **frmblob, size_t *frmlen) { - TOKUDB_DBUG_ENTER("tokudb_discover"); - int error; - DB* status_db = NULL; - DB_TXN* txn = NULL; - char path[FN_REFLEN + 1]; - HA_METADATA_KEY curr_key = hatoku_frm_data; - DBT key, value; - memset(&key, 0, sizeof(key)); - memset(&value, 0, sizeof(&value)); - - error = db_env->txn_begin(db_env, 0, &txn, 0); - if (error) { goto cleanup; } - - build_table_filename(path, sizeof(path) - 1, db, name, "", translate_name ? 0 : FN_IS_TMP); - error = open_status_dictionary(&status_db, path, txn); - if (error) { goto cleanup; } - - key.data = &curr_key; - key.size = sizeof(curr_key); - - error = status_db->getf_set( - status_db, - txn, - 0, - &key, - smart_dbt_callback_verify_frm, - &value - ); - if (error) { - goto cleanup; - } - - *frmblob = (uchar *)value.data; - *frmlen = value.size; - - error = 0; -cleanup: - if (status_db) { - status_db->close(status_db,0); - } - if (txn) { - commit_txn(txn, 0); - } - TOKUDB_DBUG_RETURN(error); -} - -static int store_dbname_tablename_size(TABLE *table, char *name, uint64_t size, THD *thd) { - char *tp = strrchr(name, '/'); - assert(tp); - char *tablename = tp + 1; - size_t tablename_length = strlen(tablename); - - char *dp = strchr(name, '/'); - char *dbname; - size_t dbname_length; - if (dp == tp) { - dbname = name; - dbname_length = tp - dbname; - } else { - dbname = dp + 1; - dbname_length = tp - dbname; - } - - table->field[0]->store(dbname, dbname_length, system_charset_info); - table->field[1]->store(tablename, tablename_length, system_charset_info); - table->field[2]->store(size, false); - int error = schema_table_store_record(thd, table); - return error; -} - -static int tokudb_dictionary_info(TABLE *table, THD *thd) { - int error; - DB_TXN* txn = NULL; - DBC* tmp_cursor = NULL; - DBT curr_key; - DBT curr_val; - memset(&curr_key, 0, sizeof curr_key); - memset(&curr_val, 0, sizeof curr_val); - error = db_env->txn_begin(db_env, 0, &txn, DB_READ_UNCOMMITTED); - if (error) { - goto cleanup; - } - error = db_env->get_cursor_for_directory(db_env, txn, &tmp_cursor); - if (error) { - goto cleanup; - } - while (error == 0) { - error = tmp_cursor->c_get( - tmp_cursor, - &curr_key, - &curr_val, - DB_NEXT - ); - if (!error) { - // We store the NULL terminator in the directory so it's included in the size. - // See #5789 - // Recalculate and check just to be safe. - size_t dname_len = strlen((const char *)curr_key.data); - size_t iname_len = strlen((const char *)curr_val.data); - assert(dname_len == curr_key.size - 1); - assert(iname_len == curr_val.size - 1); - table->field[0]->store( - (char *)curr_key.data, - dname_len, - system_charset_info - ); - table->field[1]->store( - (char *)curr_val.data, - iname_len, - system_charset_info - ); - error = schema_table_store_record(thd, table); - } - } - if (error == DB_NOTFOUND) { - error = 0; - } -cleanup: - if (tmp_cursor) { - int r = tmp_cursor->c_close(tmp_cursor); - assert(r == 0); - } - if (txn) { - commit_txn(txn, 0); - } - return error; -} - -static int tokudb_report_fractal_tree_info_for_db(const DBT *dname, const DBT *iname, TABLE *table, THD *thd) { - int error; - DB *db; - uint64_t bt_num_blocks_allocated; - uint64_t bt_num_blocks_in_use; - uint64_t bt_size_allocated; - uint64_t bt_size_in_use; - - error = db_create(&db, db_env, 0); - if (error) { - goto exit; - } - error = db->open(db, NULL, (char *)dname->data, NULL, DB_BTREE, 0, 0666); - if (error) { - goto exit; - } - error = db->get_fractal_tree_info64(db, - &bt_num_blocks_allocated, &bt_num_blocks_in_use, - &bt_size_allocated, &bt_size_in_use); - { - int close_error = db->close(db, 0); - if (!error) { - error = close_error; - } - } - if (error) { - goto exit; - } - - // We store the NULL terminator in the directory so it's included in the size. - // See #5789 - // Recalculate and check just to be safe. - { - size_t dname_len = strlen((const char *)dname->data); - size_t iname_len = strlen((const char *)iname->data); - assert(dname_len == dname->size - 1); - assert(iname_len == iname->size - 1); - table->field[0]->store( - (char *)dname->data, - dname_len, - system_charset_info - ); - table->field[1]->store( - (char *)iname->data, - iname_len, - system_charset_info - ); - } - table->field[2]->store(bt_num_blocks_allocated, false); - table->field[3]->store(bt_num_blocks_in_use, false); - table->field[4]->store(bt_size_allocated, false); - table->field[5]->store(bt_size_in_use, false); - - error = schema_table_store_record(thd, table); - -exit: - return error; -} - -static int tokudb_fractal_tree_info(TABLE *table, THD *thd) { - int error; - DB_TXN* txn = NULL; - DBC* tmp_cursor = NULL; - DBT curr_key; - DBT curr_val; - memset(&curr_key, 0, sizeof curr_key); - memset(&curr_val, 0, sizeof curr_val); - error = db_env->txn_begin(db_env, 0, &txn, DB_READ_UNCOMMITTED); - if (error) { - goto cleanup; - } - error = db_env->get_cursor_for_directory(db_env, txn, &tmp_cursor); - if (error) { - goto cleanup; - } - while (error == 0) { - error = tmp_cursor->c_get( - tmp_cursor, - &curr_key, - &curr_val, - DB_NEXT - ); - if (!error) { - error = tokudb_report_fractal_tree_info_for_db(&curr_key, &curr_val, table, thd); - } - } - if (error == DB_NOTFOUND) { - error = 0; - } -cleanup: - if (tmp_cursor) { - int r = tmp_cursor->c_close(tmp_cursor); - assert(r == 0); - } - if (txn) { - commit_txn(txn, 0); - } - return error; -} - -struct tokudb_report_fractal_tree_block_map_iterator_extra { - int64_t num_rows; - int64_t i; - uint64_t *checkpoint_counts; - int64_t *blocknums; - int64_t *diskoffs; - int64_t *sizes; -}; - -// This iterator is called while holding the blocktable lock. We should be as quick as possible. -// We don't want to do one call to get the number of rows, release the blocktable lock, and then do another call to get all the rows because the number of rows may change if we don't hold the lock. -// As a compromise, we'll do some mallocs inside the lock on the first call, but everything else should be fast. -static int tokudb_report_fractal_tree_block_map_iterator(uint64_t checkpoint_count, - int64_t num_rows, - int64_t blocknum, - int64_t diskoff, - int64_t size, - void *iter_extra) { - struct tokudb_report_fractal_tree_block_map_iterator_extra *e = static_cast(iter_extra); - - assert(num_rows > 0); - if (e->num_rows == 0) { - e->checkpoint_counts = (uint64_t *) my_malloc(num_rows * (sizeof *e->checkpoint_counts), MYF(MY_WME|MY_ZEROFILL|MY_FAE)); - e->blocknums = (int64_t *) my_malloc(num_rows * (sizeof *e->blocknums), MYF(MY_WME|MY_ZEROFILL|MY_FAE)); - e->diskoffs = (int64_t *) my_malloc(num_rows * (sizeof *e->diskoffs), MYF(MY_WME|MY_ZEROFILL|MY_FAE)); - e->sizes = (int64_t *) my_malloc(num_rows * (sizeof *e->sizes), MYF(MY_WME|MY_ZEROFILL|MY_FAE)); - e->num_rows = num_rows; - } - - e->checkpoint_counts[e->i] = checkpoint_count; - e->blocknums[e->i] = blocknum; - e->diskoffs[e->i] = diskoff; - e->sizes[e->i] = size; - ++(e->i); - - return 0; -} - -static int tokudb_report_fractal_tree_block_map_for_db(const DBT *dname, const DBT *iname, TABLE *table, THD *thd) { - int error; - DB *db; - struct tokudb_report_fractal_tree_block_map_iterator_extra e = {}; // avoid struct initializers so that we can compile with older gcc versions - - error = db_create(&db, db_env, 0); - if (error) { - goto exit; - } - error = db->open(db, NULL, (char *)dname->data, NULL, DB_BTREE, 0, 0666); - if (error) { - goto exit; - } - error = db->iterate_fractal_tree_block_map(db, tokudb_report_fractal_tree_block_map_iterator, &e); - { - int close_error = db->close(db, 0); - if (!error) { - error = close_error; - } - } - if (error) { - goto exit; - } - - // If not, we should have gotten an error and skipped this section of code - assert(e.i == e.num_rows); - for (int64_t i = 0; error == 0 && i < e.num_rows; ++i) { - // We store the NULL terminator in the directory so it's included in the size. - // See #5789 - // Recalculate and check just to be safe. - size_t dname_len = strlen((const char *)dname->data); - size_t iname_len = strlen((const char *)iname->data); - assert(dname_len == dname->size - 1); - assert(iname_len == iname->size - 1); - table->field[0]->store( - (char *)dname->data, - dname_len, - system_charset_info - ); - table->field[1]->store( - (char *)iname->data, - iname_len, - system_charset_info - ); - table->field[2]->store(e.checkpoint_counts[i], false); - table->field[3]->store(e.blocknums[i], false); - static const int64_t freelist_null = -1; - static const int64_t diskoff_unused = -2; - if (e.diskoffs[i] == diskoff_unused || e.diskoffs[i] == freelist_null) { - table->field[4]->set_null(); - } else { - table->field[4]->set_notnull(); - table->field[4]->store(e.diskoffs[i], false); - } - static const int64_t size_is_free = -1; - if (e.sizes[i] == size_is_free) { - table->field[5]->set_null(); - } else { - table->field[5]->set_notnull(); - table->field[5]->store(e.sizes[i], false); - } - - error = schema_table_store_record(thd, table); - } - -exit: - if (e.checkpoint_counts != NULL) { - my_free(e.checkpoint_counts, MYF(0)); - e.checkpoint_counts = NULL; - } - if (e.blocknums != NULL) { - my_free(e.blocknums, MYF(0)); - e.blocknums = NULL; - } - if (e.diskoffs != NULL) { - my_free(e.diskoffs, MYF(0)); - e.diskoffs = NULL; - } - if (e.sizes != NULL) { - my_free(e.sizes, MYF(0)); - e.sizes = NULL; - } - return error; -} - -static int tokudb_fractal_tree_block_map(TABLE *table, THD *thd) { - int error; - DB_TXN* txn = NULL; - DBC* tmp_cursor = NULL; - DBT curr_key; - DBT curr_val; - memset(&curr_key, 0, sizeof curr_key); - memset(&curr_val, 0, sizeof curr_val); - error = db_env->txn_begin(db_env, 0, &txn, DB_READ_UNCOMMITTED); - if (error) { - goto cleanup; - } - error = db_env->get_cursor_for_directory(db_env, txn, &tmp_cursor); - if (error) { - goto cleanup; - } - while (error == 0) { - error = tmp_cursor->c_get( - tmp_cursor, - &curr_key, - &curr_val, - DB_NEXT - ); - if (!error) { - error = tokudb_report_fractal_tree_block_map_for_db(&curr_key, &curr_val, table, thd); - } - } - if (error == DB_NOTFOUND) { - error = 0; - } -cleanup: - if (tmp_cursor) { - int r = tmp_cursor->c_close(tmp_cursor); - assert(r == 0); - } - if (txn) { - commit_txn(txn, 0); - } - return error; -} - -static int tokudb_get_user_data_size(TABLE *table, THD *thd, bool exact) { - int error; - DB* curr_db = NULL; - DB_TXN* txn = NULL; - DBC* tmp_cursor = NULL; - DBC* tmp_table_cursor = NULL; - DBT curr_key; - DBT curr_val; - DB_TXN* tmp_txn = NULL; - memset(&curr_key, 0, sizeof curr_key); - memset(&curr_val, 0, sizeof curr_val); - pthread_mutex_lock(&tokudb_meta_mutex); - - error = db_env->txn_begin(db_env, 0, &txn, DB_READ_UNCOMMITTED); - if (error) { - goto cleanup; - } - error = metadata_db->cursor(metadata_db, txn, &tmp_cursor, 0); - if (error) { - goto cleanup; - } - while (error == 0) { - tmp_txn = NULL; - // - // here, and in other places, check if process has been killed - // if so, get out of function so user is not stalled - // - if (thd->killed) { - break; - } - error = db_env->txn_begin(db_env, 0, &tmp_txn, DB_READ_UNCOMMITTED); - if (error) { - goto cleanup; - } - - // - // do not need this to be super fast, so use old simple API - // - error = tmp_cursor->c_get( - tmp_cursor, - &curr_key, - &curr_val, - DB_NEXT - ); - if (!error) { - char* name = (char *)curr_key.data; - char* newname; - uint64_t curr_num_bytes = 0; - DB_BTREE_STAT64 dict_stats; - - error = db_create(&curr_db, db_env, 0); - if (error) { goto cleanup; } - - newname = (char *)my_malloc( - get_max_dict_name_path_length(name), - MYF(MY_WME|MY_ZEROFILL|MY_FAE)); - - make_name(newname, name, "main"); - - error = curr_db->open(curr_db, tmp_txn, newname, NULL, DB_BTREE, DB_THREAD, 0); - - my_free(newname, MYF(0)); - - if (error == ENOENT) { error = 0; continue; } - if (error) { goto cleanup; } - - if (exact) { - // - // flatten if exact is required - // - uint curr_num_items = 0; - error = curr_db->cursor(curr_db, tmp_txn, &tmp_table_cursor, 0); - if (error) { - tmp_table_cursor = NULL; - goto cleanup; - } - while (error != DB_NOTFOUND) { - error = tmp_table_cursor->c_getf_next(tmp_table_cursor, 0, smart_dbt_do_nothing, NULL); - if (error && error != DB_NOTFOUND) { - goto cleanup; - } - curr_num_items++; - // - // allow early exit if command has been killed - // - if ( (curr_num_items % 1000) == 0 && thd->killed) { - goto cleanup; - } - } - error = tmp_table_cursor->c_close(tmp_table_cursor); - assert(error==0); - tmp_table_cursor = NULL; - } - - error = curr_db->stat64( - curr_db, - tmp_txn, - &dict_stats - ); - if (error) { goto cleanup; } - - curr_num_bytes = dict_stats.bt_dsize; - if (*(uchar *)curr_val.data) { - // - // in this case, we have a hidden primary key, do not - // want to report space taken up by the hidden primary key to the user - // - uint64_t hpk_space = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH*dict_stats.bt_ndata; - curr_num_bytes = (hpk_space > curr_num_bytes) ? 0 : curr_num_bytes - hpk_space; - } - else { - // - // one infinity byte per key needs to be subtracted - // - uint64_t inf_byte_space = dict_stats.bt_ndata; - curr_num_bytes = (inf_byte_space > curr_num_bytes) ? 0 : curr_num_bytes - inf_byte_space; - } - - error = store_dbname_tablename_size(table, name, curr_num_bytes, thd); - if (error) goto cleanup; - - { - int r = curr_db->close(curr_db, 0); - assert(r == 0); - curr_db = NULL; - } - } - - if (tmp_txn) { - commit_txn(tmp_txn, 0); - tmp_txn = NULL; - } - } - - error = 0; - -cleanup: - if (tmp_cursor) { - int r = tmp_cursor->c_close(tmp_cursor); - assert(r == 0); - } - if (tmp_table_cursor) { - int r = tmp_table_cursor->c_close(tmp_table_cursor); - assert(r == 0); - } - if (curr_db) { - int r = curr_db->close(curr_db, 0); - assert(r == 0); - } - if (tmp_txn) { - commit_txn(tmp_txn, 0); - } - if (txn) { - commit_txn(txn, 0); - } - if (error) { - sql_print_error("got an error %d in show_data_size\n", error); - } - pthread_mutex_unlock(&tokudb_meta_mutex); - return error; -} - -#define STATPRINT(legend, val) if (legend != NULL && val != NULL) stat_print(thd, \ - tokudb_hton_name, \ - strlen(tokudb_hton_name), \ - legend, \ - strlen(legend), \ - val, \ - strlen(val)) - -extern sys_var *intern_find_sys_var(const char *str, uint length, bool no_error); - -static bool tokudb_show_engine_status(THD * thd, stat_print_fn * stat_print) { - TOKUDB_DBUG_ENTER("tokudb_show_engine_status"); - int error; - uint64_t panic; - const int panic_string_len = 1024; - char panic_string[panic_string_len] = {'\0'}; - uint64_t num_rows; - uint64_t max_rows; - fs_redzone_state redzone_state; - const int bufsiz = 1024; - char buf[bufsiz]; - -#if MYSQL_VERSION_ID < 50500 - { - sys_var * version = intern_find_sys_var("version", 0, false); - snprintf(buf, bufsiz, "%s", version->value_ptr(thd, (enum_var_type)0, (LEX_STRING*)NULL)); - STATPRINT("Version", buf); - } -#endif - error = db_env->get_engine_status_num_rows (db_env, &max_rows); - TOKU_ENGINE_STATUS_ROW_S mystat[max_rows]; - error = db_env->get_engine_status (db_env, mystat, max_rows, &num_rows, &redzone_state, &panic, panic_string, panic_string_len, TOKU_ENGINE_STATUS); - - if (strlen(panic_string)) { - STATPRINT("Environment panic string", panic_string); - } - if (error == 0) { - if (panic) { - snprintf(buf, bufsiz, "%" PRIu64, panic); - STATPRINT("Environment panic", buf); - } - - if(redzone_state == FS_BLOCKED) { - STATPRINT("*** URGENT WARNING ***", "FILE SYSTEM IS COMPLETELY FULL"); - snprintf(buf, bufsiz, "FILE SYSTEM IS COMPLETELY FULL"); - } - else if (redzone_state == FS_GREEN) { - snprintf(buf, bufsiz, "more than %d percent of total file system space", 2*tokudb_fs_reserve_percent); - } - else if (redzone_state == FS_YELLOW) { - snprintf(buf, bufsiz, "*** WARNING *** FILE SYSTEM IS GETTING FULL (less than %d percent free)", 2*tokudb_fs_reserve_percent); - } - else if (redzone_state == FS_RED){ - snprintf(buf, bufsiz, "*** WARNING *** FILE SYSTEM IS GETTING VERY FULL (less than %d percent free): INSERTS ARE PROHIBITED", tokudb_fs_reserve_percent); - } - else { - snprintf(buf, bufsiz, "information unavailable, unknown redzone state %d", redzone_state); - } - STATPRINT ("disk free space", buf); - - for (uint64_t row = 0; row < num_rows; row++) { - switch (mystat[row].type) { - case FS_STATE: - snprintf(buf, bufsiz, "%"PRIu64"", mystat[row].value.num); - break; - case UINT64: - snprintf(buf, bufsiz, "%"PRIu64"", mystat[row].value.num); - break; - case CHARSTR: - snprintf(buf, bufsiz, "%s", mystat[row].value.str); - break; - case UNIXTIME: - { - time_t t = mystat[row].value.num; - char tbuf[26]; - snprintf(buf, bufsiz, "%.24s", ctime_r(&t, tbuf)); - } - break; - case TOKUTIME: - { - double t = tokutime_to_seconds(mystat[row].value.num); - snprintf(buf, bufsiz, "%.6f", t); - } - break; - case PARCOUNT: - { - uint64_t v = read_partitioned_counter(mystat[row].value.parcount); - snprintf(buf, bufsiz, "%" PRIu64, v); - } - break; - default: - snprintf(buf, bufsiz, "UNKNOWN STATUS TYPE: %d", mystat[row].type); - break; - } - STATPRINT(mystat[row].legend, buf); - } - uint64_t bytes_inserted = read_partitioned_counter(tokudb_primary_key_bytes_inserted); - snprintf(buf, bufsiz, "%" PRIu64, bytes_inserted); - STATPRINT("handlerton: primary key bytes inserted", buf); - } - if (error) { my_errno = error; } - TOKUDB_DBUG_RETURN(error); -} - -static void tokudb_checkpoint_lock(THD * thd) { - int error; - tokudb_trx_data* trx = NULL; - char status_msg[200]; //buffer of 200 should be a good upper bound. - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); - if (!trx) { - error = create_tokudb_trx_data_instance(&trx); - // - // can only fail due to memory allocation, so ok to assert - // - assert(!error); - thd_data_set(thd, tokudb_hton->slot, trx); - } - - if (trx->checkpoint_lock_taken) { - goto cleanup; - } - // - // This can only fail if environment is not created, which is not possible - // in handlerton - // - sprintf(status_msg, "Trying to grab checkpointing lock."); - thd_proc_info(thd, status_msg); - error = db_env->checkpointing_postpone(db_env); - assert(!error); - - trx->checkpoint_lock_taken = true; -cleanup: - return; -} - -static void tokudb_checkpoint_unlock(THD * thd) { - int error; - char status_msg[200]; //buffer of 200 should be a good upper bound. - tokudb_trx_data* trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); - if (!trx) { - error = 0; - goto cleanup; - } - if (!trx->checkpoint_lock_taken) { - error = 0; - goto cleanup; - } - // - // at this point, we know the checkpoint lock has been taken - // - sprintf(status_msg, "Trying to release checkpointing lock."); - thd_proc_info(thd, status_msg); - error = db_env->checkpointing_resume(db_env); - assert(!error); - - trx->checkpoint_lock_taken = false; - -cleanup: - return; -} - -static bool tokudb_show_status(handlerton * hton, THD * thd, stat_print_fn * stat_print, enum ha_stat_type stat_type) { - switch (stat_type) { - case HA_ENGINE_STATUS: - return tokudb_show_engine_status(thd, stat_print); - break; - default: - break; - } - return false; -} - -static void tokudb_print_error(const DB_ENV * db_env, const char *db_errpfx, const char *buffer) { - sql_print_error("%s: %s", db_errpfx, buffer); -} - -static void tokudb_cleanup_log_files(void) { - TOKUDB_DBUG_ENTER("tokudb_cleanup_log_files"); - char **names; - int error; - - if ((error = db_env->txn_checkpoint(db_env, 0, 0, 0))) - my_error(ER_ERROR_DURING_CHECKPOINT, MYF(0), error); - - if ((error = db_env->log_archive(db_env, &names, 0)) != 0) { - DBUG_PRINT("error", ("log_archive failed (error %d)", error)); - db_env->err(db_env, error, "log_archive"); - DBUG_VOID_RETURN; - } - - if (names) { - char **np; - for (np = names; *np; ++np) { -#if 1 - if (tokudb_debug) - TOKUDB_TRACE("%s:cleanup:%s\n", __FUNCTION__, *np); -#else - my_delete(*np, MYF(MY_WME)); -#endif - } - - free(names); - } - - DBUG_VOID_RETURN; -} - -// options flags -// PLUGIN_VAR_THDLOCAL Variable is per-connection -// PLUGIN_VAR_READONLY Server variable is read only -// PLUGIN_VAR_NOSYSVAR Not a server variable -// PLUGIN_VAR_NOCMDOPT Not a command line option -// PLUGIN_VAR_NOCMDARG No argument for cmd line -// PLUGIN_VAR_RQCMDARG Argument required for cmd line -// PLUGIN_VAR_OPCMDARG Argument optional for cmd line -// PLUGIN_VAR_MEMALLOC String needs memory allocated - - -// system variables - -static void tokudb_lock_timeout_update(THD * thd, struct st_mysql_sys_var * sys_var, void * var, const void * save) { - ulonglong *timeout = (ulonglong *) var; - *timeout = *(const ulonglong *) save; - db_env->set_lock_timeout(db_env, *timeout); -} - -#define DEFAULT_LOCK_TIMEOUT_MSEC 4000 - -static MYSQL_SYSVAR_ULONGLONG(lock_timeout, tokudb_lock_timeout, - 0, "TokuDB lock timeout", - NULL, tokudb_lock_timeout_update, DEFAULT_LOCK_TIMEOUT_MSEC, - 0, ~0ULL, 0); - -static void tokudb_cleaner_period_update(THD * thd, struct st_mysql_sys_var * sys_var, void * var, const void * save) { - ulong * cleaner_period = (ulong *) var; - *cleaner_period = *(const ulonglong *) save; - int r = db_env->cleaner_set_period(db_env, *cleaner_period); - assert(r == 0); -} - -#define DEFAULT_CLEANER_PERIOD 1 - -static MYSQL_SYSVAR_ULONG(cleaner_period, tokudb_cleaner_period, - 0, "TokuDB cleaner_period", - NULL, tokudb_cleaner_period_update, DEFAULT_CLEANER_PERIOD, - 0, ~0UL, 0); - -static void tokudb_cleaner_iterations_update(THD * thd, struct st_mysql_sys_var * sys_var, void * var, const void * save) { - ulong * cleaner_iterations = (ulong *) var; - *cleaner_iterations = *(const ulonglong *) save; - int r = db_env->cleaner_set_iterations(db_env, *cleaner_iterations); - assert(r == 0); -} - -#define DEFAULT_CLEANER_ITERATIONS 5 - -static MYSQL_SYSVAR_ULONG(cleaner_iterations, tokudb_cleaner_iterations, - 0, "TokuDB cleaner_iterations", - NULL, tokudb_cleaner_iterations_update, DEFAULT_CLEANER_ITERATIONS, - 0, ~0UL, 0); - -static void tokudb_checkpointing_period_update(THD * thd, struct st_mysql_sys_var * sys_var, void * var, const void * save) { - uint * checkpointing_period = (uint *) var; - *checkpointing_period = *(const ulonglong *) save; - int r = db_env->checkpointing_set_period(db_env, *checkpointing_period); - assert(r == 0); -} - -static MYSQL_SYSVAR_UINT(checkpointing_period, tokudb_checkpointing_period, - 0, "TokuDB Checkpointing period", - NULL, tokudb_checkpointing_period_update, 60, - 0, ~0U, 0); - -static MYSQL_SYSVAR_BOOL(directio, tokudb_directio, - PLUGIN_VAR_READONLY, - "TokuDB Enable Direct I/O ", - NULL, NULL, FALSE); -static MYSQL_SYSVAR_BOOL(checkpoint_on_flush_logs, tokudb_checkpoint_on_flush_logs, - 0, - "TokuDB Checkpoint on Flush Logs ", - NULL, NULL, FALSE); - -static MYSQL_SYSVAR_ULONGLONG(cache_size, tokudb_cache_size, - PLUGIN_VAR_READONLY, "TokuDB cache table size", NULL, NULL, 0, - 0, ~0ULL, 0); - -static MYSQL_SYSVAR_ULONGLONG(max_lock_memory, tokudb_max_lock_memory, PLUGIN_VAR_READONLY, "TokuDB max memory for locks", NULL, NULL, 0, 0, ~0ULL, 0); -static MYSQL_SYSVAR_ULONG(debug, tokudb_debug, 0, "TokuDB Debug", NULL, NULL, 0, 0, ~0UL, 0); - -static MYSQL_SYSVAR_STR(log_dir, tokudb_log_dir, PLUGIN_VAR_READONLY, "TokuDB Log Directory", NULL, NULL, NULL); - -static MYSQL_SYSVAR_STR(data_dir, tokudb_data_dir, PLUGIN_VAR_READONLY, "TokuDB Data Directory", NULL, NULL, NULL); - -static MYSQL_SYSVAR_STR(version, tokudb_version, PLUGIN_VAR_READONLY, "TokuDB Version", NULL, NULL, NULL); - -static MYSQL_SYSVAR_UINT(init_flags, tokudb_init_flags, PLUGIN_VAR_READONLY, "Sets TokuDB DB_ENV->open flags", NULL, NULL, tokudb_init_flags, 0, ~0U, 0); - -static MYSQL_SYSVAR_UINT(write_status_frequency, tokudb_write_status_frequency, 0, "TokuDB frequency that show processlist updates status of writes", NULL, NULL, 1000, 0, ~0U, 0); -static MYSQL_SYSVAR_UINT(read_status_frequency, tokudb_read_status_frequency, 0, "TokuDB frequency that show processlist updates status of reads", NULL, NULL, 10000, 0, ~0U, 0); -static MYSQL_SYSVAR_INT(fs_reserve_percent, tokudb_fs_reserve_percent, PLUGIN_VAR_READONLY, "TokuDB file system space reserve (percent free required)", NULL, NULL, 5, 0, 100, 0); -static MYSQL_SYSVAR_STR(tmp_dir, tokudb_tmp_dir, PLUGIN_VAR_READONLY, "Tokudb Tmp Dir", NULL, NULL, NULL); - -static void tokudb_fsync_log_period_update(THD *thd, struct st_mysql_sys_var *sys_var, void *var, const void *save) { - uint32 *period = (uint32 *) var; - *period = *(const ulonglong *) save; - db_env->change_fsync_log_period(db_env, *period); -} - -static MYSQL_SYSVAR_UINT(fsync_log_period, tokudb_fsync_log_period, 0, "TokuDB fsync log period", NULL, tokudb_fsync_log_period_update, 0, 0, ~0U, 0); - -static struct st_mysql_sys_var *tokudb_system_variables[] = { - MYSQL_SYSVAR(cache_size), - MYSQL_SYSVAR(max_lock_memory), - MYSQL_SYSVAR(data_dir), - MYSQL_SYSVAR(log_dir), - MYSQL_SYSVAR(debug), - MYSQL_SYSVAR(commit_sync), - MYSQL_SYSVAR(lock_timeout), - MYSQL_SYSVAR(cleaner_period), - MYSQL_SYSVAR(cleaner_iterations), - MYSQL_SYSVAR(pk_insert_mode), - MYSQL_SYSVAR(load_save_space), - MYSQL_SYSVAR(disable_slow_alter), - MYSQL_SYSVAR(disable_hot_alter), - MYSQL_SYSVAR(create_index_online), - MYSQL_SYSVAR(disable_prefetching), - MYSQL_SYSVAR(version), - MYSQL_SYSVAR(init_flags), - MYSQL_SYSVAR(checkpointing_period), - MYSQL_SYSVAR(prelock_empty), - MYSQL_SYSVAR(log_client_errors), - MYSQL_SYSVAR(checkpoint_lock), - MYSQL_SYSVAR(write_status_frequency), - MYSQL_SYSVAR(read_status_frequency), - MYSQL_SYSVAR(fs_reserve_percent), - MYSQL_SYSVAR(tmp_dir), - MYSQL_SYSVAR(block_size), - MYSQL_SYSVAR(read_block_size), - MYSQL_SYSVAR(read_buf_size), - MYSQL_SYSVAR(row_format), - MYSQL_SYSVAR(directio), - MYSQL_SYSVAR(checkpoint_on_flush_logs), -#if TOKU_INCLUDE_UPSERT - MYSQL_SYSVAR(disable_slow_update), - MYSQL_SYSVAR(disable_slow_upsert), -#endif - MYSQL_SYSVAR(analyze_time), - MYSQL_SYSVAR(fsync_log_period), - NULL -}; - -struct st_mysql_storage_engine tokudb_storage_engine = { MYSQL_HANDLERTON_INTERFACE_VERSION }; - -static ST_FIELD_INFO tokudb_user_data_field_info[] = { - {"database_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"table_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"data_size", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, - {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE} -}; - -#if MYSQL_VERSION_ID >= 50600 -static int tokudb_user_data_fill_table(THD *thd, TABLE_LIST *tables, Item *cond) { -#else -static int tokudb_user_data_fill_table(THD *thd, TABLE_LIST *tables, COND *cond) { -#endif - int error; - TABLE *table = tables->table; - - // 3938: Get a read lock on the status flag, since we must - // read it before safely proceeding - rw_rdlock(&tokudb_hton_initialized_lock); - - if (!tokudb_hton_initialized) { - my_error(ER_PLUGIN_IS_NOT_LOADED, MYF(0), "TokuDB"); - error = -1; - } else { - error = tokudb_get_user_data_size(table, thd, false); - } - - // 3938: unlock the status flag lock - rw_unlock(&tokudb_hton_initialized_lock); - return error; -} - -static int tokudb_user_data_init(void *p) { - ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p; - schema->fields_info = tokudb_user_data_field_info; - schema->fill_table = tokudb_user_data_fill_table; - return 0; -} - -static int tokudb_user_data_done(void *p) { - return 0; -} - -static struct st_mysql_information_schema tokudb_user_data_information_schema = { MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION }; - -static struct st_mysql_information_schema tokudb_fractal_tree_info_information_schema = { MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION }; - -static struct st_mysql_information_schema tokudb_fractal_tree_block_map_information_schema = { MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION }; - -static ST_FIELD_INFO tokudb_user_data_exact_field_info[] = { - {"database_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"table_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"data_size", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, - {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE} -}; - -static ST_FIELD_INFO tokudb_dictionary_field_info[] = { - {"dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"internal_file_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, - {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE} -}; - -static ST_FIELD_INFO tokudb_fractal_tree_info_field_info[] = { - {"dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"internal_file_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"bt_num_blocks_allocated", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"bt_num_blocks_in_use", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"bt_size_allocated", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"bt_size_in_use", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, - {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE} -}; - -static ST_FIELD_INFO tokudb_fractal_tree_block_map_field_info[] = { - {"dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"internal_file_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"checkpoint_count", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"blocknum", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, - {"offset", 0, MYSQL_TYPE_LONGLONG, 0, MY_I_S_MAYBE_NULL, NULL, SKIP_OPEN_TABLE }, - {"size", 0, MYSQL_TYPE_LONGLONG, 0, MY_I_S_MAYBE_NULL, NULL, SKIP_OPEN_TABLE }, - {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE} -}; - - -#if MYSQL_VERSION_ID >= 50600 -static int tokudb_dictionary_info_fill_table(THD *thd, TABLE_LIST *tables, Item *cond) { -#else -static int tokudb_dictionary_info_fill_table(THD *thd, TABLE_LIST *tables, COND *cond) { -#endif - int error; - TABLE *table = tables->table; - - // 3938: Get a read lock on the status flag, since we must - // read it before safely proceeding - rw_rdlock(&tokudb_hton_initialized_lock); - - if (!tokudb_hton_initialized) { - my_error(ER_PLUGIN_IS_NOT_LOADED, MYF(0), "TokuDB"); - error = -1; - } else { - error = tokudb_dictionary_info(table, thd); - } - - //3938: unlock the status flag lock - rw_unlock(&tokudb_hton_initialized_lock); - return error; -} - -#if MYSQL_VERSION_ID >= 50600 -static int tokudb_user_data_exact_fill_table(THD *thd, TABLE_LIST *tables, Item *cond) { -#else -static int tokudb_user_data_exact_fill_table(THD *thd, TABLE_LIST *tables, COND *cond) { -#endif - int error; - TABLE *table = tables->table; - - // 3938: Get a read lock on the status flag, since we must - // read it before safely proceeding - rw_rdlock(&tokudb_hton_initialized_lock); - - if (!tokudb_hton_initialized) { - my_error(ER_PLUGIN_IS_NOT_LOADED, MYF(0), "TokuDB"); - error = -1; - } else { - error = tokudb_get_user_data_size(table, thd, true); - } - - //3938: unlock the status flag lock - rw_unlock(&tokudb_hton_initialized_lock); - return error; -} - -#if MYSQL_VERSION_ID >= 50600 -static int tokudb_fractal_tree_info_fill_table(THD *thd, TABLE_LIST *tables, Item *cond) { -#else -static int tokudb_fractal_tree_info_fill_table(THD *thd, TABLE_LIST *tables, COND *cond) { -#endif - int error; - TABLE *table = tables->table; - - // 3938: Get a read lock on the status flag, since we must - // read it before safely proceeding - rw_rdlock(&tokudb_hton_initialized_lock); - - if (!tokudb_hton_initialized) { - my_error(ER_PLUGIN_IS_NOT_LOADED, MYF(0), "TokuDB"); - error = -1; - } else { - error = tokudb_fractal_tree_info(table, thd); - } - - //3938: unlock the status flag lock - rw_unlock(&tokudb_hton_initialized_lock); - return error; -} - -#if MYSQL_VERSION_ID >= 50600 -static int tokudb_fractal_tree_block_map_fill_table(THD *thd, TABLE_LIST *tables, Item *cond) { -#else -static int tokudb_fractal_tree_block_map_fill_table(THD *thd, TABLE_LIST *tables, COND *cond) { -#endif - int error; - TABLE *table = tables->table; - - // 3938: Get a read lock on the status flag, since we must - // read it before safely proceeding - rw_rdlock(&tokudb_hton_initialized_lock); - - if (!tokudb_hton_initialized) { - my_error(ER_PLUGIN_IS_NOT_LOADED, MYF(0), "TokuDB"); - error = -1; - } else { - error = tokudb_fractal_tree_block_map(table, thd); - } - - //3938: unlock the status flag lock - rw_unlock(&tokudb_hton_initialized_lock); - return error; -} - -static int tokudb_user_data_exact_init(void *p) { - ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p; - schema->fields_info = tokudb_user_data_exact_field_info; - schema->fill_table = tokudb_user_data_exact_fill_table; - return 0; -} - -static int tokudb_user_data_exact_done(void *p) { - return 0; -} - -static int tokudb_dictionary_info_init(void *p) { - ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p; - schema->fields_info = tokudb_dictionary_field_info; - schema->fill_table = tokudb_dictionary_info_fill_table; - return 0; -} - -static int tokudb_dictionary_info_done(void *p) { - return 0; -} - -static int tokudb_fractal_tree_info_init(void *p) { - ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p; - schema->fields_info = tokudb_fractal_tree_info_field_info; - schema->fill_table = tokudb_fractal_tree_info_fill_table; - return 0; -} - -static int tokudb_fractal_tree_info_done(void *p) { - return 0; -} - -static int tokudb_fractal_tree_block_map_init(void *p) { - ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p; - schema->fields_info = tokudb_fractal_tree_block_map_field_info; - schema->fill_table = tokudb_fractal_tree_block_map_fill_table; - return 0; -} - -static int tokudb_fractal_tree_block_map_done(void *p) { - return 0; -} - -static struct st_mysql_information_schema tokudb_user_data_exact_information_schema = { MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION }; - -enum { TOKUDB_PLUGIN_VERSION = 0x0400 }; -#define TOKUDB_PLUGIN_VERSION_STR "1024" - -// Retrieves variables for information_schema.global_status. -// Names (columnname) are automatically converted to upper case, and prefixed with "TOKUDB_" -static int show_tokudb_vars(THD *thd, SHOW_VAR *var, char *buff) { - TOKUDB_DBUG_ENTER("show_tokudb_vars"); - - int error; - uint64_t panic; - const int panic_string_len = 1024; - char panic_string[panic_string_len] = {'\0'}; - fs_redzone_state redzone_state; - - uint64_t num_rows; - error = db_env->get_engine_status (db_env, toku_global_status_rows, toku_global_status_max_rows, &num_rows, &redzone_state, &panic, panic_string, panic_string_len, TOKU_GLOBAL_STATUS); - //TODO: Maybe do something with the panic output? -#if 0 - if (strlen(panic_string)) { - STATPRINT("Environment panic string", panic_string); - } -#endif - if (error == 0) { - assert(num_rows <= toku_global_status_max_rows); - //TODO: Maybe enable some of the items here: (copied from engine status -#if 0 - if (panic) { - snprintf(buf, bufsiz, "%" PRIu64, panic); - STATPRINT("Environment panic", buf); - } - - if(redzone_state == FS_BLOCKED) { - STATPRINT("*** URGENT WARNING ***", "FILE SYSTEM IS COMPLETELY FULL"); - snprintf(buf, bufsiz, "FILE SYSTEM IS COMPLETELY FULL"); - } - else if (redzone_state == FS_GREEN) { - snprintf(buf, bufsiz, "more than %d percent of total file system space", 2*tokudb_fs_reserve_percent); - } - else if (redzone_state == FS_YELLOW) { - snprintf(buf, bufsiz, "*** WARNING *** FILE SYSTEM IS GETTING FULL (less than %d percent free)", 2*tokudb_fs_reserve_percent); - } - else if (redzone_state == FS_RED){ - snprintf(buf, bufsiz, "*** WARNING *** FILE SYSTEM IS GETTING VERY FULL (less than %d percent free): INSERTS ARE PROHIBITED", tokudb_fs_reserve_percent); - } - else { - snprintf(buf, bufsiz, "information unavailable, unknown redzone state %d", redzone_state); - } - STATPRINT ("disk free space", buf); -#endif - - - //TODO: (optionally) add redzone state, panic, panic string, etc. Right now it's being ignored. - - for (uint64_t row = 0; row < num_rows; row++) { - SHOW_VAR &status_var = toku_global_status_variables[row]; - TOKU_ENGINE_STATUS_ROW_S &status_row = toku_global_status_rows[row]; - - status_var.name = status_row.columnname; - switch (status_row.type) { - case FS_STATE: - case UINT64: - status_var.type = SHOW_LONGLONG; - status_var.value = (char*)&status_row.value.num; - break; - case CHARSTR: - status_var.type = SHOW_CHAR; - status_var.value = (char*)status_row.value.str; - break; - case UNIXTIME: - { - status_var.type = SHOW_CHAR; - time_t t = status_row.value.num; - char tbuf[26]; - // Reuse the memory in status_row. (It belongs to us). - snprintf(status_row.value.datebuf, sizeof(status_row.value.datebuf), "%.24s", ctime_r(&t, tbuf)); - status_var.value = (char*)&status_row.value.datebuf[0]; - } - break; - case TOKUTIME: - { - status_var.type = SHOW_DOUBLE; - double t = tokutime_to_seconds(status_row.value.num); - // Reuse the memory in status_row. (It belongs to us). - status_row.value.dnum = t; - status_var.value = (char*)&status_row.value.dnum; - } - break; - case PARCOUNT: - { - status_var.type = SHOW_LONGLONG; - uint64_t v = read_partitioned_counter(status_row.value.parcount); - // Reuse the memory in status_row. (It belongs to us). - status_row.value.num = v; - status_var.value = (char*)&status_row.value.num; - } - break; - default: - { - status_var.type = SHOW_CHAR; - // Reuse the memory in status_row.datebuf. (It belongs to us). - // UNKNOWN TYPE: %d fits in 26 bytes (sizeof datebuf) for any integer. - snprintf(status_row.value.datebuf, sizeof(status_row.value.datebuf), "UNKNOWN TYPE: %d", status_row.type); - status_var.value = (char*)&status_row.value.datebuf[0]; - } - break; - } - } - // Sentinel value at end. - toku_global_status_variables[num_rows].type = SHOW_LONG; - toku_global_status_variables[num_rows].value = (char*)NullS; - toku_global_status_variables[num_rows].name = (char*)NullS; - - var->type= SHOW_ARRAY; - var->value= (char *) toku_global_status_variables; - } - if (error) { my_errno = error; } - TOKUDB_DBUG_RETURN(error); -} - -static SHOW_VAR toku_global_status_variables_export[]= { - {"Tokudb", (char*)&show_tokudb_vars, SHOW_FUNC}, - {NullS, NullS, SHOW_LONG} -}; - -mysql_declare_plugin(tokudb) -{ - MYSQL_STORAGE_ENGINE_PLUGIN, - &tokudb_storage_engine, - tokudb_hton_name, - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_init_func, /* plugin init */ - tokudb_done_func, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - toku_global_status_variables_export, /* status variables */ - tokudb_system_variables, /* system variables */ - NULL, /* config options */ -#if MYSQL_VERSION_ID >= 50521 - 0, /* flags */ -#endif -}, -{ - MYSQL_INFORMATION_SCHEMA_PLUGIN, - &tokudb_user_data_information_schema, - "TokuDB_user_data", - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_user_data_init, /* plugin init */ - tokudb_user_data_done, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - NULL, /* status variables */ - NULL, /* system variables */ - NULL, /* config options */ -#if MYSQL_VERSION_ID >= 50521 - 0, /* flags */ -#endif -}, -{ - MYSQL_INFORMATION_SCHEMA_PLUGIN, - &tokudb_user_data_exact_information_schema, - "TokuDB_user_data_exact", - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_user_data_exact_init, /* plugin init */ - tokudb_user_data_exact_done, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - NULL, /* status variables */ - NULL, /* system variables */ - NULL, /* config options */ -#if MYSQL_VERSION_ID >= 50521 - 0, /* flags */ -#endif -}, -{ - MYSQL_INFORMATION_SCHEMA_PLUGIN, - &tokudb_user_data_exact_information_schema, - "TokuDB_file_map", - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_dictionary_info_init, /* plugin init */ - tokudb_dictionary_info_done, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - NULL, /* status variables */ - NULL, /* system variables */ - NULL, /* config options */ -#if MYSQL_VERSION_ID >= 50521 - 0, /* flags */ -#endif -}, -{ - MYSQL_INFORMATION_SCHEMA_PLUGIN, - &tokudb_fractal_tree_info_information_schema, - "TokuDB_fractal_tree_info", - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_fractal_tree_info_init, /* plugin init */ - tokudb_fractal_tree_info_done, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - NULL, /* status variables */ - NULL, /* system variables */ - NULL, /* config options */ -#if MYSQL_VERSION_ID >= 50521 - 0, /* flags */ -#endif -}, -{ - MYSQL_INFORMATION_SCHEMA_PLUGIN, - &tokudb_fractal_tree_block_map_information_schema, - "TokuDB_fractal_tree_block_map", - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_fractal_tree_block_map_init, /* plugin init */ - tokudb_fractal_tree_block_map_done, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - NULL, /* status variables */ - NULL, /* system variables */ - NULL, /* config options */ -#if MYSQL_VERSION_ID >= 50521 - 0, /* flags */ -#endif -} -mysql_declare_plugin_end; - -#ifdef MARIA_PLUGIN_INTERFACE_VERSION - -maria_declare_plugin(tokudb) -{ - MYSQL_STORAGE_ENGINE_PLUGIN, - &tokudb_storage_engine, - tokudb_hton_name, - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_init_func, /* plugin init */ - tokudb_done_func, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - toku_global_status_variables_export, /* status variables */ - tokudb_system_variables, /* system variables */ - TOKUDB_PLUGIN_VERSION_STR, /* string version */ - MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ -}, -{ - MYSQL_INFORMATION_SCHEMA_PLUGIN, - &tokudb_user_data_information_schema, - "TokuDB_user_data", - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_user_data_init, /* plugin init */ - tokudb_user_data_done, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - NULL, /* status variables */ - NULL, /* system variables */ - TOKUDB_PLUGIN_VERSION_STR, /* string version */ - MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ -}, -{ - MYSQL_INFORMATION_SCHEMA_PLUGIN, - &tokudb_user_data_exact_information_schema, - "TokuDB_user_data_exact", - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_user_data_exact_init, /* plugin init */ - tokudb_user_data_exact_done, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - NULL, /* status variables */ - NULL, /* system variables */ - TOKUDB_PLUGIN_VERSION_STR, /* string version */ - MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ -}, -{ - MYSQL_INFORMATION_SCHEMA_PLUGIN, - &tokudb_user_data_exact_information_schema, - "TokuDB_file_map", - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_dictionary_info_init, /* plugin init */ - tokudb_dictionary_info_done, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - NULL, /* status variables */ - NULL, /* system variables */ - TOKUDB_PLUGIN_VERSION_STR, /* string version */ - MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ -}, -{ - MYSQL_INFORMATION_SCHEMA_PLUGIN, - &tokudb_fractal_tree_info_information_schema, - "TokuDB_fractal_tree_info", - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_fractal_tree_info_init, /* plugin init */ - tokudb_fractal_tree_info_done, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - NULL, /* status variables */ - NULL, /* system variables */ - TOKUDB_PLUGIN_VERSION_STR, /* string version */ - MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ -}, -{ - MYSQL_INFORMATION_SCHEMA_PLUGIN, - &tokudb_fractal_tree_block_map_information_schema, - "TokuDB_fractal_tree_block_map", - "Tokutek Inc", - "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", - PLUGIN_LICENSE_GPL, - tokudb_fractal_tree_block_map_init, /* plugin init */ - tokudb_fractal_tree_block_map_done, /* plugin deinit */ - TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ - NULL, /* status variables */ - NULL, /* system variables */ - TOKUDB_PLUGIN_VERSION_STR, /* string version */ - MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ -} -maria_declare_plugin_end; - -#endif +/* -*- mode: C; c-basic-offset: 4 -*- */ +#define MYSQL_SERVER 1 +#include "hatoku_defines.h" +#include + +#include "stdint.h" +#if defined(_WIN32) +#include "misc.h" +#endif +#define __STDC_FORMAT_MACROS +#include +#include "toku_os.h" +#include "toku_time.h" +#include "partitioned_counter.h" + +/* We define DTRACE after mysql_priv.h in case it disabled dtrace in the main server */ +#ifdef HAVE_DTRACE +#define _DTRACE_VERSION 1 +#else +#endif + +#include +#include "hatoku_hton.h" +#include "ha_tokudb.h" + +#undef PACKAGE +#undef VERSION +#undef HAVE_DTRACE +#undef _DTRACE_VERSION + +#define TOKU_METADB_NAME "tokudb_meta" + +typedef struct savepoint_info { + DB_TXN* txn; + tokudb_trx_data* trx; + bool in_sub_stmt; +} *SP_INFO, SP_INFO_T; + +static uchar *tokudb_get_key(TOKUDB_SHARE * share, size_t * length, my_bool not_used __attribute__ ((unused))) { + *length = share->table_name_length; + return (uchar *) share->table_name; +} + +static handler *tokudb_create_handler(handlerton * hton, TABLE_SHARE * table, MEM_ROOT * mem_root); + +static MYSQL_THDVAR_BOOL(commit_sync, + PLUGIN_VAR_THDLOCAL, + "sync on txn commit", + /* check */ NULL, + /* update */ NULL, + /* default*/ true +); +static MYSQL_THDVAR_UINT(pk_insert_mode, + 0, + "set the primary key insert mode", + NULL, + NULL, + 1, // default + 0, // min? + 2, // max + 1 // blocksize +); +static MYSQL_THDVAR_BOOL(load_save_space, + 0, + "if on, intial loads are slower but take less space", + NULL, + NULL, + false +); +static MYSQL_THDVAR_BOOL(disable_slow_alter, + 0, + "if on, alter tables that require copy are disabled", + NULL, + NULL, + false +); +static MYSQL_THDVAR_BOOL(disable_hot_alter, + 0, + "if on, hot alter table is disabled", + NULL, + NULL, + false +); +static MYSQL_THDVAR_BOOL(create_index_online, + 0, + "if on, create index done online", + NULL, + NULL, + true +); +static MYSQL_THDVAR_BOOL(disable_prefetching, + 0, + "if on, prefetching disabled", + NULL, + NULL, + false +); +static MYSQL_THDVAR_BOOL(prelock_empty, + 0, + "Tokudb Prelock Empty Table", + NULL, + NULL, + true +); +static MYSQL_THDVAR_BOOL(log_client_errors, + 0, + "Tokudb Log Client Errors", + NULL, + NULL, + false +); +static MYSQL_THDVAR_UINT(block_size, + 0, + "fractal tree block size", + NULL, + NULL, + 4<<20, // default + 4096, // min + ~0U, // max + 1 // blocksize??? +); +static MYSQL_THDVAR_UINT(read_block_size, + 0, + "fractal tree read block size", + NULL, + NULL, + 128*1024, // default + 4096, // min + ~0U, // max + 1 // blocksize??? +); +static MYSQL_THDVAR_UINT(read_buf_size, + 0, + "fractal tree read block size", //TODO: Is this a typo? + NULL, + NULL, + 128*1024, // default + 0, // min + 1*1024*1024, // max + 1 // blocksize??? +); +#if TOKU_INCLUDE_UPSERT +static MYSQL_THDVAR_BOOL(disable_slow_update, + PLUGIN_VAR_THDLOCAL, + "disable slow update", + NULL, // check + NULL, // update + false // default +); +static MYSQL_THDVAR_BOOL(disable_slow_upsert, + PLUGIN_VAR_THDLOCAL, + "disable slow upsert", + NULL, // check + NULL, // update + false // default +); +#endif +static MYSQL_THDVAR_UINT(analyze_time, + 0, + "analyze time", + NULL, + NULL, + 60, // default + 0, // min + ~0U, // max + 1 // blocksize??? +); + +static void tokudb_checkpoint_lock(THD * thd); +static void tokudb_checkpoint_unlock(THD * thd); + +static void tokudb_checkpoint_lock_update( + THD* thd, + struct st_mysql_sys_var* var, + void* var_ptr, + const void* save) +{ + my_bool* val = (my_bool *) var_ptr; + *val= *(my_bool *) save ? true : false; + if (*val) { + tokudb_checkpoint_lock(thd); + } + else { + tokudb_checkpoint_unlock(thd); + } +} + +static MYSQL_THDVAR_BOOL(checkpoint_lock, + 0, + "Tokudb Checkpoint Lock", + NULL, + tokudb_checkpoint_lock_update, + false +); + +static const char *tokudb_row_format_names[] = { + "tokudb_uncompressed", + "tokudb_zlib", + "tokudb_quicklz", + "tokudb_lzma", + "tokudb_fast", + "tokudb_small", + "tokudb_default", + NullS +}; + +static TYPELIB tokudb_row_format_typelib = { + array_elements(tokudb_row_format_names) - 1, + "tokudb_row_format_typelib", + tokudb_row_format_names, + NULL +}; + +static MYSQL_THDVAR_ENUM(row_format, PLUGIN_VAR_OPCMDARG, + "Specifies the compression method for a table during this session. " + "Possible values are TOKUDB_UNCOMPRESSED, TOKUDB_ZLIB, TOKUDB_QUICKLZ, " + "TOKUDB_LZMA, TOKUDB_FAST, TOKUDB_SMALL and TOKUDB_DEFAULT", + NULL, NULL, SRV_ROW_FORMAT_DEFAULT, &tokudb_row_format_typelib); + +srv_row_format_t get_row_format(THD *thd) +{ + return (srv_row_format_t) THDVAR(thd, row_format); +} + +static void tokudb_print_error(const DB_ENV * db_env, const char *db_errpfx, const char *buffer); +static void tokudb_cleanup_log_files(void); +static int tokudb_end(handlerton * hton, ha_panic_function type); +static bool tokudb_flush_logs(handlerton * hton); +static bool tokudb_show_status(handlerton * hton, THD * thd, stat_print_fn * print, enum ha_stat_type); +static int tokudb_close_connection(handlerton * hton, THD * thd); +static int tokudb_commit(handlerton * hton, THD * thd, bool all); +static int tokudb_rollback(handlerton * hton, THD * thd, bool all); +#if TOKU_INCLUDE_XA +static int tokudb_xa_prepare(handlerton* hton, THD* thd, bool all); +static int tokudb_xa_recover(handlerton* hton, XID* xid_list, uint len); +static int tokudb_commit_by_xid(handlerton* hton, XID* xid); +static int tokudb_rollback_by_xid(handlerton* hton, XID* xid); +#endif + +static int tokudb_rollback_to_savepoint(handlerton * hton, THD * thd, void *savepoint); +static int tokudb_savepoint(handlerton * hton, THD * thd, void *savepoint); +static int tokudb_release_savepoint(handlerton * hton, THD * thd, void *savepoint); +static int tokudb_discover(handlerton *hton, THD* thd, const char *db, const char *name, uchar **frmblob, size_t *frmlen); +static int tokudb_discover2(handlerton *hton, THD* thd, const char *db, const char *name, bool translate_name,uchar **frmblob, size_t *frmlen); +handlerton *tokudb_hton; + +const char *ha_tokudb_ext = ".tokudb"; +char *tokudb_data_dir; +ulong tokudb_debug; +DB_ENV *db_env; +DB* metadata_db; +HASH tokudb_open_tables; +pthread_mutex_t tokudb_mutex; +pthread_mutex_t tokudb_meta_mutex; + +static PARTITIONED_COUNTER tokudb_primary_key_bytes_inserted; +void toku_hton_update_primary_key_bytes_inserted(uint64_t row_size) { + increment_partitioned_counter(tokudb_primary_key_bytes_inserted, row_size); +} + +static ulonglong tokudb_lock_timeout; +static ulong tokudb_cleaner_period; +static ulong tokudb_cleaner_iterations; + +#define ASSERT_MSGLEN 1024 + +void toku_hton_assert_fail(const char* expr_as_string, const char * fun, const char * file, int line, int caller_errno) { + char msg[ASSERT_MSGLEN]; + if (db_env) { + snprintf(msg, ASSERT_MSGLEN, "Handlerton: %s ", expr_as_string); + db_env->crash(db_env, msg, fun, file, line,caller_errno); + } + else { + snprintf(msg, ASSERT_MSGLEN, "Handlerton assertion failed, no env, %s, %d, %s, %s (errno=%d)\n", file, line, fun, expr_as_string, caller_errno); + perror(msg); + fflush(stderr); + } + abort(); +} + +//my_bool tokudb_shared_data = false; +static uint32_t tokudb_init_flags = + DB_CREATE | DB_THREAD | DB_PRIVATE | + DB_INIT_LOCK | + DB_INIT_MPOOL | + DB_INIT_TXN | + DB_INIT_LOG | + DB_RECOVER; +static uint32_t tokudb_env_flags = 0; +// static uint32_t tokudb_lock_type = DB_LOCK_DEFAULT; +// static ulong tokudb_log_buffer_size = 0; +// static ulong tokudb_log_file_size = 0; +static my_bool tokudb_directio = FALSE; +static my_bool tokudb_checkpoint_on_flush_logs = FALSE; +static ulonglong tokudb_cache_size = 0; +static ulonglong tokudb_max_lock_memory = 0; +static char *tokudb_home; +static char *tokudb_tmp_dir; +static char *tokudb_log_dir; +// static long tokudb_lock_scan_time = 0; +// static ulong tokudb_region_size = 0; +// static ulong tokudb_cache_parts = 1; +const char *tokudb_hton_name = "TokuDB"; +static uint32_t tokudb_checkpointing_period; +static uint32_t tokudb_fsync_log_period; +uint32_t tokudb_write_status_frequency; +uint32_t tokudb_read_status_frequency; +#ifdef TOKUDB_VERSION +char *tokudb_version = (char*) TOKUDB_VERSION; +#else +char *tokudb_version; +#endif +static int tokudb_fs_reserve_percent; // file system reserve as a percentage of total disk space + +#if defined(_WIN32) +extern "C" { +#include "ydb.h" +} +#endif + +// A flag set if the handlerton is in an initialized, usable state, +// plus a reader-write lock to protect it without serializing reads. +// Since we don't have static initializers for the opaque rwlock type, +// use constructor and destructor functions to create and destroy +// the lock before and after main(), respectively. +static int tokudb_hton_initialized; +static rw_lock_t tokudb_hton_initialized_lock; + +static void create_tokudb_hton_intialized_lock(void) __attribute__((constructor)); +static void destroy_tokudb_hton_initialized_lock(void) __attribute__((destructor)); + +static void create_tokudb_hton_intialized_lock(void) +{ + my_rwlock_init(&tokudb_hton_initialized_lock, 0); +} + +static void destroy_tokudb_hton_initialized_lock(void) +{ + rwlock_destroy(&tokudb_hton_initialized_lock); +} + +static SHOW_VAR *toku_global_status_variables = NULL; +static uint64_t toku_global_status_max_rows; +static TOKU_ENGINE_STATUS_ROW_S* toku_global_status_rows = NULL; + +static int tokudb_init_func(void *p) { + TOKUDB_DBUG_ENTER("tokudb_init_func"); + int r; +#if defined(_WIN64) + r = toku_ydb_init(); + if (r) { + printf("got error %d\n", r); + goto error; + } +#endif + + // 3938: lock the handlerton's initialized status flag for writing + r = rw_wrlock(&tokudb_hton_initialized_lock); + assert(r == 0); + + db_env = NULL; + metadata_db = NULL; + + tokudb_hton = (handlerton *) p; + + pthread_mutex_init(&tokudb_mutex, MY_MUTEX_INIT_FAST); + pthread_mutex_init(&tokudb_meta_mutex, MY_MUTEX_INIT_FAST); + (void) my_hash_init(&tokudb_open_tables, table_alias_charset, 32, 0, 0, (my_hash_get_key) tokudb_get_key, 0, 0); + + tokudb_hton->state = SHOW_OPTION_YES; + // tokudb_hton->flags= HTON_CAN_RECREATE; // QQQ this came from skeleton + tokudb_hton->flags = HTON_CLOSE_CURSORS_AT_COMMIT; + +#if TOKU_INCLUDE_OTHER_DB_TYPE + // we have historically been a dynamic storage engine, so we set db_type according. + // however, extended keys is triggered off of the db_type, so tokudb adds another type so that extended keys works + tokudb_hton->db_type = DB_TYPE_UNKNOWN; + tokudb_hton->other_db_type = DB_TYPE_TOKUDB; +#else + tokudb_hton->db_type = DB_TYPE_TOKUDB; +#endif + + tokudb_hton->create = tokudb_create_handler; + tokudb_hton->close_connection = tokudb_close_connection; + + tokudb_hton->savepoint_offset = sizeof(SP_INFO_T); + tokudb_hton->savepoint_set = tokudb_savepoint; + tokudb_hton->savepoint_rollback = tokudb_rollback_to_savepoint; + tokudb_hton->savepoint_release = tokudb_release_savepoint; + + tokudb_hton->discover = tokudb_discover; +#if defined(MYSQL_HANDLERTON_INCLUDE_DISCOVER2) + tokudb_hton->discover2 = tokudb_discover2; +#endif + tokudb_hton->commit = tokudb_commit; + tokudb_hton->rollback = tokudb_rollback; +#if TOKU_INCLUDE_XA + tokudb_hton->prepare=tokudb_xa_prepare; + tokudb_hton->recover=tokudb_xa_recover; + tokudb_hton->commit_by_xid=tokudb_commit_by_xid; + tokudb_hton->rollback_by_xid=tokudb_rollback_by_xid; +#endif + + tokudb_hton->panic = tokudb_end; + tokudb_hton->flush_logs = tokudb_flush_logs; + tokudb_hton->show_status = tokudb_show_status; + if (!tokudb_home) + tokudb_home = mysql_real_data_home; + DBUG_PRINT("info", ("tokudb_home: %s", tokudb_home)); +#if 0 + if (!tokudb_log_buffer_size) { // QQQ + tokudb_log_buffer_size = max(table_cache_size * 512, 32 * 1024); + DBUG_PRINT("info", ("computing tokudb_log_buffer_size %ld\n", tokudb_log_buffer_size)); + } + tokudb_log_file_size = tokudb_log_buffer_size * 4; + tokudb_log_file_size = MY_ALIGN(tokudb_log_file_size, 1024 * 1024L); + tokudb_log_file_size = max(tokudb_log_file_size, 10 * 1024 * 1024L); + DBUG_PRINT("info", ("computing tokudb_log_file_size: %ld\n", tokudb_log_file_size)); +#endif + if ((r = db_env_create(&db_env, 0))) { + DBUG_PRINT("info", ("db_env_create %d\n", r)); + goto error; + } + + DBUG_PRINT("info", ("tokudb_env_flags: 0x%x\n", tokudb_env_flags)); + r = db_env->set_flags(db_env, tokudb_env_flags, 1); + if (r) { // QQQ + if (tokudb_debug & TOKUDB_DEBUG_INIT) + TOKUDB_TRACE("%s:WARNING: flags=%x r=%d\n", __FUNCTION__, tokudb_env_flags, r); + // goto error; + } + + // config error handling + db_env->set_errcall(db_env, tokudb_print_error); + db_env->set_errpfx(db_env, "TokuDB"); + + // + // set default comparison functions + // + r = db_env->set_default_bt_compare(db_env, tokudb_cmp_dbt_key); + if (r) { + DBUG_PRINT("info", ("set_default_bt_compare%d\n", r)); + goto error; + } + + { + char *tmp_dir = tokudb_tmp_dir; + char *data_dir = tokudb_data_dir; + if (data_dir == 0) { + data_dir = mysql_data_home; + } + if (tmp_dir == 0) { + tmp_dir = data_dir; + } + DBUG_PRINT("info", ("tokudb_data_dir: %s\n", data_dir)); + db_env->set_data_dir(db_env, data_dir); + + DBUG_PRINT("info", ("tokudb_tmp_dir: %s\n", tmp_dir)); + db_env->set_tmp_dir(db_env, tmp_dir); + } + + if (tokudb_log_dir) { + DBUG_PRINT("info", ("tokudb_log_dir: %s\n", tokudb_log_dir)); + db_env->set_lg_dir(db_env, tokudb_log_dir); + } + + // config the cache table size to min(1/2 of physical memory, 1/8 of the process address space) + if (tokudb_cache_size == 0) { + uint64_t physmem, maxdata; + physmem = toku_os_get_phys_memory_size(); + tokudb_cache_size = physmem / 2; + r = toku_os_get_max_process_data_size(&maxdata); + if (r == 0) { + if (tokudb_cache_size > maxdata / 8) + tokudb_cache_size = maxdata / 8; + } + } + if (tokudb_cache_size) { + DBUG_PRINT("info", ("tokudb_cache_size: %lld\n", tokudb_cache_size)); + r = db_env->set_cachesize(db_env, (uint32_t)(tokudb_cache_size >> 30), (uint32_t)(tokudb_cache_size % (1024L * 1024L * 1024L)), 1); + if (r) { + DBUG_PRINT("info", ("set_cachesize %d\n", r)); + goto error; + } + } + if (tokudb_max_lock_memory == 0) { + tokudb_max_lock_memory = tokudb_cache_size/8; + } + if (tokudb_max_lock_memory) { + DBUG_PRINT("info", ("tokudb_max_lock_memory: %lld\n", tokudb_max_lock_memory)); + r = db_env->set_lk_max_memory(db_env, tokudb_max_lock_memory); + if (r) { + DBUG_PRINT("info", ("set_lk_max_memory %d\n", r)); + goto error; + } + } + + uint32_t gbytes, bytes; int parts; + r = db_env->get_cachesize(db_env, &gbytes, &bytes, &parts); + if (r == 0) + if (tokudb_debug & TOKUDB_DEBUG_INIT) + TOKUDB_TRACE("%s:tokudb_cache_size=%lld\n", __FUNCTION__, ((unsigned long long) gbytes << 30) + bytes); + +#if 0 + // QQQ config the logs + DBUG_PRINT("info", ("tokudb_log_file_size: %ld\n", tokudb_log_file_size)); + db_env->set_lg_max(db_env, tokudb_log_file_size); + DBUG_PRINT("info", ("tokudb_log_buffer_size: %ld\n", tokudb_log_buffer_size)); + db_env->set_lg_bsize(db_env, tokudb_log_buffer_size); + // DBUG_PRINT("info",("tokudb_region_size: %ld\n", tokudb_region_size)); + // db_env->set_lg_regionmax(db_env, tokudb_region_size); +#endif + + if (db_env->set_redzone) { + r = db_env->set_redzone(db_env, tokudb_fs_reserve_percent); + if (r && (tokudb_debug & TOKUDB_DEBUG_INIT)) + TOKUDB_TRACE("%s:%d r=%d\n", __FUNCTION__, __LINE__, r); + } + + if (tokudb_debug & TOKUDB_DEBUG_INIT) TOKUDB_TRACE("%s:env open:flags=%x\n", __FUNCTION__, tokudb_init_flags); + + r = db_env->set_generate_row_callback_for_put(db_env,generate_row_for_put); + assert(r == 0); + r = db_env->set_generate_row_callback_for_del(db_env,generate_row_for_del); + assert(r == 0); + db_env->set_update(db_env, tokudb_update_fun); + db_env_set_direct_io(tokudb_directio == TRUE); + db_env->change_fsync_log_period(db_env, tokudb_fsync_log_period); + r = db_env->open(db_env, tokudb_home, tokudb_init_flags, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH); + + if (tokudb_debug & TOKUDB_DEBUG_INIT) TOKUDB_TRACE("%s:env opened:return=%d\n", __FUNCTION__, r); + + if (r) { + DBUG_PRINT("info", ("env->open %d\n", r)); + goto error; + } + + r = db_env->checkpointing_set_period(db_env, tokudb_checkpointing_period); + assert(r == 0); + r = db_env->cleaner_set_period(db_env, tokudb_cleaner_period); + assert(r == 0); + r = db_env->cleaner_set_iterations(db_env, tokudb_cleaner_iterations); + assert(r == 0); + + r = db_env->set_lock_timeout(db_env, tokudb_lock_timeout); + assert(r == 0); + + r = db_env->get_engine_status_num_rows (db_env, &toku_global_status_max_rows); + assert(r == 0); + + { + const myf mem_flags = MY_FAE|MY_WME|MY_ZEROFILL|MY_ALLOW_ZERO_PTR|MY_FREE_ON_ERROR; + toku_global_status_variables = (SHOW_VAR*)my_malloc(sizeof(*toku_global_status_variables)*toku_global_status_max_rows, mem_flags); + toku_global_status_rows = (TOKU_ENGINE_STATUS_ROW_S*)my_malloc(sizeof(*toku_global_status_rows)*toku_global_status_max_rows, mem_flags); + } + + r = db_create(&metadata_db, db_env, 0); + if (r) { + DBUG_PRINT("info", ("failed to create metadata db %d\n", r)); + goto error; + } + + + r= metadata_db->open(metadata_db, NULL, TOKU_METADB_NAME, NULL, DB_BTREE, DB_THREAD, 0); + if (r) { + if (r != ENOENT) { + sql_print_error("Got error %d when trying to open metadata_db", r); + goto error; + } + r = metadata_db->close(metadata_db,0); + assert(r == 0); + r = db_create(&metadata_db, db_env, 0); + if (r) { + DBUG_PRINT("info", ("failed to create metadata db %d\n", r)); + goto error; + } + + r= metadata_db->open(metadata_db, NULL, TOKU_METADB_NAME, NULL, DB_BTREE, DB_THREAD | DB_CREATE | DB_EXCL, my_umask); + if (r) { + goto error; + } + } + + + + tokudb_primary_key_bytes_inserted = create_partitioned_counter(); + + //3938: succeeded, set the init status flag and unlock + tokudb_hton_initialized = 1; + rw_unlock(&tokudb_hton_initialized_lock); + DBUG_RETURN(false); + +error: + if (metadata_db) { + int rr = metadata_db->close(metadata_db, 0); + assert(rr==0); + } + if (db_env) { + int rr= db_env->close(db_env, 0); + assert(rr==0); + db_env = 0; + } + + // 3938: failed to initialized, drop the flag and lock + tokudb_hton_initialized = 0; + rw_unlock(&tokudb_hton_initialized_lock); + DBUG_RETURN(true); +} + +static int tokudb_done_func(void *p) { + TOKUDB_DBUG_ENTER("tokudb_done_func"); + { + const myf mem_flags = MY_FAE|MY_WME|MY_ZEROFILL|MY_ALLOW_ZERO_PTR|MY_FREE_ON_ERROR; + my_free(toku_global_status_variables, mem_flags); + my_free(toku_global_status_rows, mem_flags); + toku_global_status_variables = NULL; + toku_global_status_rows = NULL; + } + my_hash_free(&tokudb_open_tables); + pthread_mutex_destroy(&tokudb_mutex); + pthread_mutex_destroy(&tokudb_meta_mutex); +#if defined(_WIN64) + toku_ydb_destroy(); +#endif + TOKUDB_DBUG_RETURN(0); +} + +static handler *tokudb_create_handler(handlerton * hton, TABLE_SHARE * table, MEM_ROOT * mem_root) { + return new(mem_root) ha_tokudb(hton, table); +} + +int tokudb_end(handlerton * hton, ha_panic_function type) { + TOKUDB_DBUG_ENTER("tokudb_end"); + int error = 0; + + // 3938: if we finalize the storage engine plugin, it is no longer + // initialized. grab a writer lock for the duration of the + // call, so we can drop the flag and destroy the mutexes + // in isolation. + rw_wrlock(&tokudb_hton_initialized_lock); + assert(tokudb_hton_initialized); + + if (metadata_db) { + int r = metadata_db->close(metadata_db, 0); + assert(r == 0); + } + if (db_env) { + if (tokudb_init_flags & DB_INIT_LOG) + tokudb_cleanup_log_files(); + error = db_env->close(db_env, 0); // Error is logged + assert(error==0); + db_env = NULL; + } + + // 3938: drop the initialized flag and unlock + tokudb_hton_initialized = 0; + rw_unlock(&tokudb_hton_initialized_lock); + + TOKUDB_DBUG_RETURN(error); +} + +static int tokudb_close_connection(handlerton * hton, THD * thd) { + int error = 0; + tokudb_trx_data* trx = NULL; + trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + if (trx && trx->checkpoint_lock_taken) { + error = db_env->checkpointing_resume(db_env); + } + my_free(trx, MYF(0)); + return error; +} + +bool tokudb_flush_logs(handlerton * hton) { + TOKUDB_DBUG_ENTER("tokudb_flush_logs"); + int error; + bool result = 0; + + if (tokudb_checkpoint_on_flush_logs) { + // + // take the checkpoint + // + error = db_env->txn_checkpoint(db_env, 0, 0, 0); + if (error) { + my_error(ER_ERROR_DURING_CHECKPOINT, MYF(0), error); + result = 1; + goto exit; + } + } + else { + error = db_env->log_flush(db_env, NULL); + assert(error == 0); + } + + result = 0; +exit: + TOKUDB_DBUG_RETURN(result); +} + +uint get_pk_insert_mode(THD* thd) { + return THDVAR(thd, pk_insert_mode); +} + +bool get_load_save_space(THD* thd) { + return (THDVAR(thd, load_save_space) != 0); +} + +bool get_disable_slow_alter(THD* thd) { + return (THDVAR(thd, disable_slow_alter) != 0); +} + +bool get_disable_hot_alter(THD* thd) { + return THDVAR(thd, disable_hot_alter) != 0; +} + +bool get_create_index_online(THD* thd) { + return (THDVAR(thd, create_index_online) != 0); +} + +bool get_disable_prefetching(THD* thd) { + return (THDVAR(thd, disable_prefetching) != 0); +} + +bool get_prelock_empty(THD* thd) { + return (THDVAR(thd, prelock_empty) != 0); +} + +bool get_log_client_errors(THD* thd) { + return (THDVAR(thd, log_client_errors) != 0); +} + +uint get_tokudb_block_size(THD* thd) { + return THDVAR(thd, block_size); +} + +uint get_tokudb_read_block_size(THD* thd) { + return THDVAR(thd, read_block_size); +} + +uint get_tokudb_read_buf_size(THD* thd) { + return THDVAR(thd, read_buf_size); +} + +#if TOKU_INCLUDE_UPSERT +bool get_disable_slow_update(THD *thd) { + return (THDVAR(thd, disable_slow_update) != 0); +} + +bool get_disable_slow_upsert(THD *thd) { + return (THDVAR(thd, disable_slow_upsert) != 0); +} +#endif +uint get_analyze_time(THD *thd) { + return THDVAR(thd, analyze_time); +} + +typedef struct txn_progress_info { + char status[200]; + THD* thd; +} *TXN_PROGRESS_INFO; + + +void txn_progress_func(TOKU_TXN_PROGRESS progress, void* extra) { + TXN_PROGRESS_INFO progress_info = (TXN_PROGRESS_INFO)extra; + int r; + if (progress->stalled_on_checkpoint) { + if (progress->is_commit) { + r = sprintf( + progress_info->status, + "Writing committed changes to disk, processing commit of transaction, %"PRId64" out of %"PRId64, + progress->entries_processed, + progress->entries_total + ); + assert(r >= 0); + } + else { + r = sprintf( + progress_info->status, + "Writing committed changes to disk, processing abort of transaction, %"PRId64" out of %"PRId64, + progress->entries_processed, + progress->entries_total + ); + assert(r >= 0); + } + } + else { + if (progress->is_commit) { + r = sprintf( + progress_info->status, + "processing commit of transaction, %"PRId64" out of %"PRId64, + progress->entries_processed, + progress->entries_total + ); + assert(r >= 0); + } + else { + r = sprintf( + progress_info->status, + "processing abort of transaction, %"PRId64" out of %"PRId64, + progress->entries_processed, + progress->entries_total + ); + assert(r >= 0); + } + } + thd_proc_info(progress_info->thd, progress_info->status); +} + + +static void commit_txn_with_progress(DB_TXN* txn, uint32_t flags, THD* thd) { + int r; + struct txn_progress_info info; + info.thd = thd; + r = txn->commit_with_progress(txn, flags, txn_progress_func, &info); + if (r != 0) { + sql_print_error("tried committing transaction %p and got error code %d", txn, r); + } + assert(r == 0); +} + +static void abort_txn_with_progress(DB_TXN* txn, THD* thd) { + int r; + struct txn_progress_info info; + info.thd = thd; + r = txn->abort_with_progress(txn, txn_progress_func, &info); + if (r != 0) { + sql_print_error("tried aborting transaction %p and got error code %d", txn, r); + } + assert(r == 0); +} + +static int tokudb_commit(handlerton * hton, THD * thd, bool all) { + TOKUDB_DBUG_ENTER("tokudb_commit"); + DBUG_PRINT("trans", ("ending transaction %s", all ? "all" : "stmt")); + uint32_t syncflag = THDVAR(thd, commit_sync) ? 0 : DB_TXN_NOSYNC; + tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + DB_TXN **txn = all ? &trx->all : &trx->stmt; + if (*txn) { + if (tokudb_debug & TOKUDB_DEBUG_TXN) { + TOKUDB_TRACE("doing txn commit:%d:%p\n", all, *txn); + } + // test hook to induce a crash on a debug build + DBUG_EXECUTE_IF("tokudb_crash_commit_before", DBUG_SUICIDE();); + commit_txn_with_progress(*txn, syncflag, thd); + // test hook to induce a crash on a debug build + DBUG_EXECUTE_IF("tokudb_crash_commit_after", DBUG_SUICIDE();); + if (*txn == trx->sp_level) { + trx->sp_level = 0; + } + *txn = 0; + trx->sub_sp_level = NULL; + } + else if (tokudb_debug & TOKUDB_DEBUG_TXN) { + TOKUDB_TRACE("nothing to commit %d\n", all); + } + reset_stmt_progress(&trx->stmt_progress); + TOKUDB_DBUG_RETURN(0); +} + +static int tokudb_rollback(handlerton * hton, THD * thd, bool all) { + TOKUDB_DBUG_ENTER("tokudb_rollback"); + DBUG_PRINT("trans", ("aborting transaction %s", all ? "all" : "stmt")); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + DB_TXN **txn = all ? &trx->all : &trx->stmt; + if (*txn) { + if (tokudb_debug & TOKUDB_DEBUG_TXN) { + TOKUDB_TRACE("rollback:%p\n", *txn); + } + abort_txn_with_progress(*txn, thd); + if (*txn == trx->sp_level) { + trx->sp_level = 0; + } + *txn = 0; + trx->sub_sp_level = NULL; + } + else { + if (tokudb_debug & TOKUDB_DEBUG_TXN) { + TOKUDB_TRACE("abort0\n"); + } + } + reset_stmt_progress(&trx->stmt_progress); + TOKUDB_DBUG_RETURN(0); +} + +#if TOKU_INCLUDE_XA + +static int tokudb_xa_prepare(handlerton* hton, THD* thd, bool all) { + TOKUDB_DBUG_ENTER("tokudb_xa_prepare"); + int r = 0; + DBUG_PRINT("trans", ("preparing transaction %s", all ? "all" : "stmt")); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + DB_TXN* txn = all ? trx->all : trx->stmt; + if (txn) { + if (tokudb_debug & TOKUDB_DEBUG_TXN) { + TOKUDB_TRACE("doing txn prepare:%d:%p\n", all, txn); + } + // a TOKU_XA_XID is identical to a MYSQL_XID + TOKU_XA_XID thd_xid; + thd_get_xid(thd, (MYSQL_XID*) &thd_xid); + // test hook to induce a crash on a debug build + DBUG_EXECUTE_IF("tokudb_crash_prepare_before", DBUG_SUICIDE();); + r = txn->xa_prepare(txn, &thd_xid); + // test hook to induce a crash on a debug build + DBUG_EXECUTE_IF("tokudb_crash_prepare_after", DBUG_SUICIDE();); + } + else if (tokudb_debug & TOKUDB_DEBUG_TXN) { + TOKUDB_TRACE("nothing to prepare %d\n", all); + } + TOKUDB_DBUG_RETURN(r); +} + +static int tokudb_xa_recover(handlerton* hton, XID* xid_list, uint len) { + TOKUDB_DBUG_ENTER("tokudb_xa_recover"); + int r = 0; + if (len == 0 || xid_list == NULL) { + TOKUDB_DBUG_RETURN(0); + } + long num_returned = 0; + r = db_env->txn_xa_recover( + db_env, + (TOKU_XA_XID*)xid_list, + len, + &num_returned, + DB_NEXT + ); + assert(r == 0); + TOKUDB_DBUG_RETURN((int)num_returned); +} + +static int tokudb_commit_by_xid(handlerton* hton, XID* xid) { + TOKUDB_DBUG_ENTER("tokudb_commit_by_xid"); + int r = 0; + DB_TXN* txn = NULL; + TOKU_XA_XID* toku_xid = (TOKU_XA_XID*)xid; + + r = db_env->get_txn_from_xid(db_env, toku_xid, &txn); + if (r) { goto cleanup; } + + r = txn->commit(txn, 0); + if (r) { goto cleanup; } + + r = 0; +cleanup: + TOKUDB_DBUG_RETURN(r); +} + +static int tokudb_rollback_by_xid(handlerton* hton, XID* xid) { + TOKUDB_DBUG_ENTER("tokudb_rollback_by_xid"); + int r = 0; + DB_TXN* txn = NULL; + TOKU_XA_XID* toku_xid = (TOKU_XA_XID*)xid; + + r = db_env->get_txn_from_xid(db_env, toku_xid, &txn); + if (r) { goto cleanup; } + + r = txn->abort(txn); + if (r) { goto cleanup; } + + r = 0; +cleanup: + TOKUDB_DBUG_RETURN(r); +} + +#endif + +static int tokudb_savepoint(handlerton * hton, THD * thd, void *savepoint) { + TOKUDB_DBUG_ENTER("tokudb_savepoint"); + int error; + SP_INFO save_info = (SP_INFO)savepoint; + tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + if (thd->in_sub_stmt) { + assert(trx->stmt); + error = db_env->txn_begin(db_env, trx->sub_sp_level, &(save_info->txn), DB_INHERIT_ISOLATION); + if (error) { + goto cleanup; + } + trx->sub_sp_level = save_info->txn; + save_info->in_sub_stmt = true; + } + else { + error = db_env->txn_begin(db_env, trx->sp_level, &(save_info->txn), DB_INHERIT_ISOLATION); + if (error) { + goto cleanup; + } + trx->sp_level = save_info->txn; + save_info->in_sub_stmt = false; + } + save_info->trx = trx; + error = 0; +cleanup: + TOKUDB_DBUG_RETURN(error); +} + +static int tokudb_rollback_to_savepoint(handlerton * hton, THD * thd, void *savepoint) { + TOKUDB_DBUG_ENTER("tokudb_rollback_to_savepoint"); + int error; + SP_INFO save_info = (SP_INFO)savepoint; + DB_TXN* parent = NULL; + DB_TXN* txn_to_rollback = save_info->txn; + + tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + parent = txn_to_rollback->parent; + if (!(error = txn_to_rollback->abort(txn_to_rollback))) { + if (save_info->in_sub_stmt) { + trx->sub_sp_level = parent; + } + else { + trx->sp_level = parent; + } + error = tokudb_savepoint(hton, thd, savepoint); + } + TOKUDB_DBUG_RETURN(error); +} + +static int tokudb_release_savepoint(handlerton * hton, THD * thd, void *savepoint) { + TOKUDB_DBUG_ENTER("tokudb_release_savepoint"); + int error; + + SP_INFO save_info = (SP_INFO)savepoint; + DB_TXN* parent = NULL; + DB_TXN* txn_to_commit = save_info->txn; + + tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + parent = txn_to_commit->parent; + if (!(error = txn_to_commit->commit(txn_to_commit, 0))) { + if (save_info->in_sub_stmt) { + trx->sub_sp_level = parent; + } + else { + trx->sp_level = parent; + } + save_info->txn = NULL; + } + TOKUDB_DBUG_RETURN(error); +} + +static int tokudb_discover(handlerton *hton, THD* thd, const char *db, const char *name, uchar **frmblob, size_t *frmlen) { + return tokudb_discover2(hton, thd, db, name, true, frmblob, frmlen); +} + +static int tokudb_discover2(handlerton *hton, THD* thd, const char *db, const char *name, bool translate_name, + uchar **frmblob, size_t *frmlen) { + TOKUDB_DBUG_ENTER("tokudb_discover"); + int error; + DB* status_db = NULL; + DB_TXN* txn = NULL; + char path[FN_REFLEN + 1]; + HA_METADATA_KEY curr_key = hatoku_frm_data; + DBT key, value; + memset(&key, 0, sizeof(key)); + memset(&value, 0, sizeof(&value)); + + error = db_env->txn_begin(db_env, 0, &txn, 0); + if (error) { goto cleanup; } + + build_table_filename(path, sizeof(path) - 1, db, name, "", translate_name ? 0 : FN_IS_TMP); + error = open_status_dictionary(&status_db, path, txn); + if (error) { goto cleanup; } + + key.data = &curr_key; + key.size = sizeof(curr_key); + + error = status_db->getf_set( + status_db, + txn, + 0, + &key, + smart_dbt_callback_verify_frm, + &value + ); + if (error) { + goto cleanup; + } + + *frmblob = (uchar *)value.data; + *frmlen = value.size; + + error = 0; +cleanup: + if (status_db) { + status_db->close(status_db,0); + } + if (txn) { + commit_txn(txn, 0); + } + TOKUDB_DBUG_RETURN(error); +} + +static int store_dbname_tablename_size(TABLE *table, char *name, uint64_t size, THD *thd) { + char *tp = strrchr(name, '/'); + assert(tp); + char *tablename = tp + 1; + size_t tablename_length = strlen(tablename); + + char *dp = strchr(name, '/'); + char *dbname; + size_t dbname_length; + if (dp == tp) { + dbname = name; + dbname_length = tp - dbname; + } else { + dbname = dp + 1; + dbname_length = tp - dbname; + } + + table->field[0]->store(dbname, dbname_length, system_charset_info); + table->field[1]->store(tablename, tablename_length, system_charset_info); + table->field[2]->store(size, false); + int error = schema_table_store_record(thd, table); + return error; +} + +static int tokudb_dictionary_info(TABLE *table, THD *thd) { + int error; + DB_TXN* txn = NULL; + DBC* tmp_cursor = NULL; + DBT curr_key; + DBT curr_val; + memset(&curr_key, 0, sizeof curr_key); + memset(&curr_val, 0, sizeof curr_val); + error = db_env->txn_begin(db_env, 0, &txn, DB_READ_UNCOMMITTED); + if (error) { + goto cleanup; + } + error = db_env->get_cursor_for_directory(db_env, txn, &tmp_cursor); + if (error) { + goto cleanup; + } + while (error == 0) { + error = tmp_cursor->c_get( + tmp_cursor, + &curr_key, + &curr_val, + DB_NEXT + ); + if (!error) { + // We store the NULL terminator in the directory so it's included in the size. + // See #5789 + // Recalculate and check just to be safe. + size_t dname_len = strlen((const char *)curr_key.data); + size_t iname_len = strlen((const char *)curr_val.data); + assert(dname_len == curr_key.size - 1); + assert(iname_len == curr_val.size - 1); + table->field[0]->store( + (char *)curr_key.data, + dname_len, + system_charset_info + ); + table->field[1]->store( + (char *)curr_val.data, + iname_len, + system_charset_info + ); + error = schema_table_store_record(thd, table); + } + } + if (error == DB_NOTFOUND) { + error = 0; + } +cleanup: + if (tmp_cursor) { + int r = tmp_cursor->c_close(tmp_cursor); + assert(r == 0); + } + if (txn) { + commit_txn(txn, 0); + } + return error; +} + +static int tokudb_report_fractal_tree_info_for_db(const DBT *dname, const DBT *iname, TABLE *table, THD *thd) { + int error; + DB *db; + uint64_t bt_num_blocks_allocated; + uint64_t bt_num_blocks_in_use; + uint64_t bt_size_allocated; + uint64_t bt_size_in_use; + + error = db_create(&db, db_env, 0); + if (error) { + goto exit; + } + error = db->open(db, NULL, (char *)dname->data, NULL, DB_BTREE, 0, 0666); + if (error) { + goto exit; + } + error = db->get_fractal_tree_info64(db, + &bt_num_blocks_allocated, &bt_num_blocks_in_use, + &bt_size_allocated, &bt_size_in_use); + { + int close_error = db->close(db, 0); + if (!error) { + error = close_error; + } + } + if (error) { + goto exit; + } + + // We store the NULL terminator in the directory so it's included in the size. + // See #5789 + // Recalculate and check just to be safe. + { + size_t dname_len = strlen((const char *)dname->data); + size_t iname_len = strlen((const char *)iname->data); + assert(dname_len == dname->size - 1); + assert(iname_len == iname->size - 1); + table->field[0]->store( + (char *)dname->data, + dname_len, + system_charset_info + ); + table->field[1]->store( + (char *)iname->data, + iname_len, + system_charset_info + ); + } + table->field[2]->store(bt_num_blocks_allocated, false); + table->field[3]->store(bt_num_blocks_in_use, false); + table->field[4]->store(bt_size_allocated, false); + table->field[5]->store(bt_size_in_use, false); + + error = schema_table_store_record(thd, table); + +exit: + return error; +} + +static int tokudb_fractal_tree_info(TABLE *table, THD *thd) { + int error; + DB_TXN* txn = NULL; + DBC* tmp_cursor = NULL; + DBT curr_key; + DBT curr_val; + memset(&curr_key, 0, sizeof curr_key); + memset(&curr_val, 0, sizeof curr_val); + error = db_env->txn_begin(db_env, 0, &txn, DB_READ_UNCOMMITTED); + if (error) { + goto cleanup; + } + error = db_env->get_cursor_for_directory(db_env, txn, &tmp_cursor); + if (error) { + goto cleanup; + } + while (error == 0) { + error = tmp_cursor->c_get( + tmp_cursor, + &curr_key, + &curr_val, + DB_NEXT + ); + if (!error) { + error = tokudb_report_fractal_tree_info_for_db(&curr_key, &curr_val, table, thd); + } + } + if (error == DB_NOTFOUND) { + error = 0; + } +cleanup: + if (tmp_cursor) { + int r = tmp_cursor->c_close(tmp_cursor); + assert(r == 0); + } + if (txn) { + commit_txn(txn, 0); + } + return error; +} + +struct tokudb_report_fractal_tree_block_map_iterator_extra { + int64_t num_rows; + int64_t i; + uint64_t *checkpoint_counts; + int64_t *blocknums; + int64_t *diskoffs; + int64_t *sizes; +}; + +// This iterator is called while holding the blocktable lock. We should be as quick as possible. +// We don't want to do one call to get the number of rows, release the blocktable lock, and then do another call to get all the rows because the number of rows may change if we don't hold the lock. +// As a compromise, we'll do some mallocs inside the lock on the first call, but everything else should be fast. +static int tokudb_report_fractal_tree_block_map_iterator(uint64_t checkpoint_count, + int64_t num_rows, + int64_t blocknum, + int64_t diskoff, + int64_t size, + void *iter_extra) { + struct tokudb_report_fractal_tree_block_map_iterator_extra *e = static_cast(iter_extra); + + assert(num_rows > 0); + if (e->num_rows == 0) { + e->checkpoint_counts = (uint64_t *) my_malloc(num_rows * (sizeof *e->checkpoint_counts), MYF(MY_WME|MY_ZEROFILL|MY_FAE)); + e->blocknums = (int64_t *) my_malloc(num_rows * (sizeof *e->blocknums), MYF(MY_WME|MY_ZEROFILL|MY_FAE)); + e->diskoffs = (int64_t *) my_malloc(num_rows * (sizeof *e->diskoffs), MYF(MY_WME|MY_ZEROFILL|MY_FAE)); + e->sizes = (int64_t *) my_malloc(num_rows * (sizeof *e->sizes), MYF(MY_WME|MY_ZEROFILL|MY_FAE)); + e->num_rows = num_rows; + } + + e->checkpoint_counts[e->i] = checkpoint_count; + e->blocknums[e->i] = blocknum; + e->diskoffs[e->i] = diskoff; + e->sizes[e->i] = size; + ++(e->i); + + return 0; +} + +static int tokudb_report_fractal_tree_block_map_for_db(const DBT *dname, const DBT *iname, TABLE *table, THD *thd) { + int error; + DB *db; + struct tokudb_report_fractal_tree_block_map_iterator_extra e = {}; // avoid struct initializers so that we can compile with older gcc versions + + error = db_create(&db, db_env, 0); + if (error) { + goto exit; + } + error = db->open(db, NULL, (char *)dname->data, NULL, DB_BTREE, 0, 0666); + if (error) { + goto exit; + } + error = db->iterate_fractal_tree_block_map(db, tokudb_report_fractal_tree_block_map_iterator, &e); + { + int close_error = db->close(db, 0); + if (!error) { + error = close_error; + } + } + if (error) { + goto exit; + } + + // If not, we should have gotten an error and skipped this section of code + assert(e.i == e.num_rows); + for (int64_t i = 0; error == 0 && i < e.num_rows; ++i) { + // We store the NULL terminator in the directory so it's included in the size. + // See #5789 + // Recalculate and check just to be safe. + size_t dname_len = strlen((const char *)dname->data); + size_t iname_len = strlen((const char *)iname->data); + assert(dname_len == dname->size - 1); + assert(iname_len == iname->size - 1); + table->field[0]->store( + (char *)dname->data, + dname_len, + system_charset_info + ); + table->field[1]->store( + (char *)iname->data, + iname_len, + system_charset_info + ); + table->field[2]->store(e.checkpoint_counts[i], false); + table->field[3]->store(e.blocknums[i], false); + static const int64_t freelist_null = -1; + static const int64_t diskoff_unused = -2; + if (e.diskoffs[i] == diskoff_unused || e.diskoffs[i] == freelist_null) { + table->field[4]->set_null(); + } else { + table->field[4]->set_notnull(); + table->field[4]->store(e.diskoffs[i], false); + } + static const int64_t size_is_free = -1; + if (e.sizes[i] == size_is_free) { + table->field[5]->set_null(); + } else { + table->field[5]->set_notnull(); + table->field[5]->store(e.sizes[i], false); + } + + error = schema_table_store_record(thd, table); + } + +exit: + if (e.checkpoint_counts != NULL) { + my_free(e.checkpoint_counts, MYF(0)); + e.checkpoint_counts = NULL; + } + if (e.blocknums != NULL) { + my_free(e.blocknums, MYF(0)); + e.blocknums = NULL; + } + if (e.diskoffs != NULL) { + my_free(e.diskoffs, MYF(0)); + e.diskoffs = NULL; + } + if (e.sizes != NULL) { + my_free(e.sizes, MYF(0)); + e.sizes = NULL; + } + return error; +} + +static int tokudb_fractal_tree_block_map(TABLE *table, THD *thd) { + int error; + DB_TXN* txn = NULL; + DBC* tmp_cursor = NULL; + DBT curr_key; + DBT curr_val; + memset(&curr_key, 0, sizeof curr_key); + memset(&curr_val, 0, sizeof curr_val); + error = db_env->txn_begin(db_env, 0, &txn, DB_READ_UNCOMMITTED); + if (error) { + goto cleanup; + } + error = db_env->get_cursor_for_directory(db_env, txn, &tmp_cursor); + if (error) { + goto cleanup; + } + while (error == 0) { + error = tmp_cursor->c_get( + tmp_cursor, + &curr_key, + &curr_val, + DB_NEXT + ); + if (!error) { + error = tokudb_report_fractal_tree_block_map_for_db(&curr_key, &curr_val, table, thd); + } + } + if (error == DB_NOTFOUND) { + error = 0; + } +cleanup: + if (tmp_cursor) { + int r = tmp_cursor->c_close(tmp_cursor); + assert(r == 0); + } + if (txn) { + commit_txn(txn, 0); + } + return error; +} + +static int tokudb_get_user_data_size(TABLE *table, THD *thd, bool exact) { + int error; + DB* curr_db = NULL; + DB_TXN* txn = NULL; + DBC* tmp_cursor = NULL; + DBC* tmp_table_cursor = NULL; + DBT curr_key; + DBT curr_val; + DB_TXN* tmp_txn = NULL; + memset(&curr_key, 0, sizeof curr_key); + memset(&curr_val, 0, sizeof curr_val); + pthread_mutex_lock(&tokudb_meta_mutex); + + error = db_env->txn_begin(db_env, 0, &txn, DB_READ_UNCOMMITTED); + if (error) { + goto cleanup; + } + error = metadata_db->cursor(metadata_db, txn, &tmp_cursor, 0); + if (error) { + goto cleanup; + } + while (error == 0) { + tmp_txn = NULL; + // + // here, and in other places, check if process has been killed + // if so, get out of function so user is not stalled + // + if (thd->killed) { + break; + } + error = db_env->txn_begin(db_env, 0, &tmp_txn, DB_READ_UNCOMMITTED); + if (error) { + goto cleanup; + } + + // + // do not need this to be super fast, so use old simple API + // + error = tmp_cursor->c_get( + tmp_cursor, + &curr_key, + &curr_val, + DB_NEXT + ); + if (!error) { + char* name = (char *)curr_key.data; + char* newname; + uint64_t curr_num_bytes = 0; + DB_BTREE_STAT64 dict_stats; + + error = db_create(&curr_db, db_env, 0); + if (error) { goto cleanup; } + + newname = (char *)my_malloc( + get_max_dict_name_path_length(name), + MYF(MY_WME|MY_ZEROFILL|MY_FAE)); + + make_name(newname, name, "main"); + + error = curr_db->open(curr_db, tmp_txn, newname, NULL, DB_BTREE, DB_THREAD, 0); + + my_free(newname, MYF(0)); + + if (error == ENOENT) { error = 0; continue; } + if (error) { goto cleanup; } + + if (exact) { + // + // flatten if exact is required + // + uint curr_num_items = 0; + error = curr_db->cursor(curr_db, tmp_txn, &tmp_table_cursor, 0); + if (error) { + tmp_table_cursor = NULL; + goto cleanup; + } + while (error != DB_NOTFOUND) { + error = tmp_table_cursor->c_getf_next(tmp_table_cursor, 0, smart_dbt_do_nothing, NULL); + if (error && error != DB_NOTFOUND) { + goto cleanup; + } + curr_num_items++; + // + // allow early exit if command has been killed + // + if ( (curr_num_items % 1000) == 0 && thd->killed) { + goto cleanup; + } + } + error = tmp_table_cursor->c_close(tmp_table_cursor); + assert(error==0); + tmp_table_cursor = NULL; + } + + error = curr_db->stat64( + curr_db, + tmp_txn, + &dict_stats + ); + if (error) { goto cleanup; } + + curr_num_bytes = dict_stats.bt_dsize; + if (*(uchar *)curr_val.data) { + // + // in this case, we have a hidden primary key, do not + // want to report space taken up by the hidden primary key to the user + // + uint64_t hpk_space = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH*dict_stats.bt_ndata; + curr_num_bytes = (hpk_space > curr_num_bytes) ? 0 : curr_num_bytes - hpk_space; + } + else { + // + // one infinity byte per key needs to be subtracted + // + uint64_t inf_byte_space = dict_stats.bt_ndata; + curr_num_bytes = (inf_byte_space > curr_num_bytes) ? 0 : curr_num_bytes - inf_byte_space; + } + + error = store_dbname_tablename_size(table, name, curr_num_bytes, thd); + if (error) goto cleanup; + + { + int r = curr_db->close(curr_db, 0); + assert(r == 0); + curr_db = NULL; + } + } + + if (tmp_txn) { + commit_txn(tmp_txn, 0); + tmp_txn = NULL; + } + } + + error = 0; + +cleanup: + if (tmp_cursor) { + int r = tmp_cursor->c_close(tmp_cursor); + assert(r == 0); + } + if (tmp_table_cursor) { + int r = tmp_table_cursor->c_close(tmp_table_cursor); + assert(r == 0); + } + if (curr_db) { + int r = curr_db->close(curr_db, 0); + assert(r == 0); + } + if (tmp_txn) { + commit_txn(tmp_txn, 0); + } + if (txn) { + commit_txn(txn, 0); + } + if (error) { + sql_print_error("got an error %d in show_data_size\n", error); + } + pthread_mutex_unlock(&tokudb_meta_mutex); + return error; +} + +#define STATPRINT(legend, val) if (legend != NULL && val != NULL) stat_print(thd, \ + tokudb_hton_name, \ + strlen(tokudb_hton_name), \ + legend, \ + strlen(legend), \ + val, \ + strlen(val)) + +extern sys_var *intern_find_sys_var(const char *str, uint length, bool no_error); + +static bool tokudb_show_engine_status(THD * thd, stat_print_fn * stat_print) { + TOKUDB_DBUG_ENTER("tokudb_show_engine_status"); + int error; + uint64_t panic; + const int panic_string_len = 1024; + char panic_string[panic_string_len] = {'\0'}; + uint64_t num_rows; + uint64_t max_rows; + fs_redzone_state redzone_state; + const int bufsiz = 1024; + char buf[bufsiz]; + +#if MYSQL_VERSION_ID < 50500 + { + sys_var * version = intern_find_sys_var("version", 0, false); + snprintf(buf, bufsiz, "%s", version->value_ptr(thd, (enum_var_type)0, (LEX_STRING*)NULL)); + STATPRINT("Version", buf); + } +#endif + error = db_env->get_engine_status_num_rows (db_env, &max_rows); + TOKU_ENGINE_STATUS_ROW_S mystat[max_rows]; + error = db_env->get_engine_status (db_env, mystat, max_rows, &num_rows, &redzone_state, &panic, panic_string, panic_string_len, TOKU_ENGINE_STATUS); + + if (strlen(panic_string)) { + STATPRINT("Environment panic string", panic_string); + } + if (error == 0) { + if (panic) { + snprintf(buf, bufsiz, "%" PRIu64, panic); + STATPRINT("Environment panic", buf); + } + + if(redzone_state == FS_BLOCKED) { + STATPRINT("*** URGENT WARNING ***", "FILE SYSTEM IS COMPLETELY FULL"); + snprintf(buf, bufsiz, "FILE SYSTEM IS COMPLETELY FULL"); + } + else if (redzone_state == FS_GREEN) { + snprintf(buf, bufsiz, "more than %d percent of total file system space", 2*tokudb_fs_reserve_percent); + } + else if (redzone_state == FS_YELLOW) { + snprintf(buf, bufsiz, "*** WARNING *** FILE SYSTEM IS GETTING FULL (less than %d percent free)", 2*tokudb_fs_reserve_percent); + } + else if (redzone_state == FS_RED){ + snprintf(buf, bufsiz, "*** WARNING *** FILE SYSTEM IS GETTING VERY FULL (less than %d percent free): INSERTS ARE PROHIBITED", tokudb_fs_reserve_percent); + } + else { + snprintf(buf, bufsiz, "information unavailable, unknown redzone state %d", redzone_state); + } + STATPRINT ("disk free space", buf); + + for (uint64_t row = 0; row < num_rows; row++) { + switch (mystat[row].type) { + case FS_STATE: + snprintf(buf, bufsiz, "%"PRIu64"", mystat[row].value.num); + break; + case UINT64: + snprintf(buf, bufsiz, "%"PRIu64"", mystat[row].value.num); + break; + case CHARSTR: + snprintf(buf, bufsiz, "%s", mystat[row].value.str); + break; + case UNIXTIME: + { + time_t t = mystat[row].value.num; + char tbuf[26]; + snprintf(buf, bufsiz, "%.24s", ctime_r(&t, tbuf)); + } + break; + case TOKUTIME: + { + double t = tokutime_to_seconds(mystat[row].value.num); + snprintf(buf, bufsiz, "%.6f", t); + } + break; + case PARCOUNT: + { + uint64_t v = read_partitioned_counter(mystat[row].value.parcount); + snprintf(buf, bufsiz, "%" PRIu64, v); + } + break; + default: + snprintf(buf, bufsiz, "UNKNOWN STATUS TYPE: %d", mystat[row].type); + break; + } + STATPRINT(mystat[row].legend, buf); + } + uint64_t bytes_inserted = read_partitioned_counter(tokudb_primary_key_bytes_inserted); + snprintf(buf, bufsiz, "%" PRIu64, bytes_inserted); + STATPRINT("handlerton: primary key bytes inserted", buf); + } + if (error) { my_errno = error; } + TOKUDB_DBUG_RETURN(error); +} + +static void tokudb_checkpoint_lock(THD * thd) { + int error; + tokudb_trx_data* trx = NULL; + char status_msg[200]; //buffer of 200 should be a good upper bound. + trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + if (!trx) { + error = create_tokudb_trx_data_instance(&trx); + // + // can only fail due to memory allocation, so ok to assert + // + assert(!error); + thd_data_set(thd, tokudb_hton->slot, trx); + } + + if (trx->checkpoint_lock_taken) { + goto cleanup; + } + // + // This can only fail if environment is not created, which is not possible + // in handlerton + // + sprintf(status_msg, "Trying to grab checkpointing lock."); + thd_proc_info(thd, status_msg); + error = db_env->checkpointing_postpone(db_env); + assert(!error); + + trx->checkpoint_lock_taken = true; +cleanup: + return; +} + +static void tokudb_checkpoint_unlock(THD * thd) { + int error; + char status_msg[200]; //buffer of 200 should be a good upper bound. + tokudb_trx_data* trx = NULL; + trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + if (!trx) { + error = 0; + goto cleanup; + } + if (!trx->checkpoint_lock_taken) { + error = 0; + goto cleanup; + } + // + // at this point, we know the checkpoint lock has been taken + // + sprintf(status_msg, "Trying to release checkpointing lock."); + thd_proc_info(thd, status_msg); + error = db_env->checkpointing_resume(db_env); + assert(!error); + + trx->checkpoint_lock_taken = false; + +cleanup: + return; +} + +static bool tokudb_show_status(handlerton * hton, THD * thd, stat_print_fn * stat_print, enum ha_stat_type stat_type) { + switch (stat_type) { + case HA_ENGINE_STATUS: + return tokudb_show_engine_status(thd, stat_print); + break; + default: + break; + } + return false; +} + +static void tokudb_print_error(const DB_ENV * db_env, const char *db_errpfx, const char *buffer) { + sql_print_error("%s: %s", db_errpfx, buffer); +} + +static void tokudb_cleanup_log_files(void) { + TOKUDB_DBUG_ENTER("tokudb_cleanup_log_files"); + char **names; + int error; + + if ((error = db_env->txn_checkpoint(db_env, 0, 0, 0))) + my_error(ER_ERROR_DURING_CHECKPOINT, MYF(0), error); + + if ((error = db_env->log_archive(db_env, &names, 0)) != 0) { + DBUG_PRINT("error", ("log_archive failed (error %d)", error)); + db_env->err(db_env, error, "log_archive"); + DBUG_VOID_RETURN; + } + + if (names) { + char **np; + for (np = names; *np; ++np) { +#if 1 + if (tokudb_debug) + TOKUDB_TRACE("%s:cleanup:%s\n", __FUNCTION__, *np); +#else + my_delete(*np, MYF(MY_WME)); +#endif + } + + free(names); + } + + DBUG_VOID_RETURN; +} + +// options flags +// PLUGIN_VAR_THDLOCAL Variable is per-connection +// PLUGIN_VAR_READONLY Server variable is read only +// PLUGIN_VAR_NOSYSVAR Not a server variable +// PLUGIN_VAR_NOCMDOPT Not a command line option +// PLUGIN_VAR_NOCMDARG No argument for cmd line +// PLUGIN_VAR_RQCMDARG Argument required for cmd line +// PLUGIN_VAR_OPCMDARG Argument optional for cmd line +// PLUGIN_VAR_MEMALLOC String needs memory allocated + + +// system variables + +static void tokudb_lock_timeout_update(THD * thd, struct st_mysql_sys_var * sys_var, void * var, const void * save) { + ulonglong *timeout = (ulonglong *) var; + *timeout = *(const ulonglong *) save; + db_env->set_lock_timeout(db_env, *timeout); +} + +#define DEFAULT_LOCK_TIMEOUT_MSEC 4000 + +static MYSQL_SYSVAR_ULONGLONG(lock_timeout, tokudb_lock_timeout, + 0, "TokuDB lock timeout", + NULL, tokudb_lock_timeout_update, DEFAULT_LOCK_TIMEOUT_MSEC, + 0, ~0ULL, 0); + +static void tokudb_cleaner_period_update(THD * thd, struct st_mysql_sys_var * sys_var, void * var, const void * save) { + ulong * cleaner_period = (ulong *) var; + *cleaner_period = *(const ulonglong *) save; + int r = db_env->cleaner_set_period(db_env, *cleaner_period); + assert(r == 0); +} + +#define DEFAULT_CLEANER_PERIOD 1 + +static MYSQL_SYSVAR_ULONG(cleaner_period, tokudb_cleaner_period, + 0, "TokuDB cleaner_period", + NULL, tokudb_cleaner_period_update, DEFAULT_CLEANER_PERIOD, + 0, ~0UL, 0); + +static void tokudb_cleaner_iterations_update(THD * thd, struct st_mysql_sys_var * sys_var, void * var, const void * save) { + ulong * cleaner_iterations = (ulong *) var; + *cleaner_iterations = *(const ulonglong *) save; + int r = db_env->cleaner_set_iterations(db_env, *cleaner_iterations); + assert(r == 0); +} + +#define DEFAULT_CLEANER_ITERATIONS 5 + +static MYSQL_SYSVAR_ULONG(cleaner_iterations, tokudb_cleaner_iterations, + 0, "TokuDB cleaner_iterations", + NULL, tokudb_cleaner_iterations_update, DEFAULT_CLEANER_ITERATIONS, + 0, ~0UL, 0); + +static void tokudb_checkpointing_period_update(THD * thd, struct st_mysql_sys_var * sys_var, void * var, const void * save) { + uint * checkpointing_period = (uint *) var; + *checkpointing_period = *(const ulonglong *) save; + int r = db_env->checkpointing_set_period(db_env, *checkpointing_period); + assert(r == 0); +} + +static MYSQL_SYSVAR_UINT(checkpointing_period, tokudb_checkpointing_period, + 0, "TokuDB Checkpointing period", + NULL, tokudb_checkpointing_period_update, 60, + 0, ~0U, 0); + +static MYSQL_SYSVAR_BOOL(directio, tokudb_directio, + PLUGIN_VAR_READONLY, + "TokuDB Enable Direct I/O ", + NULL, NULL, FALSE); +static MYSQL_SYSVAR_BOOL(checkpoint_on_flush_logs, tokudb_checkpoint_on_flush_logs, + 0, + "TokuDB Checkpoint on Flush Logs ", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONGLONG(cache_size, tokudb_cache_size, + PLUGIN_VAR_READONLY, "TokuDB cache table size", NULL, NULL, 0, + 0, ~0ULL, 0); + +static MYSQL_SYSVAR_ULONGLONG(max_lock_memory, tokudb_max_lock_memory, PLUGIN_VAR_READONLY, "TokuDB max memory for locks", NULL, NULL, 0, 0, ~0ULL, 0); +static MYSQL_SYSVAR_ULONG(debug, tokudb_debug, 0, "TokuDB Debug", NULL, NULL, 0, 0, ~0UL, 0); + +static MYSQL_SYSVAR_STR(log_dir, tokudb_log_dir, PLUGIN_VAR_READONLY, "TokuDB Log Directory", NULL, NULL, NULL); + +static MYSQL_SYSVAR_STR(data_dir, tokudb_data_dir, PLUGIN_VAR_READONLY, "TokuDB Data Directory", NULL, NULL, NULL); + +static MYSQL_SYSVAR_STR(version, tokudb_version, PLUGIN_VAR_READONLY, "TokuDB Version", NULL, NULL, NULL); + +static MYSQL_SYSVAR_UINT(init_flags, tokudb_init_flags, PLUGIN_VAR_READONLY, "Sets TokuDB DB_ENV->open flags", NULL, NULL, tokudb_init_flags, 0, ~0U, 0); + +static MYSQL_SYSVAR_UINT(write_status_frequency, tokudb_write_status_frequency, 0, "TokuDB frequency that show processlist updates status of writes", NULL, NULL, 1000, 0, ~0U, 0); +static MYSQL_SYSVAR_UINT(read_status_frequency, tokudb_read_status_frequency, 0, "TokuDB frequency that show processlist updates status of reads", NULL, NULL, 10000, 0, ~0U, 0); +static MYSQL_SYSVAR_INT(fs_reserve_percent, tokudb_fs_reserve_percent, PLUGIN_VAR_READONLY, "TokuDB file system space reserve (percent free required)", NULL, NULL, 5, 0, 100, 0); +static MYSQL_SYSVAR_STR(tmp_dir, tokudb_tmp_dir, PLUGIN_VAR_READONLY, "Tokudb Tmp Dir", NULL, NULL, NULL); + +static void tokudb_fsync_log_period_update(THD *thd, struct st_mysql_sys_var *sys_var, void *var, const void *save) { + uint32 *period = (uint32 *) var; + *period = *(const ulonglong *) save; + db_env->change_fsync_log_period(db_env, *period); +} + +static MYSQL_SYSVAR_UINT(fsync_log_period, tokudb_fsync_log_period, 0, "TokuDB fsync log period", NULL, tokudb_fsync_log_period_update, 0, 0, ~0U, 0); + +static struct st_mysql_sys_var *tokudb_system_variables[] = { + MYSQL_SYSVAR(cache_size), + MYSQL_SYSVAR(max_lock_memory), + MYSQL_SYSVAR(data_dir), + MYSQL_SYSVAR(log_dir), + MYSQL_SYSVAR(debug), + MYSQL_SYSVAR(commit_sync), + MYSQL_SYSVAR(lock_timeout), + MYSQL_SYSVAR(cleaner_period), + MYSQL_SYSVAR(cleaner_iterations), + MYSQL_SYSVAR(pk_insert_mode), + MYSQL_SYSVAR(load_save_space), + MYSQL_SYSVAR(disable_slow_alter), + MYSQL_SYSVAR(disable_hot_alter), + MYSQL_SYSVAR(create_index_online), + MYSQL_SYSVAR(disable_prefetching), + MYSQL_SYSVAR(version), + MYSQL_SYSVAR(init_flags), + MYSQL_SYSVAR(checkpointing_period), + MYSQL_SYSVAR(prelock_empty), + MYSQL_SYSVAR(log_client_errors), + MYSQL_SYSVAR(checkpoint_lock), + MYSQL_SYSVAR(write_status_frequency), + MYSQL_SYSVAR(read_status_frequency), + MYSQL_SYSVAR(fs_reserve_percent), + MYSQL_SYSVAR(tmp_dir), + MYSQL_SYSVAR(block_size), + MYSQL_SYSVAR(read_block_size), + MYSQL_SYSVAR(read_buf_size), + MYSQL_SYSVAR(row_format), + MYSQL_SYSVAR(directio), + MYSQL_SYSVAR(checkpoint_on_flush_logs), +#if TOKU_INCLUDE_UPSERT + MYSQL_SYSVAR(disable_slow_update), + MYSQL_SYSVAR(disable_slow_upsert), +#endif + MYSQL_SYSVAR(analyze_time), + MYSQL_SYSVAR(fsync_log_period), + NULL +}; + +struct st_mysql_storage_engine tokudb_storage_engine = { MYSQL_HANDLERTON_INTERFACE_VERSION }; + +static ST_FIELD_INFO tokudb_user_data_field_info[] = { + {"database_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"table_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"data_size", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, + {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE} +}; + +#if MYSQL_VERSION_ID >= 50600 +static int tokudb_user_data_fill_table(THD *thd, TABLE_LIST *tables, Item *cond) { +#else +static int tokudb_user_data_fill_table(THD *thd, TABLE_LIST *tables, COND *cond) { +#endif + int error; + TABLE *table = tables->table; + + // 3938: Get a read lock on the status flag, since we must + // read it before safely proceeding + rw_rdlock(&tokudb_hton_initialized_lock); + + if (!tokudb_hton_initialized) { + my_error(ER_PLUGIN_IS_NOT_LOADED, MYF(0), "TokuDB"); + error = -1; + } else { + error = tokudb_get_user_data_size(table, thd, false); + } + + // 3938: unlock the status flag lock + rw_unlock(&tokudb_hton_initialized_lock); + return error; +} + +static int tokudb_user_data_init(void *p) { + ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p; + schema->fields_info = tokudb_user_data_field_info; + schema->fill_table = tokudb_user_data_fill_table; + return 0; +} + +static int tokudb_user_data_done(void *p) { + return 0; +} + +static struct st_mysql_information_schema tokudb_user_data_information_schema = { MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION }; + +static struct st_mysql_information_schema tokudb_fractal_tree_info_information_schema = { MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION }; + +static struct st_mysql_information_schema tokudb_fractal_tree_block_map_information_schema = { MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION }; + +static ST_FIELD_INFO tokudb_user_data_exact_field_info[] = { + {"database_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"table_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"data_size", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, + {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE} +}; + +static ST_FIELD_INFO tokudb_dictionary_field_info[] = { + {"dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"internal_file_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, + {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE} +}; + +static ST_FIELD_INFO tokudb_fractal_tree_info_field_info[] = { + {"dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"internal_file_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"bt_num_blocks_allocated", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"bt_num_blocks_in_use", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"bt_size_allocated", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"bt_size_in_use", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, + {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE} +}; + +static ST_FIELD_INFO tokudb_fractal_tree_block_map_field_info[] = { + {"dictionary_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"internal_file_name", 256, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"checkpoint_count", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"blocknum", 0, MYSQL_TYPE_LONGLONG, 0, 0, NULL, SKIP_OPEN_TABLE }, + {"offset", 0, MYSQL_TYPE_LONGLONG, 0, MY_I_S_MAYBE_NULL, NULL, SKIP_OPEN_TABLE }, + {"size", 0, MYSQL_TYPE_LONGLONG, 0, MY_I_S_MAYBE_NULL, NULL, SKIP_OPEN_TABLE }, + {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE} +}; + + +#if MYSQL_VERSION_ID >= 50600 +static int tokudb_dictionary_info_fill_table(THD *thd, TABLE_LIST *tables, Item *cond) { +#else +static int tokudb_dictionary_info_fill_table(THD *thd, TABLE_LIST *tables, COND *cond) { +#endif + int error; + TABLE *table = tables->table; + + // 3938: Get a read lock on the status flag, since we must + // read it before safely proceeding + rw_rdlock(&tokudb_hton_initialized_lock); + + if (!tokudb_hton_initialized) { + my_error(ER_PLUGIN_IS_NOT_LOADED, MYF(0), "TokuDB"); + error = -1; + } else { + error = tokudb_dictionary_info(table, thd); + } + + //3938: unlock the status flag lock + rw_unlock(&tokudb_hton_initialized_lock); + return error; +} + +#if MYSQL_VERSION_ID >= 50600 +static int tokudb_user_data_exact_fill_table(THD *thd, TABLE_LIST *tables, Item *cond) { +#else +static int tokudb_user_data_exact_fill_table(THD *thd, TABLE_LIST *tables, COND *cond) { +#endif + int error; + TABLE *table = tables->table; + + // 3938: Get a read lock on the status flag, since we must + // read it before safely proceeding + rw_rdlock(&tokudb_hton_initialized_lock); + + if (!tokudb_hton_initialized) { + my_error(ER_PLUGIN_IS_NOT_LOADED, MYF(0), "TokuDB"); + error = -1; + } else { + error = tokudb_get_user_data_size(table, thd, true); + } + + //3938: unlock the status flag lock + rw_unlock(&tokudb_hton_initialized_lock); + return error; +} + +#if MYSQL_VERSION_ID >= 50600 +static int tokudb_fractal_tree_info_fill_table(THD *thd, TABLE_LIST *tables, Item *cond) { +#else +static int tokudb_fractal_tree_info_fill_table(THD *thd, TABLE_LIST *tables, COND *cond) { +#endif + int error; + TABLE *table = tables->table; + + // 3938: Get a read lock on the status flag, since we must + // read it before safely proceeding + rw_rdlock(&tokudb_hton_initialized_lock); + + if (!tokudb_hton_initialized) { + my_error(ER_PLUGIN_IS_NOT_LOADED, MYF(0), "TokuDB"); + error = -1; + } else { + error = tokudb_fractal_tree_info(table, thd); + } + + //3938: unlock the status flag lock + rw_unlock(&tokudb_hton_initialized_lock); + return error; +} + +#if MYSQL_VERSION_ID >= 50600 +static int tokudb_fractal_tree_block_map_fill_table(THD *thd, TABLE_LIST *tables, Item *cond) { +#else +static int tokudb_fractal_tree_block_map_fill_table(THD *thd, TABLE_LIST *tables, COND *cond) { +#endif + int error; + TABLE *table = tables->table; + + // 3938: Get a read lock on the status flag, since we must + // read it before safely proceeding + rw_rdlock(&tokudb_hton_initialized_lock); + + if (!tokudb_hton_initialized) { + my_error(ER_PLUGIN_IS_NOT_LOADED, MYF(0), "TokuDB"); + error = -1; + } else { + error = tokudb_fractal_tree_block_map(table, thd); + } + + //3938: unlock the status flag lock + rw_unlock(&tokudb_hton_initialized_lock); + return error; +} + +static int tokudb_user_data_exact_init(void *p) { + ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p; + schema->fields_info = tokudb_user_data_exact_field_info; + schema->fill_table = tokudb_user_data_exact_fill_table; + return 0; +} + +static int tokudb_user_data_exact_done(void *p) { + return 0; +} + +static int tokudb_dictionary_info_init(void *p) { + ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p; + schema->fields_info = tokudb_dictionary_field_info; + schema->fill_table = tokudb_dictionary_info_fill_table; + return 0; +} + +static int tokudb_dictionary_info_done(void *p) { + return 0; +} + +static int tokudb_fractal_tree_info_init(void *p) { + ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p; + schema->fields_info = tokudb_fractal_tree_info_field_info; + schema->fill_table = tokudb_fractal_tree_info_fill_table; + return 0; +} + +static int tokudb_fractal_tree_info_done(void *p) { + return 0; +} + +static int tokudb_fractal_tree_block_map_init(void *p) { + ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p; + schema->fields_info = tokudb_fractal_tree_block_map_field_info; + schema->fill_table = tokudb_fractal_tree_block_map_fill_table; + return 0; +} + +static int tokudb_fractal_tree_block_map_done(void *p) { + return 0; +} + +static struct st_mysql_information_schema tokudb_user_data_exact_information_schema = { MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION }; + +enum { TOKUDB_PLUGIN_VERSION = 0x0400 }; +#define TOKUDB_PLUGIN_VERSION_STR "1024" + +// Retrieves variables for information_schema.global_status. +// Names (columnname) are automatically converted to upper case, and prefixed with "TOKUDB_" +static int show_tokudb_vars(THD *thd, SHOW_VAR *var, char *buff) { + TOKUDB_DBUG_ENTER("show_tokudb_vars"); + + int error; + uint64_t panic; + const int panic_string_len = 1024; + char panic_string[panic_string_len] = {'\0'}; + fs_redzone_state redzone_state; + + uint64_t num_rows; + error = db_env->get_engine_status (db_env, toku_global_status_rows, toku_global_status_max_rows, &num_rows, &redzone_state, &panic, panic_string, panic_string_len, TOKU_GLOBAL_STATUS); + //TODO: Maybe do something with the panic output? +#if 0 + if (strlen(panic_string)) { + STATPRINT("Environment panic string", panic_string); + } +#endif + if (error == 0) { + assert(num_rows <= toku_global_status_max_rows); + //TODO: Maybe enable some of the items here: (copied from engine status +#if 0 + if (panic) { + snprintf(buf, bufsiz, "%" PRIu64, panic); + STATPRINT("Environment panic", buf); + } + + if(redzone_state == FS_BLOCKED) { + STATPRINT("*** URGENT WARNING ***", "FILE SYSTEM IS COMPLETELY FULL"); + snprintf(buf, bufsiz, "FILE SYSTEM IS COMPLETELY FULL"); + } + else if (redzone_state == FS_GREEN) { + snprintf(buf, bufsiz, "more than %d percent of total file system space", 2*tokudb_fs_reserve_percent); + } + else if (redzone_state == FS_YELLOW) { + snprintf(buf, bufsiz, "*** WARNING *** FILE SYSTEM IS GETTING FULL (less than %d percent free)", 2*tokudb_fs_reserve_percent); + } + else if (redzone_state == FS_RED){ + snprintf(buf, bufsiz, "*** WARNING *** FILE SYSTEM IS GETTING VERY FULL (less than %d percent free): INSERTS ARE PROHIBITED", tokudb_fs_reserve_percent); + } + else { + snprintf(buf, bufsiz, "information unavailable, unknown redzone state %d", redzone_state); + } + STATPRINT ("disk free space", buf); +#endif + + + //TODO: (optionally) add redzone state, panic, panic string, etc. Right now it's being ignored. + + for (uint64_t row = 0; row < num_rows; row++) { + SHOW_VAR &status_var = toku_global_status_variables[row]; + TOKU_ENGINE_STATUS_ROW_S &status_row = toku_global_status_rows[row]; + + status_var.name = status_row.columnname; + switch (status_row.type) { + case FS_STATE: + case UINT64: + status_var.type = SHOW_LONGLONG; + status_var.value = (char*)&status_row.value.num; + break; + case CHARSTR: + status_var.type = SHOW_CHAR; + status_var.value = (char*)status_row.value.str; + break; + case UNIXTIME: + { + status_var.type = SHOW_CHAR; + time_t t = status_row.value.num; + char tbuf[26]; + // Reuse the memory in status_row. (It belongs to us). + snprintf(status_row.value.datebuf, sizeof(status_row.value.datebuf), "%.24s", ctime_r(&t, tbuf)); + status_var.value = (char*)&status_row.value.datebuf[0]; + } + break; + case TOKUTIME: + { + status_var.type = SHOW_DOUBLE; + double t = tokutime_to_seconds(status_row.value.num); + // Reuse the memory in status_row. (It belongs to us). + status_row.value.dnum = t; + status_var.value = (char*)&status_row.value.dnum; + } + break; + case PARCOUNT: + { + status_var.type = SHOW_LONGLONG; + uint64_t v = read_partitioned_counter(status_row.value.parcount); + // Reuse the memory in status_row. (It belongs to us). + status_row.value.num = v; + status_var.value = (char*)&status_row.value.num; + } + break; + default: + { + status_var.type = SHOW_CHAR; + // Reuse the memory in status_row.datebuf. (It belongs to us). + // UNKNOWN TYPE: %d fits in 26 bytes (sizeof datebuf) for any integer. + snprintf(status_row.value.datebuf, sizeof(status_row.value.datebuf), "UNKNOWN TYPE: %d", status_row.type); + status_var.value = (char*)&status_row.value.datebuf[0]; + } + break; + } + } + // Sentinel value at end. + toku_global_status_variables[num_rows].type = SHOW_LONG; + toku_global_status_variables[num_rows].value = (char*)NullS; + toku_global_status_variables[num_rows].name = (char*)NullS; + + var->type= SHOW_ARRAY; + var->value= (char *) toku_global_status_variables; + } + if (error) { my_errno = error; } + TOKUDB_DBUG_RETURN(error); +} + +static SHOW_VAR toku_global_status_variables_export[]= { + {"Tokudb", (char*)&show_tokudb_vars, SHOW_FUNC}, + {NullS, NullS, SHOW_LONG} +}; + +mysql_declare_plugin(tokudb) +{ + MYSQL_STORAGE_ENGINE_PLUGIN, + &tokudb_storage_engine, + tokudb_hton_name, + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_init_func, /* plugin init */ + tokudb_done_func, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + toku_global_status_variables_export, /* status variables */ + tokudb_system_variables, /* system variables */ + NULL, /* config options */ +#if MYSQL_VERSION_ID >= 50521 + 0, /* flags */ +#endif +}, +{ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + &tokudb_user_data_information_schema, + "TokuDB_user_data", + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_user_data_init, /* plugin init */ + tokudb_user_data_done, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + NULL, /* status variables */ + NULL, /* system variables */ + NULL, /* config options */ +#if MYSQL_VERSION_ID >= 50521 + 0, /* flags */ +#endif +}, +{ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + &tokudb_user_data_exact_information_schema, + "TokuDB_user_data_exact", + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_user_data_exact_init, /* plugin init */ + tokudb_user_data_exact_done, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + NULL, /* status variables */ + NULL, /* system variables */ + NULL, /* config options */ +#if MYSQL_VERSION_ID >= 50521 + 0, /* flags */ +#endif +}, +{ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + &tokudb_user_data_exact_information_schema, + "TokuDB_file_map", + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_dictionary_info_init, /* plugin init */ + tokudb_dictionary_info_done, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + NULL, /* status variables */ + NULL, /* system variables */ + NULL, /* config options */ +#if MYSQL_VERSION_ID >= 50521 + 0, /* flags */ +#endif +}, +{ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + &tokudb_fractal_tree_info_information_schema, + "TokuDB_fractal_tree_info", + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_fractal_tree_info_init, /* plugin init */ + tokudb_fractal_tree_info_done, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + NULL, /* status variables */ + NULL, /* system variables */ + NULL, /* config options */ +#if MYSQL_VERSION_ID >= 50521 + 0, /* flags */ +#endif +}, +{ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + &tokudb_fractal_tree_block_map_information_schema, + "TokuDB_fractal_tree_block_map", + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_fractal_tree_block_map_init, /* plugin init */ + tokudb_fractal_tree_block_map_done, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + NULL, /* status variables */ + NULL, /* system variables */ + NULL, /* config options */ +#if MYSQL_VERSION_ID >= 50521 + 0, /* flags */ +#endif +} +mysql_declare_plugin_end; + +#ifdef MARIA_PLUGIN_INTERFACE_VERSION + +maria_declare_plugin(tokudb) +{ + MYSQL_STORAGE_ENGINE_PLUGIN, + &tokudb_storage_engine, + tokudb_hton_name, + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_init_func, /* plugin init */ + tokudb_done_func, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + toku_global_status_variables_export, /* status variables */ + tokudb_system_variables, /* system variables */ + TOKUDB_PLUGIN_VERSION_STR, /* string version */ + MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ +}, +{ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + &tokudb_user_data_information_schema, + "TokuDB_user_data", + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_user_data_init, /* plugin init */ + tokudb_user_data_done, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + NULL, /* status variables */ + NULL, /* system variables */ + TOKUDB_PLUGIN_VERSION_STR, /* string version */ + MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ +}, +{ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + &tokudb_user_data_exact_information_schema, + "TokuDB_user_data_exact", + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_user_data_exact_init, /* plugin init */ + tokudb_user_data_exact_done, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + NULL, /* status variables */ + NULL, /* system variables */ + TOKUDB_PLUGIN_VERSION_STR, /* string version */ + MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ +}, +{ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + &tokudb_user_data_exact_information_schema, + "TokuDB_file_map", + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_dictionary_info_init, /* plugin init */ + tokudb_dictionary_info_done, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + NULL, /* status variables */ + NULL, /* system variables */ + TOKUDB_PLUGIN_VERSION_STR, /* string version */ + MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ +}, +{ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + &tokudb_fractal_tree_info_information_schema, + "TokuDB_fractal_tree_info", + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_fractal_tree_info_init, /* plugin init */ + tokudb_fractal_tree_info_done, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + NULL, /* status variables */ + NULL, /* system variables */ + TOKUDB_PLUGIN_VERSION_STR, /* string version */ + MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ +}, +{ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + &tokudb_fractal_tree_block_map_information_schema, + "TokuDB_fractal_tree_block_map", + "Tokutek Inc", + "Tokutek TokuDB Storage Engine with Fractal Tree(tm) Technology", + PLUGIN_LICENSE_GPL, + tokudb_fractal_tree_block_map_init, /* plugin init */ + tokudb_fractal_tree_block_map_done, /* plugin deinit */ + TOKUDB_PLUGIN_VERSION, /* 4.0.0 */ + NULL, /* status variables */ + NULL, /* system variables */ + TOKUDB_PLUGIN_VERSION_STR, /* string version */ + MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ +} +maria_declare_plugin_end; + +#endif