You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

6889 lines
206 KiB

  1. #ifdef USE_PRAGMA_IMPLEMENTATION
  2. #pragma implementation // gcc: Class implementation
  3. #endif
  4. #define MYSQL_SERVER 1
  5. #include "mysql_priv.h"
  6. #include "hatoku_cmp.h"
  7. extern "C" {
  8. #include "stdint.h"
  9. #if defined(_WIN32)
  10. #include "misc.h"
  11. #endif
  12. }
  13. static inline void *thd_data_get(THD *thd, int slot) {
  14. return thd->ha_data[slot].ha_ptr;
  15. }
  16. static inline void thd_data_set(THD *thd, int slot, void *data) {
  17. thd->ha_data[slot].ha_ptr = data;
  18. }
  19. #undef PACKAGE
  20. #undef VERSION
  21. #undef HAVE_DTRACE
  22. #undef _DTRACE_VERSION
  23. //#include "tokudb_config.h"
  24. /* We define DTRACE after mysql_priv.h in case it disabled dtrace in the main server */
  25. #ifdef HAVE_DTRACE
  26. #define _DTRACE_VERSION 1
  27. #else
  28. #endif
  29. #include "hatoku_defines.h"
  30. #include "ha_tokudb.h"
  31. #include "hatoku_hton.h"
  32. #include <mysql/plugin.h>
  33. static const char *ha_tokudb_exts[] = {
  34. ha_tokudb_ext,
  35. NullS
  36. };
  37. #define lockretryN(N) \
  38. for (ulonglong lockretrycount=0; lockretrycount<(N/(1<<3) + 1); lockretrycount++)
  39. #define lockretry_wait \
  40. if (error != DB_LOCK_NOTGRANTED) { \
  41. break; \
  42. } \
  43. if (tokudb_debug & TOKUDB_DEBUG_LOCKRETRY) { \
  44. TOKUDB_TRACE("%s count=%d\n", __FUNCTION__, (int) lockretrycount); \
  45. } \
  46. if (lockretrycount%200 == 0) { \
  47. if (ha_thd()->killed) { \
  48. error = DB_LOCK_NOTGRANTED; \
  49. break; \
  50. } \
  51. } \
  52. usleep((lockretrycount<4 ? (1<<lockretrycount) : (1<<3)) * 1024); \
  53. //
  54. // This offset is calculated starting from AFTER the NULL bytes
  55. //
  56. inline u_int32_t get_var_len_offset(KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, uint keynr) {
  57. uint offset = 0;
  58. for (uint i = 0; i < table_share->fields; i++) {
  59. if (kc_info->field_lengths[i] && !bitmap_is_set(&kc_info->key_filters[keynr],i)) {
  60. offset += kc_info->field_lengths[i];
  61. }
  62. }
  63. return offset;
  64. }
  65. inline u_int32_t get_len_of_offsets(KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, uint keynr) {
  66. uint len = 0;
  67. for (uint i = 0; i < table_share->fields; i++) {
  68. if (kc_info->length_bytes[i] && !bitmap_is_set(&kc_info->key_filters[keynr],i)) {
  69. len += kc_info->num_offset_bytes;
  70. }
  71. }
  72. return len;
  73. }
  74. static int allocate_key_and_col_info ( TABLE_SHARE* table_share, KEY_AND_COL_INFO* kc_info) {
  75. int error;
  76. //
  77. // initialize all of the bitmaps
  78. //
  79. for (uint i = 0; i < MAX_KEY + 1; i++) {
  80. error = bitmap_init(
  81. &kc_info->key_filters[i],
  82. NULL,
  83. table_share->fields,
  84. false
  85. );
  86. if (error) {
  87. goto exit;
  88. }
  89. }
  90. //
  91. // create the field lengths
  92. //
  93. kc_info->field_lengths = (u_int16_t *)my_malloc(table_share->fields*sizeof(u_int16_t), MYF(MY_WME | MY_ZEROFILL));
  94. kc_info->length_bytes= (uchar *)my_malloc(table_share->fields, MYF(MY_WME | MY_ZEROFILL));
  95. kc_info->blob_fields= (u_int32_t *)my_malloc(table_share->fields*sizeof(u_int32_t), MYF(MY_WME | MY_ZEROFILL));
  96. if (kc_info->field_lengths == NULL ||
  97. kc_info->length_bytes == NULL ||
  98. kc_info->blob_fields == NULL ) {
  99. error = ENOMEM;
  100. goto exit;
  101. }
  102. exit:
  103. if (error) {
  104. for (uint i = 0; MAX_KEY + 1; i++) {
  105. bitmap_free(&kc_info->key_filters[i]);
  106. }
  107. my_free(kc_info->field_lengths, MYF(MY_ALLOW_ZERO_PTR));
  108. my_free(kc_info->length_bytes, MYF(MY_ALLOW_ZERO_PTR));
  109. my_free(kc_info->blob_fields, MYF(MY_ALLOW_ZERO_PTR));
  110. }
  111. return error;
  112. }
  113. /** @brief
  114. Simple lock controls. The "share" it creates is a structure we will
  115. pass to each tokudb handler. Do you have to have one of these? Well, you have
  116. pieces that are used for locking, and they are needed to function.
  117. */
  118. static TOKUDB_SHARE *get_share(const char *table_name, TABLE_SHARE* table_share) {
  119. TOKUDB_SHARE *share = NULL;
  120. int error = 0;
  121. uint length;
  122. pthread_mutex_lock(&tokudb_mutex);
  123. length = (uint) strlen(table_name);
  124. if (!(share = (TOKUDB_SHARE *) my_hash_search(&tokudb_open_tables, (uchar *) table_name, length))) {
  125. char *tmp_name;
  126. //
  127. // create share and fill it with all zeroes
  128. // hence, all pointers are initialized to NULL
  129. //
  130. if (!(share = (TOKUDB_SHARE *)
  131. my_multi_malloc(MYF(MY_WME | MY_ZEROFILL),
  132. &share, sizeof(*share),
  133. &tmp_name, length + 1,
  134. NullS))) {
  135. pthread_mutex_unlock(&tokudb_mutex);
  136. return NULL;
  137. }
  138. share->use_count = 0;
  139. share->table_name_length = length;
  140. share->table_name = tmp_name;
  141. strmov(share->table_name, table_name);
  142. error = allocate_key_and_col_info(table_share, &share->kc_info);
  143. if (error) {
  144. goto exit;
  145. }
  146. bzero((void *) share->key_file, sizeof(share->key_file));
  147. error = my_hash_insert(&tokudb_open_tables, (uchar *) share);
  148. if (error) {
  149. goto exit;
  150. }
  151. thr_lock_init(&share->lock);
  152. pthread_mutex_init(&share->mutex, MY_MUTEX_INIT_FAST);
  153. }
  154. pthread_mutex_unlock(&tokudb_mutex);
  155. exit:
  156. if (error) {
  157. pthread_mutex_destroy(&share->mutex);
  158. my_free((uchar *) share, MYF(0));
  159. share = NULL;
  160. }
  161. return share;
  162. }
  163. void free_key_and_col_info (KEY_AND_COL_INFO* kc_info) {
  164. for (uint i = 0; i < MAX_KEY+1; i++) {
  165. bitmap_free(&kc_info->key_filters[i]);
  166. }
  167. for (uint i = 0; i < MAX_KEY+1; i++) {
  168. my_free(kc_info->cp_info[i], MYF(MY_ALLOW_ZERO_PTR));
  169. }
  170. my_free(kc_info->field_lengths, MYF(MY_ALLOW_ZERO_PTR));
  171. my_free(kc_info->length_bytes, MYF(MY_ALLOW_ZERO_PTR));
  172. my_free(kc_info->blob_fields, MYF(MY_ALLOW_ZERO_PTR));
  173. }
  174. static int free_share(TOKUDB_SHARE * share, bool mutex_is_locked) {
  175. int error, result = 0;
  176. pthread_mutex_lock(&tokudb_mutex);
  177. if (mutex_is_locked)
  178. pthread_mutex_unlock(&share->mutex);
  179. if (!--share->use_count) {
  180. DBUG_PRINT("info", ("share->use_count %u", share->use_count));
  181. //
  182. // number of open DB's may not be equal to number of keys we have because add_index
  183. // may have added some. So, we loop through entire array and close any non-NULL value
  184. // It is imperative that we reset a DB to NULL once we are done with it.
  185. //
  186. for (uint i = 0; i < sizeof(share->key_file)/sizeof(share->key_file[0]); i++) {
  187. if (tokudb_debug & TOKUDB_DEBUG_OPEN) {
  188. TOKUDB_TRACE("dbclose:%p\n", share->key_file[i]);
  189. }
  190. if (share->key_file[i]) {
  191. error = share->key_file[i]->close(share->key_file[i], 0);
  192. assert(error == 0);
  193. if (error) {
  194. result = error;
  195. }
  196. share->key_file[i] = NULL;
  197. }
  198. }
  199. free_key_and_col_info(&share->kc_info);
  200. if (share->status_block && (error = share->status_block->close(share->status_block, 0))) {
  201. assert(error == 0);
  202. result = error;
  203. }
  204. my_hash_delete(&tokudb_open_tables, (uchar *) share);
  205. thr_lock_delete(&share->lock);
  206. pthread_mutex_destroy(&share->mutex);
  207. my_free((uchar *) share, MYF(0));
  208. }
  209. pthread_mutex_unlock(&tokudb_mutex);
  210. return result;
  211. }
  212. #define HANDLE_INVALID_CURSOR() \
  213. if (cursor == NULL) { \
  214. error = last_cursor_error; \
  215. goto cleanup; \
  216. }
  217. /*
  218. * returns NULL terminated file extension string
  219. */
  220. const char **ha_tokudb::bas_ext() const {
  221. TOKUDB_DBUG_ENTER("ha_tokudb::bas_ext");
  222. DBUG_RETURN(ha_tokudb_exts);
  223. }
  224. static inline bool is_insert_ignore (THD* thd) {
  225. //
  226. // from http://lists.mysql.com/internals/37735
  227. //
  228. return thd->lex->ignore && thd->lex->duplicates == DUP_ERROR;
  229. }
  230. static inline bool is_replace_into(THD* thd) {
  231. return (thd_sql_command(thd) == SQLCOM_REPLACE) ||
  232. (thd_sql_command(thd) == SQLCOM_REPLACE_SELECT);
  233. }
  234. static inline bool do_ignore_flag_optimization(THD* thd, TABLE* table, bool opt_eligible) {
  235. uint pk_insert_mode = get_pk_insert_mode(thd);
  236. return (
  237. (is_replace_into(thd) || is_insert_ignore(thd)) &&
  238. opt_eligible &&
  239. ((!table->triggers && pk_insert_mode < 2) || pk_insert_mode == 0)
  240. );
  241. }
  242. ulonglong ha_tokudb::table_flags() const {
  243. return (table && do_ignore_flag_optimization(ha_thd(), table, share->replace_into_fast) ?
  244. int_table_flags | HA_BINLOG_STMT_CAPABLE :
  245. int_table_flags | HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE);
  246. }
  247. //
  248. // Returns a bit mask of capabilities of the key or its part specified by
  249. // the arguments. The capabilities are defined in sql/handler.h.
  250. //
  251. ulong ha_tokudb::index_flags(uint idx, uint part, bool all_parts) const {
  252. TOKUDB_DBUG_ENTER("ha_tokudb::index_flags");
  253. assert(table_share);
  254. ulong flags = (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER | HA_KEYREAD_ONLY | HA_READ_RANGE);
  255. if (table_share->key_info[idx].flags & HA_CLUSTERING) {
  256. flags |= HA_CLUSTERED_INDEX;
  257. }
  258. DBUG_RETURN(flags);
  259. }
  260. //
  261. // struct that will be used as a context for smart DBT callbacks
  262. // contains parameters needed to complete the smart DBT cursor call
  263. //
  264. typedef struct smart_dbt_info {
  265. ha_tokudb* ha; //instance to ha_tokudb needed for reading the row
  266. uchar* buf; // output buffer where row will be written
  267. uint keynr; // index into share->key_file that represents DB we are currently operating on
  268. } *SMART_DBT_INFO;
  269. typedef struct index_read_info {
  270. struct smart_dbt_info smart_dbt_info;
  271. int cmp;
  272. DBT* orig_key;
  273. } *INDEX_READ_INFO;
  274. typedef struct row_buffers {
  275. uchar** key_buff;
  276. uchar** rec_buff;
  277. } *ROW_BUFFERS;
  278. int poll_fun(void *extra, float progress) {
  279. LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
  280. if (context->thd->killed) {
  281. sprintf(context->write_status_msg, "The process has been killed, aborting bulk load.");
  282. return 1;
  283. }
  284. sprintf(context->write_status_msg, "Loading of data about %f done", progress);
  285. thd_proc_info(context->thd, context->write_status_msg);
  286. return 0;
  287. }
  288. void loader_ai_err_fun(DB *db, int i, int err, DBT *key, DBT *val, void *error_extra) {
  289. LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
  290. assert(context->ha);
  291. context->ha->set_loader_error(err);
  292. }
  293. void loader_dup_fun(DB *db, int i, int err, DBT *key, DBT *val, void *error_extra) {
  294. LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
  295. assert(context->ha);
  296. context->ha->set_loader_error(err);
  297. if (err == DB_KEYEXIST) {
  298. context->ha->set_dup_value_for_pk(key);
  299. }
  300. }
  301. //
  302. // smart DBT callback function for optimize
  303. // in optimize, we want to flatten DB by doing
  304. // a full table scan. Therefore, we don't
  305. // want to actually do anything with the data, hence
  306. // callback does nothing
  307. //
  308. static int smart_dbt_do_nothing (DBT const *key, DBT const *row, void *context) {
  309. return 0;
  310. }
  311. static int smart_dbt_metacallback (DBT const *key, DBT const *row, void *context) {
  312. DBT* val = (DBT *)context;
  313. val->data = my_malloc(row->size, MYF(MY_WME|MY_ZEROFILL));
  314. if (val->data == NULL) return ENOMEM;
  315. memcpy(val->data, row->data, row->size);
  316. val->size = row->size;
  317. return 0;
  318. }
  319. static int
  320. smart_dbt_callback_rowread_ptquery (DBT const *key, DBT const *row, void *context) {
  321. SMART_DBT_INFO info = (SMART_DBT_INFO)context;
  322. info->ha->extract_hidden_primary_key(info->keynr, key);
  323. return info->ha->read_row_callback(info->buf,info->keynr,row,key);
  324. }
  325. //
  326. // Smart DBT callback function in case where we have a covering index
  327. //
  328. static int
  329. smart_dbt_callback_keyread(DBT const *key, DBT const *row, void *context) {
  330. SMART_DBT_INFO info = (SMART_DBT_INFO)context;
  331. info->ha->extract_hidden_primary_key(info->keynr, key);
  332. info->ha->read_key_only(info->buf,info->keynr,key);
  333. return 0;
  334. }
  335. //
  336. // Smart DBT callback function in case where we do NOT have a covering index
  337. //
  338. static int
  339. smart_dbt_callback_rowread(DBT const *key, DBT const *row, void *context) {
  340. int error = 0;
  341. SMART_DBT_INFO info = (SMART_DBT_INFO)context;
  342. info->ha->extract_hidden_primary_key(info->keynr, key);
  343. error = info->ha->read_primary_key(info->buf,info->keynr,row,key);
  344. return error;
  345. }
  346. //
  347. // Smart DBT callback function in case where we have a covering index
  348. //
  349. static int
  350. smart_dbt_callback_ir_keyread(DBT const *key, DBT const *row, void *context) {
  351. INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
  352. ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
  353. if (ir_info->cmp) {
  354. return 0;
  355. }
  356. return smart_dbt_callback_keyread(key, row, &ir_info->smart_dbt_info);
  357. }
  358. static int
  359. smart_dbt_callback_lookup(DBT const *key, DBT const *row, void *context) {
  360. INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
  361. ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
  362. return 0;
  363. }
  364. //
  365. // Smart DBT callback function in case where we do NOT have a covering index
  366. //
  367. static int
  368. smart_dbt_callback_ir_rowread(DBT const *key, DBT const *row, void *context) {
  369. INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
  370. ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
  371. if (ir_info->cmp) {
  372. return 0;
  373. }
  374. return smart_dbt_callback_rowread(key, row, &ir_info->smart_dbt_info);
  375. }
  376. //
  377. // macro for Smart DBT callback function,
  378. // so we do not need to put this long line of code in multiple places
  379. //
  380. #define SMART_DBT_CALLBACK ( this->key_read ? smart_dbt_callback_keyread : smart_dbt_callback_rowread )
  381. #define SMART_DBT_IR_CALLBACK ( this->key_read ? smart_dbt_callback_ir_keyread : smart_dbt_callback_ir_rowread )
  382. //
  383. // macro that modifies read flag for cursor operations depending on whether
  384. // we have preacquired lock or not
  385. //
  386. #define SET_READ_FLAG(flg) ((range_lock_grabbed) ? ((flg) | DB_PRELOCKED) : (flg))
  387. //
  388. // This method retrieves the value of the auto increment column of a record in MySQL format
  389. // This was basically taken from MyISAM
  390. // Parameters:
  391. // type - the type of the auto increment column (e.g. int, float, double...)
  392. // offset - offset into the record where the auto increment column is stored
  393. // [in] record - MySQL row whose auto increment value we want to extract
  394. // Returns:
  395. // The value of the auto increment column in record
  396. //
  397. ulonglong retrieve_auto_increment(uint16 type, uint32 offset,const uchar *record)
  398. {
  399. const uchar *key; /* Key */
  400. ulonglong unsigned_autoinc = 0; /* Unsigned auto-increment */
  401. longlong signed_autoinc = 0; /* Signed auto-increment */
  402. enum { unsigned_type, signed_type } autoinc_type;
  403. float float_tmp; /* Temporary variable */
  404. double double_tmp; /* Temporary variable */
  405. key = ((uchar *) record) + offset;
  406. /* Set default autoincrement type */
  407. autoinc_type = unsigned_type;
  408. switch (type) {
  409. case HA_KEYTYPE_INT8:
  410. signed_autoinc = (longlong) *(char*)key;
  411. autoinc_type = signed_type;
  412. break;
  413. case HA_KEYTYPE_BINARY:
  414. unsigned_autoinc = (ulonglong) *(uchar*) key;
  415. break;
  416. case HA_KEYTYPE_SHORT_INT:
  417. signed_autoinc = (longlong) sint2korr(key);
  418. autoinc_type = signed_type;
  419. break;
  420. case HA_KEYTYPE_USHORT_INT:
  421. unsigned_autoinc = (ulonglong) uint2korr(key);
  422. break;
  423. case HA_KEYTYPE_LONG_INT:
  424. signed_autoinc = (longlong) sint4korr(key);
  425. autoinc_type = signed_type;
  426. break;
  427. case HA_KEYTYPE_ULONG_INT:
  428. unsigned_autoinc = (ulonglong) uint4korr(key);
  429. break;
  430. case HA_KEYTYPE_INT24:
  431. signed_autoinc = (longlong) sint3korr(key);
  432. autoinc_type = signed_type;
  433. break;
  434. case HA_KEYTYPE_UINT24:
  435. unsigned_autoinc = (ulonglong) uint3korr(key);
  436. break;
  437. case HA_KEYTYPE_LONGLONG:
  438. signed_autoinc = sint8korr(key);
  439. autoinc_type = signed_type;
  440. break;
  441. case HA_KEYTYPE_ULONGLONG:
  442. unsigned_autoinc = uint8korr(key);
  443. break;
  444. /* The remaining two cases should not be used but are included for
  445. compatibility */
  446. case HA_KEYTYPE_FLOAT:
  447. float4get(float_tmp, key); /* Note: float4get is a macro */
  448. signed_autoinc = (longlong) float_tmp;
  449. autoinc_type = signed_type;
  450. break;
  451. case HA_KEYTYPE_DOUBLE:
  452. float8get(double_tmp, key); /* Note: float8get is a macro */
  453. signed_autoinc = (longlong) double_tmp;
  454. autoinc_type = signed_type;
  455. break;
  456. default:
  457. DBUG_ASSERT(0);
  458. unsigned_autoinc = 0;
  459. }
  460. if (signed_autoinc < 0) {
  461. signed_autoinc = 0;
  462. }
  463. return autoinc_type == unsigned_type ?
  464. unsigned_autoinc : (ulonglong) signed_autoinc;
  465. }
  466. inline bool
  467. is_null_field( TABLE* table, Field* field, const uchar* record) {
  468. uint null_offset;
  469. bool ret_val;
  470. if (!field->null_ptr) {
  471. ret_val = false;
  472. goto exitpt;
  473. }
  474. null_offset = get_null_offset(table,field);
  475. ret_val = (record[null_offset] & field->null_bit) ? true: false;
  476. exitpt:
  477. return ret_val;
  478. }
  479. inline ulong field_offset(Field* field, TABLE* table) {
  480. return((ulong) (field->ptr - table->record[0]));
  481. }
  482. inline HA_TOKU_ISO_LEVEL tx_to_toku_iso(ulong tx_isolation) {
  483. if (tx_isolation == ISO_READ_UNCOMMITTED) {
  484. return hatoku_iso_read_uncommitted;
  485. }
  486. else if (tx_isolation == ISO_READ_COMMITTED) {
  487. return hatoku_iso_read_committed;
  488. }
  489. else {
  490. return hatoku_iso_serializable;
  491. }
  492. }
  493. inline u_int32_t toku_iso_to_txn_flag (HA_TOKU_ISO_LEVEL lvl) {
  494. if (lvl == hatoku_iso_read_uncommitted) {
  495. return DB_READ_UNCOMMITTED;
  496. }
  497. else if (lvl == hatoku_iso_read_committed) {
  498. return DB_READ_COMMITTED;
  499. }
  500. else {
  501. return 0;
  502. }
  503. }
  504. int filter_key_part_compare (const void* left, const void* right) {
  505. FILTER_KEY_PART_INFO* left_part= (FILTER_KEY_PART_INFO *)left;
  506. FILTER_KEY_PART_INFO* right_part = (FILTER_KEY_PART_INFO *)right;
  507. return left_part->offset - right_part->offset;
  508. }
  509. //
  510. // Be very careful with parameters passed to this function. Who knows
  511. // if key, table have proper info set. I had to verify by checking
  512. // in the debugger.
  513. //
  514. void set_key_filter(MY_BITMAP* key_filter, KEY* key, TABLE* table, bool get_offset_from_keypart) {
  515. FILTER_KEY_PART_INFO parts[MAX_REF_PARTS];
  516. uint curr_skip_index = 0;
  517. for (uint i = 0; i < key->key_parts; i++) {
  518. //
  519. // horrendous hack due to bugs in mysql, basically
  520. // we cannot always reliably get the offset from the same source
  521. //
  522. parts[i].offset = get_offset_from_keypart ? key->key_part[i].offset : field_offset(key->key_part[i].field, table);
  523. parts[i].part_index = i;
  524. }
  525. qsort(
  526. parts, // start of array
  527. key->key_parts, //num elements
  528. sizeof(*parts), //size of each element
  529. filter_key_part_compare
  530. );
  531. for (uint i = 0; i < table->s->fields; i++) {
  532. Field* field = table->field[i];
  533. uint curr_field_offset = field_offset(field, table);
  534. if (curr_skip_index < key->key_parts) {
  535. uint curr_skip_offset = 0;
  536. curr_skip_offset = parts[curr_skip_index].offset;
  537. if (curr_skip_offset == curr_field_offset) {
  538. //
  539. // we have hit a field that is a portion of the primary key
  540. //
  541. uint curr_key_index = parts[curr_skip_index].part_index;
  542. curr_skip_index++;
  543. //
  544. // only choose to continue over the key if the key's length matches the field's length
  545. // otherwise, we may have a situation where the column is a varchar(10), the
  546. // key is only the first 3 characters, and we end up losing the last 7 bytes of the
  547. // column
  548. //
  549. TOKU_TYPE toku_type;
  550. toku_type = mysql_to_toku_type(field);
  551. switch(toku_type) {
  552. case(toku_type_blob):
  553. break;
  554. case(toku_type_varbinary):
  555. case(toku_type_varstring):
  556. case(toku_type_fixbinary):
  557. case(toku_type_fixstring):
  558. if (key->key_part[curr_key_index].length == field->field_length) {
  559. bitmap_set_bit(key_filter,i);
  560. }
  561. break;
  562. default:
  563. bitmap_set_bit(key_filter,i);
  564. break;
  565. }
  566. }
  567. }
  568. }
  569. }
  570. inline uchar* pack_fixed_field(
  571. uchar* to_tokudb,
  572. const uchar* from_mysql,
  573. u_int32_t num_bytes
  574. )
  575. {
  576. switch (num_bytes) {
  577. case (1):
  578. memcpy(to_tokudb, from_mysql, 1);
  579. break;
  580. case (2):
  581. memcpy(to_tokudb, from_mysql, 2);
  582. break;
  583. case (3):
  584. memcpy(to_tokudb, from_mysql, 3);
  585. break;
  586. case (4):
  587. memcpy(to_tokudb, from_mysql, 4);
  588. break;
  589. case (8):
  590. memcpy(to_tokudb, from_mysql, 8);
  591. break;
  592. default:
  593. memcpy(to_tokudb, from_mysql, num_bytes);
  594. break;
  595. }
  596. return to_tokudb+num_bytes;
  597. }
  598. inline const uchar* unpack_fixed_field(
  599. uchar* to_mysql,
  600. const uchar* from_tokudb,
  601. u_int32_t num_bytes
  602. )
  603. {
  604. switch (num_bytes) {
  605. case (1):
  606. memcpy(to_mysql, from_tokudb, 1);
  607. break;
  608. case (2):
  609. memcpy(to_mysql, from_tokudb, 2);
  610. break;
  611. case (3):
  612. memcpy(to_mysql, from_tokudb, 3);
  613. break;
  614. case (4):
  615. memcpy(to_mysql, from_tokudb, 4);
  616. break;
  617. case (8):
  618. memcpy(to_mysql, from_tokudb, 8);
  619. break;
  620. default:
  621. memcpy(to_mysql, from_tokudb, num_bytes);
  622. break;
  623. }
  624. return from_tokudb+num_bytes;
  625. }
  626. inline uchar* pack_var_field(
  627. uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
  628. uchar* to_tokudb_data,
  629. uchar* to_tokudb_offset_start, //location where offset starts
  630. const uchar * from_mysql,
  631. u_int32_t mysql_length_bytes,
  632. u_int32_t offset_bytes
  633. )
  634. {
  635. uint data_length = 0;
  636. u_int32_t offset = 0;
  637. switch(mysql_length_bytes) {
  638. case(1):
  639. data_length = from_mysql[0];
  640. break;
  641. case(2):
  642. data_length = uint2korr(from_mysql);
  643. break;
  644. default:
  645. assert(false);
  646. break;
  647. }
  648. memcpy(to_tokudb_data, from_mysql + mysql_length_bytes, data_length);
  649. //
  650. // for offset, we pack the offset where the data ENDS!
  651. //
  652. offset = to_tokudb_data + data_length - to_tokudb_offset_start;
  653. switch(offset_bytes) {
  654. case (1):
  655. to_tokudb_offset_ptr[0] = (uchar)offset;
  656. break;
  657. case (2):
  658. int2store(to_tokudb_offset_ptr,offset);
  659. break;
  660. default:
  661. assert(false);
  662. break;
  663. }
  664. return to_tokudb_data + data_length;
  665. }
  666. inline void unpack_var_field(
  667. uchar* to_mysql,
  668. const uchar* from_tokudb_data,
  669. u_int32_t from_tokudb_data_len,
  670. u_int32_t mysql_length_bytes
  671. )
  672. {
  673. //
  674. // store the length
  675. //
  676. switch (mysql_length_bytes) {
  677. case(1):
  678. to_mysql[0] = (uchar)from_tokudb_data_len;
  679. break;
  680. case(2):
  681. int2store(to_mysql, from_tokudb_data_len);
  682. break;
  683. default:
  684. assert(false);
  685. break;
  686. }
  687. //
  688. // store the data
  689. //
  690. memcpy(to_mysql+mysql_length_bytes, from_tokudb_data, from_tokudb_data_len);
  691. }
  692. uchar* pack_toku_field_blob(
  693. uchar* to_tokudb,
  694. const uchar* from_mysql,
  695. Field* field
  696. )
  697. {
  698. u_int32_t len_bytes = field->row_pack_length();
  699. u_int32_t length = 0;
  700. uchar* data_ptr = NULL;
  701. memcpy(to_tokudb, from_mysql, len_bytes);
  702. switch (len_bytes) {
  703. case (1):
  704. length = (u_int32_t)(*from_mysql);
  705. break;
  706. case (2):
  707. length = uint2korr(from_mysql);
  708. break;
  709. case (3):
  710. length = uint3korr(from_mysql);
  711. break;
  712. case (4):
  713. length = uint4korr(from_mysql);
  714. break;
  715. default:
  716. assert(false);
  717. }
  718. if (length > 0) {
  719. memcpy_fixed((uchar *)(&data_ptr), from_mysql + len_bytes, sizeof(uchar*));
  720. memcpy(to_tokudb + len_bytes, data_ptr, length);
  721. }
  722. return (to_tokudb + len_bytes + length);
  723. }
  724. static int add_table_to_metadata(const char *name, TABLE* table, DB_TXN* txn) {
  725. int error = 0;
  726. DBT key;
  727. DBT val;
  728. uchar hidden_primary_key = (table->s->primary_key >= MAX_KEY);
  729. assert(txn);
  730. bzero((void *)&key, sizeof(key));
  731. bzero((void *)&val, sizeof(val));
  732. key.data = (void *)name;
  733. key.size = strlen(name) + 1;
  734. val.data = &hidden_primary_key;
  735. val.size = sizeof(hidden_primary_key);
  736. error = metadata_db->put(
  737. metadata_db,
  738. txn,
  739. &key,
  740. &val,
  741. DB_YESOVERWRITE
  742. );
  743. return error;
  744. }
  745. static int drop_table_from_metadata(const char *name, DB_TXN* txn) {
  746. int error = 0;
  747. DBT key;
  748. DBT data;
  749. assert(txn);
  750. bzero((void *)&key, sizeof(key));
  751. bzero((void *)&data, sizeof(data));
  752. key.data = (void *)name;
  753. key.size = strlen(name) + 1;
  754. error = metadata_db->del(
  755. metadata_db,
  756. txn,
  757. &key ,
  758. DB_DELETE_ANY
  759. );
  760. return error;
  761. }
  762. static int rename_table_in_metadata(const char *from, const char *to, DB_TXN* txn) {
  763. int error = 0;
  764. DBT from_key;
  765. DBT to_key;
  766. DBT val;
  767. assert(txn);
  768. bzero((void *)&from_key, sizeof(from_key));
  769. bzero((void *)&to_key, sizeof(to_key));
  770. bzero((void *)&val, sizeof(val));
  771. from_key.data = (void *)from;
  772. from_key.size = strlen(from) + 1;
  773. to_key.data = (void *)to;
  774. to_key.size = strlen(to) + 1;
  775. error = metadata_db->getf_set(
  776. metadata_db,
  777. txn,
  778. 0,
  779. &from_key,
  780. smart_dbt_metacallback,
  781. &val
  782. );
  783. if (error) {
  784. goto cleanup;
  785. }
  786. error = metadata_db->put(
  787. metadata_db,
  788. txn,
  789. &to_key,
  790. &val,
  791. DB_YESOVERWRITE
  792. );
  793. if (error) {
  794. goto cleanup;
  795. }
  796. error = metadata_db->del(
  797. metadata_db,
  798. txn,
  799. &from_key,
  800. DB_DELETE_ANY
  801. );
  802. if (error) {
  803. goto cleanup;
  804. }
  805. error = 0;
  806. cleanup:
  807. my_free(val.data, MYF(MY_ALLOW_ZERO_PTR));
  808. return error;
  809. }
  810. static int check_table_in_metadata(const char *name, bool* table_found) {
  811. int error = 0;
  812. DBT key;
  813. DB_TXN* txn = NULL;
  814. pthread_mutex_lock(&tokudb_meta_mutex);
  815. error = db_env->txn_begin(db_env, 0, &txn, 0);
  816. if (error) {
  817. goto cleanup;
  818. }
  819. bzero((void *)&key, sizeof(key));
  820. key.data = (void *)name;
  821. key.size = strlen(name) + 1;
  822. error = metadata_db->getf_set(
  823. metadata_db,
  824. txn,
  825. 0,
  826. &key,
  827. smart_dbt_do_nothing,
  828. NULL
  829. );
  830. if (error == 0) {
  831. *table_found = true;
  832. }
  833. else if (error == DB_NOTFOUND){
  834. *table_found = false;
  835. error = 0;
  836. }
  837. cleanup:
  838. if (txn) {
  839. commit_txn(txn, 0);
  840. }
  841. pthread_mutex_unlock(&tokudb_meta_mutex);
  842. return error;
  843. }
  844. int create_tokudb_trx_data_instance(tokudb_trx_data** out_trx) {
  845. int error;
  846. tokudb_trx_data* trx = NULL;
  847. trx = (tokudb_trx_data *) my_malloc(sizeof(*trx), MYF(MY_ZEROFILL));
  848. if (!trx) {
  849. error = ENOMEM;
  850. goto cleanup;
  851. }
  852. *out_trx = trx;
  853. error = 0;
  854. cleanup:
  855. return error;
  856. }
  857. int generate_row_for_put(
  858. DB *dest_db,
  859. DB *src_db,
  860. DBT *dest_key,
  861. DBT *dest_val,
  862. const DBT *src_key,
  863. const DBT *src_val,
  864. void *extra
  865. )
  866. {
  867. int error;
  868. DB* curr_db = dest_db;
  869. uchar* row_desc = NULL;
  870. u_int32_t desc_size;
  871. uchar* buff = NULL;
  872. u_int32_t max_key_len = 0;
  873. row_desc = (uchar *)curr_db->descriptor->data;
  874. row_desc += (*(u_int32_t *)row_desc);
  875. desc_size = (*(u_int32_t *)row_desc) - 4;
  876. row_desc += 4;
  877. if (is_key_pk(row_desc, desc_size)) {
  878. assert(dest_key->flags != DB_DBT_USERMEM);
  879. assert(dest_val->flags != DB_DBT_USERMEM);
  880. if (dest_key->flags == DB_DBT_REALLOC && dest_key->data != NULL) {
  881. free(dest_key->data);
  882. }
  883. if (dest_val->flags == DB_DBT_REALLOC && dest_val->data != NULL) {
  884. free(dest_val->data);
  885. }
  886. dest_key->data = src_key->data;
  887. dest_key->size = src_key->size;
  888. dest_key->flags = 0;
  889. dest_val->data = src_val->data;
  890. dest_val->size = src_val->size;
  891. dest_val->flags = 0;
  892. error = 0;
  893. goto cleanup;
  894. }
  895. if (dest_key->flags == DB_DBT_USERMEM) {
  896. buff = (uchar *)dest_key->data;
  897. }
  898. else if (dest_key->flags == DB_DBT_REALLOC) {
  899. max_key_len = max_key_size_from_desc(row_desc, desc_size);
  900. max_key_len += src_key->size;
  901. if (max_key_len > dest_key->ulen) {
  902. void* old_ptr = dest_key->data;
  903. void* new_ptr = NULL;
  904. new_ptr = realloc(old_ptr, max_key_len);
  905. assert(new_ptr);
  906. dest_key->data = new_ptr;
  907. dest_key->ulen = max_key_len;
  908. }
  909. buff = (uchar *)dest_key->data;
  910. assert(buff != NULL && max_key_len > 0);
  911. }
  912. else {
  913. assert(false);
  914. }
  915. dest_key->size = pack_key_from_desc(
  916. buff,
  917. row_desc,
  918. desc_size,
  919. src_key,
  920. src_val
  921. );
  922. assert(dest_key->ulen >= dest_key->size);
  923. if (tokudb_debug & TOKUDB_DEBUG_CHECK_KEY && !max_key_len) {
  924. max_key_len = max_key_size_from_desc(row_desc, desc_size);
  925. max_key_len += src_key->size;
  926. }
  927. if (max_key_len) {
  928. assert(max_key_len >= dest_key->size);
  929. }
  930. row_desc += desc_size;
  931. desc_size = (*(u_int32_t *)row_desc) - 4;
  932. row_desc += 4;
  933. if (!is_key_clustering(row_desc, desc_size)) {
  934. dest_val->size = 0;
  935. }
  936. else {
  937. uchar* buff = NULL;
  938. if (dest_val->flags == DB_DBT_USERMEM) {
  939. buff = (uchar *)dest_val->data;
  940. }
  941. else if (dest_val->flags == DB_DBT_REALLOC){
  942. if (dest_val->ulen < src_val->size) {
  943. void* old_ptr = dest_val->data;
  944. void* new_ptr = NULL;
  945. new_ptr = realloc(old_ptr, src_val->size);
  946. assert(new_ptr);
  947. dest_val->data = new_ptr;
  948. dest_val->ulen = src_val->size;
  949. }
  950. buff = (uchar *)dest_val->data;
  951. assert(buff != NULL);
  952. }
  953. else {
  954. assert(false);
  955. }
  956. dest_val->size = pack_clustering_val_from_desc(
  957. buff,
  958. row_desc,
  959. desc_size,
  960. src_val
  961. );
  962. assert(dest_val->ulen >= dest_val->size);
  963. }
  964. error = 0;
  965. cleanup:
  966. return error;
  967. }
  968. ha_tokudb::ha_tokudb(handlerton * hton, TABLE_SHARE * table_arg):handler(hton, table_arg)
  969. // flags defined in sql\handler.h
  970. {
  971. int_table_flags = HA_REC_NOT_IN_SEQ | HA_NULL_IN_KEY | HA_CAN_INDEX_BLOBS | HA_PRIMARY_KEY_IN_READ_INDEX |
  972. HA_FILE_BASED | HA_AUTO_PART_KEY | HA_TABLE_SCAN_ON_INDEX;
  973. alloc_ptr = NULL;
  974. rec_buff = NULL;
  975. transaction = NULL;
  976. cursor = NULL;
  977. fixed_cols_for_query = NULL;
  978. var_cols_for_query = NULL;
  979. num_fixed_cols_for_query = 0;
  980. num_var_cols_for_query = 0;
  981. unpack_entire_row = true;
  982. read_blobs = false;
  983. read_key = false;
  984. added_rows = 0;
  985. deleted_rows = 0;
  986. last_dup_key = UINT_MAX;
  987. using_ignore = 0;
  988. last_cursor_error = 0;
  989. range_lock_grabbed = false;
  990. blob_buff = NULL;
  991. num_blob_bytes = 0;
  992. delay_updating_ai_metadata = false;
  993. ai_metadata_update_required = false;
  994. read_lock_wait_time = 4000;
  995. bzero(mult_key_buff, sizeof(mult_key_buff));
  996. bzero(mult_rec_buff, sizeof(mult_rec_buff));
  997. bzero(mult_key_dbt, sizeof(mult_key_dbt));
  998. bzero(mult_rec_dbt, sizeof(mult_rec_dbt));
  999. loader = NULL;
  1000. abort_loader = false;
  1001. bzero(&lc, sizeof(lc));
  1002. }
  1003. //
  1004. // states if table has an auto increment column, if so, sets index where auto inc column is to index
  1005. // Parameters:
  1006. // [out] index - if auto inc exists, then this param is set to where it exists in table, if not, then unchanged
  1007. // Returns:
  1008. // true if auto inc column exists, false otherwise
  1009. //
  1010. bool ha_tokudb::has_auto_increment_flag(uint* index) {
  1011. //
  1012. // check to see if we have auto increment field
  1013. //
  1014. bool ai_found = false;
  1015. uint ai_index = 0;
  1016. for (uint i = 0; i < table_share->fields; i++, ai_index++) {
  1017. Field* field = table->field[i];
  1018. if (field->flags & AUTO_INCREMENT_FLAG) {
  1019. ai_found = true;
  1020. *index = ai_index;
  1021. break;
  1022. }
  1023. }
  1024. return ai_found;
  1025. }
  1026. int ha_tokudb::open_status_dictionary(DB** ptr, const char* name, DB_TXN* txn) {
  1027. int error;
  1028. char* newname = NULL;
  1029. uint open_mode = DB_THREAD;
  1030. newname = (char *)my_malloc(
  1031. get_max_dict_name_path_length(name),
  1032. MYF(MY_WME)
  1033. );
  1034. if (newname == NULL) {
  1035. error = ENOMEM;
  1036. goto cleanup;
  1037. }
  1038. make_name(newname, name, "status");
  1039. if (tokudb_debug & TOKUDB_DEBUG_OPEN) {
  1040. TOKUDB_TRACE("open:%s\n", newname);
  1041. }
  1042. error = db_create(ptr, db_env, 0);
  1043. if (error) { goto cleanup; }
  1044. error = (*ptr)->open((*ptr), txn, newname, NULL, DB_BTREE, open_mode, 0);
  1045. if (error) {
  1046. goto cleanup;
  1047. }
  1048. cleanup:
  1049. if (error) {
  1050. if (*ptr) {
  1051. int r = (*ptr)->close(*ptr, 0);
  1052. assert(r==0);
  1053. *ptr = NULL;
  1054. }
  1055. }
  1056. my_free(newname, MYF(MY_ALLOW_ZERO_PTR));
  1057. return error;
  1058. }
  1059. int ha_tokudb::open_main_dictionary(const char* name, bool is_read_only, DB_TXN* txn) {
  1060. int error;
  1061. char* newname = NULL;
  1062. uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
  1063. assert(share->file == NULL);
  1064. assert(share->key_file[primary_key] == NULL);
  1065. newname = (char *)my_malloc(
  1066. get_max_dict_name_path_length(name),
  1067. MYF(MY_WME|MY_ZEROFILL)
  1068. );
  1069. if (newname == NULL) {
  1070. error = ENOMEM;
  1071. goto exit;
  1072. }
  1073. make_name(newname, name, "main");
  1074. error = db_create(&share->file, db_env, 0);
  1075. if (error) {
  1076. goto exit;
  1077. }
  1078. share->key_file[primary_key] = share->file;
  1079. error = share->file->open(share->file, txn, newname, NULL, DB_BTREE, open_flags, 0);
  1080. if (error) {
  1081. goto exit;
  1082. }
  1083. if (tokudb_debug & TOKUDB_DEBUG_OPEN) {
  1084. TOKUDB_TRACE("open:%s:file=%p\n", newname, share->file);
  1085. }
  1086. error = 0;
  1087. exit:
  1088. if (error) {
  1089. if (share->file) {
  1090. int r = share->file->close(
  1091. share->file,
  1092. 0
  1093. );
  1094. assert(r==0);
  1095. share->file = NULL;
  1096. share->key_file[primary_key] = NULL;
  1097. }
  1098. }
  1099. my_free(newname, MYF(MY_ALLOW_ZERO_PTR));
  1100. return error;
  1101. }
  1102. //
  1103. // Open a secondary table, the key will be a secondary index, the data will be a primary key
  1104. //
  1105. int ha_tokudb::open_secondary_dictionary(DB** ptr, KEY* key_info, const char* name, bool is_read_only, DB_TXN* txn) {
  1106. int error = ENOSYS;
  1107. char dict_name[MAX_DICT_NAME_LEN];
  1108. uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
  1109. char* newname = NULL;
  1110. uint newname_len = 0;
  1111. sprintf(dict_name, "key-%s", key_info->name);
  1112. newname_len = get_max_dict_name_path_length(name);
  1113. newname = (char *)my_malloc(newname_len, MYF(MY_WME|MY_ZEROFILL));
  1114. if (newname == NULL) {
  1115. error = ENOMEM;
  1116. goto cleanup;
  1117. }
  1118. make_name(newname, name, dict_name);
  1119. if ((error = db_create(ptr, db_env, 0))) {
  1120. my_errno = error;
  1121. goto cleanup;
  1122. }
  1123. if ((error = (*ptr)->open(*ptr, txn, newname, NULL, DB_BTREE, open_flags, 0))) {
  1124. my_errno = error;
  1125. goto cleanup;
  1126. }
  1127. if (tokudb_debug & TOKUDB_DEBUG_OPEN) {
  1128. TOKUDB_TRACE("open:%s:file=%p\n", newname, *ptr);
  1129. }
  1130. cleanup:
  1131. if (error) {
  1132. if (*ptr) {
  1133. int r = (*ptr)->close(*ptr, 0);
  1134. assert(r==0);
  1135. *ptr = NULL;
  1136. }
  1137. }
  1138. my_free(newname, MYF(MY_ALLOW_ZERO_PTR));
  1139. return error;
  1140. }
  1141. int initialize_col_pack_info(KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, uint keynr) {
  1142. int error = ENOSYS;
  1143. //
  1144. // set up the cp_info
  1145. //
  1146. assert(kc_info->cp_info[keynr] == NULL);
  1147. kc_info->cp_info[keynr] = (COL_PACK_INFO *)my_malloc(
  1148. table_share->fields*sizeof(COL_PACK_INFO),
  1149. MYF(MY_WME | MY_ZEROFILL)
  1150. );
  1151. if (kc_info->cp_info[keynr] == NULL) {
  1152. error = ENOMEM;
  1153. goto exit;
  1154. }
  1155. {
  1156. u_int32_t curr_fixed_offset = 0;
  1157. u_int32_t curr_var_index = 0;
  1158. for (uint j = 0; j < table_share->fields; j++) {
  1159. COL_PACK_INFO* curr = &kc_info->cp_info[keynr][j];
  1160. //
  1161. // need to set the offsets / indexes
  1162. // offsets are calculated AFTER the NULL bytes
  1163. //
  1164. if (!bitmap_is_set(&kc_info->key_filters[keynr],j)) {
  1165. if (kc_info->field_lengths[j]) {
  1166. curr->col_pack_val = curr_fixed_offset;
  1167. curr_fixed_offset += kc_info->field_lengths[j];
  1168. }
  1169. else if (kc_info->length_bytes[j]) {
  1170. curr->col_pack_val = curr_var_index;
  1171. curr_var_index++;
  1172. }
  1173. }
  1174. }
  1175. //
  1176. // set up the mcp_info
  1177. //
  1178. kc_info->mcp_info[keynr].var_len_offset = get_var_len_offset(
  1179. kc_info,
  1180. table_share,
  1181. keynr
  1182. );
  1183. kc_info->mcp_info[keynr].len_of_offsets = get_len_of_offsets(
  1184. kc_info,
  1185. table_share,
  1186. keynr
  1187. );
  1188. error = 0;
  1189. }
  1190. exit:
  1191. return error;
  1192. }
  1193. int initialize_key_and_col_info(TABLE_SHARE* table_share, TABLE* table, KEY_AND_COL_INFO* kc_info, uint hidden_primary_key, uint primary_key) {
  1194. int error = 0;
  1195. u_int32_t curr_blob_field_index = 0;
  1196. u_int32_t max_var_bytes = 0;
  1197. //
  1198. // fill in the field lengths. 0 means it is a variable sized field length
  1199. // fill in length_bytes, 0 means it is fixed or blob
  1200. //
  1201. for (uint i = 0; i < table_share->fields; i++) {
  1202. Field* field = table_share->field[i];
  1203. TOKU_TYPE toku_type = mysql_to_toku_type(field);
  1204. uint32 pack_length = 0;
  1205. switch (toku_type) {
  1206. case toku_type_int:
  1207. case toku_type_double:
  1208. case toku_type_float:
  1209. case toku_type_fixbinary:
  1210. case toku_type_fixstring:
  1211. pack_length = field->pack_length();
  1212. assert(pack_length < 1<<16);
  1213. kc_info->field_lengths[i] = (u_int16_t)pack_length;
  1214. kc_info->length_bytes[i] = 0;
  1215. break;
  1216. case toku_type_blob:
  1217. kc_info->field_lengths[i] = 0;
  1218. kc_info->length_bytes[i] = 0;
  1219. kc_info->blob_fields[curr_blob_field_index] = i;
  1220. curr_blob_field_index++;
  1221. break;
  1222. case toku_type_varstring:
  1223. case toku_type_varbinary:
  1224. //
  1225. // meaning it is variable sized
  1226. //
  1227. kc_info->field_lengths[i] = 0;
  1228. kc_info->length_bytes[i] = (uchar)((Field_varstring *)field)->length_bytes;
  1229. max_var_bytes += field->field_length;
  1230. break;
  1231. default:
  1232. assert(false);
  1233. }
  1234. }
  1235. kc_info->num_blobs = curr_blob_field_index;
  1236. //
  1237. // initialize share->num_offset_bytes
  1238. // because MAX_REF_LENGTH is 65536, we
  1239. // can safely set num_offset_bytes to 1 or 2
  1240. //
  1241. if (max_var_bytes < 256) {
  1242. kc_info->num_offset_bytes = 1;
  1243. }
  1244. else {
  1245. kc_info->num_offset_bytes = 2;
  1246. }
  1247. for (uint i = 0; i < table_share->keys + test(hidden_primary_key); i++) {
  1248. //
  1249. // do the cluster/primary key filtering calculations
  1250. //
  1251. if (! (i==primary_key && hidden_primary_key) ){
  1252. if ( i == primary_key ) {
  1253. set_key_filter(
  1254. &kc_info->key_filters[primary_key],
  1255. &table_share->key_info[primary_key],
  1256. table,
  1257. true
  1258. );
  1259. }
  1260. else {
  1261. set_key_filter(
  1262. &kc_info->key_filters[i],
  1263. &table_share->key_info[i],
  1264. table,
  1265. true
  1266. );
  1267. if (!hidden_primary_key) {
  1268. set_key_filter(
  1269. &kc_info->key_filters[i],
  1270. &table_share->key_info[primary_key],
  1271. table,
  1272. true
  1273. );
  1274. }
  1275. }
  1276. }
  1277. if (i == primary_key || table_share->key_info[i].flags & HA_CLUSTERING) {
  1278. error = initialize_col_pack_info(kc_info,table_share,i);
  1279. if (error) {
  1280. goto exit;
  1281. }
  1282. }
  1283. }
  1284. exit:
  1285. return error;
  1286. }
  1287. bool ha_tokudb::can_replace_into_be_fast(TABLE_SHARE* table_share, KEY_AND_COL_INFO* kc_info, uint pk) {
  1288. uint curr_num_DBs = table_share->keys + test(hidden_primary_key);
  1289. bool ret_val;
  1290. if (curr_num_DBs == 1) {
  1291. ret_val = true;
  1292. goto exit;
  1293. }
  1294. ret_val = true;
  1295. for (uint curr_index = 0; curr_index < table_share->keys; curr_index++) {
  1296. if (curr_index == pk) continue;
  1297. KEY* curr_key_info = &table_share->key_info[curr_index];
  1298. for (uint i = 0; i < curr_key_info->key_parts; i++) {
  1299. uint16 curr_field_index = curr_key_info->key_part[i].field->field_index;
  1300. if (!bitmap_is_set(&kc_info->key_filters[curr_index],curr_field_index)) {
  1301. ret_val = false;
  1302. goto exit;
  1303. }
  1304. if (bitmap_is_set(&kc_info->key_filters[curr_index], curr_field_index) &&
  1305. !bitmap_is_set(&kc_info->key_filters[pk], curr_field_index)) {
  1306. ret_val = false;
  1307. goto exit;
  1308. }
  1309. }
  1310. }
  1311. exit:
  1312. return ret_val;
  1313. }
  1314. int ha_tokudb::initialize_share(
  1315. const char* name,
  1316. int mode
  1317. )
  1318. {
  1319. int error = 0;
  1320. u_int64_t num_rows = 0;
  1321. bool table_exists;
  1322. DBUG_PRINT("info", ("share->use_count %u", share->use_count));
  1323. table_exists = true;
  1324. error = check_table_in_metadata(name, &table_exists);
  1325. if (error) {
  1326. goto exit;
  1327. }
  1328. if (!table_exists) {
  1329. sql_print_error("table %s does not exist in metadata, was it moved from someplace else? Not opening table", name);
  1330. error = HA_ADMIN_FAILED;
  1331. goto exit;
  1332. }
  1333. error = initialize_key_and_col_info(
  1334. table_share,
  1335. table,
  1336. &share->kc_info,
  1337. hidden_primary_key,
  1338. primary_key
  1339. );
  1340. if (error) { goto exit; }
  1341. error = open_main_dictionary(name, mode == O_RDONLY, NULL);
  1342. if (error) { goto exit; }
  1343. share->has_unique_keys = false;
  1344. /* Open other keys; These are part of the share structure */
  1345. for (uint i = 0; i < table_share->keys + test(hidden_primary_key); i++) {
  1346. if (table_share->key_info[i].flags & HA_NOSAME) {
  1347. share->has_unique_keys = true;
  1348. }
  1349. if (i != primary_key) {
  1350. error = open_secondary_dictionary(
  1351. &share->key_file[i],
  1352. &table_share->key_info[i],
  1353. name,
  1354. mode == O_RDONLY,
  1355. NULL
  1356. );
  1357. if (error) {
  1358. goto exit;
  1359. }
  1360. }
  1361. }
  1362. share->replace_into_fast = can_replace_into_be_fast(
  1363. table_share,
  1364. &share->kc_info,
  1365. primary_key
  1366. );
  1367. if (!hidden_primary_key) {
  1368. //
  1369. // We need to set the ref_length to start at 5, to account for
  1370. // the "infinity byte" in keys, and for placing the DBT size in the first four bytes
  1371. //
  1372. ref_length = sizeof(u_int32_t) + sizeof(uchar);
  1373. KEY_PART_INFO *key_part = table->key_info[primary_key].key_part;
  1374. KEY_PART_INFO *end = key_part + table->key_info[primary_key].key_parts;
  1375. for (; key_part != end; key_part++) {
  1376. ref_length += key_part->field->max_packed_col_length(key_part->length);
  1377. }
  1378. share->status |= STATUS_PRIMARY_KEY_INIT;
  1379. }
  1380. share->ref_length = ref_length;
  1381. error = get_status();
  1382. if (error) {
  1383. goto exit;
  1384. }
  1385. if (share->version < HA_TOKU_VERSION) {
  1386. error = ENOSYS;
  1387. goto exit;
  1388. }
  1389. error = estimate_num_rows(share->file,&num_rows, NULL);
  1390. //
  1391. // estimate_num_rows should not fail under normal conditions
  1392. //
  1393. if (error == 0) {
  1394. share->rows = num_rows;
  1395. }
  1396. else {
  1397. goto exit;
  1398. }
  1399. //
  1400. // initialize auto increment data
  1401. //
  1402. share->has_auto_inc = has_auto_increment_flag(&share->ai_field_index);
  1403. if (share->has_auto_inc) {
  1404. init_auto_increment();
  1405. }
  1406. if (may_table_be_empty()) {
  1407. share->try_table_lock = true;
  1408. }
  1409. else {
  1410. share->try_table_lock = false;
  1411. }
  1412. error = 0;
  1413. exit:
  1414. return error;
  1415. }
  1416. //
  1417. // Creates and opens a handle to a table which already exists in a tokudb
  1418. // database.
  1419. // Parameters:
  1420. // [in] name - table name
  1421. // mode - seems to specify if table is read only
  1422. // test_if_locked - unused
  1423. // Returns:
  1424. // 0 on success
  1425. // 1 on error
  1426. //
  1427. int ha_tokudb::open(const char *name, int mode, uint test_if_locked) {
  1428. TOKUDB_DBUG_ENTER("ha_tokudb::open %p %s", this, name);
  1429. int error = 0;
  1430. int ret_val = 0;
  1431. transaction = NULL;
  1432. cursor = NULL;
  1433. /* Open primary key */
  1434. hidden_primary_key = 0;
  1435. if ((primary_key = table_share->primary_key) >= MAX_KEY) {
  1436. // No primary key
  1437. primary_key = table_share->keys;
  1438. key_used_on_scan = MAX_KEY;
  1439. hidden_primary_key = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
  1440. ref_length = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(u_int32_t);
  1441. }
  1442. else {
  1443. key_used_on_scan = primary_key;
  1444. }
  1445. /* Need some extra memory in case of packed keys */
  1446. // the "+ 1" is for the first byte that states +/- infinity
  1447. // multiply everything by 2 to account for clustered keys having a key and primary key together
  1448. max_key_length = 2*(table_share->max_key_length + MAX_REF_PARTS * 3 + sizeof(uchar));
  1449. alloc_ptr = my_multi_malloc(MYF(MY_WME),
  1450. &key_buff, max_key_length,
  1451. &key_buff2, max_key_length,
  1452. &key_buff3, max_key_length,
  1453. &primary_key_buff, (hidden_primary_key ? 0 : max_key_length),
  1454. &fixed_cols_for_query, table_share->fields*sizeof(u_int32_t),
  1455. &var_cols_for_query, table_share->fields*sizeof(u_int32_t),
  1456. NullS
  1457. );
  1458. if (alloc_ptr == NULL) {
  1459. ret_val = 1;
  1460. goto exit;
  1461. }
  1462. alloced_rec_buff_length = table_share->rec_buff_length + table_share->fields;
  1463. rec_buff = (uchar *) my_malloc(alloced_rec_buff_length, MYF(MY_WME));
  1464. if (rec_buff == NULL) {
  1465. ret_val = 1;
  1466. goto exit;
  1467. }
  1468. for (u_int32_t i = 0; i < (table_share->keys); i++) {
  1469. if (i == primary_key) {
  1470. continue;
  1471. }
  1472. mult_key_buff[i] = (uchar *)my_malloc(max_key_length, MYF(MY_WME));
  1473. assert(mult_key_buff[i] != NULL);
  1474. mult_key_dbt[i].ulen = max_key_length;
  1475. mult_key_dbt[i].flags = DB_DBT_USERMEM;
  1476. mult_key_dbt[i].data = mult_key_buff[i];
  1477. if (table_share->key_info[i].flags & HA_CLUSTERING) {
  1478. mult_rec_buff[i] = (uchar *) my_malloc(alloced_rec_buff_length, MYF(MY_WME));
  1479. assert(mult_rec_buff[i]);
  1480. mult_rec_dbt[i].ulen = alloced_rec_buff_length;
  1481. mult_rec_dbt[i].flags = DB_DBT_USERMEM;
  1482. mult_rec_dbt[i].data = mult_rec_buff[i];
  1483. }
  1484. }
  1485. alloced_mult_rec_buff_length = alloced_rec_buff_length;
  1486. /* Init shared structure */
  1487. share = get_share(name, table_share);
  1488. if (share == NULL) {
  1489. ret_val = 1;
  1490. goto exit;
  1491. }
  1492. thr_lock_data_init(&share->lock, &lock, NULL);
  1493. /* Fill in shared structure, if needed */
  1494. pthread_mutex_lock(&share->mutex);
  1495. if (tokudb_debug & TOKUDB_DEBUG_OPEN) {
  1496. TOKUDB_TRACE("tokudbopen:%p:share=%p:file=%p:table=%p:table->s=%p:%d\n",
  1497. this, share, share->file, table, table->s, share->use_count);
  1498. }
  1499. if (!share->use_count++) {
  1500. ret_val = initialize_share(
  1501. name,
  1502. mode
  1503. );
  1504. if (ret_val) {
  1505. free_share(share, 1);
  1506. goto exit;
  1507. }
  1508. }
  1509. ref_length = share->ref_length; // If second open
  1510. pthread_mutex_unlock(&share->mutex);
  1511. key_read = false;
  1512. stats.block_size = 1<<20; // QQQ Tokudb DB block size
  1513. init_hidden_prim_key_info();
  1514. info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
  1515. exit:
  1516. if (ret_val) {
  1517. my_free(alloc_ptr, MYF(MY_ALLOW_ZERO_PTR));
  1518. alloc_ptr = NULL;
  1519. my_free(rec_buff, MYF(MY_ALLOW_ZERO_PTR));
  1520. rec_buff = NULL;
  1521. for (u_int32_t i = 0; i < (table_share->keys); i++) {
  1522. my_free(mult_key_buff[i], MYF(MY_ALLOW_ZERO_PTR));
  1523. my_free(mult_rec_buff[i], MYF(MY_ALLOW_ZERO_PTR));
  1524. }
  1525. if (error) {
  1526. my_errno = error;
  1527. }
  1528. }
  1529. TOKUDB_DBUG_RETURN(ret_val);
  1530. }
  1531. //
  1532. // estimate the number of rows in a DB
  1533. // Parameters:
  1534. // [in] db - DB whose number of rows will be estimated
  1535. // [out] num_rows - number of estimated rows in db
  1536. // Returns:
  1537. // 0 on success
  1538. // error otherwise
  1539. //
  1540. int ha_tokudb::estimate_num_rows(DB* db, u_int64_t* num_rows, DB_TXN* txn) {
  1541. DBT key;
  1542. DBT data;
  1543. int error = ENOSYS;
  1544. DBC* crsr = NULL;
  1545. u_int64_t less, equal, greater;
  1546. int is_exact;
  1547. bool do_commit = false;
  1548. DB_TXN* txn_to_use = NULL;
  1549. bzero((void *)&key, sizeof(key));
  1550. bzero((void *)&data, sizeof(data));
  1551. if (txn == NULL) {
  1552. error = db_env->txn_begin(db_env, 0, &txn_to_use, DB_READ_UNCOMMITTED);
  1553. if (error) goto cleanup;
  1554. do_commit = true;
  1555. }
  1556. else {
  1557. txn_to_use = txn;
  1558. }
  1559. error = db->cursor(db, txn_to_use, &crsr, 0);
  1560. if (error) { goto cleanup; }
  1561. //
  1562. // get the first element, then estimate number of records
  1563. // by calling key_range64 on the first element
  1564. //
  1565. error = crsr->c_get(crsr, &key, &data, DB_FIRST);
  1566. if (error == DB_NOTFOUND) {
  1567. *num_rows = 0;
  1568. error = 0;
  1569. goto cleanup;
  1570. }
  1571. else if (error) { goto cleanup; }
  1572. error = db->key_range64(
  1573. db,
  1574. txn_to_use,
  1575. &key,
  1576. &less,
  1577. &equal,
  1578. &greater,
  1579. &is_exact
  1580. );
  1581. if (error) {
  1582. goto cleanup;
  1583. }
  1584. *num_rows = equal + greater;
  1585. error = 0;
  1586. cleanup:
  1587. if (crsr != NULL) {
  1588. int r = crsr->c_close(crsr);
  1589. assert(r==0);
  1590. crsr = NULL;
  1591. }
  1592. if (do_commit) {
  1593. commit_txn(txn_to_use, 0);
  1594. txn_to_use = NULL;
  1595. }
  1596. return error;
  1597. }
  1598. int ha_tokudb::write_to_status(DB* db, HA_METADATA_KEY curr_key_data, void* data, uint size, DB_TXN* txn ){
  1599. return write_metadata(db, &curr_key_data, sizeof(curr_key_data), data, size, txn);
  1600. }
  1601. int ha_tokudb::remove_metadata(DB* db, void* key_data, uint key_size, DB_TXN* transaction){
  1602. int error;
  1603. DBT key;
  1604. DB_TXN* txn = NULL;
  1605. bool do_commit = false;
  1606. //
  1607. // transaction to be used for putting metadata into status.tokudb
  1608. //
  1609. if (transaction == NULL) {
  1610. error = db_env->txn_begin(db_env, 0, &txn, 0);
  1611. if (error) {
  1612. goto cleanup;
  1613. }
  1614. do_commit = true;
  1615. }
  1616. else {
  1617. txn = transaction;
  1618. }
  1619. bzero(&key, sizeof(key));
  1620. key.data = key_data;
  1621. key.size = key_size;
  1622. error = db->del(db, txn, &key, DB_DELETE_ANY);
  1623. if (error) {
  1624. goto cleanup;
  1625. }
  1626. error = 0;
  1627. cleanup:
  1628. if (do_commit && txn) {
  1629. if (!error) {
  1630. commit_txn(txn, DB_TXN_NOSYNC);
  1631. }
  1632. else {
  1633. abort_txn(txn);
  1634. }
  1635. }
  1636. return error;
  1637. }
  1638. //
  1639. // helper function to write a piece of metadata in to status.tokudb
  1640. //
  1641. int ha_tokudb::write_metadata(DB* db, void* key_data, uint key_size, void* val_data, uint val_size, DB_TXN* transaction ){
  1642. int error;
  1643. DBT key;
  1644. DBT value;
  1645. DB_TXN* txn = NULL;
  1646. bool do_commit = false;
  1647. //
  1648. // transaction to be used for putting metadata into status.tokudb
  1649. //
  1650. if (transaction == NULL) {
  1651. error = db_env->txn_begin(db_env, 0, &txn, 0);
  1652. if (error) {
  1653. goto cleanup;
  1654. }
  1655. do_commit = true;
  1656. }
  1657. else {
  1658. txn = transaction;
  1659. }
  1660. bzero(&key, sizeof(key));
  1661. bzero(&value, sizeof(value));
  1662. key.data = key_data;
  1663. key.size = key_size;
  1664. value.data = val_data;
  1665. value.size = val_size;
  1666. error = db->put(db, txn, &key, &value, 0);
  1667. if (error) {
  1668. goto cleanup;
  1669. }
  1670. error = 0;
  1671. cleanup:
  1672. if (do_commit && txn) {
  1673. if (!error) {
  1674. commit_txn(txn, DB_TXN_NOSYNC);
  1675. }
  1676. else {
  1677. abort_txn(txn);
  1678. }
  1679. }
  1680. return error;
  1681. }
  1682. //
  1683. // Updates status.tokudb with a new max value used for the auto increment column
  1684. // Parameters:
  1685. // [in] db - this will always be status.tokudb
  1686. // val - value to store
  1687. // Returns:
  1688. // 0 on success, error otherwise
  1689. //
  1690. //
  1691. int ha_tokudb::update_max_auto_inc(DB* db, ulonglong val){
  1692. return write_to_status(db,hatoku_max_ai,&val,sizeof(val), NULL);
  1693. }
  1694. //
  1695. // Writes the initial auto increment value, as specified by create table
  1696. // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
  1697. // then the value 100 will be stored here in val
  1698. // Parameters:
  1699. // [in] db - this will always be status.tokudb
  1700. // val - value to store
  1701. // Returns:
  1702. // 0 on success, error otherwise
  1703. //
  1704. //
  1705. int ha_tokudb::write_auto_inc_create(DB* db, ulonglong val, DB_TXN* txn){
  1706. return write_to_status(db,hatoku_ai_create_value,&val,sizeof(val), txn);
  1707. }
  1708. //
  1709. // Closes a handle to a table.
  1710. //
  1711. int ha_tokudb::close(void) {
  1712. TOKUDB_DBUG_ENTER("ha_tokudb::close %p", this);
  1713. TOKUDB_DBUG_RETURN(__close(0));
  1714. }
  1715. int ha_tokudb::__close(int mutex_is_locked) {
  1716. TOKUDB_DBUG_ENTER("ha_tokudb::__close %p", this);
  1717. if (tokudb_debug & TOKUDB_DEBUG_OPEN)
  1718. TOKUDB_TRACE("close:%p\n", this);
  1719. my_free(rec_buff, MYF(MY_ALLOW_ZERO_PTR));
  1720. my_free(blob_buff, MYF(MY_ALLOW_ZERO_PTR));
  1721. my_free(alloc_ptr, MYF(MY_ALLOW_ZERO_PTR));
  1722. for (u_int32_t i = 0; i < (table_share->keys); i++) {
  1723. my_free(mult_key_buff[i], MYF(MY_ALLOW_ZERO_PTR));
  1724. my_free(mult_rec_buff[i], MYF(MY_ALLOW_ZERO_PTR));
  1725. }
  1726. rec_buff = NULL;
  1727. alloc_ptr = NULL;
  1728. ha_tokudb::reset();
  1729. TOKUDB_DBUG_RETURN(free_share(share, mutex_is_locked));
  1730. }
  1731. //
  1732. // Reallocate record buffer (rec_buff) if needed
  1733. // If not needed, does nothing
  1734. // Parameters:
  1735. // length - size of buffer required for rec_buff
  1736. //
  1737. bool ha_tokudb::fix_rec_buff_for_blob(ulong length) {
  1738. if (!rec_buff || (length > alloced_rec_buff_length)) {
  1739. uchar *newptr;
  1740. if (!(newptr = (uchar *) my_realloc((void *) rec_buff, length, MYF(MY_ALLOW_ZERO_PTR))))
  1741. return 1;
  1742. rec_buff = newptr;
  1743. alloced_rec_buff_length = length;
  1744. }
  1745. return 0;
  1746. }
  1747. void ha_tokudb::fix_mult_rec_buff() {
  1748. if (alloced_rec_buff_length > alloced_mult_rec_buff_length) {
  1749. for (uint i = 0; i < table_share->keys; i++) {
  1750. if (table_share->key_info[i].flags & HA_CLUSTERING) {
  1751. uchar *newptr;
  1752. if (!(newptr = (uchar *) my_realloc((void *) mult_rec_buff[i], alloced_rec_buff_length, MYF(MY_ALLOW_ZERO_PTR)))) {
  1753. assert(false);
  1754. }
  1755. mult_rec_buff[i] = newptr;
  1756. mult_rec_dbt[i].ulen = alloced_rec_buff_length;
  1757. mult_rec_dbt[i].flags = DB_DBT_USERMEM;
  1758. mult_rec_dbt[i].data = mult_rec_buff[i];
  1759. }
  1760. }
  1761. alloced_mult_rec_buff_length = alloced_rec_buff_length;
  1762. }
  1763. }
  1764. /* Calculate max length needed for row */
  1765. ulong ha_tokudb::max_row_length(const uchar * buf) {
  1766. ulong length = table_share->reclength + table_share->fields * 2;
  1767. uint *ptr, *end;
  1768. for (ptr = table_share->blob_field, end = ptr + table_share->blob_fields; ptr != end; ptr++) {
  1769. Field_blob *blob = ((Field_blob *) table->field[*ptr]);
  1770. length += blob->get_length((uchar *) (buf + field_offset(blob, table))) + 2;
  1771. }
  1772. return length;
  1773. }
  1774. /*
  1775. */
  1776. //
  1777. // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
  1778. // Pack a row for storage.
  1779. // If the row is of fixed length, just store the row 'as is'.
  1780. // If not, we will generate a packed row suitable for storage.
  1781. // This will only fail if we don't have enough memory to pack the row,
  1782. // which may only happen in rows with blobs, as the default row length is
  1783. // pre-allocated.
  1784. // Parameters:
  1785. // [out] row - row stored in DBT to be converted
  1786. // [out] buf - buffer where row is packed
  1787. // [in] record - row in MySQL format
  1788. //
  1789. int ha_tokudb::pack_row(
  1790. DBT * row,
  1791. const uchar* record,
  1792. uint index
  1793. )
  1794. {
  1795. uchar* fixed_field_ptr = NULL;
  1796. uchar* var_field_offset_ptr = NULL;
  1797. uchar* start_field_data_ptr = NULL;
  1798. uchar* var_field_data_ptr = NULL;
  1799. int r = ENOSYS;
  1800. bzero((void *) row, sizeof(*row));
  1801. my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
  1802. if (table_share->blob_fields) {
  1803. if (fix_rec_buff_for_blob(max_row_length(record))) {
  1804. r = HA_ERR_OUT_OF_MEM;
  1805. goto cleanup;
  1806. }
  1807. }
  1808. /* Copy null bits */
  1809. memcpy(rec_buff, record, table_share->null_bytes);
  1810. fixed_field_ptr = rec_buff + table_share->null_bytes;
  1811. var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].var_len_offset;
  1812. start_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
  1813. var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
  1814. //
  1815. // assert that when the hidden primary key exists, primary_key_offsets is NULL
  1816. //
  1817. for (uint i = 0; i < table_share->fields; i++) {
  1818. Field* field = table->field[i];
  1819. uint curr_field_offset = field_offset(field, table);
  1820. if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
  1821. continue;
  1822. }
  1823. if (share->kc_info.field_lengths[i]) {
  1824. fixed_field_ptr = pack_fixed_field(
  1825. fixed_field_ptr,
  1826. record + curr_field_offset,
  1827. share->kc_info.field_lengths[i]
  1828. );
  1829. }
  1830. else if (share->kc_info.length_bytes[i]) {
  1831. var_field_data_ptr = pack_var_field(
  1832. var_field_offset_ptr,
  1833. var_field_data_ptr,
  1834. start_field_data_ptr,
  1835. record + curr_field_offset,
  1836. share->kc_info.length_bytes[i],
  1837. share->kc_info.num_offset_bytes
  1838. );
  1839. var_field_offset_ptr += share->kc_info.num_offset_bytes;
  1840. }
  1841. }
  1842. for (uint i = 0; i < share->kc_info.num_blobs; i++) {
  1843. Field* field = table->field[share->kc_info.blob_fields[i]];
  1844. var_field_data_ptr = pack_toku_field_blob(
  1845. var_field_data_ptr,
  1846. record + field_offset(field, table),
  1847. field
  1848. );
  1849. }
  1850. row->data = rec_buff;
  1851. row->size = (size_t) (var_field_data_ptr - rec_buff);
  1852. r = 0;
  1853. cleanup:
  1854. dbug_tmp_restore_column_map(table->write_set, old_map);
  1855. return r;
  1856. }
  1857. int ha_tokudb::unpack_blobs(
  1858. uchar* record,
  1859. const uchar* from_tokudb_blob,
  1860. u_int32_t num_bytes,
  1861. bool check_bitmap
  1862. )
  1863. {
  1864. uint error = 0;
  1865. uchar* ptr = NULL;
  1866. const uchar* buff = NULL;
  1867. //
  1868. // assert that num_bytes > 0 iff share->num_blobs > 0
  1869. //
  1870. assert( !((share->kc_info.num_blobs == 0) && (num_bytes > 0)) );
  1871. if (num_bytes > num_blob_bytes) {
  1872. ptr = (uchar *)my_realloc((void *)blob_buff, num_bytes, MYF(MY_ALLOW_ZERO_PTR));
  1873. if (ptr == NULL) {
  1874. error = ENOMEM;
  1875. goto exit;
  1876. }
  1877. blob_buff = ptr;
  1878. num_blob_bytes = num_bytes;
  1879. }
  1880. memcpy(blob_buff, from_tokudb_blob, num_bytes);
  1881. buff= blob_buff;
  1882. for (uint i = 0; i < share->kc_info.num_blobs; i++) {
  1883. u_int32_t curr_field_index = share->kc_info.blob_fields[i];
  1884. bool skip = check_bitmap ?
  1885. !(bitmap_is_set(table->read_set,curr_field_index) ||
  1886. bitmap_is_set(table->write_set,curr_field_index)) :
  1887. false;
  1888. Field* field = table->field[curr_field_index];
  1889. u_int32_t len_bytes = field->row_pack_length();
  1890. buff = unpack_toku_field_blob(
  1891. record + field_offset(field, table),
  1892. buff,
  1893. len_bytes,
  1894. skip
  1895. );
  1896. }
  1897. error = 0;
  1898. exit:
  1899. return error;
  1900. }
  1901. //
  1902. // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
  1903. // Parameters:
  1904. // [out] record - row in MySQL format
  1905. // [in] row - row stored in DBT to be converted
  1906. //
  1907. int ha_tokudb::unpack_row(
  1908. uchar* record,
  1909. DBT const *row,
  1910. DBT const *key,
  1911. uint index
  1912. )
  1913. {
  1914. //
  1915. // two cases, fixed length row, and variable length row
  1916. // fixed length row is first below
  1917. //
  1918. /* Copy null bits */
  1919. int error = 0;
  1920. const uchar* fixed_field_ptr = (const uchar *) row->data;
  1921. const uchar* var_field_offset_ptr = NULL;
  1922. const uchar* var_field_data_ptr = NULL;
  1923. u_int32_t data_end_offset = 0;
  1924. memcpy(record, fixed_field_ptr, table_share->null_bytes);
  1925. fixed_field_ptr += table_share->null_bytes;
  1926. var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].var_len_offset;
  1927. var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
  1928. //
  1929. // unpack the key, if necessary
  1930. //
  1931. if (!(hidden_primary_key && index == primary_key)) {
  1932. unpack_key(record,key,index);
  1933. }
  1934. u_int32_t last_offset = 0;
  1935. //
  1936. // we have two methods of unpacking, one if we need to unpack the entire row
  1937. // the second if we unpack a subset of the entire row
  1938. // first method here is if we unpack the entire row
  1939. //
  1940. if (unpack_entire_row) {
  1941. //
  1942. // fill in parts of record that are not part of the key
  1943. //
  1944. for (uint i = 0; i < table_share->fields; i++) {
  1945. Field* field = table->field[i];
  1946. if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
  1947. continue;
  1948. }
  1949. if (share->kc_info.field_lengths[i]) {
  1950. fixed_field_ptr = unpack_fixed_field(
  1951. record + field_offset(field, table),
  1952. fixed_field_ptr,
  1953. share->kc_info.field_lengths[i]
  1954. );
  1955. }
  1956. //
  1957. // here, we DO modify var_field_data_ptr or var_field_offset_ptr
  1958. // as we unpack variable sized fields
  1959. //
  1960. else if (share->kc_info.length_bytes[i]) {
  1961. switch (share->kc_info.num_offset_bytes) {
  1962. case (1):
  1963. data_end_offset = var_field_offset_ptr[0];
  1964. break;
  1965. case (2):
  1966. data_end_offset = uint2korr(var_field_offset_ptr);
  1967. break;
  1968. default:
  1969. assert(false);
  1970. break;
  1971. }
  1972. unpack_var_field(
  1973. record + field_offset(field, table),
  1974. var_field_data_ptr,
  1975. data_end_offset - last_offset,
  1976. share->kc_info.length_bytes[i]
  1977. );
  1978. var_field_offset_ptr += share->kc_info.num_offset_bytes;
  1979. var_field_data_ptr += data_end_offset - last_offset;
  1980. last_offset = data_end_offset;
  1981. }
  1982. }
  1983. error = unpack_blobs(
  1984. record,
  1985. var_field_data_ptr,
  1986. row->size - (u_int32_t)(var_field_data_ptr - (const uchar *)row->data),
  1987. false
  1988. );
  1989. if (error) {
  1990. goto exit;
  1991. }
  1992. }
  1993. //
  1994. // in this case, we unpack only what is specified
  1995. // in fixed_cols_for_query and var_cols_for_query
  1996. //
  1997. else {
  1998. //
  1999. // first the fixed fields
  2000. //
  2001. for (u_int32_t i = 0; i < num_fixed_cols_for_query; i++) {
  2002. uint field_index = fixed_cols_for_query[i];
  2003. Field* field = table->field[field_index];
  2004. unpack_fixed_field(
  2005. record + field_offset(field, table),
  2006. fixed_field_ptr + share->kc_info.cp_info[index][field_index].col_pack_val,
  2007. share->kc_info.field_lengths[field_index]
  2008. );
  2009. }
  2010. //
  2011. // now the var fields
  2012. // here, we do NOT modify var_field_data_ptr or var_field_offset_ptr
  2013. //
  2014. for (u_int32_t i = 0; i < num_var_cols_for_query; i++) {
  2015. uint field_index = var_cols_for_query[i];
  2016. Field* field = table->field[field_index];
  2017. u_int32_t var_field_index = share->kc_info.cp_info[index][field_index].col_pack_val;
  2018. u_int32_t data_start_offset;
  2019. u_int32_t field_len;
  2020. get_var_field_info(
  2021. &field_len,
  2022. &data_start_offset,
  2023. var_field_index,
  2024. var_field_offset_ptr,
  2025. share->kc_info.num_offset_bytes
  2026. );
  2027. unpack_var_field(
  2028. record + field_offset(field, table),
  2029. var_field_data_ptr + data_start_offset,
  2030. field_len,
  2031. share->kc_info.length_bytes[field_index]
  2032. );
  2033. }
  2034. if (read_blobs) {
  2035. //
  2036. // now the blobs
  2037. //
  2038. get_blob_field_info(
  2039. &data_end_offset,
  2040. share->kc_info.mcp_info[index].len_of_offsets,
  2041. var_field_data_ptr,
  2042. share->kc_info.num_offset_bytes
  2043. );
  2044. var_field_data_ptr += data_end_offset;
  2045. error = unpack_blobs(
  2046. record,
  2047. var_field_data_ptr,
  2048. row->size - (u_int32_t)(var_field_data_ptr - (const uchar *)row->data),
  2049. true
  2050. );
  2051. if (error) {
  2052. goto exit;
  2053. }
  2054. }
  2055. }
  2056. error = 0;
  2057. exit:
  2058. return error;
  2059. }
  2060. u_int32_t ha_tokudb::place_key_into_mysql_buff(
  2061. KEY* key_info,
  2062. uchar * record,
  2063. uchar* data
  2064. )
  2065. {
  2066. KEY_PART_INFO *key_part = key_info->key_part, *end = key_part + key_info->key_parts;
  2067. uchar *pos = data;
  2068. for (; key_part != end; key_part++) {
  2069. if (key_part->field->null_bit) {
  2070. uint null_offset = get_null_offset(table, key_part->field);
  2071. if (*pos++ == NULL_COL_VAL) { // Null value
  2072. //
  2073. // We don't need to reset the record data as we will not access it
  2074. // if the null data is set
  2075. //
  2076. record[null_offset] |= key_part->field->null_bit;
  2077. continue;
  2078. }
  2079. record[null_offset] &= ~key_part->field->null_bit;
  2080. }
  2081. //
  2082. // HOPEFULLY TEMPORARY
  2083. //
  2084. assert(table->s->db_low_byte_first);
  2085. pos = unpack_toku_key_field(
  2086. record + field_offset(key_part->field, table),
  2087. pos,
  2088. key_part->field,
  2089. key_part->length
  2090. );
  2091. }
  2092. return pos-data;
  2093. }
  2094. //
  2095. // Store the key and the primary key into the row
  2096. // Parameters:
  2097. // [out] record - key stored in MySQL format
  2098. // [in] key - key stored in DBT to be converted
  2099. // index -index into key_file that represents the DB
  2100. // unpacking a key of
  2101. //
  2102. void ha_tokudb::unpack_key(uchar * record, DBT const *key, uint index) {
  2103. u_int32_t bytes_read;
  2104. uchar *pos = (uchar *) key->data + 1;
  2105. bytes_read = place_key_into_mysql_buff(
  2106. &table->key_info[index],
  2107. record,
  2108. pos
  2109. );
  2110. if( (index != primary_key) && !hidden_primary_key) {
  2111. //
  2112. // also unpack primary key
  2113. //
  2114. place_key_into_mysql_buff(
  2115. &table->key_info[primary_key],
  2116. record,
  2117. pos+bytes_read
  2118. );
  2119. }
  2120. }
  2121. u_int32_t ha_tokudb::place_key_into_dbt_buff(
  2122. KEY* key_info,
  2123. uchar * buff,
  2124. const uchar * record,
  2125. bool* has_null,
  2126. int key_length
  2127. )
  2128. {
  2129. KEY_PART_INFO *key_part = key_info->key_part;
  2130. KEY_PART_INFO *end = key_part + key_info->key_parts;
  2131. uchar* curr_buff = buff;
  2132. *has_null = false;
  2133. for (; key_part != end && key_length > 0; key_part++) {
  2134. //
  2135. // accessing key_part->field->null_bit instead off key_part->null_bit
  2136. // because key_part->null_bit is not set in add_index
  2137. // filed ticket 862 to look into this
  2138. //
  2139. if (key_part->field->null_bit) {
  2140. /* Store 0 if the key part is a NULL part */
  2141. uint null_offset = get_null_offset(table, key_part->field);
  2142. if (record[null_offset] & key_part->field->null_bit) {
  2143. *curr_buff++ = NULL_COL_VAL;
  2144. *has_null = true;
  2145. continue;
  2146. }
  2147. *curr_buff++ = NONNULL_COL_VAL; // Store NOT NULL marker
  2148. }
  2149. //
  2150. // HOPEFULLY TEMPORARY
  2151. //
  2152. assert(table->s->db_low_byte_first);
  2153. //
  2154. // accessing field_offset(key_part->field) instead off key_part->offset
  2155. // because key_part->offset is SET INCORRECTLY in add_index
  2156. // filed ticket 862 to look into this
  2157. //
  2158. curr_buff = pack_toku_key_field(
  2159. curr_buff,
  2160. (uchar *) (record + field_offset(key_part->field, table)),
  2161. key_part->field,
  2162. key_part->length
  2163. );
  2164. key_length -= key_part->length;
  2165. }
  2166. return curr_buff - buff;
  2167. }
  2168. //
  2169. // Create a packed key from a row. This key will be written as such
  2170. // to the index tree. This will never fail as the key buffer is pre-allocated.
  2171. // Parameters:
  2172. // [out] key - DBT that holds the key
  2173. // [in] key_info - holds data about the key, such as it's length and offset into record
  2174. // [out] buff - buffer that will hold the data for key (unless
  2175. // we have a hidden primary key)
  2176. // [in] record - row from which to create the key
  2177. // key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
  2178. // Returns:
  2179. // the parameter key
  2180. //
  2181. DBT* ha_tokudb::create_dbt_key_from_key(
  2182. DBT * key,
  2183. KEY* key_info,
  2184. uchar * buff,
  2185. const uchar * record,
  2186. bool* has_null,
  2187. bool dont_pack_pk,
  2188. int key_length
  2189. )
  2190. {
  2191. u_int32_t size = 0;
  2192. uchar* tmp_buff = buff;
  2193. my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
  2194. key->data = buff;
  2195. //
  2196. // first put the "infinity" byte at beginning. States if missing columns are implicitly
  2197. // positive infinity or negative infinity or zero. For this, because we are creating key
  2198. // from a row, there is no way that columns can be missing, so in practice,
  2199. // this will be meaningless. Might as well put in a value
  2200. //
  2201. *tmp_buff++ = COL_ZERO;
  2202. size++;
  2203. size += place_key_into_dbt_buff(
  2204. key_info,
  2205. tmp_buff,
  2206. record,
  2207. has_null,
  2208. key_length
  2209. );
  2210. if (!dont_pack_pk) {
  2211. tmp_buff = buff + size;
  2212. if (hidden_primary_key) {
  2213. memcpy_fixed(tmp_buff, current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
  2214. size += TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
  2215. }
  2216. else {
  2217. bool tmp_bool = false;
  2218. size += place_key_into_dbt_buff(
  2219. &table->key_info[primary_key],
  2220. tmp_buff,
  2221. record,
  2222. &tmp_bool,
  2223. MAX_KEY_LENGTH //this parameter does not matter
  2224. );
  2225. }
  2226. }
  2227. key->size = size;
  2228. DBUG_DUMP("key", (uchar *) key->data, key->size);
  2229. dbug_tmp_restore_column_map(table->write_set, old_map);
  2230. return key;
  2231. }
  2232. //
  2233. // Create a packed key from a row. This key will be written as such
  2234. // to the index tree. This will never fail as the key buffer is pre-allocated.
  2235. // Parameters:
  2236. // [out] key - DBT that holds the key
  2237. // keynr - index for which to create the key
  2238. // [out] buff - buffer that will hold the data for key (unless
  2239. // we have a hidden primary key)
  2240. // [in] record - row from which to create the key
  2241. // [out] has_null - says if the key has a NULL value for one of its columns
  2242. // key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
  2243. // Returns:
  2244. // the parameter key
  2245. //
  2246. DBT *ha_tokudb::create_dbt_key_from_table(
  2247. DBT * key,
  2248. uint keynr,
  2249. uchar * buff,
  2250. const uchar * record,
  2251. bool* has_null,
  2252. int key_length
  2253. )
  2254. {
  2255. TOKUDB_DBUG_ENTER("ha_tokudb::create_dbt_key_from_table");
  2256. bzero((void *) key, sizeof(*key));
  2257. if (hidden_primary_key && keynr == primary_key) {
  2258. key->data = buff;
  2259. memcpy(buff, &current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
  2260. key->size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
  2261. *has_null = false;
  2262. DBUG_RETURN(key);
  2263. }
  2264. DBUG_RETURN(create_dbt_key_from_key(key, &table->key_info[keynr],buff,record, has_null, (keynr == primary_key), key_length));
  2265. }
  2266. DBT* ha_tokudb::create_dbt_key_for_lookup(
  2267. DBT * key,
  2268. KEY* key_info,
  2269. uchar * buff,
  2270. const uchar * record,
  2271. bool* has_null,
  2272. int key_length
  2273. )
  2274. {
  2275. TOKUDB_DBUG_ENTER("ha_tokudb::create_dbt_key_from_lookup");
  2276. DBUG_RETURN(create_dbt_key_from_key(key, key_info, buff, record, has_null, true, key_length));
  2277. }
  2278. //
  2279. // Create a packed key from from a MySQL unpacked key (like the one that is
  2280. // sent from the index_read() This key is to be used to read a row
  2281. // Parameters:
  2282. // [out] key - DBT that holds the key
  2283. // keynr - index for which to pack the key
  2284. // [out] buff - buffer that will hold the data for key
  2285. // [in] key_ptr - MySQL unpacked key
  2286. // key_length - length of key_ptr
  2287. // Returns:
  2288. // the parameter key
  2289. //
  2290. DBT *ha_tokudb::pack_key(
  2291. DBT * key,
  2292. uint keynr,
  2293. uchar * buff,
  2294. const uchar * key_ptr,
  2295. uint key_length,
  2296. int8_t inf_byte
  2297. )
  2298. {
  2299. TOKUDB_DBUG_ENTER("ha_tokudb::pack_key");
  2300. KEY *key_info = &table->key_info[keynr];
  2301. KEY_PART_INFO *key_part = key_info->key_part;
  2302. KEY_PART_INFO *end = key_part + key_info->key_parts;
  2303. my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
  2304. bzero((void *) key, sizeof(*key));
  2305. key->data = buff;
  2306. //
  2307. // first put the "infinity" byte at beginning. States if missing columns are implicitly
  2308. // positive infinity or negative infinity
  2309. //
  2310. *buff++ = (uchar)inf_byte;
  2311. for (; key_part != end && (int) key_length > 0; key_part++) {
  2312. uint offset = 0;
  2313. if (key_part->null_bit) {
  2314. if (!(*key_ptr == 0)) {
  2315. *buff++ = NULL_COL_VAL;
  2316. key_length -= key_part->store_length;
  2317. key_ptr += key_part->store_length;
  2318. continue;
  2319. }
  2320. *buff++ = NONNULL_COL_VAL;
  2321. offset = 1; // Data is at key_ptr+1
  2322. }
  2323. assert(table->s->db_low_byte_first);
  2324. buff = pack_key_toku_key_field(
  2325. buff,
  2326. (uchar *) key_ptr + offset,
  2327. key_part->field,
  2328. key_part->length
  2329. );
  2330. key_ptr += key_part->store_length;
  2331. key_length -= key_part->store_length;
  2332. }
  2333. key->size = (buff - (uchar *) key->data);
  2334. DBUG_DUMP("key", (uchar *) key->data, key->size);
  2335. dbug_tmp_restore_column_map(table->write_set, old_map);
  2336. DBUG_RETURN(key);
  2337. }
  2338. //
  2339. // Reads the last element of dictionary of index keynr, and places
  2340. // the data into table->record[1].
  2341. //
  2342. int ha_tokudb::read_last(uint keynr) {
  2343. TOKUDB_DBUG_ENTER("ha_tokudb::read_last");
  2344. int do_commit = 0;
  2345. if (transaction == NULL) {
  2346. int r = db_env->txn_begin(db_env, 0, &transaction, 0);
  2347. assert(r == 0);
  2348. do_commit = 1;
  2349. }
  2350. int error = index_init(keynr, 0);
  2351. if (error == 0)
  2352. error = index_last(table->record[1]);
  2353. index_end();
  2354. if (do_commit) {
  2355. commit_txn(transaction, 0);
  2356. transaction = NULL;
  2357. }
  2358. TOKUDB_DBUG_RETURN(error);
  2359. }
  2360. //
  2361. // get max used hidden primary key value
  2362. //
  2363. void ha_tokudb::init_hidden_prim_key_info() {
  2364. TOKUDB_DBUG_ENTER("ha_tokudb::init_prim_key_info");
  2365. pthread_mutex_lock(&share->mutex);
  2366. if (!(share->status & STATUS_PRIMARY_KEY_INIT)) {
  2367. (void) extra(HA_EXTRA_KEYREAD);
  2368. int error = read_last(primary_key);
  2369. (void) extra(HA_EXTRA_NO_KEYREAD);
  2370. if (error == 0) {
  2371. share->auto_ident = hpk_char_to_num(current_ident);
  2372. }
  2373. share->status |= STATUS_PRIMARY_KEY_INIT;
  2374. }
  2375. pthread_mutex_unlock(&share->mutex);
  2376. DBUG_VOID_RETURN;
  2377. }
  2378. /** @brief
  2379. Get metadata info stored in status.tokudb
  2380. */
  2381. int ha_tokudb::get_status() {
  2382. TOKUDB_DBUG_ENTER("ha_tokudb::get_status");
  2383. DB_TXN* txn = NULL;
  2384. DBT key, value;
  2385. HA_METADATA_KEY curr_key;
  2386. int error;
  2387. //
  2388. // open status.tokudb
  2389. //
  2390. if (!share->status_block) {
  2391. error = open_status_dictionary(
  2392. &share->status_block,
  2393. share->table_name,
  2394. NULL
  2395. );
  2396. if (error) {
  2397. goto cleanup;
  2398. }
  2399. }
  2400. //
  2401. // transaction to be used for putting metadata into status.tokudb
  2402. //
  2403. bzero(&key, sizeof(key));
  2404. bzero(&value, sizeof(value));
  2405. key.data = &curr_key;
  2406. key.size = sizeof(curr_key);
  2407. value.flags = DB_DBT_USERMEM;
  2408. error = db_env->txn_begin(db_env, 0, &txn, 0);
  2409. if (error) { goto cleanup; }
  2410. assert(share->status_block);
  2411. //
  2412. // get version
  2413. //
  2414. value.ulen = sizeof(share->version);
  2415. value.data = &share->version;
  2416. curr_key = hatoku_version;
  2417. error = share->status_block->get(
  2418. share->status_block,
  2419. txn,
  2420. &key,
  2421. &value,
  2422. 0
  2423. );
  2424. if (error == DB_NOTFOUND) {
  2425. share->version = 0;
  2426. }
  2427. else if (error || value.size != sizeof(share->version)) {
  2428. if (error == 0) {
  2429. error = HA_ERR_INTERNAL_ERROR;
  2430. }
  2431. goto cleanup;
  2432. }
  2433. //
  2434. // get capabilities
  2435. //
  2436. curr_key = hatoku_capabilities;
  2437. value.ulen = sizeof(share->capabilities);
  2438. value.data = &share->capabilities;
  2439. error = share->status_block->get(
  2440. share->status_block,
  2441. txn,
  2442. &key,
  2443. &value,
  2444. 0
  2445. );
  2446. if (error == DB_NOTFOUND) {
  2447. share->capabilities= 0;
  2448. }
  2449. else if (error || value.size != sizeof(share->version)) {
  2450. if (error == 0) {
  2451. error = HA_ERR_INTERNAL_ERROR;
  2452. }
  2453. goto cleanup;
  2454. }
  2455. error = 0;
  2456. cleanup:
  2457. if (txn) {
  2458. commit_txn(txn,0);
  2459. }
  2460. TOKUDB_DBUG_RETURN(error);
  2461. }
  2462. /** @brief
  2463. Return an estimated of the number of rows in the table.
  2464. Used when sorting to allocate buffers and by the optimizer.
  2465. This is used in filesort.cc.
  2466. */
  2467. ha_rows ha_tokudb::estimate_rows_upper_bound() {
  2468. TOKUDB_DBUG_ENTER("ha_tokudb::estimate_rows_upper_bound");
  2469. DBUG_RETURN(share->rows + HA_TOKUDB_EXTRA_ROWS);
  2470. }
  2471. //
  2472. // Function that compares two primary keys that were saved as part of rnd_pos
  2473. // and ::position
  2474. //
  2475. int ha_tokudb::cmp_ref(const uchar * ref1, const uchar * ref2) {
  2476. int ret_val = 0;
  2477. ret_val = tokudb_compare_two_keys(
  2478. ref1 + sizeof(u_int32_t),
  2479. *(u_int32_t *)ref1,
  2480. ref2 + sizeof(u_int32_t),
  2481. *(u_int32_t *)ref2,
  2482. (uchar *)share->file->descriptor->data + 4,
  2483. *(u_int32_t *)share->file->descriptor->data - 4,
  2484. false
  2485. );
  2486. return ret_val;
  2487. }
  2488. bool ha_tokudb::check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes) {
  2489. //
  2490. // This is a horrendous hack for now, as copied by InnoDB.
  2491. // This states that if the auto increment create field has changed,
  2492. // via a "alter table foo auto_increment=new_val", that this
  2493. // change is incompatible, and to rebuild the entire table
  2494. // This will need to be fixed
  2495. //
  2496. if ((info->used_fields & HA_CREATE_USED_AUTO) &&
  2497. info->auto_increment_value != 0) {
  2498. return COMPATIBLE_DATA_NO;
  2499. }
  2500. if (table_changes != IS_EQUAL_YES)
  2501. return COMPATIBLE_DATA_NO;
  2502. return COMPATIBLE_DATA_YES;
  2503. }
  2504. //
  2505. // Method that is called before the beginning of many calls
  2506. // to insert rows (ha_tokudb::write_row). There is no guarantee
  2507. // that start_bulk_insert is called, however there is a guarantee
  2508. // that if start_bulk_insert is called, then end_bulk_insert may be
  2509. // called as well.
  2510. // Parameters:
  2511. // [in] rows - an estimate of the number of rows that will be inserted
  2512. // if number of rows is unknown (such as if doing
  2513. // "insert into foo select * from bar), then rows
  2514. // will be 0
  2515. //
  2516. //
  2517. // This function returns true if the table MAY be empty.
  2518. // It is NOT meant to be a 100% check for emptiness.
  2519. // This is used for a bulk load optimization.
  2520. //
  2521. bool ha_tokudb::may_table_be_empty() {
  2522. int error;
  2523. bool ret_val = false;
  2524. DBC* tmp_cursor = NULL;
  2525. DB_TXN* txn = NULL;
  2526. error = db_env->txn_begin(db_env, 0, &txn, 0);
  2527. if (error) {
  2528. goto cleanup;
  2529. }
  2530. error = share->file->cursor(share->file, txn, &tmp_cursor, 0);
  2531. if (error) {
  2532. goto cleanup;
  2533. }
  2534. error = tmp_cursor->c_getf_next(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
  2535. if (error == DB_NOTFOUND) {
  2536. ret_val = true;
  2537. }
  2538. else {
  2539. ret_val = false;
  2540. }
  2541. error = 0;
  2542. cleanup:
  2543. if (tmp_cursor) {
  2544. int r = tmp_cursor->c_close(tmp_cursor);
  2545. assert(r==0);
  2546. tmp_cursor = NULL;
  2547. }
  2548. if (txn) {
  2549. commit_txn(txn, 0);
  2550. txn = NULL;
  2551. }
  2552. return ret_val;
  2553. }
  2554. void ha_tokudb::start_bulk_insert(ha_rows rows) {
  2555. TOKUDB_DBUG_ENTER("ha_tokudb::start_bulk_insert");
  2556. THD* thd = ha_thd();
  2557. delay_updating_ai_metadata = true;
  2558. ai_metadata_update_required = false;
  2559. abort_loader = false;
  2560. if (share->try_table_lock) {
  2561. if (get_prelock_empty(thd) && may_table_be_empty()) {
  2562. if (using_ignore || get_load_save_space(thd)) {
  2563. acquire_table_lock(transaction, lock_write);
  2564. }
  2565. else {
  2566. u_int32_t mult_put_flags[MAX_KEY + 1] = {DB_YESOVERWRITE};
  2567. u_int32_t mult_dbt_flags[MAX_KEY + 1] = {DB_DBT_REALLOC};
  2568. uint curr_num_DBs = table->s->keys + test(hidden_primary_key);
  2569. mult_dbt_flags[primary_key] = 0;
  2570. if (!thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS) && !hidden_primary_key) {
  2571. mult_put_flags[primary_key] = DB_NOOVERWRITE;
  2572. }
  2573. int error = db_env->create_loader(
  2574. db_env,
  2575. transaction,
  2576. &loader,
  2577. NULL, // no src_db needed
  2578. curr_num_DBs,
  2579. share->key_file,
  2580. mult_put_flags,
  2581. mult_dbt_flags,
  2582. 0
  2583. );
  2584. if (error) {
  2585. assert(loader == NULL);
  2586. goto exit_try_table_lock;
  2587. }
  2588. lc.thd = thd;
  2589. lc.ha = this;
  2590. error = loader->set_poll_function(loader, poll_fun, &lc);
  2591. assert(!error);
  2592. error = loader->set_error_callback(loader, loader_dup_fun, &lc);
  2593. assert(!error);
  2594. }
  2595. }
  2596. exit_try_table_lock:
  2597. pthread_mutex_lock(&share->mutex);
  2598. share->try_table_lock = false;
  2599. pthread_mutex_unlock(&share->mutex);
  2600. }
  2601. DBUG_VOID_RETURN;
  2602. }
  2603. //
  2604. // Method that is called at the end of many calls to insert rows
  2605. // (ha_tokudb::write_row). If start_bulk_insert is called, then
  2606. // this is guaranteed to be called.
  2607. //
  2608. int ha_tokudb::end_bulk_insert() {
  2609. TOKUDB_DBUG_ENTER("ha_tokudb::end_bulk_insert");
  2610. int error = 0;
  2611. if (ai_metadata_update_required) {
  2612. pthread_mutex_lock(&share->mutex);
  2613. error = update_max_auto_inc(share->status_block, share->last_auto_increment);
  2614. pthread_mutex_unlock(&share->mutex);
  2615. if (error) { goto cleanup; }
  2616. }
  2617. delay_updating_ai_metadata = false;
  2618. ai_metadata_update_required = false;
  2619. loader_error = 0;
  2620. if (loader) {
  2621. if (!abort_loader) {
  2622. error = loader->close(loader);
  2623. loader = NULL;
  2624. if (error) { goto cleanup; }
  2625. for (uint i = 0; i < table_share->keys; i++) {
  2626. if (table_share->key_info[i].flags & HA_NOSAME) {
  2627. bool is_unique;
  2628. if (i == primary_key) {
  2629. continue;
  2630. }
  2631. error = is_index_unique(
  2632. &is_unique,
  2633. transaction,
  2634. share->key_file[i],
  2635. &table->key_info[i]
  2636. );
  2637. if (error) goto cleanup;
  2638. if (!is_unique) {
  2639. error = HA_ERR_FOUND_DUPP_KEY;
  2640. last_dup_key = i;
  2641. goto cleanup;
  2642. }
  2643. }
  2644. }
  2645. }
  2646. else {
  2647. loader->abort(loader);
  2648. loader = NULL;
  2649. }
  2650. }
  2651. cleanup:
  2652. if (loader) {
  2653. loader->abort(loader);
  2654. loader = NULL;
  2655. }
  2656. abort_loader = false;
  2657. bzero(&lc,sizeof(lc));
  2658. if (error || loader_error) {
  2659. my_errno = error ? error : loader_error;
  2660. }
  2661. TOKUDB_DBUG_RETURN(error ? error : loader_error);
  2662. }
  2663. int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info) {
  2664. int error;
  2665. DBC* tmp_cursor1 = NULL;
  2666. DBC* tmp_cursor2 = NULL;
  2667. DBT key1, key2, val, packed_key1, packed_key2;
  2668. bzero(&key1, sizeof(key1));
  2669. bzero(&key2, sizeof(key2));
  2670. bzero(&val, sizeof(val));
  2671. bzero(&packed_key1, sizeof(packed_key1));
  2672. bzero(&packed_key2, sizeof(packed_key2));
  2673. *is_unique = true;
  2674. error = db->cursor(
  2675. db,
  2676. txn,
  2677. &tmp_cursor1,
  2678. 0
  2679. );
  2680. if (error) { goto cleanup; }
  2681. error = db->cursor(
  2682. db,
  2683. txn,
  2684. &tmp_cursor2,
  2685. 0
  2686. );
  2687. if (error) { goto cleanup; }
  2688. error = tmp_cursor1->c_get(
  2689. tmp_cursor1,
  2690. &key1,
  2691. &val,
  2692. DB_NEXT
  2693. );
  2694. if (error == DB_NOTFOUND) {
  2695. *is_unique = true;
  2696. error = 0;
  2697. goto cleanup;
  2698. }
  2699. else if (error) { goto cleanup; }
  2700. error = tmp_cursor2->c_get(
  2701. tmp_cursor2,
  2702. &key2,
  2703. &val,
  2704. DB_NEXT
  2705. );
  2706. if (error) { goto cleanup; }
  2707. error = tmp_cursor2->c_get(
  2708. tmp_cursor2,
  2709. &key2,
  2710. &val,
  2711. DB_NEXT
  2712. );
  2713. if (error == DB_NOTFOUND) {
  2714. *is_unique = true;
  2715. error = 0;
  2716. goto cleanup;
  2717. }
  2718. else if (error) { goto cleanup; }
  2719. while (error != DB_NOTFOUND) {
  2720. bool has_null1;
  2721. bool has_null2;
  2722. int cmp;
  2723. place_key_into_mysql_buff(
  2724. key_info,
  2725. table->record[0],
  2726. (uchar *) key1.data + 1
  2727. );
  2728. place_key_into_mysql_buff(
  2729. key_info,
  2730. table->record[1],
  2731. (uchar *) key2.data + 1
  2732. );
  2733. create_dbt_key_for_lookup(
  2734. &packed_key1,
  2735. key_info,
  2736. key_buff,
  2737. table->record[0],
  2738. &has_null1
  2739. );
  2740. create_dbt_key_for_lookup(
  2741. &packed_key2,
  2742. key_info,
  2743. key_buff2,
  2744. table->record[1],
  2745. &has_null2
  2746. );
  2747. if (!has_null1 && !has_null2) {
  2748. cmp = tokudb_prefix_cmp_dbt_key(db, &packed_key1, &packed_key2);
  2749. if (cmp == 0) {
  2750. memcpy(key_buff, key1.data, key1.size);
  2751. place_key_into_mysql_buff(
  2752. key_info,
  2753. table->record[0],
  2754. (uchar *) key_buff + 1
  2755. );
  2756. *is_unique = false;
  2757. break;
  2758. }
  2759. }
  2760. error = tmp_cursor1->c_get(
  2761. tmp_cursor1,
  2762. &key1,
  2763. &val,
  2764. DB_NEXT
  2765. );
  2766. if (error) { goto cleanup; }
  2767. error = tmp_cursor2->c_get(
  2768. tmp_cursor2,
  2769. &key2,
  2770. &val,
  2771. DB_NEXT
  2772. );
  2773. if (error && (error != DB_NOTFOUND)) { goto cleanup; }
  2774. }
  2775. error = 0;
  2776. cleanup:
  2777. if (tmp_cursor1) {
  2778. tmp_cursor1->c_close(tmp_cursor1);
  2779. tmp_cursor1 = NULL;
  2780. }
  2781. if (tmp_cursor2) {
  2782. tmp_cursor2->c_close(tmp_cursor2);
  2783. tmp_cursor2 = NULL;
  2784. }
  2785. return error;
  2786. }
  2787. int ha_tokudb::is_val_unique(bool* is_unique, uchar* record, KEY* key_info, uint dict_index, DB_TXN* txn) {
  2788. DBT key;
  2789. int error = 0;
  2790. bool has_null;
  2791. DBC* tmp_cursor = NULL;
  2792. struct index_read_info ir_info;
  2793. struct smart_dbt_info info;
  2794. bzero((void *)&key, sizeof(key));
  2795. info.ha = this;
  2796. info.buf = NULL;
  2797. info.keynr = dict_index;
  2798. ir_info.smart_dbt_info = info;
  2799. create_dbt_key_for_lookup(
  2800. &key,
  2801. key_info,
  2802. key_buff3,
  2803. record,
  2804. &has_null
  2805. );
  2806. ir_info.orig_key = &key;
  2807. if (has_null) {
  2808. error = 0;
  2809. *is_unique = true;
  2810. goto cleanup;
  2811. }
  2812. error = share->key_file[dict_index]->cursor(
  2813. share->key_file[dict_index],
  2814. txn,
  2815. &tmp_cursor,
  2816. 0
  2817. );
  2818. if (error) { goto cleanup; }
  2819. error = tmp_cursor->c_getf_set_range(
  2820. tmp_cursor,
  2821. 0,
  2822. &key,
  2823. smart_dbt_callback_lookup,
  2824. &ir_info
  2825. );
  2826. if (error == DB_NOTFOUND) {
  2827. *is_unique = true;
  2828. error = 0;
  2829. goto cleanup;
  2830. }
  2831. else if (error) {
  2832. goto cleanup;
  2833. }
  2834. if (ir_info.cmp) {
  2835. *is_unique = true;
  2836. }
  2837. else {
  2838. *is_unique = false;
  2839. }
  2840. error = 0;
  2841. cleanup:
  2842. if (tmp_cursor) {
  2843. int r = tmp_cursor->c_close(tmp_cursor);
  2844. assert(r==0);
  2845. tmp_cursor = NULL;
  2846. }
  2847. return error;
  2848. }
  2849. int ha_tokudb::do_uniqueness_checks(uchar* record, DB_TXN* txn, THD* thd) {
  2850. int error;
  2851. //
  2852. // first do uniqueness checks
  2853. //
  2854. if (share->has_unique_keys && !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS)) {
  2855. for (uint keynr = 0; keynr < table_share->keys; keynr++) {
  2856. bool is_unique_key = table->key_info[keynr].flags & HA_NOSAME;
  2857. bool is_unique = false;
  2858. //
  2859. // don't need to do check for primary key
  2860. //
  2861. if (keynr == primary_key) {
  2862. continue;
  2863. }
  2864. if (!is_unique_key) {
  2865. continue;
  2866. }
  2867. //
  2868. // if unique key, check uniqueness constraint
  2869. // but, we do not need to check it if the key has a null
  2870. // and we do not need to check it if unique_checks is off
  2871. //
  2872. error = is_val_unique(&is_unique, record, &table->key_info[keynr], keynr, txn);
  2873. if (error) { goto cleanup; }
  2874. if (!is_unique) {
  2875. error = DB_KEYEXIST;
  2876. last_dup_key = keynr;
  2877. goto cleanup;
  2878. }
  2879. }
  2880. }
  2881. error = 0;
  2882. cleanup:
  2883. return error;
  2884. }
  2885. int ha_tokudb::test_row_packing(uchar* record, DBT* pk_key, DBT* pk_val) {
  2886. int error;
  2887. DBT row, key;
  2888. //
  2889. // variables for testing key packing, only used in some debug modes
  2890. //
  2891. uchar* tmp_pk_key_data = NULL;
  2892. uchar* tmp_pk_val_data = NULL;
  2893. DBT tmp_pk_key;
  2894. DBT tmp_pk_val;
  2895. bool has_null;
  2896. bzero(&tmp_pk_key, sizeof(DBT));
  2897. bzero(&tmp_pk_val, sizeof(DBT));
  2898. //
  2899. //use for testing the packing of keys
  2900. //
  2901. tmp_pk_key_data = (uchar *)my_malloc(pk_key->size, MYF(MY_WME));
  2902. assert(tmp_pk_key_data);
  2903. tmp_pk_val_data = (uchar *)my_malloc(pk_val->size, MYF(MY_WME));
  2904. assert(tmp_pk_val_data);
  2905. memcpy(tmp_pk_key_data, pk_key->data, pk_key->size);
  2906. memcpy(tmp_pk_val_data, pk_val->data, pk_val->size);
  2907. tmp_pk_key.data = tmp_pk_key_data;
  2908. tmp_pk_key.size = pk_key->size;
  2909. tmp_pk_val.data = tmp_pk_val_data;
  2910. tmp_pk_val.size = pk_val->size;
  2911. for (uint keynr = 0; keynr < table_share->keys; keynr++) {
  2912. u_int32_t tmp_num_bytes = 0;
  2913. int cmp;
  2914. uchar* row_desc = NULL;
  2915. u_int32_t desc_size = 0;
  2916. if (keynr == primary_key) {
  2917. continue;
  2918. }
  2919. create_dbt_key_from_table(&key, keynr, mult_key_buff[keynr], record, &has_null);
  2920. //
  2921. // TEST
  2922. //
  2923. row_desc = (uchar *)share->key_file[keynr]->descriptor->data;
  2924. row_desc += (*(u_int32_t *)row_desc);
  2925. desc_size = (*(u_int32_t *)row_desc) - 4;
  2926. row_desc += 4;
  2927. tmp_num_bytes = pack_key_from_desc(
  2928. key_buff3,
  2929. row_desc,
  2930. desc_size,
  2931. &tmp_pk_key,
  2932. &tmp_pk_val
  2933. );
  2934. assert(tmp_num_bytes == key.size);
  2935. cmp = memcmp(key_buff3,mult_key_buff[keynr],tmp_num_bytes);
  2936. assert(cmp == 0);
  2937. //
  2938. // test key packing of clustering keys
  2939. //
  2940. if (table->key_info[keynr].flags & HA_CLUSTERING) {
  2941. error = pack_row(&row, (const uchar *) record, keynr);
  2942. if (error) { goto cleanup; }
  2943. uchar* tmp_buff = NULL;
  2944. tmp_buff = (uchar *)my_malloc(alloced_rec_buff_length,MYF(MY_WME));
  2945. assert(tmp_buff);
  2946. row_desc = (uchar *)share->key_file[keynr]->descriptor->data;
  2947. row_desc += (*(u_int32_t *)row_desc);
  2948. row_desc += (*(u_int32_t *)row_desc);
  2949. desc_size = (*(u_int32_t *)row_desc) - 4;
  2950. row_desc += 4;
  2951. tmp_num_bytes = pack_clustering_val_from_desc(
  2952. tmp_buff,
  2953. row_desc,
  2954. desc_size,
  2955. &tmp_pk_val
  2956. );
  2957. assert(tmp_num_bytes == row.size);
  2958. cmp = memcmp(tmp_buff,rec_buff,tmp_num_bytes);
  2959. assert(cmp == 0);
  2960. my_free(tmp_buff,MYF(MY_ALLOW_ZERO_PTR));
  2961. }
  2962. }
  2963. error = 0;
  2964. cleanup:
  2965. my_free(tmp_pk_key_data,MYF(MY_ALLOW_ZERO_PTR));
  2966. my_free(tmp_pk_val_data,MYF(MY_ALLOW_ZERO_PTR));
  2967. return error;
  2968. }
  2969. //
  2970. // set the put flags for the main dictionary
  2971. //
  2972. void ha_tokudb::set_main_dict_put_flags(THD* thd, u_int32_t* put_flags) {
  2973. //
  2974. // optimization for "REPLACE INTO..." (and "INSERT IGNORE") command
  2975. // if the command is "REPLACE INTO" and the only table
  2976. // is the main table (or all indexes are a subset of the pk),
  2977. // then we can simply insert the element
  2978. // with DB_YESOVERWRITE. If the element does not exist,
  2979. // it will act as a normal insert, and if it does exist, it
  2980. // will act as a replace, which is exactly what REPLACE INTO is supposed
  2981. // to do. We cannot do this if otherwise, because then we lose
  2982. // consistency between indexes
  2983. //
  2984. if (hidden_primary_key){
  2985. *put_flags = DB_YESOVERWRITE;
  2986. }
  2987. else if (thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS) &&
  2988. !is_replace_into(thd) &&
  2989. !is_insert_ignore(thd)
  2990. )
  2991. {
  2992. *put_flags = DB_YESOVERWRITE;
  2993. }
  2994. else if (do_ignore_flag_optimization(thd,table,share->replace_into_fast) &&
  2995. is_replace_into(thd)
  2996. )
  2997. {
  2998. *put_flags = DB_YESOVERWRITE;
  2999. }
  3000. else if (do_ignore_flag_optimization(thd,table,share->replace_into_fast) &&
  3001. is_insert_ignore(thd)
  3002. )
  3003. {
  3004. *put_flags = DB_NOOVERWRITE_NO_ERROR;
  3005. }
  3006. else
  3007. {
  3008. *put_flags = DB_NOOVERWRITE;
  3009. }
  3010. }
  3011. int ha_tokudb::insert_row_to_main_dictionary(uchar* record, DBT* pk_key, DBT* pk_val, DB_TXN* txn) {
  3012. int error = 0;
  3013. u_int32_t put_flags = 0;
  3014. THD *thd = ha_thd();
  3015. uint curr_num_DBs = table->s->keys + test(hidden_primary_key);
  3016. ulonglong wait_lock_time = get_write_lock_wait_time(thd);
  3017. assert(curr_num_DBs == 1);
  3018. set_main_dict_put_flags(thd,&put_flags);
  3019. lockretryN(wait_lock_time){
  3020. error = share->file->put(
  3021. share->file,
  3022. txn,
  3023. pk_key,
  3024. pk_val,
  3025. put_flags
  3026. );
  3027. lockretry_wait;
  3028. }
  3029. if (error) {
  3030. last_dup_key = primary_key;
  3031. goto cleanup;
  3032. }
  3033. cleanup:
  3034. return error;
  3035. }
  3036. int ha_tokudb::insert_rows_to_dictionaries_mult(DBT* pk_key, DBT* pk_val, DB_TXN* txn, THD* thd) {
  3037. int error = 0;
  3038. bool is_replace = is_replace_into(thd);
  3039. uint curr_num_DBs = table->s->keys + test(hidden_primary_key);
  3040. ulonglong wait_lock_time = get_write_lock_wait_time(thd);
  3041. u_int32_t mult_put_flags[MAX_KEY + 1] = {DB_YESOVERWRITE};
  3042. set_main_dict_put_flags(thd, &mult_put_flags[primary_key]);
  3043. if (mult_put_flags[primary_key] == DB_NOOVERWRITE_NO_ERROR) {
  3044. //
  3045. //hopefully temporary, right now, put_multiple does not
  3046. // support use of DB_NOOVERWRITE_NO_ERROR as put_flag
  3047. //
  3048. mult_put_flags[primary_key] = DB_NOOVERWRITE;
  3049. }
  3050. lockretryN(wait_lock_time){
  3051. error = db_env->put_multiple(
  3052. db_env,
  3053. NULL,
  3054. txn,
  3055. pk_key,
  3056. pk_val,
  3057. curr_num_DBs,
  3058. share->key_file,
  3059. mult_key_dbt,
  3060. mult_rec_dbt,
  3061. mult_put_flags,
  3062. NULL
  3063. );
  3064. lockretry_wait;
  3065. }
  3066. //
  3067. // We break if we hit an error, unless it is a dup key error
  3068. // and MySQL told us to ignore duplicate key errors
  3069. //
  3070. if (error) {
  3071. last_dup_key = primary_key;
  3072. }
  3073. return error;
  3074. }
  3075. //
  3076. // Stores a row in the table, called when handling an INSERT query
  3077. // Parameters:
  3078. // [in] record - a row in MySQL format
  3079. // Returns:
  3080. // 0 on success
  3081. // error otherwise
  3082. //
  3083. int ha_tokudb::write_row(uchar * record) {
  3084. TOKUDB_DBUG_ENTER("ha_tokudb::write_row");
  3085. DBT row, prim_key;
  3086. int error;
  3087. THD *thd = ha_thd();
  3088. bool has_null;
  3089. DB_TXN* sub_trans = NULL;
  3090. DB_TXN* txn = NULL;
  3091. tokudb_trx_data *trx = NULL;
  3092. uint curr_num_DBs = table->s->keys + test(hidden_primary_key);
  3093. bool create_sub_trans = false;
  3094. //
  3095. // some crap that needs to be done because MySQL does not properly abstract
  3096. // this work away from us, namely filling in auto increment and setting auto timestamp
  3097. //
  3098. statistic_increment(table->in_use->status_var.ha_write_count, &LOCK_status);
  3099. if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT) {
  3100. table->timestamp_field->set_time();
  3101. }
  3102. if (table->next_number_field && record == table->record[0]) {
  3103. update_auto_increment();
  3104. }
  3105. //
  3106. // check to see if some value for the auto increment column that is bigger
  3107. // than anything else til now is being used. If so, update the metadata to reflect it
  3108. // the goal here is we never want to have a dup key error due to a bad increment
  3109. // of the auto inc field.
  3110. //
  3111. if (share->has_auto_inc && record == table->record[0]) {
  3112. pthread_mutex_lock(&share->mutex);
  3113. ulonglong curr_auto_inc = retrieve_auto_increment(
  3114. table->field[share->ai_field_index]->key_type(),
  3115. field_offset(table->field[share->ai_field_index], table),
  3116. record
  3117. );
  3118. if (curr_auto_inc > share->last_auto_increment) {
  3119. share->last_auto_increment = curr_auto_inc;
  3120. if (delay_updating_ai_metadata) {
  3121. ai_metadata_update_required = true;
  3122. }
  3123. else {
  3124. update_max_auto_inc(share->status_block, share->last_auto_increment);
  3125. }
  3126. }
  3127. pthread_mutex_unlock(&share->mutex);
  3128. }
  3129. if (hidden_primary_key) {
  3130. get_auto_primary_key(current_ident);
  3131. }
  3132. if (table_share->blob_fields) {
  3133. if (fix_rec_buff_for_blob(max_row_length(record))) {
  3134. error = HA_ERR_OUT_OF_MEM;
  3135. goto cleanup;
  3136. }
  3137. }
  3138. create_dbt_key_from_table(&prim_key, primary_key, primary_key_buff, record, &has_null);
  3139. if ((error = pack_row(&row, (const uchar *) record, primary_key))){
  3140. goto cleanup;
  3141. }
  3142. create_sub_trans = (using_ignore && !(do_ignore_flag_optimization(thd,table,share->replace_into_fast)));
  3143. if (create_sub_trans) {
  3144. error = db_env->txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION);
  3145. if (error) {
  3146. goto cleanup;
  3147. }
  3148. }
  3149. txn = create_sub_trans ? sub_trans : transaction;
  3150. //
  3151. // make sure the buffers for the rows are big enough
  3152. //
  3153. fix_mult_rec_buff();
  3154. if (tokudb_debug & TOKUDB_DEBUG_CHECK_KEY) {
  3155. error = test_row_packing(record,&prim_key,&row);
  3156. if (error) { goto cleanup; }
  3157. }
  3158. if (loader) {
  3159. error = loader->put(loader, &prim_key, &row);
  3160. if (error) {
  3161. abort_loader = true;
  3162. goto cleanup;
  3163. }
  3164. }
  3165. else {
  3166. if (curr_num_DBs == 1) {
  3167. error = insert_row_to_main_dictionary(record,&prim_key, &row, txn);
  3168. if (error) { goto cleanup; }
  3169. }
  3170. else {
  3171. error = do_uniqueness_checks(record, txn, thd);
  3172. if (error) { goto cleanup; }
  3173. error = insert_rows_to_dictionaries_mult(&prim_key, &row, txn, thd);
  3174. if (error) { goto cleanup; }
  3175. }
  3176. }
  3177. trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);
  3178. if (!error) {
  3179. added_rows++;
  3180. trx->stmt_progress.inserted++;
  3181. track_progress(thd);
  3182. }
  3183. cleanup:
  3184. if (error == DB_KEYEXIST) {
  3185. error = HA_ERR_FOUND_DUPP_KEY;
  3186. }
  3187. if (sub_trans) {
  3188. // no point in recording error value of abort.
  3189. // nothing we can do about it anyway and it is not what
  3190. // we want to return.
  3191. if (error) {
  3192. abort_txn(sub_trans);
  3193. }
  3194. else {
  3195. commit_txn(sub_trans, DB_TXN_NOSYNC);
  3196. }
  3197. }
  3198. TOKUDB_DBUG_RETURN(error);
  3199. }
  3200. /* Compare if a key in a row has changed */
  3201. int ha_tokudb::key_cmp(uint keynr, const uchar * old_row, const uchar * new_row) {
  3202. KEY_PART_INFO *key_part = table->key_info[keynr].key_part;
  3203. KEY_PART_INFO *end = key_part + table->key_info[keynr].key_parts;
  3204. for (; key_part != end; key_part++) {
  3205. if (key_part->null_bit) {
  3206. if ((old_row[key_part->null_offset] & key_part->null_bit) != (new_row[key_part->null_offset] & key_part->null_bit))
  3207. return 1;
  3208. }
  3209. if (key_part->key_part_flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART)) {
  3210. if (key_part->field->cmp_binary((uchar *) (old_row + key_part->offset), (uchar *) (new_row + key_part->offset), (ulong) key_part->length))
  3211. return 1;
  3212. } else {
  3213. if (memcmp(old_row + key_part->offset, new_row + key_part->offset, key_part->length))
  3214. return 1;
  3215. }
  3216. }
  3217. return 0;
  3218. }
  3219. //
  3220. // Updates a row in the table, called when handling an UPDATE query
  3221. // Parameters:
  3222. // [in] old_row - row to be updated, in MySQL format
  3223. // [in] new_row - new row, in MySQL format
  3224. // Returns:
  3225. // 0 on success
  3226. // error otherwise
  3227. //
  3228. int ha_tokudb::update_row(const uchar * old_row, uchar * new_row) {
  3229. TOKUDB_DBUG_ENTER("update_row");
  3230. DBT prim_key, key, old_prim_key, row, prim_row;
  3231. int error;
  3232. bool primary_key_changed;
  3233. bool has_null;
  3234. THD* thd = ha_thd();
  3235. DB_TXN* sub_trans = NULL;
  3236. DB_TXN* txn = NULL;
  3237. tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);
  3238. u_int32_t mult_put_flags[MAX_KEY + 1] = {DB_YESOVERWRITE};
  3239. DB* dbs[MAX_KEY + 1];
  3240. DBT key_dbts[MAX_KEY + 1];
  3241. DBT rec_dbts[MAX_KEY + 1];
  3242. u_int32_t curr_db_index;
  3243. ulonglong wait_lock_time = get_write_lock_wait_time(thd);
  3244. LINT_INIT(error);
  3245. bzero((void *) &row, sizeof(row));
  3246. bzero((void *) &prim_key, sizeof(prim_key));
  3247. bzero((void *) &old_prim_key, sizeof(old_prim_key));
  3248. bzero((void *) &prim_row, sizeof(prim_row));
  3249. bzero((void *) &key, sizeof(key));
  3250. bzero((void *) &key_dbts, sizeof(key));
  3251. bzero((void *) &rec_dbts, sizeof(key));
  3252. statistic_increment(table->in_use->status_var.ha_update_count, &LOCK_status);
  3253. if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) {
  3254. table->timestamp_field->set_time();
  3255. }
  3256. //
  3257. // check to see if some value for the auto increment column that is bigger
  3258. // than anything else til now is being used. If so, update the metadata to reflect it
  3259. // the goal here is we never want to have a dup key error due to a bad increment
  3260. // of the auto inc field.
  3261. //
  3262. if (share->has_auto_inc && new_row == table->record[0]) {
  3263. pthread_mutex_lock(&share->mutex);
  3264. ulonglong curr_auto_inc = retrieve_auto_increment(
  3265. table->field[share->ai_field_index]->key_type(),
  3266. field_offset(table->field[share->ai_field_index], table),
  3267. new_row
  3268. );
  3269. if (curr_auto_inc > share->last_auto_increment) {
  3270. error = update_max_auto_inc(share->status_block, curr_auto_inc);
  3271. if (!error) {
  3272. share->last_auto_increment = curr_auto_inc;
  3273. }
  3274. }
  3275. pthread_mutex_unlock(&share->mutex);
  3276. }
  3277. if (using_ignore) {
  3278. error = db_env->txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION);
  3279. if (error) {
  3280. goto cleanup;
  3281. }
  3282. }
  3283. txn = using_ignore ? sub_trans : transaction;
  3284. if (hidden_primary_key) {
  3285. primary_key_changed = 0;
  3286. bzero((void *) &prim_key, sizeof(prim_key));
  3287. prim_key.data = (void *) current_ident;
  3288. prim_key.size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
  3289. old_prim_key = prim_key;
  3290. }
  3291. else {
  3292. create_dbt_key_from_table(&prim_key, primary_key, key_buff, new_row, &has_null);
  3293. if ((primary_key_changed = key_cmp(primary_key, old_row, new_row))) {
  3294. create_dbt_key_from_table(&old_prim_key, primary_key, primary_key_buff, old_row, &has_null);
  3295. }
  3296. else {
  3297. old_prim_key = prim_key;
  3298. }
  3299. }
  3300. if (primary_key_changed) {
  3301. // Primary key changed or we are updating a key that can have duplicates.
  3302. // Delete the old row and add a new one
  3303. error = remove_key(txn, primary_key, old_row, &old_prim_key);
  3304. if (error) { goto cleanup; }
  3305. }
  3306. error = pack_row(&prim_row, new_row, primary_key);
  3307. if (error) { goto cleanup; }
  3308. dbs[0] = share->key_file[primary_key];
  3309. key_dbts[0] = prim_key;
  3310. rec_dbts[0] = prim_row;
  3311. mult_put_flags[0] = primary_key_changed ? DB_NOOVERWRITE : DB_YESOVERWRITE;
  3312. curr_db_index = 1;
  3313. // Update all other keys
  3314. for (uint keynr = 0; keynr < table_share->keys; keynr++) {
  3315. bool secondary_key_changed = key_cmp(keynr, old_row, new_row);
  3316. if (keynr == primary_key) {
  3317. continue;
  3318. }
  3319. if (table->key_info[keynr].flags & HA_CLUSTERING ||
  3320. secondary_key_changed ||
  3321. primary_key_changed
  3322. )
  3323. {
  3324. bool is_unique_key = table->key_info[keynr].flags & HA_NOSAME;
  3325. //
  3326. // only remove the old value if the key has changed
  3327. // if the key has not changed (in case of clustering keys,
  3328. // then we overwrite the old value)
  3329. //
  3330. if (secondary_key_changed || primary_key_changed) {
  3331. error = remove_key(txn, keynr, old_row, &old_prim_key);
  3332. if (error) {
  3333. goto cleanup;
  3334. }
  3335. }
  3336. //
  3337. // if unique key, check uniqueness constraint
  3338. // but, we do not need to check it if the key has a null
  3339. // and we do not need to check it if unique_checks is off
  3340. //
  3341. if (is_unique_key && !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS)) {
  3342. bool is_unique = false;
  3343. error = is_val_unique(&is_unique, new_row, &table->key_info[keynr], keynr, txn);
  3344. if (error) { goto cleanup; }
  3345. if (!is_unique) {
  3346. error = DB_KEYEXIST;
  3347. last_dup_key = keynr;
  3348. goto cleanup;
  3349. }
  3350. }
  3351. dbs[curr_db_index] = share->key_file[keynr];
  3352. key_dbts[curr_db_index] = mult_key_dbt[keynr];
  3353. rec_dbts[curr_db_index] = mult_rec_dbt[keynr];
  3354. curr_db_index++;
  3355. }
  3356. }
  3357. lockretryN(wait_lock_time){
  3358. error = db_env->put_multiple(
  3359. db_env,
  3360. NULL,
  3361. txn,
  3362. &prim_key,
  3363. &prim_row,
  3364. curr_db_index,
  3365. dbs,
  3366. key_dbts,
  3367. rec_dbts,
  3368. mult_put_flags,
  3369. NULL
  3370. );
  3371. lockretry_wait;
  3372. }
  3373. if (error == DB_KEYEXIST) {
  3374. last_dup_key = primary_key;
  3375. }
  3376. else if (!error) {
  3377. trx->stmt_progress.updated++;
  3378. track_progress(thd);
  3379. }
  3380. cleanup:
  3381. if (error == DB_KEYEXIST) {
  3382. error = HA_ERR_FOUND_DUPP_KEY;
  3383. }
  3384. if (sub_trans) {
  3385. // no point in recording error value of abort.
  3386. // nothing we can do about it anyway and it is not what
  3387. // we want to return.
  3388. if (error) {
  3389. abort_txn(sub_trans);
  3390. }
  3391. else {
  3392. commit_txn(sub_trans, DB_TXN_NOSYNC);
  3393. }
  3394. }
  3395. TOKUDB_DBUG_RETURN(error);
  3396. }
  3397. //
  3398. //
  3399. // Delete one key in key_file[keynr]
  3400. // This uses key_buff2, when keynr != primary key, so it's important that
  3401. // a function that calls this doesn't use this buffer for anything else.
  3402. // Parameters:
  3403. // [in] trans - transaction to be used for the delete
  3404. // keynr - index for which a key needs to be deleted
  3405. // [in] record - row in MySQL format. Must delete a key for this row
  3406. // [in] prim_key - key for record in primary table
  3407. // Returns:
  3408. // 0 on success
  3409. // error otherwise
  3410. //
  3411. int ha_tokudb::remove_key(DB_TXN * trans, uint keynr, const uchar * record, DBT * prim_key) {
  3412. TOKUDB_DBUG_ENTER("ha_tokudb::remove_key");
  3413. int error = 0;
  3414. DBT key;
  3415. bool has_null;
  3416. ulonglong wait_lock_time = get_write_lock_wait_time(ha_thd());
  3417. DBUG_PRINT("enter", ("index: %d", keynr));
  3418. DBUG_PRINT("primary", ("index: %d", primary_key));
  3419. DBUG_DUMP("prim_key", (uchar *) prim_key->data, prim_key->size);
  3420. if (keynr == primary_key) { // Unique key
  3421. DBUG_PRINT("Primary key", ("index: %d", keynr));
  3422. lockretryN(wait_lock_time){
  3423. error = share->key_file[keynr]->del(share->key_file[keynr], trans, prim_key , DB_DELETE_ANY);
  3424. lockretry_wait;
  3425. }
  3426. }
  3427. else {
  3428. DBUG_PRINT("Secondary key", ("index: %d", keynr));
  3429. create_dbt_key_from_table(&key, keynr, key_buff2, record, &has_null);
  3430. lockretryN(wait_lock_time){
  3431. error = share->key_file[keynr]->del(share->key_file[keynr], trans, &key , DB_DELETE_ANY);
  3432. lockretry_wait;
  3433. }
  3434. }
  3435. TOKUDB_DBUG_RETURN(error);
  3436. }
  3437. //
  3438. // Delete all keys for new_record
  3439. // Parameters:
  3440. // [in] trans - transaction to be used for the delete
  3441. // [in] record - row in MySQL format. Must delete all keys for this row
  3442. // [in] prim_key - key for record in primary table
  3443. // [in] keys - array that states if a key is set, and hence needs
  3444. // removal
  3445. // Returns:
  3446. // 0 on success
  3447. // error otherwise
  3448. //
  3449. int ha_tokudb::remove_keys(DB_TXN * trans, const uchar * record, DBT * prim_key) {
  3450. int result = 0;
  3451. for (uint keynr = 0; keynr < table_share->keys + test(hidden_primary_key); keynr++) {
  3452. int new_error = remove_key(trans, keynr, record, prim_key);
  3453. if (new_error) {
  3454. result = new_error; // Return last error
  3455. break; // Let rollback correct things
  3456. }
  3457. }
  3458. return result;
  3459. }
  3460. //
  3461. // Deletes a row in the table, called when handling a DELETE query
  3462. // Parameters:
  3463. // [in] record - row to be deleted, in MySQL format
  3464. // Returns:
  3465. // 0 on success
  3466. // error otherwise
  3467. //
  3468. int ha_tokudb::delete_row(const uchar * record) {
  3469. TOKUDB_DBUG_ENTER("ha_tokudb::delete_row");
  3470. int error = ENOSYS;
  3471. DBT prim_key;
  3472. key_map keys = table_share->keys_in_use;
  3473. bool has_null;
  3474. THD* thd = ha_thd();
  3475. tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);;
  3476. statistic_increment(table->in_use->status_var.ha_delete_count, &LOCK_status);
  3477. create_dbt_key_from_table(&prim_key, primary_key, key_buff, record, &has_null);
  3478. if (hidden_primary_key) {
  3479. keys.set_bit(primary_key);
  3480. }
  3481. /* Subtransactions may be used in order to retry the delete in
  3482. case we get a DB_LOCK_DEADLOCK error. */
  3483. DB_TXN *sub_trans = transaction;
  3484. error = remove_keys(sub_trans, record, &prim_key);
  3485. if (error) {
  3486. DBUG_PRINT("error", ("Got error %d", error));
  3487. }
  3488. else {
  3489. deleted_rows++;
  3490. trx->stmt_progress.deleted++;
  3491. track_progress(thd);
  3492. }
  3493. TOKUDB_DBUG_RETURN(error);
  3494. }
  3495. //
  3496. // takes as input table->read_set and table->write_set
  3497. // and puts list of field indexes that need to be read in
  3498. // unpack_row in the member variables fixed_cols_for_query
  3499. // and var_cols_for_query
  3500. //
  3501. void ha_tokudb::set_query_columns(uint keynr) {
  3502. u_int32_t curr_fixed_col_index = 0;
  3503. u_int32_t curr_var_col_index = 0;
  3504. read_key = false;
  3505. read_blobs = false;
  3506. //
  3507. // i know this is probably confusing and will need to be explained better
  3508. //
  3509. uint key_index = 0;
  3510. if (keynr == primary_key || keynr == MAX_KEY) {
  3511. key_index = primary_key;
  3512. }
  3513. else {
  3514. key_index = (table->key_info[keynr].flags & HA_CLUSTERING ? keynr : primary_key);
  3515. }
  3516. for (uint i = 0; i < table_share->fields; i++) {
  3517. if (bitmap_is_set(table->read_set,i) ||
  3518. bitmap_is_set(table->write_set,i)
  3519. )
  3520. {
  3521. if (bitmap_is_set(&share->kc_info.key_filters[key_index],i)) {
  3522. read_key = true;
  3523. }
  3524. else {
  3525. //
  3526. // if fixed field length
  3527. //
  3528. if (share->kc_info.field_lengths[i] != 0) {
  3529. //
  3530. // save the offset into the list
  3531. //
  3532. fixed_cols_for_query[curr_fixed_col_index] = i;
  3533. curr_fixed_col_index++;
  3534. }
  3535. //
  3536. // varchar or varbinary
  3537. //
  3538. else if (share->kc_info.length_bytes[i] != 0) {
  3539. var_cols_for_query[curr_var_col_index] = i;
  3540. curr_var_col_index++;
  3541. }
  3542. //
  3543. // it is a blob
  3544. //
  3545. else {
  3546. read_blobs = true;
  3547. }
  3548. }
  3549. }
  3550. }
  3551. num_fixed_cols_for_query = curr_fixed_col_index;
  3552. num_var_cols_for_query = curr_var_col_index;
  3553. }
  3554. void ha_tokudb::column_bitmaps_signal() {
  3555. //
  3556. // if we have max number of indexes, then MAX_KEY == primary_key
  3557. //
  3558. if (active_index != MAX_KEY || active_index == primary_key) {
  3559. set_query_columns(active_index);
  3560. }
  3561. }
  3562. //
  3563. // Notification that a scan of entire secondary table is about
  3564. // to take place. Will pre acquire table read lock
  3565. // Returns:
  3566. // 0 on success
  3567. // error otherwise
  3568. //
  3569. int ha_tokudb::prepare_index_scan() {
  3570. int error = 0;
  3571. DB* db = share->key_file[active_index];
  3572. lockretryN(read_lock_wait_time){
  3573. error = db->pre_acquire_read_lock(
  3574. db,
  3575. transaction,
  3576. db->dbt_neg_infty(), db->dbt_neg_infty(),
  3577. db->dbt_pos_infty(), db->dbt_pos_infty()
  3578. );
  3579. lockretry_wait;
  3580. }
  3581. if (error) { last_cursor_error = error; goto cleanup; }
  3582. range_lock_grabbed = true;
  3583. error = 0;
  3584. cleanup:
  3585. return error;
  3586. }
  3587. //
  3588. // Notification that a range query getting all elements that equal a key
  3589. // to take place. Will pre acquire read lock
  3590. // Returns:
  3591. // 0 on success
  3592. // error otherwise
  3593. //
  3594. int ha_tokudb::prepare_index_key_scan( const uchar * key, uint key_len ) {
  3595. int error = 0;
  3596. DBT start_key, end_key;
  3597. pack_key(&start_key, active_index, key_buff, key, key_len, COL_NEG_INF);
  3598. pack_key(&end_key, active_index, key_buff2, key, key_len, COL_POS_INF);
  3599. lockretryN(read_lock_wait_time){
  3600. error = share->key_file[active_index]->pre_acquire_read_lock(
  3601. share->key_file[active_index],
  3602. transaction,
  3603. &start_key,
  3604. share->key_file[active_index]->dbt_neg_infty(),
  3605. &end_key,
  3606. share->key_file[active_index]->dbt_pos_infty()
  3607. );
  3608. lockretry_wait;
  3609. }
  3610. if (error){
  3611. goto cleanup;
  3612. }
  3613. range_lock_grabbed = true;
  3614. error = 0;
  3615. cleanup:
  3616. if (error) {
  3617. last_cursor_error = error;
  3618. //
  3619. // cursor should be initialized here, but in case it is not, we still check
  3620. //
  3621. if (cursor) {
  3622. int r = cursor->c_close(cursor);
  3623. assert(r==0);
  3624. cursor = NULL;
  3625. }
  3626. }
  3627. return error;
  3628. }
  3629. //
  3630. // Initializes local cursor on DB with index keynr
  3631. // Parameters:
  3632. // keynr - key (index) number
  3633. // sorted - 1 if result MUST be sorted according to index
  3634. // Returns:
  3635. // 0 on success
  3636. // error otherwise
  3637. //
  3638. int ha_tokudb::index_init(uint keynr, bool sorted) {
  3639. TOKUDB_DBUG_ENTER("ha_tokudb::index_init %p %d", this, keynr);
  3640. int error;
  3641. THD* thd = ha_thd();
  3642. DBUG_PRINT("enter", ("table: '%s' key: %d", table_share->table_name.str, keynr));
  3643. read_lock_wait_time = get_read_lock_wait_time(ha_thd());
  3644. /*
  3645. Under some very rare conditions (like full joins) we may already have
  3646. an active cursor at this point
  3647. */
  3648. if (cursor) {
  3649. DBUG_PRINT("note", ("Closing active cursor"));
  3650. int r = cursor->c_close(cursor);
  3651. assert(r==0);
  3652. }
  3653. active_index = keynr;
  3654. last_cursor_error = 0;
  3655. range_lock_grabbed = false;
  3656. DBUG_ASSERT(keynr <= table->s->keys);
  3657. DBUG_ASSERT(share->key_file[keynr]);
  3658. if ((error = share->key_file[keynr]->cursor(share->key_file[keynr], transaction, &cursor, 0))) {
  3659. last_cursor_error = error;
  3660. cursor = NULL; // Safety
  3661. goto exit;
  3662. }
  3663. bzero((void *) &last_key, sizeof(last_key));
  3664. if (thd_sql_command(thd) == SQLCOM_SELECT) {
  3665. set_query_columns(keynr);
  3666. unpack_entire_row = false;
  3667. }
  3668. else {
  3669. unpack_entire_row = true;
  3670. }
  3671. error = 0;
  3672. exit:
  3673. TOKUDB_DBUG_RETURN(error);
  3674. }
  3675. //
  3676. // closes the local cursor
  3677. //
  3678. int ha_tokudb::index_end() {
  3679. TOKUDB_DBUG_ENTER("ha_tokudb::index_end %p", this);
  3680. int error = 0;
  3681. range_lock_grabbed = false;
  3682. if (cursor) {
  3683. DBUG_PRINT("enter", ("table: '%s'", table_share->table_name.str));
  3684. error = cursor->c_close(cursor);
  3685. assert(error==0);
  3686. cursor = NULL;
  3687. last_cursor_error = 0;
  3688. }
  3689. active_index = MAX_KEY;
  3690. //
  3691. // reset query variables
  3692. //
  3693. unpack_entire_row = true;
  3694. read_blobs = true;
  3695. read_key = true;
  3696. num_fixed_cols_for_query = 0;
  3697. num_var_cols_for_query = 0;
  3698. TOKUDB_DBUG_RETURN(error);
  3699. }
  3700. int ha_tokudb::handle_cursor_error(int error, int err_to_return, uint keynr) {
  3701. TOKUDB_DBUG_ENTER("ha_tokudb::handle_cursor_error");
  3702. if (error) {
  3703. last_cursor_error = error;
  3704. table->status = STATUS_NOT_FOUND;
  3705. int r = cursor->c_close(cursor);
  3706. assert(r==0);
  3707. cursor = NULL;
  3708. if (error == DB_NOTFOUND) {
  3709. error = err_to_return;
  3710. if ((share->key_file[keynr]->cursor(share->key_file[keynr], transaction, &cursor, 0))) {
  3711. cursor = NULL; // Safety
  3712. }
  3713. }
  3714. }
  3715. TOKUDB_DBUG_RETURN(error);
  3716. }
  3717. //
  3718. // Helper function for read_row and smart_dbt_callback_xxx functions
  3719. // When using a hidden primary key, upon reading a row,
  3720. // we set the current_ident field to whatever the primary key we retrieved
  3721. // was
  3722. //
  3723. void ha_tokudb::extract_hidden_primary_key(uint keynr, DBT const *found_key) {
  3724. //
  3725. // extract hidden primary key to current_ident
  3726. //
  3727. if (hidden_primary_key) {
  3728. if (keynr == primary_key) {
  3729. memcpy_fixed(current_ident, (char *) found_key->data, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
  3730. }
  3731. //
  3732. // if secondary key, hidden primary key is at end of found_key
  3733. //
  3734. else {
  3735. memcpy_fixed(
  3736. current_ident,
  3737. (char *) found_key->data + found_key->size - TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH,
  3738. TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH
  3739. );
  3740. }
  3741. }
  3742. }
  3743. int ha_tokudb::read_row_callback (uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
  3744. assert(keynr == primary_key);
  3745. return unpack_row(buf, row,found_key, keynr);
  3746. }
  3747. //
  3748. // Reads the contents of row and found_key, DBT's retrieved from the DB associated to keynr, into buf
  3749. // This function assumes that we are using a covering index, as a result, if keynr is the primary key,
  3750. // we do not read row into buf
  3751. // Parameters:
  3752. // [out] buf - buffer for the row, in MySQL format
  3753. // keynr - index into key_file that represents DB we are currently operating on.
  3754. // [in] row - the row that has been read from the preceding DB call
  3755. // [in] found_key - key used to retrieve the row
  3756. //
  3757. void ha_tokudb::read_key_only(uchar * buf, uint keynr, DBT const *found_key) {
  3758. TOKUDB_DBUG_ENTER("ha_tokudb::read_key_only");
  3759. table->status = 0;
  3760. //
  3761. // only case when we do not unpack the key is if we are dealing with the main dictionary
  3762. // of a table with a hidden primary key
  3763. //
  3764. if (!(hidden_primary_key && keynr == primary_key)) {
  3765. unpack_key(buf, found_key, keynr);
  3766. }
  3767. DBUG_VOID_RETURN;
  3768. }
  3769. //
  3770. // Helper function used to try to retrieve the entire row
  3771. // If keynr is associated with the main table, reads contents of found_key and row into buf, otherwise,
  3772. // makes copy of primary key and saves it to last_key. This can later be used to retrieve the entire row
  3773. // Parameters:
  3774. // [out] buf - buffer for the row, in MySQL format
  3775. // keynr - index into key_file that represents DB we are currently operating on.
  3776. // [in] row - the row that has been read from the preceding DB call
  3777. // [in] found_key - key used to retrieve the row
  3778. //
  3779. int ha_tokudb::read_primary_key(uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
  3780. TOKUDB_DBUG_ENTER("ha_tokudb::read_primary_key");
  3781. int error = 0;
  3782. table->status = 0;
  3783. //
  3784. // case where we read from secondary table that is not clustered
  3785. //
  3786. if (keynr != primary_key && !(table->key_info[keynr].flags & HA_CLUSTERING)) {
  3787. bool has_null;
  3788. //
  3789. // create a DBT that has the same data as row, this is inefficient
  3790. // extract_hidden_primary_key MUST have been called before this
  3791. //
  3792. bzero((void *) &last_key, sizeof(last_key));
  3793. if (!hidden_primary_key) {
  3794. unpack_key(buf, found_key, keynr);
  3795. }
  3796. create_dbt_key_from_table(
  3797. &last_key,
  3798. primary_key,
  3799. key_buff,
  3800. buf,
  3801. &has_null
  3802. );
  3803. }
  3804. //
  3805. // else read from clustered/primary key
  3806. //
  3807. else {
  3808. error = unpack_row(buf, row, found_key, keynr);
  3809. if (error) { goto exit; }
  3810. }
  3811. if (found_key) { DBUG_DUMP("read row key", (uchar *) found_key->data, found_key->size); }
  3812. error = 0;
  3813. exit:
  3814. TOKUDB_DBUG_RETURN(error);
  3815. }
  3816. //
  3817. // This function reads an entire row into buf. This function also assumes that
  3818. // the key needed to retrieve the row is stored in the member variable last_key
  3819. // Parameters:
  3820. // [out] buf - buffer for the row, in MySQL format
  3821. // Returns:
  3822. // 0 on success, error otherwise
  3823. //
  3824. int ha_tokudb::read_full_row(uchar * buf) {
  3825. TOKUDB_DBUG_ENTER("ha_tokudb::read_full_row");
  3826. int error = 0;
  3827. struct smart_dbt_info info;
  3828. info.ha = this;
  3829. info.buf = buf;
  3830. info.keynr = primary_key;
  3831. //
  3832. // assumes key is stored in this->last_key
  3833. //
  3834. lockretryN(read_lock_wait_time){
  3835. error = share->file->getf_set(
  3836. share->file,
  3837. transaction,
  3838. 0,
  3839. &last_key,
  3840. smart_dbt_callback_rowread_ptquery,
  3841. &info
  3842. );
  3843. lockretry_wait;
  3844. }
  3845. if (error) {
  3846. table->status = STATUS_NOT_FOUND;
  3847. TOKUDB_DBUG_RETURN(error == DB_NOTFOUND ? HA_ERR_CRASHED : error);
  3848. }
  3849. TOKUDB_DBUG_RETURN(error);
  3850. }
  3851. //
  3852. // Reads the next row matching to the key, on success, advances cursor
  3853. // Parameters:
  3854. // [out] buf - buffer for the next row, in MySQL format
  3855. // [in] key - key value
  3856. // keylen - length of key
  3857. // Returns:
  3858. // 0 on success
  3859. // HA_ERR_END_OF_FILE if not found
  3860. // error otherwise
  3861. //
  3862. int ha_tokudb::index_next_same(uchar * buf, const uchar * key, uint keylen) {
  3863. TOKUDB_DBUG_ENTER("ha_tokudb::index_next_same %p", this);
  3864. int error = 0;
  3865. struct smart_dbt_info info;
  3866. DBT curr_key;
  3867. DBT found_key;
  3868. bool has_null;
  3869. int cmp;
  3870. u_int32_t flags;
  3871. THD* thd = ha_thd();
  3872. tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);;
  3873. HANDLE_INVALID_CURSOR();
  3874. statistic_increment(table->in_use->status_var.ha_read_next_count, &LOCK_status);
  3875. info.ha = this;
  3876. info.buf = buf;
  3877. info.keynr = active_index;
  3878. pack_key(&curr_key, active_index, key_buff2, key, keylen, COL_ZERO);
  3879. flags = SET_READ_FLAG(0);
  3880. lockretryN(read_lock_wait_time){
  3881. error = cursor->c_getf_next(cursor, flags, SMART_DBT_CALLBACK, &info);
  3882. lockretry_wait;
  3883. }
  3884. error = handle_cursor_error(error, HA_ERR_END_OF_FILE,active_index);
  3885. if (error) {
  3886. goto cleanup;
  3887. }
  3888. if (!key_read && active_index != primary_key && !(table->key_info[active_index].flags & HA_CLUSTERING)) {
  3889. error = read_full_row(buf);
  3890. if (error) {
  3891. goto cleanup;
  3892. }
  3893. }
  3894. //
  3895. // now do the comparison
  3896. //
  3897. create_dbt_key_from_table(&found_key,active_index,key_buff3,buf,&has_null);
  3898. cmp = tokudb_prefix_cmp_dbt_key(share->key_file[active_index], &curr_key, &found_key);
  3899. if (cmp) {
  3900. error = HA_ERR_END_OF_FILE;
  3901. }
  3902. trx->stmt_progress.queried++;
  3903. track_progress(thd);
  3904. cleanup:
  3905. TOKUDB_DBUG_RETURN(error);
  3906. }
  3907. //
  3908. // According to InnoDB handlerton: Positions an index cursor to the index
  3909. // specified in keynr. Fetches the row if any
  3910. // Parameters:
  3911. // [out] buf - buffer for the returned row
  3912. // [in] key - key value, according to InnoDB, if NULL,
  3913. // position cursor at start or end of index,
  3914. // not sure if this is done now
  3915. // key_len - length of key
  3916. // find_flag - according to InnoDB, search flags from my_base.h
  3917. // Returns:
  3918. // 0 on success
  3919. // HA_ERR_KEY_NOT_FOUND if not found (per InnoDB),
  3920. // we seem to return HA_ERR_END_OF_FILE if find_flag != HA_READ_KEY_EXACT
  3921. // TODO: investigate this for correctness
  3922. // error otherwise
  3923. //
  3924. int ha_tokudb::index_read(uchar * buf, const uchar * key, uint key_len, enum ha_rkey_function find_flag) {
  3925. TOKUDB_DBUG_ENTER("ha_tokudb::index_read %p find %d", this, find_flag);
  3926. // TOKUDB_DBUG_DUMP("key=", key, key_len);
  3927. DBT row;
  3928. DBT lookup_key;
  3929. int error = 0;
  3930. u_int32_t flags = 0;
  3931. THD* thd = ha_thd();
  3932. tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);;
  3933. struct smart_dbt_info info;
  3934. struct index_read_info ir_info;
  3935. HANDLE_INVALID_CURSOR();
  3936. table->in_use->status_var.ha_read_key_count++;
  3937. bzero((void *) &row, sizeof(row));
  3938. info.ha = this;
  3939. info.buf = buf;
  3940. info.keynr = active_index;
  3941. ir_info.smart_dbt_info = info;
  3942. ir_info.cmp = 0;
  3943. flags = SET_READ_FLAG(0);
  3944. switch (find_flag) {
  3945. case HA_READ_KEY_EXACT: /* Find first record else error */
  3946. pack_key(&lookup_key, active_index, key_buff3, key, key_len, COL_NEG_INF);
  3947. ir_info.orig_key = &lookup_key;
  3948. lockretryN(read_lock_wait_time){
  3949. error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK, &ir_info);
  3950. lockretry_wait;
  3951. }
  3952. if (ir_info.cmp) {
  3953. error = DB_NOTFOUND;
  3954. }
  3955. break;
  3956. case HA_READ_AFTER_KEY: /* Find next rec. after key-record */
  3957. pack_key(&lookup_key, active_index, key_buff3, key, key_len, COL_POS_INF);
  3958. lockretryN(read_lock_wait_time){
  3959. error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK, &info);
  3960. lockretry_wait;
  3961. }
  3962. break;
  3963. case HA_READ_BEFORE_KEY: /* Find next rec. before key-record */
  3964. pack_key(&lookup_key, active_index, key_buff3, key, key_len, COL_NEG_INF);
  3965. lockretryN(read_lock_wait_time){
  3966. error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK, &info);
  3967. lockretry_wait;
  3968. }
  3969. break;
  3970. case HA_READ_KEY_OR_NEXT: /* Record or next record */
  3971. pack_key(&lookup_key, active_index, key_buff3, key, key_len, COL_NEG_INF);
  3972. lockretryN(read_lock_wait_time){
  3973. error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK, &info);
  3974. lockretry_wait;
  3975. }
  3976. break;
  3977. //
  3978. // This case does not seem to ever be used, it is ok for it to be slow
  3979. //
  3980. case HA_READ_KEY_OR_PREV: /* Record or previous */
  3981. pack_key(&lookup_key, active_index, key_buff3, key, key_len, COL_NEG_INF);
  3982. ir_info.orig_key = &lookup_key;
  3983. lockretryN(read_lock_wait_time){
  3984. error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK, &ir_info);
  3985. lockretry_wait;
  3986. }
  3987. if (error == DB_NOTFOUND) {
  3988. error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK, &info);
  3989. }
  3990. else if (ir_info.cmp) {
  3991. error = cursor->c_getf_prev(cursor, flags, SMART_DBT_CALLBACK, &info);
  3992. }
  3993. break;
  3994. case HA_READ_PREFIX_LAST_OR_PREV: /* Last or prev key with the same prefix */
  3995. pack_key(&lookup_key, active_index, key_buff3, key, key_len, COL_POS_INF);
  3996. lockretryN(read_lock_wait_time){
  3997. error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK, &info);
  3998. lockretry_wait;
  3999. }
  4000. break;
  4001. case HA_READ_PREFIX_LAST:
  4002. pack_key(&lookup_key, active_index, key_buff3, key, key_len, COL_POS_INF);
  4003. ir_info.orig_key = &lookup_key;
  4004. lockretryN(read_lock_wait_time){
  4005. error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK, &ir_info);
  4006. lockretry_wait;
  4007. }
  4008. if (ir_info.cmp) {
  4009. error = DB_NOTFOUND;
  4010. }
  4011. break;
  4012. default:
  4013. TOKUDB_TRACE("unsupported:%d\n", find_flag);
  4014. error = HA_ERR_UNSUPPORTED;
  4015. break;
  4016. }
  4017. error = handle_cursor_error(error,HA_ERR_KEY_NOT_FOUND,active_index);
  4018. if (!error && !key_read && active_index != primary_key && !(table->key_info[active_index].flags & HA_CLUSTERING)) {
  4019. error = read_full_row(buf);
  4020. }
  4021. if (error && (tokudb_debug & TOKUDB_DEBUG_ERROR)) {
  4022. TOKUDB_TRACE("error:%d:%d\n", error, find_flag);
  4023. }
  4024. trx->stmt_progress.queried++;
  4025. track_progress(thd);
  4026. cleanup:
  4027. TOKUDB_DBUG_RETURN(error);
  4028. }
  4029. //
  4030. // Reads the next row from the active index (cursor) into buf, and advances cursor
  4031. // Parameters:
  4032. // [out] buf - buffer for the next row, in MySQL format
  4033. // Returns:
  4034. // 0 on success
  4035. // HA_ERR_END_OF_FILE if not found
  4036. // error otherwise
  4037. //
  4038. int ha_tokudb::index_next(uchar * buf) {
  4039. TOKUDB_DBUG_ENTER("ha_tokudb::index_next");
  4040. int error = 0;
  4041. struct smart_dbt_info info;
  4042. u_int32_t flags = SET_READ_FLAG(0);
  4043. THD* thd = ha_thd();
  4044. tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);;
  4045. HANDLE_INVALID_CURSOR();
  4046. statistic_increment(table->in_use->status_var.ha_read_next_count, &LOCK_status);
  4047. info.ha = this;
  4048. info.buf = buf;
  4049. info.keynr = active_index;
  4050. lockretryN(read_lock_wait_time){
  4051. error = cursor->c_getf_next(cursor, flags, SMART_DBT_CALLBACK, &info);
  4052. lockretry_wait;
  4053. }
  4054. error = handle_cursor_error(error, HA_ERR_END_OF_FILE,active_index);
  4055. //
  4056. // still need to get entire contents of the row if operation done on
  4057. // secondary DB and it was NOT a covering index
  4058. //
  4059. if (!error && !key_read && (active_index != primary_key) && !(table->key_info[active_index].flags & HA_CLUSTERING) ) {
  4060. error = read_full_row(buf);
  4061. }
  4062. trx->stmt_progress.queried++;
  4063. track_progress(thd);
  4064. cleanup:
  4065. TOKUDB_DBUG_RETURN(error);
  4066. }
  4067. int ha_tokudb::index_read_last(uchar * buf, const uchar * key, uint key_len) {
  4068. return(index_read(buf, key, key_len, HA_READ_PREFIX_LAST));
  4069. }
  4070. //
  4071. // Reads the previous row from the active index (cursor) into buf, and advances cursor
  4072. // Parameters:
  4073. // [out] buf - buffer for the next row, in MySQL format
  4074. // Returns:
  4075. // 0 on success
  4076. // HA_ERR_END_OF_FILE if not found
  4077. // error otherwise
  4078. //
  4079. int ha_tokudb::index_prev(uchar * buf) {
  4080. TOKUDB_DBUG_ENTER("ha_tokudb::index_next");
  4081. int error = 0;
  4082. struct smart_dbt_info info;
  4083. u_int32_t flags = SET_READ_FLAG(0);
  4084. THD* thd = ha_thd();
  4085. tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);;
  4086. HANDLE_INVALID_CURSOR();
  4087. statistic_increment(table->in_use->status_var.ha_read_next_count, &LOCK_status);
  4088. info.ha = this;
  4089. info.buf = buf;
  4090. info.keynr = active_index;
  4091. lockretryN(read_lock_wait_time){
  4092. error = cursor->c_getf_prev(cursor, flags, SMART_DBT_CALLBACK, &info);
  4093. lockretry_wait;
  4094. }
  4095. error = handle_cursor_error(error,HA_ERR_END_OF_FILE,active_index);
  4096. //
  4097. // still need to get entire contents of the row if operation done on
  4098. // secondary DB and it was NOT a covering index
  4099. //
  4100. if (!error && !key_read && (active_index != primary_key) && !(table->key_info[active_index].flags & HA_CLUSTERING) ) {
  4101. error = read_full_row(buf);
  4102. }
  4103. trx->stmt_progress.queried++;
  4104. track_progress(thd);
  4105. cleanup:
  4106. TOKUDB_DBUG_RETURN(error);
  4107. }
  4108. //
  4109. // Reads the first row from the active index (cursor) into buf, and advances cursor
  4110. // Parameters:
  4111. // [out] buf - buffer for the next row, in MySQL format
  4112. // Returns:
  4113. // 0 on success
  4114. // HA_ERR_END_OF_FILE if not found
  4115. // error otherwise
  4116. //
  4117. int ha_tokudb::index_first(uchar * buf) {
  4118. TOKUDB_DBUG_ENTER("ha_tokudb::index_first");
  4119. int error = 0;
  4120. struct smart_dbt_info info;
  4121. u_int32_t flags = SET_READ_FLAG(0);
  4122. THD* thd = ha_thd();
  4123. tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);;
  4124. HANDLE_INVALID_CURSOR();
  4125. statistic_increment(table->in_use->status_var.ha_read_first_count, &LOCK_status);
  4126. info.ha = this;
  4127. info.buf = buf;
  4128. info.keynr = active_index;
  4129. lockretryN(read_lock_wait_time){
  4130. error = cursor->c_getf_first(cursor, flags, SMART_DBT_CALLBACK, &info);
  4131. lockretry_wait;
  4132. }
  4133. error = handle_cursor_error(error,HA_ERR_END_OF_FILE,active_index);
  4134. //
  4135. // still need to get entire contents of the row if operation done on
  4136. // secondary DB and it was NOT a covering index
  4137. //
  4138. if (!error && !key_read && (active_index != primary_key) && !(table->key_info[active_index].flags & HA_CLUSTERING) ) {
  4139. error = read_full_row(buf);
  4140. }
  4141. trx->stmt_progress.queried++;
  4142. track_progress(thd);
  4143. cleanup:
  4144. TOKUDB_DBUG_RETURN(error);
  4145. }
  4146. //
  4147. // Reads the last row from the active index (cursor) into buf, and advances cursor
  4148. // Parameters:
  4149. // [out] buf - buffer for the next row, in MySQL format
  4150. // Returns:
  4151. // 0 on success
  4152. // HA_ERR_END_OF_FILE if not found
  4153. // error otherwise
  4154. //
  4155. int ha_tokudb::index_last(uchar * buf) {
  4156. TOKUDB_DBUG_ENTER("ha_tokudb::index_last");
  4157. int error = 0;
  4158. struct smart_dbt_info info;
  4159. u_int32_t flags = SET_READ_FLAG(0);
  4160. THD* thd = ha_thd();
  4161. tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);;
  4162. HANDLE_INVALID_CURSOR();
  4163. statistic_increment(table->in_use->status_var.ha_read_last_count, &LOCK_status);
  4164. info.ha = this;
  4165. info.buf = buf;
  4166. info.keynr = active_index;
  4167. lockretryN(read_lock_wait_time){
  4168. error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK, &info);
  4169. lockretry_wait;
  4170. }
  4171. error = handle_cursor_error(error,HA_ERR_END_OF_FILE,active_index);
  4172. //
  4173. // still need to get entire contents of the row if operation done on
  4174. // secondary DB and it was NOT a covering index
  4175. //
  4176. if (!error && !key_read && (active_index != primary_key) && !(table->key_info[active_index].flags & HA_CLUSTERING) ) {
  4177. error = read_full_row(buf);
  4178. }
  4179. if (trx) {
  4180. trx->stmt_progress.queried++;
  4181. }
  4182. track_progress(thd);
  4183. cleanup:
  4184. TOKUDB_DBUG_RETURN(error);
  4185. }
  4186. //
  4187. // Initialize a scan of the table (which is why index_init is called on primary_key)
  4188. // Parameters:
  4189. // scan - unused
  4190. // Returns:
  4191. // 0 on success
  4192. // error otherwise
  4193. //
  4194. int ha_tokudb::rnd_init(bool scan) {
  4195. TOKUDB_DBUG_ENTER("ha_tokudb::rnd_init");
  4196. int error = 0;
  4197. read_lock_wait_time = get_read_lock_wait_time(ha_thd());
  4198. range_lock_grabbed = false;
  4199. if (scan) {
  4200. DB* db = share->key_file[primary_key];
  4201. lockretryN(read_lock_wait_time){
  4202. error = db->pre_acquire_read_lock(db, transaction, db->dbt_neg_infty(), NULL, db->dbt_pos_infty(), NULL);
  4203. lockretry_wait;
  4204. }
  4205. if (error) { last_cursor_error = error; goto cleanup; }
  4206. }
  4207. error = index_init(primary_key, 0);
  4208. if (error) { goto cleanup;}
  4209. //
  4210. // only want to set range_lock_grabbed to true after index_init
  4211. // successfully executed for two reasons:
  4212. // 1) index_init will reset it to false anyway
  4213. // 2) if it fails, we don't want prelocking on,
  4214. //
  4215. if (scan) { range_lock_grabbed = true; }
  4216. error = 0;
  4217. cleanup:
  4218. TOKUDB_DBUG_RETURN(error);
  4219. }
  4220. //
  4221. // End a scan of the table
  4222. //
  4223. int ha_tokudb::rnd_end() {
  4224. TOKUDB_DBUG_ENTER("ha_tokudb::rnd_end");
  4225. range_lock_grabbed = false;
  4226. TOKUDB_DBUG_RETURN(index_end());
  4227. }
  4228. //
  4229. // Read the next row in a table scan
  4230. // Parameters:
  4231. // [out] buf - buffer for the next row, in MySQL format
  4232. // Returns:
  4233. // 0 on success
  4234. // HA_ERR_END_OF_FILE if not found
  4235. // error otherwise
  4236. //
  4237. int ha_tokudb::rnd_next(uchar * buf) {
  4238. TOKUDB_DBUG_ENTER("ha_tokudb::ha_tokudb::rnd_next");
  4239. int error = 0;
  4240. u_int32_t flags = SET_READ_FLAG(0);
  4241. THD* thd = ha_thd();
  4242. tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);;
  4243. struct smart_dbt_info info;
  4244. HANDLE_INVALID_CURSOR();
  4245. //
  4246. // The reason we do not just call index_next is that index_next
  4247. // increments a different variable than we do here
  4248. //
  4249. statistic_increment(table->in_use->status_var.ha_read_rnd_next_count, &LOCK_status);
  4250. info.ha = this;
  4251. info.buf = buf;
  4252. info.keynr = primary_key;
  4253. lockretryN(read_lock_wait_time){
  4254. error = cursor->c_getf_next(cursor, flags, SMART_DBT_CALLBACK, &info);
  4255. lockretry_wait;
  4256. }
  4257. error = handle_cursor_error(error, HA_ERR_END_OF_FILE,primary_key);
  4258. trx->stmt_progress.queried++;
  4259. track_progress(thd);
  4260. cleanup:
  4261. TOKUDB_DBUG_RETURN(error);
  4262. }
  4263. void ha_tokudb::track_progress(THD* thd) {
  4264. tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);
  4265. if (trx) {
  4266. ulonglong num_written = trx->stmt_progress.inserted + trx->stmt_progress.updated + trx->stmt_progress.deleted;
  4267. bool update_status =
  4268. (trx->stmt_progress.queried && tokudb_read_status_frequency && (trx->stmt_progress.queried % tokudb_read_status_frequency) == 0) ||
  4269. (num_written && tokudb_write_status_frequency && (num_written % tokudb_write_status_frequency) == 0);
  4270. if (update_status) {
  4271. char *next_status = write_status_msg;
  4272. bool first = true;
  4273. int r;
  4274. if (trx->stmt_progress.queried) {
  4275. r = sprintf(next_status, "Queried about %llu row%s", trx->stmt_progress.queried, trx->stmt_progress.queried == 1 ? "" : "s");
  4276. assert(r >= 0);
  4277. next_status += r;
  4278. first = false;
  4279. }
  4280. if (trx->stmt_progress.inserted) {
  4281. r = sprintf(next_status, "%sInserted about %llu row%s", first ? "" : ", ", trx->stmt_progress.inserted, trx->stmt_progress.inserted == 1 ? "" : "s");
  4282. assert(r >= 0);
  4283. next_status += r;
  4284. first = false;
  4285. }
  4286. if (trx->stmt_progress.updated) {
  4287. r = sprintf(next_status, "%sUpdated about %llu row%s", first ? "" : ", ", trx->stmt_progress.updated, trx->stmt_progress.updated == 1 ? "" : "s");
  4288. assert(r >= 0);
  4289. next_status += r;
  4290. first = false;
  4291. }
  4292. if (trx->stmt_progress.deleted) {
  4293. r = sprintf(next_status, "%sDeleted about %llu row%s", first ? "" : ", ", trx->stmt_progress.deleted, trx->stmt_progress.deleted == 1 ? "" : "s");
  4294. assert(r >= 0);
  4295. next_status += r;
  4296. first = false;
  4297. }
  4298. if (!first)
  4299. thd_proc_info(thd, write_status_msg);
  4300. }
  4301. }
  4302. }
  4303. DBT *ha_tokudb::get_pos(DBT * to, uchar * pos) {
  4304. TOKUDB_DBUG_ENTER("ha_tokudb::get_pos");
  4305. /* We don't need to set app_data here */
  4306. bzero((void *) to, sizeof(*to));
  4307. to->data = pos + sizeof(u_int32_t);
  4308. to->size = *(u_int32_t *)pos;
  4309. DBUG_DUMP("key", (const uchar *) to->data, to->size);
  4310. DBUG_RETURN(to);
  4311. }
  4312. //
  4313. // Retrieves a row with based on the primary key saved in pos
  4314. // Returns:
  4315. // 0 on success
  4316. // HA_ERR_KEY_NOT_FOUND if not found
  4317. // error otherwise
  4318. //
  4319. int ha_tokudb::rnd_pos(uchar * buf, uchar * pos) {
  4320. TOKUDB_DBUG_ENTER("ha_tokudb::rnd_pos");
  4321. DBT db_pos;
  4322. int error = 0;
  4323. struct smart_dbt_info info;
  4324. bool old_unpack_entire_row = unpack_entire_row;
  4325. DBT* key = get_pos(&db_pos, pos);
  4326. read_lock_wait_time = get_read_lock_wait_time(ha_thd());
  4327. unpack_entire_row = true;
  4328. statistic_increment(table->in_use->status_var.ha_read_rnd_count, &LOCK_status);
  4329. active_index = MAX_KEY;
  4330. info.ha = this;
  4331. info.buf = buf;
  4332. info.keynr = primary_key;
  4333. lockretryN(read_lock_wait_time) {
  4334. error = share->file->getf_set(share->file, transaction, 0, key, smart_dbt_callback_rowread_ptquery, &info);
  4335. lockretry_wait;
  4336. }
  4337. if (error == DB_NOTFOUND) {
  4338. error = HA_ERR_KEY_NOT_FOUND;
  4339. goto cleanup;
  4340. }
  4341. cleanup:
  4342. unpack_entire_row = old_unpack_entire_row;
  4343. TOKUDB_DBUG_RETURN(error);
  4344. }
  4345. int ha_tokudb::prelock_range( const key_range *start_key, const key_range *end_key) {
  4346. TOKUDB_DBUG_ENTER("ha_tokudb::read_range_first");
  4347. int error = 0;
  4348. DBT start_dbt_key;
  4349. const DBT* start_dbt_data = NULL;
  4350. DBT end_dbt_key;
  4351. const DBT* end_dbt_data = NULL;
  4352. uchar* start_key_buff = key_buff2;
  4353. uchar* end_key_buff = key_buff3;
  4354. bzero((void *) &start_dbt_key, sizeof(start_dbt_key));
  4355. bzero((void *) &end_dbt_key, sizeof(end_dbt_key));
  4356. if (start_key) {
  4357. switch (start_key->flag) {
  4358. case HA_READ_AFTER_KEY:
  4359. pack_key(&start_dbt_key, active_index, start_key_buff, start_key->key, start_key->length, COL_POS_INF);
  4360. start_dbt_data = share->key_file[active_index]->dbt_pos_infty();
  4361. break;
  4362. default:
  4363. pack_key(&start_dbt_key, active_index, start_key_buff, start_key->key, start_key->length, COL_NEG_INF);
  4364. start_dbt_data = share->key_file[active_index]->dbt_neg_infty();
  4365. break;
  4366. }
  4367. }
  4368. else {
  4369. start_dbt_data = share->key_file[active_index]->dbt_neg_infty();
  4370. }
  4371. if (end_key) {
  4372. switch (end_key->flag) {
  4373. case HA_READ_BEFORE_KEY:
  4374. pack_key(&end_dbt_key, active_index, end_key_buff, end_key->key, end_key->length, COL_NEG_INF);
  4375. end_dbt_data = share->key_file[active_index]->dbt_neg_infty();
  4376. break;
  4377. default:
  4378. pack_key(&end_dbt_key, active_index, end_key_buff, end_key->key, end_key->length, COL_POS_INF);
  4379. end_dbt_data = share->key_file[active_index]->dbt_pos_infty();
  4380. break;
  4381. }
  4382. }
  4383. else {
  4384. end_dbt_data = share->key_file[active_index]->dbt_pos_infty();
  4385. }
  4386. lockretryN(read_lock_wait_time){
  4387. error = share->key_file[active_index]->pre_acquire_read_lock(
  4388. share->key_file[active_index],
  4389. transaction,
  4390. start_key ? &start_dbt_key : share->key_file[active_index]->dbt_neg_infty(),
  4391. start_dbt_data,
  4392. end_key ? &end_dbt_key : share->key_file[active_index]->dbt_pos_infty(),
  4393. end_dbt_data
  4394. );
  4395. lockretry_wait;
  4396. }
  4397. if (error){
  4398. last_cursor_error = error;
  4399. //
  4400. // cursor should be initialized here, but in case it is not, we still check
  4401. //
  4402. if (cursor) {
  4403. int r = cursor->c_close(cursor);
  4404. assert(r==0);
  4405. cursor = NULL;
  4406. }
  4407. goto cleanup;
  4408. }
  4409. cleanup:
  4410. TOKUDB_DBUG_RETURN(error);
  4411. }
  4412. //
  4413. // Prelock range if possible, start_key is leftmost, end_key is rightmost
  4414. // whether scanning forward or backward. This function is called by MySQL
  4415. // for backward range queries (in QUICK_SELECT_DESC::get_next).
  4416. // Forward scans use read_range_first()/read_range_next().
  4417. //
  4418. int ha_tokudb::prepare_range_scan( const key_range *start_key, const key_range *end_key) {
  4419. int error = prelock_range(start_key, end_key);
  4420. if (!error) {
  4421. range_lock_grabbed = true;
  4422. }
  4423. return error;
  4424. }
  4425. int ha_tokudb::read_range_first(
  4426. const key_range *start_key,
  4427. const key_range *end_key,
  4428. bool eq_range,
  4429. bool sorted)
  4430. {
  4431. int error;
  4432. error = prelock_range(start_key, end_key);
  4433. if (error) { goto cleanup; }
  4434. range_lock_grabbed = true;
  4435. error = handler::read_range_first(start_key, end_key, eq_range, sorted);
  4436. cleanup:
  4437. return error;
  4438. }
  4439. int ha_tokudb::read_range_next()
  4440. {
  4441. TOKUDB_DBUG_ENTER("ha_tokudb::read_range_next");
  4442. int error;
  4443. error = handler::read_range_next();
  4444. if (error) {
  4445. range_lock_grabbed = false;
  4446. }
  4447. TOKUDB_DBUG_RETURN(error);
  4448. }
  4449. /*
  4450. Set a reference to the current record in (ref,ref_length).
  4451. SYNOPSIS
  4452. ha_tokudb::position()
  4453. record The current record buffer
  4454. DESCRIPTION
  4455. The BDB handler stores the primary key in (ref,ref_length).
  4456. There is either an explicit primary key, or an implicit (hidden)
  4457. primary key.
  4458. During open(), 'ref_length' is calculated as the maximum primary
  4459. key length. When an actual key is shorter than that, the rest of
  4460. the buffer must be cleared out. The row cannot be identified, if
  4461. garbage follows behind the end of the key. There is no length
  4462. field for the current key, so that the whole ref_length is used
  4463. for comparison.
  4464. RETURN
  4465. nothing
  4466. */
  4467. void ha_tokudb::position(const uchar * record) {
  4468. TOKUDB_DBUG_ENTER("ha_tokudb::position");
  4469. DBT key;
  4470. if (hidden_primary_key) {
  4471. DBUG_ASSERT(ref_length == (TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(u_int32_t)));
  4472. memcpy_fixed(ref + sizeof(u_int32_t), current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
  4473. *(u_int32_t *)ref = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
  4474. }
  4475. else {
  4476. bool has_null;
  4477. //
  4478. // save the data
  4479. //
  4480. create_dbt_key_from_table(&key, primary_key, ref + sizeof(u_int32_t), record, &has_null);
  4481. //
  4482. // save the size of data in the first four bytes of ref
  4483. //
  4484. memcpy(ref, &key.size, sizeof(u_int32_t));
  4485. }
  4486. DBUG_VOID_RETURN;
  4487. }
  4488. //
  4489. // Per InnoDB: Returns statistics information of the table to the MySQL interpreter,
  4490. // in various fields of the handle object.
  4491. // Return:
  4492. // 0, always success
  4493. //
  4494. int ha_tokudb::info(uint flag) {
  4495. TOKUDB_DBUG_ENTER("ha_tokudb::info %p %d %lld", this, flag, (long long) share->rows);
  4496. int error;
  4497. DB_TXN* txn = NULL;
  4498. uint curr_num_DBs = table->s->keys + test(hidden_primary_key);
  4499. DB_BTREE_STAT64 dict_stats;
  4500. if (flag & HA_STATUS_VARIABLE) {
  4501. // Just to get optimizations right
  4502. stats.records = share->rows + share->rows_from_locked_table;
  4503. stats.deleted = 0;
  4504. if (!(flag & HA_STATUS_NO_LOCK)) {
  4505. u_int64_t num_rows = 0;
  4506. TOKU_DB_FRAGMENTATION_S frag_info = {0};
  4507. error = db_env->txn_begin(db_env, NULL, &txn, DB_READ_UNCOMMITTED);
  4508. if (error) { goto cleanup; }
  4509. error = estimate_num_rows(share->file,&num_rows, txn);
  4510. if (error == 0) {
  4511. share->rows = num_rows;
  4512. stats.records = num_rows;
  4513. }
  4514. else {
  4515. goto cleanup;
  4516. }
  4517. error = share->file->get_fragmentation(
  4518. share->file,
  4519. &frag_info
  4520. );
  4521. if (error) { goto cleanup; }
  4522. stats.delete_length = frag_info.unused_bytes;
  4523. error = share->file->stat64(
  4524. share->file,
  4525. txn,
  4526. &dict_stats
  4527. );
  4528. if (error) { goto cleanup; }
  4529. stats.data_file_length = dict_stats.bt_dsize;
  4530. if (hidden_primary_key) {
  4531. //
  4532. // in this case, we have a hidden primary key, do not
  4533. // want to report space taken up by the hidden primary key to the user
  4534. //
  4535. u_int64_t hpk_space = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH*dict_stats.bt_ndata;
  4536. stats.data_file_length = (hpk_space > stats.data_file_length) ? 0 : stats.data_file_length - hpk_space;
  4537. }
  4538. else {
  4539. //
  4540. // one infinity byte per key needs to be subtracted
  4541. //
  4542. u_int64_t inf_byte_space = dict_stats.bt_ndata;
  4543. stats.data_file_length = (inf_byte_space > stats.data_file_length) ? 0 : stats.data_file_length - inf_byte_space;
  4544. }
  4545. stats.mean_rec_length = stats.records ? (ulong)(stats.data_file_length/stats.records) : 0;
  4546. stats.index_file_length = 0;
  4547. for (uint i = 0; i < curr_num_DBs; i++) {
  4548. if (i == primary_key) {
  4549. continue;
  4550. }
  4551. error = share->key_file[i]->stat64(
  4552. share->key_file[i],
  4553. txn,
  4554. &dict_stats
  4555. );
  4556. if (error) { goto cleanup; }
  4557. stats.index_file_length += dict_stats.bt_dsize;
  4558. error = share->file->get_fragmentation(
  4559. share->file,
  4560. &frag_info
  4561. );
  4562. if (error) { goto cleanup; }
  4563. stats.delete_length += frag_info.unused_bytes;
  4564. }
  4565. }
  4566. }
  4567. if ((flag & HA_STATUS_CONST)) {
  4568. stats.max_data_file_length= 9223372036854775807ULL;
  4569. for (uint i = 0; i < table_share->keys; i++) {
  4570. table->key_info[i].rec_per_key[table->key_info[i].key_parts - 1] = 0;
  4571. }
  4572. }
  4573. /* Don't return key if we got an error for the internal primary key */
  4574. if (flag & HA_STATUS_ERRKEY && last_dup_key < table_share->keys) {
  4575. errkey = last_dup_key;
  4576. }
  4577. if (flag & HA_STATUS_AUTO && table->found_next_number_field) {
  4578. THD *thd= table->in_use;
  4579. struct system_variables *variables= &thd->variables;
  4580. stats.auto_increment_value = share->last_auto_increment + variables->auto_increment_increment;
  4581. }
  4582. error = 0;
  4583. cleanup:
  4584. if (txn != NULL) {
  4585. commit_txn(txn, DB_TXN_NOSYNC);
  4586. txn = NULL;
  4587. }
  4588. TOKUDB_DBUG_RETURN(error);
  4589. }
  4590. //
  4591. // Per InnoDB: Tells something additional to the handler about how to do things.
  4592. //
  4593. int ha_tokudb::extra(enum ha_extra_function operation) {
  4594. TOKUDB_DBUG_ENTER("extra %p %d", this, operation);
  4595. switch (operation) {
  4596. case HA_EXTRA_RESET_STATE:
  4597. reset();
  4598. break;
  4599. case HA_EXTRA_KEYREAD:
  4600. key_read = 1; // Query satisfied with key
  4601. break;
  4602. case HA_EXTRA_NO_KEYREAD:
  4603. key_read = 0;
  4604. break;
  4605. case HA_EXTRA_IGNORE_DUP_KEY:
  4606. using_ignore = 1;
  4607. break;
  4608. case HA_EXTRA_NO_IGNORE_DUP_KEY:
  4609. using_ignore = 0;
  4610. break;
  4611. default:
  4612. break;
  4613. }
  4614. TOKUDB_DBUG_RETURN(0);
  4615. }
  4616. int ha_tokudb::reset(void) {
  4617. TOKUDB_DBUG_ENTER("ha_tokudb::reset");
  4618. key_read = 0;
  4619. using_ignore = 0;
  4620. TOKUDB_DBUG_RETURN(0);
  4621. }
  4622. //
  4623. // helper function that iterates through all DB's
  4624. // and grabs a lock (either read or write, but not both)
  4625. // Parameters:
  4626. // [in] trans - transaction to be used to pre acquire the lock
  4627. // lt - type of lock to get, either lock_read or lock_write
  4628. // Returns:
  4629. // 0 on success
  4630. // error otherwise
  4631. //
  4632. int ha_tokudb::acquire_table_lock (DB_TXN* trans, TABLE_LOCK_TYPE lt) {
  4633. int error = ENOSYS;
  4634. uint curr_num_DBs = table->s->keys + test(hidden_primary_key);
  4635. if (lt == lock_read) {
  4636. for (uint i = 0; i < curr_num_DBs; i++) {
  4637. DB* db = share->key_file[i];
  4638. error = db->pre_acquire_read_lock(
  4639. db,
  4640. trans,
  4641. db->dbt_neg_infty(), db->dbt_neg_infty(),
  4642. db->dbt_pos_infty(), db->dbt_pos_infty()
  4643. );
  4644. if (error) break;
  4645. }
  4646. if (error) goto cleanup;
  4647. }
  4648. else if (lt == lock_write) {
  4649. if (tokudb_debug & TOKUDB_DEBUG_LOCK)
  4650. TOKUDB_TRACE("%s\n", __FUNCTION__);
  4651. for (uint i = 0; i < curr_num_DBs; i++) {
  4652. DB* db = share->key_file[i];
  4653. error = db->pre_acquire_table_lock(db, trans);
  4654. if (error == EINVAL)
  4655. TOKUDB_TRACE("%s %d db=%p trans=%p\n", __FUNCTION__, i, db, trans);
  4656. if (error) break;
  4657. }
  4658. if (tokudb_debug & TOKUDB_DEBUG_LOCK)
  4659. TOKUDB_TRACE("%s error=%d\n", __FUNCTION__, error);
  4660. if (error) goto cleanup;
  4661. }
  4662. else {
  4663. error = ENOSYS;
  4664. goto cleanup;
  4665. }
  4666. error = 0;
  4667. cleanup:
  4668. return error;
  4669. }
  4670. int ha_tokudb::create_txn(THD* thd, tokudb_trx_data* trx) {
  4671. int error;
  4672. ulong tx_isolation = thd_tx_isolation(thd);
  4673. HA_TOKU_ISO_LEVEL toku_iso_level = tx_to_toku_iso(tx_isolation);
  4674. /* First table lock, start transaction */
  4675. if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) &&
  4676. !trx->all &&
  4677. (thd_sql_command(thd) != SQLCOM_CREATE_TABLE) &&
  4678. (thd_sql_command(thd) != SQLCOM_DROP_TABLE) &&
  4679. (thd_sql_command(thd) != SQLCOM_ALTER_TABLE)) {
  4680. /* QQQ We have to start a master transaction */
  4681. // DBUG_PRINT("trans", ("starting transaction all "));
  4682. if ((error = db_env->txn_begin(db_env, NULL, &trx->all, toku_iso_to_txn_flag(toku_iso_level)))) {
  4683. trx->tokudb_lock_count--; // We didn't get the lock
  4684. goto cleanup;
  4685. }
  4686. if (tokudb_debug & TOKUDB_DEBUG_TXN) {
  4687. TOKUDB_TRACE("master:%p\n", trx->all);
  4688. }
  4689. trx->sp_level = trx->all;
  4690. trans_register_ha(thd, TRUE, tokudb_hton);
  4691. }
  4692. DBUG_PRINT("trans", ("starting transaction stmt"));
  4693. if (trx->stmt) {
  4694. if (tokudb_debug & TOKUDB_DEBUG_TXN) {
  4695. TOKUDB_TRACE("warning:stmt=%p\n", trx->stmt);
  4696. }
  4697. }
  4698. u_int32_t txn_begin_flags;
  4699. if (trx->all == NULL) {
  4700. txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
  4701. }
  4702. else {
  4703. txn_begin_flags = DB_INHERIT_ISOLATION;
  4704. }
  4705. if ((error = db_env->txn_begin(db_env, trx->sp_level, &trx->stmt, txn_begin_flags))) {
  4706. /* We leave the possible master transaction open */
  4707. trx->tokudb_lock_count--; // We didn't get the lock
  4708. goto cleanup;
  4709. }
  4710. if (tokudb_debug & TOKUDB_DEBUG_TXN) {
  4711. TOKUDB_TRACE("stmt:%p:%p\n", trx->sp_level, trx->stmt);
  4712. }
  4713. reset_stmt_progress(&trx->stmt_progress);
  4714. trans_register_ha(thd, FALSE, tokudb_hton);
  4715. cleanup:
  4716. return error;
  4717. }
  4718. /*
  4719. As MySQL will execute an external lock for every new table it uses
  4720. we can use this to start the transactions.
  4721. If we are in auto_commit mode we just need to start a transaction
  4722. for the statement to be able to rollback the statement.
  4723. If not, we have to start a master transaction if there doesn't exist
  4724. one from before.
  4725. */
  4726. //
  4727. // Parameters:
  4728. // [in] thd - handle to the user thread
  4729. // lock_type - the type of lock
  4730. // Returns:
  4731. // 0 on success
  4732. // error otherwise
  4733. //
  4734. int ha_tokudb::external_lock(THD * thd, int lock_type) {
  4735. TOKUDB_DBUG_ENTER("ha_tokudb::external_lock cmd=%d %d", thd_sql_command(thd), lock_type);
  4736. if (tokudb_debug & TOKUDB_DEBUG_LOCK)
  4737. TOKUDB_TRACE("%s cmd=%d %d\n", __FUNCTION__, thd_sql_command(thd), lock_type);
  4738. // QQQ this is here to allow experiments without transactions
  4739. int error = 0;
  4740. tokudb_trx_data *trx = NULL;
  4741. trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);
  4742. if (!trx) {
  4743. error = create_tokudb_trx_data_instance(&trx);
  4744. if (error) { goto cleanup; }
  4745. thd_data_set(thd, tokudb_hton->slot, trx);
  4746. }
  4747. if (trx->all == NULL) {
  4748. trx->sp_level = NULL;
  4749. }
  4750. if (lock_type != F_UNLCK) {
  4751. if (!trx->tokudb_lock_count++) {
  4752. DBUG_ASSERT(trx->stmt == 0);
  4753. transaction = NULL; // Safety
  4754. error = create_txn(thd, trx);
  4755. if (error) {
  4756. goto cleanup;
  4757. }
  4758. }
  4759. transaction = trx->stmt;
  4760. }
  4761. else {
  4762. lock.type = TL_UNLOCK; // Unlocked
  4763. pthread_mutex_lock(&share->mutex);
  4764. // hate dealing with comparison of signed vs unsigned, so doing this
  4765. if (deleted_rows > added_rows && share->rows < (deleted_rows - added_rows)) {
  4766. share->rows = 0;
  4767. }
  4768. else {
  4769. share->rows += (added_rows - deleted_rows);
  4770. }
  4771. pthread_mutex_unlock(&share->mutex);
  4772. added_rows = 0;
  4773. deleted_rows = 0;
  4774. share->rows_from_locked_table = 0;
  4775. if (!--trx->tokudb_lock_count) {
  4776. if (trx->stmt) {
  4777. /*
  4778. F_UNLCK is done without a transaction commit / rollback.
  4779. This happens if the thread didn't update any rows
  4780. We must in this case commit the work to keep the row locks
  4781. */
  4782. DBUG_PRINT("trans", ("commiting non-updating transaction"));
  4783. commit_txn(trx->stmt, 0);
  4784. reset_stmt_progress(&trx->stmt_progress);
  4785. if (tokudb_debug & TOKUDB_DEBUG_TXN)
  4786. TOKUDB_TRACE("commit:%p:%d\n", trx->stmt, error);
  4787. trx->stmt = NULL;
  4788. }
  4789. }
  4790. transaction = NULL;
  4791. }
  4792. cleanup:
  4793. if (tokudb_debug & TOKUDB_DEBUG_LOCK)
  4794. TOKUDB_TRACE("%s error=%d\n", __FUNCTION__, error);
  4795. TOKUDB_DBUG_RETURN(error);
  4796. }
  4797. /*
  4798. When using LOCK TABLE's external_lock is only called when the actual
  4799. TABLE LOCK is done.
  4800. Under LOCK TABLES, each used tables will force a call to start_stmt.
  4801. */
  4802. int ha_tokudb::start_stmt(THD * thd, thr_lock_type lock_type) {
  4803. TOKUDB_DBUG_ENTER("ha_tokudb::start_stmt");
  4804. int error = 0;
  4805. tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);
  4806. DBUG_ASSERT(trx);
  4807. /*
  4808. note that trx->stmt may have been already initialized as start_stmt()
  4809. is called for *each table* not for each storage engine,
  4810. and there could be many bdb tables referenced in the query
  4811. */
  4812. if (!trx->stmt) {
  4813. DBUG_PRINT("trans", ("starting transaction stmt"));
  4814. error = create_txn(thd, trx);
  4815. if (error) {
  4816. goto cleanup;
  4817. }
  4818. }
  4819. //
  4820. // we know we are in lock tables
  4821. // attempt to grab a table lock
  4822. // if fail, continue, do not return error
  4823. // This is because a failure ok, it simply means
  4824. // another active transaction has some locks.
  4825. // That other transaction modify this table
  4826. // until it is unlocked, therefore having acquire_table_lock
  4827. // potentially grab some locks but not all is ok.
  4828. //
  4829. if (lock.type <= TL_READ_NO_INSERT) {
  4830. acquire_table_lock(trx->stmt,lock_read);
  4831. }
  4832. else {
  4833. acquire_table_lock(trx->stmt,lock_write);
  4834. }
  4835. if (added_rows > deleted_rows) {
  4836. share->rows_from_locked_table = added_rows - deleted_rows;
  4837. }
  4838. transaction = trx->stmt;
  4839. cleanup:
  4840. TOKUDB_DBUG_RETURN(error);
  4841. }
  4842. /*
  4843. The idea with handler::store_lock() is the following:
  4844. The statement decided which locks we should need for the table
  4845. for updates/deletes/inserts we get WRITE locks, for SELECT... we get
  4846. read locks.
  4847. Before adding the lock into the table lock handler (see thr_lock.c)
  4848. mysqld calls store lock with the requested locks. Store lock can now
  4849. modify a write lock to a read lock (or some other lock), ignore the
  4850. lock (if we don't want to use MySQL table locks at all) or add locks
  4851. for many tables (like we do when we are using a MERGE handler).
  4852. Tokudb DB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which
  4853. signals that we are doing WRITES, but we are still allowing other
  4854. reader's and writer's.
  4855. When releasing locks, store_lock() are also called. In this case one
  4856. usually doesn't have to do anything.
  4857. In some exceptional cases MySQL may send a request for a TL_IGNORE;
  4858. This means that we are requesting the same lock as last time and this
  4859. should also be ignored. (This may happen when someone does a flush
  4860. table when we have opened a part of the tables, in which case mysqld
  4861. closes and reopens the tables and tries to get the same locks at last
  4862. time). In the future we will probably try to remove this.
  4863. */
  4864. THR_LOCK_DATA **ha_tokudb::store_lock(THD * thd, THR_LOCK_DATA ** to, enum thr_lock_type lock_type) {
  4865. TOKUDB_DBUG_ENTER("ha_tokudb::store_lock, lock_type=%d cmd=%d", lock_type, thd_sql_command(thd));
  4866. if (tokudb_debug & TOKUDB_DEBUG_LOCK)
  4867. TOKUDB_TRACE("%s lock_type=%d cmd=%d\n", __FUNCTION__, lock_type, thd_sql_command(thd));
  4868. if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) {
  4869. /* If we are not doing a LOCK TABLE, then allow multiple writers */
  4870. if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
  4871. !thd->in_lock_tables && thd_sql_command(thd) != SQLCOM_TRUNCATE && !thd_tablespace_op(thd)) {
  4872. lock_type = TL_WRITE_ALLOW_WRITE;
  4873. }
  4874. lock.type = lock_type;
  4875. }
  4876. *to++ = &lock;
  4877. if (tokudb_debug & TOKUDB_DEBUG_LOCK)
  4878. TOKUDB_TRACE("%s lock_type=%d\n", __FUNCTION__, lock_type);
  4879. DBUG_RETURN(to);
  4880. }
  4881. int toku_dbt_up(DB*,
  4882. u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val,
  4883. u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val) {
  4884. assert(false);
  4885. return 0;
  4886. }
  4887. static int create_sub_table(const char *table_name, DBT* row_descriptor, DB_TXN* txn) {
  4888. TOKUDB_DBUG_ENTER("create_sub_table");
  4889. int error;
  4890. DB *file = NULL;
  4891. error = db_create(&file, db_env, 0);
  4892. if (error) {
  4893. DBUG_PRINT("error", ("Got error: %d when creating table", error));
  4894. my_errno = error;
  4895. goto exit;
  4896. }
  4897. error = file->set_descriptor(file, 1, row_descriptor, toku_dbt_up);
  4898. if (error) {
  4899. DBUG_PRINT("error", ("Got error: %d when setting row descriptor for table '%s'", error, table_name));
  4900. goto exit;
  4901. }
  4902. error = file->open(file, txn, table_name, NULL, DB_BTREE, DB_THREAD | DB_CREATE | DB_EXCL, my_umask);
  4903. if (error) {
  4904. DBUG_PRINT("error", ("Got error: %d when opening table '%s'", error, table_name));
  4905. goto exit;
  4906. }
  4907. error = 0;
  4908. exit:
  4909. if (file) {
  4910. int r = file->close(file, 0);
  4911. assert(r==0);
  4912. }
  4913. TOKUDB_DBUG_RETURN(error);
  4914. }
  4915. void ha_tokudb::update_create_info(HA_CREATE_INFO* create_info) {
  4916. if (share->has_auto_inc) {
  4917. info(HA_STATUS_AUTO);
  4918. create_info->auto_increment_value = stats.auto_increment_value;
  4919. }
  4920. }
  4921. //
  4922. // removes key name from status.tokudb.
  4923. // needed for when we are dropping indexes, so that
  4924. // during drop table, we do not attempt to remove already dropped
  4925. // indexes because we did not keep status.tokudb in sync with list of indexes.
  4926. //
  4927. int ha_tokudb::remove_key_name_from_status(DB* status_block, char* key_name, DB_TXN* txn) {
  4928. int error;
  4929. uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
  4930. HA_METADATA_KEY md_key = hatoku_key_name;
  4931. memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
  4932. //
  4933. // put index name in status.tokudb
  4934. //
  4935. memcpy(
  4936. status_key_info + sizeof(HA_METADATA_KEY),
  4937. key_name,
  4938. strlen(key_name) + 1
  4939. );
  4940. error = remove_metadata(
  4941. status_block,
  4942. status_key_info,
  4943. sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
  4944. txn
  4945. );
  4946. return error;
  4947. }
  4948. //
  4949. // writes the key name in status.tokudb, so that we may later delete or rename
  4950. // the dictionary associated with key_name
  4951. //
  4952. int ha_tokudb::write_key_name_to_status(DB* status_block, char* key_name, DB_TXN* txn) {
  4953. int error;
  4954. uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
  4955. HA_METADATA_KEY md_key = hatoku_key_name;
  4956. memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
  4957. //
  4958. // put index name in status.tokudb
  4959. //
  4960. memcpy(
  4961. status_key_info + sizeof(HA_METADATA_KEY),
  4962. key_name,
  4963. strlen(key_name) + 1
  4964. );
  4965. error = write_metadata(
  4966. status_block,
  4967. status_key_info,
  4968. sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
  4969. NULL,
  4970. 0,
  4971. txn
  4972. );
  4973. return error;
  4974. }
  4975. //
  4976. // some tracing moved out of ha_tokudb::create, because ::create was getting cluttered
  4977. //
  4978. void ha_tokudb::trace_create_table_info(const char *name, TABLE * form) {
  4979. uint i;
  4980. //
  4981. // tracing information about what type of table we are creating
  4982. //
  4983. if (tokudb_debug & TOKUDB_DEBUG_OPEN) {
  4984. for (i = 0; i < form->s->fields; i++) {
  4985. Field *field = form->s->field[i];
  4986. TOKUDB_TRACE("field:%d:%s:type=%d:flags=%x\n", i, field->field_name, field->type(), field->flags);
  4987. }
  4988. for (i = 0; i < form->s->keys; i++) {
  4989. KEY *key = &form->s->key_info[i];
  4990. TOKUDB_TRACE("key:%d:%s:%d\n", i, key->name, key->key_parts);
  4991. uint p;
  4992. for (p = 0; p < key->key_parts; p++) {
  4993. KEY_PART_INFO *key_part = &key->key_part[p];
  4994. Field *field = key_part->field;
  4995. TOKUDB_TRACE("key:%d:%d:length=%d:%s:type=%d:flags=%x\n",
  4996. i, p, key_part->length, field->field_name, field->type(), field->flags);
  4997. }
  4998. }
  4999. }
  5000. }
  5001. //
  5002. // creates dictionary for secondary index, with key description key_info, all using txn
  5003. //
  5004. int ha_tokudb::create_secondary_dictionary(const char* name, TABLE* form, KEY* key_info, DB_TXN* txn, KEY_AND_COL_INFO* kc_info, u_int32_t keynr) {
  5005. int error;
  5006. DBT row_descriptor;
  5007. uchar* row_desc_buff = NULL;
  5008. uchar* ptr = NULL;
  5009. char* newname = NULL;
  5010. KEY* prim_key = NULL;
  5011. char dict_name[MAX_DICT_NAME_LEN];
  5012. u_int32_t max_row_desc_buff_size;
  5013. uint hpk= (form->s->primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
  5014. bzero(&row_descriptor, sizeof(row_descriptor));
  5015. max_row_desc_buff_size = 2*(form->s->fields * 6)+10; // upper bound of key comparison descriptor
  5016. max_row_desc_buff_size += get_max_secondary_key_pack_desc_size(kc_info); // upper bound for sec. key part
  5017. max_row_desc_buff_size += get_max_clustering_val_pack_desc_size(form->s); // upper bound for clustering val part
  5018. row_desc_buff = (uchar *)my_malloc(max_row_desc_buff_size, MYF(MY_WME));
  5019. if (row_desc_buff == NULL){ error = ENOMEM; goto cleanup;}
  5020. ptr = row_desc_buff;
  5021. newname = (char *)my_malloc(get_max_dict_name_path_length(name),MYF(MY_WME));
  5022. if (newname == NULL){ error = ENOMEM; goto cleanup;}
  5023. sprintf(dict_name, "key-%s", key_info->name);
  5024. make_name(newname, name, dict_name);
  5025. prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
  5026. //
  5027. // setup the row descriptor
  5028. //
  5029. row_descriptor.data = row_desc_buff;
  5030. //
  5031. // save data necessary for key comparisons
  5032. //
  5033. ptr += create_toku_key_descriptor(
  5034. row_desc_buff,
  5035. false,
  5036. key_info,
  5037. hpk,
  5038. prim_key
  5039. );
  5040. ptr += create_toku_secondary_key_pack_descriptor(
  5041. ptr,
  5042. hpk,
  5043. primary_key,
  5044. form->s,
  5045. form,
  5046. kc_info,
  5047. key_info,
  5048. prim_key
  5049. );
  5050. ptr += create_toku_clustering_val_pack_descriptor(
  5051. ptr,
  5052. primary_key,
  5053. form->s,
  5054. kc_info,
  5055. keynr,
  5056. key_info->flags & HA_CLUSTERING
  5057. );
  5058. row_descriptor.size = ptr - row_desc_buff;
  5059. assert(row_descriptor.size <= max_row_desc_buff_size);
  5060. error = create_sub_table(newname, &row_descriptor, txn);
  5061. cleanup:
  5062. my_free(newname, MYF(MY_ALLOW_ZERO_PTR));
  5063. my_free(row_desc_buff, MYF(MY_ALLOW_ZERO_PTR));
  5064. return error;
  5065. }
  5066. //
  5067. // create and close the main dictionarr with name of "name" using table form, all within
  5068. // transaction txn.
  5069. //
  5070. int ha_tokudb::create_main_dictionary(const char* name, TABLE* form, DB_TXN* txn, KEY_AND_COL_INFO* kc_info) {
  5071. int error;
  5072. DBT row_descriptor;
  5073. uchar* row_desc_buff = NULL;
  5074. uchar* ptr = NULL;
  5075. char* newname = NULL;
  5076. KEY* prim_key = NULL;
  5077. u_int32_t max_row_desc_buff_size;
  5078. uint hpk= (form->s->primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
  5079. bzero(&row_descriptor, sizeof(row_descriptor));
  5080. max_row_desc_buff_size = 2*(form->s->fields * 6)+10; // upper bound of key comparison descriptor
  5081. max_row_desc_buff_size += get_max_secondary_key_pack_desc_size(kc_info); // upper bound for sec. key part
  5082. max_row_desc_buff_size += get_max_clustering_val_pack_desc_size(form->s); // upper bound for clustering val part
  5083. row_desc_buff = (uchar *)my_malloc(max_row_desc_buff_size, MYF(MY_WME));
  5084. if (row_desc_buff == NULL){ error = ENOMEM; goto cleanup;}
  5085. ptr = row_desc_buff;
  5086. newname = (char *)my_malloc(get_max_dict_name_path_length(name),MYF(MY_WME));
  5087. if (newname == NULL){ error = ENOMEM; goto cleanup;}
  5088. make_name(newname, name, "main");
  5089. prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
  5090. //
  5091. // setup the row descriptor
  5092. //
  5093. row_descriptor.data = row_desc_buff;
  5094. //
  5095. // save data necessary for key comparisons
  5096. //
  5097. ptr += create_toku_key_descriptor(
  5098. row_desc_buff,
  5099. hpk,
  5100. prim_key,
  5101. false,
  5102. NULL
  5103. );
  5104. ptr += create_toku_main_key_pack_descriptor(
  5105. ptr
  5106. );
  5107. ptr += create_toku_clustering_val_pack_descriptor(
  5108. ptr,
  5109. primary_key,
  5110. form->s,
  5111. kc_info,
  5112. primary_key,
  5113. false
  5114. );
  5115. row_descriptor.size = ptr - row_desc_buff;
  5116. assert(row_descriptor.size <= max_row_desc_buff_size);
  5117. /* Create the main table that will hold the real rows */
  5118. error = create_sub_table(newname, &row_descriptor, txn);
  5119. cleanup:
  5120. my_free(newname, MYF(MY_ALLOW_ZERO_PTR));
  5121. my_free(row_desc_buff, MYF(MY_ALLOW_ZERO_PTR));
  5122. return error;
  5123. }
  5124. //
  5125. // Creates a new table
  5126. // Parameters:
  5127. // [in] name - table name
  5128. // [in] form - info on table, columns and indexes
  5129. // [in] create_info - more info on table, CURRENTLY UNUSED
  5130. // Returns:
  5131. // 0 on success
  5132. // error otherwise
  5133. //
  5134. int ha_tokudb::create(const char *name, TABLE * form, HA_CREATE_INFO * create_info) {
  5135. TOKUDB_DBUG_ENTER("ha_tokudb::create");
  5136. int error;
  5137. DB *status_block = NULL;
  5138. uint version;
  5139. uint capabilities;
  5140. DB_TXN* txn = NULL;
  5141. char* newname = NULL;
  5142. KEY_AND_COL_INFO kc_info;
  5143. bzero(&kc_info, sizeof(kc_info));
  5144. pthread_mutex_lock(&tokudb_meta_mutex);
  5145. newname = (char *)my_malloc(get_max_dict_name_path_length(name),MYF(MY_WME));
  5146. if (newname == NULL){ error = ENOMEM; goto cleanup;}
  5147. error = db_env->txn_begin(db_env, 0, &txn, 0);
  5148. if (error) { goto cleanup; }
  5149. primary_key = form->s->primary_key;
  5150. hidden_primary_key = (primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
  5151. if (hidden_primary_key) {
  5152. primary_key = form->s->keys;
  5153. }
  5154. /* do some tracing */
  5155. trace_create_table_info(name,form);
  5156. /* Create status.tokudb and save relevant metadata */
  5157. make_name(newname, name, "status");
  5158. error = db_create(&status_block, db_env, 0);
  5159. if (error) { goto cleanup; }
  5160. error = status_block->open(status_block, txn, newname, NULL, DB_BTREE, DB_CREATE | DB_EXCL, 0);
  5161. if (error) { goto cleanup; }
  5162. version = HA_TOKU_VERSION;
  5163. capabilities = HA_TOKU_CAP;
  5164. error = write_to_status(status_block, hatoku_version,&version,sizeof(version), txn);
  5165. if (error) { goto cleanup; }
  5166. error = write_to_status(status_block, hatoku_capabilities,&capabilities,sizeof(capabilities), txn);
  5167. if (error) { goto cleanup; }
  5168. error = write_auto_inc_create(status_block, create_info->auto_increment_value, txn);
  5169. if (error) { goto cleanup; }
  5170. error = allocate_key_and_col_info(form->s, &kc_info);
  5171. if (error) { goto cleanup; }
  5172. error = initialize_key_and_col_info(
  5173. form->s,
  5174. form,
  5175. &kc_info,
  5176. hidden_primary_key,
  5177. primary_key
  5178. );
  5179. if (error) { goto cleanup; }
  5180. error = create_main_dictionary(name, form, txn, &kc_info);
  5181. if (error) {
  5182. goto cleanup;
  5183. }
  5184. for (uint i = 0; i < form->s->keys; i++) {
  5185. if (i != primary_key) {
  5186. error = create_secondary_dictionary(name, form, &form->key_info[i], txn, &kc_info, i);
  5187. if (error) {
  5188. goto cleanup;
  5189. }
  5190. error = write_key_name_to_status(status_block, form->s->key_info[i].name, txn);
  5191. if (error) { goto cleanup; }
  5192. }
  5193. }
  5194. error = add_table_to_metadata(name, form, txn);
  5195. if (error) { goto cleanup; }
  5196. error = 0;
  5197. cleanup:
  5198. if (status_block != NULL) {
  5199. int r = status_block->close(status_block, 0);
  5200. assert(r==0);
  5201. }
  5202. free_key_and_col_info(&kc_info);
  5203. if (txn) {
  5204. if (error) {
  5205. abort_txn(txn);
  5206. }
  5207. else {
  5208. commit_txn(txn,0);
  5209. }
  5210. }
  5211. my_free(newname, MYF(MY_ALLOW_ZERO_PTR));
  5212. pthread_mutex_unlock(&tokudb_meta_mutex);
  5213. TOKUDB_DBUG_RETURN(error);
  5214. }
  5215. int ha_tokudb::discard_or_import_tablespace(my_bool discard) {
  5216. /*
  5217. if (discard) {
  5218. my_errno=HA_ERR_WRONG_COMMAND;
  5219. return my_errno;
  5220. }
  5221. return add_table_to_metadata(share->table_name);
  5222. */
  5223. my_errno=HA_ERR_WRONG_COMMAND;
  5224. return my_errno;
  5225. }
  5226. //
  5227. // deletes from_name or renames from_name to to_name, all using transaction txn.
  5228. // is_delete specifies which we are doing
  5229. // is_key specifies if it is a secondary index (and hence a "key-" needs to be prepended) or
  5230. // if it is not a secondary index
  5231. //
  5232. int ha_tokudb::delete_or_rename_dictionary( const char* from_name, const char* to_name, const char* secondary_name, bool is_key, DB_TXN* txn, bool is_delete) {
  5233. int error;
  5234. char dict_name[MAX_DICT_NAME_LEN];
  5235. char* new_from_name = NULL;
  5236. char* new_to_name = NULL;
  5237. assert(txn);
  5238. new_from_name = (char *)my_malloc(
  5239. get_max_dict_name_path_length(from_name),
  5240. MYF(MY_WME)
  5241. );
  5242. if (new_from_name == NULL) {
  5243. error = ENOMEM;
  5244. goto cleanup;
  5245. }
  5246. if (!is_delete) {
  5247. assert(to_name);
  5248. new_to_name = (char *)my_malloc(
  5249. get_max_dict_name_path_length(to_name),
  5250. MYF(MY_WME)
  5251. );
  5252. if (new_to_name == NULL) {
  5253. error = ENOMEM;
  5254. goto cleanup;
  5255. }
  5256. }
  5257. if (is_key) {
  5258. sprintf(dict_name, "key-%s", secondary_name);
  5259. make_name(new_from_name, from_name, dict_name);
  5260. }
  5261. else {
  5262. make_name(new_from_name, from_name, secondary_name);
  5263. }
  5264. if (!is_delete) {
  5265. if (is_key) {
  5266. sprintf(dict_name, "key-%s", secondary_name);
  5267. make_name(new_to_name, to_name, dict_name);
  5268. }
  5269. else {
  5270. make_name(new_to_name, to_name, secondary_name);
  5271. }
  5272. }
  5273. if (is_delete) {
  5274. error = db_env->dbremove(db_env, txn, new_from_name, NULL, 0);
  5275. }
  5276. else {
  5277. error = db_env->dbrename(db_env, txn, new_from_name, NULL, new_to_name, 0);
  5278. }
  5279. if (error) { goto cleanup; }
  5280. cleanup:
  5281. my_free(new_from_name, MYF(MY_ALLOW_ZERO_PTR));
  5282. my_free(new_to_name, MYF(MY_ALLOW_ZERO_PTR));
  5283. return error;
  5284. }
  5285. //
  5286. // deletes or renames a table. if is_delete is true, then we delete, and to_name can be NULL
  5287. // if is_delete is false, then to_name must be non-NULL, as we are renaming the table.
  5288. //
  5289. int ha_tokudb::delete_or_rename_table (const char* from_name, const char* to_name, bool is_delete) {
  5290. int error;
  5291. DB* status_db = NULL;
  5292. DBC* status_cursor = NULL;
  5293. DB_TXN* txn = NULL;
  5294. DBT curr_key;
  5295. DBT curr_val;
  5296. bzero(&curr_key, sizeof(curr_key));
  5297. bzero(&curr_val, sizeof(curr_val));
  5298. pthread_mutex_lock(&tokudb_meta_mutex);
  5299. error = db_env->txn_begin(db_env, 0, &txn, 0);
  5300. if (error) { goto cleanup; }
  5301. //
  5302. // modify metadata db
  5303. //
  5304. if (is_delete) {
  5305. error = drop_table_from_metadata(from_name, txn);
  5306. }
  5307. else {
  5308. error = rename_table_in_metadata(from_name, to_name, txn);
  5309. }
  5310. if (error) { goto cleanup; }
  5311. //
  5312. // open status db,
  5313. // create cursor,
  5314. // for each name read out of there, create a db and delete or rename it
  5315. //
  5316. error = open_status_dictionary(&status_db, from_name, txn);
  5317. if (error) { goto cleanup; }
  5318. error = status_db->cursor(status_db, txn, &status_cursor, 0);
  5319. if (error) { goto cleanup; }
  5320. while (error != DB_NOTFOUND) {
  5321. error = status_cursor->c_get(
  5322. status_cursor,
  5323. &curr_key,
  5324. &curr_val,
  5325. DB_NEXT
  5326. );
  5327. if (error && error != DB_NOTFOUND) { goto cleanup; }
  5328. if (error == DB_NOTFOUND) { break; }
  5329. HA_METADATA_KEY mk = *(HA_METADATA_KEY *)curr_key.data;
  5330. if (mk != hatoku_key_name) {
  5331. continue;
  5332. }
  5333. error = delete_or_rename_dictionary(from_name, to_name, (char *)((char *)curr_key.data + sizeof(HA_METADATA_KEY)), true, txn, is_delete);
  5334. if (error) { goto cleanup; }
  5335. }
  5336. //
  5337. // delete or rename main.tokudb
  5338. //
  5339. error = delete_or_rename_dictionary(from_name, to_name, "main", false, txn, is_delete);
  5340. if (error) { goto cleanup; }
  5341. error = status_cursor->c_close(status_cursor);
  5342. assert(error==0);
  5343. status_cursor = NULL;
  5344. if (error) { goto cleanup; }
  5345. error = status_db->close(status_db, 0);
  5346. assert(error == 0);
  5347. status_db = NULL;
  5348. //
  5349. // delete or rename status.tokudb
  5350. //
  5351. error = delete_or_rename_dictionary(from_name, to_name, "status", false, txn, is_delete);
  5352. if (error) { goto cleanup; }
  5353. my_errno = error;
  5354. cleanup:
  5355. if (status_cursor) {
  5356. int r = status_cursor->c_close(status_cursor);
  5357. assert(r==0);
  5358. }
  5359. if (status_db) {
  5360. int r = status_db->close(status_db, 0);
  5361. assert(r==0);
  5362. }
  5363. if (txn) {
  5364. if (error) {
  5365. abort_txn(txn);
  5366. }
  5367. else {
  5368. commit_txn(txn, 0);
  5369. }
  5370. }
  5371. pthread_mutex_unlock(&tokudb_meta_mutex);
  5372. return error;
  5373. }
  5374. //
  5375. // Drops table
  5376. // Parameters:
  5377. // [in] name - name of table to be deleted
  5378. // Returns:
  5379. // 0 on success
  5380. // error otherwise
  5381. //
  5382. int ha_tokudb::delete_table(const char *name) {
  5383. TOKUDB_DBUG_ENTER("ha_tokudb::delete_table");
  5384. int error;
  5385. error = delete_or_rename_table(name, NULL, true);
  5386. if (error == DB_LOCK_NOTGRANTED) {
  5387. sql_print_error("Could not delete table %s because \
  5388. another transaction has accessed the table. \
  5389. To drop the table, make sure no transactions touch the table.", name);
  5390. }
  5391. TOKUDB_DBUG_RETURN(error);
  5392. }
  5393. //
  5394. // renames table from "from" to "to"
  5395. // Parameters:
  5396. // [in] name - old name of table
  5397. // [in] to - new name of table
  5398. // Returns:
  5399. // 0 on success
  5400. // error otherwise
  5401. //
  5402. int ha_tokudb::rename_table(const char *from, const char *to) {
  5403. TOKUDB_DBUG_ENTER("%s %s %s", __FUNCTION__, from, to);
  5404. int error;
  5405. error = delete_or_rename_table(from, to, false);
  5406. if (error == DB_LOCK_NOTGRANTED) {
  5407. sql_print_error("Could not rename table from %s to %s because \
  5408. another transaction has accessed the table. \
  5409. To rename the table, make sure no transactions touch the table.", from, to);
  5410. }
  5411. TOKUDB_DBUG_RETURN(error);
  5412. }
  5413. /*
  5414. Returns estimate on number of seeks it will take to read through the table
  5415. This is to be comparable to the number returned by records_in_range so
  5416. that we can decide if we should scan the table or use keys.
  5417. */
  5418. /// QQQ why divide by 3
  5419. double ha_tokudb::scan_time() {
  5420. TOKUDB_DBUG_ENTER("ha_tokudb::scan_time");
  5421. double ret_val = (double)stats.records / 3;
  5422. DBUG_RETURN(ret_val);
  5423. }
  5424. //
  5425. // Calculate the time it takes to read a set of ranges through an index
  5426. // This enables us to optimize reads for clustered indexes.
  5427. // Implementation pulled from InnoDB
  5428. // Parameters:
  5429. // index - index to use
  5430. // ranges - number of ranges
  5431. // rows - estimated number of rows in the range
  5432. // Returns:
  5433. // estimated time measured in disk seeks
  5434. //
  5435. double ha_tokudb::read_time(
  5436. uint index,
  5437. uint ranges,
  5438. ha_rows rows
  5439. )
  5440. {
  5441. double total_scan;
  5442. double ret_val;
  5443. //
  5444. // in case for hidden primary key, this is called
  5445. //
  5446. if (index >= table_share->keys) {
  5447. ret_val = handler::read_time(index, ranges, rows);
  5448. goto cleanup;
  5449. }
  5450. //
  5451. // if it is not the primary key, and it is not a clustering key, then return handler::read_time
  5452. //
  5453. if (index != primary_key && !(table->key_info[index].flags & HA_CLUSTERING)) {
  5454. ret_val = handler::read_time(index, ranges, rows);
  5455. goto cleanup;
  5456. }
  5457. //
  5458. // for primary key and for clustered keys, return a fraction of scan_time()
  5459. //
  5460. total_scan = scan_time();
  5461. if (stats.records < rows) {
  5462. ret_val = total_scan;
  5463. goto cleanup;
  5464. }
  5465. //
  5466. // one disk seek per range plus the proportional scan time of the rows
  5467. //
  5468. ret_val = (ranges + (double) rows / (double) stats.records * total_scan);
  5469. cleanup:
  5470. return ret_val;
  5471. }
  5472. //
  5473. // Estimates the number of index records in a range. In case of errors, return
  5474. // HA_TOKUDB_RANGE_COUNT instead of HA_POS_ERROR. This was behavior
  5475. // when we got the handlerton from MySQL.
  5476. // Parameters:
  5477. // keynr -index to use
  5478. // [in] start_key - low end of the range
  5479. // [in] end_key - high end of the range
  5480. // Returns:
  5481. // 0 - There are no matching keys in the given range
  5482. // number > 0 - There are approximately number matching rows in the range
  5483. // HA_POS_ERROR - Something is wrong with the index tree
  5484. //
  5485. ha_rows ha_tokudb::records_in_range(uint keynr, key_range* start_key, key_range* end_key) {
  5486. TOKUDB_DBUG_ENTER("ha_tokudb::records_in_range");
  5487. DBT key;
  5488. ha_rows ret_val = HA_TOKUDB_RANGE_COUNT;
  5489. DB *kfile = share->key_file[keynr];
  5490. u_int64_t less, equal, greater;
  5491. u_int64_t start_rows, end_rows, rows;
  5492. int is_exact;
  5493. int error;
  5494. uchar inf_byte;
  5495. //
  5496. // get start_rows and end_rows values so that we can estimate range
  5497. // when calling key_range64, the only value we can trust is the value for less
  5498. // The reason is that the key being passed in may be a prefix of keys in the DB
  5499. // As a result, equal may be 0 and greater may actually be equal+greater
  5500. // So, we call key_range64 on the key, and the key that is after it.
  5501. //
  5502. if (start_key) {
  5503. inf_byte = (start_key->flag == HA_READ_KEY_EXACT) ?
  5504. COL_NEG_INF : COL_POS_INF;
  5505. pack_key(
  5506. &key,
  5507. keynr,
  5508. key_buff,
  5509. start_key->key,
  5510. start_key->length,
  5511. inf_byte
  5512. );
  5513. error = kfile->key_range64(
  5514. kfile,
  5515. transaction,
  5516. &key,
  5517. &less,
  5518. &equal,
  5519. &greater,
  5520. &is_exact
  5521. );
  5522. if (error) {
  5523. ret_val = HA_TOKUDB_RANGE_COUNT;
  5524. goto cleanup;
  5525. }
  5526. start_rows= less;
  5527. }
  5528. else {
  5529. start_rows= 0;
  5530. }
  5531. if (end_key) {
  5532. inf_byte = (end_key->flag == HA_READ_BEFORE_KEY) ?
  5533. COL_NEG_INF : COL_POS_INF;
  5534. pack_key(
  5535. &key,
  5536. keynr,
  5537. key_buff,
  5538. end_key->key,
  5539. end_key->length,
  5540. inf_byte
  5541. );
  5542. error = kfile->key_range64(
  5543. kfile,
  5544. transaction,
  5545. &key,
  5546. &less,
  5547. &equal,
  5548. &greater,
  5549. &is_exact
  5550. );
  5551. if (error) {
  5552. ret_val = HA_TOKUDB_RANGE_COUNT;
  5553. goto cleanup;
  5554. }
  5555. end_rows= less;
  5556. }
  5557. else {
  5558. end_rows = stats.records;
  5559. }
  5560. rows = (end_rows > start_rows) ? end_rows - start_rows : 1;
  5561. //
  5562. // MySQL thinks a return value of 0 means there are exactly 0 rows
  5563. // Therefore, always return non-zero so this assumption is not made
  5564. //
  5565. ret_val = (ha_rows) (rows <= 1 ? 1 : rows);
  5566. cleanup:
  5567. DBUG_RETURN(ret_val);
  5568. }
  5569. //
  5570. // Initializes the auto-increment data in the local "share" object to the
  5571. // greater of two values: what's stored in the metadata or the last inserted
  5572. // auto-increment field (if auto-increment field is the first field of a key).
  5573. //
  5574. void ha_tokudb::init_auto_increment() {
  5575. DBT key;
  5576. DBT value;
  5577. int error;
  5578. HA_METADATA_KEY key_val = hatoku_max_ai;
  5579. bzero(&key, sizeof(key));
  5580. bzero(&value, sizeof(value));
  5581. key.data = &key_val;
  5582. key.size = sizeof(key_val);
  5583. value.flags = DB_DBT_USERMEM;
  5584. DB_TXN* txn = NULL;
  5585. error = db_env->txn_begin(db_env, 0, &txn, 0);
  5586. if (error) {
  5587. share->last_auto_increment = 0;
  5588. }
  5589. else {
  5590. //
  5591. // First retrieve hatoku_max_ai, which is max value used by auto increment
  5592. // column so far, the max value could have been auto generated (e.g. insert (NULL))
  5593. // or it could have been manually inserted by user (e.g. insert (345))
  5594. //
  5595. value.ulen = sizeof(share->last_auto_increment);
  5596. value.data = &share->last_auto_increment;
  5597. error = share->status_block->get(
  5598. share->status_block,
  5599. txn,
  5600. &key,
  5601. &value,
  5602. 0
  5603. );
  5604. if (error || value.size != sizeof(share->last_auto_increment)) {
  5605. share->last_auto_increment = 0;
  5606. }
  5607. //
  5608. // Now retrieve the initial auto increment value, as specified by create table
  5609. // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
  5610. // then the value 100 should be stored here
  5611. //
  5612. key_val = hatoku_ai_create_value;
  5613. value.ulen = sizeof(share->auto_inc_create_value);
  5614. value.data = &share->auto_inc_create_value;
  5615. error = share->status_block->get(
  5616. share->status_block,
  5617. txn,
  5618. &key,
  5619. &value,
  5620. 0
  5621. );
  5622. if (error || value.size != sizeof(share->auto_inc_create_value)) {
  5623. share->auto_inc_create_value = 0;
  5624. }
  5625. commit_txn(txn, 0);
  5626. }
  5627. if (tokudb_debug & TOKUDB_DEBUG_AUTO_INCREMENT) {
  5628. TOKUDB_TRACE("init auto increment:%lld\n", share->last_auto_increment);
  5629. }
  5630. }
  5631. void ha_tokudb::get_auto_increment(ulonglong offset, ulonglong increment, ulonglong nb_desired_values, ulonglong * first_value, ulonglong * nb_reserved_values) {
  5632. TOKUDB_DBUG_ENTER("ha_tokudb::get_auto_increment");
  5633. ulonglong nr;
  5634. pthread_mutex_lock(&share->mutex);
  5635. if (share->auto_inc_create_value > share->last_auto_increment) {
  5636. nr = share->auto_inc_create_value;
  5637. share->last_auto_increment = share->auto_inc_create_value;
  5638. }
  5639. else {
  5640. nr = share->last_auto_increment + increment;
  5641. }
  5642. share->last_auto_increment = nr + (nb_desired_values - 1)*increment;
  5643. if (delay_updating_ai_metadata) {
  5644. ai_metadata_update_required = true;
  5645. }
  5646. else {
  5647. update_max_auto_inc(share->status_block, share->last_auto_increment);
  5648. }
  5649. if (tokudb_debug & TOKUDB_DEBUG_AUTO_INCREMENT) {
  5650. TOKUDB_TRACE("get_auto_increment(%lld,%lld,%lld):got:%lld:%lld\n",
  5651. offset, increment, nb_desired_values, nr, nb_desired_values);
  5652. }
  5653. *first_value = nr;
  5654. *nb_reserved_values = nb_desired_values;
  5655. pthread_mutex_unlock(&share->mutex);
  5656. DBUG_VOID_RETURN;
  5657. }
  5658. bool ha_tokudb::is_auto_inc_singleton(){
  5659. return false;
  5660. }
  5661. //
  5662. // Adds indexes to the table. Takes the array of KEY passed in key_info, and creates
  5663. // DB's that will go at the end of share->key_file. THE IMPLICIT ASSUMPTION HERE is
  5664. // that the table will be modified and that these added keys will be appended to the end
  5665. // of the array table->key_info
  5666. // Parameters:
  5667. // [in] table_arg - table that is being modified, seems to be identical to this->table
  5668. // [in] key_info - array of KEY's to be added
  5669. // num_of_keys - number of keys to be added, number of elements in key_info
  5670. // Returns:
  5671. // 0 on success, error otherwise
  5672. //
  5673. int ha_tokudb::add_index(TABLE *table_arg, KEY *key_info, uint num_of_keys) {
  5674. TOKUDB_DBUG_ENTER("ha_tokudb::add_index");
  5675. int error;
  5676. uint curr_index = 0;
  5677. DBC* tmp_cursor = NULL;
  5678. int cursor_ret_val = 0;
  5679. DBT curr_pk_key, curr_pk_val;
  5680. DB_TXN* txn = NULL;
  5681. THD* thd = ha_thd();
  5682. DB_LOADER* loader = NULL;
  5683. u_int32_t loader_flags = (get_load_save_space(thd)) ? LOADER_USE_PUTS : 0;
  5684. u_int32_t mult_put_flags[MAX_KEY + 1] = {DB_YESOVERWRITE};
  5685. u_int32_t mult_dbt_flags[MAX_KEY + 1] = {DB_DBT_REALLOC};
  5686. struct loader_context lc = {0};
  5687. lc.thd = thd;
  5688. lc.ha = this;
  5689. loader_error = 0;
  5690. //
  5691. // number of DB files we have open currently, before add_index is executed
  5692. //
  5693. uint curr_num_DBs = table_arg->s->keys + test(hidden_primary_key);
  5694. //
  5695. // status message to be shown in "show process list"
  5696. //
  5697. char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound.
  5698. ulonglong num_processed = 0; //variable that stores number of elements inserted thus far
  5699. read_lock_wait_time = get_read_lock_wait_time(ha_thd());
  5700. thd_proc_info(thd, "Adding indexes");
  5701. error = db_env->txn_begin(db_env, 0, &txn, 0);
  5702. if (error) { goto cleanup; }
  5703. //
  5704. // in unpack_row, MySQL passes a buffer that is this long,
  5705. // so this length should be good enough for us as well
  5706. //
  5707. bzero((void *) &curr_pk_key, sizeof(curr_pk_key));
  5708. bzero((void *) &curr_pk_val, sizeof(curr_pk_val));
  5709. //
  5710. // The files for secondary tables are derived from the name of keys
  5711. // If we try to add a key with the same name as an already existing key,
  5712. // We can crash. So here we check if any of the keys added has the same
  5713. // name of an existing key, and if so, we fail gracefully
  5714. //
  5715. for (uint i = 0; i < num_of_keys; i++) {
  5716. for (uint j = 0; j < table_arg->s->keys; j++) {
  5717. if (strcmp(key_info[i].name, table_arg->s->key_info[j].name) == 0) {
  5718. error = HA_ERR_WRONG_COMMAND;
  5719. goto cleanup;
  5720. }
  5721. }
  5722. }
  5723. //
  5724. // open all the DB files and set the appropriate variables in share
  5725. // they go to the end of share->key_file
  5726. //
  5727. curr_index = curr_num_DBs;
  5728. for (uint i = 0; i < num_of_keys; i++, curr_index++) {
  5729. if (key_info[i].flags & HA_CLUSTERING) {
  5730. set_key_filter(
  5731. &share->kc_info.key_filters[curr_index],
  5732. &key_info[i],
  5733. table_arg,
  5734. false
  5735. );
  5736. if (!hidden_primary_key) {
  5737. set_key_filter(
  5738. &share->kc_info.key_filters[curr_index],
  5739. &table_arg->key_info[primary_key],
  5740. table_arg,
  5741. false
  5742. );
  5743. }
  5744. error = initialize_col_pack_info(&share->kc_info,table_arg->s,curr_index);
  5745. if (error) {
  5746. goto cleanup;
  5747. }
  5748. }
  5749. error = create_secondary_dictionary(share->table_name, table_arg, &key_info[i], txn, &share->kc_info, curr_index);
  5750. if (error) { goto cleanup; }
  5751. error = open_secondary_dictionary(
  5752. &share->key_file[curr_index],
  5753. &key_info[i],
  5754. share->table_name,
  5755. false,
  5756. txn
  5757. );
  5758. if (error) { goto cleanup; }
  5759. }
  5760. //
  5761. // grab some locks to make this go faster
  5762. // first a global read lock on the main DB, because
  5763. // we intend to scan the entire thing
  5764. //
  5765. lockretryN(read_lock_wait_time){
  5766. error = share->file->pre_acquire_read_lock(
  5767. share->file,
  5768. txn,
  5769. share->file->dbt_neg_infty(),
  5770. NULL,
  5771. share->file->dbt_pos_infty(),
  5772. NULL
  5773. );
  5774. lockretry_wait;
  5775. }
  5776. if (error) { goto cleanup; }
  5777. error = db_env->create_loader(
  5778. db_env,
  5779. txn,
  5780. &loader,
  5781. NULL, // no src_db needed
  5782. num_of_keys,
  5783. &share->key_file[curr_num_DBs],
  5784. mult_put_flags,
  5785. mult_dbt_flags,
  5786. loader_flags
  5787. );
  5788. if (error) { goto cleanup; }
  5789. error = loader->set_poll_function(loader, poll_fun, &lc);
  5790. if (error) { goto cleanup; }
  5791. error = loader->set_error_callback(loader, loader_ai_err_fun, &lc);
  5792. if (error) { goto cleanup; }
  5793. //
  5794. // scan primary table, create each secondary key, add to each DB
  5795. //
  5796. if ((error = share->file->cursor(share->file, txn, &tmp_cursor, 0))) {
  5797. tmp_cursor = NULL; // Safety
  5798. goto cleanup;
  5799. }
  5800. cursor_ret_val = tmp_cursor->c_get(tmp_cursor, &curr_pk_key, &curr_pk_val, DB_NEXT | DB_PRELOCKED);
  5801. while (cursor_ret_val != DB_NOTFOUND) {
  5802. if (cursor_ret_val) {
  5803. error = cursor_ret_val;
  5804. goto cleanup;
  5805. }
  5806. error = loader->put(loader, &curr_pk_key, &curr_pk_val);
  5807. if (error) { goto cleanup; }
  5808. num_processed++;
  5809. if ((num_processed % 1000) == 0) {
  5810. sprintf(status_msg, "Adding indexes: Processed %llu of about %llu rows.", num_processed, (long long unsigned) share->rows);
  5811. thd_proc_info(thd, status_msg);
  5812. if (thd->killed) {
  5813. error = ER_ABORTING_CONNECTION;
  5814. goto cleanup;
  5815. }
  5816. }
  5817. cursor_ret_val = tmp_cursor->c_get(tmp_cursor, &curr_pk_key, &curr_pk_val, DB_NEXT | DB_PRELOCKED);
  5818. }
  5819. error = tmp_cursor->c_close(tmp_cursor);
  5820. assert(error==0);
  5821. tmp_cursor = NULL;
  5822. error = loader->close(loader);
  5823. loader = NULL;
  5824. if (error) goto cleanup;
  5825. curr_index = curr_num_DBs;
  5826. for (uint i = 0; i < num_of_keys; i++, curr_index++) {
  5827. if (key_info[i].flags & HA_NOSAME) {
  5828. bool is_unique;
  5829. error = is_index_unique(
  5830. &is_unique,
  5831. txn,
  5832. share->key_file[curr_index],
  5833. &key_info[i]
  5834. );
  5835. if (error) goto cleanup;
  5836. if (!is_unique) {
  5837. error = HA_ERR_FOUND_DUPP_KEY;
  5838. last_dup_key = i;
  5839. goto cleanup;
  5840. }
  5841. }
  5842. }
  5843. //
  5844. // We have an accurate row count, might as well update share->rows
  5845. //
  5846. pthread_mutex_lock(&share->mutex);
  5847. share->rows = num_processed;
  5848. pthread_mutex_unlock(&share->mutex);
  5849. //
  5850. // now write stuff to status.tokudb
  5851. //
  5852. pthread_mutex_lock(&share->mutex);
  5853. for (uint i = 0; i < num_of_keys; i++) {
  5854. write_key_name_to_status(share->status_block, key_info[i].name, txn);
  5855. }
  5856. pthread_mutex_unlock(&share->mutex);
  5857. error = 0;
  5858. cleanup:
  5859. if (tmp_cursor) {
  5860. int r = tmp_cursor->c_close(tmp_cursor);
  5861. assert(r==0);
  5862. tmp_cursor = NULL;
  5863. }
  5864. if (loader != NULL) {
  5865. loader->abort(loader);
  5866. }
  5867. if (txn) {
  5868. if (error) {
  5869. curr_index = curr_num_DBs;
  5870. for (uint i = 0; i < num_of_keys; i++, curr_index++) {
  5871. if (share->key_file[curr_index]) {
  5872. int r = share->key_file[curr_index]->close(
  5873. share->key_file[curr_index],
  5874. 0
  5875. );
  5876. assert(r==0);
  5877. share->key_file[curr_index] = NULL;
  5878. }
  5879. }
  5880. abort_txn(txn);
  5881. }
  5882. else {
  5883. commit_txn(txn,0);
  5884. }
  5885. }
  5886. if (error == DB_LOCK_NOTGRANTED) {
  5887. sql_print_error("Could not add indexes to table %s because \
  5888. another transaction has accessed the table. \
  5889. To add indexes, make sure no transactions touch the table.", share->table_name);
  5890. }
  5891. TOKUDB_DBUG_RETURN(error ? error : loader_error);
  5892. }
  5893. //
  5894. // Prepares to drop indexes to the table. For each value, i, in the array key_num,
  5895. // table->key_info[i] is a key that is to be dropped.
  5896. // ***********NOTE*******************
  5897. // Although prepare_drop_index is supposed to just get the DB's ready for removal,
  5898. // and not actually do the removal, we are doing it here and not in final_drop_index
  5899. // For the flags we expose in alter_table_flags, namely xxx_NO_WRITES, this is allowed
  5900. // Changes for "future-proofing" this so that it works when we have the equivalent flags
  5901. // that are not NO_WRITES are not worth it at the moments
  5902. // Parameters:
  5903. // [in] table_arg - table that is being modified, seems to be identical to this->table
  5904. // [in] key_num - array of indexes that specify which keys of the array table->key_info
  5905. // are to be dropped
  5906. // num_of_keys - size of array, key_num
  5907. // Returns:
  5908. // 0 on success, error otherwise
  5909. //
  5910. int ha_tokudb::prepare_drop_index(TABLE *table_arg, uint *key_num, uint num_of_keys) {
  5911. TOKUDB_DBUG_ENTER("ha_tokudb::prepare_drop_index");
  5912. int error;
  5913. DB_TXN* txn = NULL;
  5914. error = db_env->txn_begin(db_env, 0, &txn, 0);
  5915. if (error) { goto cleanup; }
  5916. for (uint i = 0; i < num_of_keys; i++) {
  5917. uint curr_index = key_num[i];
  5918. int r = share->key_file[curr_index]->close(share->key_file[curr_index],0);
  5919. assert(r==0);
  5920. share->key_file[curr_index] = NULL;
  5921. error = remove_key_name_from_status(share->status_block, table_arg->key_info[curr_index].name, txn);
  5922. if (error) { goto cleanup; }
  5923. error = delete_or_rename_dictionary(share->table_name, NULL, table_arg->key_info[curr_index].name, true, txn, true);
  5924. if (error) { goto cleanup; }
  5925. }
  5926. cleanup:
  5927. if (txn) {
  5928. if (error) {
  5929. abort_txn(txn);
  5930. }
  5931. else {
  5932. commit_txn(txn,0);
  5933. }
  5934. }
  5935. if (error == DB_LOCK_NOTGRANTED) {
  5936. sql_print_error("Could not drop indexes from table %s because \
  5937. another transaction has accessed the table. \
  5938. To drop indexes, make sure no transactions touch the table.", share->table_name);
  5939. }
  5940. TOKUDB_DBUG_RETURN(error);
  5941. }
  5942. // ***********NOTE*******************
  5943. // Although prepare_drop_index is supposed to just get the DB's ready for removal,
  5944. // and not actually do the removal, we are doing it here and not in final_drop_index
  5945. // For the flags we expose in alter_table_flags, namely xxx_NO_WRITES, this is allowed
  5946. // Changes for "future-proofing" this so that it works when we have the equivalent flags
  5947. // that are not NO_WRITES are not worth it at the moments, therefore, we can make
  5948. // this function just return
  5949. int ha_tokudb::final_drop_index(TABLE *table_arg) {
  5950. TOKUDB_DBUG_ENTER("ha_tokudb::final_drop_index");
  5951. TOKUDB_DBUG_RETURN(0);
  5952. }
  5953. void ha_tokudb::print_error(int error, myf errflag) {
  5954. if (error == DB_LOCK_DEADLOCK)
  5955. error = HA_ERR_LOCK_DEADLOCK;
  5956. if (error == DB_LOCK_NOTGRANTED)
  5957. error = HA_ERR_LOCK_WAIT_TIMEOUT;
  5958. if (error == ENOSPC) {
  5959. error = HA_ERR_DISK_FULL;
  5960. }
  5961. if (error == DB_KEYEXIST) {
  5962. error = HA_ERR_FOUND_DUPP_KEY;
  5963. }
  5964. handler::print_error(error, errflag);
  5965. }
  5966. #if 0 // QQQ use default
  5967. //
  5968. // This function will probably need to be redone from scratch
  5969. // if we ever choose to implement it
  5970. //
  5971. int ha_tokudb::analyze(THD * thd, HA_CHECK_OPT * check_opt) {
  5972. uint i;
  5973. DB_BTREE_STAT *stat = 0;
  5974. DB_TXN_STAT *txn_stat_ptr = 0;
  5975. tokudb_trx_data *trx = (tokudb_trx_data *) thd->ha_data[tokudb_hton->slot];
  5976. DBUG_ASSERT(trx);
  5977. for (i = 0; i < table_share->keys; i++) {
  5978. if (stat) {
  5979. free(stat);
  5980. stat = 0;
  5981. }
  5982. if ((key_file[i]->stat) (key_file[i], trx->all, (void *) &stat, 0))
  5983. goto err;
  5984. share->rec_per_key[i] = (stat->bt_ndata / (stat->bt_nkeys ? stat->bt_nkeys : 1));
  5985. }
  5986. /* A hidden primary key is not in key_file[] */
  5987. if (hidden_primary_key) {
  5988. if (stat) {
  5989. free(stat);
  5990. stat = 0;
  5991. }
  5992. if ((file->stat) (file, trx->all, (void *) &stat, 0))
  5993. goto err;
  5994. }
  5995. pthread_mutex_lock(&share->mutex);
  5996. share->status |= STATUS_TOKUDB_ANALYZE; // Save status on close
  5997. share->version++; // Update stat in table
  5998. pthread_mutex_unlock(&share->mutex);
  5999. update_status(share, table); // Write status to file
  6000. if (stat)
  6001. free(stat);
  6002. return ((share->status & STATUS_TOKUDB_ANALYZE) ? HA_ADMIN_FAILED : HA_ADMIN_OK);
  6003. err:
  6004. if (stat)
  6005. free(stat);
  6006. return HA_ADMIN_FAILED;
  6007. }
  6008. #endif
  6009. //
  6010. // flatten all DB's in this table, to do so, just do a full scan on every DB
  6011. //
  6012. int ha_tokudb::optimize(THD * thd, HA_CHECK_OPT * check_opt) {
  6013. TOKUDB_DBUG_ENTER("ha_tokudb::optimize");
  6014. int error;
  6015. DBC* tmp_cursor = NULL;
  6016. tokudb_trx_data *trx = NULL;
  6017. DB_TXN* txn = NULL;
  6018. bool do_commit = false;
  6019. uint curr_num_DBs = table->s->keys + test(hidden_primary_key);
  6020. trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);
  6021. if (trx == NULL) {
  6022. error = HA_ERR_UNSUPPORTED;
  6023. goto cleanup;
  6024. }
  6025. //
  6026. // optimize may be called without a valid transaction, so we have to do this
  6027. // in order to get a valid transaction
  6028. // this is a bit hacky, but it is the best we have right now
  6029. //
  6030. txn = trx->stmt ? trx->stmt : trx->sp_level;
  6031. if (txn == NULL) {
  6032. error = db_env->txn_begin(db_env, NULL, &txn, 0);
  6033. if (error) {
  6034. goto cleanup;
  6035. }
  6036. do_commit = true;
  6037. }
  6038. //
  6039. // prelock so each scan goes faster
  6040. //
  6041. error = acquire_table_lock(txn,lock_read);
  6042. if (error) {
  6043. goto cleanup;
  6044. }
  6045. //
  6046. // for each DB, scan through entire table and do nothing
  6047. //
  6048. for (uint i = 0; i < curr_num_DBs; i++) {
  6049. error = share->key_file[i]->cursor(share->key_file[i], txn, &tmp_cursor, 0);
  6050. if (error) {
  6051. tmp_cursor = NULL;
  6052. goto cleanup;
  6053. }
  6054. while (error != DB_NOTFOUND) {
  6055. error = tmp_cursor->c_getf_next(tmp_cursor, DB_PRELOCKED, smart_dbt_do_nothing, NULL);
  6056. if (error && error != DB_NOTFOUND) {
  6057. goto cleanup;
  6058. }
  6059. }
  6060. error = tmp_cursor->c_close(tmp_cursor);
  6061. assert(error==0);
  6062. tmp_cursor = NULL;
  6063. }
  6064. error = 0;
  6065. cleanup:
  6066. if (tmp_cursor) {
  6067. int r = tmp_cursor->c_close(tmp_cursor);
  6068. assert(r==0);
  6069. tmp_cursor = NULL;
  6070. }
  6071. if (do_commit) {
  6072. commit_txn(txn, 0);
  6073. }
  6074. TOKUDB_DBUG_RETURN(error);
  6075. }
  6076. //
  6077. // truncate's dictionary associated with keynr index using transaction txn
  6078. // does so by deleting and then recreating the dictionary in the context
  6079. // of a transaction
  6080. //
  6081. int ha_tokudb::truncate_dictionary( uint keynr, DB_TXN* txn ) {
  6082. int error;
  6083. bool is_pk = (keynr == primary_key);
  6084. error = share->key_file[keynr]->close(share->key_file[keynr], 0);
  6085. assert(error == 0);
  6086. share->key_file[keynr] = NULL;
  6087. if (is_pk) { share->file = NULL; }
  6088. if (is_pk) {
  6089. error = delete_or_rename_dictionary(
  6090. share->table_name,
  6091. NULL,
  6092. "main",
  6093. false, //is_key
  6094. txn,
  6095. true // is a delete
  6096. );
  6097. if (error) { goto cleanup; }
  6098. }
  6099. else {
  6100. error = delete_or_rename_dictionary(
  6101. share->table_name,
  6102. NULL,
  6103. table_share->key_info[keynr].name,
  6104. true, //is_key
  6105. txn,
  6106. true // is a delete
  6107. );
  6108. if (error) { goto cleanup; }
  6109. }
  6110. if (is_pk) {
  6111. error = create_main_dictionary(share->table_name, table, txn, &share->kc_info);
  6112. }
  6113. else {
  6114. error = create_secondary_dictionary(
  6115. share->table_name,
  6116. table,
  6117. &table_share->key_info[keynr],
  6118. txn,
  6119. &share->kc_info,
  6120. keynr
  6121. );
  6122. }
  6123. if (error) { goto cleanup; }
  6124. cleanup:
  6125. return error;
  6126. }
  6127. // delete all rows from a table
  6128. //
  6129. // effects: delete all of the rows in the main dictionary and all of the
  6130. // indices. this must be atomic, so we use the statement transaction
  6131. // for all of the truncate operations.
  6132. // locks: if we have an exclusive table write lock, all of the concurrency
  6133. // issues go away.
  6134. // returns: 0 if success
  6135. int ha_tokudb::delete_all_rows() {
  6136. TOKUDB_DBUG_ENTER("delete_all_rows");
  6137. int error = 0;
  6138. uint curr_num_DBs = 0;
  6139. DB_TXN* txn = NULL;
  6140. error = db_env->txn_begin(db_env, 0, &txn, 0);
  6141. if (error) { goto cleanup; }
  6142. if (thd_sql_command(ha_thd()) != SQLCOM_TRUNCATE) {
  6143. share->try_table_lock = true;
  6144. error = HA_ERR_WRONG_COMMAND;
  6145. goto cleanup;
  6146. }
  6147. curr_num_DBs = table->s->keys + test(hidden_primary_key);
  6148. for (uint i = 0; i < curr_num_DBs; i++) {
  6149. error = truncate_dictionary(i, txn);
  6150. if (error) { goto cleanup; }
  6151. }
  6152. // zap the row count
  6153. if (error == 0) {
  6154. share->rows = 0;
  6155. }
  6156. share->try_table_lock = true;
  6157. cleanup:
  6158. if (txn) {
  6159. if (error) {
  6160. abort_txn(txn);
  6161. }
  6162. else {
  6163. commit_txn(txn,0);
  6164. }
  6165. }
  6166. if (error == DB_LOCK_NOTGRANTED) {
  6167. sql_print_error("Could not truncate table %s because \
  6168. another transaction has accessed the table. \
  6169. To truncate the table, make sure no transactions touch the table.", share->table_name);
  6170. }
  6171. //
  6172. // regardless of errors, need to reopen the DB's
  6173. //
  6174. for (uint i = 0; i < curr_num_DBs; i++) {
  6175. int r = 0;
  6176. if (share->key_file[i] == NULL) {
  6177. if (i != primary_key) {
  6178. r = open_secondary_dictionary(
  6179. &share->key_file[i],
  6180. &table_share->key_info[i],
  6181. share->table_name,
  6182. false, //
  6183. NULL
  6184. );
  6185. assert(!r);
  6186. }
  6187. else {
  6188. r = open_main_dictionary(
  6189. share->table_name,
  6190. false,
  6191. NULL
  6192. );
  6193. assert(!r);
  6194. }
  6195. }
  6196. }
  6197. TOKUDB_DBUG_RETURN(error);
  6198. }
  6199. void ha_tokudb::set_loader_error(int err) {
  6200. loader_error = err;
  6201. }
  6202. void ha_tokudb::set_dup_value_for_pk(DBT* key) {
  6203. assert(!hidden_primary_key);
  6204. unpack_key(table->record[0],key,primary_key);
  6205. last_dup_key = primary_key;
  6206. }