You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

5585 lines
160 KiB

12 years ago
12 years ago
12 years ago
  1. /*****************************************************************************
  2. Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
  3. Copyright (c) 2008, Google Inc.
  4. Copyright (c) 2012, Facebook Inc.
  5. Portions of this file contain modifications contributed and copyrighted by
  6. Google, Inc. Those modifications are gratefully acknowledged and are described
  7. briefly in the InnoDB documentation. The contributions by Google are
  8. incorporated with their permission, and subject to the conditions contained in
  9. the file COPYING.Google.
  10. This program is free software; you can redistribute it and/or modify it under
  11. the terms of the GNU General Public License as published by the Free Software
  12. Foundation; version 2 of the License.
  13. This program is distributed in the hope that it will be useful, but WITHOUT
  14. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  15. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  16. You should have received a copy of the GNU General Public License along with
  17. this program; if not, write to the Free Software Foundation, Inc.,
  18. 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
  19. *****************************************************************************/
  20. /**************************************************//**
  21. @file btr/btr0cur.cc
  22. The index tree cursor
  23. All changes that row operations make to a B-tree or the records
  24. there must go through this module! Undo log records are written here
  25. of every modify or insert of a clustered index record.
  26. NOTE!!!
  27. To make sure we do not run out of disk space during a pessimistic
  28. insert or update, we have to reserve 2 x the height of the index tree
  29. many pages in the tablespace before we start the operation, because
  30. if leaf splitting has been started, it is difficult to undo, except
  31. by crashing the database and doing a roll-forward.
  32. Created 10/16/1994 Heikki Tuuri
  33. *******************************************************/
  34. #include "btr0cur.h"
  35. #ifdef UNIV_NONINL
  36. #include "btr0cur.ic"
  37. #endif
  38. #include "row0upd.h"
  39. #ifndef UNIV_HOTBACKUP
  40. #include "mtr0log.h"
  41. #include "page0page.h"
  42. #include "page0zip.h"
  43. #include "rem0rec.h"
  44. #include "rem0cmp.h"
  45. #include "buf0lru.h"
  46. #include "btr0btr.h"
  47. #include "btr0sea.h"
  48. #include "row0log.h"
  49. #include "row0purge.h"
  50. #include "row0upd.h"
  51. #include "trx0rec.h"
  52. #include "trx0roll.h" /* trx_is_recv() */
  53. #include "que0que.h"
  54. #include "row0row.h"
  55. #include "srv0srv.h"
  56. #include "ibuf0ibuf.h"
  57. #include "lock0lock.h"
  58. #include "zlib.h"
  59. /** Buffered B-tree operation types, introduced as part of delete buffering. */
  60. enum btr_op_t {
  61. BTR_NO_OP = 0, /*!< Not buffered */
  62. BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */
  63. BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */
  64. BTR_DELETE_OP, /*!< Purge a delete-marked record */
  65. BTR_DELMARK_OP /*!< Mark a record for deletion */
  66. };
  67. #ifdef UNIV_DEBUG
  68. /** If the following is set to TRUE, this module prints a lot of
  69. trace information of individual record operations */
  70. UNIV_INTERN ibool btr_cur_print_record_ops = FALSE;
  71. #endif /* UNIV_DEBUG */
  72. /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
  73. UNIV_INTERN ulint btr_cur_n_non_sea = 0;
  74. /** Number of successful adaptive hash index lookups in
  75. btr_cur_search_to_nth_level(). */
  76. UNIV_INTERN ulint btr_cur_n_sea = 0;
  77. /** Old value of btr_cur_n_non_sea. Copied by
  78. srv_refresh_innodb_monitor_stats(). Referenced by
  79. srv_printf_innodb_monitor(). */
  80. UNIV_INTERN ulint btr_cur_n_non_sea_old = 0;
  81. /** Old value of btr_cur_n_sea. Copied by
  82. srv_refresh_innodb_monitor_stats(). Referenced by
  83. srv_printf_innodb_monitor(). */
  84. UNIV_INTERN ulint btr_cur_n_sea_old = 0;
  85. #ifdef UNIV_DEBUG
  86. /* Flag to limit optimistic insert records */
  87. UNIV_INTERN uint btr_cur_limit_optimistic_insert_debug = 0;
  88. #endif /* UNIV_DEBUG */
  89. /** In the optimistic insert, if the insert does not fit, but this much space
  90. can be released by page reorganize, then it is reorganized */
  91. #define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32)
  92. /** The structure of a BLOB part header */
  93. /* @{ */
  94. /*--------------------------------------*/
  95. #define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this
  96. page */
  97. #define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
  98. FIL_NULL if none */
  99. /*--------------------------------------*/
  100. #define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
  101. part header, in bytes */
  102. /** Estimated table level stats from sampled value.
  103. @param value sampled stats
  104. @param index index being sampled
  105. @param sample number of sampled rows
  106. @param ext_size external stored data size
  107. @param not_empty table not empty
  108. @return estimated table wide stats from sampled value */
  109. #define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty)\
  110. (((value) * (ib_int64_t) index->stat_n_leaf_pages \
  111. + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
  112. /* @} */
  113. #endif /* !UNIV_HOTBACKUP */
  114. /** A BLOB field reference full of zero, for use in assertions and tests.
  115. Initially, BLOB field references are set to zero, in
  116. dtuple_convert_big_rec(). */
  117. const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE] = {
  118. 0, 0, 0, 0, 0,
  119. 0, 0, 0, 0, 0,
  120. 0, 0, 0, 0, 0,
  121. 0, 0, 0, 0, 0,
  122. };
  123. #ifndef UNIV_HOTBACKUP
  124. /*******************************************************************//**
  125. Marks all extern fields in a record as owned by the record. This function
  126. should be called if the delete mark of a record is removed: a not delete
  127. marked record always owns all its extern fields. */
  128. static
  129. void
  130. btr_cur_unmark_extern_fields(
  131. /*=========================*/
  132. page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
  133. part will be updated, or NULL */
  134. rec_t* rec, /*!< in/out: record in a clustered index */
  135. dict_index_t* index, /*!< in: index of the page */
  136. const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
  137. mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
  138. /*******************************************************************//**
  139. Adds path information to the cursor for the current page, for which
  140. the binary search has been performed. */
  141. static
  142. void
  143. btr_cur_add_path_info(
  144. /*==================*/
  145. btr_cur_t* cursor, /*!< in: cursor positioned on a page */
  146. ulint height, /*!< in: height of the page in tree;
  147. 0 means leaf node */
  148. ulint root_height); /*!< in: root node height in tree */
  149. /***********************************************************//**
  150. Frees the externally stored fields for a record, if the field is mentioned
  151. in the update vector. */
  152. static
  153. void
  154. btr_rec_free_updated_extern_fields(
  155. /*===============================*/
  156. dict_index_t* index, /*!< in: index of rec; the index tree MUST be
  157. X-latched */
  158. rec_t* rec, /*!< in: record */
  159. page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
  160. part will be updated, or NULL */
  161. const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
  162. const upd_t* update, /*!< in: update vector */
  163. enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
  164. mtr_t* mtr); /*!< in: mini-transaction handle which contains
  165. an X-latch to record page and to the tree */
  166. /***********************************************************//**
  167. Frees the externally stored fields for a record. */
  168. static
  169. void
  170. btr_rec_free_externally_stored_fields(
  171. /*==================================*/
  172. dict_index_t* index, /*!< in: index of the data, the index
  173. tree MUST be X-latched */
  174. rec_t* rec, /*!< in: record */
  175. const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
  176. page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
  177. part will be updated, or NULL */
  178. enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
  179. mtr_t* mtr); /*!< in: mini-transaction handle which contains
  180. an X-latch to record page and to the index
  181. tree */
  182. /***********************************************************//**
  183. Gets the externally stored size of a record, in units of a database page.
  184. @return externally stored part, in units of a database page */
  185. static
  186. ulint
  187. btr_rec_get_externally_stored_len(
  188. /*==============================*/
  189. const rec_t* rec, /*!< in: record */
  190. const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
  191. #endif /* !UNIV_HOTBACKUP */
  192. /******************************************************//**
  193. The following function is used to set the deleted bit of a record. */
  194. UNIV_INLINE
  195. void
  196. btr_rec_set_deleted_flag(
  197. /*=====================*/
  198. rec_t* rec, /*!< in/out: physical record */
  199. page_zip_des_t* page_zip,/*!< in/out: compressed page (or NULL) */
  200. ulint flag) /*!< in: nonzero if delete marked */
  201. {
  202. if (page_rec_is_comp(rec)) {
  203. rec_set_deleted_flag_new(rec, page_zip, flag);
  204. } else {
  205. ut_ad(!page_zip);
  206. rec_set_deleted_flag_old(rec, flag);
  207. }
  208. }
  209. #ifndef UNIV_HOTBACKUP
  210. /*==================== B-TREE SEARCH =========================*/
  211. /********************************************************************//**
  212. Latches the leaf page or pages requested. */
  213. static
  214. void
  215. btr_cur_latch_leaves(
  216. /*=================*/
  217. page_t* page, /*!< in: leaf page where the search
  218. converged */
  219. ulint space, /*!< in: space id */
  220. ulint zip_size, /*!< in: compressed page size in bytes
  221. or 0 for uncompressed pages */
  222. ulint page_no, /*!< in: page number of the leaf */
  223. ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
  224. btr_cur_t* cursor, /*!< in: cursor */
  225. mtr_t* mtr) /*!< in: mtr */
  226. {
  227. ulint mode;
  228. ulint left_page_no;
  229. ulint right_page_no;
  230. buf_block_t* get_block;
  231. ut_ad(page && mtr);
  232. switch (latch_mode) {
  233. case BTR_SEARCH_LEAF:
  234. case BTR_MODIFY_LEAF:
  235. mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH;
  236. get_block = btr_block_get(
  237. space, zip_size, page_no, mode, cursor->index, mtr);
  238. #ifdef UNIV_BTR_DEBUG
  239. ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
  240. #endif /* UNIV_BTR_DEBUG */
  241. get_block->check_index_page_at_flush = TRUE;
  242. return;
  243. case BTR_MODIFY_TREE:
  244. /* x-latch also brothers from left to right */
  245. left_page_no = btr_page_get_prev(page, mtr);
  246. if (left_page_no != FIL_NULL) {
  247. get_block = btr_block_get(
  248. space, zip_size, left_page_no,
  249. RW_X_LATCH, cursor->index, mtr);
  250. #ifdef UNIV_BTR_DEBUG
  251. ut_a(page_is_comp(get_block->frame)
  252. == page_is_comp(page));
  253. ut_a(btr_page_get_next(get_block->frame, mtr)
  254. == page_get_page_no(page));
  255. #endif /* UNIV_BTR_DEBUG */
  256. get_block->check_index_page_at_flush = TRUE;
  257. }
  258. get_block = btr_block_get(
  259. space, zip_size, page_no,
  260. RW_X_LATCH, cursor->index, mtr);
  261. #ifdef UNIV_BTR_DEBUG
  262. ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
  263. #endif /* UNIV_BTR_DEBUG */
  264. get_block->check_index_page_at_flush = TRUE;
  265. right_page_no = btr_page_get_next(page, mtr);
  266. if (right_page_no != FIL_NULL) {
  267. get_block = btr_block_get(
  268. space, zip_size, right_page_no,
  269. RW_X_LATCH, cursor->index, mtr);
  270. #ifdef UNIV_BTR_DEBUG
  271. ut_a(page_is_comp(get_block->frame)
  272. == page_is_comp(page));
  273. ut_a(btr_page_get_prev(get_block->frame, mtr)
  274. == page_get_page_no(page));
  275. #endif /* UNIV_BTR_DEBUG */
  276. get_block->check_index_page_at_flush = TRUE;
  277. }
  278. return;
  279. case BTR_SEARCH_PREV:
  280. case BTR_MODIFY_PREV:
  281. mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
  282. /* latch also left brother */
  283. left_page_no = btr_page_get_prev(page, mtr);
  284. if (left_page_no != FIL_NULL) {
  285. get_block = btr_block_get(
  286. space, zip_size,
  287. left_page_no, mode, cursor->index, mtr);
  288. cursor->left_block = get_block;
  289. #ifdef UNIV_BTR_DEBUG
  290. ut_a(page_is_comp(get_block->frame)
  291. == page_is_comp(page));
  292. ut_a(btr_page_get_next(get_block->frame, mtr)
  293. == page_get_page_no(page));
  294. #endif /* UNIV_BTR_DEBUG */
  295. get_block->check_index_page_at_flush = TRUE;
  296. }
  297. get_block = btr_block_get(
  298. space, zip_size, page_no, mode, cursor->index, mtr);
  299. #ifdef UNIV_BTR_DEBUG
  300. ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
  301. #endif /* UNIV_BTR_DEBUG */
  302. get_block->check_index_page_at_flush = TRUE;
  303. return;
  304. }
  305. ut_error;
  306. }
  307. /********************************************************************//**
  308. Searches an index tree and positions a tree cursor on a given level.
  309. NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
  310. to node pointer page number fields on the upper levels of the tree!
  311. Note that if mode is PAGE_CUR_LE, which is used in inserts, then
  312. cursor->up_match and cursor->low_match both will have sensible values.
  313. If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
  314. If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
  315. search tuple should be performed in the B-tree. InnoDB does an insert
  316. immediately after the cursor. Thus, the cursor may end up on a user record,
  317. or on a page infimum record. */
  318. UNIV_INTERN
  319. void
  320. btr_cur_search_to_nth_level(
  321. /*========================*/
  322. dict_index_t* index, /*!< in: index */
  323. ulint level, /*!< in: the tree level of search */
  324. const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
  325. tuple must be set so that it cannot get
  326. compared to the node ptr page number field! */
  327. ulint mode, /*!< in: PAGE_CUR_L, ...;
  328. Inserts should always be made using
  329. PAGE_CUR_LE to search the position! */
  330. ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
  331. at most one of BTR_INSERT, BTR_DELETE_MARK,
  332. BTR_DELETE, or BTR_ESTIMATE;
  333. cursor->left_block is used to store a pointer
  334. to the left neighbor page, in the cases
  335. BTR_SEARCH_PREV and BTR_MODIFY_PREV;
  336. NOTE that if has_search_latch
  337. is != 0, we maybe do not have a latch set
  338. on the cursor page, we assume
  339. the caller uses his search latch
  340. to protect the record! */
  341. btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
  342. s- or x-latched, but see also above! */
  343. ulint has_search_latch,/*!< in: info on the latch mode the
  344. caller currently has on btr_search_latch:
  345. RW_S_LATCH, or 0 */
  346. const char* file, /*!< in: file name */
  347. ulint line, /*!< in: line where called */
  348. mtr_t* mtr) /*!< in: mtr */
  349. {
  350. page_t* page;
  351. buf_block_t* block;
  352. ulint space;
  353. buf_block_t* guess;
  354. ulint height;
  355. ulint page_no;
  356. ulint up_match;
  357. ulint up_bytes;
  358. ulint low_match;
  359. ulint low_bytes;
  360. ulint savepoint;
  361. ulint rw_latch;
  362. ulint page_mode;
  363. ulint buf_mode;
  364. ulint estimate;
  365. ulint zip_size;
  366. page_cur_t* page_cursor;
  367. btr_op_t btr_op;
  368. ulint root_height = 0; /* remove warning */
  369. #ifdef BTR_CUR_ADAPT
  370. btr_search_t* info;
  371. #endif
  372. mem_heap_t* heap = NULL;
  373. ulint offsets_[REC_OFFS_NORMAL_SIZE];
  374. ulint* offsets = offsets_;
  375. rec_offs_init(offsets_);
  376. /* Currently, PAGE_CUR_LE is the only search mode used for searches
  377. ending to upper levels */
  378. ut_ad(level == 0 || mode == PAGE_CUR_LE);
  379. ut_ad(dict_index_check_search_tuple(index, tuple));
  380. ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
  381. ut_ad(dtuple_check_typed(tuple));
  382. ut_ad(!(index->type & DICT_FTS));
  383. ut_ad(index->page != FIL_NULL);
  384. UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
  385. UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
  386. UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
  387. UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
  388. #ifdef UNIV_DEBUG
  389. cursor->up_match = ULINT_UNDEFINED;
  390. cursor->low_match = ULINT_UNDEFINED;
  391. #endif
  392. ibool s_latch_by_caller;
  393. s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
  394. ut_ad(!s_latch_by_caller
  395. || mtr_memo_contains(mtr, dict_index_get_lock(index),
  396. MTR_MEMO_S_LOCK));
  397. /* These flags are mutually exclusive, they are lumped together
  398. with the latch mode for historical reasons. It's possible for
  399. none of the flags to be set. */
  400. switch (UNIV_EXPECT(latch_mode
  401. & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
  402. 0)) {
  403. case 0:
  404. btr_op = BTR_NO_OP;
  405. break;
  406. case BTR_INSERT:
  407. btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
  408. ? BTR_INSERT_IGNORE_UNIQUE_OP
  409. : BTR_INSERT_OP;
  410. break;
  411. case BTR_DELETE:
  412. btr_op = BTR_DELETE_OP;
  413. ut_a(cursor->purge_node);
  414. break;
  415. case BTR_DELETE_MARK:
  416. btr_op = BTR_DELMARK_OP;
  417. break;
  418. default:
  419. /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
  420. should be specified at a time */
  421. ut_error;
  422. }
  423. /* Operations on the insert buffer tree cannot be buffered. */
  424. ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
  425. /* Operations on the clustered index cannot be buffered. */
  426. ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
  427. estimate = latch_mode & BTR_ESTIMATE;
  428. /* Turn the flags unrelated to the latch mode off. */
  429. latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
  430. ut_ad(!s_latch_by_caller
  431. || latch_mode == BTR_SEARCH_LEAF
  432. || latch_mode == BTR_MODIFY_LEAF);
  433. cursor->flag = BTR_CUR_BINARY;
  434. cursor->index = index;
  435. #ifndef BTR_CUR_ADAPT
  436. guess = NULL;
  437. #else
  438. info = btr_search_get_info(index);
  439. guess = info->root_guess;
  440. #ifdef BTR_CUR_HASH_ADAPT
  441. # ifdef UNIV_SEARCH_PERF_STAT
  442. info->n_searches++;
  443. # endif
  444. if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED
  445. && latch_mode <= BTR_MODIFY_LEAF
  446. && info->last_hash_succ
  447. && !estimate
  448. # ifdef PAGE_CUR_LE_OR_EXTENDS
  449. && mode != PAGE_CUR_LE_OR_EXTENDS
  450. # endif /* PAGE_CUR_LE_OR_EXTENDS */
  451. /* If !has_search_latch, we do a dirty read of
  452. btr_search_enabled below, and btr_search_guess_on_hash()
  453. will have to check it again. */
  454. && UNIV_LIKELY(btr_search_enabled)
  455. && btr_search_guess_on_hash(index, info, tuple, mode,
  456. latch_mode, cursor,
  457. has_search_latch, mtr)) {
  458. /* Search using the hash index succeeded */
  459. ut_ad(cursor->up_match != ULINT_UNDEFINED
  460. || mode != PAGE_CUR_GE);
  461. ut_ad(cursor->up_match != ULINT_UNDEFINED
  462. || mode != PAGE_CUR_LE);
  463. ut_ad(cursor->low_match != ULINT_UNDEFINED
  464. || mode != PAGE_CUR_LE);
  465. btr_cur_n_sea++;
  466. return;
  467. }
  468. # endif /* BTR_CUR_HASH_ADAPT */
  469. #endif /* BTR_CUR_ADAPT */
  470. btr_cur_n_non_sea++;
  471. /* If the hash search did not succeed, do binary search down the
  472. tree */
  473. if (has_search_latch) {
  474. /* Release possible search latch to obey latching order */
  475. rw_lock_s_unlock(&btr_search_latch);
  476. }
  477. /* Store the position of the tree latch we push to mtr so that we
  478. know how to release it when we have latched leaf node(s) */
  479. savepoint = mtr_set_savepoint(mtr);
  480. switch (latch_mode) {
  481. case BTR_MODIFY_TREE:
  482. mtr_x_lock(dict_index_get_lock(index), mtr);
  483. break;
  484. case BTR_CONT_MODIFY_TREE:
  485. /* Do nothing */
  486. ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
  487. MTR_MEMO_X_LOCK));
  488. break;
  489. default:
  490. if (!s_latch_by_caller) {
  491. mtr_s_lock(dict_index_get_lock(index), mtr);
  492. }
  493. }
  494. page_cursor = btr_cur_get_page_cur(cursor);
  495. space = dict_index_get_space(index);
  496. page_no = dict_index_get_page(index);
  497. up_match = 0;
  498. up_bytes = 0;
  499. low_match = 0;
  500. low_bytes = 0;
  501. height = ULINT_UNDEFINED;
  502. /* We use these modified search modes on non-leaf levels of the
  503. B-tree. These let us end up in the right B-tree leaf. In that leaf
  504. we use the original search mode. */
  505. switch (mode) {
  506. case PAGE_CUR_GE:
  507. page_mode = PAGE_CUR_L;
  508. break;
  509. case PAGE_CUR_G:
  510. page_mode = PAGE_CUR_LE;
  511. break;
  512. default:
  513. #ifdef PAGE_CUR_LE_OR_EXTENDS
  514. ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
  515. || mode == PAGE_CUR_LE_OR_EXTENDS);
  516. #else /* PAGE_CUR_LE_OR_EXTENDS */
  517. ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
  518. #endif /* PAGE_CUR_LE_OR_EXTENDS */
  519. page_mode = mode;
  520. break;
  521. }
  522. /* Loop and search until we arrive at the desired level */
  523. search_loop:
  524. buf_mode = BUF_GET;
  525. rw_latch = RW_NO_LATCH;
  526. if (height != 0) {
  527. /* We are about to fetch the root or a non-leaf page. */
  528. } else if (latch_mode <= BTR_MODIFY_LEAF) {
  529. rw_latch = latch_mode;
  530. if (btr_op != BTR_NO_OP
  531. && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
  532. /* Try to buffer the operation if the leaf
  533. page is not in the buffer pool. */
  534. buf_mode = btr_op == BTR_DELETE_OP
  535. ? BUF_GET_IF_IN_POOL_OR_WATCH
  536. : BUF_GET_IF_IN_POOL;
  537. }
  538. }
  539. zip_size = dict_table_zip_size(index->table);
  540. retry_page_get:
  541. block = buf_page_get_gen(
  542. space, zip_size, page_no, rw_latch, guess, buf_mode,
  543. file, line, mtr);
  544. if (block == NULL) {
  545. /* This must be a search to perform an insert/delete
  546. mark/ delete; try using the insert/delete buffer */
  547. ut_ad(height == 0);
  548. ut_ad(cursor->thr);
  549. switch (btr_op) {
  550. case BTR_INSERT_OP:
  551. case BTR_INSERT_IGNORE_UNIQUE_OP:
  552. ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
  553. if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
  554. space, zip_size, page_no,
  555. cursor->thr)) {
  556. cursor->flag = BTR_CUR_INSERT_TO_IBUF;
  557. goto func_exit;
  558. }
  559. break;
  560. case BTR_DELMARK_OP:
  561. ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
  562. if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
  563. index, space, zip_size,
  564. page_no, cursor->thr)) {
  565. cursor->flag = BTR_CUR_DEL_MARK_IBUF;
  566. goto func_exit;
  567. }
  568. break;
  569. case BTR_DELETE_OP:
  570. ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
  571. if (!row_purge_poss_sec(cursor->purge_node,
  572. index, tuple)) {
  573. /* The record cannot be purged yet. */
  574. cursor->flag = BTR_CUR_DELETE_REF;
  575. } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
  576. index, space, zip_size,
  577. page_no,
  578. cursor->thr)) {
  579. /* The purge was buffered. */
  580. cursor->flag = BTR_CUR_DELETE_IBUF;
  581. } else {
  582. /* The purge could not be buffered. */
  583. buf_pool_watch_unset(space, page_no);
  584. break;
  585. }
  586. buf_pool_watch_unset(space, page_no);
  587. goto func_exit;
  588. default:
  589. ut_error;
  590. }
  591. /* Insert to the insert/delete buffer did not succeed, we
  592. must read the page from disk. */
  593. buf_mode = BUF_GET;
  594. goto retry_page_get;
  595. }
  596. block->check_index_page_at_flush = TRUE;
  597. page = buf_block_get_frame(block);
  598. if (rw_latch != RW_NO_LATCH) {
  599. #ifdef UNIV_ZIP_DEBUG
  600. const page_zip_des_t* page_zip
  601. = buf_block_get_page_zip(block);
  602. ut_a(!page_zip || page_zip_validate(page_zip, page, index));
  603. #endif /* UNIV_ZIP_DEBUG */
  604. buf_block_dbg_add_level(
  605. block, dict_index_is_ibuf(index)
  606. ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
  607. }
  608. ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
  609. ut_ad(index->id == btr_page_get_index_id(page));
  610. if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
  611. /* We are in the root node */
  612. height = btr_page_get_level(page, mtr);
  613. root_height = height;
  614. cursor->tree_height = root_height + 1;
  615. #ifdef BTR_CUR_ADAPT
  616. if (block != guess) {
  617. info->root_guess = block;
  618. }
  619. #endif
  620. }
  621. if (height == 0) {
  622. if (rw_latch == RW_NO_LATCH) {
  623. btr_cur_latch_leaves(
  624. page, space, zip_size, page_no, latch_mode,
  625. cursor, mtr);
  626. }
  627. switch (latch_mode) {
  628. case BTR_MODIFY_TREE:
  629. case BTR_CONT_MODIFY_TREE:
  630. break;
  631. default:
  632. if (!s_latch_by_caller) {
  633. /* Release the tree s-latch */
  634. mtr_release_s_latch_at_savepoint(
  635. mtr, savepoint,
  636. dict_index_get_lock(index));
  637. }
  638. }
  639. page_mode = mode;
  640. }
  641. page_cur_search_with_match(
  642. block, index, tuple, page_mode, &up_match, &up_bytes,
  643. &low_match, &low_bytes, page_cursor);
  644. if (estimate) {
  645. btr_cur_add_path_info(cursor, height, root_height);
  646. }
  647. /* If this is the desired level, leave the loop */
  648. ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
  649. mtr));
  650. if (level != height) {
  651. const rec_t* node_ptr;
  652. ut_ad(height > 0);
  653. height--;
  654. guess = NULL;
  655. node_ptr = page_cur_get_rec(page_cursor);
  656. offsets = rec_get_offsets(
  657. node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
  658. /* Go to the child node */
  659. page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
  660. if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
  661. /* We're doing a search on an ibuf tree and we're one
  662. level above the leaf page. */
  663. ut_ad(level == 0);
  664. buf_mode = BUF_GET;
  665. rw_latch = RW_NO_LATCH;
  666. goto retry_page_get;
  667. }
  668. goto search_loop;
  669. }
  670. if (level != 0) {
  671. /* x-latch the page */
  672. buf_block_t* child_block = btr_block_get(
  673. space, zip_size, page_no, RW_X_LATCH, index, mtr);
  674. page = buf_block_get_frame(child_block);
  675. btr_assert_not_corrupted(child_block, index);
  676. } else {
  677. cursor->low_match = low_match;
  678. cursor->low_bytes = low_bytes;
  679. cursor->up_match = up_match;
  680. cursor->up_bytes = up_bytes;
  681. #ifdef BTR_CUR_ADAPT
  682. /* We do a dirty read of btr_search_enabled here. We
  683. will properly check btr_search_enabled again in
  684. btr_search_build_page_hash_index() before building a
  685. page hash index, while holding btr_search_latch. */
  686. if (btr_search_enabled) {
  687. btr_search_info_update(index, cursor);
  688. }
  689. #endif
  690. ut_ad(cursor->up_match != ULINT_UNDEFINED
  691. || mode != PAGE_CUR_GE);
  692. ut_ad(cursor->up_match != ULINT_UNDEFINED
  693. || mode != PAGE_CUR_LE);
  694. ut_ad(cursor->low_match != ULINT_UNDEFINED
  695. || mode != PAGE_CUR_LE);
  696. }
  697. func_exit:
  698. if (UNIV_LIKELY_NULL(heap)) {
  699. mem_heap_free(heap);
  700. }
  701. if (has_search_latch) {
  702. rw_lock_s_lock(&btr_search_latch);
  703. }
  704. }
  705. /*****************************************************************//**
  706. Opens a cursor at either end of an index. */
  707. UNIV_INTERN
  708. void
  709. btr_cur_open_at_index_side_func(
  710. /*============================*/
  711. bool from_left, /*!< in: true if open to the low end,
  712. false if to the high end */
  713. dict_index_t* index, /*!< in: index */
  714. ulint latch_mode, /*!< in: latch mode */
  715. btr_cur_t* cursor, /*!< in/out: cursor */
  716. ulint level, /*!< in: level to search for
  717. (0=leaf). */
  718. const char* file, /*!< in: file name */
  719. ulint line, /*!< in: line where called */
  720. mtr_t* mtr) /*!< in/out: mini-transaction */
  721. {
  722. page_cur_t* page_cursor;
  723. ulint page_no;
  724. ulint space;
  725. ulint zip_size;
  726. ulint height;
  727. ulint root_height = 0; /* remove warning */
  728. rec_t* node_ptr;
  729. ulint estimate;
  730. ulint savepoint;
  731. mem_heap_t* heap = NULL;
  732. ulint offsets_[REC_OFFS_NORMAL_SIZE];
  733. ulint* offsets = offsets_;
  734. rec_offs_init(offsets_);
  735. estimate = latch_mode & BTR_ESTIMATE;
  736. latch_mode &= ~BTR_ESTIMATE;
  737. ut_ad(level != ULINT_UNDEFINED);
  738. /* Store the position of the tree latch we push to mtr so that we
  739. know how to release it when we have latched the leaf node */
  740. savepoint = mtr_set_savepoint(mtr);
  741. switch (latch_mode) {
  742. case BTR_CONT_MODIFY_TREE:
  743. break;
  744. case BTR_MODIFY_TREE:
  745. mtr_x_lock(dict_index_get_lock(index), mtr);
  746. break;
  747. case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED:
  748. case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED:
  749. ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
  750. MTR_MEMO_S_LOCK));
  751. break;
  752. default:
  753. mtr_s_lock(dict_index_get_lock(index), mtr);
  754. }
  755. page_cursor = btr_cur_get_page_cur(cursor);
  756. cursor->index = index;
  757. space = dict_index_get_space(index);
  758. zip_size = dict_table_zip_size(index->table);
  759. page_no = dict_index_get_page(index);
  760. height = ULINT_UNDEFINED;
  761. for (;;) {
  762. buf_block_t* block;
  763. page_t* page;
  764. block = buf_page_get_gen(space, zip_size, page_no,
  765. RW_NO_LATCH, NULL, BUF_GET,
  766. file, line, mtr);
  767. page = buf_block_get_frame(block);
  768. ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
  769. ut_ad(index->id == btr_page_get_index_id(page));
  770. block->check_index_page_at_flush = TRUE;
  771. if (height == ULINT_UNDEFINED) {
  772. /* We are in the root node */
  773. height = btr_page_get_level(page, mtr);
  774. root_height = height;
  775. ut_a(height >= level);
  776. } else {
  777. /* TODO: flag the index corrupted if this fails */
  778. ut_ad(height == btr_page_get_level(page, mtr));
  779. }
  780. if (height == level) {
  781. btr_cur_latch_leaves(
  782. page, space, zip_size, page_no,
  783. latch_mode & ~BTR_ALREADY_S_LATCHED,
  784. cursor, mtr);
  785. if (height == 0) {
  786. /* In versions <= 3.23.52 we had
  787. forgotten to release the tree latch
  788. here. If in an index scan we had to
  789. scan far to find a record visible to
  790. the current transaction, that could
  791. starve others waiting for the tree
  792. latch. */
  793. switch (latch_mode) {
  794. case BTR_MODIFY_TREE:
  795. case BTR_CONT_MODIFY_TREE:
  796. case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED:
  797. case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED:
  798. break;
  799. default:
  800. /* Release the tree s-latch */
  801. mtr_release_s_latch_at_savepoint(
  802. mtr, savepoint,
  803. dict_index_get_lock(index));
  804. }
  805. }
  806. }
  807. if (from_left) {
  808. page_cur_set_before_first(block, page_cursor);
  809. } else {
  810. page_cur_set_after_last(block, page_cursor);
  811. }
  812. if (height == level) {
  813. if (estimate) {
  814. btr_cur_add_path_info(cursor, height,
  815. root_height);
  816. }
  817. break;
  818. }
  819. ut_ad(height > 0);
  820. if (from_left) {
  821. page_cur_move_to_next(page_cursor);
  822. } else {
  823. page_cur_move_to_prev(page_cursor);
  824. }
  825. if (estimate) {
  826. btr_cur_add_path_info(cursor, height, root_height);
  827. }
  828. height--;
  829. node_ptr = page_cur_get_rec(page_cursor);
  830. offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
  831. ULINT_UNDEFINED, &heap);
  832. /* Go to the child node */
  833. page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
  834. }
  835. if (heap) {
  836. mem_heap_free(heap);
  837. }
  838. }
  839. /**********************************************************************//**
  840. Positions a cursor at a randomly chosen position within a B-tree. */
  841. UNIV_INTERN
  842. void
  843. btr_cur_open_at_rnd_pos_func(
  844. /*=========================*/
  845. dict_index_t* index, /*!< in: index */
  846. ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
  847. btr_cur_t* cursor, /*!< in/out: B-tree cursor */
  848. const char* file, /*!< in: file name */
  849. ulint line, /*!< in: line where called */
  850. mtr_t* mtr) /*!< in: mtr */
  851. {
  852. page_cur_t* page_cursor;
  853. ulint page_no;
  854. ulint space;
  855. ulint zip_size;
  856. ulint height;
  857. rec_t* node_ptr;
  858. mem_heap_t* heap = NULL;
  859. ulint offsets_[REC_OFFS_NORMAL_SIZE];
  860. ulint* offsets = offsets_;
  861. rec_offs_init(offsets_);
  862. switch (latch_mode) {
  863. case BTR_MODIFY_TREE:
  864. mtr_x_lock(dict_index_get_lock(index), mtr);
  865. break;
  866. default:
  867. ut_ad(latch_mode != BTR_CONT_MODIFY_TREE);
  868. mtr_s_lock(dict_index_get_lock(index), mtr);
  869. }
  870. page_cursor = btr_cur_get_page_cur(cursor);
  871. cursor->index = index;
  872. space = dict_index_get_space(index);
  873. zip_size = dict_table_zip_size(index->table);
  874. page_no = dict_index_get_page(index);
  875. height = ULINT_UNDEFINED;
  876. for (;;) {
  877. buf_block_t* block;
  878. page_t* page;
  879. block = buf_page_get_gen(space, zip_size, page_no,
  880. RW_NO_LATCH, NULL, BUF_GET,
  881. file, line, mtr);
  882. page = buf_block_get_frame(block);
  883. ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
  884. ut_ad(index->id == btr_page_get_index_id(page));
  885. if (height == ULINT_UNDEFINED) {
  886. /* We are in the root node */
  887. height = btr_page_get_level(page, mtr);
  888. }
  889. if (height == 0) {
  890. btr_cur_latch_leaves(page, space, zip_size, page_no,
  891. latch_mode, cursor, mtr);
  892. }
  893. page_cur_open_on_rnd_user_rec(block, page_cursor);
  894. if (height == 0) {
  895. break;
  896. }
  897. ut_ad(height > 0);
  898. height--;
  899. node_ptr = page_cur_get_rec(page_cursor);
  900. offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
  901. ULINT_UNDEFINED, &heap);
  902. /* Go to the child node */
  903. page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
  904. }
  905. if (UNIV_LIKELY_NULL(heap)) {
  906. mem_heap_free(heap);
  907. }
  908. }
  909. /*==================== B-TREE INSERT =========================*/
  910. /*************************************************************//**
  911. Inserts a record if there is enough space, or if enough space can
  912. be freed by reorganizing. Differs from btr_cur_optimistic_insert because
  913. no heuristics is applied to whether it pays to use CPU time for
  914. reorganizing the page or not.
  915. IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
  916. if this is a compressed leaf page in a secondary index.
  917. This has to be done either within the same mini-transaction,
  918. or by invoking ibuf_reset_free_bits() before mtr_commit().
  919. @return pointer to inserted record if succeed, else NULL */
  920. static __attribute__((nonnull, warn_unused_result))
  921. rec_t*
  922. btr_cur_insert_if_possible(
  923. /*=======================*/
  924. btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
  925. cursor stays valid */
  926. const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not
  927. have been stored to tuple */
  928. ulint** offsets,/*!< out: offsets on *rec */
  929. mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
  930. ulint n_ext, /*!< in: number of externally stored columns */
  931. mtr_t* mtr) /*!< in/out: mini-transaction */
  932. {
  933. page_cur_t* page_cursor;
  934. rec_t* rec;
  935. ut_ad(dtuple_check_typed(tuple));
  936. ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
  937. MTR_MEMO_PAGE_X_FIX));
  938. page_cursor = btr_cur_get_page_cur(cursor);
  939. /* Now, try the insert */
  940. rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
  941. offsets, heap, n_ext, mtr);
  942. /* If the record did not fit, reorganize.
  943. For compressed pages, page_cur_tuple_insert()
  944. attempted this already. */
  945. if (!rec && !page_cur_get_page_zip(page_cursor)
  946. && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
  947. rec = page_cur_tuple_insert(
  948. page_cursor, tuple, cursor->index,
  949. offsets, heap, n_ext, mtr);
  950. }
  951. ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
  952. return(rec);
  953. }
  954. /*************************************************************//**
  955. For an insert, checks the locks and does the undo logging if desired.
  956. @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
  957. UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,5,6)))
  958. dberr_t
  959. btr_cur_ins_lock_and_undo(
  960. /*======================*/
  961. ulint flags, /*!< in: undo logging and locking flags: if
  962. not zero, the parameters index and thr
  963. should be specified */
  964. btr_cur_t* cursor, /*!< in: cursor on page after which to insert */
  965. dtuple_t* entry, /*!< in/out: entry to insert */
  966. que_thr_t* thr, /*!< in: query thread or NULL */
  967. mtr_t* mtr, /*!< in/out: mini-transaction */
  968. ibool* inherit)/*!< out: TRUE if the inserted new record maybe
  969. should inherit LOCK_GAP type locks from the
  970. successor record */
  971. {
  972. dict_index_t* index;
  973. dberr_t err;
  974. rec_t* rec;
  975. roll_ptr_t roll_ptr;
  976. /* Check if we have to wait for a lock: enqueue an explicit lock
  977. request if yes */
  978. rec = btr_cur_get_rec(cursor);
  979. index = cursor->index;
  980. ut_ad(!dict_index_is_online_ddl(index)
  981. || dict_index_is_clust(index)
  982. || (flags & BTR_CREATE_FLAG));
  983. err = lock_rec_insert_check_and_lock(flags, rec,
  984. btr_cur_get_block(cursor),
  985. index, thr, mtr, inherit);
  986. if (err != DB_SUCCESS
  987. || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
  988. return(err);
  989. }
  990. err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
  991. thr, index, entry,
  992. NULL, 0, NULL, NULL,
  993. &roll_ptr);
  994. if (err != DB_SUCCESS) {
  995. return(err);
  996. }
  997. /* Now we can fill in the roll ptr field in entry */
  998. if (!(flags & BTR_KEEP_SYS_FLAG)) {
  999. row_upd_index_entry_sys_field(entry, index,
  1000. DATA_ROLL_PTR, roll_ptr);
  1001. }
  1002. return(DB_SUCCESS);
  1003. }
  1004. #ifdef UNIV_DEBUG
  1005. /*************************************************************//**
  1006. Report information about a transaction. */
  1007. static
  1008. void
  1009. btr_cur_trx_report(
  1010. /*===============*/
  1011. trx_id_t trx_id, /*!< in: transaction id */
  1012. const dict_index_t* index, /*!< in: index */
  1013. const char* op) /*!< in: operation */
  1014. {
  1015. fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ", trx_id);
  1016. fputs(op, stderr);
  1017. dict_index_name_print(stderr, NULL, index);
  1018. putc('\n', stderr);
  1019. }
  1020. #endif /* UNIV_DEBUG */
  1021. /*************************************************************//**
  1022. Tries to perform an insert to a page in an index tree, next to cursor.
  1023. It is assumed that mtr holds an x-latch on the page. The operation does
  1024. not succeed if there is too little space on the page. If there is just
  1025. one record on the page, the insert will always succeed; this is to
  1026. prevent trying to split a page with just one record.
  1027. @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
  1028. UNIV_INTERN
  1029. dberr_t
  1030. btr_cur_optimistic_insert(
  1031. /*======================*/
  1032. ulint flags, /*!< in: undo logging and locking flags: if not
  1033. zero, the parameters index and thr should be
  1034. specified */
  1035. btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
  1036. cursor stays valid */
  1037. ulint** offsets,/*!< out: offsets on *rec */
  1038. mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
  1039. dtuple_t* entry, /*!< in/out: entry to insert */
  1040. rec_t** rec, /*!< out: pointer to inserted record if
  1041. succeed */
  1042. big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
  1043. be stored externally by the caller, or
  1044. NULL */
  1045. ulint n_ext, /*!< in: number of externally stored columns */
  1046. que_thr_t* thr, /*!< in: query thread or NULL */
  1047. mtr_t* mtr) /*!< in/out: mini-transaction;
  1048. if this function returns DB_SUCCESS on
  1049. a leaf page of a secondary index in a
  1050. compressed tablespace, the caller must
  1051. mtr_commit(mtr) before latching
  1052. any further pages */
  1053. {
  1054. big_rec_t* big_rec_vec = NULL;
  1055. dict_index_t* index;
  1056. page_cur_t* page_cursor;
  1057. buf_block_t* block;
  1058. page_t* page;
  1059. rec_t* dummy;
  1060. ibool leaf;
  1061. ibool reorg;
  1062. ibool inherit;
  1063. ulint zip_size;
  1064. ulint rec_size;
  1065. dberr_t err;
  1066. *big_rec = NULL;
  1067. block = btr_cur_get_block(cursor);
  1068. page = buf_block_get_frame(block);
  1069. index = cursor->index;
  1070. ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
  1071. ut_ad(!dict_index_is_online_ddl(index)
  1072. || dict_index_is_clust(index)
  1073. || (flags & BTR_CREATE_FLAG));
  1074. ut_ad(dtuple_check_typed(entry));
  1075. zip_size = buf_block_get_zip_size(block);
  1076. #ifdef UNIV_DEBUG_VALGRIND
  1077. if (zip_size) {
  1078. UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
  1079. UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
  1080. }
  1081. #endif /* UNIV_DEBUG_VALGRIND */
  1082. #ifdef UNIV_DEBUG
  1083. if (btr_cur_print_record_ops && thr) {
  1084. btr_cur_trx_report(thr_get_trx(thr)->id, index, "insert ");
  1085. dtuple_print(stderr, entry);
  1086. }
  1087. #endif /* UNIV_DEBUG */
  1088. leaf = page_is_leaf(page);
  1089. /* Calculate the record size when entry is converted to a record */
  1090. rec_size = rec_get_converted_size(index, entry, n_ext);
  1091. if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
  1092. dtuple_get_n_fields(entry), zip_size)) {
  1093. /* The record is so big that we have to store some fields
  1094. externally on separate database pages */
  1095. big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
  1096. if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
  1097. return(DB_TOO_BIG_RECORD);
  1098. }
  1099. rec_size = rec_get_converted_size(index, entry, n_ext);
  1100. }
  1101. if (zip_size) {
  1102. /* Estimate the free space of an empty compressed page.
  1103. Subtract one byte for the encoded heap_no in the
  1104. modification log. */
  1105. ulint free_space_zip = page_zip_empty_size(
  1106. cursor->index->n_fields, zip_size);
  1107. ulint n_uniq = dict_index_get_n_unique_in_tree(index);
  1108. ut_ad(dict_table_is_comp(index->table));
  1109. if (free_space_zip == 0) {
  1110. too_big:
  1111. if (big_rec_vec) {
  1112. dtuple_convert_back_big_rec(
  1113. index, entry, big_rec_vec);
  1114. }
  1115. return(DB_TOO_BIG_RECORD);
  1116. }
  1117. /* Subtract one byte for the encoded heap_no in the
  1118. modification log. */
  1119. free_space_zip--;
  1120. /* There should be enough room for two node pointer
  1121. records on an empty non-leaf page. This prevents
  1122. infinite page splits. */
  1123. if (entry->n_fields >= n_uniq
  1124. && (REC_NODE_PTR_SIZE
  1125. + rec_get_converted_size_comp_prefix(
  1126. index, entry->fields, n_uniq, NULL)
  1127. /* On a compressed page, there is
  1128. a two-byte entry in the dense
  1129. page directory for every record.
  1130. But there is no record header. */
  1131. - (REC_N_NEW_EXTRA_BYTES - 2)
  1132. > free_space_zip / 2)) {
  1133. goto too_big;
  1134. }
  1135. }
  1136. LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
  1137. goto fail);
  1138. if (leaf && zip_size
  1139. && (page_get_data_size(page) + rec_size
  1140. >= dict_index_zip_pad_optimal_page_size(index))) {
  1141. /* If compression padding tells us that insertion will
  1142. result in too packed up page i.e.: which is likely to
  1143. cause compression failure then don't do an optimistic
  1144. insertion. */
  1145. fail:
  1146. err = DB_FAIL;
  1147. fail_err:
  1148. if (big_rec_vec) {
  1149. dtuple_convert_back_big_rec(index, entry, big_rec_vec);
  1150. }
  1151. return(err);
  1152. }
  1153. ulint max_size = page_get_max_insert_size_after_reorganize(page, 1);
  1154. if (page_has_garbage(page)) {
  1155. if ((max_size < rec_size
  1156. || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
  1157. && page_get_n_recs(page) > 1
  1158. && page_get_max_insert_size(page, 1) < rec_size) {
  1159. goto fail;
  1160. }
  1161. } else if (max_size < rec_size) {
  1162. goto fail;
  1163. }
  1164. /* If there have been many consecutive inserts to the
  1165. clustered index leaf page of an uncompressed table, check if
  1166. we have to split the page to reserve enough free space for
  1167. future updates of records. */
  1168. if (leaf && !zip_size && dict_index_is_clust(index)
  1169. && page_get_n_recs(page) >= 2
  1170. && dict_index_get_space_reserve() + rec_size > max_size
  1171. && (btr_page_get_split_rec_to_right(cursor, &dummy)
  1172. || btr_page_get_split_rec_to_left(cursor, &dummy))) {
  1173. goto fail;
  1174. }
  1175. /* Check locks and write to the undo log, if specified */
  1176. err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
  1177. thr, mtr, &inherit);
  1178. if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
  1179. goto fail_err;
  1180. }
  1181. page_cursor = btr_cur_get_page_cur(cursor);
  1182. /* Now, try the insert */
  1183. {
  1184. const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
  1185. *rec = page_cur_tuple_insert(page_cursor, entry, index,
  1186. offsets, heap, n_ext, mtr);
  1187. reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
  1188. }
  1189. if (*rec) {
  1190. } else if (zip_size) {
  1191. /* Reset the IBUF_BITMAP_FREE bits, because
  1192. page_cur_tuple_insert() will have attempted page
  1193. reorganize before failing. */
  1194. if (leaf && !dict_index_is_clust(index)) {
  1195. ibuf_reset_free_bits(block);
  1196. }
  1197. goto fail;
  1198. } else {
  1199. ut_ad(!reorg);
  1200. /* If the record did not fit, reorganize */
  1201. if (!btr_page_reorganize(page_cursor, index, mtr)) {
  1202. ut_ad(0);
  1203. goto fail;
  1204. }
  1205. ut_ad(page_get_max_insert_size(page, 1) == max_size);
  1206. reorg = TRUE;
  1207. *rec = page_cur_tuple_insert(page_cursor, entry, index,
  1208. offsets, heap, n_ext, mtr);
  1209. if (UNIV_UNLIKELY(!*rec)) {
  1210. fputs("InnoDB: Error: cannot insert tuple ", stderr);
  1211. dtuple_print(stderr, entry);
  1212. fputs(" into ", stderr);
  1213. dict_index_name_print(stderr, thr_get_trx(thr), index);
  1214. fprintf(stderr, "\nInnoDB: max insert size %lu\n",
  1215. (ulong) max_size);
  1216. ut_error;
  1217. }
  1218. }
  1219. #ifdef BTR_CUR_HASH_ADAPT
  1220. if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
  1221. btr_search_update_hash_node_on_insert(cursor);
  1222. } else {
  1223. btr_search_update_hash_on_insert(cursor);
  1224. }
  1225. #endif
  1226. if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
  1227. lock_update_insert(block, *rec);
  1228. }
  1229. if (leaf && !dict_index_is_clust(index)) {
  1230. /* Update the free bits of the B-tree page in the
  1231. insert buffer bitmap. */
  1232. /* The free bits in the insert buffer bitmap must
  1233. never exceed the free space on a page. It is safe to
  1234. decrement or reset the bits in the bitmap in a
  1235. mini-transaction that is committed before the
  1236. mini-transaction that affects the free space. */
  1237. /* It is unsafe to increment the bits in a separately
  1238. committed mini-transaction, because in crash recovery,
  1239. the free bits could momentarily be set too high. */
  1240. if (zip_size) {
  1241. /* Update the bits in the same mini-transaction. */
  1242. ibuf_update_free_bits_zip(block, mtr);
  1243. } else {
  1244. /* Decrement the bits in a separate
  1245. mini-transaction. */
  1246. ibuf_update_free_bits_if_full(
  1247. block, max_size,
  1248. rec_size + PAGE_DIR_SLOT_SIZE);
  1249. }
  1250. }
  1251. *big_rec = big_rec_vec;
  1252. return(DB_SUCCESS);
  1253. }
  1254. /*************************************************************//**
  1255. Performs an insert on a page of an index tree. It is assumed that mtr
  1256. holds an x-latch on the tree and on the cursor page. If the insert is
  1257. made on the leaf level, to avoid deadlocks, mtr must also own x-latches
  1258. to brothers of page, if those brothers exist.
  1259. @return DB_SUCCESS or error number */
  1260. UNIV_INTERN
  1261. dberr_t
  1262. btr_cur_pessimistic_insert(
  1263. /*=======================*/
  1264. ulint flags, /*!< in: undo logging and locking flags: if not
  1265. zero, the parameter thr should be
  1266. specified; if no undo logging is specified,
  1267. then the caller must have reserved enough
  1268. free extents in the file space so that the
  1269. insertion will certainly succeed */
  1270. btr_cur_t* cursor, /*!< in: cursor after which to insert;
  1271. cursor stays valid */
  1272. ulint** offsets,/*!< out: offsets on *rec */
  1273. mem_heap_t** heap, /*!< in/out: pointer to memory heap
  1274. that can be emptied, or NULL */
  1275. dtuple_t* entry, /*!< in/out: entry to insert */
  1276. rec_t** rec, /*!< out: pointer to inserted record if
  1277. succeed */
  1278. big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
  1279. be stored externally by the caller, or
  1280. NULL */
  1281. ulint n_ext, /*!< in: number of externally stored columns */
  1282. que_thr_t* thr, /*!< in: query thread or NULL */
  1283. mtr_t* mtr) /*!< in/out: mini-transaction */
  1284. {
  1285. dict_index_t* index = cursor->index;
  1286. ulint zip_size = dict_table_zip_size(index->table);
  1287. big_rec_t* big_rec_vec = NULL;
  1288. dberr_t err;
  1289. ibool dummy_inh;
  1290. ibool success;
  1291. ulint n_reserved = 0;
  1292. ut_ad(dtuple_check_typed(entry));
  1293. *big_rec = NULL;
  1294. ut_ad(mtr_memo_contains(mtr,
  1295. dict_index_get_lock(btr_cur_get_index(cursor)),
  1296. MTR_MEMO_X_LOCK));
  1297. ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
  1298. MTR_MEMO_PAGE_X_FIX));
  1299. ut_ad(!dict_index_is_online_ddl(index)
  1300. || dict_index_is_clust(index)
  1301. || (flags & BTR_CREATE_FLAG));
  1302. cursor->flag = BTR_CUR_BINARY;
  1303. /* Check locks and write to undo log, if specified */
  1304. err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
  1305. thr, mtr, &dummy_inh);
  1306. if (err != DB_SUCCESS) {
  1307. return(err);
  1308. }
  1309. if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
  1310. /* First reserve enough free space for the file segments
  1311. of the index tree, so that the insert will not fail because
  1312. of lack of space */
  1313. ulint n_extents = cursor->tree_height / 16 + 3;
  1314. success = fsp_reserve_free_extents(&n_reserved, index->space,
  1315. n_extents, FSP_NORMAL, mtr);
  1316. if (!success) {
  1317. return(DB_OUT_OF_FILE_SPACE);
  1318. }
  1319. }
  1320. if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
  1321. dict_table_is_comp(index->table),
  1322. dtuple_get_n_fields(entry),
  1323. zip_size)) {
  1324. /* The record is so big that we have to store some fields
  1325. externally on separate database pages */
  1326. if (UNIV_LIKELY_NULL(big_rec_vec)) {
  1327. /* This should never happen, but we handle
  1328. the situation in a robust manner. */
  1329. ut_ad(0);
  1330. dtuple_convert_back_big_rec(index, entry, big_rec_vec);
  1331. }
  1332. big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
  1333. if (big_rec_vec == NULL) {
  1334. if (n_reserved > 0) {
  1335. fil_space_release_free_extents(index->space,
  1336. n_reserved);
  1337. }
  1338. return(DB_TOO_BIG_RECORD);
  1339. }
  1340. }
  1341. if (dict_index_get_page(index)
  1342. == buf_block_get_page_no(btr_cur_get_block(cursor))) {
  1343. /* The page is the root page */
  1344. *rec = btr_root_raise_and_insert(
  1345. flags, cursor, offsets, heap, entry, n_ext, mtr);
  1346. } else {
  1347. *rec = btr_page_split_and_insert(
  1348. flags, cursor, offsets, heap, entry, n_ext, mtr);
  1349. }
  1350. ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
  1351. #ifdef BTR_CUR_ADAPT
  1352. btr_search_update_hash_on_insert(cursor);
  1353. #endif
  1354. if (!(flags & BTR_NO_LOCKING_FLAG)) {
  1355. lock_update_insert(btr_cur_get_block(cursor), *rec);
  1356. }
  1357. if (n_reserved > 0) {
  1358. fil_space_release_free_extents(index->space, n_reserved);
  1359. }
  1360. *big_rec = big_rec_vec;
  1361. return(DB_SUCCESS);
  1362. }
  1363. /*==================== B-TREE UPDATE =========================*/
  1364. /*************************************************************//**
  1365. For an update, checks the locks and does the undo logging.
  1366. @return DB_SUCCESS, DB_WAIT_LOCK, or error number */
  1367. UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,6,7)))
  1368. dberr_t
  1369. btr_cur_upd_lock_and_undo(
  1370. /*======================*/
  1371. ulint flags, /*!< in: undo logging and locking flags */
  1372. btr_cur_t* cursor, /*!< in: cursor on record to update */
  1373. const ulint* offsets,/*!< in: rec_get_offsets() on cursor */
  1374. const upd_t* update, /*!< in: update vector */
  1375. ulint cmpl_info,/*!< in: compiler info on secondary index
  1376. updates */
  1377. que_thr_t* thr, /*!< in: query thread
  1378. (can be NULL if BTR_NO_LOCKING_FLAG) */
  1379. mtr_t* mtr, /*!< in/out: mini-transaction */
  1380. roll_ptr_t* roll_ptr)/*!< out: roll pointer */
  1381. {
  1382. dict_index_t* index;
  1383. const rec_t* rec;
  1384. dberr_t err;
  1385. ut_ad(thr || (flags & BTR_NO_LOCKING_FLAG));
  1386. rec = btr_cur_get_rec(cursor);
  1387. index = cursor->index;
  1388. ut_ad(rec_offs_validate(rec, index, offsets));
  1389. if (!dict_index_is_clust(index)) {
  1390. ut_ad(dict_index_is_online_ddl(index)
  1391. == !!(flags & BTR_CREATE_FLAG));
  1392. /* We do undo logging only when we update a clustered index
  1393. record */
  1394. return(lock_sec_rec_modify_check_and_lock(
  1395. flags, btr_cur_get_block(cursor), rec,
  1396. index, thr, mtr));
  1397. }
  1398. /* Check if we have to wait for a lock: enqueue an explicit lock
  1399. request if yes */
  1400. if (!(flags & BTR_NO_LOCKING_FLAG)) {
  1401. err = lock_clust_rec_modify_check_and_lock(
  1402. flags, btr_cur_get_block(cursor), rec, index,
  1403. offsets, thr);
  1404. if (err != DB_SUCCESS) {
  1405. return(err);
  1406. }
  1407. }
  1408. /* Append the info about the update in the undo log */
  1409. return(trx_undo_report_row_operation(
  1410. flags, TRX_UNDO_MODIFY_OP, thr,
  1411. index, NULL, update,
  1412. cmpl_info, rec, offsets, roll_ptr));
  1413. }
  1414. /***********************************************************//**
  1415. Writes a redo log record of updating a record in-place. */
  1416. UNIV_INTERN
  1417. void
  1418. btr_cur_update_in_place_log(
  1419. /*========================*/
  1420. ulint flags, /*!< in: flags */
  1421. const rec_t* rec, /*!< in: record */
  1422. dict_index_t* index, /*!< in: index of the record */
  1423. const upd_t* update, /*!< in: update vector */
  1424. trx_id_t trx_id, /*!< in: transaction id */
  1425. roll_ptr_t roll_ptr, /*!< in: roll ptr */
  1426. mtr_t* mtr) /*!< in: mtr */
  1427. {
  1428. byte* log_ptr;
  1429. const page_t* page = page_align(rec);
  1430. ut_ad(flags < 256);
  1431. ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
  1432. log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
  1433. ? MLOG_COMP_REC_UPDATE_IN_PLACE
  1434. : MLOG_REC_UPDATE_IN_PLACE,
  1435. 1 + DATA_ROLL_PTR_LEN + 14 + 2
  1436. + MLOG_BUF_MARGIN);
  1437. if (!log_ptr) {
  1438. /* Logging in mtr is switched off during crash recovery */
  1439. return;
  1440. }
  1441. /* For secondary indexes, we could skip writing the dummy system fields
  1442. to the redo log but we have to change redo log parsing of
  1443. MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
  1444. new redo log record. For now, just write dummy sys fields to the redo
  1445. log if we are updating a secondary index record.
  1446. */
  1447. mach_write_to_1(log_ptr, flags);
  1448. log_ptr++;
  1449. if (dict_index_is_clust(index)) {
  1450. log_ptr = row_upd_write_sys_vals_to_log(
  1451. index, trx_id, roll_ptr, log_ptr, mtr);
  1452. } else {
  1453. /* Dummy system fields for a secondary index */
  1454. /* TRX_ID Position */
  1455. log_ptr += mach_write_compressed(log_ptr, 0);
  1456. /* ROLL_PTR */
  1457. trx_write_roll_ptr(log_ptr, 0);
  1458. log_ptr += DATA_ROLL_PTR_LEN;
  1459. /* TRX_ID */
  1460. log_ptr += mach_ull_write_compressed(log_ptr, 0);
  1461. }
  1462. mach_write_to_2(log_ptr, page_offset(rec));
  1463. log_ptr += 2;
  1464. row_upd_index_write_log(update, log_ptr, mtr);
  1465. }
  1466. #endif /* UNIV_HOTBACKUP */
  1467. /***********************************************************//**
  1468. Parses a redo log record of updating a record in-place.
  1469. @return end of log record or NULL */
  1470. UNIV_INTERN
  1471. byte*
  1472. btr_cur_parse_update_in_place(
  1473. /*==========================*/
  1474. byte* ptr, /*!< in: buffer */
  1475. byte* end_ptr,/*!< in: buffer end */
  1476. page_t* page, /*!< in/out: page or NULL */
  1477. page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
  1478. dict_index_t* index) /*!< in: index corresponding to page */
  1479. {
  1480. ulint flags;
  1481. rec_t* rec;
  1482. upd_t* update;
  1483. ulint pos;
  1484. trx_id_t trx_id;
  1485. roll_ptr_t roll_ptr;
  1486. ulint rec_offset;
  1487. mem_heap_t* heap;
  1488. ulint* offsets;
  1489. if (end_ptr < ptr + 1) {
  1490. return(NULL);
  1491. }
  1492. flags = mach_read_from_1(ptr);
  1493. ptr++;
  1494. ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
  1495. if (ptr == NULL) {
  1496. return(NULL);
  1497. }
  1498. if (end_ptr < ptr + 2) {
  1499. return(NULL);
  1500. }
  1501. rec_offset = mach_read_from_2(ptr);
  1502. ptr += 2;
  1503. ut_a(rec_offset <= UNIV_PAGE_SIZE);
  1504. heap = mem_heap_create(256);
  1505. ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
  1506. if (!ptr || !page) {
  1507. goto func_exit;
  1508. }
  1509. ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
  1510. rec = page + rec_offset;
  1511. /* We do not need to reserve btr_search_latch, as the page is only
  1512. being recovered, and there cannot be a hash index to it. */
  1513. offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
  1514. if (!(flags & BTR_KEEP_SYS_FLAG)) {
  1515. row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
  1516. pos, trx_id, roll_ptr);
  1517. }
  1518. row_upd_rec_in_place(rec, index, offsets, update, page_zip);
  1519. func_exit:
  1520. mem_heap_free(heap);
  1521. return(ptr);
  1522. }
  1523. #ifndef UNIV_HOTBACKUP
  1524. /*************************************************************//**
  1525. See if there is enough place in the page modification log to log
  1526. an update-in-place.
  1527. @retval false if out of space; IBUF_BITMAP_FREE will be reset
  1528. outside mtr if the page was recompressed
  1529. @retval true if enough place;
  1530. IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
  1531. a secondary index leaf page. This has to be done either within the
  1532. same mini-transaction, or by invoking ibuf_reset_free_bits() before
  1533. mtr_commit(mtr). */
  1534. UNIV_INTERN
  1535. bool
  1536. btr_cur_update_alloc_zip_func(
  1537. /*==========================*/
  1538. page_zip_des_t* page_zip,/*!< in/out: compressed page */
  1539. page_cur_t* cursor, /*!< in/out: B-tree page cursor */
  1540. dict_index_t* index, /*!< in: the index corresponding to cursor */
  1541. #ifdef UNIV_DEBUG
  1542. ulint* offsets,/*!< in/out: offsets of the cursor record */
  1543. #endif /* UNIV_DEBUG */
  1544. ulint length, /*!< in: size needed */
  1545. bool create, /*!< in: true=delete-and-insert,
  1546. false=update-in-place */
  1547. mtr_t* mtr) /*!< in/out: mini-transaction */
  1548. {
  1549. const page_t* page = page_cur_get_page(cursor);
  1550. ut_ad(page_zip == page_cur_get_page_zip(cursor));
  1551. ut_ad(page_zip);
  1552. ut_ad(!dict_index_is_ibuf(index));
  1553. ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
  1554. if (page_zip_available(page_zip, dict_index_is_clust(index),
  1555. length, create)) {
  1556. return(true);
  1557. }
  1558. if (!page_zip->m_nonempty && !page_has_garbage(page)) {
  1559. /* The page has been freshly compressed, so
  1560. reorganizing it will not help. */
  1561. return(false);
  1562. }
  1563. if (create && page_is_leaf(page)
  1564. && (length + page_get_data_size(page)
  1565. >= dict_index_zip_pad_optimal_page_size(index))) {
  1566. return(false);
  1567. }
  1568. if (!btr_page_reorganize(cursor, index, mtr)) {
  1569. goto out_of_space;
  1570. }
  1571. rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
  1572. /* After recompressing a page, we must make sure that the free
  1573. bits in the insert buffer bitmap will not exceed the free
  1574. space on the page. Because this function will not attempt
  1575. recompression unless page_zip_available() fails above, it is
  1576. safe to reset the free bits if page_zip_available() fails
  1577. again, below. The free bits can safely be reset in a separate
  1578. mini-transaction. If page_zip_available() succeeds below, we
  1579. can be sure that the btr_page_reorganize() above did not reduce
  1580. the free space available on the page. */
  1581. if (page_zip_available(page_zip, dict_index_is_clust(index),
  1582. length, create)) {
  1583. return(true);
  1584. }
  1585. out_of_space:
  1586. ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
  1587. /* Out of space: reset the free bits. */
  1588. if (!dict_index_is_clust(index) && page_is_leaf(page)) {
  1589. ibuf_reset_free_bits(page_cur_get_block(cursor));
  1590. }
  1591. return(false);
  1592. }
  1593. /*************************************************************//**
  1594. Updates a record when the update causes no size changes in its fields.
  1595. We assume here that the ordering fields of the record do not change.
  1596. @return locking or undo log related error code, or
  1597. @retval DB_SUCCESS on success
  1598. @retval DB_ZIP_OVERFLOW if there is not enough space left
  1599. on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
  1600. UNIV_INTERN
  1601. dberr_t
  1602. btr_cur_update_in_place(
  1603. /*====================*/
  1604. ulint flags, /*!< in: undo logging and locking flags */
  1605. btr_cur_t* cursor, /*!< in: cursor on the record to update;
  1606. cursor stays valid and positioned on the
  1607. same record */
  1608. ulint* offsets,/*!< in/out: offsets on cursor->page_cur.rec */
  1609. const upd_t* update, /*!< in: update vector */
  1610. ulint cmpl_info,/*!< in: compiler info on secondary index
  1611. updates */
  1612. que_thr_t* thr, /*!< in: query thread */
  1613. trx_id_t trx_id, /*!< in: transaction id */
  1614. mtr_t* mtr) /*!< in/out: mini-transaction; if this
  1615. is a secondary index, the caller must
  1616. mtr_commit(mtr) before latching any
  1617. further pages */
  1618. {
  1619. dict_index_t* index;
  1620. buf_block_t* block;
  1621. page_zip_des_t* page_zip;
  1622. dberr_t err;
  1623. rec_t* rec;
  1624. roll_ptr_t roll_ptr = 0;
  1625. ulint was_delete_marked;
  1626. ibool is_hashed;
  1627. rec = btr_cur_get_rec(cursor);
  1628. index = cursor->index;
  1629. ut_ad(rec_offs_validate(rec, index, offsets));
  1630. ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
  1631. /* The insert buffer tree should never be updated in place. */
  1632. ut_ad(!dict_index_is_ibuf(index));
  1633. ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
  1634. || dict_index_is_clust(index));
  1635. ut_ad(thr_get_trx(thr)->id == trx_id
  1636. || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
  1637. == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
  1638. | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
  1639. ut_ad(fil_page_get_type(btr_cur_get_page(cursor)) == FIL_PAGE_INDEX);
  1640. ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
  1641. #ifdef UNIV_DEBUG
  1642. if (btr_cur_print_record_ops) {
  1643. btr_cur_trx_report(trx_id, index, "update ");
  1644. rec_print_new(stderr, rec, offsets);
  1645. }
  1646. #endif /* UNIV_DEBUG */
  1647. block = btr_cur_get_block(cursor);
  1648. page_zip = buf_block_get_page_zip(block);
  1649. /* Check that enough space is available on the compressed page. */
  1650. if (page_zip) {
  1651. if (!btr_cur_update_alloc_zip(
  1652. page_zip, btr_cur_get_page_cur(cursor),
  1653. index, offsets, rec_offs_size(offsets),
  1654. false, mtr)) {
  1655. return(DB_ZIP_OVERFLOW);
  1656. }
  1657. rec = btr_cur_get_rec(cursor);
  1658. }
  1659. /* Do lock checking and undo logging */
  1660. err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
  1661. update, cmpl_info,
  1662. thr, mtr, &roll_ptr);
  1663. if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
  1664. /* We may need to update the IBUF_BITMAP_FREE
  1665. bits after a reorganize that was done in
  1666. btr_cur_update_alloc_zip(). */
  1667. goto func_exit;
  1668. }
  1669. if (!(flags & BTR_KEEP_SYS_FLAG)) {
  1670. row_upd_rec_sys_fields(rec, NULL, index, offsets,
  1671. thr_get_trx(thr), roll_ptr);
  1672. }
  1673. was_delete_marked = rec_get_deleted_flag(
  1674. rec, page_is_comp(buf_block_get_frame(block)));
  1675. is_hashed = (block->index != NULL);
  1676. if (is_hashed) {
  1677. /* TO DO: Can we skip this if none of the fields
  1678. index->search_info->curr_n_fields
  1679. are being updated? */
  1680. /* The function row_upd_changes_ord_field_binary works only
  1681. if the update vector was built for a clustered index, we must
  1682. NOT call it if index is secondary */
  1683. if (!dict_index_is_clust(index)
  1684. || row_upd_changes_ord_field_binary(index, update, thr,
  1685. NULL, NULL)) {
  1686. /* Remove possible hash index pointer to this record */
  1687. btr_search_update_hash_on_delete(cursor);
  1688. }
  1689. rw_lock_x_lock(&btr_search_latch);
  1690. }
  1691. row_upd_rec_in_place(rec, index, offsets, update, page_zip);
  1692. if (is_hashed) {
  1693. rw_lock_x_unlock(&btr_search_latch);
  1694. }
  1695. btr_cur_update_in_place_log(flags, rec, index, update,
  1696. trx_id, roll_ptr, mtr);
  1697. if (was_delete_marked
  1698. && !rec_get_deleted_flag(
  1699. rec, page_is_comp(buf_block_get_frame(block)))) {
  1700. /* The new updated record owns its possible externally
  1701. stored fields */
  1702. btr_cur_unmark_extern_fields(page_zip,
  1703. rec, index, offsets, mtr);
  1704. }
  1705. ut_ad(err == DB_SUCCESS);
  1706. func_exit:
  1707. if (page_zip
  1708. && !(flags & BTR_KEEP_IBUF_BITMAP)
  1709. && !dict_index_is_clust(index)
  1710. && page_is_leaf(buf_block_get_frame(block))) {
  1711. /* Update the free bits in the insert buffer. */
  1712. ibuf_update_free_bits_zip(block, mtr);
  1713. }
  1714. return(err);
  1715. }
  1716. /*************************************************************//**
  1717. Tries to update a record on a page in an index tree. It is assumed that mtr
  1718. holds an x-latch on the page. The operation does not succeed if there is too
  1719. little space on the page or if the update would result in too empty a page,
  1720. so that tree compression is recommended. We assume here that the ordering
  1721. fields of the record do not change.
  1722. @return error code, including
  1723. @retval DB_SUCCESS on success
  1724. @retval DB_OVERFLOW if the updated record does not fit
  1725. @retval DB_UNDERFLOW if the page would become too empty
  1726. @retval DB_ZIP_OVERFLOW if there is not enough space left
  1727. on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
  1728. UNIV_INTERN
  1729. dberr_t
  1730. btr_cur_optimistic_update(
  1731. /*======================*/
  1732. ulint flags, /*!< in: undo logging and locking flags */
  1733. btr_cur_t* cursor, /*!< in: cursor on the record to update;
  1734. cursor stays valid and positioned on the
  1735. same record */
  1736. ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
  1737. mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */
  1738. const upd_t* update, /*!< in: update vector; this must also
  1739. contain trx id and roll ptr fields */
  1740. ulint cmpl_info,/*!< in: compiler info on secondary index
  1741. updates */
  1742. que_thr_t* thr, /*!< in: query thread, or NULL if
  1743. appropriate flags are set */
  1744. trx_id_t trx_id, /*!< in: transaction id */
  1745. mtr_t* mtr) /*!< in/out: mini-transaction; if this
  1746. is a secondary index, the caller must
  1747. mtr_commit(mtr) before latching any
  1748. further pages */
  1749. {
  1750. dict_index_t* index;
  1751. page_cur_t* page_cursor;
  1752. dberr_t err;
  1753. buf_block_t* block;
  1754. page_t* page;
  1755. page_zip_des_t* page_zip;
  1756. rec_t* rec;
  1757. ulint max_size;
  1758. ulint new_rec_size;
  1759. ulint old_rec_size;
  1760. dtuple_t* new_entry;
  1761. roll_ptr_t roll_ptr;
  1762. ulint i;
  1763. ulint n_ext;
  1764. block = btr_cur_get_block(cursor);
  1765. page = buf_block_get_frame(block);
  1766. rec = btr_cur_get_rec(cursor);
  1767. index = cursor->index;
  1768. ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
  1769. ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
  1770. /* The insert buffer tree should never be updated in place. */
  1771. ut_ad(!dict_index_is_ibuf(index));
  1772. ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
  1773. || dict_index_is_clust(index));
  1774. ut_ad(thr_get_trx(thr)->id == trx_id
  1775. || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
  1776. == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
  1777. | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
  1778. ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
  1779. ut_ad(btr_page_get_index_id(page) == index->id);
  1780. *offsets = rec_get_offsets(rec, index, *offsets,
  1781. ULINT_UNDEFINED, heap);
  1782. #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
  1783. ut_a(!rec_offs_any_null_extern(rec, *offsets)
  1784. || trx_is_recv(thr_get_trx(thr)));
  1785. #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
  1786. #ifdef UNIV_DEBUG
  1787. if (btr_cur_print_record_ops) {
  1788. btr_cur_trx_report(trx_id, index, "update ");
  1789. rec_print_new(stderr, rec, *offsets);
  1790. }
  1791. #endif /* UNIV_DEBUG */
  1792. if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
  1793. /* The simplest and the most common case: the update does not
  1794. change the size of any field and none of the updated fields is
  1795. externally stored in rec or update, and there is enough space
  1796. on the compressed page to log the update. */
  1797. return(btr_cur_update_in_place(
  1798. flags, cursor, *offsets, update,
  1799. cmpl_info, thr, trx_id, mtr));
  1800. }
  1801. if (rec_offs_any_extern(*offsets)) {
  1802. any_extern:
  1803. /* Externally stored fields are treated in pessimistic
  1804. update */
  1805. return(DB_OVERFLOW);
  1806. }
  1807. for (i = 0; i < upd_get_n_fields(update); i++) {
  1808. if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
  1809. goto any_extern;
  1810. }
  1811. }
  1812. page_cursor = btr_cur_get_page_cur(cursor);
  1813. if (!*heap) {
  1814. *heap = mem_heap_create(
  1815. rec_offs_size(*offsets)
  1816. + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
  1817. }
  1818. new_entry = row_rec_to_index_entry(rec, index, *offsets,
  1819. &n_ext, *heap);
  1820. /* We checked above that there are no externally stored fields. */
  1821. ut_a(!n_ext);
  1822. /* The page containing the clustered index record
  1823. corresponding to new_entry is latched in mtr.
  1824. Thus the following call is safe. */
  1825. row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
  1826. FALSE, *heap);
  1827. old_rec_size = rec_offs_size(*offsets);
  1828. new_rec_size = rec_get_converted_size(index, new_entry, 0);
  1829. page_zip = buf_block_get_page_zip(block);
  1830. #ifdef UNIV_ZIP_DEBUG
  1831. ut_a(!page_zip || page_zip_validate(page_zip, page, index));
  1832. #endif /* UNIV_ZIP_DEBUG */
  1833. if (page_zip) {
  1834. if (!btr_cur_update_alloc_zip(
  1835. page_zip, page_cursor, index, *offsets,
  1836. new_rec_size, true, mtr)) {
  1837. return(DB_ZIP_OVERFLOW);
  1838. }
  1839. rec = page_cur_get_rec(page_cursor);
  1840. }
  1841. if (UNIV_UNLIKELY(new_rec_size
  1842. >= (page_get_free_space_of_empty(page_is_comp(page))
  1843. / 2))) {
  1844. /* We may need to update the IBUF_BITMAP_FREE
  1845. bits after a reorganize that was done in
  1846. btr_cur_update_alloc_zip(). */
  1847. err = DB_OVERFLOW;
  1848. goto func_exit;
  1849. }
  1850. if (UNIV_UNLIKELY(page_get_data_size(page)
  1851. - old_rec_size + new_rec_size
  1852. < BTR_CUR_PAGE_COMPRESS_LIMIT)) {
  1853. /* We may need to update the IBUF_BITMAP_FREE
  1854. bits after a reorganize that was done in
  1855. btr_cur_update_alloc_zip(). */
  1856. /* The page would become too empty */
  1857. err = DB_UNDERFLOW;
  1858. goto func_exit;
  1859. }
  1860. /* We do not attempt to reorganize if the page is compressed.
  1861. This is because the page may fail to compress after reorganization. */
  1862. max_size = page_zip
  1863. ? page_get_max_insert_size(page, 1)
  1864. : (old_rec_size
  1865. + page_get_max_insert_size_after_reorganize(page, 1));
  1866. if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
  1867. && (max_size >= new_rec_size))
  1868. || (page_get_n_recs(page) <= 1))) {
  1869. /* We may need to update the IBUF_BITMAP_FREE
  1870. bits after a reorganize that was done in
  1871. btr_cur_update_alloc_zip(). */
  1872. /* There was not enough space, or it did not pay to
  1873. reorganize: for simplicity, we decide what to do assuming a
  1874. reorganization is needed, though it might not be necessary */
  1875. err = DB_OVERFLOW;
  1876. goto func_exit;
  1877. }
  1878. /* Do lock checking and undo logging */
  1879. err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
  1880. update, cmpl_info,
  1881. thr, mtr, &roll_ptr);
  1882. if (err != DB_SUCCESS) {
  1883. /* We may need to update the IBUF_BITMAP_FREE
  1884. bits after a reorganize that was done in
  1885. btr_cur_update_alloc_zip(). */
  1886. goto func_exit;
  1887. }
  1888. /* Ok, we may do the replacement. Store on the page infimum the
  1889. explicit locks on rec, before deleting rec (see the comment in
  1890. btr_cur_pessimistic_update). */
  1891. lock_rec_store_on_page_infimum(block, rec);
  1892. btr_search_update_hash_on_delete(cursor);
  1893. page_cur_delete_rec(page_cursor, index, *offsets, mtr);
  1894. page_cur_move_to_prev(page_cursor);
  1895. if (!(flags & BTR_KEEP_SYS_FLAG)) {
  1896. row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
  1897. roll_ptr);
  1898. row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
  1899. trx_id);
  1900. }
  1901. /* There are no externally stored columns in new_entry */
  1902. rec = btr_cur_insert_if_possible(
  1903. cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
  1904. ut_a(rec); /* <- We calculated above the insert would fit */
  1905. /* Restore the old explicit lock state on the record */
  1906. lock_rec_restore_from_page_infimum(block, rec, block);
  1907. page_cur_move_to_next(page_cursor);
  1908. ut_ad(err == DB_SUCCESS);
  1909. func_exit:
  1910. if (page_zip
  1911. && !(flags & BTR_KEEP_IBUF_BITMAP)
  1912. && !dict_index_is_clust(index)
  1913. && page_is_leaf(page)) {
  1914. /* Update the free bits in the insert buffer. */
  1915. ibuf_update_free_bits_zip(block, mtr);
  1916. }
  1917. return(err);
  1918. }
  1919. /*************************************************************//**
  1920. If, in a split, a new supremum record was created as the predecessor of the
  1921. updated record, the supremum record must inherit exactly the locks on the
  1922. updated record. In the split it may have inherited locks from the successor
  1923. of the updated record, which is not correct. This function restores the
  1924. right locks for the new supremum. */
  1925. static
  1926. void
  1927. btr_cur_pess_upd_restore_supremum(
  1928. /*==============================*/
  1929. buf_block_t* block, /*!< in: buffer block of rec */
  1930. const rec_t* rec, /*!< in: updated record */
  1931. mtr_t* mtr) /*!< in: mtr */
  1932. {
  1933. page_t* page;
  1934. buf_block_t* prev_block;
  1935. ulint space;
  1936. ulint zip_size;
  1937. ulint prev_page_no;
  1938. page = buf_block_get_frame(block);
  1939. if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
  1940. /* Updated record is not the first user record on its page */
  1941. return;
  1942. }
  1943. space = buf_block_get_space(block);
  1944. zip_size = buf_block_get_zip_size(block);
  1945. prev_page_no = btr_page_get_prev(page, mtr);
  1946. ut_ad(prev_page_no != FIL_NULL);
  1947. prev_block = buf_page_get_with_no_latch(space, zip_size,
  1948. prev_page_no, mtr);
  1949. #ifdef UNIV_BTR_DEBUG
  1950. ut_a(btr_page_get_next(prev_block->frame, mtr)
  1951. == page_get_page_no(page));
  1952. #endif /* UNIV_BTR_DEBUG */
  1953. /* We must already have an x-latch on prev_block! */
  1954. ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
  1955. lock_rec_reset_and_inherit_gap_locks(prev_block, block,
  1956. PAGE_HEAP_NO_SUPREMUM,
  1957. page_rec_get_heap_no(rec));
  1958. }
  1959. /*************************************************************//**
  1960. Performs an update of a record on a page of a tree. It is assumed
  1961. that mtr holds an x-latch on the tree and on the cursor page. If the
  1962. update is made on the leaf level, to avoid deadlocks, mtr must also
  1963. own x-latches to brothers of page, if those brothers exist. We assume
  1964. here that the ordering fields of the record do not change.
  1965. @return DB_SUCCESS or error code */
  1966. UNIV_INTERN
  1967. dberr_t
  1968. btr_cur_pessimistic_update(
  1969. /*=======================*/
  1970. ulint flags, /*!< in: undo logging, locking, and rollback
  1971. flags */
  1972. btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
  1973. cursor may become invalid if *big_rec == NULL
  1974. || !(flags & BTR_KEEP_POS_FLAG) */
  1975. ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
  1976. mem_heap_t** offsets_heap,
  1977. /*!< in/out: pointer to memory heap
  1978. that can be emptied, or NULL */
  1979. mem_heap_t* entry_heap,
  1980. /*!< in/out: memory heap for allocating
  1981. big_rec and the index tuple */
  1982. big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
  1983. be stored externally by the caller, or NULL */
  1984. const upd_t* update, /*!< in: update vector; this is allowed also
  1985. contain trx id and roll ptr fields, but
  1986. the values in update vector have no effect */
  1987. ulint cmpl_info,/*!< in: compiler info on secondary index
  1988. updates */
  1989. que_thr_t* thr, /*!< in: query thread, or NULL if
  1990. appropriate flags are set */
  1991. trx_id_t trx_id, /*!< in: transaction id */
  1992. mtr_t* mtr) /*!< in/out: mini-transaction; must be
  1993. committed before latching any further pages */
  1994. {
  1995. big_rec_t* big_rec_vec = NULL;
  1996. big_rec_t* dummy_big_rec;
  1997. dict_index_t* index;
  1998. buf_block_t* block;
  1999. page_t* page;
  2000. page_zip_des_t* page_zip;
  2001. rec_t* rec;
  2002. page_cur_t* page_cursor;
  2003. dberr_t err;
  2004. dberr_t optim_err;
  2005. roll_ptr_t roll_ptr;
  2006. ibool was_first;
  2007. ulint n_reserved = 0;
  2008. ulint n_ext;
  2009. *offsets = NULL;
  2010. *big_rec = NULL;
  2011. block = btr_cur_get_block(cursor);
  2012. page = buf_block_get_frame(block);
  2013. page_zip = buf_block_get_page_zip(block);
  2014. index = cursor->index;
  2015. ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
  2016. MTR_MEMO_X_LOCK));
  2017. ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
  2018. #ifdef UNIV_ZIP_DEBUG
  2019. ut_a(!page_zip || page_zip_validate(page_zip, page, index));
  2020. #endif /* UNIV_ZIP_DEBUG */
  2021. /* The insert buffer tree should never be updated in place. */
  2022. ut_ad(!dict_index_is_ibuf(index));
  2023. ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
  2024. || dict_index_is_clust(index));
  2025. ut_ad(thr_get_trx(thr)->id == trx_id
  2026. || (flags & ~BTR_KEEP_POS_FLAG)
  2027. == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
  2028. | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
  2029. err = optim_err = btr_cur_optimistic_update(
  2030. flags | BTR_KEEP_IBUF_BITMAP,
  2031. cursor, offsets, offsets_heap, update,
  2032. cmpl_info, thr, trx_id, mtr);
  2033. switch (err) {
  2034. case DB_ZIP_OVERFLOW:
  2035. case DB_UNDERFLOW:
  2036. case DB_OVERFLOW:
  2037. break;
  2038. default:
  2039. err_exit:
  2040. /* We suppressed this with BTR_KEEP_IBUF_BITMAP.
  2041. For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
  2042. already reset by btr_cur_update_alloc_zip() if the
  2043. page was recompressed. */
  2044. if (page_zip
  2045. && optim_err != DB_ZIP_OVERFLOW
  2046. && !dict_index_is_clust(index)
  2047. && page_is_leaf(page)) {
  2048. ibuf_update_free_bits_zip(block, mtr);
  2049. }
  2050. return(err);
  2051. }
  2052. /* Do lock checking and undo logging */
  2053. err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
  2054. update, cmpl_info,
  2055. thr, mtr, &roll_ptr);
  2056. if (err != DB_SUCCESS) {
  2057. goto err_exit;
  2058. }
  2059. if (optim_err == DB_OVERFLOW) {
  2060. ulint reserve_flag;
  2061. /* First reserve enough free space for the file segments
  2062. of the index tree, so that the update will not fail because
  2063. of lack of space */
  2064. ulint n_extents = cursor->tree_height / 16 + 3;
  2065. if (flags & BTR_NO_UNDO_LOG_FLAG) {
  2066. reserve_flag = FSP_CLEANING;
  2067. } else {
  2068. reserve_flag = FSP_NORMAL;
  2069. }
  2070. if (!fsp_reserve_free_extents(&n_reserved, index->space,
  2071. n_extents, reserve_flag, mtr)) {
  2072. err = DB_OUT_OF_FILE_SPACE;
  2073. goto err_exit;
  2074. }
  2075. }
  2076. rec = btr_cur_get_rec(cursor);
  2077. *offsets = rec_get_offsets(
  2078. rec, index, *offsets, ULINT_UNDEFINED, offsets_heap);
  2079. dtuple_t* new_entry = row_rec_to_index_entry(
  2080. rec, index, *offsets, &n_ext, entry_heap);
  2081. /* The page containing the clustered index record
  2082. corresponding to new_entry is latched in mtr. If the
  2083. clustered index record is delete-marked, then its externally
  2084. stored fields cannot have been purged yet, because then the
  2085. purge would also have removed the clustered index record
  2086. itself. Thus the following call is safe. */
  2087. row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
  2088. FALSE, entry_heap);
  2089. if (!(flags & BTR_KEEP_SYS_FLAG)) {
  2090. row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
  2091. roll_ptr);
  2092. row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
  2093. trx_id);
  2094. }
  2095. if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(*offsets)) {
  2096. /* We are in a transaction rollback undoing a row
  2097. update: we must free possible externally stored fields
  2098. which got new values in the update, if they are not
  2099. inherited values. They can be inherited if we have
  2100. updated the primary key to another value, and then
  2101. update it back again. */
  2102. ut_ad(big_rec_vec == NULL);
  2103. btr_rec_free_updated_extern_fields(
  2104. index, rec, page_zip, *offsets, update,
  2105. trx_is_recv(thr_get_trx(thr))
  2106. ? RB_RECOVERY : RB_NORMAL, mtr);
  2107. }
  2108. /* We have to set appropriate extern storage bits in the new
  2109. record to be inserted: we have to remember which fields were such */
  2110. ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
  2111. ut_ad(rec_offs_validate(rec, index, *offsets));
  2112. n_ext += btr_push_update_extern_fields(new_entry, update, entry_heap);
  2113. if (page_zip) {
  2114. ut_ad(page_is_comp(page));
  2115. if (page_zip_rec_needs_ext(
  2116. rec_get_converted_size(index, new_entry, n_ext),
  2117. TRUE,
  2118. dict_index_get_n_fields(index),
  2119. page_zip_get_size(page_zip))) {
  2120. goto make_external;
  2121. }
  2122. } else if (page_zip_rec_needs_ext(
  2123. rec_get_converted_size(index, new_entry, n_ext),
  2124. page_is_comp(page), 0, 0)) {
  2125. make_external:
  2126. big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
  2127. if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
  2128. /* We cannot goto return_after_reservations,
  2129. because we may need to update the
  2130. IBUF_BITMAP_FREE bits, which was suppressed by
  2131. BTR_KEEP_IBUF_BITMAP. */
  2132. #ifdef UNIV_ZIP_DEBUG
  2133. ut_a(!page_zip
  2134. || page_zip_validate(page_zip, page, index));
  2135. #endif /* UNIV_ZIP_DEBUG */
  2136. if (n_reserved > 0) {
  2137. fil_space_release_free_extents(
  2138. index->space, n_reserved);
  2139. }
  2140. err = DB_TOO_BIG_RECORD;
  2141. goto err_exit;
  2142. }
  2143. ut_ad(page_is_leaf(page));
  2144. ut_ad(dict_index_is_clust(index));
  2145. ut_ad(flags & BTR_KEEP_POS_FLAG);
  2146. }
  2147. /* Store state of explicit locks on rec on the page infimum record,
  2148. before deleting rec. The page infimum acts as a dummy carrier of the
  2149. locks, taking care also of lock releases, before we can move the locks
  2150. back on the actual record. There is a special case: if we are
  2151. inserting on the root page and the insert causes a call of
  2152. btr_root_raise_and_insert. Therefore we cannot in the lock system
  2153. delete the lock structs set on the root page even if the root
  2154. page carries just node pointers. */
  2155. lock_rec_store_on_page_infimum(block, rec);
  2156. btr_search_update_hash_on_delete(cursor);
  2157. #ifdef UNIV_ZIP_DEBUG
  2158. ut_a(!page_zip || page_zip_validate(page_zip, page, index));
  2159. #endif /* UNIV_ZIP_DEBUG */
  2160. page_cursor = btr_cur_get_page_cur(cursor);
  2161. page_cur_delete_rec(page_cursor, index, *offsets, mtr);
  2162. page_cur_move_to_prev(page_cursor);
  2163. rec = btr_cur_insert_if_possible(cursor, new_entry,
  2164. offsets, offsets_heap, n_ext, mtr);
  2165. if (rec) {
  2166. page_cursor->rec = rec;
  2167. lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
  2168. rec, block);
  2169. if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
  2170. /* The new inserted record owns its possible externally
  2171. stored fields */
  2172. btr_cur_unmark_extern_fields(
  2173. page_zip, rec, index, *offsets, mtr);
  2174. }
  2175. bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
  2176. if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
  2177. if (adjust) {
  2178. rec_offs_make_valid(
  2179. page_cursor->rec, index, *offsets);
  2180. }
  2181. } else if (page_zip &&
  2182. !dict_index_is_clust(index)
  2183. && page_is_leaf(page)) {
  2184. /* Update the free bits in the insert buffer.
  2185. This is the same block which was skipped by
  2186. BTR_KEEP_IBUF_BITMAP. */
  2187. ibuf_update_free_bits_zip(block, mtr);
  2188. }
  2189. err = DB_SUCCESS;
  2190. goto return_after_reservations;
  2191. } else {
  2192. /* If the page is compressed and it initially
  2193. compresses very well, and there is a subsequent insert
  2194. of a badly-compressing record, it is possible for
  2195. btr_cur_optimistic_update() to return DB_UNDERFLOW and
  2196. btr_cur_insert_if_possible() to return FALSE. */
  2197. ut_a(page_zip || optim_err != DB_UNDERFLOW);
  2198. /* Out of space: reset the free bits.
  2199. This is the same block which was skipped by
  2200. BTR_KEEP_IBUF_BITMAP. */
  2201. if (!dict_index_is_clust(index) && page_is_leaf(page)) {
  2202. ibuf_reset_free_bits(block);
  2203. }
  2204. }
  2205. if (big_rec_vec) {
  2206. ut_ad(page_is_leaf(page));
  2207. ut_ad(dict_index_is_clust(index));
  2208. ut_ad(flags & BTR_KEEP_POS_FLAG);
  2209. /* btr_page_split_and_insert() in
  2210. btr_cur_pessimistic_insert() invokes
  2211. mtr_memo_release(mtr, index->lock, MTR_MEMO_X_LOCK).
  2212. We must keep the index->lock when we created a
  2213. big_rec, so that row_upd_clust_rec() can store the
  2214. big_rec in the same mini-transaction. */
  2215. mtr_x_lock(dict_index_get_lock(index), mtr);
  2216. }
  2217. /* Was the record to be updated positioned as the first user
  2218. record on its page? */
  2219. was_first = page_cur_is_before_first(page_cursor);
  2220. /* Lock checks and undo logging were already performed by
  2221. btr_cur_upd_lock_and_undo(). We do not try
  2222. btr_cur_optimistic_insert() because
  2223. btr_cur_insert_if_possible() already failed above. */
  2224. err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
  2225. | BTR_NO_LOCKING_FLAG
  2226. | BTR_KEEP_SYS_FLAG,
  2227. cursor, offsets, offsets_heap,
  2228. new_entry, &rec,
  2229. &dummy_big_rec, n_ext, NULL, mtr);
  2230. ut_a(rec);
  2231. ut_a(err == DB_SUCCESS);
  2232. ut_a(dummy_big_rec == NULL);
  2233. ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
  2234. page_cursor->rec = rec;
  2235. if (dict_index_is_sec_or_ibuf(index)) {
  2236. /* Update PAGE_MAX_TRX_ID in the index page header.
  2237. It was not updated by btr_cur_pessimistic_insert()
  2238. because of BTR_NO_LOCKING_FLAG. */
  2239. buf_block_t* rec_block;
  2240. rec_block = btr_cur_get_block(cursor);
  2241. page_update_max_trx_id(rec_block,
  2242. buf_block_get_page_zip(rec_block),
  2243. trx_id, mtr);
  2244. }
  2245. if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
  2246. /* The new inserted record owns its possible externally
  2247. stored fields */
  2248. buf_block_t* rec_block = btr_cur_get_block(cursor);
  2249. #ifdef UNIV_ZIP_DEBUG
  2250. ut_a(!page_zip || page_zip_validate(page_zip, page, index));
  2251. page = buf_block_get_frame(rec_block);
  2252. #endif /* UNIV_ZIP_DEBUG */
  2253. page_zip = buf_block_get_page_zip(rec_block);
  2254. btr_cur_unmark_extern_fields(page_zip,
  2255. rec, index, *offsets, mtr);
  2256. }
  2257. lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
  2258. rec, block);
  2259. /* If necessary, restore also the correct lock state for a new,
  2260. preceding supremum record created in a page split. While the old
  2261. record was nonexistent, the supremum might have inherited its locks
  2262. from a wrong record. */
  2263. if (!was_first) {
  2264. btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
  2265. rec, mtr);
  2266. }
  2267. return_after_reservations:
  2268. #ifdef UNIV_ZIP_DEBUG
  2269. ut_a(!page_zip || page_zip_validate(page_zip, page, index));
  2270. #endif /* UNIV_ZIP_DEBUG */
  2271. if (n_reserved > 0) {
  2272. fil_space_release_free_extents(index->space, n_reserved);
  2273. }
  2274. *big_rec = big_rec_vec;
  2275. return(err);
  2276. }
  2277. /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
  2278. /****************************************************************//**
  2279. Writes the redo log record for delete marking or unmarking of an index
  2280. record. */
  2281. UNIV_INLINE
  2282. void
  2283. btr_cur_del_mark_set_clust_rec_log(
  2284. /*===============================*/
  2285. rec_t* rec, /*!< in: record */
  2286. dict_index_t* index, /*!< in: index of the record */
  2287. trx_id_t trx_id, /*!< in: transaction id */
  2288. roll_ptr_t roll_ptr,/*!< in: roll ptr to the undo log record */
  2289. mtr_t* mtr) /*!< in: mtr */
  2290. {
  2291. byte* log_ptr;
  2292. ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
  2293. log_ptr = mlog_open_and_write_index(mtr, rec, index,
  2294. page_rec_is_comp(rec)
  2295. ? MLOG_COMP_REC_CLUST_DELETE_MARK
  2296. : MLOG_REC_CLUST_DELETE_MARK,
  2297. 1 + 1 + DATA_ROLL_PTR_LEN
  2298. + 14 + 2);
  2299. if (!log_ptr) {
  2300. /* Logging in mtr is switched off during crash recovery */
  2301. return;
  2302. }
  2303. *log_ptr++ = 0;
  2304. *log_ptr++ = 1;
  2305. log_ptr = row_upd_write_sys_vals_to_log(
  2306. index, trx_id, roll_ptr, log_ptr, mtr);
  2307. mach_write_to_2(log_ptr, page_offset(rec));
  2308. log_ptr += 2;
  2309. mlog_close(mtr, log_ptr);
  2310. }
  2311. #endif /* !UNIV_HOTBACKUP */
  2312. /****************************************************************//**
  2313. Parses the redo log record for delete marking or unmarking of a clustered
  2314. index record.
  2315. @return end of log record or NULL */
  2316. UNIV_INTERN
  2317. byte*
  2318. btr_cur_parse_del_mark_set_clust_rec(
  2319. /*=================================*/
  2320. byte* ptr, /*!< in: buffer */
  2321. byte* end_ptr,/*!< in: buffer end */
  2322. page_t* page, /*!< in/out: page or NULL */
  2323. page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
  2324. dict_index_t* index) /*!< in: index corresponding to page */
  2325. {
  2326. ulint flags;
  2327. ulint val;
  2328. ulint pos;
  2329. trx_id_t trx_id;
  2330. roll_ptr_t roll_ptr;
  2331. ulint offset;
  2332. rec_t* rec;
  2333. ut_ad(!page
  2334. || !!page_is_comp(page) == dict_table_is_comp(index->table));
  2335. if (end_ptr < ptr + 2) {
  2336. return(NULL);
  2337. }
  2338. flags = mach_read_from_1(ptr);
  2339. ptr++;
  2340. val = mach_read_from_1(ptr);
  2341. ptr++;
  2342. ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
  2343. if (ptr == NULL) {
  2344. return(NULL);
  2345. }
  2346. if (end_ptr < ptr + 2) {
  2347. return(NULL);
  2348. }
  2349. offset = mach_read_from_2(ptr);
  2350. ptr += 2;
  2351. ut_a(offset <= UNIV_PAGE_SIZE);
  2352. if (page) {
  2353. rec = page + offset;
  2354. /* We do not need to reserve btr_search_latch, as the page
  2355. is only being recovered, and there cannot be a hash index to
  2356. it. Besides, these fields are being updated in place
  2357. and the adaptive hash index does not depend on them. */
  2358. btr_rec_set_deleted_flag(rec, page_zip, val);
  2359. if (!(flags & BTR_KEEP_SYS_FLAG)) {
  2360. mem_heap_t* heap = NULL;
  2361. ulint offsets_[REC_OFFS_NORMAL_SIZE];
  2362. rec_offs_init(offsets_);
  2363. row_upd_rec_sys_fields_in_recovery(
  2364. rec, page_zip,
  2365. rec_get_offsets(rec, index, offsets_,
  2366. ULINT_UNDEFINED, &heap),
  2367. pos, trx_id, roll_ptr);
  2368. if (UNIV_LIKELY_NULL(heap)) {
  2369. mem_heap_free(heap);
  2370. }
  2371. }
  2372. }
  2373. return(ptr);
  2374. }
  2375. #ifndef UNIV_HOTBACKUP
  2376. /***********************************************************//**
  2377. Marks a clustered index record deleted. Writes an undo log record to
  2378. undo log on this delete marking. Writes in the trx id field the id
  2379. of the deleting transaction, and in the roll ptr field pointer to the
  2380. undo log record created.
  2381. @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
  2382. UNIV_INTERN
  2383. dberr_t
  2384. btr_cur_del_mark_set_clust_rec(
  2385. /*===========================*/
  2386. buf_block_t* block, /*!< in/out: buffer block of the record */
  2387. rec_t* rec, /*!< in/out: record */
  2388. dict_index_t* index, /*!< in: clustered index of the record */
  2389. const ulint* offsets,/*!< in: rec_get_offsets(rec) */
  2390. que_thr_t* thr, /*!< in: query thread */
  2391. mtr_t* mtr) /*!< in/out: mini-transaction */
  2392. {
  2393. roll_ptr_t roll_ptr;
  2394. dberr_t err;
  2395. page_zip_des_t* page_zip;
  2396. trx_t* trx;
  2397. ut_ad(dict_index_is_clust(index));
  2398. ut_ad(rec_offs_validate(rec, index, offsets));
  2399. ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
  2400. ut_ad(buf_block_get_frame(block) == page_align(rec));
  2401. ut_ad(page_is_leaf(page_align(rec)));
  2402. #ifdef UNIV_DEBUG
  2403. if (btr_cur_print_record_ops && thr) {
  2404. btr_cur_trx_report(thr_get_trx(thr)->id, index, "del mark ");
  2405. rec_print_new(stderr, rec, offsets);
  2406. }
  2407. #endif /* UNIV_DEBUG */
  2408. ut_ad(dict_index_is_clust(index));
  2409. ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
  2410. err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
  2411. rec, index, offsets, thr);
  2412. if (err != DB_SUCCESS) {
  2413. return(err);
  2414. }
  2415. err = trx_undo_report_row_operation(0, TRX_UNDO_MODIFY_OP, thr,
  2416. index, NULL, NULL, 0, rec, offsets,
  2417. &roll_ptr);
  2418. if (err != DB_SUCCESS) {
  2419. return(err);
  2420. }
  2421. /* The btr_search_latch is not needed here, because
  2422. the adaptive hash index does not depend on the delete-mark
  2423. and the delete-mark is being updated in place. */
  2424. page_zip = buf_block_get_page_zip(block);
  2425. btr_blob_dbg_set_deleted_flag(rec, index, offsets, TRUE);
  2426. btr_rec_set_deleted_flag(rec, page_zip, TRUE);
  2427. trx = thr_get_trx(thr);
  2428. if (dict_index_is_online_ddl(index)) {
  2429. row_log_table_delete(
  2430. rec, index, offsets, false,
  2431. trx_read_trx_id(row_get_trx_id_offset(index, offsets)
  2432. + rec));
  2433. }
  2434. row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
  2435. btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
  2436. roll_ptr, mtr);
  2437. return(err);
  2438. }
  2439. /****************************************************************//**
  2440. Writes the redo log record for a delete mark setting of a secondary
  2441. index record. */
  2442. UNIV_INLINE
  2443. void
  2444. btr_cur_del_mark_set_sec_rec_log(
  2445. /*=============================*/
  2446. rec_t* rec, /*!< in: record */
  2447. ibool val, /*!< in: value to set */
  2448. mtr_t* mtr) /*!< in: mtr */
  2449. {
  2450. byte* log_ptr;
  2451. ut_ad(val <= 1);
  2452. log_ptr = mlog_open(mtr, 11 + 1 + 2);
  2453. if (!log_ptr) {
  2454. /* Logging in mtr is switched off during crash recovery:
  2455. in that case mlog_open returns NULL */
  2456. return;
  2457. }
  2458. log_ptr = mlog_write_initial_log_record_fast(
  2459. rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
  2460. mach_write_to_1(log_ptr, val);
  2461. log_ptr++;
  2462. mach_write_to_2(log_ptr, page_offset(rec));
  2463. log_ptr += 2;
  2464. mlog_close(mtr, log_ptr);
  2465. }
  2466. #endif /* !UNIV_HOTBACKUP */
  2467. /****************************************************************//**
  2468. Parses the redo log record for delete marking or unmarking of a secondary
  2469. index record.
  2470. @return end of log record or NULL */
  2471. UNIV_INTERN
  2472. byte*
  2473. btr_cur_parse_del_mark_set_sec_rec(
  2474. /*===============================*/
  2475. byte* ptr, /*!< in: buffer */
  2476. byte* end_ptr,/*!< in: buffer end */
  2477. page_t* page, /*!< in/out: page or NULL */
  2478. page_zip_des_t* page_zip)/*!< in/out: compressed page, or NULL */
  2479. {
  2480. ulint val;
  2481. ulint offset;
  2482. rec_t* rec;
  2483. if (end_ptr < ptr + 3) {
  2484. return(NULL);
  2485. }
  2486. val = mach_read_from_1(ptr);
  2487. ptr++;
  2488. offset = mach_read_from_2(ptr);
  2489. ptr += 2;
  2490. ut_a(offset <= UNIV_PAGE_SIZE);
  2491. if (page) {
  2492. rec = page + offset;
  2493. /* We do not need to reserve btr_search_latch, as the page
  2494. is only being recovered, and there cannot be a hash index to
  2495. it. Besides, the delete-mark flag is being updated in place
  2496. and the adaptive hash index does not depend on it. */
  2497. btr_rec_set_deleted_flag(rec, page_zip, val);
  2498. }
  2499. return(ptr);
  2500. }
  2501. #ifndef UNIV_HOTBACKUP
  2502. /***********************************************************//**
  2503. Sets a secondary index record delete mark to TRUE or FALSE.
  2504. @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
  2505. UNIV_INTERN
  2506. dberr_t
  2507. btr_cur_del_mark_set_sec_rec(
  2508. /*=========================*/
  2509. ulint flags, /*!< in: locking flag */
  2510. btr_cur_t* cursor, /*!< in: cursor */
  2511. ibool val, /*!< in: value to set */
  2512. que_thr_t* thr, /*!< in: query thread */
  2513. mtr_t* mtr) /*!< in/out: mini-transaction */
  2514. {
  2515. buf_block_t* block;
  2516. rec_t* rec;
  2517. dberr_t err;
  2518. block = btr_cur_get_block(cursor);
  2519. rec = btr_cur_get_rec(cursor);
  2520. #ifdef UNIV_DEBUG
  2521. if (btr_cur_print_record_ops && thr) {
  2522. btr_cur_trx_report(thr_get_trx(thr)->id, cursor->index,
  2523. "del mark ");
  2524. rec_print(stderr, rec, cursor->index);
  2525. }
  2526. #endif /* UNIV_DEBUG */
  2527. err = lock_sec_rec_modify_check_and_lock(flags,
  2528. btr_cur_get_block(cursor),
  2529. rec, cursor->index, thr, mtr);
  2530. if (err != DB_SUCCESS) {
  2531. return(err);
  2532. }
  2533. ut_ad(!!page_rec_is_comp(rec)
  2534. == dict_table_is_comp(cursor->index->table));
  2535. /* We do not need to reserve btr_search_latch, as the
  2536. delete-mark flag is being updated in place and the adaptive
  2537. hash index does not depend on it. */
  2538. btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
  2539. btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
  2540. return(DB_SUCCESS);
  2541. }
  2542. /***********************************************************//**
  2543. Sets a secondary index record's delete mark to the given value. This
  2544. function is only used by the insert buffer merge mechanism. */
  2545. UNIV_INTERN
  2546. void
  2547. btr_cur_set_deleted_flag_for_ibuf(
  2548. /*==============================*/
  2549. rec_t* rec, /*!< in/out: record */
  2550. page_zip_des_t* page_zip, /*!< in/out: compressed page
  2551. corresponding to rec, or NULL
  2552. when the tablespace is
  2553. uncompressed */
  2554. ibool val, /*!< in: value to set */
  2555. mtr_t* mtr) /*!< in/out: mini-transaction */
  2556. {
  2557. /* We do not need to reserve btr_search_latch, as the page
  2558. has just been read to the buffer pool and there cannot be
  2559. a hash index to it. Besides, the delete-mark flag is being
  2560. updated in place and the adaptive hash index does not depend
  2561. on it. */
  2562. btr_rec_set_deleted_flag(rec, page_zip, val);
  2563. btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
  2564. }
  2565. /*==================== B-TREE RECORD REMOVE =========================*/
  2566. /*************************************************************//**
  2567. Tries to compress a page of the tree if it seems useful. It is assumed
  2568. that mtr holds an x-latch on the tree and on the cursor page. To avoid
  2569. deadlocks, mtr must also own x-latches to brothers of page, if those
  2570. brothers exist. NOTE: it is assumed that the caller has reserved enough
  2571. free extents so that the compression will always succeed if done!
  2572. @return TRUE if compression occurred */
  2573. UNIV_INTERN
  2574. ibool
  2575. btr_cur_compress_if_useful(
  2576. /*=======================*/
  2577. btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
  2578. cursor does not stay valid if !adjust and
  2579. compression occurs */
  2580. ibool adjust, /*!< in: TRUE if should adjust the
  2581. cursor position even if compression occurs */
  2582. mtr_t* mtr) /*!< in/out: mini-transaction */
  2583. {
  2584. ut_ad(mtr_memo_contains(mtr,
  2585. dict_index_get_lock(btr_cur_get_index(cursor)),
  2586. MTR_MEMO_X_LOCK));
  2587. ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
  2588. MTR_MEMO_PAGE_X_FIX));
  2589. return(btr_cur_compress_recommendation(cursor, mtr)
  2590. && btr_compress(cursor, adjust, mtr));
  2591. }
  2592. /*******************************************************//**
  2593. Removes the record on which the tree cursor is positioned on a leaf page.
  2594. It is assumed that the mtr has an x-latch on the page where the cursor is
  2595. positioned, but no latch on the whole tree.
  2596. @return TRUE if success, i.e., the page did not become too empty */
  2597. UNIV_INTERN
  2598. ibool
  2599. btr_cur_optimistic_delete_func(
  2600. /*===========================*/
  2601. btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to
  2602. delete; cursor stays valid: if deletion
  2603. succeeds, on function exit it points to the
  2604. successor of the deleted record */
  2605. #ifdef UNIV_DEBUG
  2606. ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
  2607. #endif /* UNIV_DEBUG */
  2608. mtr_t* mtr) /*!< in: mtr; if this function returns
  2609. TRUE on a leaf page of a secondary
  2610. index, the mtr must be committed
  2611. before latching any further pages */
  2612. {
  2613. buf_block_t* block;
  2614. rec_t* rec;
  2615. mem_heap_t* heap = NULL;
  2616. ulint offsets_[REC_OFFS_NORMAL_SIZE];
  2617. ulint* offsets = offsets_;
  2618. ibool no_compress_needed;
  2619. rec_offs_init(offsets_);
  2620. ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
  2621. ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
  2622. MTR_MEMO_PAGE_X_FIX));
  2623. /* This is intended only for leaf page deletions */
  2624. block = btr_cur_get_block(cursor);
  2625. ut_ad(page_is_leaf(buf_block_get_frame(block)));
  2626. ut_ad(!dict_index_is_online_ddl(cursor->index)
  2627. || dict_index_is_clust(cursor->index)
  2628. || (flags & BTR_CREATE_FLAG));
  2629. rec = btr_cur_get_rec(cursor);
  2630. offsets = rec_get_offsets(rec, cursor->index, offsets,
  2631. ULINT_UNDEFINED, &heap);
  2632. no_compress_needed = !rec_offs_any_extern(offsets)
  2633. && btr_cur_can_delete_without_compress(
  2634. cursor, rec_offs_size(offsets), mtr);
  2635. if (no_compress_needed) {
  2636. page_t* page = buf_block_get_frame(block);
  2637. page_zip_des_t* page_zip= buf_block_get_page_zip(block);
  2638. lock_update_delete(block, rec);
  2639. btr_search_update_hash_on_delete(cursor);
  2640. if (page_zip) {
  2641. #ifdef UNIV_ZIP_DEBUG
  2642. ut_a(page_zip_validate(page_zip, page, cursor->index));
  2643. #endif /* UNIV_ZIP_DEBUG */
  2644. page_cur_delete_rec(btr_cur_get_page_cur(cursor),
  2645. cursor->index, offsets, mtr);
  2646. #ifdef UNIV_ZIP_DEBUG
  2647. ut_a(page_zip_validate(page_zip, page, cursor->index));
  2648. #endif /* UNIV_ZIP_DEBUG */
  2649. /* On compressed pages, the IBUF_BITMAP_FREE
  2650. space is not affected by deleting (purging)
  2651. records, because it is defined as the minimum
  2652. of space available *without* reorganize, and
  2653. space available in the modification log. */
  2654. } else {
  2655. const ulint max_ins
  2656. = page_get_max_insert_size_after_reorganize(
  2657. page, 1);
  2658. page_cur_delete_rec(btr_cur_get_page_cur(cursor),
  2659. cursor->index, offsets, mtr);
  2660. /* The change buffer does not handle inserts
  2661. into non-leaf pages, into clustered indexes,
  2662. or into the change buffer. */
  2663. if (page_is_leaf(page)
  2664. && !dict_index_is_clust(cursor->index)
  2665. && !dict_index_is_ibuf(cursor->index)) {
  2666. ibuf_update_free_bits_low(block, max_ins, mtr);
  2667. }
  2668. }
  2669. }
  2670. if (UNIV_LIKELY_NULL(heap)) {
  2671. mem_heap_free(heap);
  2672. }
  2673. return(no_compress_needed);
  2674. }
  2675. /*************************************************************//**
  2676. Removes the record on which the tree cursor is positioned. Tries
  2677. to compress the page if its fillfactor drops below a threshold
  2678. or if it is the only page on the level. It is assumed that mtr holds
  2679. an x-latch on the tree and on the cursor page. To avoid deadlocks,
  2680. mtr must also own x-latches to brothers of page, if those brothers
  2681. exist.
  2682. @return TRUE if compression occurred */
  2683. UNIV_INTERN
  2684. ibool
  2685. btr_cur_pessimistic_delete(
  2686. /*=======================*/
  2687. dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
  2688. the latter may occur because we may have
  2689. to update node pointers on upper levels,
  2690. and in the case of variable length keys
  2691. these may actually grow in size */
  2692. ibool has_reserved_extents, /*!< in: TRUE if the
  2693. caller has already reserved enough free
  2694. extents so that he knows that the operation
  2695. will succeed */
  2696. btr_cur_t* cursor, /*!< in: cursor on the record to delete;
  2697. if compression does not occur, the cursor
  2698. stays valid: it points to successor of
  2699. deleted record on function exit */
  2700. ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
  2701. enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
  2702. mtr_t* mtr) /*!< in: mtr */
  2703. {
  2704. buf_block_t* block;
  2705. page_t* page;
  2706. page_zip_des_t* page_zip;
  2707. dict_index_t* index;
  2708. rec_t* rec;
  2709. ulint n_reserved = 0;
  2710. ibool success;
  2711. ibool ret = FALSE;
  2712. ulint level;
  2713. mem_heap_t* heap;
  2714. ulint* offsets;
  2715. block = btr_cur_get_block(cursor);
  2716. page = buf_block_get_frame(block);
  2717. index = btr_cur_get_index(cursor);
  2718. ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
  2719. ut_ad(!dict_index_is_online_ddl(index)
  2720. || dict_index_is_clust(index)
  2721. || (flags & BTR_CREATE_FLAG));
  2722. ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
  2723. MTR_MEMO_X_LOCK));
  2724. ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
  2725. if (!has_reserved_extents) {
  2726. /* First reserve enough free space for the file segments
  2727. of the index tree, so that the node pointer updates will
  2728. not fail because of lack of space */
  2729. ulint n_extents = cursor->tree_height / 32 + 1;
  2730. success = fsp_reserve_free_extents(&n_reserved,
  2731. index->space,
  2732. n_extents,
  2733. FSP_CLEANING, mtr);
  2734. if (!success) {
  2735. *err = DB_OUT_OF_FILE_SPACE;
  2736. return(FALSE);
  2737. }
  2738. }
  2739. heap = mem_heap_create(1024);
  2740. rec = btr_cur_get_rec(cursor);
  2741. page_zip = buf_block_get_page_zip(block);
  2742. #ifdef UNIV_ZIP_DEBUG
  2743. ut_a(!page_zip || page_zip_validate(page_zip, page, index));
  2744. #endif /* UNIV_ZIP_DEBUG */
  2745. offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
  2746. if (rec_offs_any_extern(offsets)) {
  2747. btr_rec_free_externally_stored_fields(index,
  2748. rec, offsets, page_zip,
  2749. rb_ctx, mtr);
  2750. #ifdef UNIV_ZIP_DEBUG
  2751. ut_a(!page_zip || page_zip_validate(page_zip, page, index));
  2752. #endif /* UNIV_ZIP_DEBUG */
  2753. }
  2754. if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
  2755. && UNIV_UNLIKELY(dict_index_get_page(index)
  2756. != buf_block_get_page_no(block))) {
  2757. /* If there is only one record, drop the whole page in
  2758. btr_discard_page, if this is not the root page */
  2759. btr_discard_page(cursor, mtr);
  2760. ret = TRUE;
  2761. goto return_after_reservations;
  2762. }
  2763. if (flags == 0) {
  2764. lock_update_delete(block, rec);
  2765. }
  2766. level = btr_page_get_level(page, mtr);
  2767. if (level > 0
  2768. && UNIV_UNLIKELY(rec == page_rec_get_next(
  2769. page_get_infimum_rec(page)))) {
  2770. rec_t* next_rec = page_rec_get_next(rec);
  2771. if (btr_page_get_prev(page, mtr) == FIL_NULL) {
  2772. /* If we delete the leftmost node pointer on a
  2773. non-leaf level, we must mark the new leftmost node
  2774. pointer as the predefined minimum record */
  2775. /* This will make page_zip_validate() fail until
  2776. page_cur_delete_rec() completes. This is harmless,
  2777. because everything will take place within a single
  2778. mini-transaction and because writing to the redo log
  2779. is an atomic operation (performed by mtr_commit()). */
  2780. btr_set_min_rec_mark(next_rec, mtr);
  2781. } else {
  2782. /* Otherwise, if we delete the leftmost node pointer
  2783. on a page, we have to change the father node pointer
  2784. so that it is equal to the new leftmost node pointer
  2785. on the page */
  2786. btr_node_ptr_delete(index, block, mtr);
  2787. dtuple_t* node_ptr = dict_index_build_node_ptr(
  2788. index, next_rec, buf_block_get_page_no(block),
  2789. heap, level);
  2790. btr_insert_on_non_leaf_level(
  2791. flags, index, level + 1, node_ptr, mtr);
  2792. }
  2793. }
  2794. btr_search_update_hash_on_delete(cursor);
  2795. page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
  2796. #ifdef UNIV_ZIP_DEBUG
  2797. ut_a(!page_zip || page_zip_validate(page_zip, page, index));
  2798. #endif /* UNIV_ZIP_DEBUG */
  2799. ut_ad(btr_check_node_ptr(index, block, mtr));
  2800. return_after_reservations:
  2801. *err = DB_SUCCESS;
  2802. mem_heap_free(heap);
  2803. if (ret == FALSE) {
  2804. ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
  2805. }
  2806. if (n_reserved > 0) {
  2807. fil_space_release_free_extents(index->space, n_reserved);
  2808. }
  2809. return(ret);
  2810. }
  2811. /*******************************************************************//**
  2812. Adds path information to the cursor for the current page, for which
  2813. the binary search has been performed. */
  2814. static
  2815. void
  2816. btr_cur_add_path_info(
  2817. /*==================*/
  2818. btr_cur_t* cursor, /*!< in: cursor positioned on a page */
  2819. ulint height, /*!< in: height of the page in tree;
  2820. 0 means leaf node */
  2821. ulint root_height) /*!< in: root node height in tree */
  2822. {
  2823. btr_path_t* slot;
  2824. const rec_t* rec;
  2825. const page_t* page;
  2826. ut_a(cursor->path_arr);
  2827. if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
  2828. /* Do nothing; return empty path */
  2829. slot = cursor->path_arr;
  2830. slot->nth_rec = ULINT_UNDEFINED;
  2831. return;
  2832. }
  2833. if (height == 0) {
  2834. /* Mark end of slots for path */
  2835. slot = cursor->path_arr + root_height + 1;
  2836. slot->nth_rec = ULINT_UNDEFINED;
  2837. }
  2838. rec = btr_cur_get_rec(cursor);
  2839. slot = cursor->path_arr + (root_height - height);
  2840. page = page_align(rec);
  2841. slot->nth_rec = page_rec_get_n_recs_before(rec);
  2842. slot->n_recs = page_get_n_recs(page);
  2843. slot->page_no = page_get_page_no(page);
  2844. slot->page_level = btr_page_get_level_low(page);
  2845. }
  2846. /*******************************************************************//**
  2847. Estimate the number of rows between slot1 and slot2 for any level on a
  2848. B-tree. This function starts from slot1->page and reads a few pages to
  2849. the right, counting their records. If we reach slot2->page quickly then
  2850. we know exactly how many records there are between slot1 and slot2 and
  2851. we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
  2852. then we calculate the average number of records in the pages scanned
  2853. so far and assume that all pages that we did not scan up to slot2->page
  2854. contain the same number of records, then we multiply that average to
  2855. the number of pages between slot1->page and slot2->page (which is
  2856. n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
  2857. @return number of rows (exact or estimated) */
  2858. static
  2859. ib_int64_t
  2860. btr_estimate_n_rows_in_range_on_level(
  2861. /*==================================*/
  2862. dict_index_t* index, /*!< in: index */
  2863. btr_path_t* slot1, /*!< in: left border */
  2864. btr_path_t* slot2, /*!< in: right border */
  2865. ib_int64_t n_rows_on_prev_level, /*!< in: number of rows
  2866. on the previous level for the
  2867. same descend paths; used to
  2868. determine the numbe of pages
  2869. on this level */
  2870. ibool* is_n_rows_exact) /*!< out: TRUE if the returned
  2871. value is exact i.e. not an
  2872. estimation */
  2873. {
  2874. ulint space;
  2875. ib_int64_t n_rows;
  2876. ulint n_pages_read;
  2877. ulint page_no;
  2878. ulint zip_size;
  2879. ulint level;
  2880. space = dict_index_get_space(index);
  2881. n_rows = 0;
  2882. n_pages_read = 0;
  2883. /* Assume by default that we will scan all pages between
  2884. slot1->page_no and slot2->page_no */
  2885. *is_n_rows_exact = TRUE;
  2886. /* add records from slot1->page_no which are to the right of
  2887. the record which serves as a left border of the range, if any */
  2888. if (slot1->nth_rec < slot1->n_recs) {
  2889. n_rows += slot1->n_recs - slot1->nth_rec;
  2890. }
  2891. /* add records from slot2->page_no which are to the left of
  2892. the record which servers as a right border of the range, if any */
  2893. if (slot2->nth_rec > 1) {
  2894. n_rows += slot2->nth_rec - 1;
  2895. }
  2896. /* count the records in the pages between slot1->page_no and
  2897. slot2->page_no (non inclusive), if any */
  2898. zip_size = fil_space_get_zip_size(space);
  2899. /* Do not read more than this number of pages in order not to hurt
  2900. performance with this code which is just an estimation. If we read
  2901. this many pages before reaching slot2->page_no then we estimate the
  2902. average from the pages scanned so far */
  2903. # define N_PAGES_READ_LIMIT 10
  2904. page_no = slot1->page_no;
  2905. level = slot1->page_level;
  2906. do {
  2907. mtr_t mtr;
  2908. page_t* page;
  2909. buf_block_t* block;
  2910. mtr_start(&mtr);
  2911. /* Fetch the page. Because we are not holding the
  2912. index->lock, the tree may have changed and we may be
  2913. attempting to read a page that is no longer part of
  2914. the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
  2915. silence a debug assertion about this. */
  2916. block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH,
  2917. NULL, BUF_GET_POSSIBLY_FREED,
  2918. __FILE__, __LINE__, &mtr);
  2919. page = buf_block_get_frame(block);
  2920. /* It is possible that the tree has been reorganized in the
  2921. meantime and this is a different page. If this happens the
  2922. calculated estimate will be bogus, which is not fatal as
  2923. this is only an estimate. We are sure that a page with
  2924. page_no exists because InnoDB never frees pages, only
  2925. reuses them. */
  2926. if (fil_page_get_type(page) != FIL_PAGE_INDEX
  2927. || btr_page_get_index_id(page) != index->id
  2928. || btr_page_get_level_low(page) != level) {
  2929. /* The page got reused for something else */
  2930. mtr_commit(&mtr);
  2931. goto inexact;
  2932. }
  2933. /* It is possible but highly unlikely that the page was
  2934. originally written by an old version of InnoDB that did
  2935. not initialize FIL_PAGE_TYPE on other than B-tree pages.
  2936. For example, this could be an almost-empty BLOB page
  2937. that happens to contain the magic values in the fields
  2938. that we checked above. */
  2939. n_pages_read++;
  2940. if (page_no != slot1->page_no) {
  2941. /* Do not count the records on slot1->page_no,
  2942. we already counted them before this loop. */
  2943. n_rows += page_get_n_recs(page);
  2944. }
  2945. page_no = btr_page_get_next(page, &mtr);
  2946. mtr_commit(&mtr);
  2947. if (n_pages_read == N_PAGES_READ_LIMIT
  2948. || page_no == FIL_NULL) {
  2949. /* Either we read too many pages or
  2950. we reached the end of the level without passing
  2951. through slot2->page_no, the tree must have changed
  2952. in the meantime */
  2953. goto inexact;
  2954. }
  2955. } while (page_no != slot2->page_no);
  2956. return(n_rows);
  2957. inexact:
  2958. *is_n_rows_exact = FALSE;
  2959. /* We did interrupt before reaching slot2->page */
  2960. if (n_pages_read > 0) {
  2961. /* The number of pages on this level is
  2962. n_rows_on_prev_level, multiply it by the
  2963. average number of recs per page so far */
  2964. n_rows = n_rows_on_prev_level
  2965. * n_rows / n_pages_read;
  2966. } else {
  2967. /* The tree changed before we could even
  2968. start with slot1->page_no */
  2969. n_rows = 10;
  2970. }
  2971. return(n_rows);
  2972. }
  2973. /*******************************************************************//**
  2974. Estimates the number of rows in a given index range.
  2975. @return estimated number of rows */
  2976. UNIV_INTERN
  2977. ib_int64_t
  2978. btr_estimate_n_rows_in_range(
  2979. /*=========================*/
  2980. dict_index_t* index, /*!< in: index */
  2981. const dtuple_t* tuple1, /*!< in: range start, may also be empty tuple */
  2982. ulint mode1, /*!< in: search mode for range start */
  2983. const dtuple_t* tuple2, /*!< in: range end, may also be empty tuple */
  2984. ulint mode2) /*!< in: search mode for range end */
  2985. {
  2986. btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS];
  2987. btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
  2988. btr_cur_t cursor;
  2989. btr_path_t* slot1;
  2990. btr_path_t* slot2;
  2991. ibool diverged;
  2992. ibool diverged_lot;
  2993. ulint divergence_level;
  2994. ib_int64_t n_rows;
  2995. ibool is_n_rows_exact;
  2996. ulint i;
  2997. mtr_t mtr;
  2998. ib_int64_t table_n_rows;
  2999. table_n_rows = dict_table_get_n_rows(index->table);
  3000. mtr_start(&mtr);
  3001. cursor.path_arr = path1;
  3002. if (dtuple_get_n_fields(tuple1) > 0) {
  3003. btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
  3004. BTR_SEARCH_LEAF | BTR_ESTIMATE,
  3005. &cursor, 0,
  3006. __FILE__, __LINE__, &mtr);
  3007. } else {
  3008. btr_cur_open_at_index_side(true, index,
  3009. BTR_SEARCH_LEAF | BTR_ESTIMATE,
  3010. &cursor, 0, &mtr);
  3011. }
  3012. mtr_commit(&mtr);
  3013. mtr_start(&mtr);
  3014. cursor.path_arr = path2;
  3015. if (dtuple_get_n_fields(tuple2) > 0) {
  3016. btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
  3017. BTR_SEARCH_LEAF | BTR_ESTIMATE,
  3018. &cursor, 0,
  3019. __FILE__, __LINE__, &mtr);
  3020. } else {
  3021. btr_cur_open_at_index_side(false, index,
  3022. BTR_SEARCH_LEAF | BTR_ESTIMATE,
  3023. &cursor, 0, &mtr);
  3024. }
  3025. mtr_commit(&mtr);
  3026. /* We have the path information for the range in path1 and path2 */
  3027. n_rows = 1;
  3028. is_n_rows_exact = TRUE;
  3029. diverged = FALSE; /* This becomes true when the path is not
  3030. the same any more */
  3031. diverged_lot = FALSE; /* This becomes true when the paths are
  3032. not the same or adjacent any more */
  3033. divergence_level = 1000000; /* This is the level where paths diverged
  3034. a lot */
  3035. for (i = 0; ; i++) {
  3036. ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
  3037. slot1 = path1 + i;
  3038. slot2 = path2 + i;
  3039. if (slot1->nth_rec == ULINT_UNDEFINED
  3040. || slot2->nth_rec == ULINT_UNDEFINED) {
  3041. if (i > divergence_level + 1 && !is_n_rows_exact) {
  3042. /* In trees whose height is > 1 our algorithm
  3043. tends to underestimate: multiply the estimate
  3044. by 2: */
  3045. n_rows = n_rows * 2;
  3046. }
  3047. DBUG_EXECUTE_IF("bug14007649", return(n_rows););
  3048. /* Do not estimate the number of rows in the range
  3049. to over 1 / 2 of the estimated rows in the whole
  3050. table */
  3051. if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
  3052. n_rows = table_n_rows / 2;
  3053. /* If there are just 0 or 1 rows in the table,
  3054. then we estimate all rows are in the range */
  3055. if (n_rows == 0) {
  3056. n_rows = table_n_rows;
  3057. }
  3058. }
  3059. return(n_rows);
  3060. }
  3061. if (!diverged && slot1->nth_rec != slot2->nth_rec) {
  3062. diverged = TRUE;
  3063. if (slot1->nth_rec < slot2->nth_rec) {
  3064. n_rows = slot2->nth_rec - slot1->nth_rec;
  3065. if (n_rows > 1) {
  3066. diverged_lot = TRUE;
  3067. divergence_level = i;
  3068. }
  3069. } else {
  3070. /* It is possible that
  3071. slot1->nth_rec >= slot2->nth_rec
  3072. if, for example, we have a single page
  3073. tree which contains (inf, 5, 6, supr)
  3074. and we select where x > 20 and x < 30;
  3075. in this case slot1->nth_rec will point
  3076. to the supr record and slot2->nth_rec
  3077. will point to 6 */
  3078. n_rows = 0;
  3079. }
  3080. } else if (diverged && !diverged_lot) {
  3081. if (slot1->nth_rec < slot1->n_recs
  3082. || slot2->nth_rec > 1) {
  3083. diverged_lot = TRUE;
  3084. divergence_level = i;
  3085. n_rows = 0;
  3086. if (slot1->nth_rec < slot1->n_recs) {
  3087. n_rows += slot1->n_recs
  3088. - slot1->nth_rec;
  3089. }
  3090. if (slot2->nth_rec > 1) {
  3091. n_rows += slot2->nth_rec - 1;
  3092. }
  3093. }
  3094. } else if (diverged_lot) {
  3095. n_rows = btr_estimate_n_rows_in_range_on_level(
  3096. index, slot1, slot2, n_rows,
  3097. &is_n_rows_exact);
  3098. }
  3099. }
  3100. }
  3101. /*******************************************************************//**
  3102. Record the number of non_null key values in a given index for
  3103. each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
  3104. The estimates are eventually stored in the array:
  3105. index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
  3106. static
  3107. void
  3108. btr_record_not_null_field_in_rec(
  3109. /*=============================*/
  3110. ulint n_unique, /*!< in: dict_index_get_n_unique(index),
  3111. number of columns uniquely determine
  3112. an index entry */
  3113. const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
  3114. its size could be for all fields or
  3115. that of "n_unique" */
  3116. ib_uint64_t* n_not_null) /*!< in/out: array to record number of
  3117. not null rows for n-column prefix */
  3118. {
  3119. ulint i;
  3120. ut_ad(rec_offs_n_fields(offsets) >= n_unique);
  3121. if (n_not_null == NULL) {
  3122. return;
  3123. }
  3124. for (i = 0; i < n_unique; i++) {
  3125. if (rec_offs_nth_sql_null(offsets, i)) {
  3126. break;
  3127. }
  3128. n_not_null[i]++;
  3129. }
  3130. }
  3131. /*******************************************************************//**
  3132. Estimates the number of different key values in a given index, for
  3133. each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
  3134. The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
  3135. 0..n_uniq-1) and the number of pages that were sampled is saved in
  3136. index->stat_n_sample_sizes[].
  3137. If innodb_stats_method is nulls_ignored, we also record the number of
  3138. non-null values for each prefix and stored the estimates in
  3139. array index->stat_n_non_null_key_vals. */
  3140. UNIV_INTERN
  3141. void
  3142. btr_estimate_number_of_different_key_vals(
  3143. /*======================================*/
  3144. dict_index_t* index) /*!< in: index */
  3145. {
  3146. btr_cur_t cursor;
  3147. page_t* page;
  3148. rec_t* rec;
  3149. ulint n_cols;
  3150. ulint matched_fields;
  3151. ulint matched_bytes;
  3152. ib_uint64_t* n_diff;
  3153. ib_uint64_t* n_not_null;
  3154. ibool stats_null_not_equal;
  3155. ullint n_sample_pages; /* number of pages to sample */
  3156. ulint not_empty_flag = 0;
  3157. ulint total_external_size = 0;
  3158. ulint i;
  3159. ulint j;
  3160. ullint add_on;
  3161. mtr_t mtr;
  3162. mem_heap_t* heap = NULL;
  3163. ulint* offsets_rec = NULL;
  3164. ulint* offsets_next_rec = NULL;
  3165. n_cols = dict_index_get_n_unique(index);
  3166. heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
  3167. * n_cols
  3168. + dict_index_get_n_fields(index)
  3169. * (sizeof *offsets_rec
  3170. + sizeof *offsets_next_rec));
  3171. n_diff = (ib_uint64_t*) mem_heap_zalloc(
  3172. heap, n_cols * sizeof(ib_int64_t));
  3173. n_not_null = NULL;
  3174. /* Check srv_innodb_stats_method setting, and decide whether we
  3175. need to record non-null value and also decide if NULL is
  3176. considered equal (by setting stats_null_not_equal value) */
  3177. switch (srv_innodb_stats_method) {
  3178. case SRV_STATS_NULLS_IGNORED:
  3179. n_not_null = (ib_uint64_t*) mem_heap_zalloc(
  3180. heap, n_cols * sizeof *n_not_null);
  3181. /* fall through */
  3182. case SRV_STATS_NULLS_UNEQUAL:
  3183. /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
  3184. case, we will treat NULLs as unequal value */
  3185. stats_null_not_equal = TRUE;
  3186. break;
  3187. case SRV_STATS_NULLS_EQUAL:
  3188. stats_null_not_equal = FALSE;
  3189. break;
  3190. default:
  3191. ut_error;
  3192. }
  3193. /* It makes no sense to test more pages than are contained
  3194. in the index, thus we lower the number if it is too high */
  3195. if (srv_stats_transient_sample_pages > index->stat_index_size) {
  3196. if (index->stat_index_size > 0) {
  3197. n_sample_pages = index->stat_index_size;
  3198. } else {
  3199. n_sample_pages = 1;
  3200. }
  3201. } else {
  3202. n_sample_pages = srv_stats_transient_sample_pages;
  3203. }
  3204. /* We sample some pages in the index to get an estimate */
  3205. for (i = 0; i < n_sample_pages; i++) {
  3206. mtr_start(&mtr);
  3207. btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
  3208. /* Count the number of different key values for each prefix of
  3209. the key on this index page. If the prefix does not determine
  3210. the index record uniquely in the B-tree, then we subtract one
  3211. because otherwise our algorithm would give a wrong estimate
  3212. for an index where there is just one key value. */
  3213. page = btr_cur_get_page(&cursor);
  3214. rec = page_rec_get_next(page_get_infimum_rec(page));
  3215. if (!page_rec_is_supremum(rec)) {
  3216. not_empty_flag = 1;
  3217. offsets_rec = rec_get_offsets(rec, index, offsets_rec,
  3218. ULINT_UNDEFINED, &heap);
  3219. if (n_not_null != NULL) {
  3220. btr_record_not_null_field_in_rec(
  3221. n_cols, offsets_rec, n_not_null);
  3222. }
  3223. }
  3224. while (!page_rec_is_supremum(rec)) {
  3225. rec_t* next_rec = page_rec_get_next(rec);
  3226. if (page_rec_is_supremum(next_rec)) {
  3227. total_external_size +=
  3228. btr_rec_get_externally_stored_len(
  3229. rec, offsets_rec);
  3230. break;
  3231. }
  3232. matched_fields = 0;
  3233. matched_bytes = 0;
  3234. offsets_next_rec = rec_get_offsets(next_rec, index,
  3235. offsets_next_rec,
  3236. ULINT_UNDEFINED,
  3237. &heap);
  3238. cmp_rec_rec_with_match(rec, next_rec,
  3239. offsets_rec, offsets_next_rec,
  3240. index, stats_null_not_equal,
  3241. &matched_fields,
  3242. &matched_bytes);
  3243. for (j = matched_fields; j < n_cols; j++) {
  3244. /* We add one if this index record has
  3245. a different prefix from the previous */
  3246. n_diff[j]++;
  3247. }
  3248. if (n_not_null != NULL) {
  3249. btr_record_not_null_field_in_rec(
  3250. n_cols, offsets_next_rec, n_not_null);
  3251. }
  3252. total_external_size
  3253. += btr_rec_get_externally_stored_len(
  3254. rec, offsets_rec);
  3255. rec = next_rec;
  3256. /* Initialize offsets_rec for the next round
  3257. and assign the old offsets_rec buffer to
  3258. offsets_next_rec. */
  3259. {
  3260. ulint* offsets_tmp = offsets_rec;
  3261. offsets_rec = offsets_next_rec;
  3262. offsets_next_rec = offsets_tmp;
  3263. }
  3264. }
  3265. if (n_cols == dict_index_get_n_unique_in_tree(index)) {
  3266. /* If there is more than one leaf page in the tree,
  3267. we add one because we know that the first record
  3268. on the page certainly had a different prefix than the
  3269. last record on the previous index page in the
  3270. alphabetical order. Before this fix, if there was
  3271. just one big record on each clustered index page, the
  3272. algorithm grossly underestimated the number of rows
  3273. in the table. */
  3274. if (btr_page_get_prev(page, &mtr) != FIL_NULL
  3275. || btr_page_get_next(page, &mtr) != FIL_NULL) {
  3276. n_diff[n_cols - 1]++;
  3277. }
  3278. }
  3279. mtr_commit(&mtr);
  3280. }
  3281. /* If we saw k borders between different key values on
  3282. n_sample_pages leaf pages, we can estimate how many
  3283. there will be in index->stat_n_leaf_pages */
  3284. /* We must take into account that our sample actually represents
  3285. also the pages used for external storage of fields (those pages are
  3286. included in index->stat_n_leaf_pages) */
  3287. for (j = 0; j < n_cols; j++) {
  3288. index->stat_n_diff_key_vals[j]
  3289. = BTR_TABLE_STATS_FROM_SAMPLE(
  3290. n_diff[j], index, n_sample_pages,
  3291. total_external_size, not_empty_flag);
  3292. /* If the tree is small, smaller than
  3293. 10 * n_sample_pages + total_external_size, then
  3294. the above estimate is ok. For bigger trees it is common that we
  3295. do not see any borders between key values in the few pages
  3296. we pick. But still there may be n_sample_pages
  3297. different key values, or even more. Let us try to approximate
  3298. that: */
  3299. add_on = index->stat_n_leaf_pages
  3300. / (10 * (n_sample_pages
  3301. + total_external_size));
  3302. if (add_on > n_sample_pages) {
  3303. add_on = n_sample_pages;
  3304. }
  3305. index->stat_n_diff_key_vals[j] += add_on;
  3306. index->stat_n_sample_sizes[j] = n_sample_pages;
  3307. /* Update the stat_n_non_null_key_vals[] with our
  3308. sampled result. stat_n_non_null_key_vals[] is created
  3309. and initialized to zero in dict_index_add_to_cache(),
  3310. along with stat_n_diff_key_vals[] array */
  3311. if (n_not_null != NULL) {
  3312. index->stat_n_non_null_key_vals[j] =
  3313. BTR_TABLE_STATS_FROM_SAMPLE(
  3314. n_not_null[j], index, n_sample_pages,
  3315. total_external_size, not_empty_flag);
  3316. }
  3317. }
  3318. mem_heap_free(heap);
  3319. }
  3320. /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
  3321. /***********************************************************//**
  3322. Gets the offset of the pointer to the externally stored part of a field.
  3323. @return offset of the pointer to the externally stored part */
  3324. static
  3325. ulint
  3326. btr_rec_get_field_ref_offs(
  3327. /*=======================*/
  3328. const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
  3329. ulint n) /*!< in: index of the external field */
  3330. {
  3331. ulint field_ref_offs;
  3332. ulint local_len;
  3333. ut_a(rec_offs_nth_extern(offsets, n));
  3334. field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
  3335. ut_a(local_len != UNIV_SQL_NULL);
  3336. ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
  3337. return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
  3338. }
  3339. /** Gets a pointer to the externally stored part of a field.
  3340. @param rec record
  3341. @param offsets rec_get_offsets(rec)
  3342. @param n index of the externally stored field
  3343. @return pointer to the externally stored part */
  3344. #define btr_rec_get_field_ref(rec, offsets, n) \
  3345. ((rec) + btr_rec_get_field_ref_offs(offsets, n))
  3346. /***********************************************************//**
  3347. Gets the externally stored size of a record, in units of a database page.
  3348. @return externally stored part, in units of a database page */
  3349. static
  3350. ulint
  3351. btr_rec_get_externally_stored_len(
  3352. /*==============================*/
  3353. const rec_t* rec, /*!< in: record */
  3354. const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
  3355. {
  3356. ulint n_fields;
  3357. ulint total_extern_len = 0;
  3358. ulint i;
  3359. ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
  3360. if (!rec_offs_any_extern(offsets)) {
  3361. return(0);
  3362. }
  3363. n_fields = rec_offs_n_fields(offsets);
  3364. for (i = 0; i < n_fields; i++) {
  3365. if (rec_offs_nth_extern(offsets, i)) {
  3366. ulint extern_len = mach_read_from_4(
  3367. btr_rec_get_field_ref(rec, offsets, i)
  3368. + BTR_EXTERN_LEN + 4);
  3369. total_extern_len += ut_calc_align(extern_len,
  3370. UNIV_PAGE_SIZE);
  3371. }
  3372. }
  3373. return(total_extern_len / UNIV_PAGE_SIZE);
  3374. }
  3375. /*******************************************************************//**
  3376. Sets the ownership bit of an externally stored field in a record. */
  3377. static
  3378. void
  3379. btr_cur_set_ownership_of_extern_field(
  3380. /*==================================*/
  3381. page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
  3382. part will be updated, or NULL */
  3383. rec_t* rec, /*!< in/out: clustered index record */
  3384. dict_index_t* index, /*!< in: index of the page */
  3385. const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
  3386. ulint i, /*!< in: field number */
  3387. ibool val, /*!< in: value to set */
  3388. mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
  3389. {
  3390. byte* data;
  3391. ulint local_len;
  3392. ulint byte_val;
  3393. data = rec_get_nth_field(rec, offsets, i, &local_len);
  3394. ut_ad(rec_offs_nth_extern(offsets, i));
  3395. ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
  3396. local_len -= BTR_EXTERN_FIELD_REF_SIZE;
  3397. byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
  3398. if (val) {
  3399. byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
  3400. } else {
  3401. #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
  3402. ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
  3403. #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
  3404. byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
  3405. }
  3406. if (page_zip) {
  3407. mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
  3408. page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
  3409. } else if (mtr != NULL) {
  3410. mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
  3411. MLOG_1BYTE, mtr);
  3412. } else {
  3413. mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
  3414. }
  3415. btr_blob_dbg_owner(rec, index, offsets, i, val);
  3416. }
  3417. /*******************************************************************//**
  3418. Marks non-updated off-page fields as disowned by this record. The ownership
  3419. must be transferred to the updated record which is inserted elsewhere in the
  3420. index tree. In purge only the owner of externally stored field is allowed
  3421. to free the field. */
  3422. UNIV_INTERN
  3423. void
  3424. btr_cur_disown_inherited_fields(
  3425. /*============================*/
  3426. page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
  3427. part will be updated, or NULL */
  3428. rec_t* rec, /*!< in/out: record in a clustered index */
  3429. dict_index_t* index, /*!< in: index of the page */
  3430. const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
  3431. const upd_t* update, /*!< in: update vector */
  3432. mtr_t* mtr) /*!< in/out: mini-transaction */
  3433. {
  3434. ulint i;
  3435. ut_ad(rec_offs_validate(rec, index, offsets));
  3436. ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
  3437. ut_ad(rec_offs_any_extern(offsets));
  3438. ut_ad(mtr);
  3439. for (i = 0; i < rec_offs_n_fields(offsets); i++) {
  3440. if (rec_offs_nth_extern(offsets, i)
  3441. && !upd_get_field_by_field_no(update, i)) {
  3442. btr_cur_set_ownership_of_extern_field(
  3443. page_zip, rec, index, offsets, i, FALSE, mtr);
  3444. }
  3445. }
  3446. }
  3447. /*******************************************************************//**
  3448. Marks all extern fields in a record as owned by the record. This function
  3449. should be called if the delete mark of a record is removed: a not delete
  3450. marked record always owns all its extern fields. */
  3451. static
  3452. void
  3453. btr_cur_unmark_extern_fields(
  3454. /*=========================*/
  3455. page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
  3456. part will be updated, or NULL */
  3457. rec_t* rec, /*!< in/out: record in a clustered index */
  3458. dict_index_t* index, /*!< in: index of the page */
  3459. const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
  3460. mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
  3461. {
  3462. ulint n;
  3463. ulint i;
  3464. ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
  3465. n = rec_offs_n_fields(offsets);
  3466. if (!rec_offs_any_extern(offsets)) {
  3467. return;
  3468. }
  3469. for (i = 0; i < n; i++) {
  3470. if (rec_offs_nth_extern(offsets, i)) {
  3471. btr_cur_set_ownership_of_extern_field(
  3472. page_zip, rec, index, offsets, i, TRUE, mtr);
  3473. }
  3474. }
  3475. }
  3476. /*******************************************************************//**
  3477. Flags the data tuple fields that are marked as extern storage in the
  3478. update vector. We use this function to remember which fields we must
  3479. mark as extern storage in a record inserted for an update.
  3480. @return number of flagged external columns */
  3481. UNIV_INTERN
  3482. ulint
  3483. btr_push_update_extern_fields(
  3484. /*==========================*/
  3485. dtuple_t* tuple, /*!< in/out: data tuple */
  3486. const upd_t* update, /*!< in: update vector */
  3487. mem_heap_t* heap) /*!< in: memory heap */
  3488. {
  3489. ulint n_pushed = 0;
  3490. ulint n;
  3491. const upd_field_t* uf;
  3492. ut_ad(tuple);
  3493. ut_ad(update);
  3494. uf = update->fields;
  3495. n = upd_get_n_fields(update);
  3496. for (; n--; uf++) {
  3497. if (dfield_is_ext(&uf->new_val)) {
  3498. dfield_t* field
  3499. = dtuple_get_nth_field(tuple, uf->field_no);
  3500. if (!dfield_is_ext(field)) {
  3501. dfield_set_ext(field);
  3502. n_pushed++;
  3503. }
  3504. switch (uf->orig_len) {
  3505. byte* data;
  3506. ulint len;
  3507. byte* buf;
  3508. case 0:
  3509. break;
  3510. case BTR_EXTERN_FIELD_REF_SIZE:
  3511. /* Restore the original locally stored
  3512. part of the column. In the undo log,
  3513. InnoDB writes a longer prefix of externally
  3514. stored columns, so that column prefixes
  3515. in secondary indexes can be reconstructed. */
  3516. dfield_set_data(field, (byte*) dfield_get_data(field)
  3517. + dfield_get_len(field)
  3518. - BTR_EXTERN_FIELD_REF_SIZE,
  3519. BTR_EXTERN_FIELD_REF_SIZE);
  3520. dfield_set_ext(field);
  3521. break;
  3522. default:
  3523. /* Reconstruct the original locally
  3524. stored part of the column. The data
  3525. will have to be copied. */
  3526. ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
  3527. data = (byte*) dfield_get_data(field);
  3528. len = dfield_get_len(field);
  3529. buf = (byte*) mem_heap_alloc(heap,
  3530. uf->orig_len);
  3531. /* Copy the locally stored prefix. */
  3532. memcpy(buf, data,
  3533. uf->orig_len
  3534. - BTR_EXTERN_FIELD_REF_SIZE);
  3535. /* Copy the BLOB pointer. */
  3536. memcpy(buf + uf->orig_len
  3537. - BTR_EXTERN_FIELD_REF_SIZE,
  3538. data + len - BTR_EXTERN_FIELD_REF_SIZE,
  3539. BTR_EXTERN_FIELD_REF_SIZE);
  3540. dfield_set_data(field, buf, uf->orig_len);
  3541. dfield_set_ext(field);
  3542. }
  3543. }
  3544. }
  3545. return(n_pushed);
  3546. }
  3547. /*******************************************************************//**
  3548. Returns the length of a BLOB part stored on the header page.
  3549. @return part length */
  3550. static
  3551. ulint
  3552. btr_blob_get_part_len(
  3553. /*==================*/
  3554. const byte* blob_header) /*!< in: blob header */
  3555. {
  3556. return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
  3557. }
  3558. /*******************************************************************//**
  3559. Returns the page number where the next BLOB part is stored.
  3560. @return page number or FIL_NULL if no more pages */
  3561. static
  3562. ulint
  3563. btr_blob_get_next_page_no(
  3564. /*======================*/
  3565. const byte* blob_header) /*!< in: blob header */
  3566. {
  3567. return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
  3568. }
  3569. /*******************************************************************//**
  3570. Deallocate a buffer block that was reserved for a BLOB part. */
  3571. static
  3572. void
  3573. btr_blob_free(
  3574. /*==========*/
  3575. buf_block_t* block, /*!< in: buffer block */
  3576. ibool all, /*!< in: TRUE=remove also the compressed page
  3577. if there is one */
  3578. mtr_t* mtr) /*!< in: mini-transaction to commit */
  3579. {
  3580. buf_pool_t* buf_pool = buf_pool_from_block(block);
  3581. ulint space = buf_block_get_space(block);
  3582. ulint page_no = buf_block_get_page_no(block);
  3583. ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
  3584. mtr_commit(mtr);
  3585. buf_pool_mutex_enter(buf_pool);
  3586. /* Only free the block if it is still allocated to
  3587. the same file page. */
  3588. if (buf_block_get_state(block)
  3589. == BUF_BLOCK_FILE_PAGE
  3590. && buf_block_get_space(block) == space
  3591. && buf_block_get_page_no(block) == page_no) {
  3592. if (!buf_LRU_free_page(&block->page, all)
  3593. && all && block->page.zip.data) {
  3594. /* Attempt to deallocate the uncompressed page
  3595. if the whole block cannot be deallocted. */
  3596. buf_LRU_free_page(&block->page, false);
  3597. }
  3598. }
  3599. buf_pool_mutex_exit(buf_pool);
  3600. }
  3601. /*******************************************************************//**
  3602. Stores the fields in big_rec_vec to the tablespace and puts pointers to
  3603. them in rec. The extern flags in rec will have to be set beforehand.
  3604. The fields are stored on pages allocated from leaf node
  3605. file segment of the index tree.
  3606. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
  3607. UNIV_INTERN
  3608. dberr_t
  3609. btr_store_big_rec_extern_fields(
  3610. /*============================*/
  3611. dict_index_t* index, /*!< in: index of rec; the index tree
  3612. MUST be X-latched */
  3613. buf_block_t* rec_block, /*!< in/out: block containing rec */
  3614. rec_t* rec, /*!< in/out: record */
  3615. const ulint* offsets, /*!< in: rec_get_offsets(rec, index);
  3616. the "external storage" flags in offsets
  3617. will not correspond to rec when
  3618. this function returns */
  3619. const big_rec_t*big_rec_vec, /*!< in: vector containing fields
  3620. to be stored externally */
  3621. mtr_t* btr_mtr, /*!< in: mtr containing the
  3622. latches to the clustered index */
  3623. enum blob_op op) /*! in: operation code */
  3624. {
  3625. ulint rec_page_no;
  3626. byte* field_ref;
  3627. ulint extern_len;
  3628. ulint store_len;
  3629. ulint page_no;
  3630. ulint space_id;
  3631. ulint zip_size;
  3632. ulint prev_page_no;
  3633. ulint hint_page_no;
  3634. ulint i;
  3635. mtr_t mtr;
  3636. mtr_t* alloc_mtr;
  3637. mem_heap_t* heap = NULL;
  3638. page_zip_des_t* page_zip;
  3639. z_stream c_stream;
  3640. buf_block_t** freed_pages = NULL;
  3641. ulint n_freed_pages = 0;
  3642. dberr_t error = DB_SUCCESS;
  3643. ut_ad(rec_offs_validate(rec, index, offsets));
  3644. ut_ad(rec_offs_any_extern(offsets));
  3645. ut_ad(btr_mtr);
  3646. ut_ad(mtr_memo_contains(btr_mtr, dict_index_get_lock(index),
  3647. MTR_MEMO_X_LOCK));
  3648. ut_ad(mtr_memo_contains(btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
  3649. ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
  3650. ut_a(dict_index_is_clust(index));
  3651. page_zip = buf_block_get_page_zip(rec_block);
  3652. ut_a(dict_table_zip_size(index->table)
  3653. == buf_block_get_zip_size(rec_block));
  3654. space_id = buf_block_get_space(rec_block);
  3655. zip_size = buf_block_get_zip_size(rec_block);
  3656. rec_page_no = buf_block_get_page_no(rec_block);
  3657. ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
  3658. if (page_zip) {
  3659. int err;
  3660. /* Zlib deflate needs 128 kilobytes for the default
  3661. window size, plus 512 << memLevel, plus a few
  3662. kilobytes for small objects. We use reduced memLevel
  3663. to limit the memory consumption, and preallocate the
  3664. heap, hoping to avoid memory fragmentation. */
  3665. heap = mem_heap_create(250000);
  3666. page_zip_set_alloc(&c_stream, heap);
  3667. err = deflateInit2(&c_stream, page_zip_level,
  3668. Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
  3669. ut_a(err == Z_OK);
  3670. }
  3671. if (btr_blob_op_is_update(op)) {
  3672. /* Avoid reusing pages that have been previously freed
  3673. in btr_mtr. */
  3674. if (btr_mtr->n_freed_pages) {
  3675. if (heap == NULL) {
  3676. heap = mem_heap_create(
  3677. btr_mtr->n_freed_pages
  3678. * sizeof *freed_pages);
  3679. }
  3680. freed_pages = static_cast<buf_block_t**>(
  3681. mem_heap_alloc(
  3682. heap,
  3683. btr_mtr->n_freed_pages
  3684. * sizeof *freed_pages));
  3685. n_freed_pages = 0;
  3686. }
  3687. /* Because btr_mtr will be committed after mtr, it is
  3688. possible that the tablespace has been extended when
  3689. the B-tree record was updated or inserted, or it will
  3690. be extended while allocating pages for big_rec.
  3691. TODO: In mtr (not btr_mtr), write a redo log record
  3692. about extending the tablespace to its current size,
  3693. and remember the current size. Whenever the tablespace
  3694. grows as pages are allocated, write further redo log
  3695. records to mtr. (Currently tablespace extension is not
  3696. covered by the redo log. If it were, the record would
  3697. only be written to btr_mtr, which is committed after
  3698. mtr.) */
  3699. alloc_mtr = btr_mtr;
  3700. } else {
  3701. /* Use the local mtr for allocations. */
  3702. alloc_mtr = &mtr;
  3703. }
  3704. #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
  3705. /* All pointers to externally stored columns in the record
  3706. must either be zero or they must be pointers to inherited
  3707. columns, owned by this record or an earlier record version. */
  3708. for (i = 0; i < rec_offs_n_fields(offsets); i++) {
  3709. if (!rec_offs_nth_extern(offsets, i)) {
  3710. continue;
  3711. }
  3712. field_ref = btr_rec_get_field_ref(rec, offsets, i);
  3713. ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
  3714. /* Either this must be an update in place,
  3715. or the BLOB must be inherited, or the BLOB pointer
  3716. must be zero (will be written in this function). */
  3717. ut_a(op == BTR_STORE_UPDATE
  3718. || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
  3719. || !memcmp(field_ref, field_ref_zero,
  3720. BTR_EXTERN_FIELD_REF_SIZE));
  3721. }
  3722. #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
  3723. /* We have to create a file segment to the tablespace
  3724. for each field and put the pointer to the field in rec */
  3725. for (i = 0; i < big_rec_vec->n_fields; i++) {
  3726. field_ref = btr_rec_get_field_ref(
  3727. rec, offsets, big_rec_vec->fields[i].field_no);
  3728. #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
  3729. /* A zero BLOB pointer should have been initially inserted. */
  3730. ut_a(!memcmp(field_ref, field_ref_zero,
  3731. BTR_EXTERN_FIELD_REF_SIZE));
  3732. #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
  3733. extern_len = big_rec_vec->fields[i].len;
  3734. UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
  3735. extern_len);
  3736. ut_a(extern_len > 0);
  3737. prev_page_no = FIL_NULL;
  3738. if (page_zip) {
  3739. int err = deflateReset(&c_stream);
  3740. ut_a(err == Z_OK);
  3741. c_stream.next_in = (Bytef*)
  3742. big_rec_vec->fields[i].data;
  3743. c_stream.avail_in = extern_len;
  3744. }
  3745. for (;;) {
  3746. buf_block_t* block;
  3747. page_t* page;
  3748. mtr_start(&mtr);
  3749. if (prev_page_no == FIL_NULL) {
  3750. hint_page_no = 1 + rec_page_no;
  3751. } else {
  3752. hint_page_no = prev_page_no + 1;
  3753. }
  3754. alloc_another:
  3755. block = btr_page_alloc(index, hint_page_no,
  3756. FSP_NO_DIR, 0, alloc_mtr, &mtr);
  3757. if (UNIV_UNLIKELY(block == NULL)) {
  3758. mtr_commit(&mtr);
  3759. error = DB_OUT_OF_FILE_SPACE;
  3760. goto func_exit;
  3761. }
  3762. if (rw_lock_get_x_lock_count(&block->lock) > 1) {
  3763. /* This page must have been freed in
  3764. btr_mtr previously. Put it aside, and
  3765. allocate another page for the BLOB data. */
  3766. ut_ad(alloc_mtr == btr_mtr);
  3767. ut_ad(btr_blob_op_is_update(op));
  3768. ut_ad(n_freed_pages < btr_mtr->n_freed_pages);
  3769. freed_pages[n_freed_pages++] = block;
  3770. goto alloc_another;
  3771. }
  3772. page_no = buf_block_get_page_no(block);
  3773. page = buf_block_get_frame(block);
  3774. if (prev_page_no != FIL_NULL) {
  3775. buf_block_t* prev_block;
  3776. page_t* prev_page;
  3777. prev_block = buf_page_get(space_id, zip_size,
  3778. prev_page_no,
  3779. RW_X_LATCH, &mtr);
  3780. buf_block_dbg_add_level(prev_block,
  3781. SYNC_EXTERN_STORAGE);
  3782. prev_page = buf_block_get_frame(prev_block);
  3783. if (page_zip) {
  3784. mlog_write_ulint(
  3785. prev_page + FIL_PAGE_NEXT,
  3786. page_no, MLOG_4BYTES, &mtr);
  3787. memcpy(buf_block_get_page_zip(
  3788. prev_block)
  3789. ->data + FIL_PAGE_NEXT,
  3790. prev_page + FIL_PAGE_NEXT, 4);
  3791. } else {
  3792. mlog_write_ulint(
  3793. prev_page + FIL_PAGE_DATA
  3794. + BTR_BLOB_HDR_NEXT_PAGE_NO,
  3795. page_no, MLOG_4BYTES, &mtr);
  3796. }
  3797. } else if (dict_index_is_online_ddl(index)) {
  3798. row_log_table_blob_alloc(index, page_no);
  3799. }
  3800. if (page_zip) {
  3801. int err;
  3802. page_zip_des_t* blob_page_zip;
  3803. /* Write FIL_PAGE_TYPE to the redo log
  3804. separately, before logging any other
  3805. changes to the page, so that the debug
  3806. assertions in
  3807. recv_parse_or_apply_log_rec_body() can
  3808. be made simpler. Before InnoDB Plugin
  3809. 1.0.4, the initialization of
  3810. FIL_PAGE_TYPE was logged as part of
  3811. the mlog_log_string() below. */
  3812. mlog_write_ulint(page + FIL_PAGE_TYPE,
  3813. prev_page_no == FIL_NULL
  3814. ? FIL_PAGE_TYPE_ZBLOB
  3815. : FIL_PAGE_TYPE_ZBLOB2,
  3816. MLOG_2BYTES, &mtr);
  3817. c_stream.next_out = page
  3818. + FIL_PAGE_DATA;
  3819. c_stream.avail_out
  3820. = page_zip_get_size(page_zip)
  3821. - FIL_PAGE_DATA;
  3822. err = deflate(&c_stream, Z_FINISH);
  3823. ut_a(err == Z_OK || err == Z_STREAM_END);
  3824. ut_a(err == Z_STREAM_END
  3825. || c_stream.avail_out == 0);
  3826. /* Write the "next BLOB page" pointer */
  3827. mlog_write_ulint(page + FIL_PAGE_NEXT,
  3828. FIL_NULL, MLOG_4BYTES, &mtr);
  3829. /* Initialize the unused "prev page" pointer */
  3830. mlog_write_ulint(page + FIL_PAGE_PREV,
  3831. FIL_NULL, MLOG_4BYTES, &mtr);
  3832. /* Write a back pointer to the record
  3833. into the otherwise unused area. This
  3834. information could be useful in
  3835. debugging. Later, we might want to
  3836. implement the possibility to relocate
  3837. BLOB pages. Then, we would need to be
  3838. able to adjust the BLOB pointer in the
  3839. record. We do not store the heap
  3840. number of the record, because it can
  3841. change in page_zip_reorganize() or
  3842. btr_page_reorganize(). However, also
  3843. the page number of the record may
  3844. change when B-tree nodes are split or
  3845. merged. */
  3846. mlog_write_ulint(page
  3847. + FIL_PAGE_FILE_FLUSH_LSN,
  3848. space_id,
  3849. MLOG_4BYTES, &mtr);
  3850. mlog_write_ulint(page
  3851. + FIL_PAGE_FILE_FLUSH_LSN + 4,
  3852. rec_page_no,
  3853. MLOG_4BYTES, &mtr);
  3854. /* Zero out the unused part of the page. */
  3855. memset(page + page_zip_get_size(page_zip)
  3856. - c_stream.avail_out,
  3857. 0, c_stream.avail_out);
  3858. mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
  3859. page_zip_get_size(page_zip)
  3860. - FIL_PAGE_FILE_FLUSH_LSN,
  3861. &mtr);
  3862. /* Copy the page to compressed storage,
  3863. because it will be flushed to disk
  3864. from there. */
  3865. blob_page_zip = buf_block_get_page_zip(block);
  3866. ut_ad(blob_page_zip);
  3867. ut_ad(page_zip_get_size(blob_page_zip)
  3868. == page_zip_get_size(page_zip));
  3869. memcpy(blob_page_zip->data, page,
  3870. page_zip_get_size(page_zip));
  3871. if (err == Z_OK && prev_page_no != FIL_NULL) {
  3872. goto next_zip_page;
  3873. }
  3874. if (alloc_mtr == &mtr) {
  3875. rec_block = buf_page_get(
  3876. space_id, zip_size,
  3877. rec_page_no,
  3878. RW_X_LATCH, &mtr);
  3879. buf_block_dbg_add_level(
  3880. rec_block,
  3881. SYNC_NO_ORDER_CHECK);
  3882. }
  3883. if (err == Z_STREAM_END) {
  3884. mach_write_to_4(field_ref
  3885. + BTR_EXTERN_LEN, 0);
  3886. mach_write_to_4(field_ref
  3887. + BTR_EXTERN_LEN + 4,
  3888. c_stream.total_in);
  3889. } else {
  3890. memset(field_ref + BTR_EXTERN_LEN,
  3891. 0, 8);
  3892. }
  3893. if (prev_page_no == FIL_NULL) {
  3894. btr_blob_dbg_add_blob(
  3895. rec, big_rec_vec->fields[i]
  3896. .field_no, page_no, index,
  3897. "store");
  3898. mach_write_to_4(field_ref
  3899. + BTR_EXTERN_SPACE_ID,
  3900. space_id);
  3901. mach_write_to_4(field_ref
  3902. + BTR_EXTERN_PAGE_NO,
  3903. page_no);
  3904. mach_write_to_4(field_ref
  3905. + BTR_EXTERN_OFFSET,
  3906. FIL_PAGE_NEXT);
  3907. }
  3908. page_zip_write_blob_ptr(
  3909. page_zip, rec, index, offsets,
  3910. big_rec_vec->fields[i].field_no,
  3911. alloc_mtr);
  3912. next_zip_page:
  3913. prev_page_no = page_no;
  3914. /* Commit mtr and release the
  3915. uncompressed page frame to save memory. */
  3916. btr_blob_free(block, FALSE, &mtr);
  3917. if (err == Z_STREAM_END) {
  3918. break;
  3919. }
  3920. } else {
  3921. mlog_write_ulint(page + FIL_PAGE_TYPE,
  3922. FIL_PAGE_TYPE_BLOB,
  3923. MLOG_2BYTES, &mtr);
  3924. if (extern_len > (UNIV_PAGE_SIZE
  3925. - FIL_PAGE_DATA
  3926. - BTR_BLOB_HDR_SIZE
  3927. - FIL_PAGE_DATA_END)) {
  3928. store_len = UNIV_PAGE_SIZE
  3929. - FIL_PAGE_DATA
  3930. - BTR_BLOB_HDR_SIZE
  3931. - FIL_PAGE_DATA_END;
  3932. } else {
  3933. store_len = extern_len;
  3934. }
  3935. mlog_write_string(page + FIL_PAGE_DATA
  3936. + BTR_BLOB_HDR_SIZE,
  3937. (const byte*)
  3938. big_rec_vec->fields[i].data
  3939. + big_rec_vec->fields[i].len
  3940. - extern_len,
  3941. store_len, &mtr);
  3942. mlog_write_ulint(page + FIL_PAGE_DATA
  3943. + BTR_BLOB_HDR_PART_LEN,
  3944. store_len, MLOG_4BYTES, &mtr);
  3945. mlog_write_ulint(page + FIL_PAGE_DATA
  3946. + BTR_BLOB_HDR_NEXT_PAGE_NO,
  3947. FIL_NULL, MLOG_4BYTES, &mtr);
  3948. extern_len -= store_len;
  3949. if (alloc_mtr == &mtr) {
  3950. rec_block = buf_page_get(
  3951. space_id, zip_size,
  3952. rec_page_no,
  3953. RW_X_LATCH, &mtr);
  3954. buf_block_dbg_add_level(
  3955. rec_block,
  3956. SYNC_NO_ORDER_CHECK);
  3957. }
  3958. mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
  3959. MLOG_4BYTES, alloc_mtr);
  3960. mlog_write_ulint(field_ref
  3961. + BTR_EXTERN_LEN + 4,
  3962. big_rec_vec->fields[i].len
  3963. - extern_len,
  3964. MLOG_4BYTES, alloc_mtr);
  3965. if (prev_page_no == FIL_NULL) {
  3966. btr_blob_dbg_add_blob(
  3967. rec, big_rec_vec->fields[i]
  3968. .field_no, page_no, index,
  3969. "store");
  3970. mlog_write_ulint(field_ref
  3971. + BTR_EXTERN_SPACE_ID,
  3972. space_id, MLOG_4BYTES,
  3973. alloc_mtr);
  3974. mlog_write_ulint(field_ref
  3975. + BTR_EXTERN_PAGE_NO,
  3976. page_no, MLOG_4BYTES,
  3977. alloc_mtr);
  3978. mlog_write_ulint(field_ref
  3979. + BTR_EXTERN_OFFSET,
  3980. FIL_PAGE_DATA,
  3981. MLOG_4BYTES,
  3982. alloc_mtr);
  3983. }
  3984. prev_page_no = page_no;
  3985. mtr_commit(&mtr);
  3986. if (extern_len == 0) {
  3987. break;
  3988. }
  3989. }
  3990. }
  3991. DBUG_EXECUTE_IF("btr_store_big_rec_extern",
  3992. error = DB_OUT_OF_FILE_SPACE;
  3993. goto func_exit;);
  3994. }
  3995. func_exit:
  3996. if (page_zip) {
  3997. deflateEnd(&c_stream);
  3998. }
  3999. if (n_freed_pages) {
  4000. ulint i;
  4001. ut_ad(alloc_mtr == btr_mtr);
  4002. ut_ad(btr_blob_op_is_update(op));
  4003. for (i = 0; i < n_freed_pages; i++) {
  4004. btr_page_free_low(index, freed_pages[i], 0, alloc_mtr);
  4005. }
  4006. }
  4007. if (heap != NULL) {
  4008. mem_heap_free(heap);
  4009. }
  4010. #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
  4011. /* All pointers to externally stored columns in the record
  4012. must be valid. */
  4013. for (i = 0; i < rec_offs_n_fields(offsets); i++) {
  4014. if (!rec_offs_nth_extern(offsets, i)) {
  4015. continue;
  4016. }
  4017. field_ref = btr_rec_get_field_ref(rec, offsets, i);
  4018. /* The pointer must not be zero if the operation
  4019. succeeded. */
  4020. ut_a(0 != memcmp(field_ref, field_ref_zero,
  4021. BTR_EXTERN_FIELD_REF_SIZE)
  4022. || error != DB_SUCCESS);
  4023. /* The column must not be disowned by this record. */
  4024. ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
  4025. }
  4026. #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
  4027. return(error);
  4028. }
  4029. /*******************************************************************//**
  4030. Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
  4031. static
  4032. void
  4033. btr_check_blob_fil_page_type(
  4034. /*=========================*/
  4035. ulint space_id, /*!< in: space id */
  4036. ulint page_no, /*!< in: page number */
  4037. const page_t* page, /*!< in: page */
  4038. ibool read) /*!< in: TRUE=read, FALSE=purge */
  4039. {
  4040. ulint type = fil_page_get_type(page);
  4041. ut_a(space_id == page_get_space_id(page));
  4042. ut_a(page_no == page_get_page_no(page));
  4043. if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
  4044. ulint flags = fil_space_get_flags(space_id);
  4045. #ifndef UNIV_DEBUG /* Improve debug test coverage */
  4046. if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
  4047. /* Old versions of InnoDB did not initialize
  4048. FIL_PAGE_TYPE on BLOB pages. Do not print
  4049. anything about the type mismatch when reading
  4050. a BLOB page that is in Antelope format.*/
  4051. return;
  4052. }
  4053. #endif /* !UNIV_DEBUG */
  4054. ut_print_timestamp(stderr);
  4055. fprintf(stderr,
  4056. " InnoDB: FIL_PAGE_TYPE=%lu"
  4057. " on BLOB %s space %lu page %lu flags %lx\n",
  4058. (ulong) type, read ? "read" : "purge",
  4059. (ulong) space_id, (ulong) page_no, (ulong) flags);
  4060. ut_error;
  4061. }
  4062. }
  4063. /*******************************************************************//**
  4064. Frees the space in an externally stored field to the file space
  4065. management if the field in data is owned by the externally stored field,
  4066. in a rollback we may have the additional condition that the field must
  4067. not be inherited. */
  4068. UNIV_INTERN
  4069. void
  4070. btr_free_externally_stored_field(
  4071. /*=============================*/
  4072. dict_index_t* index, /*!< in: index of the data, the index
  4073. tree MUST be X-latched; if the tree
  4074. height is 1, then also the root page
  4075. must be X-latched! (this is relevant
  4076. in the case this function is called
  4077. from purge where 'data' is located on
  4078. an undo log page, not an index
  4079. page) */
  4080. byte* field_ref, /*!< in/out: field reference */
  4081. const rec_t* rec, /*!< in: record containing field_ref, for
  4082. page_zip_write_blob_ptr(), or NULL */
  4083. const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
  4084. or NULL */
  4085. page_zip_des_t* page_zip, /*!< in: compressed page corresponding
  4086. to rec, or NULL if rec == NULL */
  4087. ulint i, /*!< in: field number of field_ref;
  4088. ignored if rec == NULL */
  4089. enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
  4090. mtr_t* local_mtr __attribute__((unused))) /*!< in: mtr
  4091. containing the latch to data an an
  4092. X-latch to the index tree */
  4093. {
  4094. page_t* page;
  4095. const ulint space_id = mach_read_from_4(
  4096. field_ref + BTR_EXTERN_SPACE_ID);
  4097. const ulint start_page = mach_read_from_4(
  4098. field_ref + BTR_EXTERN_PAGE_NO);
  4099. ulint rec_zip_size = dict_table_zip_size(index->table);
  4100. ulint ext_zip_size;
  4101. ulint page_no;
  4102. ulint next_page_no;
  4103. mtr_t mtr;
  4104. ut_ad(dict_index_is_clust(index));
  4105. ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
  4106. MTR_MEMO_X_LOCK));
  4107. ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
  4108. MTR_MEMO_PAGE_X_FIX));
  4109. ut_ad(!rec || rec_offs_validate(rec, index, offsets));
  4110. ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
  4111. if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
  4112. BTR_EXTERN_FIELD_REF_SIZE))) {
  4113. /* In the rollback, we may encounter a clustered index
  4114. record with some unwritten off-page columns. There is
  4115. nothing to free then. */
  4116. ut_a(rb_ctx != RB_NONE);
  4117. return;
  4118. }
  4119. ut_ad(space_id == index->space);
  4120. if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
  4121. ext_zip_size = fil_space_get_zip_size(space_id);
  4122. /* This must be an undo log record in the system tablespace,
  4123. that is, in row_purge_upd_exist_or_extern().
  4124. Currently, externally stored records are stored in the
  4125. same tablespace as the referring records. */
  4126. ut_ad(!page_get_space_id(page_align(field_ref)));
  4127. ut_ad(!rec);
  4128. ut_ad(!page_zip);
  4129. } else {
  4130. ext_zip_size = rec_zip_size;
  4131. }
  4132. if (!rec) {
  4133. /* This is a call from row_purge_upd_exist_or_extern(). */
  4134. ut_ad(!page_zip);
  4135. rec_zip_size = 0;
  4136. }
  4137. #ifdef UNIV_BLOB_DEBUG
  4138. if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)
  4139. && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
  4140. && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) {
  4141. /* This off-page column will be freed.
  4142. Check that no references remain. */
  4143. btr_blob_dbg_t b;
  4144. b.blob_page_no = start_page;
  4145. if (rec) {
  4146. /* Remove the reference from the record to the
  4147. BLOB. If the BLOB were not freed, the
  4148. reference would be removed when the record is
  4149. removed. Freeing the BLOB will overwrite the
  4150. BTR_EXTERN_PAGE_NO in the field_ref of the
  4151. record with FIL_NULL, which would make the
  4152. btr_blob_dbg information inconsistent with the
  4153. record. */
  4154. b.ref_page_no = page_get_page_no(page_align(rec));
  4155. b.ref_heap_no = page_rec_get_heap_no(rec);
  4156. b.ref_field_no = i;
  4157. btr_blob_dbg_rbt_delete(index, &b, "free");
  4158. }
  4159. btr_blob_dbg_assert_empty(index, b.blob_page_no);
  4160. }
  4161. #endif /* UNIV_BLOB_DEBUG */
  4162. for (;;) {
  4163. #ifdef UNIV_SYNC_DEBUG
  4164. buf_block_t* rec_block;
  4165. #endif /* UNIV_SYNC_DEBUG */
  4166. buf_block_t* ext_block;
  4167. mtr_start(&mtr);
  4168. #ifdef UNIV_SYNC_DEBUG
  4169. rec_block =
  4170. #endif /* UNIV_SYNC_DEBUG */
  4171. buf_page_get(page_get_space_id(page_align(field_ref)),
  4172. rec_zip_size,
  4173. page_get_page_no(page_align(field_ref)),
  4174. RW_X_LATCH, &mtr);
  4175. buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
  4176. page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
  4177. if (/* There is no external storage data */
  4178. page_no == FIL_NULL
  4179. /* This field does not own the externally stored field */
  4180. || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
  4181. & BTR_EXTERN_OWNER_FLAG)
  4182. /* Rollback and inherited field */
  4183. || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY)
  4184. && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
  4185. & BTR_EXTERN_INHERITED_FLAG))) {
  4186. /* Do not free */
  4187. mtr_commit(&mtr);
  4188. return;
  4189. }
  4190. if (page_no == start_page && dict_index_is_online_ddl(index)) {
  4191. row_log_table_blob_free(index, start_page);
  4192. }
  4193. ext_block = buf_page_get(space_id, ext_zip_size, page_no,
  4194. RW_X_LATCH, &mtr);
  4195. buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
  4196. page = buf_block_get_frame(ext_block);
  4197. if (ext_zip_size) {
  4198. /* Note that page_zip will be NULL
  4199. in row_purge_upd_exist_or_extern(). */
  4200. switch (fil_page_get_type(page)) {
  4201. case FIL_PAGE_TYPE_ZBLOB:
  4202. case FIL_PAGE_TYPE_ZBLOB2:
  4203. break;
  4204. default:
  4205. ut_error;
  4206. }
  4207. next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
  4208. btr_page_free_low(index, ext_block, 0, &mtr);
  4209. if (page_zip != NULL) {
  4210. mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
  4211. next_page_no);
  4212. mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
  4213. 0);
  4214. page_zip_write_blob_ptr(page_zip, rec, index,
  4215. offsets, i, &mtr);
  4216. } else {
  4217. mlog_write_ulint(field_ref
  4218. + BTR_EXTERN_PAGE_NO,
  4219. next_page_no,
  4220. MLOG_4BYTES, &mtr);
  4221. mlog_write_ulint(field_ref
  4222. + BTR_EXTERN_LEN + 4, 0,
  4223. MLOG_4BYTES, &mtr);
  4224. }
  4225. } else {
  4226. ut_a(!page_zip);
  4227. btr_check_blob_fil_page_type(space_id, page_no, page,
  4228. FALSE);
  4229. next_page_no = mach_read_from_4(
  4230. page + FIL_PAGE_DATA
  4231. + BTR_BLOB_HDR_NEXT_PAGE_NO);
  4232. /* We must supply the page level (= 0) as an argument
  4233. because we did not store it on the page (we save the
  4234. space overhead from an index page header. */
  4235. btr_page_free_low(index, ext_block, 0, &mtr);
  4236. mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
  4237. next_page_no,
  4238. MLOG_4BYTES, &mtr);
  4239. /* Zero out the BLOB length. If the server
  4240. crashes during the execution of this function,
  4241. trx_rollback_or_clean_all_recovered() could
  4242. dereference the half-deleted BLOB, fetching a
  4243. wrong prefix for the BLOB. */
  4244. mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
  4245. 0,
  4246. MLOG_4BYTES, &mtr);
  4247. }
  4248. /* Commit mtr and release the BLOB block to save memory. */
  4249. btr_blob_free(ext_block, TRUE, &mtr);
  4250. }
  4251. }
  4252. /***********************************************************//**
  4253. Frees the externally stored fields for a record. */
  4254. static
  4255. void
  4256. btr_rec_free_externally_stored_fields(
  4257. /*==================================*/
  4258. dict_index_t* index, /*!< in: index of the data, the index
  4259. tree MUST be X-latched */
  4260. rec_t* rec, /*!< in/out: record */
  4261. const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
  4262. page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
  4263. part will be updated, or NULL */
  4264. enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
  4265. mtr_t* mtr) /*!< in: mini-transaction handle which contains
  4266. an X-latch to record page and to the index
  4267. tree */
  4268. {
  4269. ulint n_fields;
  4270. ulint i;
  4271. ut_ad(rec_offs_validate(rec, index, offsets));
  4272. ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
  4273. /* Free possible externally stored fields in the record */
  4274. ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
  4275. n_fields = rec_offs_n_fields(offsets);
  4276. for (i = 0; i < n_fields; i++) {
  4277. if (rec_offs_nth_extern(offsets, i)) {
  4278. btr_free_externally_stored_field(
  4279. index, btr_rec_get_field_ref(rec, offsets, i),
  4280. rec, offsets, page_zip, i, rb_ctx, mtr);
  4281. }
  4282. }
  4283. }
  4284. /***********************************************************//**
  4285. Frees the externally stored fields for a record, if the field is mentioned
  4286. in the update vector. */
  4287. static
  4288. void
  4289. btr_rec_free_updated_extern_fields(
  4290. /*===============================*/
  4291. dict_index_t* index, /*!< in: index of rec; the index tree MUST be
  4292. X-latched */
  4293. rec_t* rec, /*!< in/out: record */
  4294. page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
  4295. part will be updated, or NULL */
  4296. const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
  4297. const upd_t* update, /*!< in: update vector */
  4298. enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
  4299. mtr_t* mtr) /*!< in: mini-transaction handle which contains
  4300. an X-latch to record page and to the tree */
  4301. {
  4302. ulint n_fields;
  4303. ulint i;
  4304. ut_ad(rec_offs_validate(rec, index, offsets));
  4305. ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
  4306. /* Free possible externally stored fields in the record */
  4307. n_fields = upd_get_n_fields(update);
  4308. for (i = 0; i < n_fields; i++) {
  4309. const upd_field_t* ufield = upd_get_nth_field(update, i);
  4310. if (rec_offs_nth_extern(offsets, ufield->field_no)) {
  4311. ulint len;
  4312. byte* data = rec_get_nth_field(
  4313. rec, offsets, ufield->field_no, &len);
  4314. ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
  4315. btr_free_externally_stored_field(
  4316. index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
  4317. rec, offsets, page_zip,
  4318. ufield->field_no, rb_ctx, mtr);
  4319. }
  4320. }
  4321. }
  4322. /*******************************************************************//**
  4323. Copies the prefix of an uncompressed BLOB. The clustered index record
  4324. that points to this BLOB must be protected by a lock or a page latch.
  4325. @return number of bytes written to buf */
  4326. static
  4327. ulint
  4328. btr_copy_blob_prefix(
  4329. /*=================*/
  4330. byte* buf, /*!< out: the externally stored part of
  4331. the field, or a prefix of it */
  4332. ulint len, /*!< in: length of buf, in bytes */
  4333. ulint space_id,/*!< in: space id of the BLOB pages */
  4334. ulint page_no,/*!< in: page number of the first BLOB page */
  4335. ulint offset) /*!< in: offset on the first BLOB page */
  4336. {
  4337. ulint copied_len = 0;
  4338. for (;;) {
  4339. mtr_t mtr;
  4340. buf_block_t* block;
  4341. const page_t* page;
  4342. const byte* blob_header;
  4343. ulint part_len;
  4344. ulint copy_len;
  4345. mtr_start(&mtr);
  4346. block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr);
  4347. buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
  4348. page = buf_block_get_frame(block);
  4349. btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
  4350. blob_header = page + offset;
  4351. part_len = btr_blob_get_part_len(blob_header);
  4352. copy_len = ut_min(part_len, len - copied_len);
  4353. memcpy(buf + copied_len,
  4354. blob_header + BTR_BLOB_HDR_SIZE, copy_len);
  4355. copied_len += copy_len;
  4356. page_no = btr_blob_get_next_page_no(blob_header);
  4357. mtr_commit(&mtr);
  4358. if (page_no == FIL_NULL || copy_len != part_len) {
  4359. UNIV_MEM_ASSERT_RW(buf, copied_len);
  4360. return(copied_len);
  4361. }
  4362. /* On other BLOB pages except the first the BLOB header
  4363. always is at the page data start: */
  4364. offset = FIL_PAGE_DATA;
  4365. ut_ad(copied_len <= len);
  4366. }
  4367. }
  4368. /*******************************************************************//**
  4369. Copies the prefix of a compressed BLOB. The clustered index record
  4370. that points to this BLOB must be protected by a lock or a page latch.
  4371. @return number of bytes written to buf */
  4372. static
  4373. ulint
  4374. btr_copy_zblob_prefix(
  4375. /*==================*/
  4376. byte* buf, /*!< out: the externally stored part of
  4377. the field, or a prefix of it */
  4378. ulint len, /*!< in: length of buf, in bytes */
  4379. ulint zip_size,/*!< in: compressed BLOB page size */
  4380. ulint space_id,/*!< in: space id of the BLOB pages */
  4381. ulint page_no,/*!< in: page number of the first BLOB page */
  4382. ulint offset) /*!< in: offset on the first BLOB page */
  4383. {
  4384. ulint page_type = FIL_PAGE_TYPE_ZBLOB;
  4385. mem_heap_t* heap;
  4386. int err;
  4387. z_stream d_stream;
  4388. d_stream.next_out = buf;
  4389. d_stream.avail_out = len;
  4390. d_stream.next_in = Z_NULL;
  4391. d_stream.avail_in = 0;
  4392. /* Zlib inflate needs 32 kilobytes for the default
  4393. window size, plus a few kilobytes for small objects. */
  4394. heap = mem_heap_create(40000);
  4395. page_zip_set_alloc(&d_stream, heap);
  4396. ut_ad(ut_is_2pow(zip_size));
  4397. ut_ad(zip_size >= UNIV_ZIP_SIZE_MIN);
  4398. ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
  4399. ut_ad(space_id);
  4400. err = inflateInit(&d_stream);
  4401. ut_a(err == Z_OK);
  4402. for (;;) {
  4403. buf_page_t* bpage;
  4404. ulint next_page_no;
  4405. /* There is no latch on bpage directly. Instead,
  4406. bpage is protected by the B-tree page latch that
  4407. is being held on the clustered index record, or,
  4408. in row_merge_copy_blobs(), by an exclusive table lock. */
  4409. bpage = buf_page_get_zip(space_id, zip_size, page_no);
  4410. if (UNIV_UNLIKELY(!bpage)) {
  4411. ut_print_timestamp(stderr);
  4412. fprintf(stderr,
  4413. " InnoDB: Cannot load"
  4414. " compressed BLOB"
  4415. " page %lu space %lu\n",
  4416. (ulong) page_no, (ulong) space_id);
  4417. goto func_exit;
  4418. }
  4419. if (UNIV_UNLIKELY
  4420. (fil_page_get_type(bpage->zip.data) != page_type)) {
  4421. ut_print_timestamp(stderr);
  4422. fprintf(stderr,
  4423. " InnoDB: Unexpected type %lu of"
  4424. " compressed BLOB"
  4425. " page %lu space %lu\n",
  4426. (ulong) fil_page_get_type(bpage->zip.data),
  4427. (ulong) page_no, (ulong) space_id);
  4428. ut_ad(0);
  4429. goto end_of_blob;
  4430. }
  4431. next_page_no = mach_read_from_4(bpage->zip.data + offset);
  4432. if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
  4433. /* When the BLOB begins at page header,
  4434. the compressed data payload does not
  4435. immediately follow the next page pointer. */
  4436. offset = FIL_PAGE_DATA;
  4437. } else {
  4438. offset += 4;
  4439. }
  4440. d_stream.next_in = bpage->zip.data + offset;
  4441. d_stream.avail_in = zip_size - offset;
  4442. err = inflate(&d_stream, Z_NO_FLUSH);
  4443. switch (err) {
  4444. case Z_OK:
  4445. if (!d_stream.avail_out) {
  4446. goto end_of_blob;
  4447. }
  4448. break;
  4449. case Z_STREAM_END:
  4450. if (next_page_no == FIL_NULL) {
  4451. goto end_of_blob;
  4452. }
  4453. /* fall through */
  4454. default:
  4455. inflate_error:
  4456. ut_print_timestamp(stderr);
  4457. fprintf(stderr,
  4458. " InnoDB: inflate() of"
  4459. " compressed BLOB"
  4460. " page %lu space %lu returned %d (%s)\n",
  4461. (ulong) page_no, (ulong) space_id,
  4462. err, d_stream.msg);
  4463. case Z_BUF_ERROR:
  4464. goto end_of_blob;
  4465. }
  4466. if (next_page_no == FIL_NULL) {
  4467. if (!d_stream.avail_in) {
  4468. ut_print_timestamp(stderr);
  4469. fprintf(stderr,
  4470. " InnoDB: unexpected end of"
  4471. " compressed BLOB"
  4472. " page %lu space %lu\n",
  4473. (ulong) page_no,
  4474. (ulong) space_id);
  4475. } else {
  4476. err = inflate(&d_stream, Z_FINISH);
  4477. switch (err) {
  4478. case Z_STREAM_END:
  4479. case Z_BUF_ERROR:
  4480. break;
  4481. default:
  4482. goto inflate_error;
  4483. }
  4484. }
  4485. end_of_blob:
  4486. buf_page_release_zip(bpage);
  4487. goto func_exit;
  4488. }
  4489. buf_page_release_zip(bpage);
  4490. /* On other BLOB pages except the first
  4491. the BLOB header always is at the page header: */
  4492. page_no = next_page_no;
  4493. offset = FIL_PAGE_NEXT;
  4494. page_type = FIL_PAGE_TYPE_ZBLOB2;
  4495. }
  4496. func_exit:
  4497. inflateEnd(&d_stream);
  4498. mem_heap_free(heap);
  4499. UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
  4500. return(d_stream.total_out);
  4501. }
  4502. /*******************************************************************//**
  4503. Copies the prefix of an externally stored field of a record. The
  4504. clustered index record that points to this BLOB must be protected by a
  4505. lock or a page latch.
  4506. @return number of bytes written to buf */
  4507. static
  4508. ulint
  4509. btr_copy_externally_stored_field_prefix_low(
  4510. /*========================================*/
  4511. byte* buf, /*!< out: the externally stored part of
  4512. the field, or a prefix of it */
  4513. ulint len, /*!< in: length of buf, in bytes */
  4514. ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
  4515. zero for uncompressed BLOBs */
  4516. ulint space_id,/*!< in: space id of the first BLOB page */
  4517. ulint page_no,/*!< in: page number of the first BLOB page */
  4518. ulint offset) /*!< in: offset on the first BLOB page */
  4519. {
  4520. if (UNIV_UNLIKELY(len == 0)) {
  4521. return(0);
  4522. }
  4523. if (zip_size) {
  4524. return(btr_copy_zblob_prefix(buf, len, zip_size,
  4525. space_id, page_no, offset));
  4526. } else {
  4527. return(btr_copy_blob_prefix(buf, len, space_id,
  4528. page_no, offset));
  4529. }
  4530. }
  4531. /*******************************************************************//**
  4532. Copies the prefix of an externally stored field of a record. The
  4533. clustered index record must be protected by a lock or a page latch.
  4534. @return the length of the copied field, or 0 if the column was being
  4535. or has been deleted */
  4536. UNIV_INTERN
  4537. ulint
  4538. btr_copy_externally_stored_field_prefix(
  4539. /*====================================*/
  4540. byte* buf, /*!< out: the field, or a prefix of it */
  4541. ulint len, /*!< in: length of buf, in bytes */
  4542. ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
  4543. zero for uncompressed BLOBs */
  4544. const byte* data, /*!< in: 'internally' stored part of the
  4545. field containing also the reference to
  4546. the external part; must be protected by
  4547. a lock or a page latch */
  4548. ulint local_len)/*!< in: length of data, in bytes */
  4549. {
  4550. ulint space_id;
  4551. ulint page_no;
  4552. ulint offset;
  4553. ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
  4554. local_len -= BTR_EXTERN_FIELD_REF_SIZE;
  4555. if (UNIV_UNLIKELY(local_len >= len)) {
  4556. memcpy(buf, data, len);
  4557. return(len);
  4558. }
  4559. memcpy(buf, data, local_len);
  4560. data += local_len;
  4561. ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
  4562. if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
  4563. /* The externally stored part of the column has been
  4564. (partially) deleted. Signal the half-deleted BLOB
  4565. to the caller. */
  4566. return(0);
  4567. }
  4568. space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
  4569. page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
  4570. offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
  4571. return(local_len
  4572. + btr_copy_externally_stored_field_prefix_low(buf + local_len,
  4573. len - local_len,
  4574. zip_size,
  4575. space_id, page_no,
  4576. offset));
  4577. }
  4578. /*******************************************************************//**
  4579. Copies an externally stored field of a record to mem heap. The
  4580. clustered index record must be protected by a lock or a page latch.
  4581. @return the whole field copied to heap */
  4582. UNIV_INTERN
  4583. byte*
  4584. btr_copy_externally_stored_field(
  4585. /*=============================*/
  4586. ulint* len, /*!< out: length of the whole field */
  4587. const byte* data, /*!< in: 'internally' stored part of the
  4588. field containing also the reference to
  4589. the external part; must be protected by
  4590. a lock or a page latch */
  4591. ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
  4592. zero for uncompressed BLOBs */
  4593. ulint local_len,/*!< in: length of data */
  4594. mem_heap_t* heap) /*!< in: mem heap */
  4595. {
  4596. ulint space_id;
  4597. ulint page_no;
  4598. ulint offset;
  4599. ulint extern_len;
  4600. byte* buf;
  4601. ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
  4602. local_len -= BTR_EXTERN_FIELD_REF_SIZE;
  4603. space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
  4604. page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
  4605. offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
  4606. /* Currently a BLOB cannot be bigger than 4 GB; we
  4607. leave the 4 upper bytes in the length field unused */
  4608. extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
  4609. buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
  4610. memcpy(buf, data, local_len);
  4611. *len = local_len
  4612. + btr_copy_externally_stored_field_prefix_low(buf + local_len,
  4613. extern_len,
  4614. zip_size,
  4615. space_id,
  4616. page_no, offset);
  4617. return(buf);
  4618. }
  4619. /*******************************************************************//**
  4620. Copies an externally stored field of a record to mem heap.
  4621. @return the field copied to heap, or NULL if the field is incomplete */
  4622. UNIV_INTERN
  4623. byte*
  4624. btr_rec_copy_externally_stored_field(
  4625. /*=================================*/
  4626. const rec_t* rec, /*!< in: record in a clustered index;
  4627. must be protected by a lock or a page latch */
  4628. const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
  4629. ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
  4630. zero for uncompressed BLOBs */
  4631. ulint no, /*!< in: field number */
  4632. ulint* len, /*!< out: length of the field */
  4633. mem_heap_t* heap) /*!< in: mem heap */
  4634. {
  4635. ulint local_len;
  4636. const byte* data;
  4637. ut_a(rec_offs_nth_extern(offsets, no));
  4638. /* An externally stored field can contain some initial
  4639. data from the field, and in the last 20 bytes it has the
  4640. space id, page number, and offset where the rest of the
  4641. field data is stored, and the data length in addition to
  4642. the data stored locally. We may need to store some data
  4643. locally to get the local record length above the 128 byte
  4644. limit so that field offsets are stored in two bytes, and
  4645. the extern bit is available in those two bytes. */
  4646. data = rec_get_nth_field(rec, offsets, no, &local_len);
  4647. ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
  4648. if (UNIV_UNLIKELY
  4649. (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
  4650. field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
  4651. /* The externally stored field was not written yet.
  4652. This record should only be seen by
  4653. recv_recovery_rollback_active() or any
  4654. TRX_ISO_READ_UNCOMMITTED transactions. */
  4655. return(NULL);
  4656. }
  4657. return(btr_copy_externally_stored_field(len, data,
  4658. zip_size, local_len, heap));
  4659. }
  4660. #endif /* !UNIV_HOTBACKUP */