diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index e386c162ed9..93ad4b671c8 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -1262,7 +1262,7 @@ void btr_drop_temporary_table(const dict_table_t &table) { if (buf_block_t *block= buf_page_get_low({SRV_TMP_SPACE_ID, index->page}, 0, RW_X_LATCH, nullptr, BUF_GET, &mtr, - nullptr, false, nullptr)) + nullptr, false)) { btr_free_but_not_root(block, MTR_LOG_NO_REDO); mtr.set_log_mode(MTR_LOG_NO_REDO); diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index cfbc6532c41..b6d50f19e31 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -938,24 +938,21 @@ static inline page_cur_mode_t btr_cur_nonleaf_mode(page_cur_mode_t mode) MY_ATTRIBUTE((nonnull,warn_unused_result)) /** Acquire a latch on the previous page without violating the latching order. -@param block index page -@param page_id page identifier with valid space identifier -@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param rw_latch the latch on block (RW_S_LATCH or RW_X_LATCH) -@param mtr mini-transaction +@param page_id page identifier with valid space identifier @param err error code +@param mtr mini-transaction @retval 0 if an error occurred @retval 1 if the page could be latched in the wrong order @retval -1 if the latch on block was temporarily released */ -static int btr_latch_prev(buf_block_t *block, page_id_t page_id, - ulint zip_size, - rw_lock_type_t rw_latch, mtr_t *mtr, dberr_t *err) +static int btr_latch_prev(rw_lock_type_t rw_latch, + page_id_t page_id, dberr_t *err, mtr_t *mtr) { ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); - ut_ad(page_id.space() == block->page.id().space()); - const auto prev_savepoint= mtr->get_savepoint(); - ut_ad(block == mtr->at_savepoint(prev_savepoint - 1)); + buf_block_t *block= mtr->at_savepoint(mtr->get_savepoint() - 1); + + ut_ad(page_id.space() == block->page.id().space()); const page_t *const page= block->page.frame; page_id.set_page_no(btr_page_get_prev(page)); @@ -971,68 +968,78 @@ static int btr_latch_prev(buf_block_t *block, page_id_t page_id, buffer-fixes on both blocks will prevent eviction. */ retry: - /* Pass no_wait pointer to ensure that we don't wait on the current page - latch while holding the next page latch to avoid latch ordering violation. */ - bool no_wait= false; int ret= 1; - - buf_block_t *prev= buf_page_get_gen(page_id, zip_size, RW_NO_LATCH, nullptr, - BUF_GET, mtr, err, false, &no_wait); + buf_block_t *prev= buf_pool.page_fix(page_id, err, buf_pool_t::FIX_NOWAIT); if (UNIV_UNLIKELY(!prev)) - { - /* Check if we had to return because we couldn't wait on latch. */ - if (no_wait) - goto ordered_latch; return 0; - } + if (prev == reinterpret_cast(-1)) + { + /* The block existed in buf_pool.page_hash, but not in a state that is + safe to access without waiting for some pending operation, such as + buf_page_t::read_complete() or buf_pool_t::unzip(). - static_assert(MTR_MEMO_PAGE_S_FIX == mtr_memo_type_t(BTR_SEARCH_LEAF), ""); - static_assert(MTR_MEMO_PAGE_X_FIX == mtr_memo_type_t(BTR_MODIFY_LEAF), ""); + Retry while temporarily releasing the successor block->page.lock + (but retaining a buffer-fix so that the block cannot be evicted. */ - if (rw_latch == RW_S_LATCH - ? prev->page.lock.s_lock_try() : prev->page.lock.x_lock_try()) - { - mtr->lock_register(prev_savepoint, mtr_memo_type_t(rw_latch)); - if (UNIV_UNLIKELY(prev->page.id() != page_id)) + if (rw_latch == RW_S_LATCH) + block->page.lock.s_unlock(); + else + block->page.lock.x_unlock(); + + prev= buf_pool.page_fix(page_id, err, buf_pool_t::FIX_WAIT_READ); + + if (!prev) { - fail: - /* the page was just read and found to be corrupted */ - mtr->rollback_to_savepoint(prev_savepoint); + ut_ad(*err != DB_SUCCESS); + if (rw_latch == RW_S_LATCH) + block->page.lock.s_lock(); + else + block->page.lock.x_lock(); return 0; } + else if (rw_latch == RW_S_LATCH) + goto wait_for_s; + else + goto wait_for_x; } + + static_assert(MTR_MEMO_PAGE_S_FIX == mtr_memo_type_t(BTR_SEARCH_LEAF), ""); + static_assert(MTR_MEMO_PAGE_X_FIX == mtr_memo_type_t(BTR_MODIFY_LEAF), ""); + + if (rw_latch == RW_S_LATCH + ? prev->page.lock.s_lock_try() + : prev->page.lock.x_lock_try()) + mtr->memo_push(prev, mtr_memo_type_t(rw_latch)); else { - ut_ad(mtr->at_savepoint(mtr->get_savepoint() - 1)->page.id() == page_id); - mtr->release_last_page(); -ordered_latch: if (rw_latch == RW_S_LATCH) + { block->page.lock.s_unlock(); - else - block->page.lock.x_unlock(); - - prev= buf_page_get_gen(page_id, zip_size, rw_latch, prev, - BUF_GET, mtr, err); - if (rw_latch == RW_S_LATCH) + wait_for_s: + prev->page.lock.s_lock(); block->page.lock.s_lock(); + } else + { + block->page.lock.x_unlock(); + wait_for_x: + prev->page.lock.x_lock(); block->page.lock.x_lock(); + } + ut_ad(block == mtr->at_savepoint(mtr->get_savepoint() - 1)); + mtr->memo_push(prev, mtr_memo_type_t(rw_latch)); const page_id_t prev_page_id= page_id; page_id.set_page_no(btr_page_get_prev(page)); + ret= -1; if (UNIV_UNLIKELY(page_id != prev_page_id)) { mtr->release_last_page(); if (page_id.page_no() == FIL_NULL) - return -1; + return ret; goto retry; } - - if (UNIV_UNLIKELY(!prev)) - goto fail; - - ret= -1; } const page_t *const p= prev->page.frame; @@ -1061,11 +1068,11 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, btr_intention_t lock_intention; bool detected_same_key_root= false; - mem_heap_t* heap = NULL; - rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; - rec_offs* offsets = offsets_; - rec_offs offsets2_[REC_OFFS_NORMAL_SIZE]; - rec_offs* offsets2 = offsets2_; + mem_heap_t *heap= nullptr; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + rec_offs offsets2_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets2= offsets2_; rec_offs_init(offsets_); rec_offs_init(offsets2_); @@ -1314,7 +1321,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, ut_a(page_zip_validate(page_zip, block->page.frame, index())); #endif /* UNIV_ZIP_DEBUG */ - const uint32_t page_level= btr_page_get_level(block->page.frame); + uint32_t page_level= btr_page_get_level(block->page.frame); if (height == ULINT_UNDEFINED) { @@ -1322,6 +1329,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, #ifdef BTR_CUR_ADAPT info->root_guess= block; #endif + reached_root: height= page_level; tree_height= height + 1; @@ -1331,35 +1339,55 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, We may have to reacquire the page latch in a different mode. */ switch (rw_latch) { case RW_S_LATCH: - if ((latch_mode & ~12) != RW_S_LATCH) + if (!(latch_mode & BTR_SEARCH_LEAF)) { + rw_latch= RW_X_LATCH; ut_ad(rw_lock_type_t(latch_mode & ~12) == RW_X_LATCH); - goto relatch_x; - } - if (latch_mode != BTR_MODIFY_PREV) - { - if (!latch_by_caller) - /* Release the tree s-latch */ - mtr->rollback_to_savepoint(savepoint, savepoint + 1); - goto reached_latched_leaf; + mtr->lock_register(block_savepoint, MTR_MEMO_PAGE_X_FIX); + if (!block->page.lock.s_x_upgrade_try()) + { + block->page.lock.s_unlock(); + block->page.lock.x_lock(); + /* Dropping the index tree (and freeing the root page) + should be impossible while we hold index()->lock. */ + ut_ad(!block->page.is_freed()); + page_level= btr_page_get_level(block->page.frame); + if (UNIV_UNLIKELY(page_level != 0)) + { + /* btr_root_raise_and_insert() was executed meanwhile */ + ut_ad(mtr->memo_contains_flagged(&index()->lock, + MTR_MEMO_S_LOCK)); + block->page.lock.x_u_downgrade(); + block->page.lock.u_s_downgrade(); + rw_latch= RW_S_LATCH; + mtr->lock_register(block_savepoint, MTR_MEMO_PAGE_S_FIX); + goto reached_root; + } + } } - /* fall through */ + if (latch_mode == BTR_MODIFY_PREV) + goto reached_leaf; + if (rw_latch != RW_S_LATCH) + break; + if (!latch_by_caller) + /* Release the tree s-latch */ + mtr->rollback_to_savepoint(savepoint, savepoint + 1); + goto reached_latched_leaf; case RW_SX_LATCH: - ut_ad(rw_latch == RW_S_LATCH || - latch_mode == BTR_MODIFY_ROOT_AND_LEAF); - relatch_x: - mtr->rollback_to_savepoint(block_savepoint); - height= ULINT_UNDEFINED; + ut_ad(latch_mode == BTR_MODIFY_ROOT_AND_LEAF); + static_assert(int{BTR_MODIFY_ROOT_AND_LEAF} == int{RW_SX_LATCH}, ""); rw_latch= RW_X_LATCH; - goto search_loop; + mtr->lock_register(block_savepoint, MTR_MEMO_PAGE_X_FIX); + block->page.lock.u_x_upgrade(); + break; case RW_X_LATCH: if (latch_mode == BTR_MODIFY_TREE) goto reached_index_root_and_leaf; - goto reached_root_and_leaf; + break; case RW_NO_LATCH: ut_ad(0); } - goto reached_leaf; + goto reached_root_and_leaf; } } else if (UNIV_UNLIKELY(height != page_level)) @@ -1417,7 +1445,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, /* latch also siblings from left to right */ if (page_has_prev(block->page.frame) && - !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err)) + !btr_latch_prev(rw_latch, page_id, &err, mtr)) goto func_exit; if (page_has_next(block->page.frame) && !btr_block_get(*index(), btr_page_get_next(block->page.frame), @@ -1442,7 +1470,7 @@ release_tree: ut_ad(rw_latch == RW_X_LATCH); /* x-latch also siblings from left to right */ if (page_has_prev(block->page.frame) && - !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err)) + !btr_latch_prev(rw_latch, page_id, &err, mtr)) goto func_exit; if (page_has_next(block->page.frame) && !btr_block_get(*index(), btr_page_get_next(block->page.frame), @@ -1590,7 +1618,7 @@ release_tree: ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); if (!not_first_access) - buf_read_ahead_linear(page_id, zip_size, false); + buf_read_ahead_linear(page_id, false); if (page_has_prev(block->page.frame) && page_rec_is_first(page_cur.rec, block->page.frame)) @@ -1599,7 +1627,7 @@ release_tree: /* Latch the previous page if the node pointer is the leftmost of the current page. */ - int ret= btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err); + int ret= btr_latch_prev(rw_latch, page_id, &err, mtr); if (!ret) goto func_exit; ut_ad(block_savepoint + 2 == mtr->get_savepoint()); @@ -1632,7 +1660,7 @@ release_tree: ? BUF_GET_IF_IN_POOL_OR_WATCH : BUF_GET_IF_IN_POOL; else if (!not_first_access) - buf_read_ahead_linear(page_id, zip_size, false); + buf_read_ahead_linear(page_id, false); break; case BTR_MODIFY_TREE: ut_ad(rw_latch == RW_X_LATCH); @@ -1784,8 +1812,7 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple, #endif /* UNIV_ZIP_DEBUG */ if (page_has_prev(block->page.frame) && - !btr_latch_prev(block, page_id, block->zip_size(), - RW_X_LATCH, mtr, &err)) + !btr_latch_prev(RW_X_LATCH, page_id, &err, mtr)) goto func_exit; if (page_has_next(block->page.frame) && !btr_block_get(*index(), btr_page_get_next(block->page.frame), @@ -1994,7 +2021,6 @@ index_locked: page_cur.index = index; uint32_t page= index->page; - const auto zip_size= index->table->space->zip_size(); for (ulint height= ULINT_UNDEFINED;;) { @@ -2045,8 +2071,7 @@ index_locked: { /* x-latch also siblings from left to right */ if (page_has_prev(block->page.frame) && - !btr_latch_prev(block, block->page.id(), zip_size, RW_X_LATCH, - mtr, &err)) + !btr_latch_prev(RW_X_LATCH, block->page.id(), &err, mtr)) break; if (page_has_next(block->page.frame) && !btr_block_get(*index, btr_page_get_next(block->page.frame), @@ -2100,8 +2125,7 @@ index_locked: if (latch_mode != BTR_MODIFY_TREE) { if (!height && first && first_access) - buf_read_ahead_linear(page_id_t(block->page.id().space(), page), - block->page.zip_size(), false); + buf_read_ahead_linear({block->page.id().space(), page}, false); } else if (btr_cur_need_opposite_intention(block->page, index->is_clust(), lock_intention, @@ -2126,7 +2150,8 @@ index_locked: { if (!btr_cur_will_modify_tree(index, block->page.frame, lock_intention, page_cur.rec, - node_ptr_max_size, zip_size, mtr)) + node_ptr_max_size, + index->table->space->zip_size(), mtr)) { ut_ad(n_blocks); /* release buffer-fixes on pages that will not be modified @@ -6716,7 +6741,7 @@ btr_copy_blob_prefix( return copied_len; } if (!buf_page_make_young_if_needed(&block->page)) { - buf_read_ahead_linear(id, 0, false); + buf_read_ahead_linear(id, false); } page = buf_block_get_frame(block); @@ -6795,7 +6820,7 @@ btr_copy_zblob_prefix( bpage is protected by the B-tree page latch that is being held on the clustered index record, or, in row_merge_copy_blobs(), by an exclusive table lock. */ - bpage = buf_page_get_zip(id, zip_size); + bpage = buf_page_get_zip(id); if (UNIV_UNLIKELY(!bpage)) { ib::error() << "Cannot load compressed BLOB " << id; diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc index d30f037ab99..c7fcfd205fb 100644 --- a/storage/innobase/btr/btr0pcur.cc +++ b/storage/innobase/btr/btr0pcur.cc @@ -548,9 +548,7 @@ btr_pcur_move_to_next_page( const auto s = mtr->get_savepoint(); mtr->rollback_to_savepoint(s - 2, s - 1); if (first_access) { - buf_read_ahead_linear(next_block->page.id(), - next_block->zip_size(), - ibuf_inside(mtr)); + buf_read_ahead_linear(next_block->page.id(), ibuf_inside(mtr)); } return DB_SUCCESS; } diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 7a466939eae..580e004ab3b 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -2180,13 +2180,10 @@ be implemented at a higher level. In other words, all possible accesses to a given page through this function must be protected by the same set of mutexes or latches. @param page_id page identifier -@param zip_size ROW_FORMAT=COMPRESSED page size in bytes @return pointer to the block, s-latched */ TRANSACTIONAL_TARGET -buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size) +buf_page_t* buf_page_get_zip(const page_id_t page_id) { - ut_ad(zip_size); - ut_ad(ut_is_2pow(zip_size)); ha_handler_stats *const stats= mariadb_stats; buf_inc_get(stats); @@ -2287,7 +2284,7 @@ lookup: return bpage; must_read_page: - switch (dberr_t err= buf_read_page(page_id, zip_size)) { + switch (dberr_t err= buf_read_page(page_id)) { case DB_SUCCESS: case DB_SUCCESS_LOCKED_REC: mariadb_increment_pages_read(stats); @@ -2322,8 +2319,8 @@ buf_block_init_low( /********************************************************************//** Decompress a block. -@return TRUE if successful */ -ibool +@return true if successful */ +bool buf_zip_decompress( /*===============*/ buf_block_t* block, /*!< in/out: block */ @@ -2367,7 +2364,7 @@ func_exit: if (space) { space->release(); } - return(TRUE); + return true; } ib::error() << "Unable to decompress " @@ -2401,7 +2398,7 @@ err_exit: space->release(); } - return(FALSE); + return false; } ATTRIBUTE_COLD @@ -2476,7 +2473,99 @@ static bool buf_page_ibuf_merge_try(buf_block_t *block, ulint rw_latch, return false; } -buf_block_t* buf_pool_t::page_fix(const page_id_t id) +ATTRIBUTE_COLD +buf_block_t *buf_pool_t::unzip(buf_page_t *b, buf_pool_t::hash_chain &chain) +{ + buf_block_t *block= buf_LRU_get_free_block(false); + buf_block_init_low(block); + page_hash_latch &hash_lock= page_hash.lock_get(chain); + wait_for_unfix: + mysql_mutex_lock(&mutex); + hash_lock.lock(); + + /* b->lock implies !b->can_relocate() */ + ut_ad(b->lock.have_x()); + ut_ad(b == page_hash.get(b->id(), chain)); + + /* Wait for b->unfix() in any other threads. */ + uint32_t state= b->state(); + ut_ad(buf_page_t::buf_fix_count(state)); + ut_ad(!buf_page_t::is_freed(state)); + + switch (state) { + case buf_page_t::UNFIXED + 1: + case buf_page_t::IBUF_EXIST + 1: + case buf_page_t::REINIT + 1: + break; + default: + ut_ad(state < buf_page_t::READ_FIX); + + if (state < buf_page_t::UNFIXED + 1) + { + ut_ad(state > buf_page_t::FREED); + b->lock.x_unlock(); + hash_lock.unlock(); + buf_LRU_block_free_non_file_page(block); + mysql_mutex_unlock(&mutex); + b->unfix(); + return nullptr; + } + + mysql_mutex_unlock(&mutex); + hash_lock.unlock(); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + goto wait_for_unfix; + } + + /* Ensure that another buf_page_get_low() or buf_page_t::page_fix() + will wait for block->page.lock.x_unlock(). buf_relocate() will + copy the state from b to block and replace b with block in page_hash. */ + b->set_state(buf_page_t::READ_FIX); + + mysql_mutex_lock(&flush_list_mutex); + buf_relocate(b, &block->page); + + /* X-latch the block for the duration of the decompression. */ + block->page.lock.x_lock(); + + buf_flush_relocate_on_flush_list(b, &block->page); + mysql_mutex_unlock(&flush_list_mutex); + + /* Insert at the front of unzip_LRU list */ + buf_unzip_LRU_add_block(block, false); + + mysql_mutex_unlock(&mutex); + hash_lock.unlock(); + +#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG + b->lock.x_unlock(); + b->lock.free(); +#endif + ut_free(b); + + n_pend_unzip++; + const bool ok{buf_zip_decompress(block, false)}; + n_pend_unzip--; + + if (UNIV_UNLIKELY(!ok)) + { + mysql_mutex_lock(&mutex); + block->page.read_unfix(state); + block->page.lock.x_unlock(); + if (!buf_LRU_free_page(&block->page, true)) + ut_ad(0); + mysql_mutex_unlock(&mutex); + return nullptr; + } + else + block->page.read_unfix(state); + + return block; +} + +buf_block_t *buf_pool_t::page_fix(const page_id_t id, + dberr_t *err, + buf_pool_t::page_fix_conflicts c) { ha_handler_stats *const stats= mariadb_stats; buf_inc_get(stats); @@ -2486,37 +2575,97 @@ buf_block_t* buf_pool_t::page_fix(const page_id_t id) { hash_lock.lock_shared(); buf_page_t *b= page_hash.get(id, chain); - if (b) + if (b && !watch_is_sentinel(*b)) { - uint32_t state= b->fix(); - hash_lock.unlock_shared(); + uint32_t state= b->fix() + 1; ut_ad(!b->in_zip_hash); - ut_ad(b->frame); - ut_ad(state >= buf_page_t::FREED); - if (state >= buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) + hash_lock.unlock_shared(); + + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) + { + ut_ad(state > buf_page_t::FREED); + if (c == FIX_ALSO_FREED && b->id() == id) + { + ut_ad(state == buf_page_t::FREED + 1); + return reinterpret_cast(b); + } + /* The page was marked as freed or corrupted. */ + unfix_corrupted: + b->unfix(); + corrupted: + if (err) + *err= DB_CORRUPTION; + return nullptr; + } + + if ((state >= buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) || + (state >= buf_page_t::IBUF_EXIST && state < buf_page_t::REINIT)) { + if (c == FIX_NOWAIT) + { + would_block: + b->unfix(); + return reinterpret_cast(-1); + } + + if (UNIV_UNLIKELY(!b->frame)) + { + wait_for_unzip: + b->unfix(); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + continue; + } b->lock.s_lock(); state= b->state(); ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX); + + if (state >= buf_page_t::IBUF_EXIST && state < buf_page_t::REINIT && + buf_page_ibuf_merge_try(reinterpret_cast(b), + RW_S_LATCH, err)) + goto unfix_corrupted; + b->lock.s_unlock(); } - if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) + + if (UNIV_UNLIKELY(!b->frame)) { - /* The page was marked as freed or corrupted. */ - b->unfix(); - b= nullptr; + if (b->lock.x_lock_try()); + else if (c == FIX_NOWAIT) + goto would_block; + else + goto wait_for_unzip; + + buf_block_t *block= unzip(b, chain); + if (!block) + goto corrupted; + + b= &block->page; + state= b->state(); + + if (state >= buf_page_t::IBUF_EXIST && state < buf_page_t::REINIT && + buf_page_ibuf_merge_try(block, RW_X_LATCH, err)) + goto unfix_corrupted; + + b->lock.x_unlock(); } + return reinterpret_cast(b); } hash_lock.unlock_shared(); - switch (buf_read_page(id, 0)) { + + if (c == FIX_NOWAIT) + return reinterpret_cast(-1); + + switch (dberr_t local_err= buf_read_page(id)) { default: + if (err) + *err= local_err; return nullptr; case DB_SUCCESS: case DB_SUCCESS_LOCKED_REC: mariadb_increment_pages_read(stats); - buf_read_ahead_random(id, 0, false); + buf_read_ahead_random(id, false); } } } @@ -2524,42 +2673,30 @@ buf_block_t* buf_pool_t::page_fix(const page_id_t id) /** Low level function used to get access to a database page. @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] rw_latch latch mode @param[in] guess guessed block or NULL @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH @param[in] mtr mini-transaction @param[out] err DB_SUCCESS or error code @param[in] allow_ibuf_merge Allow change buffer merge to happen -while reading the page from file -then it makes sure that it does merging of change buffer changes while -reading the page from file. -@param[in,out] no_wait If not NULL on input, then we must not -wait for current page latch. On output, the value is set to true if we had to -return because we could not wait on page latch. -@return pointer to the block or NULL */ +@return pointer to the block +@retval nullptr if the block is corrupted or unavailable */ TRANSACTIONAL_TARGET buf_block_t* buf_page_get_low( const page_id_t page_id, ulint zip_size, - ulint rw_latch, + rw_lock_type_t rw_latch, buf_block_t* guess, ulint mode, mtr_t* mtr, dberr_t* err, - bool allow_ibuf_merge, - bool* no_wait) + bool allow_ibuf_merge) { - unsigned access_time; ulint retries = 0; - ut_ad(!mtr || mtr->is_active()); - ut_ad(mtr || mode == BUF_PEEK_IF_IN_POOL); - ut_ad((rw_latch == RW_S_LATCH) - || (rw_latch == RW_X_LATCH) - || (rw_latch == RW_SX_LATCH) - || (rw_latch == RW_NO_LATCH)); + ut_ad(mtr->is_active()); ut_ad(rw_latch != RW_NO_LATCH || !allow_ibuf_merge); if (err) { @@ -2586,7 +2723,7 @@ buf_page_get_low( } #endif /* UNIV_DEBUG */ - ut_ad(!mtr || !ibuf_inside(mtr) + ut_ad(!ibuf_inside(mtr) || ibuf_page_low(page_id, zip_size, FALSE, NULL)); ha_handler_stats* const stats = mariadb_stats; @@ -2658,11 +2795,11 @@ loop: corrupted, or if an encrypted page with a valid checksum cannot be decypted. */ - switch (dberr_t local_err = buf_read_page(page_id, zip_size)) { + switch (dberr_t local_err = buf_read_page(page_id)) { case DB_SUCCESS: case DB_SUCCESS_LOCKED_REC: mariadb_increment_pages_read(stats); - buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr)); + buf_read_ahead_random(page_id, ibuf_inside(mtr)); break; default: if (mode != BUF_GET_POSSIBLY_FREED @@ -2707,18 +2844,7 @@ ignore_unfixed: in buf_page_t::read_complete() or buf_pool_t::corrupted_evict(), or after buf_zip_decompress() in this function. */ - if (!no_wait) { - block->page.lock.s_lock(); - } else if (!block->page.lock.s_lock_try()) { - ut_ad(rw_latch == RW_NO_LATCH); - /* We should not wait trying to acquire S latch for - current page while holding latch for the next page. - It would violate the latching order resulting in - possible deadlock. Caller must handle the failure. */ - block->page.unfix(); - *no_wait= true; - return nullptr; - } + block->page.lock.s_lock(); state = block->page.state(); ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX); @@ -2748,18 +2874,6 @@ ignore_unfixed: } ut_ad(id == page_id); } else if (mode != BUF_PEEK_IF_IN_POOL) { - } else if (!mtr) { - ut_ad(!block->page.oldest_modification()); - mysql_mutex_lock(&buf_pool.mutex); - block->unfix(); - -free_unfixed_block: - if (!buf_LRU_free_page(&block->page, true)) { - ut_ad(0); - } - - mysql_mutex_unlock(&buf_pool.mutex); - return nullptr; } else if (UNIV_UNLIKELY(!block->page.frame)) { /* The BUF_PEEK_IF_IN_POOL mode is mainly used for dropping an adaptive hash index. There cannot be an @@ -2770,121 +2884,6 @@ free_unfixed_block: ut_ad(mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL || block->zip_size() == zip_size); - if (UNIV_UNLIKELY(!block->page.frame)) { - if (!block->page.lock.x_lock_try()) { -wait_for_unzip: - /* The page is being read or written, or - another thread is executing buf_zip_decompress() - in buf_page_get_low() on it. */ - block->page.unfix(); - std::this_thread::sleep_for( - std::chrono::microseconds(100)); - goto loop; - } - - buf_block_t *new_block = buf_LRU_get_free_block(false); - buf_block_init_low(new_block); - -wait_for_unfix: - mysql_mutex_lock(&buf_pool.mutex); - page_hash_latch& hash_lock=buf_pool.page_hash.lock_get(chain); - - /* It does not make sense to use - transactional_lock_guard here, because buf_relocate() - would likely make a memory transaction too large. */ - hash_lock.lock(); - - /* block->page.lock implies !block->page.can_relocate() */ - ut_ad(&block->page == buf_pool.page_hash.get(page_id, chain)); - - /* Wait for any other threads to release their buffer-fix - on the compressed-only block descriptor. - FIXME: Never fix() before acquiring the lock. - Only in buf_page_get_gen(), buf_page_get_low(), buf_page_free() - we are violating that principle. */ - state = block->page.state(); - - switch (state) { - case buf_page_t::UNFIXED + 1: - case buf_page_t::IBUF_EXIST + 1: - case buf_page_t::REINIT + 1: - break; - default: - ut_ad(state < buf_page_t::READ_FIX); - - if (state < buf_page_t::UNFIXED + 1) { - ut_ad(state > buf_page_t::FREED); - block->page.lock.x_unlock(); - hash_lock.unlock(); - buf_LRU_block_free_non_file_page(new_block); - mysql_mutex_unlock(&buf_pool.mutex); - goto ignore_block; - } - - mysql_mutex_unlock(&buf_pool.mutex); - hash_lock.unlock(); - std::this_thread::sleep_for( - std::chrono::microseconds(100)); - goto wait_for_unfix; - } - - /* Ensure that another buf_page_get_low() will wait for - new_block->page.lock.x_unlock(). */ - block->page.set_state(buf_page_t::READ_FIX); - - /* Move the compressed page from block->page to new_block, - and uncompress it. */ - - mysql_mutex_lock(&buf_pool.flush_list_mutex); - buf_relocate(&block->page, &new_block->page); - - /* X-latch the block for the duration of the decompression. */ - new_block->page.lock.x_lock(); - ut_d(block->page.lock.x_unlock()); - - buf_flush_relocate_on_flush_list(&block->page, - &new_block->page); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - - /* Insert at the front of unzip_LRU list */ - buf_unzip_LRU_add_block(new_block, FALSE); - - mysql_mutex_unlock(&buf_pool.mutex); - hash_lock.unlock(); - -#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG - block->page.lock.free(); -#endif - ut_free(reinterpret_cast(block)); - block = new_block; - - buf_pool.n_pend_unzip++; - - access_time = block->page.is_accessed(); - - if (!access_time && !recv_no_ibuf_operations - && ibuf_page_exists(block->page.id(), block->zip_size())) { - state = buf_page_t::IBUF_EXIST + 1; - } - - /* Decompress the page while not holding - buf_pool.mutex. */ - const auto ok = buf_zip_decompress(block, false); - --buf_pool.n_pend_unzip; - if (!ok) { - if (err) { - *err = DB_PAGE_CORRUPTED; - } - mysql_mutex_lock(&buf_pool.mutex); - } - state = block->page.read_unfix(state); - block->page.lock.x_unlock(); - - if (!ok) { - goto free_unfixed_block; - } - } - #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG re_evict: if (mode != BUF_GET_IF_IN_POOL @@ -2948,10 +2947,29 @@ re_evict_fail: ut_ad((~buf_page_t::LRU_MASK) & state); ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX); + if (UNIV_UNLIKELY(!block->page.frame)) { + if (!block->page.lock.x_lock_try()) { +wait_for_unzip: + /* The page is being read or written, or + another thread is executing buf_pool.unzip() on it. */ + block->page.unfix(); + std::this_thread::sleep_for( + std::chrono::microseconds(100)); + goto loop; + } + + block = buf_pool.unzip(&block->page, chain); + + if (!block) { + goto ignore_unfixed; + } + + block->page.lock.x_unlock(); + } + #ifdef UNIV_DEBUG if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); #endif /* UNIV_DEBUG */ - ut_ad(block->page.frame); /* The state = block->page.state() may be stale at this point, and in fact, at any point of time if we consider its @@ -3014,35 +3032,30 @@ re_evict_fail: /** Get access to a database page. Buffered redo log may be applied. @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] rw_latch latch mode @param[in] guess guessed block or NULL @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH @param[in,out] mtr mini-transaction, or NULL @param[out] err DB_SUCCESS or error code -@param[in] allow_ibuf_merge Allow change buffer merge while -reading the pages from file. -@param[in,out] no_wait If not NULL on input, then we must not -wait for current page latch. On output, the value is set to true if we had to -return because we could not wait on page latch. -@return pointer to the block or NULL */ +@param[in] allow_ibuf_merge Allow change buffer merge to happen +@return pointer to the block +@retval nullptr if the block is corrupted or unavailable */ buf_block_t* buf_page_get_gen( const page_id_t page_id, ulint zip_size, - ulint rw_latch, + rw_lock_type_t rw_latch, buf_block_t* guess, ulint mode, mtr_t* mtr, dberr_t* err, - bool allow_ibuf_merge, - bool* no_wait) + bool allow_ibuf_merge) { buf_block_t *block= recv_sys.recover(page_id); if (UNIV_LIKELY(!block)) return buf_page_get_low(page_id, zip_size, rw_latch, - guess, mode, mtr, err, allow_ibuf_merge, - no_wait); + guess, mode, mtr, err, allow_ibuf_merge); else if (UNIV_UNLIKELY(block == reinterpret_cast(-1))) { corrupted: @@ -3050,7 +3063,6 @@ buf_page_get_gen( *err= DB_CORRUPTION; return nullptr; } - /* Recovery is a special case; we fix() before acquiring lock. */ auto s= block->page.fix(); ut_ad(s >= buf_page_t::FREED); /* The block may be write-fixed at this point because we are not @@ -3097,12 +3109,21 @@ buf_page_get_gen( } } - if (rw_latch == RW_X_LATCH) - { - mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); - return block; + switch (rw_latch) { + case RW_NO_LATCH: + block->page.lock.x_unlock(); + case RW_X_LATCH: + break; + case RW_SX_LATCH: + block->page.lock.x_u_downgrade(); + break; + case RW_S_LATCH: + block->page.lock.x_u_downgrade(); + block->page.lock.u_s_downgrade(); } - block->page.lock.x_unlock(); + + mtr->memo_push(block, mtr_memo_type_t(rw_latch)); + return block; } mtr->page_lock(block, rw_latch); return block; diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index ee6bffd4031..f05bb96aaa0 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -354,14 +354,12 @@ performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous i/o. @param[in] page_id page id of a page which the current thread wants to access -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] ibuf whether we are inside ibuf routine @return number of page read requests issued; NOTE that if we read ibuf pages, it may happen that the page at the given page number does not get read even if we return a positive value! */ TRANSACTIONAL_TARGET -ulint -buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) +ulint buf_read_ahead_random(const page_id_t page_id, bool ibuf) { if (!srv_random_read_ahead || page_id.space() >= SRV_TMP_SPACE_ID) /* Disable the read-ahead for temporary tablespace */ @@ -371,9 +369,7 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) /* No read-ahead to avoid thread deadlocks */ return 0; - if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) - /* If it is an ibuf bitmap page or trx sys hdr, we do no - read-ahead, as that could break the ibuf page access order */ + if (trx_sys_hdr_page(page_id)) return 0; if (os_aio_pending_reads_approx() > @@ -384,6 +380,17 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) if (!space) return 0; + const unsigned zip_size{space->zip_size()}; + + if (ibuf_bitmap_page(page_id, zip_size)) + { + /* If it is a change buffer bitmap page, we do no + read-ahead, as that could break the ibuf page access order */ + no_read_ahead: + space->release(); + return 0; + } + const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; ulint count= 5 + buf_read_ahead_area / 8; const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); @@ -403,9 +410,7 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) goto read_ahead; } -no_read_ahead: - space->release(); - return 0; + goto no_read_ahead; read_ahead: if (space->is_stopping()) @@ -449,14 +454,13 @@ if it is not already there. Sets the io_fix and an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. @param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @retval DB_SUCCESS if the page was read and is not corrupted @retval DB_SUCCESS_LOCKED_REC if the page was not read @retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ -dberr_t buf_read_page(const page_id_t page_id, ulint zip_size) +dberr_t buf_read_page(const page_id_t page_id) { fil_space_t *space= fil_space_t::get(page_id.space()); if (!space) @@ -468,7 +472,7 @@ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size) buf_LRU_stat_inc_io(); /* NOT protected by buf_pool.mutex */ return buf_read_page_low(space, true, BUF_READ_ANY_PAGE, - page_id, zip_size, false); + page_id, space->zip_size(), false); } /** High-level function which reads a page asynchronously from a file to the @@ -515,12 +519,10 @@ NOTE 3: the calling thread must want access to the page given: this rule is set to prevent unintended read-aheads performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous io. @param[in] page_id page id; see NOTE 3 above -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] ibuf whether if we are inside ibuf routine @return number of page read requests issued */ TRANSACTIONAL_TARGET -ulint -buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) +ulint buf_read_ahead_linear(const page_id_t page_id, bool ibuf) { /* check if readahead is disabled. Disable the read ahead logic for temporary tablespace */ @@ -547,15 +549,12 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) /* This is not a border page of the area */ return 0; - if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) - /* If it is an ibuf bitmap page or trx sys hdr, we do no - read-ahead, as that could break the ibuf page access order */ - return 0; - fil_space_t *space= fil_space_t::get(page_id.space()); if (!space) return 0; + const unsigned zip_size= space->zip_size(); + if (high_1.page_no() > space->last_page_number()) { /* The area is not whole. */ @@ -564,6 +563,11 @@ fail: return 0; } + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) + /* If it is an ibuf bitmap page or trx sys hdr, we do no + read-ahead, as that could break the ibuf page access order */ + goto fail; + /* How many out of order accessed pages can we ignore when working out the access pattern for linear readahead */ ulint count= std::min(buf_pool_t::READ_AHEAD_PAGES - diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc index 4aab68e9ca2..fb34080966b 100644 --- a/storage/innobase/gis/gis0sea.cc +++ b/storage/innobase/gis/gis0sea.cc @@ -649,7 +649,7 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple, search_loop: auto buf_mode= BUF_GET; - ulint rw_latch= RW_NO_LATCH; + rw_lock_type_t rw_latch= RW_NO_LATCH; if (height) { @@ -660,7 +660,7 @@ dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple, rw_latch= upper_rw_latch; } else if (latch_mode <= BTR_MODIFY_LEAF) - rw_latch= latch_mode; + rw_latch= rw_lock_type_t(latch_mode); dberr_t err; auto block_savepoint= mtr->get_savepoint(); diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index a7a32e67203..033857857e0 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -929,10 +929,12 @@ ibuf_page_low( ut_ad(fil_system.sys_space->purpose == FIL_TYPE_TABLESPACE); #ifdef UNIV_DEBUG - if (!x_latch) { - mtr_start(&local_mtr); - - /* Get the bitmap page without a page latch, so that + if (x_latch) { + } else if (buf_block_t* block = buf_pool.page_fix( + ibuf_bitmap_page_no_calc(page_id, zip_size))) { + local_mtr.start(); + local_mtr.memo_push(block, MTR_MEMO_BUF_FIX); + /* We got the bitmap page without a page latch, so that we will not be violating the latching order when another bitmap page has already been latched by this thread. The page will be buffer-fixed, and thus it @@ -942,16 +944,10 @@ ibuf_page_low( not be modified by any other thread. Nobody should be calling ibuf_add_free_page() or ibuf_remove_free_page() while the page is linked to the insert buffer b-tree. */ - buf_block_t* block = buf_page_get_gen( - ibuf_bitmap_page_no_calc(page_id, zip_size), - zip_size, RW_NO_LATCH, nullptr, BUF_GET, &local_mtr); - - ret = block - && ibuf_bitmap_page_get_bits_low( + ret = ibuf_bitmap_page_get_bits_low( block->page.frame, page_id, zip_size, MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF); - - mtr_commit(&local_mtr); + local_mtr.commit(); return(ret); } #endif /* UNIV_DEBUG */ diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index a2c55f3edf7..82179e6d646 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -194,42 +194,37 @@ be implemented at a higher level. In other words, all possible accesses to a given page through this function must be protected by the same set of mutexes or latches. @param page_id page identifier -@param zip_size ROW_FORMAT=COMPRESSED page size in bytes @return pointer to the block, s-latched */ -buf_page_t *buf_page_get_zip(const page_id_t page_id, ulint zip_size); +buf_page_t *buf_page_get_zip(const page_id_t page_id); /** Get access to a database page. Buffered redo log may be applied. @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] rw_latch latch mode @param[in] guess guessed block or NULL @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH @param[in,out] mtr mini-transaction @param[out] err DB_SUCCESS or error code -@param[in] allow_ibuf_merge Allow change buffer merge while -reading the pages from file. -@param[in,out] no_wait If not NULL on input, then we must not -wait for current page latch. On output, the value is set to true if we had to -return because we could not wait on page latch. -@return pointer to the block or NULL */ +@param[in] allow_ibuf_merge Allow change buffer merge to happen +@return pointer to the block +@retval nullptr if the block is corrupted or unavailable */ buf_block_t* buf_page_get_gen( const page_id_t page_id, ulint zip_size, - ulint rw_latch, + rw_lock_type_t rw_latch, buf_block_t* guess, ulint mode, mtr_t* mtr, - dberr_t* err = NULL, - bool allow_ibuf_merge = false, - bool* no_wait = nullptr) + dberr_t* err = nullptr, + bool allow_ibuf_merge = false) MY_ATTRIBUTE((nonnull(6), warn_unused_result)); /** This is the low level function used to get access to a database page. @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] rw_latch latch mode @param[in] guess guessed block or NULL @param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH @@ -237,26 +232,19 @@ BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH block with page_id is to be evicted @param[out] err DB_SUCCESS or error code @param[in] allow_ibuf_merge Allow change buffer merge to happen -while reading the page from file -then it makes sure that it does merging of change buffer changes while -reading the page from file. -@param[in] holds_next_page_latch True if caller holds next page latch. -We must not wait for current page latch. -@param[in,out] no_wait If not NULL on input, then we must not -wait for current page latch. On output, the value is set to true if we had to -return because we could not wait on page latch. -@return pointer to the block or NULL */ +@return pointer to the block +@retval nullptr if the block is corrupted or unavailable */ buf_block_t* buf_page_get_low( const page_id_t page_id, ulint zip_size, - ulint rw_latch, + rw_lock_type_t rw_latch, buf_block_t* guess, ulint mode, mtr_t* mtr, dberr_t* err, - bool allow_ibuf_merge, - bool* no_wait); + bool allow_ibuf_merge) + MY_ATTRIBUTE((nonnull(6), warn_unused_result)); /** Initialize a page in the buffer pool. The page is usually not read from a file even if it cannot be found in the buffer buf_pool. This is one @@ -398,8 +386,8 @@ void buf_page_print(const byte* read_buf, ulint zip_size = 0) ATTRIBUTE_COLD __attribute__((nonnull)); /********************************************************************//** Decompress a block. -@return TRUE if successful */ -ibool +@return true if successful */ +bool buf_zip_decompress( /*===============*/ buf_block_t* block, /*!< in/out: block */ @@ -664,37 +652,49 @@ public: public: const page_id_t &id() const { return id_; } uint32_t state() const { return zip.fix; } - uint32_t buf_fix_count() const - { - uint32_t f= state(); - ut_ad(f >= FREED); - return f < UNFIXED ? (f - FREED) : (~LRU_MASK & f); - } + static uint32_t buf_fix_count(uint32_t s) + { ut_ad(s >= FREED); return s < UNFIXED ? (s - FREED) : (~LRU_MASK & s); } + + uint32_t buf_fix_count() const { return buf_fix_count(state()); } + /** Check if a file block is io-fixed. + @param s state() + @return whether s corresponds to an io-fixed block */ + static bool is_io_fixed(uint32_t s) + { ut_ad(s >= FREED); return s >= READ_FIX; } + /** Check if a file block is read-fixed. + @param s state() + @return whether s corresponds to a read-fixed block */ + static bool is_read_fixed(uint32_t s) + { return is_io_fixed(s) && s < WRITE_FIX; } + /** Check if a file block is write-fixed. + @param s state() + @return whether s corresponds to a write-fixed block */ + static bool is_write_fixed(uint32_t s) + { ut_ad(s >= FREED); return s >= WRITE_FIX; } + /** @return whether this block is read or write fixed; read_complete() or write_complete() will always release the io-fix before releasing U-lock or X-lock */ - bool is_io_fixed() const - { const auto s= state(); ut_ad(s >= FREED); return s >= READ_FIX; } + bool is_io_fixed() const { return is_io_fixed(state()); } /** @return whether this block is write fixed; write_complete() will always release the write-fix before releasing U-lock */ - bool is_write_fixed() const { return state() >= WRITE_FIX; } - /** @return whether this block is read fixed; this should never hold - when a thread is holding the block lock in any mode */ - bool is_read_fixed() const { return is_io_fixed() && !is_write_fixed(); } + bool is_write_fixed() const { return is_write_fixed(state()); } + /** @return whether this block is read fixed */ + bool is_read_fixed() const { return is_read_fixed(state()); } /** @return if this belongs to buf_pool.unzip_LRU */ bool belongs_to_unzip_LRU() const { return UNIV_LIKELY_NULL(zip.data) && frame; } - bool is_freed() const - { const auto s= state(); ut_ad(s >= FREED); return s < UNFIXED; } - bool is_ibuf_exist() const + static bool is_freed(uint32_t s) { ut_ad(s >= FREED); return s < UNFIXED; } + bool is_freed() const { return is_freed(state()); } + static bool is_ibuf_exist(uint32_t s) { - const auto s= state(); ut_ad(s >= UNFIXED); ut_ad(s < READ_FIX); return (s & LRU_MASK) == IBUF_EXIST; } + bool is_ibuf_exist() const { return is_ibuf_exist(state()); } bool is_reinit() const { return !(~state() & REINIT); } void set_reinit(uint32_t prev_state) @@ -1416,11 +1416,43 @@ public: } public: + /** page_fix() mode of operation */ + enum page_fix_conflicts{ + /** Fetch if in the buffer pool, also blocks marked as free */ + FIX_ALSO_FREED= -1, + /** Fetch, waiting for page read completion */ + FIX_WAIT_READ, + /** Fetch, but avoid any waits for */ + FIX_NOWAIT + }; + /** Look up and buffer-fix a page. + Note: If the page is read-fixed (being read into the buffer pool), + we would have to wait for the page latch before determining if the page + is accessible (it could be corrupted and have been evicted again). + If the caller is holding other page latches so that waiting for this + page latch could lead to lock order inversion (latching order violation), + the mode c=FIX_WAIT_READ must not be used. @param id page identifier + @param err error code (will only be assigned when returning nullptr) + @param c how to handle conflicts @return undo log page, buffer-fixed + @retval -1 if c=FIX_NOWAIT and buffer-fixing would require waiting @retval nullptr if the undo page was corrupted or freed */ - buf_block_t *page_fix(const page_id_t id); + buf_block_t *page_fix(const page_id_t id, dberr_t *err, + page_fix_conflicts c); + + buf_block_t *page_fix(const page_id_t id) + { return page_fix(id, nullptr, FIX_WAIT_READ); } + + + /** Decompress a page and relocate the block descriptor + @param b buffer-fixed compressed-only ROW_FORMAT=COMPRESSED page + @param chain hash table chain for b->id().fold() + @return the decompressed block, x-latched and read-fixed + @retval nullptr if the decompression failed (b->unfix() will be invoked) */ + ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result)) + buf_block_t *unzip(buf_page_t *b, hash_chain &chain); /** @return whether the buffer pool contains a page @tparam allow_watch whether to allow watch_is_sentinel() @@ -1698,8 +1730,8 @@ public: /** map of block->frame to buf_block_t blocks that belong to buf_buddy_alloc(); protected by buf_pool.mutex */ hash_table_t zip_hash; - Atomic_counter - n_pend_unzip; /*!< number of pending decompressions */ + /** number of pending unzip() */ + Atomic_counter n_pend_unzip; time_t last_printout_time; /*!< when buf_print_io was last time diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h index 3dd085dda5c..32296720c79 100644 --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -34,14 +34,13 @@ buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. @param page_id page id -@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 @retval DB_SUCCESS if the page was read and is not corrupted @retval DB_SUCCESS_LOCKED_REC if the page was not read @retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ -dberr_t buf_read_page(const page_id_t page_id, ulint zip_size); +dberr_t buf_read_page(const page_id_t page_id); /** High-level function which reads a page asynchronously from a file to the buffer buf_pool if it is not already there. Sets the io_fix flag and sets @@ -65,13 +64,11 @@ performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous i/o. @param[in] page_id page id of a page which the current thread wants to access -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] ibuf whether we are inside ibuf routine @return number of page read requests issued; NOTE that if we read ibuf pages, it may happen that the page at the given page number does not get read even if we return a positive value! */ -ulint -buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf); +ulint buf_read_ahead_random(const page_id_t page_id, bool ibuf); /** Applies linear read-ahead if in the buf_pool the page is a border page of a linear read-ahead area and all the pages in the area have been accessed. @@ -96,11 +93,10 @@ NOTE 3: the calling thread must want access to the page given: this rule is set to prevent unintended read-aheads performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous io. @param[in] page_id page id; see NOTE 3 above -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] ibuf whether if we are inside ibuf routine @return number of page read requests issued */ ulint -buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf); +buf_read_ahead_linear(const page_id_t page_id, bool ibuf); /** Schedule a page for recovery. @param space tablespace diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc index b130675da15..4d5f3592b5a 100644 --- a/storage/innobase/row/row0import.cc +++ b/storage/innobase/row/row0import.cc @@ -2160,38 +2160,43 @@ updated then its state must be set to BUF_PAGE_NOT_USED. @retval DB_SUCCESS or error code. */ dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW { - /* If we already had an old page with matching number - in the buffer pool, evict it now, because - we no longer evict the pages on DISCARD TABLESPACE. */ - buf_page_get_low(block->page.id(), get_zip_size(), RW_NO_LATCH, - nullptr, BUF_PEEK_IF_IN_POOL, - nullptr, nullptr, false, nullptr); - - uint16_t page_type; - - if (dberr_t err = update_page(block, page_type)) { - return err; - } - - const bool full_crc32 = fil_space_t::full_crc32(get_space_flags()); - byte* frame = get_frame(block); - memset_aligned<8>(frame + FIL_PAGE_LSN, 0, 8); - - if (!block->page.zip.data) { - buf_flush_init_for_writing( - NULL, block->page.frame, NULL, full_crc32); - } else if (fil_page_type_is_index(page_type)) { - buf_flush_init_for_writing( - NULL, block->page.zip.data, &block->page.zip, - full_crc32); - } else { - /* Calculate and update the checksum of non-index - pages for ROW_FORMAT=COMPRESSED tables. */ - buf_flush_update_zip_checksum( - block->page.zip.data, block->zip_size()); - } + /* If we already had an old page with matching number in the buffer + pool, evict it now, because we no longer evict the pages on + DISCARD TABLESPACE. */ + if (buf_block_t *b= buf_pool.page_fix(block->page.id(), nullptr, + buf_pool_t::FIX_ALSO_FREED)) + { + ut_ad(!b->page.oldest_modification()); + mysql_mutex_lock(&buf_pool.mutex); + b->unfix(); - return DB_SUCCESS; + if (!buf_LRU_free_page(&b->page, true)) + ut_ad(0); + + mysql_mutex_unlock(&buf_pool.mutex); + } + + uint16_t page_type; + + if (dberr_t err= update_page(block, page_type)) + return err; + + const bool full_crc32= fil_space_t::full_crc32(get_space_flags()); + byte *frame= get_frame(block); + memset_aligned<8>(frame + FIL_PAGE_LSN, 0, 8); + + if (!block->page.zip.data) + buf_flush_init_for_writing(nullptr, block->page.frame, nullptr, + full_crc32); + else if (fil_page_type_is_index(page_type)) + buf_flush_init_for_writing(nullptr, block->page.zip.data, &block->page.zip, + full_crc32); + else + /* Calculate and update the checksum of non-index + pages for ROW_FORMAT=COMPRESSED tables. */ + buf_flush_update_zip_checksum(block->page.zip.data, block->zip_size()); + + return DB_SUCCESS; } static void reload_fts_table(row_prebuilt_t *prebuilt, diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index 276bcb6166d..c623265192a 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -1971,38 +1971,6 @@ corrupted_rec: mem_heap_empty(row_heap); if (!mtr_started) { - goto scan_next; - } - - if (clust_index->lock.is_waiting()) { - /* There are waiters on the clustered - index tree lock, likely the purge - thread. Store and restore the cursor - position, and yield so that scanning a - large table will not starve other - threads. */ - - /* Store the cursor position on the last user - record on the page. */ - if (!btr_pcur_move_to_prev_on_page(&pcur)) { - goto corrupted_index; - } - /* Leaf pages must never be empty, unless - this is the only page in the index tree. */ - if (!btr_pcur_is_on_user_rec(&pcur) - && btr_pcur_get_block(&pcur)->page.id() - .page_no() != clust_index->page) { - goto corrupted_index; - } - - btr_pcur_store_position(&pcur, &mtr); - mtr.commit(); - mtr_started = false; - - /* Give the waiters a chance to proceed. */ - std::this_thread::yield(); -scan_next: - ut_ad(!mtr_started); ut_ad(!mtr.is_active()); mtr.start(); mtr_started = true; @@ -2015,7 +1983,7 @@ scan_next: corrupted_index: err = DB_CORRUPTION; goto func_exit; - } + } /* Move to the successor of the original record. */ if (!btr_pcur_move_to_next_user_rec( @@ -2050,14 +2018,14 @@ end_of_index: buf_page_make_young_if_needed(&block->page); + const auto s = mtr.get_savepoint(); + mtr.rollback_to_savepoint(s - 2, s - 1); + page_cur_set_before_first(block, cur); if (!page_cur_move_to_next(cur) || page_cur_is_after_last(cur)) { goto corrupted_rec; } - - const auto s = mtr.get_savepoint(); - mtr.rollback_to_savepoint(s - 2, s - 1); } } else { mem_heap_empty(row_heap); diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index c0f5b1fb22c..5626b88dcf6 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -185,7 +185,7 @@ trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec, return nullptr; if (!buf_page_make_young_if_needed(&block->page)) - buf_read_ahead_linear(block->page.id(), 0, false); + buf_read_ahead_linear(block->page.id(), false); return trx_undo_page_get_last_rec(block, page_no, offset); } @@ -242,7 +242,7 @@ trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no, static trx_undo_rec_t* trx_undo_get_next_rec_from_next_page(const buf_block_t *&block, uint32_t page_no, uint16_t offset, - ulint mode, mtr_t *mtr) + rw_lock_type_t mode, mtr_t *mtr) { if (page_no == block->page.id().page_no() && mach_read_from_2(block->page.frame + offset + TRX_UNDO_NEXT_LOG)) @@ -272,7 +272,8 @@ trx_undo_get_next_rec_from_next_page(const buf_block_t *&block, @retval nullptr if none */ static trx_undo_rec_t* trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no, - uint16_t offset, ulint mode, const buf_block_t*& block, + uint16_t offset, rw_lock_type_t mode, + const buf_block_t *&block, mtr_t *mtr, dberr_t *err) { buf_block_t *b= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode, @@ -282,7 +283,7 @@ trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no, return nullptr; if (!buf_page_make_young_if_needed(&b->page)) - buf_read_ahead_linear(b->page.id(), 0, false); + buf_read_ahead_linear(b->page.id(), false); if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(b, page_no, offset)) return rec;