From 716e07693447df0a7f080cdbf51a6de8550ae4c3 Mon Sep 17 00:00:00 2001 From: marko Date: Thu, 27 Aug 2009 06:25:00 +0000 Subject: [PATCH] branches/zip: Replace the constant 3/8 ratio that controls the LRU_old size with the settable global variable innodb_old_blocks_pct. The minimum and maximum values are 5 and 95 per cent, respectively. The default is 100*3/8, in line with the old behavior. ut_time_ms(): New utility function, to return the current time in milliseconds. TODO: Is there a more efficient timestamp function, such as rdtsc divided by a power of two? buf_LRU_old_threshold_ms: New variable, corresponding to innodb_old_blocks_time. The value 0 is the default behaviour: no timeout before making blocks 'new'. bpage->accessed, bpage->LRU_position, buf_pool->ulint_clock: Remove. bpage->access_time: New field, replacing bpage->accessed. Protected by buf_pool_mutex instead of bpage->mutex. Updated when a page is created or accessed the first time in the buffer pool. buf_LRU_old_ratio, innobase_old_blocks_pct: New variables, corresponding to innodb_old_blocks_pct buf_LRU_old_ratio_update(), innobase_old_blocks_pct_update(): Update functions for buf_LRU_old_ratio, innobase_old_blocks_pct. buf_page_peek_if_too_old(): Compare ut_time_ms() to bpage->access_time if buf_LRU_old_threshold_ms && bpage->old. Else observe buf_LRU_old_ratio and bpage->freed_page_clock. buf_pool_t: Add n_pages_made_young, n_pages_not_made_young, n_pages_made_young_old, n_pages_not_made_young, for statistics. buf_print(): Display buf_pool->n_pages_made_young, buf_pool->n_pages_not_made_young. This function is only for crash diagnostics. buf_print_io(): Display buf_pool->LRU_old_len and quantities derived from buf_pool->n_pages_made_young, buf_pool->n_pages_not_made_young. This function is invoked by SHOW ENGINE INNODB STATUS. rb://129 approved by Heikki Tuuri. This addresses Bug #45015. --- ChangeLog | 21 +++++ buf/buf0buf.c | 129 ++++++++++++++----------- buf/buf0lru.c | 219 ++++++++++++++++++++++--------------------- buf/buf0rea.c | 186 ++++-------------------------------- handler/ha_innodb.cc | 41 ++++++++ include/buf0buf.h | 99 +++++++++---------- include/buf0buf.ic | 70 ++++++-------- include/buf0lru.h | 52 ++++++++-- include/ut0ut.h | 9 ++ ut/ut0ut.c | 17 ++++ 10 files changed, 404 insertions(+), 439 deletions(-) diff --git a/ChangeLog b/ChangeLog index e9b1b95dc2d..40cad959f40 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +2009-08-27 The InnoDB Team + + * buf/buf0buf.c, buf/buf0lru.c, buf/buf0rea.c, + handler/ha_innodb.cc, include/buf0buf.h, include/buf0buf.ic, + include/buf0lru.h, include/ut0ut.h, ut/ut0ut.c: + Make it possible to tune the buffer pool LRU eviction policy to be + more resistant against index scans. Introduce the settable global + variables innodb_old_blocks_pct and innodb_old_blocks_time for + controlling the buffer pool eviction policy. The parameter + innodb_old_blocks_pct (5..95) controls the desired amount of "old" + blocks in the LRU list. The default is 37, corresponding to the + old fixed ratio of 3/8. Each time a block is accessed, it will be + moved to the "new" blocks if its first access was at least + innodb_old_blocks_time milliseconds ago (default 0, meaning every + block). The idea is that in index scans, blocks will be accessed + a few times within innodb_old_blocks_time, and they will remain in + the "old" section of the LRU list. Thus, when + innodb_old_blocks_time is nonzero, blocks retrieved for one-time + index scans will be more likely candidates for eviction than + blocks that are accessed in random patterns. + 2009-08-26 The InnoDB Team * handler/ha_innodb.cc, os/os0file.c: diff --git a/buf/buf0buf.c b/buf/buf0buf.c index 0008fcb1271..b7e7ec0dbb6 100644 --- a/buf/buf0buf.c +++ b/buf/buf0buf.c @@ -966,8 +966,6 @@ buf_pool_init(void) buf_pool->no_flush[i] = os_event_create(NULL); } - buf_pool->ulint_clock = 1; - /* 3. Initialize LRU fields --------------------------- */ /* All fields are initialized by mem_zalloc(). */ @@ -1470,31 +1468,6 @@ buf_pool_resize(void) buf_pool_page_hash_rebuild(); } -/********************************************************************//** -Moves the block to the start of the LRU list if there is a danger -that the block would drift out of the buffer pool. */ -UNIV_INLINE -void -buf_block_make_young( -/*=================*/ - buf_page_t* bpage) /*!< in: block to make younger */ -{ - ut_ad(!buf_pool_mutex_own()); - - /* Note that we read freed_page_clock's without holding any mutex: - this is allowed since the result is used only in heuristics */ - - if (buf_page_peek_if_too_old(bpage)) { - - buf_pool_mutex_enter(); - /* There has been freeing activity in the LRU list: - best to move to the head of the LRU list */ - - buf_LRU_make_block_young(bpage); - buf_pool_mutex_exit(); - } -} - /********************************************************************//** Moves a page to the start of the buffer pool LRU list. This high-level function can be used to prevent an important page from from slipping out of @@ -1713,13 +1686,15 @@ err_exit: got_block: must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ; - buf_pool_mutex_exit(); + if (buf_page_peek_if_too_old(bpage)) { + buf_LRU_make_block_young(bpage); + } - buf_page_set_accessed(bpage, TRUE); + buf_page_set_accessed(bpage); - mutex_exit(block_mutex); + buf_pool_mutex_exit(); - buf_block_make_young(bpage); + mutex_exit(block_mutex); #ifdef UNIV_DEBUG_FILE_ACCESSES ut_a(!bpage->file_page_was_freed); @@ -2000,7 +1975,7 @@ buf_page_get_gen( mtr_t* mtr) /*!< in: mini-transaction */ { buf_block_t* block; - ibool accessed; + unsigned access_time; ulint fix_type; ibool must_read; @@ -2243,17 +2218,20 @@ wait_until_unfixed: UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page); buf_block_buf_fix_inc(block, file, line); - buf_pool_mutex_exit(); + + mutex_exit(&block->mutex); /* Check if this is the first access to the page */ - accessed = buf_page_is_accessed(&block->page); + access_time = buf_page_is_accessed(&block->page); - buf_page_set_accessed(&block->page, TRUE); + if (buf_page_peek_if_too_old(&block->page)) { + buf_LRU_make_block_young(&block->page); + } - mutex_exit(&block->mutex); + buf_page_set_accessed(&block->page); - buf_block_make_young(&block->page); + buf_pool_mutex_exit(); #ifdef UNIV_DEBUG_FILE_ACCESSES ut_a(!block->page.file_page_was_freed); @@ -2306,7 +2284,7 @@ wait_until_unfixed: mtr_memo_push(mtr, block, fix_type); - if (!accessed) { + if (!access_time) { /* In the case of a first access, try to apply linear read-ahead */ @@ -2336,7 +2314,7 @@ buf_page_optimistic_get_func( ulint line, /*!< in: line where called */ mtr_t* mtr) /*!< in: mini-transaction */ { - ibool accessed; + unsigned access_time; ibool success; ulint fix_type; @@ -2353,14 +2331,21 @@ buf_page_optimistic_get_func( } buf_block_buf_fix_inc(block, file, line); - accessed = buf_page_is_accessed(&block->page); - buf_page_set_accessed(&block->page, TRUE); mutex_exit(&block->mutex); - buf_block_make_young(&block->page); + buf_pool_mutex_enter(); /* Check if this is the first access to the page */ + access_time = buf_page_is_accessed(&block->page); + + if (buf_page_peek_if_too_old(&block->page)) { + buf_LRU_make_block_young(&block->page); + } + + buf_page_set_accessed(&block->page); + + buf_pool_mutex_exit(); ut_ad(!ibuf_inside() || ibuf_page(buf_block_get_space(block), @@ -2412,7 +2397,7 @@ buf_page_optimistic_get_func( #ifdef UNIV_DEBUG_FILE_ACCESSES ut_a(block->page.file_page_was_freed == FALSE); #endif - if (UNIV_UNLIKELY(!accessed)) { + if (UNIV_UNLIKELY(!access_time)) { /* In the case of a first access, try to apply linear read-ahead */ @@ -2473,10 +2458,16 @@ buf_page_get_known_nowait( mutex_exit(&block->mutex); - if (mode == BUF_MAKE_YOUNG) { - buf_block_make_young(&block->page); + buf_pool_mutex_enter(); + + if (mode == BUF_MAKE_YOUNG && buf_page_peek_if_too_old(&block->page)) { + buf_LRU_make_block_young(&block->page); } + buf_page_set_accessed(&block->page); + + buf_pool_mutex_exit(); + ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD)); if (rw_latch == RW_S_LATCH) { @@ -2608,10 +2599,10 @@ buf_page_init_low( buf_page_t* bpage) /*!< in: block to init */ { bpage->flush_type = BUF_FLUSH_LRU; - bpage->accessed = FALSE; bpage->io_fix = BUF_IO_NONE; bpage->buf_fix_count = 0; bpage->freed_page_clock = 0; + bpage->access_time = 0; bpage->newest_modification = 0; bpage->oldest_modification = 0; HASH_INVALIDATE(bpage, hash); @@ -2990,12 +2981,12 @@ buf_page_create( rw_lock_x_unlock(&block->lock); } + buf_page_set_accessed(&block->page); + buf_pool_mutex_exit(); mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); - buf_page_set_accessed(&block->page, TRUE); - mutex_exit(&block->mutex); /* Delete possible entries for the page from the insert buffer: @@ -3528,6 +3519,7 @@ buf_print(void) "n pending decompressions %lu\n" "n pending reads %lu\n" "n pending flush LRU %lu list %lu single page %lu\n" + "pages made young %lu, not young %lu\n" "pages read %lu, created %lu, written %lu\n", (ulong) size, (ulong) UT_LIST_GET_LEN(buf_pool->LRU), @@ -3538,6 +3530,8 @@ buf_print(void) (ulong) buf_pool->n_flush[BUF_FLUSH_LRU], (ulong) buf_pool->n_flush[BUF_FLUSH_LIST], (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE], + (ulong) buf_pool->n_pages_made_young, + (ulong) buf_pool->n_pages_not_made_young, (ulong) buf_pool->n_pages_read, buf_pool->n_pages_created, (ulong) buf_pool->n_pages_written); @@ -3744,10 +3738,9 @@ buf_print_io( { time_t current_time; double time_elapsed; - ulint size; + ulint n_gets_diff; ut_ad(buf_pool); - size = buf_pool->curr_size; buf_pool_mutex_enter(); @@ -3755,12 +3748,14 @@ buf_print_io( "Buffer pool size %lu\n" "Free buffers %lu\n" "Database pages %lu\n" + "Old database pages %lu\n" "Modified db pages %lu\n" "Pending reads %lu\n" "Pending writes: LRU %lu, flush list %lu, single page %lu\n", - (ulong) size, + (ulong) buf_pool->curr_size, (ulong) UT_LIST_GET_LEN(buf_pool->free), (ulong) UT_LIST_GET_LEN(buf_pool->LRU), + (ulong) buf_pool->LRU_old_len, (ulong) UT_LIST_GET_LEN(buf_pool->flush_list), (ulong) buf_pool->n_pend_reads, (ulong) buf_pool->n_flush[BUF_FLUSH_LRU] @@ -3775,8 +3770,18 @@ buf_print_io( buf_pool->last_printout_time = current_time; fprintf(file, + "Pages made young %lu, not young %lu\n" + "%.2f youngs/s, %.2f non-youngs/s\n" "Pages read %lu, created %lu, written %lu\n" "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", + (ulong) buf_pool->n_pages_made_young, + (ulong) buf_pool->n_pages_not_made_young, + (buf_pool->n_pages_made_young + - buf_pool->n_pages_made_young_old) + / time_elapsed, + (buf_pool->n_pages_not_made_young + - buf_pool->n_pages_not_made_young_old) + / time_elapsed, (ulong) buf_pool->n_pages_read, (ulong) buf_pool->n_pages_created, (ulong) buf_pool->n_pages_written, @@ -3787,19 +3792,33 @@ buf_print_io( (buf_pool->n_pages_written - buf_pool->n_pages_written_old) / time_elapsed); - if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) { - fprintf(file, "Buffer pool hit rate %lu / 1000\n", + n_gets_diff = buf_pool->n_page_gets - buf_pool->n_page_gets_old; + + if (n_gets_diff) { + fprintf(file, + "Buffer pool hit rate %lu / 1000," + " young-making rate %lu / 1000 not %lu / 1000\n", (ulong) (1000 - ((1000 * (buf_pool->n_pages_read - buf_pool->n_pages_read_old)) - / (buf_pool->n_page_gets - - buf_pool->n_page_gets_old)))); + / n_gets_diff)), + (ulong) + (1000 * (buf_pool->n_pages_made_young + - buf_pool->n_pages_made_young_old) + / n_gets_diff), + (ulong) + (1000 * (buf_pool->n_pages_not_made_young + - buf_pool->n_pages_not_made_young_old) + / n_gets_diff)); } else { fputs("No buffer pool page gets since the last printout\n", file); } buf_pool->n_page_gets_old = buf_pool->n_page_gets; + buf_pool->n_pages_made_young_old = buf_pool->n_pages_made_young; + buf_pool->n_pages_not_made_young_old + = buf_pool->n_pages_not_made_young; buf_pool->n_pages_read_old = buf_pool->n_pages_read; buf_pool->n_pages_created_old = buf_pool->n_pages_created; buf_pool->n_pages_written_old = buf_pool->n_pages_written; diff --git a/buf/buf0lru.c b/buf/buf0lru.c index be53a5f5d9d..7d09786a7c8 100644 --- a/buf/buf0lru.c +++ b/buf/buf0lru.c @@ -49,14 +49,23 @@ Created 11/5/1995 Heikki Tuuri #include "log0recv.h" #include "srv0srv.h" -/** The number of blocks from the LRU_old pointer onward, including the block -pointed to, must be 3/8 of the whole LRU list length, except that the -tolerance defined below is allowed. Note that the tolerance must be small -enough such that for even the BUF_LRU_OLD_MIN_LEN long LRU list, the -LRU_old pointer is not allowed to point to either end of the LRU list. */ +/** The number of blocks from the LRU_old pointer onward, including +the block pointed to, must be buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV +of the whole LRU list length, except that the tolerance defined below +is allowed. Note that the tolerance must be small enough such that for +even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not +allowed to point to either end of the LRU list. */ #define BUF_LRU_OLD_TOLERANCE 20 +/** The minimum amount of non-old blocks when the LRU_old list exists +(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks). +@see buf_LRU_old_adjust_len */ +#define BUF_LRU_NON_OLD_MIN_LEN 5 +#if BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN +# error "BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN" +#endif + /** The whole LRU list length is divided by this number to determine an initial segment in buf_LRU_get_recent_limit */ @@ -107,6 +116,15 @@ UNIV_INTERN buf_LRU_stat_t buf_LRU_stat_sum; /* @} */ +/** @name Heuristics for detecting index scan @{ */ +/** Reserve this much/BUF_LRU_OLD_RATIO_DIV of the buffer pool for +"old" blocks. Protected by buf_pool_mutex. */ +UNIV_INTERN uint buf_LRU_old_ratio; +/** Move blocks to "new" LRU list only if the first access was at +least this many milliseconds ago. Not protected by any mutex or latch. */ +UNIV_INTERN uint buf_LRU_old_threshold_ms; +/* @} */ + /******************************************************************//** Takes a block out of the LRU list and page hash table. If the block is compressed-only (BUF_BLOCK_ZIP_PAGE), @@ -428,42 +446,6 @@ next_page: } } -/******************************************************************//** -Gets the minimum LRU_position field for the blocks in an initial segment -(determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not -guaranteed to be precise, because the ulint_clock may wrap around. -@return the limit; zero if could not determine it */ -UNIV_INTERN -ulint -buf_LRU_get_recent_limit(void) -/*==========================*/ -{ - const buf_page_t* bpage; - ulint len; - ulint limit; - - buf_pool_mutex_enter(); - - len = UT_LIST_GET_LEN(buf_pool->LRU); - - if (len < BUF_LRU_OLD_MIN_LEN) { - /* The LRU list is too short to do read-ahead */ - - buf_pool_mutex_exit(); - - return(0); - } - - bpage = UT_LIST_GET_FIRST(buf_pool->LRU); - - limit = buf_page_get_LRU_position(bpage); - len /= BUF_LRU_INITIAL_RATIO; - - buf_pool_mutex_exit(); - - return(limit > len ? (limit - len) : 0); -} - /********************************************************************//** Insert a compressed block into buf_pool->zip_clean in the LRU order. */ UNIV_INTERN @@ -953,8 +935,10 @@ buf_LRU_old_adjust_len(void) ut_a(buf_pool->LRU_old); ut_ad(buf_pool_mutex_own()); -#if 3 * (BUF_LRU_OLD_MIN_LEN / 8) <= BUF_LRU_OLD_TOLERANCE + 5 -# error "3 * (BUF_LRU_OLD_MIN_LEN / 8) <= BUF_LRU_OLD_TOLERANCE + 5" + ut_ad(buf_LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN); + ut_ad(buf_LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX); +#if BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5) +# error "BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5)" #endif #ifdef UNIV_LRU_DEBUG /* buf_pool->LRU_old must be the first item in the LRU list @@ -966,34 +950,39 @@ buf_LRU_old_adjust_len(void) || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old); #endif /* UNIV_LRU_DEBUG */ + old_len = buf_pool->LRU_old_len; + new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU) + * buf_LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV, + UT_LIST_GET_LEN(buf_pool->LRU) + - (BUF_LRU_OLD_TOLERANCE + + BUF_LRU_NON_OLD_MIN_LEN)); + for (;;) { - old_len = buf_pool->LRU_old_len; - new_len = 3 * (UT_LIST_GET_LEN(buf_pool->LRU) / 8); + buf_page_t* LRU_old = buf_pool->LRU_old; - ut_ad(buf_pool->LRU_old->in_LRU_list); - ut_a(buf_pool->LRU_old); + ut_a(LRU_old); + ut_ad(LRU_old->in_LRU_list); #ifdef UNIV_LRU_DEBUG - ut_a(buf_pool->LRU_old->old); + ut_a(LRU_old->old); #endif /* UNIV_LRU_DEBUG */ /* Update the LRU_old pointer if necessary */ - if (old_len < new_len - BUF_LRU_OLD_TOLERANCE) { + if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) { - buf_pool->LRU_old = UT_LIST_GET_PREV( - LRU, buf_pool->LRU_old); + buf_pool->LRU_old = LRU_old = UT_LIST_GET_PREV( + LRU, LRU_old); #ifdef UNIV_LRU_DEBUG - ut_a(!buf_pool->LRU_old->old); + ut_a(!LRU_old->old); #endif /* UNIV_LRU_DEBUG */ - buf_page_set_old(buf_pool->LRU_old, TRUE); - buf_pool->LRU_old_len++; + buf_page_set_old(LRU_old, TRUE); + old_len = ++buf_pool->LRU_old_len; } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) { - buf_page_set_old(buf_pool->LRU_old, FALSE); - buf_pool->LRU_old = UT_LIST_GET_NEXT( - LRU, buf_pool->LRU_old); - buf_pool->LRU_old_len--; + buf_page_set_old(LRU_old, FALSE); + buf_pool->LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old); + old_len = --buf_pool->LRU_old_len; } else { return; } @@ -1021,6 +1010,7 @@ buf_LRU_old_init(void) while (bpage != NULL) { ut_ad(bpage->in_LRU_list); + ut_ad(buf_page_in_file(bpage)); buf_page_set_old(bpage, TRUE); bpage = UT_LIST_GET_NEXT(LRU, bpage); } @@ -1075,16 +1065,19 @@ buf_LRU_remove_block( if (UNIV_UNLIKELY(bpage == buf_pool->LRU_old)) { - /* Below: the previous block is guaranteed to exist, because - the LRU_old pointer is only allowed to differ by the - tolerance value from strict 3/8 of the LRU list length. */ + /* Below: the previous block is guaranteed to exist, + because the LRU_old pointer is only allowed to differ + by BUF_LRU_OLD_TOLERANCE from strict + buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU + list length. */ + buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage); - buf_pool->LRU_old = UT_LIST_GET_PREV(LRU, bpage); - ut_a(buf_pool->LRU_old); + ut_a(prev_bpage); #ifdef UNIV_LRU_DEBUG - ut_a(!buf_pool->LRU_old->old); + ut_a(!prev_bpage->old); #endif /* UNIV_LRU_DEBUG */ - buf_page_set_old(buf_pool->LRU_old, TRUE); + buf_pool->LRU_old = prev_bpage; + buf_page_set_old(prev_bpage, TRUE); buf_pool->LRU_old_len++; } @@ -1149,39 +1142,25 @@ buf_LRU_add_block_to_end_low( /*=========================*/ buf_page_t* bpage) /*!< in: control block */ { - buf_page_t* last_bpage; - ut_ad(buf_pool); ut_ad(bpage); ut_ad(buf_pool_mutex_own()); ut_a(buf_page_in_file(bpage)); - last_bpage = UT_LIST_GET_LAST(buf_pool->LRU); - - if (last_bpage) { - bpage->LRU_position = last_bpage->LRU_position; - } else { - bpage->LRU_position = buf_pool_clock_tic(); - } - ut_ad(!bpage->in_LRU_list); UT_LIST_ADD_LAST(LRU, buf_pool->LRU, bpage); ut_d(bpage->in_LRU_list = TRUE); buf_page_set_old(bpage, TRUE); - if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { - - buf_pool->LRU_old_len++; - } - if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) { ut_ad(buf_pool->LRU_old); /* Adjust the length of the old block list if necessary */ + buf_pool->LRU_old_len++; buf_LRU_old_adjust_len(); } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) { @@ -1189,6 +1168,7 @@ buf_LRU_add_block_to_end_low( /* The LRU list is now long enough for LRU_old to become defined: init it */ + buf_pool->LRU_old_len++; buf_LRU_old_init(); } @@ -1222,7 +1202,6 @@ buf_LRU_add_block_low( UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, bpage); - bpage->LRU_position = buf_pool_clock_tic(); bpage->freed_page_clock = buf_pool->freed_page_clock; } else { #ifdef UNIV_LRU_DEBUG @@ -1237,11 +1216,6 @@ buf_LRU_add_block_low( UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, buf_pool->LRU_old, bpage); buf_pool->LRU_old_len++; - - /* We copy the LRU position field of the previous block - to the new block */ - - bpage->LRU_position = (buf_pool->LRU_old)->LRU_position; } ut_d(bpage->in_LRU_list = TRUE); @@ -1295,6 +1269,9 @@ buf_LRU_make_block_young( /*=====================*/ buf_page_t* bpage) /*!< in: control block */ { + ut_ad(buf_pool_mutex_own()); + buf_pool->n_pages_made_young++; + buf_LRU_remove_block(bpage); buf_LRU_add_block_low(bpage, FALSE); } @@ -1847,6 +1824,48 @@ buf_LRU_block_free_hashed_page( buf_LRU_block_free_non_file_page(block); } +/**********************************************************************//** +Updates buf_LRU_old_ratio. +@return updated old_pct */ +UNIV_INTERN +uint +buf_LRU_old_ratio_update( +/*=====================*/ + uint old_pct,/*!< in: Reserve this percentage of + the buffer pool for "old" blocks. */ + ibool adjust) /*!< in: TRUE=adjust the LRU list; + FALSE=just assign buf_LRU_old_ratio + during the initialization of InnoDB */ +{ + uint ratio; + + ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100; + if (ratio < BUF_LRU_OLD_RATIO_MIN) { + ratio = BUF_LRU_OLD_RATIO_MIN; + } else if (ratio > BUF_LRU_OLD_RATIO_MAX) { + ratio = BUF_LRU_OLD_RATIO_MAX; + } + + if (adjust) { + buf_pool_mutex_enter(); + + if (ratio != buf_LRU_old_ratio) { + buf_LRU_old_ratio = ratio; + + if (UT_LIST_GET_LEN(buf_pool->LRU) + >= BUF_LRU_OLD_MIN_LEN) { + buf_LRU_old_adjust_len(); + } + } + + buf_pool_mutex_exit(); + } else { + buf_LRU_old_ratio = ratio; + } + + return(ratio * 100 / BUF_LRU_OLD_RATIO_DIV); +} + /********************************************************************//** Update the historical stats that we are collecting for LRU eviction policy at the end of each interval. */ @@ -1896,7 +1915,6 @@ buf_LRU_validate(void) buf_block_t* block; ulint old_len; ulint new_len; - ulint LRU_pos; ut_ad(buf_pool); buf_pool_mutex_enter(); @@ -1905,7 +1923,11 @@ buf_LRU_validate(void) ut_a(buf_pool->LRU_old); old_len = buf_pool->LRU_old_len; - new_len = 3 * (UT_LIST_GET_LEN(buf_pool->LRU) / 8); + new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU) + * buf_LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV, + UT_LIST_GET_LEN(buf_pool->LRU) + - (BUF_LRU_OLD_TOLERANCE + + BUF_LRU_NON_OLD_MIN_LEN)); ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE); ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE); } @@ -1943,16 +1965,7 @@ buf_LRU_validate(void) ut_a(buf_pool->LRU_old == bpage); } - LRU_pos = buf_page_get_LRU_position(bpage); - bpage = UT_LIST_GET_NEXT(LRU, bpage); - - if (bpage) { - /* If the following assert fails, it may - not be an error: just the buf_pool clock - has wrapped around */ - ut_a(LRU_pos >= buf_page_get_LRU_position(bpage)); - } } if (buf_pool->LRU_old) { @@ -2000,9 +2013,6 @@ buf_LRU_print(void) ut_ad(buf_pool); buf_pool_mutex_enter(); - fprintf(stderr, "Pool ulint clock %lu\n", - (ulong) buf_pool->ulint_clock); - bpage = UT_LIST_GET_FIRST(buf_pool->LRU); while (bpage != NULL) { @@ -2033,18 +2043,16 @@ buf_LRU_print(void) const byte* frame; case BUF_BLOCK_FILE_PAGE: frame = buf_block_get_frame((buf_block_t*) bpage); - fprintf(stderr, "\nLRU pos %lu type %lu" + fprintf(stderr, "\ntype %lu" " index id %lu\n", - (ulong) buf_page_get_LRU_position(bpage), (ulong) fil_page_get_type(frame), (ulong) ut_dulint_get_low( btr_page_get_index_id(frame))); break; case BUF_BLOCK_ZIP_PAGE: frame = bpage->zip.data; - fprintf(stderr, "\nLRU pos %lu type %lu size %lu" + fprintf(stderr, "\ntype %lu size %lu" " index id %lu\n", - (ulong) buf_page_get_LRU_position(bpage), (ulong) fil_page_get_type(frame), (ulong) buf_page_get_zip_size(bpage), (ulong) ut_dulint_get_low( @@ -2052,8 +2060,7 @@ buf_LRU_print(void) break; default: - fprintf(stderr, "\nLRU pos %lu !state %lu!\n", - (ulong) buf_page_get_LRU_position(bpage), + fprintf(stderr, "\n!state %lu!\n", (ulong) buf_page_get_state(bpage)); break; } diff --git a/buf/buf0rea.c b/buf/buf0rea.c index 4910f82ccde..eb09b469748 100644 --- a/buf/buf0rea.c +++ b/buf/buf0rea.c @@ -164,165 +164,6 @@ buf_read_page_low( return(1); } -/********************************************************************//** -Applies a random read-ahead in buf_pool if there are at least a threshold -value of accessed pages from the random read-ahead area. Does not read any -page, not even the one at the position (space, offset), if the read-ahead -mechanism is not activated. NOTE 1: the calling thread may own latches on -pages: to avoid deadlocks this function must be written such that it cannot -end up waiting for these latches! NOTE 2: the calling thread must want -access to the page given: this rule is set to prevent unintended read-aheads -performed by ibuf routines, a situation which could result in a deadlock if -the OS does not support asynchronous i/o. -@return number of page read requests issued; NOTE that if we read ibuf -pages, it may happen that the page at the given page number does not -get read even if we return a positive value! */ -static -ulint -buf_read_ahead_random( -/*==================*/ - ulint space, /*!< in: space id */ - ulint zip_size,/*!< in: compressed page size in bytes, or 0 */ - ulint offset) /*!< in: page number of a page which the current thread - wants to access */ -{ - ib_int64_t tablespace_version; - ulint recent_blocks = 0; - ulint count; - ulint LRU_recent_limit; - ulint ibuf_mode; - ulint low, high; - ulint err; - ulint i; - ulint buf_read_ahead_random_area; - - /* We have currently disabled random readahead */ - return(0); - - if (srv_startup_is_before_trx_rollback_phase) { - /* No read-ahead to avoid thread deadlocks */ - return(0); - } - - if (ibuf_bitmap_page(zip_size, offset) - || trx_sys_hdr_page(space, offset)) { - - /* If it is an ibuf bitmap page or trx sys hdr, we do - no read-ahead, as that could break the ibuf page access - order */ - - return(0); - } - - /* Remember the tablespace version before we ask te tablespace size - below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we - do not try to read outside the bounds of the tablespace! */ - - tablespace_version = fil_space_get_version(space); - - buf_read_ahead_random_area = BUF_READ_AHEAD_RANDOM_AREA; - - low = (offset / buf_read_ahead_random_area) - * buf_read_ahead_random_area; - high = (offset / buf_read_ahead_random_area + 1) - * buf_read_ahead_random_area; - if (high > fil_space_get_size(space)) { - - high = fil_space_get_size(space); - } - - /* Get the minimum LRU_position field value for an initial segment - of the LRU list, to determine which blocks have recently been added - to the start of the list. */ - - LRU_recent_limit = buf_LRU_get_recent_limit(); - - buf_pool_mutex_enter(); - - if (buf_pool->n_pend_reads - > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { - buf_pool_mutex_exit(); - - return(0); - } - - /* Count how many blocks in the area have been recently accessed, - that is, reside near the start of the LRU list. */ - - for (i = low; i < high; i++) { - const buf_page_t* bpage = buf_page_hash_get(space, i); - - if (bpage - && buf_page_is_accessed(bpage) - && (buf_page_get_LRU_position(bpage) > LRU_recent_limit)) { - - recent_blocks++; - - if (recent_blocks >= BUF_READ_AHEAD_RANDOM_THRESHOLD) { - - buf_pool_mutex_exit(); - goto read_ahead; - } - } - } - - buf_pool_mutex_exit(); - /* Do nothing */ - return(0); - -read_ahead: - /* Read all the suitable blocks within the area */ - - if (ibuf_inside()) { - ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; - } else { - ibuf_mode = BUF_READ_ANY_PAGE; - } - - count = 0; - - for (i = low; i < high; i++) { - /* It is only sensible to do read-ahead in the non-sync aio - mode: hence FALSE as the first parameter */ - - if (!ibuf_bitmap_page(zip_size, i)) { - count += buf_read_page_low( - &err, FALSE, - ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, - space, zip_size, FALSE, - tablespace_version, i); - if (err == DB_TABLESPACE_DELETED) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: Warning: in random" - " readahead trying to access\n" - "InnoDB: tablespace %lu page %lu,\n" - "InnoDB: but the tablespace does not" - " exist or is just being dropped.\n", - (ulong) space, (ulong) i); - } - } - } - - /* In simulated aio we wake the aio handler threads only after - queuing all aio requests, in native aio the following call does - nothing: */ - - os_aio_simulated_wake_handler_threads(); - -#ifdef UNIV_DEBUG - if (buf_debug_prints && (count > 0)) { - fprintf(stderr, - "Random read-ahead space %lu offset %lu pages %lu\n", - (ulong) space, (ulong) offset, - (ulong) count); - } -#endif /* UNIV_DEBUG */ - - ++srv_read_ahead_rnd; - return(count); -} - /********************************************************************//** High-level function which reads a page asynchronously from a file to the buffer buf_pool if it is not already there. Sets the io_fix flag and sets @@ -341,20 +182,17 @@ buf_read_page( { ib_int64_t tablespace_version; ulint count; - ulint count2; ulint err; tablespace_version = fil_space_get_version(space); - count = buf_read_ahead_random(space, zip_size, offset); - /* We do the i/o in the synchronous aio mode to save thread switches: hence TRUE */ - count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, - zip_size, FALSE, - tablespace_version, offset); - srv_buf_pool_reads+= count2; + count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, + zip_size, FALSE, + tablespace_version, offset); + srv_buf_pool_reads += count; if (err == DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fprintf(stderr, @@ -371,7 +209,7 @@ buf_read_page( /* Increment number of I/O operations used for LRU policy. */ buf_LRU_stat_inc_io(); - return(count + count2); + return(count); } /********************************************************************//** @@ -498,9 +336,17 @@ buf_read_ahead_linear( fail_count++; } else if (pred_bpage) { - int res = (ut_ulint_cmp( - buf_page_get_LRU_position(bpage), - buf_page_get_LRU_position(pred_bpage))); + /* Note that buf_page_is_accessed() returns + the time of the first access. If some blocks + of the extent existed in the buffer pool at + the time of a linear access pattern, the first + access times may be nonmonotonic, even though + the latest access times were linear. The + threshold (srv_read_ahead_factor) should help + a little against this. */ + int res = ut_ulint_cmp( + buf_page_is_accessed(bpage), + buf_page_is_accessed(pred_bpage)); /* Accesses not in the right order */ if (res != 0 && res != asc_or_desc) { fail_count++; diff --git a/handler/ha_innodb.cc b/handler/ha_innodb.cc index b7b26b5914a..55cecb18688 100644 --- a/handler/ha_innodb.cc +++ b/handler/ha_innodb.cc @@ -72,6 +72,7 @@ with this program; if not, write to the Free Software Foundation, Inc., /* Include necessary InnoDB headers */ extern "C" { #include "univ.i" +#include "buf0lru.h" #include "btr0sea.h" #include "os0file.h" #include "os0thread.h" @@ -157,6 +158,10 @@ static ulong innobase_write_io_threads; static long long innobase_buffer_pool_size, innobase_log_file_size; +/** Percentage of the buffer pool to reserve for 'old' blocks. +Connected to buf_LRU_old_ratio. */ +static uint innobase_old_blocks_pct; + /* The default values for the following char* start-up parameters are determined in innobase_init below: */ @@ -2208,6 +2213,9 @@ innobase_change_buffering_inited_ok: ut_a(0 == strcmp(my_charset_latin1.name, "latin1_swedish_ci")); srv_latin1_ordering = my_charset_latin1.sort_order; + innobase_old_blocks_pct = buf_LRU_old_ratio_update( + innobase_old_blocks_pct, FALSE); + innobase_commit_concurrency_init_default(); /* Since we in this module access directly the fields of a trx @@ -9614,6 +9622,25 @@ innodb_adaptive_hash_index_update( } } +/****************************************************************//** +Update the system variable innodb_old_blocks_pct using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_old_blocks_pct_update( +/*=========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + innobase_old_blocks_pct = buf_LRU_old_ratio_update( + *static_cast(save), TRUE); +} + /*************************************************************//** Check if it is a valid value of innodb_change_buffering. This function is registered as a callback with MySQL. @@ -9890,6 +9917,18 @@ static MYSQL_SYSVAR_LONG(mirrored_log_groups, innobase_mirrored_log_groups, "Number of identical copies of log groups we keep for the database. Currently this should be set to 1.", NULL, NULL, 1, 1, 10, 0); +static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct, + PLUGIN_VAR_RQCMDARG, + "Percentage of the buffer pool to reserve for 'old' blocks.", + NULL, innodb_old_blocks_pct_update, 100 * 3 / 8, 5, 95, 0); + +static MYSQL_SYSVAR_UINT(old_blocks_time, buf_LRU_old_threshold_ms, + PLUGIN_VAR_RQCMDARG, + "Move blocks to the 'new' end of the buffer pool if the first access" + " was at least this many milliseconds ago." + " The timeout is disabled if 0 (the default).", + NULL, NULL, 0, 0, UINT_MAX32, 0); + static MYSQL_SYSVAR_LONG(open_files, innobase_open_files, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "How many files at the maximum InnoDB keeps open at the same time.", @@ -9988,6 +10027,8 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(adaptive_flushing), MYSQL_SYSVAR(max_purge_lag), MYSQL_SYSVAR(mirrored_log_groups), + MYSQL_SYSVAR(old_blocks_pct), + MYSQL_SYSVAR(old_blocks_time), MYSQL_SYSVAR(open_files), MYSQL_SYSVAR(rollback_on_timeout), MYSQL_SYSVAR(stats_on_metadata), diff --git a/include/buf0buf.h b/include/buf0buf.h index a24fc4cdbbb..a1ab262b5d6 100644 --- a/include/buf0buf.h +++ b/include/buf0buf.h @@ -707,15 +707,6 @@ buf_page_belongs_to_unzip_LRU( /*==========================*/ const buf_page_t* bpage) /*!< in: pointer to control block */ __attribute__((pure)); -/*********************************************************************//** -Determine the approximate LRU list position of a block. -@return LRU list position */ -UNIV_INLINE -ulint -buf_page_get_LRU_position( -/*======================*/ - const buf_page_t* bpage) /*!< in: control block */ - __attribute__((pure)); /*********************************************************************//** Gets the mutex of a block. @@ -816,22 +807,22 @@ buf_page_set_old( buf_page_t* bpage, /*!< in/out: control block */ ibool old); /*!< in: old */ /*********************************************************************//** -Determine if a block has been accessed in the buffer pool. -@return TRUE if accessed */ +Determine the time of last access a block in the buffer pool. +@return ut_time_ms() at the time of last access, 0 if not accessed */ UNIV_INLINE -ibool +unsigned buf_page_is_accessed( /*=================*/ const buf_page_t* bpage) /*!< in: control block */ - __attribute__((pure)); + __attribute__((nonnull, pure)); /*********************************************************************//** Flag a block accessed. */ UNIV_INLINE void buf_page_set_accessed( /*==================*/ - buf_page_t* bpage, /*!< in/out: control block */ - ibool accessed); /*!< in: accessed */ + buf_page_t* bpage) /*!< in/out: control block */ + __attribute__((nonnull)); /*********************************************************************//** Gets the buf_block_t handle of a buffered file block if an uncompressed page frame exists, or NULL. @@ -1017,14 +1008,6 @@ buf_block_hash_get( /*===============*/ ulint space, /*!< in: space id */ ulint offset);/*!< in: offset of the page within space */ -/*******************************************************************//** -Increments the pool clock by one and returns its new value. Remember that -in the 32 bit version the clock wraps around at 4 billion! -@return new clock value */ -UNIV_INLINE -ulint -buf_pool_clock_tic(void); -/*====================*/ /*********************************************************************//** Gets the current length of the free list of buffer blocks. @return length of the free list */ @@ -1064,16 +1047,10 @@ struct buf_page_struct{ flushed to disk, this tells the flush_type. @see enum buf_flush */ - unsigned accessed:1; /*!< TRUE if the page has been accessed - while in the buffer pool: read-ahead - may read in pages which have not been - accessed yet; a thread is allowed to - read this for heuristic purposes - without holding any mutex or latch */ unsigned io_fix:2; /*!< type of pending I/O operation; also protected by buf_pool_mutex @see enum buf_io_fix */ - unsigned buf_fix_count:24;/*!< count of how manyfold this block + unsigned buf_fix_count:25;/*!< count of how manyfold this block is currently bufferfixed */ /* @} */ #endif /* !UNIV_HOTBACKUP */ @@ -1152,17 +1129,7 @@ struct buf_page_struct{ #endif /* UNIV_DEBUG */ unsigned old:1; /*!< TRUE if the block is in the old blocks in the LRU list */ - unsigned LRU_position:31;/*!< value which monotonically - decreases (or may stay - constant if old==TRUE) toward - the end of the LRU list, if - buf_pool->ulint_clock has not - wrapped around: NOTE that this - value can only be used in - heuristic algorithms, because - of the possibility of a - wrap-around! */ - unsigned freed_page_clock:32;/*!< the value of + unsigned freed_page_clock:31;/*!< the value of buf_pool->freed_page_clock when this block was the last time put to the head of the @@ -1170,6 +1137,9 @@ struct buf_page_struct{ to read this for heuristic purposes without holding any mutex or latch */ + unsigned access_time:32; /*!< time of first access, or + 0 if the block was never accessed + in the buffer pool */ /* @} */ # ifdef UNIV_DEBUG_FILE_ACCESSES ibool file_page_was_freed; @@ -1338,8 +1308,17 @@ struct buf_pool_struct{ ulint n_pend_reads; /*!< number of pending read operations */ ulint n_pend_unzip; /*!< number of pending decompressions */ - time_t last_printout_time; /*!< when buf_print was last time + time_t last_printout_time; + /*!< when buf_print_io was last time called */ + ulint n_pages_made_young; + /*!< number of pages made young, in + calls to buf_LRU_make_block_young() */ + ulint n_pages_not_made_young; + /*!< number of pages not made + young because the first access + was not long enough ago, in + buf_page_peek_if_too_old() */ ulint n_pages_read; /*!< number read operations */ ulint n_pages_written;/*!< number write operations */ ulint n_pages_created;/*!< number of pages created @@ -1350,14 +1329,24 @@ struct buf_pool_struct{ counted as page gets; this field is NOT protected by the buffer pool mutex */ - ulint n_page_gets_old;/*!< n_page_gets when buf_print was - last time called: used to calculate + ulint n_page_gets_old;/*!< n_page_gets when buf_print_io was + called last time: used to calculate hit rate */ - ulint n_pages_read_old;/*!< n_pages_read when buf_print was - last time called */ - ulint n_pages_written_old;/*!< number write operations */ - ulint n_pages_created_old;/*!< number of pages created in - the pool with no read */ + ulint n_pages_made_young_old; + /*!< n_pages_made_young when + buf_print_io was called last time */ + ulint n_pages_not_made_young_old; + /*!< n_pages_not_made_young when + buf_print_io was called last time */ + ulint n_pages_read_old; + /*!< n_pages_read when buf_print_io + was called last time */ + ulint n_pages_written_old; + /*!< n_pages_written when buf_print_io + was called last time */ + ulint n_pages_created_old; + /*!< n_pages_created when buf_print_io + was called last time */ /* @} */ /** @name Page flushing algorithm fields */ /* @{ */ @@ -1375,10 +1364,6 @@ struct buf_pool_struct{ /*!< this is in the set state when there is no flush batch of the given type running */ - ulint ulint_clock; /*!< a sequence number used to count - time. NOTE! This counter wraps - around at 4 billion (if ulint == - 32 bits)! */ ulint freed_page_clock;/*!< a sequence number used to count the number of buffer blocks removed from the end of @@ -1402,9 +1387,11 @@ struct buf_pool_struct{ block list */ UT_LIST_BASE_NODE_T(buf_page_t) LRU; /*!< base node of the LRU list */ - buf_page_t* LRU_old; /*!< pointer to the about 3/8 oldest - blocks in the LRU list; NULL if LRU - length less than BUF_LRU_OLD_MIN_LEN; + buf_page_t* LRU_old; /*!< pointer to the about + buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV + oldest blocks in the LRU list; + NULL if LRU length less than + BUF_LRU_OLD_MIN_LEN; NOTE: when LRU_old != NULL, its length should always equal LRU_old_len */ ulint LRU_old_len; /*!< length of the LRU list from diff --git a/include/buf0buf.ic b/include/buf0buf.ic index 17064342116..af31006f422 100644 --- a/include/buf0buf.ic +++ b/include/buf0buf.ic @@ -72,9 +72,24 @@ buf_page_peek_if_too_old( /*=====================*/ const buf_page_t* bpage) /*!< in: block to make younger */ { - return(buf_pool->freed_page_clock - >= buf_page_get_freed_page_clock(bpage) - + 1 + (buf_pool->curr_size / 4)); + if (buf_LRU_old_threshold_ms && bpage->old) { + unsigned access_time = buf_page_is_accessed(bpage); + + if (access_time && ut_time_ms() - access_time + >= buf_LRU_old_threshold_ms) { + return(TRUE); + } + + buf_pool->n_pages_not_made_young++; + return(FALSE); + } else { + /* FIXME: bpage->freed_page_clock is 31 bits */ + return((buf_pool->freed_page_clock & ~(1 << 31)) + > bpage->freed_page_clock + + (buf_pool->curr_size + * (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio) + / (BUF_LRU_OLD_RATIO_DIV * 4))); + } } /*********************************************************************//** @@ -118,22 +133,6 @@ buf_pool_get_oldest_modification(void) return(lsn); } - -/*******************************************************************//** -Increments the buf_pool clock by one and returns its new value. Remember -that in the 32 bit version the clock wraps around at 4 billion! -@return new clock value */ -UNIV_INLINE -ulint -buf_pool_clock_tic(void) -/*====================*/ -{ - ut_ad(buf_pool_mutex_own()); - - buf_pool->ulint_clock++; - - return(buf_pool->ulint_clock); -} #endif /* !UNIV_HOTBACKUP */ /*********************************************************************//** @@ -279,21 +278,6 @@ buf_page_belongs_to_unzip_LRU( && buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); } -/*********************************************************************//** -Determine the approximate LRU list position of a block. -@return LRU list position */ -UNIV_INLINE -ulint -buf_page_get_LRU_position( -/*======================*/ - const buf_page_t* bpage) /*!< in: control block */ -{ - ut_ad(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); - - return(bpage->LRU_position); -} - /*********************************************************************//** Gets the mutex of a block. @return pointer to mutex protecting bpage */ @@ -487,17 +471,17 @@ buf_page_set_old( } /*********************************************************************//** -Determine if a block has been accessed in the buffer pool. -@return TRUE if accessed */ +Determine the time of last access a block in the buffer pool. +@return ut_time_ms() at the time of last access, 0 if not accessed */ UNIV_INLINE -ibool +unsigned buf_page_is_accessed( /*=================*/ const buf_page_t* bpage) /*!< in: control block */ { ut_ad(buf_page_in_file(bpage)); - return(bpage->accessed); + return(bpage->access_time); } /*********************************************************************//** @@ -506,13 +490,15 @@ UNIV_INLINE void buf_page_set_accessed( /*==================*/ - buf_page_t* bpage, /*!< in/out: control block */ - ibool accessed) /*!< in: accessed */ + buf_page_t* bpage) /*!< in/out: control block */ { ut_a(buf_page_in_file(bpage)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_pool_mutex_own()); - bpage->accessed = accessed; + if (!bpage->access_time) { + /* Make this the time of the first access. */ + bpage->access_time = ut_time_ms(); + } } /*********************************************************************//** diff --git a/include/buf0lru.h b/include/buf0lru.h index 463aca0982c..009430af35b 100644 --- a/include/buf0lru.h +++ b/include/buf0lru.h @@ -69,7 +69,7 @@ These are low-level functions #########################################################################*/ /** Minimum LRU list length for which the LRU_old pointer is defined */ -#define BUF_LRU_OLD_MIN_LEN 80 +#define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */ /** Maximum LRU list search length in buf_flush_LRU_recommendation() */ #define BUF_LRU_FREE_SEARCH_LEN (5 + 2 * BUF_READ_AHEAD_AREA) @@ -84,15 +84,6 @@ void buf_LRU_invalidate_tablespace( /*==========================*/ ulint id); /*!< in: space id */ -/******************************************************************//** -Gets the minimum LRU_position field for the blocks in an initial segment -(determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not -guaranteed to be precise, because the ulint_clock may wrap around. -@return the limit; zero if could not determine it */ -UNIV_INTERN -ulint -buf_LRU_get_recent_limit(void); -/*==========================*/ /********************************************************************//** Insert a compressed block into buf_pool->zip_clean in the LRU order. */ UNIV_INTERN @@ -201,6 +192,18 @@ void buf_LRU_make_block_old( /*===================*/ buf_page_t* bpage); /*!< in: control block */ +/**********************************************************************//** +Updates buf_LRU_old_ratio. +@return updated old_pct */ +UNIV_INTERN +uint +buf_LRU_old_ratio_update( +/*=====================*/ + uint old_pct,/*!< in: Reserve this percentage of + the buffer pool for "old" blocks. */ + ibool adjust);/*!< in: TRUE=adjust the LRU list; + FALSE=just assign buf_LRU_old_ratio + during the initialization of InnoDB */ /********************************************************************//** Update the historical stats that we are collecting for LRU eviction policy at the end of each interval. */ @@ -227,6 +230,35 @@ buf_LRU_print(void); /*===============*/ #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ +/** @name Heuristics for detecting index scan @{ */ +/** Reserve this much/BUF_LRU_OLD_RATIO_DIV of the buffer pool for +"old" blocks. Protected by buf_pool_mutex. */ +extern uint buf_LRU_old_ratio; +/** The denominator of buf_LRU_old_ratio. */ +#define BUF_LRU_OLD_RATIO_DIV 1024 +/** Maximum value of buf_LRU_old_ratio. +@see buf_LRU_old_adjust_len +@see buf_LRU_old_ratio_update */ +#define BUF_LRU_OLD_RATIO_MAX BUF_LRU_OLD_RATIO_DIV +/** Minimum value of buf_LRU_old_ratio. +@see buf_LRU_old_adjust_len +@see buf_LRU_old_ratio_update +The minimum must exceed +(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */ +#define BUF_LRU_OLD_RATIO_MIN 51 + +#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX +# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX" +#endif +#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV +# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV" +#endif + +/** Move blocks to "new" LRU list only if the first access was at +least this many milliseconds ago. Not protected by any mutex or latch. */ +extern uint buf_LRU_old_threshold_ms; +/* @} */ + /** @brief Statistics for selecting the LRU list for eviction. These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O diff --git a/include/ut0ut.h b/include/ut0ut.h index 80094321041..12c1cd59166 100644 --- a/include/ut0ut.h +++ b/include/ut0ut.h @@ -239,6 +239,15 @@ ullint ut_time_us( /*=======*/ ullint* tloc); /*!< out: us since epoch, if non-NULL */ +/**********************************************************//** +Returns the number of milliseconds since some epoch. The +value may wrap around. It should only be used for heuristic +purposes. +@return ms since epoch */ +UNIV_INTERN +uint +ut_time_ms(void); +/*============*/ /**********************************************************//** Returns the difference of two times in seconds. diff --git a/ut/ut0ut.c b/ut/ut0ut.c index e4cc226fbad..a01a0470938 100644 --- a/ut/ut0ut.c +++ b/ut/ut0ut.c @@ -199,6 +199,23 @@ ut_time_us( return(us); } +/**********************************************************//** +Returns the number of milliseconds since some epoch. The +value may wrap around. It should only be used for heuristic +purposes. +@return ms since epoch */ +UNIV_INTERN +uint +ut_time_ms(void) +/*============*/ +{ + struct timeval tv; + + ut_gettimeofday(&tv, NULL); + + return((uint) tv.tv_sec * 1000 + tv.tv_usec / 1000); +} + /**********************************************************//** Returns the difference of two times in seconds. @return time2 - time1 expressed in seconds */