From 6100f59ffaed0a4d4c224aa771999983f8acd496 Mon Sep 17 00:00:00 2001 From: sensssz Date: Tue, 11 Oct 2016 20:52:35 -0400 Subject: [PATCH 1/7] Implement VATS both in InnoDB and XtraDB. Add configuration options for it in both of them. --- storage/innobase/handler/ha_innodb.cc | 29 +++++ storage/innobase/include/lock0lock.h | 9 ++ storage/innobase/lock/lock0lock.cc | 158 +++++++++++++++++++++++--- storage/xtradb/handler/ha_innodb.cc | 30 +++++ storage/xtradb/include/lock0lock.h | 9 ++ storage/xtradb/lock/lock0lock.cc | 155 ++++++++++++++++++++++--- 6 files changed, 362 insertions(+), 28 deletions(-) diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 7ba54a1c360..a436d079d03 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -300,6 +300,22 @@ static TYPELIB innodb_checksum_algorithm_typelib = { NULL }; +/** Possible values of the parameter innodb_lock_schedule_algorithm */ +static const char* innodb_lock_schedule_algorithm_names[] = { + "fcfs", + "vats", + NullS +}; + +/** Used to define an enumerate type of the system variable +innodb_lock_schedule_algorithm. */ +static TYPELIB innodb_lock_schedule_algorithm_typelib = { + array_elements(innodb_lock_schedule_algorithm_names) - 1, + "innodb_lock_schedule_algorithm_typelib", + innodb_lock_schedule_algorithm_names, + NULL +}; + /* The following counter is used to convey information to InnoDB about server activity: in case of normal DML ops it is not sensible to call srv_active_wake_master_thread after each @@ -19013,6 +19029,18 @@ static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size, NULL, NULL, 120, 1, 127, 0); #endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ +static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm, + PLUGIN_VAR_RQCMDARG, + "The algorithm Innodb uses for deciding which locks to grant next when" + " a lock is released. Possible values are" + " FCFS" + " grant the locks in First-Come-First-Served order;" + " VATS" + " use the Variance-Aware-Transaction-Scheduling algorithm, which" + " uses an Eldest-Transaction-First heuristic.", + NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, + &innodb_lock_schedule_algorithm_typelib); + static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of buffer pool instances, set to higher value on high-end machines to increase scalability", @@ -19828,6 +19856,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(ft_sort_pll_degree), MYSQL_SYSVAR(large_prefix), MYSQL_SYSVAR(force_load_corrupted), + MYSQL_SYSVAR(lock_schedule_algorithm), MYSQL_SYSVAR(locks_unsafe_for_binlog), MYSQL_SYSVAR(lock_wait_timeout), #ifdef UNIV_LOG_ARCHIVE diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index a6fafd95754..2b1158eddf8 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -43,6 +43,15 @@ Created 5/7/1996 Heikki Tuuri extern ibool lock_print_waits; #endif /* UNIV_DEBUG */ +/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by + setting innodb_lock_schedule_algorithm. */ +enum innodb_lock_schedule_algorithm_t { + INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, /*!< First Come First Served */ + INNODB_LOCK_SCHEDULE_ALGORITHM_VATS /*!< Variance-Aware-Transaction-Scheduling */ +}; + +extern ulong innodb_lock_schedule_algorithm; + /*********************************************************************//** Gets the size of a lock struct. @return size in bytes */ diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index b4834c05b81..37f15c0d3e8 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -76,6 +76,9 @@ bitmap */ #define LOCK_PAGE_BITMAP_MARGIN 64 +/** Lock scheduling algorithm */ +ulong innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS; + /* An explicit record lock affects both the record and the gap before it. An implicit x-lock does not affect the gap, it only locks the index record from read or update. @@ -1982,6 +1985,72 @@ wsrep_print_wait_locks( } #endif /* WITH_WSREP */ +/*********************************************************************//** +Check if lock1 has higher priority than lock2. +NULL has lowest priority. +If neither of them is wait lock, the first one has higher priority. +If only one of them is a wait lock, it has lower priority. +Otherwise, the one with an older transaction has higher priority. +@returns true if lock1 has higher priority, false otherwise. */ +bool +has_higher_priority( + lock_t *lock1, + lock_t *lock2) +{ + if (lock1 == NULL) { + return false; + } else if (lock2 == NULL) { + return true; + } + if (!lock_get_wait(lock1)) { + return true; + } else if (!lock_get_wait(lock2)) { + return false; + } + return lock1->trx->start_time < lock2->trx->start_time; +} + +/*********************************************************************//** +Insert a lock to the hash list according to the mode (whether it is a wait lock) +and the age of the transaction the it is associated with. +If the lock is not a wait lock, insert it to the head of the hash list. +Otherwise, insert it to the middle of the wait locks according to the age of the +transaciton. +*/ +static +void +lock_rec_insert_by_trx_age( + lock_t *in_lock, /*!< in: lock to be insert */ + bool wait) /*!< in: whether it's a wait lock */ +{ + ulint space; + ulint page_no; + ulint rec_fold; + hash_cell_t cell; + lock_t* node; + lock_t* next; + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); + cell = hash_get_nth_cell(lock_sys->rec_hash, + hash_calc_hash(rec_fold, lock_sys->rec_hash)); + + node = (lock_t *) cell->node; + // If in_lock is not a wait lock, we insert it to the head of the list. + if (node == NULL || !wait || has_higher_priority(in_lock, node)) { + cell->node = in_lock; + in_lock->hash = node; + return; + } + while (node != NULL && has_higher_priority((lock_t *) node->hash, in_lock)) { + node = (lock_t *) node->hash; + } + next = (lock_t *) node->hash; + node->hash = in_lock; + in_lock->hash = next; +} + /*********************************************************************//** Creates a new record lock and inserts it to the lock queue. Does NOT check for deadlocks or lock compatibility! @@ -2144,13 +2213,19 @@ lock_rec_create( return(lock); } trx_mutex_exit(c_lock->trx); + } else if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { + lock_rec_insert_by_trx_age(lock, type_mode & LOCK_WAIT); } else { HASH_INSERT(lock_t, hash, lock_sys->rec_hash, - lock_rec_fold(space, page_no), lock); + lock_rec_fold(space, page_no), lock); } #else - HASH_INSERT(lock_t, hash, lock_sys->rec_hash, - lock_rec_fold(space, page_no), lock); + if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { + lock_rec_insert_by_trx_age(lock, type_mode & LOCK_WAIT); + } else { + HASH_INSERT(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), lock); + } #endif /* WITH_WSREP */ if (!caller_owns_trx_mutex) { @@ -2822,6 +2897,27 @@ lock_rec_cancel( trx_mutex_exit(lock->trx); } +/*************************************************************//** +Move the lock to the head of the hash list. */ +static +void +lock_rec_move_to_front( + lock_t *lock_to_move, /*!< in: lock to be moved */ + ulint rec_fold) /*!< in: rec fold of the lock */ +{ + if (lock_to_move != NULL) + { + // Move the target lock to the head of the list + hash_cell_t* cell = hash_get_nth_cell(lock_sys->rec_hash, + hash_calc_hash(rec_fold, lock_sys->rec_hash)); + if (lock_to_move != cell->node) { + lock_t *next = (lock_t *) cell->node; + cell->node = lock_to_move; + lock_to_move->hash = next; + } + } +} + /*************************************************************//** Removes a record lock request, waiting or granted, from the queue and grants locks to other transactions in the queue if they now are entitled @@ -2839,7 +2935,9 @@ lock_rec_dequeue_from_page( { ulint space; ulint page_no; + ulint rec_fold lock_t* lock; + lock_t* previous = NULL; trx_lock_t* trx_lock; ut_ad(lock_mutex_own()); @@ -2850,6 +2948,7 @@ lock_rec_dequeue_from_page( space = in_lock->un_member.rec_lock.space; page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); in_lock->index->table->n_rec_locks--; @@ -2861,20 +2960,51 @@ lock_rec_dequeue_from_page( MONITOR_INC(MONITOR_RECLOCK_REMOVED); MONITOR_DEC(MONITOR_NUM_RECLOCK); - /* Check if waiting locks in the queue can now be granted: grant - locks if there are no conflicting locks ahead. Stop at the first - X lock that is waiting or has been granted. */ + if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) { + /* Check if waiting locks in the queue can now be granted: grant + locks if there are no conflicting locks ahead. Stop at the first + X lock that is waiting or has been granted. */ - for (lock = lock_rec_get_first_on_page_addr(space, page_no); - lock != NULL; - lock = lock_rec_get_next_on_page(lock)) { + for (lock = lock_rec_get_first_on_page_addr(space, page_no); + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { - if (lock_get_wait(lock) - && !lock_rec_has_to_wait_in_queue(lock)) { + if (lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { - /* Grant the lock */ - ut_ad(lock->trx != in_lock->trx); - lock_grant(lock); + /* Grant the lock */ + ut_ad(lock->trx != in_lock->trx); + lock_grant(lock); + } + } + } else { + /* Grant locks if there are no conflicting locks ahead. + Move granted locks to the head of the list. */ + for (lock = lock_rec_get_first_on_page_addr(space, page_no); + lock != NULL;) { + + /* If the lock is a wait lock on this page, and it does not need to wait. */ + if ((lock->un_member.rec_lock.space == space) + && (lock->un_member.rec_lock.page_no == page_no) + && lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + lock_grant(lock); + + if (previous != NULL) { + /* Move the lock to the head of the list. */ + HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock); + lock_rec_move_to_front(lock, rec_fold); + } else { + /* Already at the head of the list. */ + previous = lock; + } + /* Move on to the next lock. */ + lock = static_cast(HASH_GET_NEXT(hash, previous)); + } else { + previous = lock; + lock = static_cast(HASH_GET_NEXT(hash, lock)); + } } } } diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 320b900d019..7ce65e5bc6f 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -348,6 +348,23 @@ static TYPELIB innodb_empty_free_list_algorithm_typelib = { NULL }; +/** Possible values of the parameter innodb_lock_schedule_algorithm */ +static const char* innodb_lock_schedule_algorithm_names[] = { + "fcfs", + "vats", + NullS +}; + +/** Used to define an enumerate type of the system variable +innodb_lock_schedule_algorithm. */ +static TYPELIB innodb_lock_schedule_algorithm_typelib = { + array_elements(innodb_lock_schedule_algorithm_names) - 1, + "innodb_lock_schedule_algorithm_typelib", + innodb_lock_schedule_algorithm_names, + NULL +}; + + /* The following counter is used to convey information to InnoDB about server activity: in case of normal DML ops it is not sensible to call srv_active_wake_master_thread after each @@ -20473,6 +20490,18 @@ static MYSQL_SYSVAR_ENUM(empty_free_list_algorithm, innodb_srv_empty_free_list_algorithm_validate, NULL, SRV_EMPTY_FREE_LIST_BACKOFF, &innodb_empty_free_list_algorithm_typelib); +static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm, + PLUGIN_VAR_RQCMDARG, + "The algorithm Innodb uses for deciding which locks to grant next when" + " a lock is released. Possible values are" + " FCFS" + " grant the locks in First-Come-First-Served order;" + " VATS" + " use the Variance-Aware-Transaction-Scheduling algorithm, which" + " uses an Eldest-Transaction-First heuristic.", + NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, + &innodb_lock_schedule_algorithm_typelib); + static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of buffer pool instances, set to higher value on high-end machines to increase scalability", @@ -21366,6 +21395,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(ft_sort_pll_degree), MYSQL_SYSVAR(large_prefix), MYSQL_SYSVAR(force_load_corrupted), + MYSQL_SYSVAR(lock_schedule_algorithm), MYSQL_SYSVAR(locks_unsafe_for_binlog), MYSQL_SYSVAR(lock_wait_timeout), #ifdef UNIV_LOG_ARCHIVE diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h index b6100d470cc..742da74f1b6 100644 --- a/storage/xtradb/include/lock0lock.h +++ b/storage/xtradb/include/lock0lock.h @@ -45,6 +45,15 @@ Created 5/7/1996 Heikki Tuuri extern ibool lock_print_waits; #endif /* UNIV_DEBUG */ +/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by + setting innodb_lock_schedule_algorithm. */ +enum innodb_lock_schedule_algorithm_t { + INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, /*!< First Come First Served */ + INNODB_LOCK_SCHEDULE_ALGORITHM_VATS /*!< Variance-Aware-Transaction-Scheduling */ +}; + +extern ulong innodb_lock_schedule_algorithm; + extern ulint srv_n_lock_deadlock_count; /*********************************************************************//** diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc index 8650cdd106a..eb47ef6e685 100644 --- a/storage/xtradb/lock/lock0lock.cc +++ b/storage/xtradb/lock/lock0lock.cc @@ -76,6 +76,9 @@ bitmap */ #define LOCK_PAGE_BITMAP_MARGIN 64 +/** Lock scheduling algorithm */ +ulong innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS; + /* An explicit record lock affects both the record and the gap before it. An implicit x-lock does not affect the gap, it only locks the index record from read or update. @@ -2005,6 +2008,72 @@ wsrep_print_wait_locks( } #endif /* WITH_WSREP */ +/*********************************************************************//** +Check if lock1 has higher priority than lock2. +NULL has lowest priority. +If neither of them is wait lock, the first one has higher priority. +If only one of them is a wait lock, it has lower priority. +Otherwise, the one with an older transaction has higher priority. +@returns true if lock1 has higher priority, false otherwise. */ +bool +has_higher_priority( + lock_t *lock1, + lock_t *lock2) +{ + if (lock1 == NULL) { + return false; + } else if (lock2 == NULL) { + return true; + } + if (!lock_get_wait(lock1)) { + return true; + } else if (!lock_get_wait(lock2)) { + return false; + } + return lock1->trx->start_time < lock2->trx->start_time; +} + +/*********************************************************************//** +Insert a lock to the hash list according to the mode (whether it is a wait lock) +and the age of the transaction the it is associated with. +If the lock is not a wait lock, insert it to the head of the hash list. +Otherwise, insert it to the middle of the wait locks according to the age of the +transaciton. +*/ +static +void +lock_rec_insert_by_trx_age( + lock_t *in_lock, /*!< in: lock to be insert */ + bool wait) /*!< in: whether it's a wait lock */ +{ + ulint space; + ulint page_no; + ulint rec_fold; + hash_cell_t cell; + lock_t* node; + lock_t* next; + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); + cell = hash_get_nth_cell(lock_sys->rec_hash, + hash_calc_hash(rec_fold, lock_sys->rec_hash)); + + node = (lock_t *) cell->node; + // If in_lock is not a wait lock, we insert it to the head of the list. + if (node == NULL || !wait || has_higher_priority(in_lock, node)) { + cell->node = in_lock; + in_lock->hash = node; + return; + } + while (node != NULL && has_higher_priority((lock_t *) node->hash, in_lock)) { + node = (lock_t *) node->hash; + } + next = (lock_t *) node->hash; + node->hash = in_lock; + in_lock->hash = next; +} + /*********************************************************************//** Creates a new record lock and inserts it to the lock queue. Does NOT check for deadlocks or lock compatibility! @@ -2167,13 +2236,19 @@ lock_rec_create( return(lock); } trx_mutex_exit(c_lock->trx); + } else if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { + lock_rec_insert_by_trx_age(lock, type_mode & LOCK_WAIT); } else { HASH_INSERT(lock_t, hash, lock_sys->rec_hash, - lock_rec_fold(space, page_no), lock); + lock_rec_fold(space, page_no), lock); } #else - HASH_INSERT(lock_t, hash, lock_sys->rec_hash, - lock_rec_fold(space, page_no), lock); + if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { + lock_rec_insert_by_trx_age(lock, type_mode & LOCK_WAIT); + } else { + HASH_INSERT(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), lock); + } #endif /* WITH_WSREP */ lock_sys->rec_num++; @@ -2858,6 +2933,27 @@ lock_rec_cancel( trx_mutex_exit(lock->trx); } +/*************************************************************//** +Move the lock to the head of the hash list. */ +static +void +lock_rec_move_to_front( + lock_t *lock_to_move, /*!< in: lock to be moved */ + ulint rec_fold) /*!< in: rec fold of the lock */ +{ + if (lock_to_move != NULL) + { + // Move the target lock to the head of the list + hash_cell_t* cell = hash_get_nth_cell(lock_sys->rec_hash, + hash_calc_hash(rec_fold, lock_sys->rec_hash)); + if (lock_to_move != cell->node) { + lock_t *next = (lock_t *) cell->node; + cell->node = lock_to_move; + lock_to_move->hash = next; + } + } +} + /*************************************************************//** Removes a record lock request, waiting or granted, from the queue and grants locks to other transactions in the queue if they now are entitled @@ -2898,20 +2994,51 @@ lock_rec_dequeue_from_page( MONITOR_INC(MONITOR_RECLOCK_REMOVED); MONITOR_DEC(MONITOR_NUM_RECLOCK); - /* Check if waiting locks in the queue can now be granted: grant - locks if there are no conflicting locks ahead. Stop at the first - X lock that is waiting or has been granted. */ + if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) { + /* Check if waiting locks in the queue can now be granted: grant + locks if there are no conflicting locks ahead. Stop at the first + X lock that is waiting or has been granted. */ - for (lock = lock_rec_get_first_on_page_addr(space, page_no); - lock != NULL; - lock = lock_rec_get_next_on_page(lock)) { + for (lock = lock_rec_get_first_on_page_addr(space, page_no); + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { - if (lock_get_wait(lock) - && !lock_rec_has_to_wait_in_queue(lock)) { + if (lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { - /* Grant the lock */ - ut_ad(lock->trx != in_lock->trx); - lock_grant(lock); + /* Grant the lock */ + ut_ad(lock->trx != in_lock->trx); + lock_grant(lock); + } + } + } else { + /* Grant locks if there are no conflicting locks ahead. + Move granted locks to the head of the list. */ + for (lock = lock_rec_get_first_on_page_addr(space, page_no); + lock != NULL;) { + + /* If the lock is a wait lock on this page, and it does not need to wait. */ + if ((lock->un_member.rec_lock.space == space) + && (lock->un_member.rec_lock.page_no == page_no) + && lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + lock_grant(lock); + + if (previous != NULL) { + /* Move the lock to the head of the list. */ + HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock); + lock_rec_move_to_front(lock, rec_fold); + } else { + /* Already at the head of the list. */ + previous = lock; + } + /* Move on to the next lock. */ + lock = static_cast(HASH_GET_NEXT(hash, previous)); + } else { + previous = lock; + lock = static_cast(HASH_GET_NEXT(hash, lock)); + } } } } From e93d44f2d75f425b0a8bfa2fe4309b93d51e1b33 Mon Sep 17 00:00:00 2001 From: sensssz Date: Tue, 11 Oct 2016 23:02:26 -0400 Subject: [PATCH 2/7] Bug fix: add undeclared variables. --- storage/xtradb/lock/lock0lock.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc index eb47ef6e685..fbaa09edf04 100644 --- a/storage/xtradb/lock/lock0lock.cc +++ b/storage/xtradb/lock/lock0lock.cc @@ -2971,7 +2971,9 @@ lock_rec_dequeue_from_page( { ulint space; ulint page_no; + ulint rec_fold lock_t* lock; + lock_t* previous = NULL; trx_lock_t* trx_lock; ut_ad(lock_mutex_own()); @@ -2982,6 +2984,7 @@ lock_rec_dequeue_from_page( space = in_lock->un_member.rec_lock.space; page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); in_lock->index->table->n_rec_locks--; From 288796f9272e5b714b16c9a0c3df88829c33ea71 Mon Sep 17 00:00:00 2001 From: sensssz Date: Tue, 11 Oct 2016 23:05:02 -0400 Subject: [PATCH 3/7] Bug fix: missing * and ; --- storage/xtradb/lock/lock0lock.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc index fbaa09edf04..c02daabe03a 100644 --- a/storage/xtradb/lock/lock0lock.cc +++ b/storage/xtradb/lock/lock0lock.cc @@ -2049,7 +2049,7 @@ lock_rec_insert_by_trx_age( ulint space; ulint page_no; ulint rec_fold; - hash_cell_t cell; + hash_cell_t* cell; lock_t* node; lock_t* next; @@ -2971,7 +2971,7 @@ lock_rec_dequeue_from_page( { ulint space; ulint page_no; - ulint rec_fold + ulint rec_fold; lock_t* lock; lock_t* previous = NULL; trx_lock_t* trx_lock; From 55d2bff882a60211a05bc368102e6d58835c2e67 Mon Sep 17 00:00:00 2001 From: sensssz Date: Tue, 11 Oct 2016 23:27:03 -0400 Subject: [PATCH 4/7] Bug fix: add * and ; for innodb --- storage/innobase/lock/lock0lock.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 37f15c0d3e8..e100ea40a66 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -2026,7 +2026,7 @@ lock_rec_insert_by_trx_age( ulint space; ulint page_no; ulint rec_fold; - hash_cell_t cell; + hash_cell_t* cell; lock_t* node; lock_t* next; @@ -2935,7 +2935,7 @@ lock_rec_dequeue_from_page( { ulint space; ulint page_no; - ulint rec_fold + ulint rec_fold; lock_t* lock; lock_t* previous = NULL; trx_lock_t* trx_lock; From 5dc7ad87b8302afbe91824121e7eca6b43ae9256 Mon Sep 17 00:00:00 2001 From: sensssz Date: Wed, 12 Oct 2016 21:52:14 -0400 Subject: [PATCH 5/7] Reduce conflict during in-order replication. --- storage/innobase/lock/lock0lock.cc | 15 +++++++++++++++ storage/xtradb/lock/lock0lock.cc | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index e100ea40a66..0ba217511f3 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -388,6 +388,9 @@ extern "C" int thd_need_wait_for(const MYSQL_THD thd); extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd); +extern "C" +int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2); + /** Stack to use during DFS search. Currently only a single stack is required because there is no parallel deadlock check. This stack is protected by the lock_sys_t::mutex. */ @@ -1988,6 +1991,8 @@ wsrep_print_wait_locks( /*********************************************************************//** Check if lock1 has higher priority than lock2. NULL has lowest priority. +Respect the preference of the upper server layer to reduce conflict +during in-order parallel replication. If neither of them is wait lock, the first one has higher priority. If only one of them is a wait lock, it has lower priority. Otherwise, the one with an older transaction has higher priority. @@ -2002,6 +2007,16 @@ has_higher_priority( } else if (lock2 == NULL) { return true; } + // Ask the upper server layer if any of the two trx should be prefered. + int preference = thd_deadlock_victim_preference(lock1->thd, lock2->thd); + if (preference == -1) { + // lock1 is preferred as a victim, so lock2 has higher priority + return false; + } else if (preference == 1) { + // lock2 is preferred as a victim, so lock1 has higher priority + return true; + } + // No preference. Compre them by wait mode and trx age. if (!lock_get_wait(lock1)) { return true; } else if (!lock_get_wait(lock2)) { diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc index c02daabe03a..d98d09d34e4 100644 --- a/storage/xtradb/lock/lock0lock.cc +++ b/storage/xtradb/lock/lock0lock.cc @@ -388,6 +388,9 @@ extern "C" int thd_need_wait_for(const MYSQL_THD thd); extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd); +extern "C" +int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2); + /** Stack to use during DFS search. Currently only a single stack is required because there is no parallel deadlock check. This stack is protected by the lock_sys_t::mutex. */ @@ -2011,6 +2014,8 @@ wsrep_print_wait_locks( /*********************************************************************//** Check if lock1 has higher priority than lock2. NULL has lowest priority. +Respect the preference of the upper server layer to reduce conflict +during in-order parallel replication. If neither of them is wait lock, the first one has higher priority. If only one of them is a wait lock, it has lower priority. Otherwise, the one with an older transaction has higher priority. @@ -2025,6 +2030,16 @@ has_higher_priority( } else if (lock2 == NULL) { return true; } + // Ask the upper server layer if any of the two trx should be prefered. + int preference = thd_deadlock_victim_preference(lock1->thd, lock2->thd); + if (preference == -1) { + // lock1 is preferred as a victim, so lock2 has higher priority + return false; + } else if (preference == 1) { + // lock2 is preferred as a victim, so lock1 has higher priority + return true; + } + // No preference. Compre them by wait mode and trx age. if (!lock_get_wait(lock1)) { return true; } else if (!lock_get_wait(lock2)) { From 0a769b00b5bfa5384a69f1f0d526086d3943ba03 Mon Sep 17 00:00:00 2001 From: sensssz Date: Wed, 12 Oct 2016 21:54:31 -0400 Subject: [PATCH 6/7] Get thd by lock->trx->mysql_thd. --- storage/innobase/lock/lock0lock.cc | 2 +- storage/xtradb/lock/lock0lock.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 0ba217511f3..bdd85fb0e35 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -2008,7 +2008,7 @@ has_higher_priority( return true; } // Ask the upper server layer if any of the two trx should be prefered. - int preference = thd_deadlock_victim_preference(lock1->thd, lock2->thd); + int preference = thd_deadlock_victim_preference(lock1->trx->mysql_thd, lock2->trx->mysql_thd); if (preference == -1) { // lock1 is preferred as a victim, so lock2 has higher priority return false; diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc index d98d09d34e4..abc0cd32d58 100644 --- a/storage/xtradb/lock/lock0lock.cc +++ b/storage/xtradb/lock/lock0lock.cc @@ -2031,7 +2031,7 @@ has_higher_priority( return true; } // Ask the upper server layer if any of the two trx should be prefered. - int preference = thd_deadlock_victim_preference(lock1->thd, lock2->thd); + int preference = thd_deadlock_victim_preference(lock1->trx->mysql_thd, lock2->trx->mysql_thd); if (preference == -1) { // lock1 is preferred as a victim, so lock2 has higher priority return false; From 183c02839f032e1d9057fd4e278806c26b016826 Mon Sep 17 00:00:00 2001 From: sensssz Date: Thu, 13 Oct 2016 01:23:21 -0400 Subject: [PATCH 7/7] Move the lock after deadlock is resolved. --- storage/innobase/lock/lock0lock.cc | 26 +++++++++++++++----------- storage/xtradb/lock/lock0lock.cc | 26 +++++++++++++++----------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index bdd85fb0e35..572dc0e1dc4 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -2228,19 +2228,13 @@ lock_rec_create( return(lock); } trx_mutex_exit(c_lock->trx); - } else if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { - lock_rec_insert_by_trx_age(lock, type_mode & LOCK_WAIT); } else { HASH_INSERT(lock_t, hash, lock_sys->rec_hash, lock_rec_fold(space, page_no), lock); } #else - if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { - lock_rec_insert_by_trx_age(lock, type_mode & LOCK_WAIT); - } else { - HASH_INSERT(lock_t, hash, lock_sys->rec_hash, - lock_rec_fold(space, page_no), lock); - } + HASH_INSERT(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), lock); #endif /* WITH_WSREP */ if (!caller_owns_trx_mutex) { @@ -2371,6 +2365,13 @@ lock_rec_enqueue_waiting( return(DB_SUCCESS_LOCKED_REC); } + // Move it only when it does not cause a deadlock. + if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { + HASH_DELETE(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(buf_block_get_space(block), buf_block_get_page_no(block)), lock); + lock_rec_insert_by_trx_age(lock, true); + } + trx->lock.que_state = TRX_QUE_LOCK_WAIT; trx->lock.was_chosen_as_deadlock_victim = FALSE; @@ -4225,7 +4226,8 @@ lock_get_first_lock( } ut_a(lock != NULL); - ut_a(lock != ctx->wait_lock); + ut_a(lock != ctx->wait_lock || + innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS); ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock)); return(lock); @@ -6432,8 +6434,10 @@ lock_rec_queue_validate( mode, 0, 0, block, heap_no, lock->trx)); #endif /* WITH_WSREP */ - } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { - + } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock) + && innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) { + // If using VATS, it's possible that a wait lock is inserted to a place in the list + // such that it does not need to wait. ut_a(lock_rec_has_to_wait_in_queue(lock)); } } diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc index abc0cd32d58..2091f926153 100644 --- a/storage/xtradb/lock/lock0lock.cc +++ b/storage/xtradb/lock/lock0lock.cc @@ -2251,19 +2251,13 @@ lock_rec_create( return(lock); } trx_mutex_exit(c_lock->trx); - } else if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { - lock_rec_insert_by_trx_age(lock, type_mode & LOCK_WAIT); } else { HASH_INSERT(lock_t, hash, lock_sys->rec_hash, lock_rec_fold(space, page_no), lock); } #else - if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { - lock_rec_insert_by_trx_age(lock, type_mode & LOCK_WAIT); - } else { - HASH_INSERT(lock_t, hash, lock_sys->rec_hash, - lock_rec_fold(space, page_no), lock); - } + HASH_INSERT(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), lock); #endif /* WITH_WSREP */ lock_sys->rec_num++; @@ -2399,6 +2393,13 @@ lock_rec_enqueue_waiting( return(DB_SUCCESS_LOCKED_REC); } + // Move it only when it does not cause a deadlock. + if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { + HASH_DELETE(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(buf_block_get_space(block), buf_block_get_page_no(block)), lock); + lock_rec_insert_by_trx_age(lock, true); + } + trx->lock.que_state = TRX_QUE_LOCK_WAIT; trx->lock.was_chosen_as_deadlock_victim = FALSE; @@ -4263,7 +4264,8 @@ lock_get_first_lock( } ut_a(lock != NULL); - ut_a(lock != ctx->wait_lock); + ut_a(lock != ctx->wait_lock || + innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS); ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock)); return(lock); @@ -6495,8 +6497,10 @@ lock_rec_queue_validate( mode, 0, 0, block, heap_no, lock->trx->id)); #endif /* WITH_WSREP */ - } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { - + } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock) + && innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) { + // If using VATS, it's possible that a wait lock is inserted to a place in the list + // such that it does not need to wait. ut_a(lock_rec_has_to_wait_in_queue(lock)); } }