Browse Source
fix bogus stalls in the lock tree for low concurrency applications
fix bogus stalls in the lock tree for low concurrency applications
Merge into the MariaDB tree the pull request from Rich Prohaska for
PerconaFT. These changes are needed to get parallel replication to
work with TokuDB. Once the pull request is accepted by Percona and the new upstream version enters MariaDB, this commit can be superseded.
Original commit message from Rich Prohaska:
1. Fix the release before wait race
The release before wait race occurs when a lock is released by transaction A after transaction B tried to acquire it but before transaction B has a chance to register it's pending lock request. There are several ways to fix this problem, but we want to optimize for the common situation of minimal lock conflicts, which is what the lock acquisition algorithm currently does. Our solution to the release before wait race is for transaction B to retry its lock request after its lock request has been added to the pending lock set.
2. Fix the retry race
The retry race occurs in the current lock retry algorithm which assumes that if some transaction is running lock retry, then my transaction does not also need to run it. There is a chance that some pending lock requests will be skipped, but these lock requests will eventually time out. For applications with small numbers of concurrent transactions, timeouts will frequently occur, and the application throughput will be very small.
The solution to the retry race is to use a group retry algorithm. All threads run through the retry logic. Sequence numbers are used to group retries into batches such that one transaction can run the retry logic on behalf of several transactions. This amortizes the retry cost. The sequence numbers also ensure that when a transaction releases its locks, all of the pending lock requests that it is blocking are retried.
3. Implement a mechanism to find and kill a pending lock request
Tags lock requests with a client id, use the client id as a key into the pending lock requests sets to find a lock request, complete the lock request with a lock timeout error.
Copyright (c) 2016, Rich Prohaska
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pull/260/head
21 changed files with 606 additions and 107 deletions
-
5storage/tokudb/PerconaFT/buildheader/make_tdb.cc
-
9storage/tokudb/PerconaFT/ft/txn/txn.cc
-
5storage/tokudb/PerconaFT/ft/txn/txn.h
-
130storage/tokudb/PerconaFT/locktree/lock_request.cc
-
15storage/tokudb/PerconaFT/locktree/lock_request.h
-
14storage/tokudb/PerconaFT/locktree/locktree.cc
-
8storage/tokudb/PerconaFT/locktree/locktree.h
-
13storage/tokudb/PerconaFT/locktree/manager.cc
-
100storage/tokudb/PerconaFT/locktree/tests/kill_waiter.cc
-
3storage/tokudb/PerconaFT/locktree/tests/lock_request_killed.cc
-
1storage/tokudb/PerconaFT/locktree/tests/lock_request_not_killed.cc
-
89storage/tokudb/PerconaFT/locktree/tests/lock_request_start_release_wait.cc
-
31storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_race.cc
-
127storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_race_3.cc
-
128storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_wait_race_2.cc
-
9storage/tokudb/PerconaFT/src/tests/test_iterate_live_transactions.cc
-
3storage/tokudb/PerconaFT/src/tests/test_stress0.cc
-
5storage/tokudb/PerconaFT/src/ydb.cc
-
8storage/tokudb/PerconaFT/src/ydb_txn.cc
-
8storage/tokudb/tokudb_information_schema.cc
-
2storage/tokudb/tokudb_txn.h
@ -0,0 +1,100 @@ |
|||
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
|||
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
|
|||
|
|||
// test the lock manager kill waiter function
|
|||
|
|||
#include "locktree.h"
|
|||
#include "lock_request.h"
|
|||
#include "test.h"
|
|||
#include "locktree_unit_test.h"
|
|||
#include <thread>
|
|||
#include <atomic>
|
|||
|
|||
namespace toku { |
|||
|
|||
const uint64_t my_lock_wait_time = 1000 * 1000; |
|||
const uint64_t my_killed_time = 500 * 1000; |
|||
const int n_locks = 4; |
|||
|
|||
static int my_killed_callback(void) { |
|||
if (1) fprintf(stderr, "%s:%u %s\n", __FILE__, __LINE__, __FUNCTION__); |
|||
return 0; |
|||
} |
|||
|
|||
static void locktree_release_lock(locktree *lt, TXNID txn_id, const DBT *left, const DBT *right) { |
|||
range_buffer buffer; |
|||
buffer.create(); |
|||
buffer.append(left, right); |
|||
lt->release_locks(txn_id, &buffer); |
|||
buffer.destroy(); |
|||
} |
|||
|
|||
static void wait_lock(lock_request *lr, std::atomic_int *done) { |
|||
int r = lr->wait(my_lock_wait_time, my_killed_time, my_killed_callback); |
|||
assert(r == DB_LOCK_NOTGRANTED); |
|||
*done = 1; |
|||
} |
|||
|
|||
static void test_kill_waiter(void) { |
|||
int r; |
|||
|
|||
locktree_manager mgr; |
|||
mgr.create(nullptr, nullptr, nullptr, nullptr); |
|||
|
|||
DICTIONARY_ID dict_id = { 1 }; |
|||
locktree *lt = mgr.get_lt(dict_id, dbt_comparator, nullptr); |
|||
|
|||
const DBT *one = get_dbt(1); |
|||
|
|||
lock_request locks[n_locks]; |
|||
std::thread waiters[n_locks-1]; |
|||
for (int i = 0; i < n_locks; i++) { |
|||
locks[i].create(); |
|||
locks[i].set(lt, i+1, one, one, lock_request::type::WRITE, false, &waiters[i]); |
|||
} |
|||
|
|||
// txn 'n_locks' grabs the lock
|
|||
r = locks[n_locks-1].start(); |
|||
assert_zero(r); |
|||
|
|||
for (int i = 0; i < n_locks-1; i++) { |
|||
r = locks[i].start(); |
|||
assert(r == DB_LOCK_NOTGRANTED); |
|||
} |
|||
|
|||
std::atomic_int done[n_locks-1]; |
|||
for (int i = 0; i < n_locks-1; i++) { |
|||
done[i] = 0; |
|||
waiters[i] = std::thread(wait_lock, &locks[i], &done[i]); |
|||
} |
|||
|
|||
for (int i = 0; i < n_locks-1; i++) { |
|||
assert(!done[i]); |
|||
} |
|||
|
|||
sleep(1); |
|||
for (int i = 0; i < n_locks-1; i++) { |
|||
mgr.kill_waiter(&waiters[i]); |
|||
while (!done[i]) sleep(1); |
|||
waiters[i].join(); |
|||
for (int j = i+1; j < n_locks-1; j++) |
|||
assert(!done[j]); |
|||
} |
|||
|
|||
locktree_release_lock(lt, n_locks, one, one); |
|||
|
|||
for (int i = 0; i < n_locks; i++) { |
|||
locks[i].destroy(); |
|||
} |
|||
|
|||
mgr.release_lt(lt); |
|||
mgr.destroy(); |
|||
} |
|||
|
|||
} /* namespace toku */ |
|||
|
|||
int main(void) { |
|||
toku::test_kill_waiter(); |
|||
return 0; |
|||
} |
|||
|
|||
@ -0,0 +1,89 @@ |
|||
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
|||
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
|
|||
|
|||
// test the race between start, release, and wait. since start does not put its
|
|||
// lock request into the pending set, the blocking txn could release its lock before
|
|||
// the first txn waits. this will block the first txn because its lock request is
|
|||
// not known when the lock is released. the bug fix is to try again when lock retries
|
|||
// are locked out.
|
|||
|
|||
#include "locktree.h"
|
|||
#include "lock_request.h"
|
|||
#include "test.h"
|
|||
#include "locktree_unit_test.h"
|
|||
#include <thread>
|
|||
#include <atomic>
|
|||
|
|||
namespace toku { |
|||
|
|||
const uint64_t my_lock_wait_time = 1000 * 1000; // ms
|
|||
const uint64_t my_killed_time = 1 * 1000; // ms
|
|||
|
|||
static uint64_t t_wait; |
|||
|
|||
static int my_killed_callback(void) { |
|||
uint64_t t_now = toku_current_time_microsec(); |
|||
assert(t_now >= t_wait); |
|||
if (t_now - t_wait >= my_killed_time*1000) |
|||
abort(); |
|||
return 0; |
|||
} |
|||
|
|||
static void locktree_release_lock(locktree *lt, TXNID txn_id, const DBT *left, const DBT *right) { |
|||
range_buffer buffer; |
|||
buffer.create(); |
|||
buffer.append(left, right); |
|||
lt->release_locks(txn_id, &buffer); |
|||
buffer.destroy(); |
|||
} |
|||
|
|||
static void test_start_release_wait(void) { |
|||
int r; |
|||
|
|||
locktree_manager mgr; |
|||
mgr.create(nullptr, nullptr, nullptr, nullptr); |
|||
|
|||
DICTIONARY_ID dict_id = { 1 }; |
|||
locktree *lt = mgr.get_lt(dict_id, dbt_comparator, nullptr); |
|||
|
|||
const DBT *one = get_dbt(1); |
|||
|
|||
// a locks one
|
|||
lock_request a; |
|||
a.create(); |
|||
a.set(lt, 1, one, one, lock_request::type::WRITE, false); |
|||
r = a.start(); |
|||
assert(r == 0); |
|||
|
|||
// b tries to lock one, fails
|
|||
lock_request b; |
|||
b.create(); |
|||
b.set(lt, 2, one, one, lock_request::type::WRITE, false); |
|||
r = b.start(); |
|||
assert(r == DB_LOCK_NOTGRANTED); |
|||
|
|||
// a releases its lock
|
|||
locktree_release_lock(lt, 1, one, one); |
|||
|
|||
// b waits for one, gets locks immediately
|
|||
t_wait = toku_current_time_microsec(); |
|||
r = b.wait(my_lock_wait_time, my_killed_time, my_killed_callback); |
|||
assert(r == 0); |
|||
|
|||
// b releases its lock so we can exit cleanly
|
|||
locktree_release_lock(lt, 2, one, one); |
|||
|
|||
a.destroy(); |
|||
b.destroy(); |
|||
|
|||
mgr.release_lt(lt); |
|||
mgr.destroy(); |
|||
} |
|||
|
|||
} /* namespace toku */ |
|||
|
|||
int main(void) { |
|||
toku::test_start_release_wait(); |
|||
return 0; |
|||
} |
|||
|
|||
@ -0,0 +1,127 @@ |
|||
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
|||
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
|
|||
#ident "$Id$"
|
|||
/*======
|
|||
This file is part of PerconaFT. |
|||
|
|||
|
|||
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
|||
|
|||
PerconaFT is free software: you can redistribute it and/or modify |
|||
it under the terms of the GNU General Public License, version 2, |
|||
as published by the Free Software Foundation. |
|||
|
|||
PerconaFT is distributed in the hope that it will be useful, |
|||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
GNU General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License |
|||
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
|
|||
|
|||
---------------------------------------- |
|||
|
|||
PerconaFT is free software: you can redistribute it and/or modify |
|||
it under the terms of the GNU Affero General Public License, version 3, |
|||
as published by the Free Software Foundation. |
|||
|
|||
PerconaFT is distributed in the hope that it will be useful, |
|||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
GNU Affero General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU Affero General Public License |
|||
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
|
|||
======= */ |
|||
|
|||
#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
|
|||
|
|||
#include <iostream>
|
|||
#include <thread>
|
|||
#include <pthread.h>
|
|||
#include "test.h"
|
|||
#include "locktree.h"
|
|||
#include "lock_request.h"
|
|||
|
|||
// Suppose that 3 threads are running a lock acquire, release, retry sequence. There is
|
|||
// a race in the retry algorithm with 2 threads running lock retry simultaneously. The
|
|||
// first thread to run retry sets a flag that will cause the second thread to skip the
|
|||
// lock retries. If the first thread progressed past the contended lock, then the second
|
|||
// threa will HANG until its lock timer pops, even when the contended lock is no longer held.
|
|||
|
|||
// This test exposes this problem as a test hang. The group retry algorithm fixes the race
|
|||
// in the lock request retry algorihm and this test should no longer hang.
|
|||
|
|||
namespace toku { |
|||
|
|||
// use 1000 when after_retry_all is implemented, otherwise use 100000
|
|||
static const int n_tests = 1000; // 100000;
|
|||
|
|||
static void after_retry_all(void) { |
|||
usleep(10000); |
|||
} |
|||
|
|||
static void run_locker(locktree *lt, TXNID txnid, const DBT *key, pthread_barrier_t *b) { |
|||
for (int i = 0; i < n_tests; i++) { |
|||
int r; |
|||
r = pthread_barrier_wait(b); assert(r == 0 || r == PTHREAD_BARRIER_SERIAL_THREAD); |
|||
|
|||
lock_request request; |
|||
request.create(); |
|||
|
|||
request.set(lt, txnid, key, key, lock_request::type::WRITE, false); |
|||
|
|||
// try to acquire the lock
|
|||
r = request.start(); |
|||
if (r == DB_LOCK_NOTGRANTED) { |
|||
// wait for the lock to be granted
|
|||
r = request.wait(1000 * 1000); |
|||
} |
|||
|
|||
if (r == 0) { |
|||
// release the lock
|
|||
range_buffer buffer; |
|||
buffer.create(); |
|||
buffer.append(key, key); |
|||
lt->release_locks(txnid, &buffer); |
|||
buffer.destroy(); |
|||
|
|||
// retry pending lock requests
|
|||
lock_request::retry_all_lock_requests(lt, after_retry_all); |
|||
} |
|||
|
|||
request.destroy(); |
|||
memset(&request, 0xab, sizeof request); |
|||
|
|||
toku_pthread_yield(); |
|||
if ((i % 10) == 0) |
|||
std::cout << std::this_thread::get_id() << " " << i << std::endl; |
|||
} |
|||
} |
|||
|
|||
} /* namespace toku */ |
|||
|
|||
int main(void) { |
|||
|
|||
toku::locktree lt; |
|||
DICTIONARY_ID dict_id = { 1 }; |
|||
lt.create(nullptr, dict_id, toku::dbt_comparator); |
|||
|
|||
const DBT *one = toku::get_dbt(1); |
|||
|
|||
const int n_workers = 3; |
|||
std::thread worker[n_workers]; |
|||
pthread_barrier_t b; |
|||
int r = pthread_barrier_init(&b, nullptr, n_workers); assert(r == 0); |
|||
for (int i = 0; i < n_workers; i++) { |
|||
worker[i] = std::thread(toku::run_locker, <, i, one, &b); |
|||
} |
|||
for (int i = 0; i < n_workers; i++) { |
|||
worker[i].join(); |
|||
} |
|||
r = pthread_barrier_destroy(&b); assert(r == 0); |
|||
lt.release_reference(); |
|||
lt.destroy(); |
|||
return 0; |
|||
} |
|||
|
|||
@ -0,0 +1,128 @@ |
|||
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
|||
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
|
|||
#ident "$Id$"
|
|||
/*======
|
|||
This file is part of PerconaFT. |
|||
|
|||
|
|||
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
|||
|
|||
PerconaFT is free software: you can redistribute it and/or modify |
|||
it under the terms of the GNU General Public License, version 2, |
|||
as published by the Free Software Foundation. |
|||
|
|||
PerconaFT is distributed in the hope that it will be useful, |
|||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
GNU General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License |
|||
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
|
|||
|
|||
---------------------------------------- |
|||
|
|||
PerconaFT is free software: you can redistribute it and/or modify |
|||
it under the terms of the GNU Affero General Public License, version 3, |
|||
as published by the Free Software Foundation. |
|||
|
|||
PerconaFT is distributed in the hope that it will be useful, |
|||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
GNU Affero General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU Affero General Public License |
|||
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
|
|||
======= */ |
|||
|
|||
#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
|
|||
|
|||
#include <iostream>
|
|||
#include <thread>
|
|||
#include <pthread.h>
|
|||
#include "test.h"
|
|||
#include "locktree.h"
|
|||
#include "lock_request.h"
|
|||
|
|||
// Suppose that 2 threads are running a lock acquire, release, retry sequence. There is a
|
|||
// race between the acquire and the release with 2 threads. If thread 1 acquires a lock,
|
|||
// and thread 2 tries to acquire the same lock and fails, thread 1 may release its lock and retry
|
|||
// pending lock requests BEFORE thread 2 adds itself to the pending lock requests. If this
|
|||
// happens, then thread 2 will HANG until its lock timer expires even when the lock it is
|
|||
// waiting for is FREE.
|
|||
|
|||
// This test exposes this problem as a test hang. If the race is fixed, then the test runs to
|
|||
// completion.
|
|||
|
|||
namespace toku { |
|||
|
|||
static void start_before_pending(void) { |
|||
usleep(10000); |
|||
} |
|||
|
|||
static void run_locker(locktree *lt, TXNID txnid, const DBT *key, pthread_barrier_t *b) { |
|||
for (int i = 0; i < 100000; i++) { |
|||
int r; |
|||
r = pthread_barrier_wait(b); assert(r == 0 || r == PTHREAD_BARRIER_SERIAL_THREAD); |
|||
|
|||
lock_request request; |
|||
request.create(); |
|||
request.set(lt, txnid, key, key, lock_request::type::WRITE, false); |
|||
|
|||
// if the callback is included, then the race is easy to reproduce. Otherwise, several
|
|||
// test runs may be required before the race happens.
|
|||
if (1) request.set_start_before_pending_test_callback(start_before_pending); |
|||
|
|||
// try to acquire the lock
|
|||
r = request.start(); |
|||
if (r == DB_LOCK_NOTGRANTED) { |
|||
// wait for the lock to be granted
|
|||
r = request.wait(1000 * 1000); |
|||
} |
|||
|
|||
if (r == 0) { |
|||
// release the lock
|
|||
range_buffer buffer; |
|||
buffer.create(); |
|||
buffer.append(key, key); |
|||
lt->release_locks(txnid, &buffer); |
|||
buffer.destroy(); |
|||
|
|||
// retry pending lock requests
|
|||
lock_request::retry_all_lock_requests(lt); |
|||
} |
|||
|
|||
request.destroy(); |
|||
memset(&request, 0xab, sizeof request); |
|||
|
|||
toku_pthread_yield(); |
|||
if ((i % 10) == 0) |
|||
std::cout << std::this_thread::get_id() << " " << i << std::endl; |
|||
} |
|||
} |
|||
|
|||
} /* namespace toku */ |
|||
|
|||
int main(void) { |
|||
|
|||
toku::locktree lt; |
|||
DICTIONARY_ID dict_id = { 1 }; |
|||
lt.create(nullptr, dict_id, toku::dbt_comparator); |
|||
|
|||
const DBT *one = toku::get_dbt(1); |
|||
|
|||
const int n_workers = 2; |
|||
std::thread worker[n_workers]; |
|||
pthread_barrier_t b; |
|||
int r = pthread_barrier_init(&b, nullptr, n_workers); assert(r == 0); |
|||
for (int i = 0; i < n_workers; i++) { |
|||
worker[i] = std::thread(toku::run_locker, <, i, one, &b); |
|||
} |
|||
for (int i = 0; i < n_workers; i++) { |
|||
worker[i].join(); |
|||
} |
|||
r = pthread_barrier_destroy(&b); assert(r == 0); |
|||
lt.release_reference(); |
|||
lt.destroy(); |
|||
return 0; |
|||
} |
|||
|
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue