Browse Source

MDEV-27774 Reduce scalability bottlenecks in mtr_t::commit()

A prominent bottleneck in mtr_t::commit() is log_sys.mutex between
log_sys.append_prepare() and log_close().

User-visible change: The minimum innodb_log_file_size will be
increased from 1MiB to 4MiB so that some conditions can be
trivially satisfied.

log_sys.latch (log_latch): Replaces log_sys.mutex and
log_sys.flush_order_mutex. Copying mtr_t::m_log to
log_sys.buf is protected by a shared log_sys.latch.
Writes from log_sys.buf to the file system will be protected
by an exclusive log_sys.latch.

log_sys.lsn_lock: Protects the allocation of log buffer
in log_sys.append_prepare().

sspin_lock: A simple spin lock, for log_sys.lsn_lock.

Thanks to Vladislav Vaintroub for suggesting this idea, and for
reviewing these changes.

mariadb-backup: Replace some use of log_sys.mutex with recv_sys.mutex.

buf_pool_t::insert_into_flush_list(): Implement sorting of flush_list
because ordering is otherwise no longer guaranteed. Ordering by LSN
is needed for the proper operation of redo log checkpoints.

log_sys.append_prepare(): Advance log_sys.lsn and log_sys.buf_free by
the length, and return the old values. Also increment write_to_buf,
which was previously done in log_close().

mtr_t::finish_write(): Obtain the buffer pointer from
log_sys.append_prepare().

log_sys.buf_free: Make the field Atomic_relaxed,
to simplify log_flush_margin(). Use only loads and stores
to avoid costly read-modify-write atomic operations.

buf_pool.flush_list_requests: Replaces
export_vars.innodb_buffer_pool_write_requests
and srv_stats.buf_pool_write_requests.
Protected by buf_pool.flush_list_mutex.

buf_pool_t::insert_into_flush_list(): Do not invoke page_cleaner_wakeup().
Let the caller do that after a batch of calls.

recv_recover_page(): Invoke a minimal part of
buf_pool.insert_into_flush_list().

ReleaseBlocks::modified: A number of pages added to buf_pool.flush_list.

ReleaseBlocks::operator(): Merge buf_flush_note_modification() here.

log_t::set_capacity(): Renamed from log_set_capacity().
pull/2215/head
Marko Mäkelä 4 years ago
parent
commit
a635c40648
  1. 61
      extra/mariabackup/xtrabackup.cc
  2. 20
      mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result
  3. 20
      mysql-test/suite/innodb/r/log_corruption.result
  4. 2
      mysql-test/suite/innodb/r/log_file_size.result
  5. 22
      mysql-test/suite/innodb/t/log_corruption.test
  6. 2
      mysql-test/suite/innodb/t/log_file_size.test
  7. 2
      mysql-test/suite/mariabackup/innodb_redo_log_overwrite.opt
  8. 2
      mysql-test/suite/mariabackup/innodb_redo_overwrite.opt
  9. 3
      mysql-test/suite/mariabackup/innodb_redo_overwrite.result
  10. 3
      mysql-test/suite/mariabackup/innodb_redo_overwrite.test
  11. 2
      mysql-test/suite/perfschema/r/sxlock_func.result
  12. 2
      mysql-test/suite/sys_vars/r/sysvars_innodb.result
  13. 68
      storage/innobase/buf/buf0flu.cc
  14. 44
      storage/innobase/fil/fil0fil.cc
  15. 15
      storage/innobase/handler/ha_innodb.cc
  16. 17
      storage/innobase/include/buf0buf.h
  17. 29
      storage/innobase/include/buf0flu.h
  18. 23
      storage/innobase/include/fil0fil.h
  19. 91
      storage/innobase/include/log0log.h
  20. 6
      storage/innobase/include/log0recv.h
  21. 8
      storage/innobase/include/mtr0mtr.h
  22. 5
      storage/innobase/include/mtr0mtr.inl
  23. 4
      storage/innobase/include/srv0srv.h
  24. 11
      storage/innobase/include/srw_lock.h
  25. 3
      storage/innobase/include/univ.i
  26. 126
      storage/innobase/log/log0log.cc
  27. 66
      storage/innobase/log/log0recv.cc
  28. 291
      storage/innobase/mtr/mtr0mtr.cc
  29. 7
      storage/innobase/srv/srv0mon.cc
  30. 11
      storage/innobase/srv/srv0srv.cc
  31. 18
      storage/innobase/srv/srv0start.cc
  32. 9
      storage/innobase/sync/srw_lock.cc

61
extra/mariabackup/xtrabackup.cc

@ -1582,9 +1582,8 @@ struct my_option xb_server_options[] =
{"innodb_log_file_size", OPT_INNODB_LOG_FILE_SIZE,
"Ignored for mysqld option compatibility",
(G_PTR*) &srv_log_file_size, (G_PTR*) &srv_log_file_size, 0,
GET_ULL, REQUIRED_ARG, 48 << 20, 1 << 20,
std::numeric_limits<ulonglong>::max(), 0,
UNIV_PAGE_SIZE_MAX, 0},
GET_ULL, REQUIRED_ARG, 96 << 20, 4 << 20,
std::numeric_limits<ulonglong>::max(), 0, 4096, 0},
{"innodb_log_group_home_dir", OPT_INNODB_LOG_GROUP_HOME_DIR,
"Path to InnoDB log files.", &srv_log_group_home_dir,
&srv_log_group_home_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
@ -2540,10 +2539,10 @@ void xtrabackup_io_throttling()
if (!xtrabackup_backup)
return;
mysql_mutex_lock(&log_sys.mutex);
mysql_mutex_lock(&recv_sys.mutex);
if (xtrabackup_throttle && (io_ticket--) < 0)
mysql_cond_wait(&wait_throttle, &log_sys.mutex);
mysql_mutex_unlock(&log_sys.mutex);
mysql_cond_wait(&wait_throttle, &recv_sys.mutex);
mysql_mutex_unlock(&recv_sys.mutex);
}
static
@ -2972,7 +2971,7 @@ skip:
@return whether the operation failed */
static bool xtrabackup_copy_logfile()
{
mysql_mutex_assert_owner(&log_sys.mutex);
mysql_mutex_assert_owner(&recv_sys.mutex);
DBUG_EXECUTE_IF("log_checksum_mismatch", return false;);
ut_a(dst_log_file);
@ -2980,7 +2979,6 @@ static bool xtrabackup_copy_logfile()
const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
const size_t block_size_1{log_sys.get_block_size() - 1};
mysql_mutex_lock(&recv_sys.mutex);
#ifdef HAVE_PMEM
if (log_sys.is_pmem())
{
@ -3127,7 +3125,6 @@ static bool xtrabackup_copy_logfile()
#ifdef HAVE_PMEM
write_error:
#endif
mysql_mutex_unlock(&recv_sys.mutex);
msg("Error: write to ib_logfile0 failed");
return true;
}
@ -3148,12 +3145,11 @@ static bool xtrabackup_copy_logfile()
if (recv_sys.offset < log_sys.get_block_size())
break;
mysql_mutex_unlock(&recv_sys.mutex);
if (xtrabackup_throttle && io_ticket-- < 0)
mysql_cond_wait(&wait_throttle, &log_sys.mutex);
mysql_cond_wait(&wait_throttle, &recv_sys.mutex);
retry_count= 0;
continue;
}
else
{
@ -3173,7 +3169,6 @@ static bool xtrabackup_copy_logfile()
mysql_mutex_lock(&recv_sys.mutex);
}
mysql_mutex_unlock(&recv_sys.mutex);
msg(">> log scanned up to (" LSN_PF ")", recv_sys.lsn);
return false;
}
@ -3204,16 +3199,16 @@ extern lsn_t server_lsn_after_lock;
static void log_copying_thread()
{
my_thread_init();
mysql_mutex_lock(&log_sys.mutex);
mysql_mutex_lock(&recv_sys.mutex);
while (!xtrabackup_copy_logfile() &&
(!metadata_to_lsn || metadata_to_lsn > recv_sys.lsn))
{
timespec abstime;
set_timespec_nsec(abstime, 1000ULL * xtrabackup_log_copy_interval);
mysql_cond_timedwait(&log_copying_stop, &log_sys.mutex, &abstime);
mysql_cond_timedwait(&log_copying_stop, &recv_sys.mutex, &abstime);
}
log_copying_running= false;
mysql_mutex_unlock(&log_sys.mutex);
mysql_mutex_unlock(&recv_sys.mutex);
my_thread_end();
}
@ -3226,13 +3221,13 @@ static void *io_watching_thread(void*)
/* currently, for --backup only */
ut_a(xtrabackup_backup);
mysql_mutex_lock(&log_sys.mutex);
mysql_mutex_lock(&recv_sys.mutex);
while (log_copying_running && !metadata_to_lsn)
{
timespec abstime;
set_timespec(abstime, 1);
mysql_cond_timedwait(&log_copying_stop, &log_sys.mutex, &abstime);
mysql_cond_timedwait(&log_copying_stop, &recv_sys.mutex, &abstime);
io_ticket= xtrabackup_throttle;
mysql_cond_broadcast(&wait_throttle);
}
@ -3240,7 +3235,7 @@ static void *io_watching_thread(void*)
/* stop io throttle */
xtrabackup_throttle= 0;
mysql_cond_broadcast(&wait_throttle);
mysql_mutex_unlock(&log_sys.mutex);
mysql_mutex_unlock(&recv_sys.mutex);
return nullptr;
}
@ -4512,7 +4507,7 @@ static void stop_backup_threads(bool running)
@return whether the operation succeeded */
static bool xtrabackup_backup_low()
{
mysql_mutex_lock(&log_sys.mutex);
mysql_mutex_lock(&recv_sys.mutex);
ut_ad(!metadata_to_lsn);
/* read the latest checkpoint lsn */
@ -4531,19 +4526,19 @@ static bool xtrabackup_backup_low()
recv_sys.lsn = lsn;
mysql_cond_broadcast(&log_copying_stop);
const bool running= log_copying_running;
mysql_mutex_unlock(&log_sys.mutex);
mysql_mutex_unlock(&recv_sys.mutex);
stop_backup_threads(running);
mysql_mutex_lock(&log_sys.mutex);
mysql_mutex_lock(&recv_sys.mutex);
}
if (metadata_to_lsn && xtrabackup_copy_logfile()) {
mysql_mutex_unlock(&log_sys.mutex);
mysql_mutex_unlock(&recv_sys.mutex);
ds_close(dst_log_file);
dst_log_file = NULL;
return false;
}
mysql_mutex_unlock(&log_sys.mutex);
mysql_mutex_unlock(&recv_sys.mutex);
if (ds_close(dst_log_file) || !metadata_to_lsn) {
dst_log_file = NULL;
@ -4632,10 +4627,10 @@ static bool xtrabackup_backup_func()
if(innodb_init_param()) {
fail:
if (log_copying_running) {
mysql_mutex_lock(&log_sys.mutex);
mysql_mutex_lock(&recv_sys.mutex);
metadata_to_lsn = 1;
mysql_cond_broadcast(&log_copying_stop);
mysql_mutex_unlock(&log_sys.mutex);
mysql_mutex_unlock(&recv_sys.mutex);
stop_backup_threads(true);
}
@ -4692,12 +4687,12 @@ fail:
log_sys.create();
/* get current checkpoint_lsn */
mysql_mutex_lock(&log_sys.mutex);
mysql_mutex_lock(&recv_sys.mutex);
if (recv_sys.find_checkpoint() != DB_SUCCESS) {
msg("Error: cannot read redo log header");
unlock_and_fail:
mysql_mutex_unlock(&log_sys.mutex);
mysql_mutex_unlock(&recv_sys.mutex);
free_and_fail:
aligned_free(const_cast<byte*>(field_ref_zero));
field_ref_zero = nullptr;
@ -4710,7 +4705,7 @@ free_and_fail:
}
recv_needed_recovery = true;
mysql_mutex_unlock(&log_sys.mutex);
mysql_mutex_unlock(&recv_sys.mutex);
/* create extra LSN dir if it does not exist. */
if (xtrabackup_extra_lsndir
@ -4772,12 +4767,12 @@ free_and_fail:
/* copy log file by current position */
mysql_mutex_lock(&log_sys.mutex);
mysql_mutex_lock(&recv_sys.mutex);
recv_sys.lsn = log_sys.next_checkpoint_lsn;
const bool log_copy_failed = xtrabackup_copy_logfile();
mysql_mutex_unlock(&log_sys.mutex);
mysql_mutex_unlock(&recv_sys.mutex);
if (log_copy_failed) {
log_copying_running = false;
@ -5212,7 +5207,7 @@ xb_delta_open_matching_space(
return OS_FILE_CLOSED;
}
mysql_mutex_lock(&log_sys.mutex);
mysql_mutex_lock(&recv_sys.mutex);
if (!fil_is_user_tablespace_id(info.space_id)) {
found:
/* open the file and return its handle */
@ -5225,7 +5220,7 @@ found:
msg("mariabackup: Cannot open file %s\n", real_name);
}
exit:
mysql_mutex_unlock(&log_sys.mutex);
mysql_mutex_unlock(&recv_sys.mutex);
return file;
}

20
mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result

@ -20,7 +20,7 @@ AND support IN ('YES', 'DEFAULT', 'ENABLED');
ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS
FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint/ in mysqld.1.err
# empty redo log from before MariaDB 10.2.2
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -28,7 +28,7 @@ COUNT(*)
1
FOUND 1 /InnoDB: Upgrading redo log:/ in mysqld.1.err
# Corrupted multi-file redo log from before MariaDB 10.2.2
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -36,7 +36,7 @@ COUNT(*)
0
FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and it appears corrupted/ in mysqld.1.err
# Empty multi-file redo log (wrong offset) from before MariaDB 10.2.2
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -45,7 +45,7 @@ COUNT(*)
FOUND 3 /Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint\./ in mysqld.1.err
# Multi-file redo log with size mismatch from after MariaDB 10.2.2
# Corrupted multi-file redo log from after MariaDB 10.2.2
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -53,15 +53,16 @@ COUNT(*)
0
FOUND 3 /Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint\./ in mysqld.1.err
FOUND 1 /InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\.2\.3\.4\./ in mysqld.1.err
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
COUNT(*)
0
FOUND 1 /InnoDB: Log file .*ib_logfile1 is of different size 2097152 bytes than other log files 1048576 bytes!/ in mysqld.1.err
FOUND 2 /InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\.2\.3\.4\./ in mysqld.1.err
# Empty multi-file redo log from after MariaDB 10.2.2
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -74,6 +75,7 @@ SELECT * FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS
FOUND 1 /InnoDB: Log file .*ib_logfile1 is of different size 2097152 bytes than other log files 4194304 bytes!/ in mysqld.1.err
FOUND 1 /InnoDB: Invalid log header checksum/ in mysqld.1.err
# distant future redo log format, with valid header checksum
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption
@ -165,7 +167,7 @@ AND support IN ('YES', 'DEFAULT', 'ENABLED');
ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS
FOUND 3 /\[ERROR\] InnoDB: Upgrade after a crash is not supported\. The redo log was created with MariaDB 10\.3\.1\./ in mysqld.1.err
# Empty 10.3 redo log
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -173,7 +175,7 @@ COUNT(*)
1
FOUND 1 /InnoDB: log sequence number 1213964\b.*; transaction id 0/ in mysqld.1.err
# Empty 10.2 redo log
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -181,7 +183,7 @@ COUNT(*)
1
FOUND 3 /InnoDB: Upgrading redo log:/ in mysqld.1.err
# Empty 10.5 redo log
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');

20
mysql-test/suite/innodb/r/log_corruption.result

@ -20,7 +20,7 @@ AND support IN ('YES', 'DEFAULT', 'ENABLED');
ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS
FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint/ in mysqld.1.err
# empty redo log from before MariaDB 10.2.2
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -28,7 +28,7 @@ COUNT(*)
1
FOUND 1 /InnoDB: Upgrading redo log:/ in mysqld.1.err
# Corrupted multi-file redo log from before MariaDB 10.2.2
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -36,7 +36,7 @@ COUNT(*)
0
FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and it appears corrupted/ in mysqld.1.err
# Empty multi-file redo log (wrong offset) from before MariaDB 10.2.2
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -45,7 +45,7 @@ COUNT(*)
FOUND 3 /Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint\./ in mysqld.1.err
# Multi-file redo log with size mismatch from after MariaDB 10.2.2
# Corrupted multi-file redo log from after MariaDB 10.2.2
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -53,15 +53,16 @@ COUNT(*)
0
FOUND 3 /Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint\./ in mysqld.1.err
FOUND 1 /InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\.2\.3\.4\./ in mysqld.1.err
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
COUNT(*)
0
FOUND 1 /InnoDB: Log file .*ib_logfile1 is of different size 2097152 bytes than other log files 1048576 bytes!/ in mysqld.1.err
FOUND 2 /InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\.2\.3\.4\./ in mysqld.1.err
# Empty multi-file redo log from after MariaDB 10.2.2
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -74,6 +75,7 @@ SELECT * FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS
FOUND 1 /InnoDB: Log file .*ib_logfile1 is of different size 2097152 bytes than other log files 4194304 bytes!/ in mysqld.1.err
FOUND 1 /InnoDB: Invalid log header checksum/ in mysqld.1.err
# distant future redo log format, with valid header checksum
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption
@ -165,7 +167,7 @@ AND support IN ('YES', 'DEFAULT', 'ENABLED');
ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS
FOUND 3 /\[ERROR\] InnoDB: Upgrade after a crash is not supported\. The redo log was created with MariaDB 10\.3\.1\./ in mysqld.1.err
# Empty 10.3 redo log
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -173,7 +175,7 @@ COUNT(*)
1
FOUND 1 /InnoDB: log sequence number 1213964\b.*; transaction id 0/ in mysqld.1.err
# Empty 10.2 redo log
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
@ -181,7 +183,7 @@ COUNT(*)
1
FOUND 3 /InnoDB: Upgrading redo log:/ in mysqld.1.err
# Empty 10.5 redo log
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=2m
# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');

2
mysql-test/suite/innodb/r/log_file_size.result

@ -1,5 +1,5 @@
CREATE TABLE t1(a INT PRIMARY KEY) ENGINE=InnoDB;
# restart: --innodb-log-file-size=2m
# restart: --innodb-log-file-size=4m
CHECK TABLE t1;
Table Op Msg_type Msg_text
test.t1 check status OK

22
mysql-test/suite/innodb/t/log_corruption.test

@ -16,7 +16,7 @@ call mtr.add_suppression("InnoDB: Log scan aborted at LSN");
call mtr.add_suppression("InnoDB: Missing MLOG_FILE_NAME or MLOG_FILE_DELETE before MLOG_CHECKPOINT for tablespace 42\\r?$");
call mtr.add_suppression("InnoDB: Obtaining redo log encryption key version 1 failed");
call mtr.add_suppression("InnoDB: Decrypting checkpoint failed");
call mtr.add_suppression("InnoDB: Log file .*ib_logfile1 is of different size 2097152 bytes than other log files 1048576 bytes!");
call mtr.add_suppression("InnoDB: Log file .*ib_logfile1 is of different size 2097152 bytes than other log files (1048576|4194304) bytes!");
--enable_query_log
let bugdir= $MYSQLTEST_VARDIR/tmp/log_corruption;
@ -161,7 +161,7 @@ die unless seek(OUT, 0x800, 0);
print OUT pack("NnnNx[496]N", 0x80000944, 12, 12, 0, 0xb2a);
close OUT or die;
EOF
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=2m
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=4m
--source include/start_mysqld.inc
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
@ -191,7 +191,7 @@ print OUT chr(0) x 2048;
close OUT or die;
EOF
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=2m
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=4m
--source include/start_mysqld.inc
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
@ -210,7 +210,7 @@ print OUT chr(0) x 1536;
close OUT or die;
EOF
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=2m
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=4m
--source include/start_mysqld.inc
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
@ -241,7 +241,7 @@ close OUT or die;
EOF
--echo # Corrupted multi-file redo log from after MariaDB 10.2.2
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=2m
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=4m
--source include/start_mysqld.inc
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
@ -259,13 +259,14 @@ print OUT chr(0);
close OUT or die;
EOF
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=2m
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=4m
--source include/start_mysqld.inc
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
--source include/shutdown_mysqld.inc
--let SEARCH_PATTERN=InnoDB: Log file .*ib_logfile1 is of different size 2097152 bytes than other log files 1048576 bytes!
--source include/search_pattern_in_file.inc
--let SEARCH_PATTERN=InnoDB: No valid checkpoint was found; the log was created with BogoDB 1\\.2\\.3\\.4\\.
--source include/search_pattern_in_file.inc
@ -282,7 +283,7 @@ print OUT $_, pack("N", mycrc32($_, 0, $polynomial));
close OUT or die;
EOF
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=2m
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=4m
--source include/start_mysqld.inc
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
@ -304,6 +305,8 @@ EOF
--source include/start_mysqld.inc
eval $check_no_innodb;
--source include/shutdown_mysqld.inc
--let SEARCH_PATTERN=InnoDB: Log file .*ib_logfile1 is of different size 2097152 bytes than other log files 4194304 bytes!
--source include/search_pattern_in_file.inc
let SEARCH_PATTERN=InnoDB: Invalid log header checksum;
--source include/search_pattern_in_file.inc
@ -351,6 +354,9 @@ die unless seek(OUT, 0x210, 0);
print OUT pack("NNx[264]", 0, 0x80c);
print OUT pack("NNx[212]N", 0x590dbaac, 0xfe922582, 0xc72d49c4);
close OUT or die;
die unless open OUT, ">", "$ENV{bugdir}/ib_logfile1";
print OUT pack("x[4194304]");
close OUT or die;
EOF
# Anything below innodb_force_recovery=6 must find a valid redo log.
# Missing tablespace files are tolerated already with innodb_force_recovery=1.
@ -533,7 +539,7 @@ print OUT pack("NnnNx[496]N", 0x80000944, 12, 12, 1, 0x46c8a2a2);
close OUT or die;
EOF
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=2m
--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=4m
--source include/start_mysqld.inc
SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'

2
mysql-test/suite/innodb/t/log_file_size.test

@ -55,7 +55,7 @@ let $check_no_innodb=SELECT * FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
--let $restart_parameters= --innodb-log-file-size=2m
--let $restart_parameters= --innodb-log-file-size=4m
--source include/start_mysqld.inc
let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err;

2
mysql-test/suite/mariabackup/innodb_redo_log_overwrite.opt

@ -1 +1 @@
--loose-innodb-log-file-size=2097152
--loose-innodb-log-file-size=4194304

2
mysql-test/suite/mariabackup/innodb_redo_overwrite.opt

@ -1 +1 @@
--loose-innodb-log-file-size=2m
--loose-innodb-log-file-size=4194304

3
mysql-test/suite/mariabackup/innodb_redo_overwrite.result

@ -1,5 +1,4 @@
CREATE TABLE t(i INT) ENGINE=INNODB;
INSERT INTO t SELECT seq%10 FROM seq_0_to_51199;
CREATE TABLE t ENGINE=INNODB SELECT seq%10 i FROM seq_0_to_204796;
# xtrabackup backup
FOUND 1 /Was only able to copy log from \d+ to \d+, not \d+; try increasing innodb_log_file_size\b/ in backup.log
NOT FOUND /failed: redo log block checksum does not match/ in backup.log

3
mysql-test/suite/mariabackup/innodb_redo_overwrite.test

@ -3,8 +3,7 @@
--source include/have_debug.inc
--source include/have_sequence.inc
CREATE TABLE t(i INT) ENGINE=INNODB;
INSERT INTO t SELECT seq%10 FROM seq_0_to_51199;
CREATE TABLE t ENGINE=INNODB SELECT seq%10 i FROM seq_0_to_204796;
--echo # xtrabackup backup
--let $targetdir=$MYSQLTEST_VARDIR/tmp/backup

2
mysql-test/suite/perfschema/r/sxlock_func.result

@ -10,6 +10,7 @@ name
wait/synch/rwlock/innodb/dict_operation_lock
wait/synch/rwlock/innodb/fil_space_latch
wait/synch/rwlock/innodb/lock_latch
wait/synch/rwlock/innodb/log_latch
wait/synch/rwlock/innodb/trx_i_s_cache_lock
wait/synch/rwlock/innodb/trx_purge_latch
TRUNCATE TABLE performance_schema.events_waits_history_long;
@ -41,6 +42,7 @@ ORDER BY event_name;
event_name
wait/synch/rwlock/innodb/fil_space_latch
wait/synch/rwlock/innodb/lock_latch
wait/synch/rwlock/innodb/log_latch
SELECT event_name FROM performance_schema.events_waits_history_long
WHERE event_name = 'wait/synch/sxlock/innodb/index_tree_rw_lock'
AND operation IN ('try_shared_lock','shared_lock') LIMIT 1;

2
mysql-test/suite/sys_vars/r/sysvars_innodb.result

@ -1011,7 +1011,7 @@ DEFAULT_VALUE 100663296
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE BIGINT UNSIGNED
VARIABLE_COMMENT Redo log size in bytes.
NUMERIC_MIN_VALUE 1048576
NUMERIC_MIN_VALUE 4194304
NUMERIC_MAX_VALUE 18446744073709551615
NUMERIC_BLOCK_SIZE 4096
ENUM_VALUE_LIST NULL

68
storage/innobase/buf/buf0flu.cc

@ -115,6 +115,7 @@ static void buf_flush_validate_skip()
/** Wake up the page cleaner if needed */
void buf_pool_t::page_cleaner_wakeup()
{
ut_d(buf_flush_validate_skip());
if (!page_cleaner_idle())
return;
double dirty_pct= double(UT_LIST_GET_LEN(buf_pool.flush_list)) * 100.0 /
@ -155,7 +156,7 @@ void buf_pool_t::page_cleaner_wakeup()
}
}
inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage)
inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage) noexcept
{
ut_ad(!fsp_is_system_temporary(bpage->id().space()));
mysql_mutex_assert_owner(&flush_list_mutex);
@ -166,13 +167,9 @@ inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage)
/** Insert a modified block into the flush list.
@param block modified block
@param lsn start LSN of the mini-transaction that modified the block */
void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn)
void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn) noexcept
{
mysql_mutex_assert_not_owner(&mutex);
#ifdef SAFE_MUTEX
if (!recv_recovery_is_on())
mysql_mutex_assert_owner(&log_sys.flush_order_mutex);
#endif /* SAFE_MUTEX */
ut_ad(recv_recovery_is_on() || log_sys.latch.is_locked());
ut_ad(lsn > 2);
static_assert(log_t::FIRST_LSN >= 2, "compatibility");
ut_ad(!fsp_is_system_temporary(block->page.id().space()));
@ -191,16 +188,27 @@ void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn)
MEM_CHECK_DEFINED(block->page.zip.data
? block->page.zip.data : block->page.frame,
block->physical_size());
UT_LIST_ADD_FIRST(flush_list, &block->page);
ut_d(buf_flush_validate_skip());
page_cleaner_wakeup();
if (buf_page_t *prev= UT_LIST_GET_FIRST(flush_list))
{
if (prev->oldest_modification() <= lsn)
goto insert_first;
while (buf_page_t *next= UT_LIST_GET_NEXT(list, prev))
if (next->oldest_modification() <= lsn)
break;
else
prev= next;
UT_LIST_INSERT_AFTER(flush_list, prev, &block->page);
}
else
insert_first:
UT_LIST_ADD_FIRST(flush_list, &block->page);
mysql_mutex_unlock(&flush_list_mutex);
}
/** Remove a block from flush_list.
@param bpage buffer pool page
@param clear whether to invoke buf_page_t::clear_oldest_modification() */
void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear)
void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear) noexcept
{
delete_from_flush_list_low(bpage);
stat.flush_list_bytes-= bpage->physical_size();
@ -743,7 +751,7 @@ not_compressed:
}
/** Free a page whose underlying file page has been freed. */
inline void buf_pool_t::release_freed_page(buf_page_t *bpage)
inline void buf_pool_t::release_freed_page(buf_page_t *bpage) noexcept
{
mysql_mutex_assert_owner(&mutex);
mysql_mutex_lock(&flush_list_mutex);
@ -1696,12 +1704,12 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
#endif
{
n_pending_checkpoint_writes++;
mysql_mutex_unlock(&mutex);
latch.wr_unlock();
/* FIXME: issue an asynchronous write */
log.write(offset, {c, get_block_size()});
if (srv_file_flush_method != SRV_O_DSYNC)
ut_a(log.flush());
mysql_mutex_lock(&mutex);
latch.wr_lock(SRW_LOCK_CALL);
n_pending_checkpoint_writes--;
}
@ -1712,7 +1720,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF ", flushed to " LSN_PF,
next_checkpoint_lsn, get_flushed_lsn()));
mysql_mutex_unlock(&mutex);
latch.wr_unlock();
}
/** Initiate a log checkpoint, discarding the start of the log.
@ -1722,7 +1730,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
{
ut_ad(!srv_read_only_mode);
mysql_mutex_assert_owner(&log_sys.mutex);
ut_ad(log_sys.latch.is_write_locked());
ut_ad(oldest_lsn <= end_lsn);
ut_ad(end_lsn == log_sys.get_lsn());
ut_ad(!recv_no_log_write);
@ -1735,7 +1743,7 @@ static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
/* Do nothing, because nothing was logged (other than a
FILE_CHECKPOINT record) since the previous checkpoint. */
do_nothing:
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
return true;
}
@ -1748,13 +1756,14 @@ static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
It is important that we write out the redo log before any further
dirty pages are flushed to the tablespace files. At this point,
because we hold log_sys.mutex, mtr_t::commit() in other threads will
be blocked, and no pages can be added to the flush lists. */
because we hold exclusive log_sys.latch,
mtr_t::commit() in other threads will be blocked,
and no pages can be added to buf_pool.flush_list. */
const lsn_t flush_lsn{fil_names_clear(oldest_lsn)};
ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT);
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
log_write_up_to(flush_lsn, true);
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
if (log_sys.last_checkpoint_lsn >= oldest_lsn)
goto do_nothing;
@ -1763,13 +1772,12 @@ static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
if (log_sys.n_pending_checkpoint_writes)
{
/* A checkpoint write is running */
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
return false;
}
log_sys.next_checkpoint_lsn= oldest_lsn;
log_sys.write_checkpoint(end_lsn);
mysql_mutex_assert_not_owner(&log_sys.mutex);
return true;
}
@ -1793,12 +1801,10 @@ static bool log_checkpoint()
fil_flush_file_spaces();
}
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
const lsn_t end_lsn= log_sys.get_lsn();
mysql_mutex_lock(&log_sys.flush_order_mutex);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn);
mysql_mutex_unlock(&log_sys.flush_order_mutex);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
return log_checkpoint_low(oldest_lsn, end_lsn);
}
@ -1835,7 +1841,6 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
{
ut_ad(sync_lsn);
ut_ad(sync_lsn < LSN_MAX);
mysql_mutex_assert_not_owner(&log_sys.mutex);
ut_ad(!srv_read_only_mode);
if (recv_recovery_is_on())
@ -1893,7 +1898,6 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
@param furious true=furious flushing, false=limit to innodb_io_capacity */
ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
{
mysql_mutex_assert_not_owner(&log_sys.mutex);
ut_ad(!srv_read_only_mode);
if (recv_recovery_is_on())
@ -1952,11 +1956,9 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
fil_flush_file_spaces();
}
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
const lsn_t newest_lsn= log_sys.get_lsn();
mysql_mutex_lock(&log_sys.flush_order_mutex);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
mysql_mutex_unlock(&log_sys.flush_order_mutex);
lsn_t measure= buf_pool.get_oldest_modification(0);
const lsn_t checkpoint_lsn= measure ? measure : newest_lsn;
@ -1970,13 +1972,11 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
}
else
{
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
if (!measure)
measure= LSN_MAX;
}
mysql_mutex_assert_not_owner(&log_sys.mutex);
/* After attempting log checkpoint, check if we have reached our target. */
const lsn_t target= buf_flush_sync_lsn;

44
storage/innobase/fil/fil0fil.cc

@ -881,18 +881,21 @@ bool fil_space_free(uint32_t id, bool x_latched)
}
if (!recv_recovery_is_on()) {
mysql_mutex_lock(&log_sys.mutex);
}
log_sys.latch.wr_lock(SRW_LOCK_CALL);
mysql_mutex_assert_owner(&log_sys.mutex);
if (space->max_lsn) {
ut_d(space->max_lsn = 0);
fil_system.named_spaces.remove(*space);
}
if (space->max_lsn != 0) {
ut_d(space->max_lsn = 0);
fil_system.named_spaces.remove(*space);
}
log_sys.latch.wr_unlock();
} else {
ut_ad(log_sys.latch.is_write_locked());
if (!recv_recovery_is_on()) {
mysql_mutex_unlock(&log_sys.mutex);
if (space->max_lsn) {
ut_d(space->max_lsn = 0);
fil_system.named_spaces.remove(*space);
}
}
fil_space_free_low(space);
@ -1474,9 +1477,9 @@ static void fil_name_write_rename_low(uint32_t space_id, const char *old_name,
static void fil_name_commit_durable(mtr_t *mtr)
{
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
auto lsn= mtr->commit_files();
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
log_write_up_to(lsn, true);
}
@ -1647,13 +1650,13 @@ pfs_os_file_t fil_delete_tablespace(uint32_t id)
handle= fil_system.detach(space, true);
mysql_mutex_unlock(&fil_system.mutex);
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
if (space->max_lsn)
{
ut_d(space->max_lsn = 0);
fil_system.named_spaces.remove(*space);
}
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
fil_space_free_low(space);
}
@ -1853,11 +1856,12 @@ static bool fil_rename_tablespace(uint32_t id, const char *old_path,
ut_ad(strchr(new_file_name, '/'));
if (!recv_recovery_is_on()) {
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
}
/* log_sys.mutex is above fil_system.mutex in the latching order */
mysql_mutex_assert_owner(&log_sys.mutex);
/* log_sys.latch is above fil_system.mutex in the latching order */
ut_ad(log_sys.latch.is_write_locked() ||
srv_operation == SRV_OPERATION_RESTORE_DELTA);
mysql_mutex_lock(&fil_system.mutex);
space->release();
ut_ad(node->name == old_file_name);
@ -1880,7 +1884,7 @@ skip_second_rename:
}
if (!recv_recovery_is_on()) {
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
}
mysql_mutex_unlock(&fil_system.mutex);
@ -3031,7 +3035,7 @@ void
fil_names_dirty(
fil_space_t* space)
{
mysql_mutex_assert_owner(&log_sys.mutex);
ut_ad(log_sys.latch.is_write_locked());
ut_ad(recv_recovery_is_on());
ut_ad(log_sys.get_lsn() != 0);
ut_ad(space->max_lsn == 0);
@ -3045,7 +3049,7 @@ fil_names_dirty(
tablespace was modified for the first time since fil_names_clear(). */
ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void mtr_t::name_write()
{
mysql_mutex_assert_owner(&log_sys.mutex);
ut_ad(log_sys.latch.is_write_locked());
ut_d(fil_space_validate_for_mtr_commit(m_user_space));
ut_ad(!m_user_space->max_lsn);
m_user_space->max_lsn= log_sys.get_lsn();
@ -3073,7 +3077,7 @@ lsn_t fil_names_clear(lsn_t lsn)
{
mtr_t mtr;
mysql_mutex_assert_owner(&log_sys.mutex);
ut_ad(log_sys.latch.is_write_locked());
ut_ad(lsn);
ut_ad(log_sys.is_latest());

15
storage/innobase/handler/ha_innodb.cc

@ -242,10 +242,10 @@ static void innodb_max_purge_lag_wait_update(THD *thd, st_mysql_sys_var *,
if (thd_kill_level(thd))
break;
/* Adjust for purge_coordinator_state::refresh() */
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.rd_lock(SRW_LOCK_CALL);
const lsn_t last= log_sys.last_checkpoint_lsn,
max_age= log_sys.max_checkpoint_age;
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.rd_unlock();
const lsn_t lsn= log_sys.get_lsn();
if ((lsn - last) / 4 >= max_age / 5)
buf_flush_ahead(last + max_age / 5, false);
@ -534,8 +534,6 @@ mysql_pfs_key_t fts_pll_tokenize_mutex_key;
mysql_pfs_key_t ibuf_bitmap_mutex_key;
mysql_pfs_key_t ibuf_mutex_key;
mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
mysql_pfs_key_t log_sys_mutex_key;
mysql_pfs_key_t log_flush_order_mutex_key;
mysql_pfs_key_t recalc_pool_mutex_key;
mysql_pfs_key_t purge_sys_pq_mutex_key;
mysql_pfs_key_t recv_sys_mutex_key;
@ -571,12 +569,10 @@ static PSI_mutex_info all_innodb_mutexes[] = {
PSI_KEY(fts_cache_init_mutex),
PSI_KEY(fts_delete_mutex),
PSI_KEY(fts_doc_id_mutex),
PSI_KEY(log_flush_order_mutex),
PSI_KEY(ibuf_bitmap_mutex),
PSI_KEY(ibuf_mutex),
PSI_KEY(ibuf_pessimistic_insert_mutex),
PSI_KEY(index_online_log),
PSI_KEY(log_sys_mutex),
PSI_KEY(page_zip_stat_per_index_mutex),
PSI_KEY(purge_sys_pq_mutex),
PSI_KEY(recv_sys_mutex),
@ -603,6 +599,7 @@ mysql_pfs_key_t fil_space_latch_key;
mysql_pfs_key_t trx_i_s_cache_lock_key;
mysql_pfs_key_t trx_purge_latch_key;
mysql_pfs_key_t lock_latch_key;
mysql_pfs_key_t log_latch_key;
/* all_innodb_rwlocks array contains rwlocks that are
performance schema instrumented if "UNIV_PFS_RWLOCK"
@ -617,6 +614,7 @@ static PSI_rwlock_info all_innodb_rwlocks[] =
{ &trx_i_s_cache_lock_key, "trx_i_s_cache_lock", 0 },
{ &trx_purge_latch_key, "trx_purge_latch", 0 },
{ &lock_latch_key, "lock_latch", 0 },
{ &log_latch_key, "log_latch", 0 },
{ &index_tree_rw_lock_key, "index_tree_rw_lock", PSI_RWLOCK_FLAG_SX }
};
# endif /* UNIV_PFS_RWLOCK */
@ -949,8 +947,7 @@ static SHOW_VAR innodb_status_variables[]= {
{"buffer_pool_reads",
&export_vars.innodb_buffer_pool_reads, SHOW_SIZE_T},
{"buffer_pool_wait_free", &buf_pool.stat.LRU_waits, SHOW_SIZE_T},
{"buffer_pool_write_requests",
&export_vars.innodb_buffer_pool_write_requests, SHOW_SIZE_T},
{"buffer_pool_write_requests", &buf_pool.flush_list_requests, SHOW_SIZE_T},
{"checkpoint_age", &export_vars.innodb_checkpoint_age, SHOW_SIZE_T},
{"checkpoint_max_age", &export_vars.innodb_checkpoint_max_age, SHOW_SIZE_T},
{"data_fsyncs", (size_t*) &os_n_fsyncs, SHOW_SIZE_T},
@ -19235,7 +19232,7 @@ static MYSQL_SYSVAR_SIZE_T(log_buffer_size, log_sys.buf_size,
static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Redo log size in bytes.",
NULL, NULL, 96 << 20, 1U << 20, std::numeric_limits<ulonglong>::max(), 4096);
NULL, NULL, 96 << 20, 4 << 20, std::numeric_limits<ulonglong>::max(), 4096);
static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct,
PLUGIN_VAR_RQCMDARG,

17
storage/innobase/include/buf0buf.h

@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2013, 2021, MariaDB Corporation.
Copyright (c) 2013, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@ -1755,6 +1755,9 @@ public:
FlushHp flush_hp;
/** modified blocks (a subset of LRU) */
UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
/** number of blocks ever added to flush_list;
protected by flush_list_mutex */
size_t flush_list_requests;
private:
/** whether the page cleaner needs wakeup from indefinite sleep */
bool page_cleaner_is_idle;
@ -1765,7 +1768,7 @@ public:
pthread_cond_t do_flush_list;
/** @return whether the page cleaner must sleep due to being idle */
bool page_cleaner_idle() const
bool page_cleaner_idle() const noexcept
{
mysql_mutex_assert_owner(&flush_list_mutex);
return page_cleaner_is_idle;
@ -1885,24 +1888,24 @@ public:
private:
/** Remove a block from the flush list. */
inline void delete_from_flush_list_low(buf_page_t *bpage);
inline void delete_from_flush_list_low(buf_page_t *bpage) noexcept;
/** Remove a block from flush_list.
@param bpage buffer pool page
@param clear whether to invoke buf_page_t::clear_oldest_modification() */
void delete_from_flush_list(buf_page_t *bpage, bool clear);
void delete_from_flush_list(buf_page_t *bpage, bool clear) noexcept;
public:
/** Remove a block from flush_list.
@param bpage buffer pool page */
void delete_from_flush_list(buf_page_t *bpage)
void delete_from_flush_list(buf_page_t *bpage) noexcept
{ delete_from_flush_list(bpage, true); }
/** Insert a modified block into the flush list.
@param block modified block
@param lsn start LSN of the mini-transaction that modified the block */
void insert_into_flush_list(buf_block_t *block, lsn_t lsn);
void insert_into_flush_list(buf_block_t *block, lsn_t lsn) noexcept;
/** Free a page whose underlying file page has been freed. */
inline void release_freed_page(buf_page_t *bpage);
inline void release_freed_page(buf_page_t *bpage) noexcept;
private:
/** Temporary memory for page_compressed and encrypted I/O */

29
storage/innobase/include/buf0flu.h

@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2014, 2021, MariaDB Corporation.
Copyright (c) 2014, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@ -103,33 +103,6 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn);
@param furious true=furious flushing, false=limit to innodb_io_capacity */
ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious);
/********************************************************************//**
This function should be called at a mini-transaction commit, if a page was
modified in it. Puts the block to the list of modified blocks, if it not
already in it. */
inline void buf_flush_note_modification(buf_block_t *b, lsn_t start, lsn_t end)
{
ut_ad(!srv_read_only_mode);
ut_d(const auto s= b->page.state());
ut_ad(s > buf_page_t::FREED);
ut_ad(s < buf_page_t::READ_FIX);
ut_ad(mach_read_from_8(b->page.frame + FIL_PAGE_LSN) <= end);
mach_write_to_8(b->page.frame + FIL_PAGE_LSN, end);
if (UNIV_LIKELY_NULL(b->page.zip.data))
memcpy_aligned<8>(FIL_PAGE_LSN + b->page.zip.data,
FIL_PAGE_LSN + b->page.frame, 8);
const lsn_t oldest_modification= b->page.oldest_modification();
if (oldest_modification > 1)
ut_ad(oldest_modification <= start);
else if (fsp_is_system_temporary(b->page.id().space()))
b->page.set_temp_modified();
else
buf_pool.insert_into_flush_list(b, start);
srv_stats.buf_pool_write_requests.inc();
}
/** Initialize page_cleaner. */
ATTRIBUTE_COLD void buf_flush_page_cleaner_init();

23
storage/innobase/include/fil0fil.h

@ -354,13 +354,10 @@ struct fil_space_t final
/** fil_system.spaces chain node */
fil_space_t *hash;
lsn_t max_lsn;
/*!< LSN of the most recent
fil_names_write_if_was_clean().
Reset to 0 by fil_names_clear().
Protected by log_sys.mutex.
If and only if this is nonzero, the
tablespace will be in named_spaces. */
/** LSN of the most recent fil_names_write_if_was_clean().
Reset to 0 by fil_names_clear(). Protected by exclusive log_sys.latch.
If and only if max_lsn is nonzero, this is in fil_system.named_spaces. */
lsn_t max_lsn;
/** tablespace identifier */
uint32_t id;
/** whether undo tablespace truncation is in progress */
@ -1043,7 +1040,7 @@ struct fil_node_t final
{
/** tablespace containing this file */
fil_space_t *space;
/** file name; protected by fil_system.mutex and log_sys.mutex */
/** file name; protected by fil_system.mutex and exclusive log_sys.latch */
char *name;
/** file handle */
pfs_os_file_t handle;
@ -1434,14 +1431,12 @@ public:
/** nonzero if fil_node_open_file_low() should avoid moving the tablespace
to the end of space_list, for FIFO policy of try_to_close() */
ulint freeze_space_list;
/** list of all tablespaces */
ilist<fil_space_t, space_list_tag_t> space_list;
/*!< list of all file spaces */
/** list of all tablespaces for which a FILE_MODIFY record has been written
since the latest redo log checkpoint.
Protected only by exclusive log_sys.latch. */
ilist<fil_space_t, named_spaces_tag_t> named_spaces;
/*!< list of all file spaces
for which a FILE_MODIFY
record has been written since
the latest redo log checkpoint.
Protected only by log_sys.mutex. */
/** list of all ENCRYPTED=DEFAULT tablespaces that need
to be converted to the current value of innodb_encrypt_tables */

91
storage/innobase/include/log0log.h

@ -37,6 +37,7 @@ Created 12/9/1995 Heikki Tuuri
#include "os0file.h"
#include "span.h"
#include "my_atomic_wrapper.h"
#include "srw_lock.h"
#include <string>
using st_::span;
@ -57,16 +58,6 @@ static inline void delete_log_file(const char* suffix)
os_file_delete_if_exists(innodb_log_file_key, path.c_str(), nullptr);
}
/** Calculate the recommended highest values for lsn - last_checkpoint_lsn
and lsn - buf_pool.get_oldest_modification().
@param[in] file_size requested innodb_log_file_size
@retval true on success
@retval false if the smallest log is too small to
accommodate the number of OS threads in the database server */
bool
log_set_capacity(ulonglong file_size)
MY_ATTRIBUTE((warn_unused_result));
struct completion_callback;
/** Ensure that the log has been written to the log file up to a given
@ -83,10 +74,10 @@ void log_write_up_to(lsn_t lsn, bool durable,
void log_buffer_flush_to_disk(bool durable= true);
/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.mutex. */
/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */
ATTRIBUTE_COLD void log_write_and_flush_prepare();
/** Durably write the log up to log_sys.lsn() and release log_sys.mutex. */
/** Durably write the log up to log_sys.get_lsn(). */
ATTRIBUTE_COLD void log_write_and_flush();
/** Make a checkpoint */
@ -202,35 +193,38 @@ private:
preflush buffer pool pages, or initiate a log checkpoint.
This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */
std::atomic<bool> check_flush_or_checkpoint_;
public:
/** mutex protecting the log */
MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
/** rw-lock protecting buf */
MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) srw_lock latch;
private:
/** Last written LSN */
lsn_t write_lsn;
public:
/** first free offset within the log buffer in use */
size_t buf_free;
/** recommended maximum size of buf, after which the buffer is flushed */
size_t max_buf_free;
/** mutex that ensures that inserts into buf_pool.flush_list are in
LSN order; allows mtr_t::commit() to release log_sys.mutex earlier */
MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_order_mutex;
/** log record buffer, written to by mtr_t::commit() */
byte *buf;
/** buffer for writing data to ib_logfile0, or nullptr if is_pmem()
In write_buf(), buf and flush_buf are swapped */
byte *flush_buf;
/** number of write requests (to buf); protected by mutex */
ulint write_to_buf;
/** number of std::swap(buf, flush_buf) and writes from buf to log;
protected by mutex */
protected by latch.wr_lock() */
ulint write_to_log;
/** number of waits in append_prepare() */
ulint waits;
/** innodb_log_buffer_size (size of buf and flush_buf, in bytes) */
size_t buf_size;
private:
/** spin lock protecting lsn, buf_free in append_prepare() */
MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) sspin_lock lsn_lock;
public:
/** first free offset within buf use; protected by lsn_lock */
Atomic_relaxed<size_t> buf_free;
/** number of write requests (to buf); protected by exclusive lsn_lock */
ulint write_to_buf;
/** number of waits in append_prepare(); protected by lsn_lock */
ulint waits;
/** recommended maximum size of buf, after which the buffer is flushed */
size_t max_buf_free;
/** log file size in bytes, including the header */
lsn_t file_size;
private:
@ -272,11 +266,11 @@ public:
/*!< this is the maximum allowed value
for lsn - last_checkpoint_lsn when a
new query step is started */
/** latest completed checkpoint (protected by log_sys.mutex) */
/** latest completed checkpoint (protected by latch.wr_lock()) */
Atomic_relaxed<lsn_t> last_checkpoint_lsn;
lsn_t next_checkpoint_lsn;
/*!< next checkpoint lsn */
/** next checkpoint number (protected by mutex) */
/** next checkpoint number (protected by latch.wr_lock()) */
ulint next_checkpoint_no;
/** number of pending checkpoint writes */
ulint n_pending_checkpoint_writes;
@ -299,6 +293,9 @@ public:
void close_file();
/** Calculate the checkpoint safety margins. */
static void set_capacity();
lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const
{ return lsn.load(order); }
void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_release); }
@ -310,17 +307,17 @@ public:
/** Initialize the LSN on initial log file creation. */
lsn_t init_lsn() noexcept
{
mysql_mutex_lock(&mutex);
latch.wr_lock(SRW_LOCK_CALL);
const lsn_t lsn{get_lsn()};
flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
write_lsn= lsn;
mysql_mutex_unlock(&mutex);
latch.wr_unlock();
return lsn;
}
void set_recovered_lsn(lsn_t lsn) noexcept
{
mysql_mutex_assert_owner(&mutex);
ut_ad(latch.is_write_locked());
write_lsn= lsn;
this->lsn.store(lsn, std::memory_order_relaxed);
flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
@ -360,20 +357,29 @@ public:
static size_t get_block_size() { return 512; }
#endif
private:
/** Wait in append_prepare() for buffer to become available
@param ex whether log_sys.latch is exclusively locked */
ATTRIBUTE_COLD static void append_prepare_wait(bool ex) noexcept;
public:
/** Reserve space in the log buffer for appending data.
@param size upper limit of the length of the data to append(), in bytes
@return the current LSN */
inline lsn_t append_prepare(size_t size) noexcept;
@tparam pmem log_sys.is_pmem()
@param size total length of the data to append(), in bytes
@param ex whether log_sys.latch is exclusively locked
@return the start LSN and the buffer position for append() */
template<bool pmem>
inline std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept;
/** Append a string of bytes to the redo log.
@param d destination
@param s string of bytes
@param size length of str, in bytes */
void append(const void *s, size_t size) noexcept
void append(byte *&d, const void *s, size_t size) noexcept
{
mysql_mutex_assert_owner(&mutex);
ut_ad(buf_free + size <= (is_pmem() ? file_size : buf_size));
memcpy(buf + buf_free, s, size);
buf_free+= size;
ut_ad(latch.is_locked());
ut_ad(d + size <= buf + (is_pmem() ? file_size : buf_size));
memcpy(d, s, size);
d+= size;
}
/** Set the log file format. */
@ -409,14 +415,15 @@ public:
return START_OFFSET + (lsn - first_lsn) % capacity();
}
/** Write checkpoint information to the log header and release mutex.
/** Write checkpoint information and invoke latch.wr_unlock().
@param end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */
inline void write_checkpoint(lsn_t end_lsn) noexcept;
/** Write buf to ib_logfile0 and release mutex.
/** Write buf to ib_logfile0.
@tparam release_latch whether to invoke latch.wr_unlock()
@return new write target
@retval 0 if everything was written */
inline lsn_t write_buf() noexcept;
template<bool release_latch> inline lsn_t write_buf() noexcept;
/** Create the log. */
void create(lsn_t lsn) noexcept;

6
storage/innobase/include/log0recv.h

@ -420,9 +420,9 @@ extern bool recv_no_ibuf_operations;
/** TRUE when recv_init_crash_recovery() has been called. */
extern bool recv_needed_recovery;
#ifdef UNIV_DEBUG
/** TRUE if writing to the redo log (mtr_commit) is forbidden.
Protected by log_sys.mutex. */
extern bool recv_no_log_write;
/** whether writing to the redo log is forbidden;
protected by exclusive log_sys.latch. */
extern bool recv_no_log_write;
#endif /* UNIV_DEBUG */
/** TRUE if buf_page_is_corrupted() should check if the log sequence

8
storage/innobase/include/mtr0mtr.h

@ -107,7 +107,7 @@ struct mtr_t {
/** Commit a mini-transaction that did not modify any pages,
but generated some redo log on a higher level, such as
FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
The caller must hold log_sys.mutex.
The caller must hold exclusive log_sys.latch.
This is to be used at log_checkpoint().
@param checkpoint_lsn the log sequence number of a checkpoint, or 0
@return current LSN */
@ -632,13 +632,15 @@ private:
ATTRIBUTE_NOINLINE void encrypt();
/** Append the redo log records to the redo log buffer.
@param ex whether log_sys.latch is already exclusively locked
@return {start_lsn,flush_ahead} */
std::pair<lsn_t,page_flush_ahead> do_write();
std::pair<lsn_t,page_flush_ahead> do_write(bool ex);
/** Append the redo log records to the redo log buffer.
@param len number of bytes to write
@param ex whether log_sys.latch is exclusively locked
@return {start_lsn,flush_ahead} */
std::pair<lsn_t,page_flush_ahead> finish_write(size_t len);
std::pair<lsn_t,page_flush_ahead> finish_write(size_t len, bool ex);
/** Release the resources */
inline void release_resources();

5
storage/innobase/include/mtr0mtr.inl

@ -49,9 +49,8 @@ mtr_t::memo_push(void* object, mtr_memo_type_t type)
ut_ad(ut_is_2pow(type));
/* If this mtr has x-fixed a clean page then we set
the made_dirty flag. This tells us if we need to
grab log_sys.flush_order_mutex at mtr_t::commit() so that we
can insert the dirtied page into the flush list. */
the made_dirty flag. This tells mtr_t::commit()
to hold log_sys.latch longer. */
if (!m_made_dirty
&& (type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX)) {

4
storage/innobase/include/srv0srv.h

@ -86,9 +86,6 @@ struct srv_stats_t
/** Count the amount of data written in total (in bytes) */
ulint_ctr_1_t data_written;
/** Store the number of write requests issued */
ulint_ctr_1_t buf_pool_write_requests;
/** Number of buffer pool reads that led to the reading of
a disk page */
ulint_ctr_1_t buf_pool_reads;
@ -684,7 +681,6 @@ struct export_var_t{
ulint innodb_buffer_pool_pages_old;
ulint innodb_buffer_pool_read_requests; /*!< buf_pool.stat.n_page_gets */
ulint innodb_buffer_pool_reads; /*!< srv_buf_pool_reads */
ulint innodb_buffer_pool_write_requests;/*!< srv_stats.buf_pool_write_requests */
ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */
ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */
ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/

11
storage/innobase/include/srw_lock.h

@ -1,6 +1,6 @@
/*****************************************************************************
Copyright (c) 2020, 2021, MariaDB Corporation.
Copyright (c) 2020, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@ -520,3 +520,12 @@ typedef srw_lock_impl<false> srw_lock;
typedef srw_lock_impl<true> srw_spin_lock;
#endif
/** Simple spin lock */
struct sspin_lock
{
std::atomic<uint32_t> word{0};
void lock() noexcept;
void unlock() noexcept
{ ut_ad(word); word.store(0, std::memory_order_release); }
};

3
storage/innobase/include/univ.i

@ -517,8 +517,6 @@ extern mysql_pfs_key_t fts_pll_tokenize_mutex_key;
extern mysql_pfs_key_t ibuf_bitmap_mutex_key;
extern mysql_pfs_key_t ibuf_mutex_key;
extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
extern mysql_pfs_key_t log_sys_mutex_key;
extern mysql_pfs_key_t log_flush_order_mutex_key;
extern mysql_pfs_key_t recalc_pool_mutex_key;
extern mysql_pfs_key_t purge_sys_pq_mutex_key;
extern mysql_pfs_key_t recv_sys_mutex_key;
@ -547,5 +545,6 @@ extern mysql_pfs_key_t index_tree_rw_lock_key;
extern mysql_pfs_key_t index_online_log_key;
extern mysql_pfs_key_t trx_sys_rw_lock_key;
extern mysql_pfs_key_t lock_latch_key;
extern mysql_pfs_key_t log_latch_key;
# endif /* UNIV_PFS_RWLOCK */
#endif /* HAVE_PSI_INTERFACE */

126
storage/innobase/log/log0log.cc

@ -74,52 +74,24 @@ log_t log_sys;
#define LOG_BUF_FLUSH_MARGIN ((4 * 4096) /* cf. log_t::append_prepare() */ \
+ (4U << srv_page_size_shift))
/** Calculate the recommended highest values for lsn - last_checkpoint_lsn
and lsn - buf_pool.get_oldest_modification().
@param[in] file_size requested innodb_log_file_size
@retval true on success
@retval false if the smallest log group is too small to
accommodate the number of OS threads in the database server */
bool
log_set_capacity(ulonglong file_size)
void log_t::set_capacity()
{
mysql_mutex_assert_owner(&log_sys.mutex);
ut_ad(log_sys.latch.is_write_locked());
/* Margin for the free space in the smallest log, before a new query
step which modifies the database, is started */
const size_t LOG_CHECKPOINT_FREE_PER_THREAD = 4U
<< srv_page_size_shift;
const size_t LOG_CHECKPOINT_EXTRA_FREE = 8U << srv_page_size_shift;
lsn_t margin;
ulint free;
lsn_t smallest_capacity = file_size - log_t::START_OFFSET;
lsn_t smallest_capacity = srv_log_file_size - log_t::START_OFFSET;
/* Add extra safety */
smallest_capacity -= smallest_capacity / 10;
/* For each OS thread we must reserve so much free space in the
smallest log group that it can accommodate the log entries produced
by single query steps: running out of free log space is a serious
system error which requires rebooting the database. */
free = LOG_CHECKPOINT_FREE_PER_THREAD * 10
+ LOG_CHECKPOINT_EXTRA_FREE;
if (free >= smallest_capacity / 2) {
sql_print_error("InnoDB: innodb_log_file_size is too small."
" %s", INNODB_PARAMETERS_MSG);
return false;
}
margin = smallest_capacity - free;
margin = margin - margin / 10; /* Add still some extra safety */
lsn_t margin = smallest_capacity - (48 << srv_page_size_shift);
margin -= margin / 10; /* Add still some extra safety */
log_sys.log_capacity = smallest_capacity;
log_sys.max_modified_age_async = margin - margin / 8;
log_sys.max_checkpoint_age = margin;
return(true);
}
/** Initialize the redo log subsystem. */
@ -128,14 +100,7 @@ void log_t::create()
ut_ad(this == &log_sys);
ut_ad(!is_initialised());
#if defined(__aarch64__)
mysql_mutex_init(log_sys_mutex_key, &mutex, MY_MUTEX_INIT_FAST);
mysql_mutex_init(
log_flush_order_mutex_key, &flush_order_mutex, MY_MUTEX_INIT_FAST);
#else
mysql_mutex_init(log_sys_mutex_key, &mutex, nullptr);
mysql_mutex_init(log_flush_order_mutex_key, &flush_order_mutex, nullptr);
#endif
latch.SRW_LOCK_INIT(log_latch_key);
/* LSN 0 and 1 are reserved; @see buf_page_t::oldest_modification_ */
lsn.store(FIRST_LSN, std::memory_order_relaxed);
@ -272,7 +237,7 @@ void log_t::attach(log_file_t file, os_offset_t size)
void log_t::create(lsn_t lsn) noexcept
{
mysql_mutex_assert_owner(&mutex);
ut_ad(latch.is_write_locked());
ut_ad(!recv_no_log_write);
ut_ad(is_latest());
ut_ad(this == &log_sys);
@ -516,7 +481,6 @@ static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra)
inline void log_t::persist(lsn_t lsn) noexcept
{
ut_ad(is_pmem());
mysql_mutex_assert_not_owner(&mutex);
ut_ad(!write_lock.is_owner());
ut_ad(!flush_lock.is_owner());
@ -551,13 +515,13 @@ inline void log_t::persist(lsn_t lsn) noexcept
}
#endif
/** Write buf to ib_logfile0 and release mutex.
/** Write buf to ib_logfile0.
@tparam release_latch whether to invoke latch.wr_unlock()
@return new write target
@retval 0 if everything was written */
inline lsn_t log_t::write_buf() noexcept
template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
{
mysql_mutex_assert_owner(&mutex);
ut_ad(latch.is_write_locked());
ut_ad(!srv_read_only_mode);
ut_ad(!is_pmem());
@ -565,7 +529,8 @@ inline lsn_t log_t::write_buf() noexcept
if (write_lsn >= lsn)
{
mysql_mutex_unlock(&mutex);
if (release_latch)
latch.wr_unlock();
ut_ad(write_lsn == lsn);
}
else
@ -581,31 +546,33 @@ inline lsn_t log_t::write_buf() noexcept
const byte *write_buf{buf};
size_t length{buf_free};
ut_ad(length >= (calc_lsn_offset(write_lsn) & block_size_1));
buf_free&= block_size_1;
ut_ad(buf_free == ((lsn - first_lsn) & block_size_1));
const size_t new_buf_free{length & block_size_1};
buf_free= new_buf_free;
ut_ad(new_buf_free == ((lsn - first_lsn) & block_size_1));
if (buf_free)
if (new_buf_free)
{
#if 0 /* TODO: Pad the last log block with dummy records. */
buf_free= log_pad(lsn, get_block_size() - buf_free,
buf + buf_free, flush_buf);
buf_free= log_pad(lsn, get_block_size() - new_buf_free,
buf + new_buf_free, flush_buf);
... /* TODO: Update the LSN and adjust other code. */
#else
/* The rest of the block will be written as garbage.
(We want to avoid memset() while holding mutex.)
This block will be overwritten later, once records beyond
the current LSN are generated. */
MEM_MAKE_DEFINED(buf + length, get_block_size() - buf_free);
MEM_MAKE_DEFINED(buf + length, get_block_size() - new_buf_free);
buf[length]= 0; /* allow recovery to catch EOF faster */
length&= ~block_size_1;
memcpy_aligned<16>(flush_buf, buf + length, (buf_free + 15) & ~15);
memcpy_aligned<16>(flush_buf, buf + length, (new_buf_free + 15) & ~15);
length+= get_block_size();
#endif
}
std::swap(buf, flush_buf);
write_to_log++;
mysql_mutex_unlock(&mutex);
if (release_latch)
latch.wr_unlock();
if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED))
{
@ -690,8 +657,8 @@ repeat:
if (write_lock.acquire(lsn, durable ? nullptr : callback) ==
group_commit_lock::ACQUIRED)
{
mysql_mutex_lock(&log_sys.mutex);
write_lsn= log_sys.write_buf();
log_sys.latch.wr_lock(SRW_LOCK_CALL);
write_lsn= log_sys.write_buf<true>();
}
else
write_lsn= 0;
@ -718,11 +685,9 @@ void log_buffer_flush_to_disk(bool durable)
log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable);
}
/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.mutex. */
/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */
ATTRIBUTE_COLD void log_write_and_flush_prepare()
{
mysql_mutex_assert_not_owner(&log_sys.mutex);
if (log_sys.is_pmem())
return;
@ -732,23 +697,18 @@ ATTRIBUTE_COLD void log_write_and_flush_prepare()
group_commit_lock::ACQUIRED);
}
/** Durably write the log and release log_sys.mutex */
/** Durably write the log up to log_sys.get_lsn(). */
ATTRIBUTE_COLD void log_write_and_flush()
{
ut_ad(!srv_read_only_mode);
if (!log_sys.is_pmem())
{
const lsn_t write_lsn{log_sys.write_buf()};
const lsn_t flush_lsn{log_flush(write_lock.value())};
if (write_lsn || flush_lsn)
log_write_up_to(std::max(write_lsn, flush_lsn), true, &dummy_callback);
log_sys.write_buf<false>();
log_flush(write_lock.value());
}
#ifdef HAVE_PMEM
else
{
mysql_mutex_unlock(&log_sys.mutex);
log_sys.persist(log_sys.get_lsn());
}
#endif
}
@ -758,11 +718,7 @@ Tries to establish a big enough margin of free space in the log buffer, such
that a new log entry can be catenated without an immediate need for a flush. */
ATTRIBUTE_COLD static void log_flush_margin()
{
mysql_mutex_lock(&log_sys.mutex);
const bool flush{log_sys.buf_free > log_sys.max_buf_free};
mysql_mutex_unlock(&log_sys.mutex);
if (flush)
if (log_sys.buf_free > log_sys.max_buf_free)
log_buffer_flush_to_disk(false);
}
@ -775,26 +731,27 @@ ATTRIBUTE_COLD static void log_checkpoint_margin()
{
while (log_sys.check_flush_or_checkpoint())
{
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.rd_lock(SRW_LOCK_CALL);
ut_ad(!recv_no_log_write);
if (!log_sys.check_flush_or_checkpoint())
{
func_exit:
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.rd_unlock();
return;
}
const lsn_t lsn= log_sys.get_lsn();
const lsn_t checkpoint= log_sys.last_checkpoint_lsn;
const lsn_t sync_lsn= checkpoint + log_sys.max_checkpoint_age;
if (lsn <= sync_lsn)
{
log_sys.set_check_flush_or_checkpoint(false);
goto func_exit;
}
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.rd_unlock();
/* We must wait to prevent the tail of the log overwriting the head. */
buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20)));
@ -944,9 +901,9 @@ wait_suspend_loop:
}
if (log_sys.is_initialised()) {
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.rd_lock(SRW_LOCK_CALL);
const ulint n_write = log_sys.n_pending_checkpoint_writes;
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.rd_unlock();
if (n_write) {
if (srv_print_verbose_log && count > 600) {
@ -987,7 +944,7 @@ wait_suspend_loop:
? SIZE_OF_FILE_CHECKPOINT + 8
: SIZE_OF_FILE_CHECKPOINT;
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.rd_lock(SRW_LOCK_CALL);
lsn = log_sys.get_lsn();
@ -995,7 +952,7 @@ wait_suspend_loop:
&& lsn != log_sys.last_checkpoint_lsn + sizeof_cp;
ut_ad(lsn >= log_sys.last_checkpoint_lsn);
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.rd_unlock();
if (lsn_changed) {
goto loop;
@ -1041,7 +998,7 @@ log_print(
double time_elapsed;
time_t current_time;
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.rd_lock(SRW_LOCK_CALL);
const lsn_t lsn= log_sys.get_lsn();
mysql_mutex_lock(&buf_pool.flush_list_mutex);
@ -1079,7 +1036,7 @@ log_print(
log_sys.n_log_ios_old = log_sys.n_log_ios;
log_sys.last_printout_time = current_time;
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.rd_unlock();
}
/**********************************************************************//**
@ -1112,8 +1069,7 @@ void log_t::close()
ut_ad(!flush_buf);
#endif
mysql_mutex_destroy(&mutex);
mysql_mutex_destroy(&flush_order_mutex);
latch.destroy();
recv_sys.close();

66
storage/innobase/log/log0recv.cc

@ -62,7 +62,7 @@ recv_sys_t recv_sys;
bool recv_needed_recovery;
#ifdef UNIV_DEBUG
/** TRUE if writing to the redo log (mtr_commit) is forbidden.
Protected by log_sys.mutex. */
Protected by log_sys.latch. */
bool recv_no_log_write = false;
#endif /* UNIV_DEBUG */
@ -2235,7 +2235,9 @@ template<typename source>
inline recv_sys_t::parse_mtr_result recv_sys_t::parse(store_t store, source &l)
noexcept
{
mysql_mutex_assert_owner(&log_sys.mutex);
ut_ad(log_sys.latch.is_write_locked() ||
srv_operation == SRV_OPERATION_BACKUP ||
srv_operation == SRV_OPERATION_BACKUP_NO_DEFER);
mysql_mutex_assert_owner(&mutex);
ut_ad(log_sys.next_checkpoint_lsn);
ut_ad(log_sys.is_latest());
@ -2970,17 +2972,23 @@ set_start_lsn:
if (start_lsn) {
ut_ad(end_lsn >= start_lsn);
ut_ad(!block->page.oldest_modification());
mach_write_to_8(FIL_PAGE_LSN + frame, end_lsn);
if (UNIV_LIKELY(frame == block->page.frame)) {
if (UNIV_LIKELY(!block->page.zip.data)) {
mach_write_to_8(srv_page_size
- FIL_PAGE_END_LSN_OLD_CHKSUM
+ frame, end_lsn);
} else {
buf_zip_decompress(block, false);
}
buf_block_modify_clock_inc(block);
buf_flush_note_modification(block, start_lsn, end_lsn);
/* The following is adapted from
buf_pool_t::insert_into_flush_list() */
mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_pool.stat.flush_list_bytes+= block->physical_size();
block->page.set_oldest_modification(start_lsn);
UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page);
buf_pool.page_cleaner_wakeup();
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
} else if (free_page && init) {
/* There have been no operations that modify the page.
Any buffered changes must not be merged. A subsequent
@ -3271,9 +3279,6 @@ void recv_sys_t::apply(bool last_batch)
srv_operation == SRV_OPERATION_RESTORE ||
srv_operation == SRV_OPERATION_RESTORE_EXPORT);
#ifdef SAFE_MUTEX
DBUG_ASSERT(!last_batch == mysql_mutex_is_owner(&log_sys.mutex));
#endif /* SAFE_MUTEX */
mysql_mutex_assert_owner(&mutex);
timespec abstime;
@ -3283,15 +3288,15 @@ void recv_sys_t::apply(bool last_batch)
if (is_corrupt_log())
return;
if (last_batch)
{
mysql_mutex_assert_not_owner(&log_sys.mutex);
my_cond_wait(&cond, &mutex.m_mutex);
}
else
{
mysql_mutex_unlock(&mutex);
ut_ad(log_sys.latch.is_write_locked());
log_sys.latch.wr_unlock();
set_timespec_nsec(abstime, 500000000ULL); /* 0.5s */
my_cond_timedwait(&cond, &log_sys.mutex.m_mutex, &abstime);
my_cond_timedwait(&cond, &mutex.m_mutex, &abstime);
mysql_mutex_unlock(&mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
mysql_mutex_lock(&mutex);
}
}
@ -3398,7 +3403,6 @@ next_free_block:
{
if (last_batch)
{
mysql_mutex_assert_not_owner(&log_sys.mutex);
if (!empty)
my_cond_wait(&cond, &mutex.m_mutex);
else
@ -3412,9 +3416,12 @@ next_free_block:
}
else
{
mysql_mutex_unlock(&mutex);
ut_ad(log_sys.latch.is_write_locked());
log_sys.latch.wr_unlock();
set_timespec_nsec(abstime, 500000000ULL); /* 0.5s */
my_cond_timedwait(&cond, &log_sys.mutex.m_mutex, &abstime);
my_cond_timedwait(&cond, &mutex.m_mutex, &abstime);
mysql_mutex_unlock(&mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
mysql_mutex_lock(&mutex);
}
continue;
@ -3432,10 +3439,9 @@ next_free_block:
else
{
mlog_init.reset();
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
}
mysql_mutex_assert_not_owner(&log_sys.mutex);
mysql_mutex_unlock(&mutex);
if (last_batch && srv_operation != SRV_OPERATION_RESTORE &&
@ -3451,7 +3457,7 @@ next_free_block:
if (!last_batch)
{
buf_pool_invalidate();
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
}
#ifdef HAVE_PMEM
else if (log_sys.is_pmem())
@ -3511,7 +3517,7 @@ static bool recv_scan_log(bool last_phase)
for (ut_d(lsn_t source_offset= 0);;)
{
mysql_mutex_assert_owner(&log_sys.mutex);
ut_ad(log_sys.latch.is_write_locked());
#ifdef UNIV_DEBUG
const bool wrap{source_offset + recv_sys.len == log_sys.file_size};
#endif
@ -3868,7 +3874,7 @@ recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace)
static dberr_t recv_rename_files()
{
mysql_mutex_assert_owner(&recv_sys.mutex);
mysql_mutex_assert_owner(&log_sys.mutex);
ut_ad(log_sys.latch.is_write_locked());
dberr_t err= DB_SUCCESS;
@ -3963,20 +3969,16 @@ dberr_t recv_recovery_from_checkpoint_start()
recv_sys.recovery_on = true;
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
dberr_t err = recv_sys.find_checkpoint();
if (err != DB_SUCCESS) {
early_exit:
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
return err;
}
if (!log_set_capacity(srv_log_file_size)) {
err_exit:
err = DB_ERROR;
goto early_exit;
}
log_sys.set_capacity();
/* Start reading the log from the checkpoint lsn. The variable
contiguous_lsn contains an lsn up to which the log is known to
@ -4103,7 +4105,9 @@ read_only_recovery:
}
if (recv_sys.lsn < log_sys.next_checkpoint_lsn) {
goto err_exit;
err_exit:
err = DB_ERROR;
goto early_exit;
}
if (!srv_read_only_mode && log_sys.is_latest()) {
@ -4142,7 +4146,7 @@ read_only_recovery:
err = recv_rename_files();
}
mysql_mutex_unlock(&recv_sys.mutex);
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
recv_lsn_checks_on = true;

291
storage/innobase/mtr/mtr0mtr.cc

@ -312,16 +312,10 @@ struct DebugCheck {
struct ReleaseBlocks
{
const lsn_t start, end;
#ifdef UNIV_DEBUG
const mtr_buf_t &memo;
ReleaseBlocks(lsn_t start, lsn_t end, const mtr_buf_t &memo) :
start(start), end(end), memo(memo)
#else /* UNIV_DEBUG */
ReleaseBlocks(lsn_t start, lsn_t end, const mtr_buf_t&) :
start(start), end(end)
#endif /* UNIV_DEBUG */
mutable size_t modified;
ReleaseBlocks(lsn_t start, lsn_t end) : start(start), end(end), modified(0)
{
ut_ad(!srv_read_only_mode);
ut_ad(start);
ut_ad(end);
}
@ -340,8 +334,25 @@ struct ReleaseBlocks
return true;
}
buf_flush_note_modification(static_cast<buf_block_t*>(slot->object),
start, end);
modified++;
buf_block_t *b= static_cast<buf_block_t*>(slot->object);
ut_d(const auto s= b->page.state());
ut_ad(s > buf_page_t::FREED);
ut_ad(s < buf_page_t::READ_FIX);
ut_ad(mach_read_from_8(b->page.frame + FIL_PAGE_LSN) <= end);
mach_write_to_8(b->page.frame + FIL_PAGE_LSN, end);
if (UNIV_LIKELY_NULL(b->page.zip.data))
memcpy_aligned<8>(FIL_PAGE_LSN + b->page.zip.data,
FIL_PAGE_LSN + b->page.frame, 8);
const lsn_t oldest_modification= b->page.oldest_modification();
if (oldest_modification > 1)
ut_ad(oldest_modification <= start);
else if (fsp_is_system_temporary(b->page.id().space()))
b->page.set_temp_modified();
else
buf_pool.insert_into_flush_list(b, start);
return true;
}
};
@ -403,15 +414,9 @@ void mtr_t::commit()
if (UNIV_LIKELY(m_log_mode == MTR_LOG_ALL))
{
lsns= do_write();
if (m_made_dirty)
mysql_mutex_lock(&log_sys.flush_order_mutex);
/* It is now safe to release log_sys.mutex because the
buf_pool.flush_order_mutex will ensure that we are the first one
to insert into buf_pool.flush_list. */
mysql_mutex_unlock(&log_sys.mutex);
lsns= do_write(false);
if (!m_made_dirty)
log_sys.latch.rd_unlock();
}
else
{
@ -420,7 +425,7 @@ void mtr_t::commit()
m_commit_lsn= log_sys.get_lsn();
lsns= { m_commit_lsn, PAGE_FLUSH_NO };
if (UNIV_UNLIKELY(m_made_dirty)) /* This should be IMPORT TABLESPACE */
mysql_mutex_lock(&log_sys.flush_order_mutex);
log_sys.latch.rd_lock(SRW_LOCK_CALL);
}
if (m_freed_pages)
@ -445,16 +450,23 @@ void mtr_t::commit()
else
ut_ad(!m_freed_space);
m_memo.for_each_block_in_reverse(CIterate<const ReleaseBlocks>
(ReleaseBlocks(lsns.first, m_commit_lsn,
m_memo)));
ReleaseBlocks rb{lsns.first, m_commit_lsn};
m_memo.for_each_block_in_reverse(CIterate<const ReleaseBlocks>(rb));
if (m_made_dirty)
mysql_mutex_unlock(&log_sys.flush_order_mutex);
log_sys.latch.rd_unlock();
m_memo.for_each_block_in_reverse(CIterate<ReleaseLatches>());
if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO))
buf_flush_ahead(m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC);
if (rb.modified)
{
mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_pool.flush_list_requests+= rb.modified;
buf_pool.page_cleaner_wakeup();
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
}
}
else
m_memo.for_each_block_in_reverse(CIterate<ReleaseAll>());
@ -523,12 +535,13 @@ void mtr_t::commit_shrink(fil_space_t &space)
ut_ad(UT_LIST_GET_LEN(space.chain) == 1);
log_write_and_flush_prepare();
log_sys.latch.wr_lock(SRW_LOCK_CALL);
const lsn_t start_lsn= do_write().first;
const lsn_t start_lsn= do_write(true).first;
mysql_mutex_lock(&log_sys.flush_order_mutex);
/* Durably write the reduced FSP_SIZE before truncating the data file. */
log_write_and_flush();
ut_ad(log_sys.latch.is_write_locked());
os_file_truncate(space.chain.start->name, space.chain.start->handle,
os_offset_t{space.size} << srv_page_size_shift, true);
@ -557,9 +570,8 @@ void mtr_t::commit_shrink(fil_space_t &space)
m_memo.for_each_block_in_reverse(CIterate<Shrink>{space});
m_memo.for_each_block_in_reverse(CIterate<const ReleaseBlocks>
(ReleaseBlocks(start_lsn, m_commit_lsn,
m_memo)));
mysql_mutex_unlock(&log_sys.flush_order_mutex);
(ReleaseBlocks{start_lsn, m_commit_lsn}));
log_sys.latch.wr_unlock();
mysql_mutex_lock(&fil_system.mutex);
ut_ad(space.is_being_truncated);
@ -582,7 +594,7 @@ This is to be used at log_checkpoint().
@return current LSN */
lsn_t mtr_t::commit_files(lsn_t checkpoint_lsn)
{
mysql_mutex_assert_owner(&log_sys.mutex);
ut_ad(log_sys.latch.is_write_locked());
ut_ad(is_active());
ut_ad(!is_inside_ibuf());
ut_ad(m_log_mode == MTR_LOG_ALL);
@ -616,7 +628,7 @@ lsn_t mtr_t::commit_files(lsn_t checkpoint_lsn)
m_crc= 0;
m_log.for_each_block([this](const mtr_buf_t::block_t *b)
{ m_crc= my_crc32c(m_crc, b->begin(), b->used()); return true; });
finish_write(size);
finish_write(size, true);
release_resources();
if (checkpoint_lsn)
@ -753,8 +765,6 @@ mtr_t::release_page(const void* ptr, mtr_memo_type_t type)
ut_ad(0);
}
static bool log_margin_warned;
static time_t log_margin_warn_time;
static bool log_close_warned;
static time_t log_close_warn_time;
@ -774,65 +784,73 @@ ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t age, lsn_t capacity)
}
}
/** Reserve space in the log buffer for appending data.
@param size upper limit of the length of the data to append(), in bytes
@return the current LSN */
inline lsn_t log_t::append_prepare(size_t size) noexcept
/** Wait in append_prepare() for buffer to become available
@param ex whether log_sys.latch is exclusively locked */
ATTRIBUTE_COLD void log_t::append_prepare_wait(bool ex) noexcept
{
mysql_mutex_assert_owner(&mutex);
lsn_t lsn= get_lsn();
log_sys.waits++;
log_sys.lsn_lock.unlock();
if (UNIV_UNLIKELY(size > log_capacity))
{
time_t t= time(nullptr);
/* return with warning output to avoid deadlock */
if (!log_margin_warned || difftime(t, log_margin_warn_time) > 15)
{
log_margin_warned= true;
log_margin_warn_time= t;
if (ex)
log_sys.latch.wr_unlock();
else
log_sys.latch.rd_unlock();
sql_print_error("InnoDB: innodb_log_file_size is too small "
"for mini-transaction size %zu", size);
}
goto throttle;
}
else if (UNIV_UNLIKELY(lsn + size > last_checkpoint_lsn + log_capacity))
throttle:
set_check_flush_or_checkpoint();
DEBUG_SYNC_C("log_buf_size_exceeded");
log_buffer_flush_to_disk(log_sys.is_pmem());
if (is_pmem())
{
for (ut_d(int count= 50); capacity() - size <
size_t(lsn - flushed_to_disk_lsn.load(std::memory_order_relaxed)); )
{
waits++;
mysql_mutex_unlock(&mutex);
DEBUG_SYNC_C("log_buf_size_exceeded");
log_write_up_to(lsn, true);
ut_ad(count--);
mysql_mutex_lock(&mutex);
lsn= get_lsn();
}
return lsn;
}
if (ex)
log_sys.latch.wr_lock(SRW_LOCK_CALL);
else
log_sys.latch.rd_lock(SRW_LOCK_CALL);
/* Calculate the amount of free space needed. */
size= (4 * 4096) - size + log_sys.buf_size;
log_sys.lsn_lock.lock();
}
for (ut_d(int count= 50); UNIV_UNLIKELY(buf_free > size); )
/** Reserve space in the log buffer for appending data.
@tparam pmem log_sys.is_pmem()
@param size total length of the data to append(), in bytes
@param ex whether log_sys.latch is exclusively locked
@return the start LSN and the buffer position for append() */
template<bool pmem>
inline
std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
{
ut_ad(latch.is_locked());
ut_ad(pmem == is_pmem());
#ifndef _WIN32 // there is no accurate is_write_locked() on SRWLOCK
ut_ad(ex == latch.is_write_locked());
#endif
const lsn_t checkpoint_margin{last_checkpoint_lsn + log_capacity - size};
const size_t avail{(pmem ? size_t(capacity()) : buf_size) - size};
lsn_lock.lock();
write_to_buf++;
for (ut_d(int count= 50);
UNIV_UNLIKELY((pmem
? size_t(get_lsn() -
get_flushed_lsn(std::memory_order_relaxed))
: size_t{buf_free}) > avail); )
{
waits++;
mysql_mutex_unlock(&mutex);
DEBUG_SYNC_C("log_buf_size_exceeded");
log_write_up_to(lsn, false);
append_prepare_wait(ex);
ut_ad(count--);
mysql_mutex_lock(&mutex);
lsn= get_lsn();
}
return lsn;
const lsn_t l{lsn.load(std::memory_order_relaxed)};
lsn.store(l + size, std::memory_order_relaxed);
const size_t b{buf_free};
size_t new_buf_free{b};
new_buf_free+= size;
if (pmem && new_buf_free >= file_size)
new_buf_free-= size_t(capacity());
buf_free= new_buf_free;
lsn_lock.unlock();
if (UNIV_UNLIKELY(l > checkpoint_margin) ||
(!pmem && b >= max_buf_free))
set_check_flush_or_checkpoint();
return {l, &buf[b]};
}
/** Finish appending data to the log.
@ -840,9 +858,7 @@ inline lsn_t log_t::append_prepare(size_t size) noexcept
@return whether buf_flush_ahead() will have to be invoked */
static mtr_t::page_flush_ahead log_close(lsn_t lsn) noexcept
{
mysql_mutex_assert_owner(&log_sys.mutex);
log_sys.write_to_buf++;
log_sys.set_lsn(lsn);
ut_ad(log_sys.latch.is_locked());
const lsn_t checkpoint_age= lsn - log_sys.last_checkpoint_lsn;
@ -859,10 +875,11 @@ static mtr_t::page_flush_ahead log_close(lsn_t lsn) noexcept
return mtr_t::PAGE_FLUSH_SYNC;
}
std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write()
std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write(bool ex)
{
ut_ad(!recv_no_log_write);
ut_ad(m_log_mode == MTR_LOG_ALL);
ut_ad(!ex || log_sys.latch.is_write_locked());
size_t len= m_log.size() + 5;
ut_ad(len > 5);
@ -879,86 +896,93 @@ std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write()
{ m_crc= my_crc32c(m_crc, b->begin(), b->used()); return true; });
}
mysql_mutex_lock(&log_sys.mutex);
if (!ex)
log_sys.latch.rd_lock(SRW_LOCK_CALL);
if (m_user_space && !is_predefined_tablespace(m_user_space->id) &&
!m_user_space->max_lsn)
name_write();
if (UNIV_UNLIKELY(m_user_space && !m_user_space->max_lsn &&
!is_predefined_tablespace(m_user_space->id)))
{
if (!ex)
{
log_sys.latch.rd_unlock();
log_sys.latch.wr_lock(SRW_LOCK_CALL);
if (UNIV_LIKELY(!m_user_space->max_lsn))
name_write();
std::pair<lsn_t,mtr_t::page_flush_ahead> p{finish_write(len, true)};
log_sys.latch.wr_unlock();
log_sys.latch.rd_lock(SRW_LOCK_CALL);
return p;
}
else
name_write();
}
return finish_write(len);
return finish_write(len, ex);
}
/** Write the mini-transaction log to the redo log buffer.
@param len number of bytes to write
@param ex whether log_sys.latch is exclusively locked
@return {start_lsn,flush_ahead} */
std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::finish_write(size_t len)
std::pair<lsn_t,mtr_t::page_flush_ahead>
mtr_t::finish_write(size_t len, bool ex)
{
ut_ad(!recv_no_log_write);
ut_ad(m_log_mode == MTR_LOG_ALL);
const lsn_t start_lsn= log_sys.append_prepare(len);
const size_t size{m_commit_lsn ? 5U + 8U : 5U};
std::pair<lsn_t, byte*> start;
if (!log_sys.is_pmem())
{
m_log.for_each_block([](const mtr_buf_t::block_t *b)
{ log_sys.append(b->begin(), b->used()); return true; });
if (log_sys.buf_free >= log_sys.max_buf_free)
log_sys.set_check_flush_or_checkpoint();
start= log_sys.append_prepare<false>(len, ex);
m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
{ log_sys.append(start.second, b->begin(), b->used()); return true; });
#ifdef HAVE_PMEM
write_trailer:
#endif
log_sys.buf[log_sys.buf_free]=
log_sys.get_sequence_bit(start_lsn + len - size);
*start.second++= log_sys.get_sequence_bit(start.first + len - size);
if (m_commit_lsn)
{
byte *nonce= log_sys.buf + log_sys.buf_free + 1;
mach_write_to_8(nonce, m_commit_lsn);
m_crc= my_crc32c(m_crc, nonce, 8);
mach_write_to_4(&log_sys.buf[log_sys.buf_free + 9], m_crc);
log_sys.buf_free+= 8 + 5;
}
else
{
mach_write_to_4(&log_sys.buf[log_sys.buf_free + 1], m_crc);
log_sys.buf_free+= 5;
mach_write_to_8(start.second, m_commit_lsn);
m_crc= my_crc32c(m_crc, start.second, 8);
start.second+= 8;
}
mach_write_to_4(start.second, m_crc);
}
#ifdef HAVE_PMEM
else if (UNIV_LIKELY(log_sys.buf_free + len < log_sys.file_size))
{
m_log.for_each_block([](const mtr_buf_t::block_t *b)
{ log_sys.append(b->begin(), b->used()); return true; });
goto write_trailer;
}
else
{
m_log.for_each_block([](const mtr_buf_t::block_t *b)
start= log_sys.append_prepare<true>(len, ex);
if (UNIV_LIKELY(start.second + len <= &log_sys.buf[log_sys.file_size]))
{
m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
{ log_sys.append(start.second, b->begin(), b->used()); return true; });
goto write_trailer;
}
m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
{
size_t size{b->used()};
const size_t size_left{log_sys.file_size - log_sys.buf_free};
const size_t size_left(&log_sys.buf[log_sys.file_size] - start.second);
const byte *src= b->begin();
if (size <= size_left)
{
::memcpy(log_sys.buf + log_sys.buf_free, src, size);
log_sys.buf_free+= size;
}
else
if (size > size_left)
{
::memcpy(start.second, src, size_left);
start.second= &log_sys.buf[log_sys.START_OFFSET];
src+= size_left;
size-= size_left;
::memcpy(log_sys.buf + log_sys.buf_free, src, size_left);
::memcpy(log_sys.buf + log_sys.START_OFFSET, src + size_left, size);
log_sys.buf_free= log_sys.START_OFFSET + size;
}
::memcpy(start.second, src, size);
start.second+= size;
return true;
});
const size_t size_left{log_sys.file_size - log_sys.buf_free};
const size_t size_left(&log_sys.buf[log_sys.file_size] - start.second);
if (size_left > size)
goto write_trailer;
byte tail[5 + 8];
tail[0]= log_sys.get_sequence_bit(start_lsn + len - size);
tail[0]= log_sys.get_sequence_bit(start.first + len - size);
if (m_commit_lsn)
{
@ -969,15 +993,14 @@ std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::finish_write(size_t len)
else
mach_write_to_4(tail + 1, m_crc);
::memcpy(log_sys.buf + log_sys.buf_free, tail, size_left);
::memcpy(start.second, tail, size_left);
::memcpy(log_sys.buf + log_sys.START_OFFSET, tail + size_left,
size - size_left);
log_sys.buf_free= log_sys.START_OFFSET + (size - size_left);
}
#endif
m_commit_lsn= start_lsn + len;
return {start_lsn, log_close(m_commit_lsn)};
m_commit_lsn= start.first + len;
return {start.first, log_close(m_commit_lsn)};
}
/** Find out whether a block was not X-latched by the mini-transaction */

7
storage/innobase/srv/srv0mon.cc

@ -1371,6 +1371,7 @@ corresponding monitors are turned on/off/reset, and do appropriate
mathematics to deduct the actual value. Please also refer to
srv_export_innodb_status() for related global counters used by
the existing status variables.*/
TPOOL_SUPPRESS_TSAN
void
srv_mon_process_existing_counter(
/*=============================*/
@ -1405,7 +1406,7 @@ srv_mon_process_existing_counter(
/* innodb_buffer_pool_write_requests, the number of
write request */
case MONITOR_OVLD_BUF_POOL_WRITE_REQUEST:
value = srv_stats.buf_pool_write_requests;
value = buf_pool.flush_list_requests;
break;
/* innodb_buffer_pool_wait_free */
@ -1714,10 +1715,10 @@ srv_mon_process_existing_counter(
break;
case MONITOR_LSN_CHECKPOINT_AGE:
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.rd_lock(SRW_LOCK_CALL);
value = static_cast<mon_type_t>(log_sys.get_lsn()
- log_sys.last_checkpoint_lsn);
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.rd_unlock();
break;
case MONITOR_OVLD_BUF_OLDEST_LSN:

11
storage/innobase/srv/srv0srv.cc

@ -1024,9 +1024,6 @@ srv_export_innodb_status(void)
export_vars.innodb_buffer_pool_read_requests
= buf_pool.stat.n_page_gets;
export_vars.innodb_buffer_pool_write_requests =
srv_stats.buf_pool_write_requests;
export_vars.innodb_buffer_pool_reads = srv_stats.buf_pool_reads;
export_vars.innodb_buffer_pool_read_ahead_rnd =
@ -1167,13 +1164,13 @@ srv_export_innodb_status(void)
mysql_mutex_unlock(&srv_innodb_monitor_mutex);
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.rd_lock(SRW_LOCK_CALL);
export_vars.innodb_lsn_current = log_sys.get_lsn();
export_vars.innodb_lsn_flushed = log_sys.get_flushed_lsn();
export_vars.innodb_lsn_last_checkpoint = log_sys.last_checkpoint_lsn;
export_vars.innodb_checkpoint_max_age = static_cast<ulint>(
log_sys.max_checkpoint_age);
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.rd_unlock();
export_vars.innodb_os_log_written = export_vars.innodb_lsn_current
- recv_sys.lsn;
@ -1818,10 +1815,10 @@ void purge_coordinator_state::refresh(bool full)
lsn_hwm= adaptive_purge_threshold + series[n_threads];
}
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.rd_lock(SRW_LOCK_CALL);
const lsn_t last= log_sys.last_checkpoint_lsn,
max_age= log_sys.max_checkpoint_age;
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.rd_unlock();
lsn_age_factor= ulint(((log_sys.get_lsn() - last) * 100) / max_age);
}

18
storage/innobase/srv/srv0start.cc

@ -195,12 +195,8 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn,
DBUG_ASSERT(!buf_pool.any_io_pending());
mysql_mutex_lock(&log_sys.mutex);
if (!log_set_capacity(srv_log_file_size)) {
err_exit:
mysql_mutex_unlock(&log_sys.mutex);
return DB_ERROR;
}
log_sys.latch.wr_lock(SRW_LOCK_CALL);
log_sys.set_capacity();
logfile0 = get_log_file_path(LOG_FILE_NAME_PREFIX)
.append(INIT_LOG_FILE0);
@ -213,7 +209,9 @@ err_exit:
if (!ret) {
sql_print_error("InnoDB: Cannot create %s", logfile0.c_str());
goto err_exit;
err_exit:
log_sys.latch.wr_unlock();
return DB_ERROR;
}
ret = os_file_set_size(logfile0.c_str(), file, srv_log_file_size);
@ -244,7 +242,7 @@ err_exit:
/* Enable checkpoints in buf_flush_page_cleaner(). */
recv_sys.recovery_on = false;
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
log_make_checkpoint();
log_buffer_flush_to_disk();
@ -801,7 +799,7 @@ static lsn_t srv_prepare_to_delete_redo_log_file()
DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0););
DBUG_PRINT("ib_log", ("After innodb_log_abort_1"));
mysql_mutex_lock(&log_sys.mutex);
log_sys.latch.wr_lock(SRW_LOCK_CALL);
const bool latest_format{log_sys.is_latest()};
lsn_t flushed_lsn{log_sys.get_lsn()};
@ -846,7 +844,7 @@ same_size:
}
}
mysql_mutex_unlock(&log_sys.mutex);
log_sys.latch.wr_unlock();
log_write_up_to(flushed_lsn, false);

9
storage/innobase/sync/srw_lock.cc

@ -1,6 +1,6 @@
/*****************************************************************************
Copyright (c) 2020, 2021, MariaDB Corporation.
Copyright (c) 2020, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@ -102,6 +102,13 @@ static inline void srw_pause(unsigned delay)
HMT_medium();
}
void sspin_lock::lock() noexcept
{
while (word.exchange(true, std::memory_order_acquire))
while (word.load(std::memory_order_relaxed))
srw_pause(1);
}
#ifdef SUX_LOCK_GENERIC
template<> void srw_mutex_impl<true>::wr_wait()
{

Loading…
Cancel
Save