From 7c767a30a708e58b537568412da581400dddd1b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Mon, 27 Mar 2017 18:58:16 +0300 Subject: [PATCH] MDEV-10139 Support for InnoDB SEQUENCE objects We introduce a NO_ROLLBACK flag for InnoDB tables. This flag only works for tables that have a single index. Apart from undo logging, this flag will also prevent locking and the assignment of DB_ROW_ID or DB_TRX_ID, and imply READ UNCOMMITTED isolation. It is assumed that the SQL layer is guaranteeing mutual exclusion. After the initial insert of the single record during CREATE SEQUENCE, InnoDB will be updating the single record in-place. This is crash-safe thanks to the redo log. (That is, after a crash after CREATE SEQUENCE was committed, the effect of sequence operations will be observable fully or not at all.) When it comes to the durability of the updates of SEQUENCE in InnoDB, there is a clear analogy to MDEV-6076 Persistent AUTO_INCREMENT. The updates would be made persistent by the InnoDB redo log flush at transaction commit or rollback (or XA PREPARE), provided that innodb_log_flush_at_trx_commit=1. Similar to AUTO_INCREMENT, it is possible that the update of a SEQUENCE in a middle of transaction becomes durable before the COMMIT/ROLLBACK of the transaction, in case the InnoDB redo log is being flushed as a result of the a commit or rollback of some other transaction, or as a result of a redo log checkpoint that can be initiated at any time by operations that are writing redo log. dict_table_t::no_rollback(): Check if the table does not support rollback. BTR_NO_ROLLBACK: Logging and locking flags for no_rollback() tables. DICT_TF_BITS: Add the NO_ROLLBACK flag. row_ins_step(): Assign 0 to DB_ROW_ID and DB_TRX_ID, and skip any locking for no-rollback tables. There will be only a single row in no-rollback tables (or there must be a proper PRIMARY KEY). row_search_mvcc(): Execute the READ UNCOMMITTED code path for no-rollback tables. ha_innobase::external_lock(), ha_innobase::store_lock(): Block CREATE/DROP SEQUENCE in innodb_read_only mode. This probably has no effect for CREATE SEQUENCE, because already ha_innobase::create() should have been called (and refused) before external_lock() or store_lock() is called. ha_innobase::store_lock(): For CREATE SEQUENCE, do not acquire any InnoDB locks, even though TL_WRITE is being requested. (This is just a performance optimization.) innobase_copy_frm_flags_from_create_info(), row_drop_table_for_mysql(): Disable persistent statistics for no_rollback tables. --- storage/innobase/dict/dict0dict.cc | 1 + storage/innobase/handler/ha_innodb.cc | 49 +++++++++++-------- storage/innobase/include/btr0cur.h | 5 ++ storage/innobase/include/dict0mem.h | 28 ++++++++--- storage/innobase/row/row0ins.cc | 26 ++++++++-- storage/innobase/row/row0mysql.cc | 2 +- storage/innobase/row/row0sel.cc | 10 ++-- storage/innobase/row/row0upd.cc | 6 +-- .../rocksdb/r/tbl_opt_data_index_dir.result | 4 +- 9 files changed, 90 insertions(+), 41 deletions(-) diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index d7fcbdf3906..0db194b7e52 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -2502,6 +2502,7 @@ dict_index_add_to_cache_w_vcol( ut_d(mem_heap_validate(index->heap)); ut_a(!dict_index_is_clust(index) || UT_LIST_GET_LEN(table->indexes) == 0); + ut_ad(dict_index_is_clust(index) || !table->no_rollback()); if (!dict_index_find_cols(table, index, add_v)) { diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index bb8f4c6ff41..92792326398 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -3114,7 +3114,8 @@ innobase_copy_frm_flags_from_create_info( ibool ps_on; ibool ps_off; - if (dict_table_is_temporary(innodb_table)) { + if (dict_table_is_temporary(innodb_table) + || innodb_table->no_rollback()) { /* Temp tables do not use persistent stats. */ ps_on = FALSE; ps_off = TRUE; @@ -12909,6 +12910,10 @@ index_bad: default_compression_level : static_cast(options->page_compression_level), 0); + if (m_form->s->table_type == TABLE_TYPE_SEQUENCE) { + m_flags |= 1U << DICT_TF_POS_NO_ROLLBACK; + } + /* Set the flags2 when create table or alter tables */ m_flags2 |= DICT_TF2_FTS_AUX_HEX_NAME; DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", @@ -13539,6 +13544,10 @@ ha_innobase::create( trx_t* trx; DBUG_ENTER("ha_innobase::create"); + DBUG_ASSERT(form->s == table_share); + DBUG_ASSERT(table_share->table_type == TABLE_TYPE_SEQUENCE + || table_share->table_type == TABLE_TYPE_NORMAL); + create_table_info_t info(ha_thd(), form, create_info, @@ -16489,24 +16498,23 @@ ha_innobase::external_lock( } /* Check for UPDATEs in read-only mode. */ - if (srv_read_only_mode - && (thd_sql_command(thd) == SQLCOM_UPDATE - || thd_sql_command(thd) == SQLCOM_INSERT - || thd_sql_command(thd) == SQLCOM_REPLACE - || thd_sql_command(thd) == SQLCOM_DROP_TABLE - || thd_sql_command(thd) == SQLCOM_ALTER_TABLE - || thd_sql_command(thd) == SQLCOM_OPTIMIZE - || (thd_sql_command(thd) == SQLCOM_CREATE_TABLE - && lock_type == F_WRLCK) - || thd_sql_command(thd) == SQLCOM_CREATE_INDEX - || thd_sql_command(thd) == SQLCOM_DROP_INDEX - || thd_sql_command(thd) == SQLCOM_DELETE)) { - - if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE) { - ib_senderrf(thd, IB_LOG_LEVEL_WARN, - ER_READ_ONLY_MODE); - DBUG_RETURN(HA_ERR_TABLE_READONLY); - } else { + if (srv_read_only_mode) { + switch (thd_sql_command(thd)) { + case SQLCOM_CREATE_TABLE: + if (lock_type != F_WRLCK) { + break; + } + case SQLCOM_UPDATE: + case SQLCOM_INSERT: + case SQLCOM_REPLACE: + case SQLCOM_DROP_TABLE: + case SQLCOM_ALTER_TABLE: + case SQLCOM_OPTIMIZE: + case SQLCOM_CREATE_INDEX: + case SQLCOM_DROP_INDEX: + case SQLCOM_CREATE_SEQUENCE: + case SQLCOM_DROP_SEQUENCE: + case SQLCOM_DELETE: ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); DBUG_RETURN(HA_ERR_TABLE_READONLY); @@ -17433,7 +17441,8 @@ ha_innobase::store_lock( /* Use consistent read for checksum table */ if (sql_command == SQLCOM_CHECKSUM - || (sql_command == SQLCOM_ANALYZE && lock_type == TL_READ) + || sql_command == SQLCOM_CREATE_SEQUENCE + || (sql_command == SQLCOM_ANALYZE && lock_type == TL_READ) || ((srv_locks_unsafe_for_binlog || trx->isolation_level <= TRX_ISO_READ_COMMITTED) && trx->isolation_level != TRX_ISO_SERIALIZABLE diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index fee7f375cb4..e1f5286e122 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -42,6 +42,11 @@ enum { /** sys fields will be found in the update vector or inserted entry */ BTR_KEEP_SYS_FLAG = 4, + + /** no rollback */ + BTR_NO_ROLLBACK = BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG, + /** btr_cur_pessimistic_update() must keep cursor position when moving columns to big_rec */ BTR_KEEP_POS_FLAG = 8, diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 0630137bb4f..9b87e654b21 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -147,17 +147,20 @@ Width of the page compression flag #define DICT_TF_WIDTH_PAGE_COMPRESSION 1 #define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4 +/** +Width of atomic writes flag +DEFAULT=0, ON = 1, OFF = 2 +*/ +#define DICT_TF_WIDTH_ATOMIC_WRITES 2 + /** Width of the page encryption flag */ #define DICT_TF_WIDTH_PAGE_ENCRYPTION 1 #define DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY 8 -/** -Width of atomic writes flag -DEFAULT=0, ON = 1, OFF = 2 -*/ -#define DICT_TF_WIDTH_ATOMIC_WRITES 2 +/** Width of the NO_ROLLBACK flag */ +#define DICT_TF_WIDTH_NO_ROLLBACK 1 /** Width of all the currently known table flags */ #define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ @@ -169,7 +172,8 @@ DEFAULT=0, ON = 1, OFF = 2 + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \ + DICT_TF_WIDTH_ATOMIC_WRITES \ + DICT_TF_WIDTH_PAGE_ENCRYPTION \ - + DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY) + + DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY \ + + DICT_TF_WIDTH_NO_ROLLBACK) /** A mask of all the known/used bits in table flags */ #define DICT_TF_BIT_MASK (~(~0U << DICT_TF_BITS)) @@ -203,9 +207,11 @@ DEFAULT=0, ON = 1, OFF = 2 /** Zero relative shift position of the PAGE_ENCRYPTION_KEY field */ #define DICT_TF_POS_PAGE_ENCRYPTION_KEY (DICT_TF_POS_PAGE_ENCRYPTION \ + DICT_TF_WIDTH_PAGE_ENCRYPTION) -#define DICT_TF_POS_UNUSED (DICT_TF_POS_PAGE_ENCRYPTION_KEY \ +/** Zero relative shift position of the NO_ROLLBACK field */ +#define DICT_TF_POS_NO_ROLLBACK (DICT_TF_POS_PAGE_ENCRYPTION_KEY \ + DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY) - +#define DICT_TF_POS_UNUSED (DICT_TF_POS_NO_ROLLBACK \ + + DICT_TF_WIDTH_NO_ROLLBACK) /** Bit mask of the COMPACT field */ #define DICT_TF_MASK_COMPACT \ ((~(~0U << DICT_TF_WIDTH_COMPACT)) \ @@ -1357,6 +1363,12 @@ struct dict_table_t { /** Release the table handle. */ inline void release(); + /** @return whether the table supports transactions */ + bool no_rollback() const + { + return flags & (1U << DICT_TF_POS_NO_ROLLBACK); + } + /** Id of the table. */ table_id_t id; diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index 9626645ebf2..f596f3d8f27 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -3169,7 +3169,7 @@ row_ins_clust_index_entry( log_free_check(); const ulint flags = dict_table_is_temporary(index->table) ? BTR_NO_LOCKING_FLAG - : 0; + : index->table->no_rollback() ? BTR_NO_ROLLBACK : 0; err = row_ins_clust_index_entry_low( flags, BTR_MODIFY_LEAF, index, n_uniq, entry, @@ -3703,7 +3703,27 @@ row_ins_step( table during the search operation, and there is no need to set it again here. But we must write trx->id to node->trx_id_buf. */ - memset(node->trx_id_buf, 0, DATA_TRX_ID_LEN); + if (node->table->no_rollback()) { + /* No-rollback tables should only be accessed by a + single thread at a time. Concurrency control (mutual + exclusion) must be guaranteed by the SQL layer. */ + DBUG_ASSERT(node->table->n_ref_count == 1); + DBUG_ASSERT(node->ins_type == INS_DIRECT); + /* No-rollback tables can consist only of a single index. */ + DBUG_ASSERT(UT_LIST_GET_LEN(node->entry_list) == 1); + DBUG_ASSERT(UT_LIST_GET_LEN(node->table->indexes) == 1); + /* There should be no possibility for interruption and + restarting here. In theory, we could allow resumption + from the INS_NODE_INSERT_ENTRIES state here. */ + DBUG_ASSERT(node->state == INS_NODE_SET_IX_LOCK); + memset(node->trx_id_buf, 0, DATA_TRX_ID_LEN); + memset(node->row_id_buf, 0, DATA_ROW_ID_LEN); + node->index = dict_table_get_first_index(node->table); + node->entry = UT_LIST_GET_FIRST(node->entry_list); + node->state = INS_NODE_INSERT_ENTRIES; + goto do_insert; + } + trx_write_trx_id(node->trx_id_buf, trx->id); if (node->state == INS_NODE_SET_IX_LOCK) { @@ -3753,7 +3773,7 @@ same_trx: return(thr); } - +do_insert: /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ err = row_ins(node, thr); diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index 8b7c64868b8..03d7ac628a7 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -3696,7 +3696,7 @@ row_drop_table_for_mysql( RemoteDatafile::delete_link_file(name); } - if (!dict_table_is_temporary(table)) { + if (!dict_table_is_temporary(table) && !table->no_rollback()) { dict_stats_recalc_pool_del(table); dict_stats_defrag_pool_del(table, NULL); diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index 229bd567c48..106845f73fa 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -4130,9 +4130,10 @@ row_search_mvcc( ulint direction) { DBUG_ENTER("row_search_mvcc"); + DBUG_ASSERT(prebuilt->index->table == prebuilt->table); dict_index_t* index = prebuilt->index; - ibool comp = dict_table_is_comp(index->table); + ibool comp = dict_table_is_comp(prebuilt->table); const dtuple_t* search_tuple = prebuilt->search_tuple; btr_pcur_t* pcur = prebuilt->pcur; trx_t* trx = prebuilt->trx; @@ -4514,7 +4515,7 @@ row_search_mvcc( que_thr_move_to_run_state_for_mysql(thr, trx); - clust_index = dict_table_get_first_index(index->table); + clust_index = dict_table_get_first_index(prebuilt->table); /* Do some start-of-statement preparations */ @@ -4543,7 +4544,7 @@ row_search_mvcc( prebuilt->sql_stat_start = FALSE; } else { wait_table_again: - err = lock_table(0, index->table, + err = lock_table(0, prebuilt->table, prebuilt->select_lock_type == LOCK_S ? LOCK_IS : LOCK_IX, thr); @@ -5072,7 +5073,8 @@ no_gap_lock: /* This is a non-locking consistent read: if necessary, fetch a previous version of the record */ - if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) { + if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED + || prebuilt->table->no_rollback()) { /* Do nothing: we let a non-locking SELECT read the latest version of the record */ diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index 18ea3cf3cf8..92af465aa49 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -2260,7 +2260,7 @@ row_upd_sec_index_entry( flags = BTR_NO_LOCKING_FLAG; mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - flags = 0; + flags = index->table->no_rollback() ? BTR_NO_ROLLBACK : 0; } if (!index->is_committed()) { @@ -3046,11 +3046,11 @@ row_upd_clust_step( server or connection lifetime and so REDO information is not needed on restart for recovery. Disable locking as temp-tables are not shared across connection. */ - if (dict_table_is_temporary(index->table)) { + if (dict_table_is_temporary(node->table)) { flags = BTR_NO_LOCKING_FLAG; mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - flags = 0; + flags = node->table->no_rollback() ? BTR_NO_ROLLBACK : 0; } /* If the restoration does not succeed, then the same diff --git a/storage/rocksdb/mysql-test/rocksdb/r/tbl_opt_data_index_dir.result b/storage/rocksdb/mysql-test/rocksdb/r/tbl_opt_data_index_dir.result index d1e445f734c..37797c7a8da 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/tbl_opt_data_index_dir.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/tbl_opt_data_index_dir.result @@ -3,14 +3,14 @@ CREATE TABLE t1 (a INT PRIMARY KEY, b CHAR(8)) ENGINE=rocksdb DATA DIRECTORY = ' ERROR HY000: Can't create table `test`.`t1` (errno: 140 "Wrong create options") show warnings; Level Code Message -Warning 1296 Got error 198 'Specifying DATA DIRECTORY for an individual table is not supported.' from ROCKSDB +Warning 1296 Got error 200 'Specifying DATA DIRECTORY for an individual table is not supported.' from ROCKSDB Error 1005 Can't create table `test`.`t1` (errno: 140 "Wrong create options") Warning 1030 Got error 140 "Wrong create options" from storage engine ROCKSDB CREATE TABLE t1 (a INT PRIMARY KEY, b CHAR(8)) ENGINE=rocksdb INDEX DIRECTORY = '/foo/bar/index'; ERROR HY000: Can't create table `test`.`t1` (errno: 140 "Wrong create options") show warnings; Level Code Message -Warning 1296 Got error 199 'Specifying INDEX DIRECTORY for an individual table is not supported.' from ROCKSDB +Warning 1296 Got error 201 'Specifying INDEX DIRECTORY for an individual table is not supported.' from ROCKSDB Error 1005 Can't create table `test`.`t1` (errno: 140 "Wrong create options") Warning 1030 Got error 140 "Wrong create options" from storage engine ROCKSDB CREATE TABLE t1 (id INT NOT NULL PRIMARY KEY) ENGINE=rocksdb PARTITION BY RANGE (id)