You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

4082 lines
100 KiB

branches/innodb+: Merge revisions 5144:5524 from branches/zip ------------------------------------------------------------------------ r5147 | marko | 2009-05-27 06:55:14 -0400 (Wed, 27 May 2009) | 1 line branches/zip: ibuf0ibuf.c: Improve a comment. ------------------------------------------------------------------------ r5149 | marko | 2009-05-27 07:46:42 -0400 (Wed, 27 May 2009) | 34 lines branches/zip: Merge revisions 4994:5148 from branches/5.1: ------------------------------------------------------------------------ r5126 | vasil | 2009-05-26 16:57:12 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Non-functional change: move FSP_* macros from fsp0fsp.h to a new file fsp0types.h. This is needed in order to be able to use FSP_EXTENT_SIZE in mtr0log.ic. ------------------------------------------------------------------------ r5127 | vasil | 2009-05-26 17:05:43 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not include unnecessary headers mtr0log.h and fut0lst.h in trx0sys.h and include fsp0fsp.h just before it is needed. This is needed in order to be able to use TRX_SYS_SPACE in mtr0log.ic. ------------------------------------------------------------------------ r5128 | vasil | 2009-05-26 17:26:37 +0300 (Tue, 26 May 2009) | 7 lines branches/5.1: Fix Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not write redo log for the pages in the doublewrite buffer. Also, do not make a dummy change to the page because this is not needed. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5169 | marko | 2009-05-28 03:21:55 -0400 (Thu, 28 May 2009) | 1 line branches/zip: mtr0mtr.h: Add Doxygen comments for the redo log entry types. ------------------------------------------------------------------------ r5176 | marko | 2009-05-28 07:14:02 -0400 (Thu, 28 May 2009) | 1 line branches/zip: Correct a debug assertion that was added in r5125. ------------------------------------------------------------------------ r5201 | marko | 2009-06-01 06:35:25 -0400 (Mon, 01 Jun 2009) | 2 lines branches/zip: Clean up some comments. Make the rec parameter of mlog_open_and_write_index() const. ------------------------------------------------------------------------ r5234 | marko | 2009-06-03 08:26:41 -0400 (Wed, 03 Jun 2009) | 44 lines branches/zip: Merge revisions 5148:5233 from branches/5.1: ------------------------------------------------------------------------ r5150 | vasil | 2009-05-27 18:56:03 +0300 (Wed, 27 May 2009) | 4 lines branches/5.1: Whitespace fixup. ------------------------------------------------------------------------ r5191 | vasil | 2009-05-30 17:46:05 +0300 (Sat, 30 May 2009) | 19 lines branches/5.1: Merge a change from MySQL (this fixes the failing innodb_mysql test): ------------------------------------------------------------ revno: 1810.3894.10 committer: Sergey Glukhov <Sergey.Glukhov@sun.com> branch nick: mysql-5.0-bugteam timestamp: Tue 2009-05-19 11:32:21 +0500 message: Bug#39793 Foreign keys not constructed when column has a '#' in a comment or default value Internal InnoDN FK parser does not recognize '\'' as quotation symbol. Suggested fix is to add '\'' symbol check for quotation condition (dict_strip_comments() function). modified: innobase/dict/dict0dict.c mysql-test/r/innodb_mysql.result mysql-test/t/innodb_mysql.test ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5250 | marko | 2009-06-04 02:58:23 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add Doxygen comments to the rest of buf0*. ------------------------------------------------------------------------ r5251 | marko | 2009-06-04 02:59:51 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Replace <= in a function comment. ------------------------------------------------------------------------ r5253 | marko | 2009-06-04 06:37:35 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add missing Doxygen comments for page0zip. ------------------------------------------------------------------------ r5261 | vasil | 2009-06-05 11:13:31 -0400 (Fri, 05 Jun 2009) | 15 lines branches/zip: Fix Mantis Issue#244 fix bug in linear read ahead (no check on access pattern) The changes are: 1) Take into account access pattern when deciding whether or not to do linear read ahead. 2) Expose a knob innodb_read_ahead_factor = [0-64] default (8), dynamic, global to control linear read ahead behvior 3) Disable random read ahead. Keep the code for now. Submitted by: Inaam (rb://122) Approved by: Heikki (rb://122) ------------------------------------------------------------------------ r5262 | vasil | 2009-06-05 12:04:25 -0400 (Fri, 05 Jun 2009) | 22 lines branches/zip: Enable functionality to have multiple background io helper threads. This patch is based on percona contributions. More details about this patch will be written at: https://svn.innodb.com/innobase/MultipleBackgroundThreads The patch essentially does the following: expose following knobs: innodb_read_io_threads = [1 - 64] default 1 innodb_write_io_threads = [1 - 64] default 1 deprecate innodb_file_io_threads (this parameter was relevant only on windows) Internally it allows multiple segments for read and write IO request arrays where one thread works on one segement. Submitted by: Inaam (rb://124) Approved by: Heikki (rb://124) ------------------------------------------------------------------------ r5263 | vasil | 2009-06-05 12:19:37 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Whitespace cleanup. ------------------------------------------------------------------------ r5264 | vasil | 2009-06-05 12:26:58 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5261. ------------------------------------------------------------------------ r5265 | vasil | 2009-06-05 12:34:11 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5262. ------------------------------------------------------------------------ r5268 | inaam | 2009-06-08 12:18:21 -0400 (Mon, 08 Jun 2009) | 7 lines branches/zip Non functional change: Added legal notices acknowledging percona contribution to the multiple IO helper threads patch i.e.: r5262 ------------------------------------------------------------------------ r5283 | inaam | 2009-06-09 13:46:29 -0400 (Tue, 09 Jun 2009) | 9 lines branches/zip rb://130 Enable Group Commit functionality that was broken in 5.0 when distributed transactions were introduced. Reviewed by: Heikki ------------------------------------------------------------------------ r5319 | marko | 2009-06-11 04:40:33 -0400 (Thu, 11 Jun 2009) | 3 lines branches/zip: Declare os_thread_id_t as unsigned long, because ulint is wrong on Win64. Pointed out by Vladislav Vaintroub <wlad@sun.com>. ------------------------------------------------------------------------ r5320 | inaam | 2009-06-11 09:15:41 -0400 (Thu, 11 Jun 2009) | 14 lines branches/zip rb://131 This patch changes the following defaults: max_dirty_pages_pct: default from 90 to 75. max allowed from 100 to 99 additional_mem_pool_size: default from 1 to 8 MB buffer_pool_size: default from 8 to 128 MB log_buffer_size: default from 1 to 8 MB read_io_threads/write_io_threads: default from 1 to 4 The log file sizes are untouched because of upgrade issues Reviewed by: Heikki ------------------------------------------------------------------------ r5330 | marko | 2009-06-16 04:08:59 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_gen(): Reduce mutex holding time by adjusting buf_pool->n_pend_unzip while only holding buf_pool_mutex. ------------------------------------------------------------------------ r5331 | marko | 2009-06-16 05:00:48 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Eliminate a buf_page_get_mutex() call. The function must switch on the block state anyway. ------------------------------------------------------------------------ r5332 | vasil | 2009-06-16 05:03:27 -0400 (Tue, 16 Jun 2009) | 4 lines branches/zip: Add ChangeLog entries for r5283 and r5320. ------------------------------------------------------------------------ r5333 | marko | 2009-06-16 05:27:46 -0400 (Tue, 16 Jun 2009) | 1 line branches/zip: buf_page_io_query(): Remove unused function. ------------------------------------------------------------------------ r5335 | marko | 2009-06-16 09:23:10 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: innodb.test: Adjust the tolerance of innodb_buffer_pool_pages_total for r5320. ------------------------------------------------------------------------ r5342 | marko | 2009-06-17 06:15:32 -0400 (Wed, 17 Jun 2009) | 60 lines branches/zip: Merge revisions 5233:5341 from branches/5.1: ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5243 | sunny | 2009-06-04 03:17:14 +0300 (Thu, 04 Jun 2009) | 14 lines branches/5.1: When the InnoDB and MySQL data dictionaries go out of sync, before the bug fix we would assert on missing autoinc columns. With this fix we allow MySQL to open the table but set the next autoinc value for the column to the MAX value. This effectively disables the next value generation. INSERTs will fail with a generic AUTOINC failure. However, the user should be able to read/dump the table, set the column values explicitly, use ALTER TABLE to set the next autoinc value and/or sync the two data dictionaries to resume normal operations. Fix Bug#44030 Error: (1500) Couldn't read the MAX(ID) autoinc value from the index (PRIMARY) rb://118 ------------------------------------------------------------------------ r5252 | sunny | 2009-06-04 10:16:24 +0300 (Thu, 04 Jun 2009) | 2 lines branches/5.1: The version of the result file checked in was broken in r5243. ------------------------------------------------------------------------ r5259 | vasil | 2009-06-05 10:29:16 +0300 (Fri, 05 Jun 2009) | 7 lines branches/5.1: Remove the word "Error" from the printout because the mysqltest suite interprets it as an error and thus the innodb-autoinc test fails. Approved by: Sunny (via IM) ------------------------------------------------------------------------ r5339 | marko | 2009-06-17 11:01:37 +0300 (Wed, 17 Jun 2009) | 2 lines branches/5.1: Add missing #include "mtr0log.h" so that the code compiles with -DUNIV_MUST_NOT_INLINE. (null merge; this had already been committed in branches/zip) ------------------------------------------------------------------------ r5340 | marko | 2009-06-17 12:11:49 +0300 (Wed, 17 Jun 2009) | 4 lines branches/5.1: row_unlock_for_mysql(): When the clustered index is unknown, refuse to unlock the record. (Bug #45357, caused by the fix of Bug #39320). rb://132 approved by Sunny Bains. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5343 | vasil | 2009-06-17 08:56:12 -0400 (Wed, 17 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5342. ------------------------------------------------------------------------ r5344 | marko | 2009-06-17 09:03:45 -0400 (Wed, 17 Jun 2009) | 1 line branches/zip: row_merge_read_rec(): Fix a UNIV_DEBUG bug (Bug #45426) ------------------------------------------------------------------------ r5391 | marko | 2009-06-22 05:31:35 -0400 (Mon, 22 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Fix a bogus warning about block_mutex being possibly uninitialized. ------------------------------------------------------------------------ r5392 | marko | 2009-06-22 07:58:20 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: ha_innobase::check_if_incompatible_data(): When ROW_FORMAT=DEFAULT, do not compare to get_row_type(). Without this change, fast index creation will be disabled in recent versions of MySQL 5.1. ------------------------------------------------------------------------ r5393 | pekka | 2009-06-22 09:27:55 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Minor changes for Hot Backup to build correctly. (The code bracketed between #ifdef UNIV_HOTBACKUP and #endif /* UNIV_HOTBACKUP */). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5394 | pekka | 2009-06-22 09:46:34 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Add functions for checking the format of tablespaces for Hot Backup build (UNIV_HOTBACKUP defined). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5397 | calvin | 2009-06-23 16:59:42 -0400 (Tue, 23 Jun 2009) | 7 lines branches/zip: change the header file path. Change the header file path from ../storage/innobase/include/ to ../include/. In the planned 5.1 + plugin release, the source directory of the plugin will not be in storage/innobase. Approved by: Heikki (IM) ------------------------------------------------------------------------ r5407 | calvin | 2009-06-24 09:51:08 -0400 (Wed, 24 Jun 2009) | 4 lines branches/zip: remove relative path of header files. Suggested by Marko. ------------------------------------------------------------------------ r5412 | marko | 2009-06-25 06:27:08 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: Replace a DBUG_ASSERT with ut_a to track down Issue #290. ------------------------------------------------------------------------ r5415 | marko | 2009-06-25 06:45:57 -0400 (Thu, 25 Jun 2009) | 3 lines branches/zip: dict_index_find_cols(): Print diagnostic on name mismatch. This addresses Bug #44571 but does not fix it. rb://135 approved by Sunny Bains. ------------------------------------------------------------------------ r5417 | marko | 2009-06-25 08:20:56 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: ha_innodb.cc: Move the misplaced Doxygen @file comment. ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 08:55:52 -0400 (Thu, 25 Jun 2009) | 5 lines branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ r5423 | calvin | 2009-06-26 16:52:52 -0400 (Fri, 26 Jun 2009) | 2 lines branches/zip: Fix typos. ------------------------------------------------------------------------ r5425 | marko | 2009-06-29 04:52:30 -0400 (Mon, 29 Jun 2009) | 4 lines branches/zip: ha_innobase::add_index(), ha_innobase::final_drop_index(): Start prebuilt->trx before locking the table. This should fix Issue #293 and could fix Issue #229. Approved by Sunny (over IM). ------------------------------------------------------------------------ r5426 | marko | 2009-06-29 05:24:27 -0400 (Mon, 29 Jun 2009) | 3 lines branches/zip: buf_page_get_gen(): Fix a race condition when reading buf_fix_count. This could explain Issue #156. Tested by Michael. ------------------------------------------------------------------------ r5427 | marko | 2009-06-29 05:54:53 -0400 (Mon, 29 Jun 2009) | 5 lines branches/zip: lock_print_info_all_transactions(), buf_read_recv_pages(): Tolerate missing tablespaces (zip_size==ULINT_UNDEFINED). buf_page_get_gen(): Add ut_ad(ut_is_2pow(zip_size)). Issue #289, rb://136 approved by Sunny Bains ------------------------------------------------------------------------ r5428 | marko | 2009-06-29 07:06:29 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: row_sel_store_mysql_rec(): Add missing pointer cast. Do not do arithmetics on void pointers. ------------------------------------------------------------------------ r5429 | marko | 2009-06-29 09:49:54 -0400 (Mon, 29 Jun 2009) | 13 lines branches/zip: Do not crash on SET GLOBAL innodb_file_format=DEFAULT or SET GLOBAL innodb_file_format_check=DEFAULT. innodb_file_format.test: New test for innodb_file_format and innodb_file_format_check. innodb_file_format_name_validate(): Store the string in *save. innodb_file_format_name_update(): Check the string again. innodb_file_format_check_validate(): Store the string in *save. innodb_file_format_check_update(): Check the string again. Issue #282, rb://140 approved by Heikki Tuuri ------------------------------------------------------------------------ r5430 | marko | 2009-06-29 09:58:07 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: lock_rec_validate_page(): Add another assertion to track down Issue #289. ------------------------------------------------------------------------ r5431 | marko | 2009-06-29 09:58:40 -0400 (Mon, 29 Jun 2009) | 1 line branches/zip: Revert an accidentally made change in r5430 to univ.i. ------------------------------------------------------------------------ r5437 | marko | 2009-06-30 05:10:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: ibuf_dummy_index_free(): Beautify the comment. ------------------------------------------------------------------------ r5438 | marko | 2009-06-30 05:10:32 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: fseg_free(): Remove this unused function. ------------------------------------------------------------------------ r5439 | marko | 2009-06-30 05:15:22 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: fseg_validate(): Enclose in #ifdef UNIV_DEBUG. This function is unused, but it could turn out to be a useful debugging aid. ------------------------------------------------------------------------ r5441 | marko | 2009-06-30 06:30:14 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: ha_delete(): Remove this unused function that was very similar to ha_search_and_delete_if_found(). ------------------------------------------------------------------------ r5442 | marko | 2009-06-30 06:45:41 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: lock_is_on_table(), lock_table_unlock(): Unused, remove. ------------------------------------------------------------------------ r5443 | marko | 2009-06-30 07:03:00 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_event_create_auto(): Unused, remove. ------------------------------------------------------------------------ r5444 | marko | 2009-06-30 07:19:49 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: que_graph_try_free(): Unused, remove. ------------------------------------------------------------------------ r5445 | marko | 2009-06-30 07:28:11 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: row_build_row_ref_from_row(): Unused, remove. ------------------------------------------------------------------------ r5446 | marko | 2009-06-30 07:35:45 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_round_robin(), srv_que_task_enqueue(): Unused, remove. ------------------------------------------------------------------------ r5447 | marko | 2009-06-30 07:37:58 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_task_queue_check(): Unused, remove. ------------------------------------------------------------------------ r5448 | marko | 2009-06-30 07:56:36 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: mem_heap_cat(): Unused, remove. ------------------------------------------------------------------------ r5449 | marko | 2009-06-30 08:00:50 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: innobase_start_or_create_for_mysql(): Invoke os_get_os_version() at most once. ------------------------------------------------------------------------ r5450 | marko | 2009-06-30 08:02:20 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_file_close_no_error_handling(): Unused, remove. ------------------------------------------------------------------------ r5451 | marko | 2009-06-30 08:09:49 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: page_set_max_trx_id(): Make the code compile with UNIV_HOTBACKUP. ------------------------------------------------------------------------ r5452 | marko | 2009-06-30 08:10:26 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: os_file_close_no_error_handling(): Restore, as this function is used within InnoDB Hot Backup. ------------------------------------------------------------------------ r5453 | marko | 2009-06-30 08:14:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_process_set_priority_boost(): Unused, remove. ------------------------------------------------------------------------ r5454 | marko | 2009-06-30 08:42:52 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: Replace a non-ASCII character (ISO 8859-1 encoded U+00AD SOFT HYPHEN) with a cheap ASCII substitute. ------------------------------------------------------------------------ r5456 | inaam | 2009-06-30 14:21:09 -0400 (Tue, 30 Jun 2009) | 4 lines branches/zip Non functional change. s/Percona/Percona Inc./ ------------------------------------------------------------------------ r5470 | vasil | 2009-07-02 09:12:36 -0400 (Thu, 02 Jul 2009) | 16 lines branches/zip: Use PAUSE instruction inside spinloop if it is available. The patch was originally developed by Mikael Ronstrom <mikael@mysql.com> and can be found here: http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2768 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2771 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2772 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2774 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2777 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2799 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2800 Approved by: Heikki (rb://137) ------------------------------------------------------------------------ r5481 | vasil | 2009-07-06 13:16:32 -0400 (Mon, 06 Jul 2009) | 4 lines branches/zip: Remove unnecessary quotes and simplify plug.in. ------------------------------------------------------------------------ r5482 | calvin | 2009-07-06 18:36:35 -0400 (Mon, 06 Jul 2009) | 5 lines branches/zip: add COPYING files for Percona and Sun Micro. 1.0.4 contains patches based on contributions from Percona and Sun Microsystems. ------------------------------------------------------------------------ r5483 | calvin | 2009-07-07 05:36:43 -0400 (Tue, 07 Jul 2009) | 3 lines branches/zip: add IB_HAVE_PAUSE_INSTRUCTION to CMake. Windows will support PAUSE instruction by default. ------------------------------------------------------------------------ r5484 | inaam | 2009-07-07 18:57:14 -0400 (Tue, 07 Jul 2009) | 13 lines branches/zip rb://126 Based on contribution from Google Inc. This patch introduces a new parameter innodb_io_capacity to control the rate at which master threads performs various tasks. The default value is 200 and higher values imply more aggressive flushing and ibuf merges from within the master thread. This patch also changes the ibuf merge from synchronous to asynchronous. Another minor change is not to force the master thread to wait for a log flush to complete every second. Approved by: Heikki ------------------------------------------------------------------------ r5485 | inaam | 2009-07-07 19:00:49 -0400 (Tue, 07 Jul 2009) | 18 lines branches/zip rb://138 The current implementation is to try to flush the neighbors of every page that we flush. This patch makes the following distinction: 1) If the flush is from flush_list AND 2) If the flush is intended to move the oldest_modification LSN ahead (this happens when a user thread sees little space in the log file and attempts to flush pages from the buffer pool so that a checkpoint can be made) THEN Do not try to flush the neighbors. Just focus on flushing dirty pages at the end of flush_list Approved by: Heikki ------------------------------------------------------------------------ r5486 | inaam | 2009-07-08 12:11:40 -0400 (Wed, 08 Jul 2009) | 29 lines branches/zip rb://133 This patch introduces heuristics based flushing rate of dirty pages to avoid IO bursts at checkpoint. 1) log_capacity / log_generated per second gives us number of seconds in which ALL dirty pages need to be flushed. Based on this rough assumption we can say that n_dirty_pages / (log_capacity / log_generation_rate) = desired_flush_rate 2) We use weighted averages (hard coded to 20 seconds) of log_generation_rate to avoid resonance. 3) From the desired_flush_rate we subtract the number of pages that have been flushed due to LRU flushing. That gives us pages that we should flush as part of flush_list cleanup. And that is the number (capped by maximum io_capacity) that we try to flush from the master thread. Knobs: ====== innodb_adaptive_flushing: boolean, global, dynamic, default TRUE. Since this heuristic is very experimental and has the potential to dramatically change the IO pattern I think it is a good idea to leave a knob to turn it off. Approved by: Heikki ------------------------------------------------------------------------ r5487 | calvin | 2009-07-08 12:42:28 -0400 (Wed, 08 Jul 2009) | 7 lines branches/zip: fix PAUSE instruction patch on Windows The original PAUSE instruction patch (r5470) does not compile on Windows. Also, there is an elegant way of doing it on Windows - YieldProcessor(). Approved by: Heikki (on IM) ------------------------------------------------------------------------ r5489 | vasil | 2009-07-10 05:02:22 -0400 (Fri, 10 Jul 2009) | 9 lines branches/zip: Change the defaults for innodb_sync_spin_loops: 20 -> 30 innodb_spin_wait_delay: 5 -> 6 This change was proposed by Sun/MySQL based on their performance testing, see https://svn.innodb.com/innobase/Release_tasks_for_InnoDB_Plugin_V1.0.4 ------------------------------------------------------------------------ r5490 | vasil | 2009-07-10 05:04:20 -0400 (Fri, 10 Jul 2009) | 4 lines branches/zip: Add ChangeLog entry for 5489. ------------------------------------------------------------------------ r5491 | calvin | 2009-07-10 12:19:17 -0400 (Fri, 10 Jul 2009) | 6 lines branches/zip: add copyright info to files related to PAUSE instruction patch, contributed by Sun Microsystems. ------------------------------------------------------------------------ r5492 | calvin | 2009-07-10 17:47:34 -0400 (Fri, 10 Jul 2009) | 5 lines branches/zip: add ChangeLog entries for r5484-r5486. ------------------------------------------------------------------------ r5494 | vasil | 2009-07-13 03:37:35 -0400 (Mon, 13 Jul 2009) | 6 lines branches/zip: Restore the original value of innodb_sync_spin_loops at the end, previously the test assumed that setting it to 20 will do this, but now the default is 30 and MTR's internal check failed. ------------------------------------------------------------------------ r5495 | inaam | 2009-07-13 11:48:45 -0400 (Mon, 13 Jul 2009) | 5 lines branches/zip rb://138 (REVERT) Revert the flush neighbors patch as it shows regression in the benchmarks run by Michael. ------------------------------------------------------------------------ r5496 | inaam | 2009-07-13 14:04:57 -0400 (Mon, 13 Jul 2009) | 4 lines branches/zip Fixed warnings on windows where ulint != ib_uint64_t ------------------------------------------------------------------------ r5497 | calvin | 2009-07-13 15:01:00 -0400 (Mon, 13 Jul 2009) | 9 lines branches/zip: fix run-time symbols clash on Solaris. This patch is from Sergey Vojtovich of Sun Microsystems, to fix run-time symbols clash on Solaris with older C++ compiler: - when finding out a way to hide symbols, make decision basing on compiler, not operating system. - Sun Studio supports __hidden declaration specifier for this purpose. ------------------------------------------------------------------------ r5498 | vasil | 2009-07-14 03:16:18 -0400 (Tue, 14 Jul 2009) | 92 lines branches/zip: Merge r5341:5497 from branches/5.1, skipping: c5419 because it is merge from branches/zip into branches/5.1 c5466 because the source code has been adjusted to match the MySQL behavior and the innodb-autoinc test does not fail in branches/zip, if c5466 is merged, then innodb-autoinc starts failing, Sunny suggested not to merge c5466. and resolving conflicts in c5410, c5440, c5488: ------------------------------------------------------------------------ r5410 | marko | 2009-06-24 22:26:34 +0300 (Wed, 24 Jun 2009) | 2 lines Changed paths: M /branches/5.1/include/trx0sys.ic M /branches/5.1/trx/trx0purge.c M /branches/5.1/trx/trx0sys.c M /branches/5.1/trx/trx0undo.c branches/5.1: Add missing #include "mtr0log.h" to avoid warnings when compiling with -DUNIV_MUST_NOT_INLINE. ------------------------------------------------------------------------ r5419 | marko | 2009-06-25 16:11:57 +0300 (Thu, 25 Jun 2009) | 18 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb_bug42101-nonzero.result M /branches/5.1/mysql-test/innodb_bug42101-nonzero.test M /branches/5.1/mysql-test/innodb_bug42101.result M /branches/5.1/mysql-test/innodb_bug42101.test branches/5.1: Merge r5418 from branches/zip: ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 15:55:52 +0300 (Thu, 25 Jun 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc M /branches/zip/mysql-test/innodb_bug42101-nonzero.result M /branches/zip/mysql-test/innodb_bug42101-nonzero.test M /branches/zip/mysql-test/innodb_bug42101.result M /branches/zip/mysql-test/innodb_bug42101.test branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5440 | vasil | 2009-06-30 13:04:29 +0300 (Tue, 30 Jun 2009) | 8 lines Changed paths: M /branches/5.1/fil/fil0fil.c branches/5.1: Fix Bug#45814 URL reference in InnoDB server errors needs adjusting to match documentation by changing the URL from http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting.html to http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting-datadict.html ------------------------------------------------------------------------ r5466 | vasil | 2009-07-02 10:46:45 +0300 (Thu, 02 Jul 2009) | 6 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Adjust the failing innodb-autoinc test to conform to the latest behavior of the MySQL code. The idea and the comment in innodb-autoinc.test come from Sunny. ------------------------------------------------------------------------ r5488 | vasil | 2009-07-09 19:16:44 +0300 (Thu, 09 Jul 2009) | 13 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug21704.result A /branches/5.1/mysql-test/innodb_bug21704.test branches/5.1: Fix Bug#21704 Renaming column does not update FK definition by checking whether a column that participates in a FK definition is being renamed and denying the ALTER in this case. The patch was originally developed by Davi Arnaut <Davi.Arnaut@Sun.COM>: http://lists.mysql.com/commits/77714 and was later adjusted to conform to InnoDB coding style by me (Vasil), I also added some more comments and moved the bug specific mysql-test to a separate file to make it more manageable and flexible. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5499 | calvin | 2009-07-14 12:55:10 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: add a missing file in Makefile.am This change was suggested by MySQL. ------------------------------------------------------------------------ r5500 | calvin | 2009-07-14 13:03:26 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: minor change Remove an extra "with". ------------------------------------------------------------------------ r5501 | vasil | 2009-07-14 13:58:15 -0400 (Tue, 14 Jul 2009) | 5 lines branches/zip: Add @ZLIB_INCLUDES@ so that the InnoDB Plugin picks up the same zlib.h header file that is eventually used by mysqld. ------------------------------------------------------------------------ r5502 | vasil | 2009-07-14 13:59:59 -0400 (Tue, 14 Jul 2009) | 4 lines branches/zip: Add include/ut0auxconf.h to noinst_HEADERS ------------------------------------------------------------------------ r5503 | vasil | 2009-07-14 14:16:11 -0400 (Tue, 14 Jul 2009) | 8 lines branches/zip: Non-functional change: put files in noinst_HEADERS and libinnobase_a_SOURCES one per line and sort alphabetically, so it is easier to find if a file is there or not and also diffs show exactly the added or removed file instead of surrounding lines too. ------------------------------------------------------------------------ r5504 | calvin | 2009-07-15 04:58:44 -0400 (Wed, 15 Jul 2009) | 6 lines branches/zip: fix compile errors on Win64 Both srv_read_ahead_factor and srv_io_capacity should be defined as ulong. Approved by: Sunny ------------------------------------------------------------------------ r5508 | calvin | 2009-07-16 09:40:47 -0400 (Thu, 16 Jul 2009) | 16 lines branches/zip: Support inlining of functions and prefetch with Sun Studio Those changes are contributed by Sun/MySQL. Two sets of changes in this patch when Sun Studio is used: - Explicit inlining of functions - Prefetch Support This patch has been tested by Sunny with the plugin statically built in. Since we've never built the plugin as a dynamically loaded module on Solaris, it is a separate task to change plug.in. rb://142 Approved by: Heikki ------------------------------------------------------------------------ r5509 | calvin | 2009-07-16 09:45:28 -0400 (Thu, 16 Jul 2009) | 2 lines branches/zip: add ChangeLog entry for r5508. ------------------------------------------------------------------------ r5512 | sunny | 2009-07-19 19:52:48 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Remove unused extern ref to timed_mutexes. ------------------------------------------------------------------------ r5513 | sunny | 2009-07-19 19:58:43 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Undo r5512 ------------------------------------------------------------------------ r5514 | sunny | 2009-07-19 20:08:49 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Only use my_bool when UNIV_HOTBACKUP is not defined. ------------------------------------------------------------------------ r5515 | sunny | 2009-07-20 03:29:14 -0400 (Mon, 20 Jul 2009) | 2 lines branches/zip: The dict_table_t::autoinc_mutex field is not used in HotBackup. ------------------------------------------------------------------------ r5516 | sunny | 2009-07-20 03:46:05 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Make this file usable from within HotBackup. A new file has been introduced called hb_univ.i. This file should have all the HotBackup specific configuration. ------------------------------------------------------------------------ r5517 | sunny | 2009-07-20 03:55:11 -0400 (Mon, 20 Jul 2009) | 2 lines Add /* UNIV_HOTBACK */ ------------------------------------------------------------------------ r5519 | vasil | 2009-07-20 04:45:18 -0400 (Mon, 20 Jul 2009) | 31 lines branches/zip: Merge r5497:5518 from branches/5.1: ------------------------------------------------------------------------ r5518 | vasil | 2009-07-20 11:29:47 +0300 (Mon, 20 Jul 2009) | 22 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a change from MySQL: ------------------------------------------------------------ revno: 2874.2.1 committer: Anurag Shekhar <anurag.shekhar@sun.com> branch nick: mysql-5.1-bugteam-windows-warning timestamp: Wed 2009-05-13 15:41:24 +0530 message: Bug #39802 On Windows, 32-bit time_t should be enforced This patch fixes compilation warning, "conversion from 'time_t' to 'ulong', possible loss of data". The fix is to typecast time_t to ulong before assigning it to ulong. Backported this from 6.0-bugteam tree. modified: storage/archive/ha_archive.cc storage/federated/ha_federated.cc storage/innobase/handler/ha_innodb.cc storage/myisam/ha_myisam.cc ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5520 | vasil | 2009-07-20 04:51:47 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Add ChangeLog entries for r5498 and r5519. ------------------------------------------------------------------------ r5524 | inaam | 2009-07-20 12:23:15 -0400 (Mon, 20 Jul 2009) | 9 lines branches/zip Change the read ahead parameter name to innodb_read_ahead_threshold. Change the meaning of this parameter to signify the number of pages that must be sequentially accessed for InnoDB to trigger a readahead request. Suggested by: Ken ------------------------------------------------------------------------
16 years ago
branches/innodb+: Merge revisions 5144:5524 from branches/zip ------------------------------------------------------------------------ r5147 | marko | 2009-05-27 06:55:14 -0400 (Wed, 27 May 2009) | 1 line branches/zip: ibuf0ibuf.c: Improve a comment. ------------------------------------------------------------------------ r5149 | marko | 2009-05-27 07:46:42 -0400 (Wed, 27 May 2009) | 34 lines branches/zip: Merge revisions 4994:5148 from branches/5.1: ------------------------------------------------------------------------ r5126 | vasil | 2009-05-26 16:57:12 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Non-functional change: move FSP_* macros from fsp0fsp.h to a new file fsp0types.h. This is needed in order to be able to use FSP_EXTENT_SIZE in mtr0log.ic. ------------------------------------------------------------------------ r5127 | vasil | 2009-05-26 17:05:43 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not include unnecessary headers mtr0log.h and fut0lst.h in trx0sys.h and include fsp0fsp.h just before it is needed. This is needed in order to be able to use TRX_SYS_SPACE in mtr0log.ic. ------------------------------------------------------------------------ r5128 | vasil | 2009-05-26 17:26:37 +0300 (Tue, 26 May 2009) | 7 lines branches/5.1: Fix Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not write redo log for the pages in the doublewrite buffer. Also, do not make a dummy change to the page because this is not needed. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5169 | marko | 2009-05-28 03:21:55 -0400 (Thu, 28 May 2009) | 1 line branches/zip: mtr0mtr.h: Add Doxygen comments for the redo log entry types. ------------------------------------------------------------------------ r5176 | marko | 2009-05-28 07:14:02 -0400 (Thu, 28 May 2009) | 1 line branches/zip: Correct a debug assertion that was added in r5125. ------------------------------------------------------------------------ r5201 | marko | 2009-06-01 06:35:25 -0400 (Mon, 01 Jun 2009) | 2 lines branches/zip: Clean up some comments. Make the rec parameter of mlog_open_and_write_index() const. ------------------------------------------------------------------------ r5234 | marko | 2009-06-03 08:26:41 -0400 (Wed, 03 Jun 2009) | 44 lines branches/zip: Merge revisions 5148:5233 from branches/5.1: ------------------------------------------------------------------------ r5150 | vasil | 2009-05-27 18:56:03 +0300 (Wed, 27 May 2009) | 4 lines branches/5.1: Whitespace fixup. ------------------------------------------------------------------------ r5191 | vasil | 2009-05-30 17:46:05 +0300 (Sat, 30 May 2009) | 19 lines branches/5.1: Merge a change from MySQL (this fixes the failing innodb_mysql test): ------------------------------------------------------------ revno: 1810.3894.10 committer: Sergey Glukhov <Sergey.Glukhov@sun.com> branch nick: mysql-5.0-bugteam timestamp: Tue 2009-05-19 11:32:21 +0500 message: Bug#39793 Foreign keys not constructed when column has a '#' in a comment or default value Internal InnoDN FK parser does not recognize '\'' as quotation symbol. Suggested fix is to add '\'' symbol check for quotation condition (dict_strip_comments() function). modified: innobase/dict/dict0dict.c mysql-test/r/innodb_mysql.result mysql-test/t/innodb_mysql.test ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5250 | marko | 2009-06-04 02:58:23 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add Doxygen comments to the rest of buf0*. ------------------------------------------------------------------------ r5251 | marko | 2009-06-04 02:59:51 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Replace <= in a function comment. ------------------------------------------------------------------------ r5253 | marko | 2009-06-04 06:37:35 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add missing Doxygen comments for page0zip. ------------------------------------------------------------------------ r5261 | vasil | 2009-06-05 11:13:31 -0400 (Fri, 05 Jun 2009) | 15 lines branches/zip: Fix Mantis Issue#244 fix bug in linear read ahead (no check on access pattern) The changes are: 1) Take into account access pattern when deciding whether or not to do linear read ahead. 2) Expose a knob innodb_read_ahead_factor = [0-64] default (8), dynamic, global to control linear read ahead behvior 3) Disable random read ahead. Keep the code for now. Submitted by: Inaam (rb://122) Approved by: Heikki (rb://122) ------------------------------------------------------------------------ r5262 | vasil | 2009-06-05 12:04:25 -0400 (Fri, 05 Jun 2009) | 22 lines branches/zip: Enable functionality to have multiple background io helper threads. This patch is based on percona contributions. More details about this patch will be written at: https://svn.innodb.com/innobase/MultipleBackgroundThreads The patch essentially does the following: expose following knobs: innodb_read_io_threads = [1 - 64] default 1 innodb_write_io_threads = [1 - 64] default 1 deprecate innodb_file_io_threads (this parameter was relevant only on windows) Internally it allows multiple segments for read and write IO request arrays where one thread works on one segement. Submitted by: Inaam (rb://124) Approved by: Heikki (rb://124) ------------------------------------------------------------------------ r5263 | vasil | 2009-06-05 12:19:37 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Whitespace cleanup. ------------------------------------------------------------------------ r5264 | vasil | 2009-06-05 12:26:58 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5261. ------------------------------------------------------------------------ r5265 | vasil | 2009-06-05 12:34:11 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5262. ------------------------------------------------------------------------ r5268 | inaam | 2009-06-08 12:18:21 -0400 (Mon, 08 Jun 2009) | 7 lines branches/zip Non functional change: Added legal notices acknowledging percona contribution to the multiple IO helper threads patch i.e.: r5262 ------------------------------------------------------------------------ r5283 | inaam | 2009-06-09 13:46:29 -0400 (Tue, 09 Jun 2009) | 9 lines branches/zip rb://130 Enable Group Commit functionality that was broken in 5.0 when distributed transactions were introduced. Reviewed by: Heikki ------------------------------------------------------------------------ r5319 | marko | 2009-06-11 04:40:33 -0400 (Thu, 11 Jun 2009) | 3 lines branches/zip: Declare os_thread_id_t as unsigned long, because ulint is wrong on Win64. Pointed out by Vladislav Vaintroub <wlad@sun.com>. ------------------------------------------------------------------------ r5320 | inaam | 2009-06-11 09:15:41 -0400 (Thu, 11 Jun 2009) | 14 lines branches/zip rb://131 This patch changes the following defaults: max_dirty_pages_pct: default from 90 to 75. max allowed from 100 to 99 additional_mem_pool_size: default from 1 to 8 MB buffer_pool_size: default from 8 to 128 MB log_buffer_size: default from 1 to 8 MB read_io_threads/write_io_threads: default from 1 to 4 The log file sizes are untouched because of upgrade issues Reviewed by: Heikki ------------------------------------------------------------------------ r5330 | marko | 2009-06-16 04:08:59 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_gen(): Reduce mutex holding time by adjusting buf_pool->n_pend_unzip while only holding buf_pool_mutex. ------------------------------------------------------------------------ r5331 | marko | 2009-06-16 05:00:48 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Eliminate a buf_page_get_mutex() call. The function must switch on the block state anyway. ------------------------------------------------------------------------ r5332 | vasil | 2009-06-16 05:03:27 -0400 (Tue, 16 Jun 2009) | 4 lines branches/zip: Add ChangeLog entries for r5283 and r5320. ------------------------------------------------------------------------ r5333 | marko | 2009-06-16 05:27:46 -0400 (Tue, 16 Jun 2009) | 1 line branches/zip: buf_page_io_query(): Remove unused function. ------------------------------------------------------------------------ r5335 | marko | 2009-06-16 09:23:10 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: innodb.test: Adjust the tolerance of innodb_buffer_pool_pages_total for r5320. ------------------------------------------------------------------------ r5342 | marko | 2009-06-17 06:15:32 -0400 (Wed, 17 Jun 2009) | 60 lines branches/zip: Merge revisions 5233:5341 from branches/5.1: ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5243 | sunny | 2009-06-04 03:17:14 +0300 (Thu, 04 Jun 2009) | 14 lines branches/5.1: When the InnoDB and MySQL data dictionaries go out of sync, before the bug fix we would assert on missing autoinc columns. With this fix we allow MySQL to open the table but set the next autoinc value for the column to the MAX value. This effectively disables the next value generation. INSERTs will fail with a generic AUTOINC failure. However, the user should be able to read/dump the table, set the column values explicitly, use ALTER TABLE to set the next autoinc value and/or sync the two data dictionaries to resume normal operations. Fix Bug#44030 Error: (1500) Couldn't read the MAX(ID) autoinc value from the index (PRIMARY) rb://118 ------------------------------------------------------------------------ r5252 | sunny | 2009-06-04 10:16:24 +0300 (Thu, 04 Jun 2009) | 2 lines branches/5.1: The version of the result file checked in was broken in r5243. ------------------------------------------------------------------------ r5259 | vasil | 2009-06-05 10:29:16 +0300 (Fri, 05 Jun 2009) | 7 lines branches/5.1: Remove the word "Error" from the printout because the mysqltest suite interprets it as an error and thus the innodb-autoinc test fails. Approved by: Sunny (via IM) ------------------------------------------------------------------------ r5339 | marko | 2009-06-17 11:01:37 +0300 (Wed, 17 Jun 2009) | 2 lines branches/5.1: Add missing #include "mtr0log.h" so that the code compiles with -DUNIV_MUST_NOT_INLINE. (null merge; this had already been committed in branches/zip) ------------------------------------------------------------------------ r5340 | marko | 2009-06-17 12:11:49 +0300 (Wed, 17 Jun 2009) | 4 lines branches/5.1: row_unlock_for_mysql(): When the clustered index is unknown, refuse to unlock the record. (Bug #45357, caused by the fix of Bug #39320). rb://132 approved by Sunny Bains. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5343 | vasil | 2009-06-17 08:56:12 -0400 (Wed, 17 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5342. ------------------------------------------------------------------------ r5344 | marko | 2009-06-17 09:03:45 -0400 (Wed, 17 Jun 2009) | 1 line branches/zip: row_merge_read_rec(): Fix a UNIV_DEBUG bug (Bug #45426) ------------------------------------------------------------------------ r5391 | marko | 2009-06-22 05:31:35 -0400 (Mon, 22 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Fix a bogus warning about block_mutex being possibly uninitialized. ------------------------------------------------------------------------ r5392 | marko | 2009-06-22 07:58:20 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: ha_innobase::check_if_incompatible_data(): When ROW_FORMAT=DEFAULT, do not compare to get_row_type(). Without this change, fast index creation will be disabled in recent versions of MySQL 5.1. ------------------------------------------------------------------------ r5393 | pekka | 2009-06-22 09:27:55 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Minor changes for Hot Backup to build correctly. (The code bracketed between #ifdef UNIV_HOTBACKUP and #endif /* UNIV_HOTBACKUP */). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5394 | pekka | 2009-06-22 09:46:34 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Add functions for checking the format of tablespaces for Hot Backup build (UNIV_HOTBACKUP defined). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5397 | calvin | 2009-06-23 16:59:42 -0400 (Tue, 23 Jun 2009) | 7 lines branches/zip: change the header file path. Change the header file path from ../storage/innobase/include/ to ../include/. In the planned 5.1 + plugin release, the source directory of the plugin will not be in storage/innobase. Approved by: Heikki (IM) ------------------------------------------------------------------------ r5407 | calvin | 2009-06-24 09:51:08 -0400 (Wed, 24 Jun 2009) | 4 lines branches/zip: remove relative path of header files. Suggested by Marko. ------------------------------------------------------------------------ r5412 | marko | 2009-06-25 06:27:08 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: Replace a DBUG_ASSERT with ut_a to track down Issue #290. ------------------------------------------------------------------------ r5415 | marko | 2009-06-25 06:45:57 -0400 (Thu, 25 Jun 2009) | 3 lines branches/zip: dict_index_find_cols(): Print diagnostic on name mismatch. This addresses Bug #44571 but does not fix it. rb://135 approved by Sunny Bains. ------------------------------------------------------------------------ r5417 | marko | 2009-06-25 08:20:56 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: ha_innodb.cc: Move the misplaced Doxygen @file comment. ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 08:55:52 -0400 (Thu, 25 Jun 2009) | 5 lines branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ r5423 | calvin | 2009-06-26 16:52:52 -0400 (Fri, 26 Jun 2009) | 2 lines branches/zip: Fix typos. ------------------------------------------------------------------------ r5425 | marko | 2009-06-29 04:52:30 -0400 (Mon, 29 Jun 2009) | 4 lines branches/zip: ha_innobase::add_index(), ha_innobase::final_drop_index(): Start prebuilt->trx before locking the table. This should fix Issue #293 and could fix Issue #229. Approved by Sunny (over IM). ------------------------------------------------------------------------ r5426 | marko | 2009-06-29 05:24:27 -0400 (Mon, 29 Jun 2009) | 3 lines branches/zip: buf_page_get_gen(): Fix a race condition when reading buf_fix_count. This could explain Issue #156. Tested by Michael. ------------------------------------------------------------------------ r5427 | marko | 2009-06-29 05:54:53 -0400 (Mon, 29 Jun 2009) | 5 lines branches/zip: lock_print_info_all_transactions(), buf_read_recv_pages(): Tolerate missing tablespaces (zip_size==ULINT_UNDEFINED). buf_page_get_gen(): Add ut_ad(ut_is_2pow(zip_size)). Issue #289, rb://136 approved by Sunny Bains ------------------------------------------------------------------------ r5428 | marko | 2009-06-29 07:06:29 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: row_sel_store_mysql_rec(): Add missing pointer cast. Do not do arithmetics on void pointers. ------------------------------------------------------------------------ r5429 | marko | 2009-06-29 09:49:54 -0400 (Mon, 29 Jun 2009) | 13 lines branches/zip: Do not crash on SET GLOBAL innodb_file_format=DEFAULT or SET GLOBAL innodb_file_format_check=DEFAULT. innodb_file_format.test: New test for innodb_file_format and innodb_file_format_check. innodb_file_format_name_validate(): Store the string in *save. innodb_file_format_name_update(): Check the string again. innodb_file_format_check_validate(): Store the string in *save. innodb_file_format_check_update(): Check the string again. Issue #282, rb://140 approved by Heikki Tuuri ------------------------------------------------------------------------ r5430 | marko | 2009-06-29 09:58:07 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: lock_rec_validate_page(): Add another assertion to track down Issue #289. ------------------------------------------------------------------------ r5431 | marko | 2009-06-29 09:58:40 -0400 (Mon, 29 Jun 2009) | 1 line branches/zip: Revert an accidentally made change in r5430 to univ.i. ------------------------------------------------------------------------ r5437 | marko | 2009-06-30 05:10:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: ibuf_dummy_index_free(): Beautify the comment. ------------------------------------------------------------------------ r5438 | marko | 2009-06-30 05:10:32 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: fseg_free(): Remove this unused function. ------------------------------------------------------------------------ r5439 | marko | 2009-06-30 05:15:22 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: fseg_validate(): Enclose in #ifdef UNIV_DEBUG. This function is unused, but it could turn out to be a useful debugging aid. ------------------------------------------------------------------------ r5441 | marko | 2009-06-30 06:30:14 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: ha_delete(): Remove this unused function that was very similar to ha_search_and_delete_if_found(). ------------------------------------------------------------------------ r5442 | marko | 2009-06-30 06:45:41 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: lock_is_on_table(), lock_table_unlock(): Unused, remove. ------------------------------------------------------------------------ r5443 | marko | 2009-06-30 07:03:00 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_event_create_auto(): Unused, remove. ------------------------------------------------------------------------ r5444 | marko | 2009-06-30 07:19:49 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: que_graph_try_free(): Unused, remove. ------------------------------------------------------------------------ r5445 | marko | 2009-06-30 07:28:11 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: row_build_row_ref_from_row(): Unused, remove. ------------------------------------------------------------------------ r5446 | marko | 2009-06-30 07:35:45 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_round_robin(), srv_que_task_enqueue(): Unused, remove. ------------------------------------------------------------------------ r5447 | marko | 2009-06-30 07:37:58 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_task_queue_check(): Unused, remove. ------------------------------------------------------------------------ r5448 | marko | 2009-06-30 07:56:36 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: mem_heap_cat(): Unused, remove. ------------------------------------------------------------------------ r5449 | marko | 2009-06-30 08:00:50 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: innobase_start_or_create_for_mysql(): Invoke os_get_os_version() at most once. ------------------------------------------------------------------------ r5450 | marko | 2009-06-30 08:02:20 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_file_close_no_error_handling(): Unused, remove. ------------------------------------------------------------------------ r5451 | marko | 2009-06-30 08:09:49 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: page_set_max_trx_id(): Make the code compile with UNIV_HOTBACKUP. ------------------------------------------------------------------------ r5452 | marko | 2009-06-30 08:10:26 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: os_file_close_no_error_handling(): Restore, as this function is used within InnoDB Hot Backup. ------------------------------------------------------------------------ r5453 | marko | 2009-06-30 08:14:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_process_set_priority_boost(): Unused, remove. ------------------------------------------------------------------------ r5454 | marko | 2009-06-30 08:42:52 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: Replace a non-ASCII character (ISO 8859-1 encoded U+00AD SOFT HYPHEN) with a cheap ASCII substitute. ------------------------------------------------------------------------ r5456 | inaam | 2009-06-30 14:21:09 -0400 (Tue, 30 Jun 2009) | 4 lines branches/zip Non functional change. s/Percona/Percona Inc./ ------------------------------------------------------------------------ r5470 | vasil | 2009-07-02 09:12:36 -0400 (Thu, 02 Jul 2009) | 16 lines branches/zip: Use PAUSE instruction inside spinloop if it is available. The patch was originally developed by Mikael Ronstrom <mikael@mysql.com> and can be found here: http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2768 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2771 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2772 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2774 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2777 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2799 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2800 Approved by: Heikki (rb://137) ------------------------------------------------------------------------ r5481 | vasil | 2009-07-06 13:16:32 -0400 (Mon, 06 Jul 2009) | 4 lines branches/zip: Remove unnecessary quotes and simplify plug.in. ------------------------------------------------------------------------ r5482 | calvin | 2009-07-06 18:36:35 -0400 (Mon, 06 Jul 2009) | 5 lines branches/zip: add COPYING files for Percona and Sun Micro. 1.0.4 contains patches based on contributions from Percona and Sun Microsystems. ------------------------------------------------------------------------ r5483 | calvin | 2009-07-07 05:36:43 -0400 (Tue, 07 Jul 2009) | 3 lines branches/zip: add IB_HAVE_PAUSE_INSTRUCTION to CMake. Windows will support PAUSE instruction by default. ------------------------------------------------------------------------ r5484 | inaam | 2009-07-07 18:57:14 -0400 (Tue, 07 Jul 2009) | 13 lines branches/zip rb://126 Based on contribution from Google Inc. This patch introduces a new parameter innodb_io_capacity to control the rate at which master threads performs various tasks. The default value is 200 and higher values imply more aggressive flushing and ibuf merges from within the master thread. This patch also changes the ibuf merge from synchronous to asynchronous. Another minor change is not to force the master thread to wait for a log flush to complete every second. Approved by: Heikki ------------------------------------------------------------------------ r5485 | inaam | 2009-07-07 19:00:49 -0400 (Tue, 07 Jul 2009) | 18 lines branches/zip rb://138 The current implementation is to try to flush the neighbors of every page that we flush. This patch makes the following distinction: 1) If the flush is from flush_list AND 2) If the flush is intended to move the oldest_modification LSN ahead (this happens when a user thread sees little space in the log file and attempts to flush pages from the buffer pool so that a checkpoint can be made) THEN Do not try to flush the neighbors. Just focus on flushing dirty pages at the end of flush_list Approved by: Heikki ------------------------------------------------------------------------ r5486 | inaam | 2009-07-08 12:11:40 -0400 (Wed, 08 Jul 2009) | 29 lines branches/zip rb://133 This patch introduces heuristics based flushing rate of dirty pages to avoid IO bursts at checkpoint. 1) log_capacity / log_generated per second gives us number of seconds in which ALL dirty pages need to be flushed. Based on this rough assumption we can say that n_dirty_pages / (log_capacity / log_generation_rate) = desired_flush_rate 2) We use weighted averages (hard coded to 20 seconds) of log_generation_rate to avoid resonance. 3) From the desired_flush_rate we subtract the number of pages that have been flushed due to LRU flushing. That gives us pages that we should flush as part of flush_list cleanup. And that is the number (capped by maximum io_capacity) that we try to flush from the master thread. Knobs: ====== innodb_adaptive_flushing: boolean, global, dynamic, default TRUE. Since this heuristic is very experimental and has the potential to dramatically change the IO pattern I think it is a good idea to leave a knob to turn it off. Approved by: Heikki ------------------------------------------------------------------------ r5487 | calvin | 2009-07-08 12:42:28 -0400 (Wed, 08 Jul 2009) | 7 lines branches/zip: fix PAUSE instruction patch on Windows The original PAUSE instruction patch (r5470) does not compile on Windows. Also, there is an elegant way of doing it on Windows - YieldProcessor(). Approved by: Heikki (on IM) ------------------------------------------------------------------------ r5489 | vasil | 2009-07-10 05:02:22 -0400 (Fri, 10 Jul 2009) | 9 lines branches/zip: Change the defaults for innodb_sync_spin_loops: 20 -> 30 innodb_spin_wait_delay: 5 -> 6 This change was proposed by Sun/MySQL based on their performance testing, see https://svn.innodb.com/innobase/Release_tasks_for_InnoDB_Plugin_V1.0.4 ------------------------------------------------------------------------ r5490 | vasil | 2009-07-10 05:04:20 -0400 (Fri, 10 Jul 2009) | 4 lines branches/zip: Add ChangeLog entry for 5489. ------------------------------------------------------------------------ r5491 | calvin | 2009-07-10 12:19:17 -0400 (Fri, 10 Jul 2009) | 6 lines branches/zip: add copyright info to files related to PAUSE instruction patch, contributed by Sun Microsystems. ------------------------------------------------------------------------ r5492 | calvin | 2009-07-10 17:47:34 -0400 (Fri, 10 Jul 2009) | 5 lines branches/zip: add ChangeLog entries for r5484-r5486. ------------------------------------------------------------------------ r5494 | vasil | 2009-07-13 03:37:35 -0400 (Mon, 13 Jul 2009) | 6 lines branches/zip: Restore the original value of innodb_sync_spin_loops at the end, previously the test assumed that setting it to 20 will do this, but now the default is 30 and MTR's internal check failed. ------------------------------------------------------------------------ r5495 | inaam | 2009-07-13 11:48:45 -0400 (Mon, 13 Jul 2009) | 5 lines branches/zip rb://138 (REVERT) Revert the flush neighbors patch as it shows regression in the benchmarks run by Michael. ------------------------------------------------------------------------ r5496 | inaam | 2009-07-13 14:04:57 -0400 (Mon, 13 Jul 2009) | 4 lines branches/zip Fixed warnings on windows where ulint != ib_uint64_t ------------------------------------------------------------------------ r5497 | calvin | 2009-07-13 15:01:00 -0400 (Mon, 13 Jul 2009) | 9 lines branches/zip: fix run-time symbols clash on Solaris. This patch is from Sergey Vojtovich of Sun Microsystems, to fix run-time symbols clash on Solaris with older C++ compiler: - when finding out a way to hide symbols, make decision basing on compiler, not operating system. - Sun Studio supports __hidden declaration specifier for this purpose. ------------------------------------------------------------------------ r5498 | vasil | 2009-07-14 03:16:18 -0400 (Tue, 14 Jul 2009) | 92 lines branches/zip: Merge r5341:5497 from branches/5.1, skipping: c5419 because it is merge from branches/zip into branches/5.1 c5466 because the source code has been adjusted to match the MySQL behavior and the innodb-autoinc test does not fail in branches/zip, if c5466 is merged, then innodb-autoinc starts failing, Sunny suggested not to merge c5466. and resolving conflicts in c5410, c5440, c5488: ------------------------------------------------------------------------ r5410 | marko | 2009-06-24 22:26:34 +0300 (Wed, 24 Jun 2009) | 2 lines Changed paths: M /branches/5.1/include/trx0sys.ic M /branches/5.1/trx/trx0purge.c M /branches/5.1/trx/trx0sys.c M /branches/5.1/trx/trx0undo.c branches/5.1: Add missing #include "mtr0log.h" to avoid warnings when compiling with -DUNIV_MUST_NOT_INLINE. ------------------------------------------------------------------------ r5419 | marko | 2009-06-25 16:11:57 +0300 (Thu, 25 Jun 2009) | 18 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb_bug42101-nonzero.result M /branches/5.1/mysql-test/innodb_bug42101-nonzero.test M /branches/5.1/mysql-test/innodb_bug42101.result M /branches/5.1/mysql-test/innodb_bug42101.test branches/5.1: Merge r5418 from branches/zip: ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 15:55:52 +0300 (Thu, 25 Jun 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc M /branches/zip/mysql-test/innodb_bug42101-nonzero.result M /branches/zip/mysql-test/innodb_bug42101-nonzero.test M /branches/zip/mysql-test/innodb_bug42101.result M /branches/zip/mysql-test/innodb_bug42101.test branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5440 | vasil | 2009-06-30 13:04:29 +0300 (Tue, 30 Jun 2009) | 8 lines Changed paths: M /branches/5.1/fil/fil0fil.c branches/5.1: Fix Bug#45814 URL reference in InnoDB server errors needs adjusting to match documentation by changing the URL from http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting.html to http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting-datadict.html ------------------------------------------------------------------------ r5466 | vasil | 2009-07-02 10:46:45 +0300 (Thu, 02 Jul 2009) | 6 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Adjust the failing innodb-autoinc test to conform to the latest behavior of the MySQL code. The idea and the comment in innodb-autoinc.test come from Sunny. ------------------------------------------------------------------------ r5488 | vasil | 2009-07-09 19:16:44 +0300 (Thu, 09 Jul 2009) | 13 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug21704.result A /branches/5.1/mysql-test/innodb_bug21704.test branches/5.1: Fix Bug#21704 Renaming column does not update FK definition by checking whether a column that participates in a FK definition is being renamed and denying the ALTER in this case. The patch was originally developed by Davi Arnaut <Davi.Arnaut@Sun.COM>: http://lists.mysql.com/commits/77714 and was later adjusted to conform to InnoDB coding style by me (Vasil), I also added some more comments and moved the bug specific mysql-test to a separate file to make it more manageable and flexible. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5499 | calvin | 2009-07-14 12:55:10 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: add a missing file in Makefile.am This change was suggested by MySQL. ------------------------------------------------------------------------ r5500 | calvin | 2009-07-14 13:03:26 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: minor change Remove an extra "with". ------------------------------------------------------------------------ r5501 | vasil | 2009-07-14 13:58:15 -0400 (Tue, 14 Jul 2009) | 5 lines branches/zip: Add @ZLIB_INCLUDES@ so that the InnoDB Plugin picks up the same zlib.h header file that is eventually used by mysqld. ------------------------------------------------------------------------ r5502 | vasil | 2009-07-14 13:59:59 -0400 (Tue, 14 Jul 2009) | 4 lines branches/zip: Add include/ut0auxconf.h to noinst_HEADERS ------------------------------------------------------------------------ r5503 | vasil | 2009-07-14 14:16:11 -0400 (Tue, 14 Jul 2009) | 8 lines branches/zip: Non-functional change: put files in noinst_HEADERS and libinnobase_a_SOURCES one per line and sort alphabetically, so it is easier to find if a file is there or not and also diffs show exactly the added or removed file instead of surrounding lines too. ------------------------------------------------------------------------ r5504 | calvin | 2009-07-15 04:58:44 -0400 (Wed, 15 Jul 2009) | 6 lines branches/zip: fix compile errors on Win64 Both srv_read_ahead_factor and srv_io_capacity should be defined as ulong. Approved by: Sunny ------------------------------------------------------------------------ r5508 | calvin | 2009-07-16 09:40:47 -0400 (Thu, 16 Jul 2009) | 16 lines branches/zip: Support inlining of functions and prefetch with Sun Studio Those changes are contributed by Sun/MySQL. Two sets of changes in this patch when Sun Studio is used: - Explicit inlining of functions - Prefetch Support This patch has been tested by Sunny with the plugin statically built in. Since we've never built the plugin as a dynamically loaded module on Solaris, it is a separate task to change plug.in. rb://142 Approved by: Heikki ------------------------------------------------------------------------ r5509 | calvin | 2009-07-16 09:45:28 -0400 (Thu, 16 Jul 2009) | 2 lines branches/zip: add ChangeLog entry for r5508. ------------------------------------------------------------------------ r5512 | sunny | 2009-07-19 19:52:48 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Remove unused extern ref to timed_mutexes. ------------------------------------------------------------------------ r5513 | sunny | 2009-07-19 19:58:43 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Undo r5512 ------------------------------------------------------------------------ r5514 | sunny | 2009-07-19 20:08:49 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Only use my_bool when UNIV_HOTBACKUP is not defined. ------------------------------------------------------------------------ r5515 | sunny | 2009-07-20 03:29:14 -0400 (Mon, 20 Jul 2009) | 2 lines branches/zip: The dict_table_t::autoinc_mutex field is not used in HotBackup. ------------------------------------------------------------------------ r5516 | sunny | 2009-07-20 03:46:05 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Make this file usable from within HotBackup. A new file has been introduced called hb_univ.i. This file should have all the HotBackup specific configuration. ------------------------------------------------------------------------ r5517 | sunny | 2009-07-20 03:55:11 -0400 (Mon, 20 Jul 2009) | 2 lines Add /* UNIV_HOTBACK */ ------------------------------------------------------------------------ r5519 | vasil | 2009-07-20 04:45:18 -0400 (Mon, 20 Jul 2009) | 31 lines branches/zip: Merge r5497:5518 from branches/5.1: ------------------------------------------------------------------------ r5518 | vasil | 2009-07-20 11:29:47 +0300 (Mon, 20 Jul 2009) | 22 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a change from MySQL: ------------------------------------------------------------ revno: 2874.2.1 committer: Anurag Shekhar <anurag.shekhar@sun.com> branch nick: mysql-5.1-bugteam-windows-warning timestamp: Wed 2009-05-13 15:41:24 +0530 message: Bug #39802 On Windows, 32-bit time_t should be enforced This patch fixes compilation warning, "conversion from 'time_t' to 'ulong', possible loss of data". The fix is to typecast time_t to ulong before assigning it to ulong. Backported this from 6.0-bugteam tree. modified: storage/archive/ha_archive.cc storage/federated/ha_federated.cc storage/innobase/handler/ha_innodb.cc storage/myisam/ha_myisam.cc ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5520 | vasil | 2009-07-20 04:51:47 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Add ChangeLog entries for r5498 and r5519. ------------------------------------------------------------------------ r5524 | inaam | 2009-07-20 12:23:15 -0400 (Mon, 20 Jul 2009) | 9 lines branches/zip Change the read ahead parameter name to innodb_read_ahead_threshold. Change the meaning of this parameter to signify the number of pages that must be sequentially accessed for InnoDB to trigger a readahead request. Suggested by: Ken ------------------------------------------------------------------------
16 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
branches/innodb+: Merge revisions 5144:5524 from branches/zip ------------------------------------------------------------------------ r5147 | marko | 2009-05-27 06:55:14 -0400 (Wed, 27 May 2009) | 1 line branches/zip: ibuf0ibuf.c: Improve a comment. ------------------------------------------------------------------------ r5149 | marko | 2009-05-27 07:46:42 -0400 (Wed, 27 May 2009) | 34 lines branches/zip: Merge revisions 4994:5148 from branches/5.1: ------------------------------------------------------------------------ r5126 | vasil | 2009-05-26 16:57:12 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Non-functional change: move FSP_* macros from fsp0fsp.h to a new file fsp0types.h. This is needed in order to be able to use FSP_EXTENT_SIZE in mtr0log.ic. ------------------------------------------------------------------------ r5127 | vasil | 2009-05-26 17:05:43 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not include unnecessary headers mtr0log.h and fut0lst.h in trx0sys.h and include fsp0fsp.h just before it is needed. This is needed in order to be able to use TRX_SYS_SPACE in mtr0log.ic. ------------------------------------------------------------------------ r5128 | vasil | 2009-05-26 17:26:37 +0300 (Tue, 26 May 2009) | 7 lines branches/5.1: Fix Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not write redo log for the pages in the doublewrite buffer. Also, do not make a dummy change to the page because this is not needed. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5169 | marko | 2009-05-28 03:21:55 -0400 (Thu, 28 May 2009) | 1 line branches/zip: mtr0mtr.h: Add Doxygen comments for the redo log entry types. ------------------------------------------------------------------------ r5176 | marko | 2009-05-28 07:14:02 -0400 (Thu, 28 May 2009) | 1 line branches/zip: Correct a debug assertion that was added in r5125. ------------------------------------------------------------------------ r5201 | marko | 2009-06-01 06:35:25 -0400 (Mon, 01 Jun 2009) | 2 lines branches/zip: Clean up some comments. Make the rec parameter of mlog_open_and_write_index() const. ------------------------------------------------------------------------ r5234 | marko | 2009-06-03 08:26:41 -0400 (Wed, 03 Jun 2009) | 44 lines branches/zip: Merge revisions 5148:5233 from branches/5.1: ------------------------------------------------------------------------ r5150 | vasil | 2009-05-27 18:56:03 +0300 (Wed, 27 May 2009) | 4 lines branches/5.1: Whitespace fixup. ------------------------------------------------------------------------ r5191 | vasil | 2009-05-30 17:46:05 +0300 (Sat, 30 May 2009) | 19 lines branches/5.1: Merge a change from MySQL (this fixes the failing innodb_mysql test): ------------------------------------------------------------ revno: 1810.3894.10 committer: Sergey Glukhov <Sergey.Glukhov@sun.com> branch nick: mysql-5.0-bugteam timestamp: Tue 2009-05-19 11:32:21 +0500 message: Bug#39793 Foreign keys not constructed when column has a '#' in a comment or default value Internal InnoDN FK parser does not recognize '\'' as quotation symbol. Suggested fix is to add '\'' symbol check for quotation condition (dict_strip_comments() function). modified: innobase/dict/dict0dict.c mysql-test/r/innodb_mysql.result mysql-test/t/innodb_mysql.test ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5250 | marko | 2009-06-04 02:58:23 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add Doxygen comments to the rest of buf0*. ------------------------------------------------------------------------ r5251 | marko | 2009-06-04 02:59:51 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Replace <= in a function comment. ------------------------------------------------------------------------ r5253 | marko | 2009-06-04 06:37:35 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add missing Doxygen comments for page0zip. ------------------------------------------------------------------------ r5261 | vasil | 2009-06-05 11:13:31 -0400 (Fri, 05 Jun 2009) | 15 lines branches/zip: Fix Mantis Issue#244 fix bug in linear read ahead (no check on access pattern) The changes are: 1) Take into account access pattern when deciding whether or not to do linear read ahead. 2) Expose a knob innodb_read_ahead_factor = [0-64] default (8), dynamic, global to control linear read ahead behvior 3) Disable random read ahead. Keep the code for now. Submitted by: Inaam (rb://122) Approved by: Heikki (rb://122) ------------------------------------------------------------------------ r5262 | vasil | 2009-06-05 12:04:25 -0400 (Fri, 05 Jun 2009) | 22 lines branches/zip: Enable functionality to have multiple background io helper threads. This patch is based on percona contributions. More details about this patch will be written at: https://svn.innodb.com/innobase/MultipleBackgroundThreads The patch essentially does the following: expose following knobs: innodb_read_io_threads = [1 - 64] default 1 innodb_write_io_threads = [1 - 64] default 1 deprecate innodb_file_io_threads (this parameter was relevant only on windows) Internally it allows multiple segments for read and write IO request arrays where one thread works on one segement. Submitted by: Inaam (rb://124) Approved by: Heikki (rb://124) ------------------------------------------------------------------------ r5263 | vasil | 2009-06-05 12:19:37 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Whitespace cleanup. ------------------------------------------------------------------------ r5264 | vasil | 2009-06-05 12:26:58 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5261. ------------------------------------------------------------------------ r5265 | vasil | 2009-06-05 12:34:11 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5262. ------------------------------------------------------------------------ r5268 | inaam | 2009-06-08 12:18:21 -0400 (Mon, 08 Jun 2009) | 7 lines branches/zip Non functional change: Added legal notices acknowledging percona contribution to the multiple IO helper threads patch i.e.: r5262 ------------------------------------------------------------------------ r5283 | inaam | 2009-06-09 13:46:29 -0400 (Tue, 09 Jun 2009) | 9 lines branches/zip rb://130 Enable Group Commit functionality that was broken in 5.0 when distributed transactions were introduced. Reviewed by: Heikki ------------------------------------------------------------------------ r5319 | marko | 2009-06-11 04:40:33 -0400 (Thu, 11 Jun 2009) | 3 lines branches/zip: Declare os_thread_id_t as unsigned long, because ulint is wrong on Win64. Pointed out by Vladislav Vaintroub <wlad@sun.com>. ------------------------------------------------------------------------ r5320 | inaam | 2009-06-11 09:15:41 -0400 (Thu, 11 Jun 2009) | 14 lines branches/zip rb://131 This patch changes the following defaults: max_dirty_pages_pct: default from 90 to 75. max allowed from 100 to 99 additional_mem_pool_size: default from 1 to 8 MB buffer_pool_size: default from 8 to 128 MB log_buffer_size: default from 1 to 8 MB read_io_threads/write_io_threads: default from 1 to 4 The log file sizes are untouched because of upgrade issues Reviewed by: Heikki ------------------------------------------------------------------------ r5330 | marko | 2009-06-16 04:08:59 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_gen(): Reduce mutex holding time by adjusting buf_pool->n_pend_unzip while only holding buf_pool_mutex. ------------------------------------------------------------------------ r5331 | marko | 2009-06-16 05:00:48 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Eliminate a buf_page_get_mutex() call. The function must switch on the block state anyway. ------------------------------------------------------------------------ r5332 | vasil | 2009-06-16 05:03:27 -0400 (Tue, 16 Jun 2009) | 4 lines branches/zip: Add ChangeLog entries for r5283 and r5320. ------------------------------------------------------------------------ r5333 | marko | 2009-06-16 05:27:46 -0400 (Tue, 16 Jun 2009) | 1 line branches/zip: buf_page_io_query(): Remove unused function. ------------------------------------------------------------------------ r5335 | marko | 2009-06-16 09:23:10 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: innodb.test: Adjust the tolerance of innodb_buffer_pool_pages_total for r5320. ------------------------------------------------------------------------ r5342 | marko | 2009-06-17 06:15:32 -0400 (Wed, 17 Jun 2009) | 60 lines branches/zip: Merge revisions 5233:5341 from branches/5.1: ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5243 | sunny | 2009-06-04 03:17:14 +0300 (Thu, 04 Jun 2009) | 14 lines branches/5.1: When the InnoDB and MySQL data dictionaries go out of sync, before the bug fix we would assert on missing autoinc columns. With this fix we allow MySQL to open the table but set the next autoinc value for the column to the MAX value. This effectively disables the next value generation. INSERTs will fail with a generic AUTOINC failure. However, the user should be able to read/dump the table, set the column values explicitly, use ALTER TABLE to set the next autoinc value and/or sync the two data dictionaries to resume normal operations. Fix Bug#44030 Error: (1500) Couldn't read the MAX(ID) autoinc value from the index (PRIMARY) rb://118 ------------------------------------------------------------------------ r5252 | sunny | 2009-06-04 10:16:24 +0300 (Thu, 04 Jun 2009) | 2 lines branches/5.1: The version of the result file checked in was broken in r5243. ------------------------------------------------------------------------ r5259 | vasil | 2009-06-05 10:29:16 +0300 (Fri, 05 Jun 2009) | 7 lines branches/5.1: Remove the word "Error" from the printout because the mysqltest suite interprets it as an error and thus the innodb-autoinc test fails. Approved by: Sunny (via IM) ------------------------------------------------------------------------ r5339 | marko | 2009-06-17 11:01:37 +0300 (Wed, 17 Jun 2009) | 2 lines branches/5.1: Add missing #include "mtr0log.h" so that the code compiles with -DUNIV_MUST_NOT_INLINE. (null merge; this had already been committed in branches/zip) ------------------------------------------------------------------------ r5340 | marko | 2009-06-17 12:11:49 +0300 (Wed, 17 Jun 2009) | 4 lines branches/5.1: row_unlock_for_mysql(): When the clustered index is unknown, refuse to unlock the record. (Bug #45357, caused by the fix of Bug #39320). rb://132 approved by Sunny Bains. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5343 | vasil | 2009-06-17 08:56:12 -0400 (Wed, 17 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5342. ------------------------------------------------------------------------ r5344 | marko | 2009-06-17 09:03:45 -0400 (Wed, 17 Jun 2009) | 1 line branches/zip: row_merge_read_rec(): Fix a UNIV_DEBUG bug (Bug #45426) ------------------------------------------------------------------------ r5391 | marko | 2009-06-22 05:31:35 -0400 (Mon, 22 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Fix a bogus warning about block_mutex being possibly uninitialized. ------------------------------------------------------------------------ r5392 | marko | 2009-06-22 07:58:20 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: ha_innobase::check_if_incompatible_data(): When ROW_FORMAT=DEFAULT, do not compare to get_row_type(). Without this change, fast index creation will be disabled in recent versions of MySQL 5.1. ------------------------------------------------------------------------ r5393 | pekka | 2009-06-22 09:27:55 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Minor changes for Hot Backup to build correctly. (The code bracketed between #ifdef UNIV_HOTBACKUP and #endif /* UNIV_HOTBACKUP */). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5394 | pekka | 2009-06-22 09:46:34 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Add functions for checking the format of tablespaces for Hot Backup build (UNIV_HOTBACKUP defined). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5397 | calvin | 2009-06-23 16:59:42 -0400 (Tue, 23 Jun 2009) | 7 lines branches/zip: change the header file path. Change the header file path from ../storage/innobase/include/ to ../include/. In the planned 5.1 + plugin release, the source directory of the plugin will not be in storage/innobase. Approved by: Heikki (IM) ------------------------------------------------------------------------ r5407 | calvin | 2009-06-24 09:51:08 -0400 (Wed, 24 Jun 2009) | 4 lines branches/zip: remove relative path of header files. Suggested by Marko. ------------------------------------------------------------------------ r5412 | marko | 2009-06-25 06:27:08 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: Replace a DBUG_ASSERT with ut_a to track down Issue #290. ------------------------------------------------------------------------ r5415 | marko | 2009-06-25 06:45:57 -0400 (Thu, 25 Jun 2009) | 3 lines branches/zip: dict_index_find_cols(): Print diagnostic on name mismatch. This addresses Bug #44571 but does not fix it. rb://135 approved by Sunny Bains. ------------------------------------------------------------------------ r5417 | marko | 2009-06-25 08:20:56 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: ha_innodb.cc: Move the misplaced Doxygen @file comment. ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 08:55:52 -0400 (Thu, 25 Jun 2009) | 5 lines branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ r5423 | calvin | 2009-06-26 16:52:52 -0400 (Fri, 26 Jun 2009) | 2 lines branches/zip: Fix typos. ------------------------------------------------------------------------ r5425 | marko | 2009-06-29 04:52:30 -0400 (Mon, 29 Jun 2009) | 4 lines branches/zip: ha_innobase::add_index(), ha_innobase::final_drop_index(): Start prebuilt->trx before locking the table. This should fix Issue #293 and could fix Issue #229. Approved by Sunny (over IM). ------------------------------------------------------------------------ r5426 | marko | 2009-06-29 05:24:27 -0400 (Mon, 29 Jun 2009) | 3 lines branches/zip: buf_page_get_gen(): Fix a race condition when reading buf_fix_count. This could explain Issue #156. Tested by Michael. ------------------------------------------------------------------------ r5427 | marko | 2009-06-29 05:54:53 -0400 (Mon, 29 Jun 2009) | 5 lines branches/zip: lock_print_info_all_transactions(), buf_read_recv_pages(): Tolerate missing tablespaces (zip_size==ULINT_UNDEFINED). buf_page_get_gen(): Add ut_ad(ut_is_2pow(zip_size)). Issue #289, rb://136 approved by Sunny Bains ------------------------------------------------------------------------ r5428 | marko | 2009-06-29 07:06:29 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: row_sel_store_mysql_rec(): Add missing pointer cast. Do not do arithmetics on void pointers. ------------------------------------------------------------------------ r5429 | marko | 2009-06-29 09:49:54 -0400 (Mon, 29 Jun 2009) | 13 lines branches/zip: Do not crash on SET GLOBAL innodb_file_format=DEFAULT or SET GLOBAL innodb_file_format_check=DEFAULT. innodb_file_format.test: New test for innodb_file_format and innodb_file_format_check. innodb_file_format_name_validate(): Store the string in *save. innodb_file_format_name_update(): Check the string again. innodb_file_format_check_validate(): Store the string in *save. innodb_file_format_check_update(): Check the string again. Issue #282, rb://140 approved by Heikki Tuuri ------------------------------------------------------------------------ r5430 | marko | 2009-06-29 09:58:07 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: lock_rec_validate_page(): Add another assertion to track down Issue #289. ------------------------------------------------------------------------ r5431 | marko | 2009-06-29 09:58:40 -0400 (Mon, 29 Jun 2009) | 1 line branches/zip: Revert an accidentally made change in r5430 to univ.i. ------------------------------------------------------------------------ r5437 | marko | 2009-06-30 05:10:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: ibuf_dummy_index_free(): Beautify the comment. ------------------------------------------------------------------------ r5438 | marko | 2009-06-30 05:10:32 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: fseg_free(): Remove this unused function. ------------------------------------------------------------------------ r5439 | marko | 2009-06-30 05:15:22 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: fseg_validate(): Enclose in #ifdef UNIV_DEBUG. This function is unused, but it could turn out to be a useful debugging aid. ------------------------------------------------------------------------ r5441 | marko | 2009-06-30 06:30:14 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: ha_delete(): Remove this unused function that was very similar to ha_search_and_delete_if_found(). ------------------------------------------------------------------------ r5442 | marko | 2009-06-30 06:45:41 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: lock_is_on_table(), lock_table_unlock(): Unused, remove. ------------------------------------------------------------------------ r5443 | marko | 2009-06-30 07:03:00 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_event_create_auto(): Unused, remove. ------------------------------------------------------------------------ r5444 | marko | 2009-06-30 07:19:49 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: que_graph_try_free(): Unused, remove. ------------------------------------------------------------------------ r5445 | marko | 2009-06-30 07:28:11 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: row_build_row_ref_from_row(): Unused, remove. ------------------------------------------------------------------------ r5446 | marko | 2009-06-30 07:35:45 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_round_robin(), srv_que_task_enqueue(): Unused, remove. ------------------------------------------------------------------------ r5447 | marko | 2009-06-30 07:37:58 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_task_queue_check(): Unused, remove. ------------------------------------------------------------------------ r5448 | marko | 2009-06-30 07:56:36 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: mem_heap_cat(): Unused, remove. ------------------------------------------------------------------------ r5449 | marko | 2009-06-30 08:00:50 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: innobase_start_or_create_for_mysql(): Invoke os_get_os_version() at most once. ------------------------------------------------------------------------ r5450 | marko | 2009-06-30 08:02:20 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_file_close_no_error_handling(): Unused, remove. ------------------------------------------------------------------------ r5451 | marko | 2009-06-30 08:09:49 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: page_set_max_trx_id(): Make the code compile with UNIV_HOTBACKUP. ------------------------------------------------------------------------ r5452 | marko | 2009-06-30 08:10:26 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: os_file_close_no_error_handling(): Restore, as this function is used within InnoDB Hot Backup. ------------------------------------------------------------------------ r5453 | marko | 2009-06-30 08:14:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_process_set_priority_boost(): Unused, remove. ------------------------------------------------------------------------ r5454 | marko | 2009-06-30 08:42:52 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: Replace a non-ASCII character (ISO 8859-1 encoded U+00AD SOFT HYPHEN) with a cheap ASCII substitute. ------------------------------------------------------------------------ r5456 | inaam | 2009-06-30 14:21:09 -0400 (Tue, 30 Jun 2009) | 4 lines branches/zip Non functional change. s/Percona/Percona Inc./ ------------------------------------------------------------------------ r5470 | vasil | 2009-07-02 09:12:36 -0400 (Thu, 02 Jul 2009) | 16 lines branches/zip: Use PAUSE instruction inside spinloop if it is available. The patch was originally developed by Mikael Ronstrom <mikael@mysql.com> and can be found here: http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2768 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2771 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2772 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2774 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2777 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2799 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2800 Approved by: Heikki (rb://137) ------------------------------------------------------------------------ r5481 | vasil | 2009-07-06 13:16:32 -0400 (Mon, 06 Jul 2009) | 4 lines branches/zip: Remove unnecessary quotes and simplify plug.in. ------------------------------------------------------------------------ r5482 | calvin | 2009-07-06 18:36:35 -0400 (Mon, 06 Jul 2009) | 5 lines branches/zip: add COPYING files for Percona and Sun Micro. 1.0.4 contains patches based on contributions from Percona and Sun Microsystems. ------------------------------------------------------------------------ r5483 | calvin | 2009-07-07 05:36:43 -0400 (Tue, 07 Jul 2009) | 3 lines branches/zip: add IB_HAVE_PAUSE_INSTRUCTION to CMake. Windows will support PAUSE instruction by default. ------------------------------------------------------------------------ r5484 | inaam | 2009-07-07 18:57:14 -0400 (Tue, 07 Jul 2009) | 13 lines branches/zip rb://126 Based on contribution from Google Inc. This patch introduces a new parameter innodb_io_capacity to control the rate at which master threads performs various tasks. The default value is 200 and higher values imply more aggressive flushing and ibuf merges from within the master thread. This patch also changes the ibuf merge from synchronous to asynchronous. Another minor change is not to force the master thread to wait for a log flush to complete every second. Approved by: Heikki ------------------------------------------------------------------------ r5485 | inaam | 2009-07-07 19:00:49 -0400 (Tue, 07 Jul 2009) | 18 lines branches/zip rb://138 The current implementation is to try to flush the neighbors of every page that we flush. This patch makes the following distinction: 1) If the flush is from flush_list AND 2) If the flush is intended to move the oldest_modification LSN ahead (this happens when a user thread sees little space in the log file and attempts to flush pages from the buffer pool so that a checkpoint can be made) THEN Do not try to flush the neighbors. Just focus on flushing dirty pages at the end of flush_list Approved by: Heikki ------------------------------------------------------------------------ r5486 | inaam | 2009-07-08 12:11:40 -0400 (Wed, 08 Jul 2009) | 29 lines branches/zip rb://133 This patch introduces heuristics based flushing rate of dirty pages to avoid IO bursts at checkpoint. 1) log_capacity / log_generated per second gives us number of seconds in which ALL dirty pages need to be flushed. Based on this rough assumption we can say that n_dirty_pages / (log_capacity / log_generation_rate) = desired_flush_rate 2) We use weighted averages (hard coded to 20 seconds) of log_generation_rate to avoid resonance. 3) From the desired_flush_rate we subtract the number of pages that have been flushed due to LRU flushing. That gives us pages that we should flush as part of flush_list cleanup. And that is the number (capped by maximum io_capacity) that we try to flush from the master thread. Knobs: ====== innodb_adaptive_flushing: boolean, global, dynamic, default TRUE. Since this heuristic is very experimental and has the potential to dramatically change the IO pattern I think it is a good idea to leave a knob to turn it off. Approved by: Heikki ------------------------------------------------------------------------ r5487 | calvin | 2009-07-08 12:42:28 -0400 (Wed, 08 Jul 2009) | 7 lines branches/zip: fix PAUSE instruction patch on Windows The original PAUSE instruction patch (r5470) does not compile on Windows. Also, there is an elegant way of doing it on Windows - YieldProcessor(). Approved by: Heikki (on IM) ------------------------------------------------------------------------ r5489 | vasil | 2009-07-10 05:02:22 -0400 (Fri, 10 Jul 2009) | 9 lines branches/zip: Change the defaults for innodb_sync_spin_loops: 20 -> 30 innodb_spin_wait_delay: 5 -> 6 This change was proposed by Sun/MySQL based on their performance testing, see https://svn.innodb.com/innobase/Release_tasks_for_InnoDB_Plugin_V1.0.4 ------------------------------------------------------------------------ r5490 | vasil | 2009-07-10 05:04:20 -0400 (Fri, 10 Jul 2009) | 4 lines branches/zip: Add ChangeLog entry for 5489. ------------------------------------------------------------------------ r5491 | calvin | 2009-07-10 12:19:17 -0400 (Fri, 10 Jul 2009) | 6 lines branches/zip: add copyright info to files related to PAUSE instruction patch, contributed by Sun Microsystems. ------------------------------------------------------------------------ r5492 | calvin | 2009-07-10 17:47:34 -0400 (Fri, 10 Jul 2009) | 5 lines branches/zip: add ChangeLog entries for r5484-r5486. ------------------------------------------------------------------------ r5494 | vasil | 2009-07-13 03:37:35 -0400 (Mon, 13 Jul 2009) | 6 lines branches/zip: Restore the original value of innodb_sync_spin_loops at the end, previously the test assumed that setting it to 20 will do this, but now the default is 30 and MTR's internal check failed. ------------------------------------------------------------------------ r5495 | inaam | 2009-07-13 11:48:45 -0400 (Mon, 13 Jul 2009) | 5 lines branches/zip rb://138 (REVERT) Revert the flush neighbors patch as it shows regression in the benchmarks run by Michael. ------------------------------------------------------------------------ r5496 | inaam | 2009-07-13 14:04:57 -0400 (Mon, 13 Jul 2009) | 4 lines branches/zip Fixed warnings on windows where ulint != ib_uint64_t ------------------------------------------------------------------------ r5497 | calvin | 2009-07-13 15:01:00 -0400 (Mon, 13 Jul 2009) | 9 lines branches/zip: fix run-time symbols clash on Solaris. This patch is from Sergey Vojtovich of Sun Microsystems, to fix run-time symbols clash on Solaris with older C++ compiler: - when finding out a way to hide symbols, make decision basing on compiler, not operating system. - Sun Studio supports __hidden declaration specifier for this purpose. ------------------------------------------------------------------------ r5498 | vasil | 2009-07-14 03:16:18 -0400 (Tue, 14 Jul 2009) | 92 lines branches/zip: Merge r5341:5497 from branches/5.1, skipping: c5419 because it is merge from branches/zip into branches/5.1 c5466 because the source code has been adjusted to match the MySQL behavior and the innodb-autoinc test does not fail in branches/zip, if c5466 is merged, then innodb-autoinc starts failing, Sunny suggested not to merge c5466. and resolving conflicts in c5410, c5440, c5488: ------------------------------------------------------------------------ r5410 | marko | 2009-06-24 22:26:34 +0300 (Wed, 24 Jun 2009) | 2 lines Changed paths: M /branches/5.1/include/trx0sys.ic M /branches/5.1/trx/trx0purge.c M /branches/5.1/trx/trx0sys.c M /branches/5.1/trx/trx0undo.c branches/5.1: Add missing #include "mtr0log.h" to avoid warnings when compiling with -DUNIV_MUST_NOT_INLINE. ------------------------------------------------------------------------ r5419 | marko | 2009-06-25 16:11:57 +0300 (Thu, 25 Jun 2009) | 18 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb_bug42101-nonzero.result M /branches/5.1/mysql-test/innodb_bug42101-nonzero.test M /branches/5.1/mysql-test/innodb_bug42101.result M /branches/5.1/mysql-test/innodb_bug42101.test branches/5.1: Merge r5418 from branches/zip: ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 15:55:52 +0300 (Thu, 25 Jun 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc M /branches/zip/mysql-test/innodb_bug42101-nonzero.result M /branches/zip/mysql-test/innodb_bug42101-nonzero.test M /branches/zip/mysql-test/innodb_bug42101.result M /branches/zip/mysql-test/innodb_bug42101.test branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5440 | vasil | 2009-06-30 13:04:29 +0300 (Tue, 30 Jun 2009) | 8 lines Changed paths: M /branches/5.1/fil/fil0fil.c branches/5.1: Fix Bug#45814 URL reference in InnoDB server errors needs adjusting to match documentation by changing the URL from http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting.html to http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting-datadict.html ------------------------------------------------------------------------ r5466 | vasil | 2009-07-02 10:46:45 +0300 (Thu, 02 Jul 2009) | 6 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Adjust the failing innodb-autoinc test to conform to the latest behavior of the MySQL code. The idea and the comment in innodb-autoinc.test come from Sunny. ------------------------------------------------------------------------ r5488 | vasil | 2009-07-09 19:16:44 +0300 (Thu, 09 Jul 2009) | 13 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug21704.result A /branches/5.1/mysql-test/innodb_bug21704.test branches/5.1: Fix Bug#21704 Renaming column does not update FK definition by checking whether a column that participates in a FK definition is being renamed and denying the ALTER in this case. The patch was originally developed by Davi Arnaut <Davi.Arnaut@Sun.COM>: http://lists.mysql.com/commits/77714 and was later adjusted to conform to InnoDB coding style by me (Vasil), I also added some more comments and moved the bug specific mysql-test to a separate file to make it more manageable and flexible. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5499 | calvin | 2009-07-14 12:55:10 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: add a missing file in Makefile.am This change was suggested by MySQL. ------------------------------------------------------------------------ r5500 | calvin | 2009-07-14 13:03:26 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: minor change Remove an extra "with". ------------------------------------------------------------------------ r5501 | vasil | 2009-07-14 13:58:15 -0400 (Tue, 14 Jul 2009) | 5 lines branches/zip: Add @ZLIB_INCLUDES@ so that the InnoDB Plugin picks up the same zlib.h header file that is eventually used by mysqld. ------------------------------------------------------------------------ r5502 | vasil | 2009-07-14 13:59:59 -0400 (Tue, 14 Jul 2009) | 4 lines branches/zip: Add include/ut0auxconf.h to noinst_HEADERS ------------------------------------------------------------------------ r5503 | vasil | 2009-07-14 14:16:11 -0400 (Tue, 14 Jul 2009) | 8 lines branches/zip: Non-functional change: put files in noinst_HEADERS and libinnobase_a_SOURCES one per line and sort alphabetically, so it is easier to find if a file is there or not and also diffs show exactly the added or removed file instead of surrounding lines too. ------------------------------------------------------------------------ r5504 | calvin | 2009-07-15 04:58:44 -0400 (Wed, 15 Jul 2009) | 6 lines branches/zip: fix compile errors on Win64 Both srv_read_ahead_factor and srv_io_capacity should be defined as ulong. Approved by: Sunny ------------------------------------------------------------------------ r5508 | calvin | 2009-07-16 09:40:47 -0400 (Thu, 16 Jul 2009) | 16 lines branches/zip: Support inlining of functions and prefetch with Sun Studio Those changes are contributed by Sun/MySQL. Two sets of changes in this patch when Sun Studio is used: - Explicit inlining of functions - Prefetch Support This patch has been tested by Sunny with the plugin statically built in. Since we've never built the plugin as a dynamically loaded module on Solaris, it is a separate task to change plug.in. rb://142 Approved by: Heikki ------------------------------------------------------------------------ r5509 | calvin | 2009-07-16 09:45:28 -0400 (Thu, 16 Jul 2009) | 2 lines branches/zip: add ChangeLog entry for r5508. ------------------------------------------------------------------------ r5512 | sunny | 2009-07-19 19:52:48 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Remove unused extern ref to timed_mutexes. ------------------------------------------------------------------------ r5513 | sunny | 2009-07-19 19:58:43 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Undo r5512 ------------------------------------------------------------------------ r5514 | sunny | 2009-07-19 20:08:49 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Only use my_bool when UNIV_HOTBACKUP is not defined. ------------------------------------------------------------------------ r5515 | sunny | 2009-07-20 03:29:14 -0400 (Mon, 20 Jul 2009) | 2 lines branches/zip: The dict_table_t::autoinc_mutex field is not used in HotBackup. ------------------------------------------------------------------------ r5516 | sunny | 2009-07-20 03:46:05 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Make this file usable from within HotBackup. A new file has been introduced called hb_univ.i. This file should have all the HotBackup specific configuration. ------------------------------------------------------------------------ r5517 | sunny | 2009-07-20 03:55:11 -0400 (Mon, 20 Jul 2009) | 2 lines Add /* UNIV_HOTBACK */ ------------------------------------------------------------------------ r5519 | vasil | 2009-07-20 04:45:18 -0400 (Mon, 20 Jul 2009) | 31 lines branches/zip: Merge r5497:5518 from branches/5.1: ------------------------------------------------------------------------ r5518 | vasil | 2009-07-20 11:29:47 +0300 (Mon, 20 Jul 2009) | 22 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a change from MySQL: ------------------------------------------------------------ revno: 2874.2.1 committer: Anurag Shekhar <anurag.shekhar@sun.com> branch nick: mysql-5.1-bugteam-windows-warning timestamp: Wed 2009-05-13 15:41:24 +0530 message: Bug #39802 On Windows, 32-bit time_t should be enforced This patch fixes compilation warning, "conversion from 'time_t' to 'ulong', possible loss of data". The fix is to typecast time_t to ulong before assigning it to ulong. Backported this from 6.0-bugteam tree. modified: storage/archive/ha_archive.cc storage/federated/ha_federated.cc storage/innobase/handler/ha_innodb.cc storage/myisam/ha_myisam.cc ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5520 | vasil | 2009-07-20 04:51:47 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Add ChangeLog entries for r5498 and r5519. ------------------------------------------------------------------------ r5524 | inaam | 2009-07-20 12:23:15 -0400 (Mon, 20 Jul 2009) | 9 lines branches/zip Change the read ahead parameter name to innodb_read_ahead_threshold. Change the meaning of this parameter to signify the number of pages that must be sequentially accessed for InnoDB to trigger a readahead request. Suggested by: Ken ------------------------------------------------------------------------
16 years ago
branches/innodb+: Merge revisions 5144:5524 from branches/zip ------------------------------------------------------------------------ r5147 | marko | 2009-05-27 06:55:14 -0400 (Wed, 27 May 2009) | 1 line branches/zip: ibuf0ibuf.c: Improve a comment. ------------------------------------------------------------------------ r5149 | marko | 2009-05-27 07:46:42 -0400 (Wed, 27 May 2009) | 34 lines branches/zip: Merge revisions 4994:5148 from branches/5.1: ------------------------------------------------------------------------ r5126 | vasil | 2009-05-26 16:57:12 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Non-functional change: move FSP_* macros from fsp0fsp.h to a new file fsp0types.h. This is needed in order to be able to use FSP_EXTENT_SIZE in mtr0log.ic. ------------------------------------------------------------------------ r5127 | vasil | 2009-05-26 17:05:43 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not include unnecessary headers mtr0log.h and fut0lst.h in trx0sys.h and include fsp0fsp.h just before it is needed. This is needed in order to be able to use TRX_SYS_SPACE in mtr0log.ic. ------------------------------------------------------------------------ r5128 | vasil | 2009-05-26 17:26:37 +0300 (Tue, 26 May 2009) | 7 lines branches/5.1: Fix Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not write redo log for the pages in the doublewrite buffer. Also, do not make a dummy change to the page because this is not needed. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5169 | marko | 2009-05-28 03:21:55 -0400 (Thu, 28 May 2009) | 1 line branches/zip: mtr0mtr.h: Add Doxygen comments for the redo log entry types. ------------------------------------------------------------------------ r5176 | marko | 2009-05-28 07:14:02 -0400 (Thu, 28 May 2009) | 1 line branches/zip: Correct a debug assertion that was added in r5125. ------------------------------------------------------------------------ r5201 | marko | 2009-06-01 06:35:25 -0400 (Mon, 01 Jun 2009) | 2 lines branches/zip: Clean up some comments. Make the rec parameter of mlog_open_and_write_index() const. ------------------------------------------------------------------------ r5234 | marko | 2009-06-03 08:26:41 -0400 (Wed, 03 Jun 2009) | 44 lines branches/zip: Merge revisions 5148:5233 from branches/5.1: ------------------------------------------------------------------------ r5150 | vasil | 2009-05-27 18:56:03 +0300 (Wed, 27 May 2009) | 4 lines branches/5.1: Whitespace fixup. ------------------------------------------------------------------------ r5191 | vasil | 2009-05-30 17:46:05 +0300 (Sat, 30 May 2009) | 19 lines branches/5.1: Merge a change from MySQL (this fixes the failing innodb_mysql test): ------------------------------------------------------------ revno: 1810.3894.10 committer: Sergey Glukhov <Sergey.Glukhov@sun.com> branch nick: mysql-5.0-bugteam timestamp: Tue 2009-05-19 11:32:21 +0500 message: Bug#39793 Foreign keys not constructed when column has a '#' in a comment or default value Internal InnoDN FK parser does not recognize '\'' as quotation symbol. Suggested fix is to add '\'' symbol check for quotation condition (dict_strip_comments() function). modified: innobase/dict/dict0dict.c mysql-test/r/innodb_mysql.result mysql-test/t/innodb_mysql.test ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5250 | marko | 2009-06-04 02:58:23 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add Doxygen comments to the rest of buf0*. ------------------------------------------------------------------------ r5251 | marko | 2009-06-04 02:59:51 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Replace <= in a function comment. ------------------------------------------------------------------------ r5253 | marko | 2009-06-04 06:37:35 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add missing Doxygen comments for page0zip. ------------------------------------------------------------------------ r5261 | vasil | 2009-06-05 11:13:31 -0400 (Fri, 05 Jun 2009) | 15 lines branches/zip: Fix Mantis Issue#244 fix bug in linear read ahead (no check on access pattern) The changes are: 1) Take into account access pattern when deciding whether or not to do linear read ahead. 2) Expose a knob innodb_read_ahead_factor = [0-64] default (8), dynamic, global to control linear read ahead behvior 3) Disable random read ahead. Keep the code for now. Submitted by: Inaam (rb://122) Approved by: Heikki (rb://122) ------------------------------------------------------------------------ r5262 | vasil | 2009-06-05 12:04:25 -0400 (Fri, 05 Jun 2009) | 22 lines branches/zip: Enable functionality to have multiple background io helper threads. This patch is based on percona contributions. More details about this patch will be written at: https://svn.innodb.com/innobase/MultipleBackgroundThreads The patch essentially does the following: expose following knobs: innodb_read_io_threads = [1 - 64] default 1 innodb_write_io_threads = [1 - 64] default 1 deprecate innodb_file_io_threads (this parameter was relevant only on windows) Internally it allows multiple segments for read and write IO request arrays where one thread works on one segement. Submitted by: Inaam (rb://124) Approved by: Heikki (rb://124) ------------------------------------------------------------------------ r5263 | vasil | 2009-06-05 12:19:37 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Whitespace cleanup. ------------------------------------------------------------------------ r5264 | vasil | 2009-06-05 12:26:58 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5261. ------------------------------------------------------------------------ r5265 | vasil | 2009-06-05 12:34:11 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5262. ------------------------------------------------------------------------ r5268 | inaam | 2009-06-08 12:18:21 -0400 (Mon, 08 Jun 2009) | 7 lines branches/zip Non functional change: Added legal notices acknowledging percona contribution to the multiple IO helper threads patch i.e.: r5262 ------------------------------------------------------------------------ r5283 | inaam | 2009-06-09 13:46:29 -0400 (Tue, 09 Jun 2009) | 9 lines branches/zip rb://130 Enable Group Commit functionality that was broken in 5.0 when distributed transactions were introduced. Reviewed by: Heikki ------------------------------------------------------------------------ r5319 | marko | 2009-06-11 04:40:33 -0400 (Thu, 11 Jun 2009) | 3 lines branches/zip: Declare os_thread_id_t as unsigned long, because ulint is wrong on Win64. Pointed out by Vladislav Vaintroub <wlad@sun.com>. ------------------------------------------------------------------------ r5320 | inaam | 2009-06-11 09:15:41 -0400 (Thu, 11 Jun 2009) | 14 lines branches/zip rb://131 This patch changes the following defaults: max_dirty_pages_pct: default from 90 to 75. max allowed from 100 to 99 additional_mem_pool_size: default from 1 to 8 MB buffer_pool_size: default from 8 to 128 MB log_buffer_size: default from 1 to 8 MB read_io_threads/write_io_threads: default from 1 to 4 The log file sizes are untouched because of upgrade issues Reviewed by: Heikki ------------------------------------------------------------------------ r5330 | marko | 2009-06-16 04:08:59 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_gen(): Reduce mutex holding time by adjusting buf_pool->n_pend_unzip while only holding buf_pool_mutex. ------------------------------------------------------------------------ r5331 | marko | 2009-06-16 05:00:48 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Eliminate a buf_page_get_mutex() call. The function must switch on the block state anyway. ------------------------------------------------------------------------ r5332 | vasil | 2009-06-16 05:03:27 -0400 (Tue, 16 Jun 2009) | 4 lines branches/zip: Add ChangeLog entries for r5283 and r5320. ------------------------------------------------------------------------ r5333 | marko | 2009-06-16 05:27:46 -0400 (Tue, 16 Jun 2009) | 1 line branches/zip: buf_page_io_query(): Remove unused function. ------------------------------------------------------------------------ r5335 | marko | 2009-06-16 09:23:10 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: innodb.test: Adjust the tolerance of innodb_buffer_pool_pages_total for r5320. ------------------------------------------------------------------------ r5342 | marko | 2009-06-17 06:15:32 -0400 (Wed, 17 Jun 2009) | 60 lines branches/zip: Merge revisions 5233:5341 from branches/5.1: ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5243 | sunny | 2009-06-04 03:17:14 +0300 (Thu, 04 Jun 2009) | 14 lines branches/5.1: When the InnoDB and MySQL data dictionaries go out of sync, before the bug fix we would assert on missing autoinc columns. With this fix we allow MySQL to open the table but set the next autoinc value for the column to the MAX value. This effectively disables the next value generation. INSERTs will fail with a generic AUTOINC failure. However, the user should be able to read/dump the table, set the column values explicitly, use ALTER TABLE to set the next autoinc value and/or sync the two data dictionaries to resume normal operations. Fix Bug#44030 Error: (1500) Couldn't read the MAX(ID) autoinc value from the index (PRIMARY) rb://118 ------------------------------------------------------------------------ r5252 | sunny | 2009-06-04 10:16:24 +0300 (Thu, 04 Jun 2009) | 2 lines branches/5.1: The version of the result file checked in was broken in r5243. ------------------------------------------------------------------------ r5259 | vasil | 2009-06-05 10:29:16 +0300 (Fri, 05 Jun 2009) | 7 lines branches/5.1: Remove the word "Error" from the printout because the mysqltest suite interprets it as an error and thus the innodb-autoinc test fails. Approved by: Sunny (via IM) ------------------------------------------------------------------------ r5339 | marko | 2009-06-17 11:01:37 +0300 (Wed, 17 Jun 2009) | 2 lines branches/5.1: Add missing #include "mtr0log.h" so that the code compiles with -DUNIV_MUST_NOT_INLINE. (null merge; this had already been committed in branches/zip) ------------------------------------------------------------------------ r5340 | marko | 2009-06-17 12:11:49 +0300 (Wed, 17 Jun 2009) | 4 lines branches/5.1: row_unlock_for_mysql(): When the clustered index is unknown, refuse to unlock the record. (Bug #45357, caused by the fix of Bug #39320). rb://132 approved by Sunny Bains. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5343 | vasil | 2009-06-17 08:56:12 -0400 (Wed, 17 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5342. ------------------------------------------------------------------------ r5344 | marko | 2009-06-17 09:03:45 -0400 (Wed, 17 Jun 2009) | 1 line branches/zip: row_merge_read_rec(): Fix a UNIV_DEBUG bug (Bug #45426) ------------------------------------------------------------------------ r5391 | marko | 2009-06-22 05:31:35 -0400 (Mon, 22 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Fix a bogus warning about block_mutex being possibly uninitialized. ------------------------------------------------------------------------ r5392 | marko | 2009-06-22 07:58:20 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: ha_innobase::check_if_incompatible_data(): When ROW_FORMAT=DEFAULT, do not compare to get_row_type(). Without this change, fast index creation will be disabled in recent versions of MySQL 5.1. ------------------------------------------------------------------------ r5393 | pekka | 2009-06-22 09:27:55 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Minor changes for Hot Backup to build correctly. (The code bracketed between #ifdef UNIV_HOTBACKUP and #endif /* UNIV_HOTBACKUP */). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5394 | pekka | 2009-06-22 09:46:34 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Add functions for checking the format of tablespaces for Hot Backup build (UNIV_HOTBACKUP defined). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5397 | calvin | 2009-06-23 16:59:42 -0400 (Tue, 23 Jun 2009) | 7 lines branches/zip: change the header file path. Change the header file path from ../storage/innobase/include/ to ../include/. In the planned 5.1 + plugin release, the source directory of the plugin will not be in storage/innobase. Approved by: Heikki (IM) ------------------------------------------------------------------------ r5407 | calvin | 2009-06-24 09:51:08 -0400 (Wed, 24 Jun 2009) | 4 lines branches/zip: remove relative path of header files. Suggested by Marko. ------------------------------------------------------------------------ r5412 | marko | 2009-06-25 06:27:08 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: Replace a DBUG_ASSERT with ut_a to track down Issue #290. ------------------------------------------------------------------------ r5415 | marko | 2009-06-25 06:45:57 -0400 (Thu, 25 Jun 2009) | 3 lines branches/zip: dict_index_find_cols(): Print diagnostic on name mismatch. This addresses Bug #44571 but does not fix it. rb://135 approved by Sunny Bains. ------------------------------------------------------------------------ r5417 | marko | 2009-06-25 08:20:56 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: ha_innodb.cc: Move the misplaced Doxygen @file comment. ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 08:55:52 -0400 (Thu, 25 Jun 2009) | 5 lines branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ r5423 | calvin | 2009-06-26 16:52:52 -0400 (Fri, 26 Jun 2009) | 2 lines branches/zip: Fix typos. ------------------------------------------------------------------------ r5425 | marko | 2009-06-29 04:52:30 -0400 (Mon, 29 Jun 2009) | 4 lines branches/zip: ha_innobase::add_index(), ha_innobase::final_drop_index(): Start prebuilt->trx before locking the table. This should fix Issue #293 and could fix Issue #229. Approved by Sunny (over IM). ------------------------------------------------------------------------ r5426 | marko | 2009-06-29 05:24:27 -0400 (Mon, 29 Jun 2009) | 3 lines branches/zip: buf_page_get_gen(): Fix a race condition when reading buf_fix_count. This could explain Issue #156. Tested by Michael. ------------------------------------------------------------------------ r5427 | marko | 2009-06-29 05:54:53 -0400 (Mon, 29 Jun 2009) | 5 lines branches/zip: lock_print_info_all_transactions(), buf_read_recv_pages(): Tolerate missing tablespaces (zip_size==ULINT_UNDEFINED). buf_page_get_gen(): Add ut_ad(ut_is_2pow(zip_size)). Issue #289, rb://136 approved by Sunny Bains ------------------------------------------------------------------------ r5428 | marko | 2009-06-29 07:06:29 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: row_sel_store_mysql_rec(): Add missing pointer cast. Do not do arithmetics on void pointers. ------------------------------------------------------------------------ r5429 | marko | 2009-06-29 09:49:54 -0400 (Mon, 29 Jun 2009) | 13 lines branches/zip: Do not crash on SET GLOBAL innodb_file_format=DEFAULT or SET GLOBAL innodb_file_format_check=DEFAULT. innodb_file_format.test: New test for innodb_file_format and innodb_file_format_check. innodb_file_format_name_validate(): Store the string in *save. innodb_file_format_name_update(): Check the string again. innodb_file_format_check_validate(): Store the string in *save. innodb_file_format_check_update(): Check the string again. Issue #282, rb://140 approved by Heikki Tuuri ------------------------------------------------------------------------ r5430 | marko | 2009-06-29 09:58:07 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: lock_rec_validate_page(): Add another assertion to track down Issue #289. ------------------------------------------------------------------------ r5431 | marko | 2009-06-29 09:58:40 -0400 (Mon, 29 Jun 2009) | 1 line branches/zip: Revert an accidentally made change in r5430 to univ.i. ------------------------------------------------------------------------ r5437 | marko | 2009-06-30 05:10:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: ibuf_dummy_index_free(): Beautify the comment. ------------------------------------------------------------------------ r5438 | marko | 2009-06-30 05:10:32 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: fseg_free(): Remove this unused function. ------------------------------------------------------------------------ r5439 | marko | 2009-06-30 05:15:22 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: fseg_validate(): Enclose in #ifdef UNIV_DEBUG. This function is unused, but it could turn out to be a useful debugging aid. ------------------------------------------------------------------------ r5441 | marko | 2009-06-30 06:30:14 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: ha_delete(): Remove this unused function that was very similar to ha_search_and_delete_if_found(). ------------------------------------------------------------------------ r5442 | marko | 2009-06-30 06:45:41 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: lock_is_on_table(), lock_table_unlock(): Unused, remove. ------------------------------------------------------------------------ r5443 | marko | 2009-06-30 07:03:00 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_event_create_auto(): Unused, remove. ------------------------------------------------------------------------ r5444 | marko | 2009-06-30 07:19:49 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: que_graph_try_free(): Unused, remove. ------------------------------------------------------------------------ r5445 | marko | 2009-06-30 07:28:11 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: row_build_row_ref_from_row(): Unused, remove. ------------------------------------------------------------------------ r5446 | marko | 2009-06-30 07:35:45 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_round_robin(), srv_que_task_enqueue(): Unused, remove. ------------------------------------------------------------------------ r5447 | marko | 2009-06-30 07:37:58 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_task_queue_check(): Unused, remove. ------------------------------------------------------------------------ r5448 | marko | 2009-06-30 07:56:36 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: mem_heap_cat(): Unused, remove. ------------------------------------------------------------------------ r5449 | marko | 2009-06-30 08:00:50 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: innobase_start_or_create_for_mysql(): Invoke os_get_os_version() at most once. ------------------------------------------------------------------------ r5450 | marko | 2009-06-30 08:02:20 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_file_close_no_error_handling(): Unused, remove. ------------------------------------------------------------------------ r5451 | marko | 2009-06-30 08:09:49 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: page_set_max_trx_id(): Make the code compile with UNIV_HOTBACKUP. ------------------------------------------------------------------------ r5452 | marko | 2009-06-30 08:10:26 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: os_file_close_no_error_handling(): Restore, as this function is used within InnoDB Hot Backup. ------------------------------------------------------------------------ r5453 | marko | 2009-06-30 08:14:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_process_set_priority_boost(): Unused, remove. ------------------------------------------------------------------------ r5454 | marko | 2009-06-30 08:42:52 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: Replace a non-ASCII character (ISO 8859-1 encoded U+00AD SOFT HYPHEN) with a cheap ASCII substitute. ------------------------------------------------------------------------ r5456 | inaam | 2009-06-30 14:21:09 -0400 (Tue, 30 Jun 2009) | 4 lines branches/zip Non functional change. s/Percona/Percona Inc./ ------------------------------------------------------------------------ r5470 | vasil | 2009-07-02 09:12:36 -0400 (Thu, 02 Jul 2009) | 16 lines branches/zip: Use PAUSE instruction inside spinloop if it is available. The patch was originally developed by Mikael Ronstrom <mikael@mysql.com> and can be found here: http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2768 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2771 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2772 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2774 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2777 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2799 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2800 Approved by: Heikki (rb://137) ------------------------------------------------------------------------ r5481 | vasil | 2009-07-06 13:16:32 -0400 (Mon, 06 Jul 2009) | 4 lines branches/zip: Remove unnecessary quotes and simplify plug.in. ------------------------------------------------------------------------ r5482 | calvin | 2009-07-06 18:36:35 -0400 (Mon, 06 Jul 2009) | 5 lines branches/zip: add COPYING files for Percona and Sun Micro. 1.0.4 contains patches based on contributions from Percona and Sun Microsystems. ------------------------------------------------------------------------ r5483 | calvin | 2009-07-07 05:36:43 -0400 (Tue, 07 Jul 2009) | 3 lines branches/zip: add IB_HAVE_PAUSE_INSTRUCTION to CMake. Windows will support PAUSE instruction by default. ------------------------------------------------------------------------ r5484 | inaam | 2009-07-07 18:57:14 -0400 (Tue, 07 Jul 2009) | 13 lines branches/zip rb://126 Based on contribution from Google Inc. This patch introduces a new parameter innodb_io_capacity to control the rate at which master threads performs various tasks. The default value is 200 and higher values imply more aggressive flushing and ibuf merges from within the master thread. This patch also changes the ibuf merge from synchronous to asynchronous. Another minor change is not to force the master thread to wait for a log flush to complete every second. Approved by: Heikki ------------------------------------------------------------------------ r5485 | inaam | 2009-07-07 19:00:49 -0400 (Tue, 07 Jul 2009) | 18 lines branches/zip rb://138 The current implementation is to try to flush the neighbors of every page that we flush. This patch makes the following distinction: 1) If the flush is from flush_list AND 2) If the flush is intended to move the oldest_modification LSN ahead (this happens when a user thread sees little space in the log file and attempts to flush pages from the buffer pool so that a checkpoint can be made) THEN Do not try to flush the neighbors. Just focus on flushing dirty pages at the end of flush_list Approved by: Heikki ------------------------------------------------------------------------ r5486 | inaam | 2009-07-08 12:11:40 -0400 (Wed, 08 Jul 2009) | 29 lines branches/zip rb://133 This patch introduces heuristics based flushing rate of dirty pages to avoid IO bursts at checkpoint. 1) log_capacity / log_generated per second gives us number of seconds in which ALL dirty pages need to be flushed. Based on this rough assumption we can say that n_dirty_pages / (log_capacity / log_generation_rate) = desired_flush_rate 2) We use weighted averages (hard coded to 20 seconds) of log_generation_rate to avoid resonance. 3) From the desired_flush_rate we subtract the number of pages that have been flushed due to LRU flushing. That gives us pages that we should flush as part of flush_list cleanup. And that is the number (capped by maximum io_capacity) that we try to flush from the master thread. Knobs: ====== innodb_adaptive_flushing: boolean, global, dynamic, default TRUE. Since this heuristic is very experimental and has the potential to dramatically change the IO pattern I think it is a good idea to leave a knob to turn it off. Approved by: Heikki ------------------------------------------------------------------------ r5487 | calvin | 2009-07-08 12:42:28 -0400 (Wed, 08 Jul 2009) | 7 lines branches/zip: fix PAUSE instruction patch on Windows The original PAUSE instruction patch (r5470) does not compile on Windows. Also, there is an elegant way of doing it on Windows - YieldProcessor(). Approved by: Heikki (on IM) ------------------------------------------------------------------------ r5489 | vasil | 2009-07-10 05:02:22 -0400 (Fri, 10 Jul 2009) | 9 lines branches/zip: Change the defaults for innodb_sync_spin_loops: 20 -> 30 innodb_spin_wait_delay: 5 -> 6 This change was proposed by Sun/MySQL based on their performance testing, see https://svn.innodb.com/innobase/Release_tasks_for_InnoDB_Plugin_V1.0.4 ------------------------------------------------------------------------ r5490 | vasil | 2009-07-10 05:04:20 -0400 (Fri, 10 Jul 2009) | 4 lines branches/zip: Add ChangeLog entry for 5489. ------------------------------------------------------------------------ r5491 | calvin | 2009-07-10 12:19:17 -0400 (Fri, 10 Jul 2009) | 6 lines branches/zip: add copyright info to files related to PAUSE instruction patch, contributed by Sun Microsystems. ------------------------------------------------------------------------ r5492 | calvin | 2009-07-10 17:47:34 -0400 (Fri, 10 Jul 2009) | 5 lines branches/zip: add ChangeLog entries for r5484-r5486. ------------------------------------------------------------------------ r5494 | vasil | 2009-07-13 03:37:35 -0400 (Mon, 13 Jul 2009) | 6 lines branches/zip: Restore the original value of innodb_sync_spin_loops at the end, previously the test assumed that setting it to 20 will do this, but now the default is 30 and MTR's internal check failed. ------------------------------------------------------------------------ r5495 | inaam | 2009-07-13 11:48:45 -0400 (Mon, 13 Jul 2009) | 5 lines branches/zip rb://138 (REVERT) Revert the flush neighbors patch as it shows regression in the benchmarks run by Michael. ------------------------------------------------------------------------ r5496 | inaam | 2009-07-13 14:04:57 -0400 (Mon, 13 Jul 2009) | 4 lines branches/zip Fixed warnings on windows where ulint != ib_uint64_t ------------------------------------------------------------------------ r5497 | calvin | 2009-07-13 15:01:00 -0400 (Mon, 13 Jul 2009) | 9 lines branches/zip: fix run-time symbols clash on Solaris. This patch is from Sergey Vojtovich of Sun Microsystems, to fix run-time symbols clash on Solaris with older C++ compiler: - when finding out a way to hide symbols, make decision basing on compiler, not operating system. - Sun Studio supports __hidden declaration specifier for this purpose. ------------------------------------------------------------------------ r5498 | vasil | 2009-07-14 03:16:18 -0400 (Tue, 14 Jul 2009) | 92 lines branches/zip: Merge r5341:5497 from branches/5.1, skipping: c5419 because it is merge from branches/zip into branches/5.1 c5466 because the source code has been adjusted to match the MySQL behavior and the innodb-autoinc test does not fail in branches/zip, if c5466 is merged, then innodb-autoinc starts failing, Sunny suggested not to merge c5466. and resolving conflicts in c5410, c5440, c5488: ------------------------------------------------------------------------ r5410 | marko | 2009-06-24 22:26:34 +0300 (Wed, 24 Jun 2009) | 2 lines Changed paths: M /branches/5.1/include/trx0sys.ic M /branches/5.1/trx/trx0purge.c M /branches/5.1/trx/trx0sys.c M /branches/5.1/trx/trx0undo.c branches/5.1: Add missing #include "mtr0log.h" to avoid warnings when compiling with -DUNIV_MUST_NOT_INLINE. ------------------------------------------------------------------------ r5419 | marko | 2009-06-25 16:11:57 +0300 (Thu, 25 Jun 2009) | 18 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb_bug42101-nonzero.result M /branches/5.1/mysql-test/innodb_bug42101-nonzero.test M /branches/5.1/mysql-test/innodb_bug42101.result M /branches/5.1/mysql-test/innodb_bug42101.test branches/5.1: Merge r5418 from branches/zip: ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 15:55:52 +0300 (Thu, 25 Jun 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc M /branches/zip/mysql-test/innodb_bug42101-nonzero.result M /branches/zip/mysql-test/innodb_bug42101-nonzero.test M /branches/zip/mysql-test/innodb_bug42101.result M /branches/zip/mysql-test/innodb_bug42101.test branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5440 | vasil | 2009-06-30 13:04:29 +0300 (Tue, 30 Jun 2009) | 8 lines Changed paths: M /branches/5.1/fil/fil0fil.c branches/5.1: Fix Bug#45814 URL reference in InnoDB server errors needs adjusting to match documentation by changing the URL from http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting.html to http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting-datadict.html ------------------------------------------------------------------------ r5466 | vasil | 2009-07-02 10:46:45 +0300 (Thu, 02 Jul 2009) | 6 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Adjust the failing innodb-autoinc test to conform to the latest behavior of the MySQL code. The idea and the comment in innodb-autoinc.test come from Sunny. ------------------------------------------------------------------------ r5488 | vasil | 2009-07-09 19:16:44 +0300 (Thu, 09 Jul 2009) | 13 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug21704.result A /branches/5.1/mysql-test/innodb_bug21704.test branches/5.1: Fix Bug#21704 Renaming column does not update FK definition by checking whether a column that participates in a FK definition is being renamed and denying the ALTER in this case. The patch was originally developed by Davi Arnaut <Davi.Arnaut@Sun.COM>: http://lists.mysql.com/commits/77714 and was later adjusted to conform to InnoDB coding style by me (Vasil), I also added some more comments and moved the bug specific mysql-test to a separate file to make it more manageable and flexible. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5499 | calvin | 2009-07-14 12:55:10 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: add a missing file in Makefile.am This change was suggested by MySQL. ------------------------------------------------------------------------ r5500 | calvin | 2009-07-14 13:03:26 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: minor change Remove an extra "with". ------------------------------------------------------------------------ r5501 | vasil | 2009-07-14 13:58:15 -0400 (Tue, 14 Jul 2009) | 5 lines branches/zip: Add @ZLIB_INCLUDES@ so that the InnoDB Plugin picks up the same zlib.h header file that is eventually used by mysqld. ------------------------------------------------------------------------ r5502 | vasil | 2009-07-14 13:59:59 -0400 (Tue, 14 Jul 2009) | 4 lines branches/zip: Add include/ut0auxconf.h to noinst_HEADERS ------------------------------------------------------------------------ r5503 | vasil | 2009-07-14 14:16:11 -0400 (Tue, 14 Jul 2009) | 8 lines branches/zip: Non-functional change: put files in noinst_HEADERS and libinnobase_a_SOURCES one per line and sort alphabetically, so it is easier to find if a file is there or not and also diffs show exactly the added or removed file instead of surrounding lines too. ------------------------------------------------------------------------ r5504 | calvin | 2009-07-15 04:58:44 -0400 (Wed, 15 Jul 2009) | 6 lines branches/zip: fix compile errors on Win64 Both srv_read_ahead_factor and srv_io_capacity should be defined as ulong. Approved by: Sunny ------------------------------------------------------------------------ r5508 | calvin | 2009-07-16 09:40:47 -0400 (Thu, 16 Jul 2009) | 16 lines branches/zip: Support inlining of functions and prefetch with Sun Studio Those changes are contributed by Sun/MySQL. Two sets of changes in this patch when Sun Studio is used: - Explicit inlining of functions - Prefetch Support This patch has been tested by Sunny with the plugin statically built in. Since we've never built the plugin as a dynamically loaded module on Solaris, it is a separate task to change plug.in. rb://142 Approved by: Heikki ------------------------------------------------------------------------ r5509 | calvin | 2009-07-16 09:45:28 -0400 (Thu, 16 Jul 2009) | 2 lines branches/zip: add ChangeLog entry for r5508. ------------------------------------------------------------------------ r5512 | sunny | 2009-07-19 19:52:48 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Remove unused extern ref to timed_mutexes. ------------------------------------------------------------------------ r5513 | sunny | 2009-07-19 19:58:43 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Undo r5512 ------------------------------------------------------------------------ r5514 | sunny | 2009-07-19 20:08:49 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Only use my_bool when UNIV_HOTBACKUP is not defined. ------------------------------------------------------------------------ r5515 | sunny | 2009-07-20 03:29:14 -0400 (Mon, 20 Jul 2009) | 2 lines branches/zip: The dict_table_t::autoinc_mutex field is not used in HotBackup. ------------------------------------------------------------------------ r5516 | sunny | 2009-07-20 03:46:05 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Make this file usable from within HotBackup. A new file has been introduced called hb_univ.i. This file should have all the HotBackup specific configuration. ------------------------------------------------------------------------ r5517 | sunny | 2009-07-20 03:55:11 -0400 (Mon, 20 Jul 2009) | 2 lines Add /* UNIV_HOTBACK */ ------------------------------------------------------------------------ r5519 | vasil | 2009-07-20 04:45:18 -0400 (Mon, 20 Jul 2009) | 31 lines branches/zip: Merge r5497:5518 from branches/5.1: ------------------------------------------------------------------------ r5518 | vasil | 2009-07-20 11:29:47 +0300 (Mon, 20 Jul 2009) | 22 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a change from MySQL: ------------------------------------------------------------ revno: 2874.2.1 committer: Anurag Shekhar <anurag.shekhar@sun.com> branch nick: mysql-5.1-bugteam-windows-warning timestamp: Wed 2009-05-13 15:41:24 +0530 message: Bug #39802 On Windows, 32-bit time_t should be enforced This patch fixes compilation warning, "conversion from 'time_t' to 'ulong', possible loss of data". The fix is to typecast time_t to ulong before assigning it to ulong. Backported this from 6.0-bugteam tree. modified: storage/archive/ha_archive.cc storage/federated/ha_federated.cc storage/innobase/handler/ha_innodb.cc storage/myisam/ha_myisam.cc ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5520 | vasil | 2009-07-20 04:51:47 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Add ChangeLog entries for r5498 and r5519. ------------------------------------------------------------------------ r5524 | inaam | 2009-07-20 12:23:15 -0400 (Mon, 20 Jul 2009) | 9 lines branches/zip Change the read ahead parameter name to innodb_read_ahead_threshold. Change the meaning of this parameter to signify the number of pages that must be sequentially accessed for InnoDB to trigger a readahead request. Suggested by: Ken ------------------------------------------------------------------------
16 years ago
branches/innodb+: Merge revisions 4150:4528 from branches/zip: ------------------------------------------------------------------------ r4152 | marko | 2009-02-10 12:52:27 +0200 (Tue, 10 Feb 2009) | 12 lines branches/zip: When innodb_use_sys_malloc is set, ignore innodb_additional_mem_pool_size, because nothing will be allocated from mem_comm_pool. mem_pool_create(): Remove the assertion about size. The function will work with any size. However, an assertion would fail in ut_malloc_low() when size==0. mem_init(): When srv_use_sys_malloc is set, pass size=1 to mem_pool_create(). mem0mem.c: Add #include "srv0srv.h" that is needed by mem0dbg.c. ------------------------------------------------------------------------ r4153 | vasil | 2009-02-10 22:58:17 +0200 (Tue, 10 Feb 2009) | 14 lines branches/zip: (followup to r4145) Non-functional change: Change the os_atomic_increment() and os_compare_and_swap() functions to macros to avoid artificial limitations on the types of those functions' arguments. As a consequence typecasts from the source code can be removed. Also remove Google's copyright from os0sync.ic because that file no longer contains code from Google. Approved by: Marko (rb://88), also ok from Inaam via IM ------------------------------------------------------------------------ r4163 | marko | 2009-02-12 00:14:19 +0200 (Thu, 12 Feb 2009) | 4 lines branches/zip: Make innodb_thread_concurrency=0 the default. The old default was 8. ------------------------------------------------------------------------ r4169 | calvin | 2009-02-12 10:37:10 +0200 (Thu, 12 Feb 2009) | 3 lines branches/zip: Adjust the result file of innodb_thread_concurrency_basic test. The default value of innodb_thread_concurrency is changed to 0 (from 8) via r4163. ------------------------------------------------------------------------ r4174 | vasil | 2009-02-12 17:38:27 +0200 (Thu, 12 Feb 2009) | 4 lines branches/zip: Fix pathname of the file to patch. ------------------------------------------------------------------------ r4176 | vasil | 2009-02-13 10:06:31 +0200 (Fri, 13 Feb 2009) | 7 lines branches/zip: Fix the failing mysql-test partition_innodb, which failed only if run after innodb_trx_weight (or other test that would leave LATEST DEADLOCK ERROR into the output of SHOW ENGINE INNODB STATUS). Find further explanation for the failure at the top of the added patch partition_innodb.diff. ------------------------------------------------------------------------ r4198 | vasil | 2009-02-17 09:06:07 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: Add the full text of the GPLv2 license into the root directory of the plugin. In previous releases this file was copied from an external source (https://svn.innodb.com/svn/plugin/trunk/support/COPYING) "manually" when creating the source and binary archives. It is less confusing to have this present in the root directory of the SVN branch. ------------------------------------------------------------------------ r4199 | vasil | 2009-02-17 09:11:58 +0200 (Tue, 17 Feb 2009) | 4 lines branches/zip: Add Google's license into COPYING.Google. ------------------------------------------------------------------------ r4200 | vasil | 2009-02-17 09:56:33 +0200 (Tue, 17 Feb 2009) | 11 lines branches/zip: To the files touched by the Google patch from c4144 (excluding include/os0sync.ic because later we removed Google code from that file): * Remove the Google license * Remove old Innobase copyright lines * Add a reference to the Google license and to the GPLv2 license at the top, as recommended by the lawyers at Oracle Legal. ------------------------------------------------------------------------ r4201 | vasil | 2009-02-17 10:12:02 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 1/28] ------------------------------------------------------------------------ r4202 | vasil | 2009-02-17 10:15:06 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 2/28] ------------------------------------------------------------------------ r4203 | vasil | 2009-02-17 10:25:45 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 3/28] ------------------------------------------------------------------------ r4204 | vasil | 2009-02-17 10:55:41 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 4/28] ------------------------------------------------------------------------ r4205 | vasil | 2009-02-17 10:59:22 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 5/28] ------------------------------------------------------------------------ r4206 | vasil | 2009-02-17 11:02:27 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 6/28] ------------------------------------------------------------------------ r4207 | vasil | 2009-02-17 11:04:28 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 7/28] ------------------------------------------------------------------------ r4208 | vasil | 2009-02-17 11:06:49 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 8/28] ------------------------------------------------------------------------ r4209 | vasil | 2009-02-17 11:10:18 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 9/28] ------------------------------------------------------------------------ r4210 | vasil | 2009-02-17 11:12:41 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 10/28] ------------------------------------------------------------------------ r4211 | vasil | 2009-02-17 11:14:40 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 11/28] ------------------------------------------------------------------------ r4212 | vasil | 2009-02-17 11:18:35 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 12/28] ------------------------------------------------------------------------ r4213 | vasil | 2009-02-17 11:24:40 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 13/28] ------------------------------------------------------------------------ r4214 | vasil | 2009-02-17 11:27:31 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 13/28] ------------------------------------------------------------------------ r4215 | vasil | 2009-02-17 11:29:55 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 15/28] ------------------------------------------------------------------------ r4216 | vasil | 2009-02-17 11:33:38 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 16/28] ------------------------------------------------------------------------ r4217 | vasil | 2009-02-17 11:36:44 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 17/28] ------------------------------------------------------------------------ r4218 | vasil | 2009-02-17 11:39:11 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 18/28] ------------------------------------------------------------------------ r4219 | vasil | 2009-02-17 11:41:24 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 19/28] ------------------------------------------------------------------------ r4220 | vasil | 2009-02-17 11:43:50 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 20/28] ------------------------------------------------------------------------ r4221 | vasil | 2009-02-17 11:46:52 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 21/28] ------------------------------------------------------------------------ r4222 | vasil | 2009-02-17 11:50:12 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 22/28] ------------------------------------------------------------------------ r4223 | vasil | 2009-02-17 11:53:58 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 23/28] ------------------------------------------------------------------------ r4224 | vasil | 2009-02-17 12:01:41 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 24/28] ------------------------------------------------------------------------ r4225 | vasil | 2009-02-17 12:05:45 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 25/28] ------------------------------------------------------------------------ r4226 | vasil | 2009-02-17 12:09:16 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 26/28] ------------------------------------------------------------------------ r4227 | vasil | 2009-02-17 12:12:56 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 27/28] ------------------------------------------------------------------------ r4228 | vasil | 2009-02-17 12:14:04 +0200 (Tue, 17 Feb 2009) | 8 lines branches/zip: * Remove old Innobase copyright lines from C source files * Add a reference to the GPLv2 license as recommended by the lawyers at Oracle Legal [Step 28/28] ------------------------------------------------------------------------ r4229 | vasil | 2009-02-17 12:30:55 +0200 (Tue, 17 Feb 2009) | 4 lines branches/zip: Add the copyright notice to the non C files. ------------------------------------------------------------------------ r4231 | marko | 2009-02-17 14:26:53 +0200 (Tue, 17 Feb 2009) | 12 lines Minor cleanup of the Google SMP patch. sync_array_object_signalled(): Add a (void) cast to eliminate a gcc warning about the return value of os_atomic_increment() being ignored. rw_lock_create_func(): Properly indent the preprocessor directives. rw_lock_x_lock_low(), rw_lock_x_lock_func_nowait(): Split lines correctly. rw_lock_set_writer_id_and_recursion_flag(): Silence a Valgrind warning. Do not mix statements and variable declarations. ------------------------------------------------------------------------ r4232 | marko | 2009-02-17 14:59:54 +0200 (Tue, 17 Feb 2009) | 3 lines branches/zip: When assigning lock->recursive = FALSE, also flag lock->writer_thread invalid, so that Valgrind will catch more errors. This is related to Issue #175. ------------------------------------------------------------------------ r4242 | marko | 2009-02-18 17:01:09 +0200 (Wed, 18 Feb 2009) | 2 lines branches/zip: UT_DBG_STOP: Use do{} while(0) to silence a g++-4.3.2 warning about a while(0); statement. This should fix (part of) Issue #176. ------------------------------------------------------------------------ r4243 | marko | 2009-02-18 17:04:03 +0200 (Wed, 18 Feb 2009) | 3 lines branches/zip: buf_buddy_get_slot(): Fix a gcc 4.3.2 warning about an empty body of a "for" statement. This fixes part of Issue #176. ------------------------------------------------------------------------ r4244 | marko | 2009-02-18 17:25:45 +0200 (Wed, 18 Feb 2009) | 11 lines branches/zip: Protect ut_total_allocated_memory with ut_list_mutex. Unprotected updates to ut_total_allocated_memory in os_mem_alloc_large() and os_mem_free_large(), called during fast index creation, may corrupt the variable and cause assertion failures. Also, add UNIV_MEM_ALLOC() and UNIV_MEM_FREE() instrumentation around os_mem_alloc_large() and os_mem_free_large(), so that Valgrind can detect more errors. rb://90 approved by Heikki Tuuri. This addresses Issue #177. ------------------------------------------------------------------------ r4248 | marko | 2009-02-19 11:52:39 +0200 (Thu, 19 Feb 2009) | 2 lines branches/zip: page_zip_set_size(): Fix a g++ 4.3.2 warning about an empty body in a "for" statement. This closes Issue #176. ------------------------------------------------------------------------ r4251 | inaam | 2009-02-19 15:46:27 +0200 (Thu, 19 Feb 2009) | 8 lines branches/zip: Issue #178 rb://91 Change plug.in to have same CXXFLAGS as CFLAGS. This is to ensure that both .c and .cc files get compiled with same flags. To fix the issue where UNIV_LINUX was defined only in .c files. Approved by: Marko ------------------------------------------------------------------------ r4258 | vasil | 2009-02-20 11:52:19 +0200 (Fri, 20 Feb 2009) | 7 lines branches/zip: Cleanup in ChangeLog: * Wrap lines at 78 characters * Changed files are listed alphabetically * White-space cleanup ------------------------------------------------------------------------ r4259 | vasil | 2009-02-20 11:59:42 +0200 (Fri, 20 Feb 2009) | 6 lines branches/zip: ChangeLog: Remove include/os0sync.ic from the entry about the google patch, this file was modified later to not include Google's code. ------------------------------------------------------------------------ r4262 | vasil | 2009-02-20 14:56:59 +0200 (Fri, 20 Feb 2009) | 373 lines branches/zip: Merge revisions 4035:4261 from branches/5.1: ------------------------------------------------------------------------ r4065 | sunny | 2009-01-29 16:01:36 +0200 (Thu, 29 Jan 2009) | 8 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: In the last round of AUTOINC cleanup we assumed that AUTOINC is only defined for integer columns. This caused an assertion failure when we checked for the maximum value of a column type. We now calculate the max value for floating-point autoinc columns too. Fix Bug#42400 - InnoDB autoinc code can't handle floating-point columns rb://84 and Mantis issue://162 ------------------------------------------------------------------------ r4111 | sunny | 2009-02-03 22:06:52 +0200 (Tue, 03 Feb 2009) | 2 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Add the ULL suffix otherwise there is an overflow. ------------------------------------------------------------------------ r4128 | vasil | 2009-02-08 21:36:45 +0200 (Sun, 08 Feb 2009) | 18 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Merge a change from MySQL: ------------------------------------------------------------ revno: 2709.20.31 committer: Timothy Smith <timothy.smith@sun.com> branch nick: 51 timestamp: Fri 2008-12-19 01:28:51 +0100 message: Disable part of innodb-autoinc.test, because the MySQL server asserts when compiled --with-debug, due to bug 39828, "autoinc wraps around when offset and increment > 1". This change should be reverted when that bug is fixed (and a a few other minor changes to the test as described in comments). modified: mysql-test/r/innodb-autoinc.result mysql-test/t/innodb-autoinc.test ------------------------------------------------------------------------ r4129 | vasil | 2009-02-08 21:54:25 +0200 (Sun, 08 Feb 2009) | 310 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Merge a change from MySQL: [looks like the changes to innodb-autoinc.test were made as part of the following huge merge, but we are merging only changes to that file] ------------------------------------------------------------ revno: 2546.47.1 committer: Luis Soares <luis.soares@sun.com> branch nick: 5.1-rpl timestamp: Fri 2009-01-23 13:22:05 +0100 message: merge: 5.1 -> 5.1-rpl conflicts: Text conflict in client/mysqltest.cc Text conflict in mysql-test/include/wait_until_connected_again.inc Text conflict in mysql-test/lib/mtr_report.pm Text conflict in mysql-test/mysql-test-run.pl Text conflict in mysql-test/r/events_bugs.result Text conflict in mysql-test/r/log_state.result Text conflict in mysql-test/r/myisam_data_pointer_size_func.result Text conflict in mysql-test/r/mysqlcheck.result Text conflict in mysql-test/r/query_cache.result Text conflict in mysql-test/r/status.result Text conflict in mysql-test/suite/binlog/r/binlog_index.result Text conflict in mysql-test/suite/binlog/r/binlog_innodb.result Text conflict in mysql-test/suite/rpl/r/rpl_packet.result Text conflict in mysql-test/suite/rpl/t/rpl_packet.test Text conflict in mysql-test/t/disabled.def Text conflict in mysql-test/t/events_bugs.test Text conflict in mysql-test/t/log_state.test Text conflict in mysql-test/t/myisam_data_pointer_size_func.test Text conflict in mysql-test/t/mysqlcheck.test Text conflict in mysql-test/t/query_cache.test Text conflict in mysql-test/t/rpl_init_slave_func.test Text conflict in mysql-test/t/status.test removed: mysql-test/suite/parts/r/partition_bit_ndb.result mysql-test/suite/parts/t/partition_bit_ndb.test mysql-test/suite/parts/t/partition_sessions.test mysql-test/suite/sys_vars/inc/tmp_table_size_basic.inc mysql-test/suite/sys_vars/r/tmp_table_size_basic_32.result mysql-test/suite/sys_vars/r/tmp_table_size_basic_64.result mysql-test/suite/sys_vars/t/tmp_table_size_basic_32.test mysql-test/suite/sys_vars/t/tmp_table_size_basic_64.test mysql-test/t/log_bin_trust_function_creators_func-master.opt mysql-test/t/rpl_init_slave_func-slave.opt added: mysql-test/include/check_events_off.inc mysql-test/include/cleanup_fake_relay_log.inc mysql-test/include/have_simple_parser.inc mysql-test/include/no_running_event_scheduler.inc mysql-test/include/no_running_events.inc mysql-test/include/running_event_scheduler.inc mysql-test/include/setup_fake_relay_log.inc mysql-test/include/wait_condition_sp.inc mysql-test/r/fulltext_plugin.result mysql-test/r/have_simple_parser.require mysql-test/r/innodb_bug38231.result mysql-test/r/innodb_bug39438.result mysql-test/r/innodb_mysql_rbk.result mysql-test/r/partition_innodb_semi_consistent.result mysql-test/r/query_cache_28249.result mysql-test/r/status2.result mysql-test/std_data/bug40482-bin.000001 mysql-test/suite/binlog/r/binlog_innodb_row.result mysql-test/suite/binlog/t/binlog_innodb_row.test mysql-test/suite/rpl/r/rpl_binlog_corruption.result mysql-test/suite/rpl/t/rpl_binlog_corruption-master.opt mysql-test/suite/rpl/t/rpl_binlog_corruption.test mysql-test/suite/sys_vars/r/tmp_table_size_basic.result mysql-test/suite/sys_vars/t/tmp_table_size_basic.test mysql-test/t/fulltext_plugin-master.opt mysql-test/t/fulltext_plugin.test mysql-test/t/innodb_bug38231.test mysql-test/t/innodb_bug39438-master.opt mysql-test/t/innodb_bug39438.test mysql-test/t/innodb_mysql_rbk-master.opt mysql-test/t/innodb_mysql_rbk.test mysql-test/t/partition_innodb_semi_consistent-master.opt mysql-test/t/partition_innodb_semi_consistent.test mysql-test/t/query_cache_28249.test mysql-test/t/status2.test renamed: mysql-test/suite/funcs_1/r/is_collation_character_set_applicability.result => mysql-test/suite/funcs_1/r/is_coll_char_set_appl.result mysql-test/suite/funcs_1/t/is_collation_character_set_applicability.test => mysql-test/suite/funcs_1/t/is_coll_char_set_appl.test modified: .bzr-mysql/default.conf CMakeLists.txt client/mysql.cc client/mysql_upgrade.c client/mysqlcheck.c client/mysqltest.cc configure.in extra/resolve_stack_dump.c extra/yassl/include/openssl/ssl.h include/config-win.h include/m_ctype.h include/my_global.h mysql-test/extra/binlog_tests/database.test mysql-test/extra/rpl_tests/rpl_auto_increment.test mysql-test/include/commit.inc mysql-test/include/have_32bit.inc mysql-test/include/have_64bit.inc mysql-test/include/index_merge1.inc mysql-test/include/linux_sys_vars.inc mysql-test/include/windows_sys_vars.inc mysql-test/lib/mtr_report.pm mysql-test/mysql-test-run.pl mysql-test/r/alter_table.result mysql-test/r/commit_1innodb.result mysql-test/r/create.result mysql-test/r/csv.result mysql-test/r/ctype_ucs.result mysql-test/r/date_formats.result mysql-test/r/events_bugs.result mysql-test/r/events_scheduling.result mysql-test/r/fulltext.result mysql-test/r/func_if.result mysql-test/r/func_in.result mysql-test/r/func_str.result mysql-test/r/func_time.result mysql-test/r/grant.result mysql-test/r/index_merge_myisam.result mysql-test/r/information_schema.result mysql-test/r/innodb-autoinc.result mysql-test/r/innodb.result mysql-test/r/innodb_mysql.result mysql-test/r/log_bin_trust_function_creators_func.result mysql-test/r/log_state.result mysql-test/r/myisampack.result mysql-test/r/mysql.result mysql-test/r/mysqlcheck.result mysql-test/r/partition_datatype.result mysql-test/r/partition_mgm.result mysql-test/r/partition_pruning.result mysql-test/r/query_cache.result mysql-test/r/read_buffer_size_basic.result mysql-test/r/read_rnd_buffer_size_basic.result mysql-test/r/rpl_init_slave_func.result mysql-test/r/select.result mysql-test/r/status.result mysql-test/r/strict.result mysql-test/r/temp_table.result mysql-test/r/type_bit.result mysql-test/r/type_date.result mysql-test/r/type_float.result mysql-test/r/warnings_engine_disabled.result mysql-test/r/xml.result mysql-test/suite/binlog/r/binlog_database.result mysql-test/suite/binlog/r/binlog_index.result mysql-test/suite/binlog/r/binlog_innodb.result mysql-test/suite/binlog/r/binlog_row_mix_innodb_myisam.result mysql-test/suite/binlog/t/binlog_innodb.test mysql-test/suite/funcs_1/r/is_columns_is.result mysql-test/suite/funcs_1/r/is_engines.result mysql-test/suite/funcs_1/r/storedproc.result mysql-test/suite/funcs_1/storedproc/param_check.inc mysql-test/suite/funcs_2/t/disabled.def mysql-test/suite/ndb/t/disabled.def mysql-test/suite/parts/r/partition_bit_innodb.result mysql-test/suite/parts/r/partition_bit_myisam.result mysql-test/suite/parts/r/partition_special_innodb.result mysql-test/suite/parts/t/disabled.def mysql-test/suite/parts/t/partition_special_innodb.test mysql-test/suite/parts/t/partition_value_innodb.test mysql-test/suite/parts/t/partition_value_myisam.test mysql-test/suite/parts/t/partition_value_ndb.test mysql-test/suite/rpl/r/rpl_auto_increment.result mysql-test/suite/rpl/r/rpl_packet.result mysql-test/suite/rpl/r/rpl_row_create_table.result mysql-test/suite/rpl/r/rpl_slave_skip.result mysql-test/suite/rpl/r/rpl_trigger.result mysql-test/suite/rpl/t/disabled.def mysql-test/suite/rpl/t/rpl_packet.test mysql-test/suite/rpl/t/rpl_row_create_table.test mysql-test/suite/rpl/t/rpl_slave_skip.test mysql-test/suite/rpl/t/rpl_trigger.test mysql-test/suite/rpl_ndb/t/disabled.def mysql-test/suite/sys_vars/inc/key_buffer_size_basic.inc mysql-test/suite/sys_vars/inc/sort_buffer_size_basic.inc mysql-test/suite/sys_vars/r/key_buffer_size_basic_32.result mysql-test/suite/sys_vars/r/key_buffer_size_basic_64.result mysql-test/suite/sys_vars/r/sort_buffer_size_basic_32.result mysql-test/suite/sys_vars/r/sort_buffer_size_basic_64.result mysql-test/t/alter_table.test mysql-test/t/create.test mysql-test/t/csv.test mysql-test/t/ctype_ucs.test mysql-test/t/date_formats.test mysql-test/t/disabled.def mysql-test/t/events_bugs.test mysql-test/t/events_scheduling.test mysql-test/t/fulltext.test mysql-test/t/func_if.test mysql-test/t/func_in.test mysql-test/t/func_str.test mysql-test/t/func_time.test mysql-test/t/grant.test mysql-test/t/information_schema.test mysql-test/t/innodb-autoinc.test mysql-test/t/innodb.test mysql-test/t/innodb_mysql.test mysql-test/t/log_bin_trust_function_creators_func.test mysql-test/t/log_state.test mysql-test/t/myisam_data_pointer_size_func.test mysql-test/t/myisampack.test mysql-test/t/mysql.test mysql-test/t/mysqlcheck.test mysql-test/t/partition_innodb_stmt.test mysql-test/t/partition_mgm.test mysql-test/t/partition_pruning.test mysql-test/t/query_cache.test mysql-test/t/rpl_init_slave_func.test mysql-test/t/select.test mysql-test/t/status.test mysql-test/t/strict.test mysql-test/t/temp_table.test mysql-test/t/type_bit.test mysql-test/t/type_date.test mysql-test/t/type_float.test mysql-test/t/warnings_engine_disabled.test mysql-test/t/xml.test mysys/my_getopt.c mysys/my_init.c scripts/mysql_install_db.sh sql-common/my_time.c sql/field.cc sql/field.h sql/filesort.cc sql/ha_partition.cc sql/ha_partition.h sql/item.cc sql/item_cmpfunc.cc sql/item_func.h sql/item_strfunc.cc sql/item_sum.cc sql/item_timefunc.cc sql/item_timefunc.h sql/log.cc sql/log.h sql/log_event.cc sql/log_event.h sql/mysql_priv.h sql/mysqld.cc sql/opt_range.cc sql/partition_info.cc sql/repl_failsafe.cc sql/rpl_constants.h sql/set_var.cc sql/slave.cc sql/spatial.h sql/sql_acl.cc sql/sql_base.cc sql/sql_binlog.cc sql/sql_class.h sql/sql_cursor.cc sql/sql_delete.cc sql/sql_lex.cc sql/sql_lex.h sql/sql_locale.cc sql/sql_parse.cc sql/sql_partition.cc sql/sql_plugin.cc sql/sql_plugin.h sql/sql_profile.cc sql/sql_repl.cc sql/sql_select.cc sql/sql_select.h sql/sql_show.cc sql/sql_table.cc sql/sql_trigger.cc sql/sql_trigger.h sql/table.cc sql/table.h sql/unireg.cc storage/csv/ha_tina.cc storage/federated/ha_federated.cc storage/heap/ha_heap.cc storage/innobase/Makefile.am storage/innobase/btr/btr0sea.c storage/innobase/buf/buf0lru.c storage/innobase/dict/dict0dict.c storage/innobase/dict/dict0mem.c storage/innobase/handler/ha_innodb.cc storage/innobase/handler/ha_innodb.h storage/innobase/include/btr0sea.h storage/innobase/include/dict0dict.h storage/innobase/include/dict0mem.h storage/innobase/include/ha_prototypes.h storage/innobase/include/lock0lock.h storage/innobase/include/row0mysql.h storage/innobase/include/sync0sync.ic storage/innobase/include/ut0ut.h storage/innobase/lock/lock0lock.c storage/innobase/os/os0file.c storage/innobase/plug.in storage/innobase/row/row0mysql.c storage/innobase/row/row0sel.c storage/innobase/srv/srv0srv.c storage/innobase/srv/srv0start.c storage/innobase/ut/ut0ut.c storage/myisam/ft_boolean_search.c strings/ctype.c strings/xml.c tests/mysql_client_test.c win/configure.js mysql-test/suite/funcs_1/t/is_coll_char_set_appl.test ------------------------------------------------------------------------ r4165 | calvin | 2009-02-12 01:34:27 +0200 (Thu, 12 Feb 2009) | 1 line Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: minor non-functional changes. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r4263 | vasil | 2009-02-20 15:00:46 +0200 (Fri, 20 Feb 2009) | 4 lines branches/zip: Add a ChangeLog entry for a change in r4262. ------------------------------------------------------------------------ r4265 | marko | 2009-02-20 22:31:03 +0200 (Fri, 20 Feb 2009) | 5 lines branches/zip: Make innodb_use_sys_malloc=ON the default. Replace srv_use_sys_malloc with UNIV_LIKELY(srv_use_sys_malloc) to improve branch prediction in the default case. Approved by Ken over the IM. ------------------------------------------------------------------------ r4266 | vasil | 2009-02-20 23:29:32 +0200 (Fri, 20 Feb 2009) | 7 lines branches/zip: Add a sentence at the top of COPYING.Google to clarify that this license does not apply to the whole InnoDB. Suggested by: Ken ------------------------------------------------------------------------ r4268 | marko | 2009-02-23 12:43:51 +0200 (Mon, 23 Feb 2009) | 9 lines branches/zip: Initialize ut_list_mutex at startup. Without this fix, ut_list_mutex would be used uninitialized when innodb_use_sys_malloc=1. This fix addresses Issue #181. ut_mem_block_list_init(): Rename to ut_mem_init() and make public. ut_malloc_low(), ut_free_all_mem(): Add ut_a(ut_mem_block_list_inited). mem_init(): Call ut_mem_init(). ------------------------------------------------------------------------ r4269 | marko | 2009-02-23 15:09:49 +0200 (Mon, 23 Feb 2009) | 7 lines branches/zip: When freeing an uncompressed BLOB page, tolerate garbage in FIL_PAGE_TYPE. (Bug #43043, Issue #182) btr_check_blob_fil_page_type(): New function. btr_free_externally_stored_field(), btr_copy_blob_prefix(): Call btr_check_blob_fil_page_type() to check FIL_PAGE_TYPE. ------------------------------------------------------------------------ r4272 | marko | 2009-02-23 23:10:18 +0200 (Mon, 23 Feb 2009) | 8 lines branches/zip: Adjust the fix of Issue #182 in r4269 per Inaam's suggestion. btr_check_blob_fil_page_type(): Replace the parameter const char* op with ibool read. Do not print anything about page type mismatch when reading a BLOB page in Antelope format. Print space id before page number. ------------------------------------------------------------------------ r4273 | marko | 2009-02-24 00:11:11 +0200 (Tue, 24 Feb 2009) | 1 line branches/zip: ut_mem_init(): Add the assertion !ut_mem_block_list_inited. ------------------------------------------------------------------------ r4274 | marko | 2009-02-24 00:14:38 +0200 (Tue, 24 Feb 2009) | 12 lines branches/zip: Fix bugs in the fix of Issue #181. Tested inside and outside Valgrind, with innodb_use_sys_malloc set to 0 and 1. mem_init(): Invoke ut_mem_init() before mem_pool_create(), because the latter one will invoke ut_malloc(). srv_general_init(): Do not initialize the memory subsystem (mem_init()). innobase_init(): Initialize the memory subsystem (mem_init()) before calling srv_parse_data_file_paths_and_sizes(), which needs ut_malloc(). Call ut_free_all_mem() in error handling to clean up after the mem_init(). ------------------------------------------------------------------------ r4280 | marko | 2009-02-24 15:14:59 +0200 (Tue, 24 Feb 2009) | 1 line branches/zip: Remove unused function os_mem_alloc_nocache(). ------------------------------------------------------------------------ r4281 | marko | 2009-02-24 16:02:48 +0200 (Tue, 24 Feb 2009) | 1 line branches/zip: Remove the unused function dict_index_get_type(). ------------------------------------------------------------------------ r4283 | marko | 2009-02-24 23:06:56 +0200 (Tue, 24 Feb 2009) | 1 line branches/zip: srv0start.c: Remove unnecessary #include "mem0pool.h". ------------------------------------------------------------------------ r4284 | marko | 2009-02-24 23:26:38 +0200 (Tue, 24 Feb 2009) | 1 line branches/zip: mem0mem.c: Remove unnecessary #include "mach0data.h". ------------------------------------------------------------------------ r4288 | vasil | 2009-02-25 10:48:07 +0200 (Wed, 25 Feb 2009) | 21 lines branches/zip: Merge revisions 4261:4287 from branches/5.1: ------------------------------------------------------------------------ r4287 | sunny | 2009-02-25 05:32:01 +0200 (Wed, 25 Feb 2009) | 10 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Fix Bug#42714 AUTO_INCREMENT errors in 5.1.31. There are two changes to the autoinc handling. 1. To fix the immediate problem from the bug report, we must ensure that the value written to the table is always less than the max value stored in dict_table_t. 2. The second related change is that according to MySQL documentation when the offset is greater than the increment, we should ignore the offset. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r4289 | vasil | 2009-02-25 10:53:51 +0200 (Wed, 25 Feb 2009) | 4 lines branches/zip: Add ChangeLog entry for the fix in r4288. ------------------------------------------------------------------------ r4290 | vasil | 2009-02-25 11:05:44 +0200 (Wed, 25 Feb 2009) | 11 lines branches/zip: Make ChangeLog entries for bugs in bugs.mysql.com in the form: Fix Bug#12345 bug title (for bugs after 1.0.2 was released and the ChangeLog published) There is no need to bloat the ChangeLog with information that is available via bugs.mysql.com. Discussed with: Marko ------------------------------------------------------------------------ r4291 | vasil | 2009-02-25 11:08:32 +0200 (Wed, 25 Feb 2009) | 4 lines branches/zip: Fix Bug synopsis and remove explanation ------------------------------------------------------------------------ r4292 | marko | 2009-02-25 12:09:15 +0200 (Wed, 25 Feb 2009) | 25 lines branches/zip: Correct the initialization of the memory subsystem once again, to finally put Issue #181 to rest. Revert some parts of r4274. It is best not to call ut_malloc() before srv_general_init(). mem_init(): Do not call ut_mem_init(). srv_general_init(): Initialize the memory subsystem in two phases: first ut_mem_init(), then mem_init(). This is because os_sync_init() and sync_init() depend on ut_mem_init() and mem_init() depends on os_sync_init() or sync_init(). srv_parse_data_file_paths_and_sizes(), srv_parse_log_group_home_dirs(): Remove the output parameters. Assign to the global variables directly. Allocate memory with malloc() instead of ut_malloc(), because these functions will be called before srv_general_init(). srv_free_paths_and_sizes(): New function, for cleaning up after srv_parse_data_file_paths_and_sizes() and srv_parse_log_group_home_dirs(). rb://92 approved by Sunny Bains ------------------------------------------------------------------------ r4297 | vasil | 2009-02-25 17:19:19 +0200 (Wed, 25 Feb 2009) | 4 lines branches/zip: White-space cleanup in the ChangeLog ------------------------------------------------------------------------ r4301 | vasil | 2009-02-25 21:33:32 +0200 (Wed, 25 Feb 2009) | 5 lines branches/zip: Do not output the commands that restore the environment because they depend on the state of the environment before the test starts executing. ------------------------------------------------------------------------ r4315 | vasil | 2009-02-26 09:21:20 +0200 (Thu, 26 Feb 2009) | 5 lines branches/zip: Apply any necessary patches to the mysql tree at the end of setup.sh This step was previously done manually (and sometimes forgotten). ------------------------------------------------------------------------ r4319 | marko | 2009-02-26 23:27:51 +0200 (Thu, 26 Feb 2009) | 6 lines branches/zip: btr_check_blob_fil_page_type(): Do not report FIL_PAGE_TYPE mismatch even when purging a BLOB. Heavy users may have large data files created with MySQL 5.0 or earlier, and they don not want to have the error log flooded with such messages. This fixes Issue #182. ------------------------------------------------------------------------ r4320 | inaam | 2009-02-27 02:13:19 +0200 (Fri, 27 Feb 2009) | 8 lines branches/zip This is to revert the changes made to the plug.in (r4251) as a fix for issue# 178. Changes to plug.in will not propogate to a plugin installation unless autotools are rerun which is unacceptable. A fix for issue# 178 will be committed in a separate commit. ------------------------------------------------------------------------ r4321 | inaam | 2009-02-27 02:16:46 +0200 (Fri, 27 Feb 2009) | 6 lines branches/zip This is a fix for issue#178. Instead of using UNIV_LINUX which is defined through CFLAGS we use compiler generated define __linux__ that is effective for both .c and .cc files. ------------------------------------------------------------------------ r4324 | vasil | 2009-02-27 13:27:18 +0200 (Fri, 27 Feb 2009) | 39 lines branches/zip: Add FreeBSD to the list of the operating systems that have sizeof(pthread_t) == sizeof(void*) (i.e. word size). On FreeBSD pthread_t is defined like: /usr/include/sys/_pthreadtypes.h: typedef struct pthread *pthread_t; I did the following tests (per Inaam's recommendation): a) appropriate version of GCC is available on that platform (4.1.2 or higher for atomics to be available) On FreeBSD 6.x the default compiler is 3.4.6, on FreeBSD 7.x the default one is 4.2.1. One can always install the version of choice from the ports collection. If gcc 3.x is used then HAVE_GCC_ATOMIC_BUILTINS will not be defined and thus the change I am committing will make no difference. b) find out if sizeof(pthread_t) == sizeof(long) On 32 bit both are 4 bytes, on 64 bit both are 8 bytes. c) find out the compiler generated platform define (e.g.: __aix, __sunos__ etc.) The macro is __FreeBSD__. d) patch univ.i with the appropriate platform define e) build the mysql f) ensure it is using atomic builtins (look at the err.log message at system startup. It should say we are using atomics for both mutexes and rw-locks) g) do sanity testing (keeping in view the smp changes) I ran the mysql-test suite. All tests pass. ------------------------------------------------------------------------ r4353 | vasil | 2009-03-05 09:27:29 +0200 (Thu, 05 Mar 2009) | 6 lines branches/zip: As suggested by Ken, print a message that says that the Google SMP patch (GCC atomics) is disabled if it is. Also extend the message when the patch is partially enabled to make it clear that it is partially enabled. ------------------------------------------------------------------------ r4356 | vasil | 2009-03-05 13:49:51 +0200 (Thu, 05 Mar 2009) | 4 lines branches/zip: Fix typo made in r4353. ------------------------------------------------------------------------ r4357 | vasil | 2009-03-05 16:38:59 +0200 (Thu, 05 Mar 2009) | 23 lines branches/zip: Implement a check whether pthread_t objects can be used by GCC atomic builtin functions. This check is implemented in plug.in and defines the macro HAVE_ATOMIC_PTHREAD_T. This macro is checked in univ.i and the relevant part of the code enabled (the one that uses GCC atomics against pthread_t objects). In addition to this, the same program that is compiled as part of the plug.in check is added in ut/ut0auxconf.c. In the InnoDB Plugin source archives that are shipped to the users, a generated Makefile.in is added. That Makefile.in will be modified to compile ut/ut0auxconf.c and define the macro HAVE_ATOMIC_PTHREAD_T if the compilation succeeds. I.e. Makefile.in will emulate the work that is done by plug.in. This is done in order to make the check happen and HAVE_ATOMIC_PTHREAD_T eventually defined without regenerating MySQL's ./configure from ./storage/innobase/plug.in. The point is not to ask users to install the autotools and regenerate ./configure. rb://95 Approved by: Marko ------------------------------------------------------------------------ r4360 | vasil | 2009-03-05 22:23:17 +0200 (Thu, 05 Mar 2009) | 21 lines branches/zip: Merge revisions 4287:4357 from branches/5.1: ------------------------------------------------------------------------ r4325 | sunny | 2009-03-02 02:28:52 +0200 (Mon, 02 Mar 2009) | 10 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Bug#43203: Overflow from auto incrementing causes server segv It was not a SIGSEGV but an assertion failure. The assertion was checking the invariant that *first_value passed in by MySQL doesn't contain a value that is greater than the max value for that type. The assertion has been changed to a check and if the value is greater than the max we report a generic AUTOINC failure. rb://93 Approved by Heikki ------------------------------------------------------------------------ ------------------------------------------------------------------------ r4361 | vasil | 2009-03-05 22:27:54 +0200 (Thu, 05 Mar 2009) | 30 lines branches/zip: Merge revision 4358 from branches/5.1 (resolving a conflict): ------------------------------------------------------------------------ r4358 | vasil | 2009-03-05 21:21:10 +0200 (Thu, 05 Mar 2009) | 21 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a change from MySQL: ------------------------------------------------------------ revno: 2728.19.1 committer: Alfranio Correia <alfranio.correia@sun.com> branch nick: mysql-5.1-bugteam timestamp: Tue 2009-02-03 11:36:46 +0000 message: BUG#42445 Warning messages in innobase/handler/ha_innodb.cc There was a type casting problem in the storage/innobase/handler/ha_innodb.cc, (int ha_innobase::write_row(...)). Innobase uses has an internal error variable of type 'ulint' while mysql uses an 'int'. To fix the problem the function manipulates an error variable of type 'ulint' and only casts it into 'int' when needs to return the value. modified: storage/innobase/handler/ha_innodb.cc ------------------------------------------------------------------------ ------------------------------------------------------------------------ r4362 | vasil | 2009-03-05 22:29:07 +0200 (Thu, 05 Mar 2009) | 23 lines branches/zip: Merge revision 4359 from branches/5.1: ------------------------------------------------------------------------ r4359 | vasil | 2009-03-05 21:42:01 +0200 (Thu, 05 Mar 2009) | 14 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a change from MySQL: ------------------------------------------------------------ revno: 2747 committer: Timothy Smith <timothy.smith@sun.com> branch nick: 51 timestamp: Fri 2009-01-16 17:49:07 +0100 message: Add another cast to ignore int/ulong difference in error types, silence warning on Win64 modified: storage/innobase/handler/ha_innodb.cc ------------------------------------------------------------------------ ------------------------------------------------------------------------ r4363 | vasil | 2009-03-05 22:31:37 +0200 (Thu, 05 Mar 2009) | 4 lines branches/zip: Add ChangeLog entry for the bugfix in c4360. ------------------------------------------------------------------------ r4378 | calvin | 2009-03-09 10:10:17 +0200 (Mon, 09 Mar 2009) | 7 lines branches/zip: remove compile flag MYSQL_SERVER for dynamic plugin The dynamic plugin on Windows used to be built with MYSQL_SERVER compile flag, while it is not the case for other platforms. r3797 assumed MYSQL_SERVER was not defined for dynamic plugin, which introduced the engine crash during dropping a database. ------------------------------------------------------------------------ r4396 | marko | 2009-03-12 09:22:27 +0200 (Thu, 12 Mar 2009) | 3 lines branches/zip: btr_store_big_rec_extern_fields(): Initialize FIL_PAGE_TYPE in a separate redo log entry. This will make ibbackup --apply-log debugging easier. ------------------------------------------------------------------------ r4397 | marko | 2009-03-12 09:26:11 +0200 (Thu, 12 Mar 2009) | 3 lines branches/zip: trx_sys_create_doublewrite_buf(): As the dummy change, initialize FIL_PAGE_TYPE. This will make it easier to write the debug assertions for ibbackup --apply-log. ------------------------------------------------------------------------ r4401 | marko | 2009-03-12 10:26:40 +0200 (Thu, 12 Mar 2009) | 19 lines branches/zip: Merge revisions 4359:4400 from branches/5.1: ------------------------------------------------------------------------ r4399 | marko | 2009-03-12 09:38:05 +0200 (Thu, 12 Mar 2009) | 2 lines branches/5.1: row_sel_get_clust_rec_for_mysql(): Store the cursor position also for unlock_row(). (Bug #39320) ------------------------------------------------------------------------ r4400 | marko | 2009-03-12 10:06:44 +0200 (Thu, 12 Mar 2009) | 5 lines branches/5.1: Fix a bug in multi-table semi-consistent reads. Remember the acquired record locks per table handle (row_prebuilt_t) rather than per transaction (trx_t), so that unlock_row should successfully unlock all non-matching rows in multi-table operations. This deficiency was found while investigating Bug #39320. ------------------------------------------------------------------------ These were submitted as rb://94 and rb://96 and approved by Heikki Tuuri. ------------------------------------------------------------------------ r4455 | marko | 2009-03-16 11:43:34 +0200 (Mon, 16 Mar 2009) | 2 lines branches/zip: UT_LIST_VALIDATE(): Add the parameter ASSERTION and adjust all callers. ------------------------------------------------------------------------ r4456 | marko | 2009-03-16 12:59:25 +0200 (Mon, 16 Mar 2009) | 6 lines branches/zip: UT_LIST_VALIDATE(): Assert that the link is non-NULL before dereferencing it. In this way, ut_list_node_313 will be pointing to the last non-NULL list item at the time of the assertion failure. (gcc-4.3.2 -O3 seems to optimize the common subexpressions and make the variable NULL, though.) ------------------------------------------------------------------------ r4457 | marko | 2009-03-16 14:12:02 +0200 (Mon, 16 Mar 2009) | 2 lines branches/zip: sync_thread_add_level(): Make the assertions about level == SYNC_BUF_BLOCK more readable. ------------------------------------------------------------------------ r4461 | vasil | 2009-03-17 09:38:19 +0200 (Tue, 17 Mar 2009) | 6 lines branches/zip: Remove mysql-test/patches/bug32625.diff because that bug was fixed in the mysql repository (1 year and 4 months after sending them the simple patch!). See http://bugs.mysql.com/32625 ------------------------------------------------------------------------ r4465 | marko | 2009-03-17 12:34:19 +0200 (Tue, 17 Mar 2009) | 1 line branches/zip: buf0buddy.c: Add and adjust some debug assertions. ------------------------------------------------------------------------ r4473 | vasil | 2009-03-17 15:50:30 +0200 (Tue, 17 Mar 2009) | 5 lines branches/zip: Increment the InnoDB Plugin version from 1.0.3 to 1.0.4 now that 1.0.3 has been released. ------------------------------------------------------------------------ r4478 | vasil | 2009-03-18 11:53:53 +0200 (Wed, 18 Mar 2009) | 5 lines branches/zip: Remove mysql-test/patches/bug41893.diff because that bug has been fixed in the MySQL repository, see http://bugs.mysql.com/41893. ------------------------------------------------------------------------ r4479 | marko | 2009-03-18 12:43:54 +0200 (Wed, 18 Mar 2009) | 2 lines branches/zip: buf_LRU_block_remove_hashed_page(): Add some debug assertions. ------------------------------------------------------------------------ r4480 | marko | 2009-03-18 14:32:13 +0200 (Wed, 18 Mar 2009) | 1 line branches/zip: buf_buddy_free_low(): Correct the function comment. ------------------------------------------------------------------------ r4482 | marko | 2009-03-19 15:23:32 +0200 (Thu, 19 Mar 2009) | 12 lines branches/zip: Merge revisions 4400:4481 from branches/5.1: ------------------------------------------------------------------------ r4481 | marko | 2009-03-19 15:01:48 +0200 (Thu, 19 Mar 2009) | 6 lines branches/5.1: row_unlock_for_mysql(): Do not unlock records that were modified by the current transaction. This bug was introduced or unmasked in r4400. rb://97 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ r4490 | marko | 2009-03-20 12:33:33 +0200 (Fri, 20 Mar 2009) | 4 lines branches/zip: Non-functional change for reducing dependencies in InnoDB Hot Backup: Replace srv_sys->dummy_ind1 and srv_sys->dummy_ind2 with dict_ind_redundant and dict_ind_compact, initialized in dict_init(). ------------------------------------------------------------------------ r4491 | marko | 2009-03-20 12:45:18 +0200 (Fri, 20 Mar 2009) | 2 lines branches/zip: Add const qualifiers or in/out comments to some function parameters in log0log. ------------------------------------------------------------------------ r4492 | marko | 2009-03-20 12:52:14 +0200 (Fri, 20 Mar 2009) | 5 lines branches/zip: page_validate(): Always report the space id and the name of the index. In Hot Backup, do not invoke comparison functions, as MySQL collations will be unavailable. ------------------------------------------------------------------------ r4493 | marko | 2009-03-20 13:24:06 +0200 (Fri, 20 Mar 2009) | 1 line branches/zip: Replace fil_get_space_for_id_low() with fil_space_get_by_id(). ------------------------------------------------------------------------ r4494 | marko | 2009-03-20 13:51:35 +0200 (Fri, 20 Mar 2009) | 3 lines branches/zip: fil0fil.c: Refer to fil_system directly, not via local vars. This eliminates some "unused variable" warnings when building InnoDB Hot Backup in such a way that all mutex operations are no-ops. ------------------------------------------------------------------------ r4495 | marko | 2009-03-20 14:15:52 +0200 (Fri, 20 Mar 2009) | 1 line branches/zip: innobase_get_at_most_n_mbchars(): Declare in ha_prototypes.h. ------------------------------------------------------------------------ r4496 | marko | 2009-03-20 14:48:26 +0200 (Fri, 20 Mar 2009) | 1 line branches/zip: recv_recover_page(): Remove compile-time constant parameters. ------------------------------------------------------------------------ r4497 | marko | 2009-03-20 14:56:19 +0200 (Fri, 20 Mar 2009) | 1 line branches/zip: recv_sys_init(): Remove a compile-time constant parameter. ------------------------------------------------------------------------ r4498 | marko | 2009-03-20 15:08:05 +0200 (Fri, 20 Mar 2009) | 4 lines branches/zip: Non-functional change: Add const qualifiers. log_block_checksum_is_ok_or_old_format(), recv_sys_add_to_parsing_buf(): The log block is read-only. Make it const. ------------------------------------------------------------------------ r4499 | marko | 2009-03-20 15:10:25 +0200 (Fri, 20 Mar 2009) | 1 line branches/zip: recv_scan_log_recs(): Remove a compile-time constant parameter. ------------------------------------------------------------------------ r4500 | marko | 2009-03-20 15:47:17 +0200 (Fri, 20 Mar 2009) | 1 line branches/zip: fil_init(): Add the parameter hash_size. ------------------------------------------------------------------------ r4501 | vasil | 2009-03-20 16:50:41 +0200 (Fri, 20 Mar 2009) | 4 lines branches/zip: Add any entry about the release of 1.0.3 in the ChangeLog. ------------------------------------------------------------------------ r4515 | marko | 2009-03-23 10:49:53 +0200 (Mon, 23 Mar 2009) | 1 line branches/zip: hash_table_t: adaptive: Remove from UNIV_HOTBACKUP builds. ------------------------------------------------------------------------ r4516 | marko | 2009-03-23 10:57:16 +0200 (Mon, 23 Mar 2009) | 2 lines branches/zip: Define and use ASSERT_HASH_MUTEX_OWN. Make it a no-op in UNIV_HOTBACKUP builds. ------------------------------------------------------------------------ r4517 | marko | 2009-03-23 11:07:20 +0200 (Mon, 23 Mar 2009) | 2 lines branches/zip: Define and use PAGE_ZIP_MATCH. In UNIV_HOTBACKUP builds, assume fixed allocation. ------------------------------------------------------------------------ r4521 | marko | 2009-03-23 12:05:47 +0200 (Mon, 23 Mar 2009) | 1 line branches/zip: buf_page_print(): Clean up the code #ifdef UNIV_HOTBACKUP. ------------------------------------------------------------------------ r4522 | marko | 2009-03-23 12:20:50 +0200 (Mon, 23 Mar 2009) | 2 lines branches/zip: Exclude some operating system interface code from UNIV_HOTBACKUP builds. ------------------------------------------------------------------------ r4523 | marko | 2009-03-23 13:00:43 +0200 (Mon, 23 Mar 2009) | 2 lines branches/zip: Remove the remaining references to hash_table_t::adapive from UNIV_HOTBACKUP builds. This should have been done in r4515. ------------------------------------------------------------------------ r4524 | marko | 2009-03-23 14:05:18 +0200 (Mon, 23 Mar 2009) | 2 lines branches/zip: Enclose recv_recovery_from_backup_on and recv_recovery_from_backup_is_on() in #ifdef UNIV_LOG_ARCHIVE. ------------------------------------------------------------------------ r4525 | marko | 2009-03-23 14:57:45 +0200 (Mon, 23 Mar 2009) | 2 lines branches/zip: recv_parse_or_apply_log_rec_body(): Add debug assertions ensuring that FIL_PAGE_TYPE makes sense when applying log records. ------------------------------------------------------------------------ r4526 | marko | 2009-03-23 16:21:34 +0200 (Mon, 23 Mar 2009) | 2 lines branches/zip: Remove unneeded definitions and dependencies from UNIV_HOTBACKUP builds. ------------------------------------------------------------------------ r4527 | calvin | 2009-03-23 23:15:33 +0200 (Mon, 23 Mar 2009) | 5 lines branches/zip: adjust build files on Windows Adjust the patch positions based on the latest MySQL source. Also add the patches to the .bat files for vs9. ------------------------------------------------------------------------
17 years ago
branches/innodb+: Merge revisions 5091:5143 from branches/zip: ------------------------------------------------------------------------ r5092 | marko | 2009-05-25 09:54:17 +0300 (Mon, 25 May 2009) | 1 line branches/zip: Adjust some function comments after r5091. ------------------------------------------------------------------------ r5100 | marko | 2009-05-25 12:09:45 +0300 (Mon, 25 May 2009) | 1 line branches/zip: Split some long lines that were introduced in r5091. ------------------------------------------------------------------------ r5101 | marko | 2009-05-25 12:42:47 +0300 (Mon, 25 May 2009) | 2 lines branches/zip: Introduce the macro TEMP_INDEX_PREFIX_STR. This is to avoid triggering an error in Doxygen. ------------------------------------------------------------------------ r5102 | marko | 2009-05-25 13:47:14 +0300 (Mon, 25 May 2009) | 1 line branches/zip: Add missing file comments. ------------------------------------------------------------------------ r5103 | marko | 2009-05-25 13:52:29 +0300 (Mon, 25 May 2009) | 10 lines branches/zip: Add @file comments, and convert decorative /********************************* comments to Doxygen /** style like this: /*****************************//** This conversion was performed by the following command: perl -i -e 'while(<ARGV>){if (m|^/\*{30}\**$|) { s|\*{4}$|//**| if ++$com>1; $_ .= "\@file $ARGV\n" if $com==2} print; if(eof){$.=0;undef $com}}' */*[ch] include/univ.i ------------------------------------------------------------------------ r5104 | marko | 2009-05-25 14:39:07 +0300 (Mon, 25 May 2009) | 2 lines branches/zip: Revert ut0auxconf_* to r5102, that is, make Doxygen ignore these test programs. ------------------------------------------------------------------------ r5105 | marko | 2009-05-25 14:52:20 +0300 (Mon, 25 May 2009) | 2 lines branches/zip: Enclose some #error checks inside #ifndef DOXYGEN to prevent bogus Doxygen errors. ------------------------------------------------------------------------ r5106 | marko | 2009-05-25 16:09:24 +0300 (Mon, 25 May 2009) | 2 lines branches/zip: Add some Doxygen comments, mainly to structs, typedefs, macros and global variables. Many more to go. ------------------------------------------------------------------------ r5108 | marko | 2009-05-26 00:32:35 +0300 (Tue, 26 May 2009) | 2 lines branches/zip: lexyy.c: Remove the inadvertently added @file directive. There is nothing for Doxygen to see in this file, move along. ------------------------------------------------------------------------ r5125 | marko | 2009-05-26 16:28:49 +0300 (Tue, 26 May 2009) | 3 lines branches/zip: Add some Doxygen comments for many structs, typedefs, #defines and global variables. Many are still missing. ------------------------------------------------------------------------ r5134 | marko | 2009-05-27 09:08:43 +0300 (Wed, 27 May 2009) | 1 line branches/zip: Add some Doxygen @return comments. ------------------------------------------------------------------------ r5139 | marko | 2009-05-27 10:01:40 +0300 (Wed, 27 May 2009) | 1 line branches/zip: Add Doxyfile. ------------------------------------------------------------------------ r5143 | marko | 2009-05-27 10:57:25 +0300 (Wed, 27 May 2009) | 3 lines branches/zip: buf0buf.h, Doxyfile: Fix the Doxygen translation. @defgroup is for source code modules, not for field groups. Tell Doxygen to expand the UT_LIST declarations. ------------------------------------------------------------------------
17 years ago
20 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-11254: innodb-use-trim has no effect in 10.2 Problem was that implementation merged from 10.1 was incompatible with InnoDB 5.7. buf0buf.cc: Add functions to return should we punch hole and how big. buf0flu.cc: Add written page to IORequest fil0fil.cc: Remove unneeded status call and add test is sparse files and punch hole supported by file system when tablespace is created. Add call to get file system block size. Used file node is added to IORequest. Added functions to check is punch hole supported and setting punch hole. ha_innodb.cc: Remove unneeded status variables (trim512-32768) and trim_op_saved. Deprecate innodb_use_trim and set it ON by default. Add function to set innodb-use-trim dynamically. dberr.h: Add error code DB_IO_NO_PUNCH_HOLE if punch hole operation fails. fil0fil.h: Add punch_hole variable to fil_space_t and block size to fil_node_t. os0api.h: Header to helper functions on buf0buf.cc and fil0fil.cc for os0file.h os0file.h: Remove unneeded m_block_size from IORequest and add bpage to IORequest to know actual size of the block and m_fil_node to know tablespace file system block size and does it support punch hole. os0file.cc: Add function punch_hole() to IORequest to do punch_hole operation, get the file system block size and determine does file system support sparse files (for punch hole). page0size.h: remove implicit copy disable and use this implicit copy to implement copy_from() function. buf0dblwr.cc, buf0flu.cc, buf0rea.cc, fil0fil.cc, fil0fil.h, os0file.h, os0file.cc, log0log.cc, log0recv.cc: Remove unneeded write_size parameter from fil_io calls. srv0mon.h, srv0srv.h, srv0mon.cc: Remove unneeded trim512-trim32678 status variables. Removed these from monitor tests.
9 years ago
20 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
Merge Google encryption commit 195158e9889365dc3298f8c1f3bcaa745992f27f Author: Minli Zhu <minliz@google.com> Date: Mon Nov 25 11:05:55 2013 -0800 Innodb redo log encryption/decryption. Use start lsn of a log block as part of AES CTR counter. Record key version with each checkpoint. Internally key version 0 means no encryption. Tests done (see test_innodb_log_encryption.sh for detail): - Verify flag innodb_encrypt_log on or off, combined with various key versions passed through CLI, and dynamically set after startup, will not corrupt database. This includes tests from being unencrypted to encrypted, and encrypted to unencrypted. - Verify start-up with no redo logs succeeds. - Verify fresh start-up succeeds. Change-Id: I4ce4c2afdf3076be2fce90ebbc2a7ce01184b612 commit c1b97273659f07866758c25f4a56f680a1fbad24 Author: Jonas Oreland <jonaso@google.com> Date: Tue Dec 3 18:47:27 2013 +0100 encryption of aria data&index files this patch implements encryption of aria data & index files. this is implemented as 1) add read/write hooks (renamed from callbacks) that does encrypt/decrypt (also add pre_read and post_write hooks) 2) modify page headers for data/index to contain key version (making the data-page header size different for with/without encryption) 3) modify index page 0 to contain IV (and crypt header) 4) AES CRT crypt functions 5) counter block is implemented using combination of page no, lsn and table specific id NOTE: 1) log files are not encrypted, this is not needed for if aria is only used for internal temporary tables and they are not transactional (i.e not logged) 2) all encrypted tables are using PAGE_CHECKSUM (crc) normal internal temporary tables are (currently) not CHECKSUM:ed 3) This patch adds insert-order semantics to aria block_format. The default behaviour of aria block-format is best-fit, meaning that rows gets allocated to page trying to fill the pages as much as possible. However, certain sql constructs materialize temporary result in tmp-tables, and expect that a table scan will later return the rows in the same order they were inserted. This implementation of insert-order is only enabled when explicitly requested by sql-layer. CHANGES: 1) found bug in ma_write that made code try to abort a record that was never written unsure why this is not exposed Change-Id: Ia82bbaa92e2c0629c08693c5add2f56b815c0509 commit 89dc1ab651fe0205d55b4eb588f62df550aa65fc Author: Jonas Oreland <jonaso@google.com> Date: Mon Feb 17 08:04:50 2014 -0800 Implement encryption of innodb datafiles. Pages are encrypted before written to disk and decrypted when read from disk. Each page except first page (page 0) in tablespace is encrypted. Page 0 is unencrypted and contains IV for the tablespace. FIL_PAGE_FILE_FLUSH_LSN on each page (except page 0) is used to store a 32-bit key-version, so that multiple keys can be active in a tablespace simultaneous. The other 32-bit of the FIL_PAGE_FILE_FLUSH_LSN field contains a checksum that is computed after encryption. This checksum is used by innochecksum and when restoring from double-write-buffer. The encryption is performed using AES CRT. Monitoring of encryption is enabled using new IS-table INNODB_TABLESPACES_ENCRYPTION. In addition to that new status variables innodb_encryption_rotation_{ pages_read_from_cache, pages_read_from_disk, pages_modified,pages_flushed } has been added. The following tunables are introduces - innodb_encrypt_tables - innodb_encryption_threads - innodb_encryption_rotate_key_age - innodb_encryption_rotation_iops Change-Id: I8f651795a30b52e71b16d6bc9cb7559be349d0b2 commit a17eef2f6948e58219c9e26fc35633d6fd4de1de Author: Andrew Ford <andrewford@google.com> Date: Thu Jan 2 15:43:09 2014 -0800 Key management skeleton with debug hooks. Change-Id: Ifd6aa3743d7ea291c70083f433a059c439aed866 commit 68a399838ad72264fd61b3dc67fecd29bbdb0af1 Author: Andrew Ford <andrewford@google.com> Date: Mon Oct 28 16:27:44 2013 -0700 Add AES-128 CTR and GCM encryption classes. Change-Id: I116305eced2a233db15306bc2ef5b9d398d1a3a2
11 years ago
20 years ago
MDEV-11254: innodb-use-trim has no effect in 10.2 Problem was that implementation merged from 10.1 was incompatible with InnoDB 5.7. buf0buf.cc: Add functions to return should we punch hole and how big. buf0flu.cc: Add written page to IORequest fil0fil.cc: Remove unneeded status call and add test is sparse files and punch hole supported by file system when tablespace is created. Add call to get file system block size. Used file node is added to IORequest. Added functions to check is punch hole supported and setting punch hole. ha_innodb.cc: Remove unneeded status variables (trim512-32768) and trim_op_saved. Deprecate innodb_use_trim and set it ON by default. Add function to set innodb-use-trim dynamically. dberr.h: Add error code DB_IO_NO_PUNCH_HOLE if punch hole operation fails. fil0fil.h: Add punch_hole variable to fil_space_t and block size to fil_node_t. os0api.h: Header to helper functions on buf0buf.cc and fil0fil.cc for os0file.h os0file.h: Remove unneeded m_block_size from IORequest and add bpage to IORequest to know actual size of the block and m_fil_node to know tablespace file system block size and does it support punch hole. os0file.cc: Add function punch_hole() to IORequest to do punch_hole operation, get the file system block size and determine does file system support sparse files (for punch hole). page0size.h: remove implicit copy disable and use this implicit copy to implement copy_from() function. buf0dblwr.cc, buf0flu.cc, buf0rea.cc, fil0fil.cc, fil0fil.h, os0file.h, os0file.cc, log0log.cc, log0recv.cc: Remove unneeded write_size parameter from fil_io calls. srv0mon.h, srv0srv.h, srv0mon.cc: Remove unneeded trim512-trim32678 status variables. Removed these from monitor tests.
9 years ago
MDEV-11254: innodb-use-trim has no effect in 10.2 Problem was that implementation merged from 10.1 was incompatible with InnoDB 5.7. buf0buf.cc: Add functions to return should we punch hole and how big. buf0flu.cc: Add written page to IORequest fil0fil.cc: Remove unneeded status call and add test is sparse files and punch hole supported by file system when tablespace is created. Add call to get file system block size. Used file node is added to IORequest. Added functions to check is punch hole supported and setting punch hole. ha_innodb.cc: Remove unneeded status variables (trim512-32768) and trim_op_saved. Deprecate innodb_use_trim and set it ON by default. Add function to set innodb-use-trim dynamically. dberr.h: Add error code DB_IO_NO_PUNCH_HOLE if punch hole operation fails. fil0fil.h: Add punch_hole variable to fil_space_t and block size to fil_node_t. os0api.h: Header to helper functions on buf0buf.cc and fil0fil.cc for os0file.h os0file.h: Remove unneeded m_block_size from IORequest and add bpage to IORequest to know actual size of the block and m_fil_node to know tablespace file system block size and does it support punch hole. os0file.cc: Add function punch_hole() to IORequest to do punch_hole operation, get the file system block size and determine does file system support sparse files (for punch hole). page0size.h: remove implicit copy disable and use this implicit copy to implement copy_from() function. buf0dblwr.cc, buf0flu.cc, buf0rea.cc, fil0fil.cc, fil0fil.h, os0file.h, os0file.cc, log0log.cc, log0recv.cc: Remove unneeded write_size parameter from fil_io calls. srv0mon.h, srv0srv.h, srv0mon.cc: Remove unneeded trim512-trim32678 status variables. Removed these from monitor tests.
9 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
TSAN: unprotected global variable WARNING: ThreadSanitizer: data race (pid=1510842) Write of size 8 at 0x0000067b1e98 by main thread: #0 os_file_pwrite(IORequest const&, int, unsigned char const*, unsigned long, unsigned long, dberr_t*) /storage/innobase/os/os0file.cc:2928:2 (mariadbd+0x234c5ac) #1 os_file_write_func(IORequest const&, char const*, int, void const*, unsigned long, unsigned long) /storage/innobase/os/os0file.cc:2963:20 (mariadbd+0x234c019) #2 file_os_io::write(char const*, unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:320:10 (mariadbd+0x22eaa50) #3 log_file_t::write(unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:434:18 (mariadbd+0x22eb1d8) #4 log_t::file::write(unsigned long, st_::span<unsigned char>) /storage/innobase/log/log0log.cc:496:29 (mariadbd+0x22ebb55) #5 log_write_buf(unsigned char*, unsigned long, unsigned long, unsigned long, unsigned long) /storage/innobase/log/log0log.cc:614:14 (mariadbd+0x22f1b51) #6 log_write(bool) /storage/innobase/log/log0log.cc:755:2 (mariadbd+0x22ed2ec) #7 log_write_up_to(unsigned long, bool, bool, completion_callback const*) /storage/innobase/log/log0log.cc:817:5 (mariadbd+0x22eca44) #8 log_checkpoint_low(unsigned long, unsigned long) /storage/innobase/buf/buf0flu.cc:1734:5 (mariadbd+0x20d37c1) #9 log_checkpoint() /storage/innobase/buf/buf0flu.cc:1787:10 (mariadbd+0x20cd155) #10 buf_flush_wait_flushed(unsigned long) /storage/innobase/buf/buf0flu.cc:1867:5 (mariadbd+0x20ccf8f) #11 log_make_checkpoint() /storage/innobase/buf/buf0flu.cc:1793:3 (mariadbd+0x20cc4c9) #12 buf_dblwr_t::create() /storage/innobase/buf/buf0dblwr.cc:216:3 (mariadbd+0x209076a) #13 srv_start(bool) /storage/innobase/srv/srv0start.cc:1685:20 (mariadbd+0x256b4aa) #14 innodb_init(void*) /storage/innobase/handler/ha_innodb.cc:4188:8 (mariadbd+0x1ed40da) #15 ha_initialize_handlerton(st_plugin_int*) /sql/handler.cc:659:31 (mariadbd+0xf7c2b6) #16 plugin_initialize(st_mem_root*, st_plugin_int*, int*, char**, bool) /sql/sql_plugin.cc:1463:9 (mariadbd+0x160fedb) #17 plugin_init(int*, char**, int) /sql/sql_plugin.cc:1756:15 (mariadbd+0x160f53f) #18 init_server_components() /sql/mysqld.cc:5043:7 (mariadbd+0xd71462) #19 mysqld_main(int, char**) /sql/mysqld.cc:5655:7 (mariadbd+0xd6ae87) #20 main /sql/main.cc:34:10 (mariadbd+0xd661c8) Previous write of size 8 at 0x0000067b1e98 by thread T3: #0 os_file_pwrite(IORequest const&, int, unsigned char const*, unsigned long, unsigned long, dberr_t*) /storage/innobase/os/os0file.cc:2928:2 (mariadbd+0x234c5ac) #1 os_file_write_func(IORequest const&, char const*, int, void const*, unsigned long, unsigned long) /storage/innobase/os/os0file.cc:2963:20 (mariadbd+0x234c019) #2 file_os_io::write(char const*, unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:320:10 (mariadbd+0x22eaa50) #3 log_file_t::write(unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:434:18 (mariadbd+0x22eb1d8) #4 log_t::file::write(unsigned long, st_::span<unsigned char>) /storage/innobase/log/log0log.cc:496:29 (mariadbd+0x22ebb55) #5 log_write_checkpoint_info(unsigned long) /storage/innobase/log/log0log.cc:911:14 (mariadbd+0x22edd4e) #6 log_checkpoint_low(unsigned long, unsigned long) /storage/innobase/buf/buf0flu.cc:1755:3 (mariadbd+0x20d3a3d) #7 buf_flush_sync_for_checkpoint(unsigned long) /storage/innobase/buf/buf0flu.cc:1947:7 (mariadbd+0x20d4163) #8 buf_flush_page_cleaner() /storage/innobase/buf/buf0flu.cc:2186:9 (mariadbd+0x20cdab1) #9 void std::__invoke_impl<void, void (*)()>(std::__invoke_other, void (*&&)()) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/invoke.h:61:14 (mariadbd+0x20c3aaa) #10 std::__invoke_result<void (*)()>::type std::__invoke<void (*)()>(void (*&&)()) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/invoke.h:96:14 (mariadbd+0x20c39bd) #11 void std::thread::_Invoker<std::tuple<void (*)()> >::_M_invoke<0ul>(std::_Index_tuple<0ul>) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:253:13 (mariadbd+0x20c3965) #12 std::thread::_Invoker<std::tuple<void (*)()> >::operator()() /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:260:11 (mariadbd+0x20c3905) #13 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)()> > >::_M_run() /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:211:13 (mariadbd+0x20c37f9) #14 <null> <null> (libstdc++.so.6+0xd230f) Location is global 'os_n_file_writes' of size 8 at 0x0000067b1e98 (mariadbd+0x67b1e98) Make variable atomic.
4 years ago
TSAN: data race on a global counter WARNING: ThreadSanitizer: data race (pid=1503350) Write of size 8 at 0x0000067b1f20 by thread T3: #0 os_file_sync_posix(int) /storage/innobase/os/os0file.cc:895:5 (mariadbd+0x23493f6) #1 os_file_flush_func(int) /storage/innobase/os/os0file.cc:983:8 (mariadbd+0x2349204) #2 file_os_io::flush() /storage/innobase/log/log0log.cc:326:10 (mariadbd+0x22eaaa9) #3 log_file_t::flush() /storage/innobase/log/log0log.cc:440:18 (mariadbd+0x22eb2d0) #4 log_t::file::flush() /storage/innobase/log/log0log.cc:507:29 (mariadbd+0x22ebe69) #5 log_write_flush_to_disk_low(unsigned long) /storage/innobase/log/log0log.cc:629:17 (mariadbd+0x22ed3f3) #6 log_write_up_to(unsigned long, bool, bool, completion_callback const*) /storage/innobase/log/log0log.cc:829:3 (mariadbd+0x22ecb04) #7 log_checkpoint_low(unsigned long, unsigned long) /storage/innobase/buf/buf0flu.cc:1734:5 (mariadbd+0x20d37f1) #8 buf_flush_sync_for_checkpoint(unsigned long) /storage/innobase/buf/buf0flu.cc:1947:7 (mariadbd+0x20d4193) #9 buf_flush_page_cleaner() /storage/innobase/buf/buf0flu.cc:2186:9 (mariadbd+0x20cdad7) #10 void std::__invoke_impl<void, void (*)()>(std::__invoke_other, void (*&&)()) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/invoke.h:61:14 (mariadbd+0x20c3aaa) #11 std::__invoke_result<void (*)()>::type std::__invoke<void (*)()>(void (*&&)()) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/invoke.h:96:14 (mariadbd+0x20c39bd) #12 void std::thread::_Invoker<std::tuple<void (*)()> >::_M_invoke<0ul>(std::_Index_tuple<0ul>) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:253:13 (mariadbd+0x20c3965) #13 std::thread::_Invoker<std::tuple<void (*)()> >::operator()() /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:260:11 (mariadbd+0x20c3905) #14 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)()> > >::_M_run() /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:211:13 (mariadbd+0x20c37f9) #15 <null> <null> (libstdc++.so.6+0xd230f) Previous write of size 8 at 0x0000067b1f20 by main thread: #0 os_file_sync_posix(int) /storage/innobase/os/os0file.cc:895:5 (mariadbd+0x23493f6) #1 os_file_flush_func(int) /storage/innobase/os/os0file.cc:983:8 (mariadbd+0x2349204) #2 fil_space_t::flush_low() /storage/innobase/fil/fil0fil.cc:504:5 (mariadbd+0x205cad5) #3 fil_flush_file_spaces() /storage/innobase/fil/fil0fil.cc:2947:13 (mariadbd+0x206523f) #4 log_checkpoint() /storage/innobase/buf/buf0flu.cc:1777:5 (mariadbd+0x20cd069) #5 buf_flush_wait_flushed(unsigned long) /storage/innobase/buf/buf0flu.cc:1867:5 (mariadbd+0x20ccf95) #6 log_make_checkpoint() /storage/innobase/buf/buf0flu.cc:1793:3 (mariadbd+0x20cc4c9) #7 buf_dblwr_t::create() /storage/innobase/buf/buf0dblwr.cc:216:3 (mariadbd+0x209076a) #8 srv_start(bool) /storage/innobase/srv/srv0start.cc:1685:20 (mariadbd+0x256b514) #9 innodb_init(void*) /storage/innobase/handler/ha_innodb.cc:4188:8 (mariadbd+0x1ed406a) #10 ha_initialize_handlerton(st_plugin_int*) /sql/handler.cc:659:31 (mariadbd+0xf7c246) #11 plugin_initialize(st_mem_root*, st_plugin_int*, int*, char**, bool) /sql/sql_plugin.cc:1463:9 (mariadbd+0x160fe6b) #12 plugin_init(int*, char**, int) /sql/sql_plugin.cc:1756:15 (mariadbd+0x160f4cf) #13 init_server_components() /sql/mysqld.cc:5043:7 (mariadbd+0xd713f2) #14 mysqld_main(int, char**) /sql/mysqld.cc:5655:7 (mariadbd+0xd6ae17) #15 main /sql/main.cc:34:10 (mariadbd+0xd66158) This is a correct report by TSAN for an obvious case: unprotected global counter. Fix it by making counter std::atomic.
4 years ago
MDEV-11254: innodb-use-trim has no effect in 10.2 Problem was that implementation merged from 10.1 was incompatible with InnoDB 5.7. buf0buf.cc: Add functions to return should we punch hole and how big. buf0flu.cc: Add written page to IORequest fil0fil.cc: Remove unneeded status call and add test is sparse files and punch hole supported by file system when tablespace is created. Add call to get file system block size. Used file node is added to IORequest. Added functions to check is punch hole supported and setting punch hole. ha_innodb.cc: Remove unneeded status variables (trim512-32768) and trim_op_saved. Deprecate innodb_use_trim and set it ON by default. Add function to set innodb-use-trim dynamically. dberr.h: Add error code DB_IO_NO_PUNCH_HOLE if punch hole operation fails. fil0fil.h: Add punch_hole variable to fil_space_t and block size to fil_node_t. os0api.h: Header to helper functions on buf0buf.cc and fil0fil.cc for os0file.h os0file.h: Remove unneeded m_block_size from IORequest and add bpage to IORequest to know actual size of the block and m_fil_node to know tablespace file system block size and does it support punch hole. os0file.cc: Add function punch_hole() to IORequest to do punch_hole operation, get the file system block size and determine does file system support sparse files (for punch hole). page0size.h: remove implicit copy disable and use this implicit copy to implement copy_from() function. buf0dblwr.cc, buf0flu.cc, buf0rea.cc, fil0fil.cc, fil0fil.h, os0file.h, os0file.cc, log0log.cc, log0recv.cc: Remove unneeded write_size parameter from fil_io calls. srv0mon.h, srv0srv.h, srv0mon.cc: Remove unneeded trim512-trim32678 status variables. Removed these from monitor tests.
9 years ago
MDEV-11254: innodb-use-trim has no effect in 10.2 Problem was that implementation merged from 10.1 was incompatible with InnoDB 5.7. buf0buf.cc: Add functions to return should we punch hole and how big. buf0flu.cc: Add written page to IORequest fil0fil.cc: Remove unneeded status call and add test is sparse files and punch hole supported by file system when tablespace is created. Add call to get file system block size. Used file node is added to IORequest. Added functions to check is punch hole supported and setting punch hole. ha_innodb.cc: Remove unneeded status variables (trim512-32768) and trim_op_saved. Deprecate innodb_use_trim and set it ON by default. Add function to set innodb-use-trim dynamically. dberr.h: Add error code DB_IO_NO_PUNCH_HOLE if punch hole operation fails. fil0fil.h: Add punch_hole variable to fil_space_t and block size to fil_node_t. os0api.h: Header to helper functions on buf0buf.cc and fil0fil.cc for os0file.h os0file.h: Remove unneeded m_block_size from IORequest and add bpage to IORequest to know actual size of the block and m_fil_node to know tablespace file system block size and does it support punch hole. os0file.cc: Add function punch_hole() to IORequest to do punch_hole operation, get the file system block size and determine does file system support sparse files (for punch hole). page0size.h: remove implicit copy disable and use this implicit copy to implement copy_from() function. buf0dblwr.cc, buf0flu.cc, buf0rea.cc, fil0fil.cc, fil0fil.h, os0file.h, os0file.cc, log0log.cc, log0recv.cc: Remove unneeded write_size parameter from fil_io calls. srv0mon.h, srv0srv.h, srv0mon.cc: Remove unneeded trim512-trim32678 status variables. Removed these from monitor tests.
9 years ago
MDEV-11254: innodb-use-trim has no effect in 10.2 Problem was that implementation merged from 10.1 was incompatible with InnoDB 5.7. buf0buf.cc: Add functions to return should we punch hole and how big. buf0flu.cc: Add written page to IORequest fil0fil.cc: Remove unneeded status call and add test is sparse files and punch hole supported by file system when tablespace is created. Add call to get file system block size. Used file node is added to IORequest. Added functions to check is punch hole supported and setting punch hole. ha_innodb.cc: Remove unneeded status variables (trim512-32768) and trim_op_saved. Deprecate innodb_use_trim and set it ON by default. Add function to set innodb-use-trim dynamically. dberr.h: Add error code DB_IO_NO_PUNCH_HOLE if punch hole operation fails. fil0fil.h: Add punch_hole variable to fil_space_t and block size to fil_node_t. os0api.h: Header to helper functions on buf0buf.cc and fil0fil.cc for os0file.h os0file.h: Remove unneeded m_block_size from IORequest and add bpage to IORequest to know actual size of the block and m_fil_node to know tablespace file system block size and does it support punch hole. os0file.cc: Add function punch_hole() to IORequest to do punch_hole operation, get the file system block size and determine does file system support sparse files (for punch hole). page0size.h: remove implicit copy disable and use this implicit copy to implement copy_from() function. buf0dblwr.cc, buf0flu.cc, buf0rea.cc, fil0fil.cc, fil0fil.h, os0file.h, os0file.cc, log0log.cc, log0recv.cc: Remove unneeded write_size parameter from fil_io calls. srv0mon.h, srv0srv.h, srv0mon.cc: Remove unneeded trim512-trim32678 status variables. Removed these from monitor tests.
9 years ago
MDEV-11254: innodb-use-trim has no effect in 10.2 Problem was that implementation merged from 10.1 was incompatible with InnoDB 5.7. buf0buf.cc: Add functions to return should we punch hole and how big. buf0flu.cc: Add written page to IORequest fil0fil.cc: Remove unneeded status call and add test is sparse files and punch hole supported by file system when tablespace is created. Add call to get file system block size. Used file node is added to IORequest. Added functions to check is punch hole supported and setting punch hole. ha_innodb.cc: Remove unneeded status variables (trim512-32768) and trim_op_saved. Deprecate innodb_use_trim and set it ON by default. Add function to set innodb-use-trim dynamically. dberr.h: Add error code DB_IO_NO_PUNCH_HOLE if punch hole operation fails. fil0fil.h: Add punch_hole variable to fil_space_t and block size to fil_node_t. os0api.h: Header to helper functions on buf0buf.cc and fil0fil.cc for os0file.h os0file.h: Remove unneeded m_block_size from IORequest and add bpage to IORequest to know actual size of the block and m_fil_node to know tablespace file system block size and does it support punch hole. os0file.cc: Add function punch_hole() to IORequest to do punch_hole operation, get the file system block size and determine does file system support sparse files (for punch hole). page0size.h: remove implicit copy disable and use this implicit copy to implement copy_from() function. buf0dblwr.cc, buf0flu.cc, buf0rea.cc, fil0fil.cc, fil0fil.h, os0file.h, os0file.cc, log0log.cc, log0recv.cc: Remove unneeded write_size parameter from fil_io calls. srv0mon.h, srv0srv.h, srv0mon.cc: Remove unneeded trim512-trim32678 status variables. Removed these from monitor tests.
9 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
10 years ago
10 years ago
20 years ago
10 years ago
10 years ago
20 years ago
20 years ago
20 years ago
20 years ago
13 years ago
20 years ago
20 years ago
MDEV-11254: innodb-use-trim has no effect in 10.2 Problem was that implementation merged from 10.1 was incompatible with InnoDB 5.7. buf0buf.cc: Add functions to return should we punch hole and how big. buf0flu.cc: Add written page to IORequest fil0fil.cc: Remove unneeded status call and add test is sparse files and punch hole supported by file system when tablespace is created. Add call to get file system block size. Used file node is added to IORequest. Added functions to check is punch hole supported and setting punch hole. ha_innodb.cc: Remove unneeded status variables (trim512-32768) and trim_op_saved. Deprecate innodb_use_trim and set it ON by default. Add function to set innodb-use-trim dynamically. dberr.h: Add error code DB_IO_NO_PUNCH_HOLE if punch hole operation fails. fil0fil.h: Add punch_hole variable to fil_space_t and block size to fil_node_t. os0api.h: Header to helper functions on buf0buf.cc and fil0fil.cc for os0file.h os0file.h: Remove unneeded m_block_size from IORequest and add bpage to IORequest to know actual size of the block and m_fil_node to know tablespace file system block size and does it support punch hole. os0file.cc: Add function punch_hole() to IORequest to do punch_hole operation, get the file system block size and determine does file system support sparse files (for punch hole). page0size.h: remove implicit copy disable and use this implicit copy to implement copy_from() function. buf0dblwr.cc, buf0flu.cc, buf0rea.cc, fil0fil.cc, fil0fil.h, os0file.h, os0file.cc, log0log.cc, log0recv.cc: Remove unneeded write_size parameter from fil_io calls. srv0mon.h, srv0srv.h, srv0mon.cc: Remove unneeded trim512-trim32678 status variables. Removed these from monitor tests.
9 years ago
MDEV-11254: innodb-use-trim has no effect in 10.2 Problem was that implementation merged from 10.1 was incompatible with InnoDB 5.7. buf0buf.cc: Add functions to return should we punch hole and how big. buf0flu.cc: Add written page to IORequest fil0fil.cc: Remove unneeded status call and add test is sparse files and punch hole supported by file system when tablespace is created. Add call to get file system block size. Used file node is added to IORequest. Added functions to check is punch hole supported and setting punch hole. ha_innodb.cc: Remove unneeded status variables (trim512-32768) and trim_op_saved. Deprecate innodb_use_trim and set it ON by default. Add function to set innodb-use-trim dynamically. dberr.h: Add error code DB_IO_NO_PUNCH_HOLE if punch hole operation fails. fil0fil.h: Add punch_hole variable to fil_space_t and block size to fil_node_t. os0api.h: Header to helper functions on buf0buf.cc and fil0fil.cc for os0file.h os0file.h: Remove unneeded m_block_size from IORequest and add bpage to IORequest to know actual size of the block and m_fil_node to know tablespace file system block size and does it support punch hole. os0file.cc: Add function punch_hole() to IORequest to do punch_hole operation, get the file system block size and determine does file system support sparse files (for punch hole). page0size.h: remove implicit copy disable and use this implicit copy to implement copy_from() function. buf0dblwr.cc, buf0flu.cc, buf0rea.cc, fil0fil.cc, fil0fil.h, os0file.h, os0file.cc, log0log.cc, log0recv.cc: Remove unneeded write_size parameter from fil_io calls. srv0mon.h, srv0srv.h, srv0mon.cc: Remove unneeded trim512-trim32678 status variables. Removed these from monitor tests.
9 years ago
20 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
20 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
20 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
20 years ago
20 years ago
MDEV-33379 innodb_log_file_buffering=OFF causes corruption on bcachefs Apparently, invoking fcntl(fd, F_SETFL, O_DIRECT) will lead to unexpected behaviour on Linux bcachefs and possibly other file systems, depending on the operating system version. So, let us avoid doing that, and instead just attempt to pass the O_DIRECT flag to open(). This should make us compatible with NetBSD, IBM AIX, as well as Solaris and its derivatives. This fix does not change the fact that we had only implemented innodb_log_file_buffering=OFF on systems where we can determine the physical block size (typically 512 or 4096 bytes). Currently, those operating systems are Linux and Microsoft Windows. HAVE_FCNTL_DIRECT, os_file_set_nocache(): Remove. OS_FILE_OVERWRITE, OS_FILE_CREATE_PATH: Remove (never used parameters). os_file_log_buffered(), os_file_log_maybe_unbuffered(): Helper functions. os_file_create_simple_func(): When applicable, initially attempt to open files in O_DIRECT mode. os_file_create_func(): When applicable, initially attempt to open files in O_DIRECT mode. For type==OS_LOG_FILE && create_mode != OS_FILE_CREATE we will first invoke stat(2) on the file name to find out if the size is compatible with O_DIRECT. If create_mode == OS_FILE_CREATE, we will invoke fstat(2) on the created log file afterwards, and may close and reopen the file in O_DIRECT mode if applicable. create_temp_file(): Support O_DIRECT. This is only used if O_TMPFILE is available and innodb_disable_sort_file_cache=ON (non-default value). Notably, that setting never worked on Microsoft Windows. row_merge_file_create_mode(): Split from row_merge_file_create_low(). Create a temporary file in the specified mode. Reviewed by: Vladislav Vaintroub
2 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-25506 (3 of 3): Do not delete .ibd files before commit This is a complete rewrite of DROP TABLE, also as part of other DDL, such as ALTER TABLE, CREATE TABLE...SELECT, TRUNCATE TABLE. The background DROP TABLE queue hack is removed. If a transaction needs to drop and create a table by the same name (like TRUNCATE TABLE does), it must first rename the table to an internal #sql-ib name. No committed version of the data dictionary will include any #sql-ib tables, because whenever a transaction renames a table to a #sql-ib name, it will also drop that table. Either the rename will be rolled back, or the drop will be committed. Data files will be unlinked after the transaction has been committed and a FILE_RENAME record has been durably written. The file will actually be deleted when the detached file handle returned by fil_delete_tablespace() will be closed, after the latches have been released. It is possible that a purge of the delete of the SYS_INDEXES record for the clustered index will execute fil_delete_tablespace() concurrently with the DDL transaction. In that case, the thread that arrives later will wait for the other thread to finish. HTON_TRUNCATE_REQUIRES_EXCLUSIVE_USE: A new handler flag. ha_innobase::truncate() now requires that all other references to the table be released in advance. This was implemented by Monty. ha_innobase::delete_table(): If CREATE TABLE..SELECT is detected, we will "hijack" the current transaction, drop the table in the current transaction and commit the current transaction. This essentially fixes MDEV-21602. There is a FIXME comment about making the check less failure-prone. ha_innobase::truncate(), ha_innobase::delete_table(): Implement a fast path for temporary tables. We will no longer allow temporary tables to use the adaptive hash index. dict_table_t::mdl_name: The original table name for the purpose of acquiring MDL in purge, to prevent a race condition between a DDL transaction that is dropping a table, and purge processing undo log records of DML that had executed before the DDL operation. For #sql-backup- tables during ALTER TABLE...ALGORITHM=COPY, the dict_table_t::mdl_name will differ from dict_table_t::name. dict_table_t::parse_name(): Use mdl_name instead of name. dict_table_rename_in_cache(): Update mdl_name. For the internal FTS_ tables of FULLTEXT INDEX, purge would acquire MDL on the FTS_ table name, but not on the main table, and therefore it would be able to run concurrently with a DDL transaction that is dropping the table. Previously, the DROP TABLE queue hack prevented a race between purge and DDL. For now, we introduce purge_sys.stop_FTS() to prevent purge from opening any table, while a DDL transaction that may drop FTS_ tables is in progress. The function fts_lock_table(), which will be invoked before the dictionary is locked, will wait for purge to release any table handles. trx_t::drop_table_statistics(): Drop statistics for the table. This replaces dict_stats_drop_index(). We will drop or rename persistent statistics atomically as part of DDL transactions. On lock conflict for dropping statistics, we will fail instantly with DB_LOCK_WAIT_TIMEOUT, because we will be holding the exclusive data dictionary latch. trx_t::commit_cleanup(): Separated from trx_t::commit_in_memory(). Relax an assertion around fts_commit() and allow DB_LOCK_WAIT_TIMEOUT in addition to DB_DUPLICATE_KEY. The call to fts_commit() is entirely misplaced here and may obviously break the consistency of transactions that affect FULLTEXT INDEX. It needs to be fixed separately. dict_table_t::n_foreign_key_checks_running: Remove (MDEV-21175). The counter was a work-around for missing meta-data locking (MDL) on the SQL layer, and not really needed in MariaDB. ER_TABLE_IN_FK_CHECK: Replaced with ER_UNUSED_28. HA_ERR_TABLE_IN_FK_CHECK: Remove. row_ins_check_foreign_constraints(): Do not acquire dict_sys.latch either. The SQL-layer MDL will protect us. This was reviewed by Thirunarayanan Balathandayuthapani and tested by Matthias Leich.
4 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-28766: SET GLOBAL innodb_log_file_buffering In commit c4c88307091cb16886562e9e7b77f5fd077d34b5 (MDEV-28111) we disabled the file system cache on the InnoDB write-ahead log file (ib_logfile0) by default on Linux. It turns out that especially with innodb_flush_trx_log_at_commit=2, writing to the log via the file system cache typically improves throughput, especially on slow storage or at a small number of concurrent transactions. For other values of innodb_flush_log_at_trx_commit, direct writes were observed to be mostly but not always faster. Whether it pays off to disable the file system cache on the log may depend on the type of storage, the workload, and the operating system kernel version. On Linux and Microsoft Windows, we will introduce the settable Boolean global variable innodb_log_file_buffering that indicates whether the file system cache on the redo log file is enabled. The default value is innodb_log_file_buffering=OFF. If the server is started up with innodb_flush_log_at_trx_commit=2, the value will be changed to innodb_log_file_buffering=ON. When a persistent memory interface is being used for the log, the value cannot be changed from innodb_log_file_buffering=OFF. On Linux, when the physical block size cannot be determined to be a power of 2 between 64 and 4096 bytes, the file system cache cannot be disabled, and innodb_log_file_buffering=ON cannot be changed. Server log messages will indicate whether the file system cache is enabled for the redo log: [Note] InnoDB: Buffered log writes (block size=512 bytes) [Note] InnoDB: File system buffers for log disabled (block size=512 bytes) After this change, the startup parameter innodb_flush_method will no longer control whether O_DIRECT will be set on the redo log on Linux. On other operating systems that support O_DIRECT, no interface has been implemented for controlling the file system cache for the redo log. The innodb_flush_method values O_DIRECT, O_DIRECT_NO_FSYNC, O_DSYNC will enable O_DIRECT for data files, not the log. Tested by: Matthias Leich, Axel Schwenke
3 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
MDEV-28766: SET GLOBAL innodb_log_file_buffering In commit c4c88307091cb16886562e9e7b77f5fd077d34b5 (MDEV-28111) we disabled the file system cache on the InnoDB write-ahead log file (ib_logfile0) by default on Linux. It turns out that especially with innodb_flush_trx_log_at_commit=2, writing to the log via the file system cache typically improves throughput, especially on slow storage or at a small number of concurrent transactions. For other values of innodb_flush_log_at_trx_commit, direct writes were observed to be mostly but not always faster. Whether it pays off to disable the file system cache on the log may depend on the type of storage, the workload, and the operating system kernel version. On Linux and Microsoft Windows, we will introduce the settable Boolean global variable innodb_log_file_buffering that indicates whether the file system cache on the redo log file is enabled. The default value is innodb_log_file_buffering=OFF. If the server is started up with innodb_flush_log_at_trx_commit=2, the value will be changed to innodb_log_file_buffering=ON. When a persistent memory interface is being used for the log, the value cannot be changed from innodb_log_file_buffering=OFF. On Linux, when the physical block size cannot be determined to be a power of 2 between 64 and 4096 bytes, the file system cache cannot be disabled, and innodb_log_file_buffering=ON cannot be changed. Server log messages will indicate whether the file system cache is enabled for the redo log: [Note] InnoDB: Buffered log writes (block size=512 bytes) [Note] InnoDB: File system buffers for log disabled (block size=512 bytes) After this change, the startup parameter innodb_flush_method will no longer control whether O_DIRECT will be set on the redo log on Linux. On other operating systems that support O_DIRECT, no interface has been implemented for controlling the file system cache for the redo log. The innodb_flush_method values O_DIRECT, O_DIRECT_NO_FSYNC, O_DSYNC will enable O_DIRECT for data files, not the log. Tested by: Matthias Leich, Axel Schwenke
3 years ago
MDEV-28766: SET GLOBAL innodb_log_file_buffering In commit c4c88307091cb16886562e9e7b77f5fd077d34b5 (MDEV-28111) we disabled the file system cache on the InnoDB write-ahead log file (ib_logfile0) by default on Linux. It turns out that especially with innodb_flush_trx_log_at_commit=2, writing to the log via the file system cache typically improves throughput, especially on slow storage or at a small number of concurrent transactions. For other values of innodb_flush_log_at_trx_commit, direct writes were observed to be mostly but not always faster. Whether it pays off to disable the file system cache on the log may depend on the type of storage, the workload, and the operating system kernel version. On Linux and Microsoft Windows, we will introduce the settable Boolean global variable innodb_log_file_buffering that indicates whether the file system cache on the redo log file is enabled. The default value is innodb_log_file_buffering=OFF. If the server is started up with innodb_flush_log_at_trx_commit=2, the value will be changed to innodb_log_file_buffering=ON. When a persistent memory interface is being used for the log, the value cannot be changed from innodb_log_file_buffering=OFF. On Linux, when the physical block size cannot be determined to be a power of 2 between 64 and 4096 bytes, the file system cache cannot be disabled, and innodb_log_file_buffering=ON cannot be changed. Server log messages will indicate whether the file system cache is enabled for the redo log: [Note] InnoDB: Buffered log writes (block size=512 bytes) [Note] InnoDB: File system buffers for log disabled (block size=512 bytes) After this change, the startup parameter innodb_flush_method will no longer control whether O_DIRECT will be set on the redo log on Linux. On other operating systems that support O_DIRECT, no interface has been implemented for controlling the file system cache for the redo log. The innodb_flush_method values O_DIRECT, O_DIRECT_NO_FSYNC, O_DSYNC will enable O_DIRECT for data files, not the log. Tested by: Matthias Leich, Axel Schwenke
3 years ago
branches/innodb+: Merge revisions r5971:6130 from branches/zip. ------------------------------------------------------------------------ r5971 | marko | 2009-09-23 23:03:51 +1000 (Wed, 23 Sep 2009) | 2 lines branches/zip: os_file_pwrite(): Make the code compile in InnoDB Hot Backup when the pwrite system call is not available. ------------------------------------------------------------------------ r5972 | marko | 2009-09-24 05:44:52 +1000 (Thu, 24 Sep 2009) | 5 lines branches/zip: fil_node_open_file(): In InnoDB Hot Backup, determine the page size of single-file tablespaces before computing the file node size. Otherwise, the space->size of compressed tablespaces would be computed with UNIV_PAGE_SIZE instead of key_block_size. This should fix Issue #313. ------------------------------------------------------------------------ r5973 | marko | 2009-09-24 05:53:21 +1000 (Thu, 24 Sep 2009) | 2 lines branches/zip: recv_add_to_hash_table(): Simplify obfuscated pointer arithmetics. ------------------------------------------------------------------------ r5978 | marko | 2009-09-24 17:47:56 +1000 (Thu, 24 Sep 2009) | 1 line branches/zip: Fix warnings and errors when UNIV_HOTBACKUP is defined. ------------------------------------------------------------------------ r5979 | marko | 2009-09-24 20:16:10 +1000 (Thu, 24 Sep 2009) | 4 lines branches/zip: ha_innodb.cc: Define MYSQL_PLUGIN_IMPORT when necessary. This preprocessor symbol has been recently introduced in MySQL 5.1. The InnoDB Plugin should remain source compatible with MySQL 5.1.24 and later. ------------------------------------------------------------------------ r5988 | calvin | 2009-09-26 05:14:43 +1000 (Sat, 26 Sep 2009) | 8 lines branches/zip: fix bug#47055 unconditional exit(1) on ERROR_WORKING_SET_QUOTA 1453 (0x5AD) for InnoDB backend When error ERROR_WORKING_SET_QUOTA or ERROR_NO_SYSTEM_RESOURCES occurs, yields for 100ms and retries the operation. Approved by: Heikki (on IM) ------------------------------------------------------------------------ r5992 | vasil | 2009-09-28 17:10:29 +1000 (Mon, 28 Sep 2009) | 4 lines branches/zip: Add ChangeLog entry for c5988. ------------------------------------------------------------------------ r5994 | marko | 2009-09-28 18:33:59 +1000 (Mon, 28 Sep 2009) | 17 lines branches/zip: Try to prevent the reuse of tablespace identifiers after InnoDB has crashed during table creation. Also, refuse to start if files with duplicate tablespace identifiers are encountered. fil_node_create(): Update fil_system->max_assigned_id. This should prevent the reuse of a space->id when InnoDB does a full crash recovery and invokes fil_load_single_table_tablespaces(). Normally, fil_system->max_assigned_id is initialized from SELECT MAX(ID) FROM SYS_TABLES. fil_open_single_table_tablespace(): Return FALSE when fil_space_create() fails. fil_load_single_table_tablespace(): Exit if fil_space_create() fails and innodb_force_recovery=0. rb://173 approved by Heikki Tuuri. This addresses Issue #335. ------------------------------------------------------------------------ r5995 | marko | 2009-09-28 18:52:25 +1000 (Mon, 28 Sep 2009) | 17 lines branches/zip: Do not write to PAGE_INDEX_ID after page creation, not even when restoring an uncompressed page after a compression failure. btr_page_reorganize_low(): On compression failure, do not restore those page header fields that should not be affected by the reorganization. Instead, compare the fields. page_zip_decompress(): Add the parameter ibool all, for copying all page header fields. Pass the parameter all=TRUE on block read completion, redo log application, and page_zip_validate(); pass all=FALSE in all other cases. page_zip_reorganize(): Do not restore the uncompressed page on failure. It will be restored (to pre-modification state) by the caller anyway. rb://167, Issue #346 ------------------------------------------------------------------------ r5996 | marko | 2009-09-28 22:46:02 +1000 (Mon, 28 Sep 2009) | 4 lines branches/zip: Address Issue #350 in comments. lock_rec_queue_validate(), lock_rec_queue_validate(): Note that this debug code may violate the latching order and cause deadlocks. ------------------------------------------------------------------------ r5997 | marko | 2009-09-28 23:03:58 +1000 (Mon, 28 Sep 2009) | 12 lines branches/zip: Remove an assertion failure when the InnoDB data dictionary is inconsistent with the MySQL .frm file. ha_innobase::index_read(): When the index cannot be found, return an error. ha_innobase::change_active_index(): When prebuilt->index == NULL, set also prebuilt->index_usable = FALSE. This is not needed for correctness, because prebuilt->index_usable is only checked by row_search_for_mysql(), which requires prebuilt->index != NULL. This addresses Issue #349. Approved by Heikki Tuuri over IM. ------------------------------------------------------------------------ r6005 | vasil | 2009-09-29 18:09:52 +1000 (Tue, 29 Sep 2009) | 4 lines branches/zip: ChangeLog: wrap around 78th column, not earlier. ------------------------------------------------------------------------ r6006 | vasil | 2009-09-29 20:15:25 +1000 (Tue, 29 Sep 2009) | 4 lines branches/zip: Add ChangeLog entry for the release of 1.0.4. ------------------------------------------------------------------------ r6007 | vasil | 2009-09-29 23:19:59 +1000 (Tue, 29 Sep 2009) | 6 lines branches/zip: Fix the year, should be 2009. Pointed by: Calvin ------------------------------------------------------------------------ r6026 | marko | 2009-09-30 17:18:24 +1000 (Wed, 30 Sep 2009) | 1 line branches/zip: Add some debug assertions for checking FSEG_MAGIC_N. ------------------------------------------------------------------------ r6028 | marko | 2009-09-30 23:55:23 +1000 (Wed, 30 Sep 2009) | 3 lines branches/zip: recv_no_log_write: New debug flag for tracking down Mantis Issue #347. No modifications should be made to the database while recv_apply_hashed_log_recs() is about to complete. ------------------------------------------------------------------------ r6029 | calvin | 2009-10-01 06:32:02 +1000 (Thu, 01 Oct 2009) | 4 lines branches/zip: non-functional changes Fix typo. ------------------------------------------------------------------------ r6031 | marko | 2009-10-01 21:24:33 +1000 (Thu, 01 Oct 2009) | 49 lines branches/zip: Clean up after a crash during DROP INDEX. When InnoDB crashes while dropping an index, ensure that the index will be completely dropped during crash recovery. row_merge_drop_index(): Before dropping an index, rename the index to start with TEMP_INDEX_PREFIX_STR and commit the change, so that row_merge_drop_temp_indexes() will drop the index after crash recovery if the server crashes while dropping the index. fseg_inode_try_get(): New function, forked from fseg_inode_get(). Return NULL if the file segment index node is free. fseg_inode_get(): Assert that the file segment index node is not free. fseg_free_step(): If the file segment index node is already free, print a diagnostic message and return TRUE. fsp_free_seg_inode(): Write a nonzero number to FSEG_MAGIC_N, so that allocated-and-freed file segment index nodes can be better distinguished from uninitialized ones. This is rb://174, addressing Issue #348. Tested by restarting mysqld upon the completion of the added log_write_up_to() invocation below, during DROP INDEX. The index was dropped after crash recovery, and re-issuing the DROP INDEX did not crash the server. Index: btr/btr0btr.c =================================================================== --- btr/btr0btr.c (revision 6026) +++ btr/btr0btr.c (working copy) @@ -42,6 +42,7 @@ Created 6/2/1994 Heikki Tuuri #include "ibuf0ibuf.h" #include "trx0trx.h" +#include "log0log.h" /* Latching strategy of the InnoDB B-tree -------------------------------------- @@ -873,6 +874,8 @@ leaf_loop: goto leaf_loop; } + + log_write_up_to(mtr.end_lsn, LOG_WAIT_ALL_GROUPS, TRUE); top_loop: mtr_start(&mtr); ------------------------------------------------------------------------ r6033 | calvin | 2009-10-02 06:19:46 +1000 (Fri, 02 Oct 2009) | 4 lines branches/zip: fix a typo in error message Reported as bug#47763. ------------------------------------------------------------------------ r6043 | inaam | 2009-10-06 01:45:35 +1100 (Tue, 06 Oct 2009) | 12 lines branches/zip rb://176 Do not invalidate buffer pool while an LRU batch is active. Added code to buf_pool_invalidate() to wait for the running batches to finish. This patch also resets the state of buf_pool struct at invalidation. This addresses the concern where buf_pool->freed_page_clock becomes non-zero because we read in a system tablespace page for file format info at startup. Approved by: Marko ------------------------------------------------------------------------ r6044 | pekka | 2009-10-07 01:44:54 +1100 (Wed, 07 Oct 2009) | 5 lines branches/zip: Add os_file_is_same() function for Hot Backup (inside ifdef UNIV_HOTBACKUP). This is part of the fix for Issue #186. Note! The Windows implementation is incomplete. ------------------------------------------------------------------------ r6046 | pekka | 2009-10-08 20:24:56 +1100 (Thu, 08 Oct 2009) | 3 lines branches/zip: Revert r6044 which added os_file_is_same() function (issue#186). This functionality is moved to Hot Backup source tree. ------------------------------------------------------------------------ r6048 | vasil | 2009-10-09 16:42:55 +1100 (Fri, 09 Oct 2009) | 16 lines branches/zip: When scanning a directory readdir() is called and stat() after it, if a file is deleted between the two calls stat will fail and the whole precedure will fail. Change this behavior to continue with the next entry if stat() fails because of nonexistent file. This is transparent change as it will make it look as if the file was deleted before the readdir() call. This change is needed in order to fix https://svn.innodb.com/mantis/view.php?id=174 in which we need to abort if os_file_readdir_next_file() encounters "real" errors. Approved by: Marko, Pekka (rb://177) ------------------------------------------------------------------------ r6049 | vasil | 2009-10-10 03:05:26 +1100 (Sat, 10 Oct 2009) | 7 lines branches/zip: Fix compilation warning in Hot Backup: innodb/fil/fil0fil.c: In function 'fil_load_single_table_tablespace': innodb/fil/fil0fil.c:3253: warning: format '%lld' expects type 'long long int', but argument 6 has type 'ib_int64_t' ------------------------------------------------------------------------ r6064 | calvin | 2009-10-14 02:23:35 +1100 (Wed, 14 Oct 2009) | 4 lines branches/zip: non-functional changes Changes from MySQL to fix build issue. ------------------------------------------------------------------------ r6065 | inaam | 2009-10-14 04:43:13 +1100 (Wed, 14 Oct 2009) | 7 lines branches/zip rb://182 Call fsync() on datafiles after a batch of pages is written to disk even when skip_innodb_doublewrite is set. Approved by: Heikki ------------------------------------------------------------------------ r6080 | sunny | 2009-10-15 09:29:01 +1100 (Thu, 15 Oct 2009) | 3 lines branches/zip: Change page_mem_alloc_free() to inline. Fix Bug #47058 - Failure to compile innodb_plugin on solaris 10u7 + spro cc/CC 5.10 ------------------------------------------------------------------------ r6084 | vasil | 2009-10-15 16:21:17 +1100 (Thu, 15 Oct 2009) | 4 lines branches/zip: Add ChangeLog entry for r6080. ------------------------------------------------------------------------ r6095 | vasil | 2009-10-20 00:04:59 +1100 (Tue, 20 Oct 2009) | 7 lines branches/zip: Fix Bug#47808 innodb_information_schema.test fails when run under valgrind by using the wait_until_rows_count macro that loops until the number of rows becomes 14 instead of sleep 0.1, which is obviously very fragile. ------------------------------------------------------------------------ r6096 | vasil | 2009-10-20 00:06:09 +1100 (Tue, 20 Oct 2009) | 4 lines branches/zip: Add ChangeLog entry for r6095. ------------------------------------------------------------------------ r6099 | jyang | 2009-10-22 13:58:39 +1100 (Thu, 22 Oct 2009) | 7 lines branches/zip: Port bug #46000 related changes from 5.1 to zip branch. Due to different code path for creating index in zip branch comparing to 5.1), the index reserved name check function is extended to be used in ha_innobase::add_index(). rb://190 Approved by: Marko ------------------------------------------------------------------------ r6100 | jyang | 2009-10-22 14:51:07 +1100 (Thu, 22 Oct 2009) | 6 lines branches/zip: As a request from mysql, WARN_LEVEL_ERROR cannot be used for push_warning_* call any more. Switch to WARN_LEVEL_WARN. Bug #47233. rb://172 approved by Sunny Bains and Marko. ------------------------------------------------------------------------ r6101 | jyang | 2009-10-23 19:45:50 +1100 (Fri, 23 Oct 2009) | 7 lines branches/zip: Update test result with the WARN_LEVEL_ERROR to WARN_LEVEL_WARN change. This is the same result as submitted in rb://172 review, which approved by Sunny Bains and Marko. ------------------------------------------------------------------------ r6102 | marko | 2009-10-26 18:32:23 +1100 (Mon, 26 Oct 2009) | 1 line branches/zip: row_prebuilt_struct::prebuilts: Unused field, remove. ------------------------------------------------------------------------ r6103 | marko | 2009-10-27 00:46:18 +1100 (Tue, 27 Oct 2009) | 4 lines branches/zip: row_ins_alloc_sys_fields(): Zero out the system columns DB_TRX_ID, DB_ROLL_PTR and DB_ROW_ID, in order to avoid harmless Valgrind warnings about uninitialized data. (The warnings were harmless, because the fields would be initialized at a later stage.) ------------------------------------------------------------------------ r6105 | calvin | 2009-10-28 09:05:52 +1100 (Wed, 28 Oct 2009) | 6 lines branches/zip: backport r3848 from 6.0 branch ---- branches/6.0: innobase_start_or_create_for_mysql(): Make the 10 MB minimum tablespace limit independent of UNIV_PAGE_SIZE. (Bug #41490) ------------------------------------------------------------------------ r6107 | marko | 2009-10-29 01:10:34 +1100 (Thu, 29 Oct 2009) | 5 lines branches/zip: buf_page_set_old(): Improve UNIV_LRU_DEBUG diagnostics in order to catch the buf_pool->LRU_old corruption reported in Issue #381. buf_LRU_old_init(): Set the property from the tail towards the front of the buf_pool->LRU list, in order not to trip the debug check. ------------------------------------------------------------------------ r6108 | calvin | 2009-10-29 16:58:04 +1100 (Thu, 29 Oct 2009) | 5 lines branches/zip: close file handle when building with UNIV_HOTBACKUP The change does not affect regular InnoDB engine. Confirmed by Marko. ------------------------------------------------------------------------ r6109 | jyang | 2009-10-29 19:37:32 +1100 (Thu, 29 Oct 2009) | 7 lines branches/zip: In os_mem_alloc_large(), if we fail to attach the shared memory, reset memory pointer ptr to NULL, and allocate memory from conventional pool. Bug #48237 Error handling in os_mem_alloc_large appears to be incorrect rb://198 Approved by: Marko ------------------------------------------------------------------------ r6110 | marko | 2009-10-29 21:44:57 +1100 (Thu, 29 Oct 2009) | 2 lines branches/zip: Makefile.am (INCLUDES): Merge a change from MySQL: Use $(srcdir)/include instead of $(top_srcdir)/storage/innobase/include. ------------------------------------------------------------------------ r6111 | marko | 2009-10-29 22:04:11 +1100 (Thu, 29 Oct 2009) | 33 lines branches/zip: Fix corruption of buf_pool->LRU_old and improve debug assertions. This was reported as Issue #381. buf_page_set_old(): Assert that blocks may only be set old if buf_pool->LRU_old is initialized and buf_pool->LRU_old_len is nonzero. Assert that buf_pool->LRU_old points to the block at the old/new boundary. buf_LRU_old_adjust_len(): Invoke buf_page_set_old() after adjusting buf_pool->LRU_old and buf_pool->LRU_old_len, in order not to violate the added assertions. buf_LRU_old_init(): Replace buf_page_set_old() with a direct assignment to bpage->old, because these loops that initialize all the blocks would temporarily violate the assertions about buf_pool->LRU_old. buf_LRU_remove_block(): When setting buf_pool->LRU_old = NULL, also clear all bpage->old flags and set buf_pool->LRU_old_len = 0. buf_LRU_add_block_to_end_low(), buf_LRU_add_block_low(): Move the buf_page_set_old() call later in order not to violate the debug assertions. If buf_pool->LRU_old is NULL, set old=FALSE. buf_LRU_free_block(): Replace the UNIV_LRU_DEBUG assertion with a dummy buf_page_set_old() call that performs more thorough checks. buf_LRU_validate(): Do not tolerate garbage in buf_pool->LRU_old_len even if buf_pool->LRU_old is NULL. Check that bpage->old is monotonic. buf_relocate(): Make the UNIV_LRU_DEBUG checks stricter. buf0buf.h: Revise the documentation of buf_page_t::old and buf_pool_t::LRU_old_len. ------------------------------------------------------------------------ r6112 | calvin | 2009-10-30 01:21:15 +1100 (Fri, 30 Oct 2009) | 4 lines branches/zip: consideration for icc compilers Proposed by MySQL, and approved by Marko. ------------------------------------------------------------------------ r6113 | vasil | 2009-10-30 03:15:50 +1100 (Fri, 30 Oct 2009) | 93 lines branches/zip: Merge r5912:6112 from branches/5.1: (after this merge the innodb-autoinc test starts to fail, but I commit anyway because it would be easier to investigate the failure this way) ------------------------------------------------------------------------ r5952 | calvin | 2009-09-22 19:45:07 +0300 (Tue, 22 Sep 2009) | 7 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: fix bug#42383: Can't create table 'test.bug39438' For embedded server, MySQL may pass in full path, which is currently disallowed. It is needed to relax the condition by accepting full paths in the embedded case. Approved by: Heikki (on IM) ------------------------------------------------------------------------ r6032 | vasil | 2009-10-01 15:55:49 +0300 (Thu, 01 Oct 2009) | 8 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Fix Bug#38996 Race condition in ANALYZE TABLE by serializing ANALYZE TABLE inside InnoDB. Approved by: Heikki (rb://175) ------------------------------------------------------------------------ r6045 | jyang | 2009-10-08 02:27:08 +0300 (Thu, 08 Oct 2009) | 7 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug47777.result A /branches/5.1/mysql-test/innodb_bug47777.test branches/5.1: Fix bug #47777. Treat the Geometry data same as Binary BLOB in ha_innobase::store_key_val_for_row(), since the Geometry data is stored as Binary BLOB in Innodb. Review: rb://180 approved by Marko Makela. ------------------------------------------------------------------------ r6051 | sunny | 2009-10-12 07:05:00 +0300 (Mon, 12 Oct 2009) | 6 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Ignore negative values supplied by the user when calculating the next value to store in dict_table_t. Setting autoincrement columns top negative values is undefined behavior and this change should bring the behavior of InnoDB closer to what users expect. Added several tests to check. rb://162 ------------------------------------------------------------------------ r6052 | sunny | 2009-10-12 07:09:56 +0300 (Mon, 12 Oct 2009) | 4 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Reset the statement level autoinc counter on ROLLBACK. Fix the test results too. rb://164 ------------------------------------------------------------------------ r6053 | sunny | 2009-10-12 07:37:49 +0300 (Mon, 12 Oct 2009) | 6 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Copy the maximum AUTOINC value from the old table to the new table when MySQL does a CREATE INDEX ON T. This is required because MySQL does a table copy, rename and drops the old table. Fix Bug#47125: auto_increment start value is ignored if an index is created and engine=innodb rb://168 ------------------------------------------------------------------------ r6076 | vasil | 2009-10-14 19:30:12 +0300 (Wed, 14 Oct 2009) | 4 lines Changed paths: M /branches/5.1/row/row0mysql.c branches/5.1: Fix typo. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6114 | vasil | 2009-10-30 03:43:51 +1100 (Fri, 30 Oct 2009) | 6 lines branches/zip: * Add ChangeLog entries for latest changes * Obey alphabetical order in the list of the files * White-space fixup ------------------------------------------------------------------------ r6121 | sunny | 2009-10-30 10:42:11 +1100 (Fri, 30 Oct 2009) | 7 lines branches/zip: This test has been problematic for sometime now. The underlying bug is that the data dictionaries get out of sync. In the AUTOINC code we try and apply salve to the symptoms. In the past MySQL made some unrelated change and the dictionaries stopped getting out of sync and this test started to fail. Now, it seems they have reverted that changed and the test is passing again. I suspect this is not he last time that this test will change. ------------------------------------------------------------------------ r6124 | jyang | 2009-10-30 19:02:31 +1100 (Fri, 30 Oct 2009) | 5 lines branches/zip: Correct the bug number for -r6109 change from # 48273 to #48237 ------------------------------------------------------------------------ r6126 | vasil | 2009-10-30 19:36:07 +1100 (Fri, 30 Oct 2009) | 45 lines branches/zip: Merge r6112:6125 from branches/5.1: (skipping r6122 and r6123, Jimmy says these are already present and need not be merged): ------------------------------------------------------------------------ r6122 | jyang | 2009-10-30 05:18:38 +0200 (Fri, 30 Oct 2009) | 7 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb_bug44369.result M /branches/5.1/mysql-test/innodb_bug44369.test M /branches/5.1/mysql-test/innodb_bug46000.result M /branches/5.1/mysql-test/innodb_bug46000.test branches/5.1: Chnage WARN_LEVEL_ERROR to WARN_LEVEL_WARN for push_warning_printf() call in innodb. Fix Bug#47233: Innodb calls push_warning(MYSQL_ERROR::WARN_LEVEL_ERROR) rb://170 approved by Marko. ------------------------------------------------------------------------ r6123 | jyang | 2009-10-30 05:43:06 +0200 (Fri, 30 Oct 2009) | 8 lines Changed paths: M /branches/5.1/os/os0proc.c branches/5.1: In os_mem_alloc_large(), if we fail to attach the shared memory, reset memory pointer ptr to NULL, and allocate memory from conventional pool. This is a port from branches/zip. Bug #48237 Error handling in os_mem_alloc_large appears to be incorrect rb://198 Approved by: Marko ------------------------------------------------------------------------ r6125 | vasil | 2009-10-30 10:31:23 +0200 (Fri, 30 Oct 2009) | 4 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: White-space fixup. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6130 | marko | 2009-11-02 20:42:56 +1100 (Mon, 02 Nov 2009) | 9 lines branches/zip: Free all resources at shutdown. Set pointers to NULL, so that Valgrind will not complain about freed data structures that are reachable via pointers. This addresses Bug #45992 and Bug #46656. This patch is mostly based on changes copied from branches/embedded-1.0, mainly c5432, c3439, c3134, c2994, c2978, but also some other code was copied. Some added cleanup code is specific to MySQL/InnoDB. rb://199 approved by Sunny Bains ------------------------------------------------------------------------
16 years ago
MDEV-28766: SET GLOBAL innodb_log_file_buffering In commit c4c88307091cb16886562e9e7b77f5fd077d34b5 (MDEV-28111) we disabled the file system cache on the InnoDB write-ahead log file (ib_logfile0) by default on Linux. It turns out that especially with innodb_flush_trx_log_at_commit=2, writing to the log via the file system cache typically improves throughput, especially on slow storage or at a small number of concurrent transactions. For other values of innodb_flush_log_at_trx_commit, direct writes were observed to be mostly but not always faster. Whether it pays off to disable the file system cache on the log may depend on the type of storage, the workload, and the operating system kernel version. On Linux and Microsoft Windows, we will introduce the settable Boolean global variable innodb_log_file_buffering that indicates whether the file system cache on the redo log file is enabled. The default value is innodb_log_file_buffering=OFF. If the server is started up with innodb_flush_log_at_trx_commit=2, the value will be changed to innodb_log_file_buffering=ON. When a persistent memory interface is being used for the log, the value cannot be changed from innodb_log_file_buffering=OFF. On Linux, when the physical block size cannot be determined to be a power of 2 between 64 and 4096 bytes, the file system cache cannot be disabled, and innodb_log_file_buffering=ON cannot be changed. Server log messages will indicate whether the file system cache is enabled for the redo log: [Note] InnoDB: Buffered log writes (block size=512 bytes) [Note] InnoDB: File system buffers for log disabled (block size=512 bytes) After this change, the startup parameter innodb_flush_method will no longer control whether O_DIRECT will be set on the redo log on Linux. On other operating systems that support O_DIRECT, no interface has been implemented for controlling the file system cache for the redo log. The innodb_flush_method values O_DIRECT, O_DIRECT_NO_FSYNC, O_DSYNC will enable O_DIRECT for data files, not the log. Tested by: Matthias Leich, Axel Schwenke
3 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
branches/innodb+: Merge revisions r5971:6130 from branches/zip. ------------------------------------------------------------------------ r5971 | marko | 2009-09-23 23:03:51 +1000 (Wed, 23 Sep 2009) | 2 lines branches/zip: os_file_pwrite(): Make the code compile in InnoDB Hot Backup when the pwrite system call is not available. ------------------------------------------------------------------------ r5972 | marko | 2009-09-24 05:44:52 +1000 (Thu, 24 Sep 2009) | 5 lines branches/zip: fil_node_open_file(): In InnoDB Hot Backup, determine the page size of single-file tablespaces before computing the file node size. Otherwise, the space->size of compressed tablespaces would be computed with UNIV_PAGE_SIZE instead of key_block_size. This should fix Issue #313. ------------------------------------------------------------------------ r5973 | marko | 2009-09-24 05:53:21 +1000 (Thu, 24 Sep 2009) | 2 lines branches/zip: recv_add_to_hash_table(): Simplify obfuscated pointer arithmetics. ------------------------------------------------------------------------ r5978 | marko | 2009-09-24 17:47:56 +1000 (Thu, 24 Sep 2009) | 1 line branches/zip: Fix warnings and errors when UNIV_HOTBACKUP is defined. ------------------------------------------------------------------------ r5979 | marko | 2009-09-24 20:16:10 +1000 (Thu, 24 Sep 2009) | 4 lines branches/zip: ha_innodb.cc: Define MYSQL_PLUGIN_IMPORT when necessary. This preprocessor symbol has been recently introduced in MySQL 5.1. The InnoDB Plugin should remain source compatible with MySQL 5.1.24 and later. ------------------------------------------------------------------------ r5988 | calvin | 2009-09-26 05:14:43 +1000 (Sat, 26 Sep 2009) | 8 lines branches/zip: fix bug#47055 unconditional exit(1) on ERROR_WORKING_SET_QUOTA 1453 (0x5AD) for InnoDB backend When error ERROR_WORKING_SET_QUOTA or ERROR_NO_SYSTEM_RESOURCES occurs, yields for 100ms and retries the operation. Approved by: Heikki (on IM) ------------------------------------------------------------------------ r5992 | vasil | 2009-09-28 17:10:29 +1000 (Mon, 28 Sep 2009) | 4 lines branches/zip: Add ChangeLog entry for c5988. ------------------------------------------------------------------------ r5994 | marko | 2009-09-28 18:33:59 +1000 (Mon, 28 Sep 2009) | 17 lines branches/zip: Try to prevent the reuse of tablespace identifiers after InnoDB has crashed during table creation. Also, refuse to start if files with duplicate tablespace identifiers are encountered. fil_node_create(): Update fil_system->max_assigned_id. This should prevent the reuse of a space->id when InnoDB does a full crash recovery and invokes fil_load_single_table_tablespaces(). Normally, fil_system->max_assigned_id is initialized from SELECT MAX(ID) FROM SYS_TABLES. fil_open_single_table_tablespace(): Return FALSE when fil_space_create() fails. fil_load_single_table_tablespace(): Exit if fil_space_create() fails and innodb_force_recovery=0. rb://173 approved by Heikki Tuuri. This addresses Issue #335. ------------------------------------------------------------------------ r5995 | marko | 2009-09-28 18:52:25 +1000 (Mon, 28 Sep 2009) | 17 lines branches/zip: Do not write to PAGE_INDEX_ID after page creation, not even when restoring an uncompressed page after a compression failure. btr_page_reorganize_low(): On compression failure, do not restore those page header fields that should not be affected by the reorganization. Instead, compare the fields. page_zip_decompress(): Add the parameter ibool all, for copying all page header fields. Pass the parameter all=TRUE on block read completion, redo log application, and page_zip_validate(); pass all=FALSE in all other cases. page_zip_reorganize(): Do not restore the uncompressed page on failure. It will be restored (to pre-modification state) by the caller anyway. rb://167, Issue #346 ------------------------------------------------------------------------ r5996 | marko | 2009-09-28 22:46:02 +1000 (Mon, 28 Sep 2009) | 4 lines branches/zip: Address Issue #350 in comments. lock_rec_queue_validate(), lock_rec_queue_validate(): Note that this debug code may violate the latching order and cause deadlocks. ------------------------------------------------------------------------ r5997 | marko | 2009-09-28 23:03:58 +1000 (Mon, 28 Sep 2009) | 12 lines branches/zip: Remove an assertion failure when the InnoDB data dictionary is inconsistent with the MySQL .frm file. ha_innobase::index_read(): When the index cannot be found, return an error. ha_innobase::change_active_index(): When prebuilt->index == NULL, set also prebuilt->index_usable = FALSE. This is not needed for correctness, because prebuilt->index_usable is only checked by row_search_for_mysql(), which requires prebuilt->index != NULL. This addresses Issue #349. Approved by Heikki Tuuri over IM. ------------------------------------------------------------------------ r6005 | vasil | 2009-09-29 18:09:52 +1000 (Tue, 29 Sep 2009) | 4 lines branches/zip: ChangeLog: wrap around 78th column, not earlier. ------------------------------------------------------------------------ r6006 | vasil | 2009-09-29 20:15:25 +1000 (Tue, 29 Sep 2009) | 4 lines branches/zip: Add ChangeLog entry for the release of 1.0.4. ------------------------------------------------------------------------ r6007 | vasil | 2009-09-29 23:19:59 +1000 (Tue, 29 Sep 2009) | 6 lines branches/zip: Fix the year, should be 2009. Pointed by: Calvin ------------------------------------------------------------------------ r6026 | marko | 2009-09-30 17:18:24 +1000 (Wed, 30 Sep 2009) | 1 line branches/zip: Add some debug assertions for checking FSEG_MAGIC_N. ------------------------------------------------------------------------ r6028 | marko | 2009-09-30 23:55:23 +1000 (Wed, 30 Sep 2009) | 3 lines branches/zip: recv_no_log_write: New debug flag for tracking down Mantis Issue #347. No modifications should be made to the database while recv_apply_hashed_log_recs() is about to complete. ------------------------------------------------------------------------ r6029 | calvin | 2009-10-01 06:32:02 +1000 (Thu, 01 Oct 2009) | 4 lines branches/zip: non-functional changes Fix typo. ------------------------------------------------------------------------ r6031 | marko | 2009-10-01 21:24:33 +1000 (Thu, 01 Oct 2009) | 49 lines branches/zip: Clean up after a crash during DROP INDEX. When InnoDB crashes while dropping an index, ensure that the index will be completely dropped during crash recovery. row_merge_drop_index(): Before dropping an index, rename the index to start with TEMP_INDEX_PREFIX_STR and commit the change, so that row_merge_drop_temp_indexes() will drop the index after crash recovery if the server crashes while dropping the index. fseg_inode_try_get(): New function, forked from fseg_inode_get(). Return NULL if the file segment index node is free. fseg_inode_get(): Assert that the file segment index node is not free. fseg_free_step(): If the file segment index node is already free, print a diagnostic message and return TRUE. fsp_free_seg_inode(): Write a nonzero number to FSEG_MAGIC_N, so that allocated-and-freed file segment index nodes can be better distinguished from uninitialized ones. This is rb://174, addressing Issue #348. Tested by restarting mysqld upon the completion of the added log_write_up_to() invocation below, during DROP INDEX. The index was dropped after crash recovery, and re-issuing the DROP INDEX did not crash the server. Index: btr/btr0btr.c =================================================================== --- btr/btr0btr.c (revision 6026) +++ btr/btr0btr.c (working copy) @@ -42,6 +42,7 @@ Created 6/2/1994 Heikki Tuuri #include "ibuf0ibuf.h" #include "trx0trx.h" +#include "log0log.h" /* Latching strategy of the InnoDB B-tree -------------------------------------- @@ -873,6 +874,8 @@ leaf_loop: goto leaf_loop; } + + log_write_up_to(mtr.end_lsn, LOG_WAIT_ALL_GROUPS, TRUE); top_loop: mtr_start(&mtr); ------------------------------------------------------------------------ r6033 | calvin | 2009-10-02 06:19:46 +1000 (Fri, 02 Oct 2009) | 4 lines branches/zip: fix a typo in error message Reported as bug#47763. ------------------------------------------------------------------------ r6043 | inaam | 2009-10-06 01:45:35 +1100 (Tue, 06 Oct 2009) | 12 lines branches/zip rb://176 Do not invalidate buffer pool while an LRU batch is active. Added code to buf_pool_invalidate() to wait for the running batches to finish. This patch also resets the state of buf_pool struct at invalidation. This addresses the concern where buf_pool->freed_page_clock becomes non-zero because we read in a system tablespace page for file format info at startup. Approved by: Marko ------------------------------------------------------------------------ r6044 | pekka | 2009-10-07 01:44:54 +1100 (Wed, 07 Oct 2009) | 5 lines branches/zip: Add os_file_is_same() function for Hot Backup (inside ifdef UNIV_HOTBACKUP). This is part of the fix for Issue #186. Note! The Windows implementation is incomplete. ------------------------------------------------------------------------ r6046 | pekka | 2009-10-08 20:24:56 +1100 (Thu, 08 Oct 2009) | 3 lines branches/zip: Revert r6044 which added os_file_is_same() function (issue#186). This functionality is moved to Hot Backup source tree. ------------------------------------------------------------------------ r6048 | vasil | 2009-10-09 16:42:55 +1100 (Fri, 09 Oct 2009) | 16 lines branches/zip: When scanning a directory readdir() is called and stat() after it, if a file is deleted between the two calls stat will fail and the whole precedure will fail. Change this behavior to continue with the next entry if stat() fails because of nonexistent file. This is transparent change as it will make it look as if the file was deleted before the readdir() call. This change is needed in order to fix https://svn.innodb.com/mantis/view.php?id=174 in which we need to abort if os_file_readdir_next_file() encounters "real" errors. Approved by: Marko, Pekka (rb://177) ------------------------------------------------------------------------ r6049 | vasil | 2009-10-10 03:05:26 +1100 (Sat, 10 Oct 2009) | 7 lines branches/zip: Fix compilation warning in Hot Backup: innodb/fil/fil0fil.c: In function 'fil_load_single_table_tablespace': innodb/fil/fil0fil.c:3253: warning: format '%lld' expects type 'long long int', but argument 6 has type 'ib_int64_t' ------------------------------------------------------------------------ r6064 | calvin | 2009-10-14 02:23:35 +1100 (Wed, 14 Oct 2009) | 4 lines branches/zip: non-functional changes Changes from MySQL to fix build issue. ------------------------------------------------------------------------ r6065 | inaam | 2009-10-14 04:43:13 +1100 (Wed, 14 Oct 2009) | 7 lines branches/zip rb://182 Call fsync() on datafiles after a batch of pages is written to disk even when skip_innodb_doublewrite is set. Approved by: Heikki ------------------------------------------------------------------------ r6080 | sunny | 2009-10-15 09:29:01 +1100 (Thu, 15 Oct 2009) | 3 lines branches/zip: Change page_mem_alloc_free() to inline. Fix Bug #47058 - Failure to compile innodb_plugin on solaris 10u7 + spro cc/CC 5.10 ------------------------------------------------------------------------ r6084 | vasil | 2009-10-15 16:21:17 +1100 (Thu, 15 Oct 2009) | 4 lines branches/zip: Add ChangeLog entry for r6080. ------------------------------------------------------------------------ r6095 | vasil | 2009-10-20 00:04:59 +1100 (Tue, 20 Oct 2009) | 7 lines branches/zip: Fix Bug#47808 innodb_information_schema.test fails when run under valgrind by using the wait_until_rows_count macro that loops until the number of rows becomes 14 instead of sleep 0.1, which is obviously very fragile. ------------------------------------------------------------------------ r6096 | vasil | 2009-10-20 00:06:09 +1100 (Tue, 20 Oct 2009) | 4 lines branches/zip: Add ChangeLog entry for r6095. ------------------------------------------------------------------------ r6099 | jyang | 2009-10-22 13:58:39 +1100 (Thu, 22 Oct 2009) | 7 lines branches/zip: Port bug #46000 related changes from 5.1 to zip branch. Due to different code path for creating index in zip branch comparing to 5.1), the index reserved name check function is extended to be used in ha_innobase::add_index(). rb://190 Approved by: Marko ------------------------------------------------------------------------ r6100 | jyang | 2009-10-22 14:51:07 +1100 (Thu, 22 Oct 2009) | 6 lines branches/zip: As a request from mysql, WARN_LEVEL_ERROR cannot be used for push_warning_* call any more. Switch to WARN_LEVEL_WARN. Bug #47233. rb://172 approved by Sunny Bains and Marko. ------------------------------------------------------------------------ r6101 | jyang | 2009-10-23 19:45:50 +1100 (Fri, 23 Oct 2009) | 7 lines branches/zip: Update test result with the WARN_LEVEL_ERROR to WARN_LEVEL_WARN change. This is the same result as submitted in rb://172 review, which approved by Sunny Bains and Marko. ------------------------------------------------------------------------ r6102 | marko | 2009-10-26 18:32:23 +1100 (Mon, 26 Oct 2009) | 1 line branches/zip: row_prebuilt_struct::prebuilts: Unused field, remove. ------------------------------------------------------------------------ r6103 | marko | 2009-10-27 00:46:18 +1100 (Tue, 27 Oct 2009) | 4 lines branches/zip: row_ins_alloc_sys_fields(): Zero out the system columns DB_TRX_ID, DB_ROLL_PTR and DB_ROW_ID, in order to avoid harmless Valgrind warnings about uninitialized data. (The warnings were harmless, because the fields would be initialized at a later stage.) ------------------------------------------------------------------------ r6105 | calvin | 2009-10-28 09:05:52 +1100 (Wed, 28 Oct 2009) | 6 lines branches/zip: backport r3848 from 6.0 branch ---- branches/6.0: innobase_start_or_create_for_mysql(): Make the 10 MB minimum tablespace limit independent of UNIV_PAGE_SIZE. (Bug #41490) ------------------------------------------------------------------------ r6107 | marko | 2009-10-29 01:10:34 +1100 (Thu, 29 Oct 2009) | 5 lines branches/zip: buf_page_set_old(): Improve UNIV_LRU_DEBUG diagnostics in order to catch the buf_pool->LRU_old corruption reported in Issue #381. buf_LRU_old_init(): Set the property from the tail towards the front of the buf_pool->LRU list, in order not to trip the debug check. ------------------------------------------------------------------------ r6108 | calvin | 2009-10-29 16:58:04 +1100 (Thu, 29 Oct 2009) | 5 lines branches/zip: close file handle when building with UNIV_HOTBACKUP The change does not affect regular InnoDB engine. Confirmed by Marko. ------------------------------------------------------------------------ r6109 | jyang | 2009-10-29 19:37:32 +1100 (Thu, 29 Oct 2009) | 7 lines branches/zip: In os_mem_alloc_large(), if we fail to attach the shared memory, reset memory pointer ptr to NULL, and allocate memory from conventional pool. Bug #48237 Error handling in os_mem_alloc_large appears to be incorrect rb://198 Approved by: Marko ------------------------------------------------------------------------ r6110 | marko | 2009-10-29 21:44:57 +1100 (Thu, 29 Oct 2009) | 2 lines branches/zip: Makefile.am (INCLUDES): Merge a change from MySQL: Use $(srcdir)/include instead of $(top_srcdir)/storage/innobase/include. ------------------------------------------------------------------------ r6111 | marko | 2009-10-29 22:04:11 +1100 (Thu, 29 Oct 2009) | 33 lines branches/zip: Fix corruption of buf_pool->LRU_old and improve debug assertions. This was reported as Issue #381. buf_page_set_old(): Assert that blocks may only be set old if buf_pool->LRU_old is initialized and buf_pool->LRU_old_len is nonzero. Assert that buf_pool->LRU_old points to the block at the old/new boundary. buf_LRU_old_adjust_len(): Invoke buf_page_set_old() after adjusting buf_pool->LRU_old and buf_pool->LRU_old_len, in order not to violate the added assertions. buf_LRU_old_init(): Replace buf_page_set_old() with a direct assignment to bpage->old, because these loops that initialize all the blocks would temporarily violate the assertions about buf_pool->LRU_old. buf_LRU_remove_block(): When setting buf_pool->LRU_old = NULL, also clear all bpage->old flags and set buf_pool->LRU_old_len = 0. buf_LRU_add_block_to_end_low(), buf_LRU_add_block_low(): Move the buf_page_set_old() call later in order not to violate the debug assertions. If buf_pool->LRU_old is NULL, set old=FALSE. buf_LRU_free_block(): Replace the UNIV_LRU_DEBUG assertion with a dummy buf_page_set_old() call that performs more thorough checks. buf_LRU_validate(): Do not tolerate garbage in buf_pool->LRU_old_len even if buf_pool->LRU_old is NULL. Check that bpage->old is monotonic. buf_relocate(): Make the UNIV_LRU_DEBUG checks stricter. buf0buf.h: Revise the documentation of buf_page_t::old and buf_pool_t::LRU_old_len. ------------------------------------------------------------------------ r6112 | calvin | 2009-10-30 01:21:15 +1100 (Fri, 30 Oct 2009) | 4 lines branches/zip: consideration for icc compilers Proposed by MySQL, and approved by Marko. ------------------------------------------------------------------------ r6113 | vasil | 2009-10-30 03:15:50 +1100 (Fri, 30 Oct 2009) | 93 lines branches/zip: Merge r5912:6112 from branches/5.1: (after this merge the innodb-autoinc test starts to fail, but I commit anyway because it would be easier to investigate the failure this way) ------------------------------------------------------------------------ r5952 | calvin | 2009-09-22 19:45:07 +0300 (Tue, 22 Sep 2009) | 7 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: fix bug#42383: Can't create table 'test.bug39438' For embedded server, MySQL may pass in full path, which is currently disallowed. It is needed to relax the condition by accepting full paths in the embedded case. Approved by: Heikki (on IM) ------------------------------------------------------------------------ r6032 | vasil | 2009-10-01 15:55:49 +0300 (Thu, 01 Oct 2009) | 8 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Fix Bug#38996 Race condition in ANALYZE TABLE by serializing ANALYZE TABLE inside InnoDB. Approved by: Heikki (rb://175) ------------------------------------------------------------------------ r6045 | jyang | 2009-10-08 02:27:08 +0300 (Thu, 08 Oct 2009) | 7 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug47777.result A /branches/5.1/mysql-test/innodb_bug47777.test branches/5.1: Fix bug #47777. Treat the Geometry data same as Binary BLOB in ha_innobase::store_key_val_for_row(), since the Geometry data is stored as Binary BLOB in Innodb. Review: rb://180 approved by Marko Makela. ------------------------------------------------------------------------ r6051 | sunny | 2009-10-12 07:05:00 +0300 (Mon, 12 Oct 2009) | 6 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Ignore negative values supplied by the user when calculating the next value to store in dict_table_t. Setting autoincrement columns top negative values is undefined behavior and this change should bring the behavior of InnoDB closer to what users expect. Added several tests to check. rb://162 ------------------------------------------------------------------------ r6052 | sunny | 2009-10-12 07:09:56 +0300 (Mon, 12 Oct 2009) | 4 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Reset the statement level autoinc counter on ROLLBACK. Fix the test results too. rb://164 ------------------------------------------------------------------------ r6053 | sunny | 2009-10-12 07:37:49 +0300 (Mon, 12 Oct 2009) | 6 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Copy the maximum AUTOINC value from the old table to the new table when MySQL does a CREATE INDEX ON T. This is required because MySQL does a table copy, rename and drops the old table. Fix Bug#47125: auto_increment start value is ignored if an index is created and engine=innodb rb://168 ------------------------------------------------------------------------ r6076 | vasil | 2009-10-14 19:30:12 +0300 (Wed, 14 Oct 2009) | 4 lines Changed paths: M /branches/5.1/row/row0mysql.c branches/5.1: Fix typo. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6114 | vasil | 2009-10-30 03:43:51 +1100 (Fri, 30 Oct 2009) | 6 lines branches/zip: * Add ChangeLog entries for latest changes * Obey alphabetical order in the list of the files * White-space fixup ------------------------------------------------------------------------ r6121 | sunny | 2009-10-30 10:42:11 +1100 (Fri, 30 Oct 2009) | 7 lines branches/zip: This test has been problematic for sometime now. The underlying bug is that the data dictionaries get out of sync. In the AUTOINC code we try and apply salve to the symptoms. In the past MySQL made some unrelated change and the dictionaries stopped getting out of sync and this test started to fail. Now, it seems they have reverted that changed and the test is passing again. I suspect this is not he last time that this test will change. ------------------------------------------------------------------------ r6124 | jyang | 2009-10-30 19:02:31 +1100 (Fri, 30 Oct 2009) | 5 lines branches/zip: Correct the bug number for -r6109 change from # 48273 to #48237 ------------------------------------------------------------------------ r6126 | vasil | 2009-10-30 19:36:07 +1100 (Fri, 30 Oct 2009) | 45 lines branches/zip: Merge r6112:6125 from branches/5.1: (skipping r6122 and r6123, Jimmy says these are already present and need not be merged): ------------------------------------------------------------------------ r6122 | jyang | 2009-10-30 05:18:38 +0200 (Fri, 30 Oct 2009) | 7 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb_bug44369.result M /branches/5.1/mysql-test/innodb_bug44369.test M /branches/5.1/mysql-test/innodb_bug46000.result M /branches/5.1/mysql-test/innodb_bug46000.test branches/5.1: Chnage WARN_LEVEL_ERROR to WARN_LEVEL_WARN for push_warning_printf() call in innodb. Fix Bug#47233: Innodb calls push_warning(MYSQL_ERROR::WARN_LEVEL_ERROR) rb://170 approved by Marko. ------------------------------------------------------------------------ r6123 | jyang | 2009-10-30 05:43:06 +0200 (Fri, 30 Oct 2009) | 8 lines Changed paths: M /branches/5.1/os/os0proc.c branches/5.1: In os_mem_alloc_large(), if we fail to attach the shared memory, reset memory pointer ptr to NULL, and allocate memory from conventional pool. This is a port from branches/zip. Bug #48237 Error handling in os_mem_alloc_large appears to be incorrect rb://198 Approved by: Marko ------------------------------------------------------------------------ r6125 | vasil | 2009-10-30 10:31:23 +0200 (Fri, 30 Oct 2009) | 4 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: White-space fixup. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6130 | marko | 2009-11-02 20:42:56 +1100 (Mon, 02 Nov 2009) | 9 lines branches/zip: Free all resources at shutdown. Set pointers to NULL, so that Valgrind will not complain about freed data structures that are reachable via pointers. This addresses Bug #45992 and Bug #46656. This patch is mostly based on changes copied from branches/embedded-1.0, mainly c5432, c3439, c3134, c2994, c2978, but also some other code was copied. Some added cleanup code is specific to MySQL/InnoDB. rb://199 approved by Sunny Bains ------------------------------------------------------------------------
16 years ago
branches/innodb+: Merge revisions r5971:6130 from branches/zip. ------------------------------------------------------------------------ r5971 | marko | 2009-09-23 23:03:51 +1000 (Wed, 23 Sep 2009) | 2 lines branches/zip: os_file_pwrite(): Make the code compile in InnoDB Hot Backup when the pwrite system call is not available. ------------------------------------------------------------------------ r5972 | marko | 2009-09-24 05:44:52 +1000 (Thu, 24 Sep 2009) | 5 lines branches/zip: fil_node_open_file(): In InnoDB Hot Backup, determine the page size of single-file tablespaces before computing the file node size. Otherwise, the space->size of compressed tablespaces would be computed with UNIV_PAGE_SIZE instead of key_block_size. This should fix Issue #313. ------------------------------------------------------------------------ r5973 | marko | 2009-09-24 05:53:21 +1000 (Thu, 24 Sep 2009) | 2 lines branches/zip: recv_add_to_hash_table(): Simplify obfuscated pointer arithmetics. ------------------------------------------------------------------------ r5978 | marko | 2009-09-24 17:47:56 +1000 (Thu, 24 Sep 2009) | 1 line branches/zip: Fix warnings and errors when UNIV_HOTBACKUP is defined. ------------------------------------------------------------------------ r5979 | marko | 2009-09-24 20:16:10 +1000 (Thu, 24 Sep 2009) | 4 lines branches/zip: ha_innodb.cc: Define MYSQL_PLUGIN_IMPORT when necessary. This preprocessor symbol has been recently introduced in MySQL 5.1. The InnoDB Plugin should remain source compatible with MySQL 5.1.24 and later. ------------------------------------------------------------------------ r5988 | calvin | 2009-09-26 05:14:43 +1000 (Sat, 26 Sep 2009) | 8 lines branches/zip: fix bug#47055 unconditional exit(1) on ERROR_WORKING_SET_QUOTA 1453 (0x5AD) for InnoDB backend When error ERROR_WORKING_SET_QUOTA or ERROR_NO_SYSTEM_RESOURCES occurs, yields for 100ms and retries the operation. Approved by: Heikki (on IM) ------------------------------------------------------------------------ r5992 | vasil | 2009-09-28 17:10:29 +1000 (Mon, 28 Sep 2009) | 4 lines branches/zip: Add ChangeLog entry for c5988. ------------------------------------------------------------------------ r5994 | marko | 2009-09-28 18:33:59 +1000 (Mon, 28 Sep 2009) | 17 lines branches/zip: Try to prevent the reuse of tablespace identifiers after InnoDB has crashed during table creation. Also, refuse to start if files with duplicate tablespace identifiers are encountered. fil_node_create(): Update fil_system->max_assigned_id. This should prevent the reuse of a space->id when InnoDB does a full crash recovery and invokes fil_load_single_table_tablespaces(). Normally, fil_system->max_assigned_id is initialized from SELECT MAX(ID) FROM SYS_TABLES. fil_open_single_table_tablespace(): Return FALSE when fil_space_create() fails. fil_load_single_table_tablespace(): Exit if fil_space_create() fails and innodb_force_recovery=0. rb://173 approved by Heikki Tuuri. This addresses Issue #335. ------------------------------------------------------------------------ r5995 | marko | 2009-09-28 18:52:25 +1000 (Mon, 28 Sep 2009) | 17 lines branches/zip: Do not write to PAGE_INDEX_ID after page creation, not even when restoring an uncompressed page after a compression failure. btr_page_reorganize_low(): On compression failure, do not restore those page header fields that should not be affected by the reorganization. Instead, compare the fields. page_zip_decompress(): Add the parameter ibool all, for copying all page header fields. Pass the parameter all=TRUE on block read completion, redo log application, and page_zip_validate(); pass all=FALSE in all other cases. page_zip_reorganize(): Do not restore the uncompressed page on failure. It will be restored (to pre-modification state) by the caller anyway. rb://167, Issue #346 ------------------------------------------------------------------------ r5996 | marko | 2009-09-28 22:46:02 +1000 (Mon, 28 Sep 2009) | 4 lines branches/zip: Address Issue #350 in comments. lock_rec_queue_validate(), lock_rec_queue_validate(): Note that this debug code may violate the latching order and cause deadlocks. ------------------------------------------------------------------------ r5997 | marko | 2009-09-28 23:03:58 +1000 (Mon, 28 Sep 2009) | 12 lines branches/zip: Remove an assertion failure when the InnoDB data dictionary is inconsistent with the MySQL .frm file. ha_innobase::index_read(): When the index cannot be found, return an error. ha_innobase::change_active_index(): When prebuilt->index == NULL, set also prebuilt->index_usable = FALSE. This is not needed for correctness, because prebuilt->index_usable is only checked by row_search_for_mysql(), which requires prebuilt->index != NULL. This addresses Issue #349. Approved by Heikki Tuuri over IM. ------------------------------------------------------------------------ r6005 | vasil | 2009-09-29 18:09:52 +1000 (Tue, 29 Sep 2009) | 4 lines branches/zip: ChangeLog: wrap around 78th column, not earlier. ------------------------------------------------------------------------ r6006 | vasil | 2009-09-29 20:15:25 +1000 (Tue, 29 Sep 2009) | 4 lines branches/zip: Add ChangeLog entry for the release of 1.0.4. ------------------------------------------------------------------------ r6007 | vasil | 2009-09-29 23:19:59 +1000 (Tue, 29 Sep 2009) | 6 lines branches/zip: Fix the year, should be 2009. Pointed by: Calvin ------------------------------------------------------------------------ r6026 | marko | 2009-09-30 17:18:24 +1000 (Wed, 30 Sep 2009) | 1 line branches/zip: Add some debug assertions for checking FSEG_MAGIC_N. ------------------------------------------------------------------------ r6028 | marko | 2009-09-30 23:55:23 +1000 (Wed, 30 Sep 2009) | 3 lines branches/zip: recv_no_log_write: New debug flag for tracking down Mantis Issue #347. No modifications should be made to the database while recv_apply_hashed_log_recs() is about to complete. ------------------------------------------------------------------------ r6029 | calvin | 2009-10-01 06:32:02 +1000 (Thu, 01 Oct 2009) | 4 lines branches/zip: non-functional changes Fix typo. ------------------------------------------------------------------------ r6031 | marko | 2009-10-01 21:24:33 +1000 (Thu, 01 Oct 2009) | 49 lines branches/zip: Clean up after a crash during DROP INDEX. When InnoDB crashes while dropping an index, ensure that the index will be completely dropped during crash recovery. row_merge_drop_index(): Before dropping an index, rename the index to start with TEMP_INDEX_PREFIX_STR and commit the change, so that row_merge_drop_temp_indexes() will drop the index after crash recovery if the server crashes while dropping the index. fseg_inode_try_get(): New function, forked from fseg_inode_get(). Return NULL if the file segment index node is free. fseg_inode_get(): Assert that the file segment index node is not free. fseg_free_step(): If the file segment index node is already free, print a diagnostic message and return TRUE. fsp_free_seg_inode(): Write a nonzero number to FSEG_MAGIC_N, so that allocated-and-freed file segment index nodes can be better distinguished from uninitialized ones. This is rb://174, addressing Issue #348. Tested by restarting mysqld upon the completion of the added log_write_up_to() invocation below, during DROP INDEX. The index was dropped after crash recovery, and re-issuing the DROP INDEX did not crash the server. Index: btr/btr0btr.c =================================================================== --- btr/btr0btr.c (revision 6026) +++ btr/btr0btr.c (working copy) @@ -42,6 +42,7 @@ Created 6/2/1994 Heikki Tuuri #include "ibuf0ibuf.h" #include "trx0trx.h" +#include "log0log.h" /* Latching strategy of the InnoDB B-tree -------------------------------------- @@ -873,6 +874,8 @@ leaf_loop: goto leaf_loop; } + + log_write_up_to(mtr.end_lsn, LOG_WAIT_ALL_GROUPS, TRUE); top_loop: mtr_start(&mtr); ------------------------------------------------------------------------ r6033 | calvin | 2009-10-02 06:19:46 +1000 (Fri, 02 Oct 2009) | 4 lines branches/zip: fix a typo in error message Reported as bug#47763. ------------------------------------------------------------------------ r6043 | inaam | 2009-10-06 01:45:35 +1100 (Tue, 06 Oct 2009) | 12 lines branches/zip rb://176 Do not invalidate buffer pool while an LRU batch is active. Added code to buf_pool_invalidate() to wait for the running batches to finish. This patch also resets the state of buf_pool struct at invalidation. This addresses the concern where buf_pool->freed_page_clock becomes non-zero because we read in a system tablespace page for file format info at startup. Approved by: Marko ------------------------------------------------------------------------ r6044 | pekka | 2009-10-07 01:44:54 +1100 (Wed, 07 Oct 2009) | 5 lines branches/zip: Add os_file_is_same() function for Hot Backup (inside ifdef UNIV_HOTBACKUP). This is part of the fix for Issue #186. Note! The Windows implementation is incomplete. ------------------------------------------------------------------------ r6046 | pekka | 2009-10-08 20:24:56 +1100 (Thu, 08 Oct 2009) | 3 lines branches/zip: Revert r6044 which added os_file_is_same() function (issue#186). This functionality is moved to Hot Backup source tree. ------------------------------------------------------------------------ r6048 | vasil | 2009-10-09 16:42:55 +1100 (Fri, 09 Oct 2009) | 16 lines branches/zip: When scanning a directory readdir() is called and stat() after it, if a file is deleted between the two calls stat will fail and the whole precedure will fail. Change this behavior to continue with the next entry if stat() fails because of nonexistent file. This is transparent change as it will make it look as if the file was deleted before the readdir() call. This change is needed in order to fix https://svn.innodb.com/mantis/view.php?id=174 in which we need to abort if os_file_readdir_next_file() encounters "real" errors. Approved by: Marko, Pekka (rb://177) ------------------------------------------------------------------------ r6049 | vasil | 2009-10-10 03:05:26 +1100 (Sat, 10 Oct 2009) | 7 lines branches/zip: Fix compilation warning in Hot Backup: innodb/fil/fil0fil.c: In function 'fil_load_single_table_tablespace': innodb/fil/fil0fil.c:3253: warning: format '%lld' expects type 'long long int', but argument 6 has type 'ib_int64_t' ------------------------------------------------------------------------ r6064 | calvin | 2009-10-14 02:23:35 +1100 (Wed, 14 Oct 2009) | 4 lines branches/zip: non-functional changes Changes from MySQL to fix build issue. ------------------------------------------------------------------------ r6065 | inaam | 2009-10-14 04:43:13 +1100 (Wed, 14 Oct 2009) | 7 lines branches/zip rb://182 Call fsync() on datafiles after a batch of pages is written to disk even when skip_innodb_doublewrite is set. Approved by: Heikki ------------------------------------------------------------------------ r6080 | sunny | 2009-10-15 09:29:01 +1100 (Thu, 15 Oct 2009) | 3 lines branches/zip: Change page_mem_alloc_free() to inline. Fix Bug #47058 - Failure to compile innodb_plugin on solaris 10u7 + spro cc/CC 5.10 ------------------------------------------------------------------------ r6084 | vasil | 2009-10-15 16:21:17 +1100 (Thu, 15 Oct 2009) | 4 lines branches/zip: Add ChangeLog entry for r6080. ------------------------------------------------------------------------ r6095 | vasil | 2009-10-20 00:04:59 +1100 (Tue, 20 Oct 2009) | 7 lines branches/zip: Fix Bug#47808 innodb_information_schema.test fails when run under valgrind by using the wait_until_rows_count macro that loops until the number of rows becomes 14 instead of sleep 0.1, which is obviously very fragile. ------------------------------------------------------------------------ r6096 | vasil | 2009-10-20 00:06:09 +1100 (Tue, 20 Oct 2009) | 4 lines branches/zip: Add ChangeLog entry for r6095. ------------------------------------------------------------------------ r6099 | jyang | 2009-10-22 13:58:39 +1100 (Thu, 22 Oct 2009) | 7 lines branches/zip: Port bug #46000 related changes from 5.1 to zip branch. Due to different code path for creating index in zip branch comparing to 5.1), the index reserved name check function is extended to be used in ha_innobase::add_index(). rb://190 Approved by: Marko ------------------------------------------------------------------------ r6100 | jyang | 2009-10-22 14:51:07 +1100 (Thu, 22 Oct 2009) | 6 lines branches/zip: As a request from mysql, WARN_LEVEL_ERROR cannot be used for push_warning_* call any more. Switch to WARN_LEVEL_WARN. Bug #47233. rb://172 approved by Sunny Bains and Marko. ------------------------------------------------------------------------ r6101 | jyang | 2009-10-23 19:45:50 +1100 (Fri, 23 Oct 2009) | 7 lines branches/zip: Update test result with the WARN_LEVEL_ERROR to WARN_LEVEL_WARN change. This is the same result as submitted in rb://172 review, which approved by Sunny Bains and Marko. ------------------------------------------------------------------------ r6102 | marko | 2009-10-26 18:32:23 +1100 (Mon, 26 Oct 2009) | 1 line branches/zip: row_prebuilt_struct::prebuilts: Unused field, remove. ------------------------------------------------------------------------ r6103 | marko | 2009-10-27 00:46:18 +1100 (Tue, 27 Oct 2009) | 4 lines branches/zip: row_ins_alloc_sys_fields(): Zero out the system columns DB_TRX_ID, DB_ROLL_PTR and DB_ROW_ID, in order to avoid harmless Valgrind warnings about uninitialized data. (The warnings were harmless, because the fields would be initialized at a later stage.) ------------------------------------------------------------------------ r6105 | calvin | 2009-10-28 09:05:52 +1100 (Wed, 28 Oct 2009) | 6 lines branches/zip: backport r3848 from 6.0 branch ---- branches/6.0: innobase_start_or_create_for_mysql(): Make the 10 MB minimum tablespace limit independent of UNIV_PAGE_SIZE. (Bug #41490) ------------------------------------------------------------------------ r6107 | marko | 2009-10-29 01:10:34 +1100 (Thu, 29 Oct 2009) | 5 lines branches/zip: buf_page_set_old(): Improve UNIV_LRU_DEBUG diagnostics in order to catch the buf_pool->LRU_old corruption reported in Issue #381. buf_LRU_old_init(): Set the property from the tail towards the front of the buf_pool->LRU list, in order not to trip the debug check. ------------------------------------------------------------------------ r6108 | calvin | 2009-10-29 16:58:04 +1100 (Thu, 29 Oct 2009) | 5 lines branches/zip: close file handle when building with UNIV_HOTBACKUP The change does not affect regular InnoDB engine. Confirmed by Marko. ------------------------------------------------------------------------ r6109 | jyang | 2009-10-29 19:37:32 +1100 (Thu, 29 Oct 2009) | 7 lines branches/zip: In os_mem_alloc_large(), if we fail to attach the shared memory, reset memory pointer ptr to NULL, and allocate memory from conventional pool. Bug #48237 Error handling in os_mem_alloc_large appears to be incorrect rb://198 Approved by: Marko ------------------------------------------------------------------------ r6110 | marko | 2009-10-29 21:44:57 +1100 (Thu, 29 Oct 2009) | 2 lines branches/zip: Makefile.am (INCLUDES): Merge a change from MySQL: Use $(srcdir)/include instead of $(top_srcdir)/storage/innobase/include. ------------------------------------------------------------------------ r6111 | marko | 2009-10-29 22:04:11 +1100 (Thu, 29 Oct 2009) | 33 lines branches/zip: Fix corruption of buf_pool->LRU_old and improve debug assertions. This was reported as Issue #381. buf_page_set_old(): Assert that blocks may only be set old if buf_pool->LRU_old is initialized and buf_pool->LRU_old_len is nonzero. Assert that buf_pool->LRU_old points to the block at the old/new boundary. buf_LRU_old_adjust_len(): Invoke buf_page_set_old() after adjusting buf_pool->LRU_old and buf_pool->LRU_old_len, in order not to violate the added assertions. buf_LRU_old_init(): Replace buf_page_set_old() with a direct assignment to bpage->old, because these loops that initialize all the blocks would temporarily violate the assertions about buf_pool->LRU_old. buf_LRU_remove_block(): When setting buf_pool->LRU_old = NULL, also clear all bpage->old flags and set buf_pool->LRU_old_len = 0. buf_LRU_add_block_to_end_low(), buf_LRU_add_block_low(): Move the buf_page_set_old() call later in order not to violate the debug assertions. If buf_pool->LRU_old is NULL, set old=FALSE. buf_LRU_free_block(): Replace the UNIV_LRU_DEBUG assertion with a dummy buf_page_set_old() call that performs more thorough checks. buf_LRU_validate(): Do not tolerate garbage in buf_pool->LRU_old_len even if buf_pool->LRU_old is NULL. Check that bpage->old is monotonic. buf_relocate(): Make the UNIV_LRU_DEBUG checks stricter. buf0buf.h: Revise the documentation of buf_page_t::old and buf_pool_t::LRU_old_len. ------------------------------------------------------------------------ r6112 | calvin | 2009-10-30 01:21:15 +1100 (Fri, 30 Oct 2009) | 4 lines branches/zip: consideration for icc compilers Proposed by MySQL, and approved by Marko. ------------------------------------------------------------------------ r6113 | vasil | 2009-10-30 03:15:50 +1100 (Fri, 30 Oct 2009) | 93 lines branches/zip: Merge r5912:6112 from branches/5.1: (after this merge the innodb-autoinc test starts to fail, but I commit anyway because it would be easier to investigate the failure this way) ------------------------------------------------------------------------ r5952 | calvin | 2009-09-22 19:45:07 +0300 (Tue, 22 Sep 2009) | 7 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: fix bug#42383: Can't create table 'test.bug39438' For embedded server, MySQL may pass in full path, which is currently disallowed. It is needed to relax the condition by accepting full paths in the embedded case. Approved by: Heikki (on IM) ------------------------------------------------------------------------ r6032 | vasil | 2009-10-01 15:55:49 +0300 (Thu, 01 Oct 2009) | 8 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Fix Bug#38996 Race condition in ANALYZE TABLE by serializing ANALYZE TABLE inside InnoDB. Approved by: Heikki (rb://175) ------------------------------------------------------------------------ r6045 | jyang | 2009-10-08 02:27:08 +0300 (Thu, 08 Oct 2009) | 7 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug47777.result A /branches/5.1/mysql-test/innodb_bug47777.test branches/5.1: Fix bug #47777. Treat the Geometry data same as Binary BLOB in ha_innobase::store_key_val_for_row(), since the Geometry data is stored as Binary BLOB in Innodb. Review: rb://180 approved by Marko Makela. ------------------------------------------------------------------------ r6051 | sunny | 2009-10-12 07:05:00 +0300 (Mon, 12 Oct 2009) | 6 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Ignore negative values supplied by the user when calculating the next value to store in dict_table_t. Setting autoincrement columns top negative values is undefined behavior and this change should bring the behavior of InnoDB closer to what users expect. Added several tests to check. rb://162 ------------------------------------------------------------------------ r6052 | sunny | 2009-10-12 07:09:56 +0300 (Mon, 12 Oct 2009) | 4 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Reset the statement level autoinc counter on ROLLBACK. Fix the test results too. rb://164 ------------------------------------------------------------------------ r6053 | sunny | 2009-10-12 07:37:49 +0300 (Mon, 12 Oct 2009) | 6 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Copy the maximum AUTOINC value from the old table to the new table when MySQL does a CREATE INDEX ON T. This is required because MySQL does a table copy, rename and drops the old table. Fix Bug#47125: auto_increment start value is ignored if an index is created and engine=innodb rb://168 ------------------------------------------------------------------------ r6076 | vasil | 2009-10-14 19:30:12 +0300 (Wed, 14 Oct 2009) | 4 lines Changed paths: M /branches/5.1/row/row0mysql.c branches/5.1: Fix typo. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6114 | vasil | 2009-10-30 03:43:51 +1100 (Fri, 30 Oct 2009) | 6 lines branches/zip: * Add ChangeLog entries for latest changes * Obey alphabetical order in the list of the files * White-space fixup ------------------------------------------------------------------------ r6121 | sunny | 2009-10-30 10:42:11 +1100 (Fri, 30 Oct 2009) | 7 lines branches/zip: This test has been problematic for sometime now. The underlying bug is that the data dictionaries get out of sync. In the AUTOINC code we try and apply salve to the symptoms. In the past MySQL made some unrelated change and the dictionaries stopped getting out of sync and this test started to fail. Now, it seems they have reverted that changed and the test is passing again. I suspect this is not he last time that this test will change. ------------------------------------------------------------------------ r6124 | jyang | 2009-10-30 19:02:31 +1100 (Fri, 30 Oct 2009) | 5 lines branches/zip: Correct the bug number for -r6109 change from # 48273 to #48237 ------------------------------------------------------------------------ r6126 | vasil | 2009-10-30 19:36:07 +1100 (Fri, 30 Oct 2009) | 45 lines branches/zip: Merge r6112:6125 from branches/5.1: (skipping r6122 and r6123, Jimmy says these are already present and need not be merged): ------------------------------------------------------------------------ r6122 | jyang | 2009-10-30 05:18:38 +0200 (Fri, 30 Oct 2009) | 7 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb_bug44369.result M /branches/5.1/mysql-test/innodb_bug44369.test M /branches/5.1/mysql-test/innodb_bug46000.result M /branches/5.1/mysql-test/innodb_bug46000.test branches/5.1: Chnage WARN_LEVEL_ERROR to WARN_LEVEL_WARN for push_warning_printf() call in innodb. Fix Bug#47233: Innodb calls push_warning(MYSQL_ERROR::WARN_LEVEL_ERROR) rb://170 approved by Marko. ------------------------------------------------------------------------ r6123 | jyang | 2009-10-30 05:43:06 +0200 (Fri, 30 Oct 2009) | 8 lines Changed paths: M /branches/5.1/os/os0proc.c branches/5.1: In os_mem_alloc_large(), if we fail to attach the shared memory, reset memory pointer ptr to NULL, and allocate memory from conventional pool. This is a port from branches/zip. Bug #48237 Error handling in os_mem_alloc_large appears to be incorrect rb://198 Approved by: Marko ------------------------------------------------------------------------ r6125 | vasil | 2009-10-30 10:31:23 +0200 (Fri, 30 Oct 2009) | 4 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: White-space fixup. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6130 | marko | 2009-11-02 20:42:56 +1100 (Mon, 02 Nov 2009) | 9 lines branches/zip: Free all resources at shutdown. Set pointers to NULL, so that Valgrind will not complain about freed data structures that are reachable via pointers. This addresses Bug #45992 and Bug #46656. This patch is mostly based on changes copied from branches/embedded-1.0, mainly c5432, c3439, c3134, c2994, c2978, but also some other code was copied. Some added cleanup code is specific to MySQL/InnoDB. rb://199 approved by Sunny Bains ------------------------------------------------------------------------
16 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
branches/innodb+: Merge revisions 5144:5524 from branches/zip ------------------------------------------------------------------------ r5147 | marko | 2009-05-27 06:55:14 -0400 (Wed, 27 May 2009) | 1 line branches/zip: ibuf0ibuf.c: Improve a comment. ------------------------------------------------------------------------ r5149 | marko | 2009-05-27 07:46:42 -0400 (Wed, 27 May 2009) | 34 lines branches/zip: Merge revisions 4994:5148 from branches/5.1: ------------------------------------------------------------------------ r5126 | vasil | 2009-05-26 16:57:12 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Non-functional change: move FSP_* macros from fsp0fsp.h to a new file fsp0types.h. This is needed in order to be able to use FSP_EXTENT_SIZE in mtr0log.ic. ------------------------------------------------------------------------ r5127 | vasil | 2009-05-26 17:05:43 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not include unnecessary headers mtr0log.h and fut0lst.h in trx0sys.h and include fsp0fsp.h just before it is needed. This is needed in order to be able to use TRX_SYS_SPACE in mtr0log.ic. ------------------------------------------------------------------------ r5128 | vasil | 2009-05-26 17:26:37 +0300 (Tue, 26 May 2009) | 7 lines branches/5.1: Fix Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not write redo log for the pages in the doublewrite buffer. Also, do not make a dummy change to the page because this is not needed. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5169 | marko | 2009-05-28 03:21:55 -0400 (Thu, 28 May 2009) | 1 line branches/zip: mtr0mtr.h: Add Doxygen comments for the redo log entry types. ------------------------------------------------------------------------ r5176 | marko | 2009-05-28 07:14:02 -0400 (Thu, 28 May 2009) | 1 line branches/zip: Correct a debug assertion that was added in r5125. ------------------------------------------------------------------------ r5201 | marko | 2009-06-01 06:35:25 -0400 (Mon, 01 Jun 2009) | 2 lines branches/zip: Clean up some comments. Make the rec parameter of mlog_open_and_write_index() const. ------------------------------------------------------------------------ r5234 | marko | 2009-06-03 08:26:41 -0400 (Wed, 03 Jun 2009) | 44 lines branches/zip: Merge revisions 5148:5233 from branches/5.1: ------------------------------------------------------------------------ r5150 | vasil | 2009-05-27 18:56:03 +0300 (Wed, 27 May 2009) | 4 lines branches/5.1: Whitespace fixup. ------------------------------------------------------------------------ r5191 | vasil | 2009-05-30 17:46:05 +0300 (Sat, 30 May 2009) | 19 lines branches/5.1: Merge a change from MySQL (this fixes the failing innodb_mysql test): ------------------------------------------------------------ revno: 1810.3894.10 committer: Sergey Glukhov <Sergey.Glukhov@sun.com> branch nick: mysql-5.0-bugteam timestamp: Tue 2009-05-19 11:32:21 +0500 message: Bug#39793 Foreign keys not constructed when column has a '#' in a comment or default value Internal InnoDN FK parser does not recognize '\'' as quotation symbol. Suggested fix is to add '\'' symbol check for quotation condition (dict_strip_comments() function). modified: innobase/dict/dict0dict.c mysql-test/r/innodb_mysql.result mysql-test/t/innodb_mysql.test ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5250 | marko | 2009-06-04 02:58:23 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add Doxygen comments to the rest of buf0*. ------------------------------------------------------------------------ r5251 | marko | 2009-06-04 02:59:51 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Replace <= in a function comment. ------------------------------------------------------------------------ r5253 | marko | 2009-06-04 06:37:35 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add missing Doxygen comments for page0zip. ------------------------------------------------------------------------ r5261 | vasil | 2009-06-05 11:13:31 -0400 (Fri, 05 Jun 2009) | 15 lines branches/zip: Fix Mantis Issue#244 fix bug in linear read ahead (no check on access pattern) The changes are: 1) Take into account access pattern when deciding whether or not to do linear read ahead. 2) Expose a knob innodb_read_ahead_factor = [0-64] default (8), dynamic, global to control linear read ahead behvior 3) Disable random read ahead. Keep the code for now. Submitted by: Inaam (rb://122) Approved by: Heikki (rb://122) ------------------------------------------------------------------------ r5262 | vasil | 2009-06-05 12:04:25 -0400 (Fri, 05 Jun 2009) | 22 lines branches/zip: Enable functionality to have multiple background io helper threads. This patch is based on percona contributions. More details about this patch will be written at: https://svn.innodb.com/innobase/MultipleBackgroundThreads The patch essentially does the following: expose following knobs: innodb_read_io_threads = [1 - 64] default 1 innodb_write_io_threads = [1 - 64] default 1 deprecate innodb_file_io_threads (this parameter was relevant only on windows) Internally it allows multiple segments for read and write IO request arrays where one thread works on one segement. Submitted by: Inaam (rb://124) Approved by: Heikki (rb://124) ------------------------------------------------------------------------ r5263 | vasil | 2009-06-05 12:19:37 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Whitespace cleanup. ------------------------------------------------------------------------ r5264 | vasil | 2009-06-05 12:26:58 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5261. ------------------------------------------------------------------------ r5265 | vasil | 2009-06-05 12:34:11 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5262. ------------------------------------------------------------------------ r5268 | inaam | 2009-06-08 12:18:21 -0400 (Mon, 08 Jun 2009) | 7 lines branches/zip Non functional change: Added legal notices acknowledging percona contribution to the multiple IO helper threads patch i.e.: r5262 ------------------------------------------------------------------------ r5283 | inaam | 2009-06-09 13:46:29 -0400 (Tue, 09 Jun 2009) | 9 lines branches/zip rb://130 Enable Group Commit functionality that was broken in 5.0 when distributed transactions were introduced. Reviewed by: Heikki ------------------------------------------------------------------------ r5319 | marko | 2009-06-11 04:40:33 -0400 (Thu, 11 Jun 2009) | 3 lines branches/zip: Declare os_thread_id_t as unsigned long, because ulint is wrong on Win64. Pointed out by Vladislav Vaintroub <wlad@sun.com>. ------------------------------------------------------------------------ r5320 | inaam | 2009-06-11 09:15:41 -0400 (Thu, 11 Jun 2009) | 14 lines branches/zip rb://131 This patch changes the following defaults: max_dirty_pages_pct: default from 90 to 75. max allowed from 100 to 99 additional_mem_pool_size: default from 1 to 8 MB buffer_pool_size: default from 8 to 128 MB log_buffer_size: default from 1 to 8 MB read_io_threads/write_io_threads: default from 1 to 4 The log file sizes are untouched because of upgrade issues Reviewed by: Heikki ------------------------------------------------------------------------ r5330 | marko | 2009-06-16 04:08:59 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_gen(): Reduce mutex holding time by adjusting buf_pool->n_pend_unzip while only holding buf_pool_mutex. ------------------------------------------------------------------------ r5331 | marko | 2009-06-16 05:00:48 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Eliminate a buf_page_get_mutex() call. The function must switch on the block state anyway. ------------------------------------------------------------------------ r5332 | vasil | 2009-06-16 05:03:27 -0400 (Tue, 16 Jun 2009) | 4 lines branches/zip: Add ChangeLog entries for r5283 and r5320. ------------------------------------------------------------------------ r5333 | marko | 2009-06-16 05:27:46 -0400 (Tue, 16 Jun 2009) | 1 line branches/zip: buf_page_io_query(): Remove unused function. ------------------------------------------------------------------------ r5335 | marko | 2009-06-16 09:23:10 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: innodb.test: Adjust the tolerance of innodb_buffer_pool_pages_total for r5320. ------------------------------------------------------------------------ r5342 | marko | 2009-06-17 06:15:32 -0400 (Wed, 17 Jun 2009) | 60 lines branches/zip: Merge revisions 5233:5341 from branches/5.1: ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5243 | sunny | 2009-06-04 03:17:14 +0300 (Thu, 04 Jun 2009) | 14 lines branches/5.1: When the InnoDB and MySQL data dictionaries go out of sync, before the bug fix we would assert on missing autoinc columns. With this fix we allow MySQL to open the table but set the next autoinc value for the column to the MAX value. This effectively disables the next value generation. INSERTs will fail with a generic AUTOINC failure. However, the user should be able to read/dump the table, set the column values explicitly, use ALTER TABLE to set the next autoinc value and/or sync the two data dictionaries to resume normal operations. Fix Bug#44030 Error: (1500) Couldn't read the MAX(ID) autoinc value from the index (PRIMARY) rb://118 ------------------------------------------------------------------------ r5252 | sunny | 2009-06-04 10:16:24 +0300 (Thu, 04 Jun 2009) | 2 lines branches/5.1: The version of the result file checked in was broken in r5243. ------------------------------------------------------------------------ r5259 | vasil | 2009-06-05 10:29:16 +0300 (Fri, 05 Jun 2009) | 7 lines branches/5.1: Remove the word "Error" from the printout because the mysqltest suite interprets it as an error and thus the innodb-autoinc test fails. Approved by: Sunny (via IM) ------------------------------------------------------------------------ r5339 | marko | 2009-06-17 11:01:37 +0300 (Wed, 17 Jun 2009) | 2 lines branches/5.1: Add missing #include "mtr0log.h" so that the code compiles with -DUNIV_MUST_NOT_INLINE. (null merge; this had already been committed in branches/zip) ------------------------------------------------------------------------ r5340 | marko | 2009-06-17 12:11:49 +0300 (Wed, 17 Jun 2009) | 4 lines branches/5.1: row_unlock_for_mysql(): When the clustered index is unknown, refuse to unlock the record. (Bug #45357, caused by the fix of Bug #39320). rb://132 approved by Sunny Bains. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5343 | vasil | 2009-06-17 08:56:12 -0400 (Wed, 17 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5342. ------------------------------------------------------------------------ r5344 | marko | 2009-06-17 09:03:45 -0400 (Wed, 17 Jun 2009) | 1 line branches/zip: row_merge_read_rec(): Fix a UNIV_DEBUG bug (Bug #45426) ------------------------------------------------------------------------ r5391 | marko | 2009-06-22 05:31:35 -0400 (Mon, 22 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Fix a bogus warning about block_mutex being possibly uninitialized. ------------------------------------------------------------------------ r5392 | marko | 2009-06-22 07:58:20 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: ha_innobase::check_if_incompatible_data(): When ROW_FORMAT=DEFAULT, do not compare to get_row_type(). Without this change, fast index creation will be disabled in recent versions of MySQL 5.1. ------------------------------------------------------------------------ r5393 | pekka | 2009-06-22 09:27:55 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Minor changes for Hot Backup to build correctly. (The code bracketed between #ifdef UNIV_HOTBACKUP and #endif /* UNIV_HOTBACKUP */). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5394 | pekka | 2009-06-22 09:46:34 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Add functions for checking the format of tablespaces for Hot Backup build (UNIV_HOTBACKUP defined). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5397 | calvin | 2009-06-23 16:59:42 -0400 (Tue, 23 Jun 2009) | 7 lines branches/zip: change the header file path. Change the header file path from ../storage/innobase/include/ to ../include/. In the planned 5.1 + plugin release, the source directory of the plugin will not be in storage/innobase. Approved by: Heikki (IM) ------------------------------------------------------------------------ r5407 | calvin | 2009-06-24 09:51:08 -0400 (Wed, 24 Jun 2009) | 4 lines branches/zip: remove relative path of header files. Suggested by Marko. ------------------------------------------------------------------------ r5412 | marko | 2009-06-25 06:27:08 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: Replace a DBUG_ASSERT with ut_a to track down Issue #290. ------------------------------------------------------------------------ r5415 | marko | 2009-06-25 06:45:57 -0400 (Thu, 25 Jun 2009) | 3 lines branches/zip: dict_index_find_cols(): Print diagnostic on name mismatch. This addresses Bug #44571 but does not fix it. rb://135 approved by Sunny Bains. ------------------------------------------------------------------------ r5417 | marko | 2009-06-25 08:20:56 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: ha_innodb.cc: Move the misplaced Doxygen @file comment. ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 08:55:52 -0400 (Thu, 25 Jun 2009) | 5 lines branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ r5423 | calvin | 2009-06-26 16:52:52 -0400 (Fri, 26 Jun 2009) | 2 lines branches/zip: Fix typos. ------------------------------------------------------------------------ r5425 | marko | 2009-06-29 04:52:30 -0400 (Mon, 29 Jun 2009) | 4 lines branches/zip: ha_innobase::add_index(), ha_innobase::final_drop_index(): Start prebuilt->trx before locking the table. This should fix Issue #293 and could fix Issue #229. Approved by Sunny (over IM). ------------------------------------------------------------------------ r5426 | marko | 2009-06-29 05:24:27 -0400 (Mon, 29 Jun 2009) | 3 lines branches/zip: buf_page_get_gen(): Fix a race condition when reading buf_fix_count. This could explain Issue #156. Tested by Michael. ------------------------------------------------------------------------ r5427 | marko | 2009-06-29 05:54:53 -0400 (Mon, 29 Jun 2009) | 5 lines branches/zip: lock_print_info_all_transactions(), buf_read_recv_pages(): Tolerate missing tablespaces (zip_size==ULINT_UNDEFINED). buf_page_get_gen(): Add ut_ad(ut_is_2pow(zip_size)). Issue #289, rb://136 approved by Sunny Bains ------------------------------------------------------------------------ r5428 | marko | 2009-06-29 07:06:29 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: row_sel_store_mysql_rec(): Add missing pointer cast. Do not do arithmetics on void pointers. ------------------------------------------------------------------------ r5429 | marko | 2009-06-29 09:49:54 -0400 (Mon, 29 Jun 2009) | 13 lines branches/zip: Do not crash on SET GLOBAL innodb_file_format=DEFAULT or SET GLOBAL innodb_file_format_check=DEFAULT. innodb_file_format.test: New test for innodb_file_format and innodb_file_format_check. innodb_file_format_name_validate(): Store the string in *save. innodb_file_format_name_update(): Check the string again. innodb_file_format_check_validate(): Store the string in *save. innodb_file_format_check_update(): Check the string again. Issue #282, rb://140 approved by Heikki Tuuri ------------------------------------------------------------------------ r5430 | marko | 2009-06-29 09:58:07 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: lock_rec_validate_page(): Add another assertion to track down Issue #289. ------------------------------------------------------------------------ r5431 | marko | 2009-06-29 09:58:40 -0400 (Mon, 29 Jun 2009) | 1 line branches/zip: Revert an accidentally made change in r5430 to univ.i. ------------------------------------------------------------------------ r5437 | marko | 2009-06-30 05:10:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: ibuf_dummy_index_free(): Beautify the comment. ------------------------------------------------------------------------ r5438 | marko | 2009-06-30 05:10:32 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: fseg_free(): Remove this unused function. ------------------------------------------------------------------------ r5439 | marko | 2009-06-30 05:15:22 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: fseg_validate(): Enclose in #ifdef UNIV_DEBUG. This function is unused, but it could turn out to be a useful debugging aid. ------------------------------------------------------------------------ r5441 | marko | 2009-06-30 06:30:14 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: ha_delete(): Remove this unused function that was very similar to ha_search_and_delete_if_found(). ------------------------------------------------------------------------ r5442 | marko | 2009-06-30 06:45:41 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: lock_is_on_table(), lock_table_unlock(): Unused, remove. ------------------------------------------------------------------------ r5443 | marko | 2009-06-30 07:03:00 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_event_create_auto(): Unused, remove. ------------------------------------------------------------------------ r5444 | marko | 2009-06-30 07:19:49 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: que_graph_try_free(): Unused, remove. ------------------------------------------------------------------------ r5445 | marko | 2009-06-30 07:28:11 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: row_build_row_ref_from_row(): Unused, remove. ------------------------------------------------------------------------ r5446 | marko | 2009-06-30 07:35:45 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_round_robin(), srv_que_task_enqueue(): Unused, remove. ------------------------------------------------------------------------ r5447 | marko | 2009-06-30 07:37:58 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_task_queue_check(): Unused, remove. ------------------------------------------------------------------------ r5448 | marko | 2009-06-30 07:56:36 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: mem_heap_cat(): Unused, remove. ------------------------------------------------------------------------ r5449 | marko | 2009-06-30 08:00:50 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: innobase_start_or_create_for_mysql(): Invoke os_get_os_version() at most once. ------------------------------------------------------------------------ r5450 | marko | 2009-06-30 08:02:20 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_file_close_no_error_handling(): Unused, remove. ------------------------------------------------------------------------ r5451 | marko | 2009-06-30 08:09:49 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: page_set_max_trx_id(): Make the code compile with UNIV_HOTBACKUP. ------------------------------------------------------------------------ r5452 | marko | 2009-06-30 08:10:26 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: os_file_close_no_error_handling(): Restore, as this function is used within InnoDB Hot Backup. ------------------------------------------------------------------------ r5453 | marko | 2009-06-30 08:14:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_process_set_priority_boost(): Unused, remove. ------------------------------------------------------------------------ r5454 | marko | 2009-06-30 08:42:52 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: Replace a non-ASCII character (ISO 8859-1 encoded U+00AD SOFT HYPHEN) with a cheap ASCII substitute. ------------------------------------------------------------------------ r5456 | inaam | 2009-06-30 14:21:09 -0400 (Tue, 30 Jun 2009) | 4 lines branches/zip Non functional change. s/Percona/Percona Inc./ ------------------------------------------------------------------------ r5470 | vasil | 2009-07-02 09:12:36 -0400 (Thu, 02 Jul 2009) | 16 lines branches/zip: Use PAUSE instruction inside spinloop if it is available. The patch was originally developed by Mikael Ronstrom <mikael@mysql.com> and can be found here: http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2768 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2771 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2772 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2774 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2777 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2799 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2800 Approved by: Heikki (rb://137) ------------------------------------------------------------------------ r5481 | vasil | 2009-07-06 13:16:32 -0400 (Mon, 06 Jul 2009) | 4 lines branches/zip: Remove unnecessary quotes and simplify plug.in. ------------------------------------------------------------------------ r5482 | calvin | 2009-07-06 18:36:35 -0400 (Mon, 06 Jul 2009) | 5 lines branches/zip: add COPYING files for Percona and Sun Micro. 1.0.4 contains patches based on contributions from Percona and Sun Microsystems. ------------------------------------------------------------------------ r5483 | calvin | 2009-07-07 05:36:43 -0400 (Tue, 07 Jul 2009) | 3 lines branches/zip: add IB_HAVE_PAUSE_INSTRUCTION to CMake. Windows will support PAUSE instruction by default. ------------------------------------------------------------------------ r5484 | inaam | 2009-07-07 18:57:14 -0400 (Tue, 07 Jul 2009) | 13 lines branches/zip rb://126 Based on contribution from Google Inc. This patch introduces a new parameter innodb_io_capacity to control the rate at which master threads performs various tasks. The default value is 200 and higher values imply more aggressive flushing and ibuf merges from within the master thread. This patch also changes the ibuf merge from synchronous to asynchronous. Another minor change is not to force the master thread to wait for a log flush to complete every second. Approved by: Heikki ------------------------------------------------------------------------ r5485 | inaam | 2009-07-07 19:00:49 -0400 (Tue, 07 Jul 2009) | 18 lines branches/zip rb://138 The current implementation is to try to flush the neighbors of every page that we flush. This patch makes the following distinction: 1) If the flush is from flush_list AND 2) If the flush is intended to move the oldest_modification LSN ahead (this happens when a user thread sees little space in the log file and attempts to flush pages from the buffer pool so that a checkpoint can be made) THEN Do not try to flush the neighbors. Just focus on flushing dirty pages at the end of flush_list Approved by: Heikki ------------------------------------------------------------------------ r5486 | inaam | 2009-07-08 12:11:40 -0400 (Wed, 08 Jul 2009) | 29 lines branches/zip rb://133 This patch introduces heuristics based flushing rate of dirty pages to avoid IO bursts at checkpoint. 1) log_capacity / log_generated per second gives us number of seconds in which ALL dirty pages need to be flushed. Based on this rough assumption we can say that n_dirty_pages / (log_capacity / log_generation_rate) = desired_flush_rate 2) We use weighted averages (hard coded to 20 seconds) of log_generation_rate to avoid resonance. 3) From the desired_flush_rate we subtract the number of pages that have been flushed due to LRU flushing. That gives us pages that we should flush as part of flush_list cleanup. And that is the number (capped by maximum io_capacity) that we try to flush from the master thread. Knobs: ====== innodb_adaptive_flushing: boolean, global, dynamic, default TRUE. Since this heuristic is very experimental and has the potential to dramatically change the IO pattern I think it is a good idea to leave a knob to turn it off. Approved by: Heikki ------------------------------------------------------------------------ r5487 | calvin | 2009-07-08 12:42:28 -0400 (Wed, 08 Jul 2009) | 7 lines branches/zip: fix PAUSE instruction patch on Windows The original PAUSE instruction patch (r5470) does not compile on Windows. Also, there is an elegant way of doing it on Windows - YieldProcessor(). Approved by: Heikki (on IM) ------------------------------------------------------------------------ r5489 | vasil | 2009-07-10 05:02:22 -0400 (Fri, 10 Jul 2009) | 9 lines branches/zip: Change the defaults for innodb_sync_spin_loops: 20 -> 30 innodb_spin_wait_delay: 5 -> 6 This change was proposed by Sun/MySQL based on their performance testing, see https://svn.innodb.com/innobase/Release_tasks_for_InnoDB_Plugin_V1.0.4 ------------------------------------------------------------------------ r5490 | vasil | 2009-07-10 05:04:20 -0400 (Fri, 10 Jul 2009) | 4 lines branches/zip: Add ChangeLog entry for 5489. ------------------------------------------------------------------------ r5491 | calvin | 2009-07-10 12:19:17 -0400 (Fri, 10 Jul 2009) | 6 lines branches/zip: add copyright info to files related to PAUSE instruction patch, contributed by Sun Microsystems. ------------------------------------------------------------------------ r5492 | calvin | 2009-07-10 17:47:34 -0400 (Fri, 10 Jul 2009) | 5 lines branches/zip: add ChangeLog entries for r5484-r5486. ------------------------------------------------------------------------ r5494 | vasil | 2009-07-13 03:37:35 -0400 (Mon, 13 Jul 2009) | 6 lines branches/zip: Restore the original value of innodb_sync_spin_loops at the end, previously the test assumed that setting it to 20 will do this, but now the default is 30 and MTR's internal check failed. ------------------------------------------------------------------------ r5495 | inaam | 2009-07-13 11:48:45 -0400 (Mon, 13 Jul 2009) | 5 lines branches/zip rb://138 (REVERT) Revert the flush neighbors patch as it shows regression in the benchmarks run by Michael. ------------------------------------------------------------------------ r5496 | inaam | 2009-07-13 14:04:57 -0400 (Mon, 13 Jul 2009) | 4 lines branches/zip Fixed warnings on windows where ulint != ib_uint64_t ------------------------------------------------------------------------ r5497 | calvin | 2009-07-13 15:01:00 -0400 (Mon, 13 Jul 2009) | 9 lines branches/zip: fix run-time symbols clash on Solaris. This patch is from Sergey Vojtovich of Sun Microsystems, to fix run-time symbols clash on Solaris with older C++ compiler: - when finding out a way to hide symbols, make decision basing on compiler, not operating system. - Sun Studio supports __hidden declaration specifier for this purpose. ------------------------------------------------------------------------ r5498 | vasil | 2009-07-14 03:16:18 -0400 (Tue, 14 Jul 2009) | 92 lines branches/zip: Merge r5341:5497 from branches/5.1, skipping: c5419 because it is merge from branches/zip into branches/5.1 c5466 because the source code has been adjusted to match the MySQL behavior and the innodb-autoinc test does not fail in branches/zip, if c5466 is merged, then innodb-autoinc starts failing, Sunny suggested not to merge c5466. and resolving conflicts in c5410, c5440, c5488: ------------------------------------------------------------------------ r5410 | marko | 2009-06-24 22:26:34 +0300 (Wed, 24 Jun 2009) | 2 lines Changed paths: M /branches/5.1/include/trx0sys.ic M /branches/5.1/trx/trx0purge.c M /branches/5.1/trx/trx0sys.c M /branches/5.1/trx/trx0undo.c branches/5.1: Add missing #include "mtr0log.h" to avoid warnings when compiling with -DUNIV_MUST_NOT_INLINE. ------------------------------------------------------------------------ r5419 | marko | 2009-06-25 16:11:57 +0300 (Thu, 25 Jun 2009) | 18 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb_bug42101-nonzero.result M /branches/5.1/mysql-test/innodb_bug42101-nonzero.test M /branches/5.1/mysql-test/innodb_bug42101.result M /branches/5.1/mysql-test/innodb_bug42101.test branches/5.1: Merge r5418 from branches/zip: ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 15:55:52 +0300 (Thu, 25 Jun 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc M /branches/zip/mysql-test/innodb_bug42101-nonzero.result M /branches/zip/mysql-test/innodb_bug42101-nonzero.test M /branches/zip/mysql-test/innodb_bug42101.result M /branches/zip/mysql-test/innodb_bug42101.test branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5440 | vasil | 2009-06-30 13:04:29 +0300 (Tue, 30 Jun 2009) | 8 lines Changed paths: M /branches/5.1/fil/fil0fil.c branches/5.1: Fix Bug#45814 URL reference in InnoDB server errors needs adjusting to match documentation by changing the URL from http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting.html to http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting-datadict.html ------------------------------------------------------------------------ r5466 | vasil | 2009-07-02 10:46:45 +0300 (Thu, 02 Jul 2009) | 6 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Adjust the failing innodb-autoinc test to conform to the latest behavior of the MySQL code. The idea and the comment in innodb-autoinc.test come from Sunny. ------------------------------------------------------------------------ r5488 | vasil | 2009-07-09 19:16:44 +0300 (Thu, 09 Jul 2009) | 13 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug21704.result A /branches/5.1/mysql-test/innodb_bug21704.test branches/5.1: Fix Bug#21704 Renaming column does not update FK definition by checking whether a column that participates in a FK definition is being renamed and denying the ALTER in this case. The patch was originally developed by Davi Arnaut <Davi.Arnaut@Sun.COM>: http://lists.mysql.com/commits/77714 and was later adjusted to conform to InnoDB coding style by me (Vasil), I also added some more comments and moved the bug specific mysql-test to a separate file to make it more manageable and flexible. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5499 | calvin | 2009-07-14 12:55:10 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: add a missing file in Makefile.am This change was suggested by MySQL. ------------------------------------------------------------------------ r5500 | calvin | 2009-07-14 13:03:26 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: minor change Remove an extra "with". ------------------------------------------------------------------------ r5501 | vasil | 2009-07-14 13:58:15 -0400 (Tue, 14 Jul 2009) | 5 lines branches/zip: Add @ZLIB_INCLUDES@ so that the InnoDB Plugin picks up the same zlib.h header file that is eventually used by mysqld. ------------------------------------------------------------------------ r5502 | vasil | 2009-07-14 13:59:59 -0400 (Tue, 14 Jul 2009) | 4 lines branches/zip: Add include/ut0auxconf.h to noinst_HEADERS ------------------------------------------------------------------------ r5503 | vasil | 2009-07-14 14:16:11 -0400 (Tue, 14 Jul 2009) | 8 lines branches/zip: Non-functional change: put files in noinst_HEADERS and libinnobase_a_SOURCES one per line and sort alphabetically, so it is easier to find if a file is there or not and also diffs show exactly the added or removed file instead of surrounding lines too. ------------------------------------------------------------------------ r5504 | calvin | 2009-07-15 04:58:44 -0400 (Wed, 15 Jul 2009) | 6 lines branches/zip: fix compile errors on Win64 Both srv_read_ahead_factor and srv_io_capacity should be defined as ulong. Approved by: Sunny ------------------------------------------------------------------------ r5508 | calvin | 2009-07-16 09:40:47 -0400 (Thu, 16 Jul 2009) | 16 lines branches/zip: Support inlining of functions and prefetch with Sun Studio Those changes are contributed by Sun/MySQL. Two sets of changes in this patch when Sun Studio is used: - Explicit inlining of functions - Prefetch Support This patch has been tested by Sunny with the plugin statically built in. Since we've never built the plugin as a dynamically loaded module on Solaris, it is a separate task to change plug.in. rb://142 Approved by: Heikki ------------------------------------------------------------------------ r5509 | calvin | 2009-07-16 09:45:28 -0400 (Thu, 16 Jul 2009) | 2 lines branches/zip: add ChangeLog entry for r5508. ------------------------------------------------------------------------ r5512 | sunny | 2009-07-19 19:52:48 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Remove unused extern ref to timed_mutexes. ------------------------------------------------------------------------ r5513 | sunny | 2009-07-19 19:58:43 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Undo r5512 ------------------------------------------------------------------------ r5514 | sunny | 2009-07-19 20:08:49 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Only use my_bool when UNIV_HOTBACKUP is not defined. ------------------------------------------------------------------------ r5515 | sunny | 2009-07-20 03:29:14 -0400 (Mon, 20 Jul 2009) | 2 lines branches/zip: The dict_table_t::autoinc_mutex field is not used in HotBackup. ------------------------------------------------------------------------ r5516 | sunny | 2009-07-20 03:46:05 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Make this file usable from within HotBackup. A new file has been introduced called hb_univ.i. This file should have all the HotBackup specific configuration. ------------------------------------------------------------------------ r5517 | sunny | 2009-07-20 03:55:11 -0400 (Mon, 20 Jul 2009) | 2 lines Add /* UNIV_HOTBACK */ ------------------------------------------------------------------------ r5519 | vasil | 2009-07-20 04:45:18 -0400 (Mon, 20 Jul 2009) | 31 lines branches/zip: Merge r5497:5518 from branches/5.1: ------------------------------------------------------------------------ r5518 | vasil | 2009-07-20 11:29:47 +0300 (Mon, 20 Jul 2009) | 22 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a change from MySQL: ------------------------------------------------------------ revno: 2874.2.1 committer: Anurag Shekhar <anurag.shekhar@sun.com> branch nick: mysql-5.1-bugteam-windows-warning timestamp: Wed 2009-05-13 15:41:24 +0530 message: Bug #39802 On Windows, 32-bit time_t should be enforced This patch fixes compilation warning, "conversion from 'time_t' to 'ulong', possible loss of data". The fix is to typecast time_t to ulong before assigning it to ulong. Backported this from 6.0-bugteam tree. modified: storage/archive/ha_archive.cc storage/federated/ha_federated.cc storage/innobase/handler/ha_innodb.cc storage/myisam/ha_myisam.cc ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5520 | vasil | 2009-07-20 04:51:47 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Add ChangeLog entries for r5498 and r5519. ------------------------------------------------------------------------ r5524 | inaam | 2009-07-20 12:23:15 -0400 (Mon, 20 Jul 2009) | 9 lines branches/zip Change the read ahead parameter name to innodb_read_ahead_threshold. Change the meaning of this parameter to signify the number of pages that must be sequentially accessed for InnoDB to trigger a readahead request. Suggested by: Ken ------------------------------------------------------------------------
16 years ago
branches/innodb+: Merge revisions 5144:5524 from branches/zip ------------------------------------------------------------------------ r5147 | marko | 2009-05-27 06:55:14 -0400 (Wed, 27 May 2009) | 1 line branches/zip: ibuf0ibuf.c: Improve a comment. ------------------------------------------------------------------------ r5149 | marko | 2009-05-27 07:46:42 -0400 (Wed, 27 May 2009) | 34 lines branches/zip: Merge revisions 4994:5148 from branches/5.1: ------------------------------------------------------------------------ r5126 | vasil | 2009-05-26 16:57:12 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Non-functional change: move FSP_* macros from fsp0fsp.h to a new file fsp0types.h. This is needed in order to be able to use FSP_EXTENT_SIZE in mtr0log.ic. ------------------------------------------------------------------------ r5127 | vasil | 2009-05-26 17:05:43 +0300 (Tue, 26 May 2009) | 9 lines branches/5.1: Preparation for the fix of Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not include unnecessary headers mtr0log.h and fut0lst.h in trx0sys.h and include fsp0fsp.h just before it is needed. This is needed in order to be able to use TRX_SYS_SPACE in mtr0log.ic. ------------------------------------------------------------------------ r5128 | vasil | 2009-05-26 17:26:37 +0300 (Tue, 26 May 2009) | 7 lines branches/5.1: Fix Bug#45097 Hang during recovery, redo logs for doublewrite buffer pages Do not write redo log for the pages in the doublewrite buffer. Also, do not make a dummy change to the page because this is not needed. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5169 | marko | 2009-05-28 03:21:55 -0400 (Thu, 28 May 2009) | 1 line branches/zip: mtr0mtr.h: Add Doxygen comments for the redo log entry types. ------------------------------------------------------------------------ r5176 | marko | 2009-05-28 07:14:02 -0400 (Thu, 28 May 2009) | 1 line branches/zip: Correct a debug assertion that was added in r5125. ------------------------------------------------------------------------ r5201 | marko | 2009-06-01 06:35:25 -0400 (Mon, 01 Jun 2009) | 2 lines branches/zip: Clean up some comments. Make the rec parameter of mlog_open_and_write_index() const. ------------------------------------------------------------------------ r5234 | marko | 2009-06-03 08:26:41 -0400 (Wed, 03 Jun 2009) | 44 lines branches/zip: Merge revisions 5148:5233 from branches/5.1: ------------------------------------------------------------------------ r5150 | vasil | 2009-05-27 18:56:03 +0300 (Wed, 27 May 2009) | 4 lines branches/5.1: Whitespace fixup. ------------------------------------------------------------------------ r5191 | vasil | 2009-05-30 17:46:05 +0300 (Sat, 30 May 2009) | 19 lines branches/5.1: Merge a change from MySQL (this fixes the failing innodb_mysql test): ------------------------------------------------------------ revno: 1810.3894.10 committer: Sergey Glukhov <Sergey.Glukhov@sun.com> branch nick: mysql-5.0-bugteam timestamp: Tue 2009-05-19 11:32:21 +0500 message: Bug#39793 Foreign keys not constructed when column has a '#' in a comment or default value Internal InnoDN FK parser does not recognize '\'' as quotation symbol. Suggested fix is to add '\'' symbol check for quotation condition (dict_strip_comments() function). modified: innobase/dict/dict0dict.c mysql-test/r/innodb_mysql.result mysql-test/t/innodb_mysql.test ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5250 | marko | 2009-06-04 02:58:23 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add Doxygen comments to the rest of buf0*. ------------------------------------------------------------------------ r5251 | marko | 2009-06-04 02:59:51 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Replace <= in a function comment. ------------------------------------------------------------------------ r5253 | marko | 2009-06-04 06:37:35 -0400 (Thu, 04 Jun 2009) | 1 line branches/zip: Add missing Doxygen comments for page0zip. ------------------------------------------------------------------------ r5261 | vasil | 2009-06-05 11:13:31 -0400 (Fri, 05 Jun 2009) | 15 lines branches/zip: Fix Mantis Issue#244 fix bug in linear read ahead (no check on access pattern) The changes are: 1) Take into account access pattern when deciding whether or not to do linear read ahead. 2) Expose a knob innodb_read_ahead_factor = [0-64] default (8), dynamic, global to control linear read ahead behvior 3) Disable random read ahead. Keep the code for now. Submitted by: Inaam (rb://122) Approved by: Heikki (rb://122) ------------------------------------------------------------------------ r5262 | vasil | 2009-06-05 12:04:25 -0400 (Fri, 05 Jun 2009) | 22 lines branches/zip: Enable functionality to have multiple background io helper threads. This patch is based on percona contributions. More details about this patch will be written at: https://svn.innodb.com/innobase/MultipleBackgroundThreads The patch essentially does the following: expose following knobs: innodb_read_io_threads = [1 - 64] default 1 innodb_write_io_threads = [1 - 64] default 1 deprecate innodb_file_io_threads (this parameter was relevant only on windows) Internally it allows multiple segments for read and write IO request arrays where one thread works on one segement. Submitted by: Inaam (rb://124) Approved by: Heikki (rb://124) ------------------------------------------------------------------------ r5263 | vasil | 2009-06-05 12:19:37 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Whitespace cleanup. ------------------------------------------------------------------------ r5264 | vasil | 2009-06-05 12:26:58 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5261. ------------------------------------------------------------------------ r5265 | vasil | 2009-06-05 12:34:11 -0400 (Fri, 05 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5262. ------------------------------------------------------------------------ r5268 | inaam | 2009-06-08 12:18:21 -0400 (Mon, 08 Jun 2009) | 7 lines branches/zip Non functional change: Added legal notices acknowledging percona contribution to the multiple IO helper threads patch i.e.: r5262 ------------------------------------------------------------------------ r5283 | inaam | 2009-06-09 13:46:29 -0400 (Tue, 09 Jun 2009) | 9 lines branches/zip rb://130 Enable Group Commit functionality that was broken in 5.0 when distributed transactions were introduced. Reviewed by: Heikki ------------------------------------------------------------------------ r5319 | marko | 2009-06-11 04:40:33 -0400 (Thu, 11 Jun 2009) | 3 lines branches/zip: Declare os_thread_id_t as unsigned long, because ulint is wrong on Win64. Pointed out by Vladislav Vaintroub <wlad@sun.com>. ------------------------------------------------------------------------ r5320 | inaam | 2009-06-11 09:15:41 -0400 (Thu, 11 Jun 2009) | 14 lines branches/zip rb://131 This patch changes the following defaults: max_dirty_pages_pct: default from 90 to 75. max allowed from 100 to 99 additional_mem_pool_size: default from 1 to 8 MB buffer_pool_size: default from 8 to 128 MB log_buffer_size: default from 1 to 8 MB read_io_threads/write_io_threads: default from 1 to 4 The log file sizes are untouched because of upgrade issues Reviewed by: Heikki ------------------------------------------------------------------------ r5330 | marko | 2009-06-16 04:08:59 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_gen(): Reduce mutex holding time by adjusting buf_pool->n_pend_unzip while only holding buf_pool_mutex. ------------------------------------------------------------------------ r5331 | marko | 2009-06-16 05:00:48 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Eliminate a buf_page_get_mutex() call. The function must switch on the block state anyway. ------------------------------------------------------------------------ r5332 | vasil | 2009-06-16 05:03:27 -0400 (Tue, 16 Jun 2009) | 4 lines branches/zip: Add ChangeLog entries for r5283 and r5320. ------------------------------------------------------------------------ r5333 | marko | 2009-06-16 05:27:46 -0400 (Tue, 16 Jun 2009) | 1 line branches/zip: buf_page_io_query(): Remove unused function. ------------------------------------------------------------------------ r5335 | marko | 2009-06-16 09:23:10 -0400 (Tue, 16 Jun 2009) | 2 lines branches/zip: innodb.test: Adjust the tolerance of innodb_buffer_pool_pages_total for r5320. ------------------------------------------------------------------------ r5342 | marko | 2009-06-17 06:15:32 -0400 (Wed, 17 Jun 2009) | 60 lines branches/zip: Merge revisions 5233:5341 from branches/5.1: ------------------------------------------------------------------------ r5233 | marko | 2009-06-03 15:12:44 +0300 (Wed, 03 Jun 2009) | 11 lines branches/5.1: Merge the test case from r5232 from branches/5.0: ------------------------------------------------------------------------ r5232 | marko | 2009-06-03 14:31:04 +0300 (Wed, 03 Jun 2009) | 21 lines branches/5.0: Merge r3590 from branches/5.1 in order to fix Bug #40565 (Update Query Results in "1 Row Affected" But Should Be "Zero Rows"). Also, add a test case for Bug #40565. rb://128 approved by Heikki Tuuri ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5243 | sunny | 2009-06-04 03:17:14 +0300 (Thu, 04 Jun 2009) | 14 lines branches/5.1: When the InnoDB and MySQL data dictionaries go out of sync, before the bug fix we would assert on missing autoinc columns. With this fix we allow MySQL to open the table but set the next autoinc value for the column to the MAX value. This effectively disables the next value generation. INSERTs will fail with a generic AUTOINC failure. However, the user should be able to read/dump the table, set the column values explicitly, use ALTER TABLE to set the next autoinc value and/or sync the two data dictionaries to resume normal operations. Fix Bug#44030 Error: (1500) Couldn't read the MAX(ID) autoinc value from the index (PRIMARY) rb://118 ------------------------------------------------------------------------ r5252 | sunny | 2009-06-04 10:16:24 +0300 (Thu, 04 Jun 2009) | 2 lines branches/5.1: The version of the result file checked in was broken in r5243. ------------------------------------------------------------------------ r5259 | vasil | 2009-06-05 10:29:16 +0300 (Fri, 05 Jun 2009) | 7 lines branches/5.1: Remove the word "Error" from the printout because the mysqltest suite interprets it as an error and thus the innodb-autoinc test fails. Approved by: Sunny (via IM) ------------------------------------------------------------------------ r5339 | marko | 2009-06-17 11:01:37 +0300 (Wed, 17 Jun 2009) | 2 lines branches/5.1: Add missing #include "mtr0log.h" so that the code compiles with -DUNIV_MUST_NOT_INLINE. (null merge; this had already been committed in branches/zip) ------------------------------------------------------------------------ r5340 | marko | 2009-06-17 12:11:49 +0300 (Wed, 17 Jun 2009) | 4 lines branches/5.1: row_unlock_for_mysql(): When the clustered index is unknown, refuse to unlock the record. (Bug #45357, caused by the fix of Bug #39320). rb://132 approved by Sunny Bains. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5343 | vasil | 2009-06-17 08:56:12 -0400 (Wed, 17 Jun 2009) | 4 lines branches/zip: Add ChangeLog entry for r5342. ------------------------------------------------------------------------ r5344 | marko | 2009-06-17 09:03:45 -0400 (Wed, 17 Jun 2009) | 1 line branches/zip: row_merge_read_rec(): Fix a UNIV_DEBUG bug (Bug #45426) ------------------------------------------------------------------------ r5391 | marko | 2009-06-22 05:31:35 -0400 (Mon, 22 Jun 2009) | 2 lines branches/zip: buf_page_get_zip(): Fix a bogus warning about block_mutex being possibly uninitialized. ------------------------------------------------------------------------ r5392 | marko | 2009-06-22 07:58:20 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: ha_innobase::check_if_incompatible_data(): When ROW_FORMAT=DEFAULT, do not compare to get_row_type(). Without this change, fast index creation will be disabled in recent versions of MySQL 5.1. ------------------------------------------------------------------------ r5393 | pekka | 2009-06-22 09:27:55 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Minor changes for Hot Backup to build correctly. (The code bracketed between #ifdef UNIV_HOTBACKUP and #endif /* UNIV_HOTBACKUP */). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5394 | pekka | 2009-06-22 09:46:34 -0400 (Mon, 22 Jun 2009) | 4 lines branches/zip: Add functions for checking the format of tablespaces for Hot Backup build (UNIV_HOTBACKUP defined). This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r5397 | calvin | 2009-06-23 16:59:42 -0400 (Tue, 23 Jun 2009) | 7 lines branches/zip: change the header file path. Change the header file path from ../storage/innobase/include/ to ../include/. In the planned 5.1 + plugin release, the source directory of the plugin will not be in storage/innobase. Approved by: Heikki (IM) ------------------------------------------------------------------------ r5407 | calvin | 2009-06-24 09:51:08 -0400 (Wed, 24 Jun 2009) | 4 lines branches/zip: remove relative path of header files. Suggested by Marko. ------------------------------------------------------------------------ r5412 | marko | 2009-06-25 06:27:08 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: Replace a DBUG_ASSERT with ut_a to track down Issue #290. ------------------------------------------------------------------------ r5415 | marko | 2009-06-25 06:45:57 -0400 (Thu, 25 Jun 2009) | 3 lines branches/zip: dict_index_find_cols(): Print diagnostic on name mismatch. This addresses Bug #44571 but does not fix it. rb://135 approved by Sunny Bains. ------------------------------------------------------------------------ r5417 | marko | 2009-06-25 08:20:56 -0400 (Thu, 25 Jun 2009) | 1 line branches/zip: ha_innodb.cc: Move the misplaced Doxygen @file comment. ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 08:55:52 -0400 (Thu, 25 Jun 2009) | 5 lines branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ r5423 | calvin | 2009-06-26 16:52:52 -0400 (Fri, 26 Jun 2009) | 2 lines branches/zip: Fix typos. ------------------------------------------------------------------------ r5425 | marko | 2009-06-29 04:52:30 -0400 (Mon, 29 Jun 2009) | 4 lines branches/zip: ha_innobase::add_index(), ha_innobase::final_drop_index(): Start prebuilt->trx before locking the table. This should fix Issue #293 and could fix Issue #229. Approved by Sunny (over IM). ------------------------------------------------------------------------ r5426 | marko | 2009-06-29 05:24:27 -0400 (Mon, 29 Jun 2009) | 3 lines branches/zip: buf_page_get_gen(): Fix a race condition when reading buf_fix_count. This could explain Issue #156. Tested by Michael. ------------------------------------------------------------------------ r5427 | marko | 2009-06-29 05:54:53 -0400 (Mon, 29 Jun 2009) | 5 lines branches/zip: lock_print_info_all_transactions(), buf_read_recv_pages(): Tolerate missing tablespaces (zip_size==ULINT_UNDEFINED). buf_page_get_gen(): Add ut_ad(ut_is_2pow(zip_size)). Issue #289, rb://136 approved by Sunny Bains ------------------------------------------------------------------------ r5428 | marko | 2009-06-29 07:06:29 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: row_sel_store_mysql_rec(): Add missing pointer cast. Do not do arithmetics on void pointers. ------------------------------------------------------------------------ r5429 | marko | 2009-06-29 09:49:54 -0400 (Mon, 29 Jun 2009) | 13 lines branches/zip: Do not crash on SET GLOBAL innodb_file_format=DEFAULT or SET GLOBAL innodb_file_format_check=DEFAULT. innodb_file_format.test: New test for innodb_file_format and innodb_file_format_check. innodb_file_format_name_validate(): Store the string in *save. innodb_file_format_name_update(): Check the string again. innodb_file_format_check_validate(): Store the string in *save. innodb_file_format_check_update(): Check the string again. Issue #282, rb://140 approved by Heikki Tuuri ------------------------------------------------------------------------ r5430 | marko | 2009-06-29 09:58:07 -0400 (Mon, 29 Jun 2009) | 2 lines branches/zip: lock_rec_validate_page(): Add another assertion to track down Issue #289. ------------------------------------------------------------------------ r5431 | marko | 2009-06-29 09:58:40 -0400 (Mon, 29 Jun 2009) | 1 line branches/zip: Revert an accidentally made change in r5430 to univ.i. ------------------------------------------------------------------------ r5437 | marko | 2009-06-30 05:10:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: ibuf_dummy_index_free(): Beautify the comment. ------------------------------------------------------------------------ r5438 | marko | 2009-06-30 05:10:32 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: fseg_free(): Remove this unused function. ------------------------------------------------------------------------ r5439 | marko | 2009-06-30 05:15:22 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: fseg_validate(): Enclose in #ifdef UNIV_DEBUG. This function is unused, but it could turn out to be a useful debugging aid. ------------------------------------------------------------------------ r5441 | marko | 2009-06-30 06:30:14 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: ha_delete(): Remove this unused function that was very similar to ha_search_and_delete_if_found(). ------------------------------------------------------------------------ r5442 | marko | 2009-06-30 06:45:41 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: lock_is_on_table(), lock_table_unlock(): Unused, remove. ------------------------------------------------------------------------ r5443 | marko | 2009-06-30 07:03:00 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_event_create_auto(): Unused, remove. ------------------------------------------------------------------------ r5444 | marko | 2009-06-30 07:19:49 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: que_graph_try_free(): Unused, remove. ------------------------------------------------------------------------ r5445 | marko | 2009-06-30 07:28:11 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: row_build_row_ref_from_row(): Unused, remove. ------------------------------------------------------------------------ r5446 | marko | 2009-06-30 07:35:45 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_round_robin(), srv_que_task_enqueue(): Unused, remove. ------------------------------------------------------------------------ r5447 | marko | 2009-06-30 07:37:58 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: srv_que_task_queue_check(): Unused, remove. ------------------------------------------------------------------------ r5448 | marko | 2009-06-30 07:56:36 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: mem_heap_cat(): Unused, remove. ------------------------------------------------------------------------ r5449 | marko | 2009-06-30 08:00:50 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: innobase_start_or_create_for_mysql(): Invoke os_get_os_version() at most once. ------------------------------------------------------------------------ r5450 | marko | 2009-06-30 08:02:20 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_file_close_no_error_handling(): Unused, remove. ------------------------------------------------------------------------ r5451 | marko | 2009-06-30 08:09:49 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: page_set_max_trx_id(): Make the code compile with UNIV_HOTBACKUP. ------------------------------------------------------------------------ r5452 | marko | 2009-06-30 08:10:26 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: os_file_close_no_error_handling(): Restore, as this function is used within InnoDB Hot Backup. ------------------------------------------------------------------------ r5453 | marko | 2009-06-30 08:14:01 -0400 (Tue, 30 Jun 2009) | 1 line branches/zip: os_process_set_priority_boost(): Unused, remove. ------------------------------------------------------------------------ r5454 | marko | 2009-06-30 08:42:52 -0400 (Tue, 30 Jun 2009) | 2 lines branches/zip: Replace a non-ASCII character (ISO 8859-1 encoded U+00AD SOFT HYPHEN) with a cheap ASCII substitute. ------------------------------------------------------------------------ r5456 | inaam | 2009-06-30 14:21:09 -0400 (Tue, 30 Jun 2009) | 4 lines branches/zip Non functional change. s/Percona/Percona Inc./ ------------------------------------------------------------------------ r5470 | vasil | 2009-07-02 09:12:36 -0400 (Thu, 02 Jul 2009) | 16 lines branches/zip: Use PAUSE instruction inside spinloop if it is available. The patch was originally developed by Mikael Ronstrom <mikael@mysql.com> and can be found here: http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2768 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2771 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2772 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2774 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2777 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2799 http://bazaar.launchpad.net/%7Emysql/mysql-server/mysql-5.4/revision/2800 Approved by: Heikki (rb://137) ------------------------------------------------------------------------ r5481 | vasil | 2009-07-06 13:16:32 -0400 (Mon, 06 Jul 2009) | 4 lines branches/zip: Remove unnecessary quotes and simplify plug.in. ------------------------------------------------------------------------ r5482 | calvin | 2009-07-06 18:36:35 -0400 (Mon, 06 Jul 2009) | 5 lines branches/zip: add COPYING files for Percona and Sun Micro. 1.0.4 contains patches based on contributions from Percona and Sun Microsystems. ------------------------------------------------------------------------ r5483 | calvin | 2009-07-07 05:36:43 -0400 (Tue, 07 Jul 2009) | 3 lines branches/zip: add IB_HAVE_PAUSE_INSTRUCTION to CMake. Windows will support PAUSE instruction by default. ------------------------------------------------------------------------ r5484 | inaam | 2009-07-07 18:57:14 -0400 (Tue, 07 Jul 2009) | 13 lines branches/zip rb://126 Based on contribution from Google Inc. This patch introduces a new parameter innodb_io_capacity to control the rate at which master threads performs various tasks. The default value is 200 and higher values imply more aggressive flushing and ibuf merges from within the master thread. This patch also changes the ibuf merge from synchronous to asynchronous. Another minor change is not to force the master thread to wait for a log flush to complete every second. Approved by: Heikki ------------------------------------------------------------------------ r5485 | inaam | 2009-07-07 19:00:49 -0400 (Tue, 07 Jul 2009) | 18 lines branches/zip rb://138 The current implementation is to try to flush the neighbors of every page that we flush. This patch makes the following distinction: 1) If the flush is from flush_list AND 2) If the flush is intended to move the oldest_modification LSN ahead (this happens when a user thread sees little space in the log file and attempts to flush pages from the buffer pool so that a checkpoint can be made) THEN Do not try to flush the neighbors. Just focus on flushing dirty pages at the end of flush_list Approved by: Heikki ------------------------------------------------------------------------ r5486 | inaam | 2009-07-08 12:11:40 -0400 (Wed, 08 Jul 2009) | 29 lines branches/zip rb://133 This patch introduces heuristics based flushing rate of dirty pages to avoid IO bursts at checkpoint. 1) log_capacity / log_generated per second gives us number of seconds in which ALL dirty pages need to be flushed. Based on this rough assumption we can say that n_dirty_pages / (log_capacity / log_generation_rate) = desired_flush_rate 2) We use weighted averages (hard coded to 20 seconds) of log_generation_rate to avoid resonance. 3) From the desired_flush_rate we subtract the number of pages that have been flushed due to LRU flushing. That gives us pages that we should flush as part of flush_list cleanup. And that is the number (capped by maximum io_capacity) that we try to flush from the master thread. Knobs: ====== innodb_adaptive_flushing: boolean, global, dynamic, default TRUE. Since this heuristic is very experimental and has the potential to dramatically change the IO pattern I think it is a good idea to leave a knob to turn it off. Approved by: Heikki ------------------------------------------------------------------------ r5487 | calvin | 2009-07-08 12:42:28 -0400 (Wed, 08 Jul 2009) | 7 lines branches/zip: fix PAUSE instruction patch on Windows The original PAUSE instruction patch (r5470) does not compile on Windows. Also, there is an elegant way of doing it on Windows - YieldProcessor(). Approved by: Heikki (on IM) ------------------------------------------------------------------------ r5489 | vasil | 2009-07-10 05:02:22 -0400 (Fri, 10 Jul 2009) | 9 lines branches/zip: Change the defaults for innodb_sync_spin_loops: 20 -> 30 innodb_spin_wait_delay: 5 -> 6 This change was proposed by Sun/MySQL based on their performance testing, see https://svn.innodb.com/innobase/Release_tasks_for_InnoDB_Plugin_V1.0.4 ------------------------------------------------------------------------ r5490 | vasil | 2009-07-10 05:04:20 -0400 (Fri, 10 Jul 2009) | 4 lines branches/zip: Add ChangeLog entry for 5489. ------------------------------------------------------------------------ r5491 | calvin | 2009-07-10 12:19:17 -0400 (Fri, 10 Jul 2009) | 6 lines branches/zip: add copyright info to files related to PAUSE instruction patch, contributed by Sun Microsystems. ------------------------------------------------------------------------ r5492 | calvin | 2009-07-10 17:47:34 -0400 (Fri, 10 Jul 2009) | 5 lines branches/zip: add ChangeLog entries for r5484-r5486. ------------------------------------------------------------------------ r5494 | vasil | 2009-07-13 03:37:35 -0400 (Mon, 13 Jul 2009) | 6 lines branches/zip: Restore the original value of innodb_sync_spin_loops at the end, previously the test assumed that setting it to 20 will do this, but now the default is 30 and MTR's internal check failed. ------------------------------------------------------------------------ r5495 | inaam | 2009-07-13 11:48:45 -0400 (Mon, 13 Jul 2009) | 5 lines branches/zip rb://138 (REVERT) Revert the flush neighbors patch as it shows regression in the benchmarks run by Michael. ------------------------------------------------------------------------ r5496 | inaam | 2009-07-13 14:04:57 -0400 (Mon, 13 Jul 2009) | 4 lines branches/zip Fixed warnings on windows where ulint != ib_uint64_t ------------------------------------------------------------------------ r5497 | calvin | 2009-07-13 15:01:00 -0400 (Mon, 13 Jul 2009) | 9 lines branches/zip: fix run-time symbols clash on Solaris. This patch is from Sergey Vojtovich of Sun Microsystems, to fix run-time symbols clash on Solaris with older C++ compiler: - when finding out a way to hide symbols, make decision basing on compiler, not operating system. - Sun Studio supports __hidden declaration specifier for this purpose. ------------------------------------------------------------------------ r5498 | vasil | 2009-07-14 03:16:18 -0400 (Tue, 14 Jul 2009) | 92 lines branches/zip: Merge r5341:5497 from branches/5.1, skipping: c5419 because it is merge from branches/zip into branches/5.1 c5466 because the source code has been adjusted to match the MySQL behavior and the innodb-autoinc test does not fail in branches/zip, if c5466 is merged, then innodb-autoinc starts failing, Sunny suggested not to merge c5466. and resolving conflicts in c5410, c5440, c5488: ------------------------------------------------------------------------ r5410 | marko | 2009-06-24 22:26:34 +0300 (Wed, 24 Jun 2009) | 2 lines Changed paths: M /branches/5.1/include/trx0sys.ic M /branches/5.1/trx/trx0purge.c M /branches/5.1/trx/trx0sys.c M /branches/5.1/trx/trx0undo.c branches/5.1: Add missing #include "mtr0log.h" to avoid warnings when compiling with -DUNIV_MUST_NOT_INLINE. ------------------------------------------------------------------------ r5419 | marko | 2009-06-25 16:11:57 +0300 (Thu, 25 Jun 2009) | 18 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb_bug42101-nonzero.result M /branches/5.1/mysql-test/innodb_bug42101-nonzero.test M /branches/5.1/mysql-test/innodb_bug42101.result M /branches/5.1/mysql-test/innodb_bug42101.test branches/5.1: Merge r5418 from branches/zip: ------------------------------------------------------------------------ r5418 | marko | 2009-06-25 15:55:52 +0300 (Thu, 25 Jun 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc M /branches/zip/mysql-test/innodb_bug42101-nonzero.result M /branches/zip/mysql-test/innodb_bug42101-nonzero.test M /branches/zip/mysql-test/innodb_bug42101.result M /branches/zip/mysql-test/innodb_bug42101.test branches/zip: Fix a race condition caused by SET GLOBAL innodb_commit_concurrency=DEFAULT. (Bug #45749) When innodb_commit_concurrency is initially set nonzero, DEFAULT would change it back to 0, triggering Bug #42101. rb://139 approved by Heikki Tuuri. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5440 | vasil | 2009-06-30 13:04:29 +0300 (Tue, 30 Jun 2009) | 8 lines Changed paths: M /branches/5.1/fil/fil0fil.c branches/5.1: Fix Bug#45814 URL reference in InnoDB server errors needs adjusting to match documentation by changing the URL from http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting.html to http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting-datadict.html ------------------------------------------------------------------------ r5466 | vasil | 2009-07-02 10:46:45 +0300 (Thu, 02 Jul 2009) | 6 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Adjust the failing innodb-autoinc test to conform to the latest behavior of the MySQL code. The idea and the comment in innodb-autoinc.test come from Sunny. ------------------------------------------------------------------------ r5488 | vasil | 2009-07-09 19:16:44 +0300 (Thu, 09 Jul 2009) | 13 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug21704.result A /branches/5.1/mysql-test/innodb_bug21704.test branches/5.1: Fix Bug#21704 Renaming column does not update FK definition by checking whether a column that participates in a FK definition is being renamed and denying the ALTER in this case. The patch was originally developed by Davi Arnaut <Davi.Arnaut@Sun.COM>: http://lists.mysql.com/commits/77714 and was later adjusted to conform to InnoDB coding style by me (Vasil), I also added some more comments and moved the bug specific mysql-test to a separate file to make it more manageable and flexible. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5499 | calvin | 2009-07-14 12:55:10 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: add a missing file in Makefile.am This change was suggested by MySQL. ------------------------------------------------------------------------ r5500 | calvin | 2009-07-14 13:03:26 -0400 (Tue, 14 Jul 2009) | 3 lines branches/zip: minor change Remove an extra "with". ------------------------------------------------------------------------ r5501 | vasil | 2009-07-14 13:58:15 -0400 (Tue, 14 Jul 2009) | 5 lines branches/zip: Add @ZLIB_INCLUDES@ so that the InnoDB Plugin picks up the same zlib.h header file that is eventually used by mysqld. ------------------------------------------------------------------------ r5502 | vasil | 2009-07-14 13:59:59 -0400 (Tue, 14 Jul 2009) | 4 lines branches/zip: Add include/ut0auxconf.h to noinst_HEADERS ------------------------------------------------------------------------ r5503 | vasil | 2009-07-14 14:16:11 -0400 (Tue, 14 Jul 2009) | 8 lines branches/zip: Non-functional change: put files in noinst_HEADERS and libinnobase_a_SOURCES one per line and sort alphabetically, so it is easier to find if a file is there or not and also diffs show exactly the added or removed file instead of surrounding lines too. ------------------------------------------------------------------------ r5504 | calvin | 2009-07-15 04:58:44 -0400 (Wed, 15 Jul 2009) | 6 lines branches/zip: fix compile errors on Win64 Both srv_read_ahead_factor and srv_io_capacity should be defined as ulong. Approved by: Sunny ------------------------------------------------------------------------ r5508 | calvin | 2009-07-16 09:40:47 -0400 (Thu, 16 Jul 2009) | 16 lines branches/zip: Support inlining of functions and prefetch with Sun Studio Those changes are contributed by Sun/MySQL. Two sets of changes in this patch when Sun Studio is used: - Explicit inlining of functions - Prefetch Support This patch has been tested by Sunny with the plugin statically built in. Since we've never built the plugin as a dynamically loaded module on Solaris, it is a separate task to change plug.in. rb://142 Approved by: Heikki ------------------------------------------------------------------------ r5509 | calvin | 2009-07-16 09:45:28 -0400 (Thu, 16 Jul 2009) | 2 lines branches/zip: add ChangeLog entry for r5508. ------------------------------------------------------------------------ r5512 | sunny | 2009-07-19 19:52:48 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Remove unused extern ref to timed_mutexes. ------------------------------------------------------------------------ r5513 | sunny | 2009-07-19 19:58:43 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Undo r5512 ------------------------------------------------------------------------ r5514 | sunny | 2009-07-19 20:08:49 -0400 (Sun, 19 Jul 2009) | 2 lines branches/zip: Only use my_bool when UNIV_HOTBACKUP is not defined. ------------------------------------------------------------------------ r5515 | sunny | 2009-07-20 03:29:14 -0400 (Mon, 20 Jul 2009) | 2 lines branches/zip: The dict_table_t::autoinc_mutex field is not used in HotBackup. ------------------------------------------------------------------------ r5516 | sunny | 2009-07-20 03:46:05 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Make this file usable from within HotBackup. A new file has been introduced called hb_univ.i. This file should have all the HotBackup specific configuration. ------------------------------------------------------------------------ r5517 | sunny | 2009-07-20 03:55:11 -0400 (Mon, 20 Jul 2009) | 2 lines Add /* UNIV_HOTBACK */ ------------------------------------------------------------------------ r5519 | vasil | 2009-07-20 04:45:18 -0400 (Mon, 20 Jul 2009) | 31 lines branches/zip: Merge r5497:5518 from branches/5.1: ------------------------------------------------------------------------ r5518 | vasil | 2009-07-20 11:29:47 +0300 (Mon, 20 Jul 2009) | 22 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a change from MySQL: ------------------------------------------------------------ revno: 2874.2.1 committer: Anurag Shekhar <anurag.shekhar@sun.com> branch nick: mysql-5.1-bugteam-windows-warning timestamp: Wed 2009-05-13 15:41:24 +0530 message: Bug #39802 On Windows, 32-bit time_t should be enforced This patch fixes compilation warning, "conversion from 'time_t' to 'ulong', possible loss of data". The fix is to typecast time_t to ulong before assigning it to ulong. Backported this from 6.0-bugteam tree. modified: storage/archive/ha_archive.cc storage/federated/ha_federated.cc storage/innobase/handler/ha_innodb.cc storage/myisam/ha_myisam.cc ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5520 | vasil | 2009-07-20 04:51:47 -0400 (Mon, 20 Jul 2009) | 4 lines branches/zip: Add ChangeLog entries for r5498 and r5519. ------------------------------------------------------------------------ r5524 | inaam | 2009-07-20 12:23:15 -0400 (Mon, 20 Jul 2009) | 9 lines branches/zip Change the read ahead parameter name to innodb_read_ahead_threshold. Change the meaning of this parameter to signify the number of pages that must be sequentially accessed for InnoDB to trigger a readahead request. Suggested by: Ken ------------------------------------------------------------------------
16 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-25506 (3 of 3): Do not delete .ibd files before commit This is a complete rewrite of DROP TABLE, also as part of other DDL, such as ALTER TABLE, CREATE TABLE...SELECT, TRUNCATE TABLE. The background DROP TABLE queue hack is removed. If a transaction needs to drop and create a table by the same name (like TRUNCATE TABLE does), it must first rename the table to an internal #sql-ib name. No committed version of the data dictionary will include any #sql-ib tables, because whenever a transaction renames a table to a #sql-ib name, it will also drop that table. Either the rename will be rolled back, or the drop will be committed. Data files will be unlinked after the transaction has been committed and a FILE_RENAME record has been durably written. The file will actually be deleted when the detached file handle returned by fil_delete_tablespace() will be closed, after the latches have been released. It is possible that a purge of the delete of the SYS_INDEXES record for the clustered index will execute fil_delete_tablespace() concurrently with the DDL transaction. In that case, the thread that arrives later will wait for the other thread to finish. HTON_TRUNCATE_REQUIRES_EXCLUSIVE_USE: A new handler flag. ha_innobase::truncate() now requires that all other references to the table be released in advance. This was implemented by Monty. ha_innobase::delete_table(): If CREATE TABLE..SELECT is detected, we will "hijack" the current transaction, drop the table in the current transaction and commit the current transaction. This essentially fixes MDEV-21602. There is a FIXME comment about making the check less failure-prone. ha_innobase::truncate(), ha_innobase::delete_table(): Implement a fast path for temporary tables. We will no longer allow temporary tables to use the adaptive hash index. dict_table_t::mdl_name: The original table name for the purpose of acquiring MDL in purge, to prevent a race condition between a DDL transaction that is dropping a table, and purge processing undo log records of DML that had executed before the DDL operation. For #sql-backup- tables during ALTER TABLE...ALGORITHM=COPY, the dict_table_t::mdl_name will differ from dict_table_t::name. dict_table_t::parse_name(): Use mdl_name instead of name. dict_table_rename_in_cache(): Update mdl_name. For the internal FTS_ tables of FULLTEXT INDEX, purge would acquire MDL on the FTS_ table name, but not on the main table, and therefore it would be able to run concurrently with a DDL transaction that is dropping the table. Previously, the DROP TABLE queue hack prevented a race between purge and DDL. For now, we introduce purge_sys.stop_FTS() to prevent purge from opening any table, while a DDL transaction that may drop FTS_ tables is in progress. The function fts_lock_table(), which will be invoked before the dictionary is locked, will wait for purge to release any table handles. trx_t::drop_table_statistics(): Drop statistics for the table. This replaces dict_stats_drop_index(). We will drop or rename persistent statistics atomically as part of DDL transactions. On lock conflict for dropping statistics, we will fail instantly with DB_LOCK_WAIT_TIMEOUT, because we will be holding the exclusive data dictionary latch. trx_t::commit_cleanup(): Separated from trx_t::commit_in_memory(). Relax an assertion around fts_commit() and allow DB_LOCK_WAIT_TIMEOUT in addition to DB_DUPLICATE_KEY. The call to fts_commit() is entirely misplaced here and may obviously break the consistency of transactions that affect FULLTEXT INDEX. It needs to be fixed separately. dict_table_t::n_foreign_key_checks_running: Remove (MDEV-21175). The counter was a work-around for missing meta-data locking (MDL) on the SQL layer, and not really needed in MariaDB. ER_TABLE_IN_FK_CHECK: Replaced with ER_UNUSED_28. HA_ERR_TABLE_IN_FK_CHECK: Remove. row_ins_check_foreign_constraints(): Do not acquire dict_sys.latch either. The SQL-layer MDL will protect us. This was reviewed by Thirunarayanan Balathandayuthapani and tested by Matthias Leich.
4 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
branches/innodb+: Merged revisions 5525:5971 from branches/zip ------------------------------------------------------------------------ r5971 | marko | 2009-09-23 09:03:51 -0400 (Wed, 23 Sep 2009) | 2 lines branches/zip: os_file_pwrite(): Make the code compile in InnoDB Hot Backup when the pwrite system call is not available. ------------------------------------------------------------------------ r5956 | calvin | 2009-09-22 19:30:10 -0400 (Tue, 22 Sep 2009) | 4 lines branches/zip: remove handler0vars.h from Makefile.am Left over from r5950. ------------------------------------------------------------------------ r5951 | calvin | 2009-09-22 11:17:01 -0400 (Tue, 22 Sep 2009) | 4 lines branches/zip: adjust CMake file to work with old versions of MySQL Tested with MySQL 5.1.38 and 5.1.30. ------------------------------------------------------------------------ r5950 | calvin | 2009-09-22 02:42:46 -0400 (Tue, 22 Sep 2009) | 17 lines branches/zip: adjust Windows loading method for 5.1.38 Starting at 5.1.38, MySQL server exports symbols needed for dynamic plugin on Windows. There is no need for Windows specific loading. Also, the CMake files are simplified in 5.1.38. When WITH_INNOBASE_STORAGE_ENGINE is specified during configuration (win\configure.js), InnoDB is built as a static library. Otherwise, a dynamic InnoDB will be built (ha_innodb.dll). CMakeLists.txt requires minor changes in order to work with MySQL prior to 5.1.38. The changes will be in a separate patch. This patch addresses Mantis issue#286. ------------------------------------------------------------------------ r5945 | calvin | 2009-09-21 10:53:22 -0400 (Mon, 21 Sep 2009) | 4 lines branches/zip: fix a type in r5935 Should be innodb_open_files, spotted by Michael. ------------------------------------------------------------------------ r5940 | vasil | 2009-09-21 01:26:04 -0400 (Mon, 21 Sep 2009) | 4 lines branches/zip: Add ChangeLog entries for c5938. ------------------------------------------------------------------------ r5938 | calvin | 2009-09-19 03:14:25 -0400 (Sat, 19 Sep 2009) | 41 lines branches/zip: Merge revisions 2584:2956 from branches/6.0, except c2932. Bug#37232 and bug#31183 were fixed in the 6.0 branch only. They should be fixed in the plugin too, specially MySQL 6.0 is discontinued at this point. ------------------------------------------------------------------------ r2604 | inaam | 2008-08-21 09:37:06 -0500 (Thu, 21 Aug 2008) | 8 lines branches/6.0 bug#37232 Relax locking behaviour for REPLACE INTO t SELECT ... FROM t1. Now SELECT on t1 is performed as a consistent read when the isolation level is set to READ COMMITTED. Reviewed by: Heikki ------------------------------------------------------------------------ r2605 | inaam | 2008-08-21 09:59:33 -0500 (Thu, 21 Aug 2008) | 7 lines branches/6.0 Added a comment to clarify why distinct calls to read MySQL binary log file name and log position do not entail any race condition. Suggested by: Heikki ------------------------------------------------------------------------ r2956 | inaam | 2008-11-04 04:47:30 -0600 (Tue, 04 Nov 2008) | 11 lines branches/6.0 bug#31183 If the system tablespace runs out of space because 'autoextend' is not specified with innodb_data_file_path there was no error message printed to the error log. The client would get 'table full' error. This patch prints an appropriate error message to the error log. rb://43 Approved by: Marko ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5935 | calvin | 2009-09-18 17:08:02 -0400 (Fri, 18 Sep 2009) | 6 lines branches/zip: fix bug#44338; minor non-functional changes Bug#44338 innodb has message about non-existing option innodb_max_files_open. Change the option to innodb_open_files. The fix was committed into 6.0 branch. ------------------------------------------------------------------------ r5934 | vasil | 2009-09-18 13:06:46 -0400 (Fri, 18 Sep 2009) | 4 lines branches/zip: Fix typo. ------------------------------------------------------------------------ r5924 | vasil | 2009-09-18 00:59:30 -0400 (Fri, 18 Sep 2009) | 4 lines branches/zip: White space and formatting cleanup in the ChangeLog ------------------------------------------------------------------------ r5922 | marko | 2009-09-17 02:32:08 -0400 (Thu, 17 Sep 2009) | 4 lines branches/zip: innodb-zip.test: Make the test work with zlib 1.2.3.3. Apparently, the definition of compressBound() has slightly changed. This has been filed as Mantis Issue #345. ------------------------------------------------------------------------ r5920 | vasil | 2009-09-16 14:47:22 -0400 (Wed, 16 Sep 2009) | 4 lines branches/zip: Add ChangeLog entries for r5916. ------------------------------------------------------------------------ r5919 | vasil | 2009-09-16 14:37:13 -0400 (Wed, 16 Sep 2009) | 4 lines branches/zip: Whitespace cleanup in the ChangeLog. ------------------------------------------------------------------------ r5917 | marko | 2009-09-16 05:56:23 -0400 (Wed, 16 Sep 2009) | 1 line branches/zip: innobase_get_cset_width(): Cache the value of current_thd. ------------------------------------------------------------------------ r5916 | marko | 2009-09-16 05:54:43 -0400 (Wed, 16 Sep 2009) | 128 lines branches/zip: Merge revisions 5622:5912 from branches/5.1, except r5700 (changes to CMakeLists.txt) ------------------------------------------------------------------------ r5622 | vasil | 2009-08-03 15:27:00 +0300 (Mon, 03 Aug 2009) | 20 lines Changed paths: M /branches/5.1/Makefile.am branches/5.1: Merge a change from MySQL: ------------------------------------------------------------ revno: 2988 committer: Satya B <satya.bn@sun.com> branch nick: mysql-5.1-bugteam timestamp: Wed 2009-07-01 11:06:05 +0530 message: Fix build failure after applying Innodb snapshot 5.1-ss5282 After applying Innodb snapshot 5.1-ss5282, build was broken because of missing header file. Adding the header file to Makefile.am after informing the innodb developers. modified: storage/innobase/Makefile.am ------------------------------------------------------------------------ r5740 | jyang | 2009-09-03 06:33:47 +0300 (Thu, 03 Sep 2009) | 5 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/db0err.h A /branches/5.1/mysql-test/innodb_bug46000.result A /branches/5.1/mysql-test/innodb_bug46000.test branches/5.1: Disallow creating index with the name of "GEN_CLUST_INDEX" which is reserved for the default system primary index. (Bug #46000) rb://149 approved by Sunny Bains. ------------------------------------------------------------------------ r5741 | jyang | 2009-09-03 07:16:01 +0300 (Thu, 03 Sep 2009) | 5 lines Changed paths: M /branches/5.1/dict/dict0dict.c M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug44369.result A /branches/5.1/mysql-test/innodb_bug44369.test M /branches/5.1/row/row0mysql.c branches/5.1: Block creating table with column name conflicting with Innodb reserved key words. (Bug #44369) rb://151 approved by Sunny Bains. ------------------------------------------------------------------------ r5757 | jyang | 2009-09-04 04:26:13 +0300 (Fri, 04 Sep 2009) | 3 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/db0err.h D /branches/5.1/mysql-test/innodb_bug46000.result D /branches/5.1/mysql-test/innodb_bug46000.test branches/5.1: Revert change in 5740. Making the fix in a subsequent check in. ------------------------------------------------------------------------ r5760 | jyang | 2009-09-04 07:07:34 +0300 (Fri, 04 Sep 2009) | 3 lines Changed paths: M /branches/5.1/dict/dict0dict.c M /branches/5.1/handler/ha_innodb.cc D /branches/5.1/mysql-test/innodb_bug44369.result D /branches/5.1/mysql-test/innodb_bug44369.test M /branches/5.1/row/row0mysql.c branches/5.1: This is to revert change 5741. A return status for create_table_def() needs to be fixed. ------------------------------------------------------------------------ r5797 | calvin | 2009-09-09 18:26:29 +0300 (Wed, 09 Sep 2009) | 3 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: merge change from 5.1.38 HA_ERR_TOO_MANY_CONCURRENT_TRXS is added in 5.1.38. ------------------------------------------------------------------------ r5799 | calvin | 2009-09-09 20:47:31 +0300 (Wed, 09 Sep 2009) | 10 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: fix bug#46256 Allow tables to be dropped even if the collation is not found, but issue a warning. Could not find an easy way to add mysql-test since it requires changes to charsets and restarting the server. Tests were executed manually. Approved by: Heikki (on IM) ------------------------------------------------------------------------ r5805 | vasil | 2009-09-10 08:41:48 +0300 (Thu, 10 Sep 2009) | 7 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Fix a compilation warning caused by c5799: handler/ha_innodb.cc: In function 'void innobase_get_cset_width(ulint, ulint*, ulint*)': handler/ha_innodb.cc:830: warning: format '%d' expects type 'int', but argument 2 has type 'ulint' ------------------------------------------------------------------------ r5834 | jyang | 2009-09-11 00:43:05 +0300 (Fri, 11 Sep 2009) | 5 lines Changed paths: M /branches/5.1/dict/dict0dict.c M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug44369.result A /branches/5.1/mysql-test/innodb_bug44369.test M /branches/5.1/row/row0mysql.c branches/5.1: Block creating table with column name conflicting with Innodb reserved key words. (Bug #44369) rb://151 approved by Sunny Bains. ------------------------------------------------------------------------ r5895 | jyang | 2009-09-15 03:39:21 +0300 (Tue, 15 Sep 2009) | 5 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc A /branches/5.1/mysql-test/innodb_bug46000.result A /branches/5.1/mysql-test/innodb_bug46000.test branches/5.1: Disallow creating index with the name of "GEN_CLUST_INDEX" which is reserved for the default system primary index. (Bug #46000) rb://149 approved by Marko Makela. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5910 | marko | 2009-09-16 04:07:21 -0400 (Wed, 16 Sep 2009) | 9 lines branches/zip: Introduce UNIV_LOG_LSN_DEBUG and MLOG_LSN for redo log diagnostics. This was written in order to better track down Issue #313 in InnoDB Hot Backup. MLOG_LSN: A new redo log entry type, for recording the current log sequence number (LSN). This will be checked in an assertion in recv_parse_log_rec(). rb://161, discussed with Sunny and Vasil. ------------------------------------------------------------------------ r5899 | marko | 2009-09-15 07:26:01 -0400 (Tue, 15 Sep 2009) | 4 lines branches/zip: ut0ut.h: Do not #include "os0sync.h" #ifdef UNIV_HOTBACKUP. Since r5872, the InnoDB Hot Backup build was broken. Fix it by not defining any thread synchronization primitives in ut0ut.h. InnoDB Hot Backup is a single-threaded program. ------------------------------------------------------------------------ r5898 | marko | 2009-09-15 06:18:50 -0400 (Tue, 15 Sep 2009) | 2 lines branches/zip: Add */.dirstamp to svn:ignore, for https://svn.innodb.com/svn/hotbackup/branches/3.5 ------------------------------------------------------------------------ r5897 | marko | 2009-09-15 04:29:00 -0400 (Tue, 15 Sep 2009) | 8 lines branches/zip: Avoid bogus messages about latching order violations when UNIV_SYNC_DEBUG is defined. sync_thread_levels_g(): Add the parameter "warn". Do not print anything unless it is set. sync_thread_add_level(): Pass warn=TRUE to sync_thread_levels_g() when the check is within an assertion; FALSE if it is not. ------------------------------------------------------------------------ r5893 | inaam | 2009-09-14 11:20:48 -0400 (Mon, 14 Sep 2009) | 10 lines branches/zip rb://159 In case of pages that are not made young the counter is incremented only when the page in question is 'old'. In case of pages that are made young the counter is incremented in case of all pages. For apple to apple comparison this patch changes the 'young-making' counter to consider only 'old' blocks. Approved by: Marko ------------------------------------------------------------------------ r5889 | vasil | 2009-09-14 05:17:18 -0400 (Mon, 14 Sep 2009) | 5 lines branches/zip: Add missing return statement in the test program that could have caused a warning. ------------------------------------------------------------------------ r5888 | vasil | 2009-09-14 04:38:45 -0400 (Mon, 14 Sep 2009) | 40 lines branches/zip: Back-merge c5880 and c5881 from branches/embedded-1.0: ------------------------------------------------------------------------ r5880 | vasil | 2009-09-12 17:28:44 +0300 (Sat, 12 Sep 2009) | 18 lines Changed paths: M /branches/embedded-1.0/configure.in M /branches/embedded-1.0/include/os0sync.h M /branches/embedded-1.0/srv/srv0start.c branches/embedded-1.0: Clean up and simplify the code that surrounds the atomic ops: * Simplify the code that prints what atomics are used: Instead of repeating the same conditions on which each atomics are used use just one printf that prints a variable defined by the code which chooses what atomics to use. * In os0sync.h pick up each atomic variant only if it has been selected by autoconf (based on IB_ATOMIC_MODE_* macros). Define the startup message to be printed. * In configure.in: check what user has chosen and if he has chosen something that is not available, emit an error. If nothing has been chosen explicitly by the user, auto select an option according to the described logic in configure.in. ------------------------------------------------------------------------ r5881 | vasil | 2009-09-12 20:08:27 +0300 (Sat, 12 Sep 2009) | 4 lines Changed paths: M /branches/embedded-1.0/configure.in branches/embedded-1.0: Fix syntax error in test program. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5875 | vasil | 2009-09-12 08:11:25 -0400 (Sat, 12 Sep 2009) | 4 lines branches/zip: Remove unnecessary macro. ------------------------------------------------------------------------ r5872 | vasil | 2009-09-12 05:35:17 -0400 (Sat, 12 Sep 2009) | 5 lines branches/zip: Explicitly include os0sync.h to the places where HAVE_ATOMIC_BUILTINS and INNODB_RW_LOCKS_USE_ATOMICS are used to avoid potential problems. ------------------------------------------------------------------------ r5871 | vasil | 2009-09-12 05:25:44 -0400 (Sat, 12 Sep 2009) | 6 lines branches/zip: Rename HAVE_SOLARIS_ATOMICS to HAVE_IB_SOLARIS_ATOMICS and IB_HAVE_PAUSE_INSTRUCTION to HAVE_IB_PAUSE_INSTRUCTION so they all follow the same HAVE_IB_* convention. ------------------------------------------------------------------------ r5870 | vasil | 2009-09-12 05:13:44 -0400 (Sat, 12 Sep 2009) | 7 lines branches/zip: Define HAVE_ATOMIC_BUILTINS and INNODB_RW_LOCKS_USE_ATOMICS in os0sync.h instead of in univ.i. The code expects os_*() macros to be present if HAVE_ATOMIC_BUILTINS and INNODB_RW_LOCKS_USE_ATOMICS are defined. So define them next to defining the os_*() macros. ------------------------------------------------------------------------ r5869 | vasil | 2009-09-12 04:33:11 -0400 (Sat, 12 Sep 2009) | 15 lines branches/zip: Include ut0auxconf.h only if none of the macros it would define is defined. The check when to include this header was outdated from the time when there was only one macro involved. Move the atomics checks that are in univ.i outside of #if windows ... #else ... #endif This simplifies the code and removes some duplicates like defining HAVE_ATOMIC_BUILTINS if HAVE_WINDOWS_ATOMICS is defined in both branches. Do not define the same macro HAVE_ATOMIC_PTHREAD_T for different events. Instead define HAVE_IB_ATOMIC_PTHREAD_T_GCC and HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS. ------------------------------------------------------------------------ r5868 | vasil | 2009-09-12 04:01:17 -0400 (Sat, 12 Sep 2009) | 6 lines branches/zip: Move the check whether to include ut0auxconf.h before everything because we are now even checking for GCC atomics, we relied on MySQL to define this macro before. ------------------------------------------------------------------------ r5867 | vasil | 2009-09-12 03:43:45 -0400 (Sat, 12 Sep 2009) | 4 lines branches/zip: Update comment to reflect reality. ------------------------------------------------------------------------ r5866 | vasil | 2009-09-12 03:30:08 -0400 (Sat, 12 Sep 2009) | 5 lines branches/zip: Add the check for GCC atomics to ut0auxconf* (copied from plug.in) because we no longer rely on MySQL's HAVE_GCC_ATOMIC_BUILTINS. ------------------------------------------------------------------------ r5865 | vasil | 2009-09-12 03:26:03 -0400 (Sat, 12 Sep 2009) | 10 lines branches/zip: Simplify the compile time checks by splittig them into 5 independent checks: * Whether GCC atomics are available * Whether pthread_t can be used by GCC atomics * Whether Solaris libc atomics are available * Whether pthread_t can be used by Solaris libs atomics * Checking the size of pthread_t ------------------------------------------------------------------------ r5864 | vasil | 2009-09-12 03:22:55 -0400 (Sat, 12 Sep 2009) | 4 lines branches/zip: Include string.h which is needed for memset(). ------------------------------------------------------------------------ r5863 | vasil | 2009-09-12 03:07:08 -0400 (Sat, 12 Sep 2009) | 10 lines branches/zip: Check that pthread_t can indeed be passed to Solaris atomic functions, instead of assuming that it can be passed if 0 can be assigned to it. It could be that: * 0 can be assigned, but pthread_t cannot be passed and * 0 cannot be assigned but pthread_t can be passed Better to check what we are interested in, not something else and make assumptions. ------------------------------------------------------------------------ r5858 | vasil | 2009-09-11 13:46:47 -0400 (Fri, 11 Sep 2009) | 4 lines branches/zip: Fix the indentation of the closing bracket. ------------------------------------------------------------------------ r5826 | marko | 2009-09-10 07:29:46 -0400 (Thu, 10 Sep 2009) | 12 lines branches/zip: Roll back recovered dictionary transactions before dropping incomplete indexes (Issue #337). trx_rollback_or_clean_recovered(ibool all): New function, split from trx_rollback_or_clean_all_recovered(). all==FALSE will only roll back dictionary transactions. recv_recovery_from_checkpoint_finish(): Call trx_rollback_or_clean_recovered(FALSE) before row_merge_drop_temp_indexes(). rb://158 approved by Sunny Bains ------------------------------------------------------------------------ r5825 | marko | 2009-09-10 06:47:09 -0400 (Thu, 10 Sep 2009) | 20 lines branches/zip: Reduce mutex contention that was introduced when addressing Bug #45015 (Issue #316), in r5703. buf_page_set_accessed_make_young(): New auxiliary function, called by buf_page_get_zip(), buf_page_get_gen(), buf_page_optimistic_get_func(). Call ut_time_ms() outside of buf_pool_mutex. Use cached access_time. buf_page_set_accessed(): Add the parameter time_ms, so that ut_time_ms() need not be called while holding buf_pool_mutex. buf_page_optimistic_get_func(), buf_page_get_known_nowait(): Read buf_page_t::access_time without holding buf_pool_mutex. This should be OK, because the field is only used for heuristic purposes. buf_page_peek_if_too_old(): If buf_pool->freed_page_clock == 0, return FALSE, so that we will not waste time moving blocks in the LRU list in the warm-up phase or when the workload fits in the buffer pool. rb://156 approved by Sunny Bains ------------------------------------------------------------------------ r5822 | marko | 2009-09-10 06:10:20 -0400 (Thu, 10 Sep 2009) | 1 line branches/zip: buf_page_release(): De-stutter the function comment. ------------------------------------------------------------------------ r5804 | marko | 2009-09-10 01:29:31 -0400 (Thu, 10 Sep 2009) | 1 line branches/zip: trx_cleanup_at_db_startup(): Fix a typo in comment. ------------------------------------------------------------------------ r5798 | calvin | 2009-09-09 11:28:10 -0400 (Wed, 09 Sep 2009) | 5 lines branches/zip: HA_ERR_TOO_MANY_CONCURRENT_TRXS is added in 5.1.38. But the plugin should still work with previous versions of MySQL. ------------------------------------------------------------------------ r5792 | vasil | 2009-09-09 09:35:58 -0400 (Wed, 09 Sep 2009) | 32 lines branches/zip: Fix a bug in manipulating the variable innodb_old_blocks_pct: for any value assigned it got that value -1, except for 75. When assigned 75, it got 75. mysql> set global innodb_old_blocks_pct=15; Query OK, 0 rows affected (0.00 sec) mysql> show variables like 'innodb_old_blocks_pct'; +-----------------------+-------+ | Variable_name | Value | +-----------------------+-------+ | innodb_old_blocks_pct | 14 | +-----------------------+-------+ 1 row in set (0.00 sec) mysql> set global innodb_old_blocks_pct=75; Query OK, 0 rows affected (0.00 sec) mysql> show variables like 'innodb_old_blocks_pct'; +-----------------------+-------+ | Variable_name | Value | +-----------------------+-------+ | innodb_old_blocks_pct | 75 | +-----------------------+-------+ After the fix it gets exactly what was assigned. Approved by: Marko (via IM) ------------------------------------------------------------------------ r5783 | marko | 2009-09-09 03:25:00 -0400 (Wed, 09 Sep 2009) | 1 line branches/zip: buf_page_is_accessed(): Correct the function comment. ------------------------------------------------------------------------ r5782 | marko | 2009-09-09 03:00:59 -0400 (Wed, 09 Sep 2009) | 2 lines branches/zip: buf_page_peek_if_too_old(): Silence a compiler warning that was introduced in r5779 on 32-bit systems. ------------------------------------------------------------------------ r5780 | marko | 2009-09-09 02:50:50 -0400 (Wed, 09 Sep 2009) | 1 line branches/zip: ut_time_ms(): Return ulint, not uint. ------------------------------------------------------------------------ r5779 | marko | 2009-09-09 02:17:19 -0400 (Wed, 09 Sep 2009) | 2 lines branches/zip: buf_page_peek_if_too_old(): Make the bitmasking work when buf_pool->freed_page_clock is wider than 32 bits. ------------------------------------------------------------------------ r5777 | marko | 2009-09-08 11:50:25 -0400 (Tue, 08 Sep 2009) | 2 lines branches/zip: Remove BUF_LRU_INITIAL_RATIO, which should have been removed together with buf_LRU_get_recent_limit(). ------------------------------------------------------------------------ r5775 | calvin | 2009-09-07 17:15:05 -0400 (Mon, 07 Sep 2009) | 13 lines branches/zip: Build InnoDB on Windows with UNIV_HOTBACKUP The changes are non-functional changes for normal InnoDB, but needed for building the Hot Backup on Windows (with UNIV_HOTBACKUP defined). - Define os_aio_use_native_aio for HB. - Do not acquire seek mutexes for backup since HB is single threaded. - Do not use srv_flush_log_at_trx_commit for HB build rb://155 Approved by: Marko ------------------------------------------------------------------------ r5752 | marko | 2009-09-03 10:55:51 -0400 (Thu, 03 Sep 2009) | 10 lines branches/zip: recv_recover_page_func(): Write the log sequence number to the compressed page, if there is one. Previously, the function only wrote the LSN to the uncompressed page. It is not clear why recv_recover_page_func() is updating FIL_PAGE_LSN in the buffer pool. The log sequence number will be stamped on the page when it is flushed to disk, in buf_flush_init_for_writing(). I noticed this inconsistency when analyzing Issue #313, but this patch does not fix it. That is no surprise, since FIL_PAGE_LSN should only matter on disk files, not in the buffer pool. ------------------------------------------------------------------------ r5751 | marko | 2009-09-03 10:36:15 -0400 (Thu, 03 Sep 2009) | 7 lines branches/zip: row_merge(): Remove a bogus debug assertion that was triggered when creating an index on an empty table. row_merge_sort(): Add debug assertions and comments that justify the loop termination condition. The bogus assertion ut_ad(ihalf > 0) was reported by Michael. ------------------------------------------------------------------------ r5748 | marko | 2009-09-03 07:05:44 -0400 (Thu, 03 Sep 2009) | 1 line branches/zip: MLOG_MULTI_REC_END: Correct the comment. ------------------------------------------------------------------------ r5747 | marko | 2009-09-03 06:46:38 -0400 (Thu, 03 Sep 2009) | 2 lines branches/zip: recv_scan_log_recs(): Replace while with do...while, because the termination condition will always hold on the first iteration. ------------------------------------------------------------------------ r5746 | marko | 2009-09-03 04:55:36 -0400 (Thu, 03 Sep 2009) | 2 lines branches/zip: log_reserve_and_write_fast(): Do not cache the log_sys pointer in a local variable. ------------------------------------------------------------------------ r5745 | marko | 2009-09-03 04:38:22 -0400 (Thu, 03 Sep 2009) | 2 lines branches/zip: log_check_log_recs(): Enclose in #ifdef UNIV_LOG_DEBUG. Add const qualifiers. ------------------------------------------------------------------------ r5744 | marko | 2009-09-03 04:28:35 -0400 (Thu, 03 Sep 2009) | 1 line branches/zip: ut_align(): Make ptr const, like in ut_align_down(). ------------------------------------------------------------------------ r5743 | marko | 2009-09-03 02:36:12 -0400 (Thu, 03 Sep 2009) | 3 lines branches/zip: log_reserve_and_write_fast(): Remove the redundant output parameter "success". Success is also indicated by a nonzero return value. ------------------------------------------------------------------------ r5736 | marko | 2009-09-02 03:53:19 -0400 (Wed, 02 Sep 2009) | 1 line branches/zip: Enclose some timestamp functions in #ifndef UNIV_HOTBACKUP. ------------------------------------------------------------------------ r5735 | marko | 2009-09-02 03:43:09 -0400 (Wed, 02 Sep 2009) | 2 lines branches/zip: univ.i: Do not undefine PACKAGE or VERSION. InnoDB source code does not refer to these macros. ------------------------------------------------------------------------ r5734 | sunny | 2009-09-02 03:08:45 -0400 (Wed, 02 Sep 2009) | 2 lines branches/zip: Update ChangeLog with r5733 changes. ------------------------------------------------------------------------ r5733 | sunny | 2009-09-02 03:05:15 -0400 (Wed, 02 Sep 2009) | 6 lines branches/zip: Fix a regression introduced by the fix for bug#26316. We check whether a transaction holds any AUTOINC locks before we acquire the kernel mutex and release those locks. Fix for rb://153. Approved by Marko. ------------------------------------------------------------------------ r5716 | vasil | 2009-08-31 03:47:49 -0400 (Mon, 31 Aug 2009) | 9 lines branches/zip: Fix Bug#46718 InnoDB plugin incompatible with gcc 4.1 (at least: on PPC): "Undefined symbol" by implementing our own check in plug.in instead of using the result from the check from MySQL because it is insufficient. Approved by: Marko (rb://154) ------------------------------------------------------------------------ r5714 | marko | 2009-08-31 02:10:10 -0400 (Mon, 31 Aug 2009) | 5 lines branches/zip: buf_chunk_not_freed(): Do not acquire block->mutex unless block->page.state == BUF_BLOCK_FILE_PAGE. Check that block->page.state makes sense. Approved by Sunny Bains over the IM. ------------------------------------------------------------------------ r5709 | inaam | 2009-08-28 02:22:46 -0400 (Fri, 28 Aug 2009) | 5 lines branches/zip rb://152 Disable display of deprecated parameter innodb_file_io_threads in 'show variables'. ------------------------------------------------------------------------ r5708 | inaam | 2009-08-27 18:43:32 -0400 (Thu, 27 Aug 2009) | 4 lines branches/zip Remove redundant TRUE : FALSE from the return statement ------------------------------------------------------------------------ r5707 | inaam | 2009-08-27 12:20:35 -0400 (Thu, 27 Aug 2009) | 6 lines branches/zip Remove unused macros as we erased the random readahead code in r5703. Also fixed some comments. ------------------------------------------------------------------------ r5706 | inaam | 2009-08-27 12:00:27 -0400 (Thu, 27 Aug 2009) | 20 lines branches/zip rb://147 Done away with following two status variables: innodb_buffer_pool_read_ahead_rnd innodb_buffer_pool_read_ahead_seq Introduced two new status variables: innodb_buffer_pool_read_ahead = number of pages read as part of readahead since server startup innodb_buffer_pool_read_ahead_evicted = number of pages that are read in as readahead but were evicted before ever being accessed since server startup i.e.: a measure of how badly our readahead is performing SHOW INNODB STATUS will show two extra numbers in buffer pool section: pages read ahead/sec and pages evicted without access/sec Approved by: Marko ------------------------------------------------------------------------ r5705 | marko | 2009-08-27 07:56:24 -0400 (Thu, 27 Aug 2009) | 11 lines branches/zip: dict_index_find_cols(): On column name lookup failure, return DB_CORRUPTION (HA_ERR_CRASHED) instead of abnormally terminating the server. Also, disable the previously added diagnostic output to the error log, because mysql-test-run does not like extra output in the error log. (Bug #44571) dict_index_add_to_cache(): Handle errors from dict_index_find_cols(). mysql-test/innodb_bug44571.test: A test case for triggering the bug. rb://135 approved by Sunny Bains. ------------------------------------------------------------------------ r5704 | marko | 2009-08-27 04:31:17 -0400 (Thu, 27 Aug 2009) | 32 lines branches/zip: Fix a critical bug in fast index creation that could corrupt the created indexes. row_merge(): Make "half" an in/out parameter. Determine the offset of half the output file. Copy the last blocks record-by-record instead of block-by-block, so that the records can be counted. Check that the input and output have matching n_rec. row_merge_sort(): Do not assume that two blocks of size N are merged into a block of size 2*N. The output block can be shorter than the input if the last page of each input block is almost empty. Use an accurate termination condition, based on the "half" computed by row_merge(). row_merge_read(), row_merge_write(), row_merge_blocks(): Add debug output. merge_file_t, row_merge_file_create(): Add n_rec, the number of records in the merge file. row_merge_read_clustered_index(): Update n_rec. row_merge_blocks(): Update and check n_rec. row_merge_blocks_copy(): New function, for copying the last blocks in row_merge(). Update and check n_rec. This bug was discovered with a user-supplied test case that creates an index where the initial temporary file is 249 one-megabyte blocks and the merged files become smaller. In the test, possible merge record sizes are 10, 18, and 26 bytes. rb://150 approved by Sunny Bains. This addresses Issue #320. ------------------------------------------------------------------------ r5703 | marko | 2009-08-27 03:25:00 -0400 (Thu, 27 Aug 2009) | 41 lines branches/zip: Replace the constant 3/8 ratio that controls the LRU_old size with the settable global variable innodb_old_blocks_pct. The minimum and maximum values are 5 and 95 per cent, respectively. The default is 100*3/8, in line with the old behavior. ut_time_ms(): New utility function, to return the current time in milliseconds. TODO: Is there a more efficient timestamp function, such as rdtsc divided by a power of two? buf_LRU_old_threshold_ms: New variable, corresponding to innodb_old_blocks_time. The value 0 is the default behaviour: no timeout before making blocks 'new'. bpage->accessed, bpage->LRU_position, buf_pool->ulint_clock: Remove. bpage->access_time: New field, replacing bpage->accessed. Protected by buf_pool_mutex instead of bpage->mutex. Updated when a page is created or accessed the first time in the buffer pool. buf_LRU_old_ratio, innobase_old_blocks_pct: New variables, corresponding to innodb_old_blocks_pct buf_LRU_old_ratio_update(), innobase_old_blocks_pct_update(): Update functions for buf_LRU_old_ratio, innobase_old_blocks_pct. buf_page_peek_if_too_old(): Compare ut_time_ms() to bpage->access_time if buf_LRU_old_threshold_ms && bpage->old. Else observe buf_LRU_old_ratio and bpage->freed_page_clock. buf_pool_t: Add n_pages_made_young, n_pages_not_made_young, n_pages_made_young_old, n_pages_not_made_young, for statistics. buf_print(): Display buf_pool->n_pages_made_young, buf_pool->n_pages_not_made_young. This function is only for crash diagnostics. buf_print_io(): Display buf_pool->LRU_old_len and quantities derived from buf_pool->n_pages_made_young, buf_pool->n_pages_not_made_young. This function is invoked by SHOW ENGINE INNODB STATUS. rb://129 approved by Heikki Tuuri. This addresses Bug #45015. ------------------------------------------------------------------------ r5702 | marko | 2009-08-27 03:03:15 -0400 (Thu, 27 Aug 2009) | 1 line branches/zip: Document also the files affected by r5698 in the ChangeLog. ------------------------------------------------------------------------ r5701 | marko | 2009-08-27 03:01:42 -0400 (Thu, 27 Aug 2009) | 1 line branches/zip: Document r5698 in the ChangeLog. ------------------------------------------------------------------------ r5698 | inaam | 2009-08-26 10:34:35 -0400 (Wed, 26 Aug 2009) | 13 lines branches/zip bug#42885 rb://148 The call to put IO threads to sleep was most probably meant for Windows only as the comment in buf0rea.c suggests. However it was enabled on all platforms. This patch restricts the sleep call to windows. This approach of not putting threads to sleep makes even more sense because now we have multiple threads working in the background and it probably is not a good idea to put all of them to sleep because a user thread wants to post a batch for readahead. Approved by: Marko ------------------------------------------------------------------------ r5697 | vasil | 2009-08-26 09:44:40 -0400 (Wed, 26 Aug 2009) | 4 lines branches/zip: Fix typo. ------------------------------------------------------------------------ r5696 | vasil | 2009-08-26 09:15:59 -0400 (Wed, 26 Aug 2009) | 14 lines branches/zip: Merge a change from MySQL: http://lists.mysql.com/commits/80832 2968 Jonathan Perkin 2009-08-14 Build fixes for Windows, AIX, HP/UX and Sun Studio11, from Timothy Smith. modified: CMakeLists.txt cmd-line-utils/readline/util.c storage/innodb_plugin/handler/i_s.cc storage/innodb_plugin/include/univ.i ------------------------------------------------------------------------ r5695 | marko | 2009-08-26 09:14:59 -0400 (Wed, 26 Aug 2009) | 1 line branches/zip: UNIV_DEBUG_LOCK_VALIDATE: Move the definition to univ.i. ------------------------------------------------------------------------ r5694 | marko | 2009-08-26 07:25:26 -0400 (Wed, 26 Aug 2009) | 2 lines branches/zip: buf_page_t: Clarify that bpage->list may contain garbage. This comment was provoked by Inaam. ------------------------------------------------------------------------ r5687 | vasil | 2009-08-20 05:20:22 -0400 (Thu, 20 Aug 2009) | 8 lines branches/zip: ChangeLog: Follow the convention from the rest of the ChangeLog: for bugfixes from bugs.mysql.com only the bug number and title goes in the ChangeLog. Detailed explanation on what is the problem and how it was fixed is present in the bugs database. ------------------------------------------------------------------------ r5686 | vasil | 2009-08-20 05:15:05 -0400 (Thu, 20 Aug 2009) | 4 lines branches/zip: White-space fixup. ------------------------------------------------------------------------ r5685 | sunny | 2009-08-20 04:18:29 -0400 (Thu, 20 Aug 2009) | 2 lines branches/zip: Update the ChangeLog with r5684 change. ------------------------------------------------------------------------ r5684 | sunny | 2009-08-20 04:05:30 -0400 (Thu, 20 Aug 2009) | 10 lines branches/zip: Fix bug# 46650: Innodb assertion autoinc_lock == lock in lock_table_remove_low on INSERT SELECT We only store the autoinc locks that are granted in the transaction's autoinc lock vector. A transacton, that has been rolled back due to a deadlock because of an AUTOINC lock attempt, will not have added that lock to the vector. We need to check for that when we remove that lock. rb://145 Approved by Marko. ------------------------------------------------------------------------ r5681 | sunny | 2009-08-14 02:16:24 -0400 (Fri, 14 Aug 2009) | 3 lines branches/zip: When building HotBackup srv_use_sys_malloc is #ifdef out. We move access to the this variable within a !UNIV_HOTBACKUP block. ------------------------------------------------------------------------ r5671 | marko | 2009-08-13 04:46:33 -0400 (Thu, 13 Aug 2009) | 5 lines branches/zip: ha_innobase::add_index(): Fix Bug #46557: after a successful operation, read innodb_table->flags from the newly created table object, not from the old one that was just freed. Approved by Sunny. ------------------------------------------------------------------------ r5670 | marko | 2009-08-12 09:16:37 -0400 (Wed, 12 Aug 2009) | 2 lines branches/zip: trx_undo_rec_copy(): Add const qualifier to undo_rec. This is a non-functional change. ------------------------------------------------------------------------ r5663 | marko | 2009-08-11 07:42:37 -0400 (Tue, 11 Aug 2009) | 2 lines branches/zip: trx_general_rollback_for_mysql(): Remove the redundant parameter partial. If savept==NULL, partial==FALSE. ------------------------------------------------------------------------ r5662 | marko | 2009-08-11 05:54:16 -0400 (Tue, 11 Aug 2009) | 1 line branches/zip: Bump the version number to 1.0.5 after releasing 1.0.4. ------------------------------------------------------------------------ r5642 | calvin | 2009-08-06 19:04:03 -0400 (Thu, 06 Aug 2009) | 2 lines branches/zip: remove duplicate "the" in comments. ------------------------------------------------------------------------ r5639 | marko | 2009-08-06 06:39:34 -0400 (Thu, 06 Aug 2009) | 3 lines branches/zip: mem_heap_block_free(): If innodb_use_sys_malloc is set, do not tell Valgrind that the memory is free, to avoid a bogus warning in Valgrind's built-in free() hook. ------------------------------------------------------------------------ r5636 | marko | 2009-08-05 08:27:30 -0400 (Wed, 05 Aug 2009) | 2 lines branches/zip: lock_rec_validate_page(): Add the parameter zip_size. This should help track down Mantis Issue #289. ------------------------------------------------------------------------ r5635 | marko | 2009-08-05 07:06:55 -0400 (Wed, 05 Aug 2009) | 2 lines branches/zip: Replace <number> with NUMBER in some comments, to avoid problems with Doxygen XML output. ------------------------------------------------------------------------ r5629 | marko | 2009-08-04 07:42:44 -0400 (Tue, 04 Aug 2009) | 1 line branches/zip: mysql-test: Pass MTR's internal checks. ------------------------------------------------------------------------ r5626 | vasil | 2009-08-04 01:53:31 -0400 (Tue, 04 Aug 2009) | 4 lines branches/zip: Revert the dummy change from c5625. ------------------------------------------------------------------------ r5625 | vasil | 2009-08-04 01:52:48 -0400 (Tue, 04 Aug 2009) | 32 lines branches/zip: Merge 5518:5622 from branches/5.1, resolving conflict in r5622 (after resolving the conflict Makefile.am was not changed so I have made a dummy change so I can commit and thus record that branches/5.1 has been merged in branches/zip up to 5622): ------------------------------------------------------------------------ r5622 | vasil | 2009-08-03 15:27:00 +0300 (Mon, 03 Aug 2009) | 20 lines Changed paths: M /branches/5.1/Makefile.am branches/5.1: Merge a change from MySQL: ------------------------------------------------------------ revno: 2988 committer: Satya B <satya.bn@sun.com> branch nick: mysql-5.1-bugteam timestamp: Wed 2009-07-01 11:06:05 +0530 message: Fix build failure after applying Innodb snapshot 5.1-ss5282 After applying Innodb snapshot 5.1-ss5282, build was broken because of missing header file. Adding the header file to Makefile.am after informing the innodb developers. modified: storage/innobase/Makefile.am ------------------------------------------------------------------------ ------------------------------------------------------------------------ r5614 | vasil | 2009-07-31 11:09:07 -0400 (Fri, 31 Jul 2009) | 6 lines branches/zip: Add fsp0types.h to the list of noinst_HEADERS Suggested by: Sergey Vojtovich <svoj@sun.com> ------------------------------------------------------------------------ r5539 | vasil | 2009-07-21 06:28:27 -0400 (Tue, 21 Jul 2009) | 4 lines branches/zip: Add a test program to check whether the PAUSE instruction is available. ------------------------------------------------------------------------ r5537 | vasil | 2009-07-21 05:31:26 -0400 (Tue, 21 Jul 2009) | 5 lines branches/zip: Fixups in ChangeLog: sort filenames alphabetically and wrap to 78 chars per line. ------------------------------------------------------------------------ r5527 | sunny | 2009-07-20 17:56:30 -0400 (Mon, 20 Jul 2009) | 2 lines branches/zip: For HotBackup builds we don't want to hide the symbols. ------------------------------------------------------------------------ r5525 | calvin | 2009-07-20 13:14:30 -0400 (Mon, 20 Jul 2009) | 2 lines branches/zip: add ChangeLog entry for r5524. ------------------------------------------------------------------------
16 years ago
20 years ago
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
5 years ago
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
5 years ago
20 years ago
20 years ago
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
5 years ago
MDEV-11254: innodb-use-trim has no effect in 10.2 Problem was that implementation merged from 10.1 was incompatible with InnoDB 5.7. buf0buf.cc: Add functions to return should we punch hole and how big. buf0flu.cc: Add written page to IORequest fil0fil.cc: Remove unneeded status call and add test is sparse files and punch hole supported by file system when tablespace is created. Add call to get file system block size. Used file node is added to IORequest. Added functions to check is punch hole supported and setting punch hole. ha_innodb.cc: Remove unneeded status variables (trim512-32768) and trim_op_saved. Deprecate innodb_use_trim and set it ON by default. Add function to set innodb-use-trim dynamically. dberr.h: Add error code DB_IO_NO_PUNCH_HOLE if punch hole operation fails. fil0fil.h: Add punch_hole variable to fil_space_t and block size to fil_node_t. os0api.h: Header to helper functions on buf0buf.cc and fil0fil.cc for os0file.h os0file.h: Remove unneeded m_block_size from IORequest and add bpage to IORequest to know actual size of the block and m_fil_node to know tablespace file system block size and does it support punch hole. os0file.cc: Add function punch_hole() to IORequest to do punch_hole operation, get the file system block size and determine does file system support sparse files (for punch hole). page0size.h: remove implicit copy disable and use this implicit copy to implement copy_from() function. buf0dblwr.cc, buf0flu.cc, buf0rea.cc, fil0fil.cc, fil0fil.h, os0file.h, os0file.cc, log0log.cc, log0recv.cc: Remove unneeded write_size parameter from fil_io calls. srv0mon.h, srv0srv.h, srv0mon.cc: Remove unneeded trim512-trim32678 status variables. Removed these from monitor tests.
9 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
20 years ago
branches/innodb+: Merge revisions 6130:6364 from branches/zip: ------------------------------------------------------------------------ r6130 | marko | 2009-11-02 11:42:56 +0200 (Mon, 02 Nov 2009) | 9 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/btr/btr0sea.c M /branches/zip/buf/buf0buf.c M /branches/zip/dict/dict0dict.c M /branches/zip/fil/fil0fil.c M /branches/zip/ibuf/ibuf0ibuf.c M /branches/zip/include/btr0sea.h M /branches/zip/include/dict0dict.h M /branches/zip/include/fil0fil.h M /branches/zip/include/ibuf0ibuf.h M /branches/zip/include/lock0lock.h M /branches/zip/include/log0log.h M /branches/zip/include/log0recv.h M /branches/zip/include/mem0mem.h M /branches/zip/include/mem0pool.h M /branches/zip/include/os0file.h M /branches/zip/include/pars0pars.h M /branches/zip/include/srv0srv.h M /branches/zip/include/thr0loc.h M /branches/zip/include/trx0i_s.h M /branches/zip/include/trx0purge.h M /branches/zip/include/trx0rseg.h M /branches/zip/include/trx0sys.h M /branches/zip/include/trx0undo.h M /branches/zip/include/usr0sess.h M /branches/zip/lock/lock0lock.c M /branches/zip/log/log0log.c M /branches/zip/log/log0recv.c M /branches/zip/mem/mem0dbg.c M /branches/zip/mem/mem0pool.c M /branches/zip/os/os0file.c M /branches/zip/os/os0sync.c M /branches/zip/os/os0thread.c M /branches/zip/pars/lexyy.c M /branches/zip/pars/pars0lex.l M /branches/zip/que/que0que.c M /branches/zip/srv/srv0srv.c M /branches/zip/srv/srv0start.c M /branches/zip/sync/sync0arr.c M /branches/zip/sync/sync0sync.c M /branches/zip/thr/thr0loc.c M /branches/zip/trx/trx0i_s.c M /branches/zip/trx/trx0purge.c M /branches/zip/trx/trx0rseg.c M /branches/zip/trx/trx0sys.c M /branches/zip/trx/trx0undo.c M /branches/zip/usr/usr0sess.c M /branches/zip/ut/ut0mem.c branches/zip: Free all resources at shutdown. Set pointers to NULL, so that Valgrind will not complain about freed data structures that are reachable via pointers. This addresses Bug #45992 and Bug #46656. This patch is mostly based on changes copied from branches/embedded-1.0, mainly c5432, c3439, c3134, c2994, c2978, but also some other code was copied. Some added cleanup code is specific to MySQL/InnoDB. rb://199 approved by Sunny Bains ------------------------------------------------------------------------ r6134 | marko | 2009-11-04 09:57:29 +0200 (Wed, 04 Nov 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc branches/zip: innobase_convert_identifier(): Convert table names with explain_filename() to address Bug #32430: 'show innodb status' causes errors Invalid (old?) table or database name in logs. rb://134 approved by Sunny Bains ------------------------------------------------------------------------ r6137 | marko | 2009-11-04 15:24:28 +0200 (Wed, 04 Nov 2009) | 1 line Changed paths: M /branches/zip/dict/dict0dict.c branches/zip: dict_index_too_big_for_undo(): Correct a typo. ------------------------------------------------------------------------ r6153 | vasil | 2009-11-10 15:33:22 +0200 (Tue, 10 Nov 2009) | 145 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Merge r6125:6152 from branches/5.1: (everything except the last white-space change was skipped as it is already in branches/zip) ------------------------------------------------------------------------ r6127 | vasil | 2009-10-30 11:18:25 +0200 (Fri, 30 Oct 2009) | 18 lines Changed paths: M /branches/5.1/Makefile.am M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Backport c6121 from branches/zip: ------------------------------------------------------------------------ r6121 | sunny | 2009-10-30 01:42:11 +0200 (Fri, 30 Oct 2009) | 7 lines Changed paths: M /branches/zip/mysql-test/innodb-autoinc.result branches/zip: This test has been problematic for sometime now. The underlying bug is that the data dictionaries get out of sync. In the AUTOINC code we try and apply salve to the symptoms. In the past MySQL made some unrelated change and the dictionaries stopped getting out of sync and this test started to fail. Now, it seems they have reverted that changed and the test is passing again. I suspect this is not he last time that this test will change. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6129 | vasil | 2009-10-30 17:14:22 +0200 (Fri, 30 Oct 2009) | 4 lines Changed paths: M /branches/5.1/Makefile.am branches/5.1: Revert a change to Makefile.am that sneaked unnoticed in c6127. ------------------------------------------------------------------------ r6136 | marko | 2009-11-04 12:28:10 +0200 (Wed, 04 Nov 2009) | 15 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/ha_prototypes.h M /branches/5.1/ut/ut0ut.c branches/5.1: Port r6134 from branches/zip: ------------------------------------------------------------------------ r6134 | marko | 2009-11-04 07:57:29 +0000 (Wed, 04 Nov 2009) | 5 lines branches/zip: innobase_convert_identifier(): Convert table names with explain_filename() to address Bug #32430: 'show innodb status' causes errors Invalid (old?) table or database name in logs. rb://134 approved by Sunny Bains ------------------------------------------------------------------------ innobase_print_identifier(): Replace with innobase_convert_name(). innobase_convert_identifier(): New function, called by innobase_convert_name(). ------------------------------------------------------------------------ r6149 | vasil | 2009-11-09 11:15:01 +0200 (Mon, 09 Nov 2009) | 5 lines Changed paths: M /branches/5.1/CMakeLists.txt branches/5.1: Followup to r5700: Adjust the changes so they are the same as in the BZR repository. ------------------------------------------------------------------------ r6150 | vasil | 2009-11-09 11:43:31 +0200 (Mon, 09 Nov 2009) | 58 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a part of r2911.5.5 from MySQL: (the other part of this was merged in c5700) ------------------------------------------------------------ revno: 2911.5.5 committer: Vladislav Vaintroub <vvaintroub@mysql.com> branch nick: 5.1-innodb_plugin timestamp: Wed 2009-06-10 10:59:49 +0200 message: Backport WL#3653 to 5.1 to enable bundled innodb plugin. Remove custom DLL loader code from innodb plugin code, use symbols exported from mysqld. removed: storage/innodb_plugin/handler/handler0vars.h storage/innodb_plugin/handler/win_delay_loader.cc added: storage/mysql_storage_engine.cmake win/create_def_file.js modified: CMakeLists.txt include/m_ctype.h include/my_global.h include/my_sys.h include/mysql/plugin.h libmysqld/CMakeLists.txt mysql-test/mysql-test-run.pl mysql-test/t/plugin.test mysql-test/t/plugin_load-master.opt mysys/charset.c sql/CMakeLists.txt sql/handler.h sql/mysql_priv.h sql/mysqld.cc sql/sql_class.cc sql/sql_class.h sql/sql_list.h sql/sql_profile.h storage/Makefile.am storage/archive/CMakeLists.txt storage/blackhole/CMakeLists.txt storage/csv/CMakeLists.txt storage/example/CMakeLists.txt storage/federated/CMakeLists.txt storage/heap/CMakeLists.txt storage/innobase/CMakeLists.txt storage/innobase/handler/ha_innodb.cc storage/innodb_plugin/CMakeLists.txt storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/handler0alter.cc storage/innodb_plugin/handler/i_s.cc storage/innodb_plugin/plug.in storage/myisam/CMakeLists.txt storage/myisammrg/CMakeLists.txt win/Makefile.am win/configure.js ------------------------------------------------------------------------ r6152 | vasil | 2009-11-10 15:30:20 +0200 (Tue, 10 Nov 2009) | 4 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: White space fixup. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6157 | jyang | 2009-11-11 14:27:09 +0200 (Wed, 11 Nov 2009) | 10 lines Changed paths: M /branches/zip/handler/ha_innodb.cc A /branches/zip/mysql-test/innodb_bug47167.result A /branches/zip/mysql-test/innodb_bug47167.test M /branches/zip/mysql-test/innodb_file_format.result branches/zip: Fix an issue that a local variable defined in innodb_file_format_check_validate() is being referenced across function in innodb_file_format_check_update(). In addition, fix "set global innodb_file_format_check = DEFAULT" call. Bug #47167: "set global innodb_file_format_check" cannot set value by User-Defined Variable." rb://169 approved by Sunny Bains and Marko. ------------------------------------------------------------------------ r6159 | vasil | 2009-11-11 15:13:01 +0200 (Wed, 11 Nov 2009) | 37 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/handler/ha_innodb.h branches/zip: Merge a change from MySQL: (this has been reviewed by Calvin and Marko, and Calvin says Luis has incorporated Marko's suggestions) ------------------------------------------------------------ revno: 3092.5.1 committer: Luis Soares <luis.soares@sun.com> branch nick: mysql-5.1-bugteam timestamp: Thu 2009-09-24 15:52:52 +0100 message: BUG#42829: binlogging enabled for all schemas regardless of binlog-db-db / binlog-ignore-db InnoDB will return an error if statement based replication is used along with transaction isolation level READ-COMMITTED (or weaker), even if the statement in question is filtered out according to the binlog-do-db rules set. In this case, an error should not be printed. This patch addresses this issue by extending the existing check in external_lock to take into account the filter rules before deciding to print an error. Furthermore, it also changes decide_logging_format to take into consideration whether the statement is filtered out from binlog before decision is made. added: mysql-test/suite/binlog/r/binlog_stm_do_db.result mysql-test/suite/binlog/t/binlog_stm_do_db-master.opt mysql-test/suite/binlog/t/binlog_stm_do_db.test modified: sql/sql_base.cc sql/sql_class.cc storage/innobase/handler/ha_innodb.cc storage/innobase/handler/ha_innodb.h storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/ha_innodb.h ------------------------------------------------------------------------ r6160 | vasil | 2009-11-11 15:33:49 +0200 (Wed, 11 Nov 2009) | 72 lines Changed paths: M /branches/zip/include/os0file.h M /branches/zip/os/os0file.c branches/zip: Merge r6152:6159 from branches/5.1: (r6158 was skipped as an equivallent change has already been merged from MySQL) ------------------------------------------------------------------------ r6154 | calvin | 2009-11-11 02:51:17 +0200 (Wed, 11 Nov 2009) | 17 lines Changed paths: M /branches/5.1/include/os0file.h M /branches/5.1/os/os0file.c branches/5.1: fix bug#3139: Mysql crashes: 'windows error 995' after several selects on a large DB During stress environment, Windows AIO may fail with error code ERROR_OPERATION_ABORTED. InnoDB does not handle the error, rather crashes. The cause of the error is unknown, but likely due to faulty hardware or driver. This patch introduces a new error code OS_FILE_OPERATION_ABORTED, which maps to Windows ERROR_OPERATION_ABORTED (995). When the error is detected during AIO, the InnoDB will issue a synchronous retry (read/write). This patch has been extensively tested by MySQL support. Approved by: Marko rb://196 ------------------------------------------------------------------------ r6158 | vasil | 2009-11-11 14:52:14 +0200 (Wed, 11 Nov 2009) | 37 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/handler/ha_innodb.h branches/5.1: Merge a change from MySQL: (this has been reviewed by Calvin and Marko, and Calvin says Luis has incorporated Marko's suggestions) ------------------------------------------------------------ revno: 3092.5.1 committer: Luis Soares <luis.soares@sun.com> branch nick: mysql-5.1-bugteam timestamp: Thu 2009-09-24 15:52:52 +0100 message: BUG#42829: binlogging enabled for all schemas regardless of binlog-db-db / binlog-ignore-db InnoDB will return an error if statement based replication is used along with transaction isolation level READ-COMMITTED (or weaker), even if the statement in question is filtered out according to the binlog-do-db rules set. In this case, an error should not be printed. This patch addresses this issue by extending the existing check in external_lock to take into account the filter rules before deciding to print an error. Furthermore, it also changes decide_logging_format to take into consideration whether the statement is filtered out from binlog before decision is made. added: mysql-test/suite/binlog/r/binlog_stm_do_db.result mysql-test/suite/binlog/t/binlog_stm_do_db-master.opt mysql-test/suite/binlog/t/binlog_stm_do_db.test modified: sql/sql_base.cc sql/sql_class.cc storage/innobase/handler/ha_innodb.cc storage/innobase/handler/ha_innodb.h storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/ha_innodb.h ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6161 | vasil | 2009-11-11 15:36:16 +0200 (Wed, 11 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add changelog entry for r6160. ------------------------------------------------------------------------ r6162 | vasil | 2009-11-11 16:00:12 +0200 (Wed, 11 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog for r6157. ------------------------------------------------------------------------ r6163 | calvin | 2009-11-11 17:53:20 +0200 (Wed, 11 Nov 2009) | 8 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/handler/ha_innodb.h branches/zip: Exclude thd_binlog_filter_ok() when building with older version of MySQL. thd_binlog_filter_ok() is introduced in MySQL 5.1.41. But the plugin can be built with MySQL prior to 5.1.41. Approved by Heikki (on IM). ------------------------------------------------------------------------ r6169 | calvin | 2009-11-12 14:40:43 +0200 (Thu, 12 Nov 2009) | 6 lines Changed paths: A /branches/zip/mysql-test/innodb_bug46676.result A /branches/zip/mysql-test/innodb_bug46676.test branches/zip: add test case for bug#46676 This crash is reproducible with InnoDB plugin 1.0.4 + MySQL 5.1.37. But no longer reproducible after MySQL 5.1.38 (with plugin 1.0.5). Add test case to catch future regression. ------------------------------------------------------------------------ r6170 | marko | 2009-11-12 15:49:08 +0200 (Thu, 12 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/db0err.h M /branches/zip/row/row0merge.c M /branches/zip/row/row0mysql.c branches/zip: Allow CREATE INDEX to be interrupted. (Issue #354) rb://183 approved by Heikki Tuuri ------------------------------------------------------------------------ r6175 | vasil | 2009-11-16 20:07:39 +0200 (Mon, 16 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Wrap line at 78th char in the ChangeLog ------------------------------------------------------------------------ r6177 | calvin | 2009-11-16 20:20:38 +0200 (Mon, 16 Nov 2009) | 2 lines Changed paths: M /branches/zip/ChangeLog branches/zip: add an entry to ChangeLog for r6065 ------------------------------------------------------------------------ r6179 | marko | 2009-11-17 10:19:34 +0200 (Tue, 17 Nov 2009) | 2 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: ha_innobase::change_active_index(): When the history is missing, report it to the client, not to the error log. ------------------------------------------------------------------------ r6181 | vasil | 2009-11-17 12:21:41 +0200 (Tue, 17 Nov 2009) | 33 lines Changed paths: M /branches/zip/mysql-test/innodb-index.test branches/zip: At the end of innodb-index.test: restore the environment as it was before the test was started to silence this warning: MTR's internal check of the test case 'main.innodb-index' failed. This means that the test case does not preserve the state that existed before the test case was executed. Most likely the test case did not do a proper clean-up. This is the diff of the states of the servers before and after the test case was executed: mysqltest: Logging to '/tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.log'. mysqltest: Results saved in '/tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.result'. mysqltest: Connecting to server localhost:13000 (socket /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/mysqld.1.sock) as 'root', connection 'default', attempt 0 ... mysqltest: ... Connected. mysqltest: Start processing test commands from './include/check-testcase.test' ... mysqltest: ... Done processing test commands. --- /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.result 2009-11-17 13:10:40.000000000 +0300 +++ /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.reject 2009-11-17 13:10:54.000000000 +0300 @@ -84,7 +84,7 @@ INNODB_DOUBLEWRITE ON INNODB_FAST_SHUTDOWN 1 INNODB_FILE_FORMAT Antelope -INNODB_FILE_FORMAT_CHECK Antelope +INNODB_FILE_FORMAT_CHECK Barracuda INNODB_FILE_PER_TABLE OFF INNODB_FLUSH_LOG_AT_TRX_COMMIT 1 INNODB_FLUSH_METHOD mysqltest: Result content mismatch not ok ------------------------------------------------------------------------ r6182 | marko | 2009-11-17 13:49:15 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-consistent-master.opt M /branches/zip/mysql-test/innodb-consistent.result M /branches/zip/mysql-test/innodb-consistent.test M /branches/zip/mysql-test/innodb-use-sys-malloc-master.opt M /branches/zip/mysql-test/innodb-use-sys-malloc.result M /branches/zip/mysql-test/innodb-use-sys-malloc.test M /branches/zip/mysql-test/innodb_bug21704.result M /branches/zip/mysql-test/innodb_bug21704.test M /branches/zip/mysql-test/innodb_bug40360.test M /branches/zip/mysql-test/innodb_bug40565.result M /branches/zip/mysql-test/innodb_bug40565.test M /branches/zip/mysql-test/innodb_bug41904.result M /branches/zip/mysql-test/innodb_bug41904.test M /branches/zip/mysql-test/innodb_bug42101-nonzero-master.opt M /branches/zip/mysql-test/innodb_bug42101-nonzero.result M /branches/zip/mysql-test/innodb_bug42101-nonzero.test M /branches/zip/mysql-test/innodb_bug42101.result M /branches/zip/mysql-test/innodb_bug42101.test M /branches/zip/mysql-test/innodb_bug44032.result M /branches/zip/mysql-test/innodb_bug44032.test M /branches/zip/mysql-test/innodb_bug44369.result M /branches/zip/mysql-test/innodb_bug44369.test M /branches/zip/mysql-test/innodb_bug44571.result M /branches/zip/mysql-test/innodb_bug44571.test M /branches/zip/mysql-test/innodb_bug45357.test M /branches/zip/mysql-test/innodb_bug46000.result M /branches/zip/mysql-test/innodb_bug46000.test M /branches/zip/mysql-test/innodb_bug46676.result M /branches/zip/mysql-test/innodb_bug46676.test M /branches/zip/mysql-test/innodb_bug47167.result M /branches/zip/mysql-test/innodb_bug47167.test M /branches/zip/mysql-test/innodb_bug47777.result M /branches/zip/mysql-test/innodb_bug47777.test M /branches/zip/mysql-test/innodb_file_format.result M /branches/zip/mysql-test/innodb_file_format.test branches/zip: Set svn:eol-style on mysql-test files. ------------------------------------------------------------------------ r6183 | marko | 2009-11-17 13:51:16 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-consistent-master.opt M /branches/zip/mysql-test/innodb-master.opt M /branches/zip/mysql-test/innodb-semi-consistent-master.opt M /branches/zip/mysql-test/innodb-use-sys-malloc-master.opt M /branches/zip/mysql-test/innodb_bug42101-nonzero-master.opt branches/zip: Prepend loose_ to plugin-only mysql-test options. ------------------------------------------------------------------------ r6184 | marko | 2009-11-17 13:52:01 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-index.result M /branches/zip/mysql-test/innodb-index.test branches/zip: innodb-index.test: Restore innodb_file_format_check. ------------------------------------------------------------------------ r6185 | marko | 2009-11-17 16:44:20 +0200 (Tue, 17 Nov 2009) | 16 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/mysql-test/innodb.result M /branches/zip/mysql-test/innodb.test M /branches/zip/mysql-test/innodb_bug44369.result M /branches/zip/mysql-test/innodb_bug44369.test D /branches/zip/mysql-test/patches/innodb-index.diff M /branches/zip/row/row0mysql.c branches/zip: Report duplicate table names to the client connection, not to the error log. This change will allow innodb-index.test to be re-enabled. It was previously disabled, because mysql-test-run does not like output in the error log. row_create_table_for_mysql(): Do not output anything to the error log when reporting DB_DUPLICATE_KEY. Let the caller report the error. Add a TODO comment that the dict_table_t object is apparently not freed when an error occurs. create_table_def(): Convert InnoDB table names to the character set of the client connection for reporting. Use my_error(ER_WRONG_COLUMN_NAME) for reporting reserved column names. Report my_error(ER_TABLE_EXISTS_ERROR) when row_create_table_for_mysql() returns DB_DUPLICATE_KEY. rb://206 ------------------------------------------------------------------------ r6186 | vasil | 2009-11-17 16:48:14 +0200 (Tue, 17 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for r6185. ------------------------------------------------------------------------ r6189 | marko | 2009-11-18 11:36:18 +0200 (Wed, 18 Nov 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/handler0alter.cc branches/zip: ha_innobase::add_index(): When creating the primary key and the table is being locked by another transaction, do not attempt to drop the table. (Bug #48782) Approved by Sunny Bains over IM ------------------------------------------------------------------------ r6194 | vasil | 2009-11-19 09:24:45 +0200 (Thu, 19 Nov 2009) | 5 lines Changed paths: M /branches/zip/include/univ.i branches/zip: Increment version number from 1.0.5 to 1.0.6 since 1.0.5 was just released by MySQL and we will soon release 1.0.6. ------------------------------------------------------------------------ r6197 | calvin | 2009-11-19 09:32:55 +0200 (Thu, 19 Nov 2009) | 6 lines Changed paths: M /branches/zip/CMakeLists.txt branches/zip: merge the fix of bug#48317 (CMake file) Due to MySQL changes to the CMake, it is no longer able to build InnoDB plugin as a static library on Windows. The fix is proposed by Vlad of MySQL. ------------------------------------------------------------------------ r6198 | vasil | 2009-11-19 09:44:31 +0200 (Thu, 19 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for r6197. ------------------------------------------------------------------------ r6199 | vasil | 2009-11-19 12:10:12 +0200 (Thu, 19 Nov 2009) | 31 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/btr/btr0btr.c M /branches/zip/data/data0type.c branches/zip: Merge r6159:6198 from branches/5.1: ------------------------------------------------------------------------ r6187 | jyang | 2009-11-18 05:27:30 +0200 (Wed, 18 Nov 2009) | 9 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: Fix bug #48469 "when innodb tablespace is configured too small, crash and corruption!". Function btr_create() did not check the return status of fseg_create(), and continue the index creation even there is no sufficient space. rb://205 Approved by Marko ------------------------------------------------------------------------ r6188 | jyang | 2009-11-18 07:14:23 +0200 (Wed, 18 Nov 2009) | 8 lines Changed paths: M /branches/5.1/data/data0type.c branches/5.1: Fix bug #48526 "Data type for float and double is incorrectly reported in InnoDB table monitor". Certain datatypes are not printed correctly in dtype_print(). rb://204 Approved by Marko. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6201 | marko | 2009-11-19 14:09:11 +0200 (Thu, 19 Nov 2009) | 2 lines Changed paths: M /branches/zip/handler/handler0alter.cc branches/zip: ha_innobase::add_index(): Clarify the comment on orphaned tables when creating a primary key. ------------------------------------------------------------------------ r6202 | jyang | 2009-11-19 15:01:00 +0200 (Thu, 19 Nov 2009) | 8 lines Changed paths: M /branches/zip/btr/btr0btr.c branches/zip: Function fseg_free() is no longer defined in branches/zip. To port fix for bug #48469 to zip, we can use btr_free_root() which frees the page, and also does not require mini-transaction. Approved by Marko. ------------------------------------------------------------------------ r6207 | vasil | 2009-11-20 10:19:14 +0200 (Fri, 20 Nov 2009) | 54 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Merge r6198:6206 from branches/5.1: (r6203 was skipped as it is already in branches/zip) ------------------------------------------------------------------------ r6200 | vasil | 2009-11-19 12:14:23 +0200 (Thu, 19 Nov 2009) | 4 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: White space fixup - indent under the opening ( ------------------------------------------------------------------------ r6203 | jyang | 2009-11-19 15:12:22 +0200 (Thu, 19 Nov 2009) | 8 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: Use btr_free_root() instead of fseg_free() for the fix of bug #48469, because fseg_free() is not defined in the zip branch. And we could save one mini-trasaction started by fseg_free(). Approved by Marko. ------------------------------------------------------------------------ r6205 | jyang | 2009-11-20 07:55:48 +0200 (Fri, 20 Nov 2009) | 11 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Add a special case to handle the Duplicated Key error and return DB_ERROR instead. This is to avoid a possible SIGSEGV by mysql error handling re-entering the storage layer for dup key info without proper table handle. This is to prevent a server crash when error situation in bug #45961 "DDL on partitioned innodb tables leaves data dictionary in an inconsistent state" happens. rb://157 approved by Sunny Bains. ------------------------------------------------------------------------ r6206 | jyang | 2009-11-20 09:38:43 +0200 (Fri, 20 Nov 2009) | 5 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Fix a minor code formating issue for the parenthesis iplacement of the if condition in rename_table(). ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6208 | vasil | 2009-11-20 10:49:24 +0200 (Fri, 20 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for c6207. ------------------------------------------------------------------------ r6210 | vasil | 2009-11-20 23:39:48 +0200 (Fri, 20 Nov 2009) | 3 lines Changed paths: M /branches/zip/trx/trx0i_s.c branches/zip: Whitespace fixup. ------------------------------------------------------------------------ r6248 | marko | 2009-11-30 12:19:50 +0200 (Mon, 30 Nov 2009) | 1 line Changed paths: M /branches/zip/ChangeLog branches/zip: ChangeLog: Document r4922 that was forgotten. ------------------------------------------------------------------------ r6252 | marko | 2009-11-30 12:50:11 +0200 (Mon, 30 Nov 2009) | 23 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/dict/dict0boot.c M /branches/zip/dict/dict0crea.c M /branches/zip/dict/dict0load.c M /branches/zip/dict/dict0mem.c M /branches/zip/fil/fil0fil.c M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/dict0mem.h M /branches/zip/row/row0mysql.c branches/zip: Suppress errors about non-found temporary tables. Write the is_temp flag to SYS_TABLES.MIX_LEN. dict_table_t::flags: Add a flag for is_temporary, DICT_TF2_TEMPORARY. Unlike other flags, this will not be written to the tablespace flags or SYS_TABLES.TYPE, but only to SYS_TABLES.MIX_LEN. dict_build_table_def_step(): Only pass DICT_TF_BITS to tablespaces. dict_check_tablespaces_and_store_max_id(), dict_load_table(): Suppress errors about temporary tables not being found. dict_create_sys_tables_tuple(): Write the DICT_TF2_TEMPORARY flag to SYS_TABLES.MIX_LEN. fil_space_create(), fil_create_new_single_table_tablespace(): Add assertions about space->flags. row_drop_table_for_mysql(): Do not complain about non-found temporary tables. rb://160 approved by Heikki Tuuri. This addresses the second part of Bug #41609 Crash recovery does not work for InnoDB temporary tables. ------------------------------------------------------------------------ r6263 | vasil | 2009-12-01 14:49:05 +0200 (Tue, 01 Dec 2009) | 4 lines Changed paths: M /branches/zip/include/univ.i branches/zip: Increment version number from 1.0.6 to 1.0.7 1.0.6 has been released ------------------------------------------------------------------------ r6264 | vasil | 2009-12-01 16:19:44 +0200 (Tue, 01 Dec 2009) | 1 line Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for the release of 1.0.6. ------------------------------------------------------------------------ r6269 | marko | 2009-12-02 11:35:22 +0200 (Wed, 02 Dec 2009) | 2 lines Changed paths: M /branches/zip/srv/srv0start.c branches/zip: innobase_start_or_create_for_mysql(): UNIV_IBUF_DEBUG should not break crash recovery, but UNIV_IBUF_COUNT_DEBUG will. ------------------------------------------------------------------------ r6270 | marko | 2009-12-02 11:36:47 +0200 (Wed, 02 Dec 2009) | 1 line Changed paths: M /branches/zip/srv/srv0start.c branches/zip: innobase_start_or_create_for_mysql(): Log the zlib version. ------------------------------------------------------------------------ r6271 | marko | 2009-12-02 11:43:49 +0200 (Wed, 02 Dec 2009) | 2 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/Makefile.am M /branches/zip/include/univ.i M /branches/zip/plug.in branches/zip: ChangeLog: Document that since r6270, the zlib version number will be displayed at start-up. ------------------------------------------------------------------------ r6272 | marko | 2009-12-02 11:46:05 +0200 (Wed, 02 Dec 2009) | 1 line Changed paths: M /branches/zip/Makefile.am M /branches/zip/include/univ.i M /branches/zip/plug.in branches/zip: Revert changes that were accidentally committed in r6271. ------------------------------------------------------------------------ r6274 | marko | 2009-12-03 14:47:12 +0200 (Thu, 03 Dec 2009) | 6 lines Changed paths: M /branches/zip/dict/dict0dict.c branches/zip: dict_table_check_for_dup_indexes(): Assert that the data dictionary mutex is being held while table->indexes is accessed. This is already the case. Currently, only dict_table_get_next_index() and dict_table_get_first_index() are being invoked without holding dict_sys->mutex. ------------------------------------------------------------------------ r6275 | pekka | 2009-12-03 18:32:47 +0200 (Thu, 03 Dec 2009) | 10 lines Changed paths: M /branches/zip/include/log0recv.h M /branches/zip/include/trx0sys.h M /branches/zip/log/log0recv.c M /branches/zip/trx/trx0sys.c branches/zip: Minor changes which allow build with UNIV_HOTBACKUP defined to succeed: include/trx0sys.h: Allow Hot Backup build to see some TRX_SYS_DOUBLEWRITE_... macros. trx/trx0sys.c: Exclude trx_sys_close() function from Hot Backup build. log/log0recv.[ch]: Exclude recv_sys_var_init() function from Hot Backup build. This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r6277 | marko | 2009-12-08 11:13:36 +0200 (Tue, 08 Dec 2009) | 1 line Changed paths: M /branches/zip/fsp/fsp0fsp.c branches/zip: fsp0fsp.c: Add some missing in/out and const qualifiers. ------------------------------------------------------------------------ r6285 | marko | 2009-12-09 09:24:50 +0200 (Wed, 09 Dec 2009) | 13 lines Changed paths: M /branches/zip/row/row0sel.c branches/zip: row_sel_fetch_columns(): Remove redundant code that was accidentally added in r1591, which introduced dfield_t::ext in order to make the merge sort of fast index creation support externally stored columns, Initially, I tried to allocate the bit for dfield_t::ext from dfield_t::len by making the length 31 bits and mapping UNIV_SQL_NULL to something that would fit in it. Then I decided that it would be too risky. The redundant check was part of the mapping. The condition may have been dfield_is_null() initially. This redundant code was noticed by Sergey Petrunya on the MySQL internals list. ------------------------------------------------------------------------ r6288 | marko | 2009-12-09 09:51:00 +0200 (Wed, 09 Dec 2009) | 15 lines Changed paths: M /branches/zip/row/row0upd.c branches/zip: row_upd_copy_columns(): Remove redundant code that was accidentally added in r1591, which introduced dfield_t::ext in order to make the merge sort of fast index creation support externally stored columns. Initially, I tried to allocate the bit for dfield_t::ext from dfield_t::len by making the length 31 bits and mapping UNIV_SQL_NULL to something that would fit in it. Then I decided that it would be too risky. The redundant check was part of the mapping. The condition may have been dfield_is_null() initially. This is similar to the redundant code in row_sel_fetch_columns() that was noticed by Sergey Petrunya on the MySQL internals list and removed in r6285. As far as I can tell, there are no redundant UNIV_SQL_NULL assignments remaining after this change. ------------------------------------------------------------------------ r6305 | marko | 2009-12-14 13:03:57 +0200 (Mon, 14 Dec 2009) | 2 lines Changed paths: M /branches/zip/row/row0umod.c branches/zip: row_undo_mod_del_unmark_sec_and_undo_update(): Add a missing const qualifier. ------------------------------------------------------------------------ r6309 | marko | 2009-12-15 14:05:50 +0200 (Tue, 15 Dec 2009) | 3 lines Changed paths: M /branches/zip/lock/lock0lock.c branches/zip: lock_rec_insert_check_and_lock(): Avoid casting away constness. Use page_rec_get_next_const() instead. This silences a gcc 4.2.4 warning. Reported by Sunny Bains. ------------------------------------------------------------------------ r6312 | marko | 2009-12-16 10:10:36 +0200 (Wed, 16 Dec 2009) | 6 lines Changed paths: M /branches/zip/fil/fil0fil.c branches/zip: fil_close(): Add #ifndef UNIV_HOTBACKUP around a debug assertion on mutex.magic_n. InnoDB Hot Backup is a single-threaded program and does not contain mutexes. This change allows InnoDB Hot Backup to be compiled with UNIV_DEBUG. Suggested by Michael Izioumtchenko. ------------------------------------------------------------------------ r6321 | marko | 2009-12-16 16:16:33 +0200 (Wed, 16 Dec 2009) | 4 lines Changed paths: M /branches/zip/row/row0merge.c branches/zip: row_merge_drop_temp_indexes(): Revert a hack to transaction isolation level that was made unnecessary by r5826 (Issue #337). When this function is called, any active data dictionary transaction should have been rolled back. ------------------------------------------------------------------------ r6345 | marko | 2009-12-21 10:46:14 +0200 (Mon, 21 Dec 2009) | 7 lines Changed paths: M /branches/zip/log/log0recv.c branches/zip: recv_scan_log_recs(): Non-functional change: Replace a debug assertion ut_ad(len > 0) with ut_ad(len >= OS_FILE_LOG_BLOCK_SIZE). This change is only for readability, for Issue #428. Another assertion on len being an integer multiple of OS_FILE_LOG_BLOCK_SIZE already ensured together with the old ut_ad(len > 0) that actually len must be at least OS_FILE_LOG_BLOCK_SIZE. ------------------------------------------------------------------------ r6346 | marko | 2009-12-21 12:03:25 +0200 (Mon, 21 Dec 2009) | 2 lines Changed paths: M /branches/zip/log/log0recv.c branches/zip: recv_recovery_from_checkpoint_finish(): Revert a change that was accidentally committed in r6345. ------------------------------------------------------------------------ r6348 | marko | 2009-12-22 11:04:34 +0200 (Tue, 22 Dec 2009) | 37 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/ha_prototypes.h M /branches/zip/include/trx0trx.h M /branches/zip/lock/lock0lock.c M /branches/zip/trx/trx0i_s.c M /branches/zip/trx/trx0trx.c branches/zip: Merge a change from MySQL: ------------------------------------------------------------ revno: 3236 committer: Satya B <satya.bn@sun.com> branch nick: mysql-5.1-bugteam timestamp: Tue 2009-12-01 17:48:57 +0530 message: merge to mysql-5.1-bugteam ------------------------------------------------------------ revno: 3234.1.1 committer: Gleb Shchepa <gshchepa@mysql.com> branch nick: mysql-5.1-bugteam timestamp: Tue 2009-12-01 14:38:40 +0400 message: Bug #38883 (reopened): thd_security_context is not thread safe, crashes? manual merge 5.0-->5.1, updating InnoDB plugin. ------------------------------------------------------------ revno: 1810.3968.13 committer: Gleb Shchepa <gshchepa@mysql.com> branch nick: mysql-5.0-bugteam timestamp: Tue 2009-12-01 14:24:44 +0400 message: Bug #38883 (reopened): thd_security_context is not thread safe, crashes? The bug 38816 changed the lock that protects THD::query from LOCK_thread_count to LOCK_thd_data, but didn't update the associated InnoDB functions. 1. The innobase_mysql_prepare_print_arbitrary_thd and the innobase_mysql_end_print_arbitrary_thd InnoDB functions have been removed, since now we have a per-thread mutex: now we don't need to wrap several inter-thread access tries to THD::query with a single global LOCK_thread_count lock, so we can simplify the code. 2. The innobase_mysql_print_thd function has been modified to lock LOCK_thd_data in direct way. ------------------------------------------------------------------------ r6351 | marko | 2009-12-22 11:11:18 +0200 (Tue, 22 Dec 2009) | 1 line Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Remove an obsolete declaration of LOCK_thread_count. ------------------------------------------------------------------------ r6352 | marko | 2009-12-22 12:33:01 +0200 (Tue, 22 Dec 2009) | 104 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/lock0lock.h M /branches/zip/include/srv0srv.h M /branches/zip/lock/lock0lock.c M /branches/zip/log/log0log.c M /branches/zip/mysql-test/innodb-autoinc.result M /branches/zip/mysql-test/innodb-autoinc.test M /branches/zip/row/row0sel.c M /branches/zip/srv/srv0srv.c M /branches/zip/srv/srv0start.c branches/zip: Merge revisions 6206:6350 from branches/5.1, except r6347, r6349, r6350 which were committed separately to both branches, and r6310, which was backported from zip to 5.1. ------------------------------------------------------------------------ r6206 | jyang | 2009-11-20 09:38:43 +0200 (Fri, 20 Nov 2009) | 3 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Non-functional change, fix formatting. ------------------------------------------------------------------------ r6230 | sunny | 2009-11-24 23:52:43 +0200 (Tue, 24 Nov 2009) | 3 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result branches/5.1: Fix autoinc failing test results. (this should be skipped when merging 5.1 into zip) ------------------------------------------------------------------------ r6231 | sunny | 2009-11-25 10:26:27 +0200 (Wed, 25 Nov 2009) | 7 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test M /branches/5.1/row/row0sel.c branches/5.1: Fix BUG#49032 - auto_increment field does not initialize to last value in InnoDB Storage Engine. We use the appropriate function to read the column value for non-integer autoinc column types, namely float and double. rb://208. Approved by Marko. ------------------------------------------------------------------------ r6232 | sunny | 2009-11-25 10:27:39 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix white space errors. ------------------------------------------------------------------------ r6233 | sunny | 2009-11-25 10:28:35 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/include/mach0data.h M /branches/5.1/include/mach0data.ic M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix tests and make read float/double arg const. ------------------------------------------------------------------------ r6234 | sunny | 2009-11-25 10:29:03 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix whitepsace issues. ------------------------------------------------------------------------ r6235 | sunny | 2009-11-26 01:14:42 +0200 (Thu, 26 Nov 2009) | 9 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Fix Bug#47720 - REPLACE INTO Autoincrement column with negative values. This bug is similiar to the negative autoinc filter patch from earlier, with the additional handling of filtering out the negative column values set explicitly by the user. rb://184 Approved by Heikki. ------------------------------------------------------------------------ r6242 | vasil | 2009-11-27 22:07:12 +0200 (Fri, 27 Nov 2009) | 4 lines Changed paths: M /branches/5.1/export.sh branches/5.1: Minor changes to support plugin snapshots. ------------------------------------------------------------------------ r6306 | calvin | 2009-12-14 15:12:46 +0200 (Mon, 14 Dec 2009) | 5 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: fix bug#49267: innodb-autoinc.test fails on windows because of different case mode There is no change to the InnoDB code, only to fix test case by changing "T1" to "t1". ------------------------------------------------------------------------ r6324 | jyang | 2009-12-17 06:54:24 +0200 (Thu, 17 Dec 2009) | 8 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/lock0lock.h M /branches/5.1/include/srv0srv.h M /branches/5.1/lock/lock0lock.c M /branches/5.1/log/log0log.c M /branches/5.1/srv/srv0srv.c M /branches/5.1/srv/srv0start.c branches/5.1: Fix bug #47814 - Diagnostics are frequently not printed after a long lock wait in InnoDB. Separate out the lock wait timeout check thread from monitor information printing thread. rb://200 Approved by Marko. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6364 | marko | 2009-12-26 21:06:31 +0200 (Sat, 26 Dec 2009) | 4 lines Changed paths: M /branches/zip/ibuf/ibuf0ibuf.c branches/zip: ibuf_bitmap_get_map_page(): Define a wrapper macro that passes __FILE__, __LINE__ of the caller to buf_page_get_gen(). This will ease the diagnosis of the likes of Issue #135. ------------------------------------------------------------------------
16 years ago
branches/innodb+: Merge revisions 6130:6364 from branches/zip: ------------------------------------------------------------------------ r6130 | marko | 2009-11-02 11:42:56 +0200 (Mon, 02 Nov 2009) | 9 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/btr/btr0sea.c M /branches/zip/buf/buf0buf.c M /branches/zip/dict/dict0dict.c M /branches/zip/fil/fil0fil.c M /branches/zip/ibuf/ibuf0ibuf.c M /branches/zip/include/btr0sea.h M /branches/zip/include/dict0dict.h M /branches/zip/include/fil0fil.h M /branches/zip/include/ibuf0ibuf.h M /branches/zip/include/lock0lock.h M /branches/zip/include/log0log.h M /branches/zip/include/log0recv.h M /branches/zip/include/mem0mem.h M /branches/zip/include/mem0pool.h M /branches/zip/include/os0file.h M /branches/zip/include/pars0pars.h M /branches/zip/include/srv0srv.h M /branches/zip/include/thr0loc.h M /branches/zip/include/trx0i_s.h M /branches/zip/include/trx0purge.h M /branches/zip/include/trx0rseg.h M /branches/zip/include/trx0sys.h M /branches/zip/include/trx0undo.h M /branches/zip/include/usr0sess.h M /branches/zip/lock/lock0lock.c M /branches/zip/log/log0log.c M /branches/zip/log/log0recv.c M /branches/zip/mem/mem0dbg.c M /branches/zip/mem/mem0pool.c M /branches/zip/os/os0file.c M /branches/zip/os/os0sync.c M /branches/zip/os/os0thread.c M /branches/zip/pars/lexyy.c M /branches/zip/pars/pars0lex.l M /branches/zip/que/que0que.c M /branches/zip/srv/srv0srv.c M /branches/zip/srv/srv0start.c M /branches/zip/sync/sync0arr.c M /branches/zip/sync/sync0sync.c M /branches/zip/thr/thr0loc.c M /branches/zip/trx/trx0i_s.c M /branches/zip/trx/trx0purge.c M /branches/zip/trx/trx0rseg.c M /branches/zip/trx/trx0sys.c M /branches/zip/trx/trx0undo.c M /branches/zip/usr/usr0sess.c M /branches/zip/ut/ut0mem.c branches/zip: Free all resources at shutdown. Set pointers to NULL, so that Valgrind will not complain about freed data structures that are reachable via pointers. This addresses Bug #45992 and Bug #46656. This patch is mostly based on changes copied from branches/embedded-1.0, mainly c5432, c3439, c3134, c2994, c2978, but also some other code was copied. Some added cleanup code is specific to MySQL/InnoDB. rb://199 approved by Sunny Bains ------------------------------------------------------------------------ r6134 | marko | 2009-11-04 09:57:29 +0200 (Wed, 04 Nov 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc branches/zip: innobase_convert_identifier(): Convert table names with explain_filename() to address Bug #32430: 'show innodb status' causes errors Invalid (old?) table or database name in logs. rb://134 approved by Sunny Bains ------------------------------------------------------------------------ r6137 | marko | 2009-11-04 15:24:28 +0200 (Wed, 04 Nov 2009) | 1 line Changed paths: M /branches/zip/dict/dict0dict.c branches/zip: dict_index_too_big_for_undo(): Correct a typo. ------------------------------------------------------------------------ r6153 | vasil | 2009-11-10 15:33:22 +0200 (Tue, 10 Nov 2009) | 145 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Merge r6125:6152 from branches/5.1: (everything except the last white-space change was skipped as it is already in branches/zip) ------------------------------------------------------------------------ r6127 | vasil | 2009-10-30 11:18:25 +0200 (Fri, 30 Oct 2009) | 18 lines Changed paths: M /branches/5.1/Makefile.am M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Backport c6121 from branches/zip: ------------------------------------------------------------------------ r6121 | sunny | 2009-10-30 01:42:11 +0200 (Fri, 30 Oct 2009) | 7 lines Changed paths: M /branches/zip/mysql-test/innodb-autoinc.result branches/zip: This test has been problematic for sometime now. The underlying bug is that the data dictionaries get out of sync. In the AUTOINC code we try and apply salve to the symptoms. In the past MySQL made some unrelated change and the dictionaries stopped getting out of sync and this test started to fail. Now, it seems they have reverted that changed and the test is passing again. I suspect this is not he last time that this test will change. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6129 | vasil | 2009-10-30 17:14:22 +0200 (Fri, 30 Oct 2009) | 4 lines Changed paths: M /branches/5.1/Makefile.am branches/5.1: Revert a change to Makefile.am that sneaked unnoticed in c6127. ------------------------------------------------------------------------ r6136 | marko | 2009-11-04 12:28:10 +0200 (Wed, 04 Nov 2009) | 15 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/ha_prototypes.h M /branches/5.1/ut/ut0ut.c branches/5.1: Port r6134 from branches/zip: ------------------------------------------------------------------------ r6134 | marko | 2009-11-04 07:57:29 +0000 (Wed, 04 Nov 2009) | 5 lines branches/zip: innobase_convert_identifier(): Convert table names with explain_filename() to address Bug #32430: 'show innodb status' causes errors Invalid (old?) table or database name in logs. rb://134 approved by Sunny Bains ------------------------------------------------------------------------ innobase_print_identifier(): Replace with innobase_convert_name(). innobase_convert_identifier(): New function, called by innobase_convert_name(). ------------------------------------------------------------------------ r6149 | vasil | 2009-11-09 11:15:01 +0200 (Mon, 09 Nov 2009) | 5 lines Changed paths: M /branches/5.1/CMakeLists.txt branches/5.1: Followup to r5700: Adjust the changes so they are the same as in the BZR repository. ------------------------------------------------------------------------ r6150 | vasil | 2009-11-09 11:43:31 +0200 (Mon, 09 Nov 2009) | 58 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a part of r2911.5.5 from MySQL: (the other part of this was merged in c5700) ------------------------------------------------------------ revno: 2911.5.5 committer: Vladislav Vaintroub <vvaintroub@mysql.com> branch nick: 5.1-innodb_plugin timestamp: Wed 2009-06-10 10:59:49 +0200 message: Backport WL#3653 to 5.1 to enable bundled innodb plugin. Remove custom DLL loader code from innodb plugin code, use symbols exported from mysqld. removed: storage/innodb_plugin/handler/handler0vars.h storage/innodb_plugin/handler/win_delay_loader.cc added: storage/mysql_storage_engine.cmake win/create_def_file.js modified: CMakeLists.txt include/m_ctype.h include/my_global.h include/my_sys.h include/mysql/plugin.h libmysqld/CMakeLists.txt mysql-test/mysql-test-run.pl mysql-test/t/plugin.test mysql-test/t/plugin_load-master.opt mysys/charset.c sql/CMakeLists.txt sql/handler.h sql/mysql_priv.h sql/mysqld.cc sql/sql_class.cc sql/sql_class.h sql/sql_list.h sql/sql_profile.h storage/Makefile.am storage/archive/CMakeLists.txt storage/blackhole/CMakeLists.txt storage/csv/CMakeLists.txt storage/example/CMakeLists.txt storage/federated/CMakeLists.txt storage/heap/CMakeLists.txt storage/innobase/CMakeLists.txt storage/innobase/handler/ha_innodb.cc storage/innodb_plugin/CMakeLists.txt storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/handler0alter.cc storage/innodb_plugin/handler/i_s.cc storage/innodb_plugin/plug.in storage/myisam/CMakeLists.txt storage/myisammrg/CMakeLists.txt win/Makefile.am win/configure.js ------------------------------------------------------------------------ r6152 | vasil | 2009-11-10 15:30:20 +0200 (Tue, 10 Nov 2009) | 4 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: White space fixup. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6157 | jyang | 2009-11-11 14:27:09 +0200 (Wed, 11 Nov 2009) | 10 lines Changed paths: M /branches/zip/handler/ha_innodb.cc A /branches/zip/mysql-test/innodb_bug47167.result A /branches/zip/mysql-test/innodb_bug47167.test M /branches/zip/mysql-test/innodb_file_format.result branches/zip: Fix an issue that a local variable defined in innodb_file_format_check_validate() is being referenced across function in innodb_file_format_check_update(). In addition, fix "set global innodb_file_format_check = DEFAULT" call. Bug #47167: "set global innodb_file_format_check" cannot set value by User-Defined Variable." rb://169 approved by Sunny Bains and Marko. ------------------------------------------------------------------------ r6159 | vasil | 2009-11-11 15:13:01 +0200 (Wed, 11 Nov 2009) | 37 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/handler/ha_innodb.h branches/zip: Merge a change from MySQL: (this has been reviewed by Calvin and Marko, and Calvin says Luis has incorporated Marko's suggestions) ------------------------------------------------------------ revno: 3092.5.1 committer: Luis Soares <luis.soares@sun.com> branch nick: mysql-5.1-bugteam timestamp: Thu 2009-09-24 15:52:52 +0100 message: BUG#42829: binlogging enabled for all schemas regardless of binlog-db-db / binlog-ignore-db InnoDB will return an error if statement based replication is used along with transaction isolation level READ-COMMITTED (or weaker), even if the statement in question is filtered out according to the binlog-do-db rules set. In this case, an error should not be printed. This patch addresses this issue by extending the existing check in external_lock to take into account the filter rules before deciding to print an error. Furthermore, it also changes decide_logging_format to take into consideration whether the statement is filtered out from binlog before decision is made. added: mysql-test/suite/binlog/r/binlog_stm_do_db.result mysql-test/suite/binlog/t/binlog_stm_do_db-master.opt mysql-test/suite/binlog/t/binlog_stm_do_db.test modified: sql/sql_base.cc sql/sql_class.cc storage/innobase/handler/ha_innodb.cc storage/innobase/handler/ha_innodb.h storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/ha_innodb.h ------------------------------------------------------------------------ r6160 | vasil | 2009-11-11 15:33:49 +0200 (Wed, 11 Nov 2009) | 72 lines Changed paths: M /branches/zip/include/os0file.h M /branches/zip/os/os0file.c branches/zip: Merge r6152:6159 from branches/5.1: (r6158 was skipped as an equivallent change has already been merged from MySQL) ------------------------------------------------------------------------ r6154 | calvin | 2009-11-11 02:51:17 +0200 (Wed, 11 Nov 2009) | 17 lines Changed paths: M /branches/5.1/include/os0file.h M /branches/5.1/os/os0file.c branches/5.1: fix bug#3139: Mysql crashes: 'windows error 995' after several selects on a large DB During stress environment, Windows AIO may fail with error code ERROR_OPERATION_ABORTED. InnoDB does not handle the error, rather crashes. The cause of the error is unknown, but likely due to faulty hardware or driver. This patch introduces a new error code OS_FILE_OPERATION_ABORTED, which maps to Windows ERROR_OPERATION_ABORTED (995). When the error is detected during AIO, the InnoDB will issue a synchronous retry (read/write). This patch has been extensively tested by MySQL support. Approved by: Marko rb://196 ------------------------------------------------------------------------ r6158 | vasil | 2009-11-11 14:52:14 +0200 (Wed, 11 Nov 2009) | 37 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/handler/ha_innodb.h branches/5.1: Merge a change from MySQL: (this has been reviewed by Calvin and Marko, and Calvin says Luis has incorporated Marko's suggestions) ------------------------------------------------------------ revno: 3092.5.1 committer: Luis Soares <luis.soares@sun.com> branch nick: mysql-5.1-bugteam timestamp: Thu 2009-09-24 15:52:52 +0100 message: BUG#42829: binlogging enabled for all schemas regardless of binlog-db-db / binlog-ignore-db InnoDB will return an error if statement based replication is used along with transaction isolation level READ-COMMITTED (or weaker), even if the statement in question is filtered out according to the binlog-do-db rules set. In this case, an error should not be printed. This patch addresses this issue by extending the existing check in external_lock to take into account the filter rules before deciding to print an error. Furthermore, it also changes decide_logging_format to take into consideration whether the statement is filtered out from binlog before decision is made. added: mysql-test/suite/binlog/r/binlog_stm_do_db.result mysql-test/suite/binlog/t/binlog_stm_do_db-master.opt mysql-test/suite/binlog/t/binlog_stm_do_db.test modified: sql/sql_base.cc sql/sql_class.cc storage/innobase/handler/ha_innodb.cc storage/innobase/handler/ha_innodb.h storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/ha_innodb.h ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6161 | vasil | 2009-11-11 15:36:16 +0200 (Wed, 11 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add changelog entry for r6160. ------------------------------------------------------------------------ r6162 | vasil | 2009-11-11 16:00:12 +0200 (Wed, 11 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog for r6157. ------------------------------------------------------------------------ r6163 | calvin | 2009-11-11 17:53:20 +0200 (Wed, 11 Nov 2009) | 8 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/handler/ha_innodb.h branches/zip: Exclude thd_binlog_filter_ok() when building with older version of MySQL. thd_binlog_filter_ok() is introduced in MySQL 5.1.41. But the plugin can be built with MySQL prior to 5.1.41. Approved by Heikki (on IM). ------------------------------------------------------------------------ r6169 | calvin | 2009-11-12 14:40:43 +0200 (Thu, 12 Nov 2009) | 6 lines Changed paths: A /branches/zip/mysql-test/innodb_bug46676.result A /branches/zip/mysql-test/innodb_bug46676.test branches/zip: add test case for bug#46676 This crash is reproducible with InnoDB plugin 1.0.4 + MySQL 5.1.37. But no longer reproducible after MySQL 5.1.38 (with plugin 1.0.5). Add test case to catch future regression. ------------------------------------------------------------------------ r6170 | marko | 2009-11-12 15:49:08 +0200 (Thu, 12 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/db0err.h M /branches/zip/row/row0merge.c M /branches/zip/row/row0mysql.c branches/zip: Allow CREATE INDEX to be interrupted. (Issue #354) rb://183 approved by Heikki Tuuri ------------------------------------------------------------------------ r6175 | vasil | 2009-11-16 20:07:39 +0200 (Mon, 16 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Wrap line at 78th char in the ChangeLog ------------------------------------------------------------------------ r6177 | calvin | 2009-11-16 20:20:38 +0200 (Mon, 16 Nov 2009) | 2 lines Changed paths: M /branches/zip/ChangeLog branches/zip: add an entry to ChangeLog for r6065 ------------------------------------------------------------------------ r6179 | marko | 2009-11-17 10:19:34 +0200 (Tue, 17 Nov 2009) | 2 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: ha_innobase::change_active_index(): When the history is missing, report it to the client, not to the error log. ------------------------------------------------------------------------ r6181 | vasil | 2009-11-17 12:21:41 +0200 (Tue, 17 Nov 2009) | 33 lines Changed paths: M /branches/zip/mysql-test/innodb-index.test branches/zip: At the end of innodb-index.test: restore the environment as it was before the test was started to silence this warning: MTR's internal check of the test case 'main.innodb-index' failed. This means that the test case does not preserve the state that existed before the test case was executed. Most likely the test case did not do a proper clean-up. This is the diff of the states of the servers before and after the test case was executed: mysqltest: Logging to '/tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.log'. mysqltest: Results saved in '/tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.result'. mysqltest: Connecting to server localhost:13000 (socket /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/mysqld.1.sock) as 'root', connection 'default', attempt 0 ... mysqltest: ... Connected. mysqltest: Start processing test commands from './include/check-testcase.test' ... mysqltest: ... Done processing test commands. --- /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.result 2009-11-17 13:10:40.000000000 +0300 +++ /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.reject 2009-11-17 13:10:54.000000000 +0300 @@ -84,7 +84,7 @@ INNODB_DOUBLEWRITE ON INNODB_FAST_SHUTDOWN 1 INNODB_FILE_FORMAT Antelope -INNODB_FILE_FORMAT_CHECK Antelope +INNODB_FILE_FORMAT_CHECK Barracuda INNODB_FILE_PER_TABLE OFF INNODB_FLUSH_LOG_AT_TRX_COMMIT 1 INNODB_FLUSH_METHOD mysqltest: Result content mismatch not ok ------------------------------------------------------------------------ r6182 | marko | 2009-11-17 13:49:15 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-consistent-master.opt M /branches/zip/mysql-test/innodb-consistent.result M /branches/zip/mysql-test/innodb-consistent.test M /branches/zip/mysql-test/innodb-use-sys-malloc-master.opt M /branches/zip/mysql-test/innodb-use-sys-malloc.result M /branches/zip/mysql-test/innodb-use-sys-malloc.test M /branches/zip/mysql-test/innodb_bug21704.result M /branches/zip/mysql-test/innodb_bug21704.test M /branches/zip/mysql-test/innodb_bug40360.test M /branches/zip/mysql-test/innodb_bug40565.result M /branches/zip/mysql-test/innodb_bug40565.test M /branches/zip/mysql-test/innodb_bug41904.result M /branches/zip/mysql-test/innodb_bug41904.test M /branches/zip/mysql-test/innodb_bug42101-nonzero-master.opt M /branches/zip/mysql-test/innodb_bug42101-nonzero.result M /branches/zip/mysql-test/innodb_bug42101-nonzero.test M /branches/zip/mysql-test/innodb_bug42101.result M /branches/zip/mysql-test/innodb_bug42101.test M /branches/zip/mysql-test/innodb_bug44032.result M /branches/zip/mysql-test/innodb_bug44032.test M /branches/zip/mysql-test/innodb_bug44369.result M /branches/zip/mysql-test/innodb_bug44369.test M /branches/zip/mysql-test/innodb_bug44571.result M /branches/zip/mysql-test/innodb_bug44571.test M /branches/zip/mysql-test/innodb_bug45357.test M /branches/zip/mysql-test/innodb_bug46000.result M /branches/zip/mysql-test/innodb_bug46000.test M /branches/zip/mysql-test/innodb_bug46676.result M /branches/zip/mysql-test/innodb_bug46676.test M /branches/zip/mysql-test/innodb_bug47167.result M /branches/zip/mysql-test/innodb_bug47167.test M /branches/zip/mysql-test/innodb_bug47777.result M /branches/zip/mysql-test/innodb_bug47777.test M /branches/zip/mysql-test/innodb_file_format.result M /branches/zip/mysql-test/innodb_file_format.test branches/zip: Set svn:eol-style on mysql-test files. ------------------------------------------------------------------------ r6183 | marko | 2009-11-17 13:51:16 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-consistent-master.opt M /branches/zip/mysql-test/innodb-master.opt M /branches/zip/mysql-test/innodb-semi-consistent-master.opt M /branches/zip/mysql-test/innodb-use-sys-malloc-master.opt M /branches/zip/mysql-test/innodb_bug42101-nonzero-master.opt branches/zip: Prepend loose_ to plugin-only mysql-test options. ------------------------------------------------------------------------ r6184 | marko | 2009-11-17 13:52:01 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-index.result M /branches/zip/mysql-test/innodb-index.test branches/zip: innodb-index.test: Restore innodb_file_format_check. ------------------------------------------------------------------------ r6185 | marko | 2009-11-17 16:44:20 +0200 (Tue, 17 Nov 2009) | 16 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/mysql-test/innodb.result M /branches/zip/mysql-test/innodb.test M /branches/zip/mysql-test/innodb_bug44369.result M /branches/zip/mysql-test/innodb_bug44369.test D /branches/zip/mysql-test/patches/innodb-index.diff M /branches/zip/row/row0mysql.c branches/zip: Report duplicate table names to the client connection, not to the error log. This change will allow innodb-index.test to be re-enabled. It was previously disabled, because mysql-test-run does not like output in the error log. row_create_table_for_mysql(): Do not output anything to the error log when reporting DB_DUPLICATE_KEY. Let the caller report the error. Add a TODO comment that the dict_table_t object is apparently not freed when an error occurs. create_table_def(): Convert InnoDB table names to the character set of the client connection for reporting. Use my_error(ER_WRONG_COLUMN_NAME) for reporting reserved column names. Report my_error(ER_TABLE_EXISTS_ERROR) when row_create_table_for_mysql() returns DB_DUPLICATE_KEY. rb://206 ------------------------------------------------------------------------ r6186 | vasil | 2009-11-17 16:48:14 +0200 (Tue, 17 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for r6185. ------------------------------------------------------------------------ r6189 | marko | 2009-11-18 11:36:18 +0200 (Wed, 18 Nov 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/handler0alter.cc branches/zip: ha_innobase::add_index(): When creating the primary key and the table is being locked by another transaction, do not attempt to drop the table. (Bug #48782) Approved by Sunny Bains over IM ------------------------------------------------------------------------ r6194 | vasil | 2009-11-19 09:24:45 +0200 (Thu, 19 Nov 2009) | 5 lines Changed paths: M /branches/zip/include/univ.i branches/zip: Increment version number from 1.0.5 to 1.0.6 since 1.0.5 was just released by MySQL and we will soon release 1.0.6. ------------------------------------------------------------------------ r6197 | calvin | 2009-11-19 09:32:55 +0200 (Thu, 19 Nov 2009) | 6 lines Changed paths: M /branches/zip/CMakeLists.txt branches/zip: merge the fix of bug#48317 (CMake file) Due to MySQL changes to the CMake, it is no longer able to build InnoDB plugin as a static library on Windows. The fix is proposed by Vlad of MySQL. ------------------------------------------------------------------------ r6198 | vasil | 2009-11-19 09:44:31 +0200 (Thu, 19 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for r6197. ------------------------------------------------------------------------ r6199 | vasil | 2009-11-19 12:10:12 +0200 (Thu, 19 Nov 2009) | 31 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/btr/btr0btr.c M /branches/zip/data/data0type.c branches/zip: Merge r6159:6198 from branches/5.1: ------------------------------------------------------------------------ r6187 | jyang | 2009-11-18 05:27:30 +0200 (Wed, 18 Nov 2009) | 9 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: Fix bug #48469 "when innodb tablespace is configured too small, crash and corruption!". Function btr_create() did not check the return status of fseg_create(), and continue the index creation even there is no sufficient space. rb://205 Approved by Marko ------------------------------------------------------------------------ r6188 | jyang | 2009-11-18 07:14:23 +0200 (Wed, 18 Nov 2009) | 8 lines Changed paths: M /branches/5.1/data/data0type.c branches/5.1: Fix bug #48526 "Data type for float and double is incorrectly reported in InnoDB table monitor". Certain datatypes are not printed correctly in dtype_print(). rb://204 Approved by Marko. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6201 | marko | 2009-11-19 14:09:11 +0200 (Thu, 19 Nov 2009) | 2 lines Changed paths: M /branches/zip/handler/handler0alter.cc branches/zip: ha_innobase::add_index(): Clarify the comment on orphaned tables when creating a primary key. ------------------------------------------------------------------------ r6202 | jyang | 2009-11-19 15:01:00 +0200 (Thu, 19 Nov 2009) | 8 lines Changed paths: M /branches/zip/btr/btr0btr.c branches/zip: Function fseg_free() is no longer defined in branches/zip. To port fix for bug #48469 to zip, we can use btr_free_root() which frees the page, and also does not require mini-transaction. Approved by Marko. ------------------------------------------------------------------------ r6207 | vasil | 2009-11-20 10:19:14 +0200 (Fri, 20 Nov 2009) | 54 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Merge r6198:6206 from branches/5.1: (r6203 was skipped as it is already in branches/zip) ------------------------------------------------------------------------ r6200 | vasil | 2009-11-19 12:14:23 +0200 (Thu, 19 Nov 2009) | 4 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: White space fixup - indent under the opening ( ------------------------------------------------------------------------ r6203 | jyang | 2009-11-19 15:12:22 +0200 (Thu, 19 Nov 2009) | 8 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: Use btr_free_root() instead of fseg_free() for the fix of bug #48469, because fseg_free() is not defined in the zip branch. And we could save one mini-trasaction started by fseg_free(). Approved by Marko. ------------------------------------------------------------------------ r6205 | jyang | 2009-11-20 07:55:48 +0200 (Fri, 20 Nov 2009) | 11 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Add a special case to handle the Duplicated Key error and return DB_ERROR instead. This is to avoid a possible SIGSEGV by mysql error handling re-entering the storage layer for dup key info without proper table handle. This is to prevent a server crash when error situation in bug #45961 "DDL on partitioned innodb tables leaves data dictionary in an inconsistent state" happens. rb://157 approved by Sunny Bains. ------------------------------------------------------------------------ r6206 | jyang | 2009-11-20 09:38:43 +0200 (Fri, 20 Nov 2009) | 5 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Fix a minor code formating issue for the parenthesis iplacement of the if condition in rename_table(). ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6208 | vasil | 2009-11-20 10:49:24 +0200 (Fri, 20 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for c6207. ------------------------------------------------------------------------ r6210 | vasil | 2009-11-20 23:39:48 +0200 (Fri, 20 Nov 2009) | 3 lines Changed paths: M /branches/zip/trx/trx0i_s.c branches/zip: Whitespace fixup. ------------------------------------------------------------------------ r6248 | marko | 2009-11-30 12:19:50 +0200 (Mon, 30 Nov 2009) | 1 line Changed paths: M /branches/zip/ChangeLog branches/zip: ChangeLog: Document r4922 that was forgotten. ------------------------------------------------------------------------ r6252 | marko | 2009-11-30 12:50:11 +0200 (Mon, 30 Nov 2009) | 23 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/dict/dict0boot.c M /branches/zip/dict/dict0crea.c M /branches/zip/dict/dict0load.c M /branches/zip/dict/dict0mem.c M /branches/zip/fil/fil0fil.c M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/dict0mem.h M /branches/zip/row/row0mysql.c branches/zip: Suppress errors about non-found temporary tables. Write the is_temp flag to SYS_TABLES.MIX_LEN. dict_table_t::flags: Add a flag for is_temporary, DICT_TF2_TEMPORARY. Unlike other flags, this will not be written to the tablespace flags or SYS_TABLES.TYPE, but only to SYS_TABLES.MIX_LEN. dict_build_table_def_step(): Only pass DICT_TF_BITS to tablespaces. dict_check_tablespaces_and_store_max_id(), dict_load_table(): Suppress errors about temporary tables not being found. dict_create_sys_tables_tuple(): Write the DICT_TF2_TEMPORARY flag to SYS_TABLES.MIX_LEN. fil_space_create(), fil_create_new_single_table_tablespace(): Add assertions about space->flags. row_drop_table_for_mysql(): Do not complain about non-found temporary tables. rb://160 approved by Heikki Tuuri. This addresses the second part of Bug #41609 Crash recovery does not work for InnoDB temporary tables. ------------------------------------------------------------------------ r6263 | vasil | 2009-12-01 14:49:05 +0200 (Tue, 01 Dec 2009) | 4 lines Changed paths: M /branches/zip/include/univ.i branches/zip: Increment version number from 1.0.6 to 1.0.7 1.0.6 has been released ------------------------------------------------------------------------ r6264 | vasil | 2009-12-01 16:19:44 +0200 (Tue, 01 Dec 2009) | 1 line Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for the release of 1.0.6. ------------------------------------------------------------------------ r6269 | marko | 2009-12-02 11:35:22 +0200 (Wed, 02 Dec 2009) | 2 lines Changed paths: M /branches/zip/srv/srv0start.c branches/zip: innobase_start_or_create_for_mysql(): UNIV_IBUF_DEBUG should not break crash recovery, but UNIV_IBUF_COUNT_DEBUG will. ------------------------------------------------------------------------ r6270 | marko | 2009-12-02 11:36:47 +0200 (Wed, 02 Dec 2009) | 1 line Changed paths: M /branches/zip/srv/srv0start.c branches/zip: innobase_start_or_create_for_mysql(): Log the zlib version. ------------------------------------------------------------------------ r6271 | marko | 2009-12-02 11:43:49 +0200 (Wed, 02 Dec 2009) | 2 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/Makefile.am M /branches/zip/include/univ.i M /branches/zip/plug.in branches/zip: ChangeLog: Document that since r6270, the zlib version number will be displayed at start-up. ------------------------------------------------------------------------ r6272 | marko | 2009-12-02 11:46:05 +0200 (Wed, 02 Dec 2009) | 1 line Changed paths: M /branches/zip/Makefile.am M /branches/zip/include/univ.i M /branches/zip/plug.in branches/zip: Revert changes that were accidentally committed in r6271. ------------------------------------------------------------------------ r6274 | marko | 2009-12-03 14:47:12 +0200 (Thu, 03 Dec 2009) | 6 lines Changed paths: M /branches/zip/dict/dict0dict.c branches/zip: dict_table_check_for_dup_indexes(): Assert that the data dictionary mutex is being held while table->indexes is accessed. This is already the case. Currently, only dict_table_get_next_index() and dict_table_get_first_index() are being invoked without holding dict_sys->mutex. ------------------------------------------------------------------------ r6275 | pekka | 2009-12-03 18:32:47 +0200 (Thu, 03 Dec 2009) | 10 lines Changed paths: M /branches/zip/include/log0recv.h M /branches/zip/include/trx0sys.h M /branches/zip/log/log0recv.c M /branches/zip/trx/trx0sys.c branches/zip: Minor changes which allow build with UNIV_HOTBACKUP defined to succeed: include/trx0sys.h: Allow Hot Backup build to see some TRX_SYS_DOUBLEWRITE_... macros. trx/trx0sys.c: Exclude trx_sys_close() function from Hot Backup build. log/log0recv.[ch]: Exclude recv_sys_var_init() function from Hot Backup build. This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r6277 | marko | 2009-12-08 11:13:36 +0200 (Tue, 08 Dec 2009) | 1 line Changed paths: M /branches/zip/fsp/fsp0fsp.c branches/zip: fsp0fsp.c: Add some missing in/out and const qualifiers. ------------------------------------------------------------------------ r6285 | marko | 2009-12-09 09:24:50 +0200 (Wed, 09 Dec 2009) | 13 lines Changed paths: M /branches/zip/row/row0sel.c branches/zip: row_sel_fetch_columns(): Remove redundant code that was accidentally added in r1591, which introduced dfield_t::ext in order to make the merge sort of fast index creation support externally stored columns, Initially, I tried to allocate the bit for dfield_t::ext from dfield_t::len by making the length 31 bits and mapping UNIV_SQL_NULL to something that would fit in it. Then I decided that it would be too risky. The redundant check was part of the mapping. The condition may have been dfield_is_null() initially. This redundant code was noticed by Sergey Petrunya on the MySQL internals list. ------------------------------------------------------------------------ r6288 | marko | 2009-12-09 09:51:00 +0200 (Wed, 09 Dec 2009) | 15 lines Changed paths: M /branches/zip/row/row0upd.c branches/zip: row_upd_copy_columns(): Remove redundant code that was accidentally added in r1591, which introduced dfield_t::ext in order to make the merge sort of fast index creation support externally stored columns. Initially, I tried to allocate the bit for dfield_t::ext from dfield_t::len by making the length 31 bits and mapping UNIV_SQL_NULL to something that would fit in it. Then I decided that it would be too risky. The redundant check was part of the mapping. The condition may have been dfield_is_null() initially. This is similar to the redundant code in row_sel_fetch_columns() that was noticed by Sergey Petrunya on the MySQL internals list and removed in r6285. As far as I can tell, there are no redundant UNIV_SQL_NULL assignments remaining after this change. ------------------------------------------------------------------------ r6305 | marko | 2009-12-14 13:03:57 +0200 (Mon, 14 Dec 2009) | 2 lines Changed paths: M /branches/zip/row/row0umod.c branches/zip: row_undo_mod_del_unmark_sec_and_undo_update(): Add a missing const qualifier. ------------------------------------------------------------------------ r6309 | marko | 2009-12-15 14:05:50 +0200 (Tue, 15 Dec 2009) | 3 lines Changed paths: M /branches/zip/lock/lock0lock.c branches/zip: lock_rec_insert_check_and_lock(): Avoid casting away constness. Use page_rec_get_next_const() instead. This silences a gcc 4.2.4 warning. Reported by Sunny Bains. ------------------------------------------------------------------------ r6312 | marko | 2009-12-16 10:10:36 +0200 (Wed, 16 Dec 2009) | 6 lines Changed paths: M /branches/zip/fil/fil0fil.c branches/zip: fil_close(): Add #ifndef UNIV_HOTBACKUP around a debug assertion on mutex.magic_n. InnoDB Hot Backup is a single-threaded program and does not contain mutexes. This change allows InnoDB Hot Backup to be compiled with UNIV_DEBUG. Suggested by Michael Izioumtchenko. ------------------------------------------------------------------------ r6321 | marko | 2009-12-16 16:16:33 +0200 (Wed, 16 Dec 2009) | 4 lines Changed paths: M /branches/zip/row/row0merge.c branches/zip: row_merge_drop_temp_indexes(): Revert a hack to transaction isolation level that was made unnecessary by r5826 (Issue #337). When this function is called, any active data dictionary transaction should have been rolled back. ------------------------------------------------------------------------ r6345 | marko | 2009-12-21 10:46:14 +0200 (Mon, 21 Dec 2009) | 7 lines Changed paths: M /branches/zip/log/log0recv.c branches/zip: recv_scan_log_recs(): Non-functional change: Replace a debug assertion ut_ad(len > 0) with ut_ad(len >= OS_FILE_LOG_BLOCK_SIZE). This change is only for readability, for Issue #428. Another assertion on len being an integer multiple of OS_FILE_LOG_BLOCK_SIZE already ensured together with the old ut_ad(len > 0) that actually len must be at least OS_FILE_LOG_BLOCK_SIZE. ------------------------------------------------------------------------ r6346 | marko | 2009-12-21 12:03:25 +0200 (Mon, 21 Dec 2009) | 2 lines Changed paths: M /branches/zip/log/log0recv.c branches/zip: recv_recovery_from_checkpoint_finish(): Revert a change that was accidentally committed in r6345. ------------------------------------------------------------------------ r6348 | marko | 2009-12-22 11:04:34 +0200 (Tue, 22 Dec 2009) | 37 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/ha_prototypes.h M /branches/zip/include/trx0trx.h M /branches/zip/lock/lock0lock.c M /branches/zip/trx/trx0i_s.c M /branches/zip/trx/trx0trx.c branches/zip: Merge a change from MySQL: ------------------------------------------------------------ revno: 3236 committer: Satya B <satya.bn@sun.com> branch nick: mysql-5.1-bugteam timestamp: Tue 2009-12-01 17:48:57 +0530 message: merge to mysql-5.1-bugteam ------------------------------------------------------------ revno: 3234.1.1 committer: Gleb Shchepa <gshchepa@mysql.com> branch nick: mysql-5.1-bugteam timestamp: Tue 2009-12-01 14:38:40 +0400 message: Bug #38883 (reopened): thd_security_context is not thread safe, crashes? manual merge 5.0-->5.1, updating InnoDB plugin. ------------------------------------------------------------ revno: 1810.3968.13 committer: Gleb Shchepa <gshchepa@mysql.com> branch nick: mysql-5.0-bugteam timestamp: Tue 2009-12-01 14:24:44 +0400 message: Bug #38883 (reopened): thd_security_context is not thread safe, crashes? The bug 38816 changed the lock that protects THD::query from LOCK_thread_count to LOCK_thd_data, but didn't update the associated InnoDB functions. 1. The innobase_mysql_prepare_print_arbitrary_thd and the innobase_mysql_end_print_arbitrary_thd InnoDB functions have been removed, since now we have a per-thread mutex: now we don't need to wrap several inter-thread access tries to THD::query with a single global LOCK_thread_count lock, so we can simplify the code. 2. The innobase_mysql_print_thd function has been modified to lock LOCK_thd_data in direct way. ------------------------------------------------------------------------ r6351 | marko | 2009-12-22 11:11:18 +0200 (Tue, 22 Dec 2009) | 1 line Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Remove an obsolete declaration of LOCK_thread_count. ------------------------------------------------------------------------ r6352 | marko | 2009-12-22 12:33:01 +0200 (Tue, 22 Dec 2009) | 104 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/lock0lock.h M /branches/zip/include/srv0srv.h M /branches/zip/lock/lock0lock.c M /branches/zip/log/log0log.c M /branches/zip/mysql-test/innodb-autoinc.result M /branches/zip/mysql-test/innodb-autoinc.test M /branches/zip/row/row0sel.c M /branches/zip/srv/srv0srv.c M /branches/zip/srv/srv0start.c branches/zip: Merge revisions 6206:6350 from branches/5.1, except r6347, r6349, r6350 which were committed separately to both branches, and r6310, which was backported from zip to 5.1. ------------------------------------------------------------------------ r6206 | jyang | 2009-11-20 09:38:43 +0200 (Fri, 20 Nov 2009) | 3 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Non-functional change, fix formatting. ------------------------------------------------------------------------ r6230 | sunny | 2009-11-24 23:52:43 +0200 (Tue, 24 Nov 2009) | 3 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result branches/5.1: Fix autoinc failing test results. (this should be skipped when merging 5.1 into zip) ------------------------------------------------------------------------ r6231 | sunny | 2009-11-25 10:26:27 +0200 (Wed, 25 Nov 2009) | 7 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test M /branches/5.1/row/row0sel.c branches/5.1: Fix BUG#49032 - auto_increment field does not initialize to last value in InnoDB Storage Engine. We use the appropriate function to read the column value for non-integer autoinc column types, namely float and double. rb://208. Approved by Marko. ------------------------------------------------------------------------ r6232 | sunny | 2009-11-25 10:27:39 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix white space errors. ------------------------------------------------------------------------ r6233 | sunny | 2009-11-25 10:28:35 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/include/mach0data.h M /branches/5.1/include/mach0data.ic M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix tests and make read float/double arg const. ------------------------------------------------------------------------ r6234 | sunny | 2009-11-25 10:29:03 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix whitepsace issues. ------------------------------------------------------------------------ r6235 | sunny | 2009-11-26 01:14:42 +0200 (Thu, 26 Nov 2009) | 9 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Fix Bug#47720 - REPLACE INTO Autoincrement column with negative values. This bug is similiar to the negative autoinc filter patch from earlier, with the additional handling of filtering out the negative column values set explicitly by the user. rb://184 Approved by Heikki. ------------------------------------------------------------------------ r6242 | vasil | 2009-11-27 22:07:12 +0200 (Fri, 27 Nov 2009) | 4 lines Changed paths: M /branches/5.1/export.sh branches/5.1: Minor changes to support plugin snapshots. ------------------------------------------------------------------------ r6306 | calvin | 2009-12-14 15:12:46 +0200 (Mon, 14 Dec 2009) | 5 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: fix bug#49267: innodb-autoinc.test fails on windows because of different case mode There is no change to the InnoDB code, only to fix test case by changing "T1" to "t1". ------------------------------------------------------------------------ r6324 | jyang | 2009-12-17 06:54:24 +0200 (Thu, 17 Dec 2009) | 8 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/lock0lock.h M /branches/5.1/include/srv0srv.h M /branches/5.1/lock/lock0lock.c M /branches/5.1/log/log0log.c M /branches/5.1/srv/srv0srv.c M /branches/5.1/srv/srv0start.c branches/5.1: Fix bug #47814 - Diagnostics are frequently not printed after a long lock wait in InnoDB. Separate out the lock wait timeout check thread from monitor information printing thread. rb://200 Approved by Marko. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6364 | marko | 2009-12-26 21:06:31 +0200 (Sat, 26 Dec 2009) | 4 lines Changed paths: M /branches/zip/ibuf/ibuf0ibuf.c branches/zip: ibuf_bitmap_get_map_page(): Define a wrapper macro that passes __FILE__, __LINE__ of the caller to buf_page_get_gen(). This will ease the diagnosis of the likes of Issue #135. ------------------------------------------------------------------------
16 years ago
branches/innodb+: Merge revisions 6130:6364 from branches/zip: ------------------------------------------------------------------------ r6130 | marko | 2009-11-02 11:42:56 +0200 (Mon, 02 Nov 2009) | 9 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/btr/btr0sea.c M /branches/zip/buf/buf0buf.c M /branches/zip/dict/dict0dict.c M /branches/zip/fil/fil0fil.c M /branches/zip/ibuf/ibuf0ibuf.c M /branches/zip/include/btr0sea.h M /branches/zip/include/dict0dict.h M /branches/zip/include/fil0fil.h M /branches/zip/include/ibuf0ibuf.h M /branches/zip/include/lock0lock.h M /branches/zip/include/log0log.h M /branches/zip/include/log0recv.h M /branches/zip/include/mem0mem.h M /branches/zip/include/mem0pool.h M /branches/zip/include/os0file.h M /branches/zip/include/pars0pars.h M /branches/zip/include/srv0srv.h M /branches/zip/include/thr0loc.h M /branches/zip/include/trx0i_s.h M /branches/zip/include/trx0purge.h M /branches/zip/include/trx0rseg.h M /branches/zip/include/trx0sys.h M /branches/zip/include/trx0undo.h M /branches/zip/include/usr0sess.h M /branches/zip/lock/lock0lock.c M /branches/zip/log/log0log.c M /branches/zip/log/log0recv.c M /branches/zip/mem/mem0dbg.c M /branches/zip/mem/mem0pool.c M /branches/zip/os/os0file.c M /branches/zip/os/os0sync.c M /branches/zip/os/os0thread.c M /branches/zip/pars/lexyy.c M /branches/zip/pars/pars0lex.l M /branches/zip/que/que0que.c M /branches/zip/srv/srv0srv.c M /branches/zip/srv/srv0start.c M /branches/zip/sync/sync0arr.c M /branches/zip/sync/sync0sync.c M /branches/zip/thr/thr0loc.c M /branches/zip/trx/trx0i_s.c M /branches/zip/trx/trx0purge.c M /branches/zip/trx/trx0rseg.c M /branches/zip/trx/trx0sys.c M /branches/zip/trx/trx0undo.c M /branches/zip/usr/usr0sess.c M /branches/zip/ut/ut0mem.c branches/zip: Free all resources at shutdown. Set pointers to NULL, so that Valgrind will not complain about freed data structures that are reachable via pointers. This addresses Bug #45992 and Bug #46656. This patch is mostly based on changes copied from branches/embedded-1.0, mainly c5432, c3439, c3134, c2994, c2978, but also some other code was copied. Some added cleanup code is specific to MySQL/InnoDB. rb://199 approved by Sunny Bains ------------------------------------------------------------------------ r6134 | marko | 2009-11-04 09:57:29 +0200 (Wed, 04 Nov 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc branches/zip: innobase_convert_identifier(): Convert table names with explain_filename() to address Bug #32430: 'show innodb status' causes errors Invalid (old?) table or database name in logs. rb://134 approved by Sunny Bains ------------------------------------------------------------------------ r6137 | marko | 2009-11-04 15:24:28 +0200 (Wed, 04 Nov 2009) | 1 line Changed paths: M /branches/zip/dict/dict0dict.c branches/zip: dict_index_too_big_for_undo(): Correct a typo. ------------------------------------------------------------------------ r6153 | vasil | 2009-11-10 15:33:22 +0200 (Tue, 10 Nov 2009) | 145 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Merge r6125:6152 from branches/5.1: (everything except the last white-space change was skipped as it is already in branches/zip) ------------------------------------------------------------------------ r6127 | vasil | 2009-10-30 11:18:25 +0200 (Fri, 30 Oct 2009) | 18 lines Changed paths: M /branches/5.1/Makefile.am M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Backport c6121 from branches/zip: ------------------------------------------------------------------------ r6121 | sunny | 2009-10-30 01:42:11 +0200 (Fri, 30 Oct 2009) | 7 lines Changed paths: M /branches/zip/mysql-test/innodb-autoinc.result branches/zip: This test has been problematic for sometime now. The underlying bug is that the data dictionaries get out of sync. In the AUTOINC code we try and apply salve to the symptoms. In the past MySQL made some unrelated change and the dictionaries stopped getting out of sync and this test started to fail. Now, it seems they have reverted that changed and the test is passing again. I suspect this is not he last time that this test will change. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6129 | vasil | 2009-10-30 17:14:22 +0200 (Fri, 30 Oct 2009) | 4 lines Changed paths: M /branches/5.1/Makefile.am branches/5.1: Revert a change to Makefile.am that sneaked unnoticed in c6127. ------------------------------------------------------------------------ r6136 | marko | 2009-11-04 12:28:10 +0200 (Wed, 04 Nov 2009) | 15 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/ha_prototypes.h M /branches/5.1/ut/ut0ut.c branches/5.1: Port r6134 from branches/zip: ------------------------------------------------------------------------ r6134 | marko | 2009-11-04 07:57:29 +0000 (Wed, 04 Nov 2009) | 5 lines branches/zip: innobase_convert_identifier(): Convert table names with explain_filename() to address Bug #32430: 'show innodb status' causes errors Invalid (old?) table or database name in logs. rb://134 approved by Sunny Bains ------------------------------------------------------------------------ innobase_print_identifier(): Replace with innobase_convert_name(). innobase_convert_identifier(): New function, called by innobase_convert_name(). ------------------------------------------------------------------------ r6149 | vasil | 2009-11-09 11:15:01 +0200 (Mon, 09 Nov 2009) | 5 lines Changed paths: M /branches/5.1/CMakeLists.txt branches/5.1: Followup to r5700: Adjust the changes so they are the same as in the BZR repository. ------------------------------------------------------------------------ r6150 | vasil | 2009-11-09 11:43:31 +0200 (Mon, 09 Nov 2009) | 58 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a part of r2911.5.5 from MySQL: (the other part of this was merged in c5700) ------------------------------------------------------------ revno: 2911.5.5 committer: Vladislav Vaintroub <vvaintroub@mysql.com> branch nick: 5.1-innodb_plugin timestamp: Wed 2009-06-10 10:59:49 +0200 message: Backport WL#3653 to 5.1 to enable bundled innodb plugin. Remove custom DLL loader code from innodb plugin code, use symbols exported from mysqld. removed: storage/innodb_plugin/handler/handler0vars.h storage/innodb_plugin/handler/win_delay_loader.cc added: storage/mysql_storage_engine.cmake win/create_def_file.js modified: CMakeLists.txt include/m_ctype.h include/my_global.h include/my_sys.h include/mysql/plugin.h libmysqld/CMakeLists.txt mysql-test/mysql-test-run.pl mysql-test/t/plugin.test mysql-test/t/plugin_load-master.opt mysys/charset.c sql/CMakeLists.txt sql/handler.h sql/mysql_priv.h sql/mysqld.cc sql/sql_class.cc sql/sql_class.h sql/sql_list.h sql/sql_profile.h storage/Makefile.am storage/archive/CMakeLists.txt storage/blackhole/CMakeLists.txt storage/csv/CMakeLists.txt storage/example/CMakeLists.txt storage/federated/CMakeLists.txt storage/heap/CMakeLists.txt storage/innobase/CMakeLists.txt storage/innobase/handler/ha_innodb.cc storage/innodb_plugin/CMakeLists.txt storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/handler0alter.cc storage/innodb_plugin/handler/i_s.cc storage/innodb_plugin/plug.in storage/myisam/CMakeLists.txt storage/myisammrg/CMakeLists.txt win/Makefile.am win/configure.js ------------------------------------------------------------------------ r6152 | vasil | 2009-11-10 15:30:20 +0200 (Tue, 10 Nov 2009) | 4 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: White space fixup. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6157 | jyang | 2009-11-11 14:27:09 +0200 (Wed, 11 Nov 2009) | 10 lines Changed paths: M /branches/zip/handler/ha_innodb.cc A /branches/zip/mysql-test/innodb_bug47167.result A /branches/zip/mysql-test/innodb_bug47167.test M /branches/zip/mysql-test/innodb_file_format.result branches/zip: Fix an issue that a local variable defined in innodb_file_format_check_validate() is being referenced across function in innodb_file_format_check_update(). In addition, fix "set global innodb_file_format_check = DEFAULT" call. Bug #47167: "set global innodb_file_format_check" cannot set value by User-Defined Variable." rb://169 approved by Sunny Bains and Marko. ------------------------------------------------------------------------ r6159 | vasil | 2009-11-11 15:13:01 +0200 (Wed, 11 Nov 2009) | 37 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/handler/ha_innodb.h branches/zip: Merge a change from MySQL: (this has been reviewed by Calvin and Marko, and Calvin says Luis has incorporated Marko's suggestions) ------------------------------------------------------------ revno: 3092.5.1 committer: Luis Soares <luis.soares@sun.com> branch nick: mysql-5.1-bugteam timestamp: Thu 2009-09-24 15:52:52 +0100 message: BUG#42829: binlogging enabled for all schemas regardless of binlog-db-db / binlog-ignore-db InnoDB will return an error if statement based replication is used along with transaction isolation level READ-COMMITTED (or weaker), even if the statement in question is filtered out according to the binlog-do-db rules set. In this case, an error should not be printed. This patch addresses this issue by extending the existing check in external_lock to take into account the filter rules before deciding to print an error. Furthermore, it also changes decide_logging_format to take into consideration whether the statement is filtered out from binlog before decision is made. added: mysql-test/suite/binlog/r/binlog_stm_do_db.result mysql-test/suite/binlog/t/binlog_stm_do_db-master.opt mysql-test/suite/binlog/t/binlog_stm_do_db.test modified: sql/sql_base.cc sql/sql_class.cc storage/innobase/handler/ha_innodb.cc storage/innobase/handler/ha_innodb.h storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/ha_innodb.h ------------------------------------------------------------------------ r6160 | vasil | 2009-11-11 15:33:49 +0200 (Wed, 11 Nov 2009) | 72 lines Changed paths: M /branches/zip/include/os0file.h M /branches/zip/os/os0file.c branches/zip: Merge r6152:6159 from branches/5.1: (r6158 was skipped as an equivallent change has already been merged from MySQL) ------------------------------------------------------------------------ r6154 | calvin | 2009-11-11 02:51:17 +0200 (Wed, 11 Nov 2009) | 17 lines Changed paths: M /branches/5.1/include/os0file.h M /branches/5.1/os/os0file.c branches/5.1: fix bug#3139: Mysql crashes: 'windows error 995' after several selects on a large DB During stress environment, Windows AIO may fail with error code ERROR_OPERATION_ABORTED. InnoDB does not handle the error, rather crashes. The cause of the error is unknown, but likely due to faulty hardware or driver. This patch introduces a new error code OS_FILE_OPERATION_ABORTED, which maps to Windows ERROR_OPERATION_ABORTED (995). When the error is detected during AIO, the InnoDB will issue a synchronous retry (read/write). This patch has been extensively tested by MySQL support. Approved by: Marko rb://196 ------------------------------------------------------------------------ r6158 | vasil | 2009-11-11 14:52:14 +0200 (Wed, 11 Nov 2009) | 37 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/handler/ha_innodb.h branches/5.1: Merge a change from MySQL: (this has been reviewed by Calvin and Marko, and Calvin says Luis has incorporated Marko's suggestions) ------------------------------------------------------------ revno: 3092.5.1 committer: Luis Soares <luis.soares@sun.com> branch nick: mysql-5.1-bugteam timestamp: Thu 2009-09-24 15:52:52 +0100 message: BUG#42829: binlogging enabled for all schemas regardless of binlog-db-db / binlog-ignore-db InnoDB will return an error if statement based replication is used along with transaction isolation level READ-COMMITTED (or weaker), even if the statement in question is filtered out according to the binlog-do-db rules set. In this case, an error should not be printed. This patch addresses this issue by extending the existing check in external_lock to take into account the filter rules before deciding to print an error. Furthermore, it also changes decide_logging_format to take into consideration whether the statement is filtered out from binlog before decision is made. added: mysql-test/suite/binlog/r/binlog_stm_do_db.result mysql-test/suite/binlog/t/binlog_stm_do_db-master.opt mysql-test/suite/binlog/t/binlog_stm_do_db.test modified: sql/sql_base.cc sql/sql_class.cc storage/innobase/handler/ha_innodb.cc storage/innobase/handler/ha_innodb.h storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/ha_innodb.h ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6161 | vasil | 2009-11-11 15:36:16 +0200 (Wed, 11 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add changelog entry for r6160. ------------------------------------------------------------------------ r6162 | vasil | 2009-11-11 16:00:12 +0200 (Wed, 11 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog for r6157. ------------------------------------------------------------------------ r6163 | calvin | 2009-11-11 17:53:20 +0200 (Wed, 11 Nov 2009) | 8 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/handler/ha_innodb.h branches/zip: Exclude thd_binlog_filter_ok() when building with older version of MySQL. thd_binlog_filter_ok() is introduced in MySQL 5.1.41. But the plugin can be built with MySQL prior to 5.1.41. Approved by Heikki (on IM). ------------------------------------------------------------------------ r6169 | calvin | 2009-11-12 14:40:43 +0200 (Thu, 12 Nov 2009) | 6 lines Changed paths: A /branches/zip/mysql-test/innodb_bug46676.result A /branches/zip/mysql-test/innodb_bug46676.test branches/zip: add test case for bug#46676 This crash is reproducible with InnoDB plugin 1.0.4 + MySQL 5.1.37. But no longer reproducible after MySQL 5.1.38 (with plugin 1.0.5). Add test case to catch future regression. ------------------------------------------------------------------------ r6170 | marko | 2009-11-12 15:49:08 +0200 (Thu, 12 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/db0err.h M /branches/zip/row/row0merge.c M /branches/zip/row/row0mysql.c branches/zip: Allow CREATE INDEX to be interrupted. (Issue #354) rb://183 approved by Heikki Tuuri ------------------------------------------------------------------------ r6175 | vasil | 2009-11-16 20:07:39 +0200 (Mon, 16 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Wrap line at 78th char in the ChangeLog ------------------------------------------------------------------------ r6177 | calvin | 2009-11-16 20:20:38 +0200 (Mon, 16 Nov 2009) | 2 lines Changed paths: M /branches/zip/ChangeLog branches/zip: add an entry to ChangeLog for r6065 ------------------------------------------------------------------------ r6179 | marko | 2009-11-17 10:19:34 +0200 (Tue, 17 Nov 2009) | 2 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: ha_innobase::change_active_index(): When the history is missing, report it to the client, not to the error log. ------------------------------------------------------------------------ r6181 | vasil | 2009-11-17 12:21:41 +0200 (Tue, 17 Nov 2009) | 33 lines Changed paths: M /branches/zip/mysql-test/innodb-index.test branches/zip: At the end of innodb-index.test: restore the environment as it was before the test was started to silence this warning: MTR's internal check of the test case 'main.innodb-index' failed. This means that the test case does not preserve the state that existed before the test case was executed. Most likely the test case did not do a proper clean-up. This is the diff of the states of the servers before and after the test case was executed: mysqltest: Logging to '/tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.log'. mysqltest: Results saved in '/tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.result'. mysqltest: Connecting to server localhost:13000 (socket /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/mysqld.1.sock) as 'root', connection 'default', attempt 0 ... mysqltest: ... Connected. mysqltest: Start processing test commands from './include/check-testcase.test' ... mysqltest: ... Done processing test commands. --- /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.result 2009-11-17 13:10:40.000000000 +0300 +++ /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.reject 2009-11-17 13:10:54.000000000 +0300 @@ -84,7 +84,7 @@ INNODB_DOUBLEWRITE ON INNODB_FAST_SHUTDOWN 1 INNODB_FILE_FORMAT Antelope -INNODB_FILE_FORMAT_CHECK Antelope +INNODB_FILE_FORMAT_CHECK Barracuda INNODB_FILE_PER_TABLE OFF INNODB_FLUSH_LOG_AT_TRX_COMMIT 1 INNODB_FLUSH_METHOD mysqltest: Result content mismatch not ok ------------------------------------------------------------------------ r6182 | marko | 2009-11-17 13:49:15 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-consistent-master.opt M /branches/zip/mysql-test/innodb-consistent.result M /branches/zip/mysql-test/innodb-consistent.test M /branches/zip/mysql-test/innodb-use-sys-malloc-master.opt M /branches/zip/mysql-test/innodb-use-sys-malloc.result M /branches/zip/mysql-test/innodb-use-sys-malloc.test M /branches/zip/mysql-test/innodb_bug21704.result M /branches/zip/mysql-test/innodb_bug21704.test M /branches/zip/mysql-test/innodb_bug40360.test M /branches/zip/mysql-test/innodb_bug40565.result M /branches/zip/mysql-test/innodb_bug40565.test M /branches/zip/mysql-test/innodb_bug41904.result M /branches/zip/mysql-test/innodb_bug41904.test M /branches/zip/mysql-test/innodb_bug42101-nonzero-master.opt M /branches/zip/mysql-test/innodb_bug42101-nonzero.result M /branches/zip/mysql-test/innodb_bug42101-nonzero.test M /branches/zip/mysql-test/innodb_bug42101.result M /branches/zip/mysql-test/innodb_bug42101.test M /branches/zip/mysql-test/innodb_bug44032.result M /branches/zip/mysql-test/innodb_bug44032.test M /branches/zip/mysql-test/innodb_bug44369.result M /branches/zip/mysql-test/innodb_bug44369.test M /branches/zip/mysql-test/innodb_bug44571.result M /branches/zip/mysql-test/innodb_bug44571.test M /branches/zip/mysql-test/innodb_bug45357.test M /branches/zip/mysql-test/innodb_bug46000.result M /branches/zip/mysql-test/innodb_bug46000.test M /branches/zip/mysql-test/innodb_bug46676.result M /branches/zip/mysql-test/innodb_bug46676.test M /branches/zip/mysql-test/innodb_bug47167.result M /branches/zip/mysql-test/innodb_bug47167.test M /branches/zip/mysql-test/innodb_bug47777.result M /branches/zip/mysql-test/innodb_bug47777.test M /branches/zip/mysql-test/innodb_file_format.result M /branches/zip/mysql-test/innodb_file_format.test branches/zip: Set svn:eol-style on mysql-test files. ------------------------------------------------------------------------ r6183 | marko | 2009-11-17 13:51:16 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-consistent-master.opt M /branches/zip/mysql-test/innodb-master.opt M /branches/zip/mysql-test/innodb-semi-consistent-master.opt M /branches/zip/mysql-test/innodb-use-sys-malloc-master.opt M /branches/zip/mysql-test/innodb_bug42101-nonzero-master.opt branches/zip: Prepend loose_ to plugin-only mysql-test options. ------------------------------------------------------------------------ r6184 | marko | 2009-11-17 13:52:01 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-index.result M /branches/zip/mysql-test/innodb-index.test branches/zip: innodb-index.test: Restore innodb_file_format_check. ------------------------------------------------------------------------ r6185 | marko | 2009-11-17 16:44:20 +0200 (Tue, 17 Nov 2009) | 16 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/mysql-test/innodb.result M /branches/zip/mysql-test/innodb.test M /branches/zip/mysql-test/innodb_bug44369.result M /branches/zip/mysql-test/innodb_bug44369.test D /branches/zip/mysql-test/patches/innodb-index.diff M /branches/zip/row/row0mysql.c branches/zip: Report duplicate table names to the client connection, not to the error log. This change will allow innodb-index.test to be re-enabled. It was previously disabled, because mysql-test-run does not like output in the error log. row_create_table_for_mysql(): Do not output anything to the error log when reporting DB_DUPLICATE_KEY. Let the caller report the error. Add a TODO comment that the dict_table_t object is apparently not freed when an error occurs. create_table_def(): Convert InnoDB table names to the character set of the client connection for reporting. Use my_error(ER_WRONG_COLUMN_NAME) for reporting reserved column names. Report my_error(ER_TABLE_EXISTS_ERROR) when row_create_table_for_mysql() returns DB_DUPLICATE_KEY. rb://206 ------------------------------------------------------------------------ r6186 | vasil | 2009-11-17 16:48:14 +0200 (Tue, 17 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for r6185. ------------------------------------------------------------------------ r6189 | marko | 2009-11-18 11:36:18 +0200 (Wed, 18 Nov 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/handler0alter.cc branches/zip: ha_innobase::add_index(): When creating the primary key and the table is being locked by another transaction, do not attempt to drop the table. (Bug #48782) Approved by Sunny Bains over IM ------------------------------------------------------------------------ r6194 | vasil | 2009-11-19 09:24:45 +0200 (Thu, 19 Nov 2009) | 5 lines Changed paths: M /branches/zip/include/univ.i branches/zip: Increment version number from 1.0.5 to 1.0.6 since 1.0.5 was just released by MySQL and we will soon release 1.0.6. ------------------------------------------------------------------------ r6197 | calvin | 2009-11-19 09:32:55 +0200 (Thu, 19 Nov 2009) | 6 lines Changed paths: M /branches/zip/CMakeLists.txt branches/zip: merge the fix of bug#48317 (CMake file) Due to MySQL changes to the CMake, it is no longer able to build InnoDB plugin as a static library on Windows. The fix is proposed by Vlad of MySQL. ------------------------------------------------------------------------ r6198 | vasil | 2009-11-19 09:44:31 +0200 (Thu, 19 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for r6197. ------------------------------------------------------------------------ r6199 | vasil | 2009-11-19 12:10:12 +0200 (Thu, 19 Nov 2009) | 31 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/btr/btr0btr.c M /branches/zip/data/data0type.c branches/zip: Merge r6159:6198 from branches/5.1: ------------------------------------------------------------------------ r6187 | jyang | 2009-11-18 05:27:30 +0200 (Wed, 18 Nov 2009) | 9 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: Fix bug #48469 "when innodb tablespace is configured too small, crash and corruption!". Function btr_create() did not check the return status of fseg_create(), and continue the index creation even there is no sufficient space. rb://205 Approved by Marko ------------------------------------------------------------------------ r6188 | jyang | 2009-11-18 07:14:23 +0200 (Wed, 18 Nov 2009) | 8 lines Changed paths: M /branches/5.1/data/data0type.c branches/5.1: Fix bug #48526 "Data type for float and double is incorrectly reported in InnoDB table monitor". Certain datatypes are not printed correctly in dtype_print(). rb://204 Approved by Marko. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6201 | marko | 2009-11-19 14:09:11 +0200 (Thu, 19 Nov 2009) | 2 lines Changed paths: M /branches/zip/handler/handler0alter.cc branches/zip: ha_innobase::add_index(): Clarify the comment on orphaned tables when creating a primary key. ------------------------------------------------------------------------ r6202 | jyang | 2009-11-19 15:01:00 +0200 (Thu, 19 Nov 2009) | 8 lines Changed paths: M /branches/zip/btr/btr0btr.c branches/zip: Function fseg_free() is no longer defined in branches/zip. To port fix for bug #48469 to zip, we can use btr_free_root() which frees the page, and also does not require mini-transaction. Approved by Marko. ------------------------------------------------------------------------ r6207 | vasil | 2009-11-20 10:19:14 +0200 (Fri, 20 Nov 2009) | 54 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Merge r6198:6206 from branches/5.1: (r6203 was skipped as it is already in branches/zip) ------------------------------------------------------------------------ r6200 | vasil | 2009-11-19 12:14:23 +0200 (Thu, 19 Nov 2009) | 4 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: White space fixup - indent under the opening ( ------------------------------------------------------------------------ r6203 | jyang | 2009-11-19 15:12:22 +0200 (Thu, 19 Nov 2009) | 8 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: Use btr_free_root() instead of fseg_free() for the fix of bug #48469, because fseg_free() is not defined in the zip branch. And we could save one mini-trasaction started by fseg_free(). Approved by Marko. ------------------------------------------------------------------------ r6205 | jyang | 2009-11-20 07:55:48 +0200 (Fri, 20 Nov 2009) | 11 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Add a special case to handle the Duplicated Key error and return DB_ERROR instead. This is to avoid a possible SIGSEGV by mysql error handling re-entering the storage layer for dup key info without proper table handle. This is to prevent a server crash when error situation in bug #45961 "DDL on partitioned innodb tables leaves data dictionary in an inconsistent state" happens. rb://157 approved by Sunny Bains. ------------------------------------------------------------------------ r6206 | jyang | 2009-11-20 09:38:43 +0200 (Fri, 20 Nov 2009) | 5 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Fix a minor code formating issue for the parenthesis iplacement of the if condition in rename_table(). ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6208 | vasil | 2009-11-20 10:49:24 +0200 (Fri, 20 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for c6207. ------------------------------------------------------------------------ r6210 | vasil | 2009-11-20 23:39:48 +0200 (Fri, 20 Nov 2009) | 3 lines Changed paths: M /branches/zip/trx/trx0i_s.c branches/zip: Whitespace fixup. ------------------------------------------------------------------------ r6248 | marko | 2009-11-30 12:19:50 +0200 (Mon, 30 Nov 2009) | 1 line Changed paths: M /branches/zip/ChangeLog branches/zip: ChangeLog: Document r4922 that was forgotten. ------------------------------------------------------------------------ r6252 | marko | 2009-11-30 12:50:11 +0200 (Mon, 30 Nov 2009) | 23 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/dict/dict0boot.c M /branches/zip/dict/dict0crea.c M /branches/zip/dict/dict0load.c M /branches/zip/dict/dict0mem.c M /branches/zip/fil/fil0fil.c M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/dict0mem.h M /branches/zip/row/row0mysql.c branches/zip: Suppress errors about non-found temporary tables. Write the is_temp flag to SYS_TABLES.MIX_LEN. dict_table_t::flags: Add a flag for is_temporary, DICT_TF2_TEMPORARY. Unlike other flags, this will not be written to the tablespace flags or SYS_TABLES.TYPE, but only to SYS_TABLES.MIX_LEN. dict_build_table_def_step(): Only pass DICT_TF_BITS to tablespaces. dict_check_tablespaces_and_store_max_id(), dict_load_table(): Suppress errors about temporary tables not being found. dict_create_sys_tables_tuple(): Write the DICT_TF2_TEMPORARY flag to SYS_TABLES.MIX_LEN. fil_space_create(), fil_create_new_single_table_tablespace(): Add assertions about space->flags. row_drop_table_for_mysql(): Do not complain about non-found temporary tables. rb://160 approved by Heikki Tuuri. This addresses the second part of Bug #41609 Crash recovery does not work for InnoDB temporary tables. ------------------------------------------------------------------------ r6263 | vasil | 2009-12-01 14:49:05 +0200 (Tue, 01 Dec 2009) | 4 lines Changed paths: M /branches/zip/include/univ.i branches/zip: Increment version number from 1.0.6 to 1.0.7 1.0.6 has been released ------------------------------------------------------------------------ r6264 | vasil | 2009-12-01 16:19:44 +0200 (Tue, 01 Dec 2009) | 1 line Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for the release of 1.0.6. ------------------------------------------------------------------------ r6269 | marko | 2009-12-02 11:35:22 +0200 (Wed, 02 Dec 2009) | 2 lines Changed paths: M /branches/zip/srv/srv0start.c branches/zip: innobase_start_or_create_for_mysql(): UNIV_IBUF_DEBUG should not break crash recovery, but UNIV_IBUF_COUNT_DEBUG will. ------------------------------------------------------------------------ r6270 | marko | 2009-12-02 11:36:47 +0200 (Wed, 02 Dec 2009) | 1 line Changed paths: M /branches/zip/srv/srv0start.c branches/zip: innobase_start_or_create_for_mysql(): Log the zlib version. ------------------------------------------------------------------------ r6271 | marko | 2009-12-02 11:43:49 +0200 (Wed, 02 Dec 2009) | 2 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/Makefile.am M /branches/zip/include/univ.i M /branches/zip/plug.in branches/zip: ChangeLog: Document that since r6270, the zlib version number will be displayed at start-up. ------------------------------------------------------------------------ r6272 | marko | 2009-12-02 11:46:05 +0200 (Wed, 02 Dec 2009) | 1 line Changed paths: M /branches/zip/Makefile.am M /branches/zip/include/univ.i M /branches/zip/plug.in branches/zip: Revert changes that were accidentally committed in r6271. ------------------------------------------------------------------------ r6274 | marko | 2009-12-03 14:47:12 +0200 (Thu, 03 Dec 2009) | 6 lines Changed paths: M /branches/zip/dict/dict0dict.c branches/zip: dict_table_check_for_dup_indexes(): Assert that the data dictionary mutex is being held while table->indexes is accessed. This is already the case. Currently, only dict_table_get_next_index() and dict_table_get_first_index() are being invoked without holding dict_sys->mutex. ------------------------------------------------------------------------ r6275 | pekka | 2009-12-03 18:32:47 +0200 (Thu, 03 Dec 2009) | 10 lines Changed paths: M /branches/zip/include/log0recv.h M /branches/zip/include/trx0sys.h M /branches/zip/log/log0recv.c M /branches/zip/trx/trx0sys.c branches/zip: Minor changes which allow build with UNIV_HOTBACKUP defined to succeed: include/trx0sys.h: Allow Hot Backup build to see some TRX_SYS_DOUBLEWRITE_... macros. trx/trx0sys.c: Exclude trx_sys_close() function from Hot Backup build. log/log0recv.[ch]: Exclude recv_sys_var_init() function from Hot Backup build. This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r6277 | marko | 2009-12-08 11:13:36 +0200 (Tue, 08 Dec 2009) | 1 line Changed paths: M /branches/zip/fsp/fsp0fsp.c branches/zip: fsp0fsp.c: Add some missing in/out and const qualifiers. ------------------------------------------------------------------------ r6285 | marko | 2009-12-09 09:24:50 +0200 (Wed, 09 Dec 2009) | 13 lines Changed paths: M /branches/zip/row/row0sel.c branches/zip: row_sel_fetch_columns(): Remove redundant code that was accidentally added in r1591, which introduced dfield_t::ext in order to make the merge sort of fast index creation support externally stored columns, Initially, I tried to allocate the bit for dfield_t::ext from dfield_t::len by making the length 31 bits and mapping UNIV_SQL_NULL to something that would fit in it. Then I decided that it would be too risky. The redundant check was part of the mapping. The condition may have been dfield_is_null() initially. This redundant code was noticed by Sergey Petrunya on the MySQL internals list. ------------------------------------------------------------------------ r6288 | marko | 2009-12-09 09:51:00 +0200 (Wed, 09 Dec 2009) | 15 lines Changed paths: M /branches/zip/row/row0upd.c branches/zip: row_upd_copy_columns(): Remove redundant code that was accidentally added in r1591, which introduced dfield_t::ext in order to make the merge sort of fast index creation support externally stored columns. Initially, I tried to allocate the bit for dfield_t::ext from dfield_t::len by making the length 31 bits and mapping UNIV_SQL_NULL to something that would fit in it. Then I decided that it would be too risky. The redundant check was part of the mapping. The condition may have been dfield_is_null() initially. This is similar to the redundant code in row_sel_fetch_columns() that was noticed by Sergey Petrunya on the MySQL internals list and removed in r6285. As far as I can tell, there are no redundant UNIV_SQL_NULL assignments remaining after this change. ------------------------------------------------------------------------ r6305 | marko | 2009-12-14 13:03:57 +0200 (Mon, 14 Dec 2009) | 2 lines Changed paths: M /branches/zip/row/row0umod.c branches/zip: row_undo_mod_del_unmark_sec_and_undo_update(): Add a missing const qualifier. ------------------------------------------------------------------------ r6309 | marko | 2009-12-15 14:05:50 +0200 (Tue, 15 Dec 2009) | 3 lines Changed paths: M /branches/zip/lock/lock0lock.c branches/zip: lock_rec_insert_check_and_lock(): Avoid casting away constness. Use page_rec_get_next_const() instead. This silences a gcc 4.2.4 warning. Reported by Sunny Bains. ------------------------------------------------------------------------ r6312 | marko | 2009-12-16 10:10:36 +0200 (Wed, 16 Dec 2009) | 6 lines Changed paths: M /branches/zip/fil/fil0fil.c branches/zip: fil_close(): Add #ifndef UNIV_HOTBACKUP around a debug assertion on mutex.magic_n. InnoDB Hot Backup is a single-threaded program and does not contain mutexes. This change allows InnoDB Hot Backup to be compiled with UNIV_DEBUG. Suggested by Michael Izioumtchenko. ------------------------------------------------------------------------ r6321 | marko | 2009-12-16 16:16:33 +0200 (Wed, 16 Dec 2009) | 4 lines Changed paths: M /branches/zip/row/row0merge.c branches/zip: row_merge_drop_temp_indexes(): Revert a hack to transaction isolation level that was made unnecessary by r5826 (Issue #337). When this function is called, any active data dictionary transaction should have been rolled back. ------------------------------------------------------------------------ r6345 | marko | 2009-12-21 10:46:14 +0200 (Mon, 21 Dec 2009) | 7 lines Changed paths: M /branches/zip/log/log0recv.c branches/zip: recv_scan_log_recs(): Non-functional change: Replace a debug assertion ut_ad(len > 0) with ut_ad(len >= OS_FILE_LOG_BLOCK_SIZE). This change is only for readability, for Issue #428. Another assertion on len being an integer multiple of OS_FILE_LOG_BLOCK_SIZE already ensured together with the old ut_ad(len > 0) that actually len must be at least OS_FILE_LOG_BLOCK_SIZE. ------------------------------------------------------------------------ r6346 | marko | 2009-12-21 12:03:25 +0200 (Mon, 21 Dec 2009) | 2 lines Changed paths: M /branches/zip/log/log0recv.c branches/zip: recv_recovery_from_checkpoint_finish(): Revert a change that was accidentally committed in r6345. ------------------------------------------------------------------------ r6348 | marko | 2009-12-22 11:04:34 +0200 (Tue, 22 Dec 2009) | 37 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/ha_prototypes.h M /branches/zip/include/trx0trx.h M /branches/zip/lock/lock0lock.c M /branches/zip/trx/trx0i_s.c M /branches/zip/trx/trx0trx.c branches/zip: Merge a change from MySQL: ------------------------------------------------------------ revno: 3236 committer: Satya B <satya.bn@sun.com> branch nick: mysql-5.1-bugteam timestamp: Tue 2009-12-01 17:48:57 +0530 message: merge to mysql-5.1-bugteam ------------------------------------------------------------ revno: 3234.1.1 committer: Gleb Shchepa <gshchepa@mysql.com> branch nick: mysql-5.1-bugteam timestamp: Tue 2009-12-01 14:38:40 +0400 message: Bug #38883 (reopened): thd_security_context is not thread safe, crashes? manual merge 5.0-->5.1, updating InnoDB plugin. ------------------------------------------------------------ revno: 1810.3968.13 committer: Gleb Shchepa <gshchepa@mysql.com> branch nick: mysql-5.0-bugteam timestamp: Tue 2009-12-01 14:24:44 +0400 message: Bug #38883 (reopened): thd_security_context is not thread safe, crashes? The bug 38816 changed the lock that protects THD::query from LOCK_thread_count to LOCK_thd_data, but didn't update the associated InnoDB functions. 1. The innobase_mysql_prepare_print_arbitrary_thd and the innobase_mysql_end_print_arbitrary_thd InnoDB functions have been removed, since now we have a per-thread mutex: now we don't need to wrap several inter-thread access tries to THD::query with a single global LOCK_thread_count lock, so we can simplify the code. 2. The innobase_mysql_print_thd function has been modified to lock LOCK_thd_data in direct way. ------------------------------------------------------------------------ r6351 | marko | 2009-12-22 11:11:18 +0200 (Tue, 22 Dec 2009) | 1 line Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Remove an obsolete declaration of LOCK_thread_count. ------------------------------------------------------------------------ r6352 | marko | 2009-12-22 12:33:01 +0200 (Tue, 22 Dec 2009) | 104 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/lock0lock.h M /branches/zip/include/srv0srv.h M /branches/zip/lock/lock0lock.c M /branches/zip/log/log0log.c M /branches/zip/mysql-test/innodb-autoinc.result M /branches/zip/mysql-test/innodb-autoinc.test M /branches/zip/row/row0sel.c M /branches/zip/srv/srv0srv.c M /branches/zip/srv/srv0start.c branches/zip: Merge revisions 6206:6350 from branches/5.1, except r6347, r6349, r6350 which were committed separately to both branches, and r6310, which was backported from zip to 5.1. ------------------------------------------------------------------------ r6206 | jyang | 2009-11-20 09:38:43 +0200 (Fri, 20 Nov 2009) | 3 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Non-functional change, fix formatting. ------------------------------------------------------------------------ r6230 | sunny | 2009-11-24 23:52:43 +0200 (Tue, 24 Nov 2009) | 3 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result branches/5.1: Fix autoinc failing test results. (this should be skipped when merging 5.1 into zip) ------------------------------------------------------------------------ r6231 | sunny | 2009-11-25 10:26:27 +0200 (Wed, 25 Nov 2009) | 7 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test M /branches/5.1/row/row0sel.c branches/5.1: Fix BUG#49032 - auto_increment field does not initialize to last value in InnoDB Storage Engine. We use the appropriate function to read the column value for non-integer autoinc column types, namely float and double. rb://208. Approved by Marko. ------------------------------------------------------------------------ r6232 | sunny | 2009-11-25 10:27:39 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix white space errors. ------------------------------------------------------------------------ r6233 | sunny | 2009-11-25 10:28:35 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/include/mach0data.h M /branches/5.1/include/mach0data.ic M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix tests and make read float/double arg const. ------------------------------------------------------------------------ r6234 | sunny | 2009-11-25 10:29:03 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix whitepsace issues. ------------------------------------------------------------------------ r6235 | sunny | 2009-11-26 01:14:42 +0200 (Thu, 26 Nov 2009) | 9 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Fix Bug#47720 - REPLACE INTO Autoincrement column with negative values. This bug is similiar to the negative autoinc filter patch from earlier, with the additional handling of filtering out the negative column values set explicitly by the user. rb://184 Approved by Heikki. ------------------------------------------------------------------------ r6242 | vasil | 2009-11-27 22:07:12 +0200 (Fri, 27 Nov 2009) | 4 lines Changed paths: M /branches/5.1/export.sh branches/5.1: Minor changes to support plugin snapshots. ------------------------------------------------------------------------ r6306 | calvin | 2009-12-14 15:12:46 +0200 (Mon, 14 Dec 2009) | 5 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: fix bug#49267: innodb-autoinc.test fails on windows because of different case mode There is no change to the InnoDB code, only to fix test case by changing "T1" to "t1". ------------------------------------------------------------------------ r6324 | jyang | 2009-12-17 06:54:24 +0200 (Thu, 17 Dec 2009) | 8 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/lock0lock.h M /branches/5.1/include/srv0srv.h M /branches/5.1/lock/lock0lock.c M /branches/5.1/log/log0log.c M /branches/5.1/srv/srv0srv.c M /branches/5.1/srv/srv0start.c branches/5.1: Fix bug #47814 - Diagnostics are frequently not printed after a long lock wait in InnoDB. Separate out the lock wait timeout check thread from monitor information printing thread. rb://200 Approved by Marko. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6364 | marko | 2009-12-26 21:06:31 +0200 (Sat, 26 Dec 2009) | 4 lines Changed paths: M /branches/zip/ibuf/ibuf0ibuf.c branches/zip: ibuf_bitmap_get_map_page(): Define a wrapper macro that passes __FILE__, __LINE__ of the caller to buf_page_get_gen(). This will ease the diagnosis of the likes of Issue #135. ------------------------------------------------------------------------
16 years ago
branches/innodb+: Merge revisions 6130:6364 from branches/zip: ------------------------------------------------------------------------ r6130 | marko | 2009-11-02 11:42:56 +0200 (Mon, 02 Nov 2009) | 9 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/btr/btr0sea.c M /branches/zip/buf/buf0buf.c M /branches/zip/dict/dict0dict.c M /branches/zip/fil/fil0fil.c M /branches/zip/ibuf/ibuf0ibuf.c M /branches/zip/include/btr0sea.h M /branches/zip/include/dict0dict.h M /branches/zip/include/fil0fil.h M /branches/zip/include/ibuf0ibuf.h M /branches/zip/include/lock0lock.h M /branches/zip/include/log0log.h M /branches/zip/include/log0recv.h M /branches/zip/include/mem0mem.h M /branches/zip/include/mem0pool.h M /branches/zip/include/os0file.h M /branches/zip/include/pars0pars.h M /branches/zip/include/srv0srv.h M /branches/zip/include/thr0loc.h M /branches/zip/include/trx0i_s.h M /branches/zip/include/trx0purge.h M /branches/zip/include/trx0rseg.h M /branches/zip/include/trx0sys.h M /branches/zip/include/trx0undo.h M /branches/zip/include/usr0sess.h M /branches/zip/lock/lock0lock.c M /branches/zip/log/log0log.c M /branches/zip/log/log0recv.c M /branches/zip/mem/mem0dbg.c M /branches/zip/mem/mem0pool.c M /branches/zip/os/os0file.c M /branches/zip/os/os0sync.c M /branches/zip/os/os0thread.c M /branches/zip/pars/lexyy.c M /branches/zip/pars/pars0lex.l M /branches/zip/que/que0que.c M /branches/zip/srv/srv0srv.c M /branches/zip/srv/srv0start.c M /branches/zip/sync/sync0arr.c M /branches/zip/sync/sync0sync.c M /branches/zip/thr/thr0loc.c M /branches/zip/trx/trx0i_s.c M /branches/zip/trx/trx0purge.c M /branches/zip/trx/trx0rseg.c M /branches/zip/trx/trx0sys.c M /branches/zip/trx/trx0undo.c M /branches/zip/usr/usr0sess.c M /branches/zip/ut/ut0mem.c branches/zip: Free all resources at shutdown. Set pointers to NULL, so that Valgrind will not complain about freed data structures that are reachable via pointers. This addresses Bug #45992 and Bug #46656. This patch is mostly based on changes copied from branches/embedded-1.0, mainly c5432, c3439, c3134, c2994, c2978, but also some other code was copied. Some added cleanup code is specific to MySQL/InnoDB. rb://199 approved by Sunny Bains ------------------------------------------------------------------------ r6134 | marko | 2009-11-04 09:57:29 +0200 (Wed, 04 Nov 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc branches/zip: innobase_convert_identifier(): Convert table names with explain_filename() to address Bug #32430: 'show innodb status' causes errors Invalid (old?) table or database name in logs. rb://134 approved by Sunny Bains ------------------------------------------------------------------------ r6137 | marko | 2009-11-04 15:24:28 +0200 (Wed, 04 Nov 2009) | 1 line Changed paths: M /branches/zip/dict/dict0dict.c branches/zip: dict_index_too_big_for_undo(): Correct a typo. ------------------------------------------------------------------------ r6153 | vasil | 2009-11-10 15:33:22 +0200 (Tue, 10 Nov 2009) | 145 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Merge r6125:6152 from branches/5.1: (everything except the last white-space change was skipped as it is already in branches/zip) ------------------------------------------------------------------------ r6127 | vasil | 2009-10-30 11:18:25 +0200 (Fri, 30 Oct 2009) | 18 lines Changed paths: M /branches/5.1/Makefile.am M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Backport c6121 from branches/zip: ------------------------------------------------------------------------ r6121 | sunny | 2009-10-30 01:42:11 +0200 (Fri, 30 Oct 2009) | 7 lines Changed paths: M /branches/zip/mysql-test/innodb-autoinc.result branches/zip: This test has been problematic for sometime now. The underlying bug is that the data dictionaries get out of sync. In the AUTOINC code we try and apply salve to the symptoms. In the past MySQL made some unrelated change and the dictionaries stopped getting out of sync and this test started to fail. Now, it seems they have reverted that changed and the test is passing again. I suspect this is not he last time that this test will change. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6129 | vasil | 2009-10-30 17:14:22 +0200 (Fri, 30 Oct 2009) | 4 lines Changed paths: M /branches/5.1/Makefile.am branches/5.1: Revert a change to Makefile.am that sneaked unnoticed in c6127. ------------------------------------------------------------------------ r6136 | marko | 2009-11-04 12:28:10 +0200 (Wed, 04 Nov 2009) | 15 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/ha_prototypes.h M /branches/5.1/ut/ut0ut.c branches/5.1: Port r6134 from branches/zip: ------------------------------------------------------------------------ r6134 | marko | 2009-11-04 07:57:29 +0000 (Wed, 04 Nov 2009) | 5 lines branches/zip: innobase_convert_identifier(): Convert table names with explain_filename() to address Bug #32430: 'show innodb status' causes errors Invalid (old?) table or database name in logs. rb://134 approved by Sunny Bains ------------------------------------------------------------------------ innobase_print_identifier(): Replace with innobase_convert_name(). innobase_convert_identifier(): New function, called by innobase_convert_name(). ------------------------------------------------------------------------ r6149 | vasil | 2009-11-09 11:15:01 +0200 (Mon, 09 Nov 2009) | 5 lines Changed paths: M /branches/5.1/CMakeLists.txt branches/5.1: Followup to r5700: Adjust the changes so they are the same as in the BZR repository. ------------------------------------------------------------------------ r6150 | vasil | 2009-11-09 11:43:31 +0200 (Mon, 09 Nov 2009) | 58 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a part of r2911.5.5 from MySQL: (the other part of this was merged in c5700) ------------------------------------------------------------ revno: 2911.5.5 committer: Vladislav Vaintroub <vvaintroub@mysql.com> branch nick: 5.1-innodb_plugin timestamp: Wed 2009-06-10 10:59:49 +0200 message: Backport WL#3653 to 5.1 to enable bundled innodb plugin. Remove custom DLL loader code from innodb plugin code, use symbols exported from mysqld. removed: storage/innodb_plugin/handler/handler0vars.h storage/innodb_plugin/handler/win_delay_loader.cc added: storage/mysql_storage_engine.cmake win/create_def_file.js modified: CMakeLists.txt include/m_ctype.h include/my_global.h include/my_sys.h include/mysql/plugin.h libmysqld/CMakeLists.txt mysql-test/mysql-test-run.pl mysql-test/t/plugin.test mysql-test/t/plugin_load-master.opt mysys/charset.c sql/CMakeLists.txt sql/handler.h sql/mysql_priv.h sql/mysqld.cc sql/sql_class.cc sql/sql_class.h sql/sql_list.h sql/sql_profile.h storage/Makefile.am storage/archive/CMakeLists.txt storage/blackhole/CMakeLists.txt storage/csv/CMakeLists.txt storage/example/CMakeLists.txt storage/federated/CMakeLists.txt storage/heap/CMakeLists.txt storage/innobase/CMakeLists.txt storage/innobase/handler/ha_innodb.cc storage/innodb_plugin/CMakeLists.txt storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/handler0alter.cc storage/innodb_plugin/handler/i_s.cc storage/innodb_plugin/plug.in storage/myisam/CMakeLists.txt storage/myisammrg/CMakeLists.txt win/Makefile.am win/configure.js ------------------------------------------------------------------------ r6152 | vasil | 2009-11-10 15:30:20 +0200 (Tue, 10 Nov 2009) | 4 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: White space fixup. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6157 | jyang | 2009-11-11 14:27:09 +0200 (Wed, 11 Nov 2009) | 10 lines Changed paths: M /branches/zip/handler/ha_innodb.cc A /branches/zip/mysql-test/innodb_bug47167.result A /branches/zip/mysql-test/innodb_bug47167.test M /branches/zip/mysql-test/innodb_file_format.result branches/zip: Fix an issue that a local variable defined in innodb_file_format_check_validate() is being referenced across function in innodb_file_format_check_update(). In addition, fix "set global innodb_file_format_check = DEFAULT" call. Bug #47167: "set global innodb_file_format_check" cannot set value by User-Defined Variable." rb://169 approved by Sunny Bains and Marko. ------------------------------------------------------------------------ r6159 | vasil | 2009-11-11 15:13:01 +0200 (Wed, 11 Nov 2009) | 37 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/handler/ha_innodb.h branches/zip: Merge a change from MySQL: (this has been reviewed by Calvin and Marko, and Calvin says Luis has incorporated Marko's suggestions) ------------------------------------------------------------ revno: 3092.5.1 committer: Luis Soares <luis.soares@sun.com> branch nick: mysql-5.1-bugteam timestamp: Thu 2009-09-24 15:52:52 +0100 message: BUG#42829: binlogging enabled for all schemas regardless of binlog-db-db / binlog-ignore-db InnoDB will return an error if statement based replication is used along with transaction isolation level READ-COMMITTED (or weaker), even if the statement in question is filtered out according to the binlog-do-db rules set. In this case, an error should not be printed. This patch addresses this issue by extending the existing check in external_lock to take into account the filter rules before deciding to print an error. Furthermore, it also changes decide_logging_format to take into consideration whether the statement is filtered out from binlog before decision is made. added: mysql-test/suite/binlog/r/binlog_stm_do_db.result mysql-test/suite/binlog/t/binlog_stm_do_db-master.opt mysql-test/suite/binlog/t/binlog_stm_do_db.test modified: sql/sql_base.cc sql/sql_class.cc storage/innobase/handler/ha_innodb.cc storage/innobase/handler/ha_innodb.h storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/ha_innodb.h ------------------------------------------------------------------------ r6160 | vasil | 2009-11-11 15:33:49 +0200 (Wed, 11 Nov 2009) | 72 lines Changed paths: M /branches/zip/include/os0file.h M /branches/zip/os/os0file.c branches/zip: Merge r6152:6159 from branches/5.1: (r6158 was skipped as an equivallent change has already been merged from MySQL) ------------------------------------------------------------------------ r6154 | calvin | 2009-11-11 02:51:17 +0200 (Wed, 11 Nov 2009) | 17 lines Changed paths: M /branches/5.1/include/os0file.h M /branches/5.1/os/os0file.c branches/5.1: fix bug#3139: Mysql crashes: 'windows error 995' after several selects on a large DB During stress environment, Windows AIO may fail with error code ERROR_OPERATION_ABORTED. InnoDB does not handle the error, rather crashes. The cause of the error is unknown, but likely due to faulty hardware or driver. This patch introduces a new error code OS_FILE_OPERATION_ABORTED, which maps to Windows ERROR_OPERATION_ABORTED (995). When the error is detected during AIO, the InnoDB will issue a synchronous retry (read/write). This patch has been extensively tested by MySQL support. Approved by: Marko rb://196 ------------------------------------------------------------------------ r6158 | vasil | 2009-11-11 14:52:14 +0200 (Wed, 11 Nov 2009) | 37 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/handler/ha_innodb.h branches/5.1: Merge a change from MySQL: (this has been reviewed by Calvin and Marko, and Calvin says Luis has incorporated Marko's suggestions) ------------------------------------------------------------ revno: 3092.5.1 committer: Luis Soares <luis.soares@sun.com> branch nick: mysql-5.1-bugteam timestamp: Thu 2009-09-24 15:52:52 +0100 message: BUG#42829: binlogging enabled for all schemas regardless of binlog-db-db / binlog-ignore-db InnoDB will return an error if statement based replication is used along with transaction isolation level READ-COMMITTED (or weaker), even if the statement in question is filtered out according to the binlog-do-db rules set. In this case, an error should not be printed. This patch addresses this issue by extending the existing check in external_lock to take into account the filter rules before deciding to print an error. Furthermore, it also changes decide_logging_format to take into consideration whether the statement is filtered out from binlog before decision is made. added: mysql-test/suite/binlog/r/binlog_stm_do_db.result mysql-test/suite/binlog/t/binlog_stm_do_db-master.opt mysql-test/suite/binlog/t/binlog_stm_do_db.test modified: sql/sql_base.cc sql/sql_class.cc storage/innobase/handler/ha_innodb.cc storage/innobase/handler/ha_innodb.h storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/ha_innodb.h ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6161 | vasil | 2009-11-11 15:36:16 +0200 (Wed, 11 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add changelog entry for r6160. ------------------------------------------------------------------------ r6162 | vasil | 2009-11-11 16:00:12 +0200 (Wed, 11 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog for r6157. ------------------------------------------------------------------------ r6163 | calvin | 2009-11-11 17:53:20 +0200 (Wed, 11 Nov 2009) | 8 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/handler/ha_innodb.h branches/zip: Exclude thd_binlog_filter_ok() when building with older version of MySQL. thd_binlog_filter_ok() is introduced in MySQL 5.1.41. But the plugin can be built with MySQL prior to 5.1.41. Approved by Heikki (on IM). ------------------------------------------------------------------------ r6169 | calvin | 2009-11-12 14:40:43 +0200 (Thu, 12 Nov 2009) | 6 lines Changed paths: A /branches/zip/mysql-test/innodb_bug46676.result A /branches/zip/mysql-test/innodb_bug46676.test branches/zip: add test case for bug#46676 This crash is reproducible with InnoDB plugin 1.0.4 + MySQL 5.1.37. But no longer reproducible after MySQL 5.1.38 (with plugin 1.0.5). Add test case to catch future regression. ------------------------------------------------------------------------ r6170 | marko | 2009-11-12 15:49:08 +0200 (Thu, 12 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/db0err.h M /branches/zip/row/row0merge.c M /branches/zip/row/row0mysql.c branches/zip: Allow CREATE INDEX to be interrupted. (Issue #354) rb://183 approved by Heikki Tuuri ------------------------------------------------------------------------ r6175 | vasil | 2009-11-16 20:07:39 +0200 (Mon, 16 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Wrap line at 78th char in the ChangeLog ------------------------------------------------------------------------ r6177 | calvin | 2009-11-16 20:20:38 +0200 (Mon, 16 Nov 2009) | 2 lines Changed paths: M /branches/zip/ChangeLog branches/zip: add an entry to ChangeLog for r6065 ------------------------------------------------------------------------ r6179 | marko | 2009-11-17 10:19:34 +0200 (Tue, 17 Nov 2009) | 2 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: ha_innobase::change_active_index(): When the history is missing, report it to the client, not to the error log. ------------------------------------------------------------------------ r6181 | vasil | 2009-11-17 12:21:41 +0200 (Tue, 17 Nov 2009) | 33 lines Changed paths: M /branches/zip/mysql-test/innodb-index.test branches/zip: At the end of innodb-index.test: restore the environment as it was before the test was started to silence this warning: MTR's internal check of the test case 'main.innodb-index' failed. This means that the test case does not preserve the state that existed before the test case was executed. Most likely the test case did not do a proper clean-up. This is the diff of the states of the servers before and after the test case was executed: mysqltest: Logging to '/tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.log'. mysqltest: Results saved in '/tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.result'. mysqltest: Connecting to server localhost:13000 (socket /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/mysqld.1.sock) as 'root', connection 'default', attempt 0 ... mysqltest: ... Connected. mysqltest: Start processing test commands from './include/check-testcase.test' ... mysqltest: ... Done processing test commands. --- /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.result 2009-11-17 13:10:40.000000000 +0300 +++ /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.reject 2009-11-17 13:10:54.000000000 +0300 @@ -84,7 +84,7 @@ INNODB_DOUBLEWRITE ON INNODB_FAST_SHUTDOWN 1 INNODB_FILE_FORMAT Antelope -INNODB_FILE_FORMAT_CHECK Antelope +INNODB_FILE_FORMAT_CHECK Barracuda INNODB_FILE_PER_TABLE OFF INNODB_FLUSH_LOG_AT_TRX_COMMIT 1 INNODB_FLUSH_METHOD mysqltest: Result content mismatch not ok ------------------------------------------------------------------------ r6182 | marko | 2009-11-17 13:49:15 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-consistent-master.opt M /branches/zip/mysql-test/innodb-consistent.result M /branches/zip/mysql-test/innodb-consistent.test M /branches/zip/mysql-test/innodb-use-sys-malloc-master.opt M /branches/zip/mysql-test/innodb-use-sys-malloc.result M /branches/zip/mysql-test/innodb-use-sys-malloc.test M /branches/zip/mysql-test/innodb_bug21704.result M /branches/zip/mysql-test/innodb_bug21704.test M /branches/zip/mysql-test/innodb_bug40360.test M /branches/zip/mysql-test/innodb_bug40565.result M /branches/zip/mysql-test/innodb_bug40565.test M /branches/zip/mysql-test/innodb_bug41904.result M /branches/zip/mysql-test/innodb_bug41904.test M /branches/zip/mysql-test/innodb_bug42101-nonzero-master.opt M /branches/zip/mysql-test/innodb_bug42101-nonzero.result M /branches/zip/mysql-test/innodb_bug42101-nonzero.test M /branches/zip/mysql-test/innodb_bug42101.result M /branches/zip/mysql-test/innodb_bug42101.test M /branches/zip/mysql-test/innodb_bug44032.result M /branches/zip/mysql-test/innodb_bug44032.test M /branches/zip/mysql-test/innodb_bug44369.result M /branches/zip/mysql-test/innodb_bug44369.test M /branches/zip/mysql-test/innodb_bug44571.result M /branches/zip/mysql-test/innodb_bug44571.test M /branches/zip/mysql-test/innodb_bug45357.test M /branches/zip/mysql-test/innodb_bug46000.result M /branches/zip/mysql-test/innodb_bug46000.test M /branches/zip/mysql-test/innodb_bug46676.result M /branches/zip/mysql-test/innodb_bug46676.test M /branches/zip/mysql-test/innodb_bug47167.result M /branches/zip/mysql-test/innodb_bug47167.test M /branches/zip/mysql-test/innodb_bug47777.result M /branches/zip/mysql-test/innodb_bug47777.test M /branches/zip/mysql-test/innodb_file_format.result M /branches/zip/mysql-test/innodb_file_format.test branches/zip: Set svn:eol-style on mysql-test files. ------------------------------------------------------------------------ r6183 | marko | 2009-11-17 13:51:16 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-consistent-master.opt M /branches/zip/mysql-test/innodb-master.opt M /branches/zip/mysql-test/innodb-semi-consistent-master.opt M /branches/zip/mysql-test/innodb-use-sys-malloc-master.opt M /branches/zip/mysql-test/innodb_bug42101-nonzero-master.opt branches/zip: Prepend loose_ to plugin-only mysql-test options. ------------------------------------------------------------------------ r6184 | marko | 2009-11-17 13:52:01 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-index.result M /branches/zip/mysql-test/innodb-index.test branches/zip: innodb-index.test: Restore innodb_file_format_check. ------------------------------------------------------------------------ r6185 | marko | 2009-11-17 16:44:20 +0200 (Tue, 17 Nov 2009) | 16 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/mysql-test/innodb.result M /branches/zip/mysql-test/innodb.test M /branches/zip/mysql-test/innodb_bug44369.result M /branches/zip/mysql-test/innodb_bug44369.test D /branches/zip/mysql-test/patches/innodb-index.diff M /branches/zip/row/row0mysql.c branches/zip: Report duplicate table names to the client connection, not to the error log. This change will allow innodb-index.test to be re-enabled. It was previously disabled, because mysql-test-run does not like output in the error log. row_create_table_for_mysql(): Do not output anything to the error log when reporting DB_DUPLICATE_KEY. Let the caller report the error. Add a TODO comment that the dict_table_t object is apparently not freed when an error occurs. create_table_def(): Convert InnoDB table names to the character set of the client connection for reporting. Use my_error(ER_WRONG_COLUMN_NAME) for reporting reserved column names. Report my_error(ER_TABLE_EXISTS_ERROR) when row_create_table_for_mysql() returns DB_DUPLICATE_KEY. rb://206 ------------------------------------------------------------------------ r6186 | vasil | 2009-11-17 16:48:14 +0200 (Tue, 17 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for r6185. ------------------------------------------------------------------------ r6189 | marko | 2009-11-18 11:36:18 +0200 (Wed, 18 Nov 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/handler0alter.cc branches/zip: ha_innobase::add_index(): When creating the primary key and the table is being locked by another transaction, do not attempt to drop the table. (Bug #48782) Approved by Sunny Bains over IM ------------------------------------------------------------------------ r6194 | vasil | 2009-11-19 09:24:45 +0200 (Thu, 19 Nov 2009) | 5 lines Changed paths: M /branches/zip/include/univ.i branches/zip: Increment version number from 1.0.5 to 1.0.6 since 1.0.5 was just released by MySQL and we will soon release 1.0.6. ------------------------------------------------------------------------ r6197 | calvin | 2009-11-19 09:32:55 +0200 (Thu, 19 Nov 2009) | 6 lines Changed paths: M /branches/zip/CMakeLists.txt branches/zip: merge the fix of bug#48317 (CMake file) Due to MySQL changes to the CMake, it is no longer able to build InnoDB plugin as a static library on Windows. The fix is proposed by Vlad of MySQL. ------------------------------------------------------------------------ r6198 | vasil | 2009-11-19 09:44:31 +0200 (Thu, 19 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for r6197. ------------------------------------------------------------------------ r6199 | vasil | 2009-11-19 12:10:12 +0200 (Thu, 19 Nov 2009) | 31 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/btr/btr0btr.c M /branches/zip/data/data0type.c branches/zip: Merge r6159:6198 from branches/5.1: ------------------------------------------------------------------------ r6187 | jyang | 2009-11-18 05:27:30 +0200 (Wed, 18 Nov 2009) | 9 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: Fix bug #48469 "when innodb tablespace is configured too small, crash and corruption!". Function btr_create() did not check the return status of fseg_create(), and continue the index creation even there is no sufficient space. rb://205 Approved by Marko ------------------------------------------------------------------------ r6188 | jyang | 2009-11-18 07:14:23 +0200 (Wed, 18 Nov 2009) | 8 lines Changed paths: M /branches/5.1/data/data0type.c branches/5.1: Fix bug #48526 "Data type for float and double is incorrectly reported in InnoDB table monitor". Certain datatypes are not printed correctly in dtype_print(). rb://204 Approved by Marko. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6201 | marko | 2009-11-19 14:09:11 +0200 (Thu, 19 Nov 2009) | 2 lines Changed paths: M /branches/zip/handler/handler0alter.cc branches/zip: ha_innobase::add_index(): Clarify the comment on orphaned tables when creating a primary key. ------------------------------------------------------------------------ r6202 | jyang | 2009-11-19 15:01:00 +0200 (Thu, 19 Nov 2009) | 8 lines Changed paths: M /branches/zip/btr/btr0btr.c branches/zip: Function fseg_free() is no longer defined in branches/zip. To port fix for bug #48469 to zip, we can use btr_free_root() which frees the page, and also does not require mini-transaction. Approved by Marko. ------------------------------------------------------------------------ r6207 | vasil | 2009-11-20 10:19:14 +0200 (Fri, 20 Nov 2009) | 54 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Merge r6198:6206 from branches/5.1: (r6203 was skipped as it is already in branches/zip) ------------------------------------------------------------------------ r6200 | vasil | 2009-11-19 12:14:23 +0200 (Thu, 19 Nov 2009) | 4 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: White space fixup - indent under the opening ( ------------------------------------------------------------------------ r6203 | jyang | 2009-11-19 15:12:22 +0200 (Thu, 19 Nov 2009) | 8 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: Use btr_free_root() instead of fseg_free() for the fix of bug #48469, because fseg_free() is not defined in the zip branch. And we could save one mini-trasaction started by fseg_free(). Approved by Marko. ------------------------------------------------------------------------ r6205 | jyang | 2009-11-20 07:55:48 +0200 (Fri, 20 Nov 2009) | 11 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Add a special case to handle the Duplicated Key error and return DB_ERROR instead. This is to avoid a possible SIGSEGV by mysql error handling re-entering the storage layer for dup key info without proper table handle. This is to prevent a server crash when error situation in bug #45961 "DDL on partitioned innodb tables leaves data dictionary in an inconsistent state" happens. rb://157 approved by Sunny Bains. ------------------------------------------------------------------------ r6206 | jyang | 2009-11-20 09:38:43 +0200 (Fri, 20 Nov 2009) | 5 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Fix a minor code formating issue for the parenthesis iplacement of the if condition in rename_table(). ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6208 | vasil | 2009-11-20 10:49:24 +0200 (Fri, 20 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for c6207. ------------------------------------------------------------------------ r6210 | vasil | 2009-11-20 23:39:48 +0200 (Fri, 20 Nov 2009) | 3 lines Changed paths: M /branches/zip/trx/trx0i_s.c branches/zip: Whitespace fixup. ------------------------------------------------------------------------ r6248 | marko | 2009-11-30 12:19:50 +0200 (Mon, 30 Nov 2009) | 1 line Changed paths: M /branches/zip/ChangeLog branches/zip: ChangeLog: Document r4922 that was forgotten. ------------------------------------------------------------------------ r6252 | marko | 2009-11-30 12:50:11 +0200 (Mon, 30 Nov 2009) | 23 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/dict/dict0boot.c M /branches/zip/dict/dict0crea.c M /branches/zip/dict/dict0load.c M /branches/zip/dict/dict0mem.c M /branches/zip/fil/fil0fil.c M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/dict0mem.h M /branches/zip/row/row0mysql.c branches/zip: Suppress errors about non-found temporary tables. Write the is_temp flag to SYS_TABLES.MIX_LEN. dict_table_t::flags: Add a flag for is_temporary, DICT_TF2_TEMPORARY. Unlike other flags, this will not be written to the tablespace flags or SYS_TABLES.TYPE, but only to SYS_TABLES.MIX_LEN. dict_build_table_def_step(): Only pass DICT_TF_BITS to tablespaces. dict_check_tablespaces_and_store_max_id(), dict_load_table(): Suppress errors about temporary tables not being found. dict_create_sys_tables_tuple(): Write the DICT_TF2_TEMPORARY flag to SYS_TABLES.MIX_LEN. fil_space_create(), fil_create_new_single_table_tablespace(): Add assertions about space->flags. row_drop_table_for_mysql(): Do not complain about non-found temporary tables. rb://160 approved by Heikki Tuuri. This addresses the second part of Bug #41609 Crash recovery does not work for InnoDB temporary tables. ------------------------------------------------------------------------ r6263 | vasil | 2009-12-01 14:49:05 +0200 (Tue, 01 Dec 2009) | 4 lines Changed paths: M /branches/zip/include/univ.i branches/zip: Increment version number from 1.0.6 to 1.0.7 1.0.6 has been released ------------------------------------------------------------------------ r6264 | vasil | 2009-12-01 16:19:44 +0200 (Tue, 01 Dec 2009) | 1 line Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for the release of 1.0.6. ------------------------------------------------------------------------ r6269 | marko | 2009-12-02 11:35:22 +0200 (Wed, 02 Dec 2009) | 2 lines Changed paths: M /branches/zip/srv/srv0start.c branches/zip: innobase_start_or_create_for_mysql(): UNIV_IBUF_DEBUG should not break crash recovery, but UNIV_IBUF_COUNT_DEBUG will. ------------------------------------------------------------------------ r6270 | marko | 2009-12-02 11:36:47 +0200 (Wed, 02 Dec 2009) | 1 line Changed paths: M /branches/zip/srv/srv0start.c branches/zip: innobase_start_or_create_for_mysql(): Log the zlib version. ------------------------------------------------------------------------ r6271 | marko | 2009-12-02 11:43:49 +0200 (Wed, 02 Dec 2009) | 2 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/Makefile.am M /branches/zip/include/univ.i M /branches/zip/plug.in branches/zip: ChangeLog: Document that since r6270, the zlib version number will be displayed at start-up. ------------------------------------------------------------------------ r6272 | marko | 2009-12-02 11:46:05 +0200 (Wed, 02 Dec 2009) | 1 line Changed paths: M /branches/zip/Makefile.am M /branches/zip/include/univ.i M /branches/zip/plug.in branches/zip: Revert changes that were accidentally committed in r6271. ------------------------------------------------------------------------ r6274 | marko | 2009-12-03 14:47:12 +0200 (Thu, 03 Dec 2009) | 6 lines Changed paths: M /branches/zip/dict/dict0dict.c branches/zip: dict_table_check_for_dup_indexes(): Assert that the data dictionary mutex is being held while table->indexes is accessed. This is already the case. Currently, only dict_table_get_next_index() and dict_table_get_first_index() are being invoked without holding dict_sys->mutex. ------------------------------------------------------------------------ r6275 | pekka | 2009-12-03 18:32:47 +0200 (Thu, 03 Dec 2009) | 10 lines Changed paths: M /branches/zip/include/log0recv.h M /branches/zip/include/trx0sys.h M /branches/zip/log/log0recv.c M /branches/zip/trx/trx0sys.c branches/zip: Minor changes which allow build with UNIV_HOTBACKUP defined to succeed: include/trx0sys.h: Allow Hot Backup build to see some TRX_SYS_DOUBLEWRITE_... macros. trx/trx0sys.c: Exclude trx_sys_close() function from Hot Backup build. log/log0recv.[ch]: Exclude recv_sys_var_init() function from Hot Backup build. This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r6277 | marko | 2009-12-08 11:13:36 +0200 (Tue, 08 Dec 2009) | 1 line Changed paths: M /branches/zip/fsp/fsp0fsp.c branches/zip: fsp0fsp.c: Add some missing in/out and const qualifiers. ------------------------------------------------------------------------ r6285 | marko | 2009-12-09 09:24:50 +0200 (Wed, 09 Dec 2009) | 13 lines Changed paths: M /branches/zip/row/row0sel.c branches/zip: row_sel_fetch_columns(): Remove redundant code that was accidentally added in r1591, which introduced dfield_t::ext in order to make the merge sort of fast index creation support externally stored columns, Initially, I tried to allocate the bit for dfield_t::ext from dfield_t::len by making the length 31 bits and mapping UNIV_SQL_NULL to something that would fit in it. Then I decided that it would be too risky. The redundant check was part of the mapping. The condition may have been dfield_is_null() initially. This redundant code was noticed by Sergey Petrunya on the MySQL internals list. ------------------------------------------------------------------------ r6288 | marko | 2009-12-09 09:51:00 +0200 (Wed, 09 Dec 2009) | 15 lines Changed paths: M /branches/zip/row/row0upd.c branches/zip: row_upd_copy_columns(): Remove redundant code that was accidentally added in r1591, which introduced dfield_t::ext in order to make the merge sort of fast index creation support externally stored columns. Initially, I tried to allocate the bit for dfield_t::ext from dfield_t::len by making the length 31 bits and mapping UNIV_SQL_NULL to something that would fit in it. Then I decided that it would be too risky. The redundant check was part of the mapping. The condition may have been dfield_is_null() initially. This is similar to the redundant code in row_sel_fetch_columns() that was noticed by Sergey Petrunya on the MySQL internals list and removed in r6285. As far as I can tell, there are no redundant UNIV_SQL_NULL assignments remaining after this change. ------------------------------------------------------------------------ r6305 | marko | 2009-12-14 13:03:57 +0200 (Mon, 14 Dec 2009) | 2 lines Changed paths: M /branches/zip/row/row0umod.c branches/zip: row_undo_mod_del_unmark_sec_and_undo_update(): Add a missing const qualifier. ------------------------------------------------------------------------ r6309 | marko | 2009-12-15 14:05:50 +0200 (Tue, 15 Dec 2009) | 3 lines Changed paths: M /branches/zip/lock/lock0lock.c branches/zip: lock_rec_insert_check_and_lock(): Avoid casting away constness. Use page_rec_get_next_const() instead. This silences a gcc 4.2.4 warning. Reported by Sunny Bains. ------------------------------------------------------------------------ r6312 | marko | 2009-12-16 10:10:36 +0200 (Wed, 16 Dec 2009) | 6 lines Changed paths: M /branches/zip/fil/fil0fil.c branches/zip: fil_close(): Add #ifndef UNIV_HOTBACKUP around a debug assertion on mutex.magic_n. InnoDB Hot Backup is a single-threaded program and does not contain mutexes. This change allows InnoDB Hot Backup to be compiled with UNIV_DEBUG. Suggested by Michael Izioumtchenko. ------------------------------------------------------------------------ r6321 | marko | 2009-12-16 16:16:33 +0200 (Wed, 16 Dec 2009) | 4 lines Changed paths: M /branches/zip/row/row0merge.c branches/zip: row_merge_drop_temp_indexes(): Revert a hack to transaction isolation level that was made unnecessary by r5826 (Issue #337). When this function is called, any active data dictionary transaction should have been rolled back. ------------------------------------------------------------------------ r6345 | marko | 2009-12-21 10:46:14 +0200 (Mon, 21 Dec 2009) | 7 lines Changed paths: M /branches/zip/log/log0recv.c branches/zip: recv_scan_log_recs(): Non-functional change: Replace a debug assertion ut_ad(len > 0) with ut_ad(len >= OS_FILE_LOG_BLOCK_SIZE). This change is only for readability, for Issue #428. Another assertion on len being an integer multiple of OS_FILE_LOG_BLOCK_SIZE already ensured together with the old ut_ad(len > 0) that actually len must be at least OS_FILE_LOG_BLOCK_SIZE. ------------------------------------------------------------------------ r6346 | marko | 2009-12-21 12:03:25 +0200 (Mon, 21 Dec 2009) | 2 lines Changed paths: M /branches/zip/log/log0recv.c branches/zip: recv_recovery_from_checkpoint_finish(): Revert a change that was accidentally committed in r6345. ------------------------------------------------------------------------ r6348 | marko | 2009-12-22 11:04:34 +0200 (Tue, 22 Dec 2009) | 37 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/ha_prototypes.h M /branches/zip/include/trx0trx.h M /branches/zip/lock/lock0lock.c M /branches/zip/trx/trx0i_s.c M /branches/zip/trx/trx0trx.c branches/zip: Merge a change from MySQL: ------------------------------------------------------------ revno: 3236 committer: Satya B <satya.bn@sun.com> branch nick: mysql-5.1-bugteam timestamp: Tue 2009-12-01 17:48:57 +0530 message: merge to mysql-5.1-bugteam ------------------------------------------------------------ revno: 3234.1.1 committer: Gleb Shchepa <gshchepa@mysql.com> branch nick: mysql-5.1-bugteam timestamp: Tue 2009-12-01 14:38:40 +0400 message: Bug #38883 (reopened): thd_security_context is not thread safe, crashes? manual merge 5.0-->5.1, updating InnoDB plugin. ------------------------------------------------------------ revno: 1810.3968.13 committer: Gleb Shchepa <gshchepa@mysql.com> branch nick: mysql-5.0-bugteam timestamp: Tue 2009-12-01 14:24:44 +0400 message: Bug #38883 (reopened): thd_security_context is not thread safe, crashes? The bug 38816 changed the lock that protects THD::query from LOCK_thread_count to LOCK_thd_data, but didn't update the associated InnoDB functions. 1. The innobase_mysql_prepare_print_arbitrary_thd and the innobase_mysql_end_print_arbitrary_thd InnoDB functions have been removed, since now we have a per-thread mutex: now we don't need to wrap several inter-thread access tries to THD::query with a single global LOCK_thread_count lock, so we can simplify the code. 2. The innobase_mysql_print_thd function has been modified to lock LOCK_thd_data in direct way. ------------------------------------------------------------------------ r6351 | marko | 2009-12-22 11:11:18 +0200 (Tue, 22 Dec 2009) | 1 line Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Remove an obsolete declaration of LOCK_thread_count. ------------------------------------------------------------------------ r6352 | marko | 2009-12-22 12:33:01 +0200 (Tue, 22 Dec 2009) | 104 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/lock0lock.h M /branches/zip/include/srv0srv.h M /branches/zip/lock/lock0lock.c M /branches/zip/log/log0log.c M /branches/zip/mysql-test/innodb-autoinc.result M /branches/zip/mysql-test/innodb-autoinc.test M /branches/zip/row/row0sel.c M /branches/zip/srv/srv0srv.c M /branches/zip/srv/srv0start.c branches/zip: Merge revisions 6206:6350 from branches/5.1, except r6347, r6349, r6350 which were committed separately to both branches, and r6310, which was backported from zip to 5.1. ------------------------------------------------------------------------ r6206 | jyang | 2009-11-20 09:38:43 +0200 (Fri, 20 Nov 2009) | 3 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Non-functional change, fix formatting. ------------------------------------------------------------------------ r6230 | sunny | 2009-11-24 23:52:43 +0200 (Tue, 24 Nov 2009) | 3 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result branches/5.1: Fix autoinc failing test results. (this should be skipped when merging 5.1 into zip) ------------------------------------------------------------------------ r6231 | sunny | 2009-11-25 10:26:27 +0200 (Wed, 25 Nov 2009) | 7 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test M /branches/5.1/row/row0sel.c branches/5.1: Fix BUG#49032 - auto_increment field does not initialize to last value in InnoDB Storage Engine. We use the appropriate function to read the column value for non-integer autoinc column types, namely float and double. rb://208. Approved by Marko. ------------------------------------------------------------------------ r6232 | sunny | 2009-11-25 10:27:39 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix white space errors. ------------------------------------------------------------------------ r6233 | sunny | 2009-11-25 10:28:35 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/include/mach0data.h M /branches/5.1/include/mach0data.ic M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix tests and make read float/double arg const. ------------------------------------------------------------------------ r6234 | sunny | 2009-11-25 10:29:03 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix whitepsace issues. ------------------------------------------------------------------------ r6235 | sunny | 2009-11-26 01:14:42 +0200 (Thu, 26 Nov 2009) | 9 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Fix Bug#47720 - REPLACE INTO Autoincrement column with negative values. This bug is similiar to the negative autoinc filter patch from earlier, with the additional handling of filtering out the negative column values set explicitly by the user. rb://184 Approved by Heikki. ------------------------------------------------------------------------ r6242 | vasil | 2009-11-27 22:07:12 +0200 (Fri, 27 Nov 2009) | 4 lines Changed paths: M /branches/5.1/export.sh branches/5.1: Minor changes to support plugin snapshots. ------------------------------------------------------------------------ r6306 | calvin | 2009-12-14 15:12:46 +0200 (Mon, 14 Dec 2009) | 5 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: fix bug#49267: innodb-autoinc.test fails on windows because of different case mode There is no change to the InnoDB code, only to fix test case by changing "T1" to "t1". ------------------------------------------------------------------------ r6324 | jyang | 2009-12-17 06:54:24 +0200 (Thu, 17 Dec 2009) | 8 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/lock0lock.h M /branches/5.1/include/srv0srv.h M /branches/5.1/lock/lock0lock.c M /branches/5.1/log/log0log.c M /branches/5.1/srv/srv0srv.c M /branches/5.1/srv/srv0start.c branches/5.1: Fix bug #47814 - Diagnostics are frequently not printed after a long lock wait in InnoDB. Separate out the lock wait timeout check thread from monitor information printing thread. rb://200 Approved by Marko. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6364 | marko | 2009-12-26 21:06:31 +0200 (Sat, 26 Dec 2009) | 4 lines Changed paths: M /branches/zip/ibuf/ibuf0ibuf.c branches/zip: ibuf_bitmap_get_map_page(): Define a wrapper macro that passes __FILE__, __LINE__ of the caller to buf_page_get_gen(). This will ease the diagnosis of the likes of Issue #135. ------------------------------------------------------------------------
16 years ago
branches/innodb+: Merge revisions 6130:6364 from branches/zip: ------------------------------------------------------------------------ r6130 | marko | 2009-11-02 11:42:56 +0200 (Mon, 02 Nov 2009) | 9 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/btr/btr0sea.c M /branches/zip/buf/buf0buf.c M /branches/zip/dict/dict0dict.c M /branches/zip/fil/fil0fil.c M /branches/zip/ibuf/ibuf0ibuf.c M /branches/zip/include/btr0sea.h M /branches/zip/include/dict0dict.h M /branches/zip/include/fil0fil.h M /branches/zip/include/ibuf0ibuf.h M /branches/zip/include/lock0lock.h M /branches/zip/include/log0log.h M /branches/zip/include/log0recv.h M /branches/zip/include/mem0mem.h M /branches/zip/include/mem0pool.h M /branches/zip/include/os0file.h M /branches/zip/include/pars0pars.h M /branches/zip/include/srv0srv.h M /branches/zip/include/thr0loc.h M /branches/zip/include/trx0i_s.h M /branches/zip/include/trx0purge.h M /branches/zip/include/trx0rseg.h M /branches/zip/include/trx0sys.h M /branches/zip/include/trx0undo.h M /branches/zip/include/usr0sess.h M /branches/zip/lock/lock0lock.c M /branches/zip/log/log0log.c M /branches/zip/log/log0recv.c M /branches/zip/mem/mem0dbg.c M /branches/zip/mem/mem0pool.c M /branches/zip/os/os0file.c M /branches/zip/os/os0sync.c M /branches/zip/os/os0thread.c M /branches/zip/pars/lexyy.c M /branches/zip/pars/pars0lex.l M /branches/zip/que/que0que.c M /branches/zip/srv/srv0srv.c M /branches/zip/srv/srv0start.c M /branches/zip/sync/sync0arr.c M /branches/zip/sync/sync0sync.c M /branches/zip/thr/thr0loc.c M /branches/zip/trx/trx0i_s.c M /branches/zip/trx/trx0purge.c M /branches/zip/trx/trx0rseg.c M /branches/zip/trx/trx0sys.c M /branches/zip/trx/trx0undo.c M /branches/zip/usr/usr0sess.c M /branches/zip/ut/ut0mem.c branches/zip: Free all resources at shutdown. Set pointers to NULL, so that Valgrind will not complain about freed data structures that are reachable via pointers. This addresses Bug #45992 and Bug #46656. This patch is mostly based on changes copied from branches/embedded-1.0, mainly c5432, c3439, c3134, c2994, c2978, but also some other code was copied. Some added cleanup code is specific to MySQL/InnoDB. rb://199 approved by Sunny Bains ------------------------------------------------------------------------ r6134 | marko | 2009-11-04 09:57:29 +0200 (Wed, 04 Nov 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc branches/zip: innobase_convert_identifier(): Convert table names with explain_filename() to address Bug #32430: 'show innodb status' causes errors Invalid (old?) table or database name in logs. rb://134 approved by Sunny Bains ------------------------------------------------------------------------ r6137 | marko | 2009-11-04 15:24:28 +0200 (Wed, 04 Nov 2009) | 1 line Changed paths: M /branches/zip/dict/dict0dict.c branches/zip: dict_index_too_big_for_undo(): Correct a typo. ------------------------------------------------------------------------ r6153 | vasil | 2009-11-10 15:33:22 +0200 (Tue, 10 Nov 2009) | 145 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Merge r6125:6152 from branches/5.1: (everything except the last white-space change was skipped as it is already in branches/zip) ------------------------------------------------------------------------ r6127 | vasil | 2009-10-30 11:18:25 +0200 (Fri, 30 Oct 2009) | 18 lines Changed paths: M /branches/5.1/Makefile.am M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Backport c6121 from branches/zip: ------------------------------------------------------------------------ r6121 | sunny | 2009-10-30 01:42:11 +0200 (Fri, 30 Oct 2009) | 7 lines Changed paths: M /branches/zip/mysql-test/innodb-autoinc.result branches/zip: This test has been problematic for sometime now. The underlying bug is that the data dictionaries get out of sync. In the AUTOINC code we try and apply salve to the symptoms. In the past MySQL made some unrelated change and the dictionaries stopped getting out of sync and this test started to fail. Now, it seems they have reverted that changed and the test is passing again. I suspect this is not he last time that this test will change. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6129 | vasil | 2009-10-30 17:14:22 +0200 (Fri, 30 Oct 2009) | 4 lines Changed paths: M /branches/5.1/Makefile.am branches/5.1: Revert a change to Makefile.am that sneaked unnoticed in c6127. ------------------------------------------------------------------------ r6136 | marko | 2009-11-04 12:28:10 +0200 (Wed, 04 Nov 2009) | 15 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/ha_prototypes.h M /branches/5.1/ut/ut0ut.c branches/5.1: Port r6134 from branches/zip: ------------------------------------------------------------------------ r6134 | marko | 2009-11-04 07:57:29 +0000 (Wed, 04 Nov 2009) | 5 lines branches/zip: innobase_convert_identifier(): Convert table names with explain_filename() to address Bug #32430: 'show innodb status' causes errors Invalid (old?) table or database name in logs. rb://134 approved by Sunny Bains ------------------------------------------------------------------------ innobase_print_identifier(): Replace with innobase_convert_name(). innobase_convert_identifier(): New function, called by innobase_convert_name(). ------------------------------------------------------------------------ r6149 | vasil | 2009-11-09 11:15:01 +0200 (Mon, 09 Nov 2009) | 5 lines Changed paths: M /branches/5.1/CMakeLists.txt branches/5.1: Followup to r5700: Adjust the changes so they are the same as in the BZR repository. ------------------------------------------------------------------------ r6150 | vasil | 2009-11-09 11:43:31 +0200 (Mon, 09 Nov 2009) | 58 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Merge a part of r2911.5.5 from MySQL: (the other part of this was merged in c5700) ------------------------------------------------------------ revno: 2911.5.5 committer: Vladislav Vaintroub <vvaintroub@mysql.com> branch nick: 5.1-innodb_plugin timestamp: Wed 2009-06-10 10:59:49 +0200 message: Backport WL#3653 to 5.1 to enable bundled innodb plugin. Remove custom DLL loader code from innodb plugin code, use symbols exported from mysqld. removed: storage/innodb_plugin/handler/handler0vars.h storage/innodb_plugin/handler/win_delay_loader.cc added: storage/mysql_storage_engine.cmake win/create_def_file.js modified: CMakeLists.txt include/m_ctype.h include/my_global.h include/my_sys.h include/mysql/plugin.h libmysqld/CMakeLists.txt mysql-test/mysql-test-run.pl mysql-test/t/plugin.test mysql-test/t/plugin_load-master.opt mysys/charset.c sql/CMakeLists.txt sql/handler.h sql/mysql_priv.h sql/mysqld.cc sql/sql_class.cc sql/sql_class.h sql/sql_list.h sql/sql_profile.h storage/Makefile.am storage/archive/CMakeLists.txt storage/blackhole/CMakeLists.txt storage/csv/CMakeLists.txt storage/example/CMakeLists.txt storage/federated/CMakeLists.txt storage/heap/CMakeLists.txt storage/innobase/CMakeLists.txt storage/innobase/handler/ha_innodb.cc storage/innodb_plugin/CMakeLists.txt storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/handler0alter.cc storage/innodb_plugin/handler/i_s.cc storage/innodb_plugin/plug.in storage/myisam/CMakeLists.txt storage/myisammrg/CMakeLists.txt win/Makefile.am win/configure.js ------------------------------------------------------------------------ r6152 | vasil | 2009-11-10 15:30:20 +0200 (Tue, 10 Nov 2009) | 4 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: White space fixup. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6157 | jyang | 2009-11-11 14:27:09 +0200 (Wed, 11 Nov 2009) | 10 lines Changed paths: M /branches/zip/handler/ha_innodb.cc A /branches/zip/mysql-test/innodb_bug47167.result A /branches/zip/mysql-test/innodb_bug47167.test M /branches/zip/mysql-test/innodb_file_format.result branches/zip: Fix an issue that a local variable defined in innodb_file_format_check_validate() is being referenced across function in innodb_file_format_check_update(). In addition, fix "set global innodb_file_format_check = DEFAULT" call. Bug #47167: "set global innodb_file_format_check" cannot set value by User-Defined Variable." rb://169 approved by Sunny Bains and Marko. ------------------------------------------------------------------------ r6159 | vasil | 2009-11-11 15:13:01 +0200 (Wed, 11 Nov 2009) | 37 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/handler/ha_innodb.h branches/zip: Merge a change from MySQL: (this has been reviewed by Calvin and Marko, and Calvin says Luis has incorporated Marko's suggestions) ------------------------------------------------------------ revno: 3092.5.1 committer: Luis Soares <luis.soares@sun.com> branch nick: mysql-5.1-bugteam timestamp: Thu 2009-09-24 15:52:52 +0100 message: BUG#42829: binlogging enabled for all schemas regardless of binlog-db-db / binlog-ignore-db InnoDB will return an error if statement based replication is used along with transaction isolation level READ-COMMITTED (or weaker), even if the statement in question is filtered out according to the binlog-do-db rules set. In this case, an error should not be printed. This patch addresses this issue by extending the existing check in external_lock to take into account the filter rules before deciding to print an error. Furthermore, it also changes decide_logging_format to take into consideration whether the statement is filtered out from binlog before decision is made. added: mysql-test/suite/binlog/r/binlog_stm_do_db.result mysql-test/suite/binlog/t/binlog_stm_do_db-master.opt mysql-test/suite/binlog/t/binlog_stm_do_db.test modified: sql/sql_base.cc sql/sql_class.cc storage/innobase/handler/ha_innodb.cc storage/innobase/handler/ha_innodb.h storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/ha_innodb.h ------------------------------------------------------------------------ r6160 | vasil | 2009-11-11 15:33:49 +0200 (Wed, 11 Nov 2009) | 72 lines Changed paths: M /branches/zip/include/os0file.h M /branches/zip/os/os0file.c branches/zip: Merge r6152:6159 from branches/5.1: (r6158 was skipped as an equivallent change has already been merged from MySQL) ------------------------------------------------------------------------ r6154 | calvin | 2009-11-11 02:51:17 +0200 (Wed, 11 Nov 2009) | 17 lines Changed paths: M /branches/5.1/include/os0file.h M /branches/5.1/os/os0file.c branches/5.1: fix bug#3139: Mysql crashes: 'windows error 995' after several selects on a large DB During stress environment, Windows AIO may fail with error code ERROR_OPERATION_ABORTED. InnoDB does not handle the error, rather crashes. The cause of the error is unknown, but likely due to faulty hardware or driver. This patch introduces a new error code OS_FILE_OPERATION_ABORTED, which maps to Windows ERROR_OPERATION_ABORTED (995). When the error is detected during AIO, the InnoDB will issue a synchronous retry (read/write). This patch has been extensively tested by MySQL support. Approved by: Marko rb://196 ------------------------------------------------------------------------ r6158 | vasil | 2009-11-11 14:52:14 +0200 (Wed, 11 Nov 2009) | 37 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/handler/ha_innodb.h branches/5.1: Merge a change from MySQL: (this has been reviewed by Calvin and Marko, and Calvin says Luis has incorporated Marko's suggestions) ------------------------------------------------------------ revno: 3092.5.1 committer: Luis Soares <luis.soares@sun.com> branch nick: mysql-5.1-bugteam timestamp: Thu 2009-09-24 15:52:52 +0100 message: BUG#42829: binlogging enabled for all schemas regardless of binlog-db-db / binlog-ignore-db InnoDB will return an error if statement based replication is used along with transaction isolation level READ-COMMITTED (or weaker), even if the statement in question is filtered out according to the binlog-do-db rules set. In this case, an error should not be printed. This patch addresses this issue by extending the existing check in external_lock to take into account the filter rules before deciding to print an error. Furthermore, it also changes decide_logging_format to take into consideration whether the statement is filtered out from binlog before decision is made. added: mysql-test/suite/binlog/r/binlog_stm_do_db.result mysql-test/suite/binlog/t/binlog_stm_do_db-master.opt mysql-test/suite/binlog/t/binlog_stm_do_db.test modified: sql/sql_base.cc sql/sql_class.cc storage/innobase/handler/ha_innodb.cc storage/innobase/handler/ha_innodb.h storage/innodb_plugin/handler/ha_innodb.cc storage/innodb_plugin/handler/ha_innodb.h ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6161 | vasil | 2009-11-11 15:36:16 +0200 (Wed, 11 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add changelog entry for r6160. ------------------------------------------------------------------------ r6162 | vasil | 2009-11-11 16:00:12 +0200 (Wed, 11 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog for r6157. ------------------------------------------------------------------------ r6163 | calvin | 2009-11-11 17:53:20 +0200 (Wed, 11 Nov 2009) | 8 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/handler/ha_innodb.h branches/zip: Exclude thd_binlog_filter_ok() when building with older version of MySQL. thd_binlog_filter_ok() is introduced in MySQL 5.1.41. But the plugin can be built with MySQL prior to 5.1.41. Approved by Heikki (on IM). ------------------------------------------------------------------------ r6169 | calvin | 2009-11-12 14:40:43 +0200 (Thu, 12 Nov 2009) | 6 lines Changed paths: A /branches/zip/mysql-test/innodb_bug46676.result A /branches/zip/mysql-test/innodb_bug46676.test branches/zip: add test case for bug#46676 This crash is reproducible with InnoDB plugin 1.0.4 + MySQL 5.1.37. But no longer reproducible after MySQL 5.1.38 (with plugin 1.0.5). Add test case to catch future regression. ------------------------------------------------------------------------ r6170 | marko | 2009-11-12 15:49:08 +0200 (Thu, 12 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/db0err.h M /branches/zip/row/row0merge.c M /branches/zip/row/row0mysql.c branches/zip: Allow CREATE INDEX to be interrupted. (Issue #354) rb://183 approved by Heikki Tuuri ------------------------------------------------------------------------ r6175 | vasil | 2009-11-16 20:07:39 +0200 (Mon, 16 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Wrap line at 78th char in the ChangeLog ------------------------------------------------------------------------ r6177 | calvin | 2009-11-16 20:20:38 +0200 (Mon, 16 Nov 2009) | 2 lines Changed paths: M /branches/zip/ChangeLog branches/zip: add an entry to ChangeLog for r6065 ------------------------------------------------------------------------ r6179 | marko | 2009-11-17 10:19:34 +0200 (Tue, 17 Nov 2009) | 2 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: ha_innobase::change_active_index(): When the history is missing, report it to the client, not to the error log. ------------------------------------------------------------------------ r6181 | vasil | 2009-11-17 12:21:41 +0200 (Tue, 17 Nov 2009) | 33 lines Changed paths: M /branches/zip/mysql-test/innodb-index.test branches/zip: At the end of innodb-index.test: restore the environment as it was before the test was started to silence this warning: MTR's internal check of the test case 'main.innodb-index' failed. This means that the test case does not preserve the state that existed before the test case was executed. Most likely the test case did not do a proper clean-up. This is the diff of the states of the servers before and after the test case was executed: mysqltest: Logging to '/tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.log'. mysqltest: Results saved in '/tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.result'. mysqltest: Connecting to server localhost:13000 (socket /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/mysqld.1.sock) as 'root', connection 'default', attempt 0 ... mysqltest: ... Connected. mysqltest: Start processing test commands from './include/check-testcase.test' ... mysqltest: ... Done processing test commands. --- /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.result 2009-11-17 13:10:40.000000000 +0300 +++ /tmp/autotest.sh-20091117_033000-zip.btyZwu/mysql-5.1/mysql-test/var/tmp/check-mysqld_1.reject 2009-11-17 13:10:54.000000000 +0300 @@ -84,7 +84,7 @@ INNODB_DOUBLEWRITE ON INNODB_FAST_SHUTDOWN 1 INNODB_FILE_FORMAT Antelope -INNODB_FILE_FORMAT_CHECK Antelope +INNODB_FILE_FORMAT_CHECK Barracuda INNODB_FILE_PER_TABLE OFF INNODB_FLUSH_LOG_AT_TRX_COMMIT 1 INNODB_FLUSH_METHOD mysqltest: Result content mismatch not ok ------------------------------------------------------------------------ r6182 | marko | 2009-11-17 13:49:15 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-consistent-master.opt M /branches/zip/mysql-test/innodb-consistent.result M /branches/zip/mysql-test/innodb-consistent.test M /branches/zip/mysql-test/innodb-use-sys-malloc-master.opt M /branches/zip/mysql-test/innodb-use-sys-malloc.result M /branches/zip/mysql-test/innodb-use-sys-malloc.test M /branches/zip/mysql-test/innodb_bug21704.result M /branches/zip/mysql-test/innodb_bug21704.test M /branches/zip/mysql-test/innodb_bug40360.test M /branches/zip/mysql-test/innodb_bug40565.result M /branches/zip/mysql-test/innodb_bug40565.test M /branches/zip/mysql-test/innodb_bug41904.result M /branches/zip/mysql-test/innodb_bug41904.test M /branches/zip/mysql-test/innodb_bug42101-nonzero-master.opt M /branches/zip/mysql-test/innodb_bug42101-nonzero.result M /branches/zip/mysql-test/innodb_bug42101-nonzero.test M /branches/zip/mysql-test/innodb_bug42101.result M /branches/zip/mysql-test/innodb_bug42101.test M /branches/zip/mysql-test/innodb_bug44032.result M /branches/zip/mysql-test/innodb_bug44032.test M /branches/zip/mysql-test/innodb_bug44369.result M /branches/zip/mysql-test/innodb_bug44369.test M /branches/zip/mysql-test/innodb_bug44571.result M /branches/zip/mysql-test/innodb_bug44571.test M /branches/zip/mysql-test/innodb_bug45357.test M /branches/zip/mysql-test/innodb_bug46000.result M /branches/zip/mysql-test/innodb_bug46000.test M /branches/zip/mysql-test/innodb_bug46676.result M /branches/zip/mysql-test/innodb_bug46676.test M /branches/zip/mysql-test/innodb_bug47167.result M /branches/zip/mysql-test/innodb_bug47167.test M /branches/zip/mysql-test/innodb_bug47777.result M /branches/zip/mysql-test/innodb_bug47777.test M /branches/zip/mysql-test/innodb_file_format.result M /branches/zip/mysql-test/innodb_file_format.test branches/zip: Set svn:eol-style on mysql-test files. ------------------------------------------------------------------------ r6183 | marko | 2009-11-17 13:51:16 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-consistent-master.opt M /branches/zip/mysql-test/innodb-master.opt M /branches/zip/mysql-test/innodb-semi-consistent-master.opt M /branches/zip/mysql-test/innodb-use-sys-malloc-master.opt M /branches/zip/mysql-test/innodb_bug42101-nonzero-master.opt branches/zip: Prepend loose_ to plugin-only mysql-test options. ------------------------------------------------------------------------ r6184 | marko | 2009-11-17 13:52:01 +0200 (Tue, 17 Nov 2009) | 1 line Changed paths: M /branches/zip/mysql-test/innodb-index.result M /branches/zip/mysql-test/innodb-index.test branches/zip: innodb-index.test: Restore innodb_file_format_check. ------------------------------------------------------------------------ r6185 | marko | 2009-11-17 16:44:20 +0200 (Tue, 17 Nov 2009) | 16 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/mysql-test/innodb.result M /branches/zip/mysql-test/innodb.test M /branches/zip/mysql-test/innodb_bug44369.result M /branches/zip/mysql-test/innodb_bug44369.test D /branches/zip/mysql-test/patches/innodb-index.diff M /branches/zip/row/row0mysql.c branches/zip: Report duplicate table names to the client connection, not to the error log. This change will allow innodb-index.test to be re-enabled. It was previously disabled, because mysql-test-run does not like output in the error log. row_create_table_for_mysql(): Do not output anything to the error log when reporting DB_DUPLICATE_KEY. Let the caller report the error. Add a TODO comment that the dict_table_t object is apparently not freed when an error occurs. create_table_def(): Convert InnoDB table names to the character set of the client connection for reporting. Use my_error(ER_WRONG_COLUMN_NAME) for reporting reserved column names. Report my_error(ER_TABLE_EXISTS_ERROR) when row_create_table_for_mysql() returns DB_DUPLICATE_KEY. rb://206 ------------------------------------------------------------------------ r6186 | vasil | 2009-11-17 16:48:14 +0200 (Tue, 17 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for r6185. ------------------------------------------------------------------------ r6189 | marko | 2009-11-18 11:36:18 +0200 (Wed, 18 Nov 2009) | 5 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/handler/handler0alter.cc branches/zip: ha_innobase::add_index(): When creating the primary key and the table is being locked by another transaction, do not attempt to drop the table. (Bug #48782) Approved by Sunny Bains over IM ------------------------------------------------------------------------ r6194 | vasil | 2009-11-19 09:24:45 +0200 (Thu, 19 Nov 2009) | 5 lines Changed paths: M /branches/zip/include/univ.i branches/zip: Increment version number from 1.0.5 to 1.0.6 since 1.0.5 was just released by MySQL and we will soon release 1.0.6. ------------------------------------------------------------------------ r6197 | calvin | 2009-11-19 09:32:55 +0200 (Thu, 19 Nov 2009) | 6 lines Changed paths: M /branches/zip/CMakeLists.txt branches/zip: merge the fix of bug#48317 (CMake file) Due to MySQL changes to the CMake, it is no longer able to build InnoDB plugin as a static library on Windows. The fix is proposed by Vlad of MySQL. ------------------------------------------------------------------------ r6198 | vasil | 2009-11-19 09:44:31 +0200 (Thu, 19 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for r6197. ------------------------------------------------------------------------ r6199 | vasil | 2009-11-19 12:10:12 +0200 (Thu, 19 Nov 2009) | 31 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/btr/btr0btr.c M /branches/zip/data/data0type.c branches/zip: Merge r6159:6198 from branches/5.1: ------------------------------------------------------------------------ r6187 | jyang | 2009-11-18 05:27:30 +0200 (Wed, 18 Nov 2009) | 9 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: Fix bug #48469 "when innodb tablespace is configured too small, crash and corruption!". Function btr_create() did not check the return status of fseg_create(), and continue the index creation even there is no sufficient space. rb://205 Approved by Marko ------------------------------------------------------------------------ r6188 | jyang | 2009-11-18 07:14:23 +0200 (Wed, 18 Nov 2009) | 8 lines Changed paths: M /branches/5.1/data/data0type.c branches/5.1: Fix bug #48526 "Data type for float and double is incorrectly reported in InnoDB table monitor". Certain datatypes are not printed correctly in dtype_print(). rb://204 Approved by Marko. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6201 | marko | 2009-11-19 14:09:11 +0200 (Thu, 19 Nov 2009) | 2 lines Changed paths: M /branches/zip/handler/handler0alter.cc branches/zip: ha_innobase::add_index(): Clarify the comment on orphaned tables when creating a primary key. ------------------------------------------------------------------------ r6202 | jyang | 2009-11-19 15:01:00 +0200 (Thu, 19 Nov 2009) | 8 lines Changed paths: M /branches/zip/btr/btr0btr.c branches/zip: Function fseg_free() is no longer defined in branches/zip. To port fix for bug #48469 to zip, we can use btr_free_root() which frees the page, and also does not require mini-transaction. Approved by Marko. ------------------------------------------------------------------------ r6207 | vasil | 2009-11-20 10:19:14 +0200 (Fri, 20 Nov 2009) | 54 lines Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Merge r6198:6206 from branches/5.1: (r6203 was skipped as it is already in branches/zip) ------------------------------------------------------------------------ r6200 | vasil | 2009-11-19 12:14:23 +0200 (Thu, 19 Nov 2009) | 4 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: White space fixup - indent under the opening ( ------------------------------------------------------------------------ r6203 | jyang | 2009-11-19 15:12:22 +0200 (Thu, 19 Nov 2009) | 8 lines Changed paths: M /branches/5.1/btr/btr0btr.c branches/5.1: Use btr_free_root() instead of fseg_free() for the fix of bug #48469, because fseg_free() is not defined in the zip branch. And we could save one mini-trasaction started by fseg_free(). Approved by Marko. ------------------------------------------------------------------------ r6205 | jyang | 2009-11-20 07:55:48 +0200 (Fri, 20 Nov 2009) | 11 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Add a special case to handle the Duplicated Key error and return DB_ERROR instead. This is to avoid a possible SIGSEGV by mysql error handling re-entering the storage layer for dup key info without proper table handle. This is to prevent a server crash when error situation in bug #45961 "DDL on partitioned innodb tables leaves data dictionary in an inconsistent state" happens. rb://157 approved by Sunny Bains. ------------------------------------------------------------------------ r6206 | jyang | 2009-11-20 09:38:43 +0200 (Fri, 20 Nov 2009) | 5 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Fix a minor code formating issue for the parenthesis iplacement of the if condition in rename_table(). ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6208 | vasil | 2009-11-20 10:49:24 +0200 (Fri, 20 Nov 2009) | 4 lines Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for c6207. ------------------------------------------------------------------------ r6210 | vasil | 2009-11-20 23:39:48 +0200 (Fri, 20 Nov 2009) | 3 lines Changed paths: M /branches/zip/trx/trx0i_s.c branches/zip: Whitespace fixup. ------------------------------------------------------------------------ r6248 | marko | 2009-11-30 12:19:50 +0200 (Mon, 30 Nov 2009) | 1 line Changed paths: M /branches/zip/ChangeLog branches/zip: ChangeLog: Document r4922 that was forgotten. ------------------------------------------------------------------------ r6252 | marko | 2009-11-30 12:50:11 +0200 (Mon, 30 Nov 2009) | 23 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/dict/dict0boot.c M /branches/zip/dict/dict0crea.c M /branches/zip/dict/dict0load.c M /branches/zip/dict/dict0mem.c M /branches/zip/fil/fil0fil.c M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/dict0mem.h M /branches/zip/row/row0mysql.c branches/zip: Suppress errors about non-found temporary tables. Write the is_temp flag to SYS_TABLES.MIX_LEN. dict_table_t::flags: Add a flag for is_temporary, DICT_TF2_TEMPORARY. Unlike other flags, this will not be written to the tablespace flags or SYS_TABLES.TYPE, but only to SYS_TABLES.MIX_LEN. dict_build_table_def_step(): Only pass DICT_TF_BITS to tablespaces. dict_check_tablespaces_and_store_max_id(), dict_load_table(): Suppress errors about temporary tables not being found. dict_create_sys_tables_tuple(): Write the DICT_TF2_TEMPORARY flag to SYS_TABLES.MIX_LEN. fil_space_create(), fil_create_new_single_table_tablespace(): Add assertions about space->flags. row_drop_table_for_mysql(): Do not complain about non-found temporary tables. rb://160 approved by Heikki Tuuri. This addresses the second part of Bug #41609 Crash recovery does not work for InnoDB temporary tables. ------------------------------------------------------------------------ r6263 | vasil | 2009-12-01 14:49:05 +0200 (Tue, 01 Dec 2009) | 4 lines Changed paths: M /branches/zip/include/univ.i branches/zip: Increment version number from 1.0.6 to 1.0.7 1.0.6 has been released ------------------------------------------------------------------------ r6264 | vasil | 2009-12-01 16:19:44 +0200 (Tue, 01 Dec 2009) | 1 line Changed paths: M /branches/zip/ChangeLog branches/zip: Add ChangeLog entry for the release of 1.0.6. ------------------------------------------------------------------------ r6269 | marko | 2009-12-02 11:35:22 +0200 (Wed, 02 Dec 2009) | 2 lines Changed paths: M /branches/zip/srv/srv0start.c branches/zip: innobase_start_or_create_for_mysql(): UNIV_IBUF_DEBUG should not break crash recovery, but UNIV_IBUF_COUNT_DEBUG will. ------------------------------------------------------------------------ r6270 | marko | 2009-12-02 11:36:47 +0200 (Wed, 02 Dec 2009) | 1 line Changed paths: M /branches/zip/srv/srv0start.c branches/zip: innobase_start_or_create_for_mysql(): Log the zlib version. ------------------------------------------------------------------------ r6271 | marko | 2009-12-02 11:43:49 +0200 (Wed, 02 Dec 2009) | 2 lines Changed paths: M /branches/zip/ChangeLog M /branches/zip/Makefile.am M /branches/zip/include/univ.i M /branches/zip/plug.in branches/zip: ChangeLog: Document that since r6270, the zlib version number will be displayed at start-up. ------------------------------------------------------------------------ r6272 | marko | 2009-12-02 11:46:05 +0200 (Wed, 02 Dec 2009) | 1 line Changed paths: M /branches/zip/Makefile.am M /branches/zip/include/univ.i M /branches/zip/plug.in branches/zip: Revert changes that were accidentally committed in r6271. ------------------------------------------------------------------------ r6274 | marko | 2009-12-03 14:47:12 +0200 (Thu, 03 Dec 2009) | 6 lines Changed paths: M /branches/zip/dict/dict0dict.c branches/zip: dict_table_check_for_dup_indexes(): Assert that the data dictionary mutex is being held while table->indexes is accessed. This is already the case. Currently, only dict_table_get_next_index() and dict_table_get_first_index() are being invoked without holding dict_sys->mutex. ------------------------------------------------------------------------ r6275 | pekka | 2009-12-03 18:32:47 +0200 (Thu, 03 Dec 2009) | 10 lines Changed paths: M /branches/zip/include/log0recv.h M /branches/zip/include/trx0sys.h M /branches/zip/log/log0recv.c M /branches/zip/trx/trx0sys.c branches/zip: Minor changes which allow build with UNIV_HOTBACKUP defined to succeed: include/trx0sys.h: Allow Hot Backup build to see some TRX_SYS_DOUBLEWRITE_... macros. trx/trx0sys.c: Exclude trx_sys_close() function from Hot Backup build. log/log0recv.[ch]: Exclude recv_sys_var_init() function from Hot Backup build. This change should not affect !UNIV_HOTBACKUP build. ------------------------------------------------------------------------ r6277 | marko | 2009-12-08 11:13:36 +0200 (Tue, 08 Dec 2009) | 1 line Changed paths: M /branches/zip/fsp/fsp0fsp.c branches/zip: fsp0fsp.c: Add some missing in/out and const qualifiers. ------------------------------------------------------------------------ r6285 | marko | 2009-12-09 09:24:50 +0200 (Wed, 09 Dec 2009) | 13 lines Changed paths: M /branches/zip/row/row0sel.c branches/zip: row_sel_fetch_columns(): Remove redundant code that was accidentally added in r1591, which introduced dfield_t::ext in order to make the merge sort of fast index creation support externally stored columns, Initially, I tried to allocate the bit for dfield_t::ext from dfield_t::len by making the length 31 bits and mapping UNIV_SQL_NULL to something that would fit in it. Then I decided that it would be too risky. The redundant check was part of the mapping. The condition may have been dfield_is_null() initially. This redundant code was noticed by Sergey Petrunya on the MySQL internals list. ------------------------------------------------------------------------ r6288 | marko | 2009-12-09 09:51:00 +0200 (Wed, 09 Dec 2009) | 15 lines Changed paths: M /branches/zip/row/row0upd.c branches/zip: row_upd_copy_columns(): Remove redundant code that was accidentally added in r1591, which introduced dfield_t::ext in order to make the merge sort of fast index creation support externally stored columns. Initially, I tried to allocate the bit for dfield_t::ext from dfield_t::len by making the length 31 bits and mapping UNIV_SQL_NULL to something that would fit in it. Then I decided that it would be too risky. The redundant check was part of the mapping. The condition may have been dfield_is_null() initially. This is similar to the redundant code in row_sel_fetch_columns() that was noticed by Sergey Petrunya on the MySQL internals list and removed in r6285. As far as I can tell, there are no redundant UNIV_SQL_NULL assignments remaining after this change. ------------------------------------------------------------------------ r6305 | marko | 2009-12-14 13:03:57 +0200 (Mon, 14 Dec 2009) | 2 lines Changed paths: M /branches/zip/row/row0umod.c branches/zip: row_undo_mod_del_unmark_sec_and_undo_update(): Add a missing const qualifier. ------------------------------------------------------------------------ r6309 | marko | 2009-12-15 14:05:50 +0200 (Tue, 15 Dec 2009) | 3 lines Changed paths: M /branches/zip/lock/lock0lock.c branches/zip: lock_rec_insert_check_and_lock(): Avoid casting away constness. Use page_rec_get_next_const() instead. This silences a gcc 4.2.4 warning. Reported by Sunny Bains. ------------------------------------------------------------------------ r6312 | marko | 2009-12-16 10:10:36 +0200 (Wed, 16 Dec 2009) | 6 lines Changed paths: M /branches/zip/fil/fil0fil.c branches/zip: fil_close(): Add #ifndef UNIV_HOTBACKUP around a debug assertion on mutex.magic_n. InnoDB Hot Backup is a single-threaded program and does not contain mutexes. This change allows InnoDB Hot Backup to be compiled with UNIV_DEBUG. Suggested by Michael Izioumtchenko. ------------------------------------------------------------------------ r6321 | marko | 2009-12-16 16:16:33 +0200 (Wed, 16 Dec 2009) | 4 lines Changed paths: M /branches/zip/row/row0merge.c branches/zip: row_merge_drop_temp_indexes(): Revert a hack to transaction isolation level that was made unnecessary by r5826 (Issue #337). When this function is called, any active data dictionary transaction should have been rolled back. ------------------------------------------------------------------------ r6345 | marko | 2009-12-21 10:46:14 +0200 (Mon, 21 Dec 2009) | 7 lines Changed paths: M /branches/zip/log/log0recv.c branches/zip: recv_scan_log_recs(): Non-functional change: Replace a debug assertion ut_ad(len > 0) with ut_ad(len >= OS_FILE_LOG_BLOCK_SIZE). This change is only for readability, for Issue #428. Another assertion on len being an integer multiple of OS_FILE_LOG_BLOCK_SIZE already ensured together with the old ut_ad(len > 0) that actually len must be at least OS_FILE_LOG_BLOCK_SIZE. ------------------------------------------------------------------------ r6346 | marko | 2009-12-21 12:03:25 +0200 (Mon, 21 Dec 2009) | 2 lines Changed paths: M /branches/zip/log/log0recv.c branches/zip: recv_recovery_from_checkpoint_finish(): Revert a change that was accidentally committed in r6345. ------------------------------------------------------------------------ r6348 | marko | 2009-12-22 11:04:34 +0200 (Tue, 22 Dec 2009) | 37 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/ha_prototypes.h M /branches/zip/include/trx0trx.h M /branches/zip/lock/lock0lock.c M /branches/zip/trx/trx0i_s.c M /branches/zip/trx/trx0trx.c branches/zip: Merge a change from MySQL: ------------------------------------------------------------ revno: 3236 committer: Satya B <satya.bn@sun.com> branch nick: mysql-5.1-bugteam timestamp: Tue 2009-12-01 17:48:57 +0530 message: merge to mysql-5.1-bugteam ------------------------------------------------------------ revno: 3234.1.1 committer: Gleb Shchepa <gshchepa@mysql.com> branch nick: mysql-5.1-bugteam timestamp: Tue 2009-12-01 14:38:40 +0400 message: Bug #38883 (reopened): thd_security_context is not thread safe, crashes? manual merge 5.0-->5.1, updating InnoDB plugin. ------------------------------------------------------------ revno: 1810.3968.13 committer: Gleb Shchepa <gshchepa@mysql.com> branch nick: mysql-5.0-bugteam timestamp: Tue 2009-12-01 14:24:44 +0400 message: Bug #38883 (reopened): thd_security_context is not thread safe, crashes? The bug 38816 changed the lock that protects THD::query from LOCK_thread_count to LOCK_thd_data, but didn't update the associated InnoDB functions. 1. The innobase_mysql_prepare_print_arbitrary_thd and the innobase_mysql_end_print_arbitrary_thd InnoDB functions have been removed, since now we have a per-thread mutex: now we don't need to wrap several inter-thread access tries to THD::query with a single global LOCK_thread_count lock, so we can simplify the code. 2. The innobase_mysql_print_thd function has been modified to lock LOCK_thd_data in direct way. ------------------------------------------------------------------------ r6351 | marko | 2009-12-22 11:11:18 +0200 (Tue, 22 Dec 2009) | 1 line Changed paths: M /branches/zip/handler/ha_innodb.cc branches/zip: Remove an obsolete declaration of LOCK_thread_count. ------------------------------------------------------------------------ r6352 | marko | 2009-12-22 12:33:01 +0200 (Tue, 22 Dec 2009) | 104 lines Changed paths: M /branches/zip/handler/ha_innodb.cc M /branches/zip/include/lock0lock.h M /branches/zip/include/srv0srv.h M /branches/zip/lock/lock0lock.c M /branches/zip/log/log0log.c M /branches/zip/mysql-test/innodb-autoinc.result M /branches/zip/mysql-test/innodb-autoinc.test M /branches/zip/row/row0sel.c M /branches/zip/srv/srv0srv.c M /branches/zip/srv/srv0start.c branches/zip: Merge revisions 6206:6350 from branches/5.1, except r6347, r6349, r6350 which were committed separately to both branches, and r6310, which was backported from zip to 5.1. ------------------------------------------------------------------------ r6206 | jyang | 2009-11-20 09:38:43 +0200 (Fri, 20 Nov 2009) | 3 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc branches/5.1: Non-functional change, fix formatting. ------------------------------------------------------------------------ r6230 | sunny | 2009-11-24 23:52:43 +0200 (Tue, 24 Nov 2009) | 3 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result branches/5.1: Fix autoinc failing test results. (this should be skipped when merging 5.1 into zip) ------------------------------------------------------------------------ r6231 | sunny | 2009-11-25 10:26:27 +0200 (Wed, 25 Nov 2009) | 7 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test M /branches/5.1/row/row0sel.c branches/5.1: Fix BUG#49032 - auto_increment field does not initialize to last value in InnoDB Storage Engine. We use the appropriate function to read the column value for non-integer autoinc column types, namely float and double. rb://208. Approved by Marko. ------------------------------------------------------------------------ r6232 | sunny | 2009-11-25 10:27:39 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix white space errors. ------------------------------------------------------------------------ r6233 | sunny | 2009-11-25 10:28:35 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/include/mach0data.h M /branches/5.1/include/mach0data.ic M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix tests and make read float/double arg const. ------------------------------------------------------------------------ r6234 | sunny | 2009-11-25 10:29:03 +0200 (Wed, 25 Nov 2009) | 2 lines Changed paths: M /branches/5.1/row/row0sel.c branches/5.1: This is an interim fix, fix whitepsace issues. ------------------------------------------------------------------------ r6235 | sunny | 2009-11-26 01:14:42 +0200 (Thu, 26 Nov 2009) | 9 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: Fix Bug#47720 - REPLACE INTO Autoincrement column with negative values. This bug is similiar to the negative autoinc filter patch from earlier, with the additional handling of filtering out the negative column values set explicitly by the user. rb://184 Approved by Heikki. ------------------------------------------------------------------------ r6242 | vasil | 2009-11-27 22:07:12 +0200 (Fri, 27 Nov 2009) | 4 lines Changed paths: M /branches/5.1/export.sh branches/5.1: Minor changes to support plugin snapshots. ------------------------------------------------------------------------ r6306 | calvin | 2009-12-14 15:12:46 +0200 (Mon, 14 Dec 2009) | 5 lines Changed paths: M /branches/5.1/mysql-test/innodb-autoinc.result M /branches/5.1/mysql-test/innodb-autoinc.test branches/5.1: fix bug#49267: innodb-autoinc.test fails on windows because of different case mode There is no change to the InnoDB code, only to fix test case by changing "T1" to "t1". ------------------------------------------------------------------------ r6324 | jyang | 2009-12-17 06:54:24 +0200 (Thu, 17 Dec 2009) | 8 lines Changed paths: M /branches/5.1/handler/ha_innodb.cc M /branches/5.1/include/lock0lock.h M /branches/5.1/include/srv0srv.h M /branches/5.1/lock/lock0lock.c M /branches/5.1/log/log0log.c M /branches/5.1/srv/srv0srv.c M /branches/5.1/srv/srv0start.c branches/5.1: Fix bug #47814 - Diagnostics are frequently not printed after a long lock wait in InnoDB. Separate out the lock wait timeout check thread from monitor information printing thread. rb://200 Approved by Marko. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r6364 | marko | 2009-12-26 21:06:31 +0200 (Sat, 26 Dec 2009) | 4 lines Changed paths: M /branches/zip/ibuf/ibuf0ibuf.c branches/zip: ibuf_bitmap_get_map_page(): Define a wrapper macro that passes __FILE__, __LINE__ of the caller to buf_page_get_gen(). This will ease the diagnosis of the likes of Issue #135. ------------------------------------------------------------------------
16 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
5 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
5 years ago
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
5 years ago
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
5 years ago
20 years ago
20 years ago
20 years ago
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
5 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-29911 InnoDB recovery and mariadb-backup --prepare fail to report detailed progress The progress reporting of InnoDB crash recovery was rather intermittent. Nothing was reported during the single-threaded log record parsing, which could consume minutes when parsing a large log. During log application, there only was progress reporting in background threads that would be invoked on data page read completion. The progress reporting here will be detailed like this: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1990840177; to recover: 124806 pages InnoDB: Parsed redo log up to LSN=2729777071; to recover: 186123 pages InnoDB: Parsed redo log up to LSN=3488599173; to recover: 248397 pages InnoDB: Parsed redo log up to LSN=4177856618; to recover: 306469 pages InnoDB: Multi-batch recovery needed at LSN 4189599815 InnoDB: End of log at LSN=4483551634 InnoDB: To recover: LSN 4189599815/4483551634; 307490 pages InnoDB: To recover: LSN 4189599815/4483551634; 197159 pages InnoDB: To recover: LSN 4189599815/4483551634; 67623 pages InnoDB: Parsed redo log up to LSN=4353924218; to recover: 102083 pages ... InnoDB: log sequence number 4483551634 ... The previous messages "Starting a batch to recover" or "Starting a final batch to recover" will be replaced by "To recover: ... pages" messages. If a batch lasts longer than 15 seconds, then there will be progress reports every 15 seconds, showing the number of remaining pages. For the non-final batch, the "To recover:" message includes two end LSN: that of the batch, and of the recovered log. This is the primary measure of progress. The batch will end once the number of pages to recover reaches 0. If recovery is possible in a single batch, the output will look like this, with a shorter "To recover:" message that counts only the remaining pages: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1998701027; to recover: 125560 pages InnoDB: Parsed redo log up to LSN=2734136874; to recover: 186446 pages InnoDB: Parsed redo log up to LSN=3499505504; to recover: 249378 pages InnoDB: Parsed redo log up to LSN=4183247844; to recover: 306964 pages InnoDB: End of log at LSN=4483551634 ... InnoDB: To recover: 331797 pages ... InnoDB: log sequence number 4483551634 ... We will also speed up recovery by improving the memory management and implementing multi-threaded recovery of data pages that will not need to be read into the buffer pool ("fake read"). Log application in the "fake read" threads will be protected by an atomic being_recovered field and exclusive buf_page_t::latch. Recovery will reserve for data pages two thirds of the buffer pool, or 256 pages, whichever is smaller. Previously, we could only use at most one third of the buffer pool for buffered log records. This would typically mean that with large buffer pools, recovery unnecessary consisted of multiple batches. If recovery runs out of memory, it will "roll back" or "rewind" the current mini-transaction. The recv_sys.lsn and recv_sys.pages will correspond to the "out of memory LSN", at the end of the previous complete mini-transaction. If recovery runs out of memory while executing the final recovery batch, we can simply invoke recv_sys.apply(false) to make room, and resume parsing. If recovery runs out of memory before the final batch, we will scan the redo log to the end (recv_sys.scanned_lsn) and check for any missing or inconsistent files. If recv_init_crash_recovery_spaces() does not report any potentially missing tablespaces, we can make use of the already stored recv_sys.pages and only rewind to the "out of memory LSN". Else, we must keep parsing and invoking recv_validate_tablespace() until an error has been found or everything has been resolved, and ultimatily rewind to to the checkpoint LSN. recv_sys_t::pages_it: A cached iterator to recv_sys.pages recv_sys_t::parse_mtr(): Remove an ATTRIBUTE_NOINLINE that would prevent tail call optimization in recv_sys_t::parse_pmem(). recv_sys_t::parse(), recv_sys_t::parse_mtr(), recv_sys_t::parse_pmem(): Add template<bool store> parameter. Redo log record parsing (store=false) is better specialized from store=true (with bool if_exists) so that we can avoid some conditional branches in frequently invoked low-level code. recv_sys_t::is_memory_exhausted(): Remove. The special parse() status GOT_OOM will report out-of-memory situation at the low level. recv_sys_t::rewind(), page_recv_t::recs_t::rewind(): Remove all log starting with a specific LSN. recv_scan_log(): Separate some code for only parsing, not storing log. In rewound_lsn, remember the LSN at which last_phase=false recovery ran out of memory. This is where the next call to recv_scan_log() will resume storing the log. This replaces recv_sys.last_stored_lsn. recv_sys_t::parse(): Evaluate the template parameter store in a few more cases, to allow dead code to be eliminated at compile time. recv_sys_t::scanned_lsn: The end of the log found by recv_scan_log(). The special value 1 means that recv_sys has been initialized but no log has been parsed. IORequest::write_complete(), IORequest::read_complete(): Replaces fil_aio_callback(). read_io_callback(), write_io_callback(): Replaces io_callback(). IORequest::fake_read_complete(), fake_io_callback(), os_fake_read(): Process a "fake read" request for concurrent recovery. recv_sys_t::apply_batch(): Choose a number of successive pages for a recovery batch. recv_sys_t::erase(recv_sys_t::map::iterator): Remove log records for a page whose recovery is not in progress. Log application threads will not invoke this; they will only set being_recovered=-1 to indicate that the entry is no longer needed. recv_sys_t::garbage_collect(): Remove all being_recovered=-1 entries. recv_sys_t::wait_for_pool(): Wait for some space to become available in the buffer pool. mlog_init_t::mark_ibuf_exist(): Avoid calls to recv_sys::recover_low() via ibuf_page_exists() and buf_page_get_low(). Such calls would lead to double locking of recv_sys.mutex, which depending on implementation could cause a deadlock. We will use lower-level calls to look up index pages. buf_LRU_block_remove_hashed(): Disable consistency checks for freed ROW_FORMAT=COMPRESSED pages. Their contents could be uninitialized garbage. This fixes an occasional failure of the test innodb.innodb_bulk_create_index_debug. Tested by: Matthias Leich
2 years ago
MDEV-29911 InnoDB recovery and mariadb-backup --prepare fail to report detailed progress The progress reporting of InnoDB crash recovery was rather intermittent. Nothing was reported during the single-threaded log record parsing, which could consume minutes when parsing a large log. During log application, there only was progress reporting in background threads that would be invoked on data page read completion. The progress reporting here will be detailed like this: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1990840177; to recover: 124806 pages InnoDB: Parsed redo log up to LSN=2729777071; to recover: 186123 pages InnoDB: Parsed redo log up to LSN=3488599173; to recover: 248397 pages InnoDB: Parsed redo log up to LSN=4177856618; to recover: 306469 pages InnoDB: Multi-batch recovery needed at LSN 4189599815 InnoDB: End of log at LSN=4483551634 InnoDB: To recover: LSN 4189599815/4483551634; 307490 pages InnoDB: To recover: LSN 4189599815/4483551634; 197159 pages InnoDB: To recover: LSN 4189599815/4483551634; 67623 pages InnoDB: Parsed redo log up to LSN=4353924218; to recover: 102083 pages ... InnoDB: log sequence number 4483551634 ... The previous messages "Starting a batch to recover" or "Starting a final batch to recover" will be replaced by "To recover: ... pages" messages. If a batch lasts longer than 15 seconds, then there will be progress reports every 15 seconds, showing the number of remaining pages. For the non-final batch, the "To recover:" message includes two end LSN: that of the batch, and of the recovered log. This is the primary measure of progress. The batch will end once the number of pages to recover reaches 0. If recovery is possible in a single batch, the output will look like this, with a shorter "To recover:" message that counts only the remaining pages: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1998701027; to recover: 125560 pages InnoDB: Parsed redo log up to LSN=2734136874; to recover: 186446 pages InnoDB: Parsed redo log up to LSN=3499505504; to recover: 249378 pages InnoDB: Parsed redo log up to LSN=4183247844; to recover: 306964 pages InnoDB: End of log at LSN=4483551634 ... InnoDB: To recover: 331797 pages ... InnoDB: log sequence number 4483551634 ... We will also speed up recovery by improving the memory management and implementing multi-threaded recovery of data pages that will not need to be read into the buffer pool ("fake read"). Log application in the "fake read" threads will be protected by an atomic being_recovered field and exclusive buf_page_t::latch. Recovery will reserve for data pages two thirds of the buffer pool, or 256 pages, whichever is smaller. Previously, we could only use at most one third of the buffer pool for buffered log records. This would typically mean that with large buffer pools, recovery unnecessary consisted of multiple batches. If recovery runs out of memory, it will "roll back" or "rewind" the current mini-transaction. The recv_sys.lsn and recv_sys.pages will correspond to the "out of memory LSN", at the end of the previous complete mini-transaction. If recovery runs out of memory while executing the final recovery batch, we can simply invoke recv_sys.apply(false) to make room, and resume parsing. If recovery runs out of memory before the final batch, we will scan the redo log to the end (recv_sys.scanned_lsn) and check for any missing or inconsistent files. If recv_init_crash_recovery_spaces() does not report any potentially missing tablespaces, we can make use of the already stored recv_sys.pages and only rewind to the "out of memory LSN". Else, we must keep parsing and invoking recv_validate_tablespace() until an error has been found or everything has been resolved, and ultimatily rewind to to the checkpoint LSN. recv_sys_t::pages_it: A cached iterator to recv_sys.pages recv_sys_t::parse_mtr(): Remove an ATTRIBUTE_NOINLINE that would prevent tail call optimization in recv_sys_t::parse_pmem(). recv_sys_t::parse(), recv_sys_t::parse_mtr(), recv_sys_t::parse_pmem(): Add template<bool store> parameter. Redo log record parsing (store=false) is better specialized from store=true (with bool if_exists) so that we can avoid some conditional branches in frequently invoked low-level code. recv_sys_t::is_memory_exhausted(): Remove. The special parse() status GOT_OOM will report out-of-memory situation at the low level. recv_sys_t::rewind(), page_recv_t::recs_t::rewind(): Remove all log starting with a specific LSN. recv_scan_log(): Separate some code for only parsing, not storing log. In rewound_lsn, remember the LSN at which last_phase=false recovery ran out of memory. This is where the next call to recv_scan_log() will resume storing the log. This replaces recv_sys.last_stored_lsn. recv_sys_t::parse(): Evaluate the template parameter store in a few more cases, to allow dead code to be eliminated at compile time. recv_sys_t::scanned_lsn: The end of the log found by recv_scan_log(). The special value 1 means that recv_sys has been initialized but no log has been parsed. IORequest::write_complete(), IORequest::read_complete(): Replaces fil_aio_callback(). read_io_callback(), write_io_callback(): Replaces io_callback(). IORequest::fake_read_complete(), fake_io_callback(), os_fake_read(): Process a "fake read" request for concurrent recovery. recv_sys_t::apply_batch(): Choose a number of successive pages for a recovery batch. recv_sys_t::erase(recv_sys_t::map::iterator): Remove log records for a page whose recovery is not in progress. Log application threads will not invoke this; they will only set being_recovered=-1 to indicate that the entry is no longer needed. recv_sys_t::garbage_collect(): Remove all being_recovered=-1 entries. recv_sys_t::wait_for_pool(): Wait for some space to become available in the buffer pool. mlog_init_t::mark_ibuf_exist(): Avoid calls to recv_sys::recover_low() via ibuf_page_exists() and buf_page_get_low(). Such calls would lead to double locking of recv_sys.mutex, which depending on implementation could cause a deadlock. We will use lower-level calls to look up index pages. buf_LRU_block_remove_hashed(): Disable consistency checks for freed ROW_FORMAT=COMPRESSED pages. Their contents could be uninitialized garbage. This fixes an occasional failure of the test innodb.innodb_bulk_create_index_debug. Tested by: Matthias Leich
2 years ago
MDEV-29911 InnoDB recovery and mariadb-backup --prepare fail to report detailed progress The progress reporting of InnoDB crash recovery was rather intermittent. Nothing was reported during the single-threaded log record parsing, which could consume minutes when parsing a large log. During log application, there only was progress reporting in background threads that would be invoked on data page read completion. The progress reporting here will be detailed like this: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1990840177; to recover: 124806 pages InnoDB: Parsed redo log up to LSN=2729777071; to recover: 186123 pages InnoDB: Parsed redo log up to LSN=3488599173; to recover: 248397 pages InnoDB: Parsed redo log up to LSN=4177856618; to recover: 306469 pages InnoDB: Multi-batch recovery needed at LSN 4189599815 InnoDB: End of log at LSN=4483551634 InnoDB: To recover: LSN 4189599815/4483551634; 307490 pages InnoDB: To recover: LSN 4189599815/4483551634; 197159 pages InnoDB: To recover: LSN 4189599815/4483551634; 67623 pages InnoDB: Parsed redo log up to LSN=4353924218; to recover: 102083 pages ... InnoDB: log sequence number 4483551634 ... The previous messages "Starting a batch to recover" or "Starting a final batch to recover" will be replaced by "To recover: ... pages" messages. If a batch lasts longer than 15 seconds, then there will be progress reports every 15 seconds, showing the number of remaining pages. For the non-final batch, the "To recover:" message includes two end LSN: that of the batch, and of the recovered log. This is the primary measure of progress. The batch will end once the number of pages to recover reaches 0. If recovery is possible in a single batch, the output will look like this, with a shorter "To recover:" message that counts only the remaining pages: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1998701027; to recover: 125560 pages InnoDB: Parsed redo log up to LSN=2734136874; to recover: 186446 pages InnoDB: Parsed redo log up to LSN=3499505504; to recover: 249378 pages InnoDB: Parsed redo log up to LSN=4183247844; to recover: 306964 pages InnoDB: End of log at LSN=4483551634 ... InnoDB: To recover: 331797 pages ... InnoDB: log sequence number 4483551634 ... We will also speed up recovery by improving the memory management and implementing multi-threaded recovery of data pages that will not need to be read into the buffer pool ("fake read"). Log application in the "fake read" threads will be protected by an atomic being_recovered field and exclusive buf_page_t::latch. Recovery will reserve for data pages two thirds of the buffer pool, or 256 pages, whichever is smaller. Previously, we could only use at most one third of the buffer pool for buffered log records. This would typically mean that with large buffer pools, recovery unnecessary consisted of multiple batches. If recovery runs out of memory, it will "roll back" or "rewind" the current mini-transaction. The recv_sys.lsn and recv_sys.pages will correspond to the "out of memory LSN", at the end of the previous complete mini-transaction. If recovery runs out of memory while executing the final recovery batch, we can simply invoke recv_sys.apply(false) to make room, and resume parsing. If recovery runs out of memory before the final batch, we will scan the redo log to the end (recv_sys.scanned_lsn) and check for any missing or inconsistent files. If recv_init_crash_recovery_spaces() does not report any potentially missing tablespaces, we can make use of the already stored recv_sys.pages and only rewind to the "out of memory LSN". Else, we must keep parsing and invoking recv_validate_tablespace() until an error has been found or everything has been resolved, and ultimatily rewind to to the checkpoint LSN. recv_sys_t::pages_it: A cached iterator to recv_sys.pages recv_sys_t::parse_mtr(): Remove an ATTRIBUTE_NOINLINE that would prevent tail call optimization in recv_sys_t::parse_pmem(). recv_sys_t::parse(), recv_sys_t::parse_mtr(), recv_sys_t::parse_pmem(): Add template<bool store> parameter. Redo log record parsing (store=false) is better specialized from store=true (with bool if_exists) so that we can avoid some conditional branches in frequently invoked low-level code. recv_sys_t::is_memory_exhausted(): Remove. The special parse() status GOT_OOM will report out-of-memory situation at the low level. recv_sys_t::rewind(), page_recv_t::recs_t::rewind(): Remove all log starting with a specific LSN. recv_scan_log(): Separate some code for only parsing, not storing log. In rewound_lsn, remember the LSN at which last_phase=false recovery ran out of memory. This is where the next call to recv_scan_log() will resume storing the log. This replaces recv_sys.last_stored_lsn. recv_sys_t::parse(): Evaluate the template parameter store in a few more cases, to allow dead code to be eliminated at compile time. recv_sys_t::scanned_lsn: The end of the log found by recv_scan_log(). The special value 1 means that recv_sys has been initialized but no log has been parsed. IORequest::write_complete(), IORequest::read_complete(): Replaces fil_aio_callback(). read_io_callback(), write_io_callback(): Replaces io_callback(). IORequest::fake_read_complete(), fake_io_callback(), os_fake_read(): Process a "fake read" request for concurrent recovery. recv_sys_t::apply_batch(): Choose a number of successive pages for a recovery batch. recv_sys_t::erase(recv_sys_t::map::iterator): Remove log records for a page whose recovery is not in progress. Log application threads will not invoke this; they will only set being_recovered=-1 to indicate that the entry is no longer needed. recv_sys_t::garbage_collect(): Remove all being_recovered=-1 entries. recv_sys_t::wait_for_pool(): Wait for some space to become available in the buffer pool. mlog_init_t::mark_ibuf_exist(): Avoid calls to recv_sys::recover_low() via ibuf_page_exists() and buf_page_get_low(). Such calls would lead to double locking of recv_sys.mutex, which depending on implementation could cause a deadlock. We will use lower-level calls to look up index pages. buf_LRU_block_remove_hashed(): Disable consistency checks for freed ROW_FORMAT=COMPRESSED pages. Their contents could be uninitialized garbage. This fixes an occasional failure of the test innodb.innodb_bulk_create_index_debug. Tested by: Matthias Leich
2 years ago
20 years ago
6 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
6 years ago
20 years ago
20 years ago
6 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
6 years ago
6 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-29911 InnoDB recovery and mariadb-backup --prepare fail to report detailed progress The progress reporting of InnoDB crash recovery was rather intermittent. Nothing was reported during the single-threaded log record parsing, which could consume minutes when parsing a large log. During log application, there only was progress reporting in background threads that would be invoked on data page read completion. The progress reporting here will be detailed like this: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1990840177; to recover: 124806 pages InnoDB: Parsed redo log up to LSN=2729777071; to recover: 186123 pages InnoDB: Parsed redo log up to LSN=3488599173; to recover: 248397 pages InnoDB: Parsed redo log up to LSN=4177856618; to recover: 306469 pages InnoDB: Multi-batch recovery needed at LSN 4189599815 InnoDB: End of log at LSN=4483551634 InnoDB: To recover: LSN 4189599815/4483551634; 307490 pages InnoDB: To recover: LSN 4189599815/4483551634; 197159 pages InnoDB: To recover: LSN 4189599815/4483551634; 67623 pages InnoDB: Parsed redo log up to LSN=4353924218; to recover: 102083 pages ... InnoDB: log sequence number 4483551634 ... The previous messages "Starting a batch to recover" or "Starting a final batch to recover" will be replaced by "To recover: ... pages" messages. If a batch lasts longer than 15 seconds, then there will be progress reports every 15 seconds, showing the number of remaining pages. For the non-final batch, the "To recover:" message includes two end LSN: that of the batch, and of the recovered log. This is the primary measure of progress. The batch will end once the number of pages to recover reaches 0. If recovery is possible in a single batch, the output will look like this, with a shorter "To recover:" message that counts only the remaining pages: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1998701027; to recover: 125560 pages InnoDB: Parsed redo log up to LSN=2734136874; to recover: 186446 pages InnoDB: Parsed redo log up to LSN=3499505504; to recover: 249378 pages InnoDB: Parsed redo log up to LSN=4183247844; to recover: 306964 pages InnoDB: End of log at LSN=4483551634 ... InnoDB: To recover: 331797 pages ... InnoDB: log sequence number 4483551634 ... We will also speed up recovery by improving the memory management and implementing multi-threaded recovery of data pages that will not need to be read into the buffer pool ("fake read"). Log application in the "fake read" threads will be protected by an atomic being_recovered field and exclusive buf_page_t::latch. Recovery will reserve for data pages two thirds of the buffer pool, or 256 pages, whichever is smaller. Previously, we could only use at most one third of the buffer pool for buffered log records. This would typically mean that with large buffer pools, recovery unnecessary consisted of multiple batches. If recovery runs out of memory, it will "roll back" or "rewind" the current mini-transaction. The recv_sys.lsn and recv_sys.pages will correspond to the "out of memory LSN", at the end of the previous complete mini-transaction. If recovery runs out of memory while executing the final recovery batch, we can simply invoke recv_sys.apply(false) to make room, and resume parsing. If recovery runs out of memory before the final batch, we will scan the redo log to the end (recv_sys.scanned_lsn) and check for any missing or inconsistent files. If recv_init_crash_recovery_spaces() does not report any potentially missing tablespaces, we can make use of the already stored recv_sys.pages and only rewind to the "out of memory LSN". Else, we must keep parsing and invoking recv_validate_tablespace() until an error has been found or everything has been resolved, and ultimatily rewind to to the checkpoint LSN. recv_sys_t::pages_it: A cached iterator to recv_sys.pages recv_sys_t::parse_mtr(): Remove an ATTRIBUTE_NOINLINE that would prevent tail call optimization in recv_sys_t::parse_pmem(). recv_sys_t::parse(), recv_sys_t::parse_mtr(), recv_sys_t::parse_pmem(): Add template<bool store> parameter. Redo log record parsing (store=false) is better specialized from store=true (with bool if_exists) so that we can avoid some conditional branches in frequently invoked low-level code. recv_sys_t::is_memory_exhausted(): Remove. The special parse() status GOT_OOM will report out-of-memory situation at the low level. recv_sys_t::rewind(), page_recv_t::recs_t::rewind(): Remove all log starting with a specific LSN. recv_scan_log(): Separate some code for only parsing, not storing log. In rewound_lsn, remember the LSN at which last_phase=false recovery ran out of memory. This is where the next call to recv_scan_log() will resume storing the log. This replaces recv_sys.last_stored_lsn. recv_sys_t::parse(): Evaluate the template parameter store in a few more cases, to allow dead code to be eliminated at compile time. recv_sys_t::scanned_lsn: The end of the log found by recv_scan_log(). The special value 1 means that recv_sys has been initialized but no log has been parsed. IORequest::write_complete(), IORequest::read_complete(): Replaces fil_aio_callback(). read_io_callback(), write_io_callback(): Replaces io_callback(). IORequest::fake_read_complete(), fake_io_callback(), os_fake_read(): Process a "fake read" request for concurrent recovery. recv_sys_t::apply_batch(): Choose a number of successive pages for a recovery batch. recv_sys_t::erase(recv_sys_t::map::iterator): Remove log records for a page whose recovery is not in progress. Log application threads will not invoke this; they will only set being_recovered=-1 to indicate that the entry is no longer needed. recv_sys_t::garbage_collect(): Remove all being_recovered=-1 entries. recv_sys_t::wait_for_pool(): Wait for some space to become available in the buffer pool. mlog_init_t::mark_ibuf_exist(): Avoid calls to recv_sys::recover_low() via ibuf_page_exists() and buf_page_get_low(). Such calls would lead to double locking of recv_sys.mutex, which depending on implementation could cause a deadlock. We will use lower-level calls to look up index pages. buf_LRU_block_remove_hashed(): Disable consistency checks for freed ROW_FORMAT=COMPRESSED pages. Their contents could be uninitialized garbage. This fixes an occasional failure of the test innodb.innodb_bulk_create_index_debug. Tested by: Matthias Leich
2 years ago
20 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
20 years ago
MDEV-29911 InnoDB recovery and mariadb-backup --prepare fail to report detailed progress The progress reporting of InnoDB crash recovery was rather intermittent. Nothing was reported during the single-threaded log record parsing, which could consume minutes when parsing a large log. During log application, there only was progress reporting in background threads that would be invoked on data page read completion. The progress reporting here will be detailed like this: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1990840177; to recover: 124806 pages InnoDB: Parsed redo log up to LSN=2729777071; to recover: 186123 pages InnoDB: Parsed redo log up to LSN=3488599173; to recover: 248397 pages InnoDB: Parsed redo log up to LSN=4177856618; to recover: 306469 pages InnoDB: Multi-batch recovery needed at LSN 4189599815 InnoDB: End of log at LSN=4483551634 InnoDB: To recover: LSN 4189599815/4483551634; 307490 pages InnoDB: To recover: LSN 4189599815/4483551634; 197159 pages InnoDB: To recover: LSN 4189599815/4483551634; 67623 pages InnoDB: Parsed redo log up to LSN=4353924218; to recover: 102083 pages ... InnoDB: log sequence number 4483551634 ... The previous messages "Starting a batch to recover" or "Starting a final batch to recover" will be replaced by "To recover: ... pages" messages. If a batch lasts longer than 15 seconds, then there will be progress reports every 15 seconds, showing the number of remaining pages. For the non-final batch, the "To recover:" message includes two end LSN: that of the batch, and of the recovered log. This is the primary measure of progress. The batch will end once the number of pages to recover reaches 0. If recovery is possible in a single batch, the output will look like this, with a shorter "To recover:" message that counts only the remaining pages: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1998701027; to recover: 125560 pages InnoDB: Parsed redo log up to LSN=2734136874; to recover: 186446 pages InnoDB: Parsed redo log up to LSN=3499505504; to recover: 249378 pages InnoDB: Parsed redo log up to LSN=4183247844; to recover: 306964 pages InnoDB: End of log at LSN=4483551634 ... InnoDB: To recover: 331797 pages ... InnoDB: log sequence number 4483551634 ... We will also speed up recovery by improving the memory management and implementing multi-threaded recovery of data pages that will not need to be read into the buffer pool ("fake read"). Log application in the "fake read" threads will be protected by an atomic being_recovered field and exclusive buf_page_t::latch. Recovery will reserve for data pages two thirds of the buffer pool, or 256 pages, whichever is smaller. Previously, we could only use at most one third of the buffer pool for buffered log records. This would typically mean that with large buffer pools, recovery unnecessary consisted of multiple batches. If recovery runs out of memory, it will "roll back" or "rewind" the current mini-transaction. The recv_sys.lsn and recv_sys.pages will correspond to the "out of memory LSN", at the end of the previous complete mini-transaction. If recovery runs out of memory while executing the final recovery batch, we can simply invoke recv_sys.apply(false) to make room, and resume parsing. If recovery runs out of memory before the final batch, we will scan the redo log to the end (recv_sys.scanned_lsn) and check for any missing or inconsistent files. If recv_init_crash_recovery_spaces() does not report any potentially missing tablespaces, we can make use of the already stored recv_sys.pages and only rewind to the "out of memory LSN". Else, we must keep parsing and invoking recv_validate_tablespace() until an error has been found or everything has been resolved, and ultimatily rewind to to the checkpoint LSN. recv_sys_t::pages_it: A cached iterator to recv_sys.pages recv_sys_t::parse_mtr(): Remove an ATTRIBUTE_NOINLINE that would prevent tail call optimization in recv_sys_t::parse_pmem(). recv_sys_t::parse(), recv_sys_t::parse_mtr(), recv_sys_t::parse_pmem(): Add template<bool store> parameter. Redo log record parsing (store=false) is better specialized from store=true (with bool if_exists) so that we can avoid some conditional branches in frequently invoked low-level code. recv_sys_t::is_memory_exhausted(): Remove. The special parse() status GOT_OOM will report out-of-memory situation at the low level. recv_sys_t::rewind(), page_recv_t::recs_t::rewind(): Remove all log starting with a specific LSN. recv_scan_log(): Separate some code for only parsing, not storing log. In rewound_lsn, remember the LSN at which last_phase=false recovery ran out of memory. This is where the next call to recv_scan_log() will resume storing the log. This replaces recv_sys.last_stored_lsn. recv_sys_t::parse(): Evaluate the template parameter store in a few more cases, to allow dead code to be eliminated at compile time. recv_sys_t::scanned_lsn: The end of the log found by recv_scan_log(). The special value 1 means that recv_sys has been initialized but no log has been parsed. IORequest::write_complete(), IORequest::read_complete(): Replaces fil_aio_callback(). read_io_callback(), write_io_callback(): Replaces io_callback(). IORequest::fake_read_complete(), fake_io_callback(), os_fake_read(): Process a "fake read" request for concurrent recovery. recv_sys_t::apply_batch(): Choose a number of successive pages for a recovery batch. recv_sys_t::erase(recv_sys_t::map::iterator): Remove log records for a page whose recovery is not in progress. Log application threads will not invoke this; they will only set being_recovered=-1 to indicate that the entry is no longer needed. recv_sys_t::garbage_collect(): Remove all being_recovered=-1 entries. recv_sys_t::wait_for_pool(): Wait for some space to become available in the buffer pool. mlog_init_t::mark_ibuf_exist(): Avoid calls to recv_sys::recover_low() via ibuf_page_exists() and buf_page_get_low(). Such calls would lead to double locking of recv_sys.mutex, which depending on implementation could cause a deadlock. We will use lower-level calls to look up index pages. buf_LRU_block_remove_hashed(): Disable consistency checks for freed ROW_FORMAT=COMPRESSED pages. Their contents could be uninitialized garbage. This fixes an occasional failure of the test innodb.innodb_bulk_create_index_debug. Tested by: Matthias Leich
2 years ago
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
5 years ago
MDEV-29911 InnoDB recovery and mariadb-backup --prepare fail to report detailed progress The progress reporting of InnoDB crash recovery was rather intermittent. Nothing was reported during the single-threaded log record parsing, which could consume minutes when parsing a large log. During log application, there only was progress reporting in background threads that would be invoked on data page read completion. The progress reporting here will be detailed like this: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1990840177; to recover: 124806 pages InnoDB: Parsed redo log up to LSN=2729777071; to recover: 186123 pages InnoDB: Parsed redo log up to LSN=3488599173; to recover: 248397 pages InnoDB: Parsed redo log up to LSN=4177856618; to recover: 306469 pages InnoDB: Multi-batch recovery needed at LSN 4189599815 InnoDB: End of log at LSN=4483551634 InnoDB: To recover: LSN 4189599815/4483551634; 307490 pages InnoDB: To recover: LSN 4189599815/4483551634; 197159 pages InnoDB: To recover: LSN 4189599815/4483551634; 67623 pages InnoDB: Parsed redo log up to LSN=4353924218; to recover: 102083 pages ... InnoDB: log sequence number 4483551634 ... The previous messages "Starting a batch to recover" or "Starting a final batch to recover" will be replaced by "To recover: ... pages" messages. If a batch lasts longer than 15 seconds, then there will be progress reports every 15 seconds, showing the number of remaining pages. For the non-final batch, the "To recover:" message includes two end LSN: that of the batch, and of the recovered log. This is the primary measure of progress. The batch will end once the number of pages to recover reaches 0. If recovery is possible in a single batch, the output will look like this, with a shorter "To recover:" message that counts only the remaining pages: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1998701027; to recover: 125560 pages InnoDB: Parsed redo log up to LSN=2734136874; to recover: 186446 pages InnoDB: Parsed redo log up to LSN=3499505504; to recover: 249378 pages InnoDB: Parsed redo log up to LSN=4183247844; to recover: 306964 pages InnoDB: End of log at LSN=4483551634 ... InnoDB: To recover: 331797 pages ... InnoDB: log sequence number 4483551634 ... We will also speed up recovery by improving the memory management and implementing multi-threaded recovery of data pages that will not need to be read into the buffer pool ("fake read"). Log application in the "fake read" threads will be protected by an atomic being_recovered field and exclusive buf_page_t::latch. Recovery will reserve for data pages two thirds of the buffer pool, or 256 pages, whichever is smaller. Previously, we could only use at most one third of the buffer pool for buffered log records. This would typically mean that with large buffer pools, recovery unnecessary consisted of multiple batches. If recovery runs out of memory, it will "roll back" or "rewind" the current mini-transaction. The recv_sys.lsn and recv_sys.pages will correspond to the "out of memory LSN", at the end of the previous complete mini-transaction. If recovery runs out of memory while executing the final recovery batch, we can simply invoke recv_sys.apply(false) to make room, and resume parsing. If recovery runs out of memory before the final batch, we will scan the redo log to the end (recv_sys.scanned_lsn) and check for any missing or inconsistent files. If recv_init_crash_recovery_spaces() does not report any potentially missing tablespaces, we can make use of the already stored recv_sys.pages and only rewind to the "out of memory LSN". Else, we must keep parsing and invoking recv_validate_tablespace() until an error has been found or everything has been resolved, and ultimatily rewind to to the checkpoint LSN. recv_sys_t::pages_it: A cached iterator to recv_sys.pages recv_sys_t::parse_mtr(): Remove an ATTRIBUTE_NOINLINE that would prevent tail call optimization in recv_sys_t::parse_pmem(). recv_sys_t::parse(), recv_sys_t::parse_mtr(), recv_sys_t::parse_pmem(): Add template<bool store> parameter. Redo log record parsing (store=false) is better specialized from store=true (with bool if_exists) so that we can avoid some conditional branches in frequently invoked low-level code. recv_sys_t::is_memory_exhausted(): Remove. The special parse() status GOT_OOM will report out-of-memory situation at the low level. recv_sys_t::rewind(), page_recv_t::recs_t::rewind(): Remove all log starting with a specific LSN. recv_scan_log(): Separate some code for only parsing, not storing log. In rewound_lsn, remember the LSN at which last_phase=false recovery ran out of memory. This is where the next call to recv_scan_log() will resume storing the log. This replaces recv_sys.last_stored_lsn. recv_sys_t::parse(): Evaluate the template parameter store in a few more cases, to allow dead code to be eliminated at compile time. recv_sys_t::scanned_lsn: The end of the log found by recv_scan_log(). The special value 1 means that recv_sys has been initialized but no log has been parsed. IORequest::write_complete(), IORequest::read_complete(): Replaces fil_aio_callback(). read_io_callback(), write_io_callback(): Replaces io_callback(). IORequest::fake_read_complete(), fake_io_callback(), os_fake_read(): Process a "fake read" request for concurrent recovery. recv_sys_t::apply_batch(): Choose a number of successive pages for a recovery batch. recv_sys_t::erase(recv_sys_t::map::iterator): Remove log records for a page whose recovery is not in progress. Log application threads will not invoke this; they will only set being_recovered=-1 to indicate that the entry is no longer needed. recv_sys_t::garbage_collect(): Remove all being_recovered=-1 entries. recv_sys_t::wait_for_pool(): Wait for some space to become available in the buffer pool. mlog_init_t::mark_ibuf_exist(): Avoid calls to recv_sys::recover_low() via ibuf_page_exists() and buf_page_get_low(). Such calls would lead to double locking of recv_sys.mutex, which depending on implementation could cause a deadlock. We will use lower-level calls to look up index pages. buf_LRU_block_remove_hashed(): Disable consistency checks for freed ROW_FORMAT=COMPRESSED pages. Their contents could be uninitialized garbage. This fixes an occasional failure of the test innodb.innodb_bulk_create_index_debug. Tested by: Matthias Leich
2 years ago
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
5 years ago
MDEV-29911 InnoDB recovery and mariadb-backup --prepare fail to report detailed progress The progress reporting of InnoDB crash recovery was rather intermittent. Nothing was reported during the single-threaded log record parsing, which could consume minutes when parsing a large log. During log application, there only was progress reporting in background threads that would be invoked on data page read completion. The progress reporting here will be detailed like this: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1990840177; to recover: 124806 pages InnoDB: Parsed redo log up to LSN=2729777071; to recover: 186123 pages InnoDB: Parsed redo log up to LSN=3488599173; to recover: 248397 pages InnoDB: Parsed redo log up to LSN=4177856618; to recover: 306469 pages InnoDB: Multi-batch recovery needed at LSN 4189599815 InnoDB: End of log at LSN=4483551634 InnoDB: To recover: LSN 4189599815/4483551634; 307490 pages InnoDB: To recover: LSN 4189599815/4483551634; 197159 pages InnoDB: To recover: LSN 4189599815/4483551634; 67623 pages InnoDB: Parsed redo log up to LSN=4353924218; to recover: 102083 pages ... InnoDB: log sequence number 4483551634 ... The previous messages "Starting a batch to recover" or "Starting a final batch to recover" will be replaced by "To recover: ... pages" messages. If a batch lasts longer than 15 seconds, then there will be progress reports every 15 seconds, showing the number of remaining pages. For the non-final batch, the "To recover:" message includes two end LSN: that of the batch, and of the recovered log. This is the primary measure of progress. The batch will end once the number of pages to recover reaches 0. If recovery is possible in a single batch, the output will look like this, with a shorter "To recover:" message that counts only the remaining pages: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1998701027; to recover: 125560 pages InnoDB: Parsed redo log up to LSN=2734136874; to recover: 186446 pages InnoDB: Parsed redo log up to LSN=3499505504; to recover: 249378 pages InnoDB: Parsed redo log up to LSN=4183247844; to recover: 306964 pages InnoDB: End of log at LSN=4483551634 ... InnoDB: To recover: 331797 pages ... InnoDB: log sequence number 4483551634 ... We will also speed up recovery by improving the memory management and implementing multi-threaded recovery of data pages that will not need to be read into the buffer pool ("fake read"). Log application in the "fake read" threads will be protected by an atomic being_recovered field and exclusive buf_page_t::latch. Recovery will reserve for data pages two thirds of the buffer pool, or 256 pages, whichever is smaller. Previously, we could only use at most one third of the buffer pool for buffered log records. This would typically mean that with large buffer pools, recovery unnecessary consisted of multiple batches. If recovery runs out of memory, it will "roll back" or "rewind" the current mini-transaction. The recv_sys.lsn and recv_sys.pages will correspond to the "out of memory LSN", at the end of the previous complete mini-transaction. If recovery runs out of memory while executing the final recovery batch, we can simply invoke recv_sys.apply(false) to make room, and resume parsing. If recovery runs out of memory before the final batch, we will scan the redo log to the end (recv_sys.scanned_lsn) and check for any missing or inconsistent files. If recv_init_crash_recovery_spaces() does not report any potentially missing tablespaces, we can make use of the already stored recv_sys.pages and only rewind to the "out of memory LSN". Else, we must keep parsing and invoking recv_validate_tablespace() until an error has been found or everything has been resolved, and ultimatily rewind to to the checkpoint LSN. recv_sys_t::pages_it: A cached iterator to recv_sys.pages recv_sys_t::parse_mtr(): Remove an ATTRIBUTE_NOINLINE that would prevent tail call optimization in recv_sys_t::parse_pmem(). recv_sys_t::parse(), recv_sys_t::parse_mtr(), recv_sys_t::parse_pmem(): Add template<bool store> parameter. Redo log record parsing (store=false) is better specialized from store=true (with bool if_exists) so that we can avoid some conditional branches in frequently invoked low-level code. recv_sys_t::is_memory_exhausted(): Remove. The special parse() status GOT_OOM will report out-of-memory situation at the low level. recv_sys_t::rewind(), page_recv_t::recs_t::rewind(): Remove all log starting with a specific LSN. recv_scan_log(): Separate some code for only parsing, not storing log. In rewound_lsn, remember the LSN at which last_phase=false recovery ran out of memory. This is where the next call to recv_scan_log() will resume storing the log. This replaces recv_sys.last_stored_lsn. recv_sys_t::parse(): Evaluate the template parameter store in a few more cases, to allow dead code to be eliminated at compile time. recv_sys_t::scanned_lsn: The end of the log found by recv_scan_log(). The special value 1 means that recv_sys has been initialized but no log has been parsed. IORequest::write_complete(), IORequest::read_complete(): Replaces fil_aio_callback(). read_io_callback(), write_io_callback(): Replaces io_callback(). IORequest::fake_read_complete(), fake_io_callback(), os_fake_read(): Process a "fake read" request for concurrent recovery. recv_sys_t::apply_batch(): Choose a number of successive pages for a recovery batch. recv_sys_t::erase(recv_sys_t::map::iterator): Remove log records for a page whose recovery is not in progress. Log application threads will not invoke this; they will only set being_recovered=-1 to indicate that the entry is no longer needed. recv_sys_t::garbage_collect(): Remove all being_recovered=-1 entries. recv_sys_t::wait_for_pool(): Wait for some space to become available in the buffer pool. mlog_init_t::mark_ibuf_exist(): Avoid calls to recv_sys::recover_low() via ibuf_page_exists() and buf_page_get_low(). Such calls would lead to double locking of recv_sys.mutex, which depending on implementation could cause a deadlock. We will use lower-level calls to look up index pages. buf_LRU_block_remove_hashed(): Disable consistency checks for freed ROW_FORMAT=COMPRESSED pages. Their contents could be uninitialized garbage. This fixes an occasional failure of the test innodb.innodb_bulk_create_index_debug. Tested by: Matthias Leich
2 years ago
MDEV-29911 InnoDB recovery and mariadb-backup --prepare fail to report detailed progress The progress reporting of InnoDB crash recovery was rather intermittent. Nothing was reported during the single-threaded log record parsing, which could consume minutes when parsing a large log. During log application, there only was progress reporting in background threads that would be invoked on data page read completion. The progress reporting here will be detailed like this: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1990840177; to recover: 124806 pages InnoDB: Parsed redo log up to LSN=2729777071; to recover: 186123 pages InnoDB: Parsed redo log up to LSN=3488599173; to recover: 248397 pages InnoDB: Parsed redo log up to LSN=4177856618; to recover: 306469 pages InnoDB: Multi-batch recovery needed at LSN 4189599815 InnoDB: End of log at LSN=4483551634 InnoDB: To recover: LSN 4189599815/4483551634; 307490 pages InnoDB: To recover: LSN 4189599815/4483551634; 197159 pages InnoDB: To recover: LSN 4189599815/4483551634; 67623 pages InnoDB: Parsed redo log up to LSN=4353924218; to recover: 102083 pages ... InnoDB: log sequence number 4483551634 ... The previous messages "Starting a batch to recover" or "Starting a final batch to recover" will be replaced by "To recover: ... pages" messages. If a batch lasts longer than 15 seconds, then there will be progress reports every 15 seconds, showing the number of remaining pages. For the non-final batch, the "To recover:" message includes two end LSN: that of the batch, and of the recovered log. This is the primary measure of progress. The batch will end once the number of pages to recover reaches 0. If recovery is possible in a single batch, the output will look like this, with a shorter "To recover:" message that counts only the remaining pages: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1998701027; to recover: 125560 pages InnoDB: Parsed redo log up to LSN=2734136874; to recover: 186446 pages InnoDB: Parsed redo log up to LSN=3499505504; to recover: 249378 pages InnoDB: Parsed redo log up to LSN=4183247844; to recover: 306964 pages InnoDB: End of log at LSN=4483551634 ... InnoDB: To recover: 331797 pages ... InnoDB: log sequence number 4483551634 ... We will also speed up recovery by improving the memory management and implementing multi-threaded recovery of data pages that will not need to be read into the buffer pool ("fake read"). Log application in the "fake read" threads will be protected by an atomic being_recovered field and exclusive buf_page_t::latch. Recovery will reserve for data pages two thirds of the buffer pool, or 256 pages, whichever is smaller. Previously, we could only use at most one third of the buffer pool for buffered log records. This would typically mean that with large buffer pools, recovery unnecessary consisted of multiple batches. If recovery runs out of memory, it will "roll back" or "rewind" the current mini-transaction. The recv_sys.lsn and recv_sys.pages will correspond to the "out of memory LSN", at the end of the previous complete mini-transaction. If recovery runs out of memory while executing the final recovery batch, we can simply invoke recv_sys.apply(false) to make room, and resume parsing. If recovery runs out of memory before the final batch, we will scan the redo log to the end (recv_sys.scanned_lsn) and check for any missing or inconsistent files. If recv_init_crash_recovery_spaces() does not report any potentially missing tablespaces, we can make use of the already stored recv_sys.pages and only rewind to the "out of memory LSN". Else, we must keep parsing and invoking recv_validate_tablespace() until an error has been found or everything has been resolved, and ultimatily rewind to to the checkpoint LSN. recv_sys_t::pages_it: A cached iterator to recv_sys.pages recv_sys_t::parse_mtr(): Remove an ATTRIBUTE_NOINLINE that would prevent tail call optimization in recv_sys_t::parse_pmem(). recv_sys_t::parse(), recv_sys_t::parse_mtr(), recv_sys_t::parse_pmem(): Add template<bool store> parameter. Redo log record parsing (store=false) is better specialized from store=true (with bool if_exists) so that we can avoid some conditional branches in frequently invoked low-level code. recv_sys_t::is_memory_exhausted(): Remove. The special parse() status GOT_OOM will report out-of-memory situation at the low level. recv_sys_t::rewind(), page_recv_t::recs_t::rewind(): Remove all log starting with a specific LSN. recv_scan_log(): Separate some code for only parsing, not storing log. In rewound_lsn, remember the LSN at which last_phase=false recovery ran out of memory. This is where the next call to recv_scan_log() will resume storing the log. This replaces recv_sys.last_stored_lsn. recv_sys_t::parse(): Evaluate the template parameter store in a few more cases, to allow dead code to be eliminated at compile time. recv_sys_t::scanned_lsn: The end of the log found by recv_scan_log(). The special value 1 means that recv_sys has been initialized but no log has been parsed. IORequest::write_complete(), IORequest::read_complete(): Replaces fil_aio_callback(). read_io_callback(), write_io_callback(): Replaces io_callback(). IORequest::fake_read_complete(), fake_io_callback(), os_fake_read(): Process a "fake read" request for concurrent recovery. recv_sys_t::apply_batch(): Choose a number of successive pages for a recovery batch. recv_sys_t::erase(recv_sys_t::map::iterator): Remove log records for a page whose recovery is not in progress. Log application threads will not invoke this; they will only set being_recovered=-1 to indicate that the entry is no longer needed. recv_sys_t::garbage_collect(): Remove all being_recovered=-1 entries. recv_sys_t::wait_for_pool(): Wait for some space to become available in the buffer pool. mlog_init_t::mark_ibuf_exist(): Avoid calls to recv_sys::recover_low() via ibuf_page_exists() and buf_page_get_low(). Such calls would lead to double locking of recv_sys.mutex, which depending on implementation could cause a deadlock. We will use lower-level calls to look up index pages. buf_LRU_block_remove_hashed(): Disable consistency checks for freed ROW_FORMAT=COMPRESSED pages. Their contents could be uninitialized garbage. This fixes an occasional failure of the test innodb.innodb_bulk_create_index_debug. Tested by: Matthias Leich
2 years ago
MDEV-29911 InnoDB recovery and mariadb-backup --prepare fail to report detailed progress The progress reporting of InnoDB crash recovery was rather intermittent. Nothing was reported during the single-threaded log record parsing, which could consume minutes when parsing a large log. During log application, there only was progress reporting in background threads that would be invoked on data page read completion. The progress reporting here will be detailed like this: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1990840177; to recover: 124806 pages InnoDB: Parsed redo log up to LSN=2729777071; to recover: 186123 pages InnoDB: Parsed redo log up to LSN=3488599173; to recover: 248397 pages InnoDB: Parsed redo log up to LSN=4177856618; to recover: 306469 pages InnoDB: Multi-batch recovery needed at LSN 4189599815 InnoDB: End of log at LSN=4483551634 InnoDB: To recover: LSN 4189599815/4483551634; 307490 pages InnoDB: To recover: LSN 4189599815/4483551634; 197159 pages InnoDB: To recover: LSN 4189599815/4483551634; 67623 pages InnoDB: Parsed redo log up to LSN=4353924218; to recover: 102083 pages ... InnoDB: log sequence number 4483551634 ... The previous messages "Starting a batch to recover" or "Starting a final batch to recover" will be replaced by "To recover: ... pages" messages. If a batch lasts longer than 15 seconds, then there will be progress reports every 15 seconds, showing the number of remaining pages. For the non-final batch, the "To recover:" message includes two end LSN: that of the batch, and of the recovered log. This is the primary measure of progress. The batch will end once the number of pages to recover reaches 0. If recovery is possible in a single batch, the output will look like this, with a shorter "To recover:" message that counts only the remaining pages: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1998701027; to recover: 125560 pages InnoDB: Parsed redo log up to LSN=2734136874; to recover: 186446 pages InnoDB: Parsed redo log up to LSN=3499505504; to recover: 249378 pages InnoDB: Parsed redo log up to LSN=4183247844; to recover: 306964 pages InnoDB: End of log at LSN=4483551634 ... InnoDB: To recover: 331797 pages ... InnoDB: log sequence number 4483551634 ... We will also speed up recovery by improving the memory management and implementing multi-threaded recovery of data pages that will not need to be read into the buffer pool ("fake read"). Log application in the "fake read" threads will be protected by an atomic being_recovered field and exclusive buf_page_t::latch. Recovery will reserve for data pages two thirds of the buffer pool, or 256 pages, whichever is smaller. Previously, we could only use at most one third of the buffer pool for buffered log records. This would typically mean that with large buffer pools, recovery unnecessary consisted of multiple batches. If recovery runs out of memory, it will "roll back" or "rewind" the current mini-transaction. The recv_sys.lsn and recv_sys.pages will correspond to the "out of memory LSN", at the end of the previous complete mini-transaction. If recovery runs out of memory while executing the final recovery batch, we can simply invoke recv_sys.apply(false) to make room, and resume parsing. If recovery runs out of memory before the final batch, we will scan the redo log to the end (recv_sys.scanned_lsn) and check for any missing or inconsistent files. If recv_init_crash_recovery_spaces() does not report any potentially missing tablespaces, we can make use of the already stored recv_sys.pages and only rewind to the "out of memory LSN". Else, we must keep parsing and invoking recv_validate_tablespace() until an error has been found or everything has been resolved, and ultimatily rewind to to the checkpoint LSN. recv_sys_t::pages_it: A cached iterator to recv_sys.pages recv_sys_t::parse_mtr(): Remove an ATTRIBUTE_NOINLINE that would prevent tail call optimization in recv_sys_t::parse_pmem(). recv_sys_t::parse(), recv_sys_t::parse_mtr(), recv_sys_t::parse_pmem(): Add template<bool store> parameter. Redo log record parsing (store=false) is better specialized from store=true (with bool if_exists) so that we can avoid some conditional branches in frequently invoked low-level code. recv_sys_t::is_memory_exhausted(): Remove. The special parse() status GOT_OOM will report out-of-memory situation at the low level. recv_sys_t::rewind(), page_recv_t::recs_t::rewind(): Remove all log starting with a specific LSN. recv_scan_log(): Separate some code for only parsing, not storing log. In rewound_lsn, remember the LSN at which last_phase=false recovery ran out of memory. This is where the next call to recv_scan_log() will resume storing the log. This replaces recv_sys.last_stored_lsn. recv_sys_t::parse(): Evaluate the template parameter store in a few more cases, to allow dead code to be eliminated at compile time. recv_sys_t::scanned_lsn: The end of the log found by recv_scan_log(). The special value 1 means that recv_sys has been initialized but no log has been parsed. IORequest::write_complete(), IORequest::read_complete(): Replaces fil_aio_callback(). read_io_callback(), write_io_callback(): Replaces io_callback(). IORequest::fake_read_complete(), fake_io_callback(), os_fake_read(): Process a "fake read" request for concurrent recovery. recv_sys_t::apply_batch(): Choose a number of successive pages for a recovery batch. recv_sys_t::erase(recv_sys_t::map::iterator): Remove log records for a page whose recovery is not in progress. Log application threads will not invoke this; they will only set being_recovered=-1 to indicate that the entry is no longer needed. recv_sys_t::garbage_collect(): Remove all being_recovered=-1 entries. recv_sys_t::wait_for_pool(): Wait for some space to become available in the buffer pool. mlog_init_t::mark_ibuf_exist(): Avoid calls to recv_sys::recover_low() via ibuf_page_exists() and buf_page_get_low(). Such calls would lead to double locking of recv_sys.mutex, which depending on implementation could cause a deadlock. We will use lower-level calls to look up index pages. buf_LRU_block_remove_hashed(): Disable consistency checks for freed ROW_FORMAT=COMPRESSED pages. Their contents could be uninitialized garbage. This fixes an occasional failure of the test innodb.innodb_bulk_create_index_debug. Tested by: Matthias Leich
2 years ago
20 years ago
MDEV-29911 InnoDB recovery and mariadb-backup --prepare fail to report detailed progress The progress reporting of InnoDB crash recovery was rather intermittent. Nothing was reported during the single-threaded log record parsing, which could consume minutes when parsing a large log. During log application, there only was progress reporting in background threads that would be invoked on data page read completion. The progress reporting here will be detailed like this: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1990840177; to recover: 124806 pages InnoDB: Parsed redo log up to LSN=2729777071; to recover: 186123 pages InnoDB: Parsed redo log up to LSN=3488599173; to recover: 248397 pages InnoDB: Parsed redo log up to LSN=4177856618; to recover: 306469 pages InnoDB: Multi-batch recovery needed at LSN 4189599815 InnoDB: End of log at LSN=4483551634 InnoDB: To recover: LSN 4189599815/4483551634; 307490 pages InnoDB: To recover: LSN 4189599815/4483551634; 197159 pages InnoDB: To recover: LSN 4189599815/4483551634; 67623 pages InnoDB: Parsed redo log up to LSN=4353924218; to recover: 102083 pages ... InnoDB: log sequence number 4483551634 ... The previous messages "Starting a batch to recover" or "Starting a final batch to recover" will be replaced by "To recover: ... pages" messages. If a batch lasts longer than 15 seconds, then there will be progress reports every 15 seconds, showing the number of remaining pages. For the non-final batch, the "To recover:" message includes two end LSN: that of the batch, and of the recovered log. This is the primary measure of progress. The batch will end once the number of pages to recover reaches 0. If recovery is possible in a single batch, the output will look like this, with a shorter "To recover:" message that counts only the remaining pages: InnoDB: Starting crash recovery from checkpoint LSN=503549688 InnoDB: Parsed redo log up to LSN=1998701027; to recover: 125560 pages InnoDB: Parsed redo log up to LSN=2734136874; to recover: 186446 pages InnoDB: Parsed redo log up to LSN=3499505504; to recover: 249378 pages InnoDB: Parsed redo log up to LSN=4183247844; to recover: 306964 pages InnoDB: End of log at LSN=4483551634 ... InnoDB: To recover: 331797 pages ... InnoDB: log sequence number 4483551634 ... We will also speed up recovery by improving the memory management and implementing multi-threaded recovery of data pages that will not need to be read into the buffer pool ("fake read"). Log application in the "fake read" threads will be protected by an atomic being_recovered field and exclusive buf_page_t::latch. Recovery will reserve for data pages two thirds of the buffer pool, or 256 pages, whichever is smaller. Previously, we could only use at most one third of the buffer pool for buffered log records. This would typically mean that with large buffer pools, recovery unnecessary consisted of multiple batches. If recovery runs out of memory, it will "roll back" or "rewind" the current mini-transaction. The recv_sys.lsn and recv_sys.pages will correspond to the "out of memory LSN", at the end of the previous complete mini-transaction. If recovery runs out of memory while executing the final recovery batch, we can simply invoke recv_sys.apply(false) to make room, and resume parsing. If recovery runs out of memory before the final batch, we will scan the redo log to the end (recv_sys.scanned_lsn) and check for any missing or inconsistent files. If recv_init_crash_recovery_spaces() does not report any potentially missing tablespaces, we can make use of the already stored recv_sys.pages and only rewind to the "out of memory LSN". Else, we must keep parsing and invoking recv_validate_tablespace() until an error has been found or everything has been resolved, and ultimatily rewind to to the checkpoint LSN. recv_sys_t::pages_it: A cached iterator to recv_sys.pages recv_sys_t::parse_mtr(): Remove an ATTRIBUTE_NOINLINE that would prevent tail call optimization in recv_sys_t::parse_pmem(). recv_sys_t::parse(), recv_sys_t::parse_mtr(), recv_sys_t::parse_pmem(): Add template<bool store> parameter. Redo log record parsing (store=false) is better specialized from store=true (with bool if_exists) so that we can avoid some conditional branches in frequently invoked low-level code. recv_sys_t::is_memory_exhausted(): Remove. The special parse() status GOT_OOM will report out-of-memory situation at the low level. recv_sys_t::rewind(), page_recv_t::recs_t::rewind(): Remove all log starting with a specific LSN. recv_scan_log(): Separate some code for only parsing, not storing log. In rewound_lsn, remember the LSN at which last_phase=false recovery ran out of memory. This is where the next call to recv_scan_log() will resume storing the log. This replaces recv_sys.last_stored_lsn. recv_sys_t::parse(): Evaluate the template parameter store in a few more cases, to allow dead code to be eliminated at compile time. recv_sys_t::scanned_lsn: The end of the log found by recv_scan_log(). The special value 1 means that recv_sys has been initialized but no log has been parsed. IORequest::write_complete(), IORequest::read_complete(): Replaces fil_aio_callback(). read_io_callback(), write_io_callback(): Replaces io_callback(). IORequest::fake_read_complete(), fake_io_callback(), os_fake_read(): Process a "fake read" request for concurrent recovery. recv_sys_t::apply_batch(): Choose a number of successive pages for a recovery batch. recv_sys_t::erase(recv_sys_t::map::iterator): Remove log records for a page whose recovery is not in progress. Log application threads will not invoke this; they will only set being_recovered=-1 to indicate that the entry is no longer needed. recv_sys_t::garbage_collect(): Remove all being_recovered=-1 entries. recv_sys_t::wait_for_pool(): Wait for some space to become available in the buffer pool. mlog_init_t::mark_ibuf_exist(): Avoid calls to recv_sys::recover_low() via ibuf_page_exists() and buf_page_get_low(). Such calls would lead to double locking of recv_sys.mutex, which depending on implementation could cause a deadlock. We will use lower-level calls to look up index pages. buf_LRU_block_remove_hashed(): Disable consistency checks for freed ROW_FORMAT=COMPRESSED pages. Their contents could be uninitialized garbage. This fixes an occasional failure of the test innodb.innodb_bulk_create_index_debug. Tested by: Matthias Leich
2 years ago
20 years ago
20 years ago
MDEV-14425 Improve the redo log for concurrency The InnoDB redo log used to be formatted in blocks of 512 bytes. The log blocks were encrypted and the checksum was calculated while holding log_sys.mutex, creating a serious scalability bottleneck. We remove the fixed-size redo log block structure altogether and essentially turn every mini-transaction into a log block of its own. This allows encryption and checksum calculations to be performed on local mtr_t::m_log buffers, before acquiring log_sys.mutex. The mutex only protects a memcpy() of the data to the shared log_sys.buf, as well as the padding of the log, in case the to-be-written part of the log would not end in a block boundary of the underlying storage. For now, the "padding" consists of writing a single NUL byte, to allow recovery and mariadb-backup to detect the end of the circular log faster. Like the previous implementation, we will overwrite the last log block over and over again, until it has been completely filled. It would be possible to write only up to the last completed block (if no more recent write was requested), or to write dummy FILE_CHECKPOINT records to fill the incomplete block, by invoking the currently disabled function log_pad(). This would require adjustments to some logic around log checkpoints, page flushing, and shutdown. An upgrade after a crash of any previous version is not supported. Logically empty log files from a previous version will be upgraded. An attempt to start up InnoDB without a valid ib_logfile0 will be refused. Previously, the redo log used to be created automatically if it was missing. Only with with innodb_force_recovery=6, it is possible to start InnoDB in read-only mode even if the log file does not exist. This allows the contents of a possibly corrupted database to be dumped. Because a prepared backup from an earlier version of mariadb-backup will create a 0-sized log file, we will allow an upgrade from such log files, provided that the FIL_PAGE_FILE_FLUSH_LSN in the system tablespace looks valid. The 512-byte log checkpoint blocks at 0x200 and 0x600 will be replaced with 64-byte log checkpoint blocks at 0x1000 and 0x2000. The start of log records will move from 0x800 to 0x3000. This allows us to use 4096-byte aligned blocks for all I/O in a future revision. We extend the MDEV-12353 redo log record format as follows. (1) Empty mini-transactions or extra NUL bytes will not be allowed. (2) The end-of-minitransaction marker (a NUL byte) will be replaced with a 1-bit sequence number, which will be toggled each time when the circular log file wraps back to the beginning. (3) After the sequence bit, a CRC-32C checksum of all data (excluding the sequence bit) will written. (4) If the log is encrypted, 8 bytes will be written before the checksum and included in it. This is part of the initialization vector (IV) of encrypted log data. (5) File names, page numbers, and checkpoint information will not be encrypted. Only the payload bytes of page-level log will be encrypted. The tablespace ID and page number will form part of the IV. (6) For padding, arbitrary-length FILE_CHECKPOINT records may be written, with all-zero payload, and with the normal end marker and checksum. The minimum size is 7 bytes, or 7+8 with innodb_encrypt_log=ON. In mariadb-backup and in Galera snapshot transfer (SST) scripts, we will no longer remove ib_logfile0 or create an empty ib_logfile0. Server startup will require a valid log file. When resizing the log, we will create a logically empty ib_logfile101 at the current LSN and use an atomic rename to replace ib_logfile0 with it. See the test innodb.log_file_size. Because there is no mandatory padding in the log file, we are able to create a dummy log file as of an arbitrary log sequence number. See the test mariabackup.huge_lsn. The parameter innodb_log_write_ahead_size and the INFORMATION_SCHEMA.INNODB_METRICS counter log_padded will be removed. The minimum value of innodb_log_buffer_size will be increased to 2MiB (because log_sys.buf will replace recv_sys.buf) and the increment adjusted to 4096 bytes (the maximum log block size). The following INFORMATION_SCHEMA.INNODB_METRICS counters will be removed: os_log_fsyncs os_log_pending_fsyncs log_pending_log_flushes log_pending_checkpoint_writes The following status variables will be removed: Innodb_os_log_fsyncs (this is included in Innodb_data_fsyncs) Innodb_os_log_pending_fsyncs (this was limited to at most 1 by design) log_sys.get_block_size(): Return the physical block size of the log file. This is only implemented on Linux and Microsoft Windows for now, and for the power-of-2 block sizes between 64 and 4096 bytes (the minimum and maximum size of a checkpoint block). If the block size is anything else, the traditional 512-byte size will be used via normal file system buffering. If the file system buffers can be bypassed, a message like the following will be issued: InnoDB: File system buffers for log disabled (block size=512 bytes) InnoDB: File system buffers for log disabled (block size=4096 bytes) This has been tested on Linux and Microsoft Windows with both sizes. On Linux, only enable O_DIRECT on the log for innodb_flush_method=O_DSYNC. Tests in 3 different environments where the log is stored in a device with a physical block size of 512 bytes are yielding better throughput without O_DIRECT. This could be due to the fact that in the event the last log block is being overwritten (if multiple transactions would become durable at the same time, and each of will write a small number of bytes to the last log block), it should be faster to re-copy data from log_sys.buf or log_sys.flush_buf to the kernel buffer, to be finally written at fdatasync() time. The parameter innodb_flush_method=O_DSYNC will imply O_DIRECT for data files. This option will enable O_DIRECT on the log file on Linux. It may be unsafe to use when the storage device does not support FUA (Force Unit Access) mode. When the server is compiled WITH_PMEM=ON, we will use memory-mapped I/O for the log file if the log resides on a "mount -o dax" device. We will identify PMEM in a start-up message: InnoDB: log sequence number 0 (memory-mapped); transaction id 3 On Linux, we will also invoke mmap() on any ib_logfile0 that resides in /dev/shm, effectively treating the log file as persistent memory. This should speed up "./mtr --mem" and increase the test coverage of PMEM on non-PMEM hardware. It also allows users to estimate how much the performance would be improved by installing persistent memory. On other tmpfs file systems such as /run, we will not use mmap(). mariadb-backup: Eliminated several variables. We will refer directly to recv_sys and log_sys. backup_wait_for_lsn(): Detect non-progress of xtrabackup_copy_logfile(). In this new log format with arbitrary-sized blocks, we can only detect log file overrun indirectly, by observing that the scanned log sequence number is not advancing. xtrabackup_copy_logfile(): On PMEM, do not modify the sequence bit, because we are not allowed to modify the server's log file, and our memory mapping is read-only. trx_flush_log_if_needed_low(): Do not use the callback on pmem. Using neither flush_lock nor write_lock around PMEM writes seems to yield the best performance. The pmem_persist() calls may still be somewhat slower than the pwrite() and fdatasync() based interface (PMEM mounted without -o dax). recv_sys_t::buf: Remove. We will use log_sys.buf for parsing. recv_sys_t::MTR_SIZE_MAX: Replaces RECV_SCAN_SIZE. recv_sys_t::file_checkpoint: Renamed from mlog_checkpoint_lsn. recv_sys_t, log_sys_t: Removed many data members. recv_sys.lsn: Renamed from recv_sys.recovered_lsn. recv_sys.offset: Renamed from recv_sys.recovered_offset. log_sys.buf_size: Replaces srv_log_buffer_size. recv_buf: A smart pointer that wraps log_sys.buf[recv_sys.offset] when the buffer is being allocated from the memory heap. recv_ring: A smart pointer that wraps a circular log_sys.buf[] that is backed by ib_logfile0. The pointer will wrap from recv_sys.len (log_sys.file_size) to log_sys.START_OFFSET. For the record that wraps around, we may copy file name or record payload data to the auxiliary buffer decrypt_buf in order to have a contiguous block of memory. The maximum size of a record is less than innodb_page_size bytes. recv_sys_t::parse(): Take the smart pointer as a template parameter. Do not temporarily add a trailing NUL byte to FILE_ records, because we are not supposed to modify the memory-mapped log file. (It is attached in read-write mode already during recovery.) recv_sys_t::parse_mtr(): Wrapper for recv_sys_t::parse(). recv_sys_t::parse_pmem(): Like parse_mtr(), but if PREMATURE_EOF would be returned on PMEM, use recv_ring to wrap around the buffer to the start. mtr_t::finish_write(), log_close(): Do not enforce log_sys.max_buf_free on PMEM, because it has no meaning on the mmap-based log. log_sys.write_to_buf: Count writes to log_sys.buf. Replaces srv_stats.log_write_requests and export_vars.innodb_log_write_requests. Protected by log_sys.mutex. Updated consistently in log_close(). Previously, mtr_t::commit() conditionally updated the count, which was inconsistent. log_sys.write_to_log: Count swaps of log_sys.buf and log_sys.flush_buf, for writing to log_sys.log (the ib_logfile0). Replaces srv_stats.log_writes and export_vars.innodb_log_writes. Protected by log_sys.mutex. log_sys.waits: Count waits in append_prepare(). Replaces srv_stats.log_waits and export_vars.innodb_log_waits. recv_recover_page(): Do not unnecessarily acquire log_sys.flush_order_mutex. We are inserting the blocks in arbitary order anyway, to be adjusted in recv_sys.apply(true). We will change the definition of flush_lock and write_lock to avoid potential false sharing. Depending on sizeof(log_sys) and CPU_LEVEL1_DCACHE_LINESIZE, the flush_lock and write_lock could share a cache line with each other or with the last data members of log_sys. Thanks to Matthias Leich for providing https://rr-project.org traces for various failures during the development, and to Thirunarayanan Balathandayuthapani for his help in debugging some of the recovery code. And thanks to the developers of the rr debugger for a tool without which extensive changes to InnoDB would be very challenging to get right. Thanks to Vladislav Vaintroub for useful feedback and to him, Axel Schwenke and Krunal Bauskar for testing the performance.
4 years ago
TSAN: unprotected global variable WARNING: ThreadSanitizer: data race (pid=1510842) Write of size 8 at 0x0000067b1e98 by main thread: #0 os_file_pwrite(IORequest const&, int, unsigned char const*, unsigned long, unsigned long, dberr_t*) /storage/innobase/os/os0file.cc:2928:2 (mariadbd+0x234c5ac) #1 os_file_write_func(IORequest const&, char const*, int, void const*, unsigned long, unsigned long) /storage/innobase/os/os0file.cc:2963:20 (mariadbd+0x234c019) #2 file_os_io::write(char const*, unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:320:10 (mariadbd+0x22eaa50) #3 log_file_t::write(unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:434:18 (mariadbd+0x22eb1d8) #4 log_t::file::write(unsigned long, st_::span<unsigned char>) /storage/innobase/log/log0log.cc:496:29 (mariadbd+0x22ebb55) #5 log_write_buf(unsigned char*, unsigned long, unsigned long, unsigned long, unsigned long) /storage/innobase/log/log0log.cc:614:14 (mariadbd+0x22f1b51) #6 log_write(bool) /storage/innobase/log/log0log.cc:755:2 (mariadbd+0x22ed2ec) #7 log_write_up_to(unsigned long, bool, bool, completion_callback const*) /storage/innobase/log/log0log.cc:817:5 (mariadbd+0x22eca44) #8 log_checkpoint_low(unsigned long, unsigned long) /storage/innobase/buf/buf0flu.cc:1734:5 (mariadbd+0x20d37c1) #9 log_checkpoint() /storage/innobase/buf/buf0flu.cc:1787:10 (mariadbd+0x20cd155) #10 buf_flush_wait_flushed(unsigned long) /storage/innobase/buf/buf0flu.cc:1867:5 (mariadbd+0x20ccf8f) #11 log_make_checkpoint() /storage/innobase/buf/buf0flu.cc:1793:3 (mariadbd+0x20cc4c9) #12 buf_dblwr_t::create() /storage/innobase/buf/buf0dblwr.cc:216:3 (mariadbd+0x209076a) #13 srv_start(bool) /storage/innobase/srv/srv0start.cc:1685:20 (mariadbd+0x256b4aa) #14 innodb_init(void*) /storage/innobase/handler/ha_innodb.cc:4188:8 (mariadbd+0x1ed40da) #15 ha_initialize_handlerton(st_plugin_int*) /sql/handler.cc:659:31 (mariadbd+0xf7c2b6) #16 plugin_initialize(st_mem_root*, st_plugin_int*, int*, char**, bool) /sql/sql_plugin.cc:1463:9 (mariadbd+0x160fedb) #17 plugin_init(int*, char**, int) /sql/sql_plugin.cc:1756:15 (mariadbd+0x160f53f) #18 init_server_components() /sql/mysqld.cc:5043:7 (mariadbd+0xd71462) #19 mysqld_main(int, char**) /sql/mysqld.cc:5655:7 (mariadbd+0xd6ae87) #20 main /sql/main.cc:34:10 (mariadbd+0xd661c8) Previous write of size 8 at 0x0000067b1e98 by thread T3: #0 os_file_pwrite(IORequest const&, int, unsigned char const*, unsigned long, unsigned long, dberr_t*) /storage/innobase/os/os0file.cc:2928:2 (mariadbd+0x234c5ac) #1 os_file_write_func(IORequest const&, char const*, int, void const*, unsigned long, unsigned long) /storage/innobase/os/os0file.cc:2963:20 (mariadbd+0x234c019) #2 file_os_io::write(char const*, unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:320:10 (mariadbd+0x22eaa50) #3 log_file_t::write(unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:434:18 (mariadbd+0x22eb1d8) #4 log_t::file::write(unsigned long, st_::span<unsigned char>) /storage/innobase/log/log0log.cc:496:29 (mariadbd+0x22ebb55) #5 log_write_checkpoint_info(unsigned long) /storage/innobase/log/log0log.cc:911:14 (mariadbd+0x22edd4e) #6 log_checkpoint_low(unsigned long, unsigned long) /storage/innobase/buf/buf0flu.cc:1755:3 (mariadbd+0x20d3a3d) #7 buf_flush_sync_for_checkpoint(unsigned long) /storage/innobase/buf/buf0flu.cc:1947:7 (mariadbd+0x20d4163) #8 buf_flush_page_cleaner() /storage/innobase/buf/buf0flu.cc:2186:9 (mariadbd+0x20cdab1) #9 void std::__invoke_impl<void, void (*)()>(std::__invoke_other, void (*&&)()) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/invoke.h:61:14 (mariadbd+0x20c3aaa) #10 std::__invoke_result<void (*)()>::type std::__invoke<void (*)()>(void (*&&)()) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/invoke.h:96:14 (mariadbd+0x20c39bd) #11 void std::thread::_Invoker<std::tuple<void (*)()> >::_M_invoke<0ul>(std::_Index_tuple<0ul>) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:253:13 (mariadbd+0x20c3965) #12 std::thread::_Invoker<std::tuple<void (*)()> >::operator()() /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:260:11 (mariadbd+0x20c3905) #13 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)()> > >::_M_run() /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:211:13 (mariadbd+0x20c37f9) #14 <null> <null> (libstdc++.so.6+0xd230f) Location is global 'os_n_file_writes' of size 8 at 0x0000067b1e98 (mariadbd+0x67b1e98) Make variable atomic.
4 years ago
TSAN: unprotected global variable WARNING: ThreadSanitizer: data race (pid=1510842) Write of size 8 at 0x0000067b1e98 by main thread: #0 os_file_pwrite(IORequest const&, int, unsigned char const*, unsigned long, unsigned long, dberr_t*) /storage/innobase/os/os0file.cc:2928:2 (mariadbd+0x234c5ac) #1 os_file_write_func(IORequest const&, char const*, int, void const*, unsigned long, unsigned long) /storage/innobase/os/os0file.cc:2963:20 (mariadbd+0x234c019) #2 file_os_io::write(char const*, unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:320:10 (mariadbd+0x22eaa50) #3 log_file_t::write(unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:434:18 (mariadbd+0x22eb1d8) #4 log_t::file::write(unsigned long, st_::span<unsigned char>) /storage/innobase/log/log0log.cc:496:29 (mariadbd+0x22ebb55) #5 log_write_buf(unsigned char*, unsigned long, unsigned long, unsigned long, unsigned long) /storage/innobase/log/log0log.cc:614:14 (mariadbd+0x22f1b51) #6 log_write(bool) /storage/innobase/log/log0log.cc:755:2 (mariadbd+0x22ed2ec) #7 log_write_up_to(unsigned long, bool, bool, completion_callback const*) /storage/innobase/log/log0log.cc:817:5 (mariadbd+0x22eca44) #8 log_checkpoint_low(unsigned long, unsigned long) /storage/innobase/buf/buf0flu.cc:1734:5 (mariadbd+0x20d37c1) #9 log_checkpoint() /storage/innobase/buf/buf0flu.cc:1787:10 (mariadbd+0x20cd155) #10 buf_flush_wait_flushed(unsigned long) /storage/innobase/buf/buf0flu.cc:1867:5 (mariadbd+0x20ccf8f) #11 log_make_checkpoint() /storage/innobase/buf/buf0flu.cc:1793:3 (mariadbd+0x20cc4c9) #12 buf_dblwr_t::create() /storage/innobase/buf/buf0dblwr.cc:216:3 (mariadbd+0x209076a) #13 srv_start(bool) /storage/innobase/srv/srv0start.cc:1685:20 (mariadbd+0x256b4aa) #14 innodb_init(void*) /storage/innobase/handler/ha_innodb.cc:4188:8 (mariadbd+0x1ed40da) #15 ha_initialize_handlerton(st_plugin_int*) /sql/handler.cc:659:31 (mariadbd+0xf7c2b6) #16 plugin_initialize(st_mem_root*, st_plugin_int*, int*, char**, bool) /sql/sql_plugin.cc:1463:9 (mariadbd+0x160fedb) #17 plugin_init(int*, char**, int) /sql/sql_plugin.cc:1756:15 (mariadbd+0x160f53f) #18 init_server_components() /sql/mysqld.cc:5043:7 (mariadbd+0xd71462) #19 mysqld_main(int, char**) /sql/mysqld.cc:5655:7 (mariadbd+0xd6ae87) #20 main /sql/main.cc:34:10 (mariadbd+0xd661c8) Previous write of size 8 at 0x0000067b1e98 by thread T3: #0 os_file_pwrite(IORequest const&, int, unsigned char const*, unsigned long, unsigned long, dberr_t*) /storage/innobase/os/os0file.cc:2928:2 (mariadbd+0x234c5ac) #1 os_file_write_func(IORequest const&, char const*, int, void const*, unsigned long, unsigned long) /storage/innobase/os/os0file.cc:2963:20 (mariadbd+0x234c019) #2 file_os_io::write(char const*, unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:320:10 (mariadbd+0x22eaa50) #3 log_file_t::write(unsigned long, st_::span<unsigned char const>) /storage/innobase/log/log0log.cc:434:18 (mariadbd+0x22eb1d8) #4 log_t::file::write(unsigned long, st_::span<unsigned char>) /storage/innobase/log/log0log.cc:496:29 (mariadbd+0x22ebb55) #5 log_write_checkpoint_info(unsigned long) /storage/innobase/log/log0log.cc:911:14 (mariadbd+0x22edd4e) #6 log_checkpoint_low(unsigned long, unsigned long) /storage/innobase/buf/buf0flu.cc:1755:3 (mariadbd+0x20d3a3d) #7 buf_flush_sync_for_checkpoint(unsigned long) /storage/innobase/buf/buf0flu.cc:1947:7 (mariadbd+0x20d4163) #8 buf_flush_page_cleaner() /storage/innobase/buf/buf0flu.cc:2186:9 (mariadbd+0x20cdab1) #9 void std::__invoke_impl<void, void (*)()>(std::__invoke_other, void (*&&)()) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/invoke.h:61:14 (mariadbd+0x20c3aaa) #10 std::__invoke_result<void (*)()>::type std::__invoke<void (*)()>(void (*&&)()) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/invoke.h:96:14 (mariadbd+0x20c39bd) #11 void std::thread::_Invoker<std::tuple<void (*)()> >::_M_invoke<0ul>(std::_Index_tuple<0ul>) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:253:13 (mariadbd+0x20c3965) #12 std::thread::_Invoker<std::tuple<void (*)()> >::operator()() /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:260:11 (mariadbd+0x20c3905) #13 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)()> > >::_M_run() /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:211:13 (mariadbd+0x20c37f9) #14 <null> <null> (libstdc++.so.6+0xd230f) Location is global 'os_n_file_writes' of size 8 at 0x0000067b1e98 (mariadbd+0x67b1e98) Make variable atomic.
4 years ago
TSAN: data race on a global counter WARNING: ThreadSanitizer: data race (pid=1503350) Write of size 8 at 0x0000067b1f20 by thread T3: #0 os_file_sync_posix(int) /storage/innobase/os/os0file.cc:895:5 (mariadbd+0x23493f6) #1 os_file_flush_func(int) /storage/innobase/os/os0file.cc:983:8 (mariadbd+0x2349204) #2 file_os_io::flush() /storage/innobase/log/log0log.cc:326:10 (mariadbd+0x22eaaa9) #3 log_file_t::flush() /storage/innobase/log/log0log.cc:440:18 (mariadbd+0x22eb2d0) #4 log_t::file::flush() /storage/innobase/log/log0log.cc:507:29 (mariadbd+0x22ebe69) #5 log_write_flush_to_disk_low(unsigned long) /storage/innobase/log/log0log.cc:629:17 (mariadbd+0x22ed3f3) #6 log_write_up_to(unsigned long, bool, bool, completion_callback const*) /storage/innobase/log/log0log.cc:829:3 (mariadbd+0x22ecb04) #7 log_checkpoint_low(unsigned long, unsigned long) /storage/innobase/buf/buf0flu.cc:1734:5 (mariadbd+0x20d37f1) #8 buf_flush_sync_for_checkpoint(unsigned long) /storage/innobase/buf/buf0flu.cc:1947:7 (mariadbd+0x20d4193) #9 buf_flush_page_cleaner() /storage/innobase/buf/buf0flu.cc:2186:9 (mariadbd+0x20cdad7) #10 void std::__invoke_impl<void, void (*)()>(std::__invoke_other, void (*&&)()) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/invoke.h:61:14 (mariadbd+0x20c3aaa) #11 std::__invoke_result<void (*)()>::type std::__invoke<void (*)()>(void (*&&)()) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/invoke.h:96:14 (mariadbd+0x20c39bd) #12 void std::thread::_Invoker<std::tuple<void (*)()> >::_M_invoke<0ul>(std::_Index_tuple<0ul>) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:253:13 (mariadbd+0x20c3965) #13 std::thread::_Invoker<std::tuple<void (*)()> >::operator()() /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:260:11 (mariadbd+0x20c3905) #14 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)()> > >::_M_run() /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_thread.h:211:13 (mariadbd+0x20c37f9) #15 <null> <null> (libstdc++.so.6+0xd230f) Previous write of size 8 at 0x0000067b1f20 by main thread: #0 os_file_sync_posix(int) /storage/innobase/os/os0file.cc:895:5 (mariadbd+0x23493f6) #1 os_file_flush_func(int) /storage/innobase/os/os0file.cc:983:8 (mariadbd+0x2349204) #2 fil_space_t::flush_low() /storage/innobase/fil/fil0fil.cc:504:5 (mariadbd+0x205cad5) #3 fil_flush_file_spaces() /storage/innobase/fil/fil0fil.cc:2947:13 (mariadbd+0x206523f) #4 log_checkpoint() /storage/innobase/buf/buf0flu.cc:1777:5 (mariadbd+0x20cd069) #5 buf_flush_wait_flushed(unsigned long) /storage/innobase/buf/buf0flu.cc:1867:5 (mariadbd+0x20ccf95) #6 log_make_checkpoint() /storage/innobase/buf/buf0flu.cc:1793:3 (mariadbd+0x20cc4c9) #7 buf_dblwr_t::create() /storage/innobase/buf/buf0dblwr.cc:216:3 (mariadbd+0x209076a) #8 srv_start(bool) /storage/innobase/srv/srv0start.cc:1685:20 (mariadbd+0x256b514) #9 innodb_init(void*) /storage/innobase/handler/ha_innodb.cc:4188:8 (mariadbd+0x1ed406a) #10 ha_initialize_handlerton(st_plugin_int*) /sql/handler.cc:659:31 (mariadbd+0xf7c246) #11 plugin_initialize(st_mem_root*, st_plugin_int*, int*, char**, bool) /sql/sql_plugin.cc:1463:9 (mariadbd+0x160fe6b) #12 plugin_init(int*, char**, int) /sql/sql_plugin.cc:1756:15 (mariadbd+0x160f4cf) #13 init_server_components() /sql/mysqld.cc:5043:7 (mariadbd+0xd713f2) #14 mysqld_main(int, char**) /sql/mysqld.cc:5655:7 (mariadbd+0xd6ae17) #15 main /sql/main.cc:34:10 (mariadbd+0xd66158) This is a correct report by TSAN for an obvious case: unprotected global counter. Fix it by making counter std::atomic.
4 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored for system tablespace on SSD When the maximum configured number of file is exceeded, InnoDB will close data files. We used to maintain a fil_system.LRU list and a counter fil_node_t::n_pending to achieve this, at the huge cost of multiple fil_system.mutex operations per I/O operation. fil_node_open_file_low(): Implement a FIFO replacement policy: The last opened file will be moved to the end of fil_system.space_list, and files will be closed from the start of the list. However, we will not move tablespaces in fil_system.space_list while i_s_tablespaces_encryption_fill_table() is executing (producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION) because it may cause information of some tablespaces to go missing. We also avoid this in mariabackup --backup because datafiles_iter_next() assumes that the ordering is not changed. IORequest: Fold more parameters to IORequest::type. fil_space_t::io(): Replaces fil_io(). fil_space_t::flush(): Replaces fil_flush(). OS_AIO_IBUF: Remove. We will always issue synchronous reads of the change buffer pages in buf_read_page_low(). We will always ignore some errors for background reads. This should reduce fil_system.mutex contention a little. fil_node_t::complete_write(): Replaces fil_node_t::complete_io(). On both read and write completion, fil_space_t::release_for_io() will have to be called. fil_space_t::io(): Do not acquire fil_system.mutex in the normal code path. xb_delta_open_matching_space(): Do not try to open the system tablespace which was already opened. This fixes a file sharing violation in mariabackup --prepare --incremental. Reviewed by: Vladislav Vaintroub
5 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
MDEV-24626 Remove synchronous write of page0 file during file creation During data file creation, InnoDB holds dict_sys mutex, tries to write page 0 of the file and flushes the file. This not only causing unnecessary contention but also a deviation from the write-ahead logging protocol. The clean sequence of operations is that we first start a dictionary transaction and write SYS_TABLES and SYS_INDEXES records that identify the tablespace. Then, we durably write a FILE_CREATE record to the write-ahead log and create the file. Recovery should not unnecessarily insist that the first page of each data file that is referred to by the redo log is valid. It must be enough that page 0 of the tablespace can be initialized based on the redo log contents. We introduce a new data structure deferred_spaces that keeps track of corrupted-looking files during recovery. The data structure holds the last LSN of a FILE_ record referring to the data file, the tablespace identifier, and the last known file name. There are two scenarios can happen during recovery: i) Sufficient memory: InnoDB can reconstruct the tablespace after parsing all redo log records. ii) Insufficient memory(multiple apply phase): InnoDB should store the deferred tablespace redo logs even though tablespace is not present. InnoDB should start constructing the tablespace when it first encounters deferred tablespace id. Mariabackup copies the zero filled ibd file in backup_fix_ddl() as the extension of .new file. Mariabackup test case does page flushing when it deals with DDL operation during backup operation. fil_ibd_create(): Remove the write of page0 and flushing of file fil_ibd_load(): Return FIL_LOAD_DEFER if the tablespace has zero filled page0 Datafile: Clean up the error handling, and do not report errors if we are in the middle of recovery. The caller will check Datafile::m_defer. fil_node_t::deferred: Indicates whether the tablespace loading was deferred during recovery FIL_LOAD_DEFER: Returned by fil_ibd_load() to indicate that tablespace file was cannot be loaded. recv_sys_t::recover_deferred(): Invoke deferred_spaces.create() to initialize fil_space_t based on buffered metadata and records to initialize page 0. Ignore the flags in fil_name_t, because they are intentionally invalid. fil_name_process(): Update deferred_spaces. recv_sys_t::parse(): Store the redo log if the tablespace id is present in deferred spaces recv_sys_t::recover_low(): Should recover the first page of the tablespace even though the tablespace instance is not present recv_sys_t::apply(): Initialize the deferred tablespace before applying the deferred tablespace records recv_validate_tablespace(): Skip the validation for deferred_spaces. recv_rename_files(): Moved and revised from recv_sys_t::apply(). For deferred-recovery tablespaces, do not attempt to rename the file if a deferred-recovery tablespace is associated with the name. recv_recovery_from_checkpoint_start(): Invoke recv_rename_files() and initialize all deferred tablespaces before applying redo log. fil_node_t::read_page0(): Skip page0 validation if the tablespace is deferred buf_page_create_deferred(): A variant of buf_page_create() when the fil_space_t is not available yet This is joint work with Thirunarayanan Balathandayuthapani, who implemented an initial prototype.
4 years ago
MDEV-17380 innodb_flush_neighbors=ON should be ignored on SSD For tablespaces that do not reside on spinning storage, it does not make sense to attempt to write nearby pages when writing out dirty pages from the InnoDB buffer pool. It is actually detrimental to performance and to the life span of flash ROM storage. With this change, MariaDB will detect whether an InnoDB file resides on solid-state storage. The detection has been implemented for Linux and Microsoft Windows. For other systems, we will err on the safe side and assume that files reside on SSD. As part of this change, we will reduce the number of fstat() calls when opening data files on POSIX systems and slightly clean up some file I/O code. FIXME: os_is_sparse_file_supported() on POSIX works in a destructive manner. Thus, we can only invoke it when creating files, not when opening them. For diagnostics, we introduce the column ON_SSD to the table INFORMATION_SCHEMA.INNODB_TABLESPACES_SCRUBBING. The table INNODB_SYS_TABLESPACES might seem more appropriate, but its purpose is to reflect the contents of the InnoDB system table SYS_TABLESPACES, which we would like to remove at some point. On Microsoft Windows, querying StorageDeviceSeekPenaltyProperty sometimes returns ERROR_GEN_FAILURE instead of ERROR_INVALID_FUNCTION or ERROR_NOT_SUPPORTED. We will silently ignore also this error, and assume that the file does not reside on SSD. On Linux, the detection will be based on the files /sys/block/*/queue/rotational and /sys/block/*/dev. Especially for USB storage, it is possible that /sys/block/*/queue/rotational will wrongly report 1 instead of 0. fil_node_t::on_ssd: Whether the InnoDB data file resides on solid-state storage. fil_system_t::ssd: Collection of Linux block devices that reside on non-rotational storage. fil_system_t::create(): Detect ssd on Linux based on the contents of /sys/block/*/queue/rotational and /sys/block/*/dev. fil_system_t::is_ssd(dev_t): Determine if a Linux block device is non-rotational. Partitions will be identified with the containing block device by assuming that the least significant 4 bits of the minor number identify a partition, and that the "partition number" of the entire device is 0.
7 years ago
  1. /***********************************************************************
  2. Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved.
  3. Copyright (c) 2009, Percona Inc.
  4. Copyright (c) 2013, 2022, MariaDB Corporation.
  5. Portions of this file contain modifications contributed and copyrighted
  6. by Percona Inc.. Those modifications are
  7. gratefully acknowledged and are described briefly in the InnoDB
  8. documentation. The contributions by Percona Inc. are incorporated with
  9. their permission, and subject to the conditions contained in the file
  10. COPYING.Percona.
  11. This program is free software; you can redistribute it and/or modify it
  12. under the terms of the GNU General Public License as published by the
  13. Free Software Foundation; version 2 of the License.
  14. This program is distributed in the hope that it will be useful, but
  15. WITHOUT ANY WARRANTY; without even the implied warranty of
  16. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
  17. Public License for more details.
  18. You should have received a copy of the GNU General Public License along with
  19. this program; if not, write to the Free Software Foundation, Inc.,
  20. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
  21. ***********************************************************************/
  22. /**************************************************//**
  23. @file os/os0file.cc
  24. The interface to the operating system file i/o primitives
  25. Created 10/21/1995 Heikki Tuuri
  26. *******************************************************/
  27. #include "os0file.h"
  28. #include "sql_const.h"
  29. #include "log.h"
  30. #ifdef __linux__
  31. # include <sys/types.h>
  32. # include <sys/stat.h>
  33. # include <sys/sysmacros.h>
  34. #endif
  35. #include "srv0mon.h"
  36. #include "srv0srv.h"
  37. #include "srv0start.h"
  38. #include "fil0fil.h"
  39. #include "fsp0fsp.h"
  40. #ifdef HAVE_LINUX_UNISTD_H
  41. #include "unistd.h"
  42. #endif
  43. #include "buf0dblwr.h"
  44. #include <tpool_structs.h>
  45. #ifdef LINUX_NATIVE_AIO
  46. #include <libaio.h>
  47. #endif /* LINUX_NATIVE_AIO */
  48. #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
  49. # include <fcntl.h>
  50. # include <linux/falloc.h>
  51. #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
  52. #ifdef _WIN32
  53. #include <winioctl.h>
  54. #endif
  55. // my_test_if_atomic_write() , my_win_secattr()
  56. #include <my_sys.h>
  57. #include <thread>
  58. #include <chrono>
  59. /* Per-IO operation environment*/
  60. class io_slots
  61. {
  62. private:
  63. tpool::cache<tpool::aiocb> m_cache;
  64. tpool::task_group m_group;
  65. int m_max_aio;
  66. public:
  67. io_slots(int max_submitted_io, int max_callback_concurrency) :
  68. m_cache(max_submitted_io), m_group(max_callback_concurrency, false),
  69. m_max_aio(max_submitted_io)
  70. {
  71. }
  72. /* Get cached AIO control block */
  73. tpool::aiocb* acquire()
  74. {
  75. return m_cache.get();
  76. }
  77. /* Release AIO control block back to cache */
  78. void release(tpool::aiocb* aiocb)
  79. {
  80. m_cache.put(aiocb);
  81. }
  82. bool contains(tpool::aiocb* aiocb)
  83. {
  84. return m_cache.contains(aiocb);
  85. }
  86. /* Wait for completions of all AIO operations */
  87. void wait(mysql_mutex_t &m)
  88. {
  89. m_cache.wait(m);
  90. }
  91. void wait()
  92. {
  93. m_cache.wait();
  94. }
  95. size_t pending_io_count()
  96. {
  97. return m_cache.pos();
  98. }
  99. tpool::task_group* get_task_group()
  100. {
  101. return &m_group;
  102. }
  103. ~io_slots()
  104. {
  105. wait();
  106. }
  107. mysql_mutex_t& mutex()
  108. {
  109. return m_cache.mutex();
  110. }
  111. void resize(int max_submitted_io, int max_callback_concurrency)
  112. {
  113. m_cache.resize(max_submitted_io);
  114. m_group.set_max_tasks(max_callback_concurrency);
  115. m_max_aio = max_submitted_io;
  116. }
  117. tpool::task_group& task_group()
  118. {
  119. return m_group;
  120. }
  121. };
  122. static io_slots *read_slots;
  123. static io_slots *write_slots;
  124. /** Number of retries for partial I/O's */
  125. constexpr ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
  126. /* This specifies the file permissions InnoDB uses when it creates files in
  127. Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
  128. my_umask */
  129. #ifndef _WIN32
  130. /** Umask for creating files */
  131. static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
  132. #else
  133. /** Umask for creating files */
  134. static ulint os_innodb_umask = 0;
  135. #endif /* _WIN32 */
  136. Atomic_counter<ulint> os_n_file_reads;
  137. static ulint os_bytes_read_since_printout;
  138. Atomic_counter<size_t> os_n_file_writes;
  139. Atomic_counter<size_t> os_n_fsyncs;
  140. static ulint os_n_file_reads_old;
  141. static ulint os_n_file_writes_old;
  142. static ulint os_n_fsyncs_old;
  143. static time_t os_last_printout;
  144. bool os_has_said_disk_full;
  145. /** Default Zip compression level */
  146. extern uint page_zip_level;
  147. #ifdef UNIV_PFS_IO
  148. /* Keys to register InnoDB I/O with performance schema */
  149. mysql_pfs_key_t innodb_data_file_key;
  150. mysql_pfs_key_t innodb_temp_file_key;
  151. #endif
  152. /** Handle errors for file operations.
  153. @param[in] name name of a file or NULL
  154. @param[in] operation operation
  155. @param[in] should_abort whether to abort on an unknown error
  156. @param[in] on_error_silent whether to suppress reports of non-fatal errors
  157. @return true if we should retry the operation */
  158. static
  159. bool
  160. os_file_handle_error_cond_exit(
  161. const char* name,
  162. const char* operation,
  163. bool should_abort,
  164. bool on_error_silent);
  165. /** Does error handling when a file operation fails.
  166. @param operation name of operation that failed */
  167. static void os_file_handle_error(const char *operation)
  168. {
  169. os_file_handle_error_cond_exit(nullptr, operation, true, false);
  170. }
  171. /** Does error handling when a file operation fails.
  172. @param[in] name name of a file or NULL
  173. @param[in] operation operation name that failed
  174. @param[in] on_error_silent if true then don't print any message to the log.
  175. @return true if we should retry the operation */
  176. static
  177. bool
  178. os_file_handle_error_no_exit(
  179. const char* name,
  180. const char* operation,
  181. bool on_error_silent)
  182. {
  183. /* Don't exit in case of unknown error */
  184. return(os_file_handle_error_cond_exit(
  185. name, operation, false, on_error_silent));
  186. }
  187. /** Handle RENAME error.
  188. @param name old name of the file
  189. @param new_name new name of the file */
  190. static void os_file_handle_rename_error(const char* name, const char* new_name)
  191. {
  192. if (os_file_get_last_error(true) != OS_FILE_DISK_FULL) {
  193. ib::error() << "Cannot rename file '" << name << "' to '"
  194. << new_name << "'";
  195. } else if (!os_has_said_disk_full) {
  196. os_has_said_disk_full = true;
  197. /* Disk full error is reported irrespective of the
  198. on_error_silent setting. */
  199. ib::error() << "Full disk prevents renaming file '"
  200. << name << "' to '" << new_name << "'";
  201. }
  202. }
  203. #ifdef _WIN32
  204. /**
  205. Wrapper around Windows DeviceIoControl() function.
  206. Works synchronously, also in case for handle opened
  207. for async access (i.e with FILE_FLAG_OVERLAPPED).
  208. Accepts the same parameters as DeviceIoControl(),except
  209. last parameter (OVERLAPPED).
  210. */
  211. static
  212. BOOL
  213. os_win32_device_io_control(
  214. HANDLE handle,
  215. DWORD code,
  216. LPVOID inbuf,
  217. DWORD inbuf_size,
  218. LPVOID outbuf,
  219. DWORD outbuf_size,
  220. LPDWORD bytes_returned
  221. )
  222. {
  223. OVERLAPPED overlapped = { 0 };
  224. overlapped.hEvent = tpool::win_get_syncio_event();
  225. BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf,
  226. outbuf_size, NULL, &overlapped);
  227. if (result || (GetLastError() == ERROR_IO_PENDING)) {
  228. /* Wait for async io to complete */
  229. result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE);
  230. }
  231. return result;
  232. }
  233. #endif
  234. /** Helper class for doing synchronous file IO. Currently, the objective
  235. is to hide the OS specific code, so that the higher level functions aren't
  236. peppered with #ifdef. Makes the code flow difficult to follow. */
  237. class SyncFileIO
  238. {
  239. public:
  240. /** Constructor
  241. @param[in] fh File handle
  242. @param[in,out] buf Buffer to read/write
  243. @param[in] n Number of bytes to read/write
  244. @param[in] offset Offset where to read or write */
  245. SyncFileIO(os_file_t fh, void *buf, ulint n, os_offset_t offset) :
  246. m_fh(fh), m_buf(buf), m_n(static_cast<ssize_t>(n)), m_offset(offset)
  247. { ut_ad(m_n > 0); }
  248. /** Do the read/write
  249. @param[in] request The IO context and type
  250. @return the number of bytes read/written or negative value on error */
  251. ssize_t execute(const IORequest &request);
  252. /** Move the read/write offset up to where the partial IO succeeded.
  253. @param[in] n_bytes The number of bytes to advance */
  254. void advance(ssize_t n_bytes)
  255. {
  256. m_offset+= n_bytes;
  257. ut_ad(m_n >= n_bytes);
  258. m_n-= n_bytes;
  259. m_buf= reinterpret_cast<uchar*>(m_buf) + n_bytes;
  260. }
  261. private:
  262. /** Open file handle */
  263. const os_file_t m_fh;
  264. /** Buffer to read/write */
  265. void *m_buf;
  266. /** Number of bytes to read/write */
  267. ssize_t m_n;
  268. /** Offset from where to read/write */
  269. os_offset_t m_offset;
  270. /** Do the read/write
  271. @param request The IO context and type
  272. @param n Number of bytes to read/write
  273. @return the number of bytes read/written or negative value on error */
  274. ssize_t execute_low(const IORequest& request, ssize_t n);
  275. };
  276. #ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */
  277. /** Obtain an exclusive lock on a file.
  278. @param fd file descriptor
  279. @param name file name
  280. @return 0 on success */
  281. int os_file_lock(int fd, const char *name)
  282. {
  283. struct flock lk;
  284. lk.l_type = F_WRLCK;
  285. lk.l_whence = SEEK_SET;
  286. lk.l_start = lk.l_len = 0;
  287. if (fcntl(fd, F_SETLK, &lk) == -1) {
  288. ib::error()
  289. << "Unable to lock " << name
  290. << " error: " << errno;
  291. if (errno == EAGAIN || errno == EACCES) {
  292. ib::info()
  293. << "Check that you do not already have"
  294. " another mariadbd process using the"
  295. " same InnoDB data or log files.";
  296. }
  297. return(-1);
  298. }
  299. return(0);
  300. }
  301. #endif /* !_WIN32 */
  302. /** Create a temporary file. This function is like tmpfile(3), but
  303. the temporary file is created in the in the mysql server configuration
  304. parameter (--tmpdir).
  305. @return temporary file handle, or NULL on error */
  306. FILE*
  307. os_file_create_tmpfile()
  308. {
  309. FILE* file = NULL;
  310. File fd = mysql_tmpfile("ib");
  311. if (fd >= 0) {
  312. file = my_fdopen(fd, 0, O_RDWR|O_TRUNC|O_CREAT|FILE_BINARY,
  313. MYF(MY_WME));
  314. if (!file) {
  315. my_close(fd, MYF(MY_WME));
  316. }
  317. }
  318. if (file == NULL) {
  319. ib::error()
  320. << "Unable to create temporary file; errno: "
  321. << errno;
  322. }
  323. return(file);
  324. }
  325. /** Rewind file to its start, read at most size - 1 bytes from it to str, and
  326. NUL-terminate str. All errors are silently ignored. This function is
  327. mostly meant to be used with temporary files.
  328. @param[in,out] file File to read from
  329. @param[in,out] str Buffer where to read
  330. @param[in] size Size of buffer */
  331. void
  332. os_file_read_string(
  333. FILE* file,
  334. char* str,
  335. ulint size)
  336. {
  337. if (size != 0) {
  338. rewind(file);
  339. size_t flen = fread(str, 1, size - 1, file);
  340. str[flen] = '\0';
  341. }
  342. }
  343. /** This function reduces a null-terminated full remote path name into
  344. the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
  345. the 'databasename/tablename.ibd' found at the end of the path with just
  346. 'tablename'.
  347. Since the result is always smaller than the path sent in, no new memory
  348. is allocated. The caller should allocate memory for the path sent in.
  349. This function manipulates that path in place.
  350. If the path format is not as expected, just return. The result is used
  351. to inform a SHOW CREATE TABLE command.
  352. @param[in,out] data_dir_path Full path/data_dir_path */
  353. void
  354. os_file_make_data_dir_path(
  355. char* data_dir_path)
  356. {
  357. /* Replace the period before the extension with a null byte. */
  358. char* ptr = strrchr(data_dir_path, '.');
  359. if (ptr == NULL) {
  360. return;
  361. }
  362. ptr[0] = '\0';
  363. /* The tablename starts after the last slash. */
  364. ptr = strrchr(data_dir_path, '/');
  365. if (ptr == NULL) {
  366. return;
  367. }
  368. ptr[0] = '\0';
  369. char* tablename = ptr + 1;
  370. /* The databasename starts after the next to last slash. */
  371. ptr = strrchr(data_dir_path, '/');
  372. #ifdef _WIN32
  373. if (char *aptr = strrchr(data_dir_path, '\\')) {
  374. if (aptr > ptr) {
  375. ptr = aptr;
  376. }
  377. }
  378. #endif
  379. if (ptr == NULL) {
  380. return;
  381. }
  382. ulint tablename_len = strlen(tablename);
  383. memmove(++ptr, tablename, tablename_len);
  384. ptr[tablename_len] = '\0';
  385. }
  386. /** Check if the path refers to the root of a drive using a pointer
  387. to the last directory separator that the caller has fixed.
  388. @param[in] path path name
  389. @param[in] path last directory separator in the path
  390. @return true if this path is a drive root, false if not */
  391. UNIV_INLINE
  392. bool
  393. os_file_is_root(
  394. const char* path,
  395. const char* last_slash)
  396. {
  397. return(
  398. #ifdef _WIN32
  399. (last_slash == path + 2 && path[1] == ':') ||
  400. #endif /* _WIN32 */
  401. last_slash == path);
  402. }
  403. /** Return the parent directory component of a null-terminated path.
  404. Return a new buffer containing the string up to, but not including,
  405. the final component of the path.
  406. The path returned will not contain a trailing separator.
  407. Do not return a root path, return NULL instead.
  408. The final component trimmed off may be a filename or a directory name.
  409. If the final component is the only component of the path, return NULL.
  410. It is the caller's responsibility to free the returned string after it
  411. is no longer needed.
  412. @param[in] path Path name
  413. @return own: parent directory of the path */
  414. static
  415. char*
  416. os_file_get_parent_dir(
  417. const char* path)
  418. {
  419. /* Find the offset of the last slash */
  420. const char* last_slash = strrchr(path, '/');
  421. #ifdef _WIN32
  422. if (const char *last = strrchr(path, '\\')) {
  423. if (last > last_slash) {
  424. last_slash = last;
  425. }
  426. }
  427. #endif
  428. if (!last_slash) {
  429. /* No slash in the path, return NULL */
  430. return(NULL);
  431. }
  432. /* Ok, there is a slash. Is there anything after it? */
  433. const bool has_trailing_slash = last_slash[1] == '\0';
  434. /* Reduce repetitive slashes. */
  435. while (last_slash > path
  436. && (IF_WIN(last_slash[-1] == '\\' ||,) last_slash[-1] == '/')) {
  437. last_slash--;
  438. }
  439. /* Check for the root of a drive. */
  440. if (os_file_is_root(path, last_slash)) {
  441. return(NULL);
  442. }
  443. /* If a trailing slash prevented the first strrchr() from trimming
  444. the last component of the path, trim that component now. */
  445. if (has_trailing_slash) {
  446. /* Back up to the previous slash. */
  447. last_slash--;
  448. while (last_slash > path
  449. && (IF_WIN(last_slash[0] != '\\' &&,)
  450. last_slash[0] != '/')) {
  451. last_slash--;
  452. }
  453. /* Reduce repetitive slashes. */
  454. while (last_slash > path
  455. && (IF_WIN(last_slash[-1] == '\\' ||,)
  456. last_slash[-1] == '/')) {
  457. last_slash--;
  458. }
  459. }
  460. /* Check for the root of a drive. */
  461. if (os_file_is_root(path, last_slash)) {
  462. return(NULL);
  463. }
  464. if (last_slash - path < 0) {
  465. /* Sanity check, it prevents gcc from trying to handle this case which
  466. * results in warnings for some optimized builds */
  467. return (NULL);
  468. }
  469. /* Non-trivial directory component */
  470. return(mem_strdupl(path, ulint(last_slash - path)));
  471. }
  472. #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
  473. /* Test the function os_file_get_parent_dir. */
  474. void
  475. test_os_file_get_parent_dir(
  476. const char* child_dir,
  477. const char* expected_dir)
  478. {
  479. char* child = mem_strdup(child_dir);
  480. char* expected = expected_dir == NULL ? NULL
  481. : mem_strdup(expected_dir);
  482. char* parent = os_file_get_parent_dir(child);
  483. bool unexpected = (expected == NULL
  484. ? (parent != NULL)
  485. : (0 != strcmp(parent, expected)));
  486. if (unexpected) {
  487. ib::fatal() << "os_file_get_parent_dir('" << child
  488. << "') returned '" << parent
  489. << "', instead of '" << expected << "'.";
  490. }
  491. ut_free(parent);
  492. ut_free(child);
  493. ut_free(expected);
  494. }
  495. /* Test the function os_file_get_parent_dir. */
  496. void
  497. unit_test_os_file_get_parent_dir()
  498. {
  499. test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
  500. test_os_file_get_parent_dir("/usr/", NULL);
  501. test_os_file_get_parent_dir("//usr//", NULL);
  502. test_os_file_get_parent_dir("usr", NULL);
  503. test_os_file_get_parent_dir("usr//", NULL);
  504. test_os_file_get_parent_dir("/", NULL);
  505. test_os_file_get_parent_dir("//", NULL);
  506. test_os_file_get_parent_dir(".", NULL);
  507. test_os_file_get_parent_dir("..", NULL);
  508. # ifdef _WIN32
  509. test_os_file_get_parent_dir("D:", NULL);
  510. test_os_file_get_parent_dir("D:/", NULL);
  511. test_os_file_get_parent_dir("D:\\", NULL);
  512. test_os_file_get_parent_dir("D:/data", NULL);
  513. test_os_file_get_parent_dir("D:/data/", NULL);
  514. test_os_file_get_parent_dir("D:\\data\\", NULL);
  515. test_os_file_get_parent_dir("D:///data/////", NULL);
  516. test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
  517. test_os_file_get_parent_dir("D:/data//a", "D:/data");
  518. test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
  519. test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
  520. test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
  521. #endif /* _WIN32 */
  522. }
  523. #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
  524. /** Creates all missing subdirectories along the given path.
  525. @param[in] path Path name
  526. @return DB_SUCCESS if OK, otherwise error code. */
  527. dberr_t
  528. os_file_create_subdirs_if_needed(
  529. const char* path)
  530. {
  531. if (srv_read_only_mode) {
  532. ib::error()
  533. << "read only mode set. Can't create "
  534. << "subdirectories '" << path << "'";
  535. return(DB_READ_ONLY);
  536. }
  537. char* subdir = os_file_get_parent_dir(path);
  538. if (subdir == NULL) {
  539. /* subdir is root or cwd, nothing to do */
  540. return(DB_SUCCESS);
  541. }
  542. /* Test if subdir exists */
  543. os_file_type_t type;
  544. bool subdir_exists;
  545. bool success = os_file_status(subdir, &subdir_exists, &type);
  546. if (success && !subdir_exists) {
  547. /* Subdir does not exist, create it */
  548. dberr_t err = os_file_create_subdirs_if_needed(subdir);
  549. if (err != DB_SUCCESS) {
  550. ut_free(subdir);
  551. return(err);
  552. }
  553. success = os_file_create_directory(subdir, false);
  554. }
  555. ut_free(subdir);
  556. return(success ? DB_SUCCESS : DB_ERROR);
  557. }
  558. /** Do the read/write
  559. @param[in] request The IO context and type
  560. @param[in] n Number of bytes to read/write
  561. @return the number of bytes read/written or negative value on error */
  562. ssize_t
  563. SyncFileIO::execute_low(const IORequest& request, ssize_t n)
  564. {
  565. ut_ad(n > 0);
  566. ut_ad(size_t(n) <= os_file_request_size_max);
  567. if (request.is_read())
  568. return IF_WIN(tpool::pread(m_fh, m_buf, n, m_offset), pread(m_fh, m_buf, n, m_offset));
  569. return IF_WIN(tpool::pwrite(m_fh, m_buf, n, m_offset), pwrite(m_fh, m_buf, n, m_offset));
  570. }
  571. /** Do the read/write
  572. @param[in] request The IO context and type
  573. @return the number of bytes read/written or negative value on error */
  574. ssize_t
  575. SyncFileIO::execute(const IORequest& request)
  576. {
  577. ssize_t n_bytes= 0;
  578. ut_ad(m_n > 0);
  579. while (size_t(m_n) > os_file_request_size_max)
  580. {
  581. ssize_t n_partial_bytes= execute_low(request, os_file_request_size_max);
  582. if (n_partial_bytes < 0)
  583. return n_partial_bytes;
  584. n_bytes+= n_partial_bytes;
  585. if (n_partial_bytes != os_file_request_size_max)
  586. return n_bytes;
  587. advance(os_file_request_size_max);
  588. }
  589. if (ssize_t n= execute_low(request, m_n))
  590. {
  591. if (n < 0)
  592. return n;
  593. n_bytes += n;
  594. }
  595. return n_bytes;
  596. }
  597. #ifndef _WIN32
  598. /** Free storage space associated with a section of the file.
  599. @param[in] fh Open file handle
  600. @param[in] off Starting offset (SEEK_SET)
  601. @param[in] len Size of the hole
  602. @return DB_SUCCESS or error code */
  603. static
  604. dberr_t
  605. os_file_punch_hole_posix(
  606. os_file_t fh,
  607. os_offset_t off,
  608. os_offset_t len)
  609. {
  610. #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
  611. const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
  612. int ret = fallocate(fh, mode, off, len);
  613. if (ret == 0) {
  614. return(DB_SUCCESS);
  615. }
  616. if (errno == ENOTSUP) {
  617. return(DB_IO_NO_PUNCH_HOLE);
  618. }
  619. ib::warn()
  620. << "fallocate("
  621. <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
  622. << off << ", " << len << ") returned errno: "
  623. << errno;
  624. return(DB_IO_ERROR);
  625. #elif defined __sun__
  626. // Use F_FREESP
  627. #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
  628. return(DB_IO_NO_PUNCH_HOLE);
  629. }
  630. /** Retrieves the last error number if an error occurs in a file io function.
  631. The number should be retrieved before any other OS calls (because they may
  632. overwrite the error number). If the number is not known to this program,
  633. the OS error number + 100 is returned.
  634. @param[in] report_all_errors true if we want an error message
  635. printed of all errors
  636. @param[in] on_error_silent true then don't print any diagnostic
  637. to the log
  638. @return error number, or OS error number + 100 */
  639. ulint os_file_get_last_error(bool report_all_errors, bool on_error_silent)
  640. {
  641. int err = errno;
  642. if (err == 0) {
  643. return(0);
  644. }
  645. if (report_all_errors
  646. || (err != ENOSPC && err != EEXIST && err != ENOENT
  647. && !on_error_silent)) {
  648. ib::error()
  649. << "Operating system error number "
  650. << err
  651. << " in a file operation.";
  652. if (err == EACCES) {
  653. ib::error()
  654. << "The error means mariadbd does not have"
  655. " the access rights to the directory.";
  656. } else {
  657. if (strerror(err) != NULL) {
  658. ib::error()
  659. << "Error number " << err << " means '"
  660. << strerror(err) << "'";
  661. }
  662. ib::info() << OPERATING_SYSTEM_ERROR_MSG;
  663. }
  664. }
  665. switch (err) {
  666. case ENOSPC:
  667. return(OS_FILE_DISK_FULL);
  668. case ENOENT:
  669. return(OS_FILE_NOT_FOUND);
  670. case EEXIST:
  671. return(OS_FILE_ALREADY_EXISTS);
  672. case EXDEV:
  673. case ENOTDIR:
  674. case EISDIR:
  675. case EPERM:
  676. return(OS_FILE_PATH_ERROR);
  677. case EAGAIN:
  678. if (srv_use_native_aio) {
  679. return(OS_FILE_AIO_RESOURCES_RESERVED);
  680. }
  681. break;
  682. case EINTR:
  683. if (srv_use_native_aio) {
  684. return(OS_FILE_AIO_INTERRUPTED);
  685. }
  686. break;
  687. case EACCES:
  688. return(OS_FILE_ACCESS_VIOLATION);
  689. }
  690. return(OS_FILE_ERROR_MAX + err);
  691. }
  692. /** Wrapper to fsync() or fdatasync() that retries the call on some errors.
  693. Returns the value 0 if successful; otherwise the value -1 is returned and
  694. the global variable errno is set to indicate the error.
  695. @param[in] file open file handle
  696. @return 0 if success, -1 otherwise */
  697. static int os_file_sync_posix(os_file_t file)
  698. {
  699. #if !defined(HAVE_FDATASYNC) || HAVE_DECL_FDATASYNC == 0
  700. auto func= fsync;
  701. auto func_name= "fsync()";
  702. #else
  703. auto func= fdatasync;
  704. auto func_name= "fdatasync()";
  705. #endif
  706. ulint failures= 0;
  707. for (;;)
  708. {
  709. ++os_n_fsyncs;
  710. int ret= func(file);
  711. if (ret == 0)
  712. return ret;
  713. switch (errno)
  714. {
  715. case ENOLCK:
  716. ++failures;
  717. ut_a(failures < 1000);
  718. if (!(failures % 100))
  719. ib::warn() << func_name << ": No locks available; retrying";
  720. std::this_thread::sleep_for(std::chrono::milliseconds(200));
  721. break;
  722. case EINTR:
  723. ++failures;
  724. ut_a(failures < 2000);
  725. break;
  726. default:
  727. ib::fatal() << func_name << " returned " << errno;
  728. }
  729. }
  730. }
  731. /** Check the existence and type of the given file.
  732. @param[in] path path name of file
  733. @param[out] exists true if the file exists
  734. @param[out] type Type of the file, if it exists
  735. @return true if call succeeded */
  736. static
  737. bool
  738. os_file_status_posix(
  739. const char* path,
  740. bool* exists,
  741. os_file_type_t* type)
  742. {
  743. struct stat statinfo;
  744. int ret = stat(path, &statinfo);
  745. *exists = !ret;
  746. if (!ret) {
  747. /* file exists, everything OK */
  748. MSAN_STAT_WORKAROUND(&statinfo);
  749. } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
  750. /* file does not exist */
  751. return(true);
  752. } else {
  753. /* file exists, but stat call failed */
  754. os_file_handle_error_no_exit(path, "stat", false);
  755. return(false);
  756. }
  757. if (S_ISDIR(statinfo.st_mode)) {
  758. *type = OS_FILE_TYPE_DIR;
  759. } else if (S_ISLNK(statinfo.st_mode)) {
  760. *type = OS_FILE_TYPE_LINK;
  761. } else if (S_ISREG(statinfo.st_mode)) {
  762. *type = OS_FILE_TYPE_FILE;
  763. } else {
  764. *type = OS_FILE_TYPE_UNKNOWN;
  765. }
  766. return(true);
  767. }
  768. /** NOTE! Use the corresponding macro os_file_flush(), not directly this
  769. function!
  770. Flushes the write buffers of a given file to the disk.
  771. @param[in] file handle to a file
  772. @return true if success */
  773. bool
  774. os_file_flush_func(
  775. os_file_t file)
  776. {
  777. int ret;
  778. ret = os_file_sync_posix(file);
  779. if (ret == 0) {
  780. return(true);
  781. }
  782. /* Since Linux returns EINVAL if the 'file' is actually a raw device,
  783. we choose to ignore that error if we are using raw disks */
  784. if (srv_start_raw_disk_in_use && errno == EINVAL) {
  785. return(true);
  786. }
  787. ib::error() << "The OS said file flush did not succeed";
  788. os_file_handle_error("flush");
  789. /* It is a fatal error if a file flush does not succeed, because then
  790. the database can get corrupt on disk */
  791. ut_error;
  792. return(false);
  793. }
  794. /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
  795. this function!
  796. A simple function to open or create a file.
  797. @param[in] name name of the file or path as a null-terminated
  798. string
  799. @param[in] create_mode create mode
  800. @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
  801. @param[in] read_only if true, read only checks are enforced
  802. @param[out] success true if succeed, false if error
  803. @return handle to the file, not defined if error, error number
  804. can be retrieved with os_file_get_last_error */
  805. pfs_os_file_t
  806. os_file_create_simple_func(
  807. const char* name,
  808. os_file_create_t create_mode,
  809. ulint access_type,
  810. bool read_only,
  811. bool* success)
  812. {
  813. pfs_os_file_t file;
  814. *success = false;
  815. int create_flag = O_RDONLY | O_CLOEXEC;
  816. if (read_only) {
  817. } else if (create_mode == OS_FILE_CREATE) {
  818. create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC;
  819. } else {
  820. ut_ad(create_mode == OS_FILE_OPEN);
  821. if (access_type != OS_FILE_READ_ONLY) {
  822. create_flag = O_RDWR | O_CLOEXEC;
  823. }
  824. }
  825. bool retry;
  826. #ifdef O_DIRECT
  827. int direct_flag = 0;
  828. /* This function is always called for data files, we should disable
  829. OS caching (O_DIRECT) here as we do in os_file_create_func(), so
  830. we open the same file in the same mode, see man page of open(2). */
  831. switch (srv_file_flush_method) {
  832. case SRV_O_DSYNC:
  833. case SRV_O_DIRECT:
  834. case SRV_O_DIRECT_NO_FSYNC:
  835. direct_flag = O_DIRECT;
  836. break;
  837. }
  838. #else
  839. constexpr int direct_flag = 0;
  840. #endif
  841. do {
  842. file = open(name, create_flag | direct_flag, os_innodb_umask);
  843. if (file == -1) {
  844. #ifdef O_DIRECT
  845. if (direct_flag && errno == EINVAL) {
  846. direct_flag = 0;
  847. retry = true;
  848. continue;
  849. }
  850. #endif
  851. *success = false;
  852. retry = os_file_handle_error_no_exit(
  853. name,
  854. create_mode == OS_FILE_CREATE
  855. ? "create" : "open", false);
  856. } else {
  857. *success = true;
  858. retry = false;
  859. }
  860. } while (retry);
  861. if (!read_only
  862. && *success
  863. && access_type == OS_FILE_READ_WRITE
  864. && !my_disable_locking
  865. && os_file_lock(file, name)) {
  866. *success = false;
  867. close(file);
  868. file = -1;
  869. }
  870. return(file);
  871. }
  872. /** This function attempts to create a directory named pathname. The new
  873. directory gets default permissions. On Unix the permissions are
  874. (0770 & ~umask). If the directory exists already, nothing is done and
  875. the call succeeds, unless the fail_if_exists arguments is true.
  876. If another error occurs, such as a permission error, this does not crash,
  877. but reports the error and returns false.
  878. @param[in] pathname directory name as null-terminated string
  879. @param[in] fail_if_exists if true, pre-existing directory is treated as
  880. an error.
  881. @return true if call succeeds, false on error */
  882. bool
  883. os_file_create_directory(
  884. const char* pathname,
  885. bool fail_if_exists)
  886. {
  887. int rcode;
  888. rcode = mkdir(pathname, 0770);
  889. if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
  890. /* failure */
  891. os_file_handle_error_no_exit(pathname, "mkdir", false);
  892. return(false);
  893. }
  894. return(true);
  895. }
  896. #ifdef O_DIRECT
  897. # if defined __linux
  898. /** Note that the log file uses buffered I/O. */
  899. static ATTRIBUTE_COLD void os_file_log_buffered()
  900. {
  901. log_sys.log_maybe_unbuffered= false;
  902. log_sys.log_buffered= true;
  903. }
  904. # endif
  905. /** @return whether the log file may work with unbuffered I/O. */
  906. static ATTRIBUTE_COLD bool os_file_log_maybe_unbuffered(const struct stat &st)
  907. {
  908. MSAN_STAT_WORKAROUND(&st);
  909. # ifdef __linux__
  910. char b[20 + sizeof "/sys/dev/block/" ":" "/../queue/physical_block_size"];
  911. if (snprintf(b, sizeof b, "/sys/dev/block/%u:%u/queue/physical_block_size",
  912. major(st.st_dev), minor(st.st_dev)) >=
  913. static_cast<int>(sizeof b))
  914. {
  915. fallback:
  916. log_sys.set_block_size(512);
  917. return false;
  918. }
  919. int f= open(b, O_RDONLY);
  920. if (f == -1)
  921. {
  922. if (snprintf(b, sizeof b, "/sys/dev/block/%u:%u/../queue/"
  923. "physical_block_size",
  924. major(st.st_dev), minor(st.st_dev)) >=
  925. static_cast<int>(sizeof b))
  926. goto fallback;
  927. f= open(b, O_RDONLY);
  928. }
  929. unsigned long s= 0;
  930. if (f != -1)
  931. {
  932. ssize_t l= read(f, b, sizeof b);
  933. if (l > 0 && size_t(l) < sizeof b && b[l - 1] == '\n')
  934. {
  935. char *end= b;
  936. s= strtoul(b, &end, 10);
  937. if (b == end || *end != '\n')
  938. s = 0;
  939. }
  940. close(f);
  941. }
  942. if (s > 4096 || s < 64 || !ut_is_2pow(s))
  943. goto fallback;
  944. log_sys.set_block_size(uint32_t(s));
  945. # else
  946. constexpr unsigned long s= 4096;
  947. # endif
  948. return !(st.st_size & (s - 1));
  949. }
  950. #endif
  951. /** NOTE! Use the corresponding macro os_file_create(), not directly
  952. this function!
  953. Opens an existing file or creates a new.
  954. @param[in] name name of the file or path as a null-terminated
  955. string
  956. @param[in] create_mode create mode
  957. @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
  958. is desired, OS_FILE_NORMAL, if any normal file;
  959. NOTE that it also depends on type, os_aio_..
  960. and srv_.. variables whether we really use async
  961. I/O or unbuffered I/O: look in the function
  962. source code for the exact rules
  963. @param[in] type OS_DATA_FILE or OS_LOG_FILE
  964. @param[in] read_only true, if read only checks should be enforcedm
  965. @param[in] success true if succeeded
  966. @return handle to the file, not defined if error, error number
  967. can be retrieved with os_file_get_last_error */
  968. pfs_os_file_t
  969. os_file_create_func(
  970. const char* name,
  971. os_file_create_t create_mode,
  972. ulint purpose,
  973. ulint type,
  974. bool read_only,
  975. bool* success)
  976. {
  977. *success = false;
  978. DBUG_EXECUTE_IF(
  979. "ib_create_table_fail_disk_full",
  980. errno = ENOSPC;
  981. return(OS_FILE_CLOSED);
  982. );
  983. int create_flag;
  984. if (read_only) {
  985. create_flag = O_RDONLY | O_CLOEXEC;
  986. } else if (create_mode == OS_FILE_CREATE
  987. || create_mode == OS_FILE_CREATE_SILENT) {
  988. create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC;
  989. } else {
  990. ut_ad(create_mode == OS_FILE_OPEN
  991. || create_mode == OS_FILE_OPEN_SILENT
  992. || create_mode == OS_FILE_OPEN_RETRY
  993. || create_mode == OS_FILE_OPEN_RETRY_SILENT
  994. || create_mode == OS_FILE_OPEN_RAW);
  995. create_flag = O_RDWR | O_CLOEXEC;
  996. }
  997. #ifdef O_DIRECT
  998. struct stat st;
  999. ut_a(type == OS_LOG_FILE
  1000. || type == OS_DATA_FILE || type == OS_DATA_FILE_NO_O_DIRECT);
  1001. int direct_flag = 0;
  1002. if (type == OS_DATA_FILE) {
  1003. switch (srv_file_flush_method) {
  1004. case SRV_O_DSYNC:
  1005. case SRV_O_DIRECT:
  1006. case SRV_O_DIRECT_NO_FSYNC:
  1007. direct_flag = O_DIRECT;
  1008. break;
  1009. default:
  1010. break;
  1011. }
  1012. # ifdef __linux__
  1013. } else if (type == OS_LOG_FILE && create_mode != OS_FILE_CREATE
  1014. && create_mode != OS_FILE_CREATE_SILENT
  1015. && !log_sys.is_opened()) {
  1016. if (stat(name, &st)) {
  1017. if (errno == ENOENT) {
  1018. if (create_mode & OS_FILE_ON_ERROR_SILENT) {
  1019. goto not_found;
  1020. }
  1021. sql_print_error(
  1022. "InnoDB: File %s was not found", name);
  1023. goto not_found;
  1024. }
  1025. log_sys.set_block_size(512);
  1026. goto skip_o_direct;
  1027. } else if (!os_file_log_maybe_unbuffered(st)
  1028. || log_sys.log_buffered) {
  1029. skip_o_direct:
  1030. os_file_log_buffered();
  1031. } else {
  1032. direct_flag = O_DIRECT;
  1033. log_sys.log_maybe_unbuffered = true;
  1034. }
  1035. # endif
  1036. }
  1037. #else
  1038. ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
  1039. constexpr int direct_flag = 0;
  1040. #endif
  1041. ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
  1042. /* We let O_DSYNC only affect log files */
  1043. if (!read_only
  1044. && type == OS_LOG_FILE
  1045. && srv_file_flush_method == SRV_O_DSYNC) {
  1046. #ifdef O_DSYNC
  1047. create_flag |= O_DSYNC;
  1048. #else
  1049. create_flag |= O_SYNC;
  1050. #endif
  1051. }
  1052. os_file_t file;
  1053. for (;;) {
  1054. file = open(name, create_flag | direct_flag, os_innodb_umask);
  1055. if (file == -1) {
  1056. #ifdef O_DIRECT
  1057. if (direct_flag && errno == EINVAL) {
  1058. direct_flag = 0;
  1059. # ifdef __linux__
  1060. if (type == OS_LOG_FILE) {
  1061. os_file_log_buffered();
  1062. }
  1063. # endif
  1064. if (create_mode == OS_FILE_CREATE
  1065. || create_mode == OS_FILE_CREATE_SILENT) {
  1066. /* Linux may create the file
  1067. before rejecting the O_DIRECT. */
  1068. unlink(name);
  1069. }
  1070. continue;
  1071. }
  1072. #endif
  1073. if (!os_file_handle_error_no_exit(
  1074. name, (create_flag & O_CREAT)
  1075. ? "create" : "open",
  1076. create_mode & OS_FILE_ON_ERROR_SILENT)) {
  1077. break;
  1078. }
  1079. } else {
  1080. *success = true;
  1081. break;
  1082. }
  1083. }
  1084. if (!*success) {
  1085. #ifdef __linux__
  1086. not_found:
  1087. #endif
  1088. return OS_FILE_CLOSED;
  1089. }
  1090. #ifdef __linux__
  1091. if ((create_flag & O_CREAT) && type == OS_LOG_FILE) {
  1092. if (fstat(file, &st) || !os_file_log_maybe_unbuffered(st)) {
  1093. os_file_log_buffered();
  1094. } else {
  1095. close(file);
  1096. return os_file_create_func(name, OS_FILE_OPEN, purpose,
  1097. type, false, success);
  1098. }
  1099. }
  1100. #endif
  1101. if (!read_only
  1102. && create_mode != OS_FILE_OPEN_RAW
  1103. && !my_disable_locking
  1104. && os_file_lock(file, name)) {
  1105. if (create_mode == OS_FILE_OPEN_RETRY
  1106. || create_mode == OS_FILE_OPEN_RETRY_SILENT) {
  1107. ib::info()
  1108. << "Retrying to lock the first data file";
  1109. for (int i = 0; i < 100; i++) {
  1110. std::this_thread::sleep_for(
  1111. std::chrono::seconds(1));
  1112. if (!os_file_lock(file, name)) {
  1113. *success = true;
  1114. return(file);
  1115. }
  1116. }
  1117. ib::info()
  1118. << "Unable to open the first data file";
  1119. }
  1120. *success = false;
  1121. close(file);
  1122. file = -1;
  1123. }
  1124. return(file);
  1125. }
  1126. /** NOTE! Use the corresponding macro
  1127. os_file_create_simple_no_error_handling(), not directly this function!
  1128. A simple function to open or create a file.
  1129. @param[in] name name of the file or path as a null-terminated
  1130. string
  1131. @param[in] create_mode OS_FILE_CREATE or OS_FILE_OPEN
  1132. @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
  1133. OS_FILE_READ_ALLOW_DELETE; the last option
  1134. is used by a backup program reading the file
  1135. @param[in] read_only if true read only mode checks are enforced
  1136. @param[out] success true if succeeded
  1137. @return own: handle to the file, not defined if error, error number
  1138. can be retrieved with os_file_get_last_error */
  1139. pfs_os_file_t
  1140. os_file_create_simple_no_error_handling_func(
  1141. const char* name,
  1142. os_file_create_t create_mode,
  1143. ulint access_type,
  1144. bool read_only,
  1145. bool* success)
  1146. {
  1147. os_file_t file;
  1148. int create_flag = O_RDONLY | O_CLOEXEC;
  1149. *success = false;
  1150. if (read_only) {
  1151. } else if (create_mode == OS_FILE_CREATE) {
  1152. create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC;
  1153. } else {
  1154. ut_ad(create_mode == OS_FILE_OPEN);
  1155. if (access_type != OS_FILE_READ_ONLY) {
  1156. ut_a(access_type == OS_FILE_READ_WRITE
  1157. || access_type == OS_FILE_READ_ALLOW_DELETE);
  1158. create_flag = O_RDWR;
  1159. }
  1160. }
  1161. file = open(name, create_flag, os_innodb_umask);
  1162. *success = (file != -1);
  1163. if (!read_only
  1164. && *success
  1165. && access_type == OS_FILE_READ_WRITE
  1166. && !my_disable_locking
  1167. && os_file_lock(file, name)) {
  1168. *success = false;
  1169. close(file);
  1170. file = -1;
  1171. }
  1172. return(file);
  1173. }
  1174. /** Deletes a file if it exists. The file has to be closed before calling this.
  1175. @param[in] name file path as a null-terminated string
  1176. @param[out] exist indicate if file pre-exist
  1177. @return true if success */
  1178. bool
  1179. os_file_delete_if_exists_func(
  1180. const char* name,
  1181. bool* exist)
  1182. {
  1183. if (exist != NULL) {
  1184. *exist = true;
  1185. }
  1186. int ret;
  1187. ret = unlink(name);
  1188. if (ret != 0 && errno == ENOENT) {
  1189. if (exist != NULL) {
  1190. *exist = false;
  1191. }
  1192. } else if (ret != 0 && errno != ENOENT) {
  1193. os_file_handle_error_no_exit(name, "delete", false);
  1194. return(false);
  1195. }
  1196. return(true);
  1197. }
  1198. /** Deletes a file. The file has to be closed before calling this.
  1199. @param[in] name file path as a null-terminated string
  1200. @return true if success */
  1201. bool
  1202. os_file_delete_func(
  1203. const char* name)
  1204. {
  1205. int ret;
  1206. ret = unlink(name);
  1207. if (ret != 0) {
  1208. os_file_handle_error_no_exit(name, "delete", FALSE);
  1209. return(false);
  1210. }
  1211. return(true);
  1212. }
  1213. /** NOTE! Use the corresponding macro os_file_rename(), not directly this
  1214. function!
  1215. Renames a file (can also move it to another directory). It is safest that the
  1216. file is closed before calling this function.
  1217. @param[in] oldpath old file path as a null-terminated string
  1218. @param[in] newpath new file path
  1219. @return true if success */
  1220. bool
  1221. os_file_rename_func(
  1222. const char* oldpath,
  1223. const char* newpath)
  1224. {
  1225. #ifdef UNIV_DEBUG
  1226. os_file_type_t type;
  1227. bool exists;
  1228. /* New path must not exist. */
  1229. ut_ad(os_file_status(newpath, &exists, &type));
  1230. ut_ad(!exists);
  1231. /* Old path must exist. */
  1232. ut_ad(os_file_status(oldpath, &exists, &type));
  1233. ut_ad(exists);
  1234. #endif /* UNIV_DEBUG */
  1235. int ret;
  1236. ret = rename(oldpath, newpath);
  1237. if (ret != 0) {
  1238. os_file_handle_rename_error(oldpath, newpath);
  1239. return(false);
  1240. }
  1241. return(true);
  1242. }
  1243. /** NOTE! Use the corresponding macro os_file_close(), not directly this
  1244. function!
  1245. Closes a file handle. In case of error, error number can be retrieved with
  1246. os_file_get_last_error.
  1247. @param[in] file Handle to close
  1248. @return true if success */
  1249. bool os_file_close_func(os_file_t file)
  1250. {
  1251. int ret= close(file);
  1252. if (!ret)
  1253. return true;
  1254. os_file_handle_error("close");
  1255. return false;
  1256. }
  1257. /** Gets a file size.
  1258. @param[in] file handle to an open file
  1259. @return file size, or (os_offset_t) -1 on failure */
  1260. os_offset_t
  1261. os_file_get_size(os_file_t file)
  1262. {
  1263. struct stat statbuf;
  1264. if (fstat(file, &statbuf)) return os_offset_t(-1);
  1265. MSAN_STAT_WORKAROUND(&statbuf);
  1266. return statbuf.st_size;
  1267. }
  1268. /** Gets a file size.
  1269. @param[in] filename Full path to the filename to check
  1270. @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
  1271. errno */
  1272. os_file_size_t
  1273. os_file_get_size(
  1274. const char* filename)
  1275. {
  1276. struct stat s;
  1277. os_file_size_t file_size;
  1278. int ret = stat(filename, &s);
  1279. if (ret == 0) {
  1280. MSAN_STAT_WORKAROUND(&s);
  1281. file_size.m_total_size = s.st_size;
  1282. /* st_blocks is in 512 byte sized blocks */
  1283. file_size.m_alloc_size = s.st_blocks * 512;
  1284. } else {
  1285. file_size.m_total_size = ~0U;
  1286. file_size.m_alloc_size = (os_offset_t) errno;
  1287. }
  1288. return(file_size);
  1289. }
  1290. /** This function returns information about the specified file
  1291. @param[in] path pathname of the file
  1292. @param[out] stat_info information of a file in a directory
  1293. @param[in,out] statinfo information of a file in a directory
  1294. @param[in] check_rw_perm for testing whether the file can be opened
  1295. in RW mode
  1296. @param[in] read_only if true read only mode checks are enforced
  1297. @return DB_SUCCESS if all OK */
  1298. static
  1299. dberr_t
  1300. os_file_get_status_posix(
  1301. const char* path,
  1302. os_file_stat_t* stat_info,
  1303. struct stat* statinfo,
  1304. bool check_rw_perm,
  1305. bool read_only)
  1306. {
  1307. int ret = stat(path, statinfo);
  1308. if (ret && (errno == ENOENT || errno == ENOTDIR
  1309. || errno == ENAMETOOLONG)) {
  1310. /* file does not exist */
  1311. return(DB_NOT_FOUND);
  1312. } else if (ret) {
  1313. /* file exists, but stat call failed */
  1314. os_file_handle_error_no_exit(path, "stat", false);
  1315. return(DB_FAIL);
  1316. }
  1317. MSAN_STAT_WORKAROUND(statinfo);
  1318. switch (statinfo->st_mode & S_IFMT) {
  1319. case S_IFDIR:
  1320. stat_info->type = OS_FILE_TYPE_DIR;
  1321. break;
  1322. case S_IFLNK:
  1323. stat_info->type = OS_FILE_TYPE_LINK;
  1324. break;
  1325. case S_IFBLK:
  1326. /* Handle block device as regular file. */
  1327. case S_IFCHR:
  1328. /* Handle character device as regular file. */
  1329. case S_IFREG:
  1330. stat_info->type = OS_FILE_TYPE_FILE;
  1331. break;
  1332. default:
  1333. stat_info->type = OS_FILE_TYPE_UNKNOWN;
  1334. }
  1335. stat_info->size = statinfo->st_size;
  1336. stat_info->block_size = statinfo->st_blksize;
  1337. stat_info->alloc_size = statinfo->st_blocks * 512;
  1338. if (check_rw_perm
  1339. && (stat_info->type == OS_FILE_TYPE_FILE
  1340. || stat_info->type == OS_FILE_TYPE_BLOCK)) {
  1341. stat_info->rw_perm = !access(path, read_only
  1342. ? R_OK : R_OK | W_OK);
  1343. }
  1344. return(DB_SUCCESS);
  1345. }
  1346. /** Truncates a file to a specified size in bytes.
  1347. Do nothing if the size to preserve is greater or equal to the current
  1348. size of the file.
  1349. @param[in] pathname file path
  1350. @param[in] file file to be truncated
  1351. @param[in] size size to preserve in bytes
  1352. @return true if success */
  1353. static
  1354. bool
  1355. os_file_truncate_posix(
  1356. const char* pathname,
  1357. os_file_t file,
  1358. os_offset_t size)
  1359. {
  1360. int res = ftruncate(file, size);
  1361. if (res == -1) {
  1362. bool retry;
  1363. retry = os_file_handle_error_no_exit(
  1364. pathname, "truncate", false);
  1365. if (retry) {
  1366. ib::warn()
  1367. << "Truncate failed for '"
  1368. << pathname << "'";
  1369. }
  1370. }
  1371. return(res == 0);
  1372. }
  1373. /** Truncates a file at its current position.
  1374. @return true if success */
  1375. bool
  1376. os_file_set_eof(
  1377. FILE* file) /*!< in: file to be truncated */
  1378. {
  1379. return(!ftruncate(fileno(file), ftell(file)));
  1380. }
  1381. #else /* !_WIN32 */
  1382. #include <WinIoCtl.h>
  1383. /** Free storage space associated with a section of the file.
  1384. @param[in] fh Open file handle
  1385. @param[in] off Starting offset (SEEK_SET)
  1386. @param[in] len Size of the hole
  1387. @return 0 on success or errno */
  1388. static
  1389. dberr_t
  1390. os_file_punch_hole_win32(
  1391. os_file_t fh,
  1392. os_offset_t off,
  1393. os_offset_t len)
  1394. {
  1395. FILE_ZERO_DATA_INFORMATION punch;
  1396. punch.FileOffset.QuadPart = off;
  1397. punch.BeyondFinalZero.QuadPart = off + len;
  1398. /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
  1399. therefore we pass a dummy parameter. */
  1400. DWORD temp;
  1401. BOOL success = os_win32_device_io_control(
  1402. fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
  1403. NULL, 0, &temp);
  1404. return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE);
  1405. }
  1406. /** Check the existence and type of the given file.
  1407. @param[in] path path name of file
  1408. @param[out] exists true if the file exists
  1409. @param[out] type Type of the file, if it exists
  1410. @return true if call succeeded */
  1411. static
  1412. bool
  1413. os_file_status_win32(
  1414. const char* path,
  1415. bool* exists,
  1416. os_file_type_t* type)
  1417. {
  1418. int ret;
  1419. struct _stat64 statinfo;
  1420. ret = _stat64(path, &statinfo);
  1421. *exists = !ret;
  1422. if (!ret) {
  1423. /* file exists, everything OK */
  1424. } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
  1425. /* file does not exist */
  1426. return(true);
  1427. } else {
  1428. /* file exists, but stat call failed */
  1429. os_file_handle_error_no_exit(path, "stat", false);
  1430. return(false);
  1431. }
  1432. if (_S_IFDIR & statinfo.st_mode) {
  1433. *type = OS_FILE_TYPE_DIR;
  1434. } else if (_S_IFREG & statinfo.st_mode) {
  1435. *type = OS_FILE_TYPE_FILE;
  1436. } else {
  1437. *type = OS_FILE_TYPE_UNKNOWN;
  1438. }
  1439. return(true);
  1440. }
  1441. /* Dynamically load NtFlushBuffersFileEx, used in os_file_flush_func */
  1442. #include <winternl.h>
  1443. typedef NTSTATUS(WINAPI* pNtFlushBuffersFileEx)(
  1444. HANDLE FileHandle, ULONG Flags, PVOID Parameters, ULONG ParametersSize,
  1445. PIO_STATUS_BLOCK IoStatusBlock);
  1446. static pNtFlushBuffersFileEx my_NtFlushBuffersFileEx
  1447. = (pNtFlushBuffersFileEx)GetProcAddress(GetModuleHandle("ntdll"),
  1448. "NtFlushBuffersFileEx");
  1449. /** NOTE! Use the corresponding macro os_file_flush(), not directly this
  1450. function!
  1451. Flushes the write buffers of a given file to the disk.
  1452. @param[in] file handle to a file
  1453. @return true if success */
  1454. bool os_file_flush_func(os_file_t file)
  1455. {
  1456. ++os_n_fsyncs;
  1457. static bool disable_datasync;
  1458. if (my_NtFlushBuffersFileEx && !disable_datasync)
  1459. {
  1460. IO_STATUS_BLOCK iosb{};
  1461. NTSTATUS status= my_NtFlushBuffersFileEx(
  1462. file, FLUSH_FLAGS_FILE_DATA_SYNC_ONLY, nullptr, 0, &iosb);
  1463. if (!status)
  1464. return true;
  1465. /*
  1466. NtFlushBuffersFileEx(FLUSH_FLAGS_FILE_DATA_SYNC_ONLY) might fail
  1467. unless on Win10+, and maybe non-NTFS. Switch to using FlushFileBuffers().
  1468. */
  1469. disable_datasync= true;
  1470. }
  1471. if (FlushFileBuffers(file))
  1472. return true;
  1473. /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
  1474. actually a raw device, we choose to ignore that error if we are using
  1475. raw disks */
  1476. if (srv_start_raw_disk_in_use && GetLastError() == ERROR_INVALID_FUNCTION)
  1477. return true;
  1478. os_file_handle_error("flush");
  1479. /* It is a fatal error if a file flush does not succeed, because then
  1480. the database can get corrupt on disk */
  1481. ut_error;
  1482. return false;
  1483. }
  1484. /** Retrieves the last error number if an error occurs in a file io function.
  1485. The number should be retrieved before any other OS calls (because they may
  1486. overwrite the error number). If the number is not known to this program,
  1487. then OS error number + OS_FILE_ERROR_MAX is returned.
  1488. @param[in] report_all_errors true if we want an error message
  1489. printed of all errors
  1490. @param[in] on_error_silent true then don't print any diagnostic
  1491. to the log
  1492. @return error number, or OS error number + OS_FILE_ERROR_MAX */
  1493. ulint os_file_get_last_error(bool report_all_errors, bool on_error_silent)
  1494. {
  1495. ulint err = (ulint) GetLastError();
  1496. if (err == ERROR_SUCCESS) {
  1497. return(0);
  1498. }
  1499. if (report_all_errors
  1500. || (!on_error_silent
  1501. && err != ERROR_DISK_FULL
  1502. && err != ERROR_FILE_NOT_FOUND
  1503. && err != ERROR_FILE_EXISTS)) {
  1504. ib::error()
  1505. << "Operating system error number " << err
  1506. << " in a file operation.";
  1507. switch (err) {
  1508. case ERROR_PATH_NOT_FOUND:
  1509. break;
  1510. case ERROR_ACCESS_DENIED:
  1511. ib::error()
  1512. << "The error means mariadbd does not have"
  1513. " the access rights to"
  1514. " the directory. It may also be"
  1515. " you have created a subdirectory"
  1516. " of the same name as a data file.";
  1517. break;
  1518. case ERROR_SHARING_VIOLATION:
  1519. case ERROR_LOCK_VIOLATION:
  1520. ib::error()
  1521. << "The error means that another program"
  1522. " is using InnoDB's files."
  1523. " This might be a backup or antivirus"
  1524. " software or another instance"
  1525. " of MariaDB."
  1526. " Please close it to get rid of this error.";
  1527. break;
  1528. case ERROR_WORKING_SET_QUOTA:
  1529. case ERROR_NO_SYSTEM_RESOURCES:
  1530. ib::error()
  1531. << "The error means that there are no"
  1532. " sufficient system resources or quota to"
  1533. " complete the operation.";
  1534. break;
  1535. case ERROR_OPERATION_ABORTED:
  1536. ib::error()
  1537. << "The error means that the I/O"
  1538. " operation has been aborted"
  1539. " because of either a thread exit"
  1540. " or an application request."
  1541. " Retry attempt is made.";
  1542. break;
  1543. default:
  1544. ib::info() << OPERATING_SYSTEM_ERROR_MSG;
  1545. }
  1546. }
  1547. if (err == ERROR_FILE_NOT_FOUND) {
  1548. return(OS_FILE_NOT_FOUND);
  1549. } else if (err == ERROR_DISK_FULL) {
  1550. return(OS_FILE_DISK_FULL);
  1551. } else if (err == ERROR_FILE_EXISTS) {
  1552. return(OS_FILE_ALREADY_EXISTS);
  1553. } else if (err == ERROR_SHARING_VIOLATION
  1554. || err == ERROR_LOCK_VIOLATION) {
  1555. return(OS_FILE_SHARING_VIOLATION);
  1556. } else if (err == ERROR_WORKING_SET_QUOTA
  1557. || err == ERROR_NO_SYSTEM_RESOURCES) {
  1558. return(OS_FILE_INSUFFICIENT_RESOURCE);
  1559. } else if (err == ERROR_OPERATION_ABORTED) {
  1560. return(OS_FILE_OPERATION_ABORTED);
  1561. } else if (err == ERROR_ACCESS_DENIED) {
  1562. return(OS_FILE_ACCESS_VIOLATION);
  1563. }
  1564. return(OS_FILE_ERROR_MAX + err);
  1565. }
  1566. /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
  1567. this function!
  1568. A simple function to open or create a file.
  1569. @param[in] name name of the file or path as a null-terminated
  1570. string
  1571. @param[in] create_mode create mode
  1572. @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
  1573. @param[in] read_only if true read only mode checks are enforced
  1574. @param[out] success true if succeed, false if error
  1575. @return handle to the file, not defined if error, error number
  1576. can be retrieved with os_file_get_last_error */
  1577. pfs_os_file_t
  1578. os_file_create_simple_func(
  1579. const char* name,
  1580. os_file_create_t create_mode,
  1581. ulint access_type,
  1582. bool read_only,
  1583. bool* success)
  1584. {
  1585. os_file_t file;
  1586. *success = false;
  1587. DWORD access = GENERIC_READ;
  1588. DWORD create_flag;
  1589. DWORD attributes = 0;
  1590. ut_ad(srv_operation == SRV_OPERATION_NORMAL);
  1591. if (read_only || create_mode == OS_FILE_OPEN) {
  1592. create_flag = OPEN_EXISTING;
  1593. } else {
  1594. ut_ad(create_mode == OS_FILE_CREATE);
  1595. create_flag = CREATE_NEW;
  1596. }
  1597. if (access_type == OS_FILE_READ_ONLY) {
  1598. } else if (read_only) {
  1599. ib::info()
  1600. << "Read only mode set. Unable to"
  1601. " open file '" << name << "' in RW mode, "
  1602. << "trying RO mode";
  1603. } else {
  1604. ut_ad(access_type == OS_FILE_READ_WRITE);
  1605. access = GENERIC_READ | GENERIC_WRITE;
  1606. }
  1607. for (;;) {
  1608. /* Use default security attributes and no template file. */
  1609. file = CreateFile(
  1610. (LPCTSTR) name, access,
  1611. FILE_SHARE_READ | FILE_SHARE_DELETE,
  1612. my_win_file_secattr(), create_flag, attributes, NULL);
  1613. if (file != INVALID_HANDLE_VALUE) {
  1614. *success = true;
  1615. break;
  1616. }
  1617. if (!os_file_handle_error_no_exit(name,
  1618. create_flag == CREATE_NEW
  1619. ? "create" : "open",
  1620. false)) {
  1621. break;
  1622. }
  1623. }
  1624. return(file);
  1625. }
  1626. /** This function attempts to create a directory named pathname. The new
  1627. directory gets default permissions. On Unix the permissions are
  1628. (0770 & ~umask). If the directory exists already, nothing is done and
  1629. the call succeeds, unless the fail_if_exists arguments is true.
  1630. If another error occurs, such as a permission error, this does not crash,
  1631. but reports the error and returns false.
  1632. @param[in] pathname directory name as null-terminated string
  1633. @param[in] fail_if_exists if true, pre-existing directory is treated
  1634. as an error.
  1635. @return true if call succeeds, false on error */
  1636. bool
  1637. os_file_create_directory(
  1638. const char* pathname,
  1639. bool fail_if_exists)
  1640. {
  1641. BOOL rcode;
  1642. rcode = CreateDirectory((LPCTSTR) pathname, NULL);
  1643. if (!(rcode != 0
  1644. || (GetLastError() == ERROR_ALREADY_EXISTS
  1645. && !fail_if_exists))) {
  1646. os_file_handle_error_no_exit(
  1647. pathname, "CreateDirectory", false);
  1648. return(false);
  1649. }
  1650. return(true);
  1651. }
  1652. /** Get disk sector size for a file. */
  1653. static size_t get_sector_size(HANDLE file)
  1654. {
  1655. FILE_STORAGE_INFO fsi;
  1656. ULONG s= 4096;
  1657. if (GetFileInformationByHandleEx(file, FileStorageInfo, &fsi, sizeof fsi))
  1658. {
  1659. s= fsi.PhysicalBytesPerSectorForPerformance;
  1660. if (s > 4096 || s < 64 || !ut_is_2pow(s))
  1661. return 4096;
  1662. }
  1663. return s;
  1664. }
  1665. /** NOTE! Use the corresponding macro os_file_create(), not directly
  1666. this function!
  1667. Opens an existing file or creates a new.
  1668. @param[in] name name of the file or path as a null-terminated
  1669. string
  1670. @param[in] create_mode create mode
  1671. @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
  1672. is desired, OS_FILE_NORMAL, if any normal file;
  1673. NOTE that it also depends on type, os_aio_..
  1674. and srv_.. variables whether we really use async
  1675. I/O or unbuffered I/O: look in the function
  1676. source code for the exact rules
  1677. @param[in] type OS_DATA_FILE or OS_LOG_FILE
  1678. @param[in] success true if succeeded
  1679. @return handle to the file, not defined if error, error number
  1680. can be retrieved with os_file_get_last_error */
  1681. pfs_os_file_t
  1682. os_file_create_func(
  1683. const char* name,
  1684. os_file_create_t create_mode,
  1685. ulint purpose,
  1686. ulint type,
  1687. bool read_only,
  1688. bool* success)
  1689. {
  1690. os_file_t file;
  1691. *success = false;
  1692. DBUG_EXECUTE_IF(
  1693. "ib_create_table_fail_disk_full",
  1694. *success = false;
  1695. SetLastError(ERROR_DISK_FULL);
  1696. return(OS_FILE_CLOSED);
  1697. );
  1698. DWORD create_flag = OPEN_EXISTING;
  1699. DWORD share_mode = read_only
  1700. ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
  1701. : FILE_SHARE_READ | FILE_SHARE_DELETE;
  1702. switch (create_mode) {
  1703. case OS_FILE_OPEN_RAW:
  1704. ut_a(!read_only);
  1705. /* On Windows Physical devices require admin privileges and
  1706. have to have the write-share mode set. See the remarks
  1707. section for the CreateFile() function documentation in MSDN. */
  1708. share_mode |= FILE_SHARE_WRITE;
  1709. break;
  1710. case OS_FILE_CREATE_SILENT:
  1711. case OS_FILE_CREATE:
  1712. create_flag = CREATE_NEW;
  1713. break;
  1714. default:
  1715. ut_ad(create_mode == OS_FILE_OPEN
  1716. || create_mode == OS_FILE_OPEN_SILENT
  1717. || create_mode == OS_FILE_OPEN_RETRY_SILENT
  1718. || create_mode == OS_FILE_OPEN_RETRY);
  1719. break;
  1720. }
  1721. DWORD attributes = (purpose == OS_FILE_AIO && srv_use_native_aio)
  1722. ? FILE_FLAG_OVERLAPPED : 0;
  1723. if (type == OS_LOG_FILE) {
  1724. if (!log_sys.is_opened() && !log_sys.log_buffered) {
  1725. attributes|= FILE_FLAG_NO_BUFFERING;
  1726. }
  1727. if (srv_file_flush_method == SRV_O_DSYNC)
  1728. attributes|= FILE_FLAG_WRITE_THROUGH;
  1729. }
  1730. else if (type == OS_DATA_FILE) {
  1731. switch (srv_file_flush_method) {
  1732. case SRV_FSYNC:
  1733. case SRV_LITTLESYNC:
  1734. case SRV_NOSYNC:
  1735. break;
  1736. default:
  1737. attributes|= FILE_FLAG_NO_BUFFERING;
  1738. }
  1739. }
  1740. DWORD access = GENERIC_READ;
  1741. if (!read_only) {
  1742. access |= GENERIC_WRITE;
  1743. }
  1744. for (;;) {
  1745. const char *operation;
  1746. /* Use default security attributes and no template file. */
  1747. file = CreateFile(
  1748. name, access, share_mode, my_win_file_secattr(),
  1749. create_flag, attributes, NULL);
  1750. *success = file != INVALID_HANDLE_VALUE;
  1751. if (*success && type == OS_LOG_FILE) {
  1752. uint32_t s = uint32_t(get_sector_size(file));
  1753. log_sys.set_block_size(s);
  1754. if (attributes & FILE_FLAG_NO_BUFFERING) {
  1755. if (os_file_get_size(file) % s) {
  1756. attributes &= ~FILE_FLAG_NO_BUFFERING;
  1757. create_flag = OPEN_ALWAYS;
  1758. CloseHandle(file);
  1759. continue;
  1760. }
  1761. log_sys.log_buffered = false;
  1762. }
  1763. }
  1764. if (*success) {
  1765. break;
  1766. }
  1767. operation = create_flag == CREATE_NEW ? "create" : "open";
  1768. if (!os_file_handle_error_no_exit(name, operation,
  1769. create_mode
  1770. & OS_FILE_ON_ERROR_SILENT)) {
  1771. break;
  1772. }
  1773. }
  1774. if (*success && (attributes & FILE_FLAG_OVERLAPPED) && srv_thread_pool) {
  1775. srv_thread_pool->bind(file);
  1776. }
  1777. return(file);
  1778. }
  1779. /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
  1780. not directly this function!
  1781. A simple function to open or create a file.
  1782. @param[in] name name of the file or path as a null-terminated
  1783. string
  1784. @param[in] create_mode create mode
  1785. @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
  1786. OS_FILE_READ_ALLOW_DELETE; the last option is
  1787. used by a backup program reading the file
  1788. @param[out] success true if succeeded
  1789. @return own: handle to the file, not defined if error, error number
  1790. can be retrieved with os_file_get_last_error */
  1791. pfs_os_file_t
  1792. os_file_create_simple_no_error_handling_func(
  1793. const char* name,
  1794. os_file_create_t create_mode,
  1795. ulint access_type,
  1796. bool read_only,
  1797. bool* success)
  1798. {
  1799. os_file_t file;
  1800. DWORD access = GENERIC_READ;
  1801. DWORD create_flag = OPEN_EXISTING;
  1802. DWORD attributes = 0;
  1803. DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_DELETE;
  1804. ut_a(name);
  1805. if (read_only) {
  1806. share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE
  1807. | FILE_SHARE_DELETE;
  1808. } else {
  1809. if (create_mode == OS_FILE_CREATE) {
  1810. create_flag = CREATE_NEW;
  1811. } else {
  1812. ut_ad(create_mode == OS_FILE_OPEN);
  1813. }
  1814. switch (access_type) {
  1815. case OS_FILE_READ_ONLY: break;
  1816. case OS_FILE_READ_WRITE:
  1817. access = GENERIC_READ | GENERIC_WRITE;
  1818. break;
  1819. default:
  1820. ut_ad(access_type == OS_FILE_READ_ALLOW_DELETE);
  1821. /* A backup program has to give mariadbd the maximum
  1822. freedom to do what it likes with the file */
  1823. share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
  1824. | FILE_SHARE_READ;
  1825. }
  1826. }
  1827. file = CreateFile((LPCTSTR) name,
  1828. access,
  1829. share_mode,
  1830. my_win_file_secattr(),
  1831. create_flag,
  1832. attributes,
  1833. NULL); // No template file
  1834. *success = (file != INVALID_HANDLE_VALUE);
  1835. return(file);
  1836. }
  1837. /** Deletes a file if it exists. The file has to be closed before calling this.
  1838. @param[in] name file path as a null-terminated string
  1839. @param[out] exist indicate if file pre-exist
  1840. @return true if success */
  1841. bool
  1842. os_file_delete_if_exists_func(
  1843. const char* name,
  1844. bool* exist)
  1845. {
  1846. ulint count = 0;
  1847. if (exist != NULL) {
  1848. *exist = true;
  1849. }
  1850. for (;;) {
  1851. /* In Windows, deleting an .ibd file may fail if
  1852. the file is being accessed by an external program,
  1853. such as a backup tool. */
  1854. bool ret = DeleteFile((LPCTSTR) name);
  1855. if (ret) {
  1856. return(true);
  1857. }
  1858. switch (GetLastError()) {
  1859. case ERROR_FILE_NOT_FOUND:
  1860. case ERROR_PATH_NOT_FOUND:
  1861. /* the file does not exist, this not an error */
  1862. if (exist != NULL) {
  1863. *exist = false;
  1864. }
  1865. /* fall through */
  1866. case ERROR_ACCESS_DENIED:
  1867. return(true);
  1868. }
  1869. ++count;
  1870. if (count > 100 && 0 == (count % 10)) {
  1871. /* Print error information */
  1872. os_file_get_last_error(true);
  1873. ib::warn() << "Delete of file '" << name << "' failed.";
  1874. }
  1875. std::this_thread::sleep_for(std::chrono::seconds(1));
  1876. if (count > 2000) {
  1877. return(false);
  1878. }
  1879. }
  1880. }
  1881. /** Deletes a file. The file has to be closed before calling this.
  1882. @param[in] name File path as NUL terminated string
  1883. @return true if success */
  1884. bool
  1885. os_file_delete_func(
  1886. const char* name)
  1887. {
  1888. ulint count = 0;
  1889. for (;;) {
  1890. /* In Windows, deleting an .ibd file may fail if
  1891. the file is being accessed by an external program,
  1892. such as a backup tool. */
  1893. BOOL ret = DeleteFile((LPCTSTR) name);
  1894. if (ret) {
  1895. return(true);
  1896. }
  1897. if (GetLastError() == ERROR_FILE_NOT_FOUND) {
  1898. /* If the file does not exist, we classify this as
  1899. a 'mild' error and return */
  1900. return(false);
  1901. }
  1902. ++count;
  1903. if (count > 100 && 0 == (count % 10)) {
  1904. /* print error information */
  1905. os_file_get_last_error(true);
  1906. ib::warn()
  1907. << "Cannot delete file '" << name << "'. Is "
  1908. << "another program accessing it?";
  1909. }
  1910. std::this_thread::sleep_for(std::chrono::seconds(1));
  1911. if (count > 2000) {
  1912. return(false);
  1913. }
  1914. }
  1915. ut_error;
  1916. return(false);
  1917. }
  1918. /** NOTE! Use the corresponding macro os_file_rename(), not directly this
  1919. function!
  1920. Renames a file (can also move it to another directory). It is safest that the
  1921. file is closed before calling this function.
  1922. @param[in] oldpath old file path as a null-terminated string
  1923. @param[in] newpath new file path
  1924. @return true if success */
  1925. bool
  1926. os_file_rename_func(
  1927. const char* oldpath,
  1928. const char* newpath)
  1929. {
  1930. #ifdef UNIV_DEBUG
  1931. os_file_type_t type;
  1932. bool exists;
  1933. /* New path must not exist. */
  1934. ut_ad(os_file_status(newpath, &exists, &type));
  1935. ut_ad(!exists);
  1936. /* Old path must exist. */
  1937. ut_ad(os_file_status(oldpath, &exists, &type));
  1938. ut_ad(exists);
  1939. #endif /* UNIV_DEBUG */
  1940. if (MoveFileEx(oldpath, newpath, MOVEFILE_REPLACE_EXISTING)) {
  1941. return(true);
  1942. }
  1943. os_file_handle_rename_error(oldpath, newpath);
  1944. return(false);
  1945. }
  1946. /** NOTE! Use the corresponding macro os_file_close(), not directly
  1947. this function!
  1948. Closes a file handle. In case of error, error number can be retrieved with
  1949. os_file_get_last_error.
  1950. @param[in,own] file Handle to a file
  1951. @return true if success */
  1952. bool os_file_close_func(os_file_t file)
  1953. {
  1954. ut_ad(file);
  1955. if (!CloseHandle(file))
  1956. {
  1957. os_file_handle_error("close");
  1958. return false;
  1959. }
  1960. if(srv_thread_pool)
  1961. srv_thread_pool->unbind(file);
  1962. return true;
  1963. }
  1964. /** Gets a file size.
  1965. @param[in] file Handle to a file
  1966. @return file size, or (os_offset_t) -1 on failure */
  1967. os_offset_t os_file_get_size(os_file_t file)
  1968. {
  1969. LARGE_INTEGER li;
  1970. if (GetFileSizeEx(file, &li))
  1971. return li.QuadPart;
  1972. return ((os_offset_t) -1);
  1973. }
  1974. /** Gets a file size.
  1975. @param[in] filename Full path to the filename to check
  1976. @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
  1977. errno */
  1978. os_file_size_t
  1979. os_file_get_size(
  1980. const char* filename)
  1981. {
  1982. struct __stat64 s;
  1983. os_file_size_t file_size;
  1984. int ret = _stat64(filename, &s);
  1985. if (ret == 0) {
  1986. file_size.m_total_size = s.st_size;
  1987. DWORD low_size;
  1988. DWORD high_size;
  1989. low_size = GetCompressedFileSize(filename, &high_size);
  1990. if (low_size != INVALID_FILE_SIZE) {
  1991. file_size.m_alloc_size = high_size;
  1992. file_size.m_alloc_size <<= 32;
  1993. file_size.m_alloc_size |= low_size;
  1994. } else {
  1995. ib::error()
  1996. << "GetCompressedFileSize("
  1997. << filename << ", ..) failed.";
  1998. file_size.m_alloc_size = (os_offset_t) -1;
  1999. }
  2000. } else {
  2001. file_size.m_total_size = ~0;
  2002. file_size.m_alloc_size = (os_offset_t) ret;
  2003. }
  2004. return(file_size);
  2005. }
  2006. /** This function returns information about the specified file
  2007. @param[in] path pathname of the file
  2008. @param[out] stat_info information of a file in a directory
  2009. @param[in,out] statinfo information of a file in a directory
  2010. @param[in] check_rw_perm for testing whether the file can be opened
  2011. in RW mode
  2012. @param[in] read_only true if the file is opened in read-only mode
  2013. @return DB_SUCCESS if all OK */
  2014. static
  2015. dberr_t
  2016. os_file_get_status_win32(
  2017. const char* path,
  2018. os_file_stat_t* stat_info,
  2019. struct _stat64* statinfo,
  2020. bool check_rw_perm,
  2021. bool read_only)
  2022. {
  2023. int ret = _stat64(path, statinfo);
  2024. if (ret && (errno == ENOENT || errno == ENOTDIR
  2025. || errno == ENAMETOOLONG)) {
  2026. /* file does not exist */
  2027. return(DB_NOT_FOUND);
  2028. } else if (ret) {
  2029. /* file exists, but stat call failed */
  2030. os_file_handle_error_no_exit(path, "STAT", false);
  2031. return(DB_FAIL);
  2032. } else if (_S_IFDIR & statinfo->st_mode) {
  2033. stat_info->type = OS_FILE_TYPE_DIR;
  2034. } else if (_S_IFREG & statinfo->st_mode) {
  2035. DWORD access = GENERIC_READ;
  2036. if (!read_only) {
  2037. access |= GENERIC_WRITE;
  2038. }
  2039. stat_info->type = OS_FILE_TYPE_FILE;
  2040. /* Check if we can open it in read-only mode. */
  2041. if (check_rw_perm) {
  2042. HANDLE fh;
  2043. fh = CreateFile(
  2044. (LPCTSTR) path, // File to open
  2045. access,
  2046. FILE_SHARE_READ | FILE_SHARE_WRITE
  2047. | FILE_SHARE_DELETE, // Full sharing
  2048. my_win_file_secattr(),
  2049. OPEN_EXISTING, // Existing file only
  2050. FILE_ATTRIBUTE_NORMAL, // Normal file
  2051. NULL); // No attr. template
  2052. if (fh == INVALID_HANDLE_VALUE) {
  2053. stat_info->rw_perm = false;
  2054. } else {
  2055. stat_info->rw_perm = true;
  2056. CloseHandle(fh);
  2057. }
  2058. }
  2059. } else {
  2060. stat_info->type = OS_FILE_TYPE_UNKNOWN;
  2061. }
  2062. return(DB_SUCCESS);
  2063. }
  2064. /**
  2065. Sets a sparse flag on Windows file.
  2066. @param[in] file file handle
  2067. @return true on success, false on error
  2068. */
  2069. #include <versionhelpers.h>
  2070. bool os_file_set_sparse_win32(os_file_t file, bool is_sparse)
  2071. {
  2072. if (!is_sparse && !IsWindows8OrGreater()) {
  2073. /* Cannot unset sparse flag on older Windows.
  2074. Until Windows8 it is documented to produce unpredictable results,
  2075. if there are unallocated ranges in file.*/
  2076. return false;
  2077. }
  2078. DWORD temp;
  2079. FILE_SET_SPARSE_BUFFER sparse_buffer;
  2080. sparse_buffer.SetSparse = is_sparse;
  2081. return os_win32_device_io_control(file,
  2082. FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp);
  2083. }
  2084. /**
  2085. Change file size on Windows.
  2086. If file is extended, the bytes between old and new EOF
  2087. are zeros.
  2088. If file is sparse, "virtual" block is added at the end of
  2089. allocated area.
  2090. If file is normal, file system allocates storage.
  2091. @param[in] pathname file path
  2092. @param[in] file file handle
  2093. @param[in] size size to preserve in bytes
  2094. @return true if success */
  2095. bool
  2096. os_file_change_size_win32(
  2097. const char* pathname,
  2098. os_file_t file,
  2099. os_offset_t size)
  2100. {
  2101. LARGE_INTEGER length;
  2102. length.QuadPart = size;
  2103. BOOL success = SetFilePointerEx(file, length, NULL, FILE_BEGIN);
  2104. if (!success) {
  2105. os_file_handle_error_no_exit(
  2106. pathname, "SetFilePointerEx", false);
  2107. } else {
  2108. success = SetEndOfFile(file);
  2109. if (!success) {
  2110. os_file_handle_error_no_exit(
  2111. pathname, "SetEndOfFile", false);
  2112. }
  2113. }
  2114. return(success);
  2115. }
  2116. /** Truncates a file at its current position.
  2117. @param[in] file Handle to be truncated
  2118. @return true if success */
  2119. bool
  2120. os_file_set_eof(
  2121. FILE* file)
  2122. {
  2123. HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
  2124. return(SetEndOfFile(h));
  2125. }
  2126. #endif /* !_WIN32*/
  2127. /** Does a synchronous read or write depending upon the type specified
  2128. In case of partial reads/writes the function tries
  2129. NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
  2130. @param[in] type, IO flags
  2131. @param[in] file handle to an open file
  2132. @param[out] buf buffer where to read
  2133. @param[in] offset file offset from the start where to read
  2134. @param[in] n number of bytes to read, starting from offset
  2135. @param[out] err DB_SUCCESS or error code
  2136. @return number of bytes read/written, -1 if error */
  2137. static MY_ATTRIBUTE((warn_unused_result))
  2138. ssize_t
  2139. os_file_io(
  2140. const IORequest&in_type,
  2141. os_file_t file,
  2142. void* buf,
  2143. ulint n,
  2144. os_offset_t offset,
  2145. dberr_t* err)
  2146. {
  2147. ssize_t original_n = ssize_t(n);
  2148. IORequest type = in_type;
  2149. ssize_t bytes_returned = 0;
  2150. SyncFileIO sync_file_io(file, buf, n, offset);
  2151. for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
  2152. ssize_t n_bytes = sync_file_io.execute(type);
  2153. /* Check for a hard error. Not much we can do now. */
  2154. if (n_bytes < 0) {
  2155. break;
  2156. } else if (n_bytes + bytes_returned == ssize_t(n)) {
  2157. bytes_returned += n_bytes;
  2158. *err = type.maybe_punch_hole(offset, n);
  2159. return(original_n);
  2160. }
  2161. /* Handle partial read/write. */
  2162. ut_ad(ulint(n_bytes + bytes_returned) < n);
  2163. bytes_returned += n_bytes;
  2164. if (type.type != IORequest::READ_MAYBE_PARTIAL) {
  2165. sql_print_warning("InnoDB: %zu bytes should have been"
  2166. " %s at %llu from %s,"
  2167. " but got only %zd."
  2168. " Retrying.",
  2169. n, type.is_read()
  2170. ? "read" : "written", offset,
  2171. type.node
  2172. ? type.node->name
  2173. : "(unknown file)", bytes_returned);
  2174. }
  2175. /* Advance the offset and buffer by n_bytes */
  2176. sync_file_io.advance(n_bytes);
  2177. }
  2178. *err = DB_IO_ERROR;
  2179. if (type.type != IORequest::READ_MAYBE_PARTIAL) {
  2180. ib::warn()
  2181. << "Retry attempts for "
  2182. << (type.is_read() ? "reading" : "writing")
  2183. << " partial data failed.";
  2184. }
  2185. return(bytes_returned);
  2186. }
  2187. /** Does a synchronous write operation in Posix.
  2188. @param[in] type IO context
  2189. @param[in] file handle to an open file
  2190. @param[out] buf buffer from which to write
  2191. @param[in] n number of bytes to write, starting from offset
  2192. @param[in] offset file offset from the start where to write
  2193. @param[out] err DB_SUCCESS or error code
  2194. @return number of bytes written
  2195. @retval -1 on error */
  2196. static MY_ATTRIBUTE((warn_unused_result))
  2197. ssize_t
  2198. os_file_pwrite(
  2199. const IORequest& type,
  2200. os_file_t file,
  2201. const byte* buf,
  2202. ulint n,
  2203. os_offset_t offset,
  2204. dberr_t* err)
  2205. {
  2206. ut_ad(type.is_write());
  2207. ++os_n_file_writes;
  2208. const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
  2209. MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
  2210. ssize_t n_bytes = os_file_io(type, file, const_cast<byte*>(buf),
  2211. n, offset, err);
  2212. MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
  2213. return(n_bytes);
  2214. }
  2215. /** NOTE! Use the corresponding macro os_file_write(), not directly
  2216. Requests a synchronous write operation.
  2217. @param[in] type IO flags
  2218. @param[in] file handle to an open file
  2219. @param[out] buf buffer from which to write
  2220. @param[in] offset file offset from the start where to read
  2221. @param[in] n number of bytes to read, starting from offset
  2222. @return error code
  2223. @retval DB_SUCCESS if the operation succeeded */
  2224. dberr_t
  2225. os_file_write_func(
  2226. const IORequest& type,
  2227. const char* name,
  2228. os_file_t file,
  2229. const void* buf,
  2230. os_offset_t offset,
  2231. ulint n)
  2232. {
  2233. dberr_t err;
  2234. ut_ad(n > 0);
  2235. ssize_t n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err);
  2236. if ((ulint) n_bytes != n && !os_has_said_disk_full) {
  2237. ib::error()
  2238. << "Write to file " << name << " failed at offset "
  2239. << offset << ", " << n
  2240. << " bytes should have been written,"
  2241. " only " << n_bytes << " were written."
  2242. " Operating system error number " << IF_WIN(GetLastError(),errno) << "."
  2243. " Check that your OS and file system"
  2244. " support files of this size."
  2245. " Check also that the disk is not full"
  2246. " or a disk quota exceeded.";
  2247. #ifndef _WIN32
  2248. if (strerror(errno) != NULL) {
  2249. ib::error()
  2250. << "Error number " << errno
  2251. << " means '" << strerror(errno) << "'";
  2252. }
  2253. ib::info() << OPERATING_SYSTEM_ERROR_MSG;
  2254. #endif
  2255. os_has_said_disk_full = true;
  2256. }
  2257. return(err);
  2258. }
  2259. /** Does a synchronous read operation in Posix.
  2260. @param[in] type IO flags
  2261. @param[in] file handle to an open file
  2262. @param[out] buf buffer where to read
  2263. @param[in] offset file offset from the start where to read
  2264. @param[in] n number of bytes to read, starting from offset
  2265. @param[out] err DB_SUCCESS or error code
  2266. @return number of bytes read, -1 if error */
  2267. static MY_ATTRIBUTE((warn_unused_result))
  2268. ssize_t
  2269. os_file_pread(
  2270. const IORequest& type,
  2271. os_file_t file,
  2272. void* buf,
  2273. ulint n,
  2274. os_offset_t offset,
  2275. dberr_t* err)
  2276. {
  2277. ut_ad(type.is_read());
  2278. ++os_n_file_reads;
  2279. const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
  2280. MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
  2281. ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
  2282. MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
  2283. return(n_bytes);
  2284. }
  2285. /** Requests a synchronous positioned read operation.
  2286. @return DB_SUCCESS if request was successful, false if fail
  2287. @param[in] type IO flags
  2288. @param[in] file handle to an open file
  2289. @param[out] buf buffer where to read
  2290. @param[in] offset file offset from the start where to read
  2291. @param[in] n number of bytes to read, starting from offset
  2292. @param[out] o number of bytes actually read
  2293. @return DB_SUCCESS or error code */
  2294. dberr_t
  2295. os_file_read_func(
  2296. const IORequest& type,
  2297. os_file_t file,
  2298. void* buf,
  2299. os_offset_t offset,
  2300. ulint n,
  2301. ulint* o)
  2302. {
  2303. ut_ad(!type.node || type.node->handle == file);
  2304. ut_ad(n);
  2305. os_bytes_read_since_printout+= n;
  2306. dberr_t err;
  2307. ssize_t n_bytes= os_file_pread(type, file, buf, n, offset, &err);
  2308. if (o)
  2309. *o= ulint(n_bytes);
  2310. if (ulint(n_bytes) == n || err != DB_SUCCESS)
  2311. return err;
  2312. os_file_handle_error_no_exit(type.node ? type.node->name : nullptr, "read",
  2313. false);
  2314. sql_print_error("InnoDB: Tried to read %zu bytes at offset %llu"
  2315. " of file %s, but was only able to read %zd",
  2316. n, offset, type.node ? type.node->name : "(unknown)",
  2317. n_bytes);
  2318. return err ? err : DB_IO_ERROR;
  2319. }
  2320. /** Handle errors for file operations.
  2321. @param[in] name name of a file or NULL
  2322. @param[in] operation operation
  2323. @param[in] should_abort whether to abort on an unknown error
  2324. @param[in] on_error_silent whether to suppress reports of non-fatal errors
  2325. @return true if we should retry the operation */
  2326. static MY_ATTRIBUTE((warn_unused_result))
  2327. bool
  2328. os_file_handle_error_cond_exit(
  2329. const char* name,
  2330. const char* operation,
  2331. bool should_abort,
  2332. bool on_error_silent)
  2333. {
  2334. ulint err;
  2335. err = os_file_get_last_error(false, on_error_silent);
  2336. switch (err) {
  2337. case OS_FILE_DISK_FULL:
  2338. /* We only print a warning about disk full once */
  2339. if (os_has_said_disk_full) {
  2340. return(false);
  2341. }
  2342. /* Disk full error is reported irrespective of the
  2343. on_error_silent setting. */
  2344. if (name) {
  2345. ib::error()
  2346. << "Encountered a problem with file '"
  2347. << name << "'";
  2348. }
  2349. ib::error()
  2350. << "Disk is full. Try to clean the disk to free space.";
  2351. os_has_said_disk_full = true;
  2352. return(false);
  2353. case OS_FILE_AIO_RESOURCES_RESERVED:
  2354. case OS_FILE_AIO_INTERRUPTED:
  2355. return(true);
  2356. case OS_FILE_PATH_ERROR:
  2357. case OS_FILE_ALREADY_EXISTS:
  2358. case OS_FILE_ACCESS_VIOLATION:
  2359. return(false);
  2360. case OS_FILE_NOT_FOUND:
  2361. if (!on_error_silent) {
  2362. sql_print_error("InnoDB: File %s was not found", name);
  2363. }
  2364. return false;
  2365. case OS_FILE_SHARING_VIOLATION:
  2366. std::this_thread::sleep_for(std::chrono::seconds(10));
  2367. return(true);
  2368. case OS_FILE_OPERATION_ABORTED:
  2369. case OS_FILE_INSUFFICIENT_RESOURCE:
  2370. std::this_thread::sleep_for(std::chrono::milliseconds(100));
  2371. return(true);
  2372. default:
  2373. /* If it is an operation that can crash on error then it
  2374. is better to ignore on_error_silent and print an error message
  2375. to the log. */
  2376. if (should_abort || !on_error_silent) {
  2377. ib::error() << "File "
  2378. << (name != NULL ? name : "(unknown)")
  2379. << ": '" << operation << "'"
  2380. " returned OS error " << err << "."
  2381. << (should_abort
  2382. ? " Cannot continue operation" : "");
  2383. }
  2384. if (should_abort) {
  2385. abort();
  2386. }
  2387. }
  2388. return(false);
  2389. }
  2390. /** Check if the file system supports sparse files.
  2391. @param fh file handle
  2392. @return true if the file system supports sparse files */
  2393. static bool os_is_sparse_file_supported(os_file_t fh)
  2394. {
  2395. #ifdef _WIN32
  2396. FILE_ATTRIBUTE_TAG_INFO info;
  2397. if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
  2398. &info, (DWORD)sizeof(info))) {
  2399. if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) {
  2400. return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0;
  2401. }
  2402. }
  2403. return false;
  2404. #else
  2405. /* We don't know the FS block size, use the sector size. The FS
  2406. will do the magic. */
  2407. return DB_SUCCESS == os_file_punch_hole_posix(fh, 0, srv_page_size);
  2408. #endif /* _WIN32 */
  2409. }
  2410. /** Extend a file.
  2411. On Windows, extending a file allocates blocks for the file,
  2412. unless the file is sparse.
  2413. On Unix, we will extend the file with ftruncate(), if
  2414. file needs to be sparse. Otherwise posix_fallocate() is used
  2415. when available, and if not, binary zeroes are added to the end
  2416. of file.
  2417. @param[in] name file name
  2418. @param[in] file file handle
  2419. @param[in] size desired file size
  2420. @param[in] sparse whether to create a sparse file (no preallocating)
  2421. @return whether the operation succeeded */
  2422. bool
  2423. os_file_set_size(
  2424. const char* name,
  2425. os_file_t file,
  2426. os_offset_t size,
  2427. bool is_sparse)
  2428. {
  2429. ut_ad(!(size & 4095));
  2430. #ifdef _WIN32
  2431. /* On Windows, changing file size works well and as expected for both
  2432. sparse and normal files.
  2433. However, 10.2 up until 10.2.9 made every file sparse in innodb,
  2434. causing NTFS fragmentation issues(MDEV-13941). We try to undo
  2435. the damage, and unsparse the file.*/
  2436. if (!is_sparse && os_is_sparse_file_supported(file)) {
  2437. if (!os_file_set_sparse_win32(file, false))
  2438. /* Unsparsing file failed. Fallback to writing binary
  2439. zeros, to avoid even higher fragmentation.*/
  2440. goto fallback;
  2441. }
  2442. return os_file_change_size_win32(name, file, size);
  2443. fallback:
  2444. #else
  2445. struct stat statbuf;
  2446. if (is_sparse) {
  2447. bool success = !ftruncate(file, size);
  2448. if (!success) {
  2449. ib::error() << "ftruncate of file " << name << " to "
  2450. << size << " bytes failed with error "
  2451. << errno;
  2452. }
  2453. return(success);
  2454. }
  2455. # ifdef HAVE_POSIX_FALLOCATE
  2456. int err;
  2457. do {
  2458. if (fstat(file, &statbuf)) {
  2459. err = errno;
  2460. } else {
  2461. MSAN_STAT_WORKAROUND(&statbuf);
  2462. os_offset_t current_size = statbuf.st_size;
  2463. if (current_size >= size) {
  2464. return true;
  2465. }
  2466. current_size &= ~4095ULL;
  2467. # ifdef __linux__
  2468. if (!fallocate(file, 0, current_size,
  2469. size - current_size)) {
  2470. err = 0;
  2471. break;
  2472. }
  2473. err = errno;
  2474. # else
  2475. err = posix_fallocate(file, current_size,
  2476. size - current_size);
  2477. # endif
  2478. }
  2479. } while (err == EINTR
  2480. && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
  2481. switch (err) {
  2482. case 0:
  2483. return true;
  2484. default:
  2485. ib::error() << "preallocating "
  2486. << size << " bytes for file " << name
  2487. << " failed with error " << err;
  2488. /* fall through */
  2489. case EINTR:
  2490. errno = err;
  2491. return false;
  2492. case EINVAL:
  2493. case EOPNOTSUPP:
  2494. /* fall back to the code below */
  2495. break;
  2496. }
  2497. # endif /* HAVE_POSIX_ALLOCATE */
  2498. #endif /* _WIN32*/
  2499. #ifdef _WIN32
  2500. os_offset_t current_size = os_file_get_size(file);
  2501. FILE_STORAGE_INFO info;
  2502. if (GetFileInformationByHandleEx(file, FileStorageInfo, &info,
  2503. sizeof info)) {
  2504. if (info.LogicalBytesPerSector) {
  2505. current_size &= ~os_offset_t(info.LogicalBytesPerSector
  2506. - 1);
  2507. }
  2508. }
  2509. #else
  2510. if (fstat(file, &statbuf)) {
  2511. return false;
  2512. }
  2513. os_offset_t current_size = statbuf.st_size & ~4095ULL;
  2514. #endif
  2515. if (current_size >= size) {
  2516. return true;
  2517. }
  2518. /* Write up to 1 megabyte at a time. */
  2519. ulint buf_size = ut_min(ulint(64),
  2520. ulint(size >> srv_page_size_shift))
  2521. << srv_page_size_shift;
  2522. /* Align the buffer for possible raw i/o */
  2523. byte* buf = static_cast<byte*>(aligned_malloc(buf_size,
  2524. srv_page_size));
  2525. /* Write buffer full of zeros */
  2526. memset(buf, 0, buf_size);
  2527. while (current_size < size
  2528. && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
  2529. ulint n_bytes;
  2530. if (size - current_size < (os_offset_t) buf_size) {
  2531. n_bytes = (ulint) (size - current_size);
  2532. } else {
  2533. n_bytes = buf_size;
  2534. }
  2535. if (os_file_write(IORequestWrite, name,
  2536. file, buf, current_size, n_bytes) !=
  2537. DB_SUCCESS) {
  2538. break;
  2539. }
  2540. current_size += n_bytes;
  2541. }
  2542. aligned_free(buf);
  2543. return(current_size >= size && os_file_flush(file));
  2544. }
  2545. /** Truncate a file to a specified size in bytes.
  2546. @param[in] pathname file path
  2547. @param[in] file file to be truncated
  2548. @param[in] size size preserved in bytes
  2549. @param[in] allow_shrink whether to allow the file to become smaller
  2550. @return true if success */
  2551. bool
  2552. os_file_truncate(
  2553. const char* pathname,
  2554. os_file_t file,
  2555. os_offset_t size,
  2556. bool allow_shrink)
  2557. {
  2558. if (!allow_shrink) {
  2559. /* Do nothing if the size preserved is larger than or
  2560. equal to the current size of file */
  2561. os_offset_t size_bytes = os_file_get_size(file);
  2562. if (size >= size_bytes) {
  2563. return(true);
  2564. }
  2565. }
  2566. #ifdef _WIN32
  2567. return(os_file_change_size_win32(pathname, file, size));
  2568. #else /* _WIN32 */
  2569. return(os_file_truncate_posix(pathname, file, size));
  2570. #endif /* _WIN32 */
  2571. }
  2572. /** Check the existence and type of the given file.
  2573. @param[in] path path name of file
  2574. @param[out] exists true if the file exists
  2575. @param[out] type Type of the file, if it exists
  2576. @return true if call succeeded */
  2577. bool
  2578. os_file_status(
  2579. const char* path,
  2580. bool* exists,
  2581. os_file_type_t* type)
  2582. {
  2583. #ifdef _WIN32
  2584. return(os_file_status_win32(path, exists, type));
  2585. #else
  2586. return(os_file_status_posix(path, exists, type));
  2587. #endif /* _WIN32 */
  2588. }
  2589. /** Free storage space associated with a section of the file.
  2590. @param[in] fh Open file handle
  2591. @param[in] off Starting offset (SEEK_SET)
  2592. @param[in] len Size of the hole
  2593. @return DB_SUCCESS or error code */
  2594. dberr_t
  2595. os_file_punch_hole(
  2596. os_file_t fh,
  2597. os_offset_t off,
  2598. os_offset_t len)
  2599. {
  2600. #ifdef _WIN32
  2601. return os_file_punch_hole_win32(fh, off, len);
  2602. #else
  2603. return os_file_punch_hole_posix(fh, off, len);
  2604. #endif /* _WIN32 */
  2605. }
  2606. /** Free storage space associated with a section of the file.
  2607. @param off byte offset from the start (SEEK_SET)
  2608. @param len size of the hole in bytes
  2609. @return DB_SUCCESS or error code */
  2610. dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const
  2611. {
  2612. ulint trim_len = bpage ? bpage->physical_size() - len : 0;
  2613. if (trim_len == 0) {
  2614. return(DB_SUCCESS);
  2615. }
  2616. off += len;
  2617. /* Check does file system support punching holes for this
  2618. tablespace. */
  2619. if (!node->punch_hole) {
  2620. return DB_IO_NO_PUNCH_HOLE;
  2621. }
  2622. dberr_t err = os_file_punch_hole(node->handle, off, trim_len);
  2623. switch (err) {
  2624. case DB_SUCCESS:
  2625. srv_stats.page_compressed_trim_op.inc();
  2626. return err;
  2627. case DB_IO_NO_PUNCH_HOLE:
  2628. node->punch_hole = false;
  2629. err = DB_SUCCESS;
  2630. /* fall through */
  2631. default:
  2632. return err;
  2633. }
  2634. }
  2635. /*
  2636. Get file system block size, by path.
  2637. This is expensive on Windows, and not very useful in general,
  2638. (only shown in some I_S table), so we keep that out of usual
  2639. stat.
  2640. */
  2641. size_t os_file_get_fs_block_size(const char *path)
  2642. {
  2643. #ifdef _WIN32
  2644. char volname[MAX_PATH];
  2645. if (!GetVolumePathName(path, volname, MAX_PATH))
  2646. return 0;
  2647. DWORD sectorsPerCluster;
  2648. DWORD bytesPerSector;
  2649. DWORD numberOfFreeClusters;
  2650. DWORD totalNumberOfClusters;
  2651. if (GetDiskFreeSpace(volname, &sectorsPerCluster, &bytesPerSector,
  2652. &numberOfFreeClusters, &totalNumberOfClusters))
  2653. return ((size_t) bytesPerSector) * sectorsPerCluster;
  2654. #else
  2655. os_file_stat_t info;
  2656. if (os_file_get_status(path, &info, false, false) == DB_SUCCESS)
  2657. return info.block_size;
  2658. #endif
  2659. return 0;
  2660. }
  2661. /** This function returns information about the specified file
  2662. @param[in] path pathname of the file
  2663. @param[out] stat_info information of a file in a directory
  2664. @param[in] check_rw_perm for testing whether the file can be opened
  2665. in RW mode
  2666. @param[in] read_only true if file is opened in read-only mode
  2667. @return DB_SUCCESS if all OK */
  2668. dberr_t
  2669. os_file_get_status(
  2670. const char* path,
  2671. os_file_stat_t* stat_info,
  2672. bool check_rw_perm,
  2673. bool read_only)
  2674. {
  2675. dberr_t ret;
  2676. #ifdef _WIN32
  2677. struct _stat64 info;
  2678. ret = os_file_get_status_win32(
  2679. path, stat_info, &info, check_rw_perm, read_only);
  2680. #else
  2681. struct stat info;
  2682. ret = os_file_get_status_posix(
  2683. path, stat_info, &info, check_rw_perm, read_only);
  2684. #endif /* _WIN32 */
  2685. if (ret == DB_SUCCESS) {
  2686. stat_info->ctime = info.st_ctime;
  2687. stat_info->atime = info.st_atime;
  2688. stat_info->mtime = info.st_mtime;
  2689. stat_info->size = info.st_size;
  2690. }
  2691. return(ret);
  2692. }
  2693. static void fake_io_callback(void *c)
  2694. {
  2695. tpool::aiocb *cb= static_cast<tpool::aiocb*>(c);
  2696. ut_ad(read_slots->contains(cb));
  2697. static_cast<const IORequest*>(static_cast<const void*>(cb->m_userdata))->
  2698. fake_read_complete(cb->m_offset);
  2699. read_slots->release(cb);
  2700. }
  2701. static void read_io_callback(void *c)
  2702. {
  2703. tpool::aiocb *cb= static_cast<tpool::aiocb*>(c);
  2704. ut_ad(cb->m_opcode == tpool::aio_opcode::AIO_PREAD);
  2705. ut_ad(read_slots->contains(cb));
  2706. const IORequest &request= *static_cast<const IORequest*>
  2707. (static_cast<const void*>(cb->m_userdata));
  2708. request.read_complete(cb->m_err);
  2709. read_slots->release(cb);
  2710. }
  2711. static void write_io_callback(void *c)
  2712. {
  2713. tpool::aiocb *cb= static_cast<tpool::aiocb*>(c);
  2714. ut_ad(cb->m_opcode == tpool::aio_opcode::AIO_PWRITE);
  2715. ut_ad(write_slots->contains(cb));
  2716. const IORequest &request= *static_cast<const IORequest*>
  2717. (static_cast<const void*>(cb->m_userdata));
  2718. if (UNIV_UNLIKELY(cb->m_err != 0))
  2719. ib::info () << "IO Error: " << cb->m_err
  2720. << " during write of "
  2721. << cb->m_len << " bytes, for file "
  2722. << request.node->name << "(" << cb->m_fh << "), returned "
  2723. << cb->m_ret_len;
  2724. request.write_complete(cb->m_err);
  2725. write_slots->release(cb);
  2726. }
  2727. #ifdef LINUX_NATIVE_AIO
  2728. /** Checks if the system supports native linux aio. On some kernel
  2729. versions where native aio is supported it won't work on tmpfs. In such
  2730. cases we can't use native aio.
  2731. @return: true if supported, false otherwise. */
  2732. static bool is_linux_native_aio_supported()
  2733. {
  2734. File fd;
  2735. io_context_t io_ctx;
  2736. std::string log_file_path = get_log_file_path();
  2737. memset(&io_ctx, 0, sizeof(io_ctx));
  2738. if (io_setup(1, &io_ctx)) {
  2739. /* The platform does not support native aio. */
  2740. return(false);
  2741. }
  2742. else if (!srv_read_only_mode) {
  2743. /* Now check if tmpdir supports native aio ops. */
  2744. fd = mysql_tmpfile("ib");
  2745. if (fd < 0) {
  2746. ib::warn()
  2747. << "Unable to create temp file to check"
  2748. " native AIO support.";
  2749. int ret = io_destroy(io_ctx);
  2750. ut_a(ret != -EINVAL);
  2751. ut_ad(ret != -EFAULT);
  2752. return(false);
  2753. }
  2754. }
  2755. else {
  2756. fd = my_open(log_file_path.c_str(), O_RDONLY | O_CLOEXEC,
  2757. MYF(0));
  2758. if (fd == -1) {
  2759. ib::warn() << "Unable to open \"" << log_file_path
  2760. << "\" to check native"
  2761. << " AIO read support.";
  2762. int ret = io_destroy(io_ctx);
  2763. ut_a(ret != EINVAL);
  2764. ut_ad(ret != EFAULT);
  2765. return(false);
  2766. }
  2767. }
  2768. struct io_event io_event;
  2769. memset(&io_event, 0x0, sizeof(io_event));
  2770. byte* ptr = static_cast<byte*>(aligned_malloc(srv_page_size,
  2771. srv_page_size));
  2772. struct iocb iocb;
  2773. /* Suppress valgrind warning. */
  2774. memset(ptr, 0, srv_page_size);
  2775. memset(&iocb, 0x0, sizeof(iocb));
  2776. struct iocb* p_iocb = &iocb;
  2777. if (!srv_read_only_mode) {
  2778. io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0);
  2779. }
  2780. else {
  2781. ut_a(srv_page_size >= 512);
  2782. io_prep_pread(p_iocb, fd, ptr, 512, 0);
  2783. }
  2784. int err = io_submit(io_ctx, 1, &p_iocb);
  2785. if (err >= 1) {
  2786. /* Now collect the submitted IO request. */
  2787. err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
  2788. }
  2789. aligned_free(ptr);
  2790. my_close(fd, MYF(MY_WME));
  2791. switch (err) {
  2792. case 1:
  2793. {
  2794. int ret = io_destroy(io_ctx);
  2795. ut_a(ret != -EINVAL);
  2796. ut_ad(ret != -EFAULT);
  2797. return(true);
  2798. }
  2799. case -EINVAL:
  2800. case -ENOSYS:
  2801. ib::warn()
  2802. << "Linux Native AIO not supported. You can either"
  2803. " move "
  2804. << (srv_read_only_mode ? log_file_path : "tmpdir")
  2805. << " to a file system that supports native"
  2806. " AIO or you can set innodb_use_native_aio to"
  2807. " FALSE to avoid this message.";
  2808. /* fall through. */
  2809. default:
  2810. ib::warn()
  2811. << "Linux Native AIO check on "
  2812. << (srv_read_only_mode ? log_file_path : "tmpdir")
  2813. << "returned error[" << -err << "]";
  2814. }
  2815. int ret = io_destroy(io_ctx);
  2816. ut_a(ret != -EINVAL);
  2817. ut_ad(ret != -EFAULT);
  2818. return(false);
  2819. }
  2820. #endif
  2821. int os_aio_init()
  2822. {
  2823. int max_write_events= int(srv_n_write_io_threads *
  2824. OS_AIO_N_PENDING_IOS_PER_THREAD);
  2825. int max_read_events= int(srv_n_read_io_threads *
  2826. OS_AIO_N_PENDING_IOS_PER_THREAD);
  2827. int max_events= max_read_events + max_write_events;
  2828. int ret;
  2829. #if LINUX_NATIVE_AIO
  2830. if (srv_use_native_aio && !is_linux_native_aio_supported())
  2831. goto disable;
  2832. #endif
  2833. ret= srv_thread_pool->configure_aio(srv_use_native_aio, max_events);
  2834. #ifdef LINUX_NATIVE_AIO
  2835. if (ret)
  2836. {
  2837. ut_ad(srv_use_native_aio);
  2838. disable:
  2839. ib::warn() << "Linux Native AIO disabled.";
  2840. srv_use_native_aio= false;
  2841. ret= srv_thread_pool->configure_aio(false, max_events);
  2842. }
  2843. #endif
  2844. #ifdef HAVE_URING
  2845. if (ret)
  2846. {
  2847. ut_ad(srv_use_native_aio);
  2848. ib::warn()
  2849. << "liburing disabled: falling back to innodb_use_native_aio=OFF";
  2850. srv_use_native_aio= false;
  2851. ret= srv_thread_pool->configure_aio(false, max_events);
  2852. }
  2853. #endif
  2854. if (!ret)
  2855. {
  2856. read_slots= new io_slots(max_read_events, srv_n_read_io_threads);
  2857. write_slots= new io_slots(max_write_events, srv_n_write_io_threads);
  2858. }
  2859. return ret;
  2860. }
  2861. /**
  2862. Change reader or writer thread parameter on a running server.
  2863. This includes resizing the io slots, as we calculate
  2864. number of outstanding IOs based on the these variables.
  2865. It is trickier with when Linux AIO is involved (io_context
  2866. needs to be recreated to account for different number of
  2867. max_events). With Linux AIO, depending on fs-max-aio number
  2868. and user and system wide max-aio limitation, this can fail.
  2869. Otherwise, we just resize the slots, and allow for
  2870. more concurrent threads via thread_group setting.
  2871. @param[in] n_reader_threads - max number of concurrently
  2872. executing read callbacks
  2873. @param[in] n_writer_thread - max number of cuncurrently
  2874. executing write callbacks
  2875. @return 0 for success, !=0 for error.
  2876. */
  2877. int os_aio_resize(ulint n_reader_threads, ulint n_writer_threads)
  2878. {
  2879. /* Lock the slots, and wait until all current IOs finish.*/
  2880. auto &lk_read= read_slots->mutex(), &lk_write= write_slots->mutex();
  2881. mysql_mutex_lock(&lk_read);
  2882. mysql_mutex_lock(&lk_write);
  2883. read_slots->wait(lk_read);
  2884. write_slots->wait(lk_write);
  2885. /* Now, all IOs have finished and no new ones can start, due to locks. */
  2886. int max_read_events= int(n_reader_threads * OS_AIO_N_PENDING_IOS_PER_THREAD);
  2887. int max_write_events= int(n_writer_threads * OS_AIO_N_PENDING_IOS_PER_THREAD);
  2888. int events= max_read_events + max_write_events;
  2889. /** Do the Linux AIO dance (this will try to create a new
  2890. io context with changed max_events ,etc*/
  2891. int ret= srv_thread_pool->reconfigure_aio(srv_use_native_aio, events);
  2892. if (ret)
  2893. {
  2894. /** Do the best effort. We can't change the parallel io number,
  2895. but we still can adjust the number of concurrent completion handlers.*/
  2896. read_slots->task_group().set_max_tasks(static_cast<int>(n_reader_threads));
  2897. write_slots->task_group().set_max_tasks(static_cast<int>(n_writer_threads));
  2898. }
  2899. else
  2900. {
  2901. /* Allocation succeeded, resize the slots*/
  2902. read_slots->resize(max_read_events, static_cast<int>(n_reader_threads));
  2903. write_slots->resize(max_write_events, static_cast<int>(n_writer_threads));
  2904. }
  2905. mysql_mutex_unlock(&lk_read);
  2906. mysql_mutex_unlock(&lk_write);
  2907. return ret;
  2908. }
  2909. void os_aio_free()
  2910. {
  2911. srv_thread_pool->disable_aio();
  2912. delete read_slots;
  2913. delete write_slots;
  2914. read_slots= nullptr;
  2915. write_slots= nullptr;
  2916. }
  2917. /** Wait until there are no pending asynchronous writes. */
  2918. static void os_aio_wait_until_no_pending_writes_low(bool declare)
  2919. {
  2920. const bool notify_wait= declare && write_slots->pending_io_count();
  2921. if (notify_wait)
  2922. tpool::tpool_wait_begin();
  2923. write_slots->wait();
  2924. if (notify_wait)
  2925. tpool::tpool_wait_end();
  2926. }
  2927. /** Wait until there are no pending asynchronous writes.
  2928. @param declare whether the wait will be declared in tpool */
  2929. void os_aio_wait_until_no_pending_writes(bool declare)
  2930. {
  2931. os_aio_wait_until_no_pending_writes_low(declare);
  2932. buf_dblwr.wait_flush_buffered_writes();
  2933. }
  2934. /** @return number of pending reads */
  2935. size_t os_aio_pending_reads()
  2936. {
  2937. mysql_mutex_lock(&read_slots->mutex());
  2938. size_t pending= read_slots->pending_io_count();
  2939. mysql_mutex_unlock(&read_slots->mutex());
  2940. return pending;
  2941. }
  2942. /** @return approximate number of pending reads */
  2943. size_t os_aio_pending_reads_approx()
  2944. {
  2945. return read_slots->pending_io_count();
  2946. }
  2947. /** @return number of pending writes */
  2948. size_t os_aio_pending_writes()
  2949. {
  2950. mysql_mutex_lock(&write_slots->mutex());
  2951. size_t pending= write_slots->pending_io_count();
  2952. mysql_mutex_unlock(&write_slots->mutex());
  2953. return pending;
  2954. }
  2955. /** Wait until all pending asynchronous reads have completed.
  2956. @param declare whether the wait will be declared in tpool */
  2957. void os_aio_wait_until_no_pending_reads(bool declare)
  2958. {
  2959. const bool notify_wait= declare && read_slots->pending_io_count();
  2960. if (notify_wait)
  2961. tpool::tpool_wait_begin();
  2962. read_slots->wait();
  2963. if (notify_wait)
  2964. tpool::tpool_wait_end();
  2965. }
  2966. /** Submit a fake read request during crash recovery.
  2967. @param type fake read request
  2968. @param offset additional context */
  2969. void os_fake_read(const IORequest &type, os_offset_t offset)
  2970. {
  2971. tpool::aiocb *cb= read_slots->acquire();
  2972. cb->m_group= read_slots->get_task_group();
  2973. cb->m_fh= type.node->handle.m_file;
  2974. cb->m_buffer= nullptr;
  2975. cb->m_len= 0;
  2976. cb->m_offset= offset;
  2977. cb->m_opcode= tpool::aio_opcode::AIO_PREAD;
  2978. new (cb->m_userdata) IORequest{type};
  2979. cb->m_internal_task.m_func= fake_io_callback;
  2980. cb->m_internal_task.m_arg= cb;
  2981. cb->m_internal_task.m_group= cb->m_group;
  2982. srv_thread_pool->submit_task(&cb->m_internal_task);
  2983. }
  2984. /** Request a read or write.
  2985. @param type I/O request
  2986. @param buf buffer
  2987. @param offset file offset
  2988. @param n number of bytes
  2989. @retval DB_SUCCESS if request was queued successfully
  2990. @retval DB_IO_ERROR on I/O error */
  2991. dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n)
  2992. {
  2993. ut_ad(n > 0);
  2994. ut_ad(!(n & 511)); /* payload of page_compressed tables */
  2995. ut_ad((offset % UNIV_ZIP_SIZE_MIN) == 0);
  2996. ut_ad((reinterpret_cast<size_t>(buf) % UNIV_ZIP_SIZE_MIN) == 0);
  2997. ut_ad(type.is_read() || type.is_write());
  2998. ut_ad(type.node);
  2999. ut_ad(type.node->is_open());
  3000. #ifdef WIN_ASYNC_IO
  3001. ut_ad((n & 0xFFFFFFFFUL) == n);
  3002. #endif /* WIN_ASYNC_IO */
  3003. #ifdef UNIV_PFS_IO
  3004. PSI_file_locker_state state;
  3005. PSI_file_locker* locker= nullptr;
  3006. register_pfs_file_io_begin(&state, locker, type.node->handle, n,
  3007. type.is_write()
  3008. ? PSI_FILE_WRITE : PSI_FILE_READ,
  3009. __FILE__, __LINE__);
  3010. #endif /* UNIV_PFS_IO */
  3011. dberr_t err = DB_SUCCESS;
  3012. if (!type.is_async()) {
  3013. err = type.is_read()
  3014. ? os_file_read_func(type, type.node->handle,
  3015. buf, offset, n, nullptr)
  3016. : os_file_write_func(type, type.node->name,
  3017. type.node->handle,
  3018. buf, offset, n);
  3019. func_exit:
  3020. #ifdef UNIV_PFS_IO
  3021. register_pfs_file_io_end(locker, n);
  3022. #endif /* UNIV_PFS_IO */
  3023. return err;
  3024. }
  3025. io_slots* slots;
  3026. tpool::callback_func callback;
  3027. tpool::aio_opcode opcode;
  3028. if (type.is_read()) {
  3029. ++os_n_file_reads;
  3030. slots = read_slots;
  3031. callback = read_io_callback;
  3032. opcode = tpool::aio_opcode::AIO_PREAD;
  3033. } else {
  3034. ++os_n_file_writes;
  3035. slots = write_slots;
  3036. callback = write_io_callback;
  3037. opcode = tpool::aio_opcode::AIO_PWRITE;
  3038. }
  3039. compile_time_assert(sizeof(IORequest) <= tpool::MAX_AIO_USERDATA_LEN);
  3040. tpool::aiocb* cb = slots->acquire();
  3041. cb->m_buffer = buf;
  3042. cb->m_callback = callback;
  3043. cb->m_group = slots->get_task_group();
  3044. cb->m_fh = type.node->handle.m_file;
  3045. cb->m_len = (int)n;
  3046. cb->m_offset = offset;
  3047. cb->m_opcode = opcode;
  3048. new (cb->m_userdata) IORequest{type};
  3049. if (srv_thread_pool->submit_io(cb)) {
  3050. slots->release(cb);
  3051. os_file_handle_error_no_exit(type.node->name, type.is_read()
  3052. ? "aio read" : "aio write",
  3053. false);
  3054. err = DB_IO_ERROR;
  3055. type.node->space->release();
  3056. }
  3057. goto func_exit;
  3058. }
  3059. /** Prints info of the aio arrays.
  3060. @param[in,out] file file where to print */
  3061. void
  3062. os_aio_print(FILE* file)
  3063. {
  3064. time_t current_time;
  3065. double time_elapsed;
  3066. current_time = time(NULL);
  3067. time_elapsed = 0.001 + difftime(current_time, os_last_printout);
  3068. fprintf(file,
  3069. "Pending flushes (fsync): " ULINTPF "\n"
  3070. ULINTPF " OS file reads, %zu OS file writes, %zu OS fsyncs\n",
  3071. ulint{fil_n_pending_tablespace_flushes},
  3072. ulint{os_n_file_reads},
  3073. static_cast<size_t>(os_n_file_writes),
  3074. static_cast<size_t>(os_n_fsyncs));
  3075. const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
  3076. const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
  3077. if (n_reads != 0 || n_writes != 0) {
  3078. fprintf(file,
  3079. ULINTPF " pending reads, " ULINTPF " pending writes\n",
  3080. n_reads, n_writes);
  3081. }
  3082. ulint avg_bytes_read = (os_n_file_reads == os_n_file_reads_old)
  3083. ? 0
  3084. : os_bytes_read_since_printout
  3085. / (os_n_file_reads - os_n_file_reads_old);
  3086. fprintf(file,
  3087. "%.2f reads/s, " ULINTPF " avg bytes/read,"
  3088. " %.2f writes/s, %.2f fsyncs/s\n",
  3089. static_cast<double>(os_n_file_reads - os_n_file_reads_old)
  3090. / time_elapsed,
  3091. avg_bytes_read,
  3092. static_cast<double>(os_n_file_writes - os_n_file_writes_old)
  3093. / time_elapsed,
  3094. static_cast<double>(os_n_fsyncs - os_n_fsyncs_old)
  3095. / time_elapsed);
  3096. os_n_file_reads_old = os_n_file_reads;
  3097. os_n_file_writes_old = os_n_file_writes;
  3098. os_n_fsyncs_old = os_n_fsyncs;
  3099. os_bytes_read_since_printout = 0;
  3100. os_last_printout = current_time;
  3101. }
  3102. /** Refreshes the statistics used to print per-second averages. */
  3103. void
  3104. os_aio_refresh_stats()
  3105. {
  3106. os_n_fsyncs_old = os_n_fsyncs;
  3107. os_bytes_read_since_printout = 0;
  3108. os_n_file_reads_old = os_n_file_reads;
  3109. os_n_file_writes_old = os_n_file_writes;
  3110. os_n_fsyncs_old = os_n_fsyncs;
  3111. os_bytes_read_since_printout = 0;
  3112. os_last_printout = time(NULL);
  3113. }
  3114. /**
  3115. Set the file create umask
  3116. @param[in] umask The umask to use for file creation. */
  3117. void
  3118. os_file_set_umask(ulint umask)
  3119. {
  3120. os_innodb_umask = umask;
  3121. }
  3122. #ifdef _WIN32
  3123. /* Checks whether physical drive is on SSD.*/
  3124. static bool is_drive_on_ssd(DWORD nr)
  3125. {
  3126. char physical_drive_path[32];
  3127. snprintf(physical_drive_path, sizeof(physical_drive_path),
  3128. "\\\\.\\PhysicalDrive%lu", nr);
  3129. HANDLE h= CreateFile(physical_drive_path, 0,
  3130. FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
  3131. nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
  3132. if (h == INVALID_HANDLE_VALUE)
  3133. return false;
  3134. DEVICE_SEEK_PENALTY_DESCRIPTOR seek_penalty;
  3135. STORAGE_PROPERTY_QUERY storage_query{};
  3136. storage_query.PropertyId= StorageDeviceSeekPenaltyProperty;
  3137. storage_query.QueryType= PropertyStandardQuery;
  3138. bool on_ssd= false;
  3139. DWORD bytes_written;
  3140. if (DeviceIoControl(h, IOCTL_STORAGE_QUERY_PROPERTY, &storage_query,
  3141. sizeof storage_query, &seek_penalty, sizeof seek_penalty,
  3142. &bytes_written, nullptr))
  3143. {
  3144. on_ssd= !seek_penalty.IncursSeekPenalty;
  3145. }
  3146. else
  3147. {
  3148. on_ssd= false;
  3149. }
  3150. CloseHandle(h);
  3151. return on_ssd;
  3152. }
  3153. /*
  3154. Checks whether volume is on SSD, by checking all physical drives
  3155. in that volume.
  3156. */
  3157. static bool is_volume_on_ssd(const char *volume_mount_point)
  3158. {
  3159. char volume_name[MAX_PATH];
  3160. if (!GetVolumeNameForVolumeMountPoint(volume_mount_point, volume_name,
  3161. array_elements(volume_name)))
  3162. {
  3163. /* This can fail, e.g if file is on network share */
  3164. return false;
  3165. }
  3166. /* Chomp last backslash, this is needed to open volume.*/
  3167. size_t length= strlen(volume_name);
  3168. if (length && volume_name[length - 1] == '\\')
  3169. volume_name[length - 1]= 0;
  3170. /* Open volume handle */
  3171. HANDLE volume_handle= CreateFile(
  3172. volume_name, 0, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
  3173. nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
  3174. if (volume_handle == INVALID_HANDLE_VALUE)
  3175. return false;
  3176. /*
  3177. Enumerate all volume extends, check whether all of them are on SSD
  3178. */
  3179. /* Anticipate common case where there is only one extent.*/
  3180. VOLUME_DISK_EXTENTS single_extent;
  3181. /* But also have a place to manage allocated data.*/
  3182. std::unique_ptr<BYTE[]> lifetime;
  3183. DWORD bytes_written;
  3184. VOLUME_DISK_EXTENTS *extents= nullptr;
  3185. if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
  3186. nullptr, 0, &single_extent, sizeof(single_extent),
  3187. &bytes_written, nullptr))
  3188. {
  3189. /* Worked on the first try. Use the preallocated buffer.*/
  3190. extents= &single_extent;
  3191. }
  3192. else
  3193. {
  3194. VOLUME_DISK_EXTENTS *last_query= &single_extent;
  3195. while (GetLastError() == ERROR_MORE_DATA)
  3196. {
  3197. DWORD extentCount= last_query->NumberOfDiskExtents;
  3198. DWORD allocatedSize=
  3199. FIELD_OFFSET(VOLUME_DISK_EXTENTS, Extents[extentCount]);
  3200. lifetime.reset(new BYTE[allocatedSize]);
  3201. last_query= (VOLUME_DISK_EXTENTS *) lifetime.get();
  3202. if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
  3203. nullptr, 0, last_query, allocatedSize,
  3204. &bytes_written, nullptr))
  3205. {
  3206. extents= last_query;
  3207. break;
  3208. }
  3209. }
  3210. }
  3211. CloseHandle(volume_handle);
  3212. if (!extents)
  3213. return false;
  3214. for (DWORD i= 0; i < extents->NumberOfDiskExtents; i++)
  3215. if (!is_drive_on_ssd(extents->Extents[i].DiskNumber))
  3216. return false;
  3217. return true;
  3218. }
  3219. #include <unordered_map>
  3220. static bool is_path_on_ssd(char *file_path)
  3221. {
  3222. /* Preset result, in case something fails, e.g we're on network drive.*/
  3223. char volume_path[MAX_PATH];
  3224. if (!GetVolumePathName(file_path, volume_path, array_elements(volume_path)))
  3225. return false;
  3226. return is_volume_on_ssd(volume_path);
  3227. }
  3228. static bool is_file_on_ssd(HANDLE handle, char *file_path)
  3229. {
  3230. ULONGLONG volume_serial_number;
  3231. FILE_ID_INFO info;
  3232. if(!GetFileInformationByHandleEx(handle, FileIdInfo, &info, sizeof(info)))
  3233. return false;
  3234. volume_serial_number= info.VolumeSerialNumber;
  3235. static std::unordered_map<ULONGLONG, bool> cache;
  3236. static SRWLOCK lock= SRWLOCK_INIT;
  3237. bool found;
  3238. bool result;
  3239. AcquireSRWLockShared(&lock);
  3240. auto e= cache.find(volume_serial_number);
  3241. if ((found= e != cache.end()))
  3242. result= e->second;
  3243. ReleaseSRWLockShared(&lock);
  3244. if (!found)
  3245. {
  3246. result= is_path_on_ssd(file_path);
  3247. /* Update cache */
  3248. AcquireSRWLockExclusive(&lock);
  3249. cache[volume_serial_number]= result;
  3250. ReleaseSRWLockExclusive(&lock);
  3251. }
  3252. return result;
  3253. }
  3254. #endif
  3255. void fil_node_t::find_metadata(os_file_t file
  3256. #ifndef _WIN32
  3257. , bool create, struct stat *statbuf
  3258. #endif
  3259. )
  3260. {
  3261. if (!is_open())
  3262. {
  3263. handle= file;
  3264. ut_ad(is_open());
  3265. }
  3266. if (!space->is_compressed())
  3267. punch_hole= 0;
  3268. else if (my_test_if_thinly_provisioned(file))
  3269. punch_hole= 2;
  3270. else
  3271. punch_hole= IF_WIN(, !create ||) os_is_sparse_file_supported(file);
  3272. #ifdef _WIN32
  3273. on_ssd= is_file_on_ssd(file, name);
  3274. FILE_STORAGE_INFO info;
  3275. if (GetFileInformationByHandleEx(file, FileStorageInfo, &info, sizeof info))
  3276. block_size= info.PhysicalBytesPerSectorForAtomicity;
  3277. else
  3278. block_size= 512;
  3279. #else
  3280. struct stat sbuf;
  3281. if (!statbuf && !fstat(file, &sbuf))
  3282. {
  3283. MSAN_STAT_WORKAROUND(&sbuf);
  3284. statbuf= &sbuf;
  3285. }
  3286. if (statbuf)
  3287. block_size= statbuf->st_blksize;
  3288. # ifdef __linux__
  3289. on_ssd= statbuf && fil_system.is_ssd(statbuf->st_dev);
  3290. # endif
  3291. #endif
  3292. if (space->purpose != FIL_TYPE_TABLESPACE)
  3293. {
  3294. /* For temporary tablespace or during IMPORT TABLESPACE, we
  3295. disable neighbour flushing and do not care about atomicity. */
  3296. on_ssd= true;
  3297. atomic_write= true;
  3298. }
  3299. else
  3300. /* On Windows, all single sector writes are atomic, as per
  3301. WriteFile() documentation on MSDN. */
  3302. atomic_write= srv_use_atomic_writes &&
  3303. IF_WIN(srv_page_size == block_size,
  3304. my_test_if_atomic_write(file, space->physical_size()));
  3305. }
  3306. /** Read the first page of a data file.
  3307. @return whether the page was found valid */
  3308. bool fil_node_t::read_page0()
  3309. {
  3310. mysql_mutex_assert_owner(&fil_system.mutex);
  3311. const unsigned psize= space->physical_size();
  3312. #ifndef _WIN32
  3313. struct stat statbuf;
  3314. if (fstat(handle, &statbuf))
  3315. return false;
  3316. MSAN_STAT_WORKAROUND(&statbuf);
  3317. os_offset_t size_bytes= statbuf.st_size;
  3318. #else
  3319. os_offset_t size_bytes= os_file_get_size(handle);
  3320. ut_a(size_bytes != (os_offset_t) -1);
  3321. #endif
  3322. const uint32_t min_size= FIL_IBD_FILE_INITIAL_SIZE * psize;
  3323. if (size_bytes < min_size)
  3324. {
  3325. ib::error() << "The size of the file " << name
  3326. << " is only " << size_bytes
  3327. << " bytes, should be at least " << min_size;
  3328. return false;
  3329. }
  3330. if (!deferred)
  3331. {
  3332. page_t *page= static_cast<byte*>(aligned_malloc(psize, psize));
  3333. if (os_file_read(IORequestRead, handle, page, 0, psize, nullptr)
  3334. != DB_SUCCESS)
  3335. {
  3336. sql_print_error("InnoDB: Unable to read first page of file %s", name);
  3337. aligned_free(page);
  3338. return false;
  3339. }
  3340. const ulint space_id= memcmp_aligned<2>
  3341. (FIL_PAGE_SPACE_ID + page,
  3342. FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4)
  3343. ? ULINT_UNDEFINED
  3344. : mach_read_from_4(FIL_PAGE_SPACE_ID + page);
  3345. uint32_t flags= fsp_header_get_flags(page);
  3346. const uint32_t size= fsp_header_get_field(page, FSP_SIZE);
  3347. const uint32_t free_limit= fsp_header_get_field(page, FSP_FREE_LIMIT);
  3348. const uint32_t free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + page);
  3349. if (!fil_space_t::is_valid_flags(flags, space->id))
  3350. {
  3351. uint32_t cflags= fsp_flags_convert_from_101(flags);
  3352. if (cflags != UINT32_MAX)
  3353. {
  3354. uint32_t cf= cflags & ~FSP_FLAGS_MEM_MASK;
  3355. uint32_t sf= space->flags & ~FSP_FLAGS_MEM_MASK;
  3356. if (fil_space_t::is_flags_equal(cf, sf) ||
  3357. fil_space_t::is_flags_equal(sf, cf))
  3358. {
  3359. flags= cflags;
  3360. goto flags_ok;
  3361. }
  3362. }
  3363. aligned_free(page);
  3364. goto invalid;
  3365. }
  3366. if (!fil_space_t::is_flags_equal((flags & ~FSP_FLAGS_MEM_MASK),
  3367. (space->flags & ~FSP_FLAGS_MEM_MASK)) &&
  3368. !fil_space_t::is_flags_equal((space->flags & ~FSP_FLAGS_MEM_MASK),
  3369. (flags & ~FSP_FLAGS_MEM_MASK)))
  3370. {
  3371. invalid:
  3372. sql_print_error("InnoDB: Expected tablespace flags 0x%zx but found 0x%zx"
  3373. " in the file %s", space->flags, flags, name);
  3374. return false;
  3375. }
  3376. flags_ok:
  3377. ut_ad(!(flags & FSP_FLAGS_MEM_MASK));
  3378. /* Try to read crypt_data from page 0 if it is not yet read. */
  3379. if (!space->crypt_data)
  3380. space->crypt_data= fil_space_read_crypt_data(
  3381. fil_space_t::zip_size(flags), page);
  3382. aligned_free(page);
  3383. if (UNIV_UNLIKELY(space_id != space->id))
  3384. {
  3385. ib::error() << "Expected tablespace id " << space->id
  3386. << " but found " << space_id
  3387. << " in the file " << name;
  3388. return false;
  3389. }
  3390. space->flags= (space->flags & FSP_FLAGS_MEM_MASK) | flags;
  3391. ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
  3392. ut_ad(space->free_len == 0 || space->free_len == free_len);
  3393. space->size_in_header= size;
  3394. space->free_limit= free_limit;
  3395. space->free_len= free_len;
  3396. }
  3397. IF_WIN(find_metadata(), find_metadata(handle, false, &statbuf));
  3398. /* Truncate the size to a multiple of extent size. */
  3399. ulint mask= psize * FSP_EXTENT_SIZE - 1;
  3400. if (size_bytes <= mask);
  3401. /* .ibd files start smaller than an
  3402. extent size. Do not truncate valid data. */
  3403. else
  3404. size_bytes&= ~os_offset_t(mask);
  3405. this->size= uint32_t(size_bytes / psize);
  3406. space->set_sizes(this->size);
  3407. return true;
  3408. }