You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

4351 lines
102 KiB

20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
20 years ago
  1. /******************************************************
  2. The interface to the operating system file i/o primitives
  3. (c) 1995 Innobase Oy
  4. Created 10/21/1995 Heikki Tuuri
  5. *******************************************************/
  6. #include "os0file.h"
  7. #include "os0sync.h"
  8. #include "os0thread.h"
  9. #include "ut0mem.h"
  10. #include "srv0srv.h"
  11. #include "srv0start.h"
  12. #include "fil0fil.h"
  13. #include "buf0buf.h"
  14. #if defined(UNIV_HOTBACKUP) && defined(__WIN__)
  15. /* Add includes for the _stat() call to compile on Windows */
  16. #include <sys/types.h>
  17. #include <sys/stat.h>
  18. #include <errno.h>
  19. #endif /* UNIV_HOTBACKUP */
  20. #undef HAVE_FDATASYNC
  21. #ifdef POSIX_ASYNC_IO
  22. /* We assume in this case that the OS has standard Posix aio (at least SunOS
  23. 2.6, HP-UX 11i and AIX 4.3 have) */
  24. #endif
  25. /* This specifies the file permissions InnoDB uses when it creates files in
  26. Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
  27. my_umask */
  28. #ifndef __WIN__
  29. ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
  30. #else
  31. ulint os_innodb_umask = 0;
  32. #endif
  33. #ifdef UNIV_DO_FLUSH
  34. /* If the following is set to TRUE, we do not call os_file_flush in every
  35. os_file_write. We can set this TRUE when the doublewrite buffer is used. */
  36. ibool os_do_not_call_flush_at_each_write = FALSE;
  37. #else
  38. /* We do not call os_file_flush in every os_file_write. */
  39. #endif /* UNIV_DO_FLUSH */
  40. /* We use these mutexes to protect lseek + file i/o operation, if the
  41. OS does not provide an atomic pread or pwrite, or similar */
  42. #define OS_FILE_N_SEEK_MUTEXES 16
  43. os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
  44. /* In simulated aio, merge at most this many consecutive i/os */
  45. #define OS_AIO_MERGE_N_CONSECUTIVE 64
  46. /* If this flag is TRUE, then we will use the native aio of the
  47. OS (provided we compiled Innobase with it in), otherwise we will
  48. use simulated aio we build below with threads */
  49. ibool os_aio_use_native_aio = FALSE;
  50. ibool os_aio_print_debug = FALSE;
  51. /* The aio array slot structure */
  52. typedef struct os_aio_slot_struct os_aio_slot_t;
  53. struct os_aio_slot_struct{
  54. ibool is_read; /* TRUE if a read operation */
  55. ulint pos; /* index of the slot in the aio
  56. array */
  57. ibool reserved; /* TRUE if this slot is reserved */
  58. time_t reservation_time;/* time when reserved */
  59. ulint len; /* length of the block to read or
  60. write */
  61. byte* buf; /* buffer used in i/o */
  62. ulint type; /* OS_FILE_READ or OS_FILE_WRITE */
  63. ulint offset; /* 32 low bits of file offset in
  64. bytes */
  65. ulint offset_high; /* 32 high bits of file offset */
  66. os_file_t file; /* file where to read or write */
  67. const char* name; /* file name or path */
  68. ibool io_already_done;/* used only in simulated aio:
  69. TRUE if the physical i/o already
  70. made and only the slot message
  71. needs to be passed to the caller
  72. of os_aio_simulated_handle */
  73. fil_node_t* message1; /* message which is given by the */
  74. void* message2; /* the requester of an aio operation
  75. and which can be used to identify
  76. which pending aio operation was
  77. completed */
  78. #ifdef WIN_ASYNC_IO
  79. os_event_t event; /* event object we need in the
  80. OVERLAPPED struct */
  81. OVERLAPPED control; /* Windows control block for the
  82. aio request */
  83. #elif defined(POSIX_ASYNC_IO)
  84. struct aiocb control; /* Posix control block for aio
  85. request */
  86. #endif
  87. };
  88. /* The aio array structure */
  89. typedef struct os_aio_array_struct os_aio_array_t;
  90. struct os_aio_array_struct{
  91. os_mutex_t mutex; /* the mutex protecting the aio array */
  92. os_event_t not_full; /* The event which is set to the signaled
  93. state when there is space in the aio
  94. outside the ibuf segment */
  95. os_event_t is_empty; /* The event which is set to the signaled
  96. state when there are no pending i/os
  97. in this array */
  98. ulint n_slots; /* Total number of slots in the aio array.
  99. This must be divisible by n_threads. */
  100. ulint n_segments;/* Number of segments in the aio array of
  101. pending aio requests. A thread can wait
  102. separately for any one of the segments. */
  103. ulint n_reserved;/* Number of reserved slots in the
  104. aio array outside the ibuf segment */
  105. os_aio_slot_t* slots; /* Pointer to the slots in the array */
  106. #ifdef __WIN__
  107. os_native_event_t* native_events;
  108. /* Pointer to an array of OS native event
  109. handles where we copied the handles from
  110. slots, in the same order. This can be used
  111. in WaitForMultipleObjects; used only in
  112. Windows */
  113. #endif
  114. };
  115. /* Array of events used in simulated aio */
  116. os_event_t* os_aio_segment_wait_events = NULL;
  117. /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
  118. are NULL when the module has not yet been initialized. */
  119. static os_aio_array_t* os_aio_read_array = NULL;
  120. static os_aio_array_t* os_aio_write_array = NULL;
  121. static os_aio_array_t* os_aio_ibuf_array = NULL;
  122. static os_aio_array_t* os_aio_log_array = NULL;
  123. static os_aio_array_t* os_aio_sync_array = NULL;
  124. static ulint os_aio_n_segments = ULINT_UNDEFINED;
  125. /* If the following is TRUE, read i/o handler threads try to
  126. wait until a batch of new read requests have been posted */
  127. static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
  128. ulint os_n_file_reads = 0;
  129. ulint os_bytes_read_since_printout = 0;
  130. ulint os_n_file_writes = 0;
  131. ulint os_n_fsyncs = 0;
  132. ulint os_n_file_reads_old = 0;
  133. ulint os_n_file_writes_old = 0;
  134. ulint os_n_fsyncs_old = 0;
  135. time_t os_last_printout;
  136. ibool os_has_said_disk_full = FALSE;
  137. /* The mutex protecting the following counts of pending I/O operations */
  138. static os_mutex_t os_file_count_mutex;
  139. ulint os_file_n_pending_preads = 0;
  140. ulint os_file_n_pending_pwrites = 0;
  141. ulint os_n_pending_writes = 0;
  142. ulint os_n_pending_reads = 0;
  143. /***************************************************************************
  144. Gets the operating system version. Currently works only on Windows. */
  145. ulint
  146. os_get_os_version(void)
  147. /*===================*/
  148. /* out: OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
  149. {
  150. #ifdef __WIN__
  151. OSVERSIONINFO os_info;
  152. os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
  153. ut_a(GetVersionEx(&os_info));
  154. if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
  155. return(OS_WIN31);
  156. } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
  157. return(OS_WIN95);
  158. } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
  159. if (os_info.dwMajorVersion <= 4) {
  160. return(OS_WINNT);
  161. } else {
  162. return(OS_WIN2000);
  163. }
  164. } else {
  165. ut_error;
  166. return(0);
  167. }
  168. #else
  169. ut_error;
  170. return(0);
  171. #endif
  172. }
  173. /***************************************************************************
  174. Retrieves the last error number if an error occurs in a file io function.
  175. The number should be retrieved before any other OS calls (because they may
  176. overwrite the error number). If the number is not known to this program,
  177. the OS error number + 100 is returned. */
  178. ulint
  179. os_file_get_last_error(
  180. /*===================*/
  181. /* out: error number, or OS error
  182. number + 100 */
  183. ibool report_all_errors) /* in: TRUE if we want an error message
  184. printed of all errors */
  185. {
  186. ulint err;
  187. #ifdef __WIN__
  188. err = (ulint) GetLastError();
  189. if (report_all_errors
  190. || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
  191. ut_print_timestamp(stderr);
  192. fprintf(stderr,
  193. " InnoDB: Operating system error number %lu"
  194. " in a file operation.\n", (ulong) err);
  195. if (err == ERROR_PATH_NOT_FOUND) {
  196. fprintf(stderr,
  197. "InnoDB: The error means the system"
  198. " cannot find the path specified.\n");
  199. if (srv_is_being_started) {
  200. fprintf(stderr,
  201. "InnoDB: If you are installing InnoDB,"
  202. " remember that you must create\n"
  203. "InnoDB: directories yourself, InnoDB"
  204. " does not create them.\n");
  205. }
  206. } else if (err == ERROR_ACCESS_DENIED) {
  207. fprintf(stderr,
  208. "InnoDB: The error means mysqld does not have"
  209. " the access rights to\n"
  210. "InnoDB: the directory. It may also be"
  211. " you have created a subdirectory\n"
  212. "InnoDB: of the same name as a data file.\n");
  213. } else {
  214. fprintf(stderr,
  215. "InnoDB: Some operating system error numbers"
  216. " are described at\n"
  217. "InnoDB: "
  218. "http://dev.mysql.com/doc/refman/5.1/en/"
  219. "operating-system-error-codes.html\n");
  220. }
  221. }
  222. fflush(stderr);
  223. if (err == ERROR_FILE_NOT_FOUND) {
  224. return(OS_FILE_NOT_FOUND);
  225. } else if (err == ERROR_DISK_FULL) {
  226. return(OS_FILE_DISK_FULL);
  227. } else if (err == ERROR_FILE_EXISTS) {
  228. return(OS_FILE_ALREADY_EXISTS);
  229. } else {
  230. return(100 + err);
  231. }
  232. #else
  233. err = (ulint) errno;
  234. if (report_all_errors
  235. || (err != ENOSPC && err != EEXIST)) {
  236. ut_print_timestamp(stderr);
  237. fprintf(stderr,
  238. " InnoDB: Operating system error number %lu"
  239. " in a file operation.\n", (ulong) err);
  240. if (err == ENOENT) {
  241. fprintf(stderr,
  242. "InnoDB: The error means the system"
  243. " cannot find the path specified.\n");
  244. if (srv_is_being_started) {
  245. fprintf(stderr,
  246. "InnoDB: If you are installing InnoDB,"
  247. " remember that you must create\n"
  248. "InnoDB: directories yourself, InnoDB"
  249. " does not create them.\n");
  250. }
  251. } else if (err == EACCES) {
  252. fprintf(stderr,
  253. "InnoDB: The error means mysqld does not have"
  254. " the access rights to\n"
  255. "InnoDB: the directory.\n");
  256. } else {
  257. if (strerror((int)err) != NULL) {
  258. fprintf(stderr,
  259. "InnoDB: Error number %lu"
  260. " means '%s'.\n",
  261. err, strerror((int)err));
  262. }
  263. fprintf(stderr,
  264. "InnoDB: Some operating system"
  265. " error numbers are described at\n"
  266. "InnoDB: "
  267. "http://dev.mysql.com/doc/refman/5.1/en/"
  268. "operating-system-error-codes.html\n");
  269. }
  270. }
  271. fflush(stderr);
  272. if (err == ENOSPC) {
  273. return(OS_FILE_DISK_FULL);
  274. #ifdef POSIX_ASYNC_IO
  275. } else if (err == EAGAIN) {
  276. return(OS_FILE_AIO_RESOURCES_RESERVED);
  277. #endif
  278. } else if (err == ENOENT) {
  279. return(OS_FILE_NOT_FOUND);
  280. } else if (err == EEXIST) {
  281. return(OS_FILE_ALREADY_EXISTS);
  282. } else if (err == EXDEV || err == ENOTDIR || err == EISDIR) {
  283. return(OS_FILE_PATH_ERROR);
  284. } else {
  285. return(100 + err);
  286. }
  287. #endif
  288. }
  289. /********************************************************************
  290. Does error handling when a file operation fails.
  291. Conditionally exits (calling exit(3)) based on should_exit value and the
  292. error type */
  293. static
  294. ibool
  295. os_file_handle_error_cond_exit(
  296. /*===========================*/
  297. /* out: TRUE if we should retry the
  298. operation */
  299. const char* name, /* in: name of a file or NULL */
  300. const char* operation, /* in: operation */
  301. ibool should_exit) /* in: call exit(3) if unknown error
  302. and this parameter is TRUE */
  303. {
  304. ulint err;
  305. err = os_file_get_last_error(FALSE);
  306. if (err == OS_FILE_DISK_FULL) {
  307. /* We only print a warning about disk full once */
  308. if (os_has_said_disk_full) {
  309. return(FALSE);
  310. }
  311. if (name) {
  312. ut_print_timestamp(stderr);
  313. fprintf(stderr,
  314. " InnoDB: Encountered a problem with"
  315. " file %s\n", name);
  316. }
  317. ut_print_timestamp(stderr);
  318. fprintf(stderr,
  319. " InnoDB: Disk is full. Try to clean the disk"
  320. " to free space.\n");
  321. os_has_said_disk_full = TRUE;
  322. fflush(stderr);
  323. return(FALSE);
  324. } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
  325. return(TRUE);
  326. } else if (err == OS_FILE_ALREADY_EXISTS
  327. || err == OS_FILE_PATH_ERROR) {
  328. return(FALSE);
  329. } else {
  330. if (name) {
  331. fprintf(stderr, "InnoDB: File name %s\n", name);
  332. }
  333. fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
  334. operation);
  335. if (should_exit) {
  336. fprintf(stderr, "InnoDB: Cannot continue operation.\n");
  337. fflush(stderr);
  338. exit(1);
  339. }
  340. }
  341. return(FALSE);
  342. }
  343. /********************************************************************
  344. Does error handling when a file operation fails. */
  345. static
  346. ibool
  347. os_file_handle_error(
  348. /*=================*/
  349. /* out: TRUE if we should retry the
  350. operation */
  351. const char* name, /* in: name of a file or NULL */
  352. const char* operation)/* in: operation */
  353. {
  354. /* exit in case of unknown error */
  355. return(os_file_handle_error_cond_exit(name, operation, TRUE));
  356. }
  357. /********************************************************************
  358. Does error handling when a file operation fails. */
  359. static
  360. ibool
  361. os_file_handle_error_no_exit(
  362. /*=========================*/
  363. /* out: TRUE if we should retry the
  364. operation */
  365. const char* name, /* in: name of a file or NULL */
  366. const char* operation)/* in: operation */
  367. {
  368. /* don't exit in case of unknown error */
  369. return(os_file_handle_error_cond_exit(name, operation, FALSE));
  370. }
  371. #undef USE_FILE_LOCK
  372. #define USE_FILE_LOCK
  373. #if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__FreeBSD__) || defined(__NETWARE__)
  374. /* InnoDB Hot Backup does not lock the data files.
  375. * On Windows, mandatory locking is used.
  376. * On FreeBSD with LinuxThreads, advisory locking does not work properly.
  377. */
  378. # undef USE_FILE_LOCK
  379. #endif
  380. #ifdef USE_FILE_LOCK
  381. /********************************************************************
  382. Obtain an exclusive lock on a file. */
  383. static
  384. int
  385. os_file_lock(
  386. /*=========*/
  387. /* out: 0 on success */
  388. int fd, /* in: file descriptor */
  389. const char* name) /* in: file name */
  390. {
  391. struct flock lk;
  392. lk.l_type = F_WRLCK;
  393. lk.l_whence = SEEK_SET;
  394. lk.l_start = lk.l_len = 0;
  395. if (fcntl(fd, F_SETLK, &lk) == -1) {
  396. fprintf(stderr,
  397. "InnoDB: Unable to lock %s, error: %d\n", name, errno);
  398. if (errno == EAGAIN || errno == EACCES) {
  399. fprintf(stderr,
  400. "InnoDB: Check that you do not already have"
  401. " another mysqld process\n"
  402. "InnoDB: using the same InnoDB data"
  403. " or log files.\n");
  404. }
  405. return(-1);
  406. }
  407. return(0);
  408. }
  409. #endif /* USE_FILE_LOCK */
  410. /********************************************************************
  411. Creates the seek mutexes used in positioned reads and writes. */
  412. void
  413. os_io_init_simple(void)
  414. /*===================*/
  415. {
  416. ulint i;
  417. os_file_count_mutex = os_mutex_create(NULL);
  418. for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
  419. os_file_seek_mutexes[i] = os_mutex_create(NULL);
  420. }
  421. }
  422. #if !defined(UNIV_HOTBACKUP) && !defined(__NETWARE__)
  423. /*************************************************************************
  424. Creates a temporary file that will be deleted on close.
  425. This function is defined in ha_innodb.cc. */
  426. int
  427. innobase_mysql_tmpfile(void);
  428. /*========================*/
  429. /* out: temporary file descriptor, or < 0 on error */
  430. #endif /* !UNIV_HOTBACKUP && !__NETWARE__ */
  431. /***************************************************************************
  432. Creates a temporary file. This function is like tmpfile(3), but
  433. the temporary file is created in the MySQL temporary directory.
  434. On Netware, this function is like tmpfile(3), because the C run-time
  435. library of Netware does not expose the delete-on-close flag. */
  436. FILE*
  437. os_file_create_tmpfile(void)
  438. /*========================*/
  439. /* out: temporary file handle, or NULL on error */
  440. {
  441. #ifdef UNIV_HOTBACKUP
  442. ut_error;
  443. return(NULL);
  444. #else
  445. # ifdef __NETWARE__
  446. FILE* file = tmpfile();
  447. # else /* __NETWARE__ */
  448. FILE* file = NULL;
  449. int fd = innobase_mysql_tmpfile();
  450. if (fd >= 0) {
  451. file = fdopen(fd, "w+b");
  452. }
  453. # endif /* __NETWARE__ */
  454. if (!file) {
  455. ut_print_timestamp(stderr);
  456. fprintf(stderr,
  457. " InnoDB: Error: unable to create temporary file;"
  458. " errno: %d\n", errno);
  459. # ifndef __NETWARE__
  460. if (fd >= 0) {
  461. close(fd);
  462. }
  463. # endif /* !__NETWARE__ */
  464. }
  465. return(file);
  466. #endif /* UNIV_HOTBACKUP */
  467. }
  468. /***************************************************************************
  469. The os_file_opendir() function opens a directory stream corresponding to the
  470. directory named by the dirname argument. The directory stream is positioned
  471. at the first entry. In both Unix and Windows we automatically skip the '.'
  472. and '..' items at the start of the directory listing. */
  473. os_file_dir_t
  474. os_file_opendir(
  475. /*============*/
  476. /* out: directory stream, NULL if
  477. error */
  478. const char* dirname, /* in: directory name; it must not
  479. contain a trailing '\' or '/' */
  480. ibool error_is_fatal) /* in: TRUE if we should treat an
  481. error as a fatal error; if we try to
  482. open symlinks then we do not wish a
  483. fatal error if it happens not to be
  484. a directory */
  485. {
  486. os_file_dir_t dir;
  487. #ifdef __WIN__
  488. LPWIN32_FIND_DATA lpFindFileData;
  489. char path[OS_FILE_MAX_PATH + 3];
  490. ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
  491. strcpy(path, dirname);
  492. strcpy(path + strlen(path), "\\*");
  493. /* Note that in Windows opening the 'directory stream' also retrieves
  494. the first entry in the directory. Since it is '.', that is no problem,
  495. as we will skip over the '.' and '..' entries anyway. */
  496. lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
  497. dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
  498. ut_free(lpFindFileData);
  499. if (dir == INVALID_HANDLE_VALUE) {
  500. if (error_is_fatal) {
  501. os_file_handle_error(dirname, "opendir");
  502. }
  503. return(NULL);
  504. }
  505. return(dir);
  506. #else
  507. dir = opendir(dirname);
  508. if (dir == NULL && error_is_fatal) {
  509. os_file_handle_error(dirname, "opendir");
  510. }
  511. return(dir);
  512. #endif
  513. }
  514. /***************************************************************************
  515. Closes a directory stream. */
  516. int
  517. os_file_closedir(
  518. /*=============*/
  519. /* out: 0 if success, -1 if failure */
  520. os_file_dir_t dir) /* in: directory stream */
  521. {
  522. #ifdef __WIN__
  523. BOOL ret;
  524. ret = FindClose(dir);
  525. if (!ret) {
  526. os_file_handle_error_no_exit(NULL, "closedir");
  527. return(-1);
  528. }
  529. return(0);
  530. #else
  531. int ret;
  532. ret = closedir(dir);
  533. if (ret) {
  534. os_file_handle_error_no_exit(NULL, "closedir");
  535. }
  536. return(ret);
  537. #endif
  538. }
  539. /***************************************************************************
  540. This function returns information of the next file in the directory. We jump
  541. over the '.' and '..' entries in the directory. */
  542. int
  543. os_file_readdir_next_file(
  544. /*======================*/
  545. /* out: 0 if ok, -1 if error, 1 if at the end
  546. of the directory */
  547. const char* dirname,/* in: directory name or path */
  548. os_file_dir_t dir, /* in: directory stream */
  549. os_file_stat_t* info) /* in/out: buffer where the info is returned */
  550. {
  551. #ifdef __WIN__
  552. LPWIN32_FIND_DATA lpFindFileData;
  553. BOOL ret;
  554. lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
  555. next_file:
  556. ret = FindNextFile(dir, lpFindFileData);
  557. if (ret) {
  558. ut_a(strlen((char *) lpFindFileData->cFileName)
  559. < OS_FILE_MAX_PATH);
  560. if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
  561. || strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
  562. goto next_file;
  563. }
  564. strcpy(info->name, (char *) lpFindFileData->cFileName);
  565. info->size = (ib_longlong)(lpFindFileData->nFileSizeLow)
  566. + (((ib_longlong)(lpFindFileData->nFileSizeHigh))
  567. << 32);
  568. if (lpFindFileData->dwFileAttributes
  569. & FILE_ATTRIBUTE_REPARSE_POINT) {
  570. /* TODO: test Windows symlinks */
  571. /* TODO: MySQL has apparently its own symlink
  572. implementation in Windows, dbname.sym can
  573. redirect a database directory:
  574. http://dev.mysql.com/doc/refman/5.1/en/
  575. windows-symbolic-links.html */
  576. info->type = OS_FILE_TYPE_LINK;
  577. } else if (lpFindFileData->dwFileAttributes
  578. & FILE_ATTRIBUTE_DIRECTORY) {
  579. info->type = OS_FILE_TYPE_DIR;
  580. } else {
  581. /* It is probably safest to assume that all other
  582. file types are normal. Better to check them rather
  583. than blindly skip them. */
  584. info->type = OS_FILE_TYPE_FILE;
  585. }
  586. }
  587. ut_free(lpFindFileData);
  588. if (ret) {
  589. return(0);
  590. } else if (GetLastError() == ERROR_NO_MORE_FILES) {
  591. return(1);
  592. } else {
  593. os_file_handle_error_no_exit(dirname,
  594. "readdir_next_file");
  595. return(-1);
  596. }
  597. #else
  598. struct dirent* ent;
  599. char* full_path;
  600. int ret;
  601. struct stat statinfo;
  602. #ifdef HAVE_READDIR_R
  603. char dirent_buf[sizeof(struct dirent)
  604. + _POSIX_PATH_MAX + 100];
  605. /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
  606. the max file name len; but in most standards, the
  607. length is NAME_MAX; we add 100 to be even safer */
  608. #endif
  609. next_file:
  610. #ifdef HAVE_READDIR_R
  611. ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
  612. if (ret != 0) {
  613. fprintf(stderr,
  614. "InnoDB: cannot read directory %s, error %lu\n",
  615. dirname, (ulong)ret);
  616. return(-1);
  617. }
  618. if (ent == NULL) {
  619. /* End of directory */
  620. return(1);
  621. }
  622. ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
  623. #else
  624. ent = readdir(dir);
  625. if (ent == NULL) {
  626. return(1);
  627. }
  628. #endif
  629. ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
  630. if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
  631. goto next_file;
  632. }
  633. strcpy(info->name, ent->d_name);
  634. full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
  635. sprintf(full_path, "%s/%s", dirname, ent->d_name);
  636. ret = stat(full_path, &statinfo);
  637. if (ret) {
  638. os_file_handle_error_no_exit(full_path, "stat");
  639. ut_free(full_path);
  640. return(-1);
  641. }
  642. info->size = (ib_longlong)statinfo.st_size;
  643. if (S_ISDIR(statinfo.st_mode)) {
  644. info->type = OS_FILE_TYPE_DIR;
  645. } else if (S_ISLNK(statinfo.st_mode)) {
  646. info->type = OS_FILE_TYPE_LINK;
  647. } else if (S_ISREG(statinfo.st_mode)) {
  648. info->type = OS_FILE_TYPE_FILE;
  649. } else {
  650. info->type = OS_FILE_TYPE_UNKNOWN;
  651. }
  652. ut_free(full_path);
  653. return(0);
  654. #endif
  655. }
  656. /*********************************************************************
  657. This function attempts to create a directory named pathname. The new directory
  658. gets default permissions. On Unix the permissions are (0770 & ~umask). If the
  659. directory exists already, nothing is done and the call succeeds, unless the
  660. fail_if_exists arguments is true. */
  661. ibool
  662. os_file_create_directory(
  663. /*=====================*/
  664. /* out: TRUE if call succeeds,
  665. FALSE on error */
  666. const char* pathname, /* in: directory name as
  667. null-terminated string */
  668. ibool fail_if_exists) /* in: if TRUE, pre-existing directory
  669. is treated as an error. */
  670. {
  671. #ifdef __WIN__
  672. BOOL rcode;
  673. rcode = CreateDirectory((LPCTSTR) pathname, NULL);
  674. if (!(rcode != 0
  675. || (GetLastError() == ERROR_ALREADY_EXISTS
  676. && !fail_if_exists))) {
  677. /* failure */
  678. os_file_handle_error(pathname, "CreateDirectory");
  679. return(FALSE);
  680. }
  681. return (TRUE);
  682. #else
  683. int rcode;
  684. rcode = mkdir(pathname, 0770);
  685. if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
  686. /* failure */
  687. os_file_handle_error(pathname, "mkdir");
  688. return(FALSE);
  689. }
  690. return (TRUE);
  691. #endif
  692. }
  693. /********************************************************************
  694. A simple function to open or create a file. */
  695. os_file_t
  696. os_file_create_simple(
  697. /*==================*/
  698. /* out, own: handle to the file, not defined
  699. if error, error number can be retrieved with
  700. os_file_get_last_error */
  701. const char* name, /* in: name of the file or path as a
  702. null-terminated string */
  703. ulint create_mode,/* in: OS_FILE_OPEN if an existing file is
  704. opened (if does not exist, error), or
  705. OS_FILE_CREATE if a new file is created
  706. (if exists, error), or
  707. OS_FILE_CREATE_PATH if new file
  708. (if exists, error) and subdirectories along
  709. its path are created (if needed)*/
  710. ulint access_type,/* in: OS_FILE_READ_ONLY or
  711. OS_FILE_READ_WRITE */
  712. ibool* success)/* out: TRUE if succeed, FALSE if error */
  713. {
  714. #ifdef __WIN__
  715. os_file_t file;
  716. DWORD create_flag;
  717. DWORD access;
  718. DWORD attributes = 0;
  719. ibool retry;
  720. try_again:
  721. ut_a(name);
  722. if (create_mode == OS_FILE_OPEN) {
  723. create_flag = OPEN_EXISTING;
  724. } else if (create_mode == OS_FILE_CREATE) {
  725. create_flag = CREATE_NEW;
  726. } else if (create_mode == OS_FILE_CREATE_PATH) {
  727. /* create subdirs along the path if needed */
  728. *success = os_file_create_subdirs_if_needed(name);
  729. if (!*success) {
  730. ut_error;
  731. }
  732. create_flag = CREATE_NEW;
  733. create_mode = OS_FILE_CREATE;
  734. } else {
  735. create_flag = 0;
  736. ut_error;
  737. }
  738. if (access_type == OS_FILE_READ_ONLY) {
  739. access = GENERIC_READ;
  740. } else if (access_type == OS_FILE_READ_WRITE) {
  741. access = GENERIC_READ | GENERIC_WRITE;
  742. } else {
  743. access = 0;
  744. ut_error;
  745. }
  746. file = CreateFile((LPCTSTR) name,
  747. access,
  748. FILE_SHARE_READ | FILE_SHARE_WRITE,
  749. /* file can be read and written also
  750. by other processes */
  751. NULL, /* default security attributes */
  752. create_flag,
  753. attributes,
  754. NULL); /* no template file */
  755. if (file == INVALID_HANDLE_VALUE) {
  756. *success = FALSE;
  757. retry = os_file_handle_error(name,
  758. create_mode == OS_FILE_OPEN ?
  759. "open" : "create");
  760. if (retry) {
  761. goto try_again;
  762. }
  763. } else {
  764. *success = TRUE;
  765. }
  766. return(file);
  767. #else /* __WIN__ */
  768. os_file_t file;
  769. int create_flag;
  770. ibool retry;
  771. try_again:
  772. ut_a(name);
  773. if (create_mode == OS_FILE_OPEN) {
  774. if (access_type == OS_FILE_READ_ONLY) {
  775. create_flag = O_RDONLY;
  776. } else {
  777. create_flag = O_RDWR;
  778. }
  779. } else if (create_mode == OS_FILE_CREATE) {
  780. create_flag = O_RDWR | O_CREAT | O_EXCL;
  781. } else if (create_mode == OS_FILE_CREATE_PATH) {
  782. /* create subdirs along the path if needed */
  783. *success = os_file_create_subdirs_if_needed(name);
  784. if (!*success) {
  785. return (-1);
  786. }
  787. create_flag = O_RDWR | O_CREAT | O_EXCL;
  788. create_mode = OS_FILE_CREATE;
  789. } else {
  790. create_flag = 0;
  791. ut_error;
  792. }
  793. if (create_mode == OS_FILE_CREATE) {
  794. file = open(name, create_flag, S_IRUSR | S_IWUSR
  795. | S_IRGRP | S_IWGRP);
  796. } else {
  797. file = open(name, create_flag);
  798. }
  799. if (file == -1) {
  800. *success = FALSE;
  801. retry = os_file_handle_error(name,
  802. create_mode == OS_FILE_OPEN ?
  803. "open" : "create");
  804. if (retry) {
  805. goto try_again;
  806. }
  807. #ifdef USE_FILE_LOCK
  808. } else if (access_type == OS_FILE_READ_WRITE
  809. && os_file_lock(file, name)) {
  810. *success = FALSE;
  811. close(file);
  812. file = -1;
  813. #endif
  814. } else {
  815. *success = TRUE;
  816. }
  817. return(file);
  818. #endif /* __WIN__ */
  819. }
  820. /********************************************************************
  821. A simple function to open or create a file. */
  822. os_file_t
  823. os_file_create_simple_no_error_handling(
  824. /*====================================*/
  825. /* out, own: handle to the file, not defined
  826. if error, error number can be retrieved with
  827. os_file_get_last_error */
  828. const char* name, /* in: name of the file or path as a
  829. null-terminated string */
  830. ulint create_mode,/* in: OS_FILE_OPEN if an existing file
  831. is opened (if does not exist, error), or
  832. OS_FILE_CREATE if a new file is created
  833. (if exists, error) */
  834. ulint access_type,/* in: OS_FILE_READ_ONLY,
  835. OS_FILE_READ_WRITE, or
  836. OS_FILE_READ_ALLOW_DELETE; the last option is
  837. used by a backup program reading the file */
  838. ibool* success)/* out: TRUE if succeed, FALSE if error */
  839. {
  840. #ifdef __WIN__
  841. os_file_t file;
  842. DWORD create_flag;
  843. DWORD access;
  844. DWORD attributes = 0;
  845. DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
  846. ut_a(name);
  847. if (create_mode == OS_FILE_OPEN) {
  848. create_flag = OPEN_EXISTING;
  849. } else if (create_mode == OS_FILE_CREATE) {
  850. create_flag = CREATE_NEW;
  851. } else {
  852. create_flag = 0;
  853. ut_error;
  854. }
  855. if (access_type == OS_FILE_READ_ONLY) {
  856. access = GENERIC_READ;
  857. } else if (access_type == OS_FILE_READ_WRITE) {
  858. access = GENERIC_READ | GENERIC_WRITE;
  859. } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
  860. access = GENERIC_READ;
  861. share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
  862. | FILE_SHARE_WRITE; /* A backup program has to give
  863. mysqld the maximum freedom to
  864. do what it likes with the
  865. file */
  866. } else {
  867. access = 0;
  868. ut_error;
  869. }
  870. file = CreateFile((LPCTSTR) name,
  871. access,
  872. share_mode,
  873. NULL, /* default security attributes */
  874. create_flag,
  875. attributes,
  876. NULL); /* no template file */
  877. if (file == INVALID_HANDLE_VALUE) {
  878. *success = FALSE;
  879. } else {
  880. *success = TRUE;
  881. }
  882. return(file);
  883. #else /* __WIN__ */
  884. os_file_t file;
  885. int create_flag;
  886. ut_a(name);
  887. if (create_mode == OS_FILE_OPEN) {
  888. if (access_type == OS_FILE_READ_ONLY) {
  889. create_flag = O_RDONLY;
  890. } else {
  891. create_flag = O_RDWR;
  892. }
  893. } else if (create_mode == OS_FILE_CREATE) {
  894. create_flag = O_RDWR | O_CREAT | O_EXCL;
  895. } else {
  896. create_flag = 0;
  897. ut_error;
  898. }
  899. if (create_mode == OS_FILE_CREATE) {
  900. file = open(name, create_flag, S_IRUSR | S_IWUSR
  901. | S_IRGRP | S_IWGRP);
  902. } else {
  903. file = open(name, create_flag);
  904. }
  905. if (file == -1) {
  906. *success = FALSE;
  907. #ifdef USE_FILE_LOCK
  908. } else if (access_type == OS_FILE_READ_WRITE
  909. && os_file_lock(file, name)) {
  910. *success = FALSE;
  911. close(file);
  912. file = -1;
  913. #endif
  914. } else {
  915. *success = TRUE;
  916. }
  917. return(file);
  918. #endif /* __WIN__ */
  919. }
  920. /********************************************************************
  921. Tries to disable OS caching on an opened file descriptor. */
  922. void
  923. os_file_set_nocache(
  924. /*================*/
  925. int fd, /* in: file descriptor to alter */
  926. const char* file_name, /* in: used in the diagnostic message */
  927. const char* operation_name) /* in: used in the diagnostic message,
  928. we call os_file_set_nocache()
  929. immediately after opening or creating
  930. a file, so this is either "open" or
  931. "create" */
  932. {
  933. /* some versions of Solaris may not have DIRECTIO_ON */
  934. #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
  935. if (directio(fd, DIRECTIO_ON) == -1) {
  936. int errno_save;
  937. errno_save = (int)errno;
  938. ut_print_timestamp(stderr);
  939. fprintf(stderr,
  940. " InnoDB: Failed to set DIRECTIO_ON "
  941. "on file %s: %s: %s, continuing anyway\n",
  942. file_name, operation_name, strerror(errno_save));
  943. }
  944. #elif defined(O_DIRECT)
  945. if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
  946. int errno_save;
  947. errno_save = (int)errno;
  948. ut_print_timestamp(stderr);
  949. fprintf(stderr,
  950. " InnoDB: Failed to set O_DIRECT "
  951. "on file %s: %s: %s, continuing anyway\n",
  952. file_name, operation_name, strerror(errno_save));
  953. if (errno_save == EINVAL) {
  954. ut_print_timestamp(stderr);
  955. fprintf(stderr,
  956. " InnoDB: O_DIRECT is known to result in "
  957. "'Invalid argument' on Linux on tmpfs, "
  958. "see MySQL Bug#26662\n");
  959. }
  960. }
  961. #endif
  962. }
  963. /********************************************************************
  964. Opens an existing file or creates a new. */
  965. os_file_t
  966. os_file_create(
  967. /*===========*/
  968. /* out, own: handle to the file, not defined
  969. if error, error number can be retrieved with
  970. os_file_get_last_error */
  971. const char* name, /* in: name of the file or path as a
  972. null-terminated string */
  973. ulint create_mode,/* in: OS_FILE_OPEN if an existing file
  974. is opened (if does not exist, error), or
  975. OS_FILE_CREATE if a new file is created
  976. (if exists, error),
  977. OS_FILE_OVERWRITE if a new file is created
  978. or an old overwritten;
  979. OS_FILE_OPEN_RAW, if a raw device or disk
  980. partition should be opened */
  981. ulint purpose,/* in: OS_FILE_AIO, if asynchronous,
  982. non-buffered i/o is desired,
  983. OS_FILE_NORMAL, if any normal file;
  984. NOTE that it also depends on type, os_aio_..
  985. and srv_.. variables whether we really use
  986. async i/o or unbuffered i/o: look in the
  987. function source code for the exact rules */
  988. ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */
  989. ibool* success)/* out: TRUE if succeed, FALSE if error */
  990. {
  991. #ifdef __WIN__
  992. os_file_t file;
  993. DWORD share_mode = FILE_SHARE_READ;
  994. DWORD create_flag;
  995. DWORD attributes;
  996. ibool retry;
  997. try_again:
  998. ut_a(name);
  999. if (create_mode == OS_FILE_OPEN_RAW) {
  1000. create_flag = OPEN_EXISTING;
  1001. share_mode = FILE_SHARE_WRITE;
  1002. } else if (create_mode == OS_FILE_OPEN
  1003. || create_mode == OS_FILE_OPEN_RETRY) {
  1004. create_flag = OPEN_EXISTING;
  1005. } else if (create_mode == OS_FILE_CREATE) {
  1006. create_flag = CREATE_NEW;
  1007. } else if (create_mode == OS_FILE_OVERWRITE) {
  1008. create_flag = CREATE_ALWAYS;
  1009. } else {
  1010. create_flag = 0;
  1011. ut_error;
  1012. }
  1013. if (purpose == OS_FILE_AIO) {
  1014. /* If specified, use asynchronous (overlapped) io and no
  1015. buffering of writes in the OS */
  1016. attributes = 0;
  1017. #ifdef WIN_ASYNC_IO
  1018. if (os_aio_use_native_aio) {
  1019. attributes = attributes | FILE_FLAG_OVERLAPPED;
  1020. }
  1021. #endif
  1022. #ifdef UNIV_NON_BUFFERED_IO
  1023. if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
  1024. /* Do not use unbuffered i/o to log files because
  1025. value 2 denotes that we do not flush the log at every
  1026. commit, but only once per second */
  1027. } else if (srv_win_file_flush_method
  1028. == SRV_WIN_IO_UNBUFFERED) {
  1029. attributes = attributes | FILE_FLAG_NO_BUFFERING;
  1030. }
  1031. #endif
  1032. } else if (purpose == OS_FILE_NORMAL) {
  1033. attributes = 0;
  1034. #ifdef UNIV_NON_BUFFERED_IO
  1035. if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
  1036. /* Do not use unbuffered i/o to log files because
  1037. value 2 denotes that we do not flush the log at every
  1038. commit, but only once per second */
  1039. } else if (srv_win_file_flush_method
  1040. == SRV_WIN_IO_UNBUFFERED) {
  1041. attributes = attributes | FILE_FLAG_NO_BUFFERING;
  1042. }
  1043. #endif
  1044. } else {
  1045. attributes = 0;
  1046. ut_error;
  1047. }
  1048. file = CreateFile((LPCTSTR) name,
  1049. GENERIC_READ | GENERIC_WRITE, /* read and write
  1050. access */
  1051. share_mode, /* File can be read also by other
  1052. processes; we must give the read
  1053. permission because of ibbackup. We do
  1054. not give the write permission to
  1055. others because if one would succeed to
  1056. start 2 instances of mysqld on the
  1057. SAME files, that could cause severe
  1058. database corruption! When opening
  1059. raw disk partitions, Microsoft manuals
  1060. say that we must give also the write
  1061. permission. */
  1062. NULL, /* default security attributes */
  1063. create_flag,
  1064. attributes,
  1065. NULL); /* no template file */
  1066. if (file == INVALID_HANDLE_VALUE) {
  1067. *success = FALSE;
  1068. retry = os_file_handle_error(name,
  1069. create_mode == OS_FILE_CREATE ?
  1070. "create" : "open");
  1071. if (retry) {
  1072. goto try_again;
  1073. }
  1074. } else {
  1075. *success = TRUE;
  1076. }
  1077. return(file);
  1078. #else /* __WIN__ */
  1079. os_file_t file;
  1080. int create_flag;
  1081. ibool retry;
  1082. const char* mode_str = NULL;
  1083. const char* type_str = NULL;
  1084. const char* purpose_str = NULL;
  1085. try_again:
  1086. ut_a(name);
  1087. if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
  1088. || create_mode == OS_FILE_OPEN_RETRY) {
  1089. mode_str = "OPEN";
  1090. create_flag = O_RDWR;
  1091. } else if (create_mode == OS_FILE_CREATE) {
  1092. mode_str = "CREATE";
  1093. create_flag = O_RDWR | O_CREAT | O_EXCL;
  1094. } else if (create_mode == OS_FILE_OVERWRITE) {
  1095. mode_str = "OVERWRITE";
  1096. create_flag = O_RDWR | O_CREAT | O_TRUNC;
  1097. } else {
  1098. create_flag = 0;
  1099. ut_error;
  1100. }
  1101. if (type == OS_LOG_FILE) {
  1102. type_str = "LOG";
  1103. } else if (type == OS_DATA_FILE) {
  1104. type_str = "DATA";
  1105. } else {
  1106. ut_error;
  1107. }
  1108. if (purpose == OS_FILE_AIO) {
  1109. purpose_str = "AIO";
  1110. } else if (purpose == OS_FILE_NORMAL) {
  1111. purpose_str = "NORMAL";
  1112. } else {
  1113. ut_error;
  1114. }
  1115. #if 0
  1116. fprintf(stderr, "Opening file %s, mode %s, type %s, purpose %s\n",
  1117. name, mode_str, type_str, purpose_str);
  1118. #endif
  1119. #ifdef O_SYNC
  1120. /* We let O_SYNC only affect log files; note that we map O_DSYNC to
  1121. O_SYNC because the datasync options seemed to corrupt files in 2001
  1122. in both Linux and Solaris */
  1123. if (type == OS_LOG_FILE
  1124. && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
  1125. # if 0
  1126. fprintf(stderr, "Using O_SYNC for file %s\n", name);
  1127. # endif
  1128. create_flag = create_flag | O_SYNC;
  1129. }
  1130. #endif /* O_SYNC */
  1131. file = open(name, create_flag, os_innodb_umask);
  1132. if (file == -1) {
  1133. *success = FALSE;
  1134. retry = os_file_handle_error(name,
  1135. create_mode == OS_FILE_CREATE ?
  1136. "create" : "open");
  1137. if (retry) {
  1138. goto try_again;
  1139. } else {
  1140. return(file /* -1 */);
  1141. }
  1142. }
  1143. /* else */
  1144. *success = TRUE;
  1145. /* We disable OS caching (O_DIRECT) only on data files */
  1146. if (type != OS_LOG_FILE
  1147. && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
  1148. os_file_set_nocache(file, name, mode_str);
  1149. }
  1150. #ifdef USE_FILE_LOCK
  1151. if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
  1152. if (create_mode == OS_FILE_OPEN_RETRY) {
  1153. int i;
  1154. ut_print_timestamp(stderr);
  1155. fputs(" InnoDB: Retrying to lock"
  1156. " the first data file\n",
  1157. stderr);
  1158. for (i = 0; i < 100; i++) {
  1159. os_thread_sleep(1000000);
  1160. if (!os_file_lock(file, name)) {
  1161. *success = TRUE;
  1162. return(file);
  1163. }
  1164. }
  1165. ut_print_timestamp(stderr);
  1166. fputs(" InnoDB: Unable to open the first data file\n",
  1167. stderr);
  1168. }
  1169. *success = FALSE;
  1170. close(file);
  1171. file = -1;
  1172. }
  1173. #endif /* USE_FILE_LOCK */
  1174. return(file);
  1175. #endif /* __WIN__ */
  1176. }
  1177. /***************************************************************************
  1178. Deletes a file if it exists. The file has to be closed before calling this. */
  1179. ibool
  1180. os_file_delete_if_exists(
  1181. /*=====================*/
  1182. /* out: TRUE if success */
  1183. const char* name) /* in: file path as a null-terminated string */
  1184. {
  1185. #ifdef __WIN__
  1186. BOOL ret;
  1187. ulint count = 0;
  1188. loop:
  1189. /* In Windows, deleting an .ibd file may fail if ibbackup is copying
  1190. it */
  1191. ret = DeleteFile((LPCTSTR)name);
  1192. if (ret) {
  1193. return(TRUE);
  1194. }
  1195. if (GetLastError() == ERROR_FILE_NOT_FOUND) {
  1196. /* the file does not exist, this not an error */
  1197. return(TRUE);
  1198. }
  1199. count++;
  1200. if (count > 100 && 0 == (count % 10)) {
  1201. fprintf(stderr,
  1202. "InnoDB: Warning: cannot delete file %s\n"
  1203. "InnoDB: Are you running ibbackup"
  1204. " to back up the file?\n", name);
  1205. os_file_get_last_error(TRUE); /* print error information */
  1206. }
  1207. os_thread_sleep(1000000); /* sleep for a second */
  1208. if (count > 2000) {
  1209. return(FALSE);
  1210. }
  1211. goto loop;
  1212. #else
  1213. int ret;
  1214. ret = unlink((const char*)name);
  1215. if (ret != 0 && errno != ENOENT) {
  1216. os_file_handle_error_no_exit(name, "delete");
  1217. return(FALSE);
  1218. }
  1219. return(TRUE);
  1220. #endif
  1221. }
  1222. /***************************************************************************
  1223. Deletes a file. The file has to be closed before calling this. */
  1224. ibool
  1225. os_file_delete(
  1226. /*===========*/
  1227. /* out: TRUE if success */
  1228. const char* name) /* in: file path as a null-terminated string */
  1229. {
  1230. #ifdef __WIN__
  1231. BOOL ret;
  1232. ulint count = 0;
  1233. loop:
  1234. /* In Windows, deleting an .ibd file may fail if ibbackup is copying
  1235. it */
  1236. ret = DeleteFile((LPCTSTR)name);
  1237. if (ret) {
  1238. return(TRUE);
  1239. }
  1240. if (GetLastError() == ERROR_FILE_NOT_FOUND) {
  1241. /* If the file does not exist, we classify this as a 'mild'
  1242. error and return */
  1243. return(FALSE);
  1244. }
  1245. count++;
  1246. if (count > 100 && 0 == (count % 10)) {
  1247. fprintf(stderr,
  1248. "InnoDB: Warning: cannot delete file %s\n"
  1249. "InnoDB: Are you running ibbackup"
  1250. " to back up the file?\n", name);
  1251. os_file_get_last_error(TRUE); /* print error information */
  1252. }
  1253. os_thread_sleep(1000000); /* sleep for a second */
  1254. if (count > 2000) {
  1255. return(FALSE);
  1256. }
  1257. goto loop;
  1258. #else
  1259. int ret;
  1260. ret = unlink((const char*)name);
  1261. if (ret != 0) {
  1262. os_file_handle_error_no_exit(name, "delete");
  1263. return(FALSE);
  1264. }
  1265. return(TRUE);
  1266. #endif
  1267. }
  1268. /***************************************************************************
  1269. Renames a file (can also move it to another directory). It is safest that the
  1270. file is closed before calling this function. */
  1271. ibool
  1272. os_file_rename(
  1273. /*===========*/
  1274. /* out: TRUE if success */
  1275. const char* oldpath,/* in: old file path as a null-terminated
  1276. string */
  1277. const char* newpath)/* in: new file path */
  1278. {
  1279. #ifdef __WIN__
  1280. BOOL ret;
  1281. ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
  1282. if (ret) {
  1283. return(TRUE);
  1284. }
  1285. os_file_handle_error_no_exit(oldpath, "rename");
  1286. return(FALSE);
  1287. #else
  1288. int ret;
  1289. ret = rename((const char*)oldpath, (const char*)newpath);
  1290. if (ret != 0) {
  1291. os_file_handle_error_no_exit(oldpath, "rename");
  1292. return(FALSE);
  1293. }
  1294. return(TRUE);
  1295. #endif
  1296. }
  1297. /***************************************************************************
  1298. Closes a file handle. In case of error, error number can be retrieved with
  1299. os_file_get_last_error. */
  1300. ibool
  1301. os_file_close(
  1302. /*==========*/
  1303. /* out: TRUE if success */
  1304. os_file_t file) /* in, own: handle to a file */
  1305. {
  1306. #ifdef __WIN__
  1307. BOOL ret;
  1308. ut_a(file);
  1309. ret = CloseHandle(file);
  1310. if (ret) {
  1311. return(TRUE);
  1312. }
  1313. os_file_handle_error(NULL, "close");
  1314. return(FALSE);
  1315. #else
  1316. int ret;
  1317. ret = close(file);
  1318. if (ret == -1) {
  1319. os_file_handle_error(NULL, "close");
  1320. return(FALSE);
  1321. }
  1322. return(TRUE);
  1323. #endif
  1324. }
  1325. /***************************************************************************
  1326. Closes a file handle. */
  1327. ibool
  1328. os_file_close_no_error_handling(
  1329. /*============================*/
  1330. /* out: TRUE if success */
  1331. os_file_t file) /* in, own: handle to a file */
  1332. {
  1333. #ifdef __WIN__
  1334. BOOL ret;
  1335. ut_a(file);
  1336. ret = CloseHandle(file);
  1337. if (ret) {
  1338. return(TRUE);
  1339. }
  1340. return(FALSE);
  1341. #else
  1342. int ret;
  1343. ret = close(file);
  1344. if (ret == -1) {
  1345. return(FALSE);
  1346. }
  1347. return(TRUE);
  1348. #endif
  1349. }
  1350. /***************************************************************************
  1351. Gets a file size. */
  1352. ibool
  1353. os_file_get_size(
  1354. /*=============*/
  1355. /* out: TRUE if success */
  1356. os_file_t file, /* in: handle to a file */
  1357. ulint* size, /* out: least significant 32 bits of file
  1358. size */
  1359. ulint* size_high)/* out: most significant 32 bits of size */
  1360. {
  1361. #ifdef __WIN__
  1362. DWORD high;
  1363. DWORD low;
  1364. low = GetFileSize(file, &high);
  1365. if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
  1366. return(FALSE);
  1367. }
  1368. *size = low;
  1369. *size_high = high;
  1370. return(TRUE);
  1371. #else
  1372. off_t offs;
  1373. offs = lseek(file, 0, SEEK_END);
  1374. if (offs == ((off_t)-1)) {
  1375. return(FALSE);
  1376. }
  1377. if (sizeof(off_t) > 4) {
  1378. *size = (ulint)(offs & 0xFFFFFFFFUL);
  1379. *size_high = (ulint)(offs >> 32);
  1380. } else {
  1381. *size = (ulint) offs;
  1382. *size_high = 0;
  1383. }
  1384. return(TRUE);
  1385. #endif
  1386. }
  1387. /***************************************************************************
  1388. Gets file size as a 64-bit integer ib_longlong. */
  1389. ib_longlong
  1390. os_file_get_size_as_iblonglong(
  1391. /*===========================*/
  1392. /* out: size in bytes, -1 if error */
  1393. os_file_t file) /* in: handle to a file */
  1394. {
  1395. ulint size;
  1396. ulint size_high;
  1397. ibool success;
  1398. success = os_file_get_size(file, &size, &size_high);
  1399. if (!success) {
  1400. return(-1);
  1401. }
  1402. return((((ib_longlong)size_high) << 32) + (ib_longlong)size);
  1403. }
  1404. /***************************************************************************
  1405. Write the specified number of zeros to a newly created file. */
  1406. ibool
  1407. os_file_set_size(
  1408. /*=============*/
  1409. /* out: TRUE if success */
  1410. const char* name, /* in: name of the file or path as a
  1411. null-terminated string */
  1412. os_file_t file, /* in: handle to a file */
  1413. ulint size, /* in: least significant 32 bits of file
  1414. size */
  1415. ulint size_high)/* in: most significant 32 bits of size */
  1416. {
  1417. ib_longlong current_size;
  1418. ib_longlong desired_size;
  1419. ibool ret;
  1420. byte* buf;
  1421. byte* buf2;
  1422. ulint buf_size;
  1423. ut_a(size == (size & 0xFFFFFFFF));
  1424. current_size = 0;
  1425. desired_size = (ib_longlong)size + (((ib_longlong)size_high) << 32);
  1426. /* Write up to 1 megabyte at a time. */
  1427. buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
  1428. * UNIV_PAGE_SIZE;
  1429. buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE);
  1430. /* Align the buffer for possible raw i/o */
  1431. buf = ut_align(buf2, UNIV_PAGE_SIZE);
  1432. /* Write buffer full of zeros */
  1433. memset(buf, 0, buf_size);
  1434. if (desired_size >= (ib_longlong)(100 * 1024 * 1024)) {
  1435. fprintf(stderr, "InnoDB: Progress in MB:");
  1436. }
  1437. while (current_size < desired_size) {
  1438. ulint n_bytes;
  1439. if (desired_size - current_size < (ib_longlong) buf_size) {
  1440. n_bytes = (ulint) (desired_size - current_size);
  1441. } else {
  1442. n_bytes = buf_size;
  1443. }
  1444. ret = os_file_write(name, file, buf,
  1445. (ulint)(current_size & 0xFFFFFFFF),
  1446. (ulint)(current_size >> 32),
  1447. n_bytes);
  1448. if (!ret) {
  1449. ut_free(buf2);
  1450. goto error_handling;
  1451. }
  1452. /* Print about progress for each 100 MB written */
  1453. if ((ib_longlong) (current_size + n_bytes) / (ib_longlong)(100 * 1024 * 1024)
  1454. != current_size / (ib_longlong)(100 * 1024 * 1024)) {
  1455. fprintf(stderr, " %lu00",
  1456. (ulong) ((current_size + n_bytes)
  1457. / (ib_longlong)(100 * 1024 * 1024)));
  1458. }
  1459. current_size += n_bytes;
  1460. }
  1461. if (desired_size >= (ib_longlong)(100 * 1024 * 1024)) {
  1462. fprintf(stderr, "\n");
  1463. }
  1464. ut_free(buf2);
  1465. ret = os_file_flush(file);
  1466. if (ret) {
  1467. return(TRUE);
  1468. }
  1469. error_handling:
  1470. return(FALSE);
  1471. }
  1472. /***************************************************************************
  1473. Truncates a file at its current position. */
  1474. ibool
  1475. os_file_set_eof(
  1476. /*============*/
  1477. /* out: TRUE if success */
  1478. FILE* file) /* in: file to be truncated */
  1479. {
  1480. #ifdef __WIN__
  1481. HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
  1482. return(SetEndOfFile(h));
  1483. #else /* __WIN__ */
  1484. return(!ftruncate(fileno(file), ftell(file)));
  1485. #endif /* __WIN__ */
  1486. }
  1487. /***************************************************************************
  1488. Flushes the write buffers of a given file to the disk. */
  1489. ibool
  1490. os_file_flush(
  1491. /*==========*/
  1492. /* out: TRUE if success */
  1493. os_file_t file) /* in, own: handle to a file */
  1494. {
  1495. #ifdef __WIN__
  1496. BOOL ret;
  1497. ut_a(file);
  1498. os_n_fsyncs++;
  1499. ret = FlushFileBuffers(file);
  1500. if (ret) {
  1501. return(TRUE);
  1502. }
  1503. /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
  1504. actually a raw device, we choose to ignore that error if we are using
  1505. raw disks */
  1506. if (srv_start_raw_disk_in_use && GetLastError()
  1507. == ERROR_INVALID_FUNCTION) {
  1508. return(TRUE);
  1509. }
  1510. os_file_handle_error(NULL, "flush");
  1511. /* It is a fatal error if a file flush does not succeed, because then
  1512. the database can get corrupt on disk */
  1513. ut_error;
  1514. return(FALSE);
  1515. #else
  1516. int ret;
  1517. #if defined(HAVE_DARWIN_THREADS)
  1518. # ifndef F_FULLFSYNC
  1519. /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
  1520. # define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
  1521. # elif F_FULLFSYNC != 51
  1522. # error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
  1523. # endif
  1524. /* Apple has disabled fsync() for internal disk drives in OS X. That
  1525. caused corruption for a user when he tested a power outage. Let us in
  1526. OS X use a nonstandard flush method recommended by an Apple
  1527. engineer. */
  1528. if (!srv_have_fullfsync) {
  1529. /* If we are not on an operating system that supports this,
  1530. then fall back to a plain fsync. */
  1531. ret = fsync(file);
  1532. } else {
  1533. ret = fcntl(file, F_FULLFSYNC, NULL);
  1534. if (ret) {
  1535. /* If we are not on a file system that supports this,
  1536. then fall back to a plain fsync. */
  1537. ret = fsync(file);
  1538. }
  1539. }
  1540. #elif HAVE_FDATASYNC
  1541. ret = fdatasync(file);
  1542. #else
  1543. /* fprintf(stderr, "Flushing to file %p\n", file); */
  1544. ret = fsync(file);
  1545. #endif
  1546. os_n_fsyncs++;
  1547. if (ret == 0) {
  1548. return(TRUE);
  1549. }
  1550. /* Since Linux returns EINVAL if the 'file' is actually a raw device,
  1551. we choose to ignore that error if we are using raw disks */
  1552. if (srv_start_raw_disk_in_use && errno == EINVAL) {
  1553. return(TRUE);
  1554. }
  1555. ut_print_timestamp(stderr);
  1556. fprintf(stderr,
  1557. " InnoDB: Error: the OS said file flush did not succeed\n");
  1558. os_file_handle_error(NULL, "flush");
  1559. /* It is a fatal error if a file flush does not succeed, because then
  1560. the database can get corrupt on disk */
  1561. ut_error;
  1562. return(FALSE);
  1563. #endif
  1564. }
  1565. #ifndef __WIN__
  1566. /***********************************************************************
  1567. Does a synchronous read operation in Posix. */
  1568. static
  1569. ssize_t
  1570. os_file_pread(
  1571. /*==========*/
  1572. /* out: number of bytes read, -1 if error */
  1573. os_file_t file, /* in: handle to a file */
  1574. void* buf, /* in: buffer where to read */
  1575. ulint n, /* in: number of bytes to read */
  1576. ulint offset, /* in: least significant 32 bits of file
  1577. offset from where to read */
  1578. ulint offset_high) /* in: most significant 32 bits of
  1579. offset */
  1580. {
  1581. off_t offs;
  1582. ssize_t n_bytes;
  1583. ut_a((offset & 0xFFFFFFFFUL) == offset);
  1584. /* If off_t is > 4 bytes in size, then we assume we can pass a
  1585. 64-bit address */
  1586. if (sizeof(off_t) > 4) {
  1587. offs = (off_t)offset + (((off_t)offset_high) << 32);
  1588. } else {
  1589. offs = (off_t)offset;
  1590. if (offset_high > 0) {
  1591. fprintf(stderr,
  1592. "InnoDB: Error: file read at offset > 4 GB\n");
  1593. }
  1594. }
  1595. os_n_file_reads++;
  1596. #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
  1597. os_mutex_enter(os_file_count_mutex);
  1598. os_file_n_pending_preads++;
  1599. os_n_pending_reads++;
  1600. os_mutex_exit(os_file_count_mutex);
  1601. n_bytes = pread(file, buf, (ssize_t)n, offs);
  1602. os_mutex_enter(os_file_count_mutex);
  1603. os_file_n_pending_preads--;
  1604. os_n_pending_reads--;
  1605. os_mutex_exit(os_file_count_mutex);
  1606. return(n_bytes);
  1607. #else
  1608. {
  1609. off_t ret_offset;
  1610. ssize_t ret;
  1611. ulint i;
  1612. os_mutex_enter(os_file_count_mutex);
  1613. os_n_pending_reads++;
  1614. os_mutex_exit(os_file_count_mutex);
  1615. /* Protect the seek / read operation with a mutex */
  1616. i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
  1617. os_mutex_enter(os_file_seek_mutexes[i]);
  1618. ret_offset = lseek(file, offs, SEEK_SET);
  1619. if (ret_offset < 0) {
  1620. ret = -1;
  1621. } else {
  1622. ret = read(file, buf, (ssize_t)n);
  1623. }
  1624. os_mutex_exit(os_file_seek_mutexes[i]);
  1625. os_mutex_enter(os_file_count_mutex);
  1626. os_n_pending_reads--;
  1627. os_mutex_exit(os_file_count_mutex);
  1628. return(ret);
  1629. }
  1630. #endif
  1631. }
  1632. /***********************************************************************
  1633. Does a synchronous write operation in Posix. */
  1634. static
  1635. ssize_t
  1636. os_file_pwrite(
  1637. /*===========*/
  1638. /* out: number of bytes written, -1 if error */
  1639. os_file_t file, /* in: handle to a file */
  1640. const void* buf, /* in: buffer from where to write */
  1641. ulint n, /* in: number of bytes to write */
  1642. ulint offset, /* in: least significant 32 bits of file
  1643. offset where to write */
  1644. ulint offset_high) /* in: most significant 32 bits of
  1645. offset */
  1646. {
  1647. ssize_t ret;
  1648. off_t offs;
  1649. ut_a((offset & 0xFFFFFFFFUL) == offset);
  1650. /* If off_t is > 4 bytes in size, then we assume we can pass a
  1651. 64-bit address */
  1652. if (sizeof(off_t) > 4) {
  1653. offs = (off_t)offset + (((off_t)offset_high) << 32);
  1654. } else {
  1655. offs = (off_t)offset;
  1656. if (offset_high > 0) {
  1657. fprintf(stderr,
  1658. "InnoDB: Error: file write"
  1659. " at offset > 4 GB\n");
  1660. }
  1661. }
  1662. os_n_file_writes++;
  1663. #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
  1664. os_mutex_enter(os_file_count_mutex);
  1665. os_file_n_pending_pwrites++;
  1666. os_n_pending_writes++;
  1667. os_mutex_exit(os_file_count_mutex);
  1668. ret = pwrite(file, buf, (ssize_t)n, offs);
  1669. os_mutex_enter(os_file_count_mutex);
  1670. os_file_n_pending_pwrites--;
  1671. os_n_pending_writes--;
  1672. os_mutex_exit(os_file_count_mutex);
  1673. # ifdef UNIV_DO_FLUSH
  1674. if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
  1675. && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
  1676. && !os_do_not_call_flush_at_each_write) {
  1677. /* Always do fsync to reduce the probability that when
  1678. the OS crashes, a database page is only partially
  1679. physically written to disk. */
  1680. ut_a(TRUE == os_file_flush(file));
  1681. }
  1682. # endif /* UNIV_DO_FLUSH */
  1683. return(ret);
  1684. #else
  1685. {
  1686. off_t ret_offset;
  1687. ulint i;
  1688. os_mutex_enter(os_file_count_mutex);
  1689. os_n_pending_writes++;
  1690. os_mutex_exit(os_file_count_mutex);
  1691. /* Protect the seek / write operation with a mutex */
  1692. i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
  1693. os_mutex_enter(os_file_seek_mutexes[i]);
  1694. ret_offset = lseek(file, offs, SEEK_SET);
  1695. if (ret_offset < 0) {
  1696. ret = -1;
  1697. goto func_exit;
  1698. }
  1699. ret = write(file, buf, (ssize_t)n);
  1700. # ifdef UNIV_DO_FLUSH
  1701. if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
  1702. && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
  1703. && !os_do_not_call_flush_at_each_write) {
  1704. /* Always do fsync to reduce the probability that when
  1705. the OS crashes, a database page is only partially
  1706. physically written to disk. */
  1707. ut_a(TRUE == os_file_flush(file));
  1708. }
  1709. # endif /* UNIV_DO_FLUSH */
  1710. func_exit:
  1711. os_mutex_exit(os_file_seek_mutexes[i]);
  1712. os_mutex_enter(os_file_count_mutex);
  1713. os_n_pending_writes--;
  1714. os_mutex_exit(os_file_count_mutex);
  1715. return(ret);
  1716. }
  1717. #endif
  1718. }
  1719. #endif
  1720. /***********************************************************************
  1721. Requests a synchronous positioned read operation. */
  1722. ibool
  1723. os_file_read(
  1724. /*=========*/
  1725. /* out: TRUE if request was
  1726. successful, FALSE if fail */
  1727. os_file_t file, /* in: handle to a file */
  1728. void* buf, /* in: buffer where to read */
  1729. ulint offset, /* in: least significant 32 bits of file
  1730. offset where to read */
  1731. ulint offset_high, /* in: most significant 32 bits of
  1732. offset */
  1733. ulint n) /* in: number of bytes to read */
  1734. {
  1735. #ifdef __WIN__
  1736. BOOL ret;
  1737. DWORD len;
  1738. DWORD ret2;
  1739. DWORD low;
  1740. DWORD high;
  1741. ibool retry;
  1742. ulint i;
  1743. ut_a((offset & 0xFFFFFFFFUL) == offset);
  1744. os_n_file_reads++;
  1745. os_bytes_read_since_printout += n;
  1746. try_again:
  1747. ut_ad(file);
  1748. ut_ad(buf);
  1749. ut_ad(n > 0);
  1750. low = (DWORD) offset;
  1751. high = (DWORD) offset_high;
  1752. os_mutex_enter(os_file_count_mutex);
  1753. os_n_pending_reads++;
  1754. os_mutex_exit(os_file_count_mutex);
  1755. /* Protect the seek / read operation with a mutex */
  1756. i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
  1757. os_mutex_enter(os_file_seek_mutexes[i]);
  1758. ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
  1759. if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
  1760. os_mutex_exit(os_file_seek_mutexes[i]);
  1761. os_mutex_enter(os_file_count_mutex);
  1762. os_n_pending_reads--;
  1763. os_mutex_exit(os_file_count_mutex);
  1764. goto error_handling;
  1765. }
  1766. ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
  1767. os_mutex_exit(os_file_seek_mutexes[i]);
  1768. os_mutex_enter(os_file_count_mutex);
  1769. os_n_pending_reads--;
  1770. os_mutex_exit(os_file_count_mutex);
  1771. if (ret && len == n) {
  1772. return(TRUE);
  1773. }
  1774. #else
  1775. ibool retry;
  1776. ssize_t ret;
  1777. os_bytes_read_since_printout += n;
  1778. try_again:
  1779. ret = os_file_pread(file, buf, n, offset, offset_high);
  1780. if ((ulint)ret == n) {
  1781. return(TRUE);
  1782. }
  1783. fprintf(stderr,
  1784. "InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
  1785. "InnoDB: Was only able to read %ld.\n",
  1786. (ulong)n, (ulong)offset_high,
  1787. (ulong)offset, (long)ret);
  1788. #endif
  1789. #ifdef __WIN__
  1790. error_handling:
  1791. #endif
  1792. retry = os_file_handle_error(NULL, "read");
  1793. if (retry) {
  1794. goto try_again;
  1795. }
  1796. fprintf(stderr,
  1797. "InnoDB: Fatal error: cannot read from file."
  1798. " OS error number %lu.\n",
  1799. #ifdef __WIN__
  1800. (ulong) GetLastError()
  1801. #else
  1802. (ulong) errno
  1803. #endif
  1804. );
  1805. fflush(stderr);
  1806. ut_error;
  1807. return(FALSE);
  1808. }
  1809. /***********************************************************************
  1810. Requests a synchronous positioned read operation. This function does not do
  1811. any error handling. In case of error it returns FALSE. */
  1812. ibool
  1813. os_file_read_no_error_handling(
  1814. /*===========================*/
  1815. /* out: TRUE if request was
  1816. successful, FALSE if fail */
  1817. os_file_t file, /* in: handle to a file */
  1818. void* buf, /* in: buffer where to read */
  1819. ulint offset, /* in: least significant 32 bits of file
  1820. offset where to read */
  1821. ulint offset_high, /* in: most significant 32 bits of
  1822. offset */
  1823. ulint n) /* in: number of bytes to read */
  1824. {
  1825. #ifdef __WIN__
  1826. BOOL ret;
  1827. DWORD len;
  1828. DWORD ret2;
  1829. DWORD low;
  1830. DWORD high;
  1831. ibool retry;
  1832. ulint i;
  1833. ut_a((offset & 0xFFFFFFFFUL) == offset);
  1834. os_n_file_reads++;
  1835. os_bytes_read_since_printout += n;
  1836. try_again:
  1837. ut_ad(file);
  1838. ut_ad(buf);
  1839. ut_ad(n > 0);
  1840. low = (DWORD) offset;
  1841. high = (DWORD) offset_high;
  1842. os_mutex_enter(os_file_count_mutex);
  1843. os_n_pending_reads++;
  1844. os_mutex_exit(os_file_count_mutex);
  1845. /* Protect the seek / read operation with a mutex */
  1846. i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
  1847. os_mutex_enter(os_file_seek_mutexes[i]);
  1848. ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
  1849. if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
  1850. os_mutex_exit(os_file_seek_mutexes[i]);
  1851. os_mutex_enter(os_file_count_mutex);
  1852. os_n_pending_reads--;
  1853. os_mutex_exit(os_file_count_mutex);
  1854. goto error_handling;
  1855. }
  1856. ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
  1857. os_mutex_exit(os_file_seek_mutexes[i]);
  1858. os_mutex_enter(os_file_count_mutex);
  1859. os_n_pending_reads--;
  1860. os_mutex_exit(os_file_count_mutex);
  1861. if (ret && len == n) {
  1862. return(TRUE);
  1863. }
  1864. #else
  1865. ibool retry;
  1866. ssize_t ret;
  1867. os_bytes_read_since_printout += n;
  1868. try_again:
  1869. ret = os_file_pread(file, buf, n, offset, offset_high);
  1870. if ((ulint)ret == n) {
  1871. return(TRUE);
  1872. }
  1873. #endif
  1874. #ifdef __WIN__
  1875. error_handling:
  1876. #endif
  1877. retry = os_file_handle_error_no_exit(NULL, "read");
  1878. if (retry) {
  1879. goto try_again;
  1880. }
  1881. return(FALSE);
  1882. }
  1883. /***********************************************************************
  1884. Rewind file to its start, read at most size - 1 bytes from it to str, and
  1885. NUL-terminate str. All errors are silently ignored. This function is
  1886. mostly meant to be used with temporary files. */
  1887. void
  1888. os_file_read_string(
  1889. /*================*/
  1890. FILE* file, /* in: file to read from */
  1891. char* str, /* in: buffer where to read */
  1892. ulint size) /* in: size of buffer */
  1893. {
  1894. size_t flen;
  1895. if (size == 0) {
  1896. return;
  1897. }
  1898. rewind(file);
  1899. flen = fread(str, 1, size - 1, file);
  1900. str[flen] = '\0';
  1901. }
  1902. /***********************************************************************
  1903. Requests a synchronous write operation. */
  1904. ibool
  1905. os_file_write(
  1906. /*==========*/
  1907. /* out: TRUE if request was
  1908. successful, FALSE if fail */
  1909. const char* name, /* in: name of the file or path as a
  1910. null-terminated string */
  1911. os_file_t file, /* in: handle to a file */
  1912. const void* buf, /* in: buffer from which to write */
  1913. ulint offset, /* in: least significant 32 bits of file
  1914. offset where to write */
  1915. ulint offset_high, /* in: most significant 32 bits of
  1916. offset */
  1917. ulint n) /* in: number of bytes to write */
  1918. {
  1919. #ifdef __WIN__
  1920. BOOL ret;
  1921. DWORD len;
  1922. DWORD ret2;
  1923. DWORD low;
  1924. DWORD high;
  1925. ulint i;
  1926. ulint n_retries = 0;
  1927. ulint err;
  1928. ut_a((offset & 0xFFFFFFFF) == offset);
  1929. os_n_file_writes++;
  1930. ut_ad(file);
  1931. ut_ad(buf);
  1932. ut_ad(n > 0);
  1933. retry:
  1934. low = (DWORD) offset;
  1935. high = (DWORD) offset_high;
  1936. os_mutex_enter(os_file_count_mutex);
  1937. os_n_pending_writes++;
  1938. os_mutex_exit(os_file_count_mutex);
  1939. /* Protect the seek / write operation with a mutex */
  1940. i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
  1941. os_mutex_enter(os_file_seek_mutexes[i]);
  1942. ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
  1943. if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
  1944. os_mutex_exit(os_file_seek_mutexes[i]);
  1945. os_mutex_enter(os_file_count_mutex);
  1946. os_n_pending_writes--;
  1947. os_mutex_exit(os_file_count_mutex);
  1948. ut_print_timestamp(stderr);
  1949. fprintf(stderr,
  1950. " InnoDB: Error: File pointer positioning to"
  1951. " file %s failed at\n"
  1952. "InnoDB: offset %lu %lu. Operating system"
  1953. " error number %lu.\n"
  1954. "InnoDB: Some operating system error numbers"
  1955. " are described at\n"
  1956. "InnoDB: "
  1957. "http://dev.mysql.com/doc/refman/5.1/en/"
  1958. "operating-system-error-codes.html\n",
  1959. name, (ulong) offset_high, (ulong) offset,
  1960. (ulong) GetLastError());
  1961. return(FALSE);
  1962. }
  1963. ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
  1964. /* Always do fsync to reduce the probability that when the OS crashes,
  1965. a database page is only partially physically written to disk. */
  1966. # ifdef UNIV_DO_FLUSH
  1967. if (!os_do_not_call_flush_at_each_write) {
  1968. ut_a(TRUE == os_file_flush(file));
  1969. }
  1970. # endif /* UNIV_DO_FLUSH */
  1971. os_mutex_exit(os_file_seek_mutexes[i]);
  1972. os_mutex_enter(os_file_count_mutex);
  1973. os_n_pending_writes--;
  1974. os_mutex_exit(os_file_count_mutex);
  1975. if (ret && len == n) {
  1976. return(TRUE);
  1977. }
  1978. /* If some background file system backup tool is running, then, at
  1979. least in Windows 2000, we may get here a specific error. Let us
  1980. retry the operation 100 times, with 1 second waits. */
  1981. if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
  1982. os_thread_sleep(1000000);
  1983. n_retries++;
  1984. goto retry;
  1985. }
  1986. if (!os_has_said_disk_full) {
  1987. err = (ulint)GetLastError();
  1988. ut_print_timestamp(stderr);
  1989. fprintf(stderr,
  1990. " InnoDB: Error: Write to file %s failed"
  1991. " at offset %lu %lu.\n"
  1992. "InnoDB: %lu bytes should have been written,"
  1993. " only %lu were written.\n"
  1994. "InnoDB: Operating system error number %lu.\n"
  1995. "InnoDB: Check that your OS and file system"
  1996. " support files of this size.\n"
  1997. "InnoDB: Check also that the disk is not full"
  1998. " or a disk quota exceeded.\n",
  1999. name, (ulong) offset_high, (ulong) offset,
  2000. (ulong) n, (ulong) len, (ulong) err);
  2001. if (strerror((int)err) != NULL) {
  2002. fprintf(stderr,
  2003. "InnoDB: Error number %lu means '%s'.\n",
  2004. (ulong) err, strerror((int)err));
  2005. }
  2006. fprintf(stderr,
  2007. "InnoDB: Some operating system error numbers"
  2008. " are described at\n"
  2009. "InnoDB: "
  2010. "http://dev.mysql.com/doc/refman/5.1/en/"
  2011. "operating-system-error-codes.html\n");
  2012. os_has_said_disk_full = TRUE;
  2013. }
  2014. return(FALSE);
  2015. #else
  2016. ssize_t ret;
  2017. ret = os_file_pwrite(file, buf, n, offset, offset_high);
  2018. if ((ulint)ret == n) {
  2019. return(TRUE);
  2020. }
  2021. if (!os_has_said_disk_full) {
  2022. ut_print_timestamp(stderr);
  2023. fprintf(stderr,
  2024. " InnoDB: Error: Write to file %s failed"
  2025. " at offset %lu %lu.\n"
  2026. "InnoDB: %lu bytes should have been written,"
  2027. " only %ld were written.\n"
  2028. "InnoDB: Operating system error number %lu.\n"
  2029. "InnoDB: Check that your OS and file system"
  2030. " support files of this size.\n"
  2031. "InnoDB: Check also that the disk is not full"
  2032. " or a disk quota exceeded.\n",
  2033. name, offset_high, offset, n, (long int)ret,
  2034. (ulint)errno);
  2035. if (strerror(errno) != NULL) {
  2036. fprintf(stderr,
  2037. "InnoDB: Error number %lu means '%s'.\n",
  2038. (ulint)errno, strerror(errno));
  2039. }
  2040. fprintf(stderr,
  2041. "InnoDB: Some operating system error numbers"
  2042. " are described at\n"
  2043. "InnoDB: "
  2044. "http://dev.mysql.com/doc/refman/5.1/en/"
  2045. "operating-system-error-codes.html\n");
  2046. os_has_said_disk_full = TRUE;
  2047. }
  2048. return(FALSE);
  2049. #endif
  2050. }
  2051. /***********************************************************************
  2052. Check the existence and type of the given file. */
  2053. ibool
  2054. os_file_status(
  2055. /*===========*/
  2056. /* out: TRUE if call succeeded */
  2057. const char* path, /* in: pathname of the file */
  2058. ibool* exists, /* out: TRUE if file exists */
  2059. os_file_type_t* type) /* out: type of the file (if it exists) */
  2060. {
  2061. #ifdef __WIN__
  2062. int ret;
  2063. struct _stat statinfo;
  2064. ret = _stat(path, &statinfo);
  2065. if (ret && (errno == ENOENT || errno == ENOTDIR)) {
  2066. /* file does not exist */
  2067. *exists = FALSE;
  2068. return(TRUE);
  2069. } else if (ret) {
  2070. /* file exists, but stat call failed */
  2071. os_file_handle_error_no_exit(path, "stat");
  2072. return(FALSE);
  2073. }
  2074. if (_S_IFDIR & statinfo.st_mode) {
  2075. *type = OS_FILE_TYPE_DIR;
  2076. } else if (_S_IFREG & statinfo.st_mode) {
  2077. *type = OS_FILE_TYPE_FILE;
  2078. } else {
  2079. *type = OS_FILE_TYPE_UNKNOWN;
  2080. }
  2081. *exists = TRUE;
  2082. return(TRUE);
  2083. #else
  2084. int ret;
  2085. struct stat statinfo;
  2086. ret = stat(path, &statinfo);
  2087. if (ret && (errno == ENOENT || errno == ENOTDIR)) {
  2088. /* file does not exist */
  2089. *exists = FALSE;
  2090. return(TRUE);
  2091. } else if (ret) {
  2092. /* file exists, but stat call failed */
  2093. os_file_handle_error_no_exit(path, "stat");
  2094. return(FALSE);
  2095. }
  2096. if (S_ISDIR(statinfo.st_mode)) {
  2097. *type = OS_FILE_TYPE_DIR;
  2098. } else if (S_ISLNK(statinfo.st_mode)) {
  2099. *type = OS_FILE_TYPE_LINK;
  2100. } else if (S_ISREG(statinfo.st_mode)) {
  2101. *type = OS_FILE_TYPE_FILE;
  2102. } else {
  2103. *type = OS_FILE_TYPE_UNKNOWN;
  2104. }
  2105. *exists = TRUE;
  2106. return(TRUE);
  2107. #endif
  2108. }
  2109. /***********************************************************************
  2110. This function returns information about the specified file */
  2111. ibool
  2112. os_file_get_status(
  2113. /*===============*/
  2114. /* out: TRUE if stat
  2115. information found */
  2116. const char* path, /* in: pathname of the file */
  2117. os_file_stat_t* stat_info) /* information of a file in a
  2118. directory */
  2119. {
  2120. #ifdef __WIN__
  2121. int ret;
  2122. struct _stat statinfo;
  2123. ret = _stat(path, &statinfo);
  2124. if (ret && (errno == ENOENT || errno == ENOTDIR)) {
  2125. /* file does not exist */
  2126. return(FALSE);
  2127. } else if (ret) {
  2128. /* file exists, but stat call failed */
  2129. os_file_handle_error_no_exit(path, "stat");
  2130. return(FALSE);
  2131. }
  2132. if (_S_IFDIR & statinfo.st_mode) {
  2133. stat_info->type = OS_FILE_TYPE_DIR;
  2134. } else if (_S_IFREG & statinfo.st_mode) {
  2135. stat_info->type = OS_FILE_TYPE_FILE;
  2136. } else {
  2137. stat_info->type = OS_FILE_TYPE_UNKNOWN;
  2138. }
  2139. stat_info->ctime = statinfo.st_ctime;
  2140. stat_info->atime = statinfo.st_atime;
  2141. stat_info->mtime = statinfo.st_mtime;
  2142. stat_info->size = statinfo.st_size;
  2143. return(TRUE);
  2144. #else
  2145. int ret;
  2146. struct stat statinfo;
  2147. ret = stat(path, &statinfo);
  2148. if (ret && (errno == ENOENT || errno == ENOTDIR)) {
  2149. /* file does not exist */
  2150. return(FALSE);
  2151. } else if (ret) {
  2152. /* file exists, but stat call failed */
  2153. os_file_handle_error_no_exit(path, "stat");
  2154. return(FALSE);
  2155. }
  2156. if (S_ISDIR(statinfo.st_mode)) {
  2157. stat_info->type = OS_FILE_TYPE_DIR;
  2158. } else if (S_ISLNK(statinfo.st_mode)) {
  2159. stat_info->type = OS_FILE_TYPE_LINK;
  2160. } else if (S_ISREG(statinfo.st_mode)) {
  2161. stat_info->type = OS_FILE_TYPE_FILE;
  2162. } else {
  2163. stat_info->type = OS_FILE_TYPE_UNKNOWN;
  2164. }
  2165. stat_info->ctime = statinfo.st_ctime;
  2166. stat_info->atime = statinfo.st_atime;
  2167. stat_info->mtime = statinfo.st_mtime;
  2168. stat_info->size = statinfo.st_size;
  2169. return(TRUE);
  2170. #endif
  2171. }
  2172. /* path name separator character */
  2173. #ifdef __WIN__
  2174. # define OS_FILE_PATH_SEPARATOR '\\'
  2175. #else
  2176. # define OS_FILE_PATH_SEPARATOR '/'
  2177. #endif
  2178. /********************************************************************
  2179. The function os_file_dirname returns a directory component of a
  2180. null-terminated pathname string. In the usual case, dirname returns
  2181. the string up to, but not including, the final '/', and basename
  2182. is the component following the final '/'. Trailing '/' charac
  2183. ters are not counted as part of the pathname.
  2184. If path does not contain a slash, dirname returns the string ".".
  2185. Concatenating the string returned by dirname, a "/", and the basename
  2186. yields a complete pathname.
  2187. The return value is a copy of the directory component of the pathname.
  2188. The copy is allocated from heap. It is the caller responsibility
  2189. to free it after it is no longer needed.
  2190. The following list of examples (taken from SUSv2) shows the strings
  2191. returned by dirname and basename for different paths:
  2192. path dirname basename
  2193. "/usr/lib" "/usr" "lib"
  2194. "/usr/" "/" "usr"
  2195. "usr" "." "usr"
  2196. "/" "/" "/"
  2197. "." "." "."
  2198. ".." "." ".."
  2199. */
  2200. char*
  2201. os_file_dirname(
  2202. /*============*/
  2203. /* out, own: directory component of the
  2204. pathname */
  2205. const char* path) /* in: pathname */
  2206. {
  2207. /* Find the offset of the last slash */
  2208. const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
  2209. if (!last_slash) {
  2210. /* No slash in the path, return "." */
  2211. return(mem_strdup("."));
  2212. }
  2213. /* Ok, there is a slash */
  2214. if (last_slash == path) {
  2215. /* last slash is the first char of the path */
  2216. return(mem_strdup("/"));
  2217. }
  2218. /* Non-trivial directory component */
  2219. return(mem_strdupl(path, last_slash - path));
  2220. }
  2221. /********************************************************************
  2222. Creates all missing subdirectories along the given path. */
  2223. ibool
  2224. os_file_create_subdirs_if_needed(
  2225. /*=============================*/
  2226. /* out: TRUE if call succeeded
  2227. FALSE otherwise */
  2228. const char* path) /* in: path name */
  2229. {
  2230. char* subdir;
  2231. ibool success, subdir_exists;
  2232. os_file_type_t type;
  2233. subdir = os_file_dirname(path);
  2234. if (strlen(subdir) == 1
  2235. && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
  2236. /* subdir is root or cwd, nothing to do */
  2237. mem_free(subdir);
  2238. return(TRUE);
  2239. }
  2240. /* Test if subdir exists */
  2241. success = os_file_status(subdir, &subdir_exists, &type);
  2242. if (success && !subdir_exists) {
  2243. /* subdir does not exist, create it */
  2244. success = os_file_create_subdirs_if_needed(subdir);
  2245. if (!success) {
  2246. mem_free(subdir);
  2247. return(FALSE);
  2248. }
  2249. success = os_file_create_directory(subdir, FALSE);
  2250. }
  2251. mem_free(subdir);
  2252. return(success);
  2253. }
  2254. /********************************************************************
  2255. Returns a pointer to the nth slot in the aio array. */
  2256. static
  2257. os_aio_slot_t*
  2258. os_aio_array_get_nth_slot(
  2259. /*======================*/
  2260. /* out: pointer to slot */
  2261. os_aio_array_t* array, /* in: aio array */
  2262. ulint index) /* in: index of the slot */
  2263. {
  2264. ut_a(index < array->n_slots);
  2265. return((array->slots) + index);
  2266. }
  2267. /****************************************************************************
  2268. Creates an aio wait array. */
  2269. static
  2270. os_aio_array_t*
  2271. os_aio_array_create(
  2272. /*================*/
  2273. /* out, own: aio array */
  2274. ulint n, /* in: maximum number of pending aio operations
  2275. allowed; n must be divisible by n_segments */
  2276. ulint n_segments) /* in: number of segments in the aio array */
  2277. {
  2278. os_aio_array_t* array;
  2279. ulint i;
  2280. os_aio_slot_t* slot;
  2281. #ifdef WIN_ASYNC_IO
  2282. OVERLAPPED* over;
  2283. #endif
  2284. ut_a(n > 0);
  2285. ut_a(n_segments > 0);
  2286. array = ut_malloc(sizeof(os_aio_array_t));
  2287. array->mutex = os_mutex_create(NULL);
  2288. array->not_full = os_event_create(NULL);
  2289. array->is_empty = os_event_create(NULL);
  2290. os_event_set(array->is_empty);
  2291. array->n_slots = n;
  2292. array->n_segments = n_segments;
  2293. array->n_reserved = 0;
  2294. array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
  2295. #ifdef __WIN__
  2296. array->native_events = ut_malloc(n * sizeof(os_native_event_t));
  2297. #endif
  2298. for (i = 0; i < n; i++) {
  2299. slot = os_aio_array_get_nth_slot(array, i);
  2300. slot->pos = i;
  2301. slot->reserved = FALSE;
  2302. #ifdef WIN_ASYNC_IO
  2303. slot->event = os_event_create(NULL);
  2304. over = &(slot->control);
  2305. over->hEvent = slot->event->handle;
  2306. *((array->native_events) + i) = over->hEvent;
  2307. #endif
  2308. }
  2309. return(array);
  2310. }
  2311. /****************************************************************************
  2312. Initializes the asynchronous io system. Calls also os_io_init_simple.
  2313. Creates a separate aio array for
  2314. non-ibuf read and write, a third aio array for the ibuf i/o, with just one
  2315. segment, two aio arrays for log reads and writes with one segment, and a
  2316. synchronous aio array of the specified size. The combined number of segments
  2317. in the three first aio arrays is the parameter n_segments given to the
  2318. function. The caller must create an i/o handler thread for each segment in
  2319. the four first arrays, but not for the sync aio array. */
  2320. void
  2321. os_aio_init(
  2322. /*========*/
  2323. ulint n, /* in: maximum number of pending aio operations
  2324. allowed; n must be divisible by n_segments */
  2325. ulint n_segments, /* in: combined number of segments in the four
  2326. first aio arrays; must be >= 4 */
  2327. ulint n_slots_sync) /* in: number of slots in the sync aio array */
  2328. {
  2329. ulint n_read_segs;
  2330. ulint n_write_segs;
  2331. ulint n_per_seg;
  2332. ulint i;
  2333. #ifdef POSIX_ASYNC_IO
  2334. sigset_t sigset;
  2335. #endif
  2336. ut_ad(n % n_segments == 0);
  2337. ut_ad(n_segments >= 4);
  2338. os_io_init_simple();
  2339. for (i = 0; i < n_segments; i++) {
  2340. srv_set_io_thread_op_info(i, "not started yet");
  2341. }
  2342. n_per_seg = n / n_segments;
  2343. n_write_segs = (n_segments - 2) / 2;
  2344. n_read_segs = n_segments - 2 - n_write_segs;
  2345. /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
  2346. os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
  2347. srv_io_thread_function[0] = "insert buffer thread";
  2348. os_aio_log_array = os_aio_array_create(n_per_seg, 1);
  2349. srv_io_thread_function[1] = "log thread";
  2350. os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
  2351. n_read_segs);
  2352. for (i = 2; i < 2 + n_read_segs; i++) {
  2353. ut_a(i < SRV_MAX_N_IO_THREADS);
  2354. srv_io_thread_function[i] = "read thread";
  2355. }
  2356. os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
  2357. n_write_segs);
  2358. for (i = 2 + n_read_segs; i < n_segments; i++) {
  2359. ut_a(i < SRV_MAX_N_IO_THREADS);
  2360. srv_io_thread_function[i] = "write thread";
  2361. }
  2362. os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
  2363. os_aio_n_segments = n_segments;
  2364. os_aio_validate();
  2365. os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*));
  2366. for (i = 0; i < n_segments; i++) {
  2367. os_aio_segment_wait_events[i] = os_event_create(NULL);
  2368. }
  2369. os_last_printout = time(NULL);
  2370. #ifdef POSIX_ASYNC_IO
  2371. /* Block aio signals from the current thread and its children:
  2372. for this to work, the current thread must be the first created
  2373. in the database, so that all its children will inherit its
  2374. signal mask */
  2375. /* TODO: to work MySQL needs the SIGALARM signal; the following
  2376. will not work yet! */
  2377. sigemptyset(&sigset);
  2378. sigaddset(&sigset, SIGRTMIN + 1 + 0);
  2379. sigaddset(&sigset, SIGRTMIN + 1 + 1);
  2380. sigaddset(&sigset, SIGRTMIN + 1 + 2);
  2381. sigaddset(&sigset, SIGRTMIN + 1 + 3);
  2382. pthread_sigmask(SIG_BLOCK, &sigset, NULL); */
  2383. #endif
  2384. }
  2385. #ifdef WIN_ASYNC_IO
  2386. /****************************************************************************
  2387. Wakes up all async i/o threads in the array in Windows async i/o at
  2388. shutdown. */
  2389. static
  2390. void
  2391. os_aio_array_wake_win_aio_at_shutdown(
  2392. /*==================================*/
  2393. os_aio_array_t* array) /* in: aio array */
  2394. {
  2395. ulint i;
  2396. for (i = 0; i < array->n_slots; i++) {
  2397. os_event_set((array->slots + i)->event);
  2398. }
  2399. }
  2400. #endif
  2401. /****************************************************************************
  2402. Wakes up all async i/o threads so that they know to exit themselves in
  2403. shutdown. */
  2404. void
  2405. os_aio_wake_all_threads_at_shutdown(void)
  2406. /*=====================================*/
  2407. {
  2408. ulint i;
  2409. #ifdef WIN_ASYNC_IO
  2410. /* This code wakes up all ai/o threads in Windows native aio */
  2411. os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
  2412. os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
  2413. os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
  2414. os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
  2415. #endif
  2416. /* This loop wakes up all simulated ai/o threads */
  2417. for (i = 0; i < os_aio_n_segments; i++) {
  2418. os_event_set(os_aio_segment_wait_events[i]);
  2419. }
  2420. }
  2421. /****************************************************************************
  2422. Waits until there are no pending writes in os_aio_write_array. There can
  2423. be other, synchronous, pending writes. */
  2424. void
  2425. os_aio_wait_until_no_pending_writes(void)
  2426. /*=====================================*/
  2427. {
  2428. os_event_wait(os_aio_write_array->is_empty);
  2429. }
  2430. /**************************************************************************
  2431. Calculates segment number for a slot. */
  2432. static
  2433. ulint
  2434. os_aio_get_segment_no_from_slot(
  2435. /*============================*/
  2436. /* out: segment number (which is the number
  2437. used by, for example, i/o-handler threads) */
  2438. os_aio_array_t* array, /* in: aio wait array */
  2439. os_aio_slot_t* slot) /* in: slot in this array */
  2440. {
  2441. ulint segment;
  2442. ulint seg_len;
  2443. if (array == os_aio_ibuf_array) {
  2444. segment = 0;
  2445. } else if (array == os_aio_log_array) {
  2446. segment = 1;
  2447. } else if (array == os_aio_read_array) {
  2448. seg_len = os_aio_read_array->n_slots
  2449. / os_aio_read_array->n_segments;
  2450. segment = 2 + slot->pos / seg_len;
  2451. } else {
  2452. ut_a(array == os_aio_write_array);
  2453. seg_len = os_aio_write_array->n_slots
  2454. / os_aio_write_array->n_segments;
  2455. segment = os_aio_read_array->n_segments + 2
  2456. + slot->pos / seg_len;
  2457. }
  2458. return(segment);
  2459. }
  2460. /**************************************************************************
  2461. Calculates local segment number and aio array from global segment number. */
  2462. static
  2463. ulint
  2464. os_aio_get_array_and_local_segment(
  2465. /*===============================*/
  2466. /* out: local segment number within
  2467. the aio array */
  2468. os_aio_array_t** array, /* out: aio wait array */
  2469. ulint global_segment)/* in: global segment number */
  2470. {
  2471. ulint segment;
  2472. ut_a(global_segment < os_aio_n_segments);
  2473. if (global_segment == 0) {
  2474. *array = os_aio_ibuf_array;
  2475. segment = 0;
  2476. } else if (global_segment == 1) {
  2477. *array = os_aio_log_array;
  2478. segment = 0;
  2479. } else if (global_segment < os_aio_read_array->n_segments + 2) {
  2480. *array = os_aio_read_array;
  2481. segment = global_segment - 2;
  2482. } else {
  2483. *array = os_aio_write_array;
  2484. segment = global_segment - (os_aio_read_array->n_segments + 2);
  2485. }
  2486. return(segment);
  2487. }
  2488. /***********************************************************************
  2489. Gets an integer value designating a specified aio array. This is used
  2490. to give numbers to signals in Posix aio. */
  2491. #if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO)
  2492. static
  2493. ulint
  2494. os_aio_get_array_no(
  2495. /*================*/
  2496. os_aio_array_t* array) /* in: aio array */
  2497. {
  2498. if (array == os_aio_ibuf_array) {
  2499. return(0);
  2500. } else if (array == os_aio_log_array) {
  2501. return(1);
  2502. } else if (array == os_aio_read_array) {
  2503. return(2);
  2504. } else if (array == os_aio_write_array) {
  2505. return(3);
  2506. } else {
  2507. ut_error;
  2508. return(0);
  2509. }
  2510. }
  2511. /***********************************************************************
  2512. Gets the aio array for its number. */
  2513. static
  2514. os_aio_array_t*
  2515. os_aio_get_array_from_no(
  2516. /*=====================*/
  2517. /* out: aio array */
  2518. ulint n) /* in: array number */
  2519. {
  2520. if (n == 0) {
  2521. return(os_aio_ibuf_array);
  2522. } else if (n == 1) {
  2523. return(os_aio_log_array);
  2524. } else if (n == 2) {
  2525. return(os_aio_read_array);
  2526. } else if (n == 3) {
  2527. return(os_aio_write_array);
  2528. } else {
  2529. ut_error;
  2530. return(NULL);
  2531. }
  2532. }
  2533. #endif /* if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO) */
  2534. /***********************************************************************
  2535. Requests for a slot in the aio array. If no slot is available, waits until
  2536. not_full-event becomes signaled. */
  2537. static
  2538. os_aio_slot_t*
  2539. os_aio_array_reserve_slot(
  2540. /*======================*/
  2541. /* out: pointer to slot */
  2542. ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
  2543. os_aio_array_t* array, /* in: aio array */
  2544. fil_node_t* message1,/* in: message to be passed along with
  2545. the aio operation */
  2546. void* message2,/* in: message to be passed along with
  2547. the aio operation */
  2548. os_file_t file, /* in: file handle */
  2549. const char* name, /* in: name of the file or path as a
  2550. null-terminated string */
  2551. void* buf, /* in: buffer where to read or from which
  2552. to write */
  2553. ulint offset, /* in: least significant 32 bits of file
  2554. offset */
  2555. ulint offset_high, /* in: most significant 32 bits of
  2556. offset */
  2557. ulint len) /* in: length of the block to read or write */
  2558. {
  2559. os_aio_slot_t* slot;
  2560. #ifdef WIN_ASYNC_IO
  2561. OVERLAPPED* control;
  2562. #elif defined(POSIX_ASYNC_IO)
  2563. struct aiocb* control;
  2564. #endif
  2565. ulint i;
  2566. loop:
  2567. os_mutex_enter(array->mutex);
  2568. if (array->n_reserved == array->n_slots) {
  2569. os_mutex_exit(array->mutex);
  2570. if (!os_aio_use_native_aio) {
  2571. /* If the handler threads are suspended, wake them
  2572. so that we get more slots */
  2573. os_aio_simulated_wake_handler_threads();
  2574. }
  2575. os_event_wait(array->not_full);
  2576. goto loop;
  2577. }
  2578. for (i = 0;; i++) {
  2579. slot = os_aio_array_get_nth_slot(array, i);
  2580. if (slot->reserved == FALSE) {
  2581. break;
  2582. }
  2583. }
  2584. array->n_reserved++;
  2585. if (array->n_reserved == 1) {
  2586. os_event_reset(array->is_empty);
  2587. }
  2588. if (array->n_reserved == array->n_slots) {
  2589. os_event_reset(array->not_full);
  2590. }
  2591. slot->reserved = TRUE;
  2592. slot->reservation_time = time(NULL);
  2593. slot->message1 = message1;
  2594. slot->message2 = message2;
  2595. slot->file = file;
  2596. slot->name = name;
  2597. slot->len = len;
  2598. slot->type = type;
  2599. slot->buf = buf;
  2600. slot->offset = offset;
  2601. slot->offset_high = offset_high;
  2602. slot->io_already_done = FALSE;
  2603. #ifdef WIN_ASYNC_IO
  2604. control = &(slot->control);
  2605. control->Offset = (DWORD)offset;
  2606. control->OffsetHigh = (DWORD)offset_high;
  2607. os_event_reset(slot->event);
  2608. #elif defined(POSIX_ASYNC_IO)
  2609. #if (UNIV_WORD_SIZE == 8)
  2610. offset = offset + (offset_high << 32);
  2611. #else
  2612. ut_a(offset_high == 0);
  2613. #endif
  2614. control = &(slot->control);
  2615. control->aio_fildes = file;
  2616. control->aio_buf = buf;
  2617. control->aio_nbytes = len;
  2618. control->aio_offset = offset;
  2619. control->aio_reqprio = 0;
  2620. control->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
  2621. control->aio_sigevent.sigev_signo
  2622. = SIGRTMIN + 1 + os_aio_get_array_no(array);
  2623. /* TODO: How to choose the signal numbers? */
  2624. /*
  2625. fprintf(stderr, "AIO signal number %lu\n",
  2626. (ulint) control->aio_sigevent.sigev_signo);
  2627. */
  2628. control->aio_sigevent.sigev_value.sival_ptr = slot;
  2629. #endif
  2630. os_mutex_exit(array->mutex);
  2631. return(slot);
  2632. }
  2633. /***********************************************************************
  2634. Frees a slot in the aio array. */
  2635. static
  2636. void
  2637. os_aio_array_free_slot(
  2638. /*===================*/
  2639. os_aio_array_t* array, /* in: aio array */
  2640. os_aio_slot_t* slot) /* in: pointer to slot */
  2641. {
  2642. ut_ad(array);
  2643. ut_ad(slot);
  2644. os_mutex_enter(array->mutex);
  2645. ut_ad(slot->reserved);
  2646. slot->reserved = FALSE;
  2647. array->n_reserved--;
  2648. if (array->n_reserved == array->n_slots - 1) {
  2649. os_event_set(array->not_full);
  2650. }
  2651. if (array->n_reserved == 0) {
  2652. os_event_set(array->is_empty);
  2653. }
  2654. #ifdef WIN_ASYNC_IO
  2655. os_event_reset(slot->event);
  2656. #endif
  2657. os_mutex_exit(array->mutex);
  2658. }
  2659. /**************************************************************************
  2660. Wakes up a simulated aio i/o-handler thread if it has something to do. */
  2661. static
  2662. void
  2663. os_aio_simulated_wake_handler_thread(
  2664. /*=================================*/
  2665. ulint global_segment) /* in: the number of the segment in the aio
  2666. arrays */
  2667. {
  2668. os_aio_array_t* array;
  2669. os_aio_slot_t* slot;
  2670. ulint segment;
  2671. ulint n;
  2672. ulint i;
  2673. ut_ad(!os_aio_use_native_aio);
  2674. segment = os_aio_get_array_and_local_segment(&array, global_segment);
  2675. n = array->n_slots / array->n_segments;
  2676. /* Look through n slots after the segment * n'th slot */
  2677. os_mutex_enter(array->mutex);
  2678. for (i = 0; i < n; i++) {
  2679. slot = os_aio_array_get_nth_slot(array, i + segment * n);
  2680. if (slot->reserved) {
  2681. /* Found an i/o request */
  2682. break;
  2683. }
  2684. }
  2685. os_mutex_exit(array->mutex);
  2686. if (i < n) {
  2687. os_event_set(os_aio_segment_wait_events[global_segment]);
  2688. }
  2689. }
  2690. /**************************************************************************
  2691. Wakes up simulated aio i/o-handler threads if they have something to do. */
  2692. void
  2693. os_aio_simulated_wake_handler_threads(void)
  2694. /*=======================================*/
  2695. {
  2696. ulint i;
  2697. if (os_aio_use_native_aio) {
  2698. /* We do not use simulated aio: do nothing */
  2699. return;
  2700. }
  2701. os_aio_recommend_sleep_for_read_threads = FALSE;
  2702. for (i = 0; i < os_aio_n_segments; i++) {
  2703. os_aio_simulated_wake_handler_thread(i);
  2704. }
  2705. }
  2706. /**************************************************************************
  2707. This function can be called if one wants to post a batch of reads and
  2708. prefers an i/o-handler thread to handle them all at once later. You must
  2709. call os_aio_simulated_wake_handler_threads later to ensure the threads
  2710. are not left sleeping! */
  2711. void
  2712. os_aio_simulated_put_read_threads_to_sleep(void)
  2713. /*============================================*/
  2714. {
  2715. os_aio_array_t* array;
  2716. ulint g;
  2717. os_aio_recommend_sleep_for_read_threads = TRUE;
  2718. for (g = 0; g < os_aio_n_segments; g++) {
  2719. os_aio_get_array_and_local_segment(&array, g);
  2720. if (array == os_aio_read_array) {
  2721. os_event_reset(os_aio_segment_wait_events[g]);
  2722. }
  2723. }
  2724. }
  2725. /***********************************************************************
  2726. Requests an asynchronous i/o operation. */
  2727. ibool
  2728. os_aio(
  2729. /*===*/
  2730. /* out: TRUE if request was queued
  2731. successfully, FALSE if fail */
  2732. ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
  2733. ulint mode, /* in: OS_AIO_NORMAL, ..., possibly ORed
  2734. to OS_AIO_SIMULATED_WAKE_LATER: the
  2735. last flag advises this function not to wake
  2736. i/o-handler threads, but the caller will
  2737. do the waking explicitly later, in this
  2738. way the caller can post several requests in
  2739. a batch; NOTE that the batch must not be
  2740. so big that it exhausts the slots in aio
  2741. arrays! NOTE that a simulated batch
  2742. may introduce hidden chances of deadlocks,
  2743. because i/os are not actually handled until
  2744. all have been posted: use with great
  2745. caution! */
  2746. const char* name, /* in: name of the file or path as a
  2747. null-terminated string */
  2748. os_file_t file, /* in: handle to a file */
  2749. void* buf, /* in: buffer where to read or from which
  2750. to write */
  2751. ulint offset, /* in: least significant 32 bits of file
  2752. offset where to read or write */
  2753. ulint offset_high, /* in: most significant 32 bits of
  2754. offset */
  2755. ulint n, /* in: number of bytes to read or write */
  2756. fil_node_t* message1,/* in: messages for the aio handler (these
  2757. can be used to identify a completed aio
  2758. operation); if mode is OS_AIO_SYNC, these
  2759. are ignored */
  2760. void* message2)
  2761. {
  2762. os_aio_array_t* array;
  2763. os_aio_slot_t* slot;
  2764. #ifdef WIN_ASYNC_IO
  2765. ibool retval;
  2766. BOOL ret = TRUE;
  2767. DWORD len = (DWORD) n;
  2768. struct fil_node_struct * dummy_mess1;
  2769. void* dummy_mess2;
  2770. ulint dummy_type;
  2771. #endif
  2772. ulint err = 0;
  2773. ibool retry;
  2774. ulint wake_later;
  2775. ut_ad(file);
  2776. ut_ad(buf);
  2777. ut_ad(n > 0);
  2778. ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
  2779. ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
  2780. ut_ad(os_aio_validate());
  2781. wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
  2782. mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
  2783. if (mode == OS_AIO_SYNC
  2784. #ifdef WIN_ASYNC_IO
  2785. && !os_aio_use_native_aio
  2786. #endif
  2787. ) {
  2788. /* This is actually an ordinary synchronous read or write:
  2789. no need to use an i/o-handler thread. NOTE that if we use
  2790. Windows async i/o, Windows does not allow us to use
  2791. ordinary synchronous os_file_read etc. on the same file,
  2792. therefore we have built a special mechanism for synchronous
  2793. wait in the Windows case. */
  2794. if (type == OS_FILE_READ) {
  2795. return(os_file_read(file, buf, offset,
  2796. offset_high, n));
  2797. }
  2798. ut_a(type == OS_FILE_WRITE);
  2799. return(os_file_write(name, file, buf, offset, offset_high, n));
  2800. }
  2801. try_again:
  2802. if (mode == OS_AIO_NORMAL) {
  2803. if (type == OS_FILE_READ) {
  2804. array = os_aio_read_array;
  2805. } else {
  2806. array = os_aio_write_array;
  2807. }
  2808. } else if (mode == OS_AIO_IBUF) {
  2809. ut_ad(type == OS_FILE_READ);
  2810. /* Reduce probability of deadlock bugs in connection with ibuf:
  2811. do not let the ibuf i/o handler sleep */
  2812. wake_later = FALSE;
  2813. array = os_aio_ibuf_array;
  2814. } else if (mode == OS_AIO_LOG) {
  2815. array = os_aio_log_array;
  2816. } else if (mode == OS_AIO_SYNC) {
  2817. array = os_aio_sync_array;
  2818. } else {
  2819. array = NULL; /* Eliminate compiler warning */
  2820. ut_error;
  2821. }
  2822. slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
  2823. name, buf, offset, offset_high, n);
  2824. if (type == OS_FILE_READ) {
  2825. if (os_aio_use_native_aio) {
  2826. #ifdef WIN_ASYNC_IO
  2827. os_n_file_reads++;
  2828. os_bytes_read_since_printout += len;
  2829. ret = ReadFile(file, buf, (DWORD)n, &len,
  2830. &(slot->control));
  2831. #elif defined(POSIX_ASYNC_IO)
  2832. slot->control.aio_lio_opcode = LIO_READ;
  2833. err = (ulint) aio_read(&(slot->control));
  2834. fprintf(stderr, "Starting POSIX aio read %lu\n", err);
  2835. #endif
  2836. } else {
  2837. if (!wake_later) {
  2838. os_aio_simulated_wake_handler_thread(
  2839. os_aio_get_segment_no_from_slot(
  2840. array, slot));
  2841. }
  2842. }
  2843. } else if (type == OS_FILE_WRITE) {
  2844. if (os_aio_use_native_aio) {
  2845. #ifdef WIN_ASYNC_IO
  2846. os_n_file_writes++;
  2847. ret = WriteFile(file, buf, (DWORD)n, &len,
  2848. &(slot->control));
  2849. #elif defined(POSIX_ASYNC_IO)
  2850. slot->control.aio_lio_opcode = LIO_WRITE;
  2851. err = (ulint) aio_write(&(slot->control));
  2852. fprintf(stderr, "Starting POSIX aio write %lu\n", err);
  2853. #endif
  2854. } else {
  2855. if (!wake_later) {
  2856. os_aio_simulated_wake_handler_thread(
  2857. os_aio_get_segment_no_from_slot(
  2858. array, slot));
  2859. }
  2860. }
  2861. } else {
  2862. ut_error;
  2863. }
  2864. #ifdef WIN_ASYNC_IO
  2865. if (os_aio_use_native_aio) {
  2866. if ((ret && len == n)
  2867. || (!ret && GetLastError() == ERROR_IO_PENDING)) {
  2868. /* aio was queued successfully! */
  2869. if (mode == OS_AIO_SYNC) {
  2870. /* We want a synchronous i/o operation on a
  2871. file where we also use async i/o: in Windows
  2872. we must use the same wait mechanism as for
  2873. async i/o */
  2874. retval = os_aio_windows_handle(ULINT_UNDEFINED,
  2875. slot->pos,
  2876. &dummy_mess1,
  2877. &dummy_mess2,
  2878. &dummy_type);
  2879. return(retval);
  2880. }
  2881. return(TRUE);
  2882. }
  2883. err = 1; /* Fall through the next if */
  2884. }
  2885. #endif
  2886. if (err == 0) {
  2887. /* aio was queued successfully! */
  2888. return(TRUE);
  2889. }
  2890. os_aio_array_free_slot(array, slot);
  2891. retry = os_file_handle_error(name,
  2892. type == OS_FILE_READ
  2893. ? "aio read" : "aio write");
  2894. if (retry) {
  2895. goto try_again;
  2896. }
  2897. return(FALSE);
  2898. }
  2899. #ifdef WIN_ASYNC_IO
  2900. /**************************************************************************
  2901. This function is only used in Windows asynchronous i/o.
  2902. Waits for an aio operation to complete. This function is used to wait the
  2903. for completed requests. The aio array of pending requests is divided
  2904. into segments. The thread specifies which segment or slot it wants to wait
  2905. for. NOTE: this function will also take care of freeing the aio slot,
  2906. therefore no other thread is allowed to do the freeing! */
  2907. ibool
  2908. os_aio_windows_handle(
  2909. /*==================*/
  2910. /* out: TRUE if the aio operation succeeded */
  2911. ulint segment, /* in: the number of the segment in the aio
  2912. arrays to wait for; segment 0 is the ibuf
  2913. i/o thread, segment 1 the log i/o thread,
  2914. then follow the non-ibuf read threads, and as
  2915. the last are the non-ibuf write threads; if
  2916. this is ULINT_UNDEFINED, then it means that
  2917. sync aio is used, and this parameter is
  2918. ignored */
  2919. ulint pos, /* this parameter is used only in sync aio:
  2920. wait for the aio slot at this position */
  2921. fil_node_t**message1, /* out: the messages passed with the aio
  2922. request; note that also in the case where
  2923. the aio operation failed, these output
  2924. parameters are valid and can be used to
  2925. restart the operation, for example */
  2926. void** message2,
  2927. ulint* type) /* out: OS_FILE_WRITE or ..._READ */
  2928. {
  2929. ulint orig_seg = segment;
  2930. os_aio_array_t* array;
  2931. os_aio_slot_t* slot;
  2932. ulint n;
  2933. ulint i;
  2934. ibool ret_val;
  2935. BOOL ret;
  2936. DWORD len;
  2937. if (segment == ULINT_UNDEFINED) {
  2938. array = os_aio_sync_array;
  2939. segment = 0;
  2940. } else {
  2941. segment = os_aio_get_array_and_local_segment(&array, segment);
  2942. }
  2943. /* NOTE! We only access constant fields in os_aio_array. Therefore
  2944. we do not have to acquire the protecting mutex yet */
  2945. ut_ad(os_aio_validate());
  2946. ut_ad(segment < array->n_segments);
  2947. n = array->n_slots / array->n_segments;
  2948. if (array == os_aio_sync_array) {
  2949. os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
  2950. i = pos;
  2951. } else {
  2952. srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
  2953. i = os_event_wait_multiple(n,
  2954. (array->native_events)
  2955. + segment * n);
  2956. }
  2957. os_mutex_enter(array->mutex);
  2958. slot = os_aio_array_get_nth_slot(array, i + segment * n);
  2959. ut_a(slot->reserved);
  2960. if (orig_seg != ULINT_UNDEFINED) {
  2961. srv_set_io_thread_op_info(orig_seg,
  2962. "get windows aio return value");
  2963. }
  2964. ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
  2965. *message1 = slot->message1;
  2966. *message2 = slot->message2;
  2967. *type = slot->type;
  2968. if (ret && len == slot->len) {
  2969. ret_val = TRUE;
  2970. # ifdef UNIV_DO_FLUSH
  2971. if (slot->type == OS_FILE_WRITE
  2972. && !os_do_not_call_flush_at_each_write) {
  2973. ut_a(TRUE == os_file_flush(slot->file));
  2974. }
  2975. # endif /* UNIV_DO_FLUSH */
  2976. } else {
  2977. os_file_handle_error(slot->name, "Windows aio");
  2978. ret_val = FALSE;
  2979. }
  2980. os_mutex_exit(array->mutex);
  2981. os_aio_array_free_slot(array, slot);
  2982. return(ret_val);
  2983. }
  2984. #endif
  2985. #ifdef POSIX_ASYNC_IO
  2986. /**************************************************************************
  2987. This function is only used in Posix asynchronous i/o. Waits for an aio
  2988. operation to complete. */
  2989. ibool
  2990. os_aio_posix_handle(
  2991. /*================*/
  2992. /* out: TRUE if the aio operation succeeded */
  2993. ulint array_no, /* in: array number 0 - 3 */
  2994. fil_node_t**message1, /* out: the messages passed with the aio
  2995. request; note that also in the case where
  2996. the aio operation failed, these output
  2997. parameters are valid and can be used to
  2998. restart the operation, for example */
  2999. void** message2)
  3000. {
  3001. os_aio_array_t* array;
  3002. os_aio_slot_t* slot;
  3003. siginfo_t info;
  3004. sigset_t sigset;
  3005. sigset_t proc_sigset;
  3006. sigset_t thr_sigset;
  3007. int ret;
  3008. int i;
  3009. int sig;
  3010. sigemptyset(&sigset);
  3011. sigaddset(&sigset, SIGRTMIN + 1 + array_no);
  3012. pthread_sigmask(SIG_UNBLOCK, &sigset, NULL);
  3013. #if 0
  3014. sigprocmask(0, NULL, &proc_sigset);
  3015. pthread_sigmask(0, NULL, &thr_sigset);
  3016. for (i = 32 ; i < 40; i++) {
  3017. fprintf(stderr, "%lu : %lu %lu\n", (ulint)i,
  3018. (ulint) sigismember(&proc_sigset, i),
  3019. (ulint) sigismember(&thr_sigset, i));
  3020. }
  3021. #endif
  3022. ret = sigwaitinfo(&sigset, &info);
  3023. if (sig != SIGRTMIN + 1 + array_no) {
  3024. ut_error;
  3025. return(FALSE);
  3026. }
  3027. fputs("Handling POSIX aio\n", stderr);
  3028. array = os_aio_get_array_from_no(array_no);
  3029. os_mutex_enter(array->mutex);
  3030. slot = info.si_value.sival_ptr;
  3031. ut_a(slot->reserved);
  3032. *message1 = slot->message1;
  3033. *message2 = slot->message2;
  3034. # ifdef UNIV_DO_FLUSH
  3035. if (slot->type == OS_FILE_WRITE
  3036. && !os_do_not_call_flush_at_each_write) {
  3037. ut_a(TRUE == os_file_flush(slot->file));
  3038. }
  3039. # endif /* UNIV_DO_FLUSH */
  3040. os_mutex_exit(array->mutex);
  3041. os_aio_array_free_slot(array, slot);
  3042. return(TRUE);
  3043. }
  3044. #endif
  3045. /**************************************************************************
  3046. Does simulated aio. This function should be called by an i/o-handler
  3047. thread. */
  3048. ibool
  3049. os_aio_simulated_handle(
  3050. /*====================*/
  3051. /* out: TRUE if the aio operation succeeded */
  3052. ulint global_segment, /* in: the number of the segment in the aio
  3053. arrays to wait for; segment 0 is the ibuf
  3054. i/o thread, segment 1 the log i/o thread,
  3055. then follow the non-ibuf read threads, and as
  3056. the last are the non-ibuf write threads */
  3057. fil_node_t**message1, /* out: the messages passed with the aio
  3058. request; note that also in the case where
  3059. the aio operation failed, these output
  3060. parameters are valid and can be used to
  3061. restart the operation, for example */
  3062. void** message2,
  3063. ulint* type) /* out: OS_FILE_WRITE or ..._READ */
  3064. {
  3065. os_aio_array_t* array;
  3066. ulint segment;
  3067. os_aio_slot_t* slot;
  3068. os_aio_slot_t* slot2;
  3069. os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
  3070. ulint n_consecutive;
  3071. ulint total_len;
  3072. ulint offs;
  3073. ulint lowest_offset;
  3074. ulint biggest_age;
  3075. ulint age;
  3076. byte* combined_buf;
  3077. byte* combined_buf2;
  3078. ibool ret;
  3079. ulint n;
  3080. ulint i;
  3081. segment = os_aio_get_array_and_local_segment(&array, global_segment);
  3082. restart:
  3083. /* NOTE! We only access constant fields in os_aio_array. Therefore
  3084. we do not have to acquire the protecting mutex yet */
  3085. srv_set_io_thread_op_info(global_segment,
  3086. "looking for i/o requests (a)");
  3087. ut_ad(os_aio_validate());
  3088. ut_ad(segment < array->n_segments);
  3089. n = array->n_slots / array->n_segments;
  3090. /* Look through n slots after the segment * n'th slot */
  3091. if (array == os_aio_read_array
  3092. && os_aio_recommend_sleep_for_read_threads) {
  3093. /* Give other threads chance to add several i/os to the array
  3094. at once. */
  3095. goto recommended_sleep;
  3096. }
  3097. os_mutex_enter(array->mutex);
  3098. srv_set_io_thread_op_info(global_segment,
  3099. "looking for i/o requests (b)");
  3100. /* Check if there is a slot for which the i/o has already been
  3101. done */
  3102. for (i = 0; i < n; i++) {
  3103. slot = os_aio_array_get_nth_slot(array, i + segment * n);
  3104. if (slot->reserved && slot->io_already_done) {
  3105. if (os_aio_print_debug) {
  3106. fprintf(stderr,
  3107. "InnoDB: i/o for slot %lu"
  3108. " already done, returning\n",
  3109. (ulong) i);
  3110. }
  3111. ret = TRUE;
  3112. goto slot_io_done;
  3113. }
  3114. }
  3115. n_consecutive = 0;
  3116. /* If there are at least 2 seconds old requests, then pick the oldest
  3117. one to prevent starvation. If several requests have the same age,
  3118. then pick the one at the lowest offset. */
  3119. biggest_age = 0;
  3120. lowest_offset = ULINT_MAX;
  3121. for (i = 0; i < n; i++) {
  3122. slot = os_aio_array_get_nth_slot(array, i + segment * n);
  3123. if (slot->reserved) {
  3124. age = (ulint)difftime(time(NULL),
  3125. slot->reservation_time);
  3126. if ((age >= 2 && age > biggest_age)
  3127. || (age >= 2 && age == biggest_age
  3128. && slot->offset < lowest_offset)) {
  3129. /* Found an i/o request */
  3130. consecutive_ios[0] = slot;
  3131. n_consecutive = 1;
  3132. biggest_age = age;
  3133. lowest_offset = slot->offset;
  3134. }
  3135. }
  3136. }
  3137. if (n_consecutive == 0) {
  3138. /* There were no old requests. Look for an i/o request at the
  3139. lowest offset in the array (we ignore the high 32 bits of the
  3140. offset in these heuristics) */
  3141. lowest_offset = ULINT_MAX;
  3142. for (i = 0; i < n; i++) {
  3143. slot = os_aio_array_get_nth_slot(array,
  3144. i + segment * n);
  3145. if (slot->reserved && slot->offset < lowest_offset) {
  3146. /* Found an i/o request */
  3147. consecutive_ios[0] = slot;
  3148. n_consecutive = 1;
  3149. lowest_offset = slot->offset;
  3150. }
  3151. }
  3152. }
  3153. if (n_consecutive == 0) {
  3154. /* No i/o requested at the moment */
  3155. goto wait_for_io;
  3156. }
  3157. slot = consecutive_ios[0];
  3158. /* Check if there are several consecutive blocks to read or write */
  3159. consecutive_loop:
  3160. for (i = 0; i < n; i++) {
  3161. slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
  3162. if (slot2->reserved && slot2 != slot
  3163. && slot2->offset == slot->offset + slot->len
  3164. /* check that sum does not wrap over */
  3165. && slot->offset + slot->len > slot->offset
  3166. && slot2->offset_high == slot->offset_high
  3167. && slot2->type == slot->type
  3168. && slot2->file == slot->file) {
  3169. /* Found a consecutive i/o request */
  3170. consecutive_ios[n_consecutive] = slot2;
  3171. n_consecutive++;
  3172. slot = slot2;
  3173. if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
  3174. goto consecutive_loop;
  3175. } else {
  3176. break;
  3177. }
  3178. }
  3179. }
  3180. srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
  3181. /* We have now collected n_consecutive i/o requests in the array;
  3182. allocate a single buffer which can hold all data, and perform the
  3183. i/o */
  3184. total_len = 0;
  3185. slot = consecutive_ios[0];
  3186. for (i = 0; i < n_consecutive; i++) {
  3187. total_len += consecutive_ios[i]->len;
  3188. }
  3189. if (n_consecutive == 1) {
  3190. /* We can use the buffer of the i/o request */
  3191. combined_buf = slot->buf;
  3192. combined_buf2 = NULL;
  3193. } else {
  3194. combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
  3195. ut_a(combined_buf2);
  3196. combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE);
  3197. }
  3198. /* We release the array mutex for the time of the i/o: NOTE that
  3199. this assumes that there is just one i/o-handler thread serving
  3200. a single segment of slots! */
  3201. os_mutex_exit(array->mutex);
  3202. if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
  3203. /* Copy the buffers to the combined buffer */
  3204. offs = 0;
  3205. for (i = 0; i < n_consecutive; i++) {
  3206. ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
  3207. consecutive_ios[i]->len);
  3208. offs += consecutive_ios[i]->len;
  3209. }
  3210. }
  3211. srv_set_io_thread_op_info(global_segment, "doing file i/o");
  3212. if (os_aio_print_debug) {
  3213. fprintf(stderr,
  3214. "InnoDB: doing i/o of type %lu at offset %lu %lu,"
  3215. " length %lu\n",
  3216. (ulong) slot->type, (ulong) slot->offset_high,
  3217. (ulong) slot->offset, (ulong) total_len);
  3218. }
  3219. /* Do the i/o with ordinary, synchronous i/o functions: */
  3220. if (slot->type == OS_FILE_WRITE) {
  3221. ret = os_file_write(slot->name, slot->file, combined_buf,
  3222. slot->offset, slot->offset_high,
  3223. total_len);
  3224. } else {
  3225. ret = os_file_read(slot->file, combined_buf,
  3226. slot->offset, slot->offset_high, total_len);
  3227. }
  3228. ut_a(ret);
  3229. srv_set_io_thread_op_info(global_segment, "file i/o done");
  3230. #if 0
  3231. fprintf(stderr,
  3232. "aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
  3233. n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
  3234. #endif
  3235. if (slot->type == OS_FILE_READ && n_consecutive > 1) {
  3236. /* Copy the combined buffer to individual buffers */
  3237. offs = 0;
  3238. for (i = 0; i < n_consecutive; i++) {
  3239. ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
  3240. consecutive_ios[i]->len);
  3241. offs += consecutive_ios[i]->len;
  3242. }
  3243. }
  3244. if (combined_buf2) {
  3245. ut_free(combined_buf2);
  3246. }
  3247. os_mutex_enter(array->mutex);
  3248. /* Mark the i/os done in slots */
  3249. for (i = 0; i < n_consecutive; i++) {
  3250. consecutive_ios[i]->io_already_done = TRUE;
  3251. }
  3252. /* We return the messages for the first slot now, and if there were
  3253. several slots, the messages will be returned with subsequent calls
  3254. of this function */
  3255. slot_io_done:
  3256. ut_a(slot->reserved);
  3257. *message1 = slot->message1;
  3258. *message2 = slot->message2;
  3259. *type = slot->type;
  3260. os_mutex_exit(array->mutex);
  3261. os_aio_array_free_slot(array, slot);
  3262. return(ret);
  3263. wait_for_io:
  3264. srv_set_io_thread_op_info(global_segment, "resetting wait event");
  3265. /* We wait here until there again can be i/os in the segment
  3266. of this thread */
  3267. os_event_reset(os_aio_segment_wait_events[global_segment]);
  3268. os_mutex_exit(array->mutex);
  3269. recommended_sleep:
  3270. srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
  3271. os_event_wait(os_aio_segment_wait_events[global_segment]);
  3272. if (os_aio_print_debug) {
  3273. fprintf(stderr,
  3274. "InnoDB: i/o handler thread for i/o"
  3275. " segment %lu wakes up\n",
  3276. (ulong) global_segment);
  3277. }
  3278. goto restart;
  3279. }
  3280. /**************************************************************************
  3281. Validates the consistency of an aio array. */
  3282. static
  3283. ibool
  3284. os_aio_array_validate(
  3285. /*==================*/
  3286. /* out: TRUE if ok */
  3287. os_aio_array_t* array) /* in: aio wait array */
  3288. {
  3289. os_aio_slot_t* slot;
  3290. ulint n_reserved = 0;
  3291. ulint i;
  3292. ut_a(array);
  3293. os_mutex_enter(array->mutex);
  3294. ut_a(array->n_slots > 0);
  3295. ut_a(array->n_segments > 0);
  3296. for (i = 0; i < array->n_slots; i++) {
  3297. slot = os_aio_array_get_nth_slot(array, i);
  3298. if (slot->reserved) {
  3299. n_reserved++;
  3300. ut_a(slot->len > 0);
  3301. }
  3302. }
  3303. ut_a(array->n_reserved == n_reserved);
  3304. os_mutex_exit(array->mutex);
  3305. return(TRUE);
  3306. }
  3307. /**************************************************************************
  3308. Validates the consistency the aio system. */
  3309. ibool
  3310. os_aio_validate(void)
  3311. /*=================*/
  3312. /* out: TRUE if ok */
  3313. {
  3314. os_aio_array_validate(os_aio_read_array);
  3315. os_aio_array_validate(os_aio_write_array);
  3316. os_aio_array_validate(os_aio_ibuf_array);
  3317. os_aio_array_validate(os_aio_log_array);
  3318. os_aio_array_validate(os_aio_sync_array);
  3319. return(TRUE);
  3320. }
  3321. /**************************************************************************
  3322. Prints info of the aio arrays. */
  3323. void
  3324. os_aio_print(
  3325. /*=========*/
  3326. FILE* file) /* in: file where to print */
  3327. {
  3328. os_aio_array_t* array;
  3329. os_aio_slot_t* slot;
  3330. ulint n_reserved;
  3331. time_t current_time;
  3332. double time_elapsed;
  3333. double avg_bytes_read;
  3334. ulint i;
  3335. for (i = 0; i < srv_n_file_io_threads; i++) {
  3336. fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
  3337. srv_io_thread_op_info[i],
  3338. srv_io_thread_function[i]);
  3339. #ifndef __WIN__
  3340. if (os_aio_segment_wait_events[i]->is_set) {
  3341. fprintf(file, " ev set");
  3342. }
  3343. #endif
  3344. fprintf(file, "\n");
  3345. }
  3346. fputs("Pending normal aio reads:", file);
  3347. array = os_aio_read_array;
  3348. loop:
  3349. ut_a(array);
  3350. os_mutex_enter(array->mutex);
  3351. ut_a(array->n_slots > 0);
  3352. ut_a(array->n_segments > 0);
  3353. n_reserved = 0;
  3354. for (i = 0; i < array->n_slots; i++) {
  3355. slot = os_aio_array_get_nth_slot(array, i);
  3356. if (slot->reserved) {
  3357. n_reserved++;
  3358. #if 0
  3359. fprintf(stderr, "Reserved slot, messages %p %p\n",
  3360. (void*) slot->message1,
  3361. (void*) slot->message2);
  3362. #endif
  3363. ut_a(slot->len > 0);
  3364. }
  3365. }
  3366. ut_a(array->n_reserved == n_reserved);
  3367. fprintf(file, " %lu", (ulong) n_reserved);
  3368. os_mutex_exit(array->mutex);
  3369. if (array == os_aio_read_array) {
  3370. fputs(", aio writes:", file);
  3371. array = os_aio_write_array;
  3372. goto loop;
  3373. }
  3374. if (array == os_aio_write_array) {
  3375. fputs(",\n ibuf aio reads:", file);
  3376. array = os_aio_ibuf_array;
  3377. goto loop;
  3378. }
  3379. if (array == os_aio_ibuf_array) {
  3380. fputs(", log i/o's:", file);
  3381. array = os_aio_log_array;
  3382. goto loop;
  3383. }
  3384. if (array == os_aio_log_array) {
  3385. fputs(", sync i/o's:", file);
  3386. array = os_aio_sync_array;
  3387. goto loop;
  3388. }
  3389. putc('\n', file);
  3390. current_time = time(NULL);
  3391. time_elapsed = 0.001 + difftime(current_time, os_last_printout);
  3392. fprintf(file,
  3393. "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
  3394. "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
  3395. (ulong) fil_n_pending_log_flushes,
  3396. (ulong) fil_n_pending_tablespace_flushes,
  3397. (ulong) os_n_file_reads, (ulong) os_n_file_writes,
  3398. (ulong) os_n_fsyncs);
  3399. if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
  3400. fprintf(file,
  3401. "%lu pending preads, %lu pending pwrites\n",
  3402. (ulong) os_file_n_pending_preads,
  3403. (ulong) os_file_n_pending_pwrites);
  3404. }
  3405. if (os_n_file_reads == os_n_file_reads_old) {
  3406. avg_bytes_read = 0.0;
  3407. } else {
  3408. avg_bytes_read = (double) os_bytes_read_since_printout
  3409. / (os_n_file_reads - os_n_file_reads_old);
  3410. }
  3411. fprintf(file,
  3412. "%.2f reads/s, %lu avg bytes/read,"
  3413. " %.2f writes/s, %.2f fsyncs/s\n",
  3414. (os_n_file_reads - os_n_file_reads_old)
  3415. / time_elapsed,
  3416. (ulong)avg_bytes_read,
  3417. (os_n_file_writes - os_n_file_writes_old)
  3418. / time_elapsed,
  3419. (os_n_fsyncs - os_n_fsyncs_old)
  3420. / time_elapsed);
  3421. os_n_file_reads_old = os_n_file_reads;
  3422. os_n_file_writes_old = os_n_file_writes;
  3423. os_n_fsyncs_old = os_n_fsyncs;
  3424. os_bytes_read_since_printout = 0;
  3425. os_last_printout = current_time;
  3426. }
  3427. /**************************************************************************
  3428. Refreshes the statistics used to print per-second averages. */
  3429. void
  3430. os_aio_refresh_stats(void)
  3431. /*======================*/
  3432. {
  3433. os_n_file_reads_old = os_n_file_reads;
  3434. os_n_file_writes_old = os_n_file_writes;
  3435. os_n_fsyncs_old = os_n_fsyncs;
  3436. os_bytes_read_since_printout = 0;
  3437. os_last_printout = time(NULL);
  3438. }
  3439. #ifdef UNIV_DEBUG
  3440. /**************************************************************************
  3441. Checks that all slots in the system have been freed, that is, there are
  3442. no pending io operations. */
  3443. ibool
  3444. os_aio_all_slots_free(void)
  3445. /*=======================*/
  3446. /* out: TRUE if all free */
  3447. {
  3448. os_aio_array_t* array;
  3449. ulint n_res = 0;
  3450. array = os_aio_read_array;
  3451. os_mutex_enter(array->mutex);
  3452. n_res += array->n_reserved;
  3453. os_mutex_exit(array->mutex);
  3454. array = os_aio_write_array;
  3455. os_mutex_enter(array->mutex);
  3456. n_res += array->n_reserved;
  3457. os_mutex_exit(array->mutex);
  3458. array = os_aio_ibuf_array;
  3459. os_mutex_enter(array->mutex);
  3460. n_res += array->n_reserved;
  3461. os_mutex_exit(array->mutex);
  3462. array = os_aio_log_array;
  3463. os_mutex_enter(array->mutex);
  3464. n_res += array->n_reserved;
  3465. os_mutex_exit(array->mutex);
  3466. array = os_aio_sync_array;
  3467. os_mutex_enter(array->mutex);
  3468. n_res += array->n_reserved;
  3469. os_mutex_exit(array->mutex);
  3470. if (n_res == 0) {
  3471. return(TRUE);
  3472. }
  3473. return(FALSE);
  3474. }
  3475. #endif /* UNIV_DEBUG */