You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1036 lines
41 KiB

  1. /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
  2. // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
  3. #ifndef FT_INTERNAL_H
  4. #define FT_INTERNAL_H
  5. #ident "$Id$"
  6. #ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
  7. #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
  8. #include <config.h>
  9. // Symbol TOKUDB_REVISION is not defined by fractal-tree makefiles, so
  10. // BUILD_ID of 1000 indicates development build of main, not a release build.
  11. #if defined(TOKUDB_REVISION)
  12. #define BUILD_ID TOKUDB_REVISION
  13. #else
  14. #error
  15. #endif
  16. #include "ft_layout_version.h"
  17. #include "toku_assert.h"
  18. #include "block_allocator.h"
  19. #include "cachetable.h"
  20. #include "fifo.h"
  21. #include "ft-ops.h"
  22. #include "toku_list.h"
  23. #include "omt.h"
  24. #include "leafentry.h"
  25. #include "block_table.h"
  26. #include "mempool.h"
  27. #include "compress.h"
  28. #include "omt-tmpl.h"
  29. // Uncomment the following to use quicklz
  30. #ifndef FT_FANOUT
  31. #define FT_FANOUT 16
  32. #endif
  33. enum { TREE_FANOUT = FT_FANOUT };
  34. enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */
  35. enum { FT_CMD_OVERHEAD = (2 + sizeof(MSN)) // the type plus freshness plus MSN
  36. };
  37. enum { FT_DEFAULT_NODE_SIZE = 1 << 22 };
  38. enum { FT_DEFAULT_BASEMENT_NODE_SIZE = 128 * 1024 };
  39. struct nodeheader_in_file {
  40. int n_in_buffer;
  41. };
  42. enum { BUFFER_HEADER_SIZE = (4 // height//
  43. + 4 // n_children
  44. + TREE_FANOUT * 8 // children
  45. ) };
  46. //
  47. // Field in ftnode_fetch_extra that tells the
  48. // partial fetch callback what piece of the node
  49. // is needed by the ydb
  50. //
  51. enum ftnode_fetch_type {
  52. ftnode_fetch_none=1, // no partitions needed.
  53. ftnode_fetch_subset, // some subset of partitions needed
  54. ftnode_fetch_prefetch, // this is part of a prefetch call
  55. ftnode_fetch_all // every partition is needed
  56. };
  57. //
  58. // An extra parameter passed to cachetable functions
  59. // That is used in all types of fetch callbacks.
  60. // The contents help the partial fetch and fetch
  61. // callbacks retrieve the pieces of a node necessary
  62. // for the ensuing operation (flush, query, ...)
  63. //
  64. struct ftnode_fetch_extra {
  65. enum ftnode_fetch_type type;
  66. // needed for reading a node off disk
  67. FT h;
  68. // used in the case where type == ftnode_fetch_subset
  69. // parameters needed to find out which child needs to be decompressed (so it can be read)
  70. ft_search_t* search;
  71. DBT *range_lock_left_key, *range_lock_right_key;
  72. bool left_is_neg_infty, right_is_pos_infty;
  73. // states if we should try to aggressively fetch basement nodes
  74. // that are not specifically needed for current query,
  75. // but may be needed for other cursor operations user is doing
  76. // For example, if we have not disabled prefetching,
  77. // and the user is doing a dictionary wide scan, then
  78. // even though a query may only want one basement node,
  79. // we fetch all basement nodes in a leaf node.
  80. bool disable_prefetching;
  81. // this value will be set during the fetch_callback call by toku_ftnode_fetch_callback or toku_ftnode_pf_req_callback
  82. // thi callbacks need to evaluate this anyway, so we cache it here so the search code does not reevaluate it
  83. int child_to_read;
  84. };
  85. struct toku_fifo_entry_key_msn_heaviside_extra {
  86. DESCRIPTOR desc;
  87. ft_compare_func cmp;
  88. FIFO fifo;
  89. const DBT *key;
  90. MSN msn;
  91. };
  92. // comparison function for inserting messages into a
  93. // ftnode_nonleaf_childinfo's message_tree
  94. int
  95. toku_fifo_entry_key_msn_heaviside(const int32_t &v, const struct toku_fifo_entry_key_msn_heaviside_extra &extra);
  96. struct toku_fifo_entry_key_msn_cmp_extra {
  97. DESCRIPTOR desc;
  98. ft_compare_func cmp;
  99. FIFO fifo;
  100. };
  101. // same thing for qsort_r
  102. int
  103. toku_fifo_entry_key_msn_cmp(const struct toku_fifo_entry_key_msn_cmp_extra &extrap, const int &a, const int &b);
  104. typedef toku::omt<int32_t> off_omt_t;
  105. typedef toku::omt<int32_t, int32_t, true> marked_off_omt_t;
  106. // data of an available partition of a nonleaf ftnode
  107. struct ftnode_nonleaf_childinfo {
  108. FIFO buffer;
  109. off_omt_t broadcast_list;
  110. marked_off_omt_t fresh_message_tree;
  111. off_omt_t stale_message_tree;
  112. };
  113. unsigned int toku_bnc_nbytesinbuf(NONLEAF_CHILDINFO bnc);
  114. int toku_bnc_n_entries(NONLEAF_CHILDINFO bnc);
  115. long toku_bnc_memory_size(NONLEAF_CHILDINFO bnc);
  116. long toku_bnc_memory_used(NONLEAF_CHILDINFO bnc);
  117. int toku_bnc_insert_msg(NONLEAF_CHILDINFO bnc, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, DESCRIPTOR desc, ft_compare_func cmp);
  118. void toku_bnc_empty(NONLEAF_CHILDINFO bnc);
  119. int toku_bnc_flush_to_child(
  120. FT h,
  121. NONLEAF_CHILDINFO bnc,
  122. FTNODE child
  123. );
  124. bool
  125. toku_ft_nonleaf_is_gorged(FTNODE node);
  126. enum reactivity get_nonleaf_reactivity (FTNODE node);
  127. enum reactivity get_node_reactivity (FTNODE node);
  128. // data of an available partition of a leaf ftnode
  129. struct ftnode_leaf_basement_node {
  130. OMT buffer; // pointers to individual leaf entries
  131. struct mempool buffer_mempool; // storage for all leaf entries
  132. unsigned int n_bytes_in_buffer; // How many bytes to represent the OMT (including the per-key overheads, ...
  133. // ... but not including the overheads for the node.
  134. unsigned int seqinsert; // number of sequential inserts to this leaf
  135. MSN max_msn_applied; // max message sequence number applied
  136. bool stale_ancestor_messages_applied;
  137. STAT64INFO_S stat64_delta; // change in stat64 counters since basement was last written to disk
  138. };
  139. enum __attribute__((__packed__)) pt_state { // declare this to be packed so that when used below it will only take 1 byte.
  140. PT_INVALID = 0,
  141. PT_ON_DISK = 1,
  142. PT_COMPRESSED = 2,
  143. PT_AVAIL = 3};
  144. enum __attribute__((__packed__)) ftnode_child_tag {
  145. BCT_INVALID = 0,
  146. BCT_NULL,
  147. BCT_SUBBLOCK,
  148. BCT_LEAF,
  149. BCT_NONLEAF
  150. };
  151. typedef struct __attribute__((__packed__)) ftnode_child_pointer {
  152. enum ftnode_child_tag tag;
  153. union {
  154. struct sub_block *subblock;
  155. struct ftnode_nonleaf_childinfo *nonleaf;
  156. struct ftnode_leaf_basement_node *leaf;
  157. } u;
  158. } FTNODE_CHILD_POINTER;
  159. struct ftnode_disk_data {
  160. //
  161. // stores the offset to the beginning of the partition on disk from the ftnode, and the length, needed to read a partition off of disk
  162. // the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless
  163. // The START is the distance from the end of the compressed node_info data, to the beginning of the compressed partition
  164. // The SIZE is the size of the compressed partition.
  165. // Rationale: We cannot store the size from the beginning of the node since we don't know how big the header will be.
  166. // However, later when we are doing aligned writes, we won't be able to store the size from the end since we want things to align.
  167. uint32_t start;
  168. uint32_t size;
  169. };
  170. #define BP_START(node_dd,i) ((node_dd)[i].start)
  171. #define BP_SIZE(node_dd,i) ((node_dd)[i].size)
  172. // a ftnode partition, associated with a child of a node
  173. struct __attribute__((__packed__)) ftnode_partition {
  174. // the following three variables are used for nonleaf nodes
  175. // for leaf nodes, they are meaningless
  176. BLOCKNUM blocknum; // blocknum of child
  177. //
  178. // at any time, the partitions may be in one of the following three states (stored in pt_state):
  179. // PT_INVALID - means that the partition was just initialized
  180. // PT_ON_DISK - means that the partition is not in memory and needs to be read from disk. To use, must read off disk and decompress
  181. // PT_COMPRESSED - means that the partition is compressed in memory. To use, must decompress
  182. // PT_AVAIL - means the partition is decompressed and in memory
  183. //
  184. enum pt_state state; // make this an enum to make debugging easier.
  185. //
  186. // pointer to the partition. Depending on the state, they may be different things
  187. // if state == PT_INVALID, then the node was just initialized and ptr == NULL
  188. // if state == PT_ON_DISK, then ptr == NULL
  189. // if state == PT_COMPRESSED, then ptr points to a struct sub_block*
  190. // if state == PT_AVAIL, then ptr is:
  191. // a struct ftnode_nonleaf_childinfo for internal nodes,
  192. // a struct ftnode_leaf_basement_node for leaf nodes
  193. //
  194. struct ftnode_child_pointer ptr;
  195. // clock count used to for pe_callback to determine if a node should be evicted or not
  196. // for now, saturating the count at 1
  197. uint8_t clock_count;
  198. // How many bytes worth of work was performed by messages in each buffer.
  199. uint64_t workdone;
  200. };
  201. struct ftnode {
  202. MSN max_msn_applied_to_node_on_disk; // max_msn_applied that will be written to disk
  203. unsigned int nodesize;
  204. unsigned int flags;
  205. BLOCKNUM thisnodename; // Which block number is this node?
  206. int layout_version; // What version of the data structure?
  207. int layout_version_original; // different (<) from layout_version if upgraded from a previous version (useful for debugging)
  208. int layout_version_read_from_disk; // transient, not serialized to disk, (useful for debugging)
  209. uint32_t build_id; // build_id (svn rev number) of software that wrote this node to disk
  210. int height; /* height is always >= 0. 0 for leaf, >0 for nonleaf. */
  211. int dirty;
  212. uint32_t fullhash;
  213. int n_children; //for internal nodes, if n_children==TREE_FANOUT+1 then the tree needs to be rebalanced.
  214. // for leaf nodes, represents number of basement nodes
  215. unsigned int totalchildkeylens;
  216. DBT *childkeys; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1].
  217. Child 1's keys are > childkeys[0]. */
  218. // array of size n_children, consisting of ftnode partitions
  219. // each one is associated with a child
  220. // for internal nodes, the ith partition corresponds to the ith message buffer
  221. // for leaf nodes, the ith partition corresponds to the ith basement node
  222. struct ftnode_partition *bp;
  223. };
  224. // ftnode partition macros
  225. // BP stands for ftnode_partition
  226. #define BP_BLOCKNUM(node,i) ((node)->bp[i].blocknum)
  227. #define BP_STATE(node,i) ((node)->bp[i].state)
  228. #define BP_WORKDONE(node, i)((node)->bp[i].workdone)
  229. //
  230. // macros for managing a node's clock
  231. // Should be managed by ft-ops.c, NOT by serialize/deserialize
  232. //
  233. #define BP_TOUCH_CLOCK(node, i) ((node)->bp[i].clock_count = 1)
  234. #define BP_SWEEP_CLOCK(node, i) ((node)->bp[i].clock_count = 0)
  235. #define BP_SHOULD_EVICT(node, i) ((node)->bp[i].clock_count == 0)
  236. // not crazy about having these two here, one is for the case where we create new
  237. // nodes, such as in splits and creating new roots, and the other is for when
  238. // we are deserializing a node and not all bp's are touched
  239. #define BP_INIT_TOUCHED_CLOCK(node, i) ((node)->bp[i].clock_count = 1)
  240. #define BP_INIT_UNTOUCHED_CLOCK(node, i) ((node)->bp[i].clock_count = 0)
  241. // internal node macros
  242. static inline void set_BNULL(FTNODE node, int i) {
  243. assert(0<=i && i<node->n_children);
  244. node->bp[i].ptr.tag = BCT_NULL;
  245. }
  246. static inline bool is_BNULL (FTNODE node, int i) {
  247. assert(0<=i && i<node->n_children);
  248. return node->bp[i].ptr.tag == BCT_NULL;
  249. }
  250. static inline NONLEAF_CHILDINFO BNC(FTNODE node, int i) {
  251. assert(0<=i && i<node->n_children);
  252. FTNODE_CHILD_POINTER p = node->bp[i].ptr;
  253. assert(p.tag==BCT_NONLEAF);
  254. return p.u.nonleaf;
  255. }
  256. static inline void set_BNC(FTNODE node, int i, NONLEAF_CHILDINFO nl) {
  257. assert(0<=i && i<node->n_children);
  258. FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
  259. p->tag = BCT_NONLEAF;
  260. p->u.nonleaf = nl;
  261. }
  262. static inline BASEMENTNODE BLB(FTNODE node, int i) {
  263. assert(i<node->n_children);
  264. assert(0<=i);
  265. FTNODE_CHILD_POINTER p = node->bp[i].ptr;
  266. assert(p.tag==BCT_LEAF);
  267. return p.u.leaf;
  268. }
  269. static inline void set_BLB(FTNODE node, int i, BASEMENTNODE bn) {
  270. assert(0<=i && i<node->n_children);
  271. FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
  272. p->tag = BCT_LEAF;
  273. p->u.leaf = bn;
  274. }
  275. static inline SUB_BLOCK BSB(FTNODE node, int i) {
  276. assert(0<=i && i<node->n_children);
  277. FTNODE_CHILD_POINTER p = node->bp[i].ptr;
  278. assert(p.tag==BCT_SUBBLOCK);
  279. return p.u.subblock;
  280. }
  281. static inline void set_BSB(FTNODE node, int i, SUB_BLOCK sb) {
  282. assert(0<=i && i<node->n_children);
  283. FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
  284. p->tag = BCT_SUBBLOCK;
  285. p->u.subblock = sb;
  286. }
  287. // ftnode leaf basementnode macros,
  288. #define BLB_MAX_MSN_APPLIED(node,i) (BLB(node,i)->max_msn_applied)
  289. #define BLB_MAX_DSN_APPLIED(node,i) (BLB(node,i)->max_dsn_applied)
  290. #define BLB_BUFFER(node,i) (BLB(node,i)->buffer)
  291. #define BLB_BUFFER_MEMPOOL(node,i) (BLB(node,i)->buffer_mempool)
  292. #define BLB_NBYTESINBUF(node,i) (BLB(node,i)->n_bytes_in_buffer)
  293. #define BLB_SEQINSERT(node,i) (BLB(node,i)->seqinsert)
  294. /* pivot flags (must fit in 8 bits) */
  295. enum {
  296. FT_PIVOT_TRUNC = 4,
  297. FT_PIVOT_FRONT_COMPRESS = 8,
  298. };
  299. uint32_t compute_child_fullhash (CACHEFILE cf, FTNODE node, int childnum);
  300. // The brt_header is not managed by the cachetable. Instead, it hangs off the cachefile as userdata.
  301. enum ft_type {FT_CURRENT=1, FT_CHECKPOINT_INPROGRESS};
  302. struct ft_header {
  303. enum ft_type type;
  304. int dirty;
  305. // Free-running counter incremented once per checkpoint (toggling LSB).
  306. // LSB indicates which header location is used on disk so this
  307. // counter is effectively a boolean which alternates with each checkpoint.
  308. uint64_t checkpoint_count;
  309. // LSN of creation of "checkpoint-begin" record in log.
  310. LSN checkpoint_lsn;
  311. // see brt_layout_version.h. maybe don't need this if we assume
  312. // it's always the current version after deserializing
  313. const int layout_version;
  314. // different (<) from layout_version if upgraded from a previous
  315. // version (useful for debugging)
  316. const int layout_version_original;
  317. // build_id (svn rev number) of software that wrote this node to
  318. // disk. (read from disk, overwritten when written to disk, I
  319. // think).
  320. const uint32_t build_id;
  321. // build_id of software that created this tree
  322. const uint32_t build_id_original;
  323. // time this tree was created
  324. const uint64_t time_of_creation;
  325. // and the root transaction id that created it
  326. TXNID root_xid_that_created;
  327. // last time this header was serialized to disk (read from disk,
  328. // overwritten when written to disk)
  329. uint64_t time_of_last_modification;
  330. // last time that this tree was verified
  331. uint64_t time_of_last_verification;
  332. // this field is protected by tree_lock, see comment for tree_lock
  333. BLOCKNUM root_blocknum;
  334. const unsigned int flags;
  335. //protected by toku_ft_lock
  336. unsigned int nodesize;
  337. unsigned int basementnodesize;
  338. enum toku_compression_method compression_method;
  339. // Current Minimum MSN to be used when upgrading pre-MSN BRT's.
  340. // This is decremented from our currnt MIN_MSN so as not to clash
  341. // with any existing 'normal' MSN's.
  342. MSN highest_unused_msn_for_upgrade;
  343. // last time that a hot optimize operation was begun
  344. uint64_t time_of_last_optimize_begin;
  345. // last time that a hot optimize operation was successfully completed
  346. uint64_t time_of_last_optimize_end;
  347. // the number of hot optimize operations currently in progress on this tree
  348. uint32_t count_of_optimize_in_progress;
  349. // the number of hot optimize operations in progress on this tree at the time of the last crash (this field is in-memory only)
  350. uint32_t count_of_optimize_in_progress_read_from_disk;
  351. // all messages before this msn have been applied to leaf nodes
  352. MSN msn_at_start_of_last_completed_optimize;
  353. STAT64INFO_S on_disk_stats;
  354. };
  355. // brt_header is always the current version.
  356. struct ft {
  357. FT_HEADER h;
  358. FT_HEADER checkpoint_header;
  359. // These are (mostly) read-only.
  360. CACHEFILE cf;
  361. // unique id for dictionary
  362. DICTIONARY_ID dict_id;
  363. ft_compare_func compare_fun;
  364. ft_update_func update_fun;
  365. // protected by locktree
  366. DESCRIPTOR_S descriptor;
  367. // protected by locktree and user. User
  368. // makes sure this is only changed
  369. // when no activity on tree
  370. DESCRIPTOR_S cmp_descriptor;
  371. // These are not read-only:
  372. // lock used by a thread to pin the root node to start a descent into
  373. // the tree. This lock protects the blocknum of the root node (root_blocknum). Any
  374. // thread that wants to descend down the tree starting at the root
  375. // must grab this lock before pinning the root.
  376. toku_mutex_t tree_lock;
  377. // protected by blocktable lock
  378. BLOCK_TABLE blocktable;
  379. // protected by atomic builtins
  380. STAT64INFO_S in_memory_stats;
  381. // transient, not serialized to disk. updated when we do write to
  382. // disk. tells us whether we can do partial eviction (we can't if
  383. // the on-disk layout version is from before basement nodes)
  384. int layout_version_read_from_disk;
  385. // If a transaction created this BRT, which one?
  386. // If a transaction locked the BRT when it was empty, which transaction? (Only the latest one matters)
  387. // 0 if no such transaction
  388. // only one thread can write to these at once, this is enforced by
  389. // the lock tree
  390. TXNID txnid_that_created_or_locked_when_empty;
  391. TXNID txnid_that_suppressed_recovery_logs;
  392. // Logically the reference count is zero if live_ft_handles is empty, txns is 0, and pinned_by_checkpoint is false.
  393. // ft_ref_lock protects modifying live_ft_handles, txns, and pinned_by_checkpoint.
  394. toku_mutex_t ft_ref_lock;
  395. struct toku_list live_ft_handles;
  396. // Number of transactions that are using this FT. you should only be able
  397. // to modify this if you have a valid handle in live_ft_handles
  398. uint32_t num_txns;
  399. // A checkpoint is running. If true, then keep this header around for checkpoint, like a transaction
  400. bool pinned_by_checkpoint;
  401. // If nonzero there was a write error. Don't write any more, because it probably only gets worse. This is the error code.
  402. int panic;
  403. // A malloced string that can indicate what went wrong.
  404. char *panic_string;
  405. };
  406. // Copy the descriptor into a temporary variable, and tell DRD that subsequent code happens after reading that pointer.
  407. // In combination with the annotation in toku_ft_update_descriptor, this seems to be enough to convince test_4015 that all is well.
  408. // Otherwise, drd complains that the newly malloc'd descriptor string is touched later by some comparison operation.
  409. static inline void setup_fake_db (DB *fake_db, DESCRIPTOR orig_desc) {
  410. memset(fake_db, 0, sizeof *fake_db);
  411. fake_db->cmp_descriptor = orig_desc;
  412. }
  413. #define FAKE_DB(db, desc) struct __toku_db db; setup_fake_db(&db, (desc))
  414. struct ft_options {
  415. unsigned int nodesize;
  416. unsigned int basementnodesize;
  417. enum toku_compression_method compression_method;
  418. unsigned int flags;
  419. ft_compare_func compare_fun;
  420. ft_update_func update_fun;
  421. };
  422. struct ft_handle {
  423. // The fractal tree.
  424. FT ft;
  425. on_redirect_callback redirect_callback;
  426. void *redirect_callback_extra;
  427. struct toku_list live_ft_handle_link;
  428. bool did_set_flags;
  429. struct ft_options options;
  430. };
  431. // FIXME needs toku prefix
  432. long ftnode_memory_size (FTNODE node);
  433. PAIR_ATTR make_ftnode_pair_attr(FTNODE node);
  434. PAIR_ATTR make_invalid_pair_attr(void);
  435. /* serialization code */
  436. void
  437. toku_create_compressed_partition_from_available(
  438. FTNODE node,
  439. int childnum,
  440. enum toku_compression_method compression_method,
  441. SUB_BLOCK sb
  442. );
  443. void rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize);
  444. int toku_serialize_ftnode_to_memory (FTNODE node,
  445. FTNODE_DISK_DATA* ndd,
  446. unsigned int basementnodesize,
  447. enum toku_compression_method compression_method,
  448. bool do_rebalancing,
  449. bool in_parallel,
  450. /*out*/ size_t *n_bytes_to_write,
  451. /*out*/ char **bytes_to_write);
  452. int toku_serialize_ftnode_to(int fd, BLOCKNUM, FTNODE node, FTNODE_DISK_DATA* ndd, bool do_rebalancing, FT h, bool for_checkpoint);
  453. int toku_serialize_rollback_log_to (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE log,
  454. FT h,
  455. bool for_checkpoint);
  456. int toku_deserialize_rollback_log_from (int fd, BLOCKNUM blocknum, uint32_t fullhash, ROLLBACK_LOG_NODE *logp, FT h);
  457. int toku_deserialize_bp_from_disk(FTNODE node, FTNODE_DISK_DATA ndd, int childnum, int fd, struct ftnode_fetch_extra* bfe);
  458. int toku_deserialize_bp_from_compressed(FTNODE node, int childnum, DESCRIPTOR desc, ft_compare_func cmp);
  459. int toku_deserialize_ftnode_from (int fd, BLOCKNUM off, uint32_t /*fullhash*/, FTNODE *ftnode, FTNODE_DISK_DATA* ndd, struct ftnode_fetch_extra* bfe);
  460. // <CER> For verifying old, non-upgraded nodes (versions 13 and 14).
  461. int
  462. decompress_from_raw_block_into_rbuf(uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum);
  463. //
  464. //////////////// <CER> TODO: Move these function declarations
  465. int
  466. deserialize_ft_from_fd_into_rbuf(int fd,
  467. toku_off_t offset_of_header,
  468. struct rbuf *rb,
  469. uint64_t *checkpoint_count,
  470. LSN *checkpoint_lsn,
  471. uint32_t * version_p);
  472. int
  473. deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version);
  474. int
  475. read_block_from_fd_into_rbuf(
  476. int fd,
  477. BLOCKNUM blocknum,
  478. FT h,
  479. struct rbuf *rb
  480. );
  481. int
  482. read_compressed_sub_block(struct rbuf *rb, struct sub_block *sb);
  483. int
  484. verify_ftnode_sub_block (struct sub_block *sb);
  485. void
  486. just_decompress_sub_block(struct sub_block *sb);
  487. /* Beginning of ft-node-deserialize.c helper functions. */
  488. void initialize_ftnode(FTNODE node, BLOCKNUM blocknum);
  489. int read_and_check_magic(struct rbuf *rb);
  490. int read_and_check_version(FTNODE node, struct rbuf *rb);
  491. void read_node_info(FTNODE node, struct rbuf *rb, int version);
  492. void allocate_and_read_partition_offsets(FTNODE node, struct rbuf *rb, FTNODE_DISK_DATA *ndd);
  493. int check_node_info_checksum(struct rbuf *rb);
  494. void read_legacy_node_info(FTNODE node, struct rbuf *rb, int version);
  495. int check_legacy_end_checksum(struct rbuf *rb);
  496. /* End of ft-node-deserialization.c helper functions. */
  497. unsigned int toku_serialize_ftnode_size(FTNODE node); /* How much space will it take? */
  498. void toku_verify_or_set_counts(FTNODE);
  499. int toku_serialize_ft_size (FT_HEADER h);
  500. int toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFILE cf);
  501. int toku_serialize_ft_to_wbuf (
  502. struct wbuf *wbuf,
  503. FT_HEADER h,
  504. DISKOFF translation_location_on_disk,
  505. DISKOFF translation_size_on_disk
  506. );
  507. int toku_deserialize_ft_from (int fd, LSN max_acceptable_lsn, FT *ft);
  508. void toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset);
  509. void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTOR desc);
  510. BASEMENTNODE toku_create_empty_bn(void);
  511. BASEMENTNODE toku_create_empty_bn_no_buffer(void); // create a basement node with a null buffer.
  512. NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo);
  513. BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn);
  514. NONLEAF_CHILDINFO toku_create_empty_nl(void);
  515. // FIXME needs toku prefix
  516. void destroy_basement_node (BASEMENTNODE bn);
  517. // FIXME needs toku prefix
  518. void destroy_nonleaf_childinfo (NONLEAF_CHILDINFO nl);
  519. void toku_destroy_ftnode_internals(FTNODE node);
  520. void toku_ftnode_free (FTNODE *node);
  521. bool is_entire_node_in_memory(FTNODE node);
  522. void toku_assert_entire_node_in_memory(FTNODE node);
  523. // FIXME needs toku prefix
  524. void bring_node_fully_into_memory(FTNODE node, FT h);
  525. // append a child node to a parent node
  526. void toku_ft_nonleaf_append_child(FTNODE node, FTNODE child, const DBT *pivotkey);
  527. // append a cmd to a nonleaf node child buffer
  528. void toku_ft_append_to_child_buffer(ft_compare_func compare_fun, DESCRIPTOR desc, FTNODE node, int childnum, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, const DBT *key, const DBT *val);
  529. STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
  530. #if 1
  531. #define DEADBEEF ((void*)0xDEADBEEF)
  532. #else
  533. #define DEADBEEF ((void*)0xDEADBEEFDEADBEEF)
  534. #endif
  535. //#define SLOW
  536. #ifdef SLOW
  537. #define VERIFY_NODE(t,n) (toku_verify_or_set_counts(n), toku_verify_estimates(t,n))
  538. #else
  539. #define VERIFY_NODE(t,n) ((void)0)
  540. #endif
  541. //#define FT_TRACE
  542. #ifdef FT_TRACE
  543. #define WHEN_FTTRACE(x) x
  544. #else
  545. #define WHEN_FTTRACE(x) ((void)0)
  546. #endif
  547. void toku_evict_bn_from_memory(FTNODE node, int childnum, FT h);
  548. void toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe);
  549. extern void toku_ftnode_clone_callback(void* value_data, void** cloned_value_data, PAIR_ATTR* new_attr, bool for_checkpoint, void* write_extraargs);
  550. extern void toku_ftnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *ftnode_v, void** UU(disk_data), void *extraargs, PAIR_ATTR size, PAIR_ATTR* new_size, bool write_me, bool keep_me, bool for_checkpoint, bool is_clone);
  551. extern int toku_ftnode_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, uint32_t fullhash, void **ftnode_pv, void** UU(disk_data), PAIR_ATTR *sizep, int*dirty, void*extraargs);
  552. extern void toku_ftnode_pe_est_callback(void* ftnode_pv, void* disk_data, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void* write_extraargs);
  553. extern int toku_ftnode_pe_callback (void *ftnode_pv, PAIR_ATTR old_attr, PAIR_ATTR* new_attr, void *extraargs);
  554. extern bool toku_ftnode_pf_req_callback(void* ftnode_pv, void* read_extraargs);
  555. int toku_ftnode_pf_callback(void* ftnode_pv, void* UU(disk_data), void* read_extraargs, int fd, PAIR_ATTR* sizep);
  556. extern int toku_ftnode_cleaner_callback( void *ftnode_pv, BLOCKNUM blocknum, uint32_t fullhash, void *extraargs);
  557. static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_node(FT h) {
  558. CACHETABLE_WRITE_CALLBACK wc;
  559. wc.flush_callback = toku_ftnode_flush_callback;
  560. wc.pe_est_callback = toku_ftnode_pe_est_callback;
  561. wc.pe_callback = toku_ftnode_pe_callback;
  562. wc.cleaner_callback = toku_ftnode_cleaner_callback;
  563. wc.clone_callback = toku_ftnode_clone_callback;
  564. wc.write_extraargs = h;
  565. return wc;
  566. }
  567. static const FTNODE null_ftnode=0;
  568. // Values to be used to update ftcursor if a search is successful.
  569. struct ft_cursor_leaf_info_to_be {
  570. uint32_t index;
  571. OMT omt;
  572. };
  573. // Values to be used to pin a leaf for shortcut searches
  574. struct ft_cursor_leaf_info {
  575. struct ft_cursor_leaf_info_to_be to_be;
  576. };
  577. /* a brt cursor is represented as a kv pair in a tree */
  578. struct ft_cursor {
  579. struct toku_list cursors_link;
  580. FT_HANDLE ft_handle;
  581. bool prefetching;
  582. DBT key, val; // The key-value pair that the cursor currently points to
  583. DBT range_lock_left_key, range_lock_right_key;
  584. bool left_is_neg_infty, right_is_pos_infty;
  585. bool is_snapshot_read; // true if query is read_committed, false otherwise
  586. bool is_leaf_mode;
  587. bool disable_prefetching;
  588. bool is_temporary;
  589. TOKUTXN ttxn;
  590. struct ft_cursor_leaf_info leaf_info;
  591. };
  592. //
  593. // Helper function to fill a ftnode_fetch_extra with data
  594. // that will tell the fetch callback that the entire node is
  595. // necessary. Used in cases where the entire node
  596. // is required, such as for flushes.
  597. //
  598. static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h) {
  599. bfe->type = ftnode_fetch_all;
  600. bfe->h = h;
  601. bfe->search = NULL;
  602. bfe->range_lock_left_key = NULL;
  603. bfe->range_lock_right_key = NULL;
  604. bfe->left_is_neg_infty = false;
  605. bfe->right_is_pos_infty = false;
  606. bfe->child_to_read = -1;
  607. bfe->disable_prefetching = false;
  608. }
  609. //
  610. // Helper function to fill a ftnode_fetch_extra with data
  611. // that will tell the fetch callback that some subset of the node
  612. // necessary. Used in cases where some of the node is required
  613. // such as for a point query.
  614. //
  615. static inline void fill_bfe_for_subset_read(
  616. struct ftnode_fetch_extra *bfe,
  617. FT h,
  618. ft_search_t* search,
  619. DBT *left,
  620. DBT *right,
  621. bool left_is_neg_infty,
  622. bool right_is_pos_infty,
  623. bool disable_prefetching
  624. )
  625. {
  626. invariant(h->h->type == FT_CURRENT);
  627. bfe->type = ftnode_fetch_subset;
  628. bfe->h = h;
  629. bfe->search = search;
  630. bfe->range_lock_left_key = (left->data ? left : NULL);
  631. bfe->range_lock_right_key = (right->data ? right : NULL);
  632. bfe->left_is_neg_infty = left_is_neg_infty;
  633. bfe->right_is_pos_infty = right_is_pos_infty;
  634. bfe->child_to_read = -1;
  635. bfe->disable_prefetching = disable_prefetching;
  636. }
  637. //
  638. // Helper function to fill a ftnode_fetch_extra with data
  639. // that will tell the fetch callback that no partitions are
  640. // necessary, only the pivots and/or subtree estimates.
  641. // Currently used for stat64.
  642. //
  643. static inline void fill_bfe_for_min_read(struct ftnode_fetch_extra *bfe, FT h) {
  644. invariant(h->h->type == FT_CURRENT);
  645. bfe->type = ftnode_fetch_none;
  646. bfe->h = h;
  647. bfe->search = NULL;
  648. bfe->range_lock_left_key = NULL;
  649. bfe->range_lock_right_key = NULL;
  650. bfe->left_is_neg_infty = false;
  651. bfe->right_is_pos_infty = false;
  652. bfe->child_to_read = -1;
  653. bfe->disable_prefetching = false;
  654. }
  655. static inline void destroy_bfe_for_prefetch(struct ftnode_fetch_extra *bfe) {
  656. assert(bfe->type == ftnode_fetch_prefetch);
  657. if (bfe->range_lock_left_key != NULL) {
  658. toku_free(bfe->range_lock_left_key->data);
  659. toku_destroy_dbt(bfe->range_lock_left_key);
  660. toku_free(bfe->range_lock_left_key);
  661. bfe->range_lock_left_key = NULL;
  662. }
  663. if (bfe->range_lock_right_key != NULL) {
  664. toku_free(bfe->range_lock_right_key->data);
  665. toku_destroy_dbt(bfe->range_lock_right_key);
  666. toku_free(bfe->range_lock_right_key);
  667. bfe->range_lock_right_key = NULL;
  668. }
  669. }
  670. // this is in a strange place because it needs the cursor struct to be defined
  671. static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe,
  672. FT h,
  673. FT_CURSOR c) {
  674. invariant(h->h->type == FT_CURRENT);
  675. bfe->type = ftnode_fetch_prefetch;
  676. bfe->h = h;
  677. bfe->search = NULL;
  678. {
  679. const DBT *left = &c->range_lock_left_key;
  680. const DBT *right = &c->range_lock_right_key;
  681. if (left->data) {
  682. MALLOC(bfe->range_lock_left_key); resource_assert(bfe->range_lock_left_key);
  683. toku_fill_dbt(bfe->range_lock_left_key, toku_xmemdup(left->data, left->size), left->size);
  684. } else {
  685. bfe->range_lock_left_key = NULL;
  686. }
  687. if (right->data) {
  688. MALLOC(bfe->range_lock_right_key); resource_assert(bfe->range_lock_right_key);
  689. toku_fill_dbt(bfe->range_lock_right_key, toku_xmemdup(right->data, right->size), right->size);
  690. } else {
  691. bfe->range_lock_right_key = NULL;
  692. }
  693. }
  694. bfe->left_is_neg_infty = c->left_is_neg_infty;
  695. bfe->right_is_pos_infty = c->right_is_pos_infty;
  696. bfe->child_to_read = -1;
  697. bfe->disable_prefetching = c->disable_prefetching;
  698. }
  699. struct ancestors {
  700. FTNODE node; // This is the root node if next is NULL.
  701. int childnum; // which buffer holds messages destined to the node whose ancestors this list represents.
  702. ANCESTORS next; // Parent of this node (so next->node.(next->childnum) refers to this node).
  703. };
  704. struct pivot_bounds {
  705. const DBT * const lower_bound_exclusive;
  706. const DBT * const upper_bound_inclusive; // NULL to indicate negative or positive infinity (which are in practice exclusive since there are now transfinite keys in messages).
  707. };
  708. __attribute__((nonnull))
  709. void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node);
  710. void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied);
  711. int
  712. toku_ft_search_which_child(
  713. DESCRIPTOR desc,
  714. ft_compare_func cmp,
  715. FTNODE node,
  716. ft_search_t *search
  717. );
  718. bool
  719. toku_bfe_wants_child_available (struct ftnode_fetch_extra* bfe, int childnum);
  720. int
  721. toku_bfe_leftmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node);
  722. int
  723. toku_bfe_rightmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node);
  724. // allocate a block number
  725. // allocate and initialize a ftnode
  726. // put the ftnode into the cache table
  727. void toku_create_new_ftnode (FT_HANDLE t, FTNODE *result, int height, int n_children);
  728. // Effect: Fill in N as an empty ftnode.
  729. void toku_initialize_empty_ftnode (FTNODE n, BLOCKNUM nodename, int height, int num_children,
  730. int layout_version, unsigned int nodesize, unsigned int flags);
  731. unsigned int toku_ftnode_which_child(FTNODE node, const DBT *k,
  732. DESCRIPTOR desc, ft_compare_func cmp)
  733. __attribute__((__warn_unused_result__));
  734. /**
  735. * Finds the next child for HOT to flush to, given that everything up to
  736. * and including k has been flattened.
  737. *
  738. * If k falls between pivots in node, then we return the childnum where k
  739. * lies.
  740. *
  741. * If k is equal to some pivot, then we return the next (to the right)
  742. * childnum.
  743. */
  744. unsigned int toku_ftnode_hot_next_child(FTNODE node,
  745. const DBT *k,
  746. DESCRIPTOR desc,
  747. ft_compare_func cmp);
  748. /* Stuff for testing */
  749. // toku_testsetup_initialize() must be called before any other test_setup_xxx() functions are called.
  750. void toku_testsetup_initialize(void);
  751. int toku_testsetup_leaf(FT_HANDLE brt, BLOCKNUM *blocknum, int n_children, char **keys, int *keylens);
  752. int toku_testsetup_nonleaf (FT_HANDLE brt, int height, BLOCKNUM *diskoff, int n_children, BLOCKNUM *children, char **keys, int *keylens);
  753. int toku_testsetup_root(FT_HANDLE brt, BLOCKNUM);
  754. int toku_testsetup_get_sersize(FT_HANDLE brt, BLOCKNUM); // Return the size on disk.
  755. int toku_testsetup_insert_to_leaf (FT_HANDLE brt, BLOCKNUM, const char *key, int keylen, const char *val, int vallen);
  756. int toku_testsetup_insert_to_nonleaf (FT_HANDLE brt, BLOCKNUM, enum ft_msg_type, const char *key, int keylen, const char *val, int vallen);
  757. void toku_pin_node_with_min_bfe(FTNODE* node, BLOCKNUM b, FT_HANDLE t);
  758. // These two go together to do lookups in a ftnode using the keys in a command.
  759. struct cmd_leafval_heaviside_extra {
  760. ft_compare_func compare_fun;
  761. DESCRIPTOR desc;
  762. DBT const * const key;
  763. };
  764. int toku_cmd_leafval_heaviside (OMTVALUE leafentry, void *extra)
  765. __attribute__((__warn_unused_result__));
  766. // toku_ft_root_put_cmd() accepts non-constant cmd because this is where we set the msn
  767. int toku_ft_root_put_cmd(FT h, FT_MSG_S * cmd)
  768. __attribute__((__warn_unused_result__));
  769. void *mempool_malloc_from_omt(OMT omt, struct mempool *mp, size_t size, void **maybe_free);
  770. // Effect: Allocate a new object of size SIZE in MP. If MP runs out of space, allocate new a new mempool space, and copy all the items
  771. // from the OMT (which items refer to items in the old mempool) into the new mempool.
  772. // If MAYBE_FREE is NULL then free the old mempool's space.
  773. // Otherwise, store the old mempool's space in maybe_free.
  774. void
  775. toku_get_node_for_verify(
  776. BLOCKNUM blocknum,
  777. FT_HANDLE brt,
  778. FTNODE* nodep
  779. );
  780. int
  781. toku_verify_ftnode (FT_HANDLE brt,
  782. MSN rootmsn, MSN parentmsn,
  783. FTNODE node, int height,
  784. const DBT *lesser_pivot, // Everything in the subtree should be > lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.)
  785. const DBT *greatereq_pivot, // Everything in the subtree should be <= lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.)
  786. int (*progress_callback)(void *extra, float progress), void *progress_extra,
  787. int recurse, int verbose, int keep_going_on_failure)
  788. __attribute__ ((warn_unused_result));
  789. int toku_db_badformat(void) __attribute__((__warn_unused_result__));
  790. typedef enum {
  791. FT_UPGRADE_FOOTPRINT = 0,
  792. FT_UPGRADE_STATUS_NUM_ROWS
  793. } ft_upgrade_status_entry;
  794. typedef struct {
  795. bool initialized;
  796. TOKU_ENGINE_STATUS_ROW_S status[FT_UPGRADE_STATUS_NUM_ROWS];
  797. } FT_UPGRADE_STATUS_S, *FT_UPGRADE_STATUS;
  798. void toku_ft_upgrade_get_status(FT_UPGRADE_STATUS);
  799. typedef enum {
  800. LE_MAX_COMMITTED_XR = 0,
  801. LE_MAX_PROVISIONAL_XR,
  802. LE_EXPANDED,
  803. LE_MAX_MEMSIZE,
  804. LE_STATUS_NUM_ROWS
  805. } le_status_entry;
  806. typedef struct {
  807. bool initialized;
  808. TOKU_ENGINE_STATUS_ROW_S status[LE_STATUS_NUM_ROWS];
  809. } LE_STATUS_S, *LE_STATUS;
  810. void toku_le_get_status(LE_STATUS);
  811. typedef enum {
  812. FT_UPDATES = 0,
  813. FT_UPDATES_BROADCAST,
  814. FT_DESCRIPTOR_SET,
  815. FT_PARTIAL_EVICTIONS_NONLEAF, // number of nonleaf node partial evictions
  816. FT_PARTIAL_EVICTIONS_LEAF, // number of leaf node partial evictions
  817. FT_MSN_DISCARDS, // how many messages were ignored by leaf because of msn
  818. FT_MAX_WORKDONE, // max workdone value of any buffer
  819. FT_TOTAL_RETRIES, // total number of search retries due to TRY_AGAIN
  820. FT_MAX_SEARCH_EXCESS_RETRIES, // max number of excess search retries (retries - treeheight) due to TRY_AGAIN
  821. FT_SEARCH_TRIES_GT_HEIGHT, // number of searches that required more tries than the height of the tree
  822. FT_SEARCH_TRIES_GT_HEIGHTPLUS3, // number of searches that required more tries than the height of the tree plus three
  823. FT_DISK_FLUSH_LEAF, // number of leaf nodes flushed to disk, not for checkpoint
  824. FT_DISK_FLUSH_NONLEAF, // number of nonleaf nodes flushed to disk, not for checkpoint
  825. FT_DISK_FLUSH_LEAF_FOR_CHECKPOINT, // number of leaf nodes flushed to disk for checkpoint
  826. FT_DISK_FLUSH_NONLEAF_FOR_CHECKPOINT, // number of nonleaf nodes flushed to disk for checkpoint
  827. FT_CREATE_LEAF, // number of leaf nodes created
  828. FT_CREATE_NONLEAF, // number of nonleaf nodes created
  829. FT_DESTROY_LEAF, // number of leaf nodes destroyed
  830. FT_DESTROY_NONLEAF, // number of nonleaf nodes destroyed
  831. FT_MSG_BYTES_IN, // how many bytes of messages injected at root (for all trees)
  832. FT_MSG_BYTES_OUT, // how many bytes of messages flushed from h1 nodes to leaves
  833. FT_MSG_BYTES_CURR, // how many bytes of messages currently in trees (estimate)
  834. FT_MSG_BYTES_MAX, // how many bytes of messages currently in trees (estimate)
  835. FT_MSG_NUM, // how many messages injected at root
  836. FT_MSG_NUM_BROADCAST, // how many broadcast messages injected at root
  837. FT_NUM_BASEMENTS_DECOMPRESSED_NORMAL, // how many basement nodes were decompressed because they were the target of a query
  838. FT_NUM_BASEMENTS_DECOMPRESSED_AGGRESSIVE, // ... because they were between lc and rc
  839. FT_NUM_BASEMENTS_DECOMPRESSED_PREFETCH,
  840. FT_NUM_BASEMENTS_DECOMPRESSED_WRITE,
  841. FT_NUM_MSG_BUFFER_DECOMPRESSED_NORMAL, // how many msg buffers were decompressed because they were the target of a query
  842. FT_NUM_MSG_BUFFER_DECOMPRESSED_AGGRESSIVE, // ... because they were between lc and rc
  843. FT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH,
  844. FT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE,
  845. FT_NUM_PIVOTS_FETCHED_QUERY, // how many pivots were fetched for a query
  846. FT_NUM_PIVOTS_FETCHED_PREFETCH, // ... for a prefetch
  847. FT_NUM_PIVOTS_FETCHED_WRITE, // ... for a write
  848. FT_NUM_BASEMENTS_FETCHED_NORMAL, // how many basement nodes were fetched because they were the target of a query
  849. FT_NUM_BASEMENTS_FETCHED_AGGRESSIVE, // ... because they were between lc and rc
  850. FT_NUM_BASEMENTS_FETCHED_PREFETCH,
  851. FT_NUM_BASEMENTS_FETCHED_WRITE,
  852. FT_NUM_MSG_BUFFER_FETCHED_NORMAL, // how many msg buffers were fetched because they were the target of a query
  853. FT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE, // ... because they were between lc and rc
  854. FT_NUM_MSG_BUFFER_FETCHED_PREFETCH,
  855. FT_NUM_MSG_BUFFER_FETCHED_WRITE,
  856. FT_STATUS_NUM_ROWS
  857. } ft_status_entry;
  858. typedef struct {
  859. bool initialized;
  860. TOKU_ENGINE_STATUS_ROW_S status[FT_STATUS_NUM_ROWS];
  861. } FT_STATUS_S, *FT_STATUS;
  862. void toku_ft_get_status(FT_STATUS);
  863. void
  864. toku_ft_bn_apply_cmd_once (
  865. BASEMENTNODE bn,
  866. const FT_MSG cmd,
  867. uint32_t idx,
  868. LEAFENTRY le,
  869. uint64_t *workdonep,
  870. STAT64INFO stats_to_update
  871. );
  872. void
  873. toku_ft_bn_apply_cmd (
  874. ft_compare_func compare_fun,
  875. ft_update_func update_fun,
  876. DESCRIPTOR desc,
  877. BASEMENTNODE bn,
  878. FT_MSG cmd,
  879. uint64_t *workdone,
  880. STAT64INFO stats_to_update
  881. );
  882. void
  883. toku_ft_leaf_apply_cmd (
  884. ft_compare_func compare_fun,
  885. ft_update_func update_fun,
  886. DESCRIPTOR desc,
  887. FTNODE node,
  888. FT_MSG cmd,
  889. uint64_t *workdone,
  890. STAT64INFO stats_to_update
  891. );
  892. void
  893. toku_ft_node_put_cmd (
  894. ft_compare_func compare_fun,
  895. ft_update_func update_fun,
  896. DESCRIPTOR desc,
  897. FTNODE node,
  898. FT_MSG cmd,
  899. bool is_fresh,
  900. STAT64INFO stats_to_update
  901. );
  902. void toku_flusher_thread_set_callback(void (*callback_f)(int, void*), void* extra);
  903. int toku_upgrade_subtree_estimates_to_stat64info(int fd, FT h);
  904. #endif