You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

249 lines
9.1 KiB

  1. #ifndef BRT_INTERNAL_H
  2. #define BRT_INTERNAL_H
  3. #ident "Copyright (c) 2007 Tokutek Inc. All rights reserved."
  4. #include "cachetable.h"
  5. #include "hashtable.h"
  6. #include "pma.h"
  7. #include "brt.h"
  8. #include "crc.h"
  9. #ifndef BRT_FANOUT
  10. #define BRT_FANOUT 16
  11. #endif
  12. enum { TREE_FANOUT = BRT_FANOUT };
  13. enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */
  14. enum { PMA_ITEM_OVERHEAD = 4 };
  15. enum { BRT_CMD_OVERHEAD = 1 };
  16. enum { BRT_DEFAULT_NODE_SIZE = 1 << 20 };
  17. struct nodeheader_in_file {
  18. int n_in_buffer;
  19. };
  20. enum { BUFFER_HEADER_SIZE = (4 // height//
  21. + 4 // n_children
  22. + TREE_FANOUT * 8 // children
  23. ) };
  24. struct brtnode_nonleaf_pivotinfo {
  25. struct kv_pair *pivotkey; /* For DUPSORT keys, the keys are whole key-value pairs.
  26. * For nonduplicate and DUPSORT keys we have
  27. * Child 0's keys <= pivotkey[0] < Child 1's keys <= pivotkey[1] < ... pivotkey[N-1] < child N's keys <= pivotkey[N] ...
  28. */
  29. unsigned char pivotflags;
  30. };
  31. struct brtnode_nonleaf_childinfo {
  32. u_int32_t subtree_fingerprint;
  33. #if 0
  34. DISKOFF diskoff;
  35. HASHTABLE htable;
  36. unsigned int n_bytes_in_hashtable; /* How many bytes are in each hashtable (including overheads for the disk-representation) */
  37. unsigned int n_cursors;
  38. #endif
  39. };
  40. typedef struct brtnode *BRTNODE;
  41. /* Internal nodes. */
  42. struct brtnode {
  43. enum typ_tag tag;
  44. unsigned int nodesize;
  45. unsigned int flags;
  46. DISKOFF thisnodename; // The size of the node allocated on disk. Not all is necessarily in use.
  47. LSN disk_lsn; // The LSN as of the most recent version on disk.
  48. LSN log_lsn; // The LSN as of the most recent log write.
  49. int layout_version; // What version of the data structure?
  50. BRTNODE parent_brtnode; /* Invariant: The parent of an in-memory node must be in main memory. This is so we can find and update the down pointer when we change the diskoff of a node. */
  51. int height; /* height is always >= 0. 0 for leaf, >0 for nonleaf. */
  52. u_int32_t rand4fingerprint;
  53. u_int32_t local_fingerprint; /* For leaves this is everything in the buffer. For nonleaves, this is everything in the hash tables, but does not include child subtree fingerprints. */
  54. int dirty;
  55. union node {
  56. struct nonleaf {
  57. // Don't actually store the subree fingerprint in the in-memory data structure.
  58. int n_children; /* if n_children==TREE_FANOUT+1 then the tree needs to be rebalanced. */
  59. unsigned int totalchildkeylens;
  60. unsigned int n_bytes_in_hashtables;
  61. struct brtnode_nonleaf_childinfo childinfos[TREE_FANOUT+1]; /* One extra so we can grow */
  62. #if 0
  63. u_int32_t child_subtree_fingerprints[TREE_FANOUT+1];
  64. #define BRTNODE_CHILD_SUBTREE_FINGERPRINTS(node,i) ((node)->u.n.child_subtree_fingerprints[i])
  65. #else
  66. #define BRTNODE_CHILD_SUBTREE_FINGERPRINTS(node,i) ((node)->u.n.childinfos[i].subtree_fingerprint)
  67. #endif
  68. //#define CHSTRUCT
  69. #ifdef CHSTRUCT
  70. struct brtnode_nonleaf_pivotinfo pivots[TREE_FANOUT]; /* One extra one so we can grow. */
  71. #else
  72. struct kv_pair *childkeys[TREE_FANOUT]; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1].
  73. Note: It is possible that Child 1's keys are == to child 0's key's, so it is
  74. not necessarily true that child 1's keys are > childkeys[0].
  75. However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */
  76. unsigned char pivotflags[TREE_FANOUT];
  77. DISKOFF children[TREE_FANOUT+1]; /* unused if height==0 */ /* Note: The last element of these arrays is used only temporarily while splitting a node. */
  78. #define BRTNODE_CHILD_DISKOFF(node,i) ((node)->u.n.children[i])
  79. HASHTABLE htables[TREE_FANOUT+1];
  80. unsigned int n_bytes_in_hashtable[TREE_FANOUT+1]; /* how many bytes are in each hashtable (including overheads) */
  81. unsigned int n_cursors[TREE_FANOUT+1];
  82. #endif
  83. } n;
  84. struct leaf {
  85. PMA buffer;
  86. unsigned int n_bytes_in_buffer; /* How many bytes to represent the PMA (including the per-key overheads, but not including the overheads for the node. */
  87. } l;
  88. } u;
  89. };
  90. /* pivot flags (must fit in 8 bits) */
  91. enum {
  92. BRT_PIVOT_PRESENT_L = 1,
  93. BRT_PIVOT_PRESENT_R = 2,
  94. BRT_PIVOT_TRUNC = 4,
  95. BRT_PIVOT_FRONT_COMPRESS = 8,
  96. };
  97. struct brt_header {
  98. int dirty;
  99. unsigned int nodesize;
  100. DISKOFF freelist;
  101. DISKOFF unused_memory;
  102. DISKOFF unnamed_root;
  103. int n_named_roots; /* -1 if the only one is unnamed */
  104. char **names;
  105. DISKOFF *roots;
  106. unsigned int flags;
  107. };
  108. enum brt_header_flags {
  109. TOKU_DB_DUP = 1,
  110. TOKU_DB_DUPSORT = 2,
  111. };
  112. struct brt {
  113. CACHEFILE cf;
  114. char *database_name;
  115. // The header is shared. It is also ephemeral.
  116. struct brt_header *h;
  117. BRT_CURSOR cursors_head, cursors_tail;
  118. unsigned int nodesize;
  119. unsigned int flags;
  120. int (*compare_fun)(DB*,const DBT*,const DBT*);
  121. int (*dup_compare)(DB*,const DBT*,const DBT*);
  122. DB *db; // To pass to the compare fun
  123. void *skey,*sval; /* Used for DBT return values. */
  124. };
  125. /* serialization code */
  126. void toku_serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node);
  127. int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int flags, int nodesize, int (*bt_compare)(DB *, const DBT*, const DBT*), int (*dup_compare)(DB *, const DBT *, const DBT *), DB *db, FILENUM filenum);
  128. unsigned int toku_serialize_brtnode_size(BRTNODE node); /* How much space will it take? */
  129. int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);
  130. void toku_verify_counts(BRTNODE);
  131. int toku_serialize_brt_header_size (struct brt_header *h);
  132. int toku_serialize_brt_header_to (int fd, struct brt_header *h);
  133. int toku_serialize_brt_header_to_wbuf (struct wbuf *, struct brt_header *h);
  134. int toku_deserialize_brtheader_from (int fd, DISKOFF off, struct brt_header **brth);
  135. void toku_brtnode_free (BRTNODE *node);
  136. //static inline int brtnode_n_hashtables(BRTNODE node) { if (node->height==0) return 1; else return node->u.n.n_children; }
  137. //int write_brt_header (int fd, struct brt_header *header);
  138. #if 1
  139. #define DEADBEEF ((void*)0xDEADBEEF)
  140. #else
  141. #define DEADBEEF ((void*)0xDEADBEEFDEADBEEF)
  142. #endif
  143. #define CURSOR_PATHLEN_LIMIT 256
  144. struct brt_cursor {
  145. BRT brt;
  146. int path_len; /* -1 if the cursor points nowhere. */
  147. BRTNODE path[CURSOR_PATHLEN_LIMIT]; /* Include the leaf (last). These are all pinned. */
  148. int pathcnum[CURSOR_PATHLEN_LIMIT]; /* which child did we descend to from here? */
  149. PMA_CURSOR pmacurs; /* The cursor into the leaf. NULL if the cursor doesn't exist. */
  150. BRT_CURSOR prev,next;
  151. int op;
  152. };
  153. /* print the cursor path */
  154. void toku_brt_cursor_print(BRT_CURSOR cursor);
  155. /* is the cursor path empty? */
  156. static inline int toku_brt_cursor_path_empty(BRT_CURSOR cursor) {
  157. return cursor->path_len == 0;
  158. }
  159. /*is the cursor path full? */
  160. static inline int toku_brt_cursor_path_full(BRT_CURSOR cursor) {
  161. return cursor->path_len == CURSOR_PATHLEN_LIMIT;
  162. }
  163. static inline int toku_brt_cursor_active(BRT_CURSOR cursor) {
  164. return cursor->path_len > 0;
  165. }
  166. /* brt has a new root. add the root to this cursor. */
  167. void toku_brt_cursor_new_root(BRT_CURSOR cursor, BRT t, BRTNODE newroot, BRTNODE left, BRTNODE right);
  168. /* a brt leaf has split. modify this cursor if it includes the old node in its path. */
  169. void toku_brt_cursor_leaf_split(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, BRTNODE left, BRTNODE right);
  170. /* a brt internal node has expanded. modify this cursor if it includes the old node in its path. */
  171. void toku_brt_cursor_nonleaf_expand(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, int childnum, BRTNODE left, BRTNODE right);
  172. /* a brt internal node has split. modify this cursor if it includes the old node in its path. */
  173. void toku_brt_cursor_nonleaf_split(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, BRTNODE left, BRTNODE right);
  174. enum brt_cmd_type {
  175. BRT_NONE = 0,
  176. BRT_INSERT = 1,
  177. BRT_DELETE = 2,
  178. };
  179. struct brt_cmd {
  180. enum brt_cmd_type type;
  181. union {
  182. /* insert or delete */
  183. struct brt_cmd_insert_delete {
  184. DBT *key;
  185. DBT *val;
  186. } id;
  187. } u;
  188. };
  189. typedef struct brt_cmd BRT_CMD;
  190. struct brtenv {
  191. CACHETABLE ct;
  192. TOKULOGGER logger;
  193. long long checksum_number;
  194. // SPINLOCK checkpointing;
  195. };
  196. extern cachetable_flush_func_t toku_brtnode_flush_callback, toku_brtheader_flush_callback;
  197. extern cachetable_fetch_func_t toku_brtnode_fetch_callback, toku_brtheader_fetch_callback;
  198. extern int toku_read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header);
  199. extern int toku_unpin_brt_header (BRT brt);
  200. extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt);
  201. static const BRTNODE null_brtnode=0;
  202. extern u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen);
  203. extern u_int32_t toku_calccrc32_cmd (int type, const void *key, int keylen, const void *val, int vallen);
  204. extern u_int32_t toku_calccrc32_cmdstruct (BRT_CMD *cmd);
  205. // How long is the pivot key?
  206. unsigned int toku_brt_pivot_key_len (BRT, struct kv_pair *); // Given the tree
  207. unsigned int toku_brtnode_pivot_key_len (BRTNODE, struct kv_pair *); // Given the node
  208. #endif