1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 /* 12 * @(#)btree.h 5.1 (Berkeley) 01/23/91 13 */ 14 15 typedef char *BTREE; /* should really be (void *) */ 16 17 /* #define DEBUG */ 18 19 #define RET_ERROR -1 20 #define RET_SUCCESS 0 21 #define RET_SPECIAL 1 22 23 #ifndef TRUE 24 #define TRUE 1 25 #define FALSE 0 26 #endif /* ndef TRUE */ 27 28 #ifndef NULL 29 #define NULL 0 30 #endif /* ndef NULL */ 31 32 /* libc */ 33 extern char *malloc(); 34 35 /* these are defined in lrucache.c */ 36 extern char *lruinit(); 37 extern char *lruget(); 38 extern char *lrugetnew(); 39 extern int lrusync(); 40 extern int lruwrite(); 41 extern int lrurelease(); 42 extern void lrufree(); 43 44 /* these are defined here */ 45 extern BTREE bt_open(); 46 extern int bt_close(); 47 extern int bt_delete(); 48 extern int bt_get(); 49 extern int bt_put(); 50 extern int bt_seq(); 51 extern int bt_sync(); 52 53 /* 54 * Private types. What you choose for these depends on how big you 55 * want to let files get, and how big you want to let pages get. 56 */ 57 58 typedef u_long index_t; /* so # bytes on a page fits in a long */ 59 typedef u_long pgno_t; /* so # of pages in a btree fits in a long */ 60 61 /* 62 * When we do searches, we push the parent page numbers onto a stack 63 * as we descend the tree. This is so that for insertions, we can 64 * find our way back up to do internal page insertions and splits. 65 */ 66 67 typedef struct BTSTACK { 68 pgno_t bts_pgno; 69 struct BTSTACK *bts_next; 70 } BTSTACK; 71 72 /* 73 * Every btree page has a header that looks like this. Flags are given 74 * in the #define's for the F_ flags (see below). 75 */ 76 77 typedef struct BTHEADER { 78 pgno_t h_pgno; /* page number of this page */ 79 pgno_t h_prevpg; /* left sibling */ 80 pgno_t h_nextpg; /* right sibling */ 81 82 #define F_LEAF 0x01 /* leaf page, contains user data */ 83 #define F_CONT 0x02 /* continuation page (large items) */ 84 #define F_DIRTY 0x04 /* need to write to disk */ 85 #define F_PRESERVE 0x08 /* never delete this chain of pages */ 86 87 u_long h_flags; /* page state */ 88 index_t h_lower; /* lower bound of free space on page */ 89 index_t h_upper; /* upper bound of free space on page */ 90 index_t h_linp[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 91 } BTHEADER; 92 93 /* 94 * HTBUCKETs are hash table buckets for looking up pages of in-memory 95 * btrees by page number. We use this indirection, rather than direct 96 * pointers, so that the code for manipulating in-memory trees is the 97 * same as that for manipulating on-disk trees. 98 */ 99 100 typedef struct HTBUCKET { 101 pgno_t ht_pgno; 102 BTHEADER *ht_page; 103 struct HTBUCKET *ht_next; 104 } HTBUCKET; 105 106 typedef HTBUCKET **HTABLE; 107 108 /* minimum size we'll let a page be */ 109 #define MINPSIZE 512 110 111 /* default cache size, in bytes */ 112 #define DEFCACHE (20 * 1024) 113 114 /* hash table size for in-memory trees */ 115 #define HTSIZE 128 116 117 /* generate a hash key from a page number */ 118 #define HASHKEY(pgno) ((pgno - 1) % HTSIZE) 119 120 /* 121 * Disk btrees have a file descriptor, and may also have an lru buffer 122 * cache, if the user asked for one. 123 */ 124 125 typedef struct BTDISK { 126 int d_fd; 127 char *d_cache; 128 } BTDISK; 129 130 /* 131 * Cursors keep track of the current location in a sequential scan of 132 * the database. Since btrees impose a total ordering on keys, we can 133 * walk forward or backward through the database from any point. Cursors 134 * survive updates to the tree, and can be used to delete a particular 135 * record. 136 */ 137 138 typedef struct CURSOR { 139 pgno_t c_pgno; /* pgno of current item in scan */ 140 index_t c_index; /* index of current item in scan */ 141 char *c_key; /* current key, used for updates */ 142 143 #define CRSR_BEFORE 0x01 144 145 u_char c_flags; /* to handle updates properly */ 146 } CURSOR; 147 148 /* 149 * The private btree data structure. The user passes a pointer to one of 150 * these when we are to manipulate a tree, but the BTREE type is opaque 151 * to him. 152 */ 153 154 typedef struct BTREEDATA_P { 155 char *bt_fname; /* NULL for in-memory trees */ 156 union { 157 BTDISK bt_d; /* for on-disk btrees */ 158 HTABLE bt_ht; /* hash table for mem trees */ 159 } bt_s; 160 size_t bt_psize; /* page size for btree pages */ 161 int (*bt_compare)(); /* key comparison function */ 162 pgno_t bt_npages; /* number of pages in tree */ 163 BTHEADER *bt_curpage; /* current page contents */ 164 pgno_t bt_free; /* free pg list for big data */ 165 CURSOR bt_cursor; /* cursor for scans */ 166 BTSTACK *bt_stack; /* parent stack for inserts */ 167 u_long bt_lorder; /* byte order (endian.h) */ 168 169 #define BTF_METAOK 0x01 /* meta-data written to start of file */ 170 #define BTF_SEQINIT 0x02 /* we have called bt_seq */ 171 #define BTF_ISWRITE 0x04 /* tree was opened for write */ 172 #define BTF_NODUPS 0x08 /* tree created for unique keys */ 173 174 u_long bt_flags; /* btree state */ 175 } BTREEDATA_P; 176 177 typedef BTREEDATA_P *BTREE_P; 178 179 /* 180 * The first thing in a btree file is a BTMETA structure. The rest of 181 * the first page is empty, so that all disk operations are page-aligned. 182 */ 183 184 typedef struct BTMETA { 185 u_long m_magic; 186 u_long m_version; 187 size_t m_psize; 188 pgno_t m_free; 189 u_long m_flags; 190 u_long m_lorder; 191 } BTMETA; 192 193 #define P_NONE 0 /* invalid page number in tree */ 194 #define P_ROOT 1 /* page number of root pg in btree */ 195 196 #define NORELEASE 0 /* don't release a page during write */ 197 #define RELEASE 1 /* release a page during write */ 198 199 #define INSERT 0 /* doing an insert operation */ 200 #define DELETE 1 /* doing a delete operation */ 201 202 /* get the next free index on a btree page */ 203 #define NEXTINDEX(p) ((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t))) 204 205 /* is a BTITEM actually on the btree page? */ 206 #define VALIDITEM(t, i) ((i)->bti_index < NEXTINDEX((t)->bt_curpage)) 207 208 /* guarantee longword alignment so structure refs work */ 209 #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03) 210 211 /* get a particular datum (or idatum) off a page */ 212 #define GETDATUM(h,i) (((char *) h) + h->h_linp[i]) 213 214 /* is a {key,datum} too big to put on a single page? */ 215 #define TOOBIG(t, sz) (sz >= t->bt_psize / 5) 216 217 /* is this a disk tree or a memory tree? */ 218 #define ISDISK(t) (t->bt_fname != (char *) NULL) 219 220 /* does the disk tree use a cache? */ 221 #define ISCACHE(t) (t->bt_s.bt_d.d_cache != (char *) NULL) 222 223 /* 224 * DATUMs are for user data -- one appears on leaf pages for every 225 * tree entry. The d_bytes[] array contains the key first, then the data. 226 * 227 * If either the key or the datum is too big to store on a single page, 228 * a bit is set in the flags entry, and the d_bytes[] array contains a 229 * pgno pointing to the page at which the data is actually stored. 230 * 231 * Note on alignment: every DATUM is guaranteed to be longword aligned 232 * on the disk page. In order to force longword alignment of user key 233 * and data values, we must guarantee that the d_bytes[] array starts 234 * on a longword boundary. This is the reason that d_flags is a u_long, 235 * rather than a u_char (it really only needs to be two bits big). This 236 * is necessary because we call the user's comparison function with a 237 * pointer to the start of the d_bytes array. We don't need to force 238 * longword alignment of the data following the key, since that is copied 239 * to a longword-aligned buffer before being returned to the user. 240 */ 241 242 typedef struct DATUM { 243 size_t d_ksize; /* size of key */ 244 size_t d_dsize; /* size of data */ 245 246 #define D_BIGDATA 0x01 /* indirect datum ptr flag */ 247 #define D_BIGKEY 0x02 /* indirect key ptr flag */ 248 249 u_long d_flags; /* flags (indirect bit) */ 250 char d_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 251 } DATUM; 252 253 /* BTITEMs are used to return (page, index, datum) tuples from searches */ 254 typedef struct BTITEM { 255 pgno_t bti_pgno; 256 index_t bti_index; 257 DATUM *bti_datum; 258 } BTITEM; 259 260 /* 261 * IDATUMs are for data stored on internal pages. This is the (key, pgno) 262 * pair, such that key 'key' is the first entry on page 'pgno'. If our 263 * internal page contains keys (a) and (b) next to each other, then all 264 * items >= to (a) and < (b) go on the same page as (a). There are some 265 * gotchas with duplicate keys, however. See the split code for details. 266 * 267 * If a key is too big to fit on a single page, then the i_bytes[] array 268 * contains a pgno pointing to the start of a chain that actually stores 269 * the bytes. Since items on internal pages are never deleted from the 270 * tree, these indirect chains are marked as special, so that they won't 271 * be deleted if the corresponding leaf item is deleted. 272 * 273 * As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char) 274 * in order to guarantee that user keys are longword aligned on the disk 275 * page. 276 */ 277 278 typedef struct IDATUM { 279 size_t i_size; 280 pgno_t i_pgno; 281 u_long i_flags; /* see DATUM.d_flags, above */ 282 char i_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 283 } IDATUM; 284 285 /* all private interfaces have a leading _ in their names */ 286 extern BTITEM *_bt_search(); 287 extern BTITEM *_bt_searchr(); 288 extern BTHEADER *_bt_allocpg(); 289 extern index_t _bt_binsrch(); 290 extern int _bt_isonpage(); 291 extern BTITEM *_bt_first(); 292 extern int _bt_release(); 293 extern int _bt_wrtmeta(); 294 extern int _bt_delindir(); 295 extern int _bt_pgout(); 296 extern int _bt_pgin(); 297 extern int _bt_fixscan(); 298 extern int _bt_indirect(); 299 extern int _bt_crsrdel(); 300 extern int _bt_push(); 301 extern pgno_t _bt_pop(); 302 extern int strcmp(); 303 304