1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * @(#)btree.h 5.2 (Berkeley) 2/22/91 39 */ 40 41 typedef char *BTREE; /* should really be (void *) */ 42 43 /* #define DEBUG */ 44 45 #define RET_ERROR -1 46 #define RET_SUCCESS 0 47 #define RET_SPECIAL 1 48 49 #ifndef TRUE 50 #define TRUE 1 51 #define FALSE 0 52 #endif /* ndef TRUE */ 53 54 #ifndef NULL 55 #define NULL 0 56 #endif /* ndef NULL */ 57 58 /* these are defined in lrucache.c */ 59 extern char *lruinit(); 60 extern char *lruget(); 61 extern char *lrugetnew(); 62 extern int lrusync(); 63 extern int lruwrite(); 64 extern int lrurelease(); 65 extern void lrufree(); 66 67 /* these are defined here */ 68 extern BTREE bt_open(); 69 extern int bt_close(); 70 extern int bt_delete(); 71 extern int bt_get(); 72 extern int bt_put(); 73 extern int bt_seq(); 74 extern int bt_sync(); 75 76 /* 77 * Private types. What you choose for these depends on how big you 78 * want to let files get, and how big you want to let pages get. 79 */ 80 81 typedef u_long index_t; /* so # bytes on a page fits in a long */ 82 typedef u_long pgno_t; /* so # of pages in a btree fits in a long */ 83 84 /* 85 * When we do searches, we push the parent page numbers onto a stack 86 * as we descend the tree. This is so that for insertions, we can 87 * find our way back up to do internal page insertions and splits. 88 */ 89 90 typedef struct BTSTACK { 91 pgno_t bts_pgno; 92 struct BTSTACK *bts_next; 93 } BTSTACK; 94 95 /* 96 * Every btree page has a header that looks like this. Flags are given 97 * in the #define's for the F_ flags (see below). 98 */ 99 100 typedef struct BTHEADER { 101 pgno_t h_pgno; /* page number of this page */ 102 pgno_t h_prevpg; /* left sibling */ 103 pgno_t h_nextpg; /* right sibling */ 104 105 #define F_LEAF 0x01 /* leaf page, contains user data */ 106 #define F_CONT 0x02 /* continuation page (large items) */ 107 #define F_DIRTY 0x04 /* need to write to disk */ 108 #define F_PRESERVE 0x08 /* never delete this chain of pages */ 109 110 u_long h_flags; /* page state */ 111 index_t h_lower; /* lower bound of free space on page */ 112 index_t h_upper; /* upper bound of free space on page */ 113 index_t h_linp[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 114 } BTHEADER; 115 116 /* 117 * HTBUCKETs are hash table buckets for looking up pages of in-memory 118 * btrees by page number. We use this indirection, rather than direct 119 * pointers, so that the code for manipulating in-memory trees is the 120 * same as that for manipulating on-disk trees. 121 */ 122 123 typedef struct HTBUCKET { 124 pgno_t ht_pgno; 125 BTHEADER *ht_page; 126 struct HTBUCKET *ht_next; 127 } HTBUCKET; 128 129 typedef HTBUCKET **HTABLE; 130 131 /* minimum size we'll let a page be */ 132 #define MINPSIZE 512 133 134 /* default cache size, in bytes */ 135 #define DEFCACHE (20 * 1024) 136 137 /* hash table size for in-memory trees */ 138 #define HTSIZE 128 139 140 /* generate a hash key from a page number */ 141 #define HASHKEY(pgno) ((pgno - 1) % HTSIZE) 142 143 /* 144 * Disk btrees have a file descriptor, and may also have an lru buffer 145 * cache, if the user asked for one. 146 */ 147 148 typedef struct BTDISK { 149 int d_fd; 150 char *d_cache; 151 } BTDISK; 152 153 /* 154 * Cursors keep track of the current location in a sequential scan of 155 * the database. Since btrees impose a total ordering on keys, we can 156 * walk forward or backward through the database from any point. Cursors 157 * survive updates to the tree, and can be used to delete a particular 158 * record. 159 */ 160 161 typedef struct CURSOR { 162 pgno_t c_pgno; /* pgno of current item in scan */ 163 index_t c_index; /* index of current item in scan */ 164 char *c_key; /* current key, used for updates */ 165 166 #define CRSR_BEFORE 0x01 167 168 u_char c_flags; /* to handle updates properly */ 169 } CURSOR; 170 171 /* 172 * The private btree data structure. The user passes a pointer to one of 173 * these when we are to manipulate a tree, but the BTREE type is opaque 174 * to him. 175 */ 176 177 typedef struct BTREEDATA_P { 178 char *bt_fname; /* NULL for in-memory trees */ 179 union { 180 BTDISK bt_d; /* for on-disk btrees */ 181 HTABLE bt_ht; /* hash table for mem trees */ 182 } bt_s; 183 size_t bt_psize; /* page size for btree pages */ 184 int (*bt_compare)(); /* key comparison function */ 185 pgno_t bt_npages; /* number of pages in tree */ 186 BTHEADER *bt_curpage; /* current page contents */ 187 pgno_t bt_free; /* free pg list for big data */ 188 CURSOR bt_cursor; /* cursor for scans */ 189 BTSTACK *bt_stack; /* parent stack for inserts */ 190 u_long bt_lorder; /* byte order (endian.h) */ 191 192 #define BTF_METAOK 0x01 /* meta-data written to start of file */ 193 #define BTF_SEQINIT 0x02 /* we have called bt_seq */ 194 #define BTF_ISWRITE 0x04 /* tree was opened for write */ 195 #define BTF_NODUPS 0x08 /* tree created for unique keys */ 196 197 u_long bt_flags; /* btree state */ 198 } BTREEDATA_P; 199 200 typedef BTREEDATA_P *BTREE_P; 201 202 /* 203 * The first thing in a btree file is a BTMETA structure. The rest of 204 * the first page is empty, so that all disk operations are page-aligned. 205 */ 206 207 typedef struct BTMETA { 208 u_long m_magic; 209 u_long m_version; 210 size_t m_psize; 211 pgno_t m_free; 212 u_long m_flags; 213 u_long m_lorder; 214 } BTMETA; 215 216 #define P_NONE 0 /* invalid page number in tree */ 217 #define P_ROOT 1 /* page number of root pg in btree */ 218 219 #define NORELEASE 0 /* don't release a page during write */ 220 #define RELEASE 1 /* release a page during write */ 221 222 #define INSERT 0 /* doing an insert operation */ 223 #define DELETE 1 /* doing a delete operation */ 224 225 /* get the next free index on a btree page */ 226 #define NEXTINDEX(p) ((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t))) 227 228 /* is a BTITEM actually on the btree page? */ 229 #define VALIDITEM(t, i) ((i)->bti_index < NEXTINDEX((t)->bt_curpage)) 230 231 /* guarantee longword alignment so structure refs work */ 232 #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03) 233 234 /* get a particular datum (or idatum) off a page */ 235 #define GETDATUM(h,i) (((char *) h) + h->h_linp[i]) 236 237 /* is a {key,datum} too big to put on a single page? */ 238 #define TOOBIG(t, sz) (sz >= t->bt_psize / 5) 239 240 /* is this a disk tree or a memory tree? */ 241 #define ISDISK(t) (t->bt_fname != (char *) NULL) 242 243 /* does the disk tree use a cache? */ 244 #define ISCACHE(t) (t->bt_s.bt_d.d_cache != (char *) NULL) 245 246 /* 247 * DATUMs are for user data -- one appears on leaf pages for every 248 * tree entry. The d_bytes[] array contains the key first, then the data. 249 * 250 * If either the key or the datum is too big to store on a single page, 251 * a bit is set in the flags entry, and the d_bytes[] array contains a 252 * pgno pointing to the page at which the data is actually stored. 253 * 254 * Note on alignment: every DATUM is guaranteed to be longword aligned 255 * on the disk page. In order to force longword alignment of user key 256 * and data values, we must guarantee that the d_bytes[] array starts 257 * on a longword boundary. This is the reason that d_flags is a u_long, 258 * rather than a u_char (it really only needs to be two bits big). This 259 * is necessary because we call the user's comparison function with a 260 * pointer to the start of the d_bytes array. We don't need to force 261 * longword alignment of the data following the key, since that is copied 262 * to a longword-aligned buffer before being returned to the user. 263 */ 264 265 typedef struct DATUM { 266 size_t d_ksize; /* size of key */ 267 size_t d_dsize; /* size of data */ 268 269 #define D_BIGDATA 0x01 /* indirect datum ptr flag */ 270 #define D_BIGKEY 0x02 /* indirect key ptr flag */ 271 272 u_long d_flags; /* flags (indirect bit) */ 273 char d_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 274 } DATUM; 275 276 /* BTITEMs are used to return (page, index, datum) tuples from searches */ 277 typedef struct BTITEM { 278 pgno_t bti_pgno; 279 index_t bti_index; 280 DATUM *bti_datum; 281 } BTITEM; 282 283 /* 284 * IDATUMs are for data stored on internal pages. This is the (key, pgno) 285 * pair, such that key 'key' is the first entry on page 'pgno'. If our 286 * internal page contains keys (a) and (b) next to each other, then all 287 * items >= to (a) and < (b) go on the same page as (a). There are some 288 * gotchas with duplicate keys, however. See the split code for details. 289 * 290 * If a key is too big to fit on a single page, then the i_bytes[] array 291 * contains a pgno pointing to the start of a chain that actually stores 292 * the bytes. Since items on internal pages are never deleted from the 293 * tree, these indirect chains are marked as special, so that they won't 294 * be deleted if the corresponding leaf item is deleted. 295 * 296 * As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char) 297 * in order to guarantee that user keys are longword aligned on the disk 298 * page. 299 */ 300 301 typedef struct IDATUM { 302 size_t i_size; 303 pgno_t i_pgno; 304 u_long i_flags; /* see DATUM.d_flags, above */ 305 char i_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 306 } IDATUM; 307 308 /* all private interfaces have a leading _ in their names */ 309 extern BTITEM *_bt_search(); 310 extern BTITEM *_bt_searchr(); 311 extern BTHEADER *_bt_allocpg(); 312 extern index_t _bt_binsrch(); 313 extern int _bt_isonpage(); 314 extern BTITEM *_bt_first(); 315 extern int _bt_release(); 316 extern int _bt_wrtmeta(); 317 extern int _bt_delindir(); 318 extern int _bt_pgout(); 319 extern int _bt_pgin(); 320 extern int _bt_fixscan(); 321 extern int _bt_indirect(); 322 extern int _bt_crsrdel(); 323 extern int _bt_push(); 324 extern pgno_t _bt_pop(); 325 extern int strcmp(); 326 327