1*46139Smao /*- 2*46139Smao * Copyright (c) 1990 The Regents of the University of California. 3*46139Smao * All rights reserved. 4*46139Smao * 5*46139Smao * This code is derived from software contributed to Berkeley by 6*46139Smao * Mike Olson. 7*46139Smao * 8*46139Smao * %sccs.include.redist.c% 9*46139Smao */ 10*46139Smao 11*46139Smao /* 12*46139Smao * @(#)btree.h 5.1 (Berkeley) 01/23/91 13*46139Smao */ 14*46139Smao 15*46139Smao typedef char *BTREE; /* should really be (void *) */ 16*46139Smao 17*46139Smao /* #define DEBUG */ 18*46139Smao 19*46139Smao #define RET_ERROR -1 20*46139Smao #define RET_SUCCESS 0 21*46139Smao #define RET_SPECIAL 1 22*46139Smao 23*46139Smao #ifndef TRUE 24*46139Smao #define TRUE 1 25*46139Smao #define FALSE 0 26*46139Smao #endif /* ndef TRUE */ 27*46139Smao 28*46139Smao #ifndef NULL 29*46139Smao #define NULL 0 30*46139Smao #endif /* ndef NULL */ 31*46139Smao 32*46139Smao /* libc */ 33*46139Smao extern char *malloc(); 34*46139Smao 35*46139Smao /* these are defined in lrucache.c */ 36*46139Smao extern char *lruinit(); 37*46139Smao extern char *lruget(); 38*46139Smao extern char *lrugetnew(); 39*46139Smao extern int lrusync(); 40*46139Smao extern int lruwrite(); 41*46139Smao extern int lrurelease(); 42*46139Smao extern void lrufree(); 43*46139Smao 44*46139Smao /* these are defined here */ 45*46139Smao extern BTREE bt_open(); 46*46139Smao extern int bt_close(); 47*46139Smao extern int bt_delete(); 48*46139Smao extern int bt_get(); 49*46139Smao extern int bt_put(); 50*46139Smao extern int bt_seq(); 51*46139Smao extern int bt_sync(); 52*46139Smao 53*46139Smao /* 54*46139Smao * Private types. What you choose for these depends on how big you 55*46139Smao * want to let files get, and how big you want to let pages get. 56*46139Smao */ 57*46139Smao 58*46139Smao typedef u_long index_t; /* so # bytes on a page fits in a long */ 59*46139Smao typedef u_long pgno_t; /* so # of pages in a btree fits in a long */ 60*46139Smao 61*46139Smao /* 62*46139Smao * When we do searches, we push the parent page numbers onto a stack 63*46139Smao * as we descend the tree. This is so that for insertions, we can 64*46139Smao * find our way back up to do internal page insertions and splits. 65*46139Smao */ 66*46139Smao 67*46139Smao typedef struct BTSTACK { 68*46139Smao pgno_t bts_pgno; 69*46139Smao struct BTSTACK *bts_next; 70*46139Smao } BTSTACK; 71*46139Smao 72*46139Smao /* 73*46139Smao * Every btree page has a header that looks like this. Flags are given 74*46139Smao * in the #define's for the F_ flags (see below). 75*46139Smao */ 76*46139Smao 77*46139Smao typedef struct BTHEADER { 78*46139Smao pgno_t h_pgno; /* page number of this page */ 79*46139Smao pgno_t h_prevpg; /* left sibling */ 80*46139Smao pgno_t h_nextpg; /* right sibling */ 81*46139Smao 82*46139Smao #define F_LEAF 0x01 /* leaf page, contains user data */ 83*46139Smao #define F_CONT 0x02 /* continuation page (large items) */ 84*46139Smao #define F_DIRTY 0x04 /* need to write to disk */ 85*46139Smao #define F_PRESERVE 0x08 /* never delete this chain of pages */ 86*46139Smao 87*46139Smao u_long h_flags; /* page state */ 88*46139Smao index_t h_lower; /* lower bound of free space on page */ 89*46139Smao index_t h_upper; /* upper bound of free space on page */ 90*46139Smao index_t h_linp[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 91*46139Smao } BTHEADER; 92*46139Smao 93*46139Smao /* 94*46139Smao * HTBUCKETs are hash table buckets for looking up pages of in-memory 95*46139Smao * btrees by page number. We use this indirection, rather than direct 96*46139Smao * pointers, so that the code for manipulating in-memory trees is the 97*46139Smao * same as that for manipulating on-disk trees. 98*46139Smao */ 99*46139Smao 100*46139Smao typedef struct HTBUCKET { 101*46139Smao pgno_t ht_pgno; 102*46139Smao BTHEADER *ht_page; 103*46139Smao struct HTBUCKET *ht_next; 104*46139Smao } HTBUCKET; 105*46139Smao 106*46139Smao typedef HTBUCKET **HTABLE; 107*46139Smao 108*46139Smao /* minimum size we'll let a page be */ 109*46139Smao #define MINPSIZE 512 110*46139Smao 111*46139Smao /* default cache size, in bytes */ 112*46139Smao #define DEFCACHE (20 * 1024) 113*46139Smao 114*46139Smao /* hash table size for in-memory trees */ 115*46139Smao #define HTSIZE 128 116*46139Smao 117*46139Smao /* generate a hash key from a page number */ 118*46139Smao #define HASHKEY(pgno) ((pgno - 1) % HTSIZE) 119*46139Smao 120*46139Smao /* 121*46139Smao * Disk btrees have a file descriptor, and may also have an lru buffer 122*46139Smao * cache, if the user asked for one. 123*46139Smao */ 124*46139Smao 125*46139Smao typedef struct BTDISK { 126*46139Smao int d_fd; 127*46139Smao char *d_cache; 128*46139Smao } BTDISK; 129*46139Smao 130*46139Smao /* 131*46139Smao * Cursors keep track of the current location in a sequential scan of 132*46139Smao * the database. Since btrees impose a total ordering on keys, we can 133*46139Smao * walk forward or backward through the database from any point. Cursors 134*46139Smao * survive updates to the tree, and can be used to delete a particular 135*46139Smao * record. 136*46139Smao */ 137*46139Smao 138*46139Smao typedef struct CURSOR { 139*46139Smao pgno_t c_pgno; /* pgno of current item in scan */ 140*46139Smao index_t c_index; /* index of current item in scan */ 141*46139Smao char *c_key; /* current key, used for updates */ 142*46139Smao 143*46139Smao #define CRSR_BEFORE 0x01 144*46139Smao 145*46139Smao u_char c_flags; /* to handle updates properly */ 146*46139Smao } CURSOR; 147*46139Smao 148*46139Smao /* 149*46139Smao * The private btree data structure. The user passes a pointer to one of 150*46139Smao * these when we are to manipulate a tree, but the BTREE type is opaque 151*46139Smao * to him. 152*46139Smao */ 153*46139Smao 154*46139Smao typedef struct BTREEDATA_P { 155*46139Smao char *bt_fname; /* NULL for in-memory trees */ 156*46139Smao union { 157*46139Smao BTDISK bt_d; /* for on-disk btrees */ 158*46139Smao HTABLE bt_ht; /* hash table for mem trees */ 159*46139Smao } bt_s; 160*46139Smao size_t bt_psize; /* page size for btree pages */ 161*46139Smao int (*bt_compare)(); /* key comparison function */ 162*46139Smao pgno_t bt_npages; /* number of pages in tree */ 163*46139Smao BTHEADER *bt_curpage; /* current page contents */ 164*46139Smao pgno_t bt_free; /* free pg list for big data */ 165*46139Smao CURSOR bt_cursor; /* cursor for scans */ 166*46139Smao BTSTACK *bt_stack; /* parent stack for inserts */ 167*46139Smao u_long bt_lorder; /* byte order (endian.h) */ 168*46139Smao 169*46139Smao #define BTF_METAOK 0x01 /* meta-data written to start of file */ 170*46139Smao #define BTF_SEQINIT 0x02 /* we have called bt_seq */ 171*46139Smao #define BTF_ISWRITE 0x04 /* tree was opened for write */ 172*46139Smao #define BTF_NODUPS 0x08 /* tree created for unique keys */ 173*46139Smao 174*46139Smao u_long bt_flags; /* btree state */ 175*46139Smao } BTREEDATA_P; 176*46139Smao 177*46139Smao typedef BTREEDATA_P *BTREE_P; 178*46139Smao 179*46139Smao /* 180*46139Smao * The first thing in a btree file is a BTMETA structure. The rest of 181*46139Smao * the first page is empty, so that all disk operations are page-aligned. 182*46139Smao */ 183*46139Smao 184*46139Smao typedef struct BTMETA { 185*46139Smao u_long m_magic; 186*46139Smao u_long m_version; 187*46139Smao size_t m_psize; 188*46139Smao pgno_t m_free; 189*46139Smao u_long m_flags; 190*46139Smao u_long m_lorder; 191*46139Smao } BTMETA; 192*46139Smao 193*46139Smao #define P_NONE 0 /* invalid page number in tree */ 194*46139Smao #define P_ROOT 1 /* page number of root pg in btree */ 195*46139Smao 196*46139Smao #define NORELEASE 0 /* don't release a page during write */ 197*46139Smao #define RELEASE 1 /* release a page during write */ 198*46139Smao 199*46139Smao #define INSERT 0 /* doing an insert operation */ 200*46139Smao #define DELETE 1 /* doing a delete operation */ 201*46139Smao 202*46139Smao /* get the next free index on a btree page */ 203*46139Smao #define NEXTINDEX(p) ((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t))) 204*46139Smao 205*46139Smao /* is a BTITEM actually on the btree page? */ 206*46139Smao #define VALIDITEM(t, i) ((i)->bti_index < NEXTINDEX((t)->bt_curpage)) 207*46139Smao 208*46139Smao /* guarantee longword alignment so structure refs work */ 209*46139Smao #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03) 210*46139Smao 211*46139Smao /* get a particular datum (or idatum) off a page */ 212*46139Smao #define GETDATUM(h,i) (((char *) h) + h->h_linp[i]) 213*46139Smao 214*46139Smao /* is a {key,datum} too big to put on a single page? */ 215*46139Smao #define TOOBIG(t, sz) (sz >= t->bt_psize / 5) 216*46139Smao 217*46139Smao /* is this a disk tree or a memory tree? */ 218*46139Smao #define ISDISK(t) (t->bt_fname != (char *) NULL) 219*46139Smao 220*46139Smao /* does the disk tree use a cache? */ 221*46139Smao #define ISCACHE(t) (t->bt_s.bt_d.d_cache != (char *) NULL) 222*46139Smao 223*46139Smao /* 224*46139Smao * DATUMs are for user data -- one appears on leaf pages for every 225*46139Smao * tree entry. The d_bytes[] array contains the key first, then the data. 226*46139Smao * 227*46139Smao * If either the key or the datum is too big to store on a single page, 228*46139Smao * a bit is set in the flags entry, and the d_bytes[] array contains a 229*46139Smao * pgno pointing to the page at which the data is actually stored. 230*46139Smao * 231*46139Smao * Note on alignment: every DATUM is guaranteed to be longword aligned 232*46139Smao * on the disk page. In order to force longword alignment of user key 233*46139Smao * and data values, we must guarantee that the d_bytes[] array starts 234*46139Smao * on a longword boundary. This is the reason that d_flags is a u_long, 235*46139Smao * rather than a u_char (it really only needs to be two bits big). This 236*46139Smao * is necessary because we call the user's comparison function with a 237*46139Smao * pointer to the start of the d_bytes array. We don't need to force 238*46139Smao * longword alignment of the data following the key, since that is copied 239*46139Smao * to a longword-aligned buffer before being returned to the user. 240*46139Smao */ 241*46139Smao 242*46139Smao typedef struct DATUM { 243*46139Smao size_t d_ksize; /* size of key */ 244*46139Smao size_t d_dsize; /* size of data */ 245*46139Smao 246*46139Smao #define D_BIGDATA 0x01 /* indirect datum ptr flag */ 247*46139Smao #define D_BIGKEY 0x02 /* indirect key ptr flag */ 248*46139Smao 249*46139Smao u_long d_flags; /* flags (indirect bit) */ 250*46139Smao char d_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 251*46139Smao } DATUM; 252*46139Smao 253*46139Smao /* BTITEMs are used to return (page, index, datum) tuples from searches */ 254*46139Smao typedef struct BTITEM { 255*46139Smao pgno_t bti_pgno; 256*46139Smao index_t bti_index; 257*46139Smao DATUM *bti_datum; 258*46139Smao } BTITEM; 259*46139Smao 260*46139Smao /* 261*46139Smao * IDATUMs are for data stored on internal pages. This is the (key, pgno) 262*46139Smao * pair, such that key 'key' is the first entry on page 'pgno'. If our 263*46139Smao * internal page contains keys (a) and (b) next to each other, then all 264*46139Smao * items >= to (a) and < (b) go on the same page as (a). There are some 265*46139Smao * gotchas with duplicate keys, however. See the split code for details. 266*46139Smao * 267*46139Smao * If a key is too big to fit on a single page, then the i_bytes[] array 268*46139Smao * contains a pgno pointing to the start of a chain that actually stores 269*46139Smao * the bytes. Since items on internal pages are never deleted from the 270*46139Smao * tree, these indirect chains are marked as special, so that they won't 271*46139Smao * be deleted if the corresponding leaf item is deleted. 272*46139Smao * 273*46139Smao * As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char) 274*46139Smao * in order to guarantee that user keys are longword aligned on the disk 275*46139Smao * page. 276*46139Smao */ 277*46139Smao 278*46139Smao typedef struct IDATUM { 279*46139Smao size_t i_size; 280*46139Smao pgno_t i_pgno; 281*46139Smao u_long i_flags; /* see DATUM.d_flags, above */ 282*46139Smao char i_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 283*46139Smao } IDATUM; 284*46139Smao 285*46139Smao /* all private interfaces have a leading _ in their names */ 286*46139Smao extern BTITEM *_bt_search(); 287*46139Smao extern BTITEM *_bt_searchr(); 288*46139Smao extern BTHEADER *_bt_allocpg(); 289*46139Smao extern index_t _bt_binsrch(); 290*46139Smao extern int _bt_isonpage(); 291*46139Smao extern BTITEM *_bt_first(); 292*46139Smao extern int _bt_release(); 293*46139Smao extern int _bt_wrtmeta(); 294*46139Smao extern int _bt_delindir(); 295*46139Smao extern int _bt_pgout(); 296*46139Smao extern int _bt_pgin(); 297*46139Smao extern int _bt_fixscan(); 298*46139Smao extern int _bt_indirect(); 299*46139Smao extern int _bt_crsrdel(); 300*46139Smao extern int _bt_push(); 301*46139Smao extern pgno_t _bt_pop(); 302*46139Smao extern int strcmp(); 303*46139Smao 304