1*45876Sbostic /*- 2*45876Sbostic * Copyright (c) 1990 The Regents of the University of California. 3*45876Sbostic * All rights reserved. 4*45876Sbostic * 5*45876Sbostic * This code is derived from software contributed to Berkeley by 6*45876Sbostic * Mike Olson. 7*45876Sbostic * 8*45876Sbostic * %sccs.include.redist.c% 9*45876Sbostic */ 10*45876Sbostic 11*45876Sbostic #if defined(LIBC_SCCS) && !defined(lint) 12*45876Sbostic static char sccsid[] = "@(#)bt_open.c 5.1 (Berkeley) 01/07/91"; 13*45876Sbostic #endif /* LIBC_SCCS and not lint */ 14*45876Sbostic 15*45876Sbostic /* 16*45876Sbostic * btree.c -- implementation of btree access method for 4.4BSD. 17*45876Sbostic * 18*45876Sbostic * The design here is based on that of the btree access method used 19*45876Sbostic * in the Postgres database system at UC Berkeley. The implementation 20*45876Sbostic * is wholly independent of the Postgres code. 21*45876Sbostic * 22*45876Sbostic * This implementation supports btrees on disk (supply a filename) or 23*45876Sbostic * in memory (don't). Public interfaces defined here are: 24*45876Sbostic * 25*45876Sbostic * btree_open() -- wrapper; returns a filled DB struct for 26*45876Sbostic * a btree. 27*45876Sbostic * 28*45876Sbostic * bt_open() -- open a new or existing btree. 29*45876Sbostic * bt_get() -- fetch data from a tree by key. 30*45876Sbostic * bt_seq() -- do a sequential scan on a tree. 31*45876Sbostic * bt_put() -- add data to a tree by key. 32*45876Sbostic * bt_delete() -- remove data from a tree by key. 33*45876Sbostic * bt_close() -- close a btree. 34*45876Sbostic * bt_sync() -- force btree pages to disk (disk trees only). 35*45876Sbostic */ 36*45876Sbostic 37*45876Sbostic #include <sys/param.h> 38*45876Sbostic #include <sys/file.h> 39*45876Sbostic #include <sys/stat.h> 40*45876Sbostic #include <sys/signal.h> 41*45876Sbostic #include <sys/errno.h> 42*45876Sbostic #include <db.h> 43*45876Sbostic 44*45876Sbostic #ifndef BLSWAP 45*45876Sbostic #define BLSWAP(a) { \ 46*45876Sbostic register unsigned long _t; \ 47*45876Sbostic _t = (a); \ 48*45876Sbostic (a) >>= 24; \ 49*45876Sbostic (a) |= (_t & 0x00ff0000) >> 8; \ 50*45876Sbostic (a) |= (_t & 0x0000ff00) << 8; \ 51*45876Sbostic (a) |= (_t & 0x000000ff) << 24; \ 52*45876Sbostic } 53*45876Sbostic #endif /* ndef BLSWAP */ 54*45876Sbostic 55*45876Sbostic typedef char *BTREE; /* should really be (void *) */ 56*45876Sbostic 57*45876Sbostic /* #define DEBUG */ 58*45876Sbostic 59*45876Sbostic #define RET_ERROR -1 60*45876Sbostic #define RET_SUCCESS 0 61*45876Sbostic #define RET_SPECIAL 1 62*45876Sbostic 63*45876Sbostic #ifndef TRUE 64*45876Sbostic #define TRUE 1 65*45876Sbostic #define FALSE 0 66*45876Sbostic #endif /* ndef TRUE */ 67*45876Sbostic 68*45876Sbostic extern int errno; 69*45876Sbostic 70*45876Sbostic /* libc */ 71*45876Sbostic extern char *malloc(); 72*45876Sbostic 73*45876Sbostic /* these are defined in lrucache.c */ 74*45876Sbostic extern char *lruinit(); 75*45876Sbostic extern char *lruget(); 76*45876Sbostic extern char *lrugetnew(); 77*45876Sbostic extern int lrusync(); 78*45876Sbostic extern int lruwrite(); 79*45876Sbostic extern int lrurelease(); 80*45876Sbostic extern void lrufree(); 81*45876Sbostic 82*45876Sbostic /* these are defined here */ 83*45876Sbostic BTREE bt_open(); 84*45876Sbostic int bt_close(); 85*45876Sbostic int bt_delete(); 86*45876Sbostic int bt_get(); 87*45876Sbostic int bt_put(); 88*45876Sbostic int bt_seq(); 89*45876Sbostic int bt_sync(); 90*45876Sbostic 91*45876Sbostic /* 92*45876Sbostic * Private types. What you choose for these depends on how big you 93*45876Sbostic * want to let files get, and how big you want to let pages get. 94*45876Sbostic */ 95*45876Sbostic 96*45876Sbostic typedef u_long index_t; /* so # bytes on a page fits in a long */ 97*45876Sbostic typedef u_long pgno_t; /* so # of pages in a btree fits in a long */ 98*45876Sbostic 99*45876Sbostic /* 100*45876Sbostic * When we do searches, we push the parent page numbers onto a stack 101*45876Sbostic * as we descend the tree. This is so that for insertions, we can 102*45876Sbostic * find our way back up to do internal page insertions and splits. 103*45876Sbostic */ 104*45876Sbostic 105*45876Sbostic typedef struct BTSTACK { 106*45876Sbostic pgno_t bts_pgno; 107*45876Sbostic struct BTSTACK *bts_next; 108*45876Sbostic } BTSTACK; 109*45876Sbostic 110*45876Sbostic /* 111*45876Sbostic * Every btree page has a header that looks like this. Flags are given 112*45876Sbostic * in the #define's for the F_ flags (see below). 113*45876Sbostic */ 114*45876Sbostic 115*45876Sbostic typedef struct BTHEADER { 116*45876Sbostic pgno_t h_pgno; /* page number of this page */ 117*45876Sbostic pgno_t h_prevpg; /* left sibling */ 118*45876Sbostic pgno_t h_nextpg; /* right sibling */ 119*45876Sbostic 120*45876Sbostic #define F_LEAF 0x01 /* leaf page, contains user data */ 121*45876Sbostic #define F_CONT 0x02 /* continuation page (large items) */ 122*45876Sbostic #define F_DIRTY 0x04 /* need to write to disk */ 123*45876Sbostic #define F_PRESERVE 0x08 /* never delete this chain of pages */ 124*45876Sbostic 125*45876Sbostic u_long h_flags; /* page state */ 126*45876Sbostic index_t h_lower; /* lower bound of free space on page */ 127*45876Sbostic index_t h_upper; /* upper bound of free space on page */ 128*45876Sbostic index_t h_linp[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 129*45876Sbostic } BTHEADER; 130*45876Sbostic 131*45876Sbostic /* 132*45876Sbostic * HTBUCKETs are hash table buckets for looking up pages of in-memory 133*45876Sbostic * btrees by page number. We use this indirection, rather than direct 134*45876Sbostic * pointers, so that the code for manipulating in-memory trees is the 135*45876Sbostic * same as that for manipulating on-disk trees. 136*45876Sbostic */ 137*45876Sbostic 138*45876Sbostic typedef struct HTBUCKET { 139*45876Sbostic pgno_t ht_pgno; 140*45876Sbostic BTHEADER *ht_page; 141*45876Sbostic struct HTBUCKET *ht_next; 142*45876Sbostic } HTBUCKET; 143*45876Sbostic 144*45876Sbostic typedef HTBUCKET **HTABLE; 145*45876Sbostic 146*45876Sbostic /* minimum size we'll let a page be */ 147*45876Sbostic #define MINPSIZE 512 148*45876Sbostic 149*45876Sbostic /* default cache size, in bytes */ 150*45876Sbostic #define DEFCACHE (20 * 1024) 151*45876Sbostic 152*45876Sbostic /* hash table size for in-memory trees */ 153*45876Sbostic #define HTSIZE 128 154*45876Sbostic 155*45876Sbostic /* generate a hash key from a page number */ 156*45876Sbostic #define HASHKEY(pgno) ((pgno - 1) % HTSIZE) 157*45876Sbostic 158*45876Sbostic /* 159*45876Sbostic * Disk btrees have a file descriptor, and may also have an lru buffer 160*45876Sbostic * cache, if the user asked for one. 161*45876Sbostic */ 162*45876Sbostic 163*45876Sbostic typedef struct BTDISK { 164*45876Sbostic int d_fd; 165*45876Sbostic char *d_cache; 166*45876Sbostic } BTDISK; 167*45876Sbostic 168*45876Sbostic /* 169*45876Sbostic * Cursors keep track of the current location in a sequential scan of 170*45876Sbostic * the database. Since btrees impose a total ordering on keys, we can 171*45876Sbostic * walk forward or backward through the database from any point. Cursors 172*45876Sbostic * survive updates to the tree, and can be used to delete a particular 173*45876Sbostic * record. 174*45876Sbostic */ 175*45876Sbostic 176*45876Sbostic typedef struct CURSOR { 177*45876Sbostic pgno_t c_pgno; /* pgno of current item in scan */ 178*45876Sbostic index_t c_index; /* index of current item in scan */ 179*45876Sbostic char *c_key; /* current key, used for updates */ 180*45876Sbostic 181*45876Sbostic #define CRSR_BEFORE 0x01 182*45876Sbostic 183*45876Sbostic u_char c_flags; /* to handle updates properly */ 184*45876Sbostic } CURSOR; 185*45876Sbostic 186*45876Sbostic /* 187*45876Sbostic * The private btree data structure. The user passes a pointer to one of 188*45876Sbostic * these when we are to manipulate a tree, but the BTREE type is opaque 189*45876Sbostic * to him. 190*45876Sbostic */ 191*45876Sbostic 192*45876Sbostic typedef struct BTREEDATA_P { 193*45876Sbostic char *bt_fname; /* NULL for in-memory trees */ 194*45876Sbostic union { 195*45876Sbostic BTDISK bt_d; /* for on-disk btrees */ 196*45876Sbostic HTABLE bt_ht; /* hash table for mem trees */ 197*45876Sbostic } bt_s; 198*45876Sbostic size_t bt_psize; /* page size for btree pages */ 199*45876Sbostic int (*bt_compare)(); /* key comparison function */ 200*45876Sbostic pgno_t bt_npages; /* number of pages in tree */ 201*45876Sbostic BTHEADER *bt_curpage; /* current page contents */ 202*45876Sbostic pgno_t bt_free; /* free pg list for big data */ 203*45876Sbostic CURSOR bt_cursor; /* cursor for scans */ 204*45876Sbostic BTSTACK *bt_stack; /* parent stack for inserts */ 205*45876Sbostic u_long bt_lorder; /* byte order (endian.h) */ 206*45876Sbostic 207*45876Sbostic #define BTF_METAOK 0x01 /* meta-data written to start of file */ 208*45876Sbostic #define BTF_SEQINIT 0x02 /* we have called bt_seq */ 209*45876Sbostic #define BTF_ISWRITE 0x04 /* tree was opened for write */ 210*45876Sbostic #define BTF_NODUPS 0x08 /* tree created for unique keys */ 211*45876Sbostic 212*45876Sbostic u_long bt_flags; /* btree state */ 213*45876Sbostic } BTREEDATA_P; 214*45876Sbostic 215*45876Sbostic typedef BTREEDATA_P *BTREE_P; 216*45876Sbostic 217*45876Sbostic /* 218*45876Sbostic * The first thing in a btree file is a BTMETA structure. The rest of 219*45876Sbostic * the first page is empty, so that all disk operations are page-aligned. 220*45876Sbostic */ 221*45876Sbostic 222*45876Sbostic typedef struct BTMETA { 223*45876Sbostic u_long m_magic; 224*45876Sbostic u_long m_version; 225*45876Sbostic size_t m_psize; 226*45876Sbostic pgno_t m_free; 227*45876Sbostic u_long m_flags; 228*45876Sbostic u_long m_lorder; 229*45876Sbostic } BTMETA; 230*45876Sbostic 231*45876Sbostic #define P_NONE 0 /* invalid page number in tree */ 232*45876Sbostic #define P_ROOT 1 /* page number of root pg in btree */ 233*45876Sbostic 234*45876Sbostic #define NORELEASE 0 /* don't release a page during write */ 235*45876Sbostic #define RELEASE 1 /* release a page during write */ 236*45876Sbostic 237*45876Sbostic #define INSERT 0 /* doing an insert operation */ 238*45876Sbostic #define DELETE 1 /* doing a delete operation */ 239*45876Sbostic 240*45876Sbostic /* magic number for identifying btree files */ 241*45876Sbostic #define BTREEMAGIC 053162L /* magic */ 242*45876Sbostic #define BTREEVERSION 2 /* last changed 6 jan 1991 */ 243*45876Sbostic 244*45876Sbostic /* get the next free index on a btree page */ 245*45876Sbostic #define NEXTINDEX(p) ((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t))) 246*45876Sbostic 247*45876Sbostic /* is a BTITEM actually on the btree page? */ 248*45876Sbostic #define VALIDITEM(t, i) ((i)->bti_index < NEXTINDEX((t)->bt_curpage)) 249*45876Sbostic 250*45876Sbostic /* guarantee longword alignment so structure refs work */ 251*45876Sbostic #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03) 252*45876Sbostic 253*45876Sbostic /* get a particular datum (or idatum) off a page */ 254*45876Sbostic #define GETDATUM(h,i) (((char *) h) + h->h_linp[i]) 255*45876Sbostic 256*45876Sbostic /* is a {key,datum} too big to put on a single page? */ 257*45876Sbostic #define TOOBIG(t, sz) (sz >= t->bt_psize / 5) 258*45876Sbostic 259*45876Sbostic /* is this a disk tree or a memory tree? */ 260*45876Sbostic #define ISDISK(t) (t->bt_fname != (char *) NULL) 261*45876Sbostic 262*45876Sbostic /* does the disk tree use a cache? */ 263*45876Sbostic #define ISCACHE(t) (t->bt_s.bt_d.d_cache != (char *) NULL) 264*45876Sbostic 265*45876Sbostic /* 266*45876Sbostic * DATUMs are for user data -- one appears on leaf pages for every 267*45876Sbostic * tree entry. The d_bytes[] array contains the key first, then the data. 268*45876Sbostic * 269*45876Sbostic * If either the key or the datum is too big to store on a single page, 270*45876Sbostic * a bit is set in the flags entry, and the d_bytes[] array contains a 271*45876Sbostic * pgno pointing to the page at which the data is actually stored. 272*45876Sbostic * 273*45876Sbostic * Note on alignment: every DATUM is guaranteed to be longword aligned 274*45876Sbostic * on the disk page. In order to force longword alignment of user key 275*45876Sbostic * and data values, we must guarantee that the d_bytes[] array starts 276*45876Sbostic * on a longword boundary. This is the reason that d_flags is a u_long, 277*45876Sbostic * rather than a u_char (it really only needs to be two bits big). This 278*45876Sbostic * is necessary because we call the user's comparison function with a 279*45876Sbostic * pointer to the start of the d_bytes array. We don't need to force 280*45876Sbostic * longword alignment of the data following the key, since that is copied 281*45876Sbostic * to a longword-aligned buffer before being returned to the user. 282*45876Sbostic */ 283*45876Sbostic 284*45876Sbostic typedef struct DATUM { 285*45876Sbostic size_t d_ksize; /* size of key */ 286*45876Sbostic size_t d_dsize; /* size of data */ 287*45876Sbostic 288*45876Sbostic #define D_BIGDATA 0x01 /* indirect datum ptr flag */ 289*45876Sbostic #define D_BIGKEY 0x02 /* indirect key ptr flag */ 290*45876Sbostic 291*45876Sbostic u_long d_flags; /* flags (indirect bit) */ 292*45876Sbostic char d_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 293*45876Sbostic } DATUM; 294*45876Sbostic 295*45876Sbostic /* BTITEMs are used to return (page, index, datum) tuples from searches */ 296*45876Sbostic typedef struct BTITEM { 297*45876Sbostic pgno_t bti_pgno; 298*45876Sbostic index_t bti_index; 299*45876Sbostic DATUM *bti_datum; 300*45876Sbostic } BTITEM; 301*45876Sbostic 302*45876Sbostic /* 303*45876Sbostic * IDATUMs are for data stored on internal pages. This is the (key, pgno) 304*45876Sbostic * pair, such that key 'key' is the first entry on page 'pgno'. If our 305*45876Sbostic * internal page contains keys (a) and (b) next to each other, then all 306*45876Sbostic * items >= to (a) and < (b) go on the same page as (a). There are some 307*45876Sbostic * gotchas with duplicate keys, however. See the split code for details. 308*45876Sbostic * 309*45876Sbostic * If a key is too big to fit on a single page, then the i_bytes[] array 310*45876Sbostic * contains a pgno pointing to the start of a chain that actually stores 311*45876Sbostic * the bytes. Since items on internal pages are never deleted from the 312*45876Sbostic * tree, these indirect chains are marked as special, so that they won't 313*45876Sbostic * be deleted if the corresponding leaf item is deleted. 314*45876Sbostic * 315*45876Sbostic * As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char) 316*45876Sbostic * in order to guarantee that user keys are longword aligned on the disk 317*45876Sbostic * page. 318*45876Sbostic */ 319*45876Sbostic 320*45876Sbostic typedef struct IDATUM { 321*45876Sbostic size_t i_size; 322*45876Sbostic pgno_t i_pgno; 323*45876Sbostic u_long i_flags; /* see DATUM.d_flags, above */ 324*45876Sbostic char i_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 325*45876Sbostic } IDATUM; 326*45876Sbostic 327*45876Sbostic /* all private interfaces have a leading _ in their names */ 328*45876Sbostic static BTITEM *_bt_search(); 329*45876Sbostic static BTITEM *_bt_searchr(); 330*45876Sbostic static BTHEADER *_bt_allocpg(); 331*45876Sbostic static index_t _bt_binsrch(); 332*45876Sbostic static int _bt_isonpage(); 333*45876Sbostic static BTITEM *_bt_first(); 334*45876Sbostic static int _bt_release(); 335*45876Sbostic static int _bt_wrtmeta(); 336*45876Sbostic static int _bt_delindir(); 337*45876Sbostic static int _bt_pgout(); 338*45876Sbostic static int _bt_pgin(); 339*45876Sbostic static int _bt_fixscan(); 340*45876Sbostic static int _bt_indirect(); 341*45876Sbostic static int _bt_crsrdel(); 342*45876Sbostic 343*45876Sbostic extern int strcmp(); 344*45876Sbostic 345*45876Sbostic static BTREEINFO _DefaultBTInfo = { 346*45876Sbostic 0, /* flags */ 347*45876Sbostic 0, /* cachesize */ 348*45876Sbostic 0, /* psize */ 349*45876Sbostic strcmp, /* compare */ 350*45876Sbostic 0 351*45876Sbostic }; 352*45876Sbostic 353*45876Sbostic /* 354*45876Sbostic * BTREE_OPEN -- Wrapper routine to open a btree. 355*45876Sbostic * 356*45876Sbostic * Creates and fills a DB struct, and calls the routine that actually 357*45876Sbostic * opens the btree. 358*45876Sbostic * 359*45876Sbostic * Parameters: 360*45876Sbostic * f: filename to open 361*45876Sbostic * flags: flag bits passed to open 362*45876Sbostic * mode: permission bits, used if O_CREAT specified 363*45876Sbostic * b: BTREEINFO pointer 364*45876Sbostic * 365*45876Sbostic * Returns: 366*45876Sbostic * Filled-in DBT on success; NULL on failure, with errno 367*45876Sbostic * set as appropriate. 368*45876Sbostic * 369*45876Sbostic * Side Effects: 370*45876Sbostic * Allocates memory for the DB struct. 371*45876Sbostic */ 372*45876Sbostic 373*45876Sbostic DB * 374*45876Sbostic btree_open(f, flags, mode, b) 375*45876Sbostic char *f; 376*45876Sbostic int flags; 377*45876Sbostic int mode; 378*45876Sbostic BTREEINFO *b; 379*45876Sbostic { 380*45876Sbostic DB *db; 381*45876Sbostic BTREE t; 382*45876Sbostic 383*45876Sbostic if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL) 384*45876Sbostic return ((DB *) NULL); 385*45876Sbostic 386*45876Sbostic if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) { 387*45876Sbostic (void) free ((char *) db); 388*45876Sbostic return ((DB *) NULL); 389*45876Sbostic } 390*45876Sbostic 391*45876Sbostic db->internal = (char *) t; 392*45876Sbostic db->close = bt_close; 393*45876Sbostic db->delete = bt_delete; 394*45876Sbostic db->get = bt_get; 395*45876Sbostic db->put = bt_put; 396*45876Sbostic db->seq = bt_seq; 397*45876Sbostic db->sync = bt_sync; 398*45876Sbostic 399*45876Sbostic return (db); 400*45876Sbostic } 401*45876Sbostic 402*45876Sbostic /* 403*45876Sbostic * BT_OPEN -- Open a btree. 404*45876Sbostic * 405*45876Sbostic * This routine creates the correct kind (disk or in-memory) of 406*45876Sbostic * btree and initializes it as required. 407*45876Sbostic * 408*45876Sbostic * Parameters: 409*45876Sbostic * f -- name of btree (NULL for in-memory btrees) 410*45876Sbostic * flags -- flags passed to open() 411*45876Sbostic * mode -- mode passed to open () 412*45876Sbostic * b -- BTREEINFO structure, describing btree 413*45876Sbostic * 414*45876Sbostic * Returns: 415*45876Sbostic * (Opaque) pointer to the btree. On failure, returns NULL 416*45876Sbostic * with errno set as appropriate. 417*45876Sbostic * 418*45876Sbostic * Side Effects: 419*45876Sbostic * Allocates memory, opens files. 420*45876Sbostic */ 421*45876Sbostic 422*45876Sbostic BTREE 423*45876Sbostic bt_open(f, flags, mode, b) 424*45876Sbostic char *f; 425*45876Sbostic int flags; 426*45876Sbostic int mode; 427*45876Sbostic BTREEINFO *b; 428*45876Sbostic { 429*45876Sbostic BTREE_P t; 430*45876Sbostic HTABLE ht; 431*45876Sbostic int nbytes; 432*45876Sbostic int fd; 433*45876Sbostic CURSOR *c; 434*45876Sbostic BTMETA m; 435*45876Sbostic struct stat buf; 436*45876Sbostic 437*45876Sbostic /* use the default info if none was provided */ 438*45876Sbostic if (b == (BTREEINFO *) NULL) 439*45876Sbostic b = &_DefaultBTInfo; 440*45876Sbostic 441*45876Sbostic if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL) 442*45876Sbostic return ((BTREE) NULL); 443*45876Sbostic 444*45876Sbostic if (b->compare) 445*45876Sbostic t->bt_compare = b->compare; 446*45876Sbostic else 447*45876Sbostic t->bt_compare = strcmp; 448*45876Sbostic 449*45876Sbostic t->bt_fname = f; 450*45876Sbostic t->bt_curpage = (BTHEADER *) NULL; 451*45876Sbostic t->bt_free = P_NONE; 452*45876Sbostic c = &(t->bt_cursor); 453*45876Sbostic c->c_pgno = P_NONE; 454*45876Sbostic c->c_index = 0; 455*45876Sbostic c->c_flags = (u_char) NULL; 456*45876Sbostic c->c_key = (char *) NULL; 457*45876Sbostic t->bt_stack = (BTSTACK *) NULL; 458*45876Sbostic t->bt_flags = 0; 459*45876Sbostic 460*45876Sbostic /* 461*45876Sbostic * If no file name was supplied, this is an in-memory btree. 462*45876Sbostic * Otherwise, it's a disk-based btree. 463*45876Sbostic */ 464*45876Sbostic if (f == (char *) NULL) { 465*45876Sbostic /* in memory */ 466*45876Sbostic if ((t->bt_psize = b->psize) < MINPSIZE) { 467*45876Sbostic if (t->bt_psize != 0) { 468*45876Sbostic (void) free ((char *) t); 469*45876Sbostic errno = EINVAL; 470*45876Sbostic return ((BTREE) NULL); 471*45876Sbostic } 472*45876Sbostic t->bt_psize = getpagesize(); 473*45876Sbostic } 474*45876Sbostic 475*45876Sbostic nbytes = HTSIZE * sizeof(HTBUCKET *); 476*45876Sbostic if ((ht = (HTABLE) malloc((unsigned) nbytes)) 477*45876Sbostic == (HTABLE) NULL) { 478*45876Sbostic (void) free((char *) t); 479*45876Sbostic return ((BTREE) NULL); 480*45876Sbostic } 481*45876Sbostic (void) bzero((char *) ht, nbytes); 482*45876Sbostic t->bt_s.bt_ht = ht; 483*45876Sbostic t->bt_npages = 0; 484*45876Sbostic t->bt_lorder = BYTE_ORDER; 485*45876Sbostic if (!(b->flags & R_DUP)) 486*45876Sbostic t->bt_flags |= BTF_NODUPS; 487*45876Sbostic } else { 488*45876Sbostic /* on disk */ 489*45876Sbostic if ((fd = open(f, O_RDONLY, 0)) < 0) { 490*45876Sbostic /* doesn't exist yet, be sure page is big enough */ 491*45876Sbostic if ((t->bt_psize = b->psize) < sizeof(BTHEADER) 492*45876Sbostic && b->psize != 0) { 493*45876Sbostic (void) free((char *) t); 494*45876Sbostic errno = EINVAL; 495*45876Sbostic return ((BTREE) NULL); 496*45876Sbostic } 497*45876Sbostic if (b->lorder == 0) 498*45876Sbostic b->lorder = BYTE_ORDER; 499*45876Sbostic 500*45876Sbostic if (b->lorder != BIG_ENDIAN 501*45876Sbostic && b->lorder != LITTLE_ENDIAN) { 502*45876Sbostic (void) free((char *) t); 503*45876Sbostic errno = EINVAL; 504*45876Sbostic return ((BTREE) NULL); 505*45876Sbostic } 506*45876Sbostic t->bt_lorder = b->lorder; 507*45876Sbostic if (!(b->flags & R_DUP)) 508*45876Sbostic t->bt_flags |= BTF_NODUPS; 509*45876Sbostic } else { 510*45876Sbostic /* exists, get page size from file */ 511*45876Sbostic if (read(fd, &m, sizeof(m)) < sizeof(m)) { 512*45876Sbostic (void) close(fd); 513*45876Sbostic (void) free((char *) t); 514*45876Sbostic errno = EINVAL; 515*45876Sbostic return ((BTREE) NULL); 516*45876Sbostic } 517*45876Sbostic 518*45876Sbostic /* lorder always stored in host-independent format */ 519*45876Sbostic NTOHL(m.m_lorder); 520*45876Sbostic if (m.m_lorder != BIG_ENDIAN 521*45876Sbostic && m.m_lorder != LITTLE_ENDIAN) { 522*45876Sbostic (void) free((char *) t); 523*45876Sbostic errno = EINVAL; 524*45876Sbostic return ((BTREE) NULL); 525*45876Sbostic } 526*45876Sbostic t->bt_lorder = m.m_lorder; 527*45876Sbostic 528*45876Sbostic if (t->bt_lorder != BYTE_ORDER) { 529*45876Sbostic BLSWAP(m.m_magic); 530*45876Sbostic BLSWAP(m.m_version); 531*45876Sbostic BLSWAP(m.m_psize); 532*45876Sbostic BLSWAP(m.m_free); 533*45876Sbostic BLSWAP(m.m_flags); 534*45876Sbostic } 535*45876Sbostic 536*45876Sbostic if (m.m_magic != BTREEMAGIC 537*45876Sbostic || m.m_version != BTREEVERSION 538*45876Sbostic || m.m_psize < MINPSIZE) { 539*45876Sbostic (void) close(fd); 540*45876Sbostic (void) free((char *) t); 541*45876Sbostic #ifndef EFTYPE 542*45876Sbostic #define EFTYPE -100 543*45876Sbostic #endif 544*45876Sbostic errno = EFTYPE; 545*45876Sbostic return ((BTREE) NULL); 546*45876Sbostic } 547*45876Sbostic t->bt_psize = m.m_psize; 548*45876Sbostic t->bt_free = m.m_free; 549*45876Sbostic t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK; 550*45876Sbostic (void) close(fd); 551*45876Sbostic } 552*45876Sbostic 553*45876Sbostic /* now open the file the way the user wanted it */ 554*45876Sbostic if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) { 555*45876Sbostic (void) free ((char *) t); 556*45876Sbostic return ((BTREE) NULL); 557*45876Sbostic } 558*45876Sbostic 559*45876Sbostic /* get number of pages, page size if necessary */ 560*45876Sbostic (void) fstat(t->bt_s.bt_d.d_fd, &buf); 561*45876Sbostic if (t->bt_psize == 0) 562*45876Sbostic t->bt_psize = buf.st_blksize; 563*45876Sbostic t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize); 564*45876Sbostic 565*45876Sbostic /* page zero is metadata, doesn't count */ 566*45876Sbostic if (t->bt_npages > 0) 567*45876Sbostic --(t->bt_npages); 568*45876Sbostic 569*45876Sbostic if (b->cachesize == 0) 570*45876Sbostic b->cachesize = DEFCACHE; 571*45876Sbostic 572*45876Sbostic /* get an lru buffer cache, if the user asked for one */ 573*45876Sbostic if ((b->cachesize / t->bt_psize) > 0) { 574*45876Sbostic BTDISK *d = &(t->bt_s.bt_d); 575*45876Sbostic 576*45876Sbostic d->d_cache = lruinit(d->d_fd, 577*45876Sbostic b->cachesize / t->bt_psize, 578*45876Sbostic t->bt_psize, 579*45876Sbostic (char *) t->bt_lorder, 580*45876Sbostic _bt_pgin, _bt_pgout); 581*45876Sbostic 582*45876Sbostic if (d->d_cache == (char *) NULL) { 583*45876Sbostic (void) free((char *) t); 584*45876Sbostic return ((BTREE) NULL); 585*45876Sbostic } 586*45876Sbostic } 587*45876Sbostic } 588*45876Sbostic 589*45876Sbostic /* remember if tree was opened for write */ 590*45876Sbostic if (((flags & O_WRONLY) == O_WRONLY) 591*45876Sbostic || ((flags & O_RDWR) == O_RDWR)) 592*45876Sbostic t->bt_flags |= BTF_ISWRITE; 593*45876Sbostic 594*45876Sbostic return ((BTREE) t); 595*45876Sbostic } 596*45876Sbostic 597*45876Sbostic /* 598*45876Sbostic * BT_GET -- Get an entry from a btree. 599*45876Sbostic * 600*45876Sbostic * Does a key lookup in the tree to find the specified key, and returns 601*45876Sbostic * the key/data pair found. 602*45876Sbostic * 603*45876Sbostic * Parameters: 604*45876Sbostic * tree -- btree in which to do lookup 605*45876Sbostic * key -- key to find 606*45876Sbostic * data -- pointer to DBT in which to return data 607*45876Sbostic * flag -- ignored 608*45876Sbostic * 609*45876Sbostic * Returns: 610*45876Sbostic * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not 611*45876Sbostic * found. If key is not found, nothing is stored in the 612*45876Sbostic * return DBT 'data'. 613*45876Sbostic * 614*45876Sbostic * Side Effects: 615*45876Sbostic * None. 616*45876Sbostic * 617*45876Sbostic * Warnings: 618*45876Sbostic * Return data is statically allocated, and will be overwritten 619*45876Sbostic * at the next call. 620*45876Sbostic */ 621*45876Sbostic 622*45876Sbostic int 623*45876Sbostic bt_get(tree, key, data, flag) 624*45876Sbostic BTREE tree; 625*45876Sbostic DBT *key; 626*45876Sbostic DBT *data; 627*45876Sbostic int flag; 628*45876Sbostic { 629*45876Sbostic BTREE_P t = (BTREE_P) tree; 630*45876Sbostic BTHEADER *h; 631*45876Sbostic DATUM *d; 632*45876Sbostic BTITEM *item; 633*45876Sbostic 634*45876Sbostic /* lookup */ 635*45876Sbostic item = _bt_search(t, key); 636*45876Sbostic if (item == (BTITEM *) NULL) 637*45876Sbostic return (RET_ERROR); 638*45876Sbostic 639*45876Sbostic /* clear parent pointer stack */ 640*45876Sbostic while (_bt_pop(t) != P_NONE) 641*45876Sbostic continue; 642*45876Sbostic 643*45876Sbostic h = (BTHEADER *) t->bt_curpage; 644*45876Sbostic data->size = 0; 645*45876Sbostic data->data = (char *) NULL; 646*45876Sbostic 647*45876Sbostic /* match? */ 648*45876Sbostic if (VALIDITEM(t, item) 649*45876Sbostic && _bt_cmp(t, key->data, item->bti_index) == 0) { 650*45876Sbostic d = (DATUM *) GETDATUM(h, item->bti_index); 651*45876Sbostic return (_bt_buildret(t, d, data, key)); 652*45876Sbostic } 653*45876Sbostic 654*45876Sbostic /* nope */ 655*45876Sbostic return (RET_SPECIAL); 656*45876Sbostic } 657*45876Sbostic 658*45876Sbostic /* 659*45876Sbostic * BT_PUT -- Add an entry to a btree. 660*45876Sbostic * 661*45876Sbostic * The specified (key, data) pair is added to the tree. If the tree 662*45876Sbostic * was created for unique keys only, then duplicates will not be 663*45876Sbostic * entered. If the requested key exists in the tree, it will be over- 664*45876Sbostic * written unless the flags parameter is R_NOOVERWRITE, in which case 665*45876Sbostic * the update will not be done. If duplicate keys are permitted in the 666*45876Sbostic * tree, duplicates will be inserted and will not overwrite existing 667*45876Sbostic * keys. Nodes are split as required. 668*45876Sbostic * 669*45876Sbostic * Parameters: 670*45876Sbostic * tree -- btree in which to put the new entry 671*45876Sbostic * key -- key component to add 672*45876Sbostic * data -- data corresponding to key 673*45876Sbostic * flag -- R_NOOVERWRITE or zero. 674*45876Sbostic * 675*45876Sbostic * Returns: 676*45876Sbostic * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the 677*45876Sbostic * NOOVERWRITE flag was set and the specified key exists 678*45876Sbostic * in the database. 679*45876Sbostic * 680*45876Sbostic * Side Effects: 681*45876Sbostic * None. 682*45876Sbostic */ 683*45876Sbostic 684*45876Sbostic int 685*45876Sbostic bt_put(tree, key, data, flag) 686*45876Sbostic BTREE tree; 687*45876Sbostic DBT *key; 688*45876Sbostic DBT *data; 689*45876Sbostic int flag; 690*45876Sbostic { 691*45876Sbostic BTREE_P t = (BTREE_P) tree; 692*45876Sbostic BTITEM *item; 693*45876Sbostic 694*45876Sbostic /* look for this key in the tree */ 695*45876Sbostic item = _bt_search(t, key); 696*45876Sbostic 697*45876Sbostic /* 698*45876Sbostic * If this tree was originally created without R_DUP, then duplicate 699*45876Sbostic * keys are not allowed. We need to check this at insertion time. 700*45876Sbostic */ 701*45876Sbostic 702*45876Sbostic if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) { 703*45876Sbostic if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) { 704*45876Sbostic if (_bt_delone(t, item->bti_index) == RET_ERROR) 705*45876Sbostic return (RET_ERROR); 706*45876Sbostic } 707*45876Sbostic } 708*45876Sbostic 709*45876Sbostic return (_bt_insert(t, item, key, data, flag)); 710*45876Sbostic } 711*45876Sbostic 712*45876Sbostic /* 713*45876Sbostic * BT_DELETE -- delete a key from the tree. 714*45876Sbostic * 715*45876Sbostic * Deletes all keys (and their associated data items) matching the 716*45876Sbostic * supplied key from the tree. If the flags entry is R_CURSOR, then 717*45876Sbostic * the current item in the active scan is deleted. 718*45876Sbostic * 719*45876Sbostic * Parameters: 720*45876Sbostic * tree -- btree from which to delete key 721*45876Sbostic * key -- key to delete 722*45876Sbostic * flags -- R_CURSOR or zero 723*45876Sbostic * 724*45876Sbostic * Returns: 725*45876Sbostic * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified 726*45876Sbostic * key was not in the tree. 727*45876Sbostic * 728*45876Sbostic * Side Effects: 729*45876Sbostic * None. 730*45876Sbostic */ 731*45876Sbostic 732*45876Sbostic int 733*45876Sbostic bt_delete(tree, key, flags) 734*45876Sbostic BTREE tree; 735*45876Sbostic DBT *key; 736*45876Sbostic int flags; 737*45876Sbostic { 738*45876Sbostic BTREE_P t = (BTREE_P) tree; 739*45876Sbostic BTHEADER *h; 740*45876Sbostic BTITEM *item; 741*45876Sbostic int ndel = 0; 742*45876Sbostic 743*45876Sbostic if (flags == R_CURSOR) 744*45876Sbostic return (_bt_crsrdel(t)); 745*45876Sbostic 746*45876Sbostic /* find the first matching key in the tree */ 747*45876Sbostic item = _bt_first(t, key); 748*45876Sbostic h = t->bt_curpage; 749*45876Sbostic 750*45876Sbostic /* delete all matching keys */ 751*45876Sbostic for (;;) { 752*45876Sbostic while (VALIDITEM(t, item) 753*45876Sbostic && (_bt_cmp(t, key->data, item->bti_index) == 0)) { 754*45876Sbostic if (_bt_delone(t, item->bti_index) == RET_ERROR) 755*45876Sbostic return (RET_ERROR); 756*45876Sbostic ndel++; 757*45876Sbostic } 758*45876Sbostic 759*45876Sbostic if (VALIDITEM(t, item) || h->h_nextpg == P_NONE) 760*45876Sbostic break; 761*45876Sbostic 762*45876Sbostic /* next page, if necessary */ 763*45876Sbostic do { 764*45876Sbostic if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 765*45876Sbostic return (RET_ERROR); 766*45876Sbostic h = t->bt_curpage; 767*45876Sbostic } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE); 768*45876Sbostic 769*45876Sbostic item->bti_pgno = h->h_pgno; 770*45876Sbostic item->bti_index = 0; 771*45876Sbostic 772*45876Sbostic if (!VALIDITEM(t, item) 773*45876Sbostic || _bt_cmp(t, key->data, item->bti_index) != 0) 774*45876Sbostic break; 775*45876Sbostic } 776*45876Sbostic 777*45876Sbostic /* clean up the parent stack */ 778*45876Sbostic while (_bt_pop(t) != P_NONE) 779*45876Sbostic continue; 780*45876Sbostic 781*45876Sbostic /* flush changes to disk */ 782*45876Sbostic if (ISDISK(t)) { 783*45876Sbostic if (h->h_flags & F_DIRTY) { 784*45876Sbostic if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR) 785*45876Sbostic return (RET_ERROR); 786*45876Sbostic } 787*45876Sbostic } 788*45876Sbostic 789*45876Sbostic if (ndel == 0) 790*45876Sbostic return (RET_SPECIAL); 791*45876Sbostic 792*45876Sbostic return (RET_SUCCESS); 793*45876Sbostic } 794*45876Sbostic 795*45876Sbostic /* 796*45876Sbostic * _BT_CRSRDEL -- Delete the item pointed to by the cursor. 797*45876Sbostic * 798*45876Sbostic * This routine deletes the item most recently returned by a scan 799*45876Sbostic * through the tree. Since it only makes sense to delete the current 800*45876Sbostic * record once, we make sure that we don't try to delete twice without 801*45876Sbostic * advancing the scan. 802*45876Sbostic * 803*45876Sbostic * Parameters: 804*45876Sbostic * t -- tree in which to do deletion 805*45876Sbostic * 806*45876Sbostic * Returns: 807*45876Sbostic * RET_SUCCESS, RET_ERROR. 808*45876Sbostic * 809*45876Sbostic * Side Effects: 810*45876Sbostic * The call to _bt_delone marks the cursor, so we can tell that 811*45876Sbostic * the current record has been deleted. 812*45876Sbostic */ 813*45876Sbostic 814*45876Sbostic static int 815*45876Sbostic _bt_crsrdel(t) 816*45876Sbostic BTREE_P t; 817*45876Sbostic { 818*45876Sbostic CURSOR *c; 819*45876Sbostic 820*45876Sbostic c = &(t->bt_cursor); 821*45876Sbostic 822*45876Sbostic /* a cursor must exist, and can't have deleted the current key yet */ 823*45876Sbostic if (!(t->bt_flags & BTF_SEQINIT) || (c->c_flags & CRSR_BEFORE)) { 824*45876Sbostic errno = EINVAL; 825*45876Sbostic return (RET_ERROR); 826*45876Sbostic } 827*45876Sbostic 828*45876Sbostic if (_bt_getpage(t, c->c_pgno) == RET_ERROR) 829*45876Sbostic return (RET_ERROR); 830*45876Sbostic 831*45876Sbostic if (c->c_index >= NEXTINDEX(t->bt_curpage)) { 832*45876Sbostic errno = EINVAL; 833*45876Sbostic return (RET_ERROR); 834*45876Sbostic } 835*45876Sbostic 836*45876Sbostic return (_bt_delone(t, c->c_index)); 837*45876Sbostic } 838*45876Sbostic 839*45876Sbostic /* 840*45876Sbostic * _BT_DELONE -- Delete a single entry from a btree. 841*45876Sbostic * 842*45876Sbostic * This routine physically removes a btree entry from a leaf page. 843*45876Sbostic * IDATUM items are *never* removed from internal nodes, regardless 844*45876Sbostic * of whether the entries that originally caused them to be added 845*45876Sbostic * are removed from the tree or not. In addition, pages made empty 846*45876Sbostic * by element deletion are not actually reclaimed. They are, 847*45876Sbostic * however, made available for reuse. 848*45876Sbostic * 849*45876Sbostic * To delete an item from a page, we pack the remaining items at 850*45876Sbostic * the end of the page, overwriting the deleted item's entry. We 851*45876Sbostic * move the line pointers backward on the page, overwriting the 852*45876Sbostic * original item's line pointer. This guarantees that the space in 853*45876Sbostic * the middle of the page is free -- a property that our insertion 854*45876Sbostic * strategy relies on. 855*45876Sbostic * 856*45876Sbostic * This routine doesn't reclaim pages all of whose entries have 857*45876Sbostic * been deleted. These pages are available for reuse, however. 858*45876Sbostic * If an item is deleted that was too big to fit on a page, then 859*45876Sbostic * the blocks that it occupies are put on a free list for reuse. 860*45876Sbostic * 861*45876Sbostic * Parameters: 862*45876Sbostic * t -- btree from which to delete item 863*45876Sbostic * index -- index of entry on current page to delete 864*45876Sbostic * 865*45876Sbostic * Returns: 866*45876Sbostic * RET_SUCCESS, RET_ERROR. 867*45876Sbostic * 868*45876Sbostic * Side Effects: 869*45876Sbostic * Physically changes page layout, adjusts internal page 870*45876Sbostic * state to reflect the deletion of the item, and updates 871*45876Sbostic * the list of free pages for this tree. 872*45876Sbostic */ 873*45876Sbostic 874*45876Sbostic static int 875*45876Sbostic _bt_delone(t, index) 876*45876Sbostic BTREE_P t; 877*45876Sbostic index_t index; 878*45876Sbostic { 879*45876Sbostic char *src, *dest; 880*45876Sbostic int nbytes, nmoved; 881*45876Sbostic index_t off; 882*45876Sbostic index_t top; 883*45876Sbostic index_t i; 884*45876Sbostic pgno_t chain; 885*45876Sbostic BTHEADER *h; 886*45876Sbostic CURSOR *c; 887*45876Sbostic DATUM *d; 888*45876Sbostic 889*45876Sbostic /* deletion may confuse an active scan. fix it. */ 890*45876Sbostic c = &(t->bt_cursor); 891*45876Sbostic if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno) 892*45876Sbostic if (_bt_fixscan(t, index, (DATUM *) NULL, DELETE) == RET_ERROR) 893*45876Sbostic return (RET_ERROR); 894*45876Sbostic 895*45876Sbostic h = t->bt_curpage; 896*45876Sbostic off = h->h_linp[index]; 897*45876Sbostic d = (DATUM *) GETDATUM(h, index); 898*45876Sbostic 899*45876Sbostic /* if this is a big item, reclaim the space it occupies */ 900*45876Sbostic if (d->d_flags & D_BIGKEY) { 901*45876Sbostic bcopy(&(d->d_bytes[0]), 902*45876Sbostic (char *) &chain, 903*45876Sbostic sizeof(chain)); 904*45876Sbostic if (_bt_delindir(t, chain) == RET_ERROR) 905*45876Sbostic return (RET_ERROR); 906*45876Sbostic h = t->bt_curpage; 907*45876Sbostic d = (DATUM *) GETDATUM(h, index); 908*45876Sbostic } 909*45876Sbostic if (d->d_flags & D_BIGDATA) { 910*45876Sbostic bcopy(&(d->d_bytes[d->d_ksize]), 911*45876Sbostic (char *) &chain, 912*45876Sbostic sizeof(chain)); 913*45876Sbostic if (_bt_delindir(t, chain) == RET_ERROR) 914*45876Sbostic return (RET_ERROR); 915*45876Sbostic h = t->bt_curpage; 916*45876Sbostic d = (DATUM *) GETDATUM(h, index); 917*45876Sbostic } 918*45876Sbostic 919*45876Sbostic /* move the data down on the page */ 920*45876Sbostic nbytes = d->d_ksize + d->d_dsize 921*45876Sbostic + (sizeof(DATUM) - sizeof(char)); 922*45876Sbostic nbytes = LONGALIGN(nbytes); 923*45876Sbostic src = ((char *) h) + h->h_upper; 924*45876Sbostic dest = src + nbytes; 925*45876Sbostic nmoved = (int) (((char *) d) - src); 926*45876Sbostic (void) bcopy(src, dest, nmoved); 927*45876Sbostic 928*45876Sbostic /* next move the line pointers up */ 929*45876Sbostic src = (char *) &(h->h_linp[index + 1]); 930*45876Sbostic dest = (char *) &(h->h_linp[index]); 931*45876Sbostic nmoved = (int) (((char *) &(h->h_linp[NEXTINDEX(h)])) - src); 932*45876Sbostic (void) bcopy(src, dest, nmoved); 933*45876Sbostic 934*45876Sbostic /* remember that we freed up some space */ 935*45876Sbostic h->h_upper += nbytes; 936*45876Sbostic h->h_lower -= sizeof(index_t); 937*45876Sbostic 938*45876Sbostic /* adjust offsets in line pointers affected by moving the data */ 939*45876Sbostic top = NEXTINDEX(h); 940*45876Sbostic for (i = 0; i < top; i++) { 941*45876Sbostic if (h->h_linp[i] < off) 942*45876Sbostic h->h_linp[i] += nbytes; 943*45876Sbostic } 944*45876Sbostic 945*45876Sbostic /* it's gone */ 946*45876Sbostic h->h_flags |= F_DIRTY; 947*45876Sbostic 948*45876Sbostic return (RET_SUCCESS); 949*45876Sbostic } 950*45876Sbostic 951*45876Sbostic /* 952*45876Sbostic * _BT_FIXSCAN -- Adjust a scan to cope with a change in tree structure. 953*45876Sbostic * 954*45876Sbostic * If the user has an active scan on the database, and we delete an 955*45876Sbostic * item from the page the cursor is pointing at, we need to figure 956*45876Sbostic * out what to do about it. Basically, the solution is to point 957*45876Sbostic * "between" keys in the tree, using the CRSR_BEFORE flag. The 958*45876Sbostic * requirement is that the user should not miss any data in the 959*45876Sbostic * tree during a scan, just because he happened to do some deletions 960*45876Sbostic * or insertions while it was active. 961*45876Sbostic * 962*45876Sbostic * In order to guarantee that the scan progresses properly, we need 963*45876Sbostic * to save the key of any deleted item we were pointing at, so that 964*45876Sbostic * we can check later insertions against it. 965*45876Sbostic * 966*45876Sbostic * Parameters: 967*45876Sbostic * t -- tree to adjust 968*45876Sbostic * index -- index of item at which change was made 969*45876Sbostic * newd -- new datum (for insertions only) 970*45876Sbostic * op -- operation (DELETE or INSERT) causing change 971*45876Sbostic * 972*45876Sbostic * Returns: 973*45876Sbostic * RET_SUCCESS, RET_ERROR (errno is set). 974*45876Sbostic * 975*45876Sbostic * Side Effects: 976*45876Sbostic * None. 977*45876Sbostic */ 978*45876Sbostic 979*45876Sbostic static int 980*45876Sbostic _bt_fixscan(t, index, newd, op) 981*45876Sbostic BTREE_P t; 982*45876Sbostic index_t index; 983*45876Sbostic DATUM *newd; 984*45876Sbostic int op; 985*45876Sbostic { 986*45876Sbostic BTHEADER *h; 987*45876Sbostic CURSOR *c; 988*45876Sbostic DATUM *d; 989*45876Sbostic 990*45876Sbostic h = t->bt_curpage; 991*45876Sbostic c = &(t->bt_cursor); 992*45876Sbostic 993*45876Sbostic if (op == DELETE) { 994*45876Sbostic if (index < c->c_index) 995*45876Sbostic c->c_index--; 996*45876Sbostic else if (index == c->c_index) { 997*45876Sbostic if (!(c->c_flags & CRSR_BEFORE)) { 998*45876Sbostic if (_bt_crsrkey(t) == RET_ERROR) 999*45876Sbostic return (RET_ERROR); 1000*45876Sbostic c->c_flags |= CRSR_BEFORE; 1001*45876Sbostic } 1002*45876Sbostic } 1003*45876Sbostic } else { 1004*45876Sbostic /* 1005*45876Sbostic * If we previously deleted the object at this location, 1006*45876Sbostic * and now we're inserting a new one, we need to do the 1007*45876Sbostic * right thing -- the cursor should come either before 1008*45876Sbostic * or after the new item, depending on the key that was 1009*45876Sbostic * here, and the new one. 1010*45876Sbostic */ 1011*45876Sbostic 1012*45876Sbostic if (c->c_flags & CRSR_BEFORE) { 1013*45876Sbostic if (index <= c->c_index) { 1014*45876Sbostic char *tmp; 1015*45876Sbostic int itmp; 1016*45876Sbostic pgno_t chain; 1017*45876Sbostic int r; 1018*45876Sbostic 1019*45876Sbostic d = (DATUM *) GETDATUM(t->bt_curpage, index); 1020*45876Sbostic if (d->d_flags & D_BIGKEY) { 1021*45876Sbostic bcopy(&(newd->d_bytes[0]), 1022*45876Sbostic (char *) &chain, 1023*45876Sbostic sizeof(chain)); 1024*45876Sbostic if (_bt_getbig(t, chain, &tmp, &itmp) 1025*45876Sbostic == RET_ERROR) 1026*45876Sbostic return (RET_ERROR); 1027*45876Sbostic } else 1028*45876Sbostic tmp = &(newd->d_bytes[0]); 1029*45876Sbostic 1030*45876Sbostic r = (*(t->bt_compare))(tmp, c->c_key); 1031*45876Sbostic if (r < 0) 1032*45876Sbostic c->c_index++; 1033*45876Sbostic 1034*45876Sbostic if (d->d_flags & D_BIGKEY) 1035*45876Sbostic (void) free (tmp); 1036*45876Sbostic } 1037*45876Sbostic } else if (index <= c->c_index) 1038*45876Sbostic c->c_index++; 1039*45876Sbostic } 1040*45876Sbostic return (RET_SUCCESS); 1041*45876Sbostic } 1042*45876Sbostic 1043*45876Sbostic /* 1044*45876Sbostic * _BT_CRSRKEY -- Save a copy of the key of the record that the cursor 1045*45876Sbostic * is pointing to. The record is about to be deleted. 1046*45876Sbostic * 1047*45876Sbostic * Parameters: 1048*45876Sbostic * t -- btree 1049*45876Sbostic * 1050*45876Sbostic * Returns: 1051*45876Sbostic * RET_SUCCESS, RET_ERROR. 1052*45876Sbostic * 1053*45876Sbostic * Side Effects: 1054*45876Sbostic * Allocates memory for the copy which should be released when 1055*45876Sbostic * it is no longer needed. 1056*45876Sbostic */ 1057*45876Sbostic 1058*45876Sbostic static int 1059*45876Sbostic _bt_crsrkey(t) 1060*45876Sbostic BTREE_P t; 1061*45876Sbostic { 1062*45876Sbostic CURSOR *c; 1063*45876Sbostic DATUM *d; 1064*45876Sbostic pgno_t pgno; 1065*45876Sbostic int ignore; 1066*45876Sbostic 1067*45876Sbostic c = &(t->bt_cursor); 1068*45876Sbostic d = (DATUM *) GETDATUM(t->bt_curpage, c->c_index); 1069*45876Sbostic 1070*45876Sbostic if (d->d_flags & D_BIGKEY) { 1071*45876Sbostic bcopy(&(d->d_bytes[0]), (char *) &pgno, sizeof(pgno)); 1072*45876Sbostic return (_bt_getbig(t, pgno, &(c->c_key), &ignore)); 1073*45876Sbostic } else { 1074*45876Sbostic if ((c->c_key = (char *) malloc(d->d_ksize)) == (char *) NULL) 1075*45876Sbostic return (RET_ERROR); 1076*45876Sbostic 1077*45876Sbostic bcopy(&(d->d_bytes[0]), c->c_key, d->d_ksize); 1078*45876Sbostic } 1079*45876Sbostic return (RET_SUCCESS); 1080*45876Sbostic } 1081*45876Sbostic 1082*45876Sbostic /* 1083*45876Sbostic * _BT_GETBIG -- Get big data from indirect pages. 1084*45876Sbostic * 1085*45876Sbostic * This routine chases indirect blocks for the big object at the 1086*45876Sbostic * specified page to a buffer, and return the address of the buffer. 1087*45876Sbostic * 1088*45876Sbostic * Parameters: 1089*45876Sbostic * t -- btree with the indirect blocks 1090*45876Sbostic * pgno -- page number that starts the chain 1091*45876Sbostic * p -- (char **) to get the address of the buffer containing 1092*45876Sbostic * the key or datum. 1093*45876Sbostic * sz -- pointer to an int to get the size of the instantiated 1094*45876Sbostic * object. 1095*45876Sbostic * 1096*45876Sbostic * Returns: 1097*45876Sbostic * RET_SUCCESS, RET_ERROR. 1098*45876Sbostic * 1099*45876Sbostic * Side Effects: 1100*45876Sbostic * None. 1101*45876Sbostic */ 1102*45876Sbostic 1103*45876Sbostic static int 1104*45876Sbostic _bt_getbig(t, pgno, p, sz) 1105*45876Sbostic BTREE_P t; 1106*45876Sbostic pgno_t pgno; 1107*45876Sbostic char **p; 1108*45876Sbostic int *sz; 1109*45876Sbostic { 1110*45876Sbostic pgno_t save; 1111*45876Sbostic size_t nbytes; 1112*45876Sbostic size_t nhere; 1113*45876Sbostic BTHEADER *h; 1114*45876Sbostic char *top, *from, *where; 1115*45876Sbostic 1116*45876Sbostic save = t->bt_curpage->h_pgno; 1117*45876Sbostic if (_bt_getpage(t, pgno) == RET_ERROR) 1118*45876Sbostic return (RET_ERROR); 1119*45876Sbostic 1120*45876Sbostic h = t->bt_curpage; 1121*45876Sbostic 1122*45876Sbostic bcopy((char *) &(h->h_linp[0]), (char *) &nbytes, sizeof(nbytes)); 1123*45876Sbostic 1124*45876Sbostic if ((*p = (char *) malloc(nbytes)) == (char *) NULL) 1125*45876Sbostic return (RET_ERROR); 1126*45876Sbostic 1127*45876Sbostic *sz = nbytes; 1128*45876Sbostic from = ((char *) (&h->h_linp[0])) + sizeof(nbytes); 1129*45876Sbostic top = ((char *) h) + t->bt_psize; 1130*45876Sbostic 1131*45876Sbostic /* need more space for data? */ 1132*45876Sbostic 1133*45876Sbostic where = *p; 1134*45876Sbostic 1135*45876Sbostic while (nbytes > 0) { 1136*45876Sbostic nhere = (int) (top - from); 1137*45876Sbostic if (nhere > nbytes) { 1138*45876Sbostic (void) bcopy(from, where, nbytes); 1139*45876Sbostic nbytes = 0; 1140*45876Sbostic } else { 1141*45876Sbostic (void) bcopy(from, where, nhere); 1142*45876Sbostic where += nhere; 1143*45876Sbostic nbytes -= nhere; 1144*45876Sbostic if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 1145*45876Sbostic return (RET_ERROR); 1146*45876Sbostic h = t->bt_curpage; 1147*45876Sbostic top = ((char *) h) + t->bt_psize; 1148*45876Sbostic from = (char *) &(h->h_linp[0]); 1149*45876Sbostic } 1150*45876Sbostic } 1151*45876Sbostic 1152*45876Sbostic if (_bt_getpage(t, save) == RET_ERROR) 1153*45876Sbostic return (RET_ERROR); 1154*45876Sbostic 1155*45876Sbostic return (RET_SUCCESS); 1156*45876Sbostic } 1157*45876Sbostic 1158*45876Sbostic /* 1159*45876Sbostic * _BT_DELINDIR -- Delete a chain of indirect blocks from the btree. 1160*45876Sbostic * 1161*45876Sbostic * When a large item is deleted from the tree, this routine puts the 1162*45876Sbostic * space that it occupied onto the free list for later reuse. The 1163*45876Sbostic * bt_free entry in the btree structure points at the head of this list. 1164*45876Sbostic * This value is also stored on disk in the btree's metadata. 1165*45876Sbostic * 1166*45876Sbostic * Parameters: 1167*45876Sbostic * t -- btree from which to delete pages 1168*45876Sbostic * chain -- page number that starts the chain. 1169*45876Sbostic * 1170*45876Sbostic * Returns: 1171*45876Sbostic * RET_SUCCESS, RET_ERROR. 1172*45876Sbostic * 1173*45876Sbostic * Side Effects: 1174*45876Sbostic * Invalidates the current on-disk version of the btree's 1175*45876Sbostic * metadata (if any), and updates the free list appropriately. 1176*45876Sbostic */ 1177*45876Sbostic 1178*45876Sbostic static int 1179*45876Sbostic _bt_delindir(t, chain) 1180*45876Sbostic BTREE_P t; 1181*45876Sbostic pgno_t chain; 1182*45876Sbostic { 1183*45876Sbostic BTHEADER *h; 1184*45876Sbostic pgno_t save; 1185*45876Sbostic pgno_t oldfree; 1186*45876Sbostic 1187*45876Sbostic h = t->bt_curpage; 1188*45876Sbostic save = h->h_pgno; 1189*45876Sbostic if (_bt_getpage(t, chain) == RET_ERROR) 1190*45876Sbostic return (RET_ERROR); 1191*45876Sbostic 1192*45876Sbostic /* 1193*45876Sbostic * If some internal node is pointing at this chain, don't 1194*45876Sbostic * delete it. 1195*45876Sbostic */ 1196*45876Sbostic 1197*45876Sbostic if (t->bt_curpage->h_flags & F_PRESERVE) { 1198*45876Sbostic if (_bt_getpage(t, save) == RET_ERROR) 1199*45876Sbostic return (RET_ERROR); 1200*45876Sbostic return (RET_SUCCESS); 1201*45876Sbostic } 1202*45876Sbostic 1203*45876Sbostic /* if there's nothing on the free list, this is easy... */ 1204*45876Sbostic if (t->bt_free == P_NONE) { 1205*45876Sbostic t->bt_free = chain; 1206*45876Sbostic } else { 1207*45876Sbostic oldfree = t->bt_free; 1208*45876Sbostic 1209*45876Sbostic /* find the end of the data chain for the deleted datum */ 1210*45876Sbostic t->bt_free = chain; 1211*45876Sbostic do { 1212*45876Sbostic if (_bt_getpage(t, chain) == RET_ERROR) 1213*45876Sbostic return (RET_ERROR); 1214*45876Sbostic h = t->bt_curpage; 1215*45876Sbostic if (h->h_nextpg != P_NONE) 1216*45876Sbostic chain = h->h_nextpg; 1217*45876Sbostic } while (h->h_nextpg != P_NONE); 1218*45876Sbostic 1219*45876Sbostic /* link freed pages into free list */ 1220*45876Sbostic h->h_nextpg = oldfree; 1221*45876Sbostic if (_bt_write(t, h, RELEASE) == RET_ERROR) 1222*45876Sbostic return (RET_ERROR); 1223*45876Sbostic if (_bt_getpage(t, oldfree) == RET_ERROR) 1224*45876Sbostic return (RET_ERROR); 1225*45876Sbostic h = t->bt_curpage; 1226*45876Sbostic h->h_prevpg = chain; 1227*45876Sbostic if (_bt_write(t, h, RELEASE) == RET_ERROR) 1228*45876Sbostic return (RET_ERROR); 1229*45876Sbostic } 1230*45876Sbostic 1231*45876Sbostic /* restore the tree's current page pointer */ 1232*45876Sbostic if (_bt_getpage(t, save) == RET_ERROR) 1233*45876Sbostic return (RET_ERROR); 1234*45876Sbostic 1235*45876Sbostic /* we have trashed the tree metadata; rewrite it later */ 1236*45876Sbostic t->bt_flags &= ~BTF_METAOK; 1237*45876Sbostic 1238*45876Sbostic return (RET_SUCCESS); 1239*45876Sbostic } 1240*45876Sbostic 1241*45876Sbostic /* 1242*45876Sbostic * _BT_FIRST -- Find the first item in the tree that matches the supplied 1243*45876Sbostic * key. 1244*45876Sbostic * 1245*45876Sbostic * This routine supports deletion. When the user supplies a key to 1246*45876Sbostic * be deleted, we find the first one, and iteratively delete all the 1247*45876Sbostic * matching ones that follow it. 1248*45876Sbostic * 1249*45876Sbostic * Parameters: 1250*45876Sbostic * t -- btree in which to find first occurrence 1251*45876Sbostic * key -- key to find 1252*45876Sbostic * 1253*45876Sbostic * Returns: 1254*45876Sbostic * The BTITEM for the matching item. If there's no match, 1255*45876Sbostic * this may point to the first item > than the supplied key, 1256*45876Sbostic * or off the end of the page. 1257*45876Sbostic * 1258*45876Sbostic * Warnings: 1259*45876Sbostic * The BTITEM returned is in static space and will be overwritten 1260*45876Sbostic * by the next search of any kind in any btree. 1261*45876Sbostic */ 1262*45876Sbostic 1263*45876Sbostic static BTITEM * 1264*45876Sbostic _bt_first(t, key) 1265*45876Sbostic BTREE_P t; 1266*45876Sbostic DBT *key; 1267*45876Sbostic { 1268*45876Sbostic BTHEADER *h; 1269*45876Sbostic BTITEM *item; 1270*45876Sbostic index_t next; 1271*45876Sbostic int r; 1272*45876Sbostic 1273*45876Sbostic /* find any matching item */ 1274*45876Sbostic item = _bt_search(t, key); 1275*45876Sbostic h = t->bt_curpage; 1276*45876Sbostic next = NEXTINDEX(h); 1277*45876Sbostic 1278*45876Sbostic /* if we're off the end of the page, search failed and we're done */ 1279*45876Sbostic if (item->bti_index >= next) 1280*45876Sbostic return (item); 1281*45876Sbostic 1282*45876Sbostic /* as long as we have an exact match, walk backwards */ 1283*45876Sbostic while ((r = _bt_cmp(t, key->data, item->bti_index)) == 0) { 1284*45876Sbostic 1285*45876Sbostic /* at start of page? */ 1286*45876Sbostic if (item->bti_index == 0) { 1287*45876Sbostic 1288*45876Sbostic /* if no prev page, we're done */ 1289*45876Sbostic if (h->h_prevpg == P_NONE) 1290*45876Sbostic return (item); 1291*45876Sbostic 1292*45876Sbostic /* walk backward, skipping empty pages */ 1293*45876Sbostic do { 1294*45876Sbostic if (_bt_getpage(t, h->h_prevpg) == RET_ERROR) 1295*45876Sbostic return ((BTITEM *) NULL); 1296*45876Sbostic h = t->bt_curpage; 1297*45876Sbostic } while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE); 1298*45876Sbostic 1299*45876Sbostic if (NEXTINDEX(h) != 0) 1300*45876Sbostic item->bti_index = NEXTINDEX(h) - 1; 1301*45876Sbostic else 1302*45876Sbostic item->bti_index = 0; 1303*45876Sbostic 1304*45876Sbostic item->bti_pgno = h->h_pgno; 1305*45876Sbostic } else { 1306*45876Sbostic item->bti_index--; 1307*45876Sbostic } 1308*45876Sbostic } 1309*45876Sbostic 1310*45876Sbostic /* if we went too far backwards, step forward one entry */ 1311*45876Sbostic if (r > 0) { 1312*45876Sbostic if (++(item->bti_index) >= NEXTINDEX(h) 1313*45876Sbostic && h->h_nextpg != P_NONE) { 1314*45876Sbostic 1315*45876Sbostic /* walk forward, skipping empty pages */ 1316*45876Sbostic do { 1317*45876Sbostic if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 1318*45876Sbostic return ((BTITEM *) NULL); 1319*45876Sbostic h = t->bt_curpage; 1320*45876Sbostic } while (h->h_nextpg != P_NONE && NEXTINDEX(h) == 0); 1321*45876Sbostic 1322*45876Sbostic item->bti_index = 0; 1323*45876Sbostic item->bti_pgno = h->h_pgno; 1324*45876Sbostic } 1325*45876Sbostic } 1326*45876Sbostic 1327*45876Sbostic /* got it */ 1328*45876Sbostic return (item); 1329*45876Sbostic } 1330*45876Sbostic 1331*45876Sbostic /* 1332*45876Sbostic * _BT_SEARCH, _BT_SEARCHR -- Search for a particular key in the tree. 1333*45876Sbostic * 1334*45876Sbostic * Parameters: 1335*45876Sbostic * t -- btree in which to search 1336*45876Sbostic * key -- key to find 1337*45876Sbostic * 1338*45876Sbostic * Returns: 1339*45876Sbostic * BTITEM for matching item, if any, or the BTITEM for the 1340*45876Sbostic * location of the key, if it were in the tree. 1341*45876Sbostic * 1342*45876Sbostic * Warnings: 1343*45876Sbostic * The BTITEM returned is in static memory, and will be 1344*45876Sbostic * overwritten by the next search of any kind in any tree. 1345*45876Sbostic */ 1346*45876Sbostic 1347*45876Sbostic static BTITEM * 1348*45876Sbostic _bt_search(t, key) 1349*45876Sbostic BTREE_P t; 1350*45876Sbostic DBT *key; 1351*45876Sbostic { 1352*45876Sbostic /* we want to start all of our searches at the root */ 1353*45876Sbostic if (_bt_getpage(t, P_ROOT) == RET_ERROR) 1354*45876Sbostic return ((BTITEM *) NULL); 1355*45876Sbostic 1356*45876Sbostic return (_bt_searchr(t, key)); 1357*45876Sbostic } 1358*45876Sbostic 1359*45876Sbostic static BTITEM * 1360*45876Sbostic _bt_searchr(t, key) 1361*45876Sbostic BTREE_P t; 1362*45876Sbostic DBT *key; 1363*45876Sbostic { 1364*45876Sbostic BTHEADER *h = t->bt_curpage; 1365*45876Sbostic index_t index; 1366*45876Sbostic IDATUM *id; 1367*45876Sbostic DATUM *d; 1368*45876Sbostic static BTITEM item; 1369*45876Sbostic 1370*45876Sbostic /* do a binary search on the current page */ 1371*45876Sbostic index = _bt_binsrch(t, &(key->data[0])); 1372*45876Sbostic 1373*45876Sbostic /* 1374*45876Sbostic * At this point, the binary search terminated because the endpoints 1375*45876Sbostic * got too close together, or we have a match. Figure out which 1376*45876Sbostic * case applies and decide what to do based on the page type. 1377*45876Sbostic */ 1378*45876Sbostic if (h->h_flags & F_LEAF) { 1379*45876Sbostic item.bti_pgno = h->h_pgno; 1380*45876Sbostic item.bti_index = index; 1381*45876Sbostic if (index < NEXTINDEX(h)) 1382*45876Sbostic d = (DATUM *) GETDATUM(h,index); 1383*45876Sbostic else 1384*45876Sbostic d = (DATUM *) NULL; 1385*45876Sbostic 1386*45876Sbostic item.bti_datum = d; 1387*45876Sbostic return(&item); 1388*45876Sbostic } else { 1389*45876Sbostic id = (IDATUM *) GETDATUM(h, index); 1390*45876Sbostic _bt_push(t, h->h_pgno); 1391*45876Sbostic if (_bt_getpage(t, id->i_pgno) == RET_ERROR) 1392*45876Sbostic return ((BTITEM *) NULL); 1393*45876Sbostic return (_bt_searchr(t, key)); 1394*45876Sbostic } 1395*45876Sbostic } 1396*45876Sbostic 1397*45876Sbostic /* 1398*45876Sbostic * BT_GETPAGE -- Make pgno the current page of the btree. 1399*45876Sbostic * 1400*45876Sbostic * This routine is just a wrapper that decides whether to call the 1401*45876Sbostic * memory or disk-based routine to do the work. 1402*45876Sbostic * 1403*45876Sbostic * Parameters: 1404*45876Sbostic * t -- btree in which to get page 1405*45876Sbostic * pgno -- page number to get 1406*45876Sbostic * 1407*45876Sbostic * Returns: 1408*45876Sbostic * RET_SUCCESS or RET_ERROR. 1409*45876Sbostic */ 1410*45876Sbostic 1411*45876Sbostic static int 1412*45876Sbostic _bt_getpage(t, pgno) 1413*45876Sbostic BTREE_P t; 1414*45876Sbostic pgno_t pgno; 1415*45876Sbostic { 1416*45876Sbostic #ifdef DEBUG 1417*45876Sbostic if (pgno == P_NONE) 1418*45876Sbostic _punt(); 1419*45876Sbostic #endif /* DEBUG */ 1420*45876Sbostic 1421*45876Sbostic /* see if we can get away without doing any work */ 1422*45876Sbostic if (t->bt_curpage != (BTHEADER *) NULL) { 1423*45876Sbostic if (t->bt_curpage->h_pgno == pgno) 1424*45876Sbostic return (RET_SUCCESS); 1425*45876Sbostic } 1426*45876Sbostic 1427*45876Sbostic if (t->bt_fname == (char *) NULL) 1428*45876Sbostic return (_bt_getmpage(t, pgno)); 1429*45876Sbostic else 1430*45876Sbostic return (_bt_getdpage(t, pgno)); 1431*45876Sbostic } 1432*45876Sbostic 1433*45876Sbostic /* 1434*45876Sbostic * _BT_GETMPAGE -- Make pgno the current page of the btree. 1435*45876Sbostic * 1436*45876Sbostic * This routine gets pages for in-memory btrees. 1437*45876Sbostic * 1438*45876Sbostic * Parameters: 1439*45876Sbostic * t -- btree in which to get page 1440*45876Sbostic * pgno -- page number to get 1441*45876Sbostic * 1442*45876Sbostic * Returns: 1443*45876Sbostic * RET_SUCCESS or RET_ERROR. 1444*45876Sbostic */ 1445*45876Sbostic 1446*45876Sbostic static int 1447*45876Sbostic _bt_getmpage(t, pgno) 1448*45876Sbostic register BTREE_P t; 1449*45876Sbostic pgno_t pgno; 1450*45876Sbostic { 1451*45876Sbostic int htindex; 1452*45876Sbostic int nbytes; 1453*45876Sbostic BTHEADER *h; 1454*45876Sbostic HTBUCKET *b; 1455*45876Sbostic 1456*45876Sbostic if (t->bt_curpage == (BTHEADER *) NULL) { 1457*45876Sbostic if (pgno != P_ROOT) { 1458*45876Sbostic errno = EBADF; 1459*45876Sbostic return (RET_ERROR); 1460*45876Sbostic } 1461*45876Sbostic 1462*45876Sbostic t->bt_npages++; 1463*45876Sbostic h = (BTHEADER *) malloc((unsigned) t->bt_psize); 1464*45876Sbostic if (h == (BTHEADER *) NULL) 1465*45876Sbostic return (RET_ERROR); 1466*45876Sbostic 1467*45876Sbostic h->h_pgno = P_ROOT; 1468*45876Sbostic h->h_flags = F_LEAF; 1469*45876Sbostic h->h_lower = (index_t) 1470*45876Sbostic (((char *) &(h->h_linp[0])) - ((char *) h)); 1471*45876Sbostic h->h_upper = t->bt_psize; 1472*45876Sbostic h->h_prevpg = h->h_nextpg = P_NONE; 1473*45876Sbostic 1474*45876Sbostic t->bt_curpage = h; 1475*45876Sbostic 1476*45876Sbostic /* get the root page into the hash table */ 1477*45876Sbostic if (_bt_write(t, h, RELEASE) == RET_ERROR) 1478*45876Sbostic return (RET_ERROR); 1479*45876Sbostic } 1480*45876Sbostic 1481*45876Sbostic htindex = HASHKEY(pgno); 1482*45876Sbostic 1483*45876Sbostic for (b = t->bt_s.bt_ht[htindex]; 1484*45876Sbostic b != (HTBUCKET *) NULL; 1485*45876Sbostic b = b->ht_next) { 1486*45876Sbostic if (b->ht_pgno == pgno) { 1487*45876Sbostic t->bt_curpage = b->ht_page; 1488*45876Sbostic return (RET_SUCCESS); 1489*45876Sbostic } 1490*45876Sbostic } 1491*45876Sbostic return (RET_ERROR); 1492*45876Sbostic } 1493*45876Sbostic 1494*45876Sbostic /* 1495*45876Sbostic * _BT_GETDPAGE -- Make pgno the current page of the btree. 1496*45876Sbostic * 1497*45876Sbostic * This routine gets pages for disk btrees. 1498*45876Sbostic * 1499*45876Sbostic * Because disk btree pages must be readable across machine architectures, 1500*45876Sbostic * the btree code writes integers out in network format. This routine 1501*45876Sbostic * converts them back to host format before returning the page. 1502*45876Sbostic * 1503*45876Sbostic * Parameters: 1504*45876Sbostic * t -- btree in which to get page 1505*45876Sbostic * pgno -- page number to get 1506*45876Sbostic * 1507*45876Sbostic * Returns: 1508*45876Sbostic * RET_SUCCESS, RET_ERROR. 1509*45876Sbostic */ 1510*45876Sbostic 1511*45876Sbostic static int 1512*45876Sbostic _bt_getdpage(t, pgno) 1513*45876Sbostic register BTREE_P t; 1514*45876Sbostic pgno_t pgno; 1515*45876Sbostic { 1516*45876Sbostic BTHEADER *h; 1517*45876Sbostic char *cache; 1518*45876Sbostic long pos; 1519*45876Sbostic int n, nbytes; 1520*45876Sbostic 1521*45876Sbostic /* if we have an lru cache, let the cache code do the work */ 1522*45876Sbostic if (ISCACHE(t)) { 1523*45876Sbostic cache = t->bt_s.bt_d.d_cache; 1524*45876Sbostic 1525*45876Sbostic /* release the old page */ 1526*45876Sbostic if (t->bt_curpage != (BTHEADER *) NULL) { 1527*45876Sbostic pgno_t opgno = t->bt_curpage->h_pgno; 1528*45876Sbostic t->bt_curpage->h_flags &= ~F_DIRTY; 1529*45876Sbostic 1530*45876Sbostic if (lruwrite(cache, opgno) < 0) 1531*45876Sbostic return (RET_ERROR); 1532*45876Sbostic 1533*45876Sbostic lrurelease(cache, opgno); 1534*45876Sbostic } 1535*45876Sbostic 1536*45876Sbostic if (pgno > t->bt_npages) { 1537*45876Sbostic if ((h = (BTHEADER *) lrugetnew(cache, pgno, &nbytes)) 1538*45876Sbostic == (BTHEADER *) NULL) 1539*45876Sbostic return (RET_ERROR); 1540*45876Sbostic t->bt_npages = pgno; 1541*45876Sbostic } else { 1542*45876Sbostic if ((h = (BTHEADER *) lruget(cache, pgno, &nbytes)) 1543*45876Sbostic == (BTHEADER *) NULL) 1544*45876Sbostic return (RET_ERROR); 1545*45876Sbostic } 1546*45876Sbostic 1547*45876Sbostic /* init this page, if necessary */ 1548*45876Sbostic if (nbytes == 0) { 1549*45876Sbostic h->h_pgno = pgno; 1550*45876Sbostic h->h_flags = F_LEAF; 1551*45876Sbostic h->h_lower = (index_t) 1552*45876Sbostic (((char *) &(h->h_linp[0])) - ((char *) h)); 1553*45876Sbostic h->h_upper = t->bt_psize; 1554*45876Sbostic h->h_prevpg = h->h_nextpg = P_NONE; 1555*45876Sbostic } 1556*45876Sbostic 1557*45876Sbostic t->bt_curpage = h; 1558*45876Sbostic 1559*45876Sbostic return (RET_SUCCESS); 1560*45876Sbostic } 1561*45876Sbostic 1562*45876Sbostic /* sync the current page, if necessary */ 1563*45876Sbostic if (t->bt_curpage != (BTHEADER *) NULL) { 1564*45876Sbostic if (t->bt_curpage->h_flags & F_DIRTY) 1565*45876Sbostic if (_bt_write(t, t->bt_curpage, RELEASE) == RET_ERROR) 1566*45876Sbostic return (RET_ERROR); 1567*45876Sbostic } else { 1568*45876Sbostic if (t->bt_npages == 0) 1569*45876Sbostic t->bt_npages = 1; 1570*45876Sbostic 1571*45876Sbostic /* if no current page, get space for one */ 1572*45876Sbostic if ((t->bt_curpage = (BTHEADER *) malloc((unsigned) t->bt_psize)) 1573*45876Sbostic == (BTHEADER *) NULL) { 1574*45876Sbostic return (RET_ERROR); 1575*45876Sbostic } 1576*45876Sbostic } 1577*45876Sbostic 1578*45876Sbostic n = t->bt_psize; 1579*45876Sbostic pos = (long) (pgno * n); 1580*45876Sbostic 1581*45876Sbostic /* seek to correct location in file */ 1582*45876Sbostic if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) { 1583*45876Sbostic return (RET_ERROR); 1584*45876Sbostic } 1585*45876Sbostic 1586*45876Sbostic /* read the page */ 1587*45876Sbostic if ((nbytes = read(t->bt_s.bt_d.d_fd, t->bt_curpage, n)) < n) { 1588*45876Sbostic 1589*45876Sbostic /* 1590*45876Sbostic * If we didn't get a full page, we must have gotten no 1591*45876Sbostic * data at all -- in which case we're trying to read a 1592*45876Sbostic * root page that doesn't exist yet. This is the only 1593*45876Sbostic * case in which this is okay. If this happens, construct 1594*45876Sbostic * an empty root page by hand. 1595*45876Sbostic */ 1596*45876Sbostic if (nbytes != 0 || pgno != P_ROOT) { 1597*45876Sbostic errno = EBADF; 1598*45876Sbostic return (RET_ERROR); 1599*45876Sbostic } 1600*45876Sbostic 1601*45876Sbostic h = (BTHEADER *) t->bt_curpage; 1602*45876Sbostic h->h_pgno = pgno; 1603*45876Sbostic h->h_flags = F_LEAF; 1604*45876Sbostic h->h_lower = (index_t) 1605*45876Sbostic (((char *) &(h->h_linp[0])) - ((char *) h)); 1606*45876Sbostic h->h_upper = t->bt_psize; 1607*45876Sbostic h->h_prevpg = h->h_nextpg = P_NONE; 1608*45876Sbostic } else 1609*45876Sbostic (void) _bt_pgin(t->bt_curpage, (char *) t->bt_lorder); 1610*45876Sbostic 1611*45876Sbostic return (RET_SUCCESS); 1612*45876Sbostic } 1613*45876Sbostic 1614*45876Sbostic /* 1615*45876Sbostic * _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from 1616*45876Sbostic * the host-independent format stored on disk. 1617*45876Sbostic * 1618*45876Sbostic * Parameters: 1619*45876Sbostic * h -- page to convert 1620*45876Sbostic * _lorder -- byte order for pages (stored as a char * in the 1621*45876Sbostic * cache, and passed around as a magic cookie). 1622*45876Sbostic * 1623*45876Sbostic * Returns: 1624*45876Sbostic * RET_SUCCESS (lru code requires a return value). 1625*45876Sbostic * 1626*45876Sbostic * Side Effects: 1627*45876Sbostic * Layout of tree metadata on the page is changed in place. 1628*45876Sbostic * 1629*45876Sbostic * Warnings: 1630*45876Sbostic * Everywhere else in the code, the types pgno_t and index_t 1631*45876Sbostic * are opaque. These two routines know what they really 1632*45876Sbostic * are. 1633*45876Sbostic */ 1634*45876Sbostic 1635*45876Sbostic static int 1636*45876Sbostic _bt_pgout(h, _lorder) 1637*45876Sbostic BTHEADER *h; 1638*45876Sbostic char *_lorder; 1639*45876Sbostic { 1640*45876Sbostic int i; 1641*45876Sbostic int top; 1642*45876Sbostic int lorder; 1643*45876Sbostic DATUM *d; 1644*45876Sbostic IDATUM *id; 1645*45876Sbostic size_t chain; 1646*45876Sbostic 1647*45876Sbostic lorder = (int) _lorder; 1648*45876Sbostic if (lorder == BYTE_ORDER) 1649*45876Sbostic return (RET_SUCCESS); 1650*45876Sbostic 1651*45876Sbostic if (h->h_flags & F_LEAF) { 1652*45876Sbostic if (h->h_flags & F_CONT) { 1653*45876Sbostic if (h->h_prevpg == P_NONE) { 1654*45876Sbostic size_t longsz; 1655*45876Sbostic 1656*45876Sbostic (void) bcopy((char *) &(h->h_linp[0]), 1657*45876Sbostic (char *) &longsz, 1658*45876Sbostic sizeof(longsz)); 1659*45876Sbostic BLSWAP(longsz); 1660*45876Sbostic (void) bcopy((char *) &longsz, 1661*45876Sbostic (char *) &(h->h_linp[0]), 1662*45876Sbostic sizeof(longsz)); 1663*45876Sbostic } 1664*45876Sbostic } else { 1665*45876Sbostic top = NEXTINDEX(h); 1666*45876Sbostic for (i = 0; i < top; i++) { 1667*45876Sbostic d = (DATUM *) GETDATUM(h, i); 1668*45876Sbostic if (d->d_flags & D_BIGKEY) { 1669*45876Sbostic (void) bcopy((char *) &(d->d_bytes[0]), 1670*45876Sbostic (char *) &chain, 1671*45876Sbostic sizeof(chain)); 1672*45876Sbostic BLSWAP(chain); 1673*45876Sbostic (void) bcopy((char *) &chain, 1674*45876Sbostic (char *) &(d->d_bytes[0]), 1675*45876Sbostic sizeof(chain)); 1676*45876Sbostic } 1677*45876Sbostic if (d->d_flags & D_BIGDATA) { 1678*45876Sbostic (void) bcopy((char *) &(d->d_bytes[d->d_ksize]), 1679*45876Sbostic (char *) &chain, 1680*45876Sbostic sizeof(chain)); 1681*45876Sbostic BLSWAP(chain); 1682*45876Sbostic (void) bcopy((char *) &chain, 1683*45876Sbostic (char *) &(d->d_bytes[d->d_ksize]), 1684*45876Sbostic sizeof(chain)); 1685*45876Sbostic } 1686*45876Sbostic BLSWAP(d->d_dsize); 1687*45876Sbostic BLSWAP(d->d_ksize); 1688*45876Sbostic BLSWAP(d->d_flags); 1689*45876Sbostic BLSWAP(h->h_linp[i]); 1690*45876Sbostic } 1691*45876Sbostic } 1692*45876Sbostic } else { 1693*45876Sbostic top = NEXTINDEX(h); 1694*45876Sbostic for (i = 0; i < top; i++) { 1695*45876Sbostic id = (IDATUM *) GETDATUM(h, i); 1696*45876Sbostic BLSWAP(id->i_size); 1697*45876Sbostic BLSWAP(id->i_pgno); 1698*45876Sbostic BLSWAP(id->i_flags); 1699*45876Sbostic if (id->i_flags & D_BIGKEY) { 1700*45876Sbostic (void) bcopy((char *) &(id->i_bytes[0]), 1701*45876Sbostic (char *) &chain, 1702*45876Sbostic sizeof(chain)); 1703*45876Sbostic BLSWAP(chain); 1704*45876Sbostic (void) bcopy((char *) &chain, 1705*45876Sbostic (char *) &(id->i_bytes[0]), 1706*45876Sbostic sizeof(chain)); 1707*45876Sbostic } 1708*45876Sbostic BLSWAP(h->h_linp[i]); 1709*45876Sbostic } 1710*45876Sbostic } 1711*45876Sbostic BLSWAP(h->h_flags); 1712*45876Sbostic BLSWAP(h->h_pgno); 1713*45876Sbostic BLSWAP(h->h_prevpg); 1714*45876Sbostic BLSWAP(h->h_nextpg); 1715*45876Sbostic BLSWAP(h->h_lower); 1716*45876Sbostic BLSWAP(h->h_upper); 1717*45876Sbostic 1718*45876Sbostic return (RET_SUCCESS); 1719*45876Sbostic } 1720*45876Sbostic 1721*45876Sbostic static int 1722*45876Sbostic _bt_pgin(h, _lorder) 1723*45876Sbostic BTHEADER *h; 1724*45876Sbostic char *_lorder; 1725*45876Sbostic { 1726*45876Sbostic int i; 1727*45876Sbostic int top; 1728*45876Sbostic int lorder; 1729*45876Sbostic DATUM *d; 1730*45876Sbostic IDATUM *id; 1731*45876Sbostic size_t chain; 1732*45876Sbostic 1733*45876Sbostic /* 1734*45876Sbostic * If btree pages are to be stored in the host byte order, don't 1735*45876Sbostic * bother swapping. 1736*45876Sbostic */ 1737*45876Sbostic lorder = (int) _lorder; 1738*45876Sbostic if (lorder == BYTE_ORDER) 1739*45876Sbostic return (RET_SUCCESS); 1740*45876Sbostic 1741*45876Sbostic BLSWAP(h->h_upper); 1742*45876Sbostic BLSWAP(h->h_lower); 1743*45876Sbostic BLSWAP(h->h_nextpg); 1744*45876Sbostic BLSWAP(h->h_prevpg); 1745*45876Sbostic BLSWAP(h->h_pgno); 1746*45876Sbostic BLSWAP(h->h_flags); 1747*45876Sbostic 1748*45876Sbostic if (h->h_flags & F_LEAF) { 1749*45876Sbostic if (h->h_flags & F_CONT) { 1750*45876Sbostic if (h->h_prevpg == P_NONE) { 1751*45876Sbostic size_t longsz; 1752*45876Sbostic 1753*45876Sbostic (void) bcopy((char *) &(h->h_linp[0]), 1754*45876Sbostic (char *) &longsz, 1755*45876Sbostic sizeof(longsz)); 1756*45876Sbostic BLSWAP(longsz); 1757*45876Sbostic (void) bcopy((char *) &longsz, 1758*45876Sbostic (char *) &(h->h_linp[0]), 1759*45876Sbostic sizeof(longsz)); 1760*45876Sbostic } 1761*45876Sbostic } else { 1762*45876Sbostic top = NEXTINDEX(h); 1763*45876Sbostic for (i = 0; i < top; i++) { 1764*45876Sbostic BLSWAP(h->h_linp[i]); 1765*45876Sbostic d = (DATUM *) GETDATUM(h, i); 1766*45876Sbostic BLSWAP(d->d_dsize); 1767*45876Sbostic BLSWAP(d->d_ksize); 1768*45876Sbostic BLSWAP(d->d_flags); 1769*45876Sbostic if (d->d_flags & D_BIGKEY) { 1770*45876Sbostic (void) bcopy((char *) &(d->d_bytes[0]), 1771*45876Sbostic (char *) &chain, 1772*45876Sbostic sizeof(chain)); 1773*45876Sbostic BLSWAP(chain); 1774*45876Sbostic (void) bcopy((char *) &chain, 1775*45876Sbostic (char *) &(d->d_bytes[0]), 1776*45876Sbostic sizeof(chain)); 1777*45876Sbostic } 1778*45876Sbostic if (d->d_flags & D_BIGDATA) { 1779*45876Sbostic (void) bcopy((char *) &(d->d_bytes[d->d_ksize]), 1780*45876Sbostic (char *) &chain, 1781*45876Sbostic sizeof(chain)); 1782*45876Sbostic BLSWAP(chain); 1783*45876Sbostic (void) bcopy((char *) &chain, 1784*45876Sbostic (char *) &(d->d_bytes[d->d_ksize]), 1785*45876Sbostic sizeof(chain)); 1786*45876Sbostic } 1787*45876Sbostic } 1788*45876Sbostic } 1789*45876Sbostic } else { 1790*45876Sbostic top = NEXTINDEX(h); 1791*45876Sbostic for (i = 0; i < top; i++) { 1792*45876Sbostic BLSWAP(h->h_linp[i]); 1793*45876Sbostic id = (IDATUM *) GETDATUM(h, i); 1794*45876Sbostic BLSWAP(id->i_size); 1795*45876Sbostic BLSWAP(id->i_pgno); 1796*45876Sbostic BLSWAP(id->i_flags); 1797*45876Sbostic if (id->i_flags & D_BIGKEY) { 1798*45876Sbostic (void) bcopy((char *) &(id->i_bytes[0]), 1799*45876Sbostic (char *) &chain, 1800*45876Sbostic sizeof(chain)); 1801*45876Sbostic BLSWAP(chain); 1802*45876Sbostic (void) bcopy((char *) &chain, 1803*45876Sbostic (char *) &(id->i_bytes[0]), 1804*45876Sbostic sizeof(chain)); 1805*45876Sbostic } 1806*45876Sbostic } 1807*45876Sbostic } 1808*45876Sbostic return (RET_SUCCESS); 1809*45876Sbostic } 1810*45876Sbostic 1811*45876Sbostic /* 1812*45876Sbostic * BT_SYNC -- sync the btree to disk. 1813*45876Sbostic * 1814*45876Sbostic * Parameters: 1815*45876Sbostic * tree -- btree to sync. 1816*45876Sbostic * 1817*45876Sbostic * Returns: 1818*45876Sbostic * RET_SUCCESS, RET_ERROR. 1819*45876Sbostic */ 1820*45876Sbostic 1821*45876Sbostic bt_sync(tree) 1822*45876Sbostic BTREE tree; 1823*45876Sbostic { 1824*45876Sbostic BTREE_P t = (BTREE_P) tree; 1825*45876Sbostic BTHEADER *h; 1826*45876Sbostic pgno_t pgno; 1827*45876Sbostic 1828*45876Sbostic /* if this is an in-memory btree, syncing is a no-op */ 1829*45876Sbostic if (!ISDISK(t)) 1830*45876Sbostic return (RET_SUCCESS); 1831*45876Sbostic 1832*45876Sbostic h = (BTHEADER *) t->bt_curpage; 1833*45876Sbostic h->h_flags &= ~F_DIRTY; 1834*45876Sbostic 1835*45876Sbostic if (ISCACHE(t)) { 1836*45876Sbostic pgno = t->bt_curpage->h_pgno; 1837*45876Sbostic if (_bt_write(t, h, RELEASE) == RET_ERROR) 1838*45876Sbostic return(RET_ERROR); 1839*45876Sbostic if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR) 1840*45876Sbostic return (RET_ERROR); 1841*45876Sbostic if (_bt_getpage(t, pgno) == RET_ERROR) 1842*45876Sbostic return (RET_ERROR); 1843*45876Sbostic } else { 1844*45876Sbostic if (_bt_write(t, h, NORELEASE) == RET_ERROR) 1845*45876Sbostic return (RET_ERROR); 1846*45876Sbostic } 1847*45876Sbostic 1848*45876Sbostic return (fsync(t->bt_s.bt_d.d_fd)); 1849*45876Sbostic } 1850*45876Sbostic 1851*45876Sbostic /* 1852*45876Sbostic * _BT_INSERT -- Insert a new user datum in the btree. 1853*45876Sbostic * 1854*45876Sbostic * This routine is called by bt_put, the public interface, once the 1855*45876Sbostic * location for the new item is known. We do the work here, and 1856*45876Sbostic * handle splits if necessary. 1857*45876Sbostic * 1858*45876Sbostic * Parameters: 1859*45876Sbostic * t -- btree in which to do the insertion. 1860*45876Sbostic * item -- BTITEM describing location of new datum 1861*45876Sbostic * key -- key to insert 1862*45876Sbostic * data -- data associated with key 1863*45876Sbostic * flag -- magic cookie passed recursively to bt_put if we 1864*45876Sbostic * have to do a split 1865*45876Sbostic * 1866*45876Sbostic * Returns: 1867*45876Sbostic * RET_SUCCESS, RET_ERROR. 1868*45876Sbostic */ 1869*45876Sbostic 1870*45876Sbostic static int 1871*45876Sbostic _bt_insert(t, item, key, data, flag) 1872*45876Sbostic BTREE_P t; 1873*45876Sbostic BTITEM *item; 1874*45876Sbostic DBT *key; 1875*45876Sbostic DBT *data; 1876*45876Sbostic int flag; 1877*45876Sbostic { 1878*45876Sbostic index_t index; 1879*45876Sbostic BTHEADER *h; 1880*45876Sbostic DATUM *d; 1881*45876Sbostic int nbytes; 1882*45876Sbostic int status; 1883*45876Sbostic pgno_t pgno; 1884*45876Sbostic int keysize, datasize; 1885*45876Sbostic int bigkey, bigdata; 1886*45876Sbostic 1887*45876Sbostic if (_bt_getpage(t, item->bti_pgno) == RET_ERROR) 1888*45876Sbostic return (RET_ERROR); 1889*45876Sbostic h = t->bt_curpage; 1890*45876Sbostic 1891*45876Sbostic if (TOOBIG(t, data->size)) { 1892*45876Sbostic bigdata = TRUE; 1893*45876Sbostic datasize = sizeof(pgno_t); 1894*45876Sbostic } else { 1895*45876Sbostic bigdata = FALSE; 1896*45876Sbostic datasize = data->size; 1897*45876Sbostic } 1898*45876Sbostic 1899*45876Sbostic if (TOOBIG(t, key->size)) { 1900*45876Sbostic bigkey = TRUE; 1901*45876Sbostic keysize = sizeof(pgno_t); 1902*45876Sbostic } else { 1903*45876Sbostic bigkey = FALSE; 1904*45876Sbostic keysize = key->size; 1905*45876Sbostic } 1906*45876Sbostic 1907*45876Sbostic nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char)); 1908*45876Sbostic nbytes = LONGALIGN(nbytes) + sizeof(index_t); 1909*45876Sbostic 1910*45876Sbostic /* if there's not enough room here, split the page */ 1911*45876Sbostic if ((h->h_upper - h->h_lower) < nbytes) { 1912*45876Sbostic if (_bt_split(t) == RET_ERROR) 1913*45876Sbostic return (RET_ERROR); 1914*45876Sbostic 1915*45876Sbostic /* okay, try again */ 1916*45876Sbostic return (bt_put(t, key, data, flag)); 1917*45876Sbostic } 1918*45876Sbostic 1919*45876Sbostic /* put together a leaf page datum from the key/data pair */ 1920*45876Sbostic index = item->bti_index; 1921*45876Sbostic nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char)); 1922*45876Sbostic 1923*45876Sbostic if ((d = (DATUM *) malloc((unsigned) nbytes)) == (DATUM *) NULL) 1924*45876Sbostic return (RET_ERROR); 1925*45876Sbostic 1926*45876Sbostic d->d_ksize = keysize; 1927*45876Sbostic d->d_dsize = datasize; 1928*45876Sbostic d->d_flags = 0; 1929*45876Sbostic 1930*45876Sbostic if (bigkey) { 1931*45876Sbostic if (_bt_indirect(t, key, &pgno) == RET_ERROR) 1932*45876Sbostic return (RET_ERROR); 1933*45876Sbostic (void) bcopy((char *) &pgno, &(d->d_bytes[0]), sizeof(pgno)); 1934*45876Sbostic d->d_flags |= D_BIGKEY; 1935*45876Sbostic if (_bt_getpage(t, item->bti_pgno) == RET_ERROR) 1936*45876Sbostic return (RET_ERROR); 1937*45876Sbostic } else { 1938*45876Sbostic if (d->d_ksize > 0) { 1939*45876Sbostic (void) bcopy((char *) key->data, 1940*45876Sbostic (char *) &(d->d_bytes[0]), 1941*45876Sbostic (int) d->d_ksize); 1942*45876Sbostic } 1943*45876Sbostic } 1944*45876Sbostic 1945*45876Sbostic if (bigdata) { 1946*45876Sbostic if (_bt_indirect(t, data, &pgno) == RET_ERROR) 1947*45876Sbostic return (RET_ERROR); 1948*45876Sbostic (void) bcopy((char *) &pgno, 1949*45876Sbostic &(d->d_bytes[keysize]), 1950*45876Sbostic sizeof(pgno)); 1951*45876Sbostic d->d_flags |= D_BIGDATA; 1952*45876Sbostic if (_bt_getpage(t, item->bti_pgno) == RET_ERROR) 1953*45876Sbostic return (RET_ERROR); 1954*45876Sbostic } else { 1955*45876Sbostic if (d->d_dsize > 0) { 1956*45876Sbostic (void) bcopy((char *) data->data, 1957*45876Sbostic (char *) &(d->d_bytes[keysize]), 1958*45876Sbostic (int) d->d_dsize); 1959*45876Sbostic } 1960*45876Sbostic } 1961*45876Sbostic 1962*45876Sbostic /* do the insertion */ 1963*45876Sbostic status = _bt_insertat(t, d, index); 1964*45876Sbostic 1965*45876Sbostic (void) free((char *) d); 1966*45876Sbostic 1967*45876Sbostic return (status); 1968*45876Sbostic } 1969*45876Sbostic 1970*45876Sbostic /* 1971*45876Sbostic * _BT_INDIRECT -- Write a series of indirect pages for big objects. 1972*45876Sbostic * 1973*45876Sbostic * A chain of indirect pages looks like 1974*45876Sbostic * 1975*45876Sbostic * +-------------------+ +---------------------+ 1976*45876Sbostic * |hdr|size| | |hdr| | 1977*45876Sbostic * +---+----+ | +---+ | 1978*45876Sbostic * | ... data ... | | ... data ... | ... 1979*45876Sbostic * | | | | 1980*45876Sbostic * +-------------------+ +---------------------+ 1981*45876Sbostic * 1982*45876Sbostic * where hdr is a standard btree page header (with the indirect bit 1983*45876Sbostic * set), size on the first page is the real size of the datum, and 1984*45876Sbostic * data are bytes of the datum, split across as many pages as necessary. 1985*45876Sbostic * Indirect pages are chained together with the h_prevpg and h_nextpg 1986*45876Sbostic * entries of the page header struct. 1987*45876Sbostic * 1988*45876Sbostic * A single DBT is written per chain, so space on the last page is 1989*45876Sbostic * wasted. 1990*45876Sbostic * 1991*45876Sbostic * We return the page number of the start of the chain. 1992*45876Sbostic * 1993*45876Sbostic * When a big object is deleted from a tree, the space that it occupied 1994*45876Sbostic * is placed on a free list for later reuse. This routine checks that 1995*45876Sbostic * free list before allocating new pages to the big datum being inserted. 1996*45876Sbostic * 1997*45876Sbostic * Parameters: 1998*45876Sbostic * t -- btree in which to store indirect blocks 1999*45876Sbostic * data -- DBT with the big datum in it 2000*45876Sbostic * pgno -- place to put the starting page number of the chain 2001*45876Sbostic * 2002*45876Sbostic * Returns: 2003*45876Sbostic * RET_SUCCESS, RET_ERROR. 2004*45876Sbostic * 2005*45876Sbostic * Side Effects: 2006*45876Sbostic * Current page is changed on return. 2007*45876Sbostic */ 2008*45876Sbostic 2009*45876Sbostic static int 2010*45876Sbostic _bt_indirect(t, data, pgno) 2011*45876Sbostic BTREE_P t; 2012*45876Sbostic DBT *data; 2013*45876Sbostic pgno_t *pgno; 2014*45876Sbostic { 2015*45876Sbostic pgno_t save; 2016*45876Sbostic pgno_t prev; 2017*45876Sbostic char *top; 2018*45876Sbostic char *where; 2019*45876Sbostic char *from; 2020*45876Sbostic size_t dsize; 2021*45876Sbostic pgno_t nextchn; 2022*45876Sbostic int ischain; 2023*45876Sbostic BTHEADER *cur; 2024*45876Sbostic 2025*45876Sbostic /* get set for first page in chain */ 2026*45876Sbostic prev = P_NONE; 2027*45876Sbostic dsize = data->size; 2028*45876Sbostic from = (char *) data->data; 2029*45876Sbostic 2030*45876Sbostic /* if there are blocks on the free list, use them first */ 2031*45876Sbostic if ((*pgno = t->bt_free) == P_NONE) { 2032*45876Sbostic if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL) 2033*45876Sbostic return (RET_ERROR); 2034*45876Sbostic 2035*45876Sbostic ischain = 0; 2036*45876Sbostic *pgno = cur->h_pgno = ++(t->bt_npages); 2037*45876Sbostic } else { 2038*45876Sbostic if (_bt_getpage(t, *pgno) == RET_ERROR) 2039*45876Sbostic return (RET_ERROR); 2040*45876Sbostic ischain = 1; 2041*45876Sbostic cur = t->bt_curpage; 2042*45876Sbostic } 2043*45876Sbostic 2044*45876Sbostic cur->h_flags = F_CONT|F_LEAF; 2045*45876Sbostic (void) bcopy((char *) &dsize, (char *) &cur->h_linp[0], sizeof(size_t)); 2046*45876Sbostic where = ((char *) (&cur->h_linp[0])) + sizeof(size_t); 2047*45876Sbostic 2048*45876Sbostic /* fill and write pages in the chain */ 2049*45876Sbostic for (;;) { 2050*45876Sbostic int nhere; 2051*45876Sbostic 2052*45876Sbostic top = ((char *) cur) + t->bt_psize; 2053*45876Sbostic cur->h_prevpg = prev; 2054*45876Sbostic nextchn = cur->h_nextpg; 2055*45876Sbostic nhere = (int) (top - where); 2056*45876Sbostic 2057*45876Sbostic if (nhere >= dsize) { 2058*45876Sbostic (void) bcopy(from, where, (int) dsize); 2059*45876Sbostic cur->h_nextpg = P_NONE; 2060*45876Sbostic dsize = 0; 2061*45876Sbostic } else { 2062*45876Sbostic (void) bcopy(from, where, (int) nhere); 2063*45876Sbostic dsize -= nhere; 2064*45876Sbostic from += nhere; 2065*45876Sbostic if (nextchn == P_NONE) 2066*45876Sbostic cur->h_nextpg = t->bt_npages + 1; 2067*45876Sbostic prev = cur->h_pgno; 2068*45876Sbostic } 2069*45876Sbostic 2070*45876Sbostic /* current page is ready to go; write it out */ 2071*45876Sbostic if (_bt_write(t, cur, RELEASE) == RET_ERROR) 2072*45876Sbostic return (RET_ERROR); 2073*45876Sbostic 2074*45876Sbostic /* free the current page, if appropriate */ 2075*45876Sbostic if (ISDISK(t) && !ISCACHE(t) && !ischain) { 2076*45876Sbostic (void) free ((char *) cur); 2077*45876Sbostic } 2078*45876Sbostic 2079*45876Sbostic /* done? */ 2080*45876Sbostic if (dsize == 0) 2081*45876Sbostic break; 2082*45876Sbostic 2083*45876Sbostic /* allocate another page */ 2084*45876Sbostic if (nextchn == P_NONE) { 2085*45876Sbostic if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL) 2086*45876Sbostic return (RET_ERROR); 2087*45876Sbostic ischain = 0; 2088*45876Sbostic cur->h_pgno = ++(t->bt_npages); 2089*45876Sbostic } else { 2090*45876Sbostic if (_bt_getpage(t, nextchn) == RET_ERROR) 2091*45876Sbostic return (RET_ERROR); 2092*45876Sbostic ischain = 1; 2093*45876Sbostic cur = t->bt_curpage; 2094*45876Sbostic } 2095*45876Sbostic cur->h_flags = F_LEAF | F_CONT; 2096*45876Sbostic 2097*45876Sbostic where = (char *) (&cur->h_linp[0]); 2098*45876Sbostic } 2099*45876Sbostic 2100*45876Sbostic /* if we used pages from the free list, record changes to it */ 2101*45876Sbostic if (*pgno == t->bt_free) { 2102*45876Sbostic t->bt_free = nextchn; 2103*45876Sbostic t->bt_flags &= ~BTF_METAOK; 2104*45876Sbostic } 2105*45876Sbostic 2106*45876Sbostic return (RET_SUCCESS); 2107*45876Sbostic } 2108*45876Sbostic 2109*45876Sbostic /* 2110*45876Sbostic * _BT_SPLIT -- Split a page into two pages. 2111*45876Sbostic * 2112*45876Sbostic * Splits are caused by insertions, and propogate up the tree in 2113*45876Sbostic * the usual way. The root page is always page 1 in the file on 2114*45876Sbostic * disk, so root splits are handled specially. On entry to this 2115*45876Sbostic * routine, t->bt_curpage is the page to be split. 2116*45876Sbostic * 2117*45876Sbostic * Parameters: 2118*45876Sbostic * t -- btree in which to do split. 2119*45876Sbostic * 2120*45876Sbostic * Returns: 2121*45876Sbostic * RET_SUCCESS, RET_ERROR. 2122*45876Sbostic * 2123*45876Sbostic * Side Effects: 2124*45876Sbostic * Changes the notion of the current page. 2125*45876Sbostic */ 2126*45876Sbostic 2127*45876Sbostic static 2128*45876Sbostic _bt_split(t) 2129*45876Sbostic BTREE_P t; 2130*45876Sbostic { 2131*45876Sbostic BTHEADER *h; 2132*45876Sbostic BTHEADER *left, *right; 2133*45876Sbostic BTHEADER *next; 2134*45876Sbostic pgno_t nextpgno, parent; 2135*45876Sbostic int nbytes, len; 2136*45876Sbostic IDATUM *id; 2137*45876Sbostic DATUM *d; 2138*45876Sbostic char *src; 2139*45876Sbostic IDATUM *new; 2140*45876Sbostic pgno_t oldchain; 2141*45876Sbostic u_char flags; 2142*45876Sbostic 2143*45876Sbostic h = (BTHEADER *) t->bt_curpage; 2144*45876Sbostic 2145*45876Sbostic /* split root page specially, since it must remain page 1 */ 2146*45876Sbostic if (h->h_pgno == P_ROOT) { 2147*45876Sbostic return (_bt_splitroot(t)); 2148*45876Sbostic } 2149*45876Sbostic 2150*45876Sbostic /* 2151*45876Sbostic * This is a little complicated. We go to some trouble to 2152*45876Sbostic * figure out which of the three possible cases -- in-memory tree, 2153*45876Sbostic * disk tree (no cache), and disk tree (cache) -- we have, in order 2154*45876Sbostic * to avoid unnecessary copying. If we have a disk cache, then we 2155*45876Sbostic * have to do some extra copying, though, since the cache code 2156*45876Sbostic * manages buffers externally to this code. 2157*45876Sbostic */ 2158*45876Sbostic 2159*45876Sbostic if (ISDISK(t) && ISCACHE(t)) { 2160*45876Sbostic if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize)) 2161*45876Sbostic == (BTHEADER *) NULL) 2162*45876Sbostic return (RET_ERROR); 2163*45876Sbostic left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE; 2164*45876Sbostic left->h_flags = t->bt_curpage->h_flags; 2165*45876Sbostic left->h_lower = (index_t) 2166*45876Sbostic (((char *) &(left->h_linp[0])) - ((char *) left)); 2167*45876Sbostic left->h_upper = t->bt_psize; 2168*45876Sbostic 2169*45876Sbostic } else { 2170*45876Sbostic if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL) 2171*45876Sbostic return (RET_ERROR); 2172*45876Sbostic } 2173*45876Sbostic left->h_pgno = h->h_pgno; 2174*45876Sbostic 2175*45876Sbostic if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL) 2176*45876Sbostic return (RET_ERROR); 2177*45876Sbostic right->h_pgno = ++(t->bt_npages); 2178*45876Sbostic 2179*45876Sbostic /* now do the split */ 2180*45876Sbostic if (_bt_dopsplit(t, left, right) == RET_ERROR) 2181*45876Sbostic return (RET_ERROR); 2182*45876Sbostic 2183*45876Sbostic right->h_prevpg = left->h_pgno; 2184*45876Sbostic nextpgno = right->h_nextpg = h->h_nextpg; 2185*45876Sbostic left->h_nextpg = right->h_pgno; 2186*45876Sbostic left->h_prevpg = h->h_prevpg; 2187*45876Sbostic 2188*45876Sbostic /* okay, now use the left half of the page as the new page */ 2189*45876Sbostic if (ISDISK(t) && ISCACHE(t)) { 2190*45876Sbostic (void) bcopy((char *) left, (char *) t->bt_curpage, 2191*45876Sbostic (int) t->bt_psize); 2192*45876Sbostic (void) free ((char *) left); 2193*45876Sbostic left = t->bt_curpage; 2194*45876Sbostic } else { 2195*45876Sbostic (void) free((char *) t->bt_curpage); 2196*45876Sbostic t->bt_curpage = left; 2197*45876Sbostic } 2198*45876Sbostic 2199*45876Sbostic /* 2200*45876Sbostic * Write the new pages out. We need them to stay where they are 2201*45876Sbostic * until we're done updating the parent pages. 2202*45876Sbostic */ 2203*45876Sbostic 2204*45876Sbostic if (_bt_write(t, left, NORELEASE) == RET_ERROR) 2205*45876Sbostic return (RET_ERROR); 2206*45876Sbostic if (_bt_write(t, right, NORELEASE) == RET_ERROR) 2207*45876Sbostic return (RET_ERROR); 2208*45876Sbostic 2209*45876Sbostic /* update 'prev' pointer of old neighbor of left */ 2210*45876Sbostic if (nextpgno != P_NONE) { 2211*45876Sbostic if (_bt_getpage(t, nextpgno) == RET_ERROR) 2212*45876Sbostic return (RET_ERROR); 2213*45876Sbostic h = t->bt_curpage; 2214*45876Sbostic h->h_prevpg = right->h_pgno; 2215*45876Sbostic h->h_flags |= F_DIRTY; 2216*45876Sbostic } 2217*45876Sbostic 2218*45876Sbostic if ((parent = _bt_pop(t)) != P_NONE) { 2219*45876Sbostic if (right->h_flags & F_LEAF) { 2220*45876Sbostic d = (DATUM *) GETDATUM(right, 0); 2221*45876Sbostic len = d->d_ksize; 2222*45876Sbostic if (d->d_flags & D_BIGKEY) { 2223*45876Sbostic bcopy(&(d->d_bytes[0]), 2224*45876Sbostic (char *) &oldchain, 2225*45876Sbostic sizeof(oldchain)); 2226*45876Sbostic if (_bt_markchain(t, oldchain) == RET_ERROR) 2227*45876Sbostic return (RET_ERROR); 2228*45876Sbostic src = (char *) &oldchain; 2229*45876Sbostic flags = D_BIGKEY; 2230*45876Sbostic } else { 2231*45876Sbostic src = (char *) &(d->d_bytes[0]); 2232*45876Sbostic flags = 0; 2233*45876Sbostic } 2234*45876Sbostic } else { 2235*45876Sbostic id = (IDATUM *) GETDATUM(right, 0); 2236*45876Sbostic len = id->i_size; 2237*45876Sbostic flags = id->i_flags; 2238*45876Sbostic src = (char *) &(id->i_bytes[0]); 2239*45876Sbostic } 2240*45876Sbostic nbytes = len + (sizeof(IDATUM) - sizeof(char)); 2241*45876Sbostic new = (IDATUM *) malloc((unsigned) nbytes); 2242*45876Sbostic if (new == (IDATUM *) NULL) 2243*45876Sbostic return (RET_ERROR); 2244*45876Sbostic new->i_size = len; 2245*45876Sbostic new->i_pgno = right->h_pgno; 2246*45876Sbostic new->i_flags = flags; 2247*45876Sbostic if (len > 0) 2248*45876Sbostic (void) bcopy(src, (char *) &(new->i_bytes[0]), len); 2249*45876Sbostic 2250*45876Sbostic nbytes = LONGALIGN(nbytes) + sizeof(index_t); 2251*45876Sbostic if (_bt_getpage(t, parent) == RET_ERROR) 2252*45876Sbostic return (RET_ERROR); 2253*45876Sbostic 2254*45876Sbostic h = t->bt_curpage; 2255*45876Sbostic 2256*45876Sbostic /* 2257*45876Sbostic * Split the parent if we need to, then reposition the 2258*45876Sbostic * tree's current page pointer for the new datum. 2259*45876Sbostic */ 2260*45876Sbostic if ((h->h_upper - h->h_lower) < nbytes) { 2261*45876Sbostic if (_bt_split(t) == RET_ERROR) 2262*45876Sbostic return (RET_ERROR); 2263*45876Sbostic if (_bt_reposition(t, new, parent, right->h_prevpg) 2264*45876Sbostic == RET_ERROR) 2265*45876Sbostic return (RET_ERROR); 2266*45876Sbostic } 2267*45876Sbostic 2268*45876Sbostic /* okay, now insert the new idatum */ 2269*45876Sbostic if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR) 2270*45876Sbostic return (RET_ERROR); 2271*45876Sbostic } 2272*45876Sbostic 2273*45876Sbostic /* 2274*45876Sbostic * Okay, split is done; don't need right page stapled down anymore. 2275*45876Sbostic * The page we call 'left' above is the new version of the old 2276*45876Sbostic * (split) page, so we can't release it. 2277*45876Sbostic */ 2278*45876Sbostic 2279*45876Sbostic if (_bt_release(t, right) == RET_ERROR) 2280*45876Sbostic return (RET_ERROR); 2281*45876Sbostic if (ISDISK(t) && !ISCACHE(t)) 2282*45876Sbostic (void) free((char *) right); 2283*45876Sbostic 2284*45876Sbostic return (RET_SUCCESS); 2285*45876Sbostic } 2286*45876Sbostic 2287*45876Sbostic /* 2288*45876Sbostic * _BT_MARKCHAIN -- Mark a chain of pages as used by an internal node. 2289*45876Sbostic * 2290*45876Sbostic * Chains of indirect blocks pointed to by leaf nodes get reclaimed 2291*45876Sbostic * when the item that points to them gets deleted. Chains pointed 2292*45876Sbostic * to by internal nodes never get deleted. This routine marks a 2293*45876Sbostic * chain as pointed to by an internal node. 2294*45876Sbostic * 2295*45876Sbostic * Parameters: 2296*45876Sbostic * t -- tree in which to mark 2297*45876Sbostic * chain -- number of first page in chain 2298*45876Sbostic * 2299*45876Sbostic * Returns: 2300*45876Sbostic * RET_SUCCESS, RET_ERROR. 2301*45876Sbostic * 2302*45876Sbostic * Side Effects: 2303*45876Sbostic * None. 2304*45876Sbostic */ 2305*45876Sbostic 2306*45876Sbostic static int 2307*45876Sbostic _bt_markchain(t, chain) 2308*45876Sbostic BTREE_P t; 2309*45876Sbostic pgno_t chain; 2310*45876Sbostic { 2311*45876Sbostic pgno_t save; 2312*45876Sbostic 2313*45876Sbostic save = t->bt_curpage->h_pgno; 2314*45876Sbostic 2315*45876Sbostic if (_bt_getpage(t, chain) == RET_ERROR) 2316*45876Sbostic return (RET_ERROR); 2317*45876Sbostic 2318*45876Sbostic t->bt_curpage->h_flags |= (F_DIRTY|F_PRESERVE); 2319*45876Sbostic 2320*45876Sbostic if (_bt_getpage(t, save) == RET_ERROR) 2321*45876Sbostic return (RET_ERROR); 2322*45876Sbostic 2323*45876Sbostic return (RET_SUCCESS); 2324*45876Sbostic } 2325*45876Sbostic 2326*45876Sbostic /* 2327*45876Sbostic * _BT_REPOSITION -- Reposition the current page pointer of a btree. 2328*45876Sbostic * 2329*45876Sbostic * After splitting a node in the tree in order to make room for 2330*45876Sbostic * an insertion, we need to figure out which page after the split 2331*45876Sbostic * should get the item we want to insert. This routine positions 2332*45876Sbostic * the tree's current page pointer appropriately. 2333*45876Sbostic * 2334*45876Sbostic * Parameters: 2335*45876Sbostic * t -- tree to position 2336*45876Sbostic * new -- the item we want to insert 2337*45876Sbostic * parent -- parent of the node that we just split 2338*45876Sbostic * prev -- page number of item directly to the left of 2339*45876Sbostic * new's position in the tree. 2340*45876Sbostic * 2341*45876Sbostic * Returns: 2342*45876Sbostic * RET_SUCCESS, RET_ERROR. 2343*45876Sbostic * 2344*45876Sbostic * Side Effects: 2345*45876Sbostic * None. 2346*45876Sbostic */ 2347*45876Sbostic 2348*45876Sbostic static int 2349*45876Sbostic _bt_reposition(t, new, parent, prev) 2350*45876Sbostic BTREE_P t; 2351*45876Sbostic IDATUM *new; 2352*45876Sbostic pgno_t parent; 2353*45876Sbostic pgno_t prev; 2354*45876Sbostic { 2355*45876Sbostic index_t i, next; 2356*45876Sbostic IDATUM *idx; 2357*45876Sbostic 2358*45876Sbostic if (parent == P_ROOT) { 2359*45876Sbostic 2360*45876Sbostic /* 2361*45876Sbostic * If we just split the root page, then there are guaranteed 2362*45876Sbostic * to be exactly two IDATUMs on it. Look at both of them 2363*45876Sbostic * to see if they point to the page that we want. 2364*45876Sbostic */ 2365*45876Sbostic 2366*45876Sbostic if (_bt_getpage(t, parent) == RET_ERROR) 2367*45876Sbostic return (RET_ERROR); 2368*45876Sbostic 2369*45876Sbostic next = NEXTINDEX(t->bt_curpage); 2370*45876Sbostic for (i = 0; i < next; i++) { 2371*45876Sbostic idx = (IDATUM *) GETDATUM(t->bt_curpage, i); 2372*45876Sbostic if (_bt_getpage(t, idx->i_pgno) == RET_ERROR) 2373*45876Sbostic return (RET_ERROR); 2374*45876Sbostic if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 2375*45876Sbostic return (RET_SUCCESS); 2376*45876Sbostic if (_bt_getpage(t, parent) == RET_ERROR) 2377*45876Sbostic return (RET_ERROR); 2378*45876Sbostic } 2379*45876Sbostic } else { 2380*45876Sbostic 2381*45876Sbostic /* 2382*45876Sbostic * Get the parent page -- which is where the new item would 2383*45876Sbostic * have gone -- and figure out whether the new item now goes 2384*45876Sbostic * on the parent, or the page immediately to the right of 2385*45876Sbostic * the parent. 2386*45876Sbostic */ 2387*45876Sbostic 2388*45876Sbostic if (_bt_getpage(t, parent) == RET_ERROR) 2389*45876Sbostic return (RET_ERROR); 2390*45876Sbostic if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 2391*45876Sbostic return (RET_SUCCESS); 2392*45876Sbostic if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR) 2393*45876Sbostic return (RET_ERROR); 2394*45876Sbostic if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 2395*45876Sbostic return (RET_SUCCESS); 2396*45876Sbostic } 2397*45876Sbostic return (RET_ERROR); 2398*45876Sbostic } 2399*45876Sbostic 2400*45876Sbostic /* 2401*45876Sbostic * _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page? 2402*45876Sbostic * 2403*45876Sbostic * This routine is used by _bt_reposition to decide whether the current 2404*45876Sbostic * page is the correct one on which to insert a new item. 2405*45876Sbostic * 2406*45876Sbostic * Parameters: 2407*45876Sbostic * t -- tree to check 2408*45876Sbostic * new -- the item that will be inserted (used for binary search) 2409*45876Sbostic * prev -- page number of page whose IDATUM is immediately to 2410*45876Sbostic * the left of new's position in the tree. 2411*45876Sbostic * 2412*45876Sbostic * Returns: 2413*45876Sbostic * RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE). 2414*45876Sbostic */ 2415*45876Sbostic 2416*45876Sbostic static int 2417*45876Sbostic _bt_isonpage(t, new, prev) 2418*45876Sbostic BTREE_P t; 2419*45876Sbostic IDATUM *new; 2420*45876Sbostic pgno_t prev; 2421*45876Sbostic { 2422*45876Sbostic BTHEADER *h = (BTHEADER *) t->bt_curpage; 2423*45876Sbostic index_t i, next; 2424*45876Sbostic IDATUM *idx; 2425*45876Sbostic 2426*45876Sbostic i = _bt_binsrch(t, &(new->i_bytes[0])); 2427*45876Sbostic while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0) 2428*45876Sbostic --i; 2429*45876Sbostic next = NEXTINDEX(h); 2430*45876Sbostic idx = (IDATUM *) GETDATUM(h, i); 2431*45876Sbostic while (i < next && idx->i_pgno != prev) { 2432*45876Sbostic i++; 2433*45876Sbostic idx = (IDATUM *) GETDATUM(h, i); 2434*45876Sbostic } 2435*45876Sbostic if (idx->i_pgno == prev) 2436*45876Sbostic return (RET_SUCCESS); 2437*45876Sbostic else 2438*45876Sbostic return (RET_ERROR); 2439*45876Sbostic } 2440*45876Sbostic 2441*45876Sbostic /* 2442*45876Sbostic * _BT_SPLITROOT -- Split the root of a btree. 2443*45876Sbostic * 2444*45876Sbostic * The root page for a btree is always page one. This means that in 2445*45876Sbostic * order to split the root, we need to do extra work. 2446*45876Sbostic * 2447*45876Sbostic * Parameters: 2448*45876Sbostic * t -- tree to split 2449*45876Sbostic * 2450*45876Sbostic * Returns: 2451*45876Sbostic * RET_SUCCESS, RET_ERROR. 2452*45876Sbostic * 2453*45876Sbostic * Side Effects: 2454*45876Sbostic * Splits root upward in the usual way, adding two new pages 2455*45876Sbostic * to the tree (rather than just one, as in usual splits). 2456*45876Sbostic */ 2457*45876Sbostic 2458*45876Sbostic static 2459*45876Sbostic _bt_splitroot(t) 2460*45876Sbostic BTREE_P t; 2461*45876Sbostic { 2462*45876Sbostic BTHEADER *h = t->bt_curpage; 2463*45876Sbostic BTHEADER *left, *right; 2464*45876Sbostic DATUM *d; 2465*45876Sbostic IDATUM *id; 2466*45876Sbostic char *where; 2467*45876Sbostic BTHEADER *where_h; 2468*45876Sbostic pgno_t lastpg; 2469*45876Sbostic char *src, *dest; 2470*45876Sbostic int len, nbytes; 2471*45876Sbostic u_long was_leaf; 2472*45876Sbostic pgno_t oldchain; 2473*45876Sbostic u_char flags; 2474*45876Sbostic 2475*45876Sbostic /* get two new pages for the split */ 2476*45876Sbostic if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL) 2477*45876Sbostic return (RET_ERROR); 2478*45876Sbostic left->h_pgno = ++(t->bt_npages); 2479*45876Sbostic if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL) 2480*45876Sbostic return (RET_ERROR); 2481*45876Sbostic right->h_pgno = ++(t->bt_npages); 2482*45876Sbostic 2483*45876Sbostic /* do the split */ 2484*45876Sbostic if (_bt_dopsplit(t, left, right) == RET_ERROR) 2485*45876Sbostic return (RET_ERROR); 2486*45876Sbostic 2487*45876Sbostic /* connect the new pages correctly */ 2488*45876Sbostic right->h_prevpg = left->h_pgno; 2489*45876Sbostic left->h_nextpg = right->h_pgno; 2490*45876Sbostic 2491*45876Sbostic /* 2492*45876Sbostic * Write the child pages out now. We need them to remain 2493*45876Sbostic * where they are until we finish updating parent pages, 2494*45876Sbostic * however. 2495*45876Sbostic */ 2496*45876Sbostic 2497*45876Sbostic if (_bt_write(t, left, NORELEASE) == RET_ERROR) 2498*45876Sbostic return (RET_ERROR); 2499*45876Sbostic if (_bt_write(t, right, NORELEASE) == RET_ERROR) 2500*45876Sbostic return (RET_ERROR); 2501*45876Sbostic 2502*45876Sbostic /* now change the root page into an internal page */ 2503*45876Sbostic was_leaf = (h->h_flags & F_LEAF); 2504*45876Sbostic h->h_flags &= ~F_LEAF; 2505*45876Sbostic h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h)); 2506*45876Sbostic h->h_upper = (index_t) t->bt_psize; 2507*45876Sbostic (void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower)); 2508*45876Sbostic 2509*45876Sbostic /* put two new keys on root page */ 2510*45876Sbostic where_h = left; 2511*45876Sbostic while (where_h) { 2512*45876Sbostic DATUM *data; 2513*45876Sbostic IDATUM *idata; 2514*45876Sbostic 2515*45876Sbostic if (was_leaf) { 2516*45876Sbostic data = (DATUM *) GETDATUM(where_h, 0); 2517*45876Sbostic 2518*45876Sbostic if (where_h == left) { 2519*45876Sbostic len = 0; /* first key in tree is null */ 2520*45876Sbostic } else { 2521*45876Sbostic if (data->d_flags & D_BIGKEY) { 2522*45876Sbostic bcopy(&(data->d_bytes[0]), 2523*45876Sbostic (char *) &oldchain, 2524*45876Sbostic sizeof(oldchain)); 2525*45876Sbostic if (_bt_markchain(t, oldchain) == RET_ERROR) 2526*45876Sbostic return (RET_ERROR); 2527*45876Sbostic src = (char *) &oldchain; 2528*45876Sbostic flags = D_BIGKEY; 2529*45876Sbostic } else { 2530*45876Sbostic src = (char *) &(data->d_bytes[0]); 2531*45876Sbostic flags = 0; 2532*45876Sbostic } 2533*45876Sbostic len = data->d_ksize; 2534*45876Sbostic } 2535*45876Sbostic } else { 2536*45876Sbostic idata = (IDATUM *) GETDATUM(where_h, 0); 2537*45876Sbostic len = idata->i_size; 2538*45876Sbostic flags = idata->i_flags; 2539*45876Sbostic src = &(idata->i_bytes[0]); 2540*45876Sbostic } 2541*45876Sbostic dest = ((char *) h) + h->h_upper; 2542*45876Sbostic nbytes = len + (sizeof (IDATUM) - sizeof(char)); 2543*45876Sbostic dest -= LONGALIGN(nbytes); 2544*45876Sbostic id = (IDATUM *) dest; 2545*45876Sbostic id->i_size = len; 2546*45876Sbostic id->i_pgno = where_h->h_pgno; 2547*45876Sbostic id->i_flags = flags; 2548*45876Sbostic if (len > 0) 2549*45876Sbostic (void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len); 2550*45876Sbostic dest -= ((int) h); 2551*45876Sbostic h->h_linp[NEXTINDEX(h)] = (index_t) dest; 2552*45876Sbostic h->h_upper = (index_t) dest; 2553*45876Sbostic h->h_lower += sizeof(index_t); 2554*45876Sbostic 2555*45876Sbostic /* next page */ 2556*45876Sbostic if (where_h == left) 2557*45876Sbostic where_h = right; 2558*45876Sbostic else 2559*45876Sbostic where_h = (BTHEADER *) NULL; 2560*45876Sbostic } 2561*45876Sbostic 2562*45876Sbostic if (_bt_release(t, left) == RET_ERROR) 2563*45876Sbostic return (RET_ERROR); 2564*45876Sbostic if (_bt_release(t, right) == RET_ERROR) 2565*45876Sbostic return (RET_ERROR); 2566*45876Sbostic 2567*45876Sbostic /* 2568*45876Sbostic * That's it, split is done. If we're doing a non-cached disk 2569*45876Sbostic * btree, we can free up the pages we allocated, as they're on 2570*45876Sbostic * disk, now. 2571*45876Sbostic */ 2572*45876Sbostic 2573*45876Sbostic if (ISDISK(t) && !ISCACHE(t)) { 2574*45876Sbostic (void) free ((char *) left); 2575*45876Sbostic (void) free ((char *) right); 2576*45876Sbostic } 2577*45876Sbostic 2578*45876Sbostic h->h_flags |= F_DIRTY; 2579*45876Sbostic 2580*45876Sbostic return (RET_SUCCESS); 2581*45876Sbostic } 2582*45876Sbostic 2583*45876Sbostic /* 2584*45876Sbostic * _BT_DOPSPLIT -- Do the work of splitting a page 2585*45876Sbostic * 2586*45876Sbostic * This routine takes two page pointers and splits the data on the 2587*45876Sbostic * current page of the btree between them. 2588*45876Sbostic * 2589*45876Sbostic * We do a lot of work here to handle duplicate keys on a page; we 2590*45876Sbostic * have to place these keys carefully to guarantee that later searches 2591*45876Sbostic * will find them correctly. See comments in the code below for details. 2592*45876Sbostic * 2593*45876Sbostic * Parameters: 2594*45876Sbostic * t -- tree to split 2595*45876Sbostic * left -- pointer to page to get left half of the data 2596*45876Sbostic * right -- pointer to page to get right half of the data 2597*45876Sbostic * 2598*45876Sbostic * Returns: 2599*45876Sbostic * None. 2600*45876Sbostic */ 2601*45876Sbostic 2602*45876Sbostic static 2603*45876Sbostic _bt_dopsplit(t, left, right) 2604*45876Sbostic BTREE_P t; 2605*45876Sbostic BTHEADER *left; 2606*45876Sbostic BTHEADER *right; 2607*45876Sbostic { 2608*45876Sbostic BTHEADER *h = t->bt_curpage; 2609*45876Sbostic size_t psize; 2610*45876Sbostic char *where; 2611*45876Sbostic BTHEADER *where_h; 2612*45876Sbostic index_t where_i; 2613*45876Sbostic int nbytes, dsize, fixedsize, freespc; 2614*45876Sbostic index_t i; 2615*45876Sbostic index_t save_lower, save_upper, save_i; 2616*45876Sbostic index_t switch_i; 2617*45876Sbostic char *save_key; 2618*45876Sbostic DATUM *d; 2619*45876Sbostic CURSOR *c; 2620*45876Sbostic index_t top; 2621*45876Sbostic int free_save; 2622*45876Sbostic pgno_t chain; 2623*45876Sbostic int ignore; 2624*45876Sbostic 2625*45876Sbostic /* 2626*45876Sbostic * Our strategy is to put half the bytes on each page. We figure 2627*45876Sbostic * out how many bytes we have total, and then move items until 2628*45876Sbostic * the last item moved put at least 50% of the data on the left 2629*45876Sbostic * page. 2630*45876Sbostic */ 2631*45876Sbostic save_key = (char *) NULL; 2632*45876Sbostic psize = (int) t->bt_psize; 2633*45876Sbostic where = ((char *) left) + psize; 2634*45876Sbostic where_h = left; 2635*45876Sbostic where_i = 0; 2636*45876Sbostic nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h)); 2637*45876Sbostic freespc = nbytes; 2638*45876Sbostic 2639*45876Sbostic top = NEXTINDEX(h); 2640*45876Sbostic if (h->h_flags & F_LEAF) 2641*45876Sbostic fixedsize = (sizeof(DATUM) - sizeof(char)); 2642*45876Sbostic else 2643*45876Sbostic fixedsize = (sizeof(IDATUM) - sizeof(char)); 2644*45876Sbostic 2645*45876Sbostic save_key = (char *) NULL; 2646*45876Sbostic 2647*45876Sbostic /* move data */ 2648*45876Sbostic for (i = 0; i < top; i++) { 2649*45876Sbostic 2650*45876Sbostic /* 2651*45876Sbostic * Internal and leaf pages have different layouts for 2652*45876Sbostic * data items, but in both cases the first entry in the 2653*45876Sbostic * data item is a size_t. 2654*45876Sbostic */ 2655*45876Sbostic d = (DATUM *) GETDATUM(h,i); 2656*45876Sbostic if (h->h_flags & F_LEAF) { 2657*45876Sbostic dsize = d->d_ksize + d->d_dsize + fixedsize; 2658*45876Sbostic } else { 2659*45876Sbostic dsize = d->d_ksize + fixedsize; 2660*45876Sbostic } 2661*45876Sbostic 2662*45876Sbostic /* 2663*45876Sbostic * If a page contains duplicate keys, we have to be 2664*45876Sbostic * careful about splits. A sequence of duplicate keys 2665*45876Sbostic * may not begin in the middle of one page and end in 2666*45876Sbostic * the middle of another; it must begin on a page boundary, 2667*45876Sbostic * in order for searches on the internal nodes to work 2668*45876Sbostic * correctly. 2669*45876Sbostic */ 2670*45876Sbostic if (where_h == left) { 2671*45876Sbostic if (save_key == (char *) NULL) { 2672*45876Sbostic if (h->h_flags & F_LEAF) { 2673*45876Sbostic if (d->d_flags & D_BIGKEY) { 2674*45876Sbostic free_save = TRUE; 2675*45876Sbostic bcopy(&(d->d_bytes[0]), 2676*45876Sbostic (char *) &chain, 2677*45876Sbostic sizeof(chain)); 2678*45876Sbostic if (_bt_getbig(t, chain, 2679*45876Sbostic &save_key, 2680*45876Sbostic &ignore) 2681*45876Sbostic == RET_ERROR) 2682*45876Sbostic return (RET_ERROR); 2683*45876Sbostic } else { 2684*45876Sbostic free_save = FALSE; 2685*45876Sbostic save_key = (char *) &(d->d_bytes[0]); 2686*45876Sbostic } 2687*45876Sbostic } else { 2688*45876Sbostic IDATUM *id = (IDATUM *) d; 2689*45876Sbostic 2690*45876Sbostic if (id->i_flags & D_BIGKEY) { 2691*45876Sbostic free_save = TRUE; 2692*45876Sbostic bcopy(&(id->i_bytes[0]), 2693*45876Sbostic (char *) &chain, 2694*45876Sbostic sizeof(chain)); 2695*45876Sbostic if (_bt_getbig(t, chain, 2696*45876Sbostic &save_key, 2697*45876Sbostic &ignore) 2698*45876Sbostic == RET_ERROR) 2699*45876Sbostic return (RET_ERROR); 2700*45876Sbostic } else { 2701*45876Sbostic free_save = FALSE; 2702*45876Sbostic save_key = (char *) 2703*45876Sbostic &(id->i_bytes[0]); 2704*45876Sbostic } 2705*45876Sbostic } 2706*45876Sbostic save_i = 0; 2707*45876Sbostic save_lower = where_h->h_lower; 2708*45876Sbostic save_upper = where_h->h_upper; 2709*45876Sbostic } else { 2710*45876Sbostic if (_bt_cmp(t, save_key, i) != 0) { 2711*45876Sbostic save_lower = where_h->h_lower; 2712*45876Sbostic save_upper = where_h->h_upper; 2713*45876Sbostic save_i = i; 2714*45876Sbostic } 2715*45876Sbostic if (h->h_flags & F_LEAF) { 2716*45876Sbostic if (free_save) 2717*45876Sbostic (void) free(save_key); 2718*45876Sbostic if (d->d_flags & D_BIGKEY) { 2719*45876Sbostic free_save = TRUE; 2720*45876Sbostic bcopy(&(d->d_bytes[0]), 2721*45876Sbostic (char *) &chain, 2722*45876Sbostic sizeof(chain)); 2723*45876Sbostic if (_bt_getbig(t, chain, 2724*45876Sbostic &save_key, 2725*45876Sbostic &ignore) 2726*45876Sbostic == RET_ERROR) 2727*45876Sbostic return (RET_ERROR); 2728*45876Sbostic } else { 2729*45876Sbostic free_save = FALSE; 2730*45876Sbostic save_key = (char *) &(d->d_bytes[0]); 2731*45876Sbostic } 2732*45876Sbostic } else { 2733*45876Sbostic IDATUM *id = (IDATUM *) d; 2734*45876Sbostic 2735*45876Sbostic if (id->i_flags & D_BIGKEY) { 2736*45876Sbostic free_save = TRUE; 2737*45876Sbostic bcopy(&(id->i_bytes[0]), 2738*45876Sbostic (char *) &chain, 2739*45876Sbostic sizeof(chain)); 2740*45876Sbostic if (_bt_getbig(t, chain, 2741*45876Sbostic &save_key, 2742*45876Sbostic &ignore) 2743*45876Sbostic == RET_ERROR) 2744*45876Sbostic return (RET_ERROR); 2745*45876Sbostic } else { 2746*45876Sbostic free_save = FALSE; 2747*45876Sbostic save_key = (char *) 2748*45876Sbostic &(id->i_bytes[0]); 2749*45876Sbostic } 2750*45876Sbostic } 2751*45876Sbostic } 2752*45876Sbostic } 2753*45876Sbostic 2754*45876Sbostic /* copy data and update page state */ 2755*45876Sbostic where -= LONGALIGN(dsize); 2756*45876Sbostic (void) bcopy((char *) d, (char *) where, dsize); 2757*45876Sbostic where_h->h_upper = where_h->h_linp[where_i] = 2758*45876Sbostic (index_t) (where - (int) where_h); 2759*45876Sbostic where_h->h_lower += sizeof(index_t); 2760*45876Sbostic where_i++; 2761*45876Sbostic 2762*45876Sbostic /* if we've moved half, switch to the right-hand page */ 2763*45876Sbostic nbytes -= LONGALIGN(dsize) + sizeof(index_t); 2764*45876Sbostic if ((freespc - nbytes) > nbytes) { 2765*45876Sbostic nbytes = 2 * freespc; 2766*45876Sbostic 2767*45876Sbostic /* identical keys go on the same page */ 2768*45876Sbostic if (save_i > 0) { 2769*45876Sbostic /* i gets incremented at loop bottom... */ 2770*45876Sbostic i = save_i - 1; 2771*45876Sbostic where_h->h_lower = save_lower; 2772*45876Sbostic where_h->h_upper = save_upper; 2773*45876Sbostic } 2774*45876Sbostic where = ((char *) right) + psize; 2775*45876Sbostic where_h = right; 2776*45876Sbostic switch_i = where_i; 2777*45876Sbostic where_i = 0; 2778*45876Sbostic } 2779*45876Sbostic } 2780*45876Sbostic 2781*45876Sbostic /* 2782*45876Sbostic * If there was an active scan on the database, and we just 2783*45876Sbostic * split the page that the cursor was on, we may need to 2784*45876Sbostic * adjust the cursor to point to the same entry as before the 2785*45876Sbostic * split. 2786*45876Sbostic */ 2787*45876Sbostic 2788*45876Sbostic c = &(t->bt_cursor); 2789*45876Sbostic if ((t->bt_flags & BTF_SEQINIT) 2790*45876Sbostic && (c->c_pgno == h->h_pgno) 2791*45876Sbostic && (c->c_index >= switch_i)) { 2792*45876Sbostic c->c_pgno = where_h->h_pgno; 2793*45876Sbostic c->c_index -= where_i; 2794*45876Sbostic } 2795*45876Sbostic return (RET_SUCCESS); 2796*45876Sbostic } 2797*45876Sbostic 2798*45876Sbostic /* 2799*45876Sbostic * _BT_INSERTI -- Insert IDATUM on current page in the btree. 2800*45876Sbostic * 2801*45876Sbostic * This routine handles insertions to internal pages after splits 2802*45876Sbostic * lower in the tree. On entry, t->bt_curpage is the page to get 2803*45876Sbostic * the new IDATUM. We are also given pgno, the page number of the 2804*45876Sbostic * IDATUM that is immediately left of the new IDATUM's position. 2805*45876Sbostic * This guarantees that the IDATUM for the right half of the page 2806*45876Sbostic * after a split goes next to the IDATUM for its left half. 2807*45876Sbostic * 2808*45876Sbostic * Parameters: 2809*45876Sbostic * t -- tree in which to do insertion. 2810*45876Sbostic * id -- new IDATUM to insert 2811*45876Sbostic * pgno -- page number of IDATUM left of id's position 2812*45876Sbostic * 2813*45876Sbostic * Returns: 2814*45876Sbostic * RET_SUCCESS, RET_ERROR. 2815*45876Sbostic */ 2816*45876Sbostic 2817*45876Sbostic static int 2818*45876Sbostic _bt_inserti(t, id, pgno) 2819*45876Sbostic BTREE_P t; 2820*45876Sbostic IDATUM *id; 2821*45876Sbostic pgno_t pgno; 2822*45876Sbostic { 2823*45876Sbostic BTHEADER *h = t->bt_curpage; 2824*45876Sbostic index_t next, i; 2825*45876Sbostic IDATUM *idx; 2826*45876Sbostic char *key; 2827*45876Sbostic pgno_t chain; 2828*45876Sbostic int free_key; 2829*45876Sbostic int ignore; 2830*45876Sbostic 2831*45876Sbostic if (id->i_flags & D_BIGKEY) { 2832*45876Sbostic free_key = TRUE; 2833*45876Sbostic bcopy(&(id->i_bytes[0]), (char *) &chain, sizeof(chain)); 2834*45876Sbostic if (_bt_getbig(t, chain, &key, &ignore) == RET_ERROR) 2835*45876Sbostic return (RET_ERROR); 2836*45876Sbostic } else { 2837*45876Sbostic free_key = FALSE; 2838*45876Sbostic key = &(id->i_bytes[0]); 2839*45876Sbostic } 2840*45876Sbostic i = _bt_binsrch(t, key); 2841*45876Sbostic 2842*45876Sbostic next = NEXTINDEX(h); 2843*45876Sbostic while (i < next && _bt_cmp(t, key, i) >= 0) 2844*45876Sbostic i++; 2845*45876Sbostic 2846*45876Sbostic if (free_key) 2847*45876Sbostic (void) free(key); 2848*45876Sbostic 2849*45876Sbostic /* okay, now we're close; find adjacent IDATUM */ 2850*45876Sbostic for (;;) { 2851*45876Sbostic idx = (IDATUM *) GETDATUM(h,i); 2852*45876Sbostic if (idx->i_pgno == pgno) { 2853*45876Sbostic i++; 2854*45876Sbostic break; 2855*45876Sbostic } 2856*45876Sbostic --i; 2857*45876Sbostic } 2858*45876Sbostic 2859*45876Sbostic /* correctly positioned, do the insertion */ 2860*45876Sbostic return (_bt_insertat(t, id, i)); 2861*45876Sbostic } 2862*45876Sbostic 2863*45876Sbostic /* 2864*45876Sbostic * _BT_INSERTAT -- Insert a datum at a given location on the current page. 2865*45876Sbostic * 2866*45876Sbostic * This routine does insertions on both leaf and internal pages. 2867*45876Sbostic * 2868*45876Sbostic * Parameters: 2869*45876Sbostic * t -- tree in which to do insertion. 2870*45876Sbostic * p -- DATUM or IDATUM to insert. 2871*45876Sbostic * index -- index in line pointer array to put this item. 2872*45876Sbostic * 2873*45876Sbostic * Returns: 2874*45876Sbostic * RET_SUCCESS, RET_ERROR. 2875*45876Sbostic * 2876*45876Sbostic * Side Effects: 2877*45876Sbostic * Will rearrange line pointers to make space for the new 2878*45876Sbostic * entry. This means that any scans currently active are 2879*45876Sbostic * invalid after this. 2880*45876Sbostic * 2881*45876Sbostic * Warnings: 2882*45876Sbostic * There must be sufficient room for the new item on the page. 2883*45876Sbostic */ 2884*45876Sbostic 2885*45876Sbostic static int 2886*45876Sbostic _bt_insertat(t, p, index) 2887*45876Sbostic BTREE_P t; 2888*45876Sbostic char *p; 2889*45876Sbostic index_t index; 2890*45876Sbostic { 2891*45876Sbostic IDATUM *id = (IDATUM *) p; 2892*45876Sbostic DATUM *d = (DATUM *) p; 2893*45876Sbostic BTHEADER *h; 2894*45876Sbostic CURSOR *c; 2895*45876Sbostic index_t nxtindex; 2896*45876Sbostic char *src, *dest; 2897*45876Sbostic int nbytes; 2898*45876Sbostic 2899*45876Sbostic /* insertion may confuse an active scan. fix it. */ 2900*45876Sbostic c = &(t->bt_cursor); 2901*45876Sbostic if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno) 2902*45876Sbostic if (_bt_fixscan(t, index, d, INSERT) == RET_ERROR) 2903*45876Sbostic return (RET_ERROR); 2904*45876Sbostic 2905*45876Sbostic h = t->bt_curpage; 2906*45876Sbostic nxtindex = (index_t) NEXTINDEX(h); 2907*45876Sbostic 2908*45876Sbostic /* 2909*45876Sbostic * If we're inserting at the middle of the line pointer array, 2910*45876Sbostic * copy pointers that will follow the new one up on the page. 2911*45876Sbostic */ 2912*45876Sbostic 2913*45876Sbostic if (index < nxtindex) { 2914*45876Sbostic src = (char *) &(h->h_linp[index]); 2915*45876Sbostic dest = (char *) &(h->h_linp[index + 1]); 2916*45876Sbostic nbytes = (h->h_lower - (src - ((char *) h))) 2917*45876Sbostic + sizeof(h->h_linp[0]); 2918*45876Sbostic (void) bcopy(src, dest, nbytes); 2919*45876Sbostic } 2920*45876Sbostic 2921*45876Sbostic /* compute size and copy data to page */ 2922*45876Sbostic if (h->h_flags & F_LEAF) { 2923*45876Sbostic nbytes = d->d_ksize + d->d_dsize 2924*45876Sbostic + (sizeof(DATUM) - sizeof(char)); 2925*45876Sbostic } else { 2926*45876Sbostic nbytes = id->i_size + (sizeof(IDATUM) - sizeof(char)); 2927*45876Sbostic } 2928*45876Sbostic dest = (((char *) h) + h->h_upper) - LONGALIGN(nbytes); 2929*45876Sbostic (void) bcopy((char *) p, dest, nbytes); 2930*45876Sbostic 2931*45876Sbostic /* update statistics */ 2932*45876Sbostic dest -= (int) h; 2933*45876Sbostic h->h_linp[index] = (index_t) dest; 2934*45876Sbostic h->h_upper = (index_t) dest; 2935*45876Sbostic h->h_lower += sizeof(index_t); 2936*45876Sbostic 2937*45876Sbostic /* we're done */ 2938*45876Sbostic h->h_flags |= F_DIRTY; 2939*45876Sbostic 2940*45876Sbostic return (RET_SUCCESS); 2941*45876Sbostic } 2942*45876Sbostic 2943*45876Sbostic /* 2944*45876Sbostic * _BT_BINSRCH -- Do a binary search for a given key on the current page. 2945*45876Sbostic * 2946*45876Sbostic * Searches on internal pages are handled slightly differently from 2947*45876Sbostic * searches on leaf pages. This is because internal page searches 2948*45876Sbostic * find the largest item <= key in the tree, and leaf searches find 2949*45876Sbostic * the smallest item >= key. This guarantees that leaf page searches 2950*45876Sbostic * leave us pointing at the item's correct position, and internal 2951*45876Sbostic * searches descend the tree correctly. 2952*45876Sbostic * 2953*45876Sbostic * Parameters: 2954*45876Sbostic * t -- tree to search 2955*45876Sbostic * key -- key we're looking for 2956*45876Sbostic * 2957*45876Sbostic * Returns: 2958*45876Sbostic * Index of the line pointer array entry for the (closest) 2959*45876Sbostic * match to key on the current page, with "closest" as defined 2960*45876Sbostic * above. 2961*45876Sbostic */ 2962*45876Sbostic 2963*45876Sbostic static index_t 2964*45876Sbostic _bt_binsrch(t, key) 2965*45876Sbostic BTREE_P t; 2966*45876Sbostic char *key; 2967*45876Sbostic { 2968*45876Sbostic index_t lbound, ubound, cur; 2969*45876Sbostic BTHEADER *h = t->bt_curpage; 2970*45876Sbostic IDATUM *id; 2971*45876Sbostic int match = 0; 2972*45876Sbostic int r; 2973*45876Sbostic 2974*45876Sbostic lbound = 0; 2975*45876Sbostic ubound = NEXTINDEX(h); 2976*45876Sbostic if (ubound > 0) 2977*45876Sbostic --ubound; 2978*45876Sbostic 2979*45876Sbostic /* do a binary search on the current page */ 2980*45876Sbostic while ((ubound - lbound) > 1) { 2981*45876Sbostic cur = lbound + ((ubound - lbound) / 2); 2982*45876Sbostic r = _bt_cmp(t, key, cur); 2983*45876Sbostic 2984*45876Sbostic if (r > 0) 2985*45876Sbostic lbound = cur + 1; 2986*45876Sbostic else if (r < 0) 2987*45876Sbostic ubound = cur; 2988*45876Sbostic else { 2989*45876Sbostic match++; 2990*45876Sbostic break; 2991*45876Sbostic } 2992*45876Sbostic } 2993*45876Sbostic 2994*45876Sbostic /* 2995*45876Sbostic * At this point, the binary search terminated because the endpoints 2996*45876Sbostic * got too close together, or we have a match. Figure out which 2997*45876Sbostic * case applies, decide what to do based on the page type (leaf or 2998*45876Sbostic * internal), and do the right thing. 2999*45876Sbostic */ 3000*45876Sbostic if (match) { 3001*45876Sbostic return (cur); 3002*45876Sbostic } else if (ubound != lbound) { 3003*45876Sbostic if (h->h_flags & F_LEAF) { 3004*45876Sbostic r = _bt_cmp(t, key, lbound); 3005*45876Sbostic if (r <= 0) { 3006*45876Sbostic return (lbound); 3007*45876Sbostic } 3008*45876Sbostic } else { 3009*45876Sbostic r = _bt_cmp(t, key, ubound); 3010*45876Sbostic 3011*45876Sbostic /* for internal nodes, move as far left as possible */ 3012*45876Sbostic if (r < 0) { 3013*45876Sbostic r = _bt_cmp(t, key, lbound); 3014*45876Sbostic if (r < 0 && lbound > 0) 3015*45876Sbostic --lbound; 3016*45876Sbostic return (lbound); 3017*45876Sbostic } else { 3018*45876Sbostic return (ubound); 3019*45876Sbostic } 3020*45876Sbostic } 3021*45876Sbostic } 3022*45876Sbostic 3023*45876Sbostic if (h->h_flags & F_LEAF) { 3024*45876Sbostic if (ubound < NEXTINDEX(h)) { 3025*45876Sbostic r = _bt_cmp(t, key, ubound); 3026*45876Sbostic if (r > 0) 3027*45876Sbostic ubound++; 3028*45876Sbostic } 3029*45876Sbostic } else { 3030*45876Sbostic /* for internal pages, move as far left as possible */ 3031*45876Sbostic if (ubound == NEXTINDEX(h)) 3032*45876Sbostic ubound--; 3033*45876Sbostic 3034*45876Sbostic while (_bt_cmp(t, key, ubound) < 0) 3035*45876Sbostic ubound--; 3036*45876Sbostic } 3037*45876Sbostic return (ubound); 3038*45876Sbostic } 3039*45876Sbostic 3040*45876Sbostic /* 3041*45876Sbostic * BT_SEQ -- Sequential scan interface. 3042*45876Sbostic * 3043*45876Sbostic * This routine supports sequential scans on the btree. If called with 3044*45876Sbostic * flags set to R_CURSOR, or if no seq scan has been initialized in the 3045*45876Sbostic * current tree, we initialize the scan. Otherwise, we advance the scan 3046*45876Sbostic * and return the next item. 3047*45876Sbostic * 3048*45876Sbostic * Scans can be either keyed or non-keyed. Keyed scans basically have 3049*45876Sbostic * a starting point somewhere in the middle of the tree. Non-keyed 3050*45876Sbostic * scans start at an endpoint. Also, scans can be backward (descending 3051*45876Sbostic * order), forward (ascending order), or no movement (keep returning 3052*45876Sbostic * the same item). 3053*45876Sbostic * 3054*45876Sbostic * Flags is checked every time we enter the routine, so the user can 3055*45876Sbostic * change directions on an active scan if desired. The key argument 3056*45876Sbostic * is examined only when we initialize the scan, in order to position 3057*45876Sbostic * it properly. 3058*45876Sbostic * 3059*45876Sbostic * Items are returned via the key and data arguments passed in. 3060*45876Sbostic * 3061*45876Sbostic * Parameters: 3062*45876Sbostic * tree -- btree in which to do scan 3063*45876Sbostic * key -- key, used to position scan on initialization, and 3064*45876Sbostic * used to return key components to the user. 3065*45876Sbostic * data -- used to return data components to the user. 3066*45876Sbostic * flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or 3067*45876Sbostic * R_PREV. 3068*45876Sbostic * 3069*45876Sbostic * Returns: 3070*45876Sbostic * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data 3071*45876Sbostic * exists in the tree in the specified direction. 3072*45876Sbostic * 3073*45876Sbostic * Side Effects: 3074*45876Sbostic * Changes the btree's notion of the current position in the 3075*45876Sbostic * scan. 3076*45876Sbostic * 3077*45876Sbostic * Warnings: 3078*45876Sbostic * The key and data items returned are static and will be 3079*45876Sbostic * overwritten by the next bt_get or bt_seq. 3080*45876Sbostic */ 3081*45876Sbostic 3082*45876Sbostic bt_seq(tree, key, data, flags) 3083*45876Sbostic BTREE tree; 3084*45876Sbostic DBT *key; 3085*45876Sbostic DBT *data; 3086*45876Sbostic int flags; 3087*45876Sbostic { 3088*45876Sbostic BTREE_P t = (BTREE_P) tree; 3089*45876Sbostic BTHEADER *h; 3090*45876Sbostic DATUM *d; 3091*45876Sbostic int status; 3092*45876Sbostic 3093*45876Sbostic /* do we need to initialize the scan? */ 3094*45876Sbostic if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST 3095*45876Sbostic || !(t->bt_flags & BTF_SEQINIT)) { 3096*45876Sbostic 3097*45876Sbostic /* initialize it */ 3098*45876Sbostic status = _bt_seqinit(t, key, flags); 3099*45876Sbostic } else { 3100*45876Sbostic /* just advance the current scan pointer */ 3101*45876Sbostic status = _bt_seqadvance(t, flags); 3102*45876Sbostic } 3103*45876Sbostic 3104*45876Sbostic key->size = data->size = 0; 3105*45876Sbostic key->data = data->data = (char *) NULL; 3106*45876Sbostic 3107*45876Sbostic h = t->bt_curpage; 3108*45876Sbostic 3109*45876Sbostic /* is there a valid item at the current scan location? */ 3110*45876Sbostic if (status == RET_SPECIAL) { 3111*45876Sbostic if (flags == R_NEXT) { 3112*45876Sbostic if (t->bt_cursor.c_index >= NEXTINDEX(h)) { 3113*45876Sbostic if (NEXTINDEX(h) > 0) 3114*45876Sbostic t->bt_cursor.c_index = NEXTINDEX(h) - 1; 3115*45876Sbostic else 3116*45876Sbostic t->bt_cursor.c_index = 0; 3117*45876Sbostic } 3118*45876Sbostic } else { 3119*45876Sbostic t->bt_cursor.c_index = 0; 3120*45876Sbostic } 3121*45876Sbostic return (RET_SPECIAL); 3122*45876Sbostic } else if (status == RET_ERROR) 3123*45876Sbostic return (RET_ERROR); 3124*45876Sbostic 3125*45876Sbostic /* okay, return the data */ 3126*45876Sbostic d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index); 3127*45876Sbostic 3128*45876Sbostic return (_bt_buildret(t, d, data, key)); 3129*45876Sbostic } 3130*45876Sbostic 3131*45876Sbostic /* 3132*45876Sbostic * _BT_BUILDRET -- Build return key/data pair as a result of search or scan. 3133*45876Sbostic * 3134*45876Sbostic * This routine manages the statically allocated buffers in which we 3135*45876Sbostic * return data to the user. 3136*45876Sbostic * 3137*45876Sbostic * Parameters: 3138*45876Sbostic * t -- btree from which to return datum 3139*45876Sbostic * d -- DATUM to be returned to the user. 3140*45876Sbostic * data -- data argument supplied by user for return 3141*45876Sbostic * key -- key argument supplied by user for return 3142*45876Sbostic * 3143*45876Sbostic * Returns: 3144*45876Sbostic * RET_SUCCESS, RET_ERROR. 3145*45876Sbostic * 3146*45876Sbostic * Side Effects: 3147*45876Sbostic * May free and reallocate static buffers, if the data 3148*45876Sbostic * we want to return is bigger than the space we have to 3149*45876Sbostic * do so. 3150*45876Sbostic */ 3151*45876Sbostic 3152*45876Sbostic static int 3153*45876Sbostic _bt_buildret(t, d, data, key) 3154*45876Sbostic BTREE_P t; 3155*45876Sbostic DATUM *d; 3156*45876Sbostic DBT *data; 3157*45876Sbostic DBT *key; 3158*45876Sbostic { 3159*45876Sbostic static int _data_s = 0; 3160*45876Sbostic static int _key_s = 0; 3161*45876Sbostic static char *_data = (char *) NULL; 3162*45876Sbostic static char *_key = (char *) NULL; 3163*45876Sbostic char *from, *where, *top; 3164*45876Sbostic pgno_t save; 3165*45876Sbostic pgno_t chain; 3166*45876Sbostic size_t nbytes; 3167*45876Sbostic size_t nhere; 3168*45876Sbostic BTHEADER *h; 3169*45876Sbostic 3170*45876Sbostic if (d->d_flags & D_BIGKEY) { 3171*45876Sbostic if (_key != (char *) NULL) 3172*45876Sbostic (void) free(_key); 3173*45876Sbostic (void) bcopy((char *) &(d->d_bytes[0]), 3174*45876Sbostic (char *) &chain, 3175*45876Sbostic sizeof(chain)); 3176*45876Sbostic if (_bt_getbig(t, chain, &_key, &_key_s) == RET_ERROR) 3177*45876Sbostic return (RET_ERROR); 3178*45876Sbostic key->data = _key; 3179*45876Sbostic key->size = _key_s; 3180*45876Sbostic } else { 3181*45876Sbostic /* need more space for key? */ 3182*45876Sbostic if (d->d_ksize > _key_s) { 3183*45876Sbostic if (_key != (char *) NULL) 3184*45876Sbostic (void) free (_key); 3185*45876Sbostic if ((_key = (char *) malloc((unsigned) d->d_ksize)) 3186*45876Sbostic == (char *) NULL) 3187*45876Sbostic return (RET_ERROR); 3188*45876Sbostic _key_s = d->d_ksize; 3189*45876Sbostic } 3190*45876Sbostic 3191*45876Sbostic key->data = _key; 3192*45876Sbostic if ((key->size = d->d_ksize) > 0) 3193*45876Sbostic (void) bcopy(&(d->d_bytes[0]), 3194*45876Sbostic _key, 3195*45876Sbostic (int) d->d_ksize); 3196*45876Sbostic } 3197*45876Sbostic 3198*45876Sbostic if (d->d_flags & D_BIGDATA) { 3199*45876Sbostic if (_data != (char *) NULL) 3200*45876Sbostic (void) free(_data); 3201*45876Sbostic (void) bcopy(&(d->d_bytes[d->d_ksize]), 3202*45876Sbostic (char *) &chain, 3203*45876Sbostic sizeof(chain)); 3204*45876Sbostic if (_bt_getbig(t, chain, &_data, &_data_s) == RET_ERROR) 3205*45876Sbostic return (RET_ERROR); 3206*45876Sbostic data->data = _data; 3207*45876Sbostic data->size = _data_s; 3208*45876Sbostic } else { 3209*45876Sbostic /* need more space for data? */ 3210*45876Sbostic if (d->d_dsize > _data_s) { 3211*45876Sbostic if (_data != (char *) NULL) 3212*45876Sbostic (void) free (_data); 3213*45876Sbostic if ((_data = (char *) malloc((unsigned) d->d_dsize)) 3214*45876Sbostic == (char *) NULL) 3215*45876Sbostic return (RET_ERROR); 3216*45876Sbostic _data_s = d->d_dsize; 3217*45876Sbostic } 3218*45876Sbostic 3219*45876Sbostic data->data = _data; 3220*45876Sbostic 3221*45876Sbostic if ((data->size = d->d_dsize) > 0) 3222*45876Sbostic (void) bcopy(&(d->d_bytes[d->d_ksize]), 3223*45876Sbostic _data, 3224*45876Sbostic d->d_dsize); 3225*45876Sbostic } 3226*45876Sbostic 3227*45876Sbostic return (RET_SUCCESS); 3228*45876Sbostic } 3229*45876Sbostic 3230*45876Sbostic /* 3231*45876Sbostic * _BT_SEQINIT -- Initialize a sequential scan on the btree. 3232*45876Sbostic * 3233*45876Sbostic * Sets the tree's notion of the current scan location correctly 3234*45876Sbostic * given a key and a direction. 3235*45876Sbostic * 3236*45876Sbostic * Parameters: 3237*45876Sbostic * t -- tree in which to initialize scan 3238*45876Sbostic * key -- key for initial scan position 3239*45876Sbostic * flags -- R_NEXT, R_PREV 3240*45876Sbostic * 3241*45876Sbostic * Returns: 3242*45876Sbostic * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there's no data 3243*45876Sbostic * in the tree to scan. 3244*45876Sbostic * 3245*45876Sbostic * Side Effects: 3246*45876Sbostic * Changes current scan position for the tree. Almost certainly 3247*45876Sbostic * changes current page, as well. Sets BTF_SEQINIT bit in tree 3248*45876Sbostic * flags, so that we know we've initialized a scan. 3249*45876Sbostic */ 3250*45876Sbostic 3251*45876Sbostic static int 3252*45876Sbostic _bt_seqinit(t, key, flags) 3253*45876Sbostic BTREE_P t; 3254*45876Sbostic DBT *key; 3255*45876Sbostic int flags; 3256*45876Sbostic { 3257*45876Sbostic BTITEM *item; 3258*45876Sbostic BTHEADER *h; 3259*45876Sbostic CURSOR *c; 3260*45876Sbostic IDATUM *id; 3261*45876Sbostic pgno_t pgno; 3262*45876Sbostic index_t index; 3263*45876Sbostic index_t last; 3264*45876Sbostic 3265*45876Sbostic /* 3266*45876Sbostic * Figure out if we really have to search for the key that the 3267*45876Sbostic * user supplied. If key is null, then this is an unkeyed scan 3268*45876Sbostic * and we can just start from an endpoint. 3269*45876Sbostic */ 3270*45876Sbostic 3271*45876Sbostic c = &(t->bt_cursor); 3272*45876Sbostic 3273*45876Sbostic if (flags == R_CURSOR) { 3274*45876Sbostic if (key->data != (char *) NULL) { 3275*45876Sbostic 3276*45876Sbostic /* key supplied, find it */ 3277*45876Sbostic item = _bt_search(t, key); 3278*45876Sbostic c->c_index = item->bti_index; 3279*45876Sbostic c->c_pgno = t->bt_curpage->h_pgno; 3280*45876Sbostic } else { 3281*45876Sbostic errno = EINVAL; 3282*45876Sbostic return (RET_ERROR); 3283*45876Sbostic } 3284*45876Sbostic 3285*45876Sbostic } else { 3286*45876Sbostic 3287*45876Sbostic /* 3288*45876Sbostic * Unkeyed scan. For backward scans, find the last item 3289*45876Sbostic * in the tree; for forward scans, find the first item. 3290*45876Sbostic */ 3291*45876Sbostic 3292*45876Sbostic if (_bt_getpage(t, P_ROOT) == RET_ERROR) 3293*45876Sbostic return (RET_ERROR); 3294*45876Sbostic h = t->bt_curpage; 3295*45876Sbostic if (flags == R_LAST || flags == R_PREV) { 3296*45876Sbostic 3297*45876Sbostic /* backward scan */ 3298*45876Sbostic while (!(h->h_flags & F_LEAF)) { 3299*45876Sbostic last = NEXTINDEX(h) - 1; 3300*45876Sbostic id = (IDATUM *) GETDATUM(h,last); 3301*45876Sbostic if (_bt_getpage(t, id->i_pgno) == RET_ERROR) 3302*45876Sbostic return (RET_ERROR); 3303*45876Sbostic h = t->bt_curpage; 3304*45876Sbostic } 3305*45876Sbostic 3306*45876Sbostic /* skip empty pages */ 3307*45876Sbostic while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE) { 3308*45876Sbostic if (_bt_getpage(t, h->h_prevpg) == RET_ERROR) 3309*45876Sbostic return (RET_ERROR); 3310*45876Sbostic h = t->bt_curpage; 3311*45876Sbostic } 3312*45876Sbostic 3313*45876Sbostic c->c_pgno = h->h_pgno; 3314*45876Sbostic if (NEXTINDEX(h) > 0) 3315*45876Sbostic c->c_index = NEXTINDEX(h) - 1; 3316*45876Sbostic else 3317*45876Sbostic c->c_index = 0; 3318*45876Sbostic } else if (flags == R_FIRST || flags == R_NEXT) { 3319*45876Sbostic /* forward scan */ 3320*45876Sbostic while (!(h->h_flags & F_LEAF)) { 3321*45876Sbostic id = (IDATUM *) GETDATUM(h,0); 3322*45876Sbostic if (_bt_getpage(t, id->i_pgno) == RET_ERROR) 3323*45876Sbostic return (RET_ERROR); 3324*45876Sbostic h = t->bt_curpage; 3325*45876Sbostic } 3326*45876Sbostic 3327*45876Sbostic /* skip empty pages */ 3328*45876Sbostic while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE) { 3329*45876Sbostic if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 3330*45876Sbostic return (RET_ERROR); 3331*45876Sbostic h = t->bt_curpage; 3332*45876Sbostic } 3333*45876Sbostic 3334*45876Sbostic c->c_pgno = h->h_pgno; 3335*45876Sbostic c->c_index = 0; 3336*45876Sbostic } else { 3337*45876Sbostic /* no flags passed in */ 3338*45876Sbostic errno = EINVAL; 3339*45876Sbostic return (RET_ERROR); 3340*45876Sbostic } 3341*45876Sbostic } 3342*45876Sbostic 3343*45876Sbostic /* okay, scan is initialized */ 3344*45876Sbostic t->bt_flags |= BTF_SEQINIT; 3345*45876Sbostic 3346*45876Sbostic if (c->c_index == NEXTINDEX(t->bt_curpage)) 3347*45876Sbostic return (RET_SPECIAL); 3348*45876Sbostic 3349*45876Sbostic return (RET_SUCCESS); 3350*45876Sbostic } 3351*45876Sbostic 3352*45876Sbostic /* 3353*45876Sbostic * _BT_SEQADVANCE -- Advance the sequential scan on this tree. 3354*45876Sbostic * 3355*45876Sbostic * Moves the current location pointer for the scan on this tree one 3356*45876Sbostic * spot in the requested direction. 3357*45876Sbostic * 3358*45876Sbostic * Parameters: 3359*45876Sbostic * t -- btree being scanned 3360*45876Sbostic * flags -- for R_NEXT, R_PREV 3361*45876Sbostic * 3362*45876Sbostic * Returns: 3363*45876Sbostic * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there is no 3364*45876Sbostic * more data in the specified direction. 3365*45876Sbostic * 3366*45876Sbostic * Side Effects: 3367*45876Sbostic * May change current page. 3368*45876Sbostic */ 3369*45876Sbostic 3370*45876Sbostic static int 3371*45876Sbostic _bt_seqadvance(t, flags) 3372*45876Sbostic BTREE_P t; 3373*45876Sbostic int flags; 3374*45876Sbostic { 3375*45876Sbostic BTHEADER *h; 3376*45876Sbostic CURSOR *c; 3377*45876Sbostic index_t index; 3378*45876Sbostic 3379*45876Sbostic c = &(t->bt_cursor); 3380*45876Sbostic index = c->c_index; 3381*45876Sbostic 3382*45876Sbostic if (_bt_getpage(t, c->c_pgno) == RET_ERROR) 3383*45876Sbostic return (RET_ERROR); 3384*45876Sbostic h = t->bt_curpage; 3385*45876Sbostic 3386*45876Sbostic /* by the time we get here, don't need the cursor key anymore */ 3387*45876Sbostic if (c->c_key != (char *) NULL) 3388*45876Sbostic (void) free(c->c_key); 3389*45876Sbostic 3390*45876Sbostic if (flags == R_NEXT) { 3391*45876Sbostic 3392*45876Sbostic /* 3393*45876Sbostic * This is a forward scan. If the cursor is pointing 3394*45876Sbostic * at a virtual record (that is, it was pointing at 3395*45876Sbostic * a record that got deleted), then we should return 3396*45876Sbostic * the record it's pointing at now. Otherwise, we 3397*45876Sbostic * should advance the scan. In either case, we need 3398*45876Sbostic * to be careful not to run off the end of the current 3399*45876Sbostic * page. 3400*45876Sbostic */ 3401*45876Sbostic 3402*45876Sbostic if (c->c_flags & CRSR_BEFORE) { 3403*45876Sbostic 3404*45876Sbostic if (index >= NEXTINDEX(h)) { 3405*45876Sbostic /* out of items on this page, get next page */ 3406*45876Sbostic if (h->h_nextpg == P_NONE) { 3407*45876Sbostic /* tell caller we're done... */ 3408*45876Sbostic c->c_index = NEXTINDEX(h); 3409*45876Sbostic return (RET_SPECIAL); 3410*45876Sbostic } 3411*45876Sbostic 3412*45876Sbostic /* skip empty pages */ 3413*45876Sbostic do { 3414*45876Sbostic if (_bt_getpage(t, h->h_nextpg) 3415*45876Sbostic == RET_ERROR) { 3416*45876Sbostic c->c_index = NEXTINDEX(h); 3417*45876Sbostic return (RET_ERROR); 3418*45876Sbostic } 3419*45876Sbostic h = t->bt_curpage; 3420*45876Sbostic c->c_pgno = h->h_pgno; 3421*45876Sbostic } while (NEXTINDEX(h) == 0 3422*45876Sbostic && h->h_nextpg != P_NONE); 3423*45876Sbostic 3424*45876Sbostic if (NEXTINDEX(h) == 0) { 3425*45876Sbostic /* tell caller we're done */ 3426*45876Sbostic c->c_index = NEXTINDEX(h); 3427*45876Sbostic return (RET_SPECIAL); 3428*45876Sbostic } 3429*45876Sbostic index = 0; 3430*45876Sbostic } 3431*45876Sbostic c->c_flags &= ~CRSR_BEFORE; 3432*45876Sbostic 3433*45876Sbostic } else if (++index >= NEXTINDEX(h)) { 3434*45876Sbostic 3435*45876Sbostic /* out of items on this page, get next page */ 3436*45876Sbostic if (h->h_nextpg == P_NONE) { 3437*45876Sbostic /* tell caller we're done... */ 3438*45876Sbostic c->c_index = NEXTINDEX(h); 3439*45876Sbostic return (RET_SPECIAL); 3440*45876Sbostic } 3441*45876Sbostic 3442*45876Sbostic /* skip empty pages */ 3443*45876Sbostic do { 3444*45876Sbostic if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) { 3445*45876Sbostic c->c_index = NEXTINDEX(h); 3446*45876Sbostic return (RET_ERROR); 3447*45876Sbostic } 3448*45876Sbostic h = t->bt_curpage; 3449*45876Sbostic c->c_pgno = h->h_pgno; 3450*45876Sbostic } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE); 3451*45876Sbostic 3452*45876Sbostic if (NEXTINDEX(h) == 0) { 3453*45876Sbostic /* tell caller we're done */ 3454*45876Sbostic c->c_index = NEXTINDEX(h); 3455*45876Sbostic return (RET_SPECIAL); 3456*45876Sbostic } 3457*45876Sbostic index = 0; 3458*45876Sbostic } 3459*45876Sbostic } else if (flags == R_PREV) { 3460*45876Sbostic 3461*45876Sbostic /* for backward scans, life is substantially easier */ 3462*45876Sbostic c->c_flags &= ~CRSR_BEFORE; 3463*45876Sbostic if (c->c_key != (char *) NULL) { 3464*45876Sbostic (void) free(c->c_key); 3465*45876Sbostic c->c_key = (char *) NULL; 3466*45876Sbostic } 3467*45876Sbostic 3468*45876Sbostic if (index == 0) { 3469*45876Sbostic 3470*45876Sbostic /* we may be done */ 3471*45876Sbostic c->c_index = 0; 3472*45876Sbostic 3473*45876Sbostic /* out of items on this page, get next page */ 3474*45876Sbostic if (h->h_prevpg == P_NONE) 3475*45876Sbostic return (RET_SPECIAL); 3476*45876Sbostic 3477*45876Sbostic /* skip empty pages */ 3478*45876Sbostic do { 3479*45876Sbostic if (_bt_getpage(t, h->h_prevpg) == RET_ERROR) 3480*45876Sbostic return (RET_ERROR); 3481*45876Sbostic h = t->bt_curpage; 3482*45876Sbostic c->c_pgno = h->h_pgno; 3483*45876Sbostic } while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE); 3484*45876Sbostic 3485*45876Sbostic if (NEXTINDEX(h) == 0) 3486*45876Sbostic return (RET_SPECIAL); 3487*45876Sbostic 3488*45876Sbostic index = NEXTINDEX(h) - 1; 3489*45876Sbostic } else 3490*45876Sbostic --index; 3491*45876Sbostic } else { 3492*45876Sbostic /* must specify a direction */ 3493*45876Sbostic errno = EINVAL; 3494*45876Sbostic return (RET_ERROR); 3495*45876Sbostic } 3496*45876Sbostic 3497*45876Sbostic c->c_index = index; 3498*45876Sbostic return (RET_SUCCESS); 3499*45876Sbostic } 3500*45876Sbostic 3501*45876Sbostic /* 3502*45876Sbostic * BT_CLOSE -- Close a btree 3503*45876Sbostic * 3504*45876Sbostic * Parameters: 3505*45876Sbostic * tree -- tree to close 3506*45876Sbostic * 3507*45876Sbostic * Returns: 3508*45876Sbostic * RET_SUCCESS, RET_ERROR. 3509*45876Sbostic * 3510*45876Sbostic * Side Effects: 3511*45876Sbostic * Frees space occupied by the tree. 3512*45876Sbostic */ 3513*45876Sbostic 3514*45876Sbostic int 3515*45876Sbostic bt_close(tree) 3516*45876Sbostic BTREE tree; 3517*45876Sbostic { 3518*45876Sbostic BTREE_P t = (BTREE_P) tree; 3519*45876Sbostic int i; 3520*45876Sbostic BTHEADER *h; 3521*45876Sbostic char *cache; 3522*45876Sbostic struct HTBUCKET *b, *sb; 3523*45876Sbostic HTABLE ht; 3524*45876Sbostic int fd; 3525*45876Sbostic 3526*45876Sbostic if (t->bt_cursor.c_key != (char *) NULL) 3527*45876Sbostic (void) free(t->bt_cursor.c_key); 3528*45876Sbostic 3529*45876Sbostic if (!ISDISK(t)) { 3530*45876Sbostic /* in-memory tree, release hash table memory */ 3531*45876Sbostic ht = t->bt_s.bt_ht; 3532*45876Sbostic for (i = 0; i < HTSIZE; i++) { 3533*45876Sbostic if ((b = ht[i]) == (struct HTBUCKET *) NULL) 3534*45876Sbostic break; 3535*45876Sbostic do { 3536*45876Sbostic sb = b; 3537*45876Sbostic (void) free((char *) b->ht_page); 3538*45876Sbostic b = b->ht_next; 3539*45876Sbostic (void) free((char *) sb); 3540*45876Sbostic } while (b != (struct HTBUCKET *) NULL); 3541*45876Sbostic } 3542*45876Sbostic (void) free ((char *) ht); 3543*45876Sbostic (void) free ((char *) t); 3544*45876Sbostic return (RET_SUCCESS); 3545*45876Sbostic } 3546*45876Sbostic 3547*45876Sbostic if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) { 3548*45876Sbostic if (_bt_wrtmeta(t) == RET_ERROR) 3549*45876Sbostic return (RET_ERROR); 3550*45876Sbostic } 3551*45876Sbostic 3552*45876Sbostic if (t->bt_curpage != (BTHEADER *) NULL) { 3553*45876Sbostic h = t->bt_curpage; 3554*45876Sbostic if (h->h_flags & F_DIRTY) { 3555*45876Sbostic if (_bt_write(t, h, RELEASE) == RET_ERROR) 3556*45876Sbostic return (RET_ERROR); 3557*45876Sbostic } else { 3558*45876Sbostic if (_bt_release(t, h) == RET_ERROR) 3559*45876Sbostic return (RET_ERROR); 3560*45876Sbostic } 3561*45876Sbostic 3562*45876Sbostic /* flush and free the cache, if there is one */ 3563*45876Sbostic if (ISCACHE(t)) { 3564*45876Sbostic cache = t->bt_s.bt_d.d_cache; 3565*45876Sbostic lrusync(cache); 3566*45876Sbostic lrufree(cache); 3567*45876Sbostic } 3568*45876Sbostic (void) free ((char *) h); 3569*45876Sbostic } 3570*45876Sbostic 3571*45876Sbostic fd = t->bt_s.bt_d.d_fd; 3572*45876Sbostic (void) free ((char *) t); 3573*45876Sbostic return (close(fd)); 3574*45876Sbostic } 3575*45876Sbostic 3576*45876Sbostic /* 3577*45876Sbostic * _BT_ALLOCPG -- allocate a new page in the btree. 3578*45876Sbostic * 3579*45876Sbostic * This is called when we split a page, to get space to do the split. 3580*45876Sbostic * For disk btrees, these pages are released when the split is done. 3581*45876Sbostic * For memory btrees, they are not. 3582*45876Sbostic * 3583*45876Sbostic * Parameters: 3584*45876Sbostic * t -- tree in which to do split 3585*45876Sbostic * 3586*45876Sbostic * Returns: 3587*45876Sbostic * Pointer to the newly-allocated page 3588*45876Sbostic */ 3589*45876Sbostic 3590*45876Sbostic static BTHEADER * 3591*45876Sbostic _bt_allocpg(t) 3592*45876Sbostic BTREE_P t; 3593*45876Sbostic { 3594*45876Sbostic BTHEADER *h = t->bt_curpage; 3595*45876Sbostic BTHEADER *nh; 3596*45876Sbostic char *cache; 3597*45876Sbostic int nbytes; 3598*45876Sbostic 3599*45876Sbostic /* if we have a cache, let the cache code do the work */ 3600*45876Sbostic if (ISDISK(t) && ISCACHE(t)) { 3601*45876Sbostic nh = (BTHEADER *) lrugetnew(t->bt_s.bt_d.d_cache, 3602*45876Sbostic t->bt_npages + 1, 3603*45876Sbostic &nbytes); 3604*45876Sbostic } else { 3605*45876Sbostic nh = (BTHEADER *) malloc((unsigned) t->bt_psize); 3606*45876Sbostic } 3607*45876Sbostic 3608*45876Sbostic if (nh != (BTHEADER *) NULL) { 3609*45876Sbostic nh->h_pgno = nh->h_prevpg = nh->h_nextpg = P_NONE; 3610*45876Sbostic nh->h_flags = h->h_flags; 3611*45876Sbostic nh->h_lower = (index_t) 3612*45876Sbostic (((char *) &(nh->h_linp[0])) - ((char *) nh)); 3613*45876Sbostic nh->h_upper = t->bt_psize; 3614*45876Sbostic } 3615*45876Sbostic 3616*45876Sbostic return (nh); 3617*45876Sbostic } 3618*45876Sbostic 3619*45876Sbostic /* 3620*45876Sbostic * _BT_WRITE -- Write a specific page to a btree file. 3621*45876Sbostic * 3622*45876Sbostic * Parameters: 3623*45876Sbostic * t -- btree to get the page 3624*45876Sbostic * h -- page to write 3625*45876Sbostic * relflag -- (int) this page may/may not be released 3626*45876Sbostic * 3627*45876Sbostic * Returns: 3628*45876Sbostic * RET_SUCCESS, RET_ERROR. 3629*45876Sbostic * 3630*45876Sbostic * Side Effects: 3631*45876Sbostic * Writes a metadata page if none has been written yet. 3632*45876Sbostic */ 3633*45876Sbostic 3634*45876Sbostic static int 3635*45876Sbostic _bt_write(t, h, relflag) 3636*45876Sbostic BTREE_P t; 3637*45876Sbostic BTHEADER *h; 3638*45876Sbostic int relflag; 3639*45876Sbostic { 3640*45876Sbostic long pos; 3641*45876Sbostic int htindex; 3642*45876Sbostic HTBUCKET *b; 3643*45876Sbostic char *cache; 3644*45876Sbostic BTMETA m; 3645*45876Sbostic pgno_t pgno; 3646*45876Sbostic 3647*45876Sbostic h->h_flags &= ~F_DIRTY; 3648*45876Sbostic if (ISDISK(t)) { 3649*45876Sbostic 3650*45876Sbostic /* if we haven't done so yet, write the metadata */ 3651*45876Sbostic if (!(t->bt_flags & BTF_METAOK)) { 3652*45876Sbostic if (_bt_wrtmeta(t) == RET_ERROR) 3653*45876Sbostic return (RET_ERROR); 3654*45876Sbostic } 3655*45876Sbostic 3656*45876Sbostic pgno = h->h_pgno; 3657*45876Sbostic 3658*45876Sbostic 3659*45876Sbostic /* if we have a cache, let the cache code do the work */ 3660*45876Sbostic if ((cache = t->bt_s.bt_d.d_cache) != (char *) NULL) { 3661*45876Sbostic if (lruwrite(cache, pgno) == RET_ERROR) 3662*45876Sbostic return (RET_ERROR); 3663*45876Sbostic if (relflag && lrurelease(cache, pgno) == RET_ERROR) 3664*45876Sbostic return (RET_ERROR); 3665*45876Sbostic 3666*45876Sbostic } else { 3667*45876Sbostic (void) _bt_pgout(h, (char *) t->bt_lorder); 3668*45876Sbostic /* now write the current page */ 3669*45876Sbostic pos = (long) (pgno * t->bt_psize); 3670*45876Sbostic if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) 3671*45876Sbostic return (RET_ERROR); 3672*45876Sbostic if (write(t->bt_s.bt_d.d_fd, h, t->bt_psize) 3673*45876Sbostic < t->bt_psize) 3674*45876Sbostic return (RET_ERROR); 3675*45876Sbostic if (!relflag) 3676*45876Sbostic (void) _bt_pgin(h, (char *) t->bt_lorder); 3677*45876Sbostic } 3678*45876Sbostic } else { 3679*45876Sbostic /* in-memory btree */ 3680*45876Sbostic htindex = HASHKEY(h->h_pgno); 3681*45876Sbostic 3682*45876Sbostic /* see if we need to overwrite existing entry */ 3683*45876Sbostic for (b = t->bt_s.bt_ht[htindex]; 3684*45876Sbostic b != (HTBUCKET *) NULL; 3685*45876Sbostic b = b->ht_next) { 3686*45876Sbostic if (b->ht_pgno == h->h_pgno) { 3687*45876Sbostic b->ht_page = h; 3688*45876Sbostic return (RET_SUCCESS); 3689*45876Sbostic } 3690*45876Sbostic } 3691*45876Sbostic 3692*45876Sbostic /* new entry, write it */ 3693*45876Sbostic b = (HTBUCKET *) malloc((unsigned) sizeof (HTBUCKET)); 3694*45876Sbostic if (b == (HTBUCKET *) NULL) 3695*45876Sbostic return (RET_ERROR); 3696*45876Sbostic 3697*45876Sbostic b->ht_pgno = h->h_pgno; 3698*45876Sbostic b->ht_page = h; 3699*45876Sbostic b->ht_next = t->bt_s.bt_ht[htindex]; 3700*45876Sbostic t->bt_s.bt_ht[htindex] = b; 3701*45876Sbostic } 3702*45876Sbostic return (RET_SUCCESS); 3703*45876Sbostic } 3704*45876Sbostic 3705*45876Sbostic /* 3706*45876Sbostic * _BT_RELEASE -- Release a locked-down cache page 3707*45876Sbostic * 3708*45876Sbostic * During page splits, we want to force pages out to the cache 3709*45876Sbostic * before we actually release them, in some cases. This routine 3710*45876Sbostic * releases such a page when it is no longer needed. 3711*45876Sbostic * 3712*45876Sbostic * Parameters: 3713*45876Sbostic * t -- btree in which to release page 3714*45876Sbostic * h -- page to release 3715*45876Sbostic * 3716*45876Sbostic * Returns: 3717*45876Sbostic * RET_SUCCESS, RET_ERROR. 3718*45876Sbostic * 3719*45876Sbostic * Side Effects: 3720*45876Sbostic * None. 3721*45876Sbostic */ 3722*45876Sbostic 3723*45876Sbostic static int 3724*45876Sbostic _bt_release(t, h) 3725*45876Sbostic BTREE_P t; 3726*45876Sbostic BTHEADER *h; 3727*45876Sbostic { 3728*45876Sbostic if (ISDISK(t) && ISCACHE(t)) { 3729*45876Sbostic if (lrurelease(t->bt_s.bt_d.d_cache, h->h_pgno) < 0) 3730*45876Sbostic return (RET_ERROR); 3731*45876Sbostic } 3732*45876Sbostic return (RET_SUCCESS); 3733*45876Sbostic } 3734*45876Sbostic 3735*45876Sbostic /* 3736*45876Sbostic * _BT_WRTMETA -- Write metadata to the btree. 3737*45876Sbostic * 3738*45876Sbostic * Parameters: 3739*45876Sbostic * t -- tree to which to write 3740*45876Sbostic * 3741*45876Sbostic * Returns: 3742*45876Sbostic * RET_SUCCESS, RET_ERROR. 3743*45876Sbostic */ 3744*45876Sbostic 3745*45876Sbostic static int 3746*45876Sbostic _bt_wrtmeta(t) 3747*45876Sbostic BTREE_P t; 3748*45876Sbostic { 3749*45876Sbostic BTMETA m; 3750*45876Sbostic 3751*45876Sbostic if (lseek(t->bt_s.bt_d.d_fd, 0l, L_SET) != 0l) 3752*45876Sbostic return (RET_ERROR); 3753*45876Sbostic 3754*45876Sbostic /* lorder has to be in host-independent format */ 3755*45876Sbostic m.m_lorder = (u_long) htonl((long) t->bt_lorder); 3756*45876Sbostic 3757*45876Sbostic m.m_magic = BTREEMAGIC; 3758*45876Sbostic m.m_version = BTREEVERSION; 3759*45876Sbostic m.m_psize = t->bt_psize; 3760*45876Sbostic m.m_free = t->bt_free; 3761*45876Sbostic m.m_flags = t->bt_flags & BTF_NODUPS; 3762*45876Sbostic 3763*45876Sbostic if (t->bt_lorder != BYTE_ORDER) { 3764*45876Sbostic BLSWAP(m.m_magic); 3765*45876Sbostic BLSWAP(m.m_version); 3766*45876Sbostic BLSWAP(m.m_psize); 3767*45876Sbostic BLSWAP(m.m_free); 3768*45876Sbostic BLSWAP(m.m_flags); 3769*45876Sbostic } 3770*45876Sbostic 3771*45876Sbostic if (write(t->bt_s.bt_d.d_fd, &m, sizeof(m)) 3772*45876Sbostic != sizeof(m)) { 3773*45876Sbostic return (RET_ERROR); 3774*45876Sbostic } 3775*45876Sbostic 3776*45876Sbostic t->bt_flags |= BTF_METAOK; 3777*45876Sbostic 3778*45876Sbostic return (RET_SUCCESS); 3779*45876Sbostic } 3780*45876Sbostic 3781*45876Sbostic #ifdef DEBUG 3782*45876Sbostic void 3783*45876Sbostic _btdump(tree) 3784*45876Sbostic BTREE tree; 3785*45876Sbostic { 3786*45876Sbostic BTREE_P t = (BTREE_P) tree; 3787*45876Sbostic DATUM *d; 3788*45876Sbostic IDATUM *id; 3789*45876Sbostic BTHEADER *h; 3790*45876Sbostic pgno_t npages; 3791*45876Sbostic pgno_t i; 3792*45876Sbostic index_t cur, top; 3793*45876Sbostic 3794*45876Sbostic npages = t->bt_npages; 3795*45876Sbostic (void) printf("\"%s\" fd %d pgsz %d curpg %d @ 0x%lx", 3796*45876Sbostic t->bt_fname, t->bt_s.bt_d.d_fd, 3797*45876Sbostic t->bt_psize, t->bt_curpage); 3798*45876Sbostic (void) printf("npg %d cmp 0x%lx flags=(", npages, t->bt_compare); 3799*45876Sbostic if (t->bt_flags & BTF_SEQINIT) 3800*45876Sbostic (void) printf("BTF_SEQINIT"); 3801*45876Sbostic (void) printf(")\n"); 3802*45876Sbostic 3803*45876Sbostic for (i = P_ROOT; i <= npages; i++) { 3804*45876Sbostic if (_bt_getpage(t, i) == RET_ERROR) 3805*45876Sbostic _punt(); 3806*45876Sbostic h = t->bt_curpage; 3807*45876Sbostic top = NEXTINDEX(h); 3808*45876Sbostic (void) printf(" page %d:\n", i); 3809*45876Sbostic (void) printf("\tpgno %d prev %d next %d\n", 3810*45876Sbostic h->h_pgno, h->h_prevpg, h->h_nextpg); 3811*45876Sbostic (void) printf("\tlower %d upper %d nextind %d flags (", 3812*45876Sbostic h->h_lower, h->h_upper, top); 3813*45876Sbostic if (h->h_flags & F_LEAF) 3814*45876Sbostic (void) printf("F_LEAF"); 3815*45876Sbostic else 3816*45876Sbostic (void) printf("<internal>"); 3817*45876Sbostic if (h->h_flags & F_DIRTY) 3818*45876Sbostic (void) printf("|F_DIRTY"); 3819*45876Sbostic if (h->h_flags & F_PRESERVE) 3820*45876Sbostic (void) printf("|F_PRESERVE"); 3821*45876Sbostic if (h->h_flags & F_CONT) { 3822*45876Sbostic (void) printf("|F_CONT)"); 3823*45876Sbostic if (h->h_prevpg == P_NONE) { 3824*45876Sbostic size_t longsz; 3825*45876Sbostic (void) bcopy((char *) &(h->h_linp[0]), 3826*45876Sbostic (char *) &longsz, 3827*45876Sbostic sizeof(longsz)); 3828*45876Sbostic printf("\n\t\t(chain start, data length %ld)", 3829*45876Sbostic longsz); 3830*45876Sbostic } 3831*45876Sbostic printf("\n"); 3832*45876Sbostic continue; 3833*45876Sbostic } 3834*45876Sbostic (void) printf(")\n"); 3835*45876Sbostic for (cur = 0; cur < top; cur++) { 3836*45876Sbostic (void) printf("\t [%d] off %d ", cur, h->h_linp[cur]); 3837*45876Sbostic if (h->h_flags & F_LEAF) { 3838*45876Sbostic d = (DATUM *) GETDATUM(h,cur); 3839*45876Sbostic (void) printf("ksize %d", d->d_ksize); 3840*45876Sbostic if (d->d_flags & D_BIGKEY) 3841*45876Sbostic (void) printf(" (indirect)"); 3842*45876Sbostic (void) printf("; dsize %d", d->d_dsize); 3843*45876Sbostic if (d->d_flags & D_BIGDATA) 3844*45876Sbostic (void) printf(" (indirect)"); 3845*45876Sbostic } else { 3846*45876Sbostic id = (IDATUM *) GETDATUM(h,cur); 3847*45876Sbostic (void) printf("size %d pgno %d", 3848*45876Sbostic id->i_size, id->i_pgno); 3849*45876Sbostic if (id->i_flags & D_BIGKEY) 3850*45876Sbostic (void) printf(" (indirect)"); 3851*45876Sbostic } 3852*45876Sbostic (void) printf("\n"); 3853*45876Sbostic } 3854*45876Sbostic (void) printf("\n"); 3855*45876Sbostic } 3856*45876Sbostic } 3857*45876Sbostic #endif /* DEBUG */ 3858*45876Sbostic 3859*45876Sbostic /* 3860*45876Sbostic * _BT_CMP -- Compare a key to a given item on the current page. 3861*45876Sbostic * 3862*45876Sbostic * This routine is a wrapper for the user's comparison function. 3863*45876Sbostic * 3864*45876Sbostic * Parameters: 3865*45876Sbostic * t -- tree in which to do comparison 3866*45876Sbostic * p -- pointer to one argument for the comparison function 3867*45876Sbostic * n -- index of item to supply second arg to comparison function 3868*45876Sbostic * 3869*45876Sbostic * Returns: 3870*45876Sbostic * < 0 if p is < item at n 3871*45876Sbostic * = 0 if p is = item at n 3872*45876Sbostic * > 0 if p is > item at n 3873*45876Sbostic */ 3874*45876Sbostic 3875*45876Sbostic static int 3876*45876Sbostic _bt_cmp(t, p, n) 3877*45876Sbostic BTREE_P t; 3878*45876Sbostic char *p; 3879*45876Sbostic index_t n; 3880*45876Sbostic { 3881*45876Sbostic BTHEADER *h; 3882*45876Sbostic IDATUM *id; 3883*45876Sbostic DATUM *d; 3884*45876Sbostic char *arg; 3885*45876Sbostic int ignore; 3886*45876Sbostic int free_arg; 3887*45876Sbostic pgno_t chain; 3888*45876Sbostic int r; 3889*45876Sbostic 3890*45876Sbostic h = t->bt_curpage; 3891*45876Sbostic 3892*45876Sbostic /* 3893*45876Sbostic * The left-most key at any level of the tree on internal pages 3894*45876Sbostic * is guaranteed (artificially, by the code here) to be less than 3895*45876Sbostic * any user key. This saves us from having to update the leftmost 3896*45876Sbostic * key when the user inserts a new key in the tree smaller than 3897*45876Sbostic * anything we've seen yet. 3898*45876Sbostic */ 3899*45876Sbostic 3900*45876Sbostic if (h->h_prevpg == P_NONE && !(h->h_flags & F_LEAF) && n == 0) 3901*45876Sbostic return (1); 3902*45876Sbostic 3903*45876Sbostic if (h->h_flags & F_LEAF) { 3904*45876Sbostic d = (DATUM *) GETDATUM(h,n); 3905*45876Sbostic if (d->d_flags & D_BIGKEY) { 3906*45876Sbostic free_arg = TRUE; 3907*45876Sbostic bcopy(&(d->d_bytes[0]), (char *) &chain, sizeof(chain)); 3908*45876Sbostic if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR) 3909*45876Sbostic return (RET_ERROR); 3910*45876Sbostic } else { 3911*45876Sbostic free_arg = FALSE; 3912*45876Sbostic arg = &(d->d_bytes[0]); 3913*45876Sbostic } 3914*45876Sbostic } else { 3915*45876Sbostic id = (IDATUM *) GETDATUM(h,n); 3916*45876Sbostic if (id->i_flags & D_BIGKEY) { 3917*45876Sbostic free_arg = TRUE; 3918*45876Sbostic bcopy(&(id->i_bytes[0]), 3919*45876Sbostic (char *) &chain, 3920*45876Sbostic sizeof(chain)); 3921*45876Sbostic if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR) 3922*45876Sbostic return (RET_ERROR); 3923*45876Sbostic } else { 3924*45876Sbostic free_arg = FALSE; 3925*45876Sbostic arg = &(id->i_bytes[0]); 3926*45876Sbostic } 3927*45876Sbostic } 3928*45876Sbostic r = (*(t->bt_compare))(p, arg); 3929*45876Sbostic 3930*45876Sbostic if (free_arg) 3931*45876Sbostic (void) free(arg); 3932*45876Sbostic 3933*45876Sbostic return (r); 3934*45876Sbostic } 3935*45876Sbostic 3936*45876Sbostic /* 3937*45876Sbostic * _BT_PUSH/_BT_POP -- Push/pop a parent page number on the parent stack. 3938*45876Sbostic * 3939*45876Sbostic * When we descend the tree, we keep track of parent pages in order 3940*45876Sbostic * to handle splits on insertions. 3941*45876Sbostic * 3942*45876Sbostic * Parameters: 3943*45876Sbostic * t -- tree for which to push parent 3944*45876Sbostic * pgno -- page number to push (_bt_push only) 3945*45876Sbostic * 3946*45876Sbostic * Returns: 3947*45876Sbostic * None. 3948*45876Sbostic */ 3949*45876Sbostic 3950*45876Sbostic static 3951*45876Sbostic _bt_push(t, pgno) 3952*45876Sbostic BTREE_P t; 3953*45876Sbostic pgno_t pgno; 3954*45876Sbostic { 3955*45876Sbostic BTSTACK *new; 3956*45876Sbostic 3957*45876Sbostic new = (BTSTACK *) malloc((unsigned) sizeof(BTSTACK)); 3958*45876Sbostic new->bts_pgno = pgno; 3959*45876Sbostic new->bts_next = t->bt_stack; 3960*45876Sbostic t->bt_stack = new; 3961*45876Sbostic } 3962*45876Sbostic 3963*45876Sbostic static 3964*45876Sbostic _bt_pop(t) 3965*45876Sbostic BTREE_P t; 3966*45876Sbostic { 3967*45876Sbostic BTSTACK *s; 3968*45876Sbostic pgno_t p = P_NONE; 3969*45876Sbostic 3970*45876Sbostic if ((s = t->bt_stack) != (BTSTACK *) NULL) { 3971*45876Sbostic p = s->bts_pgno; 3972*45876Sbostic t->bt_stack = s->bts_next; 3973*45876Sbostic (void) free ((char *) s); 3974*45876Sbostic } 3975*45876Sbostic return (p); 3976*45876Sbostic } 3977*45876Sbostic 3978*45876Sbostic #ifdef DEBUG 3979*45876Sbostic _punt() 3980*45876Sbostic { 3981*45876Sbostic int pid; 3982*45876Sbostic 3983*45876Sbostic pid = getpid(); 3984*45876Sbostic (void) kill(pid, SIGILL); 3985*45876Sbostic } 3986*45876Sbostic #endif /* DEBUG */ 3987