1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #if defined(LIBC_SCCS) && !defined(lint) 12 static char sccsid[] = "@(#)bt_open.c 5.1 (Berkeley) 01/07/91"; 13 #endif /* LIBC_SCCS and not lint */ 14 15 /* 16 * btree.c -- implementation of btree access method for 4.4BSD. 17 * 18 * The design here is based on that of the btree access method used 19 * in the Postgres database system at UC Berkeley. The implementation 20 * is wholly independent of the Postgres code. 21 * 22 * This implementation supports btrees on disk (supply a filename) or 23 * in memory (don't). Public interfaces defined here are: 24 * 25 * btree_open() -- wrapper; returns a filled DB struct for 26 * a btree. 27 * 28 * bt_open() -- open a new or existing btree. 29 * bt_get() -- fetch data from a tree by key. 30 * bt_seq() -- do a sequential scan on a tree. 31 * bt_put() -- add data to a tree by key. 32 * bt_delete() -- remove data from a tree by key. 33 * bt_close() -- close a btree. 34 * bt_sync() -- force btree pages to disk (disk trees only). 35 */ 36 37 #include <sys/param.h> 38 #include <sys/file.h> 39 #include <sys/stat.h> 40 #include <sys/signal.h> 41 #include <sys/errno.h> 42 #include <db.h> 43 44 #ifndef BLSWAP 45 #define BLSWAP(a) { \ 46 register unsigned long _t; \ 47 _t = (a); \ 48 (a) >>= 24; \ 49 (a) |= (_t & 0x00ff0000) >> 8; \ 50 (a) |= (_t & 0x0000ff00) << 8; \ 51 (a) |= (_t & 0x000000ff) << 24; \ 52 } 53 #endif /* ndef BLSWAP */ 54 55 typedef char *BTREE; /* should really be (void *) */ 56 57 /* #define DEBUG */ 58 59 #define RET_ERROR -1 60 #define RET_SUCCESS 0 61 #define RET_SPECIAL 1 62 63 #ifndef TRUE 64 #define TRUE 1 65 #define FALSE 0 66 #endif /* ndef TRUE */ 67 68 extern int errno; 69 70 /* libc */ 71 extern char *malloc(); 72 73 /* these are defined in lrucache.c */ 74 extern char *lruinit(); 75 extern char *lruget(); 76 extern char *lrugetnew(); 77 extern int lrusync(); 78 extern int lruwrite(); 79 extern int lrurelease(); 80 extern void lrufree(); 81 82 /* these are defined here */ 83 BTREE bt_open(); 84 int bt_close(); 85 int bt_delete(); 86 int bt_get(); 87 int bt_put(); 88 int bt_seq(); 89 int bt_sync(); 90 91 /* 92 * Private types. What you choose for these depends on how big you 93 * want to let files get, and how big you want to let pages get. 94 */ 95 96 typedef u_long index_t; /* so # bytes on a page fits in a long */ 97 typedef u_long pgno_t; /* so # of pages in a btree fits in a long */ 98 99 /* 100 * When we do searches, we push the parent page numbers onto a stack 101 * as we descend the tree. This is so that for insertions, we can 102 * find our way back up to do internal page insertions and splits. 103 */ 104 105 typedef struct BTSTACK { 106 pgno_t bts_pgno; 107 struct BTSTACK *bts_next; 108 } BTSTACK; 109 110 /* 111 * Every btree page has a header that looks like this. Flags are given 112 * in the #define's for the F_ flags (see below). 113 */ 114 115 typedef struct BTHEADER { 116 pgno_t h_pgno; /* page number of this page */ 117 pgno_t h_prevpg; /* left sibling */ 118 pgno_t h_nextpg; /* right sibling */ 119 120 #define F_LEAF 0x01 /* leaf page, contains user data */ 121 #define F_CONT 0x02 /* continuation page (large items) */ 122 #define F_DIRTY 0x04 /* need to write to disk */ 123 #define F_PRESERVE 0x08 /* never delete this chain of pages */ 124 125 u_long h_flags; /* page state */ 126 index_t h_lower; /* lower bound of free space on page */ 127 index_t h_upper; /* upper bound of free space on page */ 128 index_t h_linp[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 129 } BTHEADER; 130 131 /* 132 * HTBUCKETs are hash table buckets for looking up pages of in-memory 133 * btrees by page number. We use this indirection, rather than direct 134 * pointers, so that the code for manipulating in-memory trees is the 135 * same as that for manipulating on-disk trees. 136 */ 137 138 typedef struct HTBUCKET { 139 pgno_t ht_pgno; 140 BTHEADER *ht_page; 141 struct HTBUCKET *ht_next; 142 } HTBUCKET; 143 144 typedef HTBUCKET **HTABLE; 145 146 /* minimum size we'll let a page be */ 147 #define MINPSIZE 512 148 149 /* default cache size, in bytes */ 150 #define DEFCACHE (20 * 1024) 151 152 /* hash table size for in-memory trees */ 153 #define HTSIZE 128 154 155 /* generate a hash key from a page number */ 156 #define HASHKEY(pgno) ((pgno - 1) % HTSIZE) 157 158 /* 159 * Disk btrees have a file descriptor, and may also have an lru buffer 160 * cache, if the user asked for one. 161 */ 162 163 typedef struct BTDISK { 164 int d_fd; 165 char *d_cache; 166 } BTDISK; 167 168 /* 169 * Cursors keep track of the current location in a sequential scan of 170 * the database. Since btrees impose a total ordering on keys, we can 171 * walk forward or backward through the database from any point. Cursors 172 * survive updates to the tree, and can be used to delete a particular 173 * record. 174 */ 175 176 typedef struct CURSOR { 177 pgno_t c_pgno; /* pgno of current item in scan */ 178 index_t c_index; /* index of current item in scan */ 179 char *c_key; /* current key, used for updates */ 180 181 #define CRSR_BEFORE 0x01 182 183 u_char c_flags; /* to handle updates properly */ 184 } CURSOR; 185 186 /* 187 * The private btree data structure. The user passes a pointer to one of 188 * these when we are to manipulate a tree, but the BTREE type is opaque 189 * to him. 190 */ 191 192 typedef struct BTREEDATA_P { 193 char *bt_fname; /* NULL for in-memory trees */ 194 union { 195 BTDISK bt_d; /* for on-disk btrees */ 196 HTABLE bt_ht; /* hash table for mem trees */ 197 } bt_s; 198 size_t bt_psize; /* page size for btree pages */ 199 int (*bt_compare)(); /* key comparison function */ 200 pgno_t bt_npages; /* number of pages in tree */ 201 BTHEADER *bt_curpage; /* current page contents */ 202 pgno_t bt_free; /* free pg list for big data */ 203 CURSOR bt_cursor; /* cursor for scans */ 204 BTSTACK *bt_stack; /* parent stack for inserts */ 205 u_long bt_lorder; /* byte order (endian.h) */ 206 207 #define BTF_METAOK 0x01 /* meta-data written to start of file */ 208 #define BTF_SEQINIT 0x02 /* we have called bt_seq */ 209 #define BTF_ISWRITE 0x04 /* tree was opened for write */ 210 #define BTF_NODUPS 0x08 /* tree created for unique keys */ 211 212 u_long bt_flags; /* btree state */ 213 } BTREEDATA_P; 214 215 typedef BTREEDATA_P *BTREE_P; 216 217 /* 218 * The first thing in a btree file is a BTMETA structure. The rest of 219 * the first page is empty, so that all disk operations are page-aligned. 220 */ 221 222 typedef struct BTMETA { 223 u_long m_magic; 224 u_long m_version; 225 size_t m_psize; 226 pgno_t m_free; 227 u_long m_flags; 228 u_long m_lorder; 229 } BTMETA; 230 231 #define P_NONE 0 /* invalid page number in tree */ 232 #define P_ROOT 1 /* page number of root pg in btree */ 233 234 #define NORELEASE 0 /* don't release a page during write */ 235 #define RELEASE 1 /* release a page during write */ 236 237 #define INSERT 0 /* doing an insert operation */ 238 #define DELETE 1 /* doing a delete operation */ 239 240 /* magic number for identifying btree files */ 241 #define BTREEMAGIC 053162L /* magic */ 242 #define BTREEVERSION 2 /* last changed 6 jan 1991 */ 243 244 /* get the next free index on a btree page */ 245 #define NEXTINDEX(p) ((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t))) 246 247 /* is a BTITEM actually on the btree page? */ 248 #define VALIDITEM(t, i) ((i)->bti_index < NEXTINDEX((t)->bt_curpage)) 249 250 /* guarantee longword alignment so structure refs work */ 251 #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03) 252 253 /* get a particular datum (or idatum) off a page */ 254 #define GETDATUM(h,i) (((char *) h) + h->h_linp[i]) 255 256 /* is a {key,datum} too big to put on a single page? */ 257 #define TOOBIG(t, sz) (sz >= t->bt_psize / 5) 258 259 /* is this a disk tree or a memory tree? */ 260 #define ISDISK(t) (t->bt_fname != (char *) NULL) 261 262 /* does the disk tree use a cache? */ 263 #define ISCACHE(t) (t->bt_s.bt_d.d_cache != (char *) NULL) 264 265 /* 266 * DATUMs are for user data -- one appears on leaf pages for every 267 * tree entry. The d_bytes[] array contains the key first, then the data. 268 * 269 * If either the key or the datum is too big to store on a single page, 270 * a bit is set in the flags entry, and the d_bytes[] array contains a 271 * pgno pointing to the page at which the data is actually stored. 272 * 273 * Note on alignment: every DATUM is guaranteed to be longword aligned 274 * on the disk page. In order to force longword alignment of user key 275 * and data values, we must guarantee that the d_bytes[] array starts 276 * on a longword boundary. This is the reason that d_flags is a u_long, 277 * rather than a u_char (it really only needs to be two bits big). This 278 * is necessary because we call the user's comparison function with a 279 * pointer to the start of the d_bytes array. We don't need to force 280 * longword alignment of the data following the key, since that is copied 281 * to a longword-aligned buffer before being returned to the user. 282 */ 283 284 typedef struct DATUM { 285 size_t d_ksize; /* size of key */ 286 size_t d_dsize; /* size of data */ 287 288 #define D_BIGDATA 0x01 /* indirect datum ptr flag */ 289 #define D_BIGKEY 0x02 /* indirect key ptr flag */ 290 291 u_long d_flags; /* flags (indirect bit) */ 292 char d_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 293 } DATUM; 294 295 /* BTITEMs are used to return (page, index, datum) tuples from searches */ 296 typedef struct BTITEM { 297 pgno_t bti_pgno; 298 index_t bti_index; 299 DATUM *bti_datum; 300 } BTITEM; 301 302 /* 303 * IDATUMs are for data stored on internal pages. This is the (key, pgno) 304 * pair, such that key 'key' is the first entry on page 'pgno'. If our 305 * internal page contains keys (a) and (b) next to each other, then all 306 * items >= to (a) and < (b) go on the same page as (a). There are some 307 * gotchas with duplicate keys, however. See the split code for details. 308 * 309 * If a key is too big to fit on a single page, then the i_bytes[] array 310 * contains a pgno pointing to the start of a chain that actually stores 311 * the bytes. Since items on internal pages are never deleted from the 312 * tree, these indirect chains are marked as special, so that they won't 313 * be deleted if the corresponding leaf item is deleted. 314 * 315 * As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char) 316 * in order to guarantee that user keys are longword aligned on the disk 317 * page. 318 */ 319 320 typedef struct IDATUM { 321 size_t i_size; 322 pgno_t i_pgno; 323 u_long i_flags; /* see DATUM.d_flags, above */ 324 char i_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 325 } IDATUM; 326 327 /* all private interfaces have a leading _ in their names */ 328 static BTITEM *_bt_search(); 329 static BTITEM *_bt_searchr(); 330 static BTHEADER *_bt_allocpg(); 331 static index_t _bt_binsrch(); 332 static int _bt_isonpage(); 333 static BTITEM *_bt_first(); 334 static int _bt_release(); 335 static int _bt_wrtmeta(); 336 static int _bt_delindir(); 337 static int _bt_pgout(); 338 static int _bt_pgin(); 339 static int _bt_fixscan(); 340 static int _bt_indirect(); 341 static int _bt_crsrdel(); 342 343 extern int strcmp(); 344 345 static BTREEINFO _DefaultBTInfo = { 346 0, /* flags */ 347 0, /* cachesize */ 348 0, /* psize */ 349 strcmp, /* compare */ 350 0 351 }; 352 353 /* 354 * BTREE_OPEN -- Wrapper routine to open a btree. 355 * 356 * Creates and fills a DB struct, and calls the routine that actually 357 * opens the btree. 358 * 359 * Parameters: 360 * f: filename to open 361 * flags: flag bits passed to open 362 * mode: permission bits, used if O_CREAT specified 363 * b: BTREEINFO pointer 364 * 365 * Returns: 366 * Filled-in DBT on success; NULL on failure, with errno 367 * set as appropriate. 368 * 369 * Side Effects: 370 * Allocates memory for the DB struct. 371 */ 372 373 DB * 374 btree_open(f, flags, mode, b) 375 char *f; 376 int flags; 377 int mode; 378 BTREEINFO *b; 379 { 380 DB *db; 381 BTREE t; 382 383 if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL) 384 return ((DB *) NULL); 385 386 if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) { 387 (void) free ((char *) db); 388 return ((DB *) NULL); 389 } 390 391 db->internal = (char *) t; 392 db->close = bt_close; 393 db->delete = bt_delete; 394 db->get = bt_get; 395 db->put = bt_put; 396 db->seq = bt_seq; 397 db->sync = bt_sync; 398 399 return (db); 400 } 401 402 /* 403 * BT_OPEN -- Open a btree. 404 * 405 * This routine creates the correct kind (disk or in-memory) of 406 * btree and initializes it as required. 407 * 408 * Parameters: 409 * f -- name of btree (NULL for in-memory btrees) 410 * flags -- flags passed to open() 411 * mode -- mode passed to open () 412 * b -- BTREEINFO structure, describing btree 413 * 414 * Returns: 415 * (Opaque) pointer to the btree. On failure, returns NULL 416 * with errno set as appropriate. 417 * 418 * Side Effects: 419 * Allocates memory, opens files. 420 */ 421 422 BTREE 423 bt_open(f, flags, mode, b) 424 char *f; 425 int flags; 426 int mode; 427 BTREEINFO *b; 428 { 429 BTREE_P t; 430 HTABLE ht; 431 int nbytes; 432 int fd; 433 CURSOR *c; 434 BTMETA m; 435 struct stat buf; 436 437 /* use the default info if none was provided */ 438 if (b == (BTREEINFO *) NULL) 439 b = &_DefaultBTInfo; 440 441 if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL) 442 return ((BTREE) NULL); 443 444 if (b->compare) 445 t->bt_compare = b->compare; 446 else 447 t->bt_compare = strcmp; 448 449 t->bt_fname = f; 450 t->bt_curpage = (BTHEADER *) NULL; 451 t->bt_free = P_NONE; 452 c = &(t->bt_cursor); 453 c->c_pgno = P_NONE; 454 c->c_index = 0; 455 c->c_flags = (u_char) NULL; 456 c->c_key = (char *) NULL; 457 t->bt_stack = (BTSTACK *) NULL; 458 t->bt_flags = 0; 459 460 /* 461 * If no file name was supplied, this is an in-memory btree. 462 * Otherwise, it's a disk-based btree. 463 */ 464 if (f == (char *) NULL) { 465 /* in memory */ 466 if ((t->bt_psize = b->psize) < MINPSIZE) { 467 if (t->bt_psize != 0) { 468 (void) free ((char *) t); 469 errno = EINVAL; 470 return ((BTREE) NULL); 471 } 472 t->bt_psize = getpagesize(); 473 } 474 475 nbytes = HTSIZE * sizeof(HTBUCKET *); 476 if ((ht = (HTABLE) malloc((unsigned) nbytes)) 477 == (HTABLE) NULL) { 478 (void) free((char *) t); 479 return ((BTREE) NULL); 480 } 481 (void) bzero((char *) ht, nbytes); 482 t->bt_s.bt_ht = ht; 483 t->bt_npages = 0; 484 t->bt_lorder = BYTE_ORDER; 485 if (!(b->flags & R_DUP)) 486 t->bt_flags |= BTF_NODUPS; 487 } else { 488 /* on disk */ 489 if ((fd = open(f, O_RDONLY, 0)) < 0) { 490 /* doesn't exist yet, be sure page is big enough */ 491 if ((t->bt_psize = b->psize) < sizeof(BTHEADER) 492 && b->psize != 0) { 493 (void) free((char *) t); 494 errno = EINVAL; 495 return ((BTREE) NULL); 496 } 497 if (b->lorder == 0) 498 b->lorder = BYTE_ORDER; 499 500 if (b->lorder != BIG_ENDIAN 501 && b->lorder != LITTLE_ENDIAN) { 502 (void) free((char *) t); 503 errno = EINVAL; 504 return ((BTREE) NULL); 505 } 506 t->bt_lorder = b->lorder; 507 if (!(b->flags & R_DUP)) 508 t->bt_flags |= BTF_NODUPS; 509 } else { 510 /* exists, get page size from file */ 511 if (read(fd, &m, sizeof(m)) < sizeof(m)) { 512 (void) close(fd); 513 (void) free((char *) t); 514 errno = EINVAL; 515 return ((BTREE) NULL); 516 } 517 518 /* lorder always stored in host-independent format */ 519 NTOHL(m.m_lorder); 520 if (m.m_lorder != BIG_ENDIAN 521 && m.m_lorder != LITTLE_ENDIAN) { 522 (void) free((char *) t); 523 errno = EINVAL; 524 return ((BTREE) NULL); 525 } 526 t->bt_lorder = m.m_lorder; 527 528 if (t->bt_lorder != BYTE_ORDER) { 529 BLSWAP(m.m_magic); 530 BLSWAP(m.m_version); 531 BLSWAP(m.m_psize); 532 BLSWAP(m.m_free); 533 BLSWAP(m.m_flags); 534 } 535 536 if (m.m_magic != BTREEMAGIC 537 || m.m_version != BTREEVERSION 538 || m.m_psize < MINPSIZE) { 539 (void) close(fd); 540 (void) free((char *) t); 541 #ifndef EFTYPE 542 #define EFTYPE -100 543 #endif 544 errno = EFTYPE; 545 return ((BTREE) NULL); 546 } 547 t->bt_psize = m.m_psize; 548 t->bt_free = m.m_free; 549 t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK; 550 (void) close(fd); 551 } 552 553 /* now open the file the way the user wanted it */ 554 if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) { 555 (void) free ((char *) t); 556 return ((BTREE) NULL); 557 } 558 559 /* get number of pages, page size if necessary */ 560 (void) fstat(t->bt_s.bt_d.d_fd, &buf); 561 if (t->bt_psize == 0) 562 t->bt_psize = buf.st_blksize; 563 t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize); 564 565 /* page zero is metadata, doesn't count */ 566 if (t->bt_npages > 0) 567 --(t->bt_npages); 568 569 if (b->cachesize == 0) 570 b->cachesize = DEFCACHE; 571 572 /* get an lru buffer cache, if the user asked for one */ 573 if ((b->cachesize / t->bt_psize) > 0) { 574 BTDISK *d = &(t->bt_s.bt_d); 575 576 d->d_cache = lruinit(d->d_fd, 577 b->cachesize / t->bt_psize, 578 t->bt_psize, 579 (char *) t->bt_lorder, 580 _bt_pgin, _bt_pgout); 581 582 if (d->d_cache == (char *) NULL) { 583 (void) free((char *) t); 584 return ((BTREE) NULL); 585 } 586 } 587 } 588 589 /* remember if tree was opened for write */ 590 if (((flags & O_WRONLY) == O_WRONLY) 591 || ((flags & O_RDWR) == O_RDWR)) 592 t->bt_flags |= BTF_ISWRITE; 593 594 return ((BTREE) t); 595 } 596 597 /* 598 * BT_GET -- Get an entry from a btree. 599 * 600 * Does a key lookup in the tree to find the specified key, and returns 601 * the key/data pair found. 602 * 603 * Parameters: 604 * tree -- btree in which to do lookup 605 * key -- key to find 606 * data -- pointer to DBT in which to return data 607 * flag -- ignored 608 * 609 * Returns: 610 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not 611 * found. If key is not found, nothing is stored in the 612 * return DBT 'data'. 613 * 614 * Side Effects: 615 * None. 616 * 617 * Warnings: 618 * Return data is statically allocated, and will be overwritten 619 * at the next call. 620 */ 621 622 int 623 bt_get(tree, key, data, flag) 624 BTREE tree; 625 DBT *key; 626 DBT *data; 627 int flag; 628 { 629 BTREE_P t = (BTREE_P) tree; 630 BTHEADER *h; 631 DATUM *d; 632 BTITEM *item; 633 634 /* lookup */ 635 item = _bt_search(t, key); 636 if (item == (BTITEM *) NULL) 637 return (RET_ERROR); 638 639 /* clear parent pointer stack */ 640 while (_bt_pop(t) != P_NONE) 641 continue; 642 643 h = (BTHEADER *) t->bt_curpage; 644 data->size = 0; 645 data->data = (char *) NULL; 646 647 /* match? */ 648 if (VALIDITEM(t, item) 649 && _bt_cmp(t, key->data, item->bti_index) == 0) { 650 d = (DATUM *) GETDATUM(h, item->bti_index); 651 return (_bt_buildret(t, d, data, key)); 652 } 653 654 /* nope */ 655 return (RET_SPECIAL); 656 } 657 658 /* 659 * BT_PUT -- Add an entry to a btree. 660 * 661 * The specified (key, data) pair is added to the tree. If the tree 662 * was created for unique keys only, then duplicates will not be 663 * entered. If the requested key exists in the tree, it will be over- 664 * written unless the flags parameter is R_NOOVERWRITE, in which case 665 * the update will not be done. If duplicate keys are permitted in the 666 * tree, duplicates will be inserted and will not overwrite existing 667 * keys. Nodes are split as required. 668 * 669 * Parameters: 670 * tree -- btree in which to put the new entry 671 * key -- key component to add 672 * data -- data corresponding to key 673 * flag -- R_NOOVERWRITE or zero. 674 * 675 * Returns: 676 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the 677 * NOOVERWRITE flag was set and the specified key exists 678 * in the database. 679 * 680 * Side Effects: 681 * None. 682 */ 683 684 int 685 bt_put(tree, key, data, flag) 686 BTREE tree; 687 DBT *key; 688 DBT *data; 689 int flag; 690 { 691 BTREE_P t = (BTREE_P) tree; 692 BTITEM *item; 693 694 /* look for this key in the tree */ 695 item = _bt_search(t, key); 696 697 /* 698 * If this tree was originally created without R_DUP, then duplicate 699 * keys are not allowed. We need to check this at insertion time. 700 */ 701 702 if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) { 703 if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) { 704 if (_bt_delone(t, item->bti_index) == RET_ERROR) 705 return (RET_ERROR); 706 } 707 } 708 709 return (_bt_insert(t, item, key, data, flag)); 710 } 711 712 /* 713 * BT_DELETE -- delete a key from the tree. 714 * 715 * Deletes all keys (and their associated data items) matching the 716 * supplied key from the tree. If the flags entry is R_CURSOR, then 717 * the current item in the active scan is deleted. 718 * 719 * Parameters: 720 * tree -- btree from which to delete key 721 * key -- key to delete 722 * flags -- R_CURSOR or zero 723 * 724 * Returns: 725 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified 726 * key was not in the tree. 727 * 728 * Side Effects: 729 * None. 730 */ 731 732 int 733 bt_delete(tree, key, flags) 734 BTREE tree; 735 DBT *key; 736 int flags; 737 { 738 BTREE_P t = (BTREE_P) tree; 739 BTHEADER *h; 740 BTITEM *item; 741 int ndel = 0; 742 743 if (flags == R_CURSOR) 744 return (_bt_crsrdel(t)); 745 746 /* find the first matching key in the tree */ 747 item = _bt_first(t, key); 748 h = t->bt_curpage; 749 750 /* delete all matching keys */ 751 for (;;) { 752 while (VALIDITEM(t, item) 753 && (_bt_cmp(t, key->data, item->bti_index) == 0)) { 754 if (_bt_delone(t, item->bti_index) == RET_ERROR) 755 return (RET_ERROR); 756 ndel++; 757 } 758 759 if (VALIDITEM(t, item) || h->h_nextpg == P_NONE) 760 break; 761 762 /* next page, if necessary */ 763 do { 764 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 765 return (RET_ERROR); 766 h = t->bt_curpage; 767 } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE); 768 769 item->bti_pgno = h->h_pgno; 770 item->bti_index = 0; 771 772 if (!VALIDITEM(t, item) 773 || _bt_cmp(t, key->data, item->bti_index) != 0) 774 break; 775 } 776 777 /* clean up the parent stack */ 778 while (_bt_pop(t) != P_NONE) 779 continue; 780 781 /* flush changes to disk */ 782 if (ISDISK(t)) { 783 if (h->h_flags & F_DIRTY) { 784 if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR) 785 return (RET_ERROR); 786 } 787 } 788 789 if (ndel == 0) 790 return (RET_SPECIAL); 791 792 return (RET_SUCCESS); 793 } 794 795 /* 796 * _BT_CRSRDEL -- Delete the item pointed to by the cursor. 797 * 798 * This routine deletes the item most recently returned by a scan 799 * through the tree. Since it only makes sense to delete the current 800 * record once, we make sure that we don't try to delete twice without 801 * advancing the scan. 802 * 803 * Parameters: 804 * t -- tree in which to do deletion 805 * 806 * Returns: 807 * RET_SUCCESS, RET_ERROR. 808 * 809 * Side Effects: 810 * The call to _bt_delone marks the cursor, so we can tell that 811 * the current record has been deleted. 812 */ 813 814 static int 815 _bt_crsrdel(t) 816 BTREE_P t; 817 { 818 CURSOR *c; 819 820 c = &(t->bt_cursor); 821 822 /* a cursor must exist, and can't have deleted the current key yet */ 823 if (!(t->bt_flags & BTF_SEQINIT) || (c->c_flags & CRSR_BEFORE)) { 824 errno = EINVAL; 825 return (RET_ERROR); 826 } 827 828 if (_bt_getpage(t, c->c_pgno) == RET_ERROR) 829 return (RET_ERROR); 830 831 if (c->c_index >= NEXTINDEX(t->bt_curpage)) { 832 errno = EINVAL; 833 return (RET_ERROR); 834 } 835 836 return (_bt_delone(t, c->c_index)); 837 } 838 839 /* 840 * _BT_DELONE -- Delete a single entry from a btree. 841 * 842 * This routine physically removes a btree entry from a leaf page. 843 * IDATUM items are *never* removed from internal nodes, regardless 844 * of whether the entries that originally caused them to be added 845 * are removed from the tree or not. In addition, pages made empty 846 * by element deletion are not actually reclaimed. They are, 847 * however, made available for reuse. 848 * 849 * To delete an item from a page, we pack the remaining items at 850 * the end of the page, overwriting the deleted item's entry. We 851 * move the line pointers backward on the page, overwriting the 852 * original item's line pointer. This guarantees that the space in 853 * the middle of the page is free -- a property that our insertion 854 * strategy relies on. 855 * 856 * This routine doesn't reclaim pages all of whose entries have 857 * been deleted. These pages are available for reuse, however. 858 * If an item is deleted that was too big to fit on a page, then 859 * the blocks that it occupies are put on a free list for reuse. 860 * 861 * Parameters: 862 * t -- btree from which to delete item 863 * index -- index of entry on current page to delete 864 * 865 * Returns: 866 * RET_SUCCESS, RET_ERROR. 867 * 868 * Side Effects: 869 * Physically changes page layout, adjusts internal page 870 * state to reflect the deletion of the item, and updates 871 * the list of free pages for this tree. 872 */ 873 874 static int 875 _bt_delone(t, index) 876 BTREE_P t; 877 index_t index; 878 { 879 char *src, *dest; 880 int nbytes, nmoved; 881 index_t off; 882 index_t top; 883 index_t i; 884 pgno_t chain; 885 BTHEADER *h; 886 CURSOR *c; 887 DATUM *d; 888 889 /* deletion may confuse an active scan. fix it. */ 890 c = &(t->bt_cursor); 891 if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno) 892 if (_bt_fixscan(t, index, (DATUM *) NULL, DELETE) == RET_ERROR) 893 return (RET_ERROR); 894 895 h = t->bt_curpage; 896 off = h->h_linp[index]; 897 d = (DATUM *) GETDATUM(h, index); 898 899 /* if this is a big item, reclaim the space it occupies */ 900 if (d->d_flags & D_BIGKEY) { 901 bcopy(&(d->d_bytes[0]), 902 (char *) &chain, 903 sizeof(chain)); 904 if (_bt_delindir(t, chain) == RET_ERROR) 905 return (RET_ERROR); 906 h = t->bt_curpage; 907 d = (DATUM *) GETDATUM(h, index); 908 } 909 if (d->d_flags & D_BIGDATA) { 910 bcopy(&(d->d_bytes[d->d_ksize]), 911 (char *) &chain, 912 sizeof(chain)); 913 if (_bt_delindir(t, chain) == RET_ERROR) 914 return (RET_ERROR); 915 h = t->bt_curpage; 916 d = (DATUM *) GETDATUM(h, index); 917 } 918 919 /* move the data down on the page */ 920 nbytes = d->d_ksize + d->d_dsize 921 + (sizeof(DATUM) - sizeof(char)); 922 nbytes = LONGALIGN(nbytes); 923 src = ((char *) h) + h->h_upper; 924 dest = src + nbytes; 925 nmoved = (int) (((char *) d) - src); 926 (void) bcopy(src, dest, nmoved); 927 928 /* next move the line pointers up */ 929 src = (char *) &(h->h_linp[index + 1]); 930 dest = (char *) &(h->h_linp[index]); 931 nmoved = (int) (((char *) &(h->h_linp[NEXTINDEX(h)])) - src); 932 (void) bcopy(src, dest, nmoved); 933 934 /* remember that we freed up some space */ 935 h->h_upper += nbytes; 936 h->h_lower -= sizeof(index_t); 937 938 /* adjust offsets in line pointers affected by moving the data */ 939 top = NEXTINDEX(h); 940 for (i = 0; i < top; i++) { 941 if (h->h_linp[i] < off) 942 h->h_linp[i] += nbytes; 943 } 944 945 /* it's gone */ 946 h->h_flags |= F_DIRTY; 947 948 return (RET_SUCCESS); 949 } 950 951 /* 952 * _BT_FIXSCAN -- Adjust a scan to cope with a change in tree structure. 953 * 954 * If the user has an active scan on the database, and we delete an 955 * item from the page the cursor is pointing at, we need to figure 956 * out what to do about it. Basically, the solution is to point 957 * "between" keys in the tree, using the CRSR_BEFORE flag. The 958 * requirement is that the user should not miss any data in the 959 * tree during a scan, just because he happened to do some deletions 960 * or insertions while it was active. 961 * 962 * In order to guarantee that the scan progresses properly, we need 963 * to save the key of any deleted item we were pointing at, so that 964 * we can check later insertions against it. 965 * 966 * Parameters: 967 * t -- tree to adjust 968 * index -- index of item at which change was made 969 * newd -- new datum (for insertions only) 970 * op -- operation (DELETE or INSERT) causing change 971 * 972 * Returns: 973 * RET_SUCCESS, RET_ERROR (errno is set). 974 * 975 * Side Effects: 976 * None. 977 */ 978 979 static int 980 _bt_fixscan(t, index, newd, op) 981 BTREE_P t; 982 index_t index; 983 DATUM *newd; 984 int op; 985 { 986 BTHEADER *h; 987 CURSOR *c; 988 DATUM *d; 989 990 h = t->bt_curpage; 991 c = &(t->bt_cursor); 992 993 if (op == DELETE) { 994 if (index < c->c_index) 995 c->c_index--; 996 else if (index == c->c_index) { 997 if (!(c->c_flags & CRSR_BEFORE)) { 998 if (_bt_crsrkey(t) == RET_ERROR) 999 return (RET_ERROR); 1000 c->c_flags |= CRSR_BEFORE; 1001 } 1002 } 1003 } else { 1004 /* 1005 * If we previously deleted the object at this location, 1006 * and now we're inserting a new one, we need to do the 1007 * right thing -- the cursor should come either before 1008 * or after the new item, depending on the key that was 1009 * here, and the new one. 1010 */ 1011 1012 if (c->c_flags & CRSR_BEFORE) { 1013 if (index <= c->c_index) { 1014 char *tmp; 1015 int itmp; 1016 pgno_t chain; 1017 int r; 1018 1019 d = (DATUM *) GETDATUM(t->bt_curpage, index); 1020 if (d->d_flags & D_BIGKEY) { 1021 bcopy(&(newd->d_bytes[0]), 1022 (char *) &chain, 1023 sizeof(chain)); 1024 if (_bt_getbig(t, chain, &tmp, &itmp) 1025 == RET_ERROR) 1026 return (RET_ERROR); 1027 } else 1028 tmp = &(newd->d_bytes[0]); 1029 1030 r = (*(t->bt_compare))(tmp, c->c_key); 1031 if (r < 0) 1032 c->c_index++; 1033 1034 if (d->d_flags & D_BIGKEY) 1035 (void) free (tmp); 1036 } 1037 } else if (index <= c->c_index) 1038 c->c_index++; 1039 } 1040 return (RET_SUCCESS); 1041 } 1042 1043 /* 1044 * _BT_CRSRKEY -- Save a copy of the key of the record that the cursor 1045 * is pointing to. The record is about to be deleted. 1046 * 1047 * Parameters: 1048 * t -- btree 1049 * 1050 * Returns: 1051 * RET_SUCCESS, RET_ERROR. 1052 * 1053 * Side Effects: 1054 * Allocates memory for the copy which should be released when 1055 * it is no longer needed. 1056 */ 1057 1058 static int 1059 _bt_crsrkey(t) 1060 BTREE_P t; 1061 { 1062 CURSOR *c; 1063 DATUM *d; 1064 pgno_t pgno; 1065 int ignore; 1066 1067 c = &(t->bt_cursor); 1068 d = (DATUM *) GETDATUM(t->bt_curpage, c->c_index); 1069 1070 if (d->d_flags & D_BIGKEY) { 1071 bcopy(&(d->d_bytes[0]), (char *) &pgno, sizeof(pgno)); 1072 return (_bt_getbig(t, pgno, &(c->c_key), &ignore)); 1073 } else { 1074 if ((c->c_key = (char *) malloc(d->d_ksize)) == (char *) NULL) 1075 return (RET_ERROR); 1076 1077 bcopy(&(d->d_bytes[0]), c->c_key, d->d_ksize); 1078 } 1079 return (RET_SUCCESS); 1080 } 1081 1082 /* 1083 * _BT_GETBIG -- Get big data from indirect pages. 1084 * 1085 * This routine chases indirect blocks for the big object at the 1086 * specified page to a buffer, and return the address of the buffer. 1087 * 1088 * Parameters: 1089 * t -- btree with the indirect blocks 1090 * pgno -- page number that starts the chain 1091 * p -- (char **) to get the address of the buffer containing 1092 * the key or datum. 1093 * sz -- pointer to an int to get the size of the instantiated 1094 * object. 1095 * 1096 * Returns: 1097 * RET_SUCCESS, RET_ERROR. 1098 * 1099 * Side Effects: 1100 * None. 1101 */ 1102 1103 static int 1104 _bt_getbig(t, pgno, p, sz) 1105 BTREE_P t; 1106 pgno_t pgno; 1107 char **p; 1108 int *sz; 1109 { 1110 pgno_t save; 1111 size_t nbytes; 1112 size_t nhere; 1113 BTHEADER *h; 1114 char *top, *from, *where; 1115 1116 save = t->bt_curpage->h_pgno; 1117 if (_bt_getpage(t, pgno) == RET_ERROR) 1118 return (RET_ERROR); 1119 1120 h = t->bt_curpage; 1121 1122 bcopy((char *) &(h->h_linp[0]), (char *) &nbytes, sizeof(nbytes)); 1123 1124 if ((*p = (char *) malloc(nbytes)) == (char *) NULL) 1125 return (RET_ERROR); 1126 1127 *sz = nbytes; 1128 from = ((char *) (&h->h_linp[0])) + sizeof(nbytes); 1129 top = ((char *) h) + t->bt_psize; 1130 1131 /* need more space for data? */ 1132 1133 where = *p; 1134 1135 while (nbytes > 0) { 1136 nhere = (int) (top - from); 1137 if (nhere > nbytes) { 1138 (void) bcopy(from, where, nbytes); 1139 nbytes = 0; 1140 } else { 1141 (void) bcopy(from, where, nhere); 1142 where += nhere; 1143 nbytes -= nhere; 1144 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 1145 return (RET_ERROR); 1146 h = t->bt_curpage; 1147 top = ((char *) h) + t->bt_psize; 1148 from = (char *) &(h->h_linp[0]); 1149 } 1150 } 1151 1152 if (_bt_getpage(t, save) == RET_ERROR) 1153 return (RET_ERROR); 1154 1155 return (RET_SUCCESS); 1156 } 1157 1158 /* 1159 * _BT_DELINDIR -- Delete a chain of indirect blocks from the btree. 1160 * 1161 * When a large item is deleted from the tree, this routine puts the 1162 * space that it occupied onto the free list for later reuse. The 1163 * bt_free entry in the btree structure points at the head of this list. 1164 * This value is also stored on disk in the btree's metadata. 1165 * 1166 * Parameters: 1167 * t -- btree from which to delete pages 1168 * chain -- page number that starts the chain. 1169 * 1170 * Returns: 1171 * RET_SUCCESS, RET_ERROR. 1172 * 1173 * Side Effects: 1174 * Invalidates the current on-disk version of the btree's 1175 * metadata (if any), and updates the free list appropriately. 1176 */ 1177 1178 static int 1179 _bt_delindir(t, chain) 1180 BTREE_P t; 1181 pgno_t chain; 1182 { 1183 BTHEADER *h; 1184 pgno_t save; 1185 pgno_t oldfree; 1186 1187 h = t->bt_curpage; 1188 save = h->h_pgno; 1189 if (_bt_getpage(t, chain) == RET_ERROR) 1190 return (RET_ERROR); 1191 1192 /* 1193 * If some internal node is pointing at this chain, don't 1194 * delete it. 1195 */ 1196 1197 if (t->bt_curpage->h_flags & F_PRESERVE) { 1198 if (_bt_getpage(t, save) == RET_ERROR) 1199 return (RET_ERROR); 1200 return (RET_SUCCESS); 1201 } 1202 1203 /* if there's nothing on the free list, this is easy... */ 1204 if (t->bt_free == P_NONE) { 1205 t->bt_free = chain; 1206 } else { 1207 oldfree = t->bt_free; 1208 1209 /* find the end of the data chain for the deleted datum */ 1210 t->bt_free = chain; 1211 do { 1212 if (_bt_getpage(t, chain) == RET_ERROR) 1213 return (RET_ERROR); 1214 h = t->bt_curpage; 1215 if (h->h_nextpg != P_NONE) 1216 chain = h->h_nextpg; 1217 } while (h->h_nextpg != P_NONE); 1218 1219 /* link freed pages into free list */ 1220 h->h_nextpg = oldfree; 1221 if (_bt_write(t, h, RELEASE) == RET_ERROR) 1222 return (RET_ERROR); 1223 if (_bt_getpage(t, oldfree) == RET_ERROR) 1224 return (RET_ERROR); 1225 h = t->bt_curpage; 1226 h->h_prevpg = chain; 1227 if (_bt_write(t, h, RELEASE) == RET_ERROR) 1228 return (RET_ERROR); 1229 } 1230 1231 /* restore the tree's current page pointer */ 1232 if (_bt_getpage(t, save) == RET_ERROR) 1233 return (RET_ERROR); 1234 1235 /* we have trashed the tree metadata; rewrite it later */ 1236 t->bt_flags &= ~BTF_METAOK; 1237 1238 return (RET_SUCCESS); 1239 } 1240 1241 /* 1242 * _BT_FIRST -- Find the first item in the tree that matches the supplied 1243 * key. 1244 * 1245 * This routine supports deletion. When the user supplies a key to 1246 * be deleted, we find the first one, and iteratively delete all the 1247 * matching ones that follow it. 1248 * 1249 * Parameters: 1250 * t -- btree in which to find first occurrence 1251 * key -- key to find 1252 * 1253 * Returns: 1254 * The BTITEM for the matching item. If there's no match, 1255 * this may point to the first item > than the supplied key, 1256 * or off the end of the page. 1257 * 1258 * Warnings: 1259 * The BTITEM returned is in static space and will be overwritten 1260 * by the next search of any kind in any btree. 1261 */ 1262 1263 static BTITEM * 1264 _bt_first(t, key) 1265 BTREE_P t; 1266 DBT *key; 1267 { 1268 BTHEADER *h; 1269 BTITEM *item; 1270 index_t next; 1271 int r; 1272 1273 /* find any matching item */ 1274 item = _bt_search(t, key); 1275 h = t->bt_curpage; 1276 next = NEXTINDEX(h); 1277 1278 /* if we're off the end of the page, search failed and we're done */ 1279 if (item->bti_index >= next) 1280 return (item); 1281 1282 /* as long as we have an exact match, walk backwards */ 1283 while ((r = _bt_cmp(t, key->data, item->bti_index)) == 0) { 1284 1285 /* at start of page? */ 1286 if (item->bti_index == 0) { 1287 1288 /* if no prev page, we're done */ 1289 if (h->h_prevpg == P_NONE) 1290 return (item); 1291 1292 /* walk backward, skipping empty pages */ 1293 do { 1294 if (_bt_getpage(t, h->h_prevpg) == RET_ERROR) 1295 return ((BTITEM *) NULL); 1296 h = t->bt_curpage; 1297 } while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE); 1298 1299 if (NEXTINDEX(h) != 0) 1300 item->bti_index = NEXTINDEX(h) - 1; 1301 else 1302 item->bti_index = 0; 1303 1304 item->bti_pgno = h->h_pgno; 1305 } else { 1306 item->bti_index--; 1307 } 1308 } 1309 1310 /* if we went too far backwards, step forward one entry */ 1311 if (r > 0) { 1312 if (++(item->bti_index) >= NEXTINDEX(h) 1313 && h->h_nextpg != P_NONE) { 1314 1315 /* walk forward, skipping empty pages */ 1316 do { 1317 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 1318 return ((BTITEM *) NULL); 1319 h = t->bt_curpage; 1320 } while (h->h_nextpg != P_NONE && NEXTINDEX(h) == 0); 1321 1322 item->bti_index = 0; 1323 item->bti_pgno = h->h_pgno; 1324 } 1325 } 1326 1327 /* got it */ 1328 return (item); 1329 } 1330 1331 /* 1332 * _BT_SEARCH, _BT_SEARCHR -- Search for a particular key in the tree. 1333 * 1334 * Parameters: 1335 * t -- btree in which to search 1336 * key -- key to find 1337 * 1338 * Returns: 1339 * BTITEM for matching item, if any, or the BTITEM for the 1340 * location of the key, if it were in the tree. 1341 * 1342 * Warnings: 1343 * The BTITEM returned is in static memory, and will be 1344 * overwritten by the next search of any kind in any tree. 1345 */ 1346 1347 static BTITEM * 1348 _bt_search(t, key) 1349 BTREE_P t; 1350 DBT *key; 1351 { 1352 /* we want to start all of our searches at the root */ 1353 if (_bt_getpage(t, P_ROOT) == RET_ERROR) 1354 return ((BTITEM *) NULL); 1355 1356 return (_bt_searchr(t, key)); 1357 } 1358 1359 static BTITEM * 1360 _bt_searchr(t, key) 1361 BTREE_P t; 1362 DBT *key; 1363 { 1364 BTHEADER *h = t->bt_curpage; 1365 index_t index; 1366 IDATUM *id; 1367 DATUM *d; 1368 static BTITEM item; 1369 1370 /* do a binary search on the current page */ 1371 index = _bt_binsrch(t, &(key->data[0])); 1372 1373 /* 1374 * At this point, the binary search terminated because the endpoints 1375 * got too close together, or we have a match. Figure out which 1376 * case applies and decide what to do based on the page type. 1377 */ 1378 if (h->h_flags & F_LEAF) { 1379 item.bti_pgno = h->h_pgno; 1380 item.bti_index = index; 1381 if (index < NEXTINDEX(h)) 1382 d = (DATUM *) GETDATUM(h,index); 1383 else 1384 d = (DATUM *) NULL; 1385 1386 item.bti_datum = d; 1387 return(&item); 1388 } else { 1389 id = (IDATUM *) GETDATUM(h, index); 1390 _bt_push(t, h->h_pgno); 1391 if (_bt_getpage(t, id->i_pgno) == RET_ERROR) 1392 return ((BTITEM *) NULL); 1393 return (_bt_searchr(t, key)); 1394 } 1395 } 1396 1397 /* 1398 * BT_GETPAGE -- Make pgno the current page of the btree. 1399 * 1400 * This routine is just a wrapper that decides whether to call the 1401 * memory or disk-based routine to do the work. 1402 * 1403 * Parameters: 1404 * t -- btree in which to get page 1405 * pgno -- page number to get 1406 * 1407 * Returns: 1408 * RET_SUCCESS or RET_ERROR. 1409 */ 1410 1411 static int 1412 _bt_getpage(t, pgno) 1413 BTREE_P t; 1414 pgno_t pgno; 1415 { 1416 #ifdef DEBUG 1417 if (pgno == P_NONE) 1418 _punt(); 1419 #endif /* DEBUG */ 1420 1421 /* see if we can get away without doing any work */ 1422 if (t->bt_curpage != (BTHEADER *) NULL) { 1423 if (t->bt_curpage->h_pgno == pgno) 1424 return (RET_SUCCESS); 1425 } 1426 1427 if (t->bt_fname == (char *) NULL) 1428 return (_bt_getmpage(t, pgno)); 1429 else 1430 return (_bt_getdpage(t, pgno)); 1431 } 1432 1433 /* 1434 * _BT_GETMPAGE -- Make pgno the current page of the btree. 1435 * 1436 * This routine gets pages for in-memory btrees. 1437 * 1438 * Parameters: 1439 * t -- btree in which to get page 1440 * pgno -- page number to get 1441 * 1442 * Returns: 1443 * RET_SUCCESS or RET_ERROR. 1444 */ 1445 1446 static int 1447 _bt_getmpage(t, pgno) 1448 register BTREE_P t; 1449 pgno_t pgno; 1450 { 1451 int htindex; 1452 int nbytes; 1453 BTHEADER *h; 1454 HTBUCKET *b; 1455 1456 if (t->bt_curpage == (BTHEADER *) NULL) { 1457 if (pgno != P_ROOT) { 1458 errno = EBADF; 1459 return (RET_ERROR); 1460 } 1461 1462 t->bt_npages++; 1463 h = (BTHEADER *) malloc((unsigned) t->bt_psize); 1464 if (h == (BTHEADER *) NULL) 1465 return (RET_ERROR); 1466 1467 h->h_pgno = P_ROOT; 1468 h->h_flags = F_LEAF; 1469 h->h_lower = (index_t) 1470 (((char *) &(h->h_linp[0])) - ((char *) h)); 1471 h->h_upper = t->bt_psize; 1472 h->h_prevpg = h->h_nextpg = P_NONE; 1473 1474 t->bt_curpage = h; 1475 1476 /* get the root page into the hash table */ 1477 if (_bt_write(t, h, RELEASE) == RET_ERROR) 1478 return (RET_ERROR); 1479 } 1480 1481 htindex = HASHKEY(pgno); 1482 1483 for (b = t->bt_s.bt_ht[htindex]; 1484 b != (HTBUCKET *) NULL; 1485 b = b->ht_next) { 1486 if (b->ht_pgno == pgno) { 1487 t->bt_curpage = b->ht_page; 1488 return (RET_SUCCESS); 1489 } 1490 } 1491 return (RET_ERROR); 1492 } 1493 1494 /* 1495 * _BT_GETDPAGE -- Make pgno the current page of the btree. 1496 * 1497 * This routine gets pages for disk btrees. 1498 * 1499 * Because disk btree pages must be readable across machine architectures, 1500 * the btree code writes integers out in network format. This routine 1501 * converts them back to host format before returning the page. 1502 * 1503 * Parameters: 1504 * t -- btree in which to get page 1505 * pgno -- page number to get 1506 * 1507 * Returns: 1508 * RET_SUCCESS, RET_ERROR. 1509 */ 1510 1511 static int 1512 _bt_getdpage(t, pgno) 1513 register BTREE_P t; 1514 pgno_t pgno; 1515 { 1516 BTHEADER *h; 1517 char *cache; 1518 long pos; 1519 int n, nbytes; 1520 1521 /* if we have an lru cache, let the cache code do the work */ 1522 if (ISCACHE(t)) { 1523 cache = t->bt_s.bt_d.d_cache; 1524 1525 /* release the old page */ 1526 if (t->bt_curpage != (BTHEADER *) NULL) { 1527 pgno_t opgno = t->bt_curpage->h_pgno; 1528 t->bt_curpage->h_flags &= ~F_DIRTY; 1529 1530 if (lruwrite(cache, opgno) < 0) 1531 return (RET_ERROR); 1532 1533 lrurelease(cache, opgno); 1534 } 1535 1536 if (pgno > t->bt_npages) { 1537 if ((h = (BTHEADER *) lrugetnew(cache, pgno, &nbytes)) 1538 == (BTHEADER *) NULL) 1539 return (RET_ERROR); 1540 t->bt_npages = pgno; 1541 } else { 1542 if ((h = (BTHEADER *) lruget(cache, pgno, &nbytes)) 1543 == (BTHEADER *) NULL) 1544 return (RET_ERROR); 1545 } 1546 1547 /* init this page, if necessary */ 1548 if (nbytes == 0) { 1549 h->h_pgno = pgno; 1550 h->h_flags = F_LEAF; 1551 h->h_lower = (index_t) 1552 (((char *) &(h->h_linp[0])) - ((char *) h)); 1553 h->h_upper = t->bt_psize; 1554 h->h_prevpg = h->h_nextpg = P_NONE; 1555 } 1556 1557 t->bt_curpage = h; 1558 1559 return (RET_SUCCESS); 1560 } 1561 1562 /* sync the current page, if necessary */ 1563 if (t->bt_curpage != (BTHEADER *) NULL) { 1564 if (t->bt_curpage->h_flags & F_DIRTY) 1565 if (_bt_write(t, t->bt_curpage, RELEASE) == RET_ERROR) 1566 return (RET_ERROR); 1567 } else { 1568 if (t->bt_npages == 0) 1569 t->bt_npages = 1; 1570 1571 /* if no current page, get space for one */ 1572 if ((t->bt_curpage = (BTHEADER *) malloc((unsigned) t->bt_psize)) 1573 == (BTHEADER *) NULL) { 1574 return (RET_ERROR); 1575 } 1576 } 1577 1578 n = t->bt_psize; 1579 pos = (long) (pgno * n); 1580 1581 /* seek to correct location in file */ 1582 if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) { 1583 return (RET_ERROR); 1584 } 1585 1586 /* read the page */ 1587 if ((nbytes = read(t->bt_s.bt_d.d_fd, t->bt_curpage, n)) < n) { 1588 1589 /* 1590 * If we didn't get a full page, we must have gotten no 1591 * data at all -- in which case we're trying to read a 1592 * root page that doesn't exist yet. This is the only 1593 * case in which this is okay. If this happens, construct 1594 * an empty root page by hand. 1595 */ 1596 if (nbytes != 0 || pgno != P_ROOT) { 1597 errno = EBADF; 1598 return (RET_ERROR); 1599 } 1600 1601 h = (BTHEADER *) t->bt_curpage; 1602 h->h_pgno = pgno; 1603 h->h_flags = F_LEAF; 1604 h->h_lower = (index_t) 1605 (((char *) &(h->h_linp[0])) - ((char *) h)); 1606 h->h_upper = t->bt_psize; 1607 h->h_prevpg = h->h_nextpg = P_NONE; 1608 } else 1609 (void) _bt_pgin(t->bt_curpage, (char *) t->bt_lorder); 1610 1611 return (RET_SUCCESS); 1612 } 1613 1614 /* 1615 * _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from 1616 * the host-independent format stored on disk. 1617 * 1618 * Parameters: 1619 * h -- page to convert 1620 * _lorder -- byte order for pages (stored as a char * in the 1621 * cache, and passed around as a magic cookie). 1622 * 1623 * Returns: 1624 * RET_SUCCESS (lru code requires a return value). 1625 * 1626 * Side Effects: 1627 * Layout of tree metadata on the page is changed in place. 1628 * 1629 * Warnings: 1630 * Everywhere else in the code, the types pgno_t and index_t 1631 * are opaque. These two routines know what they really 1632 * are. 1633 */ 1634 1635 static int 1636 _bt_pgout(h, _lorder) 1637 BTHEADER *h; 1638 char *_lorder; 1639 { 1640 int i; 1641 int top; 1642 int lorder; 1643 DATUM *d; 1644 IDATUM *id; 1645 size_t chain; 1646 1647 lorder = (int) _lorder; 1648 if (lorder == BYTE_ORDER) 1649 return (RET_SUCCESS); 1650 1651 if (h->h_flags & F_LEAF) { 1652 if (h->h_flags & F_CONT) { 1653 if (h->h_prevpg == P_NONE) { 1654 size_t longsz; 1655 1656 (void) bcopy((char *) &(h->h_linp[0]), 1657 (char *) &longsz, 1658 sizeof(longsz)); 1659 BLSWAP(longsz); 1660 (void) bcopy((char *) &longsz, 1661 (char *) &(h->h_linp[0]), 1662 sizeof(longsz)); 1663 } 1664 } else { 1665 top = NEXTINDEX(h); 1666 for (i = 0; i < top; i++) { 1667 d = (DATUM *) GETDATUM(h, i); 1668 if (d->d_flags & D_BIGKEY) { 1669 (void) bcopy((char *) &(d->d_bytes[0]), 1670 (char *) &chain, 1671 sizeof(chain)); 1672 BLSWAP(chain); 1673 (void) bcopy((char *) &chain, 1674 (char *) &(d->d_bytes[0]), 1675 sizeof(chain)); 1676 } 1677 if (d->d_flags & D_BIGDATA) { 1678 (void) bcopy((char *) &(d->d_bytes[d->d_ksize]), 1679 (char *) &chain, 1680 sizeof(chain)); 1681 BLSWAP(chain); 1682 (void) bcopy((char *) &chain, 1683 (char *) &(d->d_bytes[d->d_ksize]), 1684 sizeof(chain)); 1685 } 1686 BLSWAP(d->d_dsize); 1687 BLSWAP(d->d_ksize); 1688 BLSWAP(d->d_flags); 1689 BLSWAP(h->h_linp[i]); 1690 } 1691 } 1692 } else { 1693 top = NEXTINDEX(h); 1694 for (i = 0; i < top; i++) { 1695 id = (IDATUM *) GETDATUM(h, i); 1696 BLSWAP(id->i_size); 1697 BLSWAP(id->i_pgno); 1698 BLSWAP(id->i_flags); 1699 if (id->i_flags & D_BIGKEY) { 1700 (void) bcopy((char *) &(id->i_bytes[0]), 1701 (char *) &chain, 1702 sizeof(chain)); 1703 BLSWAP(chain); 1704 (void) bcopy((char *) &chain, 1705 (char *) &(id->i_bytes[0]), 1706 sizeof(chain)); 1707 } 1708 BLSWAP(h->h_linp[i]); 1709 } 1710 } 1711 BLSWAP(h->h_flags); 1712 BLSWAP(h->h_pgno); 1713 BLSWAP(h->h_prevpg); 1714 BLSWAP(h->h_nextpg); 1715 BLSWAP(h->h_lower); 1716 BLSWAP(h->h_upper); 1717 1718 return (RET_SUCCESS); 1719 } 1720 1721 static int 1722 _bt_pgin(h, _lorder) 1723 BTHEADER *h; 1724 char *_lorder; 1725 { 1726 int i; 1727 int top; 1728 int lorder; 1729 DATUM *d; 1730 IDATUM *id; 1731 size_t chain; 1732 1733 /* 1734 * If btree pages are to be stored in the host byte order, don't 1735 * bother swapping. 1736 */ 1737 lorder = (int) _lorder; 1738 if (lorder == BYTE_ORDER) 1739 return (RET_SUCCESS); 1740 1741 BLSWAP(h->h_upper); 1742 BLSWAP(h->h_lower); 1743 BLSWAP(h->h_nextpg); 1744 BLSWAP(h->h_prevpg); 1745 BLSWAP(h->h_pgno); 1746 BLSWAP(h->h_flags); 1747 1748 if (h->h_flags & F_LEAF) { 1749 if (h->h_flags & F_CONT) { 1750 if (h->h_prevpg == P_NONE) { 1751 size_t longsz; 1752 1753 (void) bcopy((char *) &(h->h_linp[0]), 1754 (char *) &longsz, 1755 sizeof(longsz)); 1756 BLSWAP(longsz); 1757 (void) bcopy((char *) &longsz, 1758 (char *) &(h->h_linp[0]), 1759 sizeof(longsz)); 1760 } 1761 } else { 1762 top = NEXTINDEX(h); 1763 for (i = 0; i < top; i++) { 1764 BLSWAP(h->h_linp[i]); 1765 d = (DATUM *) GETDATUM(h, i); 1766 BLSWAP(d->d_dsize); 1767 BLSWAP(d->d_ksize); 1768 BLSWAP(d->d_flags); 1769 if (d->d_flags & D_BIGKEY) { 1770 (void) bcopy((char *) &(d->d_bytes[0]), 1771 (char *) &chain, 1772 sizeof(chain)); 1773 BLSWAP(chain); 1774 (void) bcopy((char *) &chain, 1775 (char *) &(d->d_bytes[0]), 1776 sizeof(chain)); 1777 } 1778 if (d->d_flags & D_BIGDATA) { 1779 (void) bcopy((char *) &(d->d_bytes[d->d_ksize]), 1780 (char *) &chain, 1781 sizeof(chain)); 1782 BLSWAP(chain); 1783 (void) bcopy((char *) &chain, 1784 (char *) &(d->d_bytes[d->d_ksize]), 1785 sizeof(chain)); 1786 } 1787 } 1788 } 1789 } else { 1790 top = NEXTINDEX(h); 1791 for (i = 0; i < top; i++) { 1792 BLSWAP(h->h_linp[i]); 1793 id = (IDATUM *) GETDATUM(h, i); 1794 BLSWAP(id->i_size); 1795 BLSWAP(id->i_pgno); 1796 BLSWAP(id->i_flags); 1797 if (id->i_flags & D_BIGKEY) { 1798 (void) bcopy((char *) &(id->i_bytes[0]), 1799 (char *) &chain, 1800 sizeof(chain)); 1801 BLSWAP(chain); 1802 (void) bcopy((char *) &chain, 1803 (char *) &(id->i_bytes[0]), 1804 sizeof(chain)); 1805 } 1806 } 1807 } 1808 return (RET_SUCCESS); 1809 } 1810 1811 /* 1812 * BT_SYNC -- sync the btree to disk. 1813 * 1814 * Parameters: 1815 * tree -- btree to sync. 1816 * 1817 * Returns: 1818 * RET_SUCCESS, RET_ERROR. 1819 */ 1820 1821 bt_sync(tree) 1822 BTREE tree; 1823 { 1824 BTREE_P t = (BTREE_P) tree; 1825 BTHEADER *h; 1826 pgno_t pgno; 1827 1828 /* if this is an in-memory btree, syncing is a no-op */ 1829 if (!ISDISK(t)) 1830 return (RET_SUCCESS); 1831 1832 h = (BTHEADER *) t->bt_curpage; 1833 h->h_flags &= ~F_DIRTY; 1834 1835 if (ISCACHE(t)) { 1836 pgno = t->bt_curpage->h_pgno; 1837 if (_bt_write(t, h, RELEASE) == RET_ERROR) 1838 return(RET_ERROR); 1839 if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR) 1840 return (RET_ERROR); 1841 if (_bt_getpage(t, pgno) == RET_ERROR) 1842 return (RET_ERROR); 1843 } else { 1844 if (_bt_write(t, h, NORELEASE) == RET_ERROR) 1845 return (RET_ERROR); 1846 } 1847 1848 return (fsync(t->bt_s.bt_d.d_fd)); 1849 } 1850 1851 /* 1852 * _BT_INSERT -- Insert a new user datum in the btree. 1853 * 1854 * This routine is called by bt_put, the public interface, once the 1855 * location for the new item is known. We do the work here, and 1856 * handle splits if necessary. 1857 * 1858 * Parameters: 1859 * t -- btree in which to do the insertion. 1860 * item -- BTITEM describing location of new datum 1861 * key -- key to insert 1862 * data -- data associated with key 1863 * flag -- magic cookie passed recursively to bt_put if we 1864 * have to do a split 1865 * 1866 * Returns: 1867 * RET_SUCCESS, RET_ERROR. 1868 */ 1869 1870 static int 1871 _bt_insert(t, item, key, data, flag) 1872 BTREE_P t; 1873 BTITEM *item; 1874 DBT *key; 1875 DBT *data; 1876 int flag; 1877 { 1878 index_t index; 1879 BTHEADER *h; 1880 DATUM *d; 1881 int nbytes; 1882 int status; 1883 pgno_t pgno; 1884 int keysize, datasize; 1885 int bigkey, bigdata; 1886 1887 if (_bt_getpage(t, item->bti_pgno) == RET_ERROR) 1888 return (RET_ERROR); 1889 h = t->bt_curpage; 1890 1891 if (TOOBIG(t, data->size)) { 1892 bigdata = TRUE; 1893 datasize = sizeof(pgno_t); 1894 } else { 1895 bigdata = FALSE; 1896 datasize = data->size; 1897 } 1898 1899 if (TOOBIG(t, key->size)) { 1900 bigkey = TRUE; 1901 keysize = sizeof(pgno_t); 1902 } else { 1903 bigkey = FALSE; 1904 keysize = key->size; 1905 } 1906 1907 nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char)); 1908 nbytes = LONGALIGN(nbytes) + sizeof(index_t); 1909 1910 /* if there's not enough room here, split the page */ 1911 if ((h->h_upper - h->h_lower) < nbytes) { 1912 if (_bt_split(t) == RET_ERROR) 1913 return (RET_ERROR); 1914 1915 /* okay, try again */ 1916 return (bt_put(t, key, data, flag)); 1917 } 1918 1919 /* put together a leaf page datum from the key/data pair */ 1920 index = item->bti_index; 1921 nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char)); 1922 1923 if ((d = (DATUM *) malloc((unsigned) nbytes)) == (DATUM *) NULL) 1924 return (RET_ERROR); 1925 1926 d->d_ksize = keysize; 1927 d->d_dsize = datasize; 1928 d->d_flags = 0; 1929 1930 if (bigkey) { 1931 if (_bt_indirect(t, key, &pgno) == RET_ERROR) 1932 return (RET_ERROR); 1933 (void) bcopy((char *) &pgno, &(d->d_bytes[0]), sizeof(pgno)); 1934 d->d_flags |= D_BIGKEY; 1935 if (_bt_getpage(t, item->bti_pgno) == RET_ERROR) 1936 return (RET_ERROR); 1937 } else { 1938 if (d->d_ksize > 0) { 1939 (void) bcopy((char *) key->data, 1940 (char *) &(d->d_bytes[0]), 1941 (int) d->d_ksize); 1942 } 1943 } 1944 1945 if (bigdata) { 1946 if (_bt_indirect(t, data, &pgno) == RET_ERROR) 1947 return (RET_ERROR); 1948 (void) bcopy((char *) &pgno, 1949 &(d->d_bytes[keysize]), 1950 sizeof(pgno)); 1951 d->d_flags |= D_BIGDATA; 1952 if (_bt_getpage(t, item->bti_pgno) == RET_ERROR) 1953 return (RET_ERROR); 1954 } else { 1955 if (d->d_dsize > 0) { 1956 (void) bcopy((char *) data->data, 1957 (char *) &(d->d_bytes[keysize]), 1958 (int) d->d_dsize); 1959 } 1960 } 1961 1962 /* do the insertion */ 1963 status = _bt_insertat(t, d, index); 1964 1965 (void) free((char *) d); 1966 1967 return (status); 1968 } 1969 1970 /* 1971 * _BT_INDIRECT -- Write a series of indirect pages for big objects. 1972 * 1973 * A chain of indirect pages looks like 1974 * 1975 * +-------------------+ +---------------------+ 1976 * |hdr|size| | |hdr| | 1977 * +---+----+ | +---+ | 1978 * | ... data ... | | ... data ... | ... 1979 * | | | | 1980 * +-------------------+ +---------------------+ 1981 * 1982 * where hdr is a standard btree page header (with the indirect bit 1983 * set), size on the first page is the real size of the datum, and 1984 * data are bytes of the datum, split across as many pages as necessary. 1985 * Indirect pages are chained together with the h_prevpg and h_nextpg 1986 * entries of the page header struct. 1987 * 1988 * A single DBT is written per chain, so space on the last page is 1989 * wasted. 1990 * 1991 * We return the page number of the start of the chain. 1992 * 1993 * When a big object is deleted from a tree, the space that it occupied 1994 * is placed on a free list for later reuse. This routine checks that 1995 * free list before allocating new pages to the big datum being inserted. 1996 * 1997 * Parameters: 1998 * t -- btree in which to store indirect blocks 1999 * data -- DBT with the big datum in it 2000 * pgno -- place to put the starting page number of the chain 2001 * 2002 * Returns: 2003 * RET_SUCCESS, RET_ERROR. 2004 * 2005 * Side Effects: 2006 * Current page is changed on return. 2007 */ 2008 2009 static int 2010 _bt_indirect(t, data, pgno) 2011 BTREE_P t; 2012 DBT *data; 2013 pgno_t *pgno; 2014 { 2015 pgno_t save; 2016 pgno_t prev; 2017 char *top; 2018 char *where; 2019 char *from; 2020 size_t dsize; 2021 pgno_t nextchn; 2022 int ischain; 2023 BTHEADER *cur; 2024 2025 /* get set for first page in chain */ 2026 prev = P_NONE; 2027 dsize = data->size; 2028 from = (char *) data->data; 2029 2030 /* if there are blocks on the free list, use them first */ 2031 if ((*pgno = t->bt_free) == P_NONE) { 2032 if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL) 2033 return (RET_ERROR); 2034 2035 ischain = 0; 2036 *pgno = cur->h_pgno = ++(t->bt_npages); 2037 } else { 2038 if (_bt_getpage(t, *pgno) == RET_ERROR) 2039 return (RET_ERROR); 2040 ischain = 1; 2041 cur = t->bt_curpage; 2042 } 2043 2044 cur->h_flags = F_CONT|F_LEAF; 2045 (void) bcopy((char *) &dsize, (char *) &cur->h_linp[0], sizeof(size_t)); 2046 where = ((char *) (&cur->h_linp[0])) + sizeof(size_t); 2047 2048 /* fill and write pages in the chain */ 2049 for (;;) { 2050 int nhere; 2051 2052 top = ((char *) cur) + t->bt_psize; 2053 cur->h_prevpg = prev; 2054 nextchn = cur->h_nextpg; 2055 nhere = (int) (top - where); 2056 2057 if (nhere >= dsize) { 2058 (void) bcopy(from, where, (int) dsize); 2059 cur->h_nextpg = P_NONE; 2060 dsize = 0; 2061 } else { 2062 (void) bcopy(from, where, (int) nhere); 2063 dsize -= nhere; 2064 from += nhere; 2065 if (nextchn == P_NONE) 2066 cur->h_nextpg = t->bt_npages + 1; 2067 prev = cur->h_pgno; 2068 } 2069 2070 /* current page is ready to go; write it out */ 2071 if (_bt_write(t, cur, RELEASE) == RET_ERROR) 2072 return (RET_ERROR); 2073 2074 /* free the current page, if appropriate */ 2075 if (ISDISK(t) && !ISCACHE(t) && !ischain) { 2076 (void) free ((char *) cur); 2077 } 2078 2079 /* done? */ 2080 if (dsize == 0) 2081 break; 2082 2083 /* allocate another page */ 2084 if (nextchn == P_NONE) { 2085 if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL) 2086 return (RET_ERROR); 2087 ischain = 0; 2088 cur->h_pgno = ++(t->bt_npages); 2089 } else { 2090 if (_bt_getpage(t, nextchn) == RET_ERROR) 2091 return (RET_ERROR); 2092 ischain = 1; 2093 cur = t->bt_curpage; 2094 } 2095 cur->h_flags = F_LEAF | F_CONT; 2096 2097 where = (char *) (&cur->h_linp[0]); 2098 } 2099 2100 /* if we used pages from the free list, record changes to it */ 2101 if (*pgno == t->bt_free) { 2102 t->bt_free = nextchn; 2103 t->bt_flags &= ~BTF_METAOK; 2104 } 2105 2106 return (RET_SUCCESS); 2107 } 2108 2109 /* 2110 * _BT_SPLIT -- Split a page into two pages. 2111 * 2112 * Splits are caused by insertions, and propogate up the tree in 2113 * the usual way. The root page is always page 1 in the file on 2114 * disk, so root splits are handled specially. On entry to this 2115 * routine, t->bt_curpage is the page to be split. 2116 * 2117 * Parameters: 2118 * t -- btree in which to do split. 2119 * 2120 * Returns: 2121 * RET_SUCCESS, RET_ERROR. 2122 * 2123 * Side Effects: 2124 * Changes the notion of the current page. 2125 */ 2126 2127 static 2128 _bt_split(t) 2129 BTREE_P t; 2130 { 2131 BTHEADER *h; 2132 BTHEADER *left, *right; 2133 BTHEADER *next; 2134 pgno_t nextpgno, parent; 2135 int nbytes, len; 2136 IDATUM *id; 2137 DATUM *d; 2138 char *src; 2139 IDATUM *new; 2140 pgno_t oldchain; 2141 u_char flags; 2142 2143 h = (BTHEADER *) t->bt_curpage; 2144 2145 /* split root page specially, since it must remain page 1 */ 2146 if (h->h_pgno == P_ROOT) { 2147 return (_bt_splitroot(t)); 2148 } 2149 2150 /* 2151 * This is a little complicated. We go to some trouble to 2152 * figure out which of the three possible cases -- in-memory tree, 2153 * disk tree (no cache), and disk tree (cache) -- we have, in order 2154 * to avoid unnecessary copying. If we have a disk cache, then we 2155 * have to do some extra copying, though, since the cache code 2156 * manages buffers externally to this code. 2157 */ 2158 2159 if (ISDISK(t) && ISCACHE(t)) { 2160 if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize)) 2161 == (BTHEADER *) NULL) 2162 return (RET_ERROR); 2163 left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE; 2164 left->h_flags = t->bt_curpage->h_flags; 2165 left->h_lower = (index_t) 2166 (((char *) &(left->h_linp[0])) - ((char *) left)); 2167 left->h_upper = t->bt_psize; 2168 2169 } else { 2170 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL) 2171 return (RET_ERROR); 2172 } 2173 left->h_pgno = h->h_pgno; 2174 2175 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL) 2176 return (RET_ERROR); 2177 right->h_pgno = ++(t->bt_npages); 2178 2179 /* now do the split */ 2180 if (_bt_dopsplit(t, left, right) == RET_ERROR) 2181 return (RET_ERROR); 2182 2183 right->h_prevpg = left->h_pgno; 2184 nextpgno = right->h_nextpg = h->h_nextpg; 2185 left->h_nextpg = right->h_pgno; 2186 left->h_prevpg = h->h_prevpg; 2187 2188 /* okay, now use the left half of the page as the new page */ 2189 if (ISDISK(t) && ISCACHE(t)) { 2190 (void) bcopy((char *) left, (char *) t->bt_curpage, 2191 (int) t->bt_psize); 2192 (void) free ((char *) left); 2193 left = t->bt_curpage; 2194 } else { 2195 (void) free((char *) t->bt_curpage); 2196 t->bt_curpage = left; 2197 } 2198 2199 /* 2200 * Write the new pages out. We need them to stay where they are 2201 * until we're done updating the parent pages. 2202 */ 2203 2204 if (_bt_write(t, left, NORELEASE) == RET_ERROR) 2205 return (RET_ERROR); 2206 if (_bt_write(t, right, NORELEASE) == RET_ERROR) 2207 return (RET_ERROR); 2208 2209 /* update 'prev' pointer of old neighbor of left */ 2210 if (nextpgno != P_NONE) { 2211 if (_bt_getpage(t, nextpgno) == RET_ERROR) 2212 return (RET_ERROR); 2213 h = t->bt_curpage; 2214 h->h_prevpg = right->h_pgno; 2215 h->h_flags |= F_DIRTY; 2216 } 2217 2218 if ((parent = _bt_pop(t)) != P_NONE) { 2219 if (right->h_flags & F_LEAF) { 2220 d = (DATUM *) GETDATUM(right, 0); 2221 len = d->d_ksize; 2222 if (d->d_flags & D_BIGKEY) { 2223 bcopy(&(d->d_bytes[0]), 2224 (char *) &oldchain, 2225 sizeof(oldchain)); 2226 if (_bt_markchain(t, oldchain) == RET_ERROR) 2227 return (RET_ERROR); 2228 src = (char *) &oldchain; 2229 flags = D_BIGKEY; 2230 } else { 2231 src = (char *) &(d->d_bytes[0]); 2232 flags = 0; 2233 } 2234 } else { 2235 id = (IDATUM *) GETDATUM(right, 0); 2236 len = id->i_size; 2237 flags = id->i_flags; 2238 src = (char *) &(id->i_bytes[0]); 2239 } 2240 nbytes = len + (sizeof(IDATUM) - sizeof(char)); 2241 new = (IDATUM *) malloc((unsigned) nbytes); 2242 if (new == (IDATUM *) NULL) 2243 return (RET_ERROR); 2244 new->i_size = len; 2245 new->i_pgno = right->h_pgno; 2246 new->i_flags = flags; 2247 if (len > 0) 2248 (void) bcopy(src, (char *) &(new->i_bytes[0]), len); 2249 2250 nbytes = LONGALIGN(nbytes) + sizeof(index_t); 2251 if (_bt_getpage(t, parent) == RET_ERROR) 2252 return (RET_ERROR); 2253 2254 h = t->bt_curpage; 2255 2256 /* 2257 * Split the parent if we need to, then reposition the 2258 * tree's current page pointer for the new datum. 2259 */ 2260 if ((h->h_upper - h->h_lower) < nbytes) { 2261 if (_bt_split(t) == RET_ERROR) 2262 return (RET_ERROR); 2263 if (_bt_reposition(t, new, parent, right->h_prevpg) 2264 == RET_ERROR) 2265 return (RET_ERROR); 2266 } 2267 2268 /* okay, now insert the new idatum */ 2269 if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR) 2270 return (RET_ERROR); 2271 } 2272 2273 /* 2274 * Okay, split is done; don't need right page stapled down anymore. 2275 * The page we call 'left' above is the new version of the old 2276 * (split) page, so we can't release it. 2277 */ 2278 2279 if (_bt_release(t, right) == RET_ERROR) 2280 return (RET_ERROR); 2281 if (ISDISK(t) && !ISCACHE(t)) 2282 (void) free((char *) right); 2283 2284 return (RET_SUCCESS); 2285 } 2286 2287 /* 2288 * _BT_MARKCHAIN -- Mark a chain of pages as used by an internal node. 2289 * 2290 * Chains of indirect blocks pointed to by leaf nodes get reclaimed 2291 * when the item that points to them gets deleted. Chains pointed 2292 * to by internal nodes never get deleted. This routine marks a 2293 * chain as pointed to by an internal node. 2294 * 2295 * Parameters: 2296 * t -- tree in which to mark 2297 * chain -- number of first page in chain 2298 * 2299 * Returns: 2300 * RET_SUCCESS, RET_ERROR. 2301 * 2302 * Side Effects: 2303 * None. 2304 */ 2305 2306 static int 2307 _bt_markchain(t, chain) 2308 BTREE_P t; 2309 pgno_t chain; 2310 { 2311 pgno_t save; 2312 2313 save = t->bt_curpage->h_pgno; 2314 2315 if (_bt_getpage(t, chain) == RET_ERROR) 2316 return (RET_ERROR); 2317 2318 t->bt_curpage->h_flags |= (F_DIRTY|F_PRESERVE); 2319 2320 if (_bt_getpage(t, save) == RET_ERROR) 2321 return (RET_ERROR); 2322 2323 return (RET_SUCCESS); 2324 } 2325 2326 /* 2327 * _BT_REPOSITION -- Reposition the current page pointer of a btree. 2328 * 2329 * After splitting a node in the tree in order to make room for 2330 * an insertion, we need to figure out which page after the split 2331 * should get the item we want to insert. This routine positions 2332 * the tree's current page pointer appropriately. 2333 * 2334 * Parameters: 2335 * t -- tree to position 2336 * new -- the item we want to insert 2337 * parent -- parent of the node that we just split 2338 * prev -- page number of item directly to the left of 2339 * new's position in the tree. 2340 * 2341 * Returns: 2342 * RET_SUCCESS, RET_ERROR. 2343 * 2344 * Side Effects: 2345 * None. 2346 */ 2347 2348 static int 2349 _bt_reposition(t, new, parent, prev) 2350 BTREE_P t; 2351 IDATUM *new; 2352 pgno_t parent; 2353 pgno_t prev; 2354 { 2355 index_t i, next; 2356 IDATUM *idx; 2357 2358 if (parent == P_ROOT) { 2359 2360 /* 2361 * If we just split the root page, then there are guaranteed 2362 * to be exactly two IDATUMs on it. Look at both of them 2363 * to see if they point to the page that we want. 2364 */ 2365 2366 if (_bt_getpage(t, parent) == RET_ERROR) 2367 return (RET_ERROR); 2368 2369 next = NEXTINDEX(t->bt_curpage); 2370 for (i = 0; i < next; i++) { 2371 idx = (IDATUM *) GETDATUM(t->bt_curpage, i); 2372 if (_bt_getpage(t, idx->i_pgno) == RET_ERROR) 2373 return (RET_ERROR); 2374 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 2375 return (RET_SUCCESS); 2376 if (_bt_getpage(t, parent) == RET_ERROR) 2377 return (RET_ERROR); 2378 } 2379 } else { 2380 2381 /* 2382 * Get the parent page -- which is where the new item would 2383 * have gone -- and figure out whether the new item now goes 2384 * on the parent, or the page immediately to the right of 2385 * the parent. 2386 */ 2387 2388 if (_bt_getpage(t, parent) == RET_ERROR) 2389 return (RET_ERROR); 2390 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 2391 return (RET_SUCCESS); 2392 if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR) 2393 return (RET_ERROR); 2394 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 2395 return (RET_SUCCESS); 2396 } 2397 return (RET_ERROR); 2398 } 2399 2400 /* 2401 * _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page? 2402 * 2403 * This routine is used by _bt_reposition to decide whether the current 2404 * page is the correct one on which to insert a new item. 2405 * 2406 * Parameters: 2407 * t -- tree to check 2408 * new -- the item that will be inserted (used for binary search) 2409 * prev -- page number of page whose IDATUM is immediately to 2410 * the left of new's position in the tree. 2411 * 2412 * Returns: 2413 * RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE). 2414 */ 2415 2416 static int 2417 _bt_isonpage(t, new, prev) 2418 BTREE_P t; 2419 IDATUM *new; 2420 pgno_t prev; 2421 { 2422 BTHEADER *h = (BTHEADER *) t->bt_curpage; 2423 index_t i, next; 2424 IDATUM *idx; 2425 2426 i = _bt_binsrch(t, &(new->i_bytes[0])); 2427 while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0) 2428 --i; 2429 next = NEXTINDEX(h); 2430 idx = (IDATUM *) GETDATUM(h, i); 2431 while (i < next && idx->i_pgno != prev) { 2432 i++; 2433 idx = (IDATUM *) GETDATUM(h, i); 2434 } 2435 if (idx->i_pgno == prev) 2436 return (RET_SUCCESS); 2437 else 2438 return (RET_ERROR); 2439 } 2440 2441 /* 2442 * _BT_SPLITROOT -- Split the root of a btree. 2443 * 2444 * The root page for a btree is always page one. This means that in 2445 * order to split the root, we need to do extra work. 2446 * 2447 * Parameters: 2448 * t -- tree to split 2449 * 2450 * Returns: 2451 * RET_SUCCESS, RET_ERROR. 2452 * 2453 * Side Effects: 2454 * Splits root upward in the usual way, adding two new pages 2455 * to the tree (rather than just one, as in usual splits). 2456 */ 2457 2458 static 2459 _bt_splitroot(t) 2460 BTREE_P t; 2461 { 2462 BTHEADER *h = t->bt_curpage; 2463 BTHEADER *left, *right; 2464 DATUM *d; 2465 IDATUM *id; 2466 char *where; 2467 BTHEADER *where_h; 2468 pgno_t lastpg; 2469 char *src, *dest; 2470 int len, nbytes; 2471 u_long was_leaf; 2472 pgno_t oldchain; 2473 u_char flags; 2474 2475 /* get two new pages for the split */ 2476 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL) 2477 return (RET_ERROR); 2478 left->h_pgno = ++(t->bt_npages); 2479 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL) 2480 return (RET_ERROR); 2481 right->h_pgno = ++(t->bt_npages); 2482 2483 /* do the split */ 2484 if (_bt_dopsplit(t, left, right) == RET_ERROR) 2485 return (RET_ERROR); 2486 2487 /* connect the new pages correctly */ 2488 right->h_prevpg = left->h_pgno; 2489 left->h_nextpg = right->h_pgno; 2490 2491 /* 2492 * Write the child pages out now. We need them to remain 2493 * where they are until we finish updating parent pages, 2494 * however. 2495 */ 2496 2497 if (_bt_write(t, left, NORELEASE) == RET_ERROR) 2498 return (RET_ERROR); 2499 if (_bt_write(t, right, NORELEASE) == RET_ERROR) 2500 return (RET_ERROR); 2501 2502 /* now change the root page into an internal page */ 2503 was_leaf = (h->h_flags & F_LEAF); 2504 h->h_flags &= ~F_LEAF; 2505 h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h)); 2506 h->h_upper = (index_t) t->bt_psize; 2507 (void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower)); 2508 2509 /* put two new keys on root page */ 2510 where_h = left; 2511 while (where_h) { 2512 DATUM *data; 2513 IDATUM *idata; 2514 2515 if (was_leaf) { 2516 data = (DATUM *) GETDATUM(where_h, 0); 2517 2518 if (where_h == left) { 2519 len = 0; /* first key in tree is null */ 2520 } else { 2521 if (data->d_flags & D_BIGKEY) { 2522 bcopy(&(data->d_bytes[0]), 2523 (char *) &oldchain, 2524 sizeof(oldchain)); 2525 if (_bt_markchain(t, oldchain) == RET_ERROR) 2526 return (RET_ERROR); 2527 src = (char *) &oldchain; 2528 flags = D_BIGKEY; 2529 } else { 2530 src = (char *) &(data->d_bytes[0]); 2531 flags = 0; 2532 } 2533 len = data->d_ksize; 2534 } 2535 } else { 2536 idata = (IDATUM *) GETDATUM(where_h, 0); 2537 len = idata->i_size; 2538 flags = idata->i_flags; 2539 src = &(idata->i_bytes[0]); 2540 } 2541 dest = ((char *) h) + h->h_upper; 2542 nbytes = len + (sizeof (IDATUM) - sizeof(char)); 2543 dest -= LONGALIGN(nbytes); 2544 id = (IDATUM *) dest; 2545 id->i_size = len; 2546 id->i_pgno = where_h->h_pgno; 2547 id->i_flags = flags; 2548 if (len > 0) 2549 (void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len); 2550 dest -= ((int) h); 2551 h->h_linp[NEXTINDEX(h)] = (index_t) dest; 2552 h->h_upper = (index_t) dest; 2553 h->h_lower += sizeof(index_t); 2554 2555 /* next page */ 2556 if (where_h == left) 2557 where_h = right; 2558 else 2559 where_h = (BTHEADER *) NULL; 2560 } 2561 2562 if (_bt_release(t, left) == RET_ERROR) 2563 return (RET_ERROR); 2564 if (_bt_release(t, right) == RET_ERROR) 2565 return (RET_ERROR); 2566 2567 /* 2568 * That's it, split is done. If we're doing a non-cached disk 2569 * btree, we can free up the pages we allocated, as they're on 2570 * disk, now. 2571 */ 2572 2573 if (ISDISK(t) && !ISCACHE(t)) { 2574 (void) free ((char *) left); 2575 (void) free ((char *) right); 2576 } 2577 2578 h->h_flags |= F_DIRTY; 2579 2580 return (RET_SUCCESS); 2581 } 2582 2583 /* 2584 * _BT_DOPSPLIT -- Do the work of splitting a page 2585 * 2586 * This routine takes two page pointers and splits the data on the 2587 * current page of the btree between them. 2588 * 2589 * We do a lot of work here to handle duplicate keys on a page; we 2590 * have to place these keys carefully to guarantee that later searches 2591 * will find them correctly. See comments in the code below for details. 2592 * 2593 * Parameters: 2594 * t -- tree to split 2595 * left -- pointer to page to get left half of the data 2596 * right -- pointer to page to get right half of the data 2597 * 2598 * Returns: 2599 * None. 2600 */ 2601 2602 static 2603 _bt_dopsplit(t, left, right) 2604 BTREE_P t; 2605 BTHEADER *left; 2606 BTHEADER *right; 2607 { 2608 BTHEADER *h = t->bt_curpage; 2609 size_t psize; 2610 char *where; 2611 BTHEADER *where_h; 2612 index_t where_i; 2613 int nbytes, dsize, fixedsize, freespc; 2614 index_t i; 2615 index_t save_lower, save_upper, save_i; 2616 index_t switch_i; 2617 char *save_key; 2618 DATUM *d; 2619 CURSOR *c; 2620 index_t top; 2621 int free_save; 2622 pgno_t chain; 2623 int ignore; 2624 2625 /* 2626 * Our strategy is to put half the bytes on each page. We figure 2627 * out how many bytes we have total, and then move items until 2628 * the last item moved put at least 50% of the data on the left 2629 * page. 2630 */ 2631 save_key = (char *) NULL; 2632 psize = (int) t->bt_psize; 2633 where = ((char *) left) + psize; 2634 where_h = left; 2635 where_i = 0; 2636 nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h)); 2637 freespc = nbytes; 2638 2639 top = NEXTINDEX(h); 2640 if (h->h_flags & F_LEAF) 2641 fixedsize = (sizeof(DATUM) - sizeof(char)); 2642 else 2643 fixedsize = (sizeof(IDATUM) - sizeof(char)); 2644 2645 save_key = (char *) NULL; 2646 2647 /* move data */ 2648 for (i = 0; i < top; i++) { 2649 2650 /* 2651 * Internal and leaf pages have different layouts for 2652 * data items, but in both cases the first entry in the 2653 * data item is a size_t. 2654 */ 2655 d = (DATUM *) GETDATUM(h,i); 2656 if (h->h_flags & F_LEAF) { 2657 dsize = d->d_ksize + d->d_dsize + fixedsize; 2658 } else { 2659 dsize = d->d_ksize + fixedsize; 2660 } 2661 2662 /* 2663 * If a page contains duplicate keys, we have to be 2664 * careful about splits. A sequence of duplicate keys 2665 * may not begin in the middle of one page and end in 2666 * the middle of another; it must begin on a page boundary, 2667 * in order for searches on the internal nodes to work 2668 * correctly. 2669 */ 2670 if (where_h == left) { 2671 if (save_key == (char *) NULL) { 2672 if (h->h_flags & F_LEAF) { 2673 if (d->d_flags & D_BIGKEY) { 2674 free_save = TRUE; 2675 bcopy(&(d->d_bytes[0]), 2676 (char *) &chain, 2677 sizeof(chain)); 2678 if (_bt_getbig(t, chain, 2679 &save_key, 2680 &ignore) 2681 == RET_ERROR) 2682 return (RET_ERROR); 2683 } else { 2684 free_save = FALSE; 2685 save_key = (char *) &(d->d_bytes[0]); 2686 } 2687 } else { 2688 IDATUM *id = (IDATUM *) d; 2689 2690 if (id->i_flags & D_BIGKEY) { 2691 free_save = TRUE; 2692 bcopy(&(id->i_bytes[0]), 2693 (char *) &chain, 2694 sizeof(chain)); 2695 if (_bt_getbig(t, chain, 2696 &save_key, 2697 &ignore) 2698 == RET_ERROR) 2699 return (RET_ERROR); 2700 } else { 2701 free_save = FALSE; 2702 save_key = (char *) 2703 &(id->i_bytes[0]); 2704 } 2705 } 2706 save_i = 0; 2707 save_lower = where_h->h_lower; 2708 save_upper = where_h->h_upper; 2709 } else { 2710 if (_bt_cmp(t, save_key, i) != 0) { 2711 save_lower = where_h->h_lower; 2712 save_upper = where_h->h_upper; 2713 save_i = i; 2714 } 2715 if (h->h_flags & F_LEAF) { 2716 if (free_save) 2717 (void) free(save_key); 2718 if (d->d_flags & D_BIGKEY) { 2719 free_save = TRUE; 2720 bcopy(&(d->d_bytes[0]), 2721 (char *) &chain, 2722 sizeof(chain)); 2723 if (_bt_getbig(t, chain, 2724 &save_key, 2725 &ignore) 2726 == RET_ERROR) 2727 return (RET_ERROR); 2728 } else { 2729 free_save = FALSE; 2730 save_key = (char *) &(d->d_bytes[0]); 2731 } 2732 } else { 2733 IDATUM *id = (IDATUM *) d; 2734 2735 if (id->i_flags & D_BIGKEY) { 2736 free_save = TRUE; 2737 bcopy(&(id->i_bytes[0]), 2738 (char *) &chain, 2739 sizeof(chain)); 2740 if (_bt_getbig(t, chain, 2741 &save_key, 2742 &ignore) 2743 == RET_ERROR) 2744 return (RET_ERROR); 2745 } else { 2746 free_save = FALSE; 2747 save_key = (char *) 2748 &(id->i_bytes[0]); 2749 } 2750 } 2751 } 2752 } 2753 2754 /* copy data and update page state */ 2755 where -= LONGALIGN(dsize); 2756 (void) bcopy((char *) d, (char *) where, dsize); 2757 where_h->h_upper = where_h->h_linp[where_i] = 2758 (index_t) (where - (int) where_h); 2759 where_h->h_lower += sizeof(index_t); 2760 where_i++; 2761 2762 /* if we've moved half, switch to the right-hand page */ 2763 nbytes -= LONGALIGN(dsize) + sizeof(index_t); 2764 if ((freespc - nbytes) > nbytes) { 2765 nbytes = 2 * freespc; 2766 2767 /* identical keys go on the same page */ 2768 if (save_i > 0) { 2769 /* i gets incremented at loop bottom... */ 2770 i = save_i - 1; 2771 where_h->h_lower = save_lower; 2772 where_h->h_upper = save_upper; 2773 } 2774 where = ((char *) right) + psize; 2775 where_h = right; 2776 switch_i = where_i; 2777 where_i = 0; 2778 } 2779 } 2780 2781 /* 2782 * If there was an active scan on the database, and we just 2783 * split the page that the cursor was on, we may need to 2784 * adjust the cursor to point to the same entry as before the 2785 * split. 2786 */ 2787 2788 c = &(t->bt_cursor); 2789 if ((t->bt_flags & BTF_SEQINIT) 2790 && (c->c_pgno == h->h_pgno) 2791 && (c->c_index >= switch_i)) { 2792 c->c_pgno = where_h->h_pgno; 2793 c->c_index -= where_i; 2794 } 2795 return (RET_SUCCESS); 2796 } 2797 2798 /* 2799 * _BT_INSERTI -- Insert IDATUM on current page in the btree. 2800 * 2801 * This routine handles insertions to internal pages after splits 2802 * lower in the tree. On entry, t->bt_curpage is the page to get 2803 * the new IDATUM. We are also given pgno, the page number of the 2804 * IDATUM that is immediately left of the new IDATUM's position. 2805 * This guarantees that the IDATUM for the right half of the page 2806 * after a split goes next to the IDATUM for its left half. 2807 * 2808 * Parameters: 2809 * t -- tree in which to do insertion. 2810 * id -- new IDATUM to insert 2811 * pgno -- page number of IDATUM left of id's position 2812 * 2813 * Returns: 2814 * RET_SUCCESS, RET_ERROR. 2815 */ 2816 2817 static int 2818 _bt_inserti(t, id, pgno) 2819 BTREE_P t; 2820 IDATUM *id; 2821 pgno_t pgno; 2822 { 2823 BTHEADER *h = t->bt_curpage; 2824 index_t next, i; 2825 IDATUM *idx; 2826 char *key; 2827 pgno_t chain; 2828 int free_key; 2829 int ignore; 2830 2831 if (id->i_flags & D_BIGKEY) { 2832 free_key = TRUE; 2833 bcopy(&(id->i_bytes[0]), (char *) &chain, sizeof(chain)); 2834 if (_bt_getbig(t, chain, &key, &ignore) == RET_ERROR) 2835 return (RET_ERROR); 2836 } else { 2837 free_key = FALSE; 2838 key = &(id->i_bytes[0]); 2839 } 2840 i = _bt_binsrch(t, key); 2841 2842 next = NEXTINDEX(h); 2843 while (i < next && _bt_cmp(t, key, i) >= 0) 2844 i++; 2845 2846 if (free_key) 2847 (void) free(key); 2848 2849 /* okay, now we're close; find adjacent IDATUM */ 2850 for (;;) { 2851 idx = (IDATUM *) GETDATUM(h,i); 2852 if (idx->i_pgno == pgno) { 2853 i++; 2854 break; 2855 } 2856 --i; 2857 } 2858 2859 /* correctly positioned, do the insertion */ 2860 return (_bt_insertat(t, id, i)); 2861 } 2862 2863 /* 2864 * _BT_INSERTAT -- Insert a datum at a given location on the current page. 2865 * 2866 * This routine does insertions on both leaf and internal pages. 2867 * 2868 * Parameters: 2869 * t -- tree in which to do insertion. 2870 * p -- DATUM or IDATUM to insert. 2871 * index -- index in line pointer array to put this item. 2872 * 2873 * Returns: 2874 * RET_SUCCESS, RET_ERROR. 2875 * 2876 * Side Effects: 2877 * Will rearrange line pointers to make space for the new 2878 * entry. This means that any scans currently active are 2879 * invalid after this. 2880 * 2881 * Warnings: 2882 * There must be sufficient room for the new item on the page. 2883 */ 2884 2885 static int 2886 _bt_insertat(t, p, index) 2887 BTREE_P t; 2888 char *p; 2889 index_t index; 2890 { 2891 IDATUM *id = (IDATUM *) p; 2892 DATUM *d = (DATUM *) p; 2893 BTHEADER *h; 2894 CURSOR *c; 2895 index_t nxtindex; 2896 char *src, *dest; 2897 int nbytes; 2898 2899 /* insertion may confuse an active scan. fix it. */ 2900 c = &(t->bt_cursor); 2901 if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno) 2902 if (_bt_fixscan(t, index, d, INSERT) == RET_ERROR) 2903 return (RET_ERROR); 2904 2905 h = t->bt_curpage; 2906 nxtindex = (index_t) NEXTINDEX(h); 2907 2908 /* 2909 * If we're inserting at the middle of the line pointer array, 2910 * copy pointers that will follow the new one up on the page. 2911 */ 2912 2913 if (index < nxtindex) { 2914 src = (char *) &(h->h_linp[index]); 2915 dest = (char *) &(h->h_linp[index + 1]); 2916 nbytes = (h->h_lower - (src - ((char *) h))) 2917 + sizeof(h->h_linp[0]); 2918 (void) bcopy(src, dest, nbytes); 2919 } 2920 2921 /* compute size and copy data to page */ 2922 if (h->h_flags & F_LEAF) { 2923 nbytes = d->d_ksize + d->d_dsize 2924 + (sizeof(DATUM) - sizeof(char)); 2925 } else { 2926 nbytes = id->i_size + (sizeof(IDATUM) - sizeof(char)); 2927 } 2928 dest = (((char *) h) + h->h_upper) - LONGALIGN(nbytes); 2929 (void) bcopy((char *) p, dest, nbytes); 2930 2931 /* update statistics */ 2932 dest -= (int) h; 2933 h->h_linp[index] = (index_t) dest; 2934 h->h_upper = (index_t) dest; 2935 h->h_lower += sizeof(index_t); 2936 2937 /* we're done */ 2938 h->h_flags |= F_DIRTY; 2939 2940 return (RET_SUCCESS); 2941 } 2942 2943 /* 2944 * _BT_BINSRCH -- Do a binary search for a given key on the current page. 2945 * 2946 * Searches on internal pages are handled slightly differently from 2947 * searches on leaf pages. This is because internal page searches 2948 * find the largest item <= key in the tree, and leaf searches find 2949 * the smallest item >= key. This guarantees that leaf page searches 2950 * leave us pointing at the item's correct position, and internal 2951 * searches descend the tree correctly. 2952 * 2953 * Parameters: 2954 * t -- tree to search 2955 * key -- key we're looking for 2956 * 2957 * Returns: 2958 * Index of the line pointer array entry for the (closest) 2959 * match to key on the current page, with "closest" as defined 2960 * above. 2961 */ 2962 2963 static index_t 2964 _bt_binsrch(t, key) 2965 BTREE_P t; 2966 char *key; 2967 { 2968 index_t lbound, ubound, cur; 2969 BTHEADER *h = t->bt_curpage; 2970 IDATUM *id; 2971 int match = 0; 2972 int r; 2973 2974 lbound = 0; 2975 ubound = NEXTINDEX(h); 2976 if (ubound > 0) 2977 --ubound; 2978 2979 /* do a binary search on the current page */ 2980 while ((ubound - lbound) > 1) { 2981 cur = lbound + ((ubound - lbound) / 2); 2982 r = _bt_cmp(t, key, cur); 2983 2984 if (r > 0) 2985 lbound = cur + 1; 2986 else if (r < 0) 2987 ubound = cur; 2988 else { 2989 match++; 2990 break; 2991 } 2992 } 2993 2994 /* 2995 * At this point, the binary search terminated because the endpoints 2996 * got too close together, or we have a match. Figure out which 2997 * case applies, decide what to do based on the page type (leaf or 2998 * internal), and do the right thing. 2999 */ 3000 if (match) { 3001 return (cur); 3002 } else if (ubound != lbound) { 3003 if (h->h_flags & F_LEAF) { 3004 r = _bt_cmp(t, key, lbound); 3005 if (r <= 0) { 3006 return (lbound); 3007 } 3008 } else { 3009 r = _bt_cmp(t, key, ubound); 3010 3011 /* for internal nodes, move as far left as possible */ 3012 if (r < 0) { 3013 r = _bt_cmp(t, key, lbound); 3014 if (r < 0 && lbound > 0) 3015 --lbound; 3016 return (lbound); 3017 } else { 3018 return (ubound); 3019 } 3020 } 3021 } 3022 3023 if (h->h_flags & F_LEAF) { 3024 if (ubound < NEXTINDEX(h)) { 3025 r = _bt_cmp(t, key, ubound); 3026 if (r > 0) 3027 ubound++; 3028 } 3029 } else { 3030 /* for internal pages, move as far left as possible */ 3031 if (ubound == NEXTINDEX(h)) 3032 ubound--; 3033 3034 while (_bt_cmp(t, key, ubound) < 0) 3035 ubound--; 3036 } 3037 return (ubound); 3038 } 3039 3040 /* 3041 * BT_SEQ -- Sequential scan interface. 3042 * 3043 * This routine supports sequential scans on the btree. If called with 3044 * flags set to R_CURSOR, or if no seq scan has been initialized in the 3045 * current tree, we initialize the scan. Otherwise, we advance the scan 3046 * and return the next item. 3047 * 3048 * Scans can be either keyed or non-keyed. Keyed scans basically have 3049 * a starting point somewhere in the middle of the tree. Non-keyed 3050 * scans start at an endpoint. Also, scans can be backward (descending 3051 * order), forward (ascending order), or no movement (keep returning 3052 * the same item). 3053 * 3054 * Flags is checked every time we enter the routine, so the user can 3055 * change directions on an active scan if desired. The key argument 3056 * is examined only when we initialize the scan, in order to position 3057 * it properly. 3058 * 3059 * Items are returned via the key and data arguments passed in. 3060 * 3061 * Parameters: 3062 * tree -- btree in which to do scan 3063 * key -- key, used to position scan on initialization, and 3064 * used to return key components to the user. 3065 * data -- used to return data components to the user. 3066 * flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or 3067 * R_PREV. 3068 * 3069 * Returns: 3070 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data 3071 * exists in the tree in the specified direction. 3072 * 3073 * Side Effects: 3074 * Changes the btree's notion of the current position in the 3075 * scan. 3076 * 3077 * Warnings: 3078 * The key and data items returned are static and will be 3079 * overwritten by the next bt_get or bt_seq. 3080 */ 3081 3082 bt_seq(tree, key, data, flags) 3083 BTREE tree; 3084 DBT *key; 3085 DBT *data; 3086 int flags; 3087 { 3088 BTREE_P t = (BTREE_P) tree; 3089 BTHEADER *h; 3090 DATUM *d; 3091 int status; 3092 3093 /* do we need to initialize the scan? */ 3094 if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST 3095 || !(t->bt_flags & BTF_SEQINIT)) { 3096 3097 /* initialize it */ 3098 status = _bt_seqinit(t, key, flags); 3099 } else { 3100 /* just advance the current scan pointer */ 3101 status = _bt_seqadvance(t, flags); 3102 } 3103 3104 key->size = data->size = 0; 3105 key->data = data->data = (char *) NULL; 3106 3107 h = t->bt_curpage; 3108 3109 /* is there a valid item at the current scan location? */ 3110 if (status == RET_SPECIAL) { 3111 if (flags == R_NEXT) { 3112 if (t->bt_cursor.c_index >= NEXTINDEX(h)) { 3113 if (NEXTINDEX(h) > 0) 3114 t->bt_cursor.c_index = NEXTINDEX(h) - 1; 3115 else 3116 t->bt_cursor.c_index = 0; 3117 } 3118 } else { 3119 t->bt_cursor.c_index = 0; 3120 } 3121 return (RET_SPECIAL); 3122 } else if (status == RET_ERROR) 3123 return (RET_ERROR); 3124 3125 /* okay, return the data */ 3126 d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index); 3127 3128 return (_bt_buildret(t, d, data, key)); 3129 } 3130 3131 /* 3132 * _BT_BUILDRET -- Build return key/data pair as a result of search or scan. 3133 * 3134 * This routine manages the statically allocated buffers in which we 3135 * return data to the user. 3136 * 3137 * Parameters: 3138 * t -- btree from which to return datum 3139 * d -- DATUM to be returned to the user. 3140 * data -- data argument supplied by user for return 3141 * key -- key argument supplied by user for return 3142 * 3143 * Returns: 3144 * RET_SUCCESS, RET_ERROR. 3145 * 3146 * Side Effects: 3147 * May free and reallocate static buffers, if the data 3148 * we want to return is bigger than the space we have to 3149 * do so. 3150 */ 3151 3152 static int 3153 _bt_buildret(t, d, data, key) 3154 BTREE_P t; 3155 DATUM *d; 3156 DBT *data; 3157 DBT *key; 3158 { 3159 static int _data_s = 0; 3160 static int _key_s = 0; 3161 static char *_data = (char *) NULL; 3162 static char *_key = (char *) NULL; 3163 char *from, *where, *top; 3164 pgno_t save; 3165 pgno_t chain; 3166 size_t nbytes; 3167 size_t nhere; 3168 BTHEADER *h; 3169 3170 if (d->d_flags & D_BIGKEY) { 3171 if (_key != (char *) NULL) 3172 (void) free(_key); 3173 (void) bcopy((char *) &(d->d_bytes[0]), 3174 (char *) &chain, 3175 sizeof(chain)); 3176 if (_bt_getbig(t, chain, &_key, &_key_s) == RET_ERROR) 3177 return (RET_ERROR); 3178 key->data = _key; 3179 key->size = _key_s; 3180 } else { 3181 /* need more space for key? */ 3182 if (d->d_ksize > _key_s) { 3183 if (_key != (char *) NULL) 3184 (void) free (_key); 3185 if ((_key = (char *) malloc((unsigned) d->d_ksize)) 3186 == (char *) NULL) 3187 return (RET_ERROR); 3188 _key_s = d->d_ksize; 3189 } 3190 3191 key->data = _key; 3192 if ((key->size = d->d_ksize) > 0) 3193 (void) bcopy(&(d->d_bytes[0]), 3194 _key, 3195 (int) d->d_ksize); 3196 } 3197 3198 if (d->d_flags & D_BIGDATA) { 3199 if (_data != (char *) NULL) 3200 (void) free(_data); 3201 (void) bcopy(&(d->d_bytes[d->d_ksize]), 3202 (char *) &chain, 3203 sizeof(chain)); 3204 if (_bt_getbig(t, chain, &_data, &_data_s) == RET_ERROR) 3205 return (RET_ERROR); 3206 data->data = _data; 3207 data->size = _data_s; 3208 } else { 3209 /* need more space for data? */ 3210 if (d->d_dsize > _data_s) { 3211 if (_data != (char *) NULL) 3212 (void) free (_data); 3213 if ((_data = (char *) malloc((unsigned) d->d_dsize)) 3214 == (char *) NULL) 3215 return (RET_ERROR); 3216 _data_s = d->d_dsize; 3217 } 3218 3219 data->data = _data; 3220 3221 if ((data->size = d->d_dsize) > 0) 3222 (void) bcopy(&(d->d_bytes[d->d_ksize]), 3223 _data, 3224 d->d_dsize); 3225 } 3226 3227 return (RET_SUCCESS); 3228 } 3229 3230 /* 3231 * _BT_SEQINIT -- Initialize a sequential scan on the btree. 3232 * 3233 * Sets the tree's notion of the current scan location correctly 3234 * given a key and a direction. 3235 * 3236 * Parameters: 3237 * t -- tree in which to initialize scan 3238 * key -- key for initial scan position 3239 * flags -- R_NEXT, R_PREV 3240 * 3241 * Returns: 3242 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there's no data 3243 * in the tree to scan. 3244 * 3245 * Side Effects: 3246 * Changes current scan position for the tree. Almost certainly 3247 * changes current page, as well. Sets BTF_SEQINIT bit in tree 3248 * flags, so that we know we've initialized a scan. 3249 */ 3250 3251 static int 3252 _bt_seqinit(t, key, flags) 3253 BTREE_P t; 3254 DBT *key; 3255 int flags; 3256 { 3257 BTITEM *item; 3258 BTHEADER *h; 3259 CURSOR *c; 3260 IDATUM *id; 3261 pgno_t pgno; 3262 index_t index; 3263 index_t last; 3264 3265 /* 3266 * Figure out if we really have to search for the key that the 3267 * user supplied. If key is null, then this is an unkeyed scan 3268 * and we can just start from an endpoint. 3269 */ 3270 3271 c = &(t->bt_cursor); 3272 3273 if (flags == R_CURSOR) { 3274 if (key->data != (char *) NULL) { 3275 3276 /* key supplied, find it */ 3277 item = _bt_search(t, key); 3278 c->c_index = item->bti_index; 3279 c->c_pgno = t->bt_curpage->h_pgno; 3280 } else { 3281 errno = EINVAL; 3282 return (RET_ERROR); 3283 } 3284 3285 } else { 3286 3287 /* 3288 * Unkeyed scan. For backward scans, find the last item 3289 * in the tree; for forward scans, find the first item. 3290 */ 3291 3292 if (_bt_getpage(t, P_ROOT) == RET_ERROR) 3293 return (RET_ERROR); 3294 h = t->bt_curpage; 3295 if (flags == R_LAST || flags == R_PREV) { 3296 3297 /* backward scan */ 3298 while (!(h->h_flags & F_LEAF)) { 3299 last = NEXTINDEX(h) - 1; 3300 id = (IDATUM *) GETDATUM(h,last); 3301 if (_bt_getpage(t, id->i_pgno) == RET_ERROR) 3302 return (RET_ERROR); 3303 h = t->bt_curpage; 3304 } 3305 3306 /* skip empty pages */ 3307 while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE) { 3308 if (_bt_getpage(t, h->h_prevpg) == RET_ERROR) 3309 return (RET_ERROR); 3310 h = t->bt_curpage; 3311 } 3312 3313 c->c_pgno = h->h_pgno; 3314 if (NEXTINDEX(h) > 0) 3315 c->c_index = NEXTINDEX(h) - 1; 3316 else 3317 c->c_index = 0; 3318 } else if (flags == R_FIRST || flags == R_NEXT) { 3319 /* forward scan */ 3320 while (!(h->h_flags & F_LEAF)) { 3321 id = (IDATUM *) GETDATUM(h,0); 3322 if (_bt_getpage(t, id->i_pgno) == RET_ERROR) 3323 return (RET_ERROR); 3324 h = t->bt_curpage; 3325 } 3326 3327 /* skip empty pages */ 3328 while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE) { 3329 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 3330 return (RET_ERROR); 3331 h = t->bt_curpage; 3332 } 3333 3334 c->c_pgno = h->h_pgno; 3335 c->c_index = 0; 3336 } else { 3337 /* no flags passed in */ 3338 errno = EINVAL; 3339 return (RET_ERROR); 3340 } 3341 } 3342 3343 /* okay, scan is initialized */ 3344 t->bt_flags |= BTF_SEQINIT; 3345 3346 if (c->c_index == NEXTINDEX(t->bt_curpage)) 3347 return (RET_SPECIAL); 3348 3349 return (RET_SUCCESS); 3350 } 3351 3352 /* 3353 * _BT_SEQADVANCE -- Advance the sequential scan on this tree. 3354 * 3355 * Moves the current location pointer for the scan on this tree one 3356 * spot in the requested direction. 3357 * 3358 * Parameters: 3359 * t -- btree being scanned 3360 * flags -- for R_NEXT, R_PREV 3361 * 3362 * Returns: 3363 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there is no 3364 * more data in the specified direction. 3365 * 3366 * Side Effects: 3367 * May change current page. 3368 */ 3369 3370 static int 3371 _bt_seqadvance(t, flags) 3372 BTREE_P t; 3373 int flags; 3374 { 3375 BTHEADER *h; 3376 CURSOR *c; 3377 index_t index; 3378 3379 c = &(t->bt_cursor); 3380 index = c->c_index; 3381 3382 if (_bt_getpage(t, c->c_pgno) == RET_ERROR) 3383 return (RET_ERROR); 3384 h = t->bt_curpage; 3385 3386 /* by the time we get here, don't need the cursor key anymore */ 3387 if (c->c_key != (char *) NULL) 3388 (void) free(c->c_key); 3389 3390 if (flags == R_NEXT) { 3391 3392 /* 3393 * This is a forward scan. If the cursor is pointing 3394 * at a virtual record (that is, it was pointing at 3395 * a record that got deleted), then we should return 3396 * the record it's pointing at now. Otherwise, we 3397 * should advance the scan. In either case, we need 3398 * to be careful not to run off the end of the current 3399 * page. 3400 */ 3401 3402 if (c->c_flags & CRSR_BEFORE) { 3403 3404 if (index >= NEXTINDEX(h)) { 3405 /* out of items on this page, get next page */ 3406 if (h->h_nextpg == P_NONE) { 3407 /* tell caller we're done... */ 3408 c->c_index = NEXTINDEX(h); 3409 return (RET_SPECIAL); 3410 } 3411 3412 /* skip empty pages */ 3413 do { 3414 if (_bt_getpage(t, h->h_nextpg) 3415 == RET_ERROR) { 3416 c->c_index = NEXTINDEX(h); 3417 return (RET_ERROR); 3418 } 3419 h = t->bt_curpage; 3420 c->c_pgno = h->h_pgno; 3421 } while (NEXTINDEX(h) == 0 3422 && h->h_nextpg != P_NONE); 3423 3424 if (NEXTINDEX(h) == 0) { 3425 /* tell caller we're done */ 3426 c->c_index = NEXTINDEX(h); 3427 return (RET_SPECIAL); 3428 } 3429 index = 0; 3430 } 3431 c->c_flags &= ~CRSR_BEFORE; 3432 3433 } else if (++index >= NEXTINDEX(h)) { 3434 3435 /* out of items on this page, get next page */ 3436 if (h->h_nextpg == P_NONE) { 3437 /* tell caller we're done... */ 3438 c->c_index = NEXTINDEX(h); 3439 return (RET_SPECIAL); 3440 } 3441 3442 /* skip empty pages */ 3443 do { 3444 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) { 3445 c->c_index = NEXTINDEX(h); 3446 return (RET_ERROR); 3447 } 3448 h = t->bt_curpage; 3449 c->c_pgno = h->h_pgno; 3450 } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE); 3451 3452 if (NEXTINDEX(h) == 0) { 3453 /* tell caller we're done */ 3454 c->c_index = NEXTINDEX(h); 3455 return (RET_SPECIAL); 3456 } 3457 index = 0; 3458 } 3459 } else if (flags == R_PREV) { 3460 3461 /* for backward scans, life is substantially easier */ 3462 c->c_flags &= ~CRSR_BEFORE; 3463 if (c->c_key != (char *) NULL) { 3464 (void) free(c->c_key); 3465 c->c_key = (char *) NULL; 3466 } 3467 3468 if (index == 0) { 3469 3470 /* we may be done */ 3471 c->c_index = 0; 3472 3473 /* out of items on this page, get next page */ 3474 if (h->h_prevpg == P_NONE) 3475 return (RET_SPECIAL); 3476 3477 /* skip empty pages */ 3478 do { 3479 if (_bt_getpage(t, h->h_prevpg) == RET_ERROR) 3480 return (RET_ERROR); 3481 h = t->bt_curpage; 3482 c->c_pgno = h->h_pgno; 3483 } while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE); 3484 3485 if (NEXTINDEX(h) == 0) 3486 return (RET_SPECIAL); 3487 3488 index = NEXTINDEX(h) - 1; 3489 } else 3490 --index; 3491 } else { 3492 /* must specify a direction */ 3493 errno = EINVAL; 3494 return (RET_ERROR); 3495 } 3496 3497 c->c_index = index; 3498 return (RET_SUCCESS); 3499 } 3500 3501 /* 3502 * BT_CLOSE -- Close a btree 3503 * 3504 * Parameters: 3505 * tree -- tree to close 3506 * 3507 * Returns: 3508 * RET_SUCCESS, RET_ERROR. 3509 * 3510 * Side Effects: 3511 * Frees space occupied by the tree. 3512 */ 3513 3514 int 3515 bt_close(tree) 3516 BTREE tree; 3517 { 3518 BTREE_P t = (BTREE_P) tree; 3519 int i; 3520 BTHEADER *h; 3521 char *cache; 3522 struct HTBUCKET *b, *sb; 3523 HTABLE ht; 3524 int fd; 3525 3526 if (t->bt_cursor.c_key != (char *) NULL) 3527 (void) free(t->bt_cursor.c_key); 3528 3529 if (!ISDISK(t)) { 3530 /* in-memory tree, release hash table memory */ 3531 ht = t->bt_s.bt_ht; 3532 for (i = 0; i < HTSIZE; i++) { 3533 if ((b = ht[i]) == (struct HTBUCKET *) NULL) 3534 break; 3535 do { 3536 sb = b; 3537 (void) free((char *) b->ht_page); 3538 b = b->ht_next; 3539 (void) free((char *) sb); 3540 } while (b != (struct HTBUCKET *) NULL); 3541 } 3542 (void) free ((char *) ht); 3543 (void) free ((char *) t); 3544 return (RET_SUCCESS); 3545 } 3546 3547 if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) { 3548 if (_bt_wrtmeta(t) == RET_ERROR) 3549 return (RET_ERROR); 3550 } 3551 3552 if (t->bt_curpage != (BTHEADER *) NULL) { 3553 h = t->bt_curpage; 3554 if (h->h_flags & F_DIRTY) { 3555 if (_bt_write(t, h, RELEASE) == RET_ERROR) 3556 return (RET_ERROR); 3557 } else { 3558 if (_bt_release(t, h) == RET_ERROR) 3559 return (RET_ERROR); 3560 } 3561 3562 /* flush and free the cache, if there is one */ 3563 if (ISCACHE(t)) { 3564 cache = t->bt_s.bt_d.d_cache; 3565 lrusync(cache); 3566 lrufree(cache); 3567 } 3568 (void) free ((char *) h); 3569 } 3570 3571 fd = t->bt_s.bt_d.d_fd; 3572 (void) free ((char *) t); 3573 return (close(fd)); 3574 } 3575 3576 /* 3577 * _BT_ALLOCPG -- allocate a new page in the btree. 3578 * 3579 * This is called when we split a page, to get space to do the split. 3580 * For disk btrees, these pages are released when the split is done. 3581 * For memory btrees, they are not. 3582 * 3583 * Parameters: 3584 * t -- tree in which to do split 3585 * 3586 * Returns: 3587 * Pointer to the newly-allocated page 3588 */ 3589 3590 static BTHEADER * 3591 _bt_allocpg(t) 3592 BTREE_P t; 3593 { 3594 BTHEADER *h = t->bt_curpage; 3595 BTHEADER *nh; 3596 char *cache; 3597 int nbytes; 3598 3599 /* if we have a cache, let the cache code do the work */ 3600 if (ISDISK(t) && ISCACHE(t)) { 3601 nh = (BTHEADER *) lrugetnew(t->bt_s.bt_d.d_cache, 3602 t->bt_npages + 1, 3603 &nbytes); 3604 } else { 3605 nh = (BTHEADER *) malloc((unsigned) t->bt_psize); 3606 } 3607 3608 if (nh != (BTHEADER *) NULL) { 3609 nh->h_pgno = nh->h_prevpg = nh->h_nextpg = P_NONE; 3610 nh->h_flags = h->h_flags; 3611 nh->h_lower = (index_t) 3612 (((char *) &(nh->h_linp[0])) - ((char *) nh)); 3613 nh->h_upper = t->bt_psize; 3614 } 3615 3616 return (nh); 3617 } 3618 3619 /* 3620 * _BT_WRITE -- Write a specific page to a btree file. 3621 * 3622 * Parameters: 3623 * t -- btree to get the page 3624 * h -- page to write 3625 * relflag -- (int) this page may/may not be released 3626 * 3627 * Returns: 3628 * RET_SUCCESS, RET_ERROR. 3629 * 3630 * Side Effects: 3631 * Writes a metadata page if none has been written yet. 3632 */ 3633 3634 static int 3635 _bt_write(t, h, relflag) 3636 BTREE_P t; 3637 BTHEADER *h; 3638 int relflag; 3639 { 3640 long pos; 3641 int htindex; 3642 HTBUCKET *b; 3643 char *cache; 3644 BTMETA m; 3645 pgno_t pgno; 3646 3647 h->h_flags &= ~F_DIRTY; 3648 if (ISDISK(t)) { 3649 3650 /* if we haven't done so yet, write the metadata */ 3651 if (!(t->bt_flags & BTF_METAOK)) { 3652 if (_bt_wrtmeta(t) == RET_ERROR) 3653 return (RET_ERROR); 3654 } 3655 3656 pgno = h->h_pgno; 3657 3658 3659 /* if we have a cache, let the cache code do the work */ 3660 if ((cache = t->bt_s.bt_d.d_cache) != (char *) NULL) { 3661 if (lruwrite(cache, pgno) == RET_ERROR) 3662 return (RET_ERROR); 3663 if (relflag && lrurelease(cache, pgno) == RET_ERROR) 3664 return (RET_ERROR); 3665 3666 } else { 3667 (void) _bt_pgout(h, (char *) t->bt_lorder); 3668 /* now write the current page */ 3669 pos = (long) (pgno * t->bt_psize); 3670 if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) 3671 return (RET_ERROR); 3672 if (write(t->bt_s.bt_d.d_fd, h, t->bt_psize) 3673 < t->bt_psize) 3674 return (RET_ERROR); 3675 if (!relflag) 3676 (void) _bt_pgin(h, (char *) t->bt_lorder); 3677 } 3678 } else { 3679 /* in-memory btree */ 3680 htindex = HASHKEY(h->h_pgno); 3681 3682 /* see if we need to overwrite existing entry */ 3683 for (b = t->bt_s.bt_ht[htindex]; 3684 b != (HTBUCKET *) NULL; 3685 b = b->ht_next) { 3686 if (b->ht_pgno == h->h_pgno) { 3687 b->ht_page = h; 3688 return (RET_SUCCESS); 3689 } 3690 } 3691 3692 /* new entry, write it */ 3693 b = (HTBUCKET *) malloc((unsigned) sizeof (HTBUCKET)); 3694 if (b == (HTBUCKET *) NULL) 3695 return (RET_ERROR); 3696 3697 b->ht_pgno = h->h_pgno; 3698 b->ht_page = h; 3699 b->ht_next = t->bt_s.bt_ht[htindex]; 3700 t->bt_s.bt_ht[htindex] = b; 3701 } 3702 return (RET_SUCCESS); 3703 } 3704 3705 /* 3706 * _BT_RELEASE -- Release a locked-down cache page 3707 * 3708 * During page splits, we want to force pages out to the cache 3709 * before we actually release them, in some cases. This routine 3710 * releases such a page when it is no longer needed. 3711 * 3712 * Parameters: 3713 * t -- btree in which to release page 3714 * h -- page to release 3715 * 3716 * Returns: 3717 * RET_SUCCESS, RET_ERROR. 3718 * 3719 * Side Effects: 3720 * None. 3721 */ 3722 3723 static int 3724 _bt_release(t, h) 3725 BTREE_P t; 3726 BTHEADER *h; 3727 { 3728 if (ISDISK(t) && ISCACHE(t)) { 3729 if (lrurelease(t->bt_s.bt_d.d_cache, h->h_pgno) < 0) 3730 return (RET_ERROR); 3731 } 3732 return (RET_SUCCESS); 3733 } 3734 3735 /* 3736 * _BT_WRTMETA -- Write metadata to the btree. 3737 * 3738 * Parameters: 3739 * t -- tree to which to write 3740 * 3741 * Returns: 3742 * RET_SUCCESS, RET_ERROR. 3743 */ 3744 3745 static int 3746 _bt_wrtmeta(t) 3747 BTREE_P t; 3748 { 3749 BTMETA m; 3750 3751 if (lseek(t->bt_s.bt_d.d_fd, 0l, L_SET) != 0l) 3752 return (RET_ERROR); 3753 3754 /* lorder has to be in host-independent format */ 3755 m.m_lorder = (u_long) htonl((long) t->bt_lorder); 3756 3757 m.m_magic = BTREEMAGIC; 3758 m.m_version = BTREEVERSION; 3759 m.m_psize = t->bt_psize; 3760 m.m_free = t->bt_free; 3761 m.m_flags = t->bt_flags & BTF_NODUPS; 3762 3763 if (t->bt_lorder != BYTE_ORDER) { 3764 BLSWAP(m.m_magic); 3765 BLSWAP(m.m_version); 3766 BLSWAP(m.m_psize); 3767 BLSWAP(m.m_free); 3768 BLSWAP(m.m_flags); 3769 } 3770 3771 if (write(t->bt_s.bt_d.d_fd, &m, sizeof(m)) 3772 != sizeof(m)) { 3773 return (RET_ERROR); 3774 } 3775 3776 t->bt_flags |= BTF_METAOK; 3777 3778 return (RET_SUCCESS); 3779 } 3780 3781 #ifdef DEBUG 3782 void 3783 _btdump(tree) 3784 BTREE tree; 3785 { 3786 BTREE_P t = (BTREE_P) tree; 3787 DATUM *d; 3788 IDATUM *id; 3789 BTHEADER *h; 3790 pgno_t npages; 3791 pgno_t i; 3792 index_t cur, top; 3793 3794 npages = t->bt_npages; 3795 (void) printf("\"%s\" fd %d pgsz %d curpg %d @ 0x%lx", 3796 t->bt_fname, t->bt_s.bt_d.d_fd, 3797 t->bt_psize, t->bt_curpage); 3798 (void) printf("npg %d cmp 0x%lx flags=(", npages, t->bt_compare); 3799 if (t->bt_flags & BTF_SEQINIT) 3800 (void) printf("BTF_SEQINIT"); 3801 (void) printf(")\n"); 3802 3803 for (i = P_ROOT; i <= npages; i++) { 3804 if (_bt_getpage(t, i) == RET_ERROR) 3805 _punt(); 3806 h = t->bt_curpage; 3807 top = NEXTINDEX(h); 3808 (void) printf(" page %d:\n", i); 3809 (void) printf("\tpgno %d prev %d next %d\n", 3810 h->h_pgno, h->h_prevpg, h->h_nextpg); 3811 (void) printf("\tlower %d upper %d nextind %d flags (", 3812 h->h_lower, h->h_upper, top); 3813 if (h->h_flags & F_LEAF) 3814 (void) printf("F_LEAF"); 3815 else 3816 (void) printf("<internal>"); 3817 if (h->h_flags & F_DIRTY) 3818 (void) printf("|F_DIRTY"); 3819 if (h->h_flags & F_PRESERVE) 3820 (void) printf("|F_PRESERVE"); 3821 if (h->h_flags & F_CONT) { 3822 (void) printf("|F_CONT)"); 3823 if (h->h_prevpg == P_NONE) { 3824 size_t longsz; 3825 (void) bcopy((char *) &(h->h_linp[0]), 3826 (char *) &longsz, 3827 sizeof(longsz)); 3828 printf("\n\t\t(chain start, data length %ld)", 3829 longsz); 3830 } 3831 printf("\n"); 3832 continue; 3833 } 3834 (void) printf(")\n"); 3835 for (cur = 0; cur < top; cur++) { 3836 (void) printf("\t [%d] off %d ", cur, h->h_linp[cur]); 3837 if (h->h_flags & F_LEAF) { 3838 d = (DATUM *) GETDATUM(h,cur); 3839 (void) printf("ksize %d", d->d_ksize); 3840 if (d->d_flags & D_BIGKEY) 3841 (void) printf(" (indirect)"); 3842 (void) printf("; dsize %d", d->d_dsize); 3843 if (d->d_flags & D_BIGDATA) 3844 (void) printf(" (indirect)"); 3845 } else { 3846 id = (IDATUM *) GETDATUM(h,cur); 3847 (void) printf("size %d pgno %d", 3848 id->i_size, id->i_pgno); 3849 if (id->i_flags & D_BIGKEY) 3850 (void) printf(" (indirect)"); 3851 } 3852 (void) printf("\n"); 3853 } 3854 (void) printf("\n"); 3855 } 3856 } 3857 #endif /* DEBUG */ 3858 3859 /* 3860 * _BT_CMP -- Compare a key to a given item on the current page. 3861 * 3862 * This routine is a wrapper for the user's comparison function. 3863 * 3864 * Parameters: 3865 * t -- tree in which to do comparison 3866 * p -- pointer to one argument for the comparison function 3867 * n -- index of item to supply second arg to comparison function 3868 * 3869 * Returns: 3870 * < 0 if p is < item at n 3871 * = 0 if p is = item at n 3872 * > 0 if p is > item at n 3873 */ 3874 3875 static int 3876 _bt_cmp(t, p, n) 3877 BTREE_P t; 3878 char *p; 3879 index_t n; 3880 { 3881 BTHEADER *h; 3882 IDATUM *id; 3883 DATUM *d; 3884 char *arg; 3885 int ignore; 3886 int free_arg; 3887 pgno_t chain; 3888 int r; 3889 3890 h = t->bt_curpage; 3891 3892 /* 3893 * The left-most key at any level of the tree on internal pages 3894 * is guaranteed (artificially, by the code here) to be less than 3895 * any user key. This saves us from having to update the leftmost 3896 * key when the user inserts a new key in the tree smaller than 3897 * anything we've seen yet. 3898 */ 3899 3900 if (h->h_prevpg == P_NONE && !(h->h_flags & F_LEAF) && n == 0) 3901 return (1); 3902 3903 if (h->h_flags & F_LEAF) { 3904 d = (DATUM *) GETDATUM(h,n); 3905 if (d->d_flags & D_BIGKEY) { 3906 free_arg = TRUE; 3907 bcopy(&(d->d_bytes[0]), (char *) &chain, sizeof(chain)); 3908 if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR) 3909 return (RET_ERROR); 3910 } else { 3911 free_arg = FALSE; 3912 arg = &(d->d_bytes[0]); 3913 } 3914 } else { 3915 id = (IDATUM *) GETDATUM(h,n); 3916 if (id->i_flags & D_BIGKEY) { 3917 free_arg = TRUE; 3918 bcopy(&(id->i_bytes[0]), 3919 (char *) &chain, 3920 sizeof(chain)); 3921 if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR) 3922 return (RET_ERROR); 3923 } else { 3924 free_arg = FALSE; 3925 arg = &(id->i_bytes[0]); 3926 } 3927 } 3928 r = (*(t->bt_compare))(p, arg); 3929 3930 if (free_arg) 3931 (void) free(arg); 3932 3933 return (r); 3934 } 3935 3936 /* 3937 * _BT_PUSH/_BT_POP -- Push/pop a parent page number on the parent stack. 3938 * 3939 * When we descend the tree, we keep track of parent pages in order 3940 * to handle splits on insertions. 3941 * 3942 * Parameters: 3943 * t -- tree for which to push parent 3944 * pgno -- page number to push (_bt_push only) 3945 * 3946 * Returns: 3947 * None. 3948 */ 3949 3950 static 3951 _bt_push(t, pgno) 3952 BTREE_P t; 3953 pgno_t pgno; 3954 { 3955 BTSTACK *new; 3956 3957 new = (BTSTACK *) malloc((unsigned) sizeof(BTSTACK)); 3958 new->bts_pgno = pgno; 3959 new->bts_next = t->bt_stack; 3960 t->bt_stack = new; 3961 } 3962 3963 static 3964 _bt_pop(t) 3965 BTREE_P t; 3966 { 3967 BTSTACK *s; 3968 pgno_t p = P_NONE; 3969 3970 if ((s = t->bt_stack) != (BTSTACK *) NULL) { 3971 p = s->bts_pgno; 3972 t->bt_stack = s->bts_next; 3973 (void) free ((char *) s); 3974 } 3975 return (p); 3976 } 3977 3978 #ifdef DEBUG 3979 _punt() 3980 { 3981 int pid; 3982 3983 pid = getpid(); 3984 (void) kill(pid, SIGILL); 3985 } 3986 #endif /* DEBUG */ 3987