1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #if defined(LIBC_SCCS) && !defined(lint) 12 static char sccsid[] = "@(#)bt_open.c 5.2 (Berkeley) 01/08/91"; 13 #endif /* LIBC_SCCS and not lint */ 14 15 /* 16 * btree.c -- implementation of btree access method for 4.4BSD. 17 * 18 * The design here is based on that of the btree access method used 19 * in the Postgres database system at UC Berkeley. The implementation 20 * is wholly independent of the Postgres code. 21 * 22 * This implementation supports btrees on disk (supply a filename) or 23 * in memory (don't). Public interfaces defined here are: 24 * 25 * btree_open() -- wrapper; returns a filled DB struct for 26 * a btree. 27 * 28 * bt_open() -- open a new or existing btree. 29 * bt_get() -- fetch data from a tree by key. 30 * bt_seq() -- do a sequential scan on a tree. 31 * bt_put() -- add data to a tree by key. 32 * bt_delete() -- remove data from a tree by key. 33 * bt_close() -- close a btree. 34 * bt_sync() -- force btree pages to disk (disk trees only). 35 */ 36 37 #include <sys/param.h> 38 #include <sys/file.h> 39 #include <sys/stat.h> 40 #include <sys/signal.h> 41 #include <sys/errno.h> 42 #include <db.h> 43 44 #ifndef BLSWAP 45 #define BLSWAP(a) { \ 46 register unsigned long _t; \ 47 _t = (a); \ 48 (a) >>= 24; \ 49 (a) |= (_t & 0x00ff0000) >> 8; \ 50 (a) |= (_t & 0x0000ff00) << 8; \ 51 (a) |= (_t & 0x000000ff) << 24; \ 52 } 53 #endif /* ndef BLSWAP */ 54 55 typedef char *BTREE; /* should really be (void *) */ 56 57 /* #define DEBUG */ 58 59 #define RET_ERROR -1 60 #define RET_SUCCESS 0 61 #define RET_SPECIAL 1 62 63 #ifndef TRUE 64 #define TRUE 1 65 #define FALSE 0 66 #endif /* ndef TRUE */ 67 68 /* libc */ 69 extern char *malloc(); 70 71 /* these are defined in lrucache.c */ 72 extern char *lruinit(); 73 extern char *lruget(); 74 extern char *lrugetnew(); 75 extern int lrusync(); 76 extern int lruwrite(); 77 extern int lrurelease(); 78 extern void lrufree(); 79 80 /* these are defined here */ 81 BTREE bt_open(); 82 int bt_close(); 83 int bt_delete(); 84 int bt_get(); 85 int bt_put(); 86 int bt_seq(); 87 int bt_sync(); 88 89 /* 90 * Private types. What you choose for these depends on how big you 91 * want to let files get, and how big you want to let pages get. 92 */ 93 94 typedef u_long index_t; /* so # bytes on a page fits in a long */ 95 typedef u_long pgno_t; /* so # of pages in a btree fits in a long */ 96 97 /* 98 * When we do searches, we push the parent page numbers onto a stack 99 * as we descend the tree. This is so that for insertions, we can 100 * find our way back up to do internal page insertions and splits. 101 */ 102 103 typedef struct BTSTACK { 104 pgno_t bts_pgno; 105 struct BTSTACK *bts_next; 106 } BTSTACK; 107 108 /* 109 * Every btree page has a header that looks like this. Flags are given 110 * in the #define's for the F_ flags (see below). 111 */ 112 113 typedef struct BTHEADER { 114 pgno_t h_pgno; /* page number of this page */ 115 pgno_t h_prevpg; /* left sibling */ 116 pgno_t h_nextpg; /* right sibling */ 117 118 #define F_LEAF 0x01 /* leaf page, contains user data */ 119 #define F_CONT 0x02 /* continuation page (large items) */ 120 #define F_DIRTY 0x04 /* need to write to disk */ 121 #define F_PRESERVE 0x08 /* never delete this chain of pages */ 122 123 u_long h_flags; /* page state */ 124 index_t h_lower; /* lower bound of free space on page */ 125 index_t h_upper; /* upper bound of free space on page */ 126 index_t h_linp[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 127 } BTHEADER; 128 129 /* 130 * HTBUCKETs are hash table buckets for looking up pages of in-memory 131 * btrees by page number. We use this indirection, rather than direct 132 * pointers, so that the code for manipulating in-memory trees is the 133 * same as that for manipulating on-disk trees. 134 */ 135 136 typedef struct HTBUCKET { 137 pgno_t ht_pgno; 138 BTHEADER *ht_page; 139 struct HTBUCKET *ht_next; 140 } HTBUCKET; 141 142 typedef HTBUCKET **HTABLE; 143 144 /* minimum size we'll let a page be */ 145 #define MINPSIZE 512 146 147 /* default cache size, in bytes */ 148 #define DEFCACHE (20 * 1024) 149 150 /* hash table size for in-memory trees */ 151 #define HTSIZE 128 152 153 /* generate a hash key from a page number */ 154 #define HASHKEY(pgno) ((pgno - 1) % HTSIZE) 155 156 /* 157 * Disk btrees have a file descriptor, and may also have an lru buffer 158 * cache, if the user asked for one. 159 */ 160 161 typedef struct BTDISK { 162 int d_fd; 163 char *d_cache; 164 } BTDISK; 165 166 /* 167 * Cursors keep track of the current location in a sequential scan of 168 * the database. Since btrees impose a total ordering on keys, we can 169 * walk forward or backward through the database from any point. Cursors 170 * survive updates to the tree, and can be used to delete a particular 171 * record. 172 */ 173 174 typedef struct CURSOR { 175 pgno_t c_pgno; /* pgno of current item in scan */ 176 index_t c_index; /* index of current item in scan */ 177 char *c_key; /* current key, used for updates */ 178 179 #define CRSR_BEFORE 0x01 180 181 u_char c_flags; /* to handle updates properly */ 182 } CURSOR; 183 184 /* 185 * The private btree data structure. The user passes a pointer to one of 186 * these when we are to manipulate a tree, but the BTREE type is opaque 187 * to him. 188 */ 189 190 typedef struct BTREEDATA_P { 191 char *bt_fname; /* NULL for in-memory trees */ 192 union { 193 BTDISK bt_d; /* for on-disk btrees */ 194 HTABLE bt_ht; /* hash table for mem trees */ 195 } bt_s; 196 size_t bt_psize; /* page size for btree pages */ 197 int (*bt_compare)(); /* key comparison function */ 198 pgno_t bt_npages; /* number of pages in tree */ 199 BTHEADER *bt_curpage; /* current page contents */ 200 pgno_t bt_free; /* free pg list for big data */ 201 CURSOR bt_cursor; /* cursor for scans */ 202 BTSTACK *bt_stack; /* parent stack for inserts */ 203 u_long bt_lorder; /* byte order (endian.h) */ 204 205 #define BTF_METAOK 0x01 /* meta-data written to start of file */ 206 #define BTF_SEQINIT 0x02 /* we have called bt_seq */ 207 #define BTF_ISWRITE 0x04 /* tree was opened for write */ 208 #define BTF_NODUPS 0x08 /* tree created for unique keys */ 209 210 u_long bt_flags; /* btree state */ 211 } BTREEDATA_P; 212 213 typedef BTREEDATA_P *BTREE_P; 214 215 /* 216 * The first thing in a btree file is a BTMETA structure. The rest of 217 * the first page is empty, so that all disk operations are page-aligned. 218 */ 219 220 typedef struct BTMETA { 221 u_long m_magic; 222 u_long m_version; 223 size_t m_psize; 224 pgno_t m_free; 225 u_long m_flags; 226 u_long m_lorder; 227 } BTMETA; 228 229 #define P_NONE 0 /* invalid page number in tree */ 230 #define P_ROOT 1 /* page number of root pg in btree */ 231 232 #define NORELEASE 0 /* don't release a page during write */ 233 #define RELEASE 1 /* release a page during write */ 234 235 #define INSERT 0 /* doing an insert operation */ 236 #define DELETE 1 /* doing a delete operation */ 237 238 /* get the next free index on a btree page */ 239 #define NEXTINDEX(p) ((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t))) 240 241 /* is a BTITEM actually on the btree page? */ 242 #define VALIDITEM(t, i) ((i)->bti_index < NEXTINDEX((t)->bt_curpage)) 243 244 /* guarantee longword alignment so structure refs work */ 245 #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03) 246 247 /* get a particular datum (or idatum) off a page */ 248 #define GETDATUM(h,i) (((char *) h) + h->h_linp[i]) 249 250 /* is a {key,datum} too big to put on a single page? */ 251 #define TOOBIG(t, sz) (sz >= t->bt_psize / 5) 252 253 /* is this a disk tree or a memory tree? */ 254 #define ISDISK(t) (t->bt_fname != (char *) NULL) 255 256 /* does the disk tree use a cache? */ 257 #define ISCACHE(t) (t->bt_s.bt_d.d_cache != (char *) NULL) 258 259 /* 260 * DATUMs are for user data -- one appears on leaf pages for every 261 * tree entry. The d_bytes[] array contains the key first, then the data. 262 * 263 * If either the key or the datum is too big to store on a single page, 264 * a bit is set in the flags entry, and the d_bytes[] array contains a 265 * pgno pointing to the page at which the data is actually stored. 266 * 267 * Note on alignment: every DATUM is guaranteed to be longword aligned 268 * on the disk page. In order to force longword alignment of user key 269 * and data values, we must guarantee that the d_bytes[] array starts 270 * on a longword boundary. This is the reason that d_flags is a u_long, 271 * rather than a u_char (it really only needs to be two bits big). This 272 * is necessary because we call the user's comparison function with a 273 * pointer to the start of the d_bytes array. We don't need to force 274 * longword alignment of the data following the key, since that is copied 275 * to a longword-aligned buffer before being returned to the user. 276 */ 277 278 typedef struct DATUM { 279 size_t d_ksize; /* size of key */ 280 size_t d_dsize; /* size of data */ 281 282 #define D_BIGDATA 0x01 /* indirect datum ptr flag */ 283 #define D_BIGKEY 0x02 /* indirect key ptr flag */ 284 285 u_long d_flags; /* flags (indirect bit) */ 286 char d_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 287 } DATUM; 288 289 /* BTITEMs are used to return (page, index, datum) tuples from searches */ 290 typedef struct BTITEM { 291 pgno_t bti_pgno; 292 index_t bti_index; 293 DATUM *bti_datum; 294 } BTITEM; 295 296 /* 297 * IDATUMs are for data stored on internal pages. This is the (key, pgno) 298 * pair, such that key 'key' is the first entry on page 'pgno'. If our 299 * internal page contains keys (a) and (b) next to each other, then all 300 * items >= to (a) and < (b) go on the same page as (a). There are some 301 * gotchas with duplicate keys, however. See the split code for details. 302 * 303 * If a key is too big to fit on a single page, then the i_bytes[] array 304 * contains a pgno pointing to the start of a chain that actually stores 305 * the bytes. Since items on internal pages are never deleted from the 306 * tree, these indirect chains are marked as special, so that they won't 307 * be deleted if the corresponding leaf item is deleted. 308 * 309 * As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char) 310 * in order to guarantee that user keys are longword aligned on the disk 311 * page. 312 */ 313 314 typedef struct IDATUM { 315 size_t i_size; 316 pgno_t i_pgno; 317 u_long i_flags; /* see DATUM.d_flags, above */ 318 char i_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */ 319 } IDATUM; 320 321 /* all private interfaces have a leading _ in their names */ 322 static BTITEM *_bt_search(); 323 static BTITEM *_bt_searchr(); 324 static BTHEADER *_bt_allocpg(); 325 static index_t _bt_binsrch(); 326 static int _bt_isonpage(); 327 static BTITEM *_bt_first(); 328 static int _bt_release(); 329 static int _bt_wrtmeta(); 330 static int _bt_delindir(); 331 static int _bt_pgout(); 332 static int _bt_pgin(); 333 static int _bt_fixscan(); 334 static int _bt_indirect(); 335 static int _bt_crsrdel(); 336 337 extern int strcmp(); 338 339 static BTREEINFO _DefaultBTInfo = { 340 0, /* flags */ 341 0, /* cachesize */ 342 0, /* psize */ 343 strcmp, /* compare */ 344 0 345 }; 346 347 /* 348 * BTREE_OPEN -- Wrapper routine to open a btree. 349 * 350 * Creates and fills a DB struct, and calls the routine that actually 351 * opens the btree. 352 * 353 * Parameters: 354 * f: filename to open 355 * flags: flag bits passed to open 356 * mode: permission bits, used if O_CREAT specified 357 * b: BTREEINFO pointer 358 * 359 * Returns: 360 * Filled-in DBT on success; NULL on failure, with errno 361 * set as appropriate. 362 * 363 * Side Effects: 364 * Allocates memory for the DB struct. 365 */ 366 367 DB * 368 btree_open(f, flags, mode, b) 369 char *f; 370 int flags; 371 int mode; 372 BTREEINFO *b; 373 { 374 DB *db; 375 BTREE t; 376 377 if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL) 378 return ((DB *) NULL); 379 380 if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) { 381 (void) free ((char *) db); 382 return ((DB *) NULL); 383 } 384 385 db->internal = (char *) t; 386 db->close = bt_close; 387 db->delete = bt_delete; 388 db->get = bt_get; 389 db->put = bt_put; 390 db->seq = bt_seq; 391 db->sync = bt_sync; 392 393 return (db); 394 } 395 396 /* 397 * BT_OPEN -- Open a btree. 398 * 399 * This routine creates the correct kind (disk or in-memory) of 400 * btree and initializes it as required. 401 * 402 * Parameters: 403 * f -- name of btree (NULL for in-memory btrees) 404 * flags -- flags passed to open() 405 * mode -- mode passed to open () 406 * b -- BTREEINFO structure, describing btree 407 * 408 * Returns: 409 * (Opaque) pointer to the btree. On failure, returns NULL 410 * with errno set as appropriate. 411 * 412 * Side Effects: 413 * Allocates memory, opens files. 414 */ 415 416 BTREE 417 bt_open(f, flags, mode, b) 418 char *f; 419 int flags; 420 int mode; 421 BTREEINFO *b; 422 { 423 BTREE_P t; 424 HTABLE ht; 425 int nbytes; 426 int fd; 427 CURSOR *c; 428 BTMETA m; 429 struct stat buf; 430 431 /* use the default info if none was provided */ 432 if (b == (BTREEINFO *) NULL) 433 b = &_DefaultBTInfo; 434 435 if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL) 436 return ((BTREE) NULL); 437 438 if (b->compare) 439 t->bt_compare = b->compare; 440 else 441 t->bt_compare = strcmp; 442 443 t->bt_fname = f; 444 t->bt_curpage = (BTHEADER *) NULL; 445 t->bt_free = P_NONE; 446 c = &(t->bt_cursor); 447 c->c_pgno = P_NONE; 448 c->c_index = 0; 449 c->c_flags = (u_char) NULL; 450 c->c_key = (char *) NULL; 451 t->bt_stack = (BTSTACK *) NULL; 452 t->bt_flags = 0; 453 454 /* 455 * If no file name was supplied, this is an in-memory btree. 456 * Otherwise, it's a disk-based btree. 457 */ 458 if (f == (char *) NULL) { 459 /* in memory */ 460 if ((t->bt_psize = b->psize) < MINPSIZE) { 461 if (t->bt_psize != 0) { 462 (void) free ((char *) t); 463 errno = EINVAL; 464 return ((BTREE) NULL); 465 } 466 t->bt_psize = getpagesize(); 467 } 468 469 nbytes = HTSIZE * sizeof(HTBUCKET *); 470 if ((ht = (HTABLE) malloc((unsigned) nbytes)) 471 == (HTABLE) NULL) { 472 (void) free((char *) t); 473 return ((BTREE) NULL); 474 } 475 (void) bzero((char *) ht, nbytes); 476 t->bt_s.bt_ht = ht; 477 t->bt_npages = 0; 478 t->bt_lorder = BYTE_ORDER; 479 if (!(b->flags & R_DUP)) 480 t->bt_flags |= BTF_NODUPS; 481 } else { 482 /* on disk */ 483 if ((fd = open(f, O_RDONLY, 0)) < 0) { 484 /* doesn't exist yet, be sure page is big enough */ 485 if ((t->bt_psize = b->psize) < sizeof(BTHEADER) 486 && b->psize != 0) { 487 (void) free((char *) t); 488 errno = EINVAL; 489 return ((BTREE) NULL); 490 } 491 if (b->lorder == 0) 492 b->lorder = BYTE_ORDER; 493 494 if (b->lorder != BIG_ENDIAN 495 && b->lorder != LITTLE_ENDIAN) { 496 (void) free((char *) t); 497 errno = EINVAL; 498 return ((BTREE) NULL); 499 } 500 t->bt_lorder = b->lorder; 501 if (!(b->flags & R_DUP)) 502 t->bt_flags |= BTF_NODUPS; 503 } else { 504 /* exists, get page size from file */ 505 if (read(fd, &m, sizeof(m)) < sizeof(m)) { 506 (void) close(fd); 507 (void) free((char *) t); 508 errno = EINVAL; 509 return ((BTREE) NULL); 510 } 511 512 /* lorder always stored in host-independent format */ 513 NTOHL(m.m_lorder); 514 if (m.m_lorder != BIG_ENDIAN 515 && m.m_lorder != LITTLE_ENDIAN) { 516 (void) free((char *) t); 517 errno = EINVAL; 518 return ((BTREE) NULL); 519 } 520 t->bt_lorder = m.m_lorder; 521 522 if (t->bt_lorder != BYTE_ORDER) { 523 BLSWAP(m.m_magic); 524 BLSWAP(m.m_version); 525 BLSWAP(m.m_psize); 526 BLSWAP(m.m_free); 527 BLSWAP(m.m_flags); 528 } 529 530 if (m.m_magic != BTREEMAGIC 531 || m.m_version != BTREEVERSION 532 || m.m_psize < MINPSIZE) { 533 (void) close(fd); 534 (void) free((char *) t); 535 #ifndef EFTYPE 536 #define EFTYPE -100 537 #endif 538 errno = EFTYPE; 539 return ((BTREE) NULL); 540 } 541 t->bt_psize = m.m_psize; 542 t->bt_free = m.m_free; 543 t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK; 544 (void) close(fd); 545 } 546 547 /* now open the file the way the user wanted it */ 548 if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) { 549 (void) free ((char *) t); 550 return ((BTREE) NULL); 551 } 552 553 /* get number of pages, page size if necessary */ 554 (void) fstat(t->bt_s.bt_d.d_fd, &buf); 555 if (t->bt_psize == 0) 556 t->bt_psize = buf.st_blksize; 557 t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize); 558 559 /* page zero is metadata, doesn't count */ 560 if (t->bt_npages > 0) 561 --(t->bt_npages); 562 563 if (b->cachesize == 0) 564 b->cachesize = DEFCACHE; 565 566 /* get an lru buffer cache, if the user asked for one */ 567 if ((b->cachesize / t->bt_psize) > 0) { 568 BTDISK *d = &(t->bt_s.bt_d); 569 570 d->d_cache = lruinit(d->d_fd, 571 b->cachesize / t->bt_psize, 572 t->bt_psize, 573 (char *) t->bt_lorder, 574 _bt_pgin, _bt_pgout); 575 576 if (d->d_cache == (char *) NULL) { 577 (void) free((char *) t); 578 return ((BTREE) NULL); 579 } 580 } 581 } 582 583 /* remember if tree was opened for write */ 584 if (((flags & O_WRONLY) == O_WRONLY) 585 || ((flags & O_RDWR) == O_RDWR)) 586 t->bt_flags |= BTF_ISWRITE; 587 588 return ((BTREE) t); 589 } 590 591 /* 592 * BT_GET -- Get an entry from a btree. 593 * 594 * Does a key lookup in the tree to find the specified key, and returns 595 * the key/data pair found. 596 * 597 * Parameters: 598 * tree -- btree in which to do lookup 599 * key -- key to find 600 * data -- pointer to DBT in which to return data 601 * flag -- ignored 602 * 603 * Returns: 604 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not 605 * found. If key is not found, nothing is stored in the 606 * return DBT 'data'. 607 * 608 * Side Effects: 609 * None. 610 * 611 * Warnings: 612 * Return data is statically allocated, and will be overwritten 613 * at the next call. 614 */ 615 616 int 617 bt_get(tree, key, data, flag) 618 BTREE tree; 619 DBT *key; 620 DBT *data; 621 int flag; 622 { 623 BTREE_P t = (BTREE_P) tree; 624 BTHEADER *h; 625 DATUM *d; 626 BTITEM *item; 627 628 /* lookup */ 629 item = _bt_search(t, key); 630 if (item == (BTITEM *) NULL) 631 return (RET_ERROR); 632 633 /* clear parent pointer stack */ 634 while (_bt_pop(t) != P_NONE) 635 continue; 636 637 h = (BTHEADER *) t->bt_curpage; 638 data->size = 0; 639 data->data = (char *) NULL; 640 641 /* match? */ 642 if (VALIDITEM(t, item) 643 && _bt_cmp(t, key->data, item->bti_index) == 0) { 644 d = (DATUM *) GETDATUM(h, item->bti_index); 645 return (_bt_buildret(t, d, data, key)); 646 } 647 648 /* nope */ 649 return (RET_SPECIAL); 650 } 651 652 /* 653 * BT_PUT -- Add an entry to a btree. 654 * 655 * The specified (key, data) pair is added to the tree. If the tree 656 * was created for unique keys only, then duplicates will not be 657 * entered. If the requested key exists in the tree, it will be over- 658 * written unless the flags parameter is R_NOOVERWRITE, in which case 659 * the update will not be done. If duplicate keys are permitted in the 660 * tree, duplicates will be inserted and will not overwrite existing 661 * keys. Nodes are split as required. 662 * 663 * Parameters: 664 * tree -- btree in which to put the new entry 665 * key -- key component to add 666 * data -- data corresponding to key 667 * flag -- R_NOOVERWRITE or zero. 668 * 669 * Returns: 670 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the 671 * NOOVERWRITE flag was set and the specified key exists 672 * in the database. 673 * 674 * Side Effects: 675 * None. 676 */ 677 678 int 679 bt_put(tree, key, data, flag) 680 BTREE tree; 681 DBT *key; 682 DBT *data; 683 int flag; 684 { 685 BTREE_P t = (BTREE_P) tree; 686 BTITEM *item; 687 688 /* look for this key in the tree */ 689 item = _bt_search(t, key); 690 691 /* 692 * If this tree was originally created without R_DUP, then duplicate 693 * keys are not allowed. We need to check this at insertion time. 694 */ 695 696 if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) { 697 if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) { 698 if (_bt_delone(t, item->bti_index) == RET_ERROR) 699 return (RET_ERROR); 700 } 701 } 702 703 return (_bt_insert(t, item, key, data, flag)); 704 } 705 706 /* 707 * BT_DELETE -- delete a key from the tree. 708 * 709 * Deletes all keys (and their associated data items) matching the 710 * supplied key from the tree. If the flags entry is R_CURSOR, then 711 * the current item in the active scan is deleted. 712 * 713 * Parameters: 714 * tree -- btree from which to delete key 715 * key -- key to delete 716 * flags -- R_CURSOR or zero 717 * 718 * Returns: 719 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified 720 * key was not in the tree. 721 * 722 * Side Effects: 723 * None. 724 */ 725 726 int 727 bt_delete(tree, key, flags) 728 BTREE tree; 729 DBT *key; 730 int flags; 731 { 732 BTREE_P t = (BTREE_P) tree; 733 BTHEADER *h; 734 BTITEM *item; 735 int ndel = 0; 736 737 if (flags == R_CURSOR) 738 return (_bt_crsrdel(t)); 739 740 /* find the first matching key in the tree */ 741 item = _bt_first(t, key); 742 h = t->bt_curpage; 743 744 /* delete all matching keys */ 745 for (;;) { 746 while (VALIDITEM(t, item) 747 && (_bt_cmp(t, key->data, item->bti_index) == 0)) { 748 if (_bt_delone(t, item->bti_index) == RET_ERROR) 749 return (RET_ERROR); 750 ndel++; 751 } 752 753 if (VALIDITEM(t, item) || h->h_nextpg == P_NONE) 754 break; 755 756 /* next page, if necessary */ 757 do { 758 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 759 return (RET_ERROR); 760 h = t->bt_curpage; 761 } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE); 762 763 item->bti_pgno = h->h_pgno; 764 item->bti_index = 0; 765 766 if (!VALIDITEM(t, item) 767 || _bt_cmp(t, key->data, item->bti_index) != 0) 768 break; 769 } 770 771 /* clean up the parent stack */ 772 while (_bt_pop(t) != P_NONE) 773 continue; 774 775 /* flush changes to disk */ 776 if (ISDISK(t)) { 777 if (h->h_flags & F_DIRTY) { 778 if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR) 779 return (RET_ERROR); 780 } 781 } 782 783 if (ndel == 0) 784 return (RET_SPECIAL); 785 786 return (RET_SUCCESS); 787 } 788 789 /* 790 * _BT_CRSRDEL -- Delete the item pointed to by the cursor. 791 * 792 * This routine deletes the item most recently returned by a scan 793 * through the tree. Since it only makes sense to delete the current 794 * record once, we make sure that we don't try to delete twice without 795 * advancing the scan. 796 * 797 * Parameters: 798 * t -- tree in which to do deletion 799 * 800 * Returns: 801 * RET_SUCCESS, RET_ERROR. 802 * 803 * Side Effects: 804 * The call to _bt_delone marks the cursor, so we can tell that 805 * the current record has been deleted. 806 */ 807 808 static int 809 _bt_crsrdel(t) 810 BTREE_P t; 811 { 812 CURSOR *c; 813 814 c = &(t->bt_cursor); 815 816 /* a cursor must exist, and can't have deleted the current key yet */ 817 if (!(t->bt_flags & BTF_SEQINIT) || (c->c_flags & CRSR_BEFORE)) { 818 errno = EINVAL; 819 return (RET_ERROR); 820 } 821 822 if (_bt_getpage(t, c->c_pgno) == RET_ERROR) 823 return (RET_ERROR); 824 825 if (c->c_index >= NEXTINDEX(t->bt_curpage)) { 826 errno = EINVAL; 827 return (RET_ERROR); 828 } 829 830 return (_bt_delone(t, c->c_index)); 831 } 832 833 /* 834 * _BT_DELONE -- Delete a single entry from a btree. 835 * 836 * This routine physically removes a btree entry from a leaf page. 837 * IDATUM items are *never* removed from internal nodes, regardless 838 * of whether the entries that originally caused them to be added 839 * are removed from the tree or not. In addition, pages made empty 840 * by element deletion are not actually reclaimed. They are, 841 * however, made available for reuse. 842 * 843 * To delete an item from a page, we pack the remaining items at 844 * the end of the page, overwriting the deleted item's entry. We 845 * move the line pointers backward on the page, overwriting the 846 * original item's line pointer. This guarantees that the space in 847 * the middle of the page is free -- a property that our insertion 848 * strategy relies on. 849 * 850 * This routine doesn't reclaim pages all of whose entries have 851 * been deleted. These pages are available for reuse, however. 852 * If an item is deleted that was too big to fit on a page, then 853 * the blocks that it occupies are put on a free list for reuse. 854 * 855 * Parameters: 856 * t -- btree from which to delete item 857 * index -- index of entry on current page to delete 858 * 859 * Returns: 860 * RET_SUCCESS, RET_ERROR. 861 * 862 * Side Effects: 863 * Physically changes page layout, adjusts internal page 864 * state to reflect the deletion of the item, and updates 865 * the list of free pages for this tree. 866 */ 867 868 static int 869 _bt_delone(t, index) 870 BTREE_P t; 871 index_t index; 872 { 873 char *src, *dest; 874 int nbytes, nmoved; 875 index_t off; 876 index_t top; 877 index_t i; 878 pgno_t chain; 879 BTHEADER *h; 880 CURSOR *c; 881 DATUM *d; 882 883 /* deletion may confuse an active scan. fix it. */ 884 c = &(t->bt_cursor); 885 if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno) 886 if (_bt_fixscan(t, index, (DATUM *) NULL, DELETE) == RET_ERROR) 887 return (RET_ERROR); 888 889 h = t->bt_curpage; 890 off = h->h_linp[index]; 891 d = (DATUM *) GETDATUM(h, index); 892 893 /* if this is a big item, reclaim the space it occupies */ 894 if (d->d_flags & D_BIGKEY) { 895 bcopy(&(d->d_bytes[0]), 896 (char *) &chain, 897 sizeof(chain)); 898 if (_bt_delindir(t, chain) == RET_ERROR) 899 return (RET_ERROR); 900 h = t->bt_curpage; 901 d = (DATUM *) GETDATUM(h, index); 902 } 903 if (d->d_flags & D_BIGDATA) { 904 bcopy(&(d->d_bytes[d->d_ksize]), 905 (char *) &chain, 906 sizeof(chain)); 907 if (_bt_delindir(t, chain) == RET_ERROR) 908 return (RET_ERROR); 909 h = t->bt_curpage; 910 d = (DATUM *) GETDATUM(h, index); 911 } 912 913 /* move the data down on the page */ 914 nbytes = d->d_ksize + d->d_dsize 915 + (sizeof(DATUM) - sizeof(char)); 916 nbytes = LONGALIGN(nbytes); 917 src = ((char *) h) + h->h_upper; 918 dest = src + nbytes; 919 nmoved = (int) (((char *) d) - src); 920 (void) bcopy(src, dest, nmoved); 921 922 /* next move the line pointers up */ 923 src = (char *) &(h->h_linp[index + 1]); 924 dest = (char *) &(h->h_linp[index]); 925 nmoved = (int) (((char *) &(h->h_linp[NEXTINDEX(h)])) - src); 926 (void) bcopy(src, dest, nmoved); 927 928 /* remember that we freed up some space */ 929 h->h_upper += nbytes; 930 h->h_lower -= sizeof(index_t); 931 932 /* adjust offsets in line pointers affected by moving the data */ 933 top = NEXTINDEX(h); 934 for (i = 0; i < top; i++) { 935 if (h->h_linp[i] < off) 936 h->h_linp[i] += nbytes; 937 } 938 939 /* it's gone */ 940 h->h_flags |= F_DIRTY; 941 942 return (RET_SUCCESS); 943 } 944 945 /* 946 * _BT_FIXSCAN -- Adjust a scan to cope with a change in tree structure. 947 * 948 * If the user has an active scan on the database, and we delete an 949 * item from the page the cursor is pointing at, we need to figure 950 * out what to do about it. Basically, the solution is to point 951 * "between" keys in the tree, using the CRSR_BEFORE flag. The 952 * requirement is that the user should not miss any data in the 953 * tree during a scan, just because he happened to do some deletions 954 * or insertions while it was active. 955 * 956 * In order to guarantee that the scan progresses properly, we need 957 * to save the key of any deleted item we were pointing at, so that 958 * we can check later insertions against it. 959 * 960 * Parameters: 961 * t -- tree to adjust 962 * index -- index of item at which change was made 963 * newd -- new datum (for insertions only) 964 * op -- operation (DELETE or INSERT) causing change 965 * 966 * Returns: 967 * RET_SUCCESS, RET_ERROR (errno is set). 968 * 969 * Side Effects: 970 * None. 971 */ 972 973 static int 974 _bt_fixscan(t, index, newd, op) 975 BTREE_P t; 976 index_t index; 977 DATUM *newd; 978 int op; 979 { 980 BTHEADER *h; 981 CURSOR *c; 982 DATUM *d; 983 984 h = t->bt_curpage; 985 c = &(t->bt_cursor); 986 987 if (op == DELETE) { 988 if (index < c->c_index) 989 c->c_index--; 990 else if (index == c->c_index) { 991 if (!(c->c_flags & CRSR_BEFORE)) { 992 if (_bt_crsrkey(t) == RET_ERROR) 993 return (RET_ERROR); 994 c->c_flags |= CRSR_BEFORE; 995 } 996 } 997 } else { 998 /* 999 * If we previously deleted the object at this location, 1000 * and now we're inserting a new one, we need to do the 1001 * right thing -- the cursor should come either before 1002 * or after the new item, depending on the key that was 1003 * here, and the new one. 1004 */ 1005 1006 if (c->c_flags & CRSR_BEFORE) { 1007 if (index <= c->c_index) { 1008 char *tmp; 1009 int itmp; 1010 pgno_t chain; 1011 int r; 1012 1013 d = (DATUM *) GETDATUM(t->bt_curpage, index); 1014 if (d->d_flags & D_BIGKEY) { 1015 bcopy(&(newd->d_bytes[0]), 1016 (char *) &chain, 1017 sizeof(chain)); 1018 if (_bt_getbig(t, chain, &tmp, &itmp) 1019 == RET_ERROR) 1020 return (RET_ERROR); 1021 } else 1022 tmp = &(newd->d_bytes[0]); 1023 1024 r = (*(t->bt_compare))(tmp, c->c_key); 1025 if (r < 0) 1026 c->c_index++; 1027 1028 if (d->d_flags & D_BIGKEY) 1029 (void) free (tmp); 1030 } 1031 } else if (index <= c->c_index) 1032 c->c_index++; 1033 } 1034 return (RET_SUCCESS); 1035 } 1036 1037 /* 1038 * _BT_CRSRKEY -- Save a copy of the key of the record that the cursor 1039 * is pointing to. The record is about to be deleted. 1040 * 1041 * Parameters: 1042 * t -- btree 1043 * 1044 * Returns: 1045 * RET_SUCCESS, RET_ERROR. 1046 * 1047 * Side Effects: 1048 * Allocates memory for the copy which should be released when 1049 * it is no longer needed. 1050 */ 1051 1052 static int 1053 _bt_crsrkey(t) 1054 BTREE_P t; 1055 { 1056 CURSOR *c; 1057 DATUM *d; 1058 pgno_t pgno; 1059 int ignore; 1060 1061 c = &(t->bt_cursor); 1062 d = (DATUM *) GETDATUM(t->bt_curpage, c->c_index); 1063 1064 if (d->d_flags & D_BIGKEY) { 1065 bcopy(&(d->d_bytes[0]), (char *) &pgno, sizeof(pgno)); 1066 return (_bt_getbig(t, pgno, &(c->c_key), &ignore)); 1067 } else { 1068 if ((c->c_key = (char *) malloc(d->d_ksize)) == (char *) NULL) 1069 return (RET_ERROR); 1070 1071 bcopy(&(d->d_bytes[0]), c->c_key, d->d_ksize); 1072 } 1073 return (RET_SUCCESS); 1074 } 1075 1076 /* 1077 * _BT_GETBIG -- Get big data from indirect pages. 1078 * 1079 * This routine chases indirect blocks for the big object at the 1080 * specified page to a buffer, and return the address of the buffer. 1081 * 1082 * Parameters: 1083 * t -- btree with the indirect blocks 1084 * pgno -- page number that starts the chain 1085 * p -- (char **) to get the address of the buffer containing 1086 * the key or datum. 1087 * sz -- pointer to an int to get the size of the instantiated 1088 * object. 1089 * 1090 * Returns: 1091 * RET_SUCCESS, RET_ERROR. 1092 * 1093 * Side Effects: 1094 * None. 1095 */ 1096 1097 static int 1098 _bt_getbig(t, pgno, p, sz) 1099 BTREE_P t; 1100 pgno_t pgno; 1101 char **p; 1102 int *sz; 1103 { 1104 pgno_t save; 1105 size_t nbytes; 1106 size_t nhere; 1107 BTHEADER *h; 1108 char *top, *from, *where; 1109 1110 save = t->bt_curpage->h_pgno; 1111 if (_bt_getpage(t, pgno) == RET_ERROR) 1112 return (RET_ERROR); 1113 1114 h = t->bt_curpage; 1115 1116 bcopy((char *) &(h->h_linp[0]), (char *) &nbytes, sizeof(nbytes)); 1117 1118 if ((*p = (char *) malloc(nbytes)) == (char *) NULL) 1119 return (RET_ERROR); 1120 1121 *sz = nbytes; 1122 from = ((char *) (&h->h_linp[0])) + sizeof(nbytes); 1123 top = ((char *) h) + t->bt_psize; 1124 1125 /* need more space for data? */ 1126 1127 where = *p; 1128 1129 while (nbytes > 0) { 1130 nhere = (int) (top - from); 1131 if (nhere > nbytes) { 1132 (void) bcopy(from, where, nbytes); 1133 nbytes = 0; 1134 } else { 1135 (void) bcopy(from, where, nhere); 1136 where += nhere; 1137 nbytes -= nhere; 1138 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 1139 return (RET_ERROR); 1140 h = t->bt_curpage; 1141 top = ((char *) h) + t->bt_psize; 1142 from = (char *) &(h->h_linp[0]); 1143 } 1144 } 1145 1146 if (_bt_getpage(t, save) == RET_ERROR) 1147 return (RET_ERROR); 1148 1149 return (RET_SUCCESS); 1150 } 1151 1152 /* 1153 * _BT_DELINDIR -- Delete a chain of indirect blocks from the btree. 1154 * 1155 * When a large item is deleted from the tree, this routine puts the 1156 * space that it occupied onto the free list for later reuse. The 1157 * bt_free entry in the btree structure points at the head of this list. 1158 * This value is also stored on disk in the btree's metadata. 1159 * 1160 * Parameters: 1161 * t -- btree from which to delete pages 1162 * chain -- page number that starts the chain. 1163 * 1164 * Returns: 1165 * RET_SUCCESS, RET_ERROR. 1166 * 1167 * Side Effects: 1168 * Invalidates the current on-disk version of the btree's 1169 * metadata (if any), and updates the free list appropriately. 1170 */ 1171 1172 static int 1173 _bt_delindir(t, chain) 1174 BTREE_P t; 1175 pgno_t chain; 1176 { 1177 BTHEADER *h; 1178 pgno_t save; 1179 pgno_t oldfree; 1180 1181 h = t->bt_curpage; 1182 save = h->h_pgno; 1183 if (_bt_getpage(t, chain) == RET_ERROR) 1184 return (RET_ERROR); 1185 1186 /* 1187 * If some internal node is pointing at this chain, don't 1188 * delete it. 1189 */ 1190 1191 if (t->bt_curpage->h_flags & F_PRESERVE) { 1192 if (_bt_getpage(t, save) == RET_ERROR) 1193 return (RET_ERROR); 1194 return (RET_SUCCESS); 1195 } 1196 1197 /* if there's nothing on the free list, this is easy... */ 1198 if (t->bt_free == P_NONE) { 1199 t->bt_free = chain; 1200 } else { 1201 oldfree = t->bt_free; 1202 1203 /* find the end of the data chain for the deleted datum */ 1204 t->bt_free = chain; 1205 do { 1206 if (_bt_getpage(t, chain) == RET_ERROR) 1207 return (RET_ERROR); 1208 h = t->bt_curpage; 1209 if (h->h_nextpg != P_NONE) 1210 chain = h->h_nextpg; 1211 } while (h->h_nextpg != P_NONE); 1212 1213 /* link freed pages into free list */ 1214 h->h_nextpg = oldfree; 1215 if (_bt_write(t, h, RELEASE) == RET_ERROR) 1216 return (RET_ERROR); 1217 if (_bt_getpage(t, oldfree) == RET_ERROR) 1218 return (RET_ERROR); 1219 h = t->bt_curpage; 1220 h->h_prevpg = chain; 1221 if (_bt_write(t, h, RELEASE) == RET_ERROR) 1222 return (RET_ERROR); 1223 } 1224 1225 /* restore the tree's current page pointer */ 1226 if (_bt_getpage(t, save) == RET_ERROR) 1227 return (RET_ERROR); 1228 1229 /* we have trashed the tree metadata; rewrite it later */ 1230 t->bt_flags &= ~BTF_METAOK; 1231 1232 return (RET_SUCCESS); 1233 } 1234 1235 /* 1236 * _BT_FIRST -- Find the first item in the tree that matches the supplied 1237 * key. 1238 * 1239 * This routine supports deletion. When the user supplies a key to 1240 * be deleted, we find the first one, and iteratively delete all the 1241 * matching ones that follow it. 1242 * 1243 * Parameters: 1244 * t -- btree in which to find first occurrence 1245 * key -- key to find 1246 * 1247 * Returns: 1248 * The BTITEM for the matching item. If there's no match, 1249 * this may point to the first item > than the supplied key, 1250 * or off the end of the page. 1251 * 1252 * Warnings: 1253 * The BTITEM returned is in static space and will be overwritten 1254 * by the next search of any kind in any btree. 1255 */ 1256 1257 static BTITEM * 1258 _bt_first(t, key) 1259 BTREE_P t; 1260 DBT *key; 1261 { 1262 BTHEADER *h; 1263 BTITEM *item; 1264 index_t next; 1265 int r; 1266 1267 /* find any matching item */ 1268 item = _bt_search(t, key); 1269 h = t->bt_curpage; 1270 next = NEXTINDEX(h); 1271 1272 /* if we're off the end of the page, search failed and we're done */ 1273 if (item->bti_index >= next) 1274 return (item); 1275 1276 /* as long as we have an exact match, walk backwards */ 1277 while ((r = _bt_cmp(t, key->data, item->bti_index)) == 0) { 1278 1279 /* at start of page? */ 1280 if (item->bti_index == 0) { 1281 1282 /* if no prev page, we're done */ 1283 if (h->h_prevpg == P_NONE) 1284 return (item); 1285 1286 /* walk backward, skipping empty pages */ 1287 do { 1288 if (_bt_getpage(t, h->h_prevpg) == RET_ERROR) 1289 return ((BTITEM *) NULL); 1290 h = t->bt_curpage; 1291 } while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE); 1292 1293 if (NEXTINDEX(h) != 0) 1294 item->bti_index = NEXTINDEX(h) - 1; 1295 else 1296 item->bti_index = 0; 1297 1298 item->bti_pgno = h->h_pgno; 1299 } else { 1300 item->bti_index--; 1301 } 1302 } 1303 1304 /* if we went too far backwards, step forward one entry */ 1305 if (r > 0) { 1306 if (++(item->bti_index) >= NEXTINDEX(h) 1307 && h->h_nextpg != P_NONE) { 1308 1309 /* walk forward, skipping empty pages */ 1310 do { 1311 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 1312 return ((BTITEM *) NULL); 1313 h = t->bt_curpage; 1314 } while (h->h_nextpg != P_NONE && NEXTINDEX(h) == 0); 1315 1316 item->bti_index = 0; 1317 item->bti_pgno = h->h_pgno; 1318 } 1319 } 1320 1321 /* got it */ 1322 return (item); 1323 } 1324 1325 /* 1326 * _BT_SEARCH, _BT_SEARCHR -- Search for a particular key in the tree. 1327 * 1328 * Parameters: 1329 * t -- btree in which to search 1330 * key -- key to find 1331 * 1332 * Returns: 1333 * BTITEM for matching item, if any, or the BTITEM for the 1334 * location of the key, if it were in the tree. 1335 * 1336 * Warnings: 1337 * The BTITEM returned is in static memory, and will be 1338 * overwritten by the next search of any kind in any tree. 1339 */ 1340 1341 static BTITEM * 1342 _bt_search(t, key) 1343 BTREE_P t; 1344 DBT *key; 1345 { 1346 /* we want to start all of our searches at the root */ 1347 if (_bt_getpage(t, P_ROOT) == RET_ERROR) 1348 return ((BTITEM *) NULL); 1349 1350 return (_bt_searchr(t, key)); 1351 } 1352 1353 static BTITEM * 1354 _bt_searchr(t, key) 1355 BTREE_P t; 1356 DBT *key; 1357 { 1358 BTHEADER *h = t->bt_curpage; 1359 index_t index; 1360 IDATUM *id; 1361 DATUM *d; 1362 static BTITEM item; 1363 1364 /* do a binary search on the current page */ 1365 index = _bt_binsrch(t, &(key->data[0])); 1366 1367 /* 1368 * At this point, the binary search terminated because the endpoints 1369 * got too close together, or we have a match. Figure out which 1370 * case applies and decide what to do based on the page type. 1371 */ 1372 if (h->h_flags & F_LEAF) { 1373 item.bti_pgno = h->h_pgno; 1374 item.bti_index = index; 1375 if (index < NEXTINDEX(h)) 1376 d = (DATUM *) GETDATUM(h,index); 1377 else 1378 d = (DATUM *) NULL; 1379 1380 item.bti_datum = d; 1381 return(&item); 1382 } else { 1383 id = (IDATUM *) GETDATUM(h, index); 1384 _bt_push(t, h->h_pgno); 1385 if (_bt_getpage(t, id->i_pgno) == RET_ERROR) 1386 return ((BTITEM *) NULL); 1387 return (_bt_searchr(t, key)); 1388 } 1389 } 1390 1391 /* 1392 * BT_GETPAGE -- Make pgno the current page of the btree. 1393 * 1394 * This routine is just a wrapper that decides whether to call the 1395 * memory or disk-based routine to do the work. 1396 * 1397 * Parameters: 1398 * t -- btree in which to get page 1399 * pgno -- page number to get 1400 * 1401 * Returns: 1402 * RET_SUCCESS or RET_ERROR. 1403 */ 1404 1405 static int 1406 _bt_getpage(t, pgno) 1407 BTREE_P t; 1408 pgno_t pgno; 1409 { 1410 #ifdef DEBUG 1411 if (pgno == P_NONE) 1412 _punt(); 1413 #endif /* DEBUG */ 1414 1415 /* see if we can get away without doing any work */ 1416 if (t->bt_curpage != (BTHEADER *) NULL) { 1417 if (t->bt_curpage->h_pgno == pgno) 1418 return (RET_SUCCESS); 1419 } 1420 1421 if (t->bt_fname == (char *) NULL) 1422 return (_bt_getmpage(t, pgno)); 1423 else 1424 return (_bt_getdpage(t, pgno)); 1425 } 1426 1427 /* 1428 * _BT_GETMPAGE -- Make pgno the current page of the btree. 1429 * 1430 * This routine gets pages for in-memory btrees. 1431 * 1432 * Parameters: 1433 * t -- btree in which to get page 1434 * pgno -- page number to get 1435 * 1436 * Returns: 1437 * RET_SUCCESS or RET_ERROR. 1438 */ 1439 1440 static int 1441 _bt_getmpage(t, pgno) 1442 register BTREE_P t; 1443 pgno_t pgno; 1444 { 1445 int htindex; 1446 int nbytes; 1447 BTHEADER *h; 1448 HTBUCKET *b; 1449 1450 if (t->bt_curpage == (BTHEADER *) NULL) { 1451 if (pgno != P_ROOT) { 1452 errno = EBADF; 1453 return (RET_ERROR); 1454 } 1455 1456 t->bt_npages++; 1457 h = (BTHEADER *) malloc((unsigned) t->bt_psize); 1458 if (h == (BTHEADER *) NULL) 1459 return (RET_ERROR); 1460 1461 h->h_pgno = P_ROOT; 1462 h->h_flags = F_LEAF; 1463 h->h_lower = (index_t) 1464 (((char *) &(h->h_linp[0])) - ((char *) h)); 1465 h->h_upper = t->bt_psize; 1466 h->h_prevpg = h->h_nextpg = P_NONE; 1467 1468 t->bt_curpage = h; 1469 1470 /* get the root page into the hash table */ 1471 if (_bt_write(t, h, RELEASE) == RET_ERROR) 1472 return (RET_ERROR); 1473 } 1474 1475 htindex = HASHKEY(pgno); 1476 1477 for (b = t->bt_s.bt_ht[htindex]; 1478 b != (HTBUCKET *) NULL; 1479 b = b->ht_next) { 1480 if (b->ht_pgno == pgno) { 1481 t->bt_curpage = b->ht_page; 1482 return (RET_SUCCESS); 1483 } 1484 } 1485 return (RET_ERROR); 1486 } 1487 1488 /* 1489 * _BT_GETDPAGE -- Make pgno the current page of the btree. 1490 * 1491 * This routine gets pages for disk btrees. 1492 * 1493 * Because disk btree pages must be readable across machine architectures, 1494 * the btree code writes integers out in network format. This routine 1495 * converts them back to host format before returning the page. 1496 * 1497 * Parameters: 1498 * t -- btree in which to get page 1499 * pgno -- page number to get 1500 * 1501 * Returns: 1502 * RET_SUCCESS, RET_ERROR. 1503 */ 1504 1505 static int 1506 _bt_getdpage(t, pgno) 1507 register BTREE_P t; 1508 pgno_t pgno; 1509 { 1510 BTHEADER *h; 1511 char *cache; 1512 long pos; 1513 int n, nbytes; 1514 1515 /* if we have an lru cache, let the cache code do the work */ 1516 if (ISCACHE(t)) { 1517 cache = t->bt_s.bt_d.d_cache; 1518 1519 /* release the old page */ 1520 if (t->bt_curpage != (BTHEADER *) NULL) { 1521 pgno_t opgno = t->bt_curpage->h_pgno; 1522 t->bt_curpage->h_flags &= ~F_DIRTY; 1523 1524 if (lruwrite(cache, opgno) < 0) 1525 return (RET_ERROR); 1526 1527 lrurelease(cache, opgno); 1528 } 1529 1530 if (pgno > t->bt_npages) { 1531 if ((h = (BTHEADER *) lrugetnew(cache, pgno, &nbytes)) 1532 == (BTHEADER *) NULL) 1533 return (RET_ERROR); 1534 t->bt_npages = pgno; 1535 } else { 1536 if ((h = (BTHEADER *) lruget(cache, pgno, &nbytes)) 1537 == (BTHEADER *) NULL) 1538 return (RET_ERROR); 1539 } 1540 1541 /* init this page, if necessary */ 1542 if (nbytes == 0) { 1543 h->h_pgno = pgno; 1544 h->h_flags = F_LEAF; 1545 h->h_lower = (index_t) 1546 (((char *) &(h->h_linp[0])) - ((char *) h)); 1547 h->h_upper = t->bt_psize; 1548 h->h_prevpg = h->h_nextpg = P_NONE; 1549 } 1550 1551 t->bt_curpage = h; 1552 1553 return (RET_SUCCESS); 1554 } 1555 1556 /* sync the current page, if necessary */ 1557 if (t->bt_curpage != (BTHEADER *) NULL) { 1558 if (t->bt_curpage->h_flags & F_DIRTY) 1559 if (_bt_write(t, t->bt_curpage, RELEASE) == RET_ERROR) 1560 return (RET_ERROR); 1561 } else { 1562 if (t->bt_npages == 0) 1563 t->bt_npages = 1; 1564 1565 /* if no current page, get space for one */ 1566 if ((t->bt_curpage = (BTHEADER *) malloc((unsigned) t->bt_psize)) 1567 == (BTHEADER *) NULL) { 1568 return (RET_ERROR); 1569 } 1570 } 1571 1572 n = t->bt_psize; 1573 pos = (long) (pgno * n); 1574 1575 /* seek to correct location in file */ 1576 if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) { 1577 return (RET_ERROR); 1578 } 1579 1580 /* read the page */ 1581 if ((nbytes = read(t->bt_s.bt_d.d_fd, t->bt_curpage, n)) < n) { 1582 1583 /* 1584 * If we didn't get a full page, we must have gotten no 1585 * data at all -- in which case we're trying to read a 1586 * root page that doesn't exist yet. This is the only 1587 * case in which this is okay. If this happens, construct 1588 * an empty root page by hand. 1589 */ 1590 if (nbytes != 0 || pgno != P_ROOT) { 1591 errno = EBADF; 1592 return (RET_ERROR); 1593 } 1594 1595 h = (BTHEADER *) t->bt_curpage; 1596 h->h_pgno = pgno; 1597 h->h_flags = F_LEAF; 1598 h->h_lower = (index_t) 1599 (((char *) &(h->h_linp[0])) - ((char *) h)); 1600 h->h_upper = t->bt_psize; 1601 h->h_prevpg = h->h_nextpg = P_NONE; 1602 } else 1603 (void) _bt_pgin(t->bt_curpage, (char *) t->bt_lorder); 1604 1605 return (RET_SUCCESS); 1606 } 1607 1608 /* 1609 * _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from 1610 * the host-independent format stored on disk. 1611 * 1612 * Parameters: 1613 * h -- page to convert 1614 * _lorder -- byte order for pages (stored as a char * in the 1615 * cache, and passed around as a magic cookie). 1616 * 1617 * Returns: 1618 * RET_SUCCESS (lru code requires a return value). 1619 * 1620 * Side Effects: 1621 * Layout of tree metadata on the page is changed in place. 1622 * 1623 * Warnings: 1624 * Everywhere else in the code, the types pgno_t and index_t 1625 * are opaque. These two routines know what they really 1626 * are. 1627 */ 1628 1629 static int 1630 _bt_pgout(h, _lorder) 1631 BTHEADER *h; 1632 char *_lorder; 1633 { 1634 int i; 1635 int top; 1636 int lorder; 1637 DATUM *d; 1638 IDATUM *id; 1639 size_t chain; 1640 1641 lorder = (int) _lorder; 1642 if (lorder == BYTE_ORDER) 1643 return (RET_SUCCESS); 1644 1645 if (h->h_flags & F_LEAF) { 1646 if (h->h_flags & F_CONT) { 1647 if (h->h_prevpg == P_NONE) { 1648 size_t longsz; 1649 1650 (void) bcopy((char *) &(h->h_linp[0]), 1651 (char *) &longsz, 1652 sizeof(longsz)); 1653 BLSWAP(longsz); 1654 (void) bcopy((char *) &longsz, 1655 (char *) &(h->h_linp[0]), 1656 sizeof(longsz)); 1657 } 1658 } else { 1659 top = NEXTINDEX(h); 1660 for (i = 0; i < top; i++) { 1661 d = (DATUM *) GETDATUM(h, i); 1662 if (d->d_flags & D_BIGKEY) { 1663 (void) bcopy((char *) &(d->d_bytes[0]), 1664 (char *) &chain, 1665 sizeof(chain)); 1666 BLSWAP(chain); 1667 (void) bcopy((char *) &chain, 1668 (char *) &(d->d_bytes[0]), 1669 sizeof(chain)); 1670 } 1671 if (d->d_flags & D_BIGDATA) { 1672 (void) bcopy((char *) &(d->d_bytes[d->d_ksize]), 1673 (char *) &chain, 1674 sizeof(chain)); 1675 BLSWAP(chain); 1676 (void) bcopy((char *) &chain, 1677 (char *) &(d->d_bytes[d->d_ksize]), 1678 sizeof(chain)); 1679 } 1680 BLSWAP(d->d_dsize); 1681 BLSWAP(d->d_ksize); 1682 BLSWAP(d->d_flags); 1683 BLSWAP(h->h_linp[i]); 1684 } 1685 } 1686 } else { 1687 top = NEXTINDEX(h); 1688 for (i = 0; i < top; i++) { 1689 id = (IDATUM *) GETDATUM(h, i); 1690 BLSWAP(id->i_size); 1691 BLSWAP(id->i_pgno); 1692 BLSWAP(id->i_flags); 1693 if (id->i_flags & D_BIGKEY) { 1694 (void) bcopy((char *) &(id->i_bytes[0]), 1695 (char *) &chain, 1696 sizeof(chain)); 1697 BLSWAP(chain); 1698 (void) bcopy((char *) &chain, 1699 (char *) &(id->i_bytes[0]), 1700 sizeof(chain)); 1701 } 1702 BLSWAP(h->h_linp[i]); 1703 } 1704 } 1705 BLSWAP(h->h_flags); 1706 BLSWAP(h->h_pgno); 1707 BLSWAP(h->h_prevpg); 1708 BLSWAP(h->h_nextpg); 1709 BLSWAP(h->h_lower); 1710 BLSWAP(h->h_upper); 1711 1712 return (RET_SUCCESS); 1713 } 1714 1715 static int 1716 _bt_pgin(h, _lorder) 1717 BTHEADER *h; 1718 char *_lorder; 1719 { 1720 int i; 1721 int top; 1722 int lorder; 1723 DATUM *d; 1724 IDATUM *id; 1725 size_t chain; 1726 1727 /* 1728 * If btree pages are to be stored in the host byte order, don't 1729 * bother swapping. 1730 */ 1731 lorder = (int) _lorder; 1732 if (lorder == BYTE_ORDER) 1733 return (RET_SUCCESS); 1734 1735 BLSWAP(h->h_upper); 1736 BLSWAP(h->h_lower); 1737 BLSWAP(h->h_nextpg); 1738 BLSWAP(h->h_prevpg); 1739 BLSWAP(h->h_pgno); 1740 BLSWAP(h->h_flags); 1741 1742 if (h->h_flags & F_LEAF) { 1743 if (h->h_flags & F_CONT) { 1744 if (h->h_prevpg == P_NONE) { 1745 size_t longsz; 1746 1747 (void) bcopy((char *) &(h->h_linp[0]), 1748 (char *) &longsz, 1749 sizeof(longsz)); 1750 BLSWAP(longsz); 1751 (void) bcopy((char *) &longsz, 1752 (char *) &(h->h_linp[0]), 1753 sizeof(longsz)); 1754 } 1755 } else { 1756 top = NEXTINDEX(h); 1757 for (i = 0; i < top; i++) { 1758 BLSWAP(h->h_linp[i]); 1759 d = (DATUM *) GETDATUM(h, i); 1760 BLSWAP(d->d_dsize); 1761 BLSWAP(d->d_ksize); 1762 BLSWAP(d->d_flags); 1763 if (d->d_flags & D_BIGKEY) { 1764 (void) bcopy((char *) &(d->d_bytes[0]), 1765 (char *) &chain, 1766 sizeof(chain)); 1767 BLSWAP(chain); 1768 (void) bcopy((char *) &chain, 1769 (char *) &(d->d_bytes[0]), 1770 sizeof(chain)); 1771 } 1772 if (d->d_flags & D_BIGDATA) { 1773 (void) bcopy((char *) &(d->d_bytes[d->d_ksize]), 1774 (char *) &chain, 1775 sizeof(chain)); 1776 BLSWAP(chain); 1777 (void) bcopy((char *) &chain, 1778 (char *) &(d->d_bytes[d->d_ksize]), 1779 sizeof(chain)); 1780 } 1781 } 1782 } 1783 } else { 1784 top = NEXTINDEX(h); 1785 for (i = 0; i < top; i++) { 1786 BLSWAP(h->h_linp[i]); 1787 id = (IDATUM *) GETDATUM(h, i); 1788 BLSWAP(id->i_size); 1789 BLSWAP(id->i_pgno); 1790 BLSWAP(id->i_flags); 1791 if (id->i_flags & D_BIGKEY) { 1792 (void) bcopy((char *) &(id->i_bytes[0]), 1793 (char *) &chain, 1794 sizeof(chain)); 1795 BLSWAP(chain); 1796 (void) bcopy((char *) &chain, 1797 (char *) &(id->i_bytes[0]), 1798 sizeof(chain)); 1799 } 1800 } 1801 } 1802 return (RET_SUCCESS); 1803 } 1804 1805 /* 1806 * BT_SYNC -- sync the btree to disk. 1807 * 1808 * Parameters: 1809 * tree -- btree to sync. 1810 * 1811 * Returns: 1812 * RET_SUCCESS, RET_ERROR. 1813 */ 1814 1815 bt_sync(tree) 1816 BTREE tree; 1817 { 1818 BTREE_P t = (BTREE_P) tree; 1819 BTHEADER *h; 1820 pgno_t pgno; 1821 1822 /* if this is an in-memory btree, syncing is a no-op */ 1823 if (!ISDISK(t)) 1824 return (RET_SUCCESS); 1825 1826 h = (BTHEADER *) t->bt_curpage; 1827 h->h_flags &= ~F_DIRTY; 1828 1829 if (ISCACHE(t)) { 1830 pgno = t->bt_curpage->h_pgno; 1831 if (_bt_write(t, h, RELEASE) == RET_ERROR) 1832 return(RET_ERROR); 1833 if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR) 1834 return (RET_ERROR); 1835 if (_bt_getpage(t, pgno) == RET_ERROR) 1836 return (RET_ERROR); 1837 } else { 1838 if (_bt_write(t, h, NORELEASE) == RET_ERROR) 1839 return (RET_ERROR); 1840 } 1841 1842 return (fsync(t->bt_s.bt_d.d_fd)); 1843 } 1844 1845 /* 1846 * _BT_INSERT -- Insert a new user datum in the btree. 1847 * 1848 * This routine is called by bt_put, the public interface, once the 1849 * location for the new item is known. We do the work here, and 1850 * handle splits if necessary. 1851 * 1852 * Parameters: 1853 * t -- btree in which to do the insertion. 1854 * item -- BTITEM describing location of new datum 1855 * key -- key to insert 1856 * data -- data associated with key 1857 * flag -- magic cookie passed recursively to bt_put if we 1858 * have to do a split 1859 * 1860 * Returns: 1861 * RET_SUCCESS, RET_ERROR. 1862 */ 1863 1864 static int 1865 _bt_insert(t, item, key, data, flag) 1866 BTREE_P t; 1867 BTITEM *item; 1868 DBT *key; 1869 DBT *data; 1870 int flag; 1871 { 1872 index_t index; 1873 BTHEADER *h; 1874 DATUM *d; 1875 int nbytes; 1876 int status; 1877 pgno_t pgno; 1878 int keysize, datasize; 1879 int bigkey, bigdata; 1880 1881 if (_bt_getpage(t, item->bti_pgno) == RET_ERROR) 1882 return (RET_ERROR); 1883 h = t->bt_curpage; 1884 1885 if (TOOBIG(t, data->size)) { 1886 bigdata = TRUE; 1887 datasize = sizeof(pgno_t); 1888 } else { 1889 bigdata = FALSE; 1890 datasize = data->size; 1891 } 1892 1893 if (TOOBIG(t, key->size)) { 1894 bigkey = TRUE; 1895 keysize = sizeof(pgno_t); 1896 } else { 1897 bigkey = FALSE; 1898 keysize = key->size; 1899 } 1900 1901 nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char)); 1902 nbytes = LONGALIGN(nbytes) + sizeof(index_t); 1903 1904 /* if there's not enough room here, split the page */ 1905 if ((h->h_upper - h->h_lower) < nbytes) { 1906 if (_bt_split(t) == RET_ERROR) 1907 return (RET_ERROR); 1908 1909 /* okay, try again */ 1910 return (bt_put(t, key, data, flag)); 1911 } 1912 1913 /* put together a leaf page datum from the key/data pair */ 1914 index = item->bti_index; 1915 nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char)); 1916 1917 if ((d = (DATUM *) malloc((unsigned) nbytes)) == (DATUM *) NULL) 1918 return (RET_ERROR); 1919 1920 d->d_ksize = keysize; 1921 d->d_dsize = datasize; 1922 d->d_flags = 0; 1923 1924 if (bigkey) { 1925 if (_bt_indirect(t, key, &pgno) == RET_ERROR) 1926 return (RET_ERROR); 1927 (void) bcopy((char *) &pgno, &(d->d_bytes[0]), sizeof(pgno)); 1928 d->d_flags |= D_BIGKEY; 1929 if (_bt_getpage(t, item->bti_pgno) == RET_ERROR) 1930 return (RET_ERROR); 1931 } else { 1932 if (d->d_ksize > 0) { 1933 (void) bcopy((char *) key->data, 1934 (char *) &(d->d_bytes[0]), 1935 (int) d->d_ksize); 1936 } 1937 } 1938 1939 if (bigdata) { 1940 if (_bt_indirect(t, data, &pgno) == RET_ERROR) 1941 return (RET_ERROR); 1942 (void) bcopy((char *) &pgno, 1943 &(d->d_bytes[keysize]), 1944 sizeof(pgno)); 1945 d->d_flags |= D_BIGDATA; 1946 if (_bt_getpage(t, item->bti_pgno) == RET_ERROR) 1947 return (RET_ERROR); 1948 } else { 1949 if (d->d_dsize > 0) { 1950 (void) bcopy((char *) data->data, 1951 (char *) &(d->d_bytes[keysize]), 1952 (int) d->d_dsize); 1953 } 1954 } 1955 1956 /* do the insertion */ 1957 status = _bt_insertat(t, d, index); 1958 1959 (void) free((char *) d); 1960 1961 return (status); 1962 } 1963 1964 /* 1965 * _BT_INDIRECT -- Write a series of indirect pages for big objects. 1966 * 1967 * A chain of indirect pages looks like 1968 * 1969 * +-------------------+ +---------------------+ 1970 * |hdr|size| | |hdr| | 1971 * +---+----+ | +---+ | 1972 * | ... data ... | | ... data ... | ... 1973 * | | | | 1974 * +-------------------+ +---------------------+ 1975 * 1976 * where hdr is a standard btree page header (with the indirect bit 1977 * set), size on the first page is the real size of the datum, and 1978 * data are bytes of the datum, split across as many pages as necessary. 1979 * Indirect pages are chained together with the h_prevpg and h_nextpg 1980 * entries of the page header struct. 1981 * 1982 * A single DBT is written per chain, so space on the last page is 1983 * wasted. 1984 * 1985 * We return the page number of the start of the chain. 1986 * 1987 * When a big object is deleted from a tree, the space that it occupied 1988 * is placed on a free list for later reuse. This routine checks that 1989 * free list before allocating new pages to the big datum being inserted. 1990 * 1991 * Parameters: 1992 * t -- btree in which to store indirect blocks 1993 * data -- DBT with the big datum in it 1994 * pgno -- place to put the starting page number of the chain 1995 * 1996 * Returns: 1997 * RET_SUCCESS, RET_ERROR. 1998 * 1999 * Side Effects: 2000 * Current page is changed on return. 2001 */ 2002 2003 static int 2004 _bt_indirect(t, data, pgno) 2005 BTREE_P t; 2006 DBT *data; 2007 pgno_t *pgno; 2008 { 2009 pgno_t save; 2010 pgno_t prev; 2011 char *top; 2012 char *where; 2013 char *from; 2014 size_t dsize; 2015 pgno_t nextchn; 2016 int ischain; 2017 BTHEADER *cur; 2018 2019 /* get set for first page in chain */ 2020 prev = P_NONE; 2021 dsize = data->size; 2022 from = (char *) data->data; 2023 2024 /* if there are blocks on the free list, use them first */ 2025 if ((*pgno = t->bt_free) == P_NONE) { 2026 if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL) 2027 return (RET_ERROR); 2028 2029 ischain = 0; 2030 *pgno = cur->h_pgno = ++(t->bt_npages); 2031 } else { 2032 if (_bt_getpage(t, *pgno) == RET_ERROR) 2033 return (RET_ERROR); 2034 ischain = 1; 2035 cur = t->bt_curpage; 2036 } 2037 2038 cur->h_flags = F_CONT|F_LEAF; 2039 (void) bcopy((char *) &dsize, (char *) &cur->h_linp[0], sizeof(size_t)); 2040 where = ((char *) (&cur->h_linp[0])) + sizeof(size_t); 2041 2042 /* fill and write pages in the chain */ 2043 for (;;) { 2044 int nhere; 2045 2046 top = ((char *) cur) + t->bt_psize; 2047 cur->h_prevpg = prev; 2048 nextchn = cur->h_nextpg; 2049 nhere = (int) (top - where); 2050 2051 if (nhere >= dsize) { 2052 (void) bcopy(from, where, (int) dsize); 2053 cur->h_nextpg = P_NONE; 2054 dsize = 0; 2055 } else { 2056 (void) bcopy(from, where, (int) nhere); 2057 dsize -= nhere; 2058 from += nhere; 2059 if (nextchn == P_NONE) 2060 cur->h_nextpg = t->bt_npages + 1; 2061 prev = cur->h_pgno; 2062 } 2063 2064 /* current page is ready to go; write it out */ 2065 if (_bt_write(t, cur, RELEASE) == RET_ERROR) 2066 return (RET_ERROR); 2067 2068 /* free the current page, if appropriate */ 2069 if (ISDISK(t) && !ISCACHE(t) && !ischain) { 2070 (void) free ((char *) cur); 2071 } 2072 2073 /* done? */ 2074 if (dsize == 0) 2075 break; 2076 2077 /* allocate another page */ 2078 if (nextchn == P_NONE) { 2079 if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL) 2080 return (RET_ERROR); 2081 ischain = 0; 2082 cur->h_pgno = ++(t->bt_npages); 2083 } else { 2084 if (_bt_getpage(t, nextchn) == RET_ERROR) 2085 return (RET_ERROR); 2086 ischain = 1; 2087 cur = t->bt_curpage; 2088 } 2089 cur->h_flags = F_LEAF | F_CONT; 2090 2091 where = (char *) (&cur->h_linp[0]); 2092 } 2093 2094 /* if we used pages from the free list, record changes to it */ 2095 if (*pgno == t->bt_free) { 2096 t->bt_free = nextchn; 2097 t->bt_flags &= ~BTF_METAOK; 2098 } 2099 2100 return (RET_SUCCESS); 2101 } 2102 2103 /* 2104 * _BT_SPLIT -- Split a page into two pages. 2105 * 2106 * Splits are caused by insertions, and propogate up the tree in 2107 * the usual way. The root page is always page 1 in the file on 2108 * disk, so root splits are handled specially. On entry to this 2109 * routine, t->bt_curpage is the page to be split. 2110 * 2111 * Parameters: 2112 * t -- btree in which to do split. 2113 * 2114 * Returns: 2115 * RET_SUCCESS, RET_ERROR. 2116 * 2117 * Side Effects: 2118 * Changes the notion of the current page. 2119 */ 2120 2121 static 2122 _bt_split(t) 2123 BTREE_P t; 2124 { 2125 BTHEADER *h; 2126 BTHEADER *left, *right; 2127 BTHEADER *next; 2128 pgno_t nextpgno, parent; 2129 int nbytes, len; 2130 IDATUM *id; 2131 DATUM *d; 2132 char *src; 2133 IDATUM *new; 2134 pgno_t oldchain; 2135 u_char flags; 2136 2137 h = (BTHEADER *) t->bt_curpage; 2138 2139 /* split root page specially, since it must remain page 1 */ 2140 if (h->h_pgno == P_ROOT) { 2141 return (_bt_splitroot(t)); 2142 } 2143 2144 /* 2145 * This is a little complicated. We go to some trouble to 2146 * figure out which of the three possible cases -- in-memory tree, 2147 * disk tree (no cache), and disk tree (cache) -- we have, in order 2148 * to avoid unnecessary copying. If we have a disk cache, then we 2149 * have to do some extra copying, though, since the cache code 2150 * manages buffers externally to this code. 2151 */ 2152 2153 if (ISDISK(t) && ISCACHE(t)) { 2154 if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize)) 2155 == (BTHEADER *) NULL) 2156 return (RET_ERROR); 2157 left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE; 2158 left->h_flags = t->bt_curpage->h_flags; 2159 left->h_lower = (index_t) 2160 (((char *) &(left->h_linp[0])) - ((char *) left)); 2161 left->h_upper = t->bt_psize; 2162 2163 } else { 2164 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL) 2165 return (RET_ERROR); 2166 } 2167 left->h_pgno = h->h_pgno; 2168 2169 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL) 2170 return (RET_ERROR); 2171 right->h_pgno = ++(t->bt_npages); 2172 2173 /* now do the split */ 2174 if (_bt_dopsplit(t, left, right) == RET_ERROR) 2175 return (RET_ERROR); 2176 2177 right->h_prevpg = left->h_pgno; 2178 nextpgno = right->h_nextpg = h->h_nextpg; 2179 left->h_nextpg = right->h_pgno; 2180 left->h_prevpg = h->h_prevpg; 2181 2182 /* okay, now use the left half of the page as the new page */ 2183 if (ISDISK(t) && ISCACHE(t)) { 2184 (void) bcopy((char *) left, (char *) t->bt_curpage, 2185 (int) t->bt_psize); 2186 (void) free ((char *) left); 2187 left = t->bt_curpage; 2188 } else { 2189 (void) free((char *) t->bt_curpage); 2190 t->bt_curpage = left; 2191 } 2192 2193 /* 2194 * Write the new pages out. We need them to stay where they are 2195 * until we're done updating the parent pages. 2196 */ 2197 2198 if (_bt_write(t, left, NORELEASE) == RET_ERROR) 2199 return (RET_ERROR); 2200 if (_bt_write(t, right, NORELEASE) == RET_ERROR) 2201 return (RET_ERROR); 2202 2203 /* update 'prev' pointer of old neighbor of left */ 2204 if (nextpgno != P_NONE) { 2205 if (_bt_getpage(t, nextpgno) == RET_ERROR) 2206 return (RET_ERROR); 2207 h = t->bt_curpage; 2208 h->h_prevpg = right->h_pgno; 2209 h->h_flags |= F_DIRTY; 2210 } 2211 2212 if ((parent = _bt_pop(t)) != P_NONE) { 2213 if (right->h_flags & F_LEAF) { 2214 d = (DATUM *) GETDATUM(right, 0); 2215 len = d->d_ksize; 2216 if (d->d_flags & D_BIGKEY) { 2217 bcopy(&(d->d_bytes[0]), 2218 (char *) &oldchain, 2219 sizeof(oldchain)); 2220 if (_bt_markchain(t, oldchain) == RET_ERROR) 2221 return (RET_ERROR); 2222 src = (char *) &oldchain; 2223 flags = D_BIGKEY; 2224 } else { 2225 src = (char *) &(d->d_bytes[0]); 2226 flags = 0; 2227 } 2228 } else { 2229 id = (IDATUM *) GETDATUM(right, 0); 2230 len = id->i_size; 2231 flags = id->i_flags; 2232 src = (char *) &(id->i_bytes[0]); 2233 } 2234 nbytes = len + (sizeof(IDATUM) - sizeof(char)); 2235 new = (IDATUM *) malloc((unsigned) nbytes); 2236 if (new == (IDATUM *) NULL) 2237 return (RET_ERROR); 2238 new->i_size = len; 2239 new->i_pgno = right->h_pgno; 2240 new->i_flags = flags; 2241 if (len > 0) 2242 (void) bcopy(src, (char *) &(new->i_bytes[0]), len); 2243 2244 nbytes = LONGALIGN(nbytes) + sizeof(index_t); 2245 if (_bt_getpage(t, parent) == RET_ERROR) 2246 return (RET_ERROR); 2247 2248 h = t->bt_curpage; 2249 2250 /* 2251 * Split the parent if we need to, then reposition the 2252 * tree's current page pointer for the new datum. 2253 */ 2254 if ((h->h_upper - h->h_lower) < nbytes) { 2255 if (_bt_split(t) == RET_ERROR) 2256 return (RET_ERROR); 2257 if (_bt_reposition(t, new, parent, right->h_prevpg) 2258 == RET_ERROR) 2259 return (RET_ERROR); 2260 } 2261 2262 /* okay, now insert the new idatum */ 2263 if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR) 2264 return (RET_ERROR); 2265 } 2266 2267 /* 2268 * Okay, split is done; don't need right page stapled down anymore. 2269 * The page we call 'left' above is the new version of the old 2270 * (split) page, so we can't release it. 2271 */ 2272 2273 if (_bt_release(t, right) == RET_ERROR) 2274 return (RET_ERROR); 2275 if (ISDISK(t) && !ISCACHE(t)) 2276 (void) free((char *) right); 2277 2278 return (RET_SUCCESS); 2279 } 2280 2281 /* 2282 * _BT_MARKCHAIN -- Mark a chain of pages as used by an internal node. 2283 * 2284 * Chains of indirect blocks pointed to by leaf nodes get reclaimed 2285 * when the item that points to them gets deleted. Chains pointed 2286 * to by internal nodes never get deleted. This routine marks a 2287 * chain as pointed to by an internal node. 2288 * 2289 * Parameters: 2290 * t -- tree in which to mark 2291 * chain -- number of first page in chain 2292 * 2293 * Returns: 2294 * RET_SUCCESS, RET_ERROR. 2295 * 2296 * Side Effects: 2297 * None. 2298 */ 2299 2300 static int 2301 _bt_markchain(t, chain) 2302 BTREE_P t; 2303 pgno_t chain; 2304 { 2305 pgno_t save; 2306 2307 save = t->bt_curpage->h_pgno; 2308 2309 if (_bt_getpage(t, chain) == RET_ERROR) 2310 return (RET_ERROR); 2311 2312 t->bt_curpage->h_flags |= (F_DIRTY|F_PRESERVE); 2313 2314 if (_bt_getpage(t, save) == RET_ERROR) 2315 return (RET_ERROR); 2316 2317 return (RET_SUCCESS); 2318 } 2319 2320 /* 2321 * _BT_REPOSITION -- Reposition the current page pointer of a btree. 2322 * 2323 * After splitting a node in the tree in order to make room for 2324 * an insertion, we need to figure out which page after the split 2325 * should get the item we want to insert. This routine positions 2326 * the tree's current page pointer appropriately. 2327 * 2328 * Parameters: 2329 * t -- tree to position 2330 * new -- the item we want to insert 2331 * parent -- parent of the node that we just split 2332 * prev -- page number of item directly to the left of 2333 * new's position in the tree. 2334 * 2335 * Returns: 2336 * RET_SUCCESS, RET_ERROR. 2337 * 2338 * Side Effects: 2339 * None. 2340 */ 2341 2342 static int 2343 _bt_reposition(t, new, parent, prev) 2344 BTREE_P t; 2345 IDATUM *new; 2346 pgno_t parent; 2347 pgno_t prev; 2348 { 2349 index_t i, next; 2350 IDATUM *idx; 2351 2352 if (parent == P_ROOT) { 2353 2354 /* 2355 * If we just split the root page, then there are guaranteed 2356 * to be exactly two IDATUMs on it. Look at both of them 2357 * to see if they point to the page that we want. 2358 */ 2359 2360 if (_bt_getpage(t, parent) == RET_ERROR) 2361 return (RET_ERROR); 2362 2363 next = NEXTINDEX(t->bt_curpage); 2364 for (i = 0; i < next; i++) { 2365 idx = (IDATUM *) GETDATUM(t->bt_curpage, i); 2366 if (_bt_getpage(t, idx->i_pgno) == RET_ERROR) 2367 return (RET_ERROR); 2368 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 2369 return (RET_SUCCESS); 2370 if (_bt_getpage(t, parent) == RET_ERROR) 2371 return (RET_ERROR); 2372 } 2373 } else { 2374 2375 /* 2376 * Get the parent page -- which is where the new item would 2377 * have gone -- and figure out whether the new item now goes 2378 * on the parent, or the page immediately to the right of 2379 * the parent. 2380 */ 2381 2382 if (_bt_getpage(t, parent) == RET_ERROR) 2383 return (RET_ERROR); 2384 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 2385 return (RET_SUCCESS); 2386 if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR) 2387 return (RET_ERROR); 2388 if (_bt_isonpage(t, new, prev) == RET_SUCCESS) 2389 return (RET_SUCCESS); 2390 } 2391 return (RET_ERROR); 2392 } 2393 2394 /* 2395 * _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page? 2396 * 2397 * This routine is used by _bt_reposition to decide whether the current 2398 * page is the correct one on which to insert a new item. 2399 * 2400 * Parameters: 2401 * t -- tree to check 2402 * new -- the item that will be inserted (used for binary search) 2403 * prev -- page number of page whose IDATUM is immediately to 2404 * the left of new's position in the tree. 2405 * 2406 * Returns: 2407 * RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE). 2408 */ 2409 2410 static int 2411 _bt_isonpage(t, new, prev) 2412 BTREE_P t; 2413 IDATUM *new; 2414 pgno_t prev; 2415 { 2416 BTHEADER *h = (BTHEADER *) t->bt_curpage; 2417 index_t i, next; 2418 IDATUM *idx; 2419 2420 i = _bt_binsrch(t, &(new->i_bytes[0])); 2421 while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0) 2422 --i; 2423 next = NEXTINDEX(h); 2424 idx = (IDATUM *) GETDATUM(h, i); 2425 while (i < next && idx->i_pgno != prev) { 2426 i++; 2427 idx = (IDATUM *) GETDATUM(h, i); 2428 } 2429 if (idx->i_pgno == prev) 2430 return (RET_SUCCESS); 2431 else 2432 return (RET_ERROR); 2433 } 2434 2435 /* 2436 * _BT_SPLITROOT -- Split the root of a btree. 2437 * 2438 * The root page for a btree is always page one. This means that in 2439 * order to split the root, we need to do extra work. 2440 * 2441 * Parameters: 2442 * t -- tree to split 2443 * 2444 * Returns: 2445 * RET_SUCCESS, RET_ERROR. 2446 * 2447 * Side Effects: 2448 * Splits root upward in the usual way, adding two new pages 2449 * to the tree (rather than just one, as in usual splits). 2450 */ 2451 2452 static 2453 _bt_splitroot(t) 2454 BTREE_P t; 2455 { 2456 BTHEADER *h = t->bt_curpage; 2457 BTHEADER *left, *right; 2458 DATUM *d; 2459 IDATUM *id; 2460 char *where; 2461 BTHEADER *where_h; 2462 pgno_t lastpg; 2463 char *src, *dest; 2464 int len, nbytes; 2465 u_long was_leaf; 2466 pgno_t oldchain; 2467 u_char flags; 2468 2469 /* get two new pages for the split */ 2470 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL) 2471 return (RET_ERROR); 2472 left->h_pgno = ++(t->bt_npages); 2473 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL) 2474 return (RET_ERROR); 2475 right->h_pgno = ++(t->bt_npages); 2476 2477 /* do the split */ 2478 if (_bt_dopsplit(t, left, right) == RET_ERROR) 2479 return (RET_ERROR); 2480 2481 /* connect the new pages correctly */ 2482 right->h_prevpg = left->h_pgno; 2483 left->h_nextpg = right->h_pgno; 2484 2485 /* 2486 * Write the child pages out now. We need them to remain 2487 * where they are until we finish updating parent pages, 2488 * however. 2489 */ 2490 2491 if (_bt_write(t, left, NORELEASE) == RET_ERROR) 2492 return (RET_ERROR); 2493 if (_bt_write(t, right, NORELEASE) == RET_ERROR) 2494 return (RET_ERROR); 2495 2496 /* now change the root page into an internal page */ 2497 was_leaf = (h->h_flags & F_LEAF); 2498 h->h_flags &= ~F_LEAF; 2499 h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h)); 2500 h->h_upper = (index_t) t->bt_psize; 2501 (void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower)); 2502 2503 /* put two new keys on root page */ 2504 where_h = left; 2505 while (where_h) { 2506 DATUM *data; 2507 IDATUM *idata; 2508 2509 if (was_leaf) { 2510 data = (DATUM *) GETDATUM(where_h, 0); 2511 2512 if (where_h == left) { 2513 len = 0; /* first key in tree is null */ 2514 } else { 2515 if (data->d_flags & D_BIGKEY) { 2516 bcopy(&(data->d_bytes[0]), 2517 (char *) &oldchain, 2518 sizeof(oldchain)); 2519 if (_bt_markchain(t, oldchain) == RET_ERROR) 2520 return (RET_ERROR); 2521 src = (char *) &oldchain; 2522 flags = D_BIGKEY; 2523 } else { 2524 src = (char *) &(data->d_bytes[0]); 2525 flags = 0; 2526 } 2527 len = data->d_ksize; 2528 } 2529 } else { 2530 idata = (IDATUM *) GETDATUM(where_h, 0); 2531 len = idata->i_size; 2532 flags = idata->i_flags; 2533 src = &(idata->i_bytes[0]); 2534 } 2535 dest = ((char *) h) + h->h_upper; 2536 nbytes = len + (sizeof (IDATUM) - sizeof(char)); 2537 dest -= LONGALIGN(nbytes); 2538 id = (IDATUM *) dest; 2539 id->i_size = len; 2540 id->i_pgno = where_h->h_pgno; 2541 id->i_flags = flags; 2542 if (len > 0) 2543 (void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len); 2544 dest -= ((int) h); 2545 h->h_linp[NEXTINDEX(h)] = (index_t) dest; 2546 h->h_upper = (index_t) dest; 2547 h->h_lower += sizeof(index_t); 2548 2549 /* next page */ 2550 if (where_h == left) 2551 where_h = right; 2552 else 2553 where_h = (BTHEADER *) NULL; 2554 } 2555 2556 if (_bt_release(t, left) == RET_ERROR) 2557 return (RET_ERROR); 2558 if (_bt_release(t, right) == RET_ERROR) 2559 return (RET_ERROR); 2560 2561 /* 2562 * That's it, split is done. If we're doing a non-cached disk 2563 * btree, we can free up the pages we allocated, as they're on 2564 * disk, now. 2565 */ 2566 2567 if (ISDISK(t) && !ISCACHE(t)) { 2568 (void) free ((char *) left); 2569 (void) free ((char *) right); 2570 } 2571 2572 h->h_flags |= F_DIRTY; 2573 2574 return (RET_SUCCESS); 2575 } 2576 2577 /* 2578 * _BT_DOPSPLIT -- Do the work of splitting a page 2579 * 2580 * This routine takes two page pointers and splits the data on the 2581 * current page of the btree between them. 2582 * 2583 * We do a lot of work here to handle duplicate keys on a page; we 2584 * have to place these keys carefully to guarantee that later searches 2585 * will find them correctly. See comments in the code below for details. 2586 * 2587 * Parameters: 2588 * t -- tree to split 2589 * left -- pointer to page to get left half of the data 2590 * right -- pointer to page to get right half of the data 2591 * 2592 * Returns: 2593 * None. 2594 */ 2595 2596 static 2597 _bt_dopsplit(t, left, right) 2598 BTREE_P t; 2599 BTHEADER *left; 2600 BTHEADER *right; 2601 { 2602 BTHEADER *h = t->bt_curpage; 2603 size_t psize; 2604 char *where; 2605 BTHEADER *where_h; 2606 index_t where_i; 2607 int nbytes, dsize, fixedsize, freespc; 2608 index_t i; 2609 index_t save_lower, save_upper, save_i; 2610 index_t switch_i; 2611 char *save_key; 2612 DATUM *d; 2613 CURSOR *c; 2614 index_t top; 2615 int free_save; 2616 pgno_t chain; 2617 int ignore; 2618 2619 /* 2620 * Our strategy is to put half the bytes on each page. We figure 2621 * out how many bytes we have total, and then move items until 2622 * the last item moved put at least 50% of the data on the left 2623 * page. 2624 */ 2625 save_key = (char *) NULL; 2626 psize = (int) t->bt_psize; 2627 where = ((char *) left) + psize; 2628 where_h = left; 2629 where_i = 0; 2630 nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h)); 2631 freespc = nbytes; 2632 2633 top = NEXTINDEX(h); 2634 if (h->h_flags & F_LEAF) 2635 fixedsize = (sizeof(DATUM) - sizeof(char)); 2636 else 2637 fixedsize = (sizeof(IDATUM) - sizeof(char)); 2638 2639 save_key = (char *) NULL; 2640 2641 /* move data */ 2642 for (i = 0; i < top; i++) { 2643 2644 /* 2645 * Internal and leaf pages have different layouts for 2646 * data items, but in both cases the first entry in the 2647 * data item is a size_t. 2648 */ 2649 d = (DATUM *) GETDATUM(h,i); 2650 if (h->h_flags & F_LEAF) { 2651 dsize = d->d_ksize + d->d_dsize + fixedsize; 2652 } else { 2653 dsize = d->d_ksize + fixedsize; 2654 } 2655 2656 /* 2657 * If a page contains duplicate keys, we have to be 2658 * careful about splits. A sequence of duplicate keys 2659 * may not begin in the middle of one page and end in 2660 * the middle of another; it must begin on a page boundary, 2661 * in order for searches on the internal nodes to work 2662 * correctly. 2663 */ 2664 if (where_h == left) { 2665 if (save_key == (char *) NULL) { 2666 if (h->h_flags & F_LEAF) { 2667 if (d->d_flags & D_BIGKEY) { 2668 free_save = TRUE; 2669 bcopy(&(d->d_bytes[0]), 2670 (char *) &chain, 2671 sizeof(chain)); 2672 if (_bt_getbig(t, chain, 2673 &save_key, 2674 &ignore) 2675 == RET_ERROR) 2676 return (RET_ERROR); 2677 } else { 2678 free_save = FALSE; 2679 save_key = (char *) &(d->d_bytes[0]); 2680 } 2681 } else { 2682 IDATUM *id = (IDATUM *) d; 2683 2684 if (id->i_flags & D_BIGKEY) { 2685 free_save = TRUE; 2686 bcopy(&(id->i_bytes[0]), 2687 (char *) &chain, 2688 sizeof(chain)); 2689 if (_bt_getbig(t, chain, 2690 &save_key, 2691 &ignore) 2692 == RET_ERROR) 2693 return (RET_ERROR); 2694 } else { 2695 free_save = FALSE; 2696 save_key = (char *) 2697 &(id->i_bytes[0]); 2698 } 2699 } 2700 save_i = 0; 2701 save_lower = where_h->h_lower; 2702 save_upper = where_h->h_upper; 2703 } else { 2704 if (_bt_cmp(t, save_key, i) != 0) { 2705 save_lower = where_h->h_lower; 2706 save_upper = where_h->h_upper; 2707 save_i = i; 2708 } 2709 if (h->h_flags & F_LEAF) { 2710 if (free_save) 2711 (void) free(save_key); 2712 if (d->d_flags & D_BIGKEY) { 2713 free_save = TRUE; 2714 bcopy(&(d->d_bytes[0]), 2715 (char *) &chain, 2716 sizeof(chain)); 2717 if (_bt_getbig(t, chain, 2718 &save_key, 2719 &ignore) 2720 == RET_ERROR) 2721 return (RET_ERROR); 2722 } else { 2723 free_save = FALSE; 2724 save_key = (char *) &(d->d_bytes[0]); 2725 } 2726 } else { 2727 IDATUM *id = (IDATUM *) d; 2728 2729 if (id->i_flags & D_BIGKEY) { 2730 free_save = TRUE; 2731 bcopy(&(id->i_bytes[0]), 2732 (char *) &chain, 2733 sizeof(chain)); 2734 if (_bt_getbig(t, chain, 2735 &save_key, 2736 &ignore) 2737 == RET_ERROR) 2738 return (RET_ERROR); 2739 } else { 2740 free_save = FALSE; 2741 save_key = (char *) 2742 &(id->i_bytes[0]); 2743 } 2744 } 2745 } 2746 } 2747 2748 /* copy data and update page state */ 2749 where -= LONGALIGN(dsize); 2750 (void) bcopy((char *) d, (char *) where, dsize); 2751 where_h->h_upper = where_h->h_linp[where_i] = 2752 (index_t) (where - (int) where_h); 2753 where_h->h_lower += sizeof(index_t); 2754 where_i++; 2755 2756 /* if we've moved half, switch to the right-hand page */ 2757 nbytes -= LONGALIGN(dsize) + sizeof(index_t); 2758 if ((freespc - nbytes) > nbytes) { 2759 nbytes = 2 * freespc; 2760 2761 /* identical keys go on the same page */ 2762 if (save_i > 0) { 2763 /* i gets incremented at loop bottom... */ 2764 i = save_i - 1; 2765 where_h->h_lower = save_lower; 2766 where_h->h_upper = save_upper; 2767 } 2768 where = ((char *) right) + psize; 2769 where_h = right; 2770 switch_i = where_i; 2771 where_i = 0; 2772 } 2773 } 2774 2775 /* 2776 * If there was an active scan on the database, and we just 2777 * split the page that the cursor was on, we may need to 2778 * adjust the cursor to point to the same entry as before the 2779 * split. 2780 */ 2781 2782 c = &(t->bt_cursor); 2783 if ((t->bt_flags & BTF_SEQINIT) 2784 && (c->c_pgno == h->h_pgno) 2785 && (c->c_index >= switch_i)) { 2786 c->c_pgno = where_h->h_pgno; 2787 c->c_index -= where_i; 2788 } 2789 return (RET_SUCCESS); 2790 } 2791 2792 /* 2793 * _BT_INSERTI -- Insert IDATUM on current page in the btree. 2794 * 2795 * This routine handles insertions to internal pages after splits 2796 * lower in the tree. On entry, t->bt_curpage is the page to get 2797 * the new IDATUM. We are also given pgno, the page number of the 2798 * IDATUM that is immediately left of the new IDATUM's position. 2799 * This guarantees that the IDATUM for the right half of the page 2800 * after a split goes next to the IDATUM for its left half. 2801 * 2802 * Parameters: 2803 * t -- tree in which to do insertion. 2804 * id -- new IDATUM to insert 2805 * pgno -- page number of IDATUM left of id's position 2806 * 2807 * Returns: 2808 * RET_SUCCESS, RET_ERROR. 2809 */ 2810 2811 static int 2812 _bt_inserti(t, id, pgno) 2813 BTREE_P t; 2814 IDATUM *id; 2815 pgno_t pgno; 2816 { 2817 BTHEADER *h = t->bt_curpage; 2818 index_t next, i; 2819 IDATUM *idx; 2820 char *key; 2821 pgno_t chain; 2822 int free_key; 2823 int ignore; 2824 2825 if (id->i_flags & D_BIGKEY) { 2826 free_key = TRUE; 2827 bcopy(&(id->i_bytes[0]), (char *) &chain, sizeof(chain)); 2828 if (_bt_getbig(t, chain, &key, &ignore) == RET_ERROR) 2829 return (RET_ERROR); 2830 } else { 2831 free_key = FALSE; 2832 key = &(id->i_bytes[0]); 2833 } 2834 i = _bt_binsrch(t, key); 2835 2836 next = NEXTINDEX(h); 2837 while (i < next && _bt_cmp(t, key, i) >= 0) 2838 i++; 2839 2840 if (free_key) 2841 (void) free(key); 2842 2843 /* okay, now we're close; find adjacent IDATUM */ 2844 for (;;) { 2845 idx = (IDATUM *) GETDATUM(h,i); 2846 if (idx->i_pgno == pgno) { 2847 i++; 2848 break; 2849 } 2850 --i; 2851 } 2852 2853 /* correctly positioned, do the insertion */ 2854 return (_bt_insertat(t, id, i)); 2855 } 2856 2857 /* 2858 * _BT_INSERTAT -- Insert a datum at a given location on the current page. 2859 * 2860 * This routine does insertions on both leaf and internal pages. 2861 * 2862 * Parameters: 2863 * t -- tree in which to do insertion. 2864 * p -- DATUM or IDATUM to insert. 2865 * index -- index in line pointer array to put this item. 2866 * 2867 * Returns: 2868 * RET_SUCCESS, RET_ERROR. 2869 * 2870 * Side Effects: 2871 * Will rearrange line pointers to make space for the new 2872 * entry. This means that any scans currently active are 2873 * invalid after this. 2874 * 2875 * Warnings: 2876 * There must be sufficient room for the new item on the page. 2877 */ 2878 2879 static int 2880 _bt_insertat(t, p, index) 2881 BTREE_P t; 2882 char *p; 2883 index_t index; 2884 { 2885 IDATUM *id = (IDATUM *) p; 2886 DATUM *d = (DATUM *) p; 2887 BTHEADER *h; 2888 CURSOR *c; 2889 index_t nxtindex; 2890 char *src, *dest; 2891 int nbytes; 2892 2893 /* insertion may confuse an active scan. fix it. */ 2894 c = &(t->bt_cursor); 2895 if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno) 2896 if (_bt_fixscan(t, index, d, INSERT) == RET_ERROR) 2897 return (RET_ERROR); 2898 2899 h = t->bt_curpage; 2900 nxtindex = (index_t) NEXTINDEX(h); 2901 2902 /* 2903 * If we're inserting at the middle of the line pointer array, 2904 * copy pointers that will follow the new one up on the page. 2905 */ 2906 2907 if (index < nxtindex) { 2908 src = (char *) &(h->h_linp[index]); 2909 dest = (char *) &(h->h_linp[index + 1]); 2910 nbytes = (h->h_lower - (src - ((char *) h))) 2911 + sizeof(h->h_linp[0]); 2912 (void) bcopy(src, dest, nbytes); 2913 } 2914 2915 /* compute size and copy data to page */ 2916 if (h->h_flags & F_LEAF) { 2917 nbytes = d->d_ksize + d->d_dsize 2918 + (sizeof(DATUM) - sizeof(char)); 2919 } else { 2920 nbytes = id->i_size + (sizeof(IDATUM) - sizeof(char)); 2921 } 2922 dest = (((char *) h) + h->h_upper) - LONGALIGN(nbytes); 2923 (void) bcopy((char *) p, dest, nbytes); 2924 2925 /* update statistics */ 2926 dest -= (int) h; 2927 h->h_linp[index] = (index_t) dest; 2928 h->h_upper = (index_t) dest; 2929 h->h_lower += sizeof(index_t); 2930 2931 /* we're done */ 2932 h->h_flags |= F_DIRTY; 2933 2934 return (RET_SUCCESS); 2935 } 2936 2937 /* 2938 * _BT_BINSRCH -- Do a binary search for a given key on the current page. 2939 * 2940 * Searches on internal pages are handled slightly differently from 2941 * searches on leaf pages. This is because internal page searches 2942 * find the largest item <= key in the tree, and leaf searches find 2943 * the smallest item >= key. This guarantees that leaf page searches 2944 * leave us pointing at the item's correct position, and internal 2945 * searches descend the tree correctly. 2946 * 2947 * Parameters: 2948 * t -- tree to search 2949 * key -- key we're looking for 2950 * 2951 * Returns: 2952 * Index of the line pointer array entry for the (closest) 2953 * match to key on the current page, with "closest" as defined 2954 * above. 2955 */ 2956 2957 static index_t 2958 _bt_binsrch(t, key) 2959 BTREE_P t; 2960 char *key; 2961 { 2962 index_t lbound, ubound, cur; 2963 BTHEADER *h = t->bt_curpage; 2964 IDATUM *id; 2965 int match = 0; 2966 int r; 2967 2968 lbound = 0; 2969 ubound = NEXTINDEX(h); 2970 if (ubound > 0) 2971 --ubound; 2972 2973 /* do a binary search on the current page */ 2974 while ((ubound - lbound) > 1) { 2975 cur = lbound + ((ubound - lbound) / 2); 2976 r = _bt_cmp(t, key, cur); 2977 2978 if (r > 0) 2979 lbound = cur + 1; 2980 else if (r < 0) 2981 ubound = cur; 2982 else { 2983 match++; 2984 break; 2985 } 2986 } 2987 2988 /* 2989 * At this point, the binary search terminated because the endpoints 2990 * got too close together, or we have a match. Figure out which 2991 * case applies, decide what to do based on the page type (leaf or 2992 * internal), and do the right thing. 2993 */ 2994 if (match) { 2995 return (cur); 2996 } else if (ubound != lbound) { 2997 if (h->h_flags & F_LEAF) { 2998 r = _bt_cmp(t, key, lbound); 2999 if (r <= 0) { 3000 return (lbound); 3001 } 3002 } else { 3003 r = _bt_cmp(t, key, ubound); 3004 3005 /* for internal nodes, move as far left as possible */ 3006 if (r < 0) { 3007 r = _bt_cmp(t, key, lbound); 3008 if (r < 0 && lbound > 0) 3009 --lbound; 3010 return (lbound); 3011 } else { 3012 return (ubound); 3013 } 3014 } 3015 } 3016 3017 if (h->h_flags & F_LEAF) { 3018 if (ubound < NEXTINDEX(h)) { 3019 r = _bt_cmp(t, key, ubound); 3020 if (r > 0) 3021 ubound++; 3022 } 3023 } else { 3024 /* for internal pages, move as far left as possible */ 3025 if (ubound == NEXTINDEX(h)) 3026 ubound--; 3027 3028 while (_bt_cmp(t, key, ubound) < 0) 3029 ubound--; 3030 } 3031 return (ubound); 3032 } 3033 3034 /* 3035 * BT_SEQ -- Sequential scan interface. 3036 * 3037 * This routine supports sequential scans on the btree. If called with 3038 * flags set to R_CURSOR, or if no seq scan has been initialized in the 3039 * current tree, we initialize the scan. Otherwise, we advance the scan 3040 * and return the next item. 3041 * 3042 * Scans can be either keyed or non-keyed. Keyed scans basically have 3043 * a starting point somewhere in the middle of the tree. Non-keyed 3044 * scans start at an endpoint. Also, scans can be backward (descending 3045 * order), forward (ascending order), or no movement (keep returning 3046 * the same item). 3047 * 3048 * Flags is checked every time we enter the routine, so the user can 3049 * change directions on an active scan if desired. The key argument 3050 * is examined only when we initialize the scan, in order to position 3051 * it properly. 3052 * 3053 * Items are returned via the key and data arguments passed in. 3054 * 3055 * Parameters: 3056 * tree -- btree in which to do scan 3057 * key -- key, used to position scan on initialization, and 3058 * used to return key components to the user. 3059 * data -- used to return data components to the user. 3060 * flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or 3061 * R_PREV. 3062 * 3063 * Returns: 3064 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data 3065 * exists in the tree in the specified direction. 3066 * 3067 * Side Effects: 3068 * Changes the btree's notion of the current position in the 3069 * scan. 3070 * 3071 * Warnings: 3072 * The key and data items returned are static and will be 3073 * overwritten by the next bt_get or bt_seq. 3074 */ 3075 3076 bt_seq(tree, key, data, flags) 3077 BTREE tree; 3078 DBT *key; 3079 DBT *data; 3080 int flags; 3081 { 3082 BTREE_P t = (BTREE_P) tree; 3083 BTHEADER *h; 3084 DATUM *d; 3085 int status; 3086 3087 /* do we need to initialize the scan? */ 3088 if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST 3089 || !(t->bt_flags & BTF_SEQINIT)) { 3090 3091 /* initialize it */ 3092 status = _bt_seqinit(t, key, flags); 3093 } else { 3094 /* just advance the current scan pointer */ 3095 status = _bt_seqadvance(t, flags); 3096 } 3097 3098 key->size = data->size = 0; 3099 key->data = data->data = (char *) NULL; 3100 3101 h = t->bt_curpage; 3102 3103 /* is there a valid item at the current scan location? */ 3104 if (status == RET_SPECIAL) { 3105 if (flags == R_NEXT) { 3106 if (t->bt_cursor.c_index >= NEXTINDEX(h)) { 3107 if (NEXTINDEX(h) > 0) 3108 t->bt_cursor.c_index = NEXTINDEX(h) - 1; 3109 else 3110 t->bt_cursor.c_index = 0; 3111 } 3112 } else { 3113 t->bt_cursor.c_index = 0; 3114 } 3115 return (RET_SPECIAL); 3116 } else if (status == RET_ERROR) 3117 return (RET_ERROR); 3118 3119 /* okay, return the data */ 3120 d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index); 3121 3122 return (_bt_buildret(t, d, data, key)); 3123 } 3124 3125 /* 3126 * _BT_BUILDRET -- Build return key/data pair as a result of search or scan. 3127 * 3128 * This routine manages the statically allocated buffers in which we 3129 * return data to the user. 3130 * 3131 * Parameters: 3132 * t -- btree from which to return datum 3133 * d -- DATUM to be returned to the user. 3134 * data -- data argument supplied by user for return 3135 * key -- key argument supplied by user for return 3136 * 3137 * Returns: 3138 * RET_SUCCESS, RET_ERROR. 3139 * 3140 * Side Effects: 3141 * May free and reallocate static buffers, if the data 3142 * we want to return is bigger than the space we have to 3143 * do so. 3144 */ 3145 3146 static int 3147 _bt_buildret(t, d, data, key) 3148 BTREE_P t; 3149 DATUM *d; 3150 DBT *data; 3151 DBT *key; 3152 { 3153 static int _data_s = 0; 3154 static int _key_s = 0; 3155 static char *_data = (char *) NULL; 3156 static char *_key = (char *) NULL; 3157 char *from, *where, *top; 3158 pgno_t save; 3159 pgno_t chain; 3160 size_t nbytes; 3161 size_t nhere; 3162 BTHEADER *h; 3163 3164 if (d->d_flags & D_BIGKEY) { 3165 if (_key != (char *) NULL) 3166 (void) free(_key); 3167 (void) bcopy((char *) &(d->d_bytes[0]), 3168 (char *) &chain, 3169 sizeof(chain)); 3170 if (_bt_getbig(t, chain, &_key, &_key_s) == RET_ERROR) 3171 return (RET_ERROR); 3172 key->data = _key; 3173 key->size = _key_s; 3174 } else { 3175 /* need more space for key? */ 3176 if (d->d_ksize > _key_s) { 3177 if (_key != (char *) NULL) 3178 (void) free (_key); 3179 if ((_key = (char *) malloc((unsigned) d->d_ksize)) 3180 == (char *) NULL) 3181 return (RET_ERROR); 3182 _key_s = d->d_ksize; 3183 } 3184 3185 key->data = _key; 3186 if ((key->size = d->d_ksize) > 0) 3187 (void) bcopy(&(d->d_bytes[0]), 3188 _key, 3189 (int) d->d_ksize); 3190 } 3191 3192 if (d->d_flags & D_BIGDATA) { 3193 if (_data != (char *) NULL) 3194 (void) free(_data); 3195 (void) bcopy(&(d->d_bytes[d->d_ksize]), 3196 (char *) &chain, 3197 sizeof(chain)); 3198 if (_bt_getbig(t, chain, &_data, &_data_s) == RET_ERROR) 3199 return (RET_ERROR); 3200 data->data = _data; 3201 data->size = _data_s; 3202 } else { 3203 /* need more space for data? */ 3204 if (d->d_dsize > _data_s) { 3205 if (_data != (char *) NULL) 3206 (void) free (_data); 3207 if ((_data = (char *) malloc((unsigned) d->d_dsize)) 3208 == (char *) NULL) 3209 return (RET_ERROR); 3210 _data_s = d->d_dsize; 3211 } 3212 3213 data->data = _data; 3214 3215 if ((data->size = d->d_dsize) > 0) 3216 (void) bcopy(&(d->d_bytes[d->d_ksize]), 3217 _data, 3218 d->d_dsize); 3219 } 3220 3221 return (RET_SUCCESS); 3222 } 3223 3224 /* 3225 * _BT_SEQINIT -- Initialize a sequential scan on the btree. 3226 * 3227 * Sets the tree's notion of the current scan location correctly 3228 * given a key and a direction. 3229 * 3230 * Parameters: 3231 * t -- tree in which to initialize scan 3232 * key -- key for initial scan position 3233 * flags -- R_NEXT, R_PREV 3234 * 3235 * Returns: 3236 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there's no data 3237 * in the tree to scan. 3238 * 3239 * Side Effects: 3240 * Changes current scan position for the tree. Almost certainly 3241 * changes current page, as well. Sets BTF_SEQINIT bit in tree 3242 * flags, so that we know we've initialized a scan. 3243 */ 3244 3245 static int 3246 _bt_seqinit(t, key, flags) 3247 BTREE_P t; 3248 DBT *key; 3249 int flags; 3250 { 3251 BTITEM *item; 3252 BTHEADER *h; 3253 CURSOR *c; 3254 IDATUM *id; 3255 pgno_t pgno; 3256 index_t index; 3257 index_t last; 3258 3259 /* 3260 * Figure out if we really have to search for the key that the 3261 * user supplied. If key is null, then this is an unkeyed scan 3262 * and we can just start from an endpoint. 3263 */ 3264 3265 c = &(t->bt_cursor); 3266 3267 if (flags == R_CURSOR) { 3268 if (key->data != (char *) NULL) { 3269 3270 /* key supplied, find it */ 3271 item = _bt_search(t, key); 3272 c->c_index = item->bti_index; 3273 c->c_pgno = t->bt_curpage->h_pgno; 3274 } else { 3275 errno = EINVAL; 3276 return (RET_ERROR); 3277 } 3278 3279 } else { 3280 3281 /* 3282 * Unkeyed scan. For backward scans, find the last item 3283 * in the tree; for forward scans, find the first item. 3284 */ 3285 3286 if (_bt_getpage(t, P_ROOT) == RET_ERROR) 3287 return (RET_ERROR); 3288 h = t->bt_curpage; 3289 if (flags == R_LAST || flags == R_PREV) { 3290 3291 /* backward scan */ 3292 while (!(h->h_flags & F_LEAF)) { 3293 last = NEXTINDEX(h) - 1; 3294 id = (IDATUM *) GETDATUM(h,last); 3295 if (_bt_getpage(t, id->i_pgno) == RET_ERROR) 3296 return (RET_ERROR); 3297 h = t->bt_curpage; 3298 } 3299 3300 /* skip empty pages */ 3301 while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE) { 3302 if (_bt_getpage(t, h->h_prevpg) == RET_ERROR) 3303 return (RET_ERROR); 3304 h = t->bt_curpage; 3305 } 3306 3307 c->c_pgno = h->h_pgno; 3308 if (NEXTINDEX(h) > 0) 3309 c->c_index = NEXTINDEX(h) - 1; 3310 else 3311 c->c_index = 0; 3312 } else if (flags == R_FIRST || flags == R_NEXT) { 3313 /* forward scan */ 3314 while (!(h->h_flags & F_LEAF)) { 3315 id = (IDATUM *) GETDATUM(h,0); 3316 if (_bt_getpage(t, id->i_pgno) == RET_ERROR) 3317 return (RET_ERROR); 3318 h = t->bt_curpage; 3319 } 3320 3321 /* skip empty pages */ 3322 while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE) { 3323 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 3324 return (RET_ERROR); 3325 h = t->bt_curpage; 3326 } 3327 3328 c->c_pgno = h->h_pgno; 3329 c->c_index = 0; 3330 } else { 3331 /* no flags passed in */ 3332 errno = EINVAL; 3333 return (RET_ERROR); 3334 } 3335 } 3336 3337 /* okay, scan is initialized */ 3338 t->bt_flags |= BTF_SEQINIT; 3339 3340 if (c->c_index == NEXTINDEX(t->bt_curpage)) 3341 return (RET_SPECIAL); 3342 3343 return (RET_SUCCESS); 3344 } 3345 3346 /* 3347 * _BT_SEQADVANCE -- Advance the sequential scan on this tree. 3348 * 3349 * Moves the current location pointer for the scan on this tree one 3350 * spot in the requested direction. 3351 * 3352 * Parameters: 3353 * t -- btree being scanned 3354 * flags -- for R_NEXT, R_PREV 3355 * 3356 * Returns: 3357 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there is no 3358 * more data in the specified direction. 3359 * 3360 * Side Effects: 3361 * May change current page. 3362 */ 3363 3364 static int 3365 _bt_seqadvance(t, flags) 3366 BTREE_P t; 3367 int flags; 3368 { 3369 BTHEADER *h; 3370 CURSOR *c; 3371 index_t index; 3372 3373 c = &(t->bt_cursor); 3374 index = c->c_index; 3375 3376 if (_bt_getpage(t, c->c_pgno) == RET_ERROR) 3377 return (RET_ERROR); 3378 h = t->bt_curpage; 3379 3380 /* by the time we get here, don't need the cursor key anymore */ 3381 if (c->c_key != (char *) NULL) 3382 (void) free(c->c_key); 3383 3384 if (flags == R_NEXT) { 3385 3386 /* 3387 * This is a forward scan. If the cursor is pointing 3388 * at a virtual record (that is, it was pointing at 3389 * a record that got deleted), then we should return 3390 * the record it's pointing at now. Otherwise, we 3391 * should advance the scan. In either case, we need 3392 * to be careful not to run off the end of the current 3393 * page. 3394 */ 3395 3396 if (c->c_flags & CRSR_BEFORE) { 3397 3398 if (index >= NEXTINDEX(h)) { 3399 /* out of items on this page, get next page */ 3400 if (h->h_nextpg == P_NONE) { 3401 /* tell caller we're done... */ 3402 c->c_index = NEXTINDEX(h); 3403 return (RET_SPECIAL); 3404 } 3405 3406 /* skip empty pages */ 3407 do { 3408 if (_bt_getpage(t, h->h_nextpg) 3409 == RET_ERROR) { 3410 c->c_index = NEXTINDEX(h); 3411 return (RET_ERROR); 3412 } 3413 h = t->bt_curpage; 3414 c->c_pgno = h->h_pgno; 3415 } while (NEXTINDEX(h) == 0 3416 && h->h_nextpg != P_NONE); 3417 3418 if (NEXTINDEX(h) == 0) { 3419 /* tell caller we're done */ 3420 c->c_index = NEXTINDEX(h); 3421 return (RET_SPECIAL); 3422 } 3423 index = 0; 3424 } 3425 c->c_flags &= ~CRSR_BEFORE; 3426 3427 } else if (++index >= NEXTINDEX(h)) { 3428 3429 /* out of items on this page, get next page */ 3430 if (h->h_nextpg == P_NONE) { 3431 /* tell caller we're done... */ 3432 c->c_index = NEXTINDEX(h); 3433 return (RET_SPECIAL); 3434 } 3435 3436 /* skip empty pages */ 3437 do { 3438 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) { 3439 c->c_index = NEXTINDEX(h); 3440 return (RET_ERROR); 3441 } 3442 h = t->bt_curpage; 3443 c->c_pgno = h->h_pgno; 3444 } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE); 3445 3446 if (NEXTINDEX(h) == 0) { 3447 /* tell caller we're done */ 3448 c->c_index = NEXTINDEX(h); 3449 return (RET_SPECIAL); 3450 } 3451 index = 0; 3452 } 3453 } else if (flags == R_PREV) { 3454 3455 /* for backward scans, life is substantially easier */ 3456 c->c_flags &= ~CRSR_BEFORE; 3457 if (c->c_key != (char *) NULL) { 3458 (void) free(c->c_key); 3459 c->c_key = (char *) NULL; 3460 } 3461 3462 if (index == 0) { 3463 3464 /* we may be done */ 3465 c->c_index = 0; 3466 3467 /* out of items on this page, get next page */ 3468 if (h->h_prevpg == P_NONE) 3469 return (RET_SPECIAL); 3470 3471 /* skip empty pages */ 3472 do { 3473 if (_bt_getpage(t, h->h_prevpg) == RET_ERROR) 3474 return (RET_ERROR); 3475 h = t->bt_curpage; 3476 c->c_pgno = h->h_pgno; 3477 } while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE); 3478 3479 if (NEXTINDEX(h) == 0) 3480 return (RET_SPECIAL); 3481 3482 index = NEXTINDEX(h) - 1; 3483 } else 3484 --index; 3485 } else { 3486 /* must specify a direction */ 3487 errno = EINVAL; 3488 return (RET_ERROR); 3489 } 3490 3491 c->c_index = index; 3492 return (RET_SUCCESS); 3493 } 3494 3495 /* 3496 * BT_CLOSE -- Close a btree 3497 * 3498 * Parameters: 3499 * tree -- tree to close 3500 * 3501 * Returns: 3502 * RET_SUCCESS, RET_ERROR. 3503 * 3504 * Side Effects: 3505 * Frees space occupied by the tree. 3506 */ 3507 3508 int 3509 bt_close(tree) 3510 BTREE tree; 3511 { 3512 BTREE_P t = (BTREE_P) tree; 3513 int i; 3514 BTHEADER *h; 3515 char *cache; 3516 struct HTBUCKET *b, *sb; 3517 HTABLE ht; 3518 int fd; 3519 3520 if (t->bt_cursor.c_key != (char *) NULL) 3521 (void) free(t->bt_cursor.c_key); 3522 3523 if (!ISDISK(t)) { 3524 /* in-memory tree, release hash table memory */ 3525 ht = t->bt_s.bt_ht; 3526 for (i = 0; i < HTSIZE; i++) { 3527 if ((b = ht[i]) == (struct HTBUCKET *) NULL) 3528 break; 3529 do { 3530 sb = b; 3531 (void) free((char *) b->ht_page); 3532 b = b->ht_next; 3533 (void) free((char *) sb); 3534 } while (b != (struct HTBUCKET *) NULL); 3535 } 3536 (void) free ((char *) ht); 3537 (void) free ((char *) t); 3538 return (RET_SUCCESS); 3539 } 3540 3541 if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) { 3542 if (_bt_wrtmeta(t) == RET_ERROR) 3543 return (RET_ERROR); 3544 } 3545 3546 if (t->bt_curpage != (BTHEADER *) NULL) { 3547 h = t->bt_curpage; 3548 if (h->h_flags & F_DIRTY) { 3549 if (_bt_write(t, h, RELEASE) == RET_ERROR) 3550 return (RET_ERROR); 3551 } else { 3552 if (_bt_release(t, h) == RET_ERROR) 3553 return (RET_ERROR); 3554 } 3555 3556 /* flush and free the cache, if there is one */ 3557 if (ISCACHE(t)) { 3558 cache = t->bt_s.bt_d.d_cache; 3559 lrusync(cache); 3560 lrufree(cache); 3561 } 3562 (void) free ((char *) h); 3563 } 3564 3565 fd = t->bt_s.bt_d.d_fd; 3566 (void) free ((char *) t); 3567 return (close(fd)); 3568 } 3569 3570 /* 3571 * _BT_ALLOCPG -- allocate a new page in the btree. 3572 * 3573 * This is called when we split a page, to get space to do the split. 3574 * For disk btrees, these pages are released when the split is done. 3575 * For memory btrees, they are not. 3576 * 3577 * Parameters: 3578 * t -- tree in which to do split 3579 * 3580 * Returns: 3581 * Pointer to the newly-allocated page 3582 */ 3583 3584 static BTHEADER * 3585 _bt_allocpg(t) 3586 BTREE_P t; 3587 { 3588 BTHEADER *h = t->bt_curpage; 3589 BTHEADER *nh; 3590 char *cache; 3591 int nbytes; 3592 3593 /* if we have a cache, let the cache code do the work */ 3594 if (ISDISK(t) && ISCACHE(t)) { 3595 nh = (BTHEADER *) lrugetnew(t->bt_s.bt_d.d_cache, 3596 t->bt_npages + 1, 3597 &nbytes); 3598 } else { 3599 nh = (BTHEADER *) malloc((unsigned) t->bt_psize); 3600 } 3601 3602 if (nh != (BTHEADER *) NULL) { 3603 nh->h_pgno = nh->h_prevpg = nh->h_nextpg = P_NONE; 3604 nh->h_flags = h->h_flags; 3605 nh->h_lower = (index_t) 3606 (((char *) &(nh->h_linp[0])) - ((char *) nh)); 3607 nh->h_upper = t->bt_psize; 3608 } 3609 3610 return (nh); 3611 } 3612 3613 /* 3614 * _BT_WRITE -- Write a specific page to a btree file. 3615 * 3616 * Parameters: 3617 * t -- btree to get the page 3618 * h -- page to write 3619 * relflag -- (int) this page may/may not be released 3620 * 3621 * Returns: 3622 * RET_SUCCESS, RET_ERROR. 3623 * 3624 * Side Effects: 3625 * Writes a metadata page if none has been written yet. 3626 */ 3627 3628 static int 3629 _bt_write(t, h, relflag) 3630 BTREE_P t; 3631 BTHEADER *h; 3632 int relflag; 3633 { 3634 long pos; 3635 int htindex; 3636 HTBUCKET *b; 3637 char *cache; 3638 BTMETA m; 3639 pgno_t pgno; 3640 3641 h->h_flags &= ~F_DIRTY; 3642 if (ISDISK(t)) { 3643 3644 /* if we haven't done so yet, write the metadata */ 3645 if (!(t->bt_flags & BTF_METAOK)) { 3646 if (_bt_wrtmeta(t) == RET_ERROR) 3647 return (RET_ERROR); 3648 } 3649 3650 pgno = h->h_pgno; 3651 3652 3653 /* if we have a cache, let the cache code do the work */ 3654 if ((cache = t->bt_s.bt_d.d_cache) != (char *) NULL) { 3655 if (lruwrite(cache, pgno) == RET_ERROR) 3656 return (RET_ERROR); 3657 if (relflag && lrurelease(cache, pgno) == RET_ERROR) 3658 return (RET_ERROR); 3659 3660 } else { 3661 (void) _bt_pgout(h, (char *) t->bt_lorder); 3662 /* now write the current page */ 3663 pos = (long) (pgno * t->bt_psize); 3664 if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) 3665 return (RET_ERROR); 3666 if (write(t->bt_s.bt_d.d_fd, h, t->bt_psize) 3667 < t->bt_psize) 3668 return (RET_ERROR); 3669 if (!relflag) 3670 (void) _bt_pgin(h, (char *) t->bt_lorder); 3671 } 3672 } else { 3673 /* in-memory btree */ 3674 htindex = HASHKEY(h->h_pgno); 3675 3676 /* see if we need to overwrite existing entry */ 3677 for (b = t->bt_s.bt_ht[htindex]; 3678 b != (HTBUCKET *) NULL; 3679 b = b->ht_next) { 3680 if (b->ht_pgno == h->h_pgno) { 3681 b->ht_page = h; 3682 return (RET_SUCCESS); 3683 } 3684 } 3685 3686 /* new entry, write it */ 3687 b = (HTBUCKET *) malloc((unsigned) sizeof (HTBUCKET)); 3688 if (b == (HTBUCKET *) NULL) 3689 return (RET_ERROR); 3690 3691 b->ht_pgno = h->h_pgno; 3692 b->ht_page = h; 3693 b->ht_next = t->bt_s.bt_ht[htindex]; 3694 t->bt_s.bt_ht[htindex] = b; 3695 } 3696 return (RET_SUCCESS); 3697 } 3698 3699 /* 3700 * _BT_RELEASE -- Release a locked-down cache page 3701 * 3702 * During page splits, we want to force pages out to the cache 3703 * before we actually release them, in some cases. This routine 3704 * releases such a page when it is no longer needed. 3705 * 3706 * Parameters: 3707 * t -- btree in which to release page 3708 * h -- page to release 3709 * 3710 * Returns: 3711 * RET_SUCCESS, RET_ERROR. 3712 * 3713 * Side Effects: 3714 * None. 3715 */ 3716 3717 static int 3718 _bt_release(t, h) 3719 BTREE_P t; 3720 BTHEADER *h; 3721 { 3722 if (ISDISK(t) && ISCACHE(t)) { 3723 if (lrurelease(t->bt_s.bt_d.d_cache, h->h_pgno) < 0) 3724 return (RET_ERROR); 3725 } 3726 return (RET_SUCCESS); 3727 } 3728 3729 /* 3730 * _BT_WRTMETA -- Write metadata to the btree. 3731 * 3732 * Parameters: 3733 * t -- tree to which to write 3734 * 3735 * Returns: 3736 * RET_SUCCESS, RET_ERROR. 3737 */ 3738 3739 static int 3740 _bt_wrtmeta(t) 3741 BTREE_P t; 3742 { 3743 BTMETA m; 3744 3745 if (lseek(t->bt_s.bt_d.d_fd, 0l, L_SET) != 0l) 3746 return (RET_ERROR); 3747 3748 /* lorder has to be in host-independent format */ 3749 m.m_lorder = (u_long) htonl((long) t->bt_lorder); 3750 3751 m.m_magic = BTREEMAGIC; 3752 m.m_version = BTREEVERSION; 3753 m.m_psize = t->bt_psize; 3754 m.m_free = t->bt_free; 3755 m.m_flags = t->bt_flags & BTF_NODUPS; 3756 3757 if (t->bt_lorder != BYTE_ORDER) { 3758 BLSWAP(m.m_magic); 3759 BLSWAP(m.m_version); 3760 BLSWAP(m.m_psize); 3761 BLSWAP(m.m_free); 3762 BLSWAP(m.m_flags); 3763 } 3764 3765 if (write(t->bt_s.bt_d.d_fd, &m, sizeof(m)) 3766 != sizeof(m)) { 3767 return (RET_ERROR); 3768 } 3769 3770 t->bt_flags |= BTF_METAOK; 3771 3772 return (RET_SUCCESS); 3773 } 3774 3775 #ifdef DEBUG 3776 void 3777 _btdump(tree) 3778 BTREE tree; 3779 { 3780 BTREE_P t = (BTREE_P) tree; 3781 DATUM *d; 3782 IDATUM *id; 3783 BTHEADER *h; 3784 pgno_t npages; 3785 pgno_t i; 3786 index_t cur, top; 3787 3788 npages = t->bt_npages; 3789 (void) printf("\"%s\" fd %d pgsz %d curpg %d @ 0x%lx", 3790 t->bt_fname, t->bt_s.bt_d.d_fd, 3791 t->bt_psize, t->bt_curpage); 3792 (void) printf("npg %d cmp 0x%lx flags=(", npages, t->bt_compare); 3793 if (t->bt_flags & BTF_SEQINIT) 3794 (void) printf("BTF_SEQINIT"); 3795 (void) printf(")\n"); 3796 3797 for (i = P_ROOT; i <= npages; i++) { 3798 if (_bt_getpage(t, i) == RET_ERROR) 3799 _punt(); 3800 h = t->bt_curpage; 3801 top = NEXTINDEX(h); 3802 (void) printf(" page %d:\n", i); 3803 (void) printf("\tpgno %d prev %d next %d\n", 3804 h->h_pgno, h->h_prevpg, h->h_nextpg); 3805 (void) printf("\tlower %d upper %d nextind %d flags (", 3806 h->h_lower, h->h_upper, top); 3807 if (h->h_flags & F_LEAF) 3808 (void) printf("F_LEAF"); 3809 else 3810 (void) printf("<internal>"); 3811 if (h->h_flags & F_DIRTY) 3812 (void) printf("|F_DIRTY"); 3813 if (h->h_flags & F_PRESERVE) 3814 (void) printf("|F_PRESERVE"); 3815 if (h->h_flags & F_CONT) { 3816 (void) printf("|F_CONT)"); 3817 if (h->h_prevpg == P_NONE) { 3818 size_t longsz; 3819 (void) bcopy((char *) &(h->h_linp[0]), 3820 (char *) &longsz, 3821 sizeof(longsz)); 3822 printf("\n\t\t(chain start, data length %ld)", 3823 longsz); 3824 } 3825 printf("\n"); 3826 continue; 3827 } 3828 (void) printf(")\n"); 3829 for (cur = 0; cur < top; cur++) { 3830 (void) printf("\t [%d] off %d ", cur, h->h_linp[cur]); 3831 if (h->h_flags & F_LEAF) { 3832 d = (DATUM *) GETDATUM(h,cur); 3833 (void) printf("ksize %d", d->d_ksize); 3834 if (d->d_flags & D_BIGKEY) 3835 (void) printf(" (indirect)"); 3836 (void) printf("; dsize %d", d->d_dsize); 3837 if (d->d_flags & D_BIGDATA) 3838 (void) printf(" (indirect)"); 3839 } else { 3840 id = (IDATUM *) GETDATUM(h,cur); 3841 (void) printf("size %d pgno %d", 3842 id->i_size, id->i_pgno); 3843 if (id->i_flags & D_BIGKEY) 3844 (void) printf(" (indirect)"); 3845 } 3846 (void) printf("\n"); 3847 } 3848 (void) printf("\n"); 3849 } 3850 } 3851 #endif /* DEBUG */ 3852 3853 /* 3854 * _BT_CMP -- Compare a key to a given item on the current page. 3855 * 3856 * This routine is a wrapper for the user's comparison function. 3857 * 3858 * Parameters: 3859 * t -- tree in which to do comparison 3860 * p -- pointer to one argument for the comparison function 3861 * n -- index of item to supply second arg to comparison function 3862 * 3863 * Returns: 3864 * < 0 if p is < item at n 3865 * = 0 if p is = item at n 3866 * > 0 if p is > item at n 3867 */ 3868 3869 static int 3870 _bt_cmp(t, p, n) 3871 BTREE_P t; 3872 char *p; 3873 index_t n; 3874 { 3875 BTHEADER *h; 3876 IDATUM *id; 3877 DATUM *d; 3878 char *arg; 3879 int ignore; 3880 int free_arg; 3881 pgno_t chain; 3882 int r; 3883 3884 h = t->bt_curpage; 3885 3886 /* 3887 * The left-most key at any level of the tree on internal pages 3888 * is guaranteed (artificially, by the code here) to be less than 3889 * any user key. This saves us from having to update the leftmost 3890 * key when the user inserts a new key in the tree smaller than 3891 * anything we've seen yet. 3892 */ 3893 3894 if (h->h_prevpg == P_NONE && !(h->h_flags & F_LEAF) && n == 0) 3895 return (1); 3896 3897 if (h->h_flags & F_LEAF) { 3898 d = (DATUM *) GETDATUM(h,n); 3899 if (d->d_flags & D_BIGKEY) { 3900 free_arg = TRUE; 3901 bcopy(&(d->d_bytes[0]), (char *) &chain, sizeof(chain)); 3902 if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR) 3903 return (RET_ERROR); 3904 } else { 3905 free_arg = FALSE; 3906 arg = &(d->d_bytes[0]); 3907 } 3908 } else { 3909 id = (IDATUM *) GETDATUM(h,n); 3910 if (id->i_flags & D_BIGKEY) { 3911 free_arg = TRUE; 3912 bcopy(&(id->i_bytes[0]), 3913 (char *) &chain, 3914 sizeof(chain)); 3915 if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR) 3916 return (RET_ERROR); 3917 } else { 3918 free_arg = FALSE; 3919 arg = &(id->i_bytes[0]); 3920 } 3921 } 3922 r = (*(t->bt_compare))(p, arg); 3923 3924 if (free_arg) 3925 (void) free(arg); 3926 3927 return (r); 3928 } 3929 3930 /* 3931 * _BT_PUSH/_BT_POP -- Push/pop a parent page number on the parent stack. 3932 * 3933 * When we descend the tree, we keep track of parent pages in order 3934 * to handle splits on insertions. 3935 * 3936 * Parameters: 3937 * t -- tree for which to push parent 3938 * pgno -- page number to push (_bt_push only) 3939 * 3940 * Returns: 3941 * None. 3942 */ 3943 3944 static 3945 _bt_push(t, pgno) 3946 BTREE_P t; 3947 pgno_t pgno; 3948 { 3949 BTSTACK *new; 3950 3951 new = (BTSTACK *) malloc((unsigned) sizeof(BTSTACK)); 3952 new->bts_pgno = pgno; 3953 new->bts_next = t->bt_stack; 3954 t->bt_stack = new; 3955 } 3956 3957 static 3958 _bt_pop(t) 3959 BTREE_P t; 3960 { 3961 BTSTACK *s; 3962 pgno_t p = P_NONE; 3963 3964 if ((s = t->bt_stack) != (BTSTACK *) NULL) { 3965 p = s->bts_pgno; 3966 t->bt_stack = s->bts_next; 3967 (void) free ((char *) s); 3968 } 3969 return (p); 3970 } 3971 3972 #ifdef DEBUG 3973 _punt() 3974 { 3975 int pid; 3976 3977 pid = getpid(); 3978 (void) kill(pid, SIGILL); 3979 } 3980 #endif /* DEBUG */ 3981