1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #if defined(LIBC_SCCS) && !defined(lint) 12 static char sccsid[] = "@(#)bt_open.c 5.4 (Berkeley) 02/18/91"; 13 #endif /* LIBC_SCCS and not lint */ 14 15 /* 16 * btree.c -- implementation of btree access method for 4.4BSD. 17 * 18 * The design here is based on that of the btree access method used 19 * in the Postgres database system at UC Berkeley. The implementation 20 * is wholly independent of the Postgres code. 21 * 22 * This implementation supports btrees on disk (supply a filename) or 23 * in memory (don't). Public interfaces defined here are: 24 * 25 * btree_open() -- wrapper; returns a filled DB struct for 26 * a btree. 27 * 28 * bt_open() -- open a new or existing btree. 29 * bt_get() -- fetch data from a tree by key. 30 * bt_seq() -- do a sequential scan on a tree. 31 * bt_put() -- add data to a tree by key. 32 * bt_delete() -- remove data from a tree by key. 33 * bt_close() -- close a btree. 34 * bt_sync() -- force btree pages to disk (disk trees only). 35 */ 36 37 #include <sys/param.h> 38 #include <sys/file.h> 39 #include <sys/stat.h> 40 #include <sys/signal.h> 41 #include <sys/errno.h> 42 #include <sys/types.h> 43 #include <db.h> 44 #include "btree.h" 45 46 BTREEINFO _DefaultBTInfo = { 47 0, /* flags */ 48 0, /* cachesize */ 49 0, /* psize */ 50 strcmp, /* compare */ 51 0 52 }; 53 54 /* 55 * BTREE_OPEN -- Wrapper routine to open a btree. 56 * 57 * Creates and fills a DB struct, and calls the routine that actually 58 * opens the btree. 59 * 60 * Parameters: 61 * f: filename to open 62 * flags: flag bits passed to open 63 * mode: permission bits, used if O_CREAT specified 64 * b: BTREEINFO pointer 65 * 66 * Returns: 67 * Filled-in DBT on success; NULL on failure, with errno 68 * set as appropriate. 69 * 70 * Side Effects: 71 * Allocates memory for the DB struct. 72 */ 73 74 DB * 75 btree_open(f, flags, mode, b) 76 char *f; 77 int flags; 78 int mode; 79 BTREEINFO *b; 80 { 81 DB *db; 82 BTREE t; 83 84 if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL) 85 return ((DB *) NULL); 86 87 if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) { 88 (void) free ((char *) db); 89 return ((DB *) NULL); 90 } 91 92 db->internal = (char *) t; 93 db->close = bt_close; 94 db->delete = bt_delete; 95 db->get = bt_get; 96 db->put = bt_put; 97 db->seq = bt_seq; 98 db->sync = bt_sync; 99 100 return (db); 101 } 102 103 /* 104 * BT_OPEN -- Open a btree. 105 * 106 * This routine creates the correct kind (disk or in-memory) of 107 * btree and initializes it as required. 108 * 109 * Parameters: 110 * f -- name of btree (NULL for in-memory btrees) 111 * flags -- flags passed to open() 112 * mode -- mode passed to open () 113 * b -- BTREEINFO structure, describing btree 114 * 115 * Returns: 116 * (Opaque) pointer to the btree. On failure, returns NULL 117 * with errno set as appropriate. 118 * 119 * Side Effects: 120 * Allocates memory, opens files. 121 */ 122 123 BTREE 124 bt_open(f, flags, mode, b) 125 char *f; 126 int flags; 127 int mode; 128 BTREEINFO *b; 129 { 130 BTREE_P t; 131 HTABLE ht; 132 int nbytes; 133 int fd; 134 CURSOR *c; 135 BTMETA m; 136 struct stat buf; 137 138 /* use the default info if none was provided */ 139 if (b == (BTREEINFO *) NULL) 140 b = &_DefaultBTInfo; 141 142 if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL) 143 return ((BTREE) NULL); 144 145 if (b->compare) 146 t->bt_compare = b->compare; 147 else 148 t->bt_compare = strcmp; 149 150 t->bt_fname = f; 151 t->bt_curpage = (BTHEADER *) NULL; 152 t->bt_free = P_NONE; 153 c = &(t->bt_cursor); 154 c->c_pgno = P_NONE; 155 c->c_index = 0; 156 c->c_flags = (u_char) NULL; 157 c->c_key = (char *) NULL; 158 t->bt_stack = (BTSTACK *) NULL; 159 t->bt_flags = 0; 160 161 /* 162 * If no file name was supplied, this is an in-memory btree. 163 * Otherwise, it's a disk-based btree. 164 */ 165 if (f == (char *) NULL) { 166 /* in memory */ 167 if ((t->bt_psize = b->psize) < MINPSIZE) { 168 if (t->bt_psize != 0) { 169 (void) free ((char *) t); 170 errno = EINVAL; 171 return ((BTREE) NULL); 172 } 173 t->bt_psize = getpagesize(); 174 } 175 176 nbytes = HTSIZE * sizeof(HTBUCKET *); 177 if ((ht = (HTABLE) malloc((unsigned) nbytes)) 178 == (HTABLE) NULL) { 179 (void) free((char *) t); 180 return ((BTREE) NULL); 181 } 182 (void) bzero((char *) ht, nbytes); 183 t->bt_s.bt_ht = ht; 184 t->bt_npages = 0; 185 t->bt_lorder = BYTE_ORDER; 186 if (!(b->flags & R_DUP)) 187 t->bt_flags |= BTF_NODUPS; 188 } else { 189 /* on disk */ 190 if ((fd = open(f, O_RDONLY, 0)) < 0) { 191 /* doesn't exist yet, be sure page is big enough */ 192 if ((t->bt_psize = b->psize) < sizeof(BTHEADER) 193 && b->psize != 0) { 194 (void) free((char *) t); 195 errno = EINVAL; 196 return ((BTREE) NULL); 197 } 198 if (b->lorder == 0) 199 b->lorder = BYTE_ORDER; 200 201 if (b->lorder != BIG_ENDIAN 202 && b->lorder != LITTLE_ENDIAN) { 203 (void) free((char *) t); 204 errno = EINVAL; 205 return ((BTREE) NULL); 206 } 207 t->bt_lorder = b->lorder; 208 if (!(b->flags & R_DUP)) 209 t->bt_flags |= BTF_NODUPS; 210 } else { 211 /* exists, get page size from file */ 212 if (read(fd, &m, sizeof(m)) < sizeof(m)) { 213 (void) close(fd); 214 (void) free((char *) t); 215 errno = EINVAL; 216 return ((BTREE) NULL); 217 } 218 219 /* lorder always stored in host-independent format */ 220 NTOHL(m.m_lorder); 221 if (m.m_lorder != BIG_ENDIAN 222 && m.m_lorder != LITTLE_ENDIAN) { 223 (void) free((char *) t); 224 errno = EINVAL; 225 return ((BTREE) NULL); 226 } 227 t->bt_lorder = m.m_lorder; 228 229 if (t->bt_lorder != BYTE_ORDER) { 230 BLSWAP(m.m_magic); 231 BLSWAP(m.m_version); 232 BLSWAP(m.m_psize); 233 BLSWAP(m.m_free); 234 BLSWAP(m.m_flags); 235 } 236 237 if (m.m_magic != BTREEMAGIC 238 || m.m_version != BTREEVERSION 239 || m.m_psize < MINPSIZE) { 240 (void) close(fd); 241 (void) free((char *) t); 242 #ifndef EFTYPE 243 #define EFTYPE -100 244 #endif 245 errno = EFTYPE; 246 return ((BTREE) NULL); 247 } 248 t->bt_psize = m.m_psize; 249 t->bt_free = m.m_free; 250 t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK; 251 (void) close(fd); 252 } 253 254 /* now open the file the way the user wanted it */ 255 if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) { 256 (void) free ((char *) t); 257 return ((BTREE) NULL); 258 } 259 260 /* access method files are always close-on-exec */ 261 if (fcntl(t->bt_s.bt_d.d_fd, F_SETFL, 1) == -1) { 262 (void) close(t->bt_s.bt_d.d_fd); 263 (void) free ((char *) t); 264 return ((BTREE) NULL); 265 } 266 267 /* get number of pages, page size if necessary */ 268 (void) fstat(t->bt_s.bt_d.d_fd, &buf); 269 if (t->bt_psize == 0) 270 t->bt_psize = buf.st_blksize; 271 t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize); 272 273 /* page zero is metadata, doesn't count */ 274 if (t->bt_npages > 0) 275 --(t->bt_npages); 276 277 if (b->cachesize == 0) 278 b->cachesize = DEFCACHE; 279 280 /* get an lru buffer cache, if the user asked for one */ 281 if ((b->cachesize / t->bt_psize) > 0) { 282 BTDISK *d = &(t->bt_s.bt_d); 283 284 d->d_cache = lruinit(d->d_fd, 285 (int) (b->cachesize / t->bt_psize), 286 (int) t->bt_psize, 287 (char *) t->bt_lorder, 288 _bt_pgin, _bt_pgout); 289 290 if (d->d_cache == (char *) NULL) { 291 (void) free((char *) t); 292 return ((BTREE) NULL); 293 } 294 } 295 } 296 297 /* remember if tree was opened for write */ 298 if (((flags & O_WRONLY) == O_WRONLY) 299 || ((flags & O_RDWR) == O_RDWR)) 300 t->bt_flags |= BTF_ISWRITE; 301 302 return ((BTREE) t); 303 } 304 305 /* 306 * BT_GET -- Get an entry from a btree. 307 * 308 * Does a key lookup in the tree to find the specified key, and returns 309 * the key/data pair found. 310 * 311 * Parameters: 312 * tree -- btree in which to do lookup 313 * key -- key to find 314 * data -- pointer to DBT in which to return data 315 * flag -- ignored 316 * 317 * Returns: 318 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not 319 * found. If key is not found, nothing is stored in the 320 * return DBT 'data'. 321 * 322 * Side Effects: 323 * None. 324 * 325 * Warnings: 326 * Return data is statically allocated, and will be overwritten 327 * at the next call. 328 */ 329 330 int 331 bt_get(tree, key, data, flag) 332 BTREE tree; 333 DBT *key; 334 DBT *data; 335 int flag; 336 { 337 BTREE_P t = (BTREE_P) tree; 338 BTHEADER *h; 339 DATUM *d; 340 BTITEM *item; 341 342 #ifdef lint 343 flag = flag; 344 #endif /* lint */ 345 346 /* lookup */ 347 item = _bt_search(t, key); 348 349 /* clear parent pointer stack */ 350 while (_bt_pop(t) != P_NONE) 351 continue; 352 353 if (item == (BTITEM *) NULL) 354 return (RET_ERROR); 355 356 h = (BTHEADER *) t->bt_curpage; 357 data->size = 0; 358 data->data = (char *) NULL; 359 360 /* match? */ 361 if (VALIDITEM(t, item) 362 && _bt_cmp(t, key->data, item->bti_index) == 0) { 363 d = (DATUM *) GETDATUM(h, item->bti_index); 364 return (_bt_buildret(t, d, data, key)); 365 } 366 367 /* nope */ 368 return (RET_SPECIAL); 369 } 370 371 /* 372 * BT_PUT -- Add an entry to a btree. 373 * 374 * The specified (key, data) pair is added to the tree. If the tree 375 * was created for unique keys only, then duplicates will not be 376 * entered. If the requested key exists in the tree, it will be over- 377 * written unless the flags parameter is R_NOOVERWRITE, in which case 378 * the update will not be done. If duplicate keys are permitted in the 379 * tree, duplicates will be inserted and will not overwrite existing 380 * keys. Nodes are split as required. 381 * 382 * Parameters: 383 * tree -- btree in which to put the new entry 384 * key -- key component to add 385 * data -- data corresponding to key 386 * flag -- R_NOOVERWRITE or zero. 387 * 388 * Returns: 389 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the 390 * NOOVERWRITE flag was set and the specified key exists 391 * in the database. 392 * 393 * Side Effects: 394 * None. 395 */ 396 397 int 398 bt_put(tree, key, data, flag) 399 BTREE tree; 400 DBT *key; 401 DBT *data; 402 int flag; 403 { 404 BTREE_P t = (BTREE_P) tree; 405 BTITEM *item; 406 407 /* look for this key in the tree */ 408 item = _bt_search(t, key); 409 410 /* 411 * If this tree was originally created without R_DUP, then duplicate 412 * keys are not allowed. We need to check this at insertion time. 413 */ 414 415 if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) { 416 if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) { 417 if (_bt_delone(t, item->bti_index) == RET_ERROR) { 418 while (_bt_pop(t) != P_NONE) 419 continue; 420 return (RET_ERROR); 421 } 422 } 423 } 424 425 return (_bt_insert(t, item, key, data, flag)); 426 } 427 428 /* 429 * BT_DELETE -- delete a key from the tree. 430 * 431 * Deletes all keys (and their associated data items) matching the 432 * supplied key from the tree. If the flags entry is R_CURSOR, then 433 * the current item in the active scan is deleted. 434 * 435 * Parameters: 436 * tree -- btree from which to delete key 437 * key -- key to delete 438 * flags -- R_CURSOR or zero 439 * 440 * Returns: 441 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified 442 * key was not in the tree. 443 * 444 * Side Effects: 445 * None. 446 */ 447 448 int 449 bt_delete(tree, key, flags) 450 BTREE tree; 451 DBT *key; 452 int flags; 453 { 454 BTREE_P t = (BTREE_P) tree; 455 BTHEADER *h; 456 BTITEM *item; 457 int ndel = 0; 458 459 if (flags == R_CURSOR) 460 return (_bt_crsrdel(t)); 461 462 /* find the first matching key in the tree */ 463 item = _bt_first(t, key); 464 h = t->bt_curpage; 465 466 /* don't need the descent stack for deletes */ 467 while (_bt_pop(t) != P_NONE) 468 continue; 469 470 /* delete all matching keys */ 471 for (;;) { 472 while (VALIDITEM(t, item) 473 && (_bt_cmp(t, key->data, item->bti_index) == 0)) { 474 if (_bt_delone(t, item->bti_index) == RET_ERROR) 475 return (RET_ERROR); 476 ndel++; 477 } 478 479 if (VALIDITEM(t, item) || h->h_nextpg == P_NONE) 480 break; 481 482 /* next page, if necessary */ 483 do { 484 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 485 return (RET_ERROR); 486 h = t->bt_curpage; 487 } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE); 488 489 item->bti_pgno = h->h_pgno; 490 item->bti_index = 0; 491 492 if (!VALIDITEM(t, item) 493 || _bt_cmp(t, key->data, item->bti_index) != 0) 494 break; 495 } 496 497 /* flush changes to disk */ 498 if (ISDISK(t)) { 499 if (h->h_flags & F_DIRTY) { 500 if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR) 501 return (RET_ERROR); 502 } 503 } 504 505 if (ndel == 0) 506 return (RET_SPECIAL); 507 508 return (RET_SUCCESS); 509 } 510 511 /* 512 * BT_SYNC -- sync the btree to disk. 513 * 514 * Parameters: 515 * tree -- btree to sync. 516 * 517 * Returns: 518 * RET_SUCCESS, RET_ERROR. 519 */ 520 521 bt_sync(tree) 522 BTREE tree; 523 { 524 BTREE_P t = (BTREE_P) tree; 525 BTHEADER *h; 526 pgno_t pgno; 527 528 /* if this is an in-memory btree, syncing is a no-op */ 529 if (!ISDISK(t)) 530 return (RET_SUCCESS); 531 532 h = (BTHEADER *) t->bt_curpage; 533 h->h_flags &= ~F_DIRTY; 534 535 if (ISCACHE(t)) { 536 pgno = t->bt_curpage->h_pgno; 537 if (_bt_write(t, h, RELEASE) == RET_ERROR) 538 return(RET_ERROR); 539 if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR) 540 return (RET_ERROR); 541 if (_bt_getpage(t, pgno) == RET_ERROR) 542 return (RET_ERROR); 543 } else { 544 if (_bt_write(t, h, NORELEASE) == RET_ERROR) 545 return (RET_ERROR); 546 } 547 548 return (fsync(t->bt_s.bt_d.d_fd)); 549 } 550 551 /* 552 * BT_SEQ -- Sequential scan interface. 553 * 554 * This routine supports sequential scans on the btree. If called with 555 * flags set to R_CURSOR, or if no seq scan has been initialized in the 556 * current tree, we initialize the scan. Otherwise, we advance the scan 557 * and return the next item. 558 * 559 * Scans can be either keyed or non-keyed. Keyed scans basically have 560 * a starting point somewhere in the middle of the tree. Non-keyed 561 * scans start at an endpoint. Also, scans can be backward (descending 562 * order), forward (ascending order), or no movement (keep returning 563 * the same item). 564 * 565 * Flags is checked every time we enter the routine, so the user can 566 * change directions on an active scan if desired. The key argument 567 * is examined only when we initialize the scan, in order to position 568 * it properly. 569 * 570 * Items are returned via the key and data arguments passed in. 571 * 572 * Parameters: 573 * tree -- btree in which to do scan 574 * key -- key, used to position scan on initialization, and 575 * used to return key components to the user. 576 * data -- used to return data components to the user. 577 * flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or 578 * R_PREV. 579 * 580 * Returns: 581 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data 582 * exists in the tree in the specified direction. 583 * 584 * Side Effects: 585 * Changes the btree's notion of the current position in the 586 * scan. 587 * 588 * Warnings: 589 * The key and data items returned are static and will be 590 * overwritten by the next bt_get or bt_seq. 591 */ 592 593 int 594 bt_seq(tree, key, data, flags) 595 BTREE tree; 596 DBT *key; 597 DBT *data; 598 int flags; 599 { 600 BTREE_P t = (BTREE_P) tree; 601 BTHEADER *h; 602 DATUM *d; 603 int status; 604 605 /* do we need to initialize the scan? */ 606 if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST 607 || !(t->bt_flags & BTF_SEQINIT)) { 608 609 /* initialize it */ 610 status = _bt_seqinit(t, key, flags); 611 } else { 612 /* just advance the current scan pointer */ 613 status = _bt_seqadvance(t, flags); 614 } 615 616 key->size = data->size = 0; 617 key->data = data->data = (char *) NULL; 618 619 h = t->bt_curpage; 620 621 /* is there a valid item at the current scan location? */ 622 if (status == RET_SPECIAL) { 623 if (flags == R_NEXT) { 624 if (t->bt_cursor.c_index >= NEXTINDEX(h)) { 625 if (NEXTINDEX(h) > 0) 626 t->bt_cursor.c_index = NEXTINDEX(h) - 1; 627 else 628 t->bt_cursor.c_index = 0; 629 } 630 } else { 631 t->bt_cursor.c_index = 0; 632 } 633 return (RET_SPECIAL); 634 } else if (status == RET_ERROR) 635 return (RET_ERROR); 636 637 /* okay, return the data */ 638 d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index); 639 640 return (_bt_buildret(t, d, data, key)); 641 } 642 643 /* 644 * BT_CLOSE -- Close a btree 645 * 646 * Parameters: 647 * tree -- tree to close 648 * 649 * Returns: 650 * RET_SUCCESS, RET_ERROR. 651 * 652 * Side Effects: 653 * Frees space occupied by the tree. 654 */ 655 656 int 657 bt_close(tree) 658 BTREE tree; 659 { 660 BTREE_P t = (BTREE_P) tree; 661 int i; 662 BTHEADER *h; 663 char *cache; 664 struct HTBUCKET *b, *sb; 665 HTABLE ht; 666 int fd; 667 668 if (t->bt_cursor.c_key != (char *) NULL) 669 (void) free(t->bt_cursor.c_key); 670 671 if (!ISDISK(t)) { 672 /* in-memory tree, release hash table memory */ 673 ht = t->bt_s.bt_ht; 674 for (i = 0; i < HTSIZE; i++) { 675 if ((b = ht[i]) == (struct HTBUCKET *) NULL) 676 break; 677 do { 678 sb = b; 679 (void) free((char *) b->ht_page); 680 b = b->ht_next; 681 (void) free((char *) sb); 682 } while (b != (struct HTBUCKET *) NULL); 683 } 684 (void) free ((char *) ht); 685 (void) free ((char *) t); 686 return (RET_SUCCESS); 687 } 688 689 if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) { 690 if (_bt_wrtmeta(t) == RET_ERROR) 691 return (RET_ERROR); 692 } 693 694 if (t->bt_curpage != (BTHEADER *) NULL) { 695 h = t->bt_curpage; 696 if (h->h_flags & F_DIRTY) { 697 if (_bt_write(t, h, RELEASE) == RET_ERROR) 698 return (RET_ERROR); 699 } else { 700 if (_bt_release(t, h) == RET_ERROR) 701 return (RET_ERROR); 702 } 703 704 /* flush and free the cache, if there is one */ 705 if (ISCACHE(t)) { 706 cache = t->bt_s.bt_d.d_cache; 707 if (lrusync(cache) == RET_ERROR) 708 return (RET_ERROR); 709 lrufree(cache); 710 } 711 (void) free ((char *) h); 712 } 713 714 fd = t->bt_s.bt_d.d_fd; 715 (void) free ((char *) t); 716 return (close(fd)); 717 } 718