1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #if defined(LIBC_SCCS) && !defined(lint) 12 static char sccsid[] = "@(#)bt_open.c 5.3 (Berkeley) 01/23/91"; 13 #endif /* LIBC_SCCS and not lint */ 14 15 /* 16 * btree.c -- implementation of btree access method for 4.4BSD. 17 * 18 * The design here is based on that of the btree access method used 19 * in the Postgres database system at UC Berkeley. The implementation 20 * is wholly independent of the Postgres code. 21 * 22 * This implementation supports btrees on disk (supply a filename) or 23 * in memory (don't). Public interfaces defined here are: 24 * 25 * btree_open() -- wrapper; returns a filled DB struct for 26 * a btree. 27 * 28 * bt_open() -- open a new or existing btree. 29 * bt_get() -- fetch data from a tree by key. 30 * bt_seq() -- do a sequential scan on a tree. 31 * bt_put() -- add data to a tree by key. 32 * bt_delete() -- remove data from a tree by key. 33 * bt_close() -- close a btree. 34 * bt_sync() -- force btree pages to disk (disk trees only). 35 */ 36 37 #include <sys/param.h> 38 #include <sys/file.h> 39 #include <sys/stat.h> 40 #include <sys/signal.h> 41 #include <sys/errno.h> 42 #include <sys/types.h> 43 #include <db.h> 44 #include "btree.h" 45 46 BTREEINFO _DefaultBTInfo = { 47 0, /* flags */ 48 0, /* cachesize */ 49 0, /* psize */ 50 strcmp, /* compare */ 51 0 52 }; 53 54 /* 55 * BTREE_OPEN -- Wrapper routine to open a btree. 56 * 57 * Creates and fills a DB struct, and calls the routine that actually 58 * opens the btree. 59 * 60 * Parameters: 61 * f: filename to open 62 * flags: flag bits passed to open 63 * mode: permission bits, used if O_CREAT specified 64 * b: BTREEINFO pointer 65 * 66 * Returns: 67 * Filled-in DBT on success; NULL on failure, with errno 68 * set as appropriate. 69 * 70 * Side Effects: 71 * Allocates memory for the DB struct. 72 */ 73 74 DB * 75 btree_open(f, flags, mode, b) 76 char *f; 77 int flags; 78 int mode; 79 BTREEINFO *b; 80 { 81 DB *db; 82 BTREE t; 83 84 if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL) 85 return ((DB *) NULL); 86 87 if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) { 88 (void) free ((char *) db); 89 return ((DB *) NULL); 90 } 91 92 db->internal = (char *) t; 93 db->close = bt_close; 94 db->delete = bt_delete; 95 db->get = bt_get; 96 db->put = bt_put; 97 db->seq = bt_seq; 98 db->sync = bt_sync; 99 100 return (db); 101 } 102 103 /* 104 * BT_OPEN -- Open a btree. 105 * 106 * This routine creates the correct kind (disk or in-memory) of 107 * btree and initializes it as required. 108 * 109 * Parameters: 110 * f -- name of btree (NULL for in-memory btrees) 111 * flags -- flags passed to open() 112 * mode -- mode passed to open () 113 * b -- BTREEINFO structure, describing btree 114 * 115 * Returns: 116 * (Opaque) pointer to the btree. On failure, returns NULL 117 * with errno set as appropriate. 118 * 119 * Side Effects: 120 * Allocates memory, opens files. 121 */ 122 123 BTREE 124 bt_open(f, flags, mode, b) 125 char *f; 126 int flags; 127 int mode; 128 BTREEINFO *b; 129 { 130 BTREE_P t; 131 HTABLE ht; 132 int nbytes; 133 int fd; 134 CURSOR *c; 135 BTMETA m; 136 struct stat buf; 137 138 /* use the default info if none was provided */ 139 if (b == (BTREEINFO *) NULL) 140 b = &_DefaultBTInfo; 141 142 if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL) 143 return ((BTREE) NULL); 144 145 if (b->compare) 146 t->bt_compare = b->compare; 147 else 148 t->bt_compare = strcmp; 149 150 t->bt_fname = f; 151 t->bt_curpage = (BTHEADER *) NULL; 152 t->bt_free = P_NONE; 153 c = &(t->bt_cursor); 154 c->c_pgno = P_NONE; 155 c->c_index = 0; 156 c->c_flags = (u_char) NULL; 157 c->c_key = (char *) NULL; 158 t->bt_stack = (BTSTACK *) NULL; 159 t->bt_flags = 0; 160 161 /* 162 * If no file name was supplied, this is an in-memory btree. 163 * Otherwise, it's a disk-based btree. 164 */ 165 if (f == (char *) NULL) { 166 /* in memory */ 167 if ((t->bt_psize = b->psize) < MINPSIZE) { 168 if (t->bt_psize != 0) { 169 (void) free ((char *) t); 170 errno = EINVAL; 171 return ((BTREE) NULL); 172 } 173 t->bt_psize = getpagesize(); 174 } 175 176 nbytes = HTSIZE * sizeof(HTBUCKET *); 177 if ((ht = (HTABLE) malloc((unsigned) nbytes)) 178 == (HTABLE) NULL) { 179 (void) free((char *) t); 180 return ((BTREE) NULL); 181 } 182 (void) bzero((char *) ht, nbytes); 183 t->bt_s.bt_ht = ht; 184 t->bt_npages = 0; 185 t->bt_lorder = BYTE_ORDER; 186 if (!(b->flags & R_DUP)) 187 t->bt_flags |= BTF_NODUPS; 188 } else { 189 /* on disk */ 190 if ((fd = open(f, O_RDONLY, 0)) < 0) { 191 /* doesn't exist yet, be sure page is big enough */ 192 if ((t->bt_psize = b->psize) < sizeof(BTHEADER) 193 && b->psize != 0) { 194 (void) free((char *) t); 195 errno = EINVAL; 196 return ((BTREE) NULL); 197 } 198 if (b->lorder == 0) 199 b->lorder = BYTE_ORDER; 200 201 if (b->lorder != BIG_ENDIAN 202 && b->lorder != LITTLE_ENDIAN) { 203 (void) free((char *) t); 204 errno = EINVAL; 205 return ((BTREE) NULL); 206 } 207 t->bt_lorder = b->lorder; 208 if (!(b->flags & R_DUP)) 209 t->bt_flags |= BTF_NODUPS; 210 } else { 211 /* exists, get page size from file */ 212 if (read(fd, &m, sizeof(m)) < sizeof(m)) { 213 (void) close(fd); 214 (void) free((char *) t); 215 errno = EINVAL; 216 return ((BTREE) NULL); 217 } 218 219 /* lorder always stored in host-independent format */ 220 NTOHL(m.m_lorder); 221 if (m.m_lorder != BIG_ENDIAN 222 && m.m_lorder != LITTLE_ENDIAN) { 223 (void) free((char *) t); 224 errno = EINVAL; 225 return ((BTREE) NULL); 226 } 227 t->bt_lorder = m.m_lorder; 228 229 if (t->bt_lorder != BYTE_ORDER) { 230 BLSWAP(m.m_magic); 231 BLSWAP(m.m_version); 232 BLSWAP(m.m_psize); 233 BLSWAP(m.m_free); 234 BLSWAP(m.m_flags); 235 } 236 237 if (m.m_magic != BTREEMAGIC 238 || m.m_version != BTREEVERSION 239 || m.m_psize < MINPSIZE) { 240 (void) close(fd); 241 (void) free((char *) t); 242 #ifndef EFTYPE 243 #define EFTYPE -100 244 #endif 245 errno = EFTYPE; 246 return ((BTREE) NULL); 247 } 248 t->bt_psize = m.m_psize; 249 t->bt_free = m.m_free; 250 t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK; 251 (void) close(fd); 252 } 253 254 /* now open the file the way the user wanted it */ 255 if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) { 256 (void) free ((char *) t); 257 return ((BTREE) NULL); 258 } 259 260 /* get number of pages, page size if necessary */ 261 (void) fstat(t->bt_s.bt_d.d_fd, &buf); 262 if (t->bt_psize == 0) 263 t->bt_psize = buf.st_blksize; 264 t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize); 265 266 /* page zero is metadata, doesn't count */ 267 if (t->bt_npages > 0) 268 --(t->bt_npages); 269 270 if (b->cachesize == 0) 271 b->cachesize = DEFCACHE; 272 273 /* get an lru buffer cache, if the user asked for one */ 274 if ((b->cachesize / t->bt_psize) > 0) { 275 BTDISK *d = &(t->bt_s.bt_d); 276 277 d->d_cache = lruinit(d->d_fd, 278 (int) (b->cachesize / t->bt_psize), 279 (int) t->bt_psize, 280 (char *) t->bt_lorder, 281 _bt_pgin, _bt_pgout); 282 283 if (d->d_cache == (char *) NULL) { 284 (void) free((char *) t); 285 return ((BTREE) NULL); 286 } 287 } 288 } 289 290 /* remember if tree was opened for write */ 291 if (((flags & O_WRONLY) == O_WRONLY) 292 || ((flags & O_RDWR) == O_RDWR)) 293 t->bt_flags |= BTF_ISWRITE; 294 295 return ((BTREE) t); 296 } 297 298 /* 299 * BT_GET -- Get an entry from a btree. 300 * 301 * Does a key lookup in the tree to find the specified key, and returns 302 * the key/data pair found. 303 * 304 * Parameters: 305 * tree -- btree in which to do lookup 306 * key -- key to find 307 * data -- pointer to DBT in which to return data 308 * flag -- ignored 309 * 310 * Returns: 311 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not 312 * found. If key is not found, nothing is stored in the 313 * return DBT 'data'. 314 * 315 * Side Effects: 316 * None. 317 * 318 * Warnings: 319 * Return data is statically allocated, and will be overwritten 320 * at the next call. 321 */ 322 323 int 324 bt_get(tree, key, data, flag) 325 BTREE tree; 326 DBT *key; 327 DBT *data; 328 int flag; 329 { 330 BTREE_P t = (BTREE_P) tree; 331 BTHEADER *h; 332 DATUM *d; 333 BTITEM *item; 334 335 #ifdef lint 336 flag = flag; 337 #endif /* lint */ 338 339 /* lookup */ 340 item = _bt_search(t, key); 341 if (item == (BTITEM *) NULL) 342 return (RET_ERROR); 343 344 /* clear parent pointer stack */ 345 while (_bt_pop(t) != P_NONE) 346 continue; 347 348 h = (BTHEADER *) t->bt_curpage; 349 data->size = 0; 350 data->data = (char *) NULL; 351 352 /* match? */ 353 if (VALIDITEM(t, item) 354 && _bt_cmp(t, key->data, item->bti_index) == 0) { 355 d = (DATUM *) GETDATUM(h, item->bti_index); 356 return (_bt_buildret(t, d, data, key)); 357 } 358 359 /* nope */ 360 return (RET_SPECIAL); 361 } 362 363 /* 364 * BT_PUT -- Add an entry to a btree. 365 * 366 * The specified (key, data) pair is added to the tree. If the tree 367 * was created for unique keys only, then duplicates will not be 368 * entered. If the requested key exists in the tree, it will be over- 369 * written unless the flags parameter is R_NOOVERWRITE, in which case 370 * the update will not be done. If duplicate keys are permitted in the 371 * tree, duplicates will be inserted and will not overwrite existing 372 * keys. Nodes are split as required. 373 * 374 * Parameters: 375 * tree -- btree in which to put the new entry 376 * key -- key component to add 377 * data -- data corresponding to key 378 * flag -- R_NOOVERWRITE or zero. 379 * 380 * Returns: 381 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the 382 * NOOVERWRITE flag was set and the specified key exists 383 * in the database. 384 * 385 * Side Effects: 386 * None. 387 */ 388 389 int 390 bt_put(tree, key, data, flag) 391 BTREE tree; 392 DBT *key; 393 DBT *data; 394 int flag; 395 { 396 BTREE_P t = (BTREE_P) tree; 397 BTITEM *item; 398 399 /* look for this key in the tree */ 400 item = _bt_search(t, key); 401 402 /* 403 * If this tree was originally created without R_DUP, then duplicate 404 * keys are not allowed. We need to check this at insertion time. 405 */ 406 407 if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) { 408 if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) { 409 if (_bt_delone(t, item->bti_index) == RET_ERROR) 410 return (RET_ERROR); 411 } 412 } 413 414 return (_bt_insert(t, item, key, data, flag)); 415 } 416 417 /* 418 * BT_DELETE -- delete a key from the tree. 419 * 420 * Deletes all keys (and their associated data items) matching the 421 * supplied key from the tree. If the flags entry is R_CURSOR, then 422 * the current item in the active scan is deleted. 423 * 424 * Parameters: 425 * tree -- btree from which to delete key 426 * key -- key to delete 427 * flags -- R_CURSOR or zero 428 * 429 * Returns: 430 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified 431 * key was not in the tree. 432 * 433 * Side Effects: 434 * None. 435 */ 436 437 int 438 bt_delete(tree, key, flags) 439 BTREE tree; 440 DBT *key; 441 int flags; 442 { 443 BTREE_P t = (BTREE_P) tree; 444 BTHEADER *h; 445 BTITEM *item; 446 int ndel = 0; 447 448 if (flags == R_CURSOR) 449 return (_bt_crsrdel(t)); 450 451 /* find the first matching key in the tree */ 452 item = _bt_first(t, key); 453 h = t->bt_curpage; 454 455 /* delete all matching keys */ 456 for (;;) { 457 while (VALIDITEM(t, item) 458 && (_bt_cmp(t, key->data, item->bti_index) == 0)) { 459 if (_bt_delone(t, item->bti_index) == RET_ERROR) 460 return (RET_ERROR); 461 ndel++; 462 } 463 464 if (VALIDITEM(t, item) || h->h_nextpg == P_NONE) 465 break; 466 467 /* next page, if necessary */ 468 do { 469 if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) 470 return (RET_ERROR); 471 h = t->bt_curpage; 472 } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE); 473 474 item->bti_pgno = h->h_pgno; 475 item->bti_index = 0; 476 477 if (!VALIDITEM(t, item) 478 || _bt_cmp(t, key->data, item->bti_index) != 0) 479 break; 480 } 481 482 /* clean up the parent stack */ 483 while (_bt_pop(t) != P_NONE) 484 continue; 485 486 /* flush changes to disk */ 487 if (ISDISK(t)) { 488 if (h->h_flags & F_DIRTY) { 489 if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR) 490 return (RET_ERROR); 491 } 492 } 493 494 if (ndel == 0) 495 return (RET_SPECIAL); 496 497 return (RET_SUCCESS); 498 } 499 500 /* 501 * BT_SYNC -- sync the btree to disk. 502 * 503 * Parameters: 504 * tree -- btree to sync. 505 * 506 * Returns: 507 * RET_SUCCESS, RET_ERROR. 508 */ 509 510 bt_sync(tree) 511 BTREE tree; 512 { 513 BTREE_P t = (BTREE_P) tree; 514 BTHEADER *h; 515 pgno_t pgno; 516 517 /* if this is an in-memory btree, syncing is a no-op */ 518 if (!ISDISK(t)) 519 return (RET_SUCCESS); 520 521 h = (BTHEADER *) t->bt_curpage; 522 h->h_flags &= ~F_DIRTY; 523 524 if (ISCACHE(t)) { 525 pgno = t->bt_curpage->h_pgno; 526 if (_bt_write(t, h, RELEASE) == RET_ERROR) 527 return(RET_ERROR); 528 if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR) 529 return (RET_ERROR); 530 if (_bt_getpage(t, pgno) == RET_ERROR) 531 return (RET_ERROR); 532 } else { 533 if (_bt_write(t, h, NORELEASE) == RET_ERROR) 534 return (RET_ERROR); 535 } 536 537 return (fsync(t->bt_s.bt_d.d_fd)); 538 } 539 540 /* 541 * BT_SEQ -- Sequential scan interface. 542 * 543 * This routine supports sequential scans on the btree. If called with 544 * flags set to R_CURSOR, or if no seq scan has been initialized in the 545 * current tree, we initialize the scan. Otherwise, we advance the scan 546 * and return the next item. 547 * 548 * Scans can be either keyed or non-keyed. Keyed scans basically have 549 * a starting point somewhere in the middle of the tree. Non-keyed 550 * scans start at an endpoint. Also, scans can be backward (descending 551 * order), forward (ascending order), or no movement (keep returning 552 * the same item). 553 * 554 * Flags is checked every time we enter the routine, so the user can 555 * change directions on an active scan if desired. The key argument 556 * is examined only when we initialize the scan, in order to position 557 * it properly. 558 * 559 * Items are returned via the key and data arguments passed in. 560 * 561 * Parameters: 562 * tree -- btree in which to do scan 563 * key -- key, used to position scan on initialization, and 564 * used to return key components to the user. 565 * data -- used to return data components to the user. 566 * flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or 567 * R_PREV. 568 * 569 * Returns: 570 * RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data 571 * exists in the tree in the specified direction. 572 * 573 * Side Effects: 574 * Changes the btree's notion of the current position in the 575 * scan. 576 * 577 * Warnings: 578 * The key and data items returned are static and will be 579 * overwritten by the next bt_get or bt_seq. 580 */ 581 582 int 583 bt_seq(tree, key, data, flags) 584 BTREE tree; 585 DBT *key; 586 DBT *data; 587 int flags; 588 { 589 BTREE_P t = (BTREE_P) tree; 590 BTHEADER *h; 591 DATUM *d; 592 int status; 593 594 /* do we need to initialize the scan? */ 595 if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST 596 || !(t->bt_flags & BTF_SEQINIT)) { 597 598 /* initialize it */ 599 status = _bt_seqinit(t, key, flags); 600 } else { 601 /* just advance the current scan pointer */ 602 status = _bt_seqadvance(t, flags); 603 } 604 605 key->size = data->size = 0; 606 key->data = data->data = (char *) NULL; 607 608 h = t->bt_curpage; 609 610 /* is there a valid item at the current scan location? */ 611 if (status == RET_SPECIAL) { 612 if (flags == R_NEXT) { 613 if (t->bt_cursor.c_index >= NEXTINDEX(h)) { 614 if (NEXTINDEX(h) > 0) 615 t->bt_cursor.c_index = NEXTINDEX(h) - 1; 616 else 617 t->bt_cursor.c_index = 0; 618 } 619 } else { 620 t->bt_cursor.c_index = 0; 621 } 622 return (RET_SPECIAL); 623 } else if (status == RET_ERROR) 624 return (RET_ERROR); 625 626 /* okay, return the data */ 627 d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index); 628 629 return (_bt_buildret(t, d, data, key)); 630 } 631 632 /* 633 * BT_CLOSE -- Close a btree 634 * 635 * Parameters: 636 * tree -- tree to close 637 * 638 * Returns: 639 * RET_SUCCESS, RET_ERROR. 640 * 641 * Side Effects: 642 * Frees space occupied by the tree. 643 */ 644 645 int 646 bt_close(tree) 647 BTREE tree; 648 { 649 BTREE_P t = (BTREE_P) tree; 650 int i; 651 BTHEADER *h; 652 char *cache; 653 struct HTBUCKET *b, *sb; 654 HTABLE ht; 655 int fd; 656 657 if (t->bt_cursor.c_key != (char *) NULL) 658 (void) free(t->bt_cursor.c_key); 659 660 if (!ISDISK(t)) { 661 /* in-memory tree, release hash table memory */ 662 ht = t->bt_s.bt_ht; 663 for (i = 0; i < HTSIZE; i++) { 664 if ((b = ht[i]) == (struct HTBUCKET *) NULL) 665 break; 666 do { 667 sb = b; 668 (void) free((char *) b->ht_page); 669 b = b->ht_next; 670 (void) free((char *) sb); 671 } while (b != (struct HTBUCKET *) NULL); 672 } 673 (void) free ((char *) ht); 674 (void) free ((char *) t); 675 return (RET_SUCCESS); 676 } 677 678 if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) { 679 if (_bt_wrtmeta(t) == RET_ERROR) 680 return (RET_ERROR); 681 } 682 683 if (t->bt_curpage != (BTHEADER *) NULL) { 684 h = t->bt_curpage; 685 if (h->h_flags & F_DIRTY) { 686 if (_bt_write(t, h, RELEASE) == RET_ERROR) 687 return (RET_ERROR); 688 } else { 689 if (_bt_release(t, h) == RET_ERROR) 690 return (RET_ERROR); 691 } 692 693 /* flush and free the cache, if there is one */ 694 if (ISCACHE(t)) { 695 cache = t->bt_s.bt_d.d_cache; 696 if (lrusync(cache) == RET_ERROR) 697 return (RET_ERROR); 698 lrufree(cache); 699 } 700 (void) free ((char *) h); 701 } 702 703 fd = t->bt_s.bt_d.d_fd; 704 (void) free ((char *) t); 705 return (close(fd)); 706 } 707