1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #if defined(LIBC_SCCS) && !defined(lint) 12 static char sccsid[] = "@(#)bt_get.c 5.1 (Berkeley) 01/23/91"; 13 #endif /* LIBC_SCCS and not lint */ 14 15 #include <sys/param.h> 16 #include <sys/types.h> 17 #include <sys/errno.h> 18 #include <sys/file.h> 19 #include <db.h> 20 #include "btree.h" 21 22 /* 23 * BT_GETPAGE -- Make pgno the current page of the btree. 24 * 25 * This routine is just a wrapper that decides whether to call the 26 * memory or disk-based routine to do the work. 27 * 28 * Parameters: 29 * t -- btree in which to get page 30 * pgno -- page number to get 31 * 32 * Returns: 33 * RET_SUCCESS or RET_ERROR. 34 */ 35 36 int 37 _bt_getpage(t, pgno) 38 BTREE_P t; 39 pgno_t pgno; 40 { 41 #ifdef DEBUG 42 if (pgno == P_NONE) 43 _punt(); 44 #endif /* DEBUG */ 45 46 /* see if we can get away without doing any work */ 47 if (t->bt_curpage != (BTHEADER *) NULL) { 48 if (t->bt_curpage->h_pgno == pgno) 49 return (RET_SUCCESS); 50 } 51 52 if (t->bt_fname == (char *) NULL) 53 return (_bt_getmpage(t, pgno)); 54 else 55 return (_bt_getdpage(t, pgno)); 56 } 57 58 /* 59 * _BT_GETMPAGE -- Make pgno the current page of the btree. 60 * 61 * This routine gets pages for in-memory btrees. 62 * 63 * Parameters: 64 * t -- btree in which to get page 65 * pgno -- page number to get 66 * 67 * Returns: 68 * RET_SUCCESS or RET_ERROR. 69 */ 70 71 int 72 _bt_getmpage(t, pgno) 73 register BTREE_P t; 74 pgno_t pgno; 75 { 76 int htindex; 77 BTHEADER *h; 78 HTBUCKET *b; 79 80 if (t->bt_curpage == (BTHEADER *) NULL) { 81 if (pgno != P_ROOT) { 82 errno = EBADF; 83 return (RET_ERROR); 84 } 85 86 t->bt_npages++; 87 h = (BTHEADER *) malloc((unsigned) t->bt_psize); 88 if (h == (BTHEADER *) NULL) 89 return (RET_ERROR); 90 91 h->h_pgno = P_ROOT; 92 h->h_flags = F_LEAF; 93 h->h_lower = (index_t) 94 (((char *) &(h->h_linp[0])) - ((char *) h)); 95 h->h_upper = t->bt_psize; 96 h->h_prevpg = h->h_nextpg = P_NONE; 97 98 t->bt_curpage = h; 99 100 /* get the root page into the hash table */ 101 if (_bt_write(t, h, RELEASE) == RET_ERROR) 102 return (RET_ERROR); 103 } 104 105 htindex = HASHKEY(pgno); 106 107 for (b = t->bt_s.bt_ht[htindex]; 108 b != (HTBUCKET *) NULL; 109 b = b->ht_next) { 110 if (b->ht_pgno == pgno) { 111 t->bt_curpage = b->ht_page; 112 return (RET_SUCCESS); 113 } 114 } 115 return (RET_ERROR); 116 } 117 118 /* 119 * _BT_GETDPAGE -- Make pgno the current page of the btree. 120 * 121 * This routine gets pages for disk btrees. 122 * 123 * Because disk btree pages must be readable across machine architectures, 124 * the btree code writes integers out in network format. This routine 125 * converts them back to host format before returning the page. 126 * 127 * Parameters: 128 * t -- btree in which to get page 129 * pgno -- page number to get 130 * 131 * Returns: 132 * RET_SUCCESS, RET_ERROR. 133 */ 134 135 int 136 _bt_getdpage(t, pgno) 137 register BTREE_P t; 138 pgno_t pgno; 139 { 140 BTHEADER *h; 141 char *cache; 142 long pos; 143 int n, nbytes; 144 145 /* if we have an lru cache, let the cache code do the work */ 146 if (ISCACHE(t)) { 147 cache = t->bt_s.bt_d.d_cache; 148 149 /* release the old page */ 150 if (t->bt_curpage != (BTHEADER *) NULL) { 151 pgno_t opgno = t->bt_curpage->h_pgno; 152 t->bt_curpage->h_flags &= ~F_DIRTY; 153 154 if (lruwrite(cache, (int) opgno) < 0) 155 return (RET_ERROR); 156 157 if (lrurelease(cache, (int) opgno) < 0) 158 return (RET_ERROR); 159 } 160 161 if (pgno > t->bt_npages) { 162 if ((h = (BTHEADER *) lrugetnew(cache, (int)pgno, &nbytes)) 163 == (BTHEADER *) NULL) 164 return (RET_ERROR); 165 t->bt_npages = pgno; 166 } else { 167 if ((h = (BTHEADER *) lruget(cache, (int)pgno, &nbytes)) 168 == (BTHEADER *) NULL) 169 return (RET_ERROR); 170 } 171 172 /* init this page, if necessary */ 173 if (nbytes == 0) { 174 h->h_pgno = pgno; 175 h->h_flags = F_LEAF; 176 h->h_lower = (index_t) 177 (((char *) &(h->h_linp[0])) - ((char *) h)); 178 h->h_upper = t->bt_psize; 179 h->h_prevpg = h->h_nextpg = P_NONE; 180 } 181 182 t->bt_curpage = h; 183 184 return (RET_SUCCESS); 185 } 186 187 /* sync the current page, if necessary */ 188 if (t->bt_curpage != (BTHEADER *) NULL) { 189 if (t->bt_curpage->h_flags & F_DIRTY) 190 if (_bt_write(t, t->bt_curpage, RELEASE) == RET_ERROR) 191 return (RET_ERROR); 192 } else { 193 if (t->bt_npages == 0) 194 t->bt_npages = 1; 195 196 /* if no current page, get space for one */ 197 if ((t->bt_curpage = (BTHEADER *) malloc((unsigned) t->bt_psize)) 198 == (BTHEADER *) NULL) { 199 return (RET_ERROR); 200 } 201 } 202 203 n = t->bt_psize; 204 pos = (long) (pgno * n); 205 206 /* seek to correct location in file */ 207 if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) { 208 return (RET_ERROR); 209 } 210 211 /* read the page */ 212 if ((nbytes = read(t->bt_s.bt_d.d_fd, t->bt_curpage, n)) < n) { 213 214 /* 215 * If we didn't get a full page, we must have gotten no 216 * data at all -- in which case we're trying to read a 217 * root page that doesn't exist yet. This is the only 218 * case in which this is okay. If this happens, construct 219 * an empty root page by hand. 220 */ 221 if (nbytes != 0 || pgno != P_ROOT) { 222 errno = EBADF; 223 return (RET_ERROR); 224 } 225 226 h = (BTHEADER *) t->bt_curpage; 227 h->h_pgno = pgno; 228 h->h_flags = F_LEAF; 229 h->h_lower = (index_t) 230 (((char *) &(h->h_linp[0])) - ((char *) h)); 231 h->h_upper = t->bt_psize; 232 h->h_prevpg = h->h_nextpg = P_NONE; 233 } else 234 (void) _bt_pgin(t->bt_curpage, (char *) t->bt_lorder); 235 236 return (RET_SUCCESS); 237 } 238 239 /* 240 * _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from 241 * the host-independent format stored on disk. 242 * 243 * Parameters: 244 * h -- page to convert 245 * _lorder -- byte order for pages (stored as a char * in the 246 * cache, and passed around as a magic cookie). 247 * 248 * Returns: 249 * RET_SUCCESS (lru code requires a return value). 250 * 251 * Side Effects: 252 * Layout of tree metadata on the page is changed in place. 253 * 254 * Warnings: 255 * Everywhere else in the code, the types pgno_t and index_t 256 * are opaque. These two routines know what they really 257 * are. 258 */ 259 260 int 261 _bt_pgout(h, _lorder) 262 BTHEADER *h; 263 char *_lorder; 264 { 265 int i; 266 int top; 267 int lorder; 268 DATUM *d; 269 IDATUM *id; 270 size_t chain; 271 272 lorder = (int) _lorder; 273 if (lorder == BYTE_ORDER) 274 return (RET_SUCCESS); 275 276 if (h->h_flags & F_LEAF) { 277 if (h->h_flags & F_CONT) { 278 if (h->h_prevpg == P_NONE) { 279 size_t longsz; 280 281 (void) bcopy((char *) &(h->h_linp[0]), 282 (char *) &longsz, 283 sizeof(longsz)); 284 BLSWAP(longsz); 285 (void) bcopy((char *) &longsz, 286 (char *) &(h->h_linp[0]), 287 sizeof(longsz)); 288 } 289 } else { 290 top = NEXTINDEX(h); 291 for (i = 0; i < top; i++) { 292 d = (DATUM *) GETDATUM(h, i); 293 if (d->d_flags & D_BIGKEY) { 294 (void) bcopy((char *) &(d->d_bytes[0]), 295 (char *) &chain, 296 sizeof(chain)); 297 BLSWAP(chain); 298 (void) bcopy((char *) &chain, 299 (char *) &(d->d_bytes[0]), 300 sizeof(chain)); 301 } 302 if (d->d_flags & D_BIGDATA) { 303 (void) bcopy((char *) &(d->d_bytes[d->d_ksize]), 304 (char *) &chain, 305 sizeof(chain)); 306 BLSWAP(chain); 307 (void) bcopy((char *) &chain, 308 (char *) &(d->d_bytes[d->d_ksize]), 309 sizeof(chain)); 310 } 311 BLSWAP(d->d_dsize); 312 BLSWAP(d->d_ksize); 313 BLSWAP(d->d_flags); 314 BLSWAP(h->h_linp[i]); 315 } 316 } 317 } else { 318 top = NEXTINDEX(h); 319 for (i = 0; i < top; i++) { 320 id = (IDATUM *) GETDATUM(h, i); 321 BLSWAP(id->i_size); 322 BLSWAP(id->i_pgno); 323 BLSWAP(id->i_flags); 324 if (id->i_flags & D_BIGKEY) { 325 (void) bcopy((char *) &(id->i_bytes[0]), 326 (char *) &chain, 327 sizeof(chain)); 328 BLSWAP(chain); 329 (void) bcopy((char *) &chain, 330 (char *) &(id->i_bytes[0]), 331 sizeof(chain)); 332 } 333 BLSWAP(h->h_linp[i]); 334 } 335 } 336 BLSWAP(h->h_flags); 337 BLSWAP(h->h_pgno); 338 BLSWAP(h->h_prevpg); 339 BLSWAP(h->h_nextpg); 340 BLSWAP(h->h_lower); 341 BLSWAP(h->h_upper); 342 343 return (RET_SUCCESS); 344 } 345 346 int 347 _bt_pgin(h, _lorder) 348 BTHEADER *h; 349 char *_lorder; 350 { 351 int i; 352 int top; 353 int lorder; 354 DATUM *d; 355 IDATUM *id; 356 size_t chain; 357 358 /* 359 * If btree pages are to be stored in the host byte order, don't 360 * bother swapping. 361 */ 362 lorder = (int) _lorder; 363 if (lorder == BYTE_ORDER) 364 return (RET_SUCCESS); 365 366 BLSWAP(h->h_upper); 367 BLSWAP(h->h_lower); 368 BLSWAP(h->h_nextpg); 369 BLSWAP(h->h_prevpg); 370 BLSWAP(h->h_pgno); 371 BLSWAP(h->h_flags); 372 373 if (h->h_flags & F_LEAF) { 374 if (h->h_flags & F_CONT) { 375 if (h->h_prevpg == P_NONE) { 376 size_t longsz; 377 378 (void) bcopy((char *) &(h->h_linp[0]), 379 (char *) &longsz, 380 sizeof(longsz)); 381 BLSWAP(longsz); 382 (void) bcopy((char *) &longsz, 383 (char *) &(h->h_linp[0]), 384 sizeof(longsz)); 385 } 386 } else { 387 top = NEXTINDEX(h); 388 for (i = 0; i < top; i++) { 389 BLSWAP(h->h_linp[i]); 390 d = (DATUM *) GETDATUM(h, i); 391 BLSWAP(d->d_dsize); 392 BLSWAP(d->d_ksize); 393 BLSWAP(d->d_flags); 394 if (d->d_flags & D_BIGKEY) { 395 (void) bcopy((char *) &(d->d_bytes[0]), 396 (char *) &chain, 397 sizeof(chain)); 398 BLSWAP(chain); 399 (void) bcopy((char *) &chain, 400 (char *) &(d->d_bytes[0]), 401 sizeof(chain)); 402 } 403 if (d->d_flags & D_BIGDATA) { 404 (void) bcopy((char *) &(d->d_bytes[d->d_ksize]), 405 (char *) &chain, 406 sizeof(chain)); 407 BLSWAP(chain); 408 (void) bcopy((char *) &chain, 409 (char *) &(d->d_bytes[d->d_ksize]), 410 sizeof(chain)); 411 } 412 } 413 } 414 } else { 415 top = NEXTINDEX(h); 416 for (i = 0; i < top; i++) { 417 BLSWAP(h->h_linp[i]); 418 id = (IDATUM *) GETDATUM(h, i); 419 BLSWAP(id->i_size); 420 BLSWAP(id->i_pgno); 421 BLSWAP(id->i_flags); 422 if (id->i_flags & D_BIGKEY) { 423 (void) bcopy((char *) &(id->i_bytes[0]), 424 (char *) &chain, 425 sizeof(chain)); 426 BLSWAP(chain); 427 (void) bcopy((char *) &chain, 428 (char *) &(id->i_bytes[0]), 429 sizeof(chain)); 430 } 431 } 432 } 433 return (RET_SUCCESS); 434 } 435 436 /* 437 * _BT_ALLOCPG -- allocate a new page in the btree. 438 * 439 * This is called when we split a page, to get space to do the split. 440 * For disk btrees, these pages are released when the split is done. 441 * For memory btrees, they are not. 442 * 443 * Parameters: 444 * t -- tree in which to do split 445 * 446 * Returns: 447 * Pointer to the newly-allocated page 448 */ 449 450 BTHEADER * 451 _bt_allocpg(t) 452 BTREE_P t; 453 { 454 BTHEADER *h = t->bt_curpage; 455 BTHEADER *nh; 456 int nbytes; 457 458 /* if we have a cache, let the cache code do the work */ 459 if (ISDISK(t) && ISCACHE(t)) { 460 nh = (BTHEADER *) lrugetnew(t->bt_s.bt_d.d_cache, 461 (int) (t->bt_npages + 1), 462 &nbytes); 463 } else { 464 nh = (BTHEADER *) malloc((unsigned) t->bt_psize); 465 } 466 467 if (nh != (BTHEADER *) NULL) { 468 nh->h_pgno = nh->h_prevpg = nh->h_nextpg = P_NONE; 469 nh->h_flags = h->h_flags; 470 nh->h_lower = (index_t) 471 (((char *) &(nh->h_linp[0])) - ((char *) nh)); 472 nh->h_upper = t->bt_psize; 473 } 474 475 return (nh); 476 } 477 478 /* 479 * _BT_WRITE -- Write a specific page to a btree file. 480 * 481 * Parameters: 482 * t -- btree to get the page 483 * h -- page to write 484 * relflag -- (int) this page may/may not be released 485 * 486 * Returns: 487 * RET_SUCCESS, RET_ERROR. 488 * 489 * Side Effects: 490 * Writes a metadata page if none has been written yet. 491 */ 492 493 int 494 _bt_write(t, h, relflag) 495 BTREE_P t; 496 BTHEADER *h; 497 int relflag; 498 { 499 long pos; 500 int htindex; 501 HTBUCKET *b; 502 char *cache; 503 pgno_t pgno; 504 505 h->h_flags &= ~F_DIRTY; 506 if (ISDISK(t)) { 507 508 /* if we haven't done so yet, write the metadata */ 509 if (!(t->bt_flags & BTF_METAOK)) { 510 if (_bt_wrtmeta(t) == RET_ERROR) 511 return (RET_ERROR); 512 } 513 514 pgno = h->h_pgno; 515 516 517 /* if we have a cache, let the cache code do the work */ 518 if ((cache = t->bt_s.bt_d.d_cache) != (char *) NULL) { 519 if (lruwrite(cache, (int) pgno) < 0) 520 return (RET_ERROR); 521 if (relflag && lrurelease(cache, (int) pgno) < 0) 522 return (RET_ERROR); 523 524 } else { 525 (void) _bt_pgout(h, (char *) t->bt_lorder); 526 /* now write the current page */ 527 pos = (long) (pgno * t->bt_psize); 528 if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) 529 return (RET_ERROR); 530 if (write(t->bt_s.bt_d.d_fd, (char *) h, (int) t->bt_psize) 531 < t->bt_psize) 532 return (RET_ERROR); 533 if (!relflag) 534 (void) _bt_pgin(h, (char *) t->bt_lorder); 535 } 536 } else { 537 /* in-memory btree */ 538 htindex = HASHKEY(h->h_pgno); 539 540 /* see if we need to overwrite existing entry */ 541 for (b = t->bt_s.bt_ht[htindex]; 542 b != (HTBUCKET *) NULL; 543 b = b->ht_next) { 544 if (b->ht_pgno == h->h_pgno) { 545 b->ht_page = h; 546 return (RET_SUCCESS); 547 } 548 } 549 550 /* new entry, write it */ 551 b = (HTBUCKET *) malloc((unsigned) sizeof (HTBUCKET)); 552 if (b == (HTBUCKET *) NULL) 553 return (RET_ERROR); 554 555 b->ht_pgno = h->h_pgno; 556 b->ht_page = h; 557 b->ht_next = t->bt_s.bt_ht[htindex]; 558 t->bt_s.bt_ht[htindex] = b; 559 } 560 return (RET_SUCCESS); 561 } 562 563 /* 564 * _BT_RELEASE -- Release a locked-down cache page 565 * 566 * During page splits, we want to force pages out to the cache 567 * before we actually release them, in some cases. This routine 568 * releases such a page when it is no longer needed. 569 * 570 * Parameters: 571 * t -- btree in which to release page 572 * h -- page to release 573 * 574 * Returns: 575 * RET_SUCCESS, RET_ERROR. 576 * 577 * Side Effects: 578 * None. 579 */ 580 581 int 582 _bt_release(t, h) 583 BTREE_P t; 584 BTHEADER *h; 585 { 586 if (ISDISK(t) && ISCACHE(t)) { 587 if (lrurelease(t->bt_s.bt_d.d_cache, (int) (h->h_pgno)) < 0) 588 return (RET_ERROR); 589 } 590 return (RET_SUCCESS); 591 } 592 593 /* 594 * _BT_WRTMETA -- Write metadata to the btree. 595 * 596 * Parameters: 597 * t -- tree to which to write 598 * 599 * Returns: 600 * RET_SUCCESS, RET_ERROR. 601 */ 602 603 int 604 _bt_wrtmeta(t) 605 BTREE_P t; 606 { 607 BTMETA m; 608 609 if (lseek(t->bt_s.bt_d.d_fd, 0l, L_SET) != 0l) 610 return (RET_ERROR); 611 612 /* lorder has to be in host-independent format */ 613 m.m_lorder = (u_long) htonl((long) t->bt_lorder); 614 615 m.m_magic = BTREEMAGIC; 616 m.m_version = BTREEVERSION; 617 m.m_psize = t->bt_psize; 618 m.m_free = t->bt_free; 619 m.m_flags = t->bt_flags & BTF_NODUPS; 620 621 if (t->bt_lorder != BYTE_ORDER) { 622 BLSWAP(m.m_magic); 623 BLSWAP(m.m_version); 624 BLSWAP(m.m_psize); 625 BLSWAP(m.m_free); 626 BLSWAP(m.m_flags); 627 } 628 629 if (write(t->bt_s.bt_d.d_fd, (char *) &m, sizeof(m)) 630 != sizeof(m)) { 631 return (RET_ERROR); 632 } 633 634 t->bt_flags |= BTF_METAOK; 635 636 return (RET_SUCCESS); 637 } 638