1 /* $NetBSD: bt_delete.c,v 1.12 2003/08/07 16:42:40 agc Exp $ */ 2 3 /*- 4 * Copyright (c) 1990, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Mike Olson. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 #if defined(LIBC_SCCS) && !defined(lint) 37 #if 0 38 static char sccsid[] = "@(#)bt_delete.c 8.13 (Berkeley) 7/28/94"; 39 #else 40 __RCSID("$NetBSD: bt_delete.c,v 1.12 2003/08/07 16:42:40 agc Exp $"); 41 #endif 42 #endif /* LIBC_SCCS and not lint */ 43 44 #include "namespace.h" 45 #include <sys/types.h> 46 47 #include <errno.h> 48 #include <stdio.h> 49 #include <string.h> 50 51 #include <db.h> 52 #include "btree.h" 53 54 static int __bt_bdelete __P((BTREE *, const DBT *)); 55 static int __bt_curdel __P((BTREE *, const DBT *, PAGE *, u_int)); 56 static int __bt_pdelete __P((BTREE *, PAGE *)); 57 static int __bt_relink __P((BTREE *, PAGE *)); 58 static int __bt_stkacq __P((BTREE *, PAGE **, CURSOR *)); 59 60 /* 61 * __bt_delete 62 * Delete the item(s) referenced by a key. 63 * 64 * Return RET_SPECIAL if the key is not found. 65 */ 66 int 67 __bt_delete(dbp, key, flags) 68 const DB *dbp; 69 const DBT *key; 70 u_int flags; 71 { 72 BTREE *t; 73 CURSOR *c; 74 PAGE *h; 75 int status; 76 77 t = dbp->internal; 78 79 /* Toss any page pinned across calls. */ 80 if (t->bt_pinned != NULL) { 81 mpool_put(t->bt_mp, t->bt_pinned, 0); 82 t->bt_pinned = NULL; 83 } 84 85 /* Check for change to a read-only tree. */ 86 if (F_ISSET(t, B_RDONLY)) { 87 errno = EPERM; 88 return (RET_ERROR); 89 } 90 91 switch (flags) { 92 case 0: 93 status = __bt_bdelete(t, key); 94 break; 95 case R_CURSOR: 96 /* 97 * If flags is R_CURSOR, delete the cursor. Must already 98 * have started a scan and not have already deleted it. 99 */ 100 c = &t->bt_cursor; 101 if (F_ISSET(c, CURS_INIT)) { 102 if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) 103 return (RET_SPECIAL); 104 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 105 return (RET_ERROR); 106 107 /* 108 * If the page is about to be emptied, we'll need to 109 * delete it, which means we have to acquire a stack. 110 */ 111 if (NEXTINDEX(h) == 1) 112 if (__bt_stkacq(t, &h, &t->bt_cursor)) 113 return (RET_ERROR); 114 115 status = __bt_dleaf(t, NULL, h, (u_int)c->pg.index); 116 117 if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) { 118 if (__bt_pdelete(t, h)) 119 return (RET_ERROR); 120 } else 121 mpool_put(t->bt_mp, h, 122 (u_int)(status == RET_SUCCESS ? 123 MPOOL_DIRTY : 0)); 124 break; 125 } 126 /* FALLTHROUGH */ 127 default: 128 errno = EINVAL; 129 return (RET_ERROR); 130 } 131 if (status == RET_SUCCESS) 132 F_SET(t, B_MODIFIED); 133 return (status); 134 } 135 136 /* 137 * __bt_stkacq -- 138 * Acquire a stack so we can delete a cursor entry. 139 * 140 * Parameters: 141 * t: tree 142 * hp: pointer to current, pinned PAGE pointer 143 * c: pointer to the cursor 144 * 145 * Returns: 146 * 0 on success, 1 on failure 147 */ 148 static int 149 __bt_stkacq(t, hp, c) 150 BTREE *t; 151 PAGE **hp; 152 CURSOR *c; 153 { 154 BINTERNAL *bi; 155 EPG *e; 156 EPGNO *parent; 157 PAGE *h; 158 indx_t idx = 0; /* Pacify gcc */ 159 pgno_t pgno; 160 recno_t nextpg, prevpg; 161 int exact, level; 162 163 /* 164 * Find the first occurrence of the key in the tree. Toss the 165 * currently locked page so we don't hit an already-locked page. 166 */ 167 h = *hp; 168 mpool_put(t->bt_mp, h, 0); 169 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 170 return (1); 171 h = e->page; 172 173 /* See if we got it in one shot. */ 174 if (h->pgno == c->pg.pgno) 175 goto ret; 176 177 /* 178 * Move right, looking for the page. At each move we have to move 179 * up the stack until we don't have to move to the next page. If 180 * we have to change pages at an internal level, we have to fix the 181 * stack back up. 182 */ 183 while (h->pgno != c->pg.pgno) { 184 if ((nextpg = h->nextpg) == P_INVALID) 185 break; 186 mpool_put(t->bt_mp, h, 0); 187 188 /* Move up the stack. */ 189 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 190 /* Get the parent page. */ 191 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 192 return (1); 193 194 /* Move to the next index. */ 195 if (parent->index != NEXTINDEX(h) - 1) { 196 idx = parent->index + 1; 197 BT_PUSH(t, h->pgno, idx); 198 break; 199 } 200 mpool_put(t->bt_mp, h, 0); 201 } 202 203 /* Restore the stack. */ 204 while (level--) { 205 /* Push the next level down onto the stack. */ 206 bi = GETBINTERNAL(h, idx); 207 pgno = bi->pgno; 208 BT_PUSH(t, pgno, 0); 209 210 /* Lose the currently pinned page. */ 211 mpool_put(t->bt_mp, h, 0); 212 213 /* Get the next level down. */ 214 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 215 return (1); 216 idx = 0; 217 } 218 mpool_put(t->bt_mp, h, 0); 219 if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL) 220 return (1); 221 } 222 223 if (h->pgno == c->pg.pgno) 224 goto ret; 225 226 /* Reacquire the original stack. */ 227 mpool_put(t->bt_mp, h, 0); 228 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 229 return (1); 230 h = e->page; 231 232 /* 233 * Move left, looking for the page. At each move we have to move 234 * up the stack until we don't have to change pages to move to the 235 * next page. If we have to change pages at an internal level, we 236 * have to fix the stack back up. 237 */ 238 while (h->pgno != c->pg.pgno) { 239 if ((prevpg = h->prevpg) == P_INVALID) 240 break; 241 mpool_put(t->bt_mp, h, 0); 242 243 /* Move up the stack. */ 244 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 245 /* Get the parent page. */ 246 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 247 return (1); 248 249 /* Move to the next index. */ 250 if (parent->index != 0) { 251 idx = parent->index - 1; 252 BT_PUSH(t, h->pgno, idx); 253 break; 254 } 255 mpool_put(t->bt_mp, h, 0); 256 } 257 258 /* Restore the stack. */ 259 while (level--) { 260 /* Push the next level down onto the stack. */ 261 bi = GETBINTERNAL(h, idx); 262 pgno = bi->pgno; 263 264 /* Lose the currently pinned page. */ 265 mpool_put(t->bt_mp, h, 0); 266 267 /* Get the next level down. */ 268 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 269 return (1); 270 271 idx = NEXTINDEX(h) - 1; 272 BT_PUSH(t, pgno, idx); 273 } 274 mpool_put(t->bt_mp, h, 0); 275 if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL) 276 return (1); 277 } 278 279 280 ret: mpool_put(t->bt_mp, h, 0); 281 return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL); 282 } 283 284 /* 285 * __bt_bdelete -- 286 * Delete all key/data pairs matching the specified key. 287 * 288 * Parameters: 289 * t: tree 290 * key: key to delete 291 * 292 * Returns: 293 * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. 294 */ 295 static int 296 __bt_bdelete(t, key) 297 BTREE *t; 298 const DBT *key; 299 { 300 EPG *e; 301 PAGE *h; 302 int deleted, exact, redo; 303 304 deleted = 0; 305 306 /* Find any matching record; __bt_search pins the page. */ 307 loop: if ((e = __bt_search(t, key, &exact)) == NULL) 308 return (deleted ? RET_SUCCESS : RET_ERROR); 309 if (!exact) { 310 mpool_put(t->bt_mp, e->page, 0); 311 return (deleted ? RET_SUCCESS : RET_SPECIAL); 312 } 313 314 /* 315 * Delete forward, then delete backward, from the found key. If 316 * there are duplicates and we reach either side of the page, do 317 * the key search again, so that we get them all. 318 */ 319 redo = 0; 320 h = e->page; 321 do { 322 if (__bt_dleaf(t, key, h, (u_int)e->index)) { 323 mpool_put(t->bt_mp, h, 0); 324 return (RET_ERROR); 325 } 326 if (F_ISSET(t, B_NODUPS)) { 327 if (NEXTINDEX(h) == 0) { 328 if (__bt_pdelete(t, h)) 329 return (RET_ERROR); 330 } else 331 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 332 return (RET_SUCCESS); 333 } 334 deleted = 1; 335 } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0); 336 337 /* Check for right-hand edge of the page. */ 338 if (e->index == NEXTINDEX(h)) 339 redo = 1; 340 341 /* Delete from the key to the beginning of the page. */ 342 while (e->index-- > 0) { 343 if (__bt_cmp(t, key, e) != 0) 344 break; 345 if (__bt_dleaf(t, key, h, (u_int)e->index) == RET_ERROR) { 346 mpool_put(t->bt_mp, h, 0); 347 return (RET_ERROR); 348 } 349 if (e->index == 0) 350 redo = 1; 351 } 352 353 /* Check for an empty page. */ 354 if (NEXTINDEX(h) == 0) { 355 if (__bt_pdelete(t, h)) 356 return (RET_ERROR); 357 goto loop; 358 } 359 360 /* Put the page. */ 361 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 362 363 if (redo) 364 goto loop; 365 return (RET_SUCCESS); 366 } 367 368 /* 369 * __bt_pdelete -- 370 * Delete a single page from the tree. 371 * 372 * Parameters: 373 * t: tree 374 * h: leaf page 375 * 376 * Returns: 377 * RET_SUCCESS, RET_ERROR. 378 * 379 * Side-effects: 380 * mpool_put's the page 381 */ 382 static int 383 __bt_pdelete(t, h) 384 BTREE *t; 385 PAGE *h; 386 { 387 BINTERNAL *bi; 388 PAGE *pg; 389 EPGNO *parent; 390 indx_t cnt, idx, *ip, offset; 391 u_int32_t nksize; 392 char *from; 393 394 /* 395 * Walk the parent page stack -- a LIFO stack of the pages that were 396 * traversed when we searched for the page where the delete occurred. 397 * Each stack entry is a page number and a page index offset. The 398 * offset is for the page traversed on the search. We've just deleted 399 * a page, so we have to delete the key from the parent page. 400 * 401 * If the delete from the parent page makes it empty, this process may 402 * continue all the way up the tree. We stop if we reach the root page 403 * (which is never deleted, it's just not worth the effort) or if the 404 * delete does not empty the page. 405 */ 406 while ((parent = BT_POP(t)) != NULL) { 407 /* Get the parent page. */ 408 if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 409 return (RET_ERROR); 410 411 idx = parent->index; 412 bi = GETBINTERNAL(pg, idx); 413 414 /* Free any overflow pages. */ 415 if (bi->flags & P_BIGKEY && 416 __ovfl_delete(t, bi->bytes) == RET_ERROR) { 417 mpool_put(t->bt_mp, pg, 0); 418 return (RET_ERROR); 419 } 420 421 /* 422 * Free the parent if it has only the one key and it's not the 423 * root page. If it's the rootpage, turn it back into an empty 424 * leaf page. 425 */ 426 if (NEXTINDEX(pg) == 1) { 427 if (pg->pgno == P_ROOT) { 428 pg->lower = BTDATAOFF; 429 pg->upper = t->bt_psize; 430 pg->flags = P_BLEAF; 431 } else { 432 if (__bt_relink(t, pg) || __bt_free(t, pg)) 433 return (RET_ERROR); 434 continue; 435 } 436 } else { 437 /* Pack remaining key items at the end of the page. */ 438 nksize = NBINTERNAL(bi->ksize); 439 from = (char *)(void *)pg + pg->upper; 440 memmove(from + nksize, from, 441 (size_t)((char *)(void *)bi - from)); 442 pg->upper += nksize; 443 444 /* Adjust indices' offsets, shift the indices down. */ 445 offset = pg->linp[idx]; 446 for (cnt = idx, ip = &pg->linp[0]; cnt--; ++ip) 447 if (ip[0] < offset) 448 ip[0] += nksize; 449 for (cnt = NEXTINDEX(pg) - idx; --cnt; ++ip) 450 ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1]; 451 pg->lower -= sizeof(indx_t); 452 } 453 454 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 455 break; 456 } 457 458 /* Free the leaf page, as long as it wasn't the root. */ 459 if (h->pgno == P_ROOT) { 460 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 461 return (RET_SUCCESS); 462 } 463 return (__bt_relink(t, h) || __bt_free(t, h)); 464 } 465 466 /* 467 * __bt_dleaf -- 468 * Delete a single record from a leaf page. 469 * 470 * Parameters: 471 * t: tree 472 * key: referenced key 473 * h: page 474 * idx: index on page to delete 475 * 476 * Returns: 477 * RET_SUCCESS, RET_ERROR. 478 */ 479 int 480 __bt_dleaf(t, key, h, idx) 481 BTREE *t; 482 const DBT *key; 483 PAGE *h; 484 u_int idx; 485 { 486 BLEAF *bl; 487 indx_t cnt, *ip, offset; 488 u_int32_t nbytes; 489 void *to; 490 char *from; 491 492 /* If this record is referenced by the cursor, delete the cursor. */ 493 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 494 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 495 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == idx && 496 __bt_curdel(t, key, h, idx)) 497 return (RET_ERROR); 498 499 /* If the entry uses overflow pages, make them available for reuse. */ 500 to = bl = GETBLEAF(h, idx); 501 if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR) 502 return (RET_ERROR); 503 if (bl->flags & P_BIGDATA && 504 __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR) 505 return (RET_ERROR); 506 507 /* Pack the remaining key/data items at the end of the page. */ 508 nbytes = NBLEAF(bl); 509 from = (char *)(void *)h + h->upper; 510 memmove(from + nbytes, from, (size_t)((char *)(void *)to - from)); 511 h->upper += nbytes; 512 513 /* Adjust the indices' offsets, shift the indices down. */ 514 offset = h->linp[idx]; 515 for (cnt = idx, ip = &h->linp[0]; cnt--; ++ip) 516 if (ip[0] < offset) 517 ip[0] += nbytes; 518 for (cnt = NEXTINDEX(h) - idx; --cnt; ++ip) 519 ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1]; 520 h->lower -= sizeof(indx_t); 521 522 /* If the cursor is on this page, adjust it as necessary. */ 523 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 524 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 525 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > idx) 526 --t->bt_cursor.pg.index; 527 528 return (RET_SUCCESS); 529 } 530 531 /* 532 * __bt_curdel -- 533 * Delete the cursor. 534 * 535 * Parameters: 536 * t: tree 537 * key: referenced key (or NULL) 538 * h: page 539 * idx: index on page to delete 540 * 541 * Returns: 542 * RET_SUCCESS, RET_ERROR. 543 */ 544 static int 545 __bt_curdel(t, key, h, idx) 546 BTREE *t; 547 const DBT *key; 548 PAGE *h; 549 u_int idx; 550 { 551 CURSOR *c; 552 EPG e; 553 PAGE *pg; 554 int curcopy, status; 555 556 /* 557 * If there are duplicates, move forward or backward to one. 558 * Otherwise, copy the key into the cursor area. 559 */ 560 c = &t->bt_cursor; 561 F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE); 562 563 curcopy = 0; 564 if (!F_ISSET(t, B_NODUPS)) { 565 /* 566 * We're going to have to do comparisons. If we weren't 567 * provided a copy of the key, i.e. the user is deleting 568 * the current cursor position, get one. 569 */ 570 if (key == NULL) { 571 e.page = h; 572 e.index = idx; 573 if ((status = __bt_ret(t, &e, 574 &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS) 575 return (status); 576 curcopy = 1; 577 key = &c->key; 578 } 579 /* Check previous key, if not at the beginning of the page. */ 580 if (idx > 0) { 581 e.page = h; 582 e.index = idx - 1; 583 if (__bt_cmp(t, key, &e) == 0) { 584 F_SET(c, CURS_BEFORE); 585 goto dup2; 586 } 587 } 588 /* Check next key, if not at the end of the page. */ 589 if (idx < NEXTINDEX(h) - 1) { 590 e.page = h; 591 e.index = idx + 1; 592 if (__bt_cmp(t, key, &e) == 0) { 593 F_SET(c, CURS_AFTER); 594 goto dup2; 595 } 596 } 597 /* Check previous key if at the beginning of the page. */ 598 if (idx == 0 && h->prevpg != P_INVALID) { 599 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 600 return (RET_ERROR); 601 e.page = pg; 602 e.index = NEXTINDEX(pg) - 1; 603 if (__bt_cmp(t, key, &e) == 0) { 604 F_SET(c, CURS_BEFORE); 605 goto dup1; 606 } 607 mpool_put(t->bt_mp, pg, 0); 608 } 609 /* Check next key if at the end of the page. */ 610 if (idx == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) { 611 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 612 return (RET_ERROR); 613 e.page = pg; 614 e.index = 0; 615 if (__bt_cmp(t, key, &e) == 0) { 616 F_SET(c, CURS_AFTER); 617 dup1: mpool_put(t->bt_mp, pg, 0); 618 dup2: c->pg.pgno = e.page->pgno; 619 c->pg.index = e.index; 620 return (RET_SUCCESS); 621 } 622 mpool_put(t->bt_mp, pg, 0); 623 } 624 } 625 e.page = h; 626 e.index = idx; 627 if (curcopy || (status = 628 __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) { 629 F_SET(c, CURS_ACQUIRE); 630 return (RET_SUCCESS); 631 } 632 return (status); 633 } 634 635 /* 636 * __bt_relink -- 637 * Link around a deleted page. 638 * 639 * Parameters: 640 * t: tree 641 * h: page to be deleted 642 */ 643 static int 644 __bt_relink(t, h) 645 BTREE *t; 646 PAGE *h; 647 { 648 PAGE *pg; 649 650 if (h->nextpg != P_INVALID) { 651 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 652 return (RET_ERROR); 653 pg->prevpg = h->prevpg; 654 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 655 } 656 if (h->prevpg != P_INVALID) { 657 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 658 return (RET_ERROR); 659 pg->nextpg = h->nextpg; 660 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 661 } 662 return (0); 663 } 664