1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.76 2008/08/06 15:38:58 dillon Exp $ 35 */ 36 37 /* 38 * HAMMER B-Tree index 39 * 40 * HAMMER implements a modified B+Tree. In documentation this will 41 * simply be refered to as the HAMMER B-Tree. Basically a HAMMER B-Tree 42 * looks like a B+Tree (A B-Tree which stores its records only at the leafs 43 * of the tree), but adds two additional boundary elements which describe 44 * the left-most and right-most element a node is able to represent. In 45 * otherwords, we have boundary elements at the two ends of a B-Tree node 46 * instead of sub-tree pointers. 47 * 48 * A B-Tree internal node looks like this: 49 * 50 * B N N N N N N B <-- boundary and internal elements 51 * S S S S S S S <-- subtree pointers 52 * 53 * A B-Tree leaf node basically looks like this: 54 * 55 * L L L L L L L L <-- leaf elemenets 56 * 57 * The radix for an internal node is 1 less then a leaf but we get a 58 * number of significant benefits for our troubles. 59 * 60 * The big benefit to using a B-Tree containing boundary information 61 * is that it is possible to cache pointers into the middle of the tree 62 * and not have to start searches, insertions, OR deletions at the root 63 * node. In particular, searches are able to progress in a definitive 64 * direction from any point in the tree without revisting nodes. This 65 * greatly improves the efficiency of many operations, most especially 66 * record appends. 67 * 68 * B-Trees also make the stacking of trees fairly straightforward. 69 * 70 * INSERTIONS: A search performed with the intention of doing 71 * an insert will guarantee that the terminal leaf node is not full by 72 * splitting full nodes. Splits occur top-down during the dive down the 73 * B-Tree. 74 * 75 * DELETIONS: A deletion makes no attempt to proactively balance the 76 * tree and will recursively remove nodes that become empty. If a 77 * deadlock occurs a deletion may not be able to remove an empty leaf. 78 * Deletions never allow internal nodes to become empty (that would blow 79 * up the boundaries). 80 */ 81 #include "hammer.h" 82 #include <sys/buf.h> 83 #include <sys/buf2.h> 84 85 static int btree_search(hammer_cursor_t cursor, int flags); 86 static int btree_split_internal(hammer_cursor_t cursor); 87 static int btree_split_leaf(hammer_cursor_t cursor); 88 static int btree_remove(hammer_cursor_t cursor); 89 static int btree_node_is_full(hammer_node_ondisk_t node); 90 static int hammer_btree_mirror_propagate(hammer_cursor_t cursor, 91 hammer_tid_t mirror_tid); 92 static void hammer_make_separator(hammer_base_elm_t key1, 93 hammer_base_elm_t key2, hammer_base_elm_t dest); 94 static void hammer_cursor_mirror_filter(hammer_cursor_t cursor); 95 96 /* 97 * Iterate records after a search. The cursor is iterated forwards past 98 * the current record until a record matching the key-range requirements 99 * is found. ENOENT is returned if the iteration goes past the ending 100 * key. 101 * 102 * The iteration is inclusive of key_beg and can be inclusive or exclusive 103 * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set. 104 * 105 * When doing an as-of search (cursor->asof != 0), key_beg.create_tid 106 * may be modified by B-Tree functions. 107 * 108 * cursor->key_beg may or may not be modified by this function during 109 * the iteration. XXX future - in case of an inverted lock we may have 110 * to reinitiate the lookup and set key_beg to properly pick up where we 111 * left off. 112 * 113 * If HAMMER_CURSOR_ITERATE_CHECK is set it is possible that the cursor 114 * was reverse indexed due to being moved to a parent while unlocked, 115 * and something else might have inserted an element outside the iteration 116 * range. When this case occurs the iterator just keeps iterating until 117 * it gets back into the iteration range (instead of asserting). 118 * 119 * NOTE! EDEADLK *CANNOT* be returned by this procedure. 120 */ 121 int 122 hammer_btree_iterate(hammer_cursor_t cursor) 123 { 124 hammer_node_ondisk_t node; 125 hammer_btree_elm_t elm; 126 hammer_mount_t hmp; 127 int error = 0; 128 int r; 129 int s; 130 131 /* 132 * Skip past the current record 133 */ 134 hmp = cursor->trans->hmp; 135 node = cursor->node->ondisk; 136 if (node == NULL) 137 return(ENOENT); 138 if (cursor->index < node->count && 139 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 140 ++cursor->index; 141 } 142 143 /* 144 * HAMMER can wind up being cpu-bound. 145 */ 146 if (++hmp->check_yield > hammer_yield_check) { 147 hmp->check_yield = 0; 148 lwkt_user_yield(); 149 } 150 151 152 /* 153 * Loop until an element is found or we are done. 154 */ 155 for (;;) { 156 /* 157 * We iterate up the tree and then index over one element 158 * while we are at the last element in the current node. 159 * 160 * If we are at the root of the filesystem, cursor_up 161 * returns ENOENT. 162 * 163 * XXX this could be optimized by storing the information in 164 * the parent reference. 165 * 166 * XXX we can lose the node lock temporarily, this could mess 167 * up our scan. 168 */ 169 ++hammer_stats_btree_iterations; 170 hammer_flusher_clean_loose_ios(hmp); 171 172 if (cursor->index == node->count) { 173 if (hammer_debug_btree) { 174 kprintf("BRACKETU %016llx[%d] -> %016llx[%d] (td=%p)\n", 175 (long long)cursor->node->node_offset, 176 cursor->index, 177 (long long)(cursor->parent ? cursor->parent->node_offset : -1), 178 cursor->parent_index, 179 curthread); 180 } 181 KKASSERT(cursor->parent == NULL || cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset); 182 error = hammer_cursor_up(cursor); 183 if (error) 184 break; 185 /* reload stale pointer */ 186 node = cursor->node->ondisk; 187 KKASSERT(cursor->index != node->count); 188 189 /* 190 * If we are reblocking we want to return internal 191 * nodes. Note that the internal node will be 192 * returned multiple times, on each upward recursion 193 * from its children. The caller selects which 194 * revisit it cares about (usually first or last only). 195 */ 196 if (cursor->flags & HAMMER_CURSOR_REBLOCKING) { 197 cursor->flags |= HAMMER_CURSOR_ATEDISK; 198 return(0); 199 } 200 ++cursor->index; 201 continue; 202 } 203 204 /* 205 * Check internal or leaf element. Determine if the record 206 * at the cursor has gone beyond the end of our range. 207 * 208 * We recurse down through internal nodes. 209 */ 210 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 211 elm = &node->elms[cursor->index]; 212 213 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 214 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 215 if (hammer_debug_btree) { 216 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d (td=%p)\n", 217 (long long)cursor->node->node_offset, 218 cursor->index, 219 (long long)elm[0].internal.base.obj_id, 220 elm[0].internal.base.rec_type, 221 (long long)elm[0].internal.base.key, 222 elm[0].internal.base.localization, 223 r, 224 curthread 225 ); 226 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 227 (long long)cursor->node->node_offset, 228 cursor->index + 1, 229 (long long)elm[1].internal.base.obj_id, 230 elm[1].internal.base.rec_type, 231 (long long)elm[1].internal.base.key, 232 elm[1].internal.base.localization, 233 s 234 ); 235 } 236 237 if (r < 0) { 238 error = ENOENT; 239 break; 240 } 241 if (r == 0 && (cursor->flags & 242 HAMMER_CURSOR_END_INCLUSIVE) == 0) { 243 error = ENOENT; 244 break; 245 } 246 247 /* 248 * Better not be zero 249 */ 250 KKASSERT(elm->internal.subtree_offset != 0); 251 252 if (s <= 0) { 253 /* 254 * If running the mirror filter see if we 255 * can skip one or more entire sub-trees. 256 * If we can we return the internal node 257 * and the caller processes the skipped 258 * range (see mirror_read). 259 */ 260 if (cursor->flags & 261 HAMMER_CURSOR_MIRROR_FILTERED) { 262 if (elm->internal.mirror_tid < 263 cursor->cmirror->mirror_tid) { 264 hammer_cursor_mirror_filter(cursor); 265 return(0); 266 } 267 } 268 } else { 269 /* 270 * Normally it would be impossible for the 271 * cursor to have gotten back-indexed, 272 * but it can happen if a node is deleted 273 * and the cursor is moved to its parent 274 * internal node. ITERATE_CHECK will be set. 275 */ 276 KKASSERT(cursor->flags & 277 HAMMER_CURSOR_ITERATE_CHECK); 278 kprintf("hammer_btree_iterate: " 279 "DEBUG: Caught parent seek " 280 "in internal iteration\n"); 281 } 282 283 error = hammer_cursor_down(cursor); 284 if (error) 285 break; 286 KKASSERT(cursor->index == 0); 287 /* reload stale pointer */ 288 node = cursor->node->ondisk; 289 continue; 290 } else { 291 elm = &node->elms[cursor->index]; 292 r = hammer_btree_cmp(&cursor->key_end, &elm->base); 293 if (hammer_debug_btree) { 294 kprintf("ELEMENT %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n", 295 (long long)cursor->node->node_offset, 296 cursor->index, 297 (elm[0].leaf.base.btype ? 298 elm[0].leaf.base.btype : '?'), 299 (long long)elm[0].leaf.base.obj_id, 300 elm[0].leaf.base.rec_type, 301 (long long)elm[0].leaf.base.key, 302 elm[0].leaf.base.localization, 303 r 304 ); 305 } 306 if (r < 0) { 307 error = ENOENT; 308 break; 309 } 310 311 /* 312 * We support both end-inclusive and 313 * end-exclusive searches. 314 */ 315 if (r == 0 && 316 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 317 error = ENOENT; 318 break; 319 } 320 321 /* 322 * If ITERATE_CHECK is set an unlocked cursor may 323 * have been moved to a parent and the iterate can 324 * happen upon elements that are not in the requested 325 * range. 326 */ 327 if (cursor->flags & HAMMER_CURSOR_ITERATE_CHECK) { 328 s = hammer_btree_cmp(&cursor->key_beg, 329 &elm->base); 330 if (s > 0) { 331 kprintf("hammer_btree_iterate: " 332 "DEBUG: Caught parent seek " 333 "in leaf iteration\n"); 334 ++cursor->index; 335 continue; 336 } 337 } 338 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 339 340 /* 341 * Return the element 342 */ 343 switch(elm->leaf.base.btype) { 344 case HAMMER_BTREE_TYPE_RECORD: 345 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 346 hammer_btree_chkts(cursor->asof, &elm->base)) { 347 ++cursor->index; 348 continue; 349 } 350 error = 0; 351 break; 352 default: 353 error = EINVAL; 354 break; 355 } 356 if (error) 357 break; 358 } 359 /* 360 * node pointer invalid after loop 361 */ 362 363 /* 364 * Return entry 365 */ 366 if (hammer_debug_btree) { 367 int i = cursor->index; 368 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 369 kprintf("ITERATE %p:%d %016llx %02x %016llx lo=%02x\n", 370 cursor->node, i, 371 (long long)elm->internal.base.obj_id, 372 elm->internal.base.rec_type, 373 (long long)elm->internal.base.key, 374 elm->internal.base.localization 375 ); 376 } 377 return(0); 378 } 379 return(error); 380 } 381 382 /* 383 * We hit an internal element that we could skip as part of a mirroring 384 * scan. Calculate the entire range being skipped. 385 * 386 * It is important to include any gaps between the parent's left_bound 387 * and the node's left_bound, and same goes for the right side. 388 */ 389 static void 390 hammer_cursor_mirror_filter(hammer_cursor_t cursor) 391 { 392 struct hammer_cmirror *cmirror; 393 hammer_node_ondisk_t ondisk; 394 hammer_btree_elm_t elm; 395 396 ondisk = cursor->node->ondisk; 397 cmirror = cursor->cmirror; 398 399 /* 400 * Calculate the skipped range 401 */ 402 elm = &ondisk->elms[cursor->index]; 403 if (cursor->index == 0) 404 cmirror->skip_beg = *cursor->left_bound; 405 else 406 cmirror->skip_beg = elm->internal.base; 407 while (cursor->index < ondisk->count) { 408 if (elm->internal.mirror_tid >= cmirror->mirror_tid) 409 break; 410 ++cursor->index; 411 ++elm; 412 } 413 if (cursor->index == ondisk->count) 414 cmirror->skip_end = *cursor->right_bound; 415 else 416 cmirror->skip_end = elm->internal.base; 417 418 /* 419 * clip the returned result. 420 */ 421 if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0) 422 cmirror->skip_beg = cursor->key_beg; 423 if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0) 424 cmirror->skip_end = cursor->key_end; 425 } 426 427 /* 428 * Iterate in the reverse direction. This is used by the pruning code to 429 * avoid overlapping records. 430 */ 431 int 432 hammer_btree_iterate_reverse(hammer_cursor_t cursor) 433 { 434 hammer_node_ondisk_t node; 435 hammer_btree_elm_t elm; 436 int error = 0; 437 int r; 438 int s; 439 440 /* mirror filtering not supported for reverse iteration */ 441 KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0); 442 443 /* 444 * Skip past the current record. For various reasons the cursor 445 * may end up set to -1 or set to point at the end of the current 446 * node. These cases must be addressed. 447 */ 448 node = cursor->node->ondisk; 449 if (node == NULL) 450 return(ENOENT); 451 if (cursor->index != -1 && 452 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 453 --cursor->index; 454 } 455 if (cursor->index == cursor->node->ondisk->count) 456 --cursor->index; 457 458 /* 459 * Loop until an element is found or we are done. 460 */ 461 for (;;) { 462 ++hammer_stats_btree_iterations; 463 hammer_flusher_clean_loose_ios(cursor->trans->hmp); 464 465 /* 466 * We iterate up the tree and then index over one element 467 * while we are at the last element in the current node. 468 */ 469 if (cursor->index == -1) { 470 error = hammer_cursor_up(cursor); 471 if (error) { 472 cursor->index = 0; /* sanity */ 473 break; 474 } 475 /* reload stale pointer */ 476 node = cursor->node->ondisk; 477 KKASSERT(cursor->index != node->count); 478 --cursor->index; 479 continue; 480 } 481 482 /* 483 * Check internal or leaf element. Determine if the record 484 * at the cursor has gone beyond the end of our range. 485 * 486 * We recurse down through internal nodes. 487 */ 488 KKASSERT(cursor->index != node->count); 489 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 490 elm = &node->elms[cursor->index]; 491 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 492 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 493 if (hammer_debug_btree) { 494 kprintf("BRACKETL %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 495 (long long)cursor->node->node_offset, 496 cursor->index, 497 (long long)elm[0].internal.base.obj_id, 498 elm[0].internal.base.rec_type, 499 (long long)elm[0].internal.base.key, 500 elm[0].internal.base.localization, 501 r 502 ); 503 kprintf("BRACKETR %016llx[%d] %016llx %02x %016llx lo=%02x %d\n", 504 (long long)cursor->node->node_offset, 505 cursor->index + 1, 506 (long long)elm[1].internal.base.obj_id, 507 elm[1].internal.base.rec_type, 508 (long long)elm[1].internal.base.key, 509 elm[1].internal.base.localization, 510 s 511 ); 512 } 513 514 if (s >= 0) { 515 error = ENOENT; 516 break; 517 } 518 519 /* 520 * It shouldn't be possible to be seeked past key_end, 521 * even if the cursor got moved to a parent. 522 */ 523 KKASSERT(r >= 0); 524 525 /* 526 * Better not be zero 527 */ 528 KKASSERT(elm->internal.subtree_offset != 0); 529 530 error = hammer_cursor_down(cursor); 531 if (error) 532 break; 533 KKASSERT(cursor->index == 0); 534 /* reload stale pointer */ 535 node = cursor->node->ondisk; 536 537 /* this can assign -1 if the leaf was empty */ 538 cursor->index = node->count - 1; 539 continue; 540 } else { 541 elm = &node->elms[cursor->index]; 542 s = hammer_btree_cmp(&cursor->key_beg, &elm->base); 543 if (hammer_debug_btree) { 544 kprintf("ELEMENT %016llx:%d %c %016llx %02x %016llx lo=%02x %d\n", 545 (long long)cursor->node->node_offset, 546 cursor->index, 547 (elm[0].leaf.base.btype ? 548 elm[0].leaf.base.btype : '?'), 549 (long long)elm[0].leaf.base.obj_id, 550 elm[0].leaf.base.rec_type, 551 (long long)elm[0].leaf.base.key, 552 elm[0].leaf.base.localization, 553 s 554 ); 555 } 556 if (s > 0) { 557 error = ENOENT; 558 break; 559 } 560 561 /* 562 * It shouldn't be possible to be seeked past key_end, 563 * even if the cursor got moved to a parent. 564 */ 565 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 566 567 /* 568 * Return the element 569 */ 570 switch(elm->leaf.base.btype) { 571 case HAMMER_BTREE_TYPE_RECORD: 572 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 573 hammer_btree_chkts(cursor->asof, &elm->base)) { 574 --cursor->index; 575 continue; 576 } 577 error = 0; 578 break; 579 default: 580 error = EINVAL; 581 break; 582 } 583 if (error) 584 break; 585 } 586 /* 587 * node pointer invalid after loop 588 */ 589 590 /* 591 * Return entry 592 */ 593 if (hammer_debug_btree) { 594 int i = cursor->index; 595 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 596 kprintf("ITERATE %p:%d %016llx %02x %016llx lo=%02x\n", 597 cursor->node, i, 598 (long long)elm->internal.base.obj_id, 599 elm->internal.base.rec_type, 600 (long long)elm->internal.base.key, 601 elm->internal.base.localization 602 ); 603 } 604 return(0); 605 } 606 return(error); 607 } 608 609 /* 610 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry 611 * could not be found, EDEADLK if inserting and a retry is needed, and a 612 * fatal error otherwise. When retrying, the caller must terminate the 613 * cursor and reinitialize it. EDEADLK cannot be returned if not inserting. 614 * 615 * The cursor is suitably positioned for a deletion on success, and suitably 616 * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was 617 * specified. 618 * 619 * The cursor may begin anywhere, the search will traverse the tree in 620 * either direction to locate the requested element. 621 * 622 * Most of the logic implementing historical searches is handled here. We 623 * do an initial lookup with create_tid set to the asof TID. Due to the 624 * way records are laid out, a backwards iteration may be required if 625 * ENOENT is returned to locate the historical record. Here's the 626 * problem: 627 * 628 * create_tid: 10 15 20 629 * LEAF1 LEAF2 630 * records: (11) (18) 631 * 632 * Lets say we want to do a lookup AS-OF timestamp 17. We will traverse 633 * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is 634 * not visible and thus causes ENOENT to be returned. We really need 635 * to check record 11 in LEAF1. If it also fails then the search fails 636 * (e.g. it might represent the range 11-16 and thus still not match our 637 * AS-OF timestamp of 17). Note that LEAF1 could be empty, requiring 638 * further iterations. 639 * 640 * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK 641 * and the cursor->create_check TID if an iteration might be needed. 642 * In the above example create_check would be set to 14. 643 */ 644 int 645 hammer_btree_lookup(hammer_cursor_t cursor) 646 { 647 int error; 648 649 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 650 KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 || 651 cursor->trans->sync_lock_refs > 0); 652 ++hammer_stats_btree_lookups; 653 if (cursor->flags & HAMMER_CURSOR_ASOF) { 654 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0); 655 cursor->key_beg.create_tid = cursor->asof; 656 for (;;) { 657 cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK; 658 error = btree_search(cursor, 0); 659 if (error != ENOENT || 660 (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) { 661 /* 662 * Stop if no error. 663 * Stop if error other then ENOENT. 664 * Stop if ENOENT and not special case. 665 */ 666 break; 667 } 668 if (hammer_debug_btree) { 669 kprintf("CREATE_CHECK %016llx\n", 670 (long long)cursor->create_check); 671 } 672 cursor->key_beg.create_tid = cursor->create_check; 673 /* loop */ 674 } 675 } else { 676 error = btree_search(cursor, 0); 677 } 678 if (error == 0) 679 error = hammer_btree_extract(cursor, cursor->flags); 680 return(error); 681 } 682 683 /* 684 * Execute the logic required to start an iteration. The first record 685 * located within the specified range is returned and iteration control 686 * flags are adjusted for successive hammer_btree_iterate() calls. 687 * 688 * Set ATEDISK so a low-level caller can call btree_first/btree_iterate 689 * in a loop without worrying about it. Higher-level merged searches will 690 * adjust the flag appropriately. 691 */ 692 int 693 hammer_btree_first(hammer_cursor_t cursor) 694 { 695 int error; 696 697 error = hammer_btree_lookup(cursor); 698 if (error == ENOENT) { 699 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 700 error = hammer_btree_iterate(cursor); 701 } 702 cursor->flags |= HAMMER_CURSOR_ATEDISK; 703 return(error); 704 } 705 706 /* 707 * Similarly but for an iteration in the reverse direction. 708 * 709 * Set ATEDISK when iterating backwards to skip the current entry, 710 * which after an ENOENT lookup will be pointing beyond our end point. 711 * 712 * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse 713 * in a loop without worrying about it. Higher-level merged searches will 714 * adjust the flag appropriately. 715 */ 716 int 717 hammer_btree_last(hammer_cursor_t cursor) 718 { 719 struct hammer_base_elm save; 720 int error; 721 722 save = cursor->key_beg; 723 cursor->key_beg = cursor->key_end; 724 error = hammer_btree_lookup(cursor); 725 cursor->key_beg = save; 726 if (error == ENOENT || 727 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 728 cursor->flags |= HAMMER_CURSOR_ATEDISK; 729 error = hammer_btree_iterate_reverse(cursor); 730 } 731 cursor->flags |= HAMMER_CURSOR_ATEDISK; 732 return(error); 733 } 734 735 /* 736 * Extract the record and/or data associated with the cursor's current 737 * position. Any prior record or data stored in the cursor is replaced. 738 * The cursor must be positioned at a leaf node. 739 * 740 * NOTE: All extractions occur at the leaf of the B-Tree. 741 */ 742 int 743 hammer_btree_extract(hammer_cursor_t cursor, int flags) 744 { 745 hammer_node_ondisk_t node; 746 hammer_btree_elm_t elm; 747 hammer_off_t data_off; 748 hammer_mount_t hmp; 749 int32_t data_len; 750 int error; 751 752 /* 753 * The case where the data reference resolves to the same buffer 754 * as the record reference must be handled. 755 */ 756 node = cursor->node->ondisk; 757 elm = &node->elms[cursor->index]; 758 cursor->data = NULL; 759 hmp = cursor->node->hmp; 760 761 /* 762 * There is nothing to extract for an internal element. 763 */ 764 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) 765 return(EINVAL); 766 767 /* 768 * Only record types have data. 769 */ 770 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 771 cursor->leaf = &elm->leaf; 772 773 if ((flags & HAMMER_CURSOR_GET_DATA) == 0) 774 return(0); 775 if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) 776 return(0); 777 data_off = elm->leaf.data_offset; 778 data_len = elm->leaf.data_len; 779 if (data_off == 0) 780 return(0); 781 782 /* 783 * Load the data 784 */ 785 KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE); 786 cursor->data = hammer_bread_ext(hmp, data_off, data_len, 787 &error, &cursor->data_buffer); 788 789 /* 790 * Mark the data buffer as not being meta-data if it isn't 791 * meta-data (sometimes bulk data is accessed via a volume 792 * block device). 793 */ 794 if (error == 0) { 795 switch(elm->leaf.base.rec_type) { 796 case HAMMER_RECTYPE_DATA: 797 case HAMMER_RECTYPE_DB: 798 hammer_io_notmeta(cursor->data_buffer); 799 break; 800 default: 801 break; 802 } 803 } 804 805 /* 806 * Deal with CRC errors on the extracted data. 807 */ 808 if (error == 0 && 809 hammer_crc_test_leaf(cursor->data, &elm->leaf) == 0) { 810 kprintf("CRC DATA @ %016llx/%d FAILED\n", 811 (long long)elm->leaf.data_offset, elm->leaf.data_len); 812 if (hammer_debug_critical) 813 Debugger("CRC FAILED: DATA"); 814 if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM) 815 error = EDOM; /* less critical (mirroring) */ 816 else 817 error = EIO; /* critical */ 818 } 819 return(error); 820 } 821 822 823 /* 824 * Insert a leaf element into the B-Tree at the current cursor position. 825 * The cursor is positioned such that the element at and beyond the cursor 826 * are shifted to make room for the new record. 827 * 828 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT 829 * flag set and that call must return ENOENT before this function can be 830 * called. 831 * 832 * The caller may depend on the cursor's exclusive lock after return to 833 * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE). 834 * 835 * ENOSPC is returned if there is no room to insert a new record. 836 */ 837 int 838 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm, 839 int *doprop) 840 { 841 hammer_node_ondisk_t node; 842 int i; 843 int error; 844 845 *doprop = 0; 846 if ((error = hammer_cursor_upgrade_node(cursor)) != 0) 847 return(error); 848 ++hammer_stats_btree_inserts; 849 850 /* 851 * Insert the element at the leaf node and update the count in the 852 * parent. It is possible for parent to be NULL, indicating that 853 * the filesystem's ROOT B-Tree node is a leaf itself, which is 854 * possible. The root inode can never be deleted so the leaf should 855 * never be empty. 856 * 857 * Remember that the right-hand boundary is not included in the 858 * count. 859 */ 860 hammer_modify_node_all(cursor->trans, cursor->node); 861 node = cursor->node->ondisk; 862 i = cursor->index; 863 KKASSERT(elm->base.btype != 0); 864 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 865 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS); 866 if (i != node->count) { 867 bcopy(&node->elms[i], &node->elms[i+1], 868 (node->count - i) * sizeof(*elm)); 869 } 870 node->elms[i].leaf = *elm; 871 ++node->count; 872 hammer_cursor_inserted_element(cursor->node, i); 873 874 /* 875 * Update the leaf node's aggregate mirror_tid for mirroring 876 * support. 877 */ 878 if (node->mirror_tid < elm->base.delete_tid) { 879 node->mirror_tid = elm->base.delete_tid; 880 *doprop = 1; 881 } 882 if (node->mirror_tid < elm->base.create_tid) { 883 node->mirror_tid = elm->base.create_tid; 884 *doprop = 1; 885 } 886 hammer_modify_node_done(cursor->node); 887 888 /* 889 * Debugging sanity checks. 890 */ 891 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0); 892 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0); 893 if (i) { 894 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0); 895 } 896 if (i != node->count - 1) 897 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0); 898 899 return(0); 900 } 901 902 /* 903 * Delete a record from the B-Tree at the current cursor position. 904 * The cursor is positioned such that the current element is the one 905 * to be deleted. 906 * 907 * On return the cursor will be positioned after the deleted element and 908 * MAY point to an internal node. It will be suitable for the continuation 909 * of an iteration but not for an insertion or deletion. 910 * 911 * Deletions will attempt to partially rebalance the B-Tree in an upward 912 * direction, but will terminate rather then deadlock. Empty internal nodes 913 * are never allowed by a deletion which deadlocks may end up giving us an 914 * empty leaf. The pruner will clean up and rebalance the tree. 915 * 916 * This function can return EDEADLK, requiring the caller to retry the 917 * operation after clearing the deadlock. 918 */ 919 int 920 hammer_btree_delete(hammer_cursor_t cursor) 921 { 922 hammer_node_ondisk_t ondisk; 923 hammer_node_t node; 924 hammer_node_t parent; 925 int error; 926 int i; 927 928 KKASSERT (cursor->trans->sync_lock_refs > 0); 929 if ((error = hammer_cursor_upgrade(cursor)) != 0) 930 return(error); 931 ++hammer_stats_btree_deletes; 932 933 /* 934 * Delete the element from the leaf node. 935 * 936 * Remember that leaf nodes do not have boundaries. 937 */ 938 node = cursor->node; 939 ondisk = node->ondisk; 940 i = cursor->index; 941 942 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF); 943 KKASSERT(i >= 0 && i < ondisk->count); 944 hammer_modify_node_all(cursor->trans, node); 945 if (i + 1 != ondisk->count) { 946 bcopy(&ondisk->elms[i+1], &ondisk->elms[i], 947 (ondisk->count - i - 1) * sizeof(ondisk->elms[0])); 948 } 949 --ondisk->count; 950 hammer_modify_node_done(node); 951 hammer_cursor_deleted_element(node, i); 952 953 /* 954 * Validate local parent 955 */ 956 if (ondisk->parent) { 957 parent = cursor->parent; 958 959 KKASSERT(parent != NULL); 960 KKASSERT(parent->node_offset == ondisk->parent); 961 } 962 963 /* 964 * If the leaf becomes empty it must be detached from the parent, 965 * potentially recursing through to the filesystem root. 966 * 967 * This may reposition the cursor at one of the parent's of the 968 * current node. 969 * 970 * Ignore deadlock errors, that simply means that btree_remove 971 * was unable to recurse and had to leave us with an empty leaf. 972 */ 973 KKASSERT(cursor->index <= ondisk->count); 974 if (ondisk->count == 0) { 975 error = btree_remove(cursor); 976 if (error == EDEADLK) 977 error = 0; 978 } else { 979 error = 0; 980 } 981 KKASSERT(cursor->parent == NULL || 982 cursor->parent_index < cursor->parent->ondisk->count); 983 return(error); 984 } 985 986 /* 987 * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE 988 * 989 * Search the filesystem B-Tree for cursor->key_beg, return the matching node. 990 * 991 * The search can begin ANYWHERE in the B-Tree. As a first step the search 992 * iterates up the tree as necessary to properly position itself prior to 993 * actually doing the sarch. 994 * 995 * INSERTIONS: The search will split full nodes and leaves on its way down 996 * and guarentee that the leaf it ends up on is not full. If we run out 997 * of space the search continues to the leaf (to position the cursor for 998 * the spike), but ENOSPC is returned. 999 * 1000 * The search is only guarenteed to end up on a leaf if an error code of 0 1001 * is returned, or if inserting and an error code of ENOENT is returned. 1002 * Otherwise it can stop at an internal node. On success a search returns 1003 * a leaf node. 1004 * 1005 * COMPLEXITY WARNING! This is the core B-Tree search code for the entire 1006 * filesystem, and it is not simple code. Please note the following facts: 1007 * 1008 * - Internal node recursions have a boundary on the left AND right. The 1009 * right boundary is non-inclusive. The create_tid is a generic part 1010 * of the key for internal nodes. 1011 * 1012 * - Leaf nodes contain terminal elements only now. 1013 * 1014 * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a 1015 * historical search. ASOF and INSERT are mutually exclusive. When 1016 * doing an as-of lookup btree_search() checks for a right-edge boundary 1017 * case. If while recursing down the left-edge differs from the key 1018 * by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along 1019 * with cursor->create_check. This is used by btree_lookup() to iterate. 1020 * The iteration backwards because as-of searches can wind up going 1021 * down the wrong branch of the B-Tree. 1022 */ 1023 static 1024 int 1025 btree_search(hammer_cursor_t cursor, int flags) 1026 { 1027 hammer_node_ondisk_t node; 1028 hammer_btree_elm_t elm; 1029 int error; 1030 int enospc = 0; 1031 int i; 1032 int r; 1033 int s; 1034 1035 flags |= cursor->flags; 1036 ++hammer_stats_btree_searches; 1037 1038 if (hammer_debug_btree) { 1039 kprintf("SEARCH %016llx[%d] %016llx %02x key=%016llx cre=%016llx lo=%02x (td = %p)\n", 1040 (long long)cursor->node->node_offset, 1041 cursor->index, 1042 (long long)cursor->key_beg.obj_id, 1043 cursor->key_beg.rec_type, 1044 (long long)cursor->key_beg.key, 1045 (long long)cursor->key_beg.create_tid, 1046 cursor->key_beg.localization, 1047 curthread 1048 ); 1049 if (cursor->parent) 1050 kprintf("SEARCHP %016llx[%d] (%016llx/%016llx %016llx/%016llx) (%p/%p %p/%p)\n", 1051 (long long)cursor->parent->node_offset, 1052 cursor->parent_index, 1053 (long long)cursor->left_bound->obj_id, 1054 (long long)cursor->parent->ondisk->elms[cursor->parent_index].internal.base.obj_id, 1055 (long long)cursor->right_bound->obj_id, 1056 (long long)cursor->parent->ondisk->elms[cursor->parent_index+1].internal.base.obj_id, 1057 cursor->left_bound, 1058 &cursor->parent->ondisk->elms[cursor->parent_index], 1059 cursor->right_bound, 1060 &cursor->parent->ondisk->elms[cursor->parent_index+1] 1061 ); 1062 } 1063 1064 /* 1065 * Move our cursor up the tree until we find a node whos range covers 1066 * the key we are trying to locate. 1067 * 1068 * The left bound is inclusive, the right bound is non-inclusive. 1069 * It is ok to cursor up too far. 1070 */ 1071 for (;;) { 1072 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound); 1073 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound); 1074 if (r >= 0 && s < 0) 1075 break; 1076 KKASSERT(cursor->parent); 1077 ++hammer_stats_btree_iterations; 1078 error = hammer_cursor_up(cursor); 1079 if (error) 1080 goto done; 1081 } 1082 1083 /* 1084 * The delete-checks below are based on node, not parent. Set the 1085 * initial delete-check based on the parent. 1086 */ 1087 if (r == 1) { 1088 KKASSERT(cursor->left_bound->create_tid != 1); 1089 cursor->create_check = cursor->left_bound->create_tid - 1; 1090 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1091 } 1092 1093 /* 1094 * We better have ended up with a node somewhere. 1095 */ 1096 KKASSERT(cursor->node != NULL); 1097 1098 /* 1099 * If we are inserting we can't start at a full node if the parent 1100 * is also full (because there is no way to split the node), 1101 * continue running up the tree until the requirement is satisfied 1102 * or we hit the root of the filesystem. 1103 * 1104 * (If inserting we aren't doing an as-of search so we don't have 1105 * to worry about create_check). 1106 */ 1107 while ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1108 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 1109 if (btree_node_is_full(cursor->node->ondisk) == 0) 1110 break; 1111 } else { 1112 if (btree_node_is_full(cursor->node->ondisk) ==0) 1113 break; 1114 } 1115 if (cursor->node->ondisk->parent == 0 || 1116 cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) { 1117 break; 1118 } 1119 ++hammer_stats_btree_iterations; 1120 error = hammer_cursor_up(cursor); 1121 /* node may have become stale */ 1122 if (error) 1123 goto done; 1124 } 1125 1126 /* 1127 * Push down through internal nodes to locate the requested key. 1128 */ 1129 node = cursor->node->ondisk; 1130 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 1131 /* 1132 * Scan the node to find the subtree index to push down into. 1133 * We go one-past, then back-up. 1134 * 1135 * We must proactively remove deleted elements which may 1136 * have been left over from a deadlocked btree_remove(). 1137 * 1138 * The left and right boundaries are included in the loop 1139 * in order to detect edge cases. 1140 * 1141 * If the separator only differs by create_tid (r == 1) 1142 * and we are doing an as-of search, we may end up going 1143 * down a branch to the left of the one containing the 1144 * desired key. This requires numerous special cases. 1145 */ 1146 ++hammer_stats_btree_iterations; 1147 if (hammer_debug_btree) { 1148 kprintf("SEARCH-I %016llx count=%d\n", 1149 (long long)cursor->node->node_offset, 1150 node->count); 1151 } 1152 1153 /* 1154 * Try to shortcut the search before dropping into the 1155 * linear loop. Locate the first node where r <= 1. 1156 */ 1157 i = hammer_btree_search_node(&cursor->key_beg, node); 1158 while (i <= node->count) { 1159 ++hammer_stats_btree_elements; 1160 elm = &node->elms[i]; 1161 r = hammer_btree_cmp(&cursor->key_beg, &elm->base); 1162 if (hammer_debug_btree > 2) { 1163 kprintf(" IELM %p %d r=%d\n", 1164 &node->elms[i], i, r); 1165 } 1166 if (r < 0) 1167 break; 1168 if (r == 1) { 1169 KKASSERT(elm->base.create_tid != 1); 1170 cursor->create_check = elm->base.create_tid - 1; 1171 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1172 } 1173 ++i; 1174 } 1175 if (hammer_debug_btree) { 1176 kprintf("SEARCH-I preI=%d/%d r=%d\n", 1177 i, node->count, r); 1178 } 1179 1180 /* 1181 * These cases occur when the parent's idea of the boundary 1182 * is wider then the child's idea of the boundary, and 1183 * require special handling. If not inserting we can 1184 * terminate the search early for these cases but the 1185 * child's boundaries cannot be unconditionally modified. 1186 */ 1187 if (i == 0) { 1188 /* 1189 * If i == 0 the search terminated to the LEFT of the 1190 * left_boundary but to the RIGHT of the parent's left 1191 * boundary. 1192 */ 1193 u_int8_t save; 1194 1195 elm = &node->elms[0]; 1196 1197 /* 1198 * If we aren't inserting we can stop here. 1199 */ 1200 if ((flags & (HAMMER_CURSOR_INSERT | 1201 HAMMER_CURSOR_PRUNING)) == 0) { 1202 cursor->index = 0; 1203 return(ENOENT); 1204 } 1205 1206 /* 1207 * Correct a left-hand boundary mismatch. 1208 * 1209 * We can only do this if we can upgrade the lock, 1210 * and synchronized as a background cursor (i.e. 1211 * inserting or pruning). 1212 * 1213 * WARNING: We can only do this if inserting, i.e. 1214 * we are running on the backend. 1215 */ 1216 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1217 return(error); 1218 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1219 hammer_modify_node_field(cursor->trans, cursor->node, 1220 elms[0]); 1221 save = node->elms[0].base.btype; 1222 node->elms[0].base = *cursor->left_bound; 1223 node->elms[0].base.btype = save; 1224 hammer_modify_node_done(cursor->node); 1225 } else if (i == node->count + 1) { 1226 /* 1227 * If i == node->count + 1 the search terminated to 1228 * the RIGHT of the right boundary but to the LEFT 1229 * of the parent's right boundary. If we aren't 1230 * inserting we can stop here. 1231 * 1232 * Note that the last element in this case is 1233 * elms[i-2] prior to adjustments to 'i'. 1234 */ 1235 --i; 1236 if ((flags & (HAMMER_CURSOR_INSERT | 1237 HAMMER_CURSOR_PRUNING)) == 0) { 1238 cursor->index = i; 1239 return (ENOENT); 1240 } 1241 1242 /* 1243 * Correct a right-hand boundary mismatch. 1244 * (actual push-down record is i-2 prior to 1245 * adjustments to i). 1246 * 1247 * We can only do this if we can upgrade the lock, 1248 * and synchronized as a background cursor (i.e. 1249 * inserting or pruning). 1250 * 1251 * WARNING: We can only do this if inserting, i.e. 1252 * we are running on the backend. 1253 */ 1254 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1255 return(error); 1256 elm = &node->elms[i]; 1257 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1258 hammer_modify_node(cursor->trans, cursor->node, 1259 &elm->base, sizeof(elm->base)); 1260 elm->base = *cursor->right_bound; 1261 hammer_modify_node_done(cursor->node); 1262 --i; 1263 } else { 1264 /* 1265 * The push-down index is now i - 1. If we had 1266 * terminated on the right boundary this will point 1267 * us at the last element. 1268 */ 1269 --i; 1270 } 1271 cursor->index = i; 1272 elm = &node->elms[i]; 1273 1274 if (hammer_debug_btree) { 1275 kprintf("RESULT-I %016llx[%d] %016llx %02x " 1276 "key=%016llx cre=%016llx lo=%02x\n", 1277 (long long)cursor->node->node_offset, 1278 i, 1279 (long long)elm->internal.base.obj_id, 1280 elm->internal.base.rec_type, 1281 (long long)elm->internal.base.key, 1282 (long long)elm->internal.base.create_tid, 1283 elm->internal.base.localization 1284 ); 1285 } 1286 1287 /* 1288 * We better have a valid subtree offset. 1289 */ 1290 KKASSERT(elm->internal.subtree_offset != 0); 1291 1292 /* 1293 * Handle insertion and deletion requirements. 1294 * 1295 * If inserting split full nodes. The split code will 1296 * adjust cursor->node and cursor->index if the current 1297 * index winds up in the new node. 1298 * 1299 * If inserting and a left or right edge case was detected, 1300 * we cannot correct the left or right boundary and must 1301 * prepend and append an empty leaf node in order to make 1302 * the boundary correction. 1303 * 1304 * If we run out of space we set enospc and continue on 1305 * to a leaf to provide the spike code with a good point 1306 * of entry. 1307 */ 1308 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1309 if (btree_node_is_full(node)) { 1310 error = btree_split_internal(cursor); 1311 if (error) { 1312 if (error != ENOSPC) 1313 goto done; 1314 enospc = 1; 1315 } 1316 /* 1317 * reload stale pointers 1318 */ 1319 i = cursor->index; 1320 node = cursor->node->ondisk; 1321 } 1322 } 1323 1324 /* 1325 * Push down (push into new node, existing node becomes 1326 * the parent) and continue the search. 1327 */ 1328 error = hammer_cursor_down(cursor); 1329 /* node may have become stale */ 1330 if (error) 1331 goto done; 1332 node = cursor->node->ondisk; 1333 } 1334 1335 /* 1336 * We are at a leaf, do a linear search of the key array. 1337 * 1338 * On success the index is set to the matching element and 0 1339 * is returned. 1340 * 1341 * On failure the index is set to the insertion point and ENOENT 1342 * is returned. 1343 * 1344 * Boundaries are not stored in leaf nodes, so the index can wind 1345 * up to the left of element 0 (index == 0) or past the end of 1346 * the array (index == node->count). It is also possible that the 1347 * leaf might be empty. 1348 */ 1349 ++hammer_stats_btree_iterations; 1350 KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF); 1351 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS); 1352 if (hammer_debug_btree) { 1353 kprintf("SEARCH-L %016llx count=%d\n", 1354 (long long)cursor->node->node_offset, 1355 node->count); 1356 } 1357 1358 /* 1359 * Try to shortcut the search before dropping into the 1360 * linear loop. Locate the first node where r <= 1. 1361 */ 1362 i = hammer_btree_search_node(&cursor->key_beg, node); 1363 while (i < node->count) { 1364 ++hammer_stats_btree_elements; 1365 elm = &node->elms[i]; 1366 1367 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base); 1368 1369 if (hammer_debug_btree > 1) 1370 kprintf(" ELM %p %d r=%d\n", &node->elms[i], i, r); 1371 1372 /* 1373 * We are at a record element. Stop if we've flipped past 1374 * key_beg, not counting the create_tid test. Allow the 1375 * r == 1 case (key_beg > element but differs only by its 1376 * create_tid) to fall through to the AS-OF check. 1377 */ 1378 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD); 1379 1380 if (r < 0) 1381 goto failed; 1382 if (r > 1) { 1383 ++i; 1384 continue; 1385 } 1386 1387 /* 1388 * Check our as-of timestamp against the element. 1389 */ 1390 if (flags & HAMMER_CURSOR_ASOF) { 1391 if (hammer_btree_chkts(cursor->asof, 1392 &node->elms[i].base) != 0) { 1393 ++i; 1394 continue; 1395 } 1396 /* success */ 1397 } else { 1398 if (r > 0) { /* can only be +1 */ 1399 ++i; 1400 continue; 1401 } 1402 /* success */ 1403 } 1404 cursor->index = i; 1405 error = 0; 1406 if (hammer_debug_btree) { 1407 kprintf("RESULT-L %016llx[%d] (SUCCESS)\n", 1408 (long long)cursor->node->node_offset, i); 1409 } 1410 goto done; 1411 } 1412 1413 /* 1414 * The search of the leaf node failed. i is the insertion point. 1415 */ 1416 failed: 1417 if (hammer_debug_btree) { 1418 kprintf("RESULT-L %016llx[%d] (FAILED)\n", 1419 (long long)cursor->node->node_offset, i); 1420 } 1421 1422 /* 1423 * No exact match was found, i is now at the insertion point. 1424 * 1425 * If inserting split a full leaf before returning. This 1426 * may have the side effect of adjusting cursor->node and 1427 * cursor->index. 1428 */ 1429 cursor->index = i; 1430 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 && 1431 btree_node_is_full(node)) { 1432 error = btree_split_leaf(cursor); 1433 if (error) { 1434 if (error != ENOSPC) 1435 goto done; 1436 enospc = 1; 1437 } 1438 /* 1439 * reload stale pointers 1440 */ 1441 /* NOT USED 1442 i = cursor->index; 1443 node = &cursor->node->internal; 1444 */ 1445 } 1446 1447 /* 1448 * We reached a leaf but did not find the key we were looking for. 1449 * If this is an insert we will be properly positioned for an insert 1450 * (ENOENT) or spike (ENOSPC) operation. 1451 */ 1452 error = enospc ? ENOSPC : ENOENT; 1453 done: 1454 return(error); 1455 } 1456 1457 /* 1458 * Heuristical search for the first element whos comparison is <= 1. May 1459 * return an index whos compare result is > 1 but may only return an index 1460 * whos compare result is <= 1 if it is the first element with that result. 1461 */ 1462 int 1463 hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node) 1464 { 1465 int b; 1466 int s; 1467 int i; 1468 int r; 1469 1470 /* 1471 * Don't bother if the node does not have very many elements 1472 */ 1473 b = 0; 1474 s = node->count; 1475 while (s - b > 4) { 1476 i = b + (s - b) / 2; 1477 ++hammer_stats_btree_elements; 1478 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base); 1479 if (r <= 1) { 1480 s = i; 1481 } else { 1482 b = i; 1483 } 1484 } 1485 return(b); 1486 } 1487 1488 1489 /************************************************************************ 1490 * SPLITTING AND MERGING * 1491 ************************************************************************ 1492 * 1493 * These routines do all the dirty work required to split and merge nodes. 1494 */ 1495 1496 /* 1497 * Split an internal node into two nodes and move the separator at the split 1498 * point to the parent. 1499 * 1500 * (cursor->node, cursor->index) indicates the element the caller intends 1501 * to push into. We will adjust node and index if that element winds 1502 * up in the split node. 1503 * 1504 * If we are at the root of the filesystem a new root must be created with 1505 * two elements, one pointing to the original root and one pointing to the 1506 * newly allocated split node. 1507 */ 1508 static 1509 int 1510 btree_split_internal(hammer_cursor_t cursor) 1511 { 1512 hammer_node_ondisk_t ondisk; 1513 hammer_node_t node; 1514 hammer_node_t parent; 1515 hammer_node_t new_node; 1516 hammer_btree_elm_t elm; 1517 hammer_btree_elm_t parent_elm; 1518 struct hammer_node_lock lockroot; 1519 hammer_mount_t hmp = cursor->trans->hmp; 1520 hammer_off_t hint; 1521 int parent_index; 1522 int made_root; 1523 int split; 1524 int error; 1525 int i; 1526 const int esize = sizeof(*elm); 1527 1528 hammer_node_lock_init(&lockroot, cursor->node); 1529 error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL); 1530 if (error) 1531 goto done; 1532 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1533 goto done; 1534 ++hammer_stats_btree_splits; 1535 1536 /* 1537 * Calculate the split point. If the insertion point is at the 1538 * end of the leaf we adjust the split point significantly to the 1539 * right to try to optimize node fill and flag it. If we hit 1540 * that same leaf again our heuristic failed and we don't try 1541 * to optimize node fill (it could lead to a degenerate case). 1542 */ 1543 node = cursor->node; 1544 ondisk = node->ondisk; 1545 KKASSERT(ondisk->count > 4); 1546 if (cursor->index == ondisk->count && 1547 (node->flags & HAMMER_NODE_NONLINEAR) == 0) { 1548 split = (ondisk->count + 1) * 3 / 4; 1549 node->flags |= HAMMER_NODE_NONLINEAR; 1550 } else { 1551 /* 1552 * We are splitting but elms[split] will be promoted to 1553 * the parent, leaving the right hand node with one less 1554 * element. If the insertion point will be on the 1555 * left-hand side adjust the split point to give the 1556 * right hand side one additional node. 1557 */ 1558 split = (ondisk->count + 1) / 2; 1559 if (cursor->index <= split) 1560 --split; 1561 } 1562 1563 /* 1564 * If we are at the root of the filesystem, create a new root node 1565 * with 1 element and split normally. Avoid making major 1566 * modifications until we know the whole operation will work. 1567 */ 1568 if (ondisk->parent == 0) { 1569 parent = hammer_alloc_btree(cursor->trans, node->node_offset, 1570 &error); 1571 if (parent == NULL) 1572 goto done; 1573 hammer_lock_ex(&parent->lock); 1574 hammer_modify_node_noundo(cursor->trans, parent); 1575 ondisk = parent->ondisk; 1576 ondisk->count = 1; 1577 ondisk->parent = 0; 1578 ondisk->mirror_tid = node->ondisk->mirror_tid; 1579 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1580 ondisk->elms[0].base = hmp->root_btree_beg; 1581 ondisk->elms[0].base.btype = node->ondisk->type; 1582 ondisk->elms[0].internal.subtree_offset = node->node_offset; 1583 ondisk->elms[1].base = hmp->root_btree_end; 1584 hammer_modify_node_done(parent); 1585 /* ondisk->elms[1].base.btype - not used */ 1586 made_root = 1; 1587 parent_index = 0; /* index of current node in parent */ 1588 } else { 1589 made_root = 0; 1590 parent = cursor->parent; 1591 parent_index = cursor->parent_index; 1592 } 1593 1594 /* 1595 * Calculate a hint for the allocation of the new B-Tree node. 1596 * The most likely expansion is coming from the insertion point 1597 * at cursor->index, so try to localize the allocation of our 1598 * new node to accomodate that sub-tree. 1599 * 1600 * Use the right-most sub-tree when expandinging on the right edge. 1601 * This is a very common case when copying a directory tree. 1602 */ 1603 if (cursor->index == ondisk->count) 1604 hint = ondisk->elms[cursor->index - 1].internal.subtree_offset; 1605 else 1606 hint = ondisk->elms[cursor->index].internal.subtree_offset; 1607 1608 /* 1609 * Split node into new_node at the split point. 1610 * 1611 * B O O O P N N B <-- P = node->elms[split] (index 4) 1612 * 0 1 2 3 4 5 6 <-- subtree indices 1613 * 1614 * x x P x x 1615 * s S S s 1616 * / \ 1617 * B O O O B B N N B <--- inner boundary points are 'P' 1618 * 0 1 2 3 4 5 6 1619 */ 1620 new_node = hammer_alloc_btree(cursor->trans, hint, &error); 1621 if (new_node == NULL) { 1622 if (made_root) { 1623 hammer_unlock(&parent->lock); 1624 hammer_delete_node(cursor->trans, parent); 1625 hammer_rel_node(parent); 1626 } 1627 goto done; 1628 } 1629 hammer_lock_ex(&new_node->lock); 1630 1631 /* 1632 * Create the new node. P becomes the left-hand boundary in the 1633 * new node. Copy the right-hand boundary as well. 1634 * 1635 * elm is the new separator. 1636 */ 1637 hammer_modify_node_noundo(cursor->trans, new_node); 1638 hammer_modify_node_all(cursor->trans, node); 1639 ondisk = node->ondisk; 1640 elm = &ondisk->elms[split]; 1641 bcopy(elm, &new_node->ondisk->elms[0], 1642 (ondisk->count - split + 1) * esize); 1643 new_node->ondisk->count = ondisk->count - split; 1644 new_node->ondisk->parent = parent->node_offset; 1645 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1646 new_node->ondisk->mirror_tid = ondisk->mirror_tid; 1647 KKASSERT(ondisk->type == new_node->ondisk->type); 1648 hammer_cursor_split_node(node, new_node, split); 1649 1650 /* 1651 * Cleanup the original node. Elm (P) becomes the new boundary, 1652 * its subtree_offset was moved to the new node. If we had created 1653 * a new root its parent pointer may have changed. 1654 */ 1655 elm->internal.subtree_offset = 0; 1656 ondisk->count = split; 1657 1658 /* 1659 * Insert the separator into the parent, fixup the parent's 1660 * reference to the original node, and reference the new node. 1661 * The separator is P. 1662 * 1663 * Remember that base.count does not include the right-hand boundary. 1664 */ 1665 hammer_modify_node_all(cursor->trans, parent); 1666 ondisk = parent->ondisk; 1667 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1668 parent_elm = &ondisk->elms[parent_index+1]; 1669 bcopy(parent_elm, parent_elm + 1, 1670 (ondisk->count - parent_index) * esize); 1671 parent_elm->internal.base = elm->base; /* separator P */ 1672 parent_elm->internal.base.btype = new_node->ondisk->type; 1673 parent_elm->internal.subtree_offset = new_node->node_offset; 1674 parent_elm->internal.mirror_tid = new_node->ondisk->mirror_tid; 1675 ++ondisk->count; 1676 hammer_modify_node_done(parent); 1677 hammer_cursor_inserted_element(parent, parent_index + 1); 1678 1679 /* 1680 * The children of new_node need their parent pointer set to new_node. 1681 * The children have already been locked by 1682 * hammer_btree_lock_children(). 1683 */ 1684 for (i = 0; i < new_node->ondisk->count; ++i) { 1685 elm = &new_node->ondisk->elms[i]; 1686 error = btree_set_parent(cursor->trans, new_node, elm); 1687 if (error) { 1688 panic("btree_split_internal: btree-fixup problem"); 1689 } 1690 } 1691 hammer_modify_node_done(new_node); 1692 1693 /* 1694 * The filesystem's root B-Tree pointer may have to be updated. 1695 */ 1696 if (made_root) { 1697 hammer_volume_t volume; 1698 1699 volume = hammer_get_root_volume(hmp, &error); 1700 KKASSERT(error == 0); 1701 1702 hammer_modify_volume_field(cursor->trans, volume, 1703 vol0_btree_root); 1704 volume->ondisk->vol0_btree_root = parent->node_offset; 1705 hammer_modify_volume_done(volume); 1706 node->ondisk->parent = parent->node_offset; 1707 if (cursor->parent) { 1708 hammer_unlock(&cursor->parent->lock); 1709 hammer_rel_node(cursor->parent); 1710 } 1711 cursor->parent = parent; /* lock'd and ref'd */ 1712 hammer_rel_volume(volume, 0); 1713 } 1714 hammer_modify_node_done(node); 1715 1716 /* 1717 * Ok, now adjust the cursor depending on which element the original 1718 * index was pointing at. If we are >= the split point the push node 1719 * is now in the new node. 1720 * 1721 * NOTE: If we are at the split point itself we cannot stay with the 1722 * original node because the push index will point at the right-hand 1723 * boundary, which is illegal. 1724 * 1725 * NOTE: The cursor's parent or parent_index must be adjusted for 1726 * the case where a new parent (new root) was created, and the case 1727 * where the cursor is now pointing at the split node. 1728 */ 1729 if (cursor->index >= split) { 1730 cursor->parent_index = parent_index + 1; 1731 cursor->index -= split; 1732 hammer_unlock(&cursor->node->lock); 1733 hammer_rel_node(cursor->node); 1734 cursor->node = new_node; /* locked and ref'd */ 1735 } else { 1736 cursor->parent_index = parent_index; 1737 hammer_unlock(&new_node->lock); 1738 hammer_rel_node(new_node); 1739 } 1740 1741 /* 1742 * Fixup left and right bounds 1743 */ 1744 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1745 cursor->left_bound = &parent_elm[0].internal.base; 1746 cursor->right_bound = &parent_elm[1].internal.base; 1747 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1748 &cursor->node->ondisk->elms[0].internal.base) <= 0); 1749 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1750 &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0); 1751 1752 done: 1753 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL); 1754 hammer_cursor_downgrade(cursor); 1755 return (error); 1756 } 1757 1758 /* 1759 * Same as the above, but splits a full leaf node. 1760 * 1761 * This function 1762 */ 1763 static 1764 int 1765 btree_split_leaf(hammer_cursor_t cursor) 1766 { 1767 hammer_node_ondisk_t ondisk; 1768 hammer_node_t parent; 1769 hammer_node_t leaf; 1770 hammer_mount_t hmp; 1771 hammer_node_t new_leaf; 1772 hammer_btree_elm_t elm; 1773 hammer_btree_elm_t parent_elm; 1774 hammer_base_elm_t mid_boundary; 1775 hammer_off_t hint; 1776 int parent_index; 1777 int made_root; 1778 int split; 1779 int error; 1780 const size_t esize = sizeof(*elm); 1781 1782 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1783 return(error); 1784 ++hammer_stats_btree_splits; 1785 1786 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1787 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1788 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1789 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1790 1791 /* 1792 * Calculate the split point. If the insertion point is at the 1793 * end of the leaf we adjust the split point significantly to the 1794 * right to try to optimize node fill and flag it. If we hit 1795 * that same leaf again our heuristic failed and we don't try 1796 * to optimize node fill (it could lead to a degenerate case). 1797 * 1798 * Spikes are made up of two leaf elements which cannot be 1799 * safely split. 1800 */ 1801 leaf = cursor->node; 1802 ondisk = leaf->ondisk; 1803 KKASSERT(ondisk->count > 4); 1804 if (cursor->index == ondisk->count && 1805 (leaf->flags & HAMMER_NODE_NONLINEAR) == 0) { 1806 split = (ondisk->count + 1) * 3 / 4; 1807 leaf->flags |= HAMMER_NODE_NONLINEAR; 1808 } else { 1809 split = (ondisk->count + 1) / 2; 1810 } 1811 1812 #if 0 1813 /* 1814 * If the insertion point is at the split point shift the 1815 * split point left so we don't have to worry about 1816 */ 1817 if (cursor->index == split) 1818 --split; 1819 #endif 1820 KKASSERT(split > 0 && split < ondisk->count); 1821 1822 error = 0; 1823 hmp = leaf->hmp; 1824 1825 elm = &ondisk->elms[split]; 1826 1827 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0); 1828 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0); 1829 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0); 1830 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0); 1831 1832 /* 1833 * If we are at the root of the tree, create a new root node with 1834 * 1 element and split normally. Avoid making major modifications 1835 * until we know the whole operation will work. 1836 */ 1837 if (ondisk->parent == 0) { 1838 parent = hammer_alloc_btree(cursor->trans, leaf->node_offset, 1839 &error); 1840 if (parent == NULL) 1841 goto done; 1842 hammer_lock_ex(&parent->lock); 1843 hammer_modify_node_noundo(cursor->trans, parent); 1844 ondisk = parent->ondisk; 1845 ondisk->count = 1; 1846 ondisk->parent = 0; 1847 ondisk->mirror_tid = leaf->ondisk->mirror_tid; 1848 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1849 ondisk->elms[0].base = hmp->root_btree_beg; 1850 ondisk->elms[0].base.btype = leaf->ondisk->type; 1851 ondisk->elms[0].internal.subtree_offset = leaf->node_offset; 1852 ondisk->elms[1].base = hmp->root_btree_end; 1853 /* ondisk->elms[1].base.btype = not used */ 1854 hammer_modify_node_done(parent); 1855 made_root = 1; 1856 parent_index = 0; /* insertion point in parent */ 1857 } else { 1858 made_root = 0; 1859 parent = cursor->parent; 1860 parent_index = cursor->parent_index; 1861 } 1862 1863 /* 1864 * Calculate a hint for the allocation of the new B-Tree leaf node. 1865 * For now just try to localize it within the same bigblock as 1866 * the current leaf. 1867 * 1868 * If the insertion point is at the end of the leaf we recognize a 1869 * likely append sequence of some sort (data, meta-data, inodes, 1870 * whatever). Set the hint to zero to allocate out of linear space 1871 * instead of trying to completely fill previously hinted space. 1872 * 1873 * This also sets the stage for recursive splits to localize using 1874 * the new space. 1875 */ 1876 ondisk = leaf->ondisk; 1877 if (cursor->index == ondisk->count) 1878 hint = 0; 1879 else 1880 hint = leaf->node_offset; 1881 1882 /* 1883 * Split leaf into new_leaf at the split point. Select a separator 1884 * value in-between the two leafs but with a bent towards the right 1885 * leaf since comparisons use an 'elm >= separator' inequality. 1886 * 1887 * L L L L L L L L 1888 * 1889 * x x P x x 1890 * s S S s 1891 * / \ 1892 * L L L L L L L L 1893 */ 1894 new_leaf = hammer_alloc_btree(cursor->trans, hint, &error); 1895 if (new_leaf == NULL) { 1896 if (made_root) { 1897 hammer_unlock(&parent->lock); 1898 hammer_delete_node(cursor->trans, parent); 1899 hammer_rel_node(parent); 1900 } 1901 goto done; 1902 } 1903 hammer_lock_ex(&new_leaf->lock); 1904 1905 /* 1906 * Create the new node and copy the leaf elements from the split 1907 * point on to the new node. 1908 */ 1909 hammer_modify_node_all(cursor->trans, leaf); 1910 hammer_modify_node_noundo(cursor->trans, new_leaf); 1911 ondisk = leaf->ondisk; 1912 elm = &ondisk->elms[split]; 1913 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize); 1914 new_leaf->ondisk->count = ondisk->count - split; 1915 new_leaf->ondisk->parent = parent->node_offset; 1916 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF; 1917 new_leaf->ondisk->mirror_tid = ondisk->mirror_tid; 1918 KKASSERT(ondisk->type == new_leaf->ondisk->type); 1919 hammer_modify_node_done(new_leaf); 1920 hammer_cursor_split_node(leaf, new_leaf, split); 1921 1922 /* 1923 * Cleanup the original node. Because this is a leaf node and 1924 * leaf nodes do not have a right-hand boundary, there 1925 * aren't any special edge cases to clean up. We just fixup the 1926 * count. 1927 */ 1928 ondisk->count = split; 1929 1930 /* 1931 * Insert the separator into the parent, fixup the parent's 1932 * reference to the original node, and reference the new node. 1933 * The separator is P. 1934 * 1935 * Remember that base.count does not include the right-hand boundary. 1936 * We are copying parent_index+1 to parent_index+2, not +0 to +1. 1937 */ 1938 hammer_modify_node_all(cursor->trans, parent); 1939 ondisk = parent->ondisk; 1940 KKASSERT(split != 0); 1941 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1942 parent_elm = &ondisk->elms[parent_index+1]; 1943 bcopy(parent_elm, parent_elm + 1, 1944 (ondisk->count - parent_index) * esize); 1945 1946 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base); 1947 parent_elm->internal.base.btype = new_leaf->ondisk->type; 1948 parent_elm->internal.subtree_offset = new_leaf->node_offset; 1949 parent_elm->internal.mirror_tid = new_leaf->ondisk->mirror_tid; 1950 mid_boundary = &parent_elm->base; 1951 ++ondisk->count; 1952 hammer_modify_node_done(parent); 1953 hammer_cursor_inserted_element(parent, parent_index + 1); 1954 1955 /* 1956 * The filesystem's root B-Tree pointer may have to be updated. 1957 */ 1958 if (made_root) { 1959 hammer_volume_t volume; 1960 1961 volume = hammer_get_root_volume(hmp, &error); 1962 KKASSERT(error == 0); 1963 1964 hammer_modify_volume_field(cursor->trans, volume, 1965 vol0_btree_root); 1966 volume->ondisk->vol0_btree_root = parent->node_offset; 1967 hammer_modify_volume_done(volume); 1968 leaf->ondisk->parent = parent->node_offset; 1969 if (cursor->parent) { 1970 hammer_unlock(&cursor->parent->lock); 1971 hammer_rel_node(cursor->parent); 1972 } 1973 cursor->parent = parent; /* lock'd and ref'd */ 1974 hammer_rel_volume(volume, 0); 1975 } 1976 hammer_modify_node_done(leaf); 1977 1978 /* 1979 * Ok, now adjust the cursor depending on which element the original 1980 * index was pointing at. If we are >= the split point the push node 1981 * is now in the new node. 1982 * 1983 * NOTE: If we are at the split point itself we need to select the 1984 * old or new node based on where key_beg's insertion point will be. 1985 * If we pick the wrong side the inserted element will wind up in 1986 * the wrong leaf node and outside that node's bounds. 1987 */ 1988 if (cursor->index > split || 1989 (cursor->index == split && 1990 hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) { 1991 cursor->parent_index = parent_index + 1; 1992 cursor->index -= split; 1993 hammer_unlock(&cursor->node->lock); 1994 hammer_rel_node(cursor->node); 1995 cursor->node = new_leaf; 1996 } else { 1997 cursor->parent_index = parent_index; 1998 hammer_unlock(&new_leaf->lock); 1999 hammer_rel_node(new_leaf); 2000 } 2001 2002 /* 2003 * Fixup left and right bounds 2004 */ 2005 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 2006 cursor->left_bound = &parent_elm[0].internal.base; 2007 cursor->right_bound = &parent_elm[1].internal.base; 2008 2009 /* 2010 * Assert that the bounds are correct. 2011 */ 2012 KKASSERT(hammer_btree_cmp(cursor->left_bound, 2013 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 2014 KKASSERT(hammer_btree_cmp(cursor->right_bound, 2015 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 2016 KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0); 2017 KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0); 2018 2019 done: 2020 hammer_cursor_downgrade(cursor); 2021 return (error); 2022 } 2023 2024 #if 0 2025 2026 /* 2027 * Recursively correct the right-hand boundary's create_tid to (tid) as 2028 * long as the rest of the key matches. We have to recurse upward in 2029 * the tree as well as down the left side of each parent's right node. 2030 * 2031 * Return EDEADLK if we were only partially successful, forcing the caller 2032 * to try again. The original cursor is not modified. This routine can 2033 * also fail with EDEADLK if it is forced to throw away a portion of its 2034 * record history. 2035 * 2036 * The caller must pass a downgraded cursor to us (otherwise we can't dup it). 2037 */ 2038 struct hammer_rhb { 2039 TAILQ_ENTRY(hammer_rhb) entry; 2040 hammer_node_t node; 2041 int index; 2042 }; 2043 2044 TAILQ_HEAD(hammer_rhb_list, hammer_rhb); 2045 2046 int 2047 hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid) 2048 { 2049 struct hammer_mount *hmp; 2050 struct hammer_rhb_list rhb_list; 2051 hammer_base_elm_t elm; 2052 hammer_node_t orig_node; 2053 struct hammer_rhb *rhb; 2054 int orig_index; 2055 int error; 2056 2057 TAILQ_INIT(&rhb_list); 2058 hmp = cursor->trans->hmp; 2059 2060 /* 2061 * Save our position so we can restore it on return. This also 2062 * gives us a stable 'elm'. 2063 */ 2064 orig_node = cursor->node; 2065 hammer_ref_node(orig_node); 2066 hammer_lock_sh(&orig_node->lock); 2067 orig_index = cursor->index; 2068 elm = &orig_node->ondisk->elms[orig_index].base; 2069 2070 /* 2071 * Now build a list of parents going up, allocating a rhb 2072 * structure for each one. 2073 */ 2074 while (cursor->parent) { 2075 /* 2076 * Stop if we no longer have any right-bounds to fix up 2077 */ 2078 if (elm->obj_id != cursor->right_bound->obj_id || 2079 elm->rec_type != cursor->right_bound->rec_type || 2080 elm->key != cursor->right_bound->key) { 2081 break; 2082 } 2083 2084 /* 2085 * Stop if the right-hand bound's create_tid does not 2086 * need to be corrected. 2087 */ 2088 if (cursor->right_bound->create_tid >= tid) 2089 break; 2090 2091 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2092 rhb->node = cursor->parent; 2093 rhb->index = cursor->parent_index; 2094 hammer_ref_node(rhb->node); 2095 hammer_lock_sh(&rhb->node->lock); 2096 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2097 2098 hammer_cursor_up(cursor); 2099 } 2100 2101 /* 2102 * now safely adjust the right hand bound for each rhb. This may 2103 * also require taking the right side of the tree and iterating down 2104 * ITS left side. 2105 */ 2106 error = 0; 2107 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2108 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2109 if (error) 2110 break; 2111 TAILQ_REMOVE(&rhb_list, rhb, entry); 2112 hammer_unlock(&rhb->node->lock); 2113 hammer_rel_node(rhb->node); 2114 kfree(rhb, hmp->m_misc); 2115 2116 switch (cursor->node->ondisk->type) { 2117 case HAMMER_BTREE_TYPE_INTERNAL: 2118 /* 2119 * Right-boundary for parent at internal node 2120 * is one element to the right of the element whos 2121 * right boundary needs adjusting. We must then 2122 * traverse down the left side correcting any left 2123 * bounds (which may now be too far to the left). 2124 */ 2125 ++cursor->index; 2126 error = hammer_btree_correct_lhb(cursor, tid); 2127 break; 2128 default: 2129 panic("hammer_btree_correct_rhb(): Bad node type"); 2130 error = EINVAL; 2131 break; 2132 } 2133 } 2134 2135 /* 2136 * Cleanup 2137 */ 2138 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2139 TAILQ_REMOVE(&rhb_list, rhb, entry); 2140 hammer_unlock(&rhb->node->lock); 2141 hammer_rel_node(rhb->node); 2142 kfree(rhb, hmp->m_misc); 2143 } 2144 error = hammer_cursor_seek(cursor, orig_node, orig_index); 2145 hammer_unlock(&orig_node->lock); 2146 hammer_rel_node(orig_node); 2147 return (error); 2148 } 2149 2150 /* 2151 * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand 2152 * bound going downward starting at the current cursor position. 2153 * 2154 * This function does not restore the cursor after use. 2155 */ 2156 int 2157 hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid) 2158 { 2159 struct hammer_rhb_list rhb_list; 2160 hammer_base_elm_t elm; 2161 hammer_base_elm_t cmp; 2162 struct hammer_rhb *rhb; 2163 struct hammer_mount *hmp; 2164 int error; 2165 2166 TAILQ_INIT(&rhb_list); 2167 hmp = cursor->trans->hmp; 2168 2169 cmp = &cursor->node->ondisk->elms[cursor->index].base; 2170 2171 /* 2172 * Record the node and traverse down the left-hand side for all 2173 * matching records needing a boundary correction. 2174 */ 2175 error = 0; 2176 for (;;) { 2177 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2178 rhb->node = cursor->node; 2179 rhb->index = cursor->index; 2180 hammer_ref_node(rhb->node); 2181 hammer_lock_sh(&rhb->node->lock); 2182 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2183 2184 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2185 /* 2186 * Nothing to traverse down if we are at the right 2187 * boundary of an internal node. 2188 */ 2189 if (cursor->index == cursor->node->ondisk->count) 2190 break; 2191 } else { 2192 elm = &cursor->node->ondisk->elms[cursor->index].base; 2193 if (elm->btype == HAMMER_BTREE_TYPE_RECORD) 2194 break; 2195 panic("Illegal leaf record type %02x", elm->btype); 2196 } 2197 error = hammer_cursor_down(cursor); 2198 if (error) 2199 break; 2200 2201 elm = &cursor->node->ondisk->elms[cursor->index].base; 2202 if (elm->obj_id != cmp->obj_id || 2203 elm->rec_type != cmp->rec_type || 2204 elm->key != cmp->key) { 2205 break; 2206 } 2207 if (elm->create_tid >= tid) 2208 break; 2209 2210 } 2211 2212 /* 2213 * Now we can safely adjust the left-hand boundary from the bottom-up. 2214 * The last element we remove from the list is the caller's right hand 2215 * boundary, which must also be adjusted. 2216 */ 2217 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2218 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2219 if (error) 2220 break; 2221 TAILQ_REMOVE(&rhb_list, rhb, entry); 2222 hammer_unlock(&rhb->node->lock); 2223 hammer_rel_node(rhb->node); 2224 kfree(rhb, hmp->m_misc); 2225 2226 elm = &cursor->node->ondisk->elms[cursor->index].base; 2227 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2228 hammer_modify_node(cursor->trans, cursor->node, 2229 &elm->create_tid, 2230 sizeof(elm->create_tid)); 2231 elm->create_tid = tid; 2232 hammer_modify_node_done(cursor->node); 2233 } else { 2234 panic("hammer_btree_correct_lhb(): Bad element type"); 2235 } 2236 } 2237 2238 /* 2239 * Cleanup 2240 */ 2241 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2242 TAILQ_REMOVE(&rhb_list, rhb, entry); 2243 hammer_unlock(&rhb->node->lock); 2244 hammer_rel_node(rhb->node); 2245 kfree(rhb, hmp->m_misc); 2246 } 2247 return (error); 2248 } 2249 2250 #endif 2251 2252 /* 2253 * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at 2254 * (cursor->node). Returns 0 on success, EDEADLK if we could not complete 2255 * the operation due to a deadlock, or some other error. 2256 * 2257 * This routine is initially called with an empty leaf and may be 2258 * recursively called with single-element internal nodes. 2259 * 2260 * It should also be noted that when removing empty leaves we must be sure 2261 * to test and update mirror_tid because another thread may have deadlocked 2262 * against us (or someone) trying to propagate it up and cannot retry once 2263 * the node has been deleted. 2264 * 2265 * On return the cursor may end up pointing to an internal node, suitable 2266 * for further iteration but not for an immediate insertion or deletion. 2267 */ 2268 static int 2269 btree_remove(hammer_cursor_t cursor) 2270 { 2271 hammer_node_ondisk_t ondisk; 2272 hammer_btree_elm_t elm; 2273 hammer_node_t node; 2274 hammer_node_t parent; 2275 const int esize = sizeof(*elm); 2276 int error; 2277 2278 node = cursor->node; 2279 2280 /* 2281 * When deleting the root of the filesystem convert it to 2282 * an empty leaf node. Internal nodes cannot be empty. 2283 */ 2284 ondisk = node->ondisk; 2285 if (ondisk->parent == 0) { 2286 KKASSERT(cursor->parent == NULL); 2287 hammer_modify_node_all(cursor->trans, node); 2288 KKASSERT(ondisk == node->ondisk); 2289 ondisk->type = HAMMER_BTREE_TYPE_LEAF; 2290 ondisk->count = 0; 2291 hammer_modify_node_done(node); 2292 cursor->index = 0; 2293 return(0); 2294 } 2295 2296 parent = cursor->parent; 2297 2298 /* 2299 * Attempt to remove the parent's reference to the child. If the 2300 * parent would become empty we have to recurse. If we fail we 2301 * leave the parent pointing to an empty leaf node. 2302 * 2303 * We have to recurse successfully before we can delete the internal 2304 * node as it is illegal to have empty internal nodes. Even though 2305 * the operation may be aborted we must still fixup any unlocked 2306 * cursors as if we had deleted the element prior to recursing 2307 * (by calling hammer_cursor_deleted_element()) so those cursors 2308 * are properly forced up the chain by the recursion. 2309 */ 2310 if (parent->ondisk->count == 1) { 2311 /* 2312 * This special cursor_up_locked() call leaves the original 2313 * node exclusively locked and referenced, leaves the 2314 * original parent locked (as the new node), and locks the 2315 * new parent. It can return EDEADLK. 2316 * 2317 * We cannot call hammer_cursor_removed_node() until we are 2318 * actually able to remove the node. If we did then tracked 2319 * cursors in the middle of iterations could be repointed 2320 * to a parent node. If this occurs they could end up 2321 * scanning newly inserted records into the node (that could 2322 * not be deleted) when they push down again. 2323 * 2324 * Due to the way the recursion works the final parent is left 2325 * in cursor->parent after the recursion returns. Each 2326 * layer on the way back up is thus able to call 2327 * hammer_cursor_removed_node() and 'jump' the node up to 2328 * the (same) final parent. 2329 * 2330 * NOTE! The local variable 'parent' is invalid after we 2331 * call hammer_cursor_up_locked(). 2332 */ 2333 error = hammer_cursor_up_locked(cursor); 2334 parent = NULL; 2335 2336 if (error == 0) { 2337 hammer_cursor_deleted_element(cursor->node, 0); 2338 error = btree_remove(cursor); 2339 if (error == 0) { 2340 KKASSERT(node != cursor->node); 2341 hammer_cursor_removed_node( 2342 node, cursor->node, 2343 cursor->index); 2344 hammer_modify_node_all(cursor->trans, node); 2345 ondisk = node->ondisk; 2346 ondisk->type = HAMMER_BTREE_TYPE_DELETED; 2347 ondisk->count = 0; 2348 hammer_modify_node_done(node); 2349 hammer_flush_node(node, 0); 2350 hammer_delete_node(cursor->trans, node); 2351 } else { 2352 /* 2353 * Defer parent removal because we could not 2354 * get the lock, just let the leaf remain 2355 * empty. 2356 */ 2357 /**/ 2358 } 2359 hammer_unlock(&node->lock); 2360 hammer_rel_node(node); 2361 } else { 2362 /* 2363 * Defer parent removal because we could not 2364 * get the lock, just let the leaf remain 2365 * empty. 2366 */ 2367 /**/ 2368 } 2369 } else { 2370 KKASSERT(parent->ondisk->count > 1); 2371 2372 hammer_modify_node_all(cursor->trans, parent); 2373 ondisk = parent->ondisk; 2374 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 2375 2376 elm = &ondisk->elms[cursor->parent_index]; 2377 KKASSERT(elm->internal.subtree_offset == node->node_offset); 2378 KKASSERT(ondisk->count > 0); 2379 2380 /* 2381 * We must retain the highest mirror_tid. The deleted 2382 * range is now encompassed by the element to the left. 2383 * If we are already at the left edge the new left edge 2384 * inherits mirror_tid. 2385 * 2386 * Note that bounds of the parent to our parent may create 2387 * a gap to the left of our left-most node or to the right 2388 * of our right-most node. The gap is silently included 2389 * in the mirror_tid's area of effect from the point of view 2390 * of the scan. 2391 */ 2392 if (cursor->parent_index) { 2393 if (elm[-1].internal.mirror_tid < 2394 elm[0].internal.mirror_tid) { 2395 elm[-1].internal.mirror_tid = 2396 elm[0].internal.mirror_tid; 2397 } 2398 } else { 2399 if (elm[1].internal.mirror_tid < 2400 elm[0].internal.mirror_tid) { 2401 elm[1].internal.mirror_tid = 2402 elm[0].internal.mirror_tid; 2403 } 2404 } 2405 2406 /* 2407 * Delete the subtree reference in the parent. Include 2408 * boundary element at end. 2409 */ 2410 bcopy(&elm[1], &elm[0], 2411 (ondisk->count - cursor->parent_index) * esize); 2412 --ondisk->count; 2413 hammer_modify_node_done(parent); 2414 hammer_cursor_removed_node(node, parent, cursor->parent_index); 2415 hammer_cursor_deleted_element(parent, cursor->parent_index); 2416 hammer_flush_node(node, 0); 2417 hammer_delete_node(cursor->trans, node); 2418 2419 /* 2420 * cursor->node is invalid, cursor up to make the cursor 2421 * valid again. We have to flag the condition in case 2422 * another thread wiggles an insertion in during an 2423 * iteration. 2424 */ 2425 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 2426 error = hammer_cursor_up(cursor); 2427 } 2428 return (error); 2429 } 2430 2431 /* 2432 * Propagate cursor->trans->tid up the B-Tree starting at the current 2433 * cursor position using pseudofs info gleaned from the passed inode. 2434 * 2435 * The passed inode has no relationship to the cursor position other 2436 * then being in the same pseudofs as the insertion or deletion we 2437 * are propagating the mirror_tid for. 2438 * 2439 * WARNING! Because we push and pop the passed cursor, it may be 2440 * modified by other B-Tree operations while it is unlocked 2441 * and things like the node & leaf pointers, and indexes might 2442 * change. 2443 */ 2444 void 2445 hammer_btree_do_propagation(hammer_cursor_t cursor, 2446 hammer_pseudofs_inmem_t pfsm, 2447 hammer_btree_leaf_elm_t leaf) 2448 { 2449 hammer_cursor_t ncursor; 2450 hammer_tid_t mirror_tid; 2451 int error; 2452 2453 /* 2454 * We do not propagate a mirror_tid if the filesystem was mounted 2455 * in no-mirror mode. 2456 */ 2457 if (cursor->trans->hmp->master_id < 0) 2458 return; 2459 2460 /* 2461 * This is a bit of a hack because we cannot deadlock or return 2462 * EDEADLK here. The related operation has already completed and 2463 * we must propagate the mirror_tid now regardless. 2464 * 2465 * Generate a new cursor which inherits the original's locks and 2466 * unlock the original. Use the new cursor to propagate the 2467 * mirror_tid. Then clean up the new cursor and reacquire locks 2468 * on the original. 2469 * 2470 * hammer_dup_cursor() cannot dup locks. The dup inherits the 2471 * original's locks and the original is tracked and must be 2472 * re-locked. 2473 */ 2474 mirror_tid = cursor->node->ondisk->mirror_tid; 2475 KKASSERT(mirror_tid != 0); 2476 ncursor = hammer_push_cursor(cursor); 2477 error = hammer_btree_mirror_propagate(ncursor, mirror_tid); 2478 KKASSERT(error == 0); 2479 hammer_pop_cursor(cursor, ncursor); 2480 /* WARNING: cursor's leaf pointer may change after pop */ 2481 } 2482 2483 2484 /* 2485 * Propagate a mirror TID update upwards through the B-Tree to the root. 2486 * 2487 * A locked internal node must be passed in. The node will remain locked 2488 * on return. 2489 * 2490 * This function syncs mirror_tid at the specified internal node's element, 2491 * adjusts the node's aggregation mirror_tid, and then recurses upwards. 2492 */ 2493 static int 2494 hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid) 2495 { 2496 hammer_btree_internal_elm_t elm; 2497 hammer_node_t node; 2498 int error; 2499 2500 for (;;) { 2501 error = hammer_cursor_up(cursor); 2502 if (error == 0) 2503 error = hammer_cursor_upgrade(cursor); 2504 2505 /* 2506 * We can ignore HAMMER_CURSOR_ITERATE_CHECK, the 2507 * cursor will still be properly positioned for 2508 * mirror propagation, just not for iterations. 2509 */ 2510 while (error == EDEADLK) { 2511 hammer_recover_cursor(cursor); 2512 error = hammer_cursor_upgrade(cursor); 2513 } 2514 if (error) 2515 break; 2516 2517 /* 2518 * If the cursor deadlocked it could end up at a leaf 2519 * after we lost the lock. 2520 */ 2521 node = cursor->node; 2522 if (node->ondisk->type != HAMMER_BTREE_TYPE_INTERNAL) 2523 continue; 2524 2525 /* 2526 * Adjust the node's element 2527 */ 2528 elm = &node->ondisk->elms[cursor->index].internal; 2529 if (elm->mirror_tid >= mirror_tid) 2530 break; 2531 hammer_modify_node(cursor->trans, node, &elm->mirror_tid, 2532 sizeof(elm->mirror_tid)); 2533 elm->mirror_tid = mirror_tid; 2534 hammer_modify_node_done(node); 2535 if (hammer_debug_general & 0x0002) { 2536 kprintf("mirror_propagate: propagate " 2537 "%016llx @%016llx:%d\n", 2538 (long long)mirror_tid, 2539 (long long)node->node_offset, 2540 cursor->index); 2541 } 2542 2543 2544 /* 2545 * Adjust the node's mirror_tid aggregator 2546 */ 2547 if (node->ondisk->mirror_tid >= mirror_tid) 2548 return(0); 2549 hammer_modify_node_field(cursor->trans, node, mirror_tid); 2550 node->ondisk->mirror_tid = mirror_tid; 2551 hammer_modify_node_done(node); 2552 if (hammer_debug_general & 0x0002) { 2553 kprintf("mirror_propagate: propagate " 2554 "%016llx @%016llx\n", 2555 (long long)mirror_tid, 2556 (long long)node->node_offset); 2557 } 2558 } 2559 if (error == ENOENT) 2560 error = 0; 2561 return(error); 2562 } 2563 2564 hammer_node_t 2565 hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node, 2566 int *parent_indexp, int *errorp, int try_exclusive) 2567 { 2568 hammer_node_t parent; 2569 hammer_btree_elm_t elm; 2570 int i; 2571 2572 /* 2573 * Get the node 2574 */ 2575 parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp); 2576 if (*errorp) { 2577 KKASSERT(parent == NULL); 2578 return(NULL); 2579 } 2580 KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0); 2581 2582 /* 2583 * Lock the node 2584 */ 2585 if (try_exclusive) { 2586 if (hammer_lock_ex_try(&parent->lock)) { 2587 hammer_rel_node(parent); 2588 *errorp = EDEADLK; 2589 return(NULL); 2590 } 2591 } else { 2592 hammer_lock_sh(&parent->lock); 2593 } 2594 2595 /* 2596 * Figure out which element in the parent is pointing to the 2597 * child. 2598 */ 2599 if (node->ondisk->count) { 2600 i = hammer_btree_search_node(&node->ondisk->elms[0].base, 2601 parent->ondisk); 2602 } else { 2603 i = 0; 2604 } 2605 while (i < parent->ondisk->count) { 2606 elm = &parent->ondisk->elms[i]; 2607 if (elm->internal.subtree_offset == node->node_offset) 2608 break; 2609 ++i; 2610 } 2611 if (i == parent->ondisk->count) { 2612 hammer_unlock(&parent->lock); 2613 panic("Bad B-Tree link: parent %p node %p\n", parent, node); 2614 } 2615 *parent_indexp = i; 2616 KKASSERT(*errorp == 0); 2617 return(parent); 2618 } 2619 2620 /* 2621 * The element (elm) has been moved to a new internal node (node). 2622 * 2623 * If the element represents a pointer to an internal node that node's 2624 * parent must be adjusted to the element's new location. 2625 * 2626 * XXX deadlock potential here with our exclusive locks 2627 */ 2628 int 2629 btree_set_parent(hammer_transaction_t trans, hammer_node_t node, 2630 hammer_btree_elm_t elm) 2631 { 2632 hammer_node_t child; 2633 int error; 2634 2635 error = 0; 2636 2637 switch(elm->base.btype) { 2638 case HAMMER_BTREE_TYPE_INTERNAL: 2639 case HAMMER_BTREE_TYPE_LEAF: 2640 child = hammer_get_node(trans, elm->internal.subtree_offset, 2641 0, &error); 2642 if (error == 0) { 2643 hammer_modify_node_field(trans, child, parent); 2644 child->ondisk->parent = node->node_offset; 2645 hammer_modify_node_done(child); 2646 hammer_rel_node(child); 2647 } 2648 break; 2649 default: 2650 break; 2651 } 2652 return(error); 2653 } 2654 2655 /* 2656 * Initialize the root of a recursive B-Tree node lock list structure. 2657 */ 2658 void 2659 hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node) 2660 { 2661 TAILQ_INIT(&parent->list); 2662 parent->parent = NULL; 2663 parent->node = node; 2664 parent->index = -1; 2665 parent->count = node->ondisk->count; 2666 parent->copy = NULL; 2667 parent->flags = 0; 2668 } 2669 2670 /* 2671 * Initialize a cache of hammer_node_lock's including space allocated 2672 * for node copies. 2673 * 2674 * This is used by the rebalancing code to preallocate the copy space 2675 * for ~4096 B-Tree nodes (16MB of data) prior to acquiring any HAMMER 2676 * locks, otherwise we can blow out the pageout daemon's emergency 2677 * reserve and deadlock it. 2678 * 2679 * NOTE: HAMMER_NODE_LOCK_LCACHE is not set on items cached in the lcache. 2680 * The flag is set when the item is pulled off the cache for use. 2681 */ 2682 void 2683 hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache, 2684 int depth) 2685 { 2686 hammer_node_lock_t item; 2687 int count; 2688 2689 for (count = 1; depth; --depth) 2690 count *= HAMMER_BTREE_LEAF_ELMS; 2691 bzero(lcache, sizeof(*lcache)); 2692 TAILQ_INIT(&lcache->list); 2693 while (count) { 2694 item = kmalloc(sizeof(*item), hmp->m_misc, M_WAITOK|M_ZERO); 2695 item->copy = kmalloc(sizeof(*item->copy), 2696 hmp->m_misc, M_WAITOK); 2697 TAILQ_INIT(&item->list); 2698 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2699 --count; 2700 } 2701 } 2702 2703 void 2704 hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache) 2705 { 2706 hammer_node_lock_t item; 2707 2708 while ((item = TAILQ_FIRST(&lcache->list)) != NULL) { 2709 TAILQ_REMOVE(&lcache->list, item, entry); 2710 KKASSERT(item->copy); 2711 KKASSERT(TAILQ_EMPTY(&item->list)); 2712 kfree(item->copy, hmp->m_misc); 2713 kfree(item, hmp->m_misc); 2714 } 2715 KKASSERT(lcache->copy == NULL); 2716 } 2717 2718 /* 2719 * Exclusively lock all the children of node. This is used by the split 2720 * code to prevent anyone from accessing the children of a cursor node 2721 * while we fix-up its parent offset. 2722 * 2723 * If we don't lock the children we can really mess up cursors which block 2724 * trying to cursor-up into our node. 2725 * 2726 * On failure EDEADLK (or some other error) is returned. If a deadlock 2727 * error is returned the cursor is adjusted to block on termination. 2728 * 2729 * The caller is responsible for managing parent->node, the root's node 2730 * is usually aliased from a cursor. 2731 */ 2732 int 2733 hammer_btree_lock_children(hammer_cursor_t cursor, int depth, 2734 hammer_node_lock_t parent, 2735 hammer_node_lock_t lcache) 2736 { 2737 hammer_node_t node; 2738 hammer_node_lock_t item; 2739 hammer_node_ondisk_t ondisk; 2740 hammer_btree_elm_t elm; 2741 hammer_node_t child; 2742 struct hammer_mount *hmp; 2743 int error; 2744 int i; 2745 2746 node = parent->node; 2747 ondisk = node->ondisk; 2748 error = 0; 2749 hmp = cursor->trans->hmp; 2750 2751 /* 2752 * We really do not want to block on I/O with exclusive locks held, 2753 * pre-get the children before trying to lock the mess. This is 2754 * only done one-level deep for now. 2755 */ 2756 for (i = 0; i < ondisk->count; ++i) { 2757 ++hammer_stats_btree_elements; 2758 elm = &ondisk->elms[i]; 2759 if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF && 2760 elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) { 2761 continue; 2762 } 2763 child = hammer_get_node(cursor->trans, 2764 elm->internal.subtree_offset, 2765 0, &error); 2766 if (child) 2767 hammer_rel_node(child); 2768 } 2769 2770 /* 2771 * Do it for real 2772 */ 2773 for (i = 0; error == 0 && i < ondisk->count; ++i) { 2774 ++hammer_stats_btree_elements; 2775 elm = &ondisk->elms[i]; 2776 2777 switch(elm->base.btype) { 2778 case HAMMER_BTREE_TYPE_INTERNAL: 2779 case HAMMER_BTREE_TYPE_LEAF: 2780 KKASSERT(elm->internal.subtree_offset != 0); 2781 child = hammer_get_node(cursor->trans, 2782 elm->internal.subtree_offset, 2783 0, &error); 2784 break; 2785 default: 2786 child = NULL; 2787 break; 2788 } 2789 if (child) { 2790 if (hammer_lock_ex_try(&child->lock) != 0) { 2791 if (cursor->deadlk_node == NULL) { 2792 cursor->deadlk_node = child; 2793 hammer_ref_node(cursor->deadlk_node); 2794 } 2795 error = EDEADLK; 2796 hammer_rel_node(child); 2797 } else { 2798 if (lcache) { 2799 item = TAILQ_FIRST(&lcache->list); 2800 KKASSERT(item != NULL); 2801 item->flags |= HAMMER_NODE_LOCK_LCACHE; 2802 TAILQ_REMOVE(&lcache->list, 2803 item, entry); 2804 } else { 2805 item = kmalloc(sizeof(*item), 2806 hmp->m_misc, 2807 M_WAITOK|M_ZERO); 2808 TAILQ_INIT(&item->list); 2809 } 2810 2811 TAILQ_INSERT_TAIL(&parent->list, item, entry); 2812 item->parent = parent; 2813 item->node = child; 2814 item->index = i; 2815 item->count = child->ondisk->count; 2816 2817 /* 2818 * Recurse (used by the rebalancing code) 2819 */ 2820 if (depth > 1 && elm->base.btype == HAMMER_BTREE_TYPE_INTERNAL) { 2821 error = hammer_btree_lock_children( 2822 cursor, 2823 depth - 1, 2824 item, 2825 lcache); 2826 } 2827 } 2828 } 2829 } 2830 if (error) 2831 hammer_btree_unlock_children(hmp, parent, lcache); 2832 return(error); 2833 } 2834 2835 /* 2836 * Create an in-memory copy of all B-Tree nodes listed, recursively, 2837 * including the parent. 2838 */ 2839 void 2840 hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2841 { 2842 hammer_mount_t hmp = cursor->trans->hmp; 2843 hammer_node_lock_t item; 2844 2845 if (parent->copy == NULL) { 2846 KKASSERT((parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0); 2847 parent->copy = kmalloc(sizeof(*parent->copy), 2848 hmp->m_misc, M_WAITOK); 2849 } 2850 KKASSERT((parent->flags & HAMMER_NODE_LOCK_UPDATED) == 0); 2851 *parent->copy = *parent->node->ondisk; 2852 TAILQ_FOREACH(item, &parent->list, entry) { 2853 hammer_btree_lock_copy(cursor, item); 2854 } 2855 } 2856 2857 /* 2858 * Recursively sync modified copies to the media. 2859 */ 2860 int 2861 hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2862 { 2863 hammer_node_lock_t item; 2864 int count = 0; 2865 2866 if (parent->flags & HAMMER_NODE_LOCK_UPDATED) { 2867 ++count; 2868 hammer_modify_node_all(cursor->trans, parent->node); 2869 *parent->node->ondisk = *parent->copy; 2870 hammer_modify_node_done(parent->node); 2871 if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) { 2872 hammer_flush_node(parent->node, 0); 2873 hammer_delete_node(cursor->trans, parent->node); 2874 } 2875 } 2876 TAILQ_FOREACH(item, &parent->list, entry) { 2877 count += hammer_btree_sync_copy(cursor, item); 2878 } 2879 return(count); 2880 } 2881 2882 /* 2883 * Release previously obtained node locks. The caller is responsible for 2884 * cleaning up parent->node itself (its usually just aliased from a cursor), 2885 * but this function will take care of the copies. 2886 * 2887 * NOTE: The root node is not placed in the lcache and node->copy is not 2888 * deallocated when lcache != NULL. 2889 */ 2890 void 2891 hammer_btree_unlock_children(hammer_mount_t hmp, hammer_node_lock_t parent, 2892 hammer_node_lock_t lcache) 2893 { 2894 hammer_node_lock_t item; 2895 hammer_node_ondisk_t copy; 2896 2897 while ((item = TAILQ_FIRST(&parent->list)) != NULL) { 2898 TAILQ_REMOVE(&parent->list, item, entry); 2899 hammer_btree_unlock_children(hmp, item, lcache); 2900 hammer_unlock(&item->node->lock); 2901 hammer_rel_node(item->node); 2902 if (lcache) { 2903 /* 2904 * NOTE: When placing the item back in the lcache 2905 * the flag is cleared by the bzero(). 2906 * Remaining fields are cleared as a safety 2907 * measure. 2908 */ 2909 KKASSERT(item->flags & HAMMER_NODE_LOCK_LCACHE); 2910 KKASSERT(TAILQ_EMPTY(&item->list)); 2911 copy = item->copy; 2912 bzero(item, sizeof(*item)); 2913 TAILQ_INIT(&item->list); 2914 item->copy = copy; 2915 if (copy) 2916 bzero(copy, sizeof(*copy)); 2917 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2918 } else { 2919 kfree(item, hmp->m_misc); 2920 } 2921 } 2922 if (parent->copy && (parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0) { 2923 kfree(parent->copy, hmp->m_misc); 2924 parent->copy = NULL; /* safety */ 2925 } 2926 } 2927 2928 /************************************************************************ 2929 * MISCELLANIOUS SUPPORT * 2930 ************************************************************************/ 2931 2932 /* 2933 * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp). 2934 * 2935 * Note that for this particular function a return value of -1, 0, or +1 2936 * can denote a match if create_tid is otherwise discounted. A create_tid 2937 * of zero is considered to be 'infinity' in comparisons. 2938 * 2939 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c. 2940 */ 2941 int 2942 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) 2943 { 2944 if (key1->localization < key2->localization) 2945 return(-5); 2946 if (key1->localization > key2->localization) 2947 return(5); 2948 2949 if (key1->obj_id < key2->obj_id) 2950 return(-4); 2951 if (key1->obj_id > key2->obj_id) 2952 return(4); 2953 2954 if (key1->rec_type < key2->rec_type) 2955 return(-3); 2956 if (key1->rec_type > key2->rec_type) 2957 return(3); 2958 2959 if (key1->key < key2->key) 2960 return(-2); 2961 if (key1->key > key2->key) 2962 return(2); 2963 2964 /* 2965 * A create_tid of zero indicates a record which is undeletable 2966 * and must be considered to have a value of positive infinity. 2967 */ 2968 if (key1->create_tid == 0) { 2969 if (key2->create_tid == 0) 2970 return(0); 2971 return(1); 2972 } 2973 if (key2->create_tid == 0) 2974 return(-1); 2975 if (key1->create_tid < key2->create_tid) 2976 return(-1); 2977 if (key1->create_tid > key2->create_tid) 2978 return(1); 2979 return(0); 2980 } 2981 2982 /* 2983 * Test a timestamp against an element to determine whether the 2984 * element is visible. A timestamp of 0 means 'infinity'. 2985 */ 2986 int 2987 hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base) 2988 { 2989 if (asof == 0) { 2990 if (base->delete_tid) 2991 return(1); 2992 return(0); 2993 } 2994 if (asof < base->create_tid) 2995 return(-1); 2996 if (base->delete_tid && asof >= base->delete_tid) 2997 return(1); 2998 return(0); 2999 } 3000 3001 /* 3002 * Create a separator half way inbetween key1 and key2. For fields just 3003 * one unit apart, the separator will match key2. key1 is on the left-hand 3004 * side and key2 is on the right-hand side. 3005 * 3006 * key2 must be >= the separator. It is ok for the separator to match key2. 3007 * 3008 * NOTE: Even if key1 does not match key2, the separator may wind up matching 3009 * key2. 3010 * 3011 * NOTE: It might be beneficial to just scrap this whole mess and just 3012 * set the separator to key2. 3013 */ 3014 #define MAKE_SEPARATOR(key1, key2, dest, field) \ 3015 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1); 3016 3017 static void 3018 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, 3019 hammer_base_elm_t dest) 3020 { 3021 bzero(dest, sizeof(*dest)); 3022 3023 dest->rec_type = key2->rec_type; 3024 dest->key = key2->key; 3025 dest->obj_id = key2->obj_id; 3026 dest->create_tid = key2->create_tid; 3027 3028 MAKE_SEPARATOR(key1, key2, dest, localization); 3029 if (key1->localization == key2->localization) { 3030 MAKE_SEPARATOR(key1, key2, dest, obj_id); 3031 if (key1->obj_id == key2->obj_id) { 3032 MAKE_SEPARATOR(key1, key2, dest, rec_type); 3033 if (key1->rec_type == key2->rec_type) { 3034 MAKE_SEPARATOR(key1, key2, dest, key); 3035 /* 3036 * Don't bother creating a separator for 3037 * create_tid, which also conveniently avoids 3038 * having to handle the create_tid == 0 3039 * (infinity) case. Just leave create_tid 3040 * set to key2. 3041 * 3042 * Worst case, dest matches key2 exactly, 3043 * which is acceptable. 3044 */ 3045 } 3046 } 3047 } 3048 } 3049 3050 #undef MAKE_SEPARATOR 3051 3052 /* 3053 * Return whether a generic internal or leaf node is full 3054 */ 3055 static int 3056 btree_node_is_full(hammer_node_ondisk_t node) 3057 { 3058 switch(node->type) { 3059 case HAMMER_BTREE_TYPE_INTERNAL: 3060 if (node->count == HAMMER_BTREE_INT_ELMS) 3061 return(1); 3062 break; 3063 case HAMMER_BTREE_TYPE_LEAF: 3064 if (node->count == HAMMER_BTREE_LEAF_ELMS) 3065 return(1); 3066 break; 3067 default: 3068 panic("illegal btree subtype"); 3069 } 3070 return(0); 3071 } 3072 3073 #if 0 3074 static int 3075 btree_max_elements(u_int8_t type) 3076 { 3077 if (type == HAMMER_BTREE_TYPE_LEAF) 3078 return(HAMMER_BTREE_LEAF_ELMS); 3079 if (type == HAMMER_BTREE_TYPE_INTERNAL) 3080 return(HAMMER_BTREE_INT_ELMS); 3081 panic("btree_max_elements: bad type %d\n", type); 3082 } 3083 #endif 3084 3085 void 3086 hammer_print_btree_node(hammer_node_ondisk_t ondisk) 3087 { 3088 hammer_btree_elm_t elm; 3089 int i; 3090 3091 kprintf("node %p count=%d parent=%016llx type=%c\n", 3092 ondisk, ondisk->count, 3093 (long long)ondisk->parent, ondisk->type); 3094 3095 /* 3096 * Dump both boundary elements if an internal node 3097 */ 3098 if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 3099 for (i = 0; i <= ondisk->count; ++i) { 3100 elm = &ondisk->elms[i]; 3101 hammer_print_btree_elm(elm, ondisk->type, i); 3102 } 3103 } else { 3104 for (i = 0; i < ondisk->count; ++i) { 3105 elm = &ondisk->elms[i]; 3106 hammer_print_btree_elm(elm, ondisk->type, i); 3107 } 3108 } 3109 } 3110 3111 void 3112 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i) 3113 { 3114 kprintf(" %2d", i); 3115 kprintf("\tobj_id = %016llx\n", (long long)elm->base.obj_id); 3116 kprintf("\tkey = %016llx\n", (long long)elm->base.key); 3117 kprintf("\tcreate_tid = %016llx\n", (long long)elm->base.create_tid); 3118 kprintf("\tdelete_tid = %016llx\n", (long long)elm->base.delete_tid); 3119 kprintf("\trec_type = %04x\n", elm->base.rec_type); 3120 kprintf("\tobj_type = %02x\n", elm->base.obj_type); 3121 kprintf("\tbtype = %02x (%c)\n", 3122 elm->base.btype, 3123 (elm->base.btype ? elm->base.btype : '?')); 3124 kprintf("\tlocalization = %02x\n", elm->base.localization); 3125 3126 switch(type) { 3127 case HAMMER_BTREE_TYPE_INTERNAL: 3128 kprintf("\tsubtree_off = %016llx\n", 3129 (long long)elm->internal.subtree_offset); 3130 break; 3131 case HAMMER_BTREE_TYPE_RECORD: 3132 kprintf("\tdata_offset = %016llx\n", 3133 (long long)elm->leaf.data_offset); 3134 kprintf("\tdata_len = %08x\n", elm->leaf.data_len); 3135 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc); 3136 break; 3137 } 3138 } 3139