1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER B-Tree index 37 * 38 * HAMMER implements a modified B+Tree. In documentation this will 39 * simply be refered to as the HAMMER B-Tree. Basically a HAMMER B-Tree 40 * looks like a B+Tree (A B-Tree which stores its records only at the leafs 41 * of the tree), but adds two additional boundary elements which describe 42 * the left-most and right-most element a node is able to represent. In 43 * otherwords, we have boundary elements at the two ends of a B-Tree node 44 * with no valid sub-tree pointer for the right-most element. 45 * 46 * A B-Tree internal node looks like this: 47 * 48 * B N N N N N N B <-- boundary and internal elements 49 * S S S S S S S <-- subtree pointers 50 * 51 * A B-Tree leaf node basically looks like this: 52 * 53 * L L L L L L L L <-- leaf elemenets 54 * 55 * The radix for an internal node is 1 less then a leaf but we get a 56 * number of significant benefits for our troubles. 57 * The left-hand boundary (B in the left) is integrated into the first 58 * element so it doesn't require 2 elements to accomodate boundaries. 59 * 60 * The big benefit to using a B-Tree containing boundary information 61 * is that it is possible to cache pointers into the middle of the tree 62 * and not have to start searches, insertions, OR deletions at the root 63 * node. In particular, searches are able to progress in a definitive 64 * direction from any point in the tree without revisting nodes. This 65 * greatly improves the efficiency of many operations, most especially 66 * record appends. 67 * 68 * B-Trees also make the stacking of trees fairly straightforward. 69 * 70 * INSERTIONS: A search performed with the intention of doing 71 * an insert will guarantee that the terminal leaf node is not full by 72 * splitting full nodes. Splits occur top-down during the dive down the 73 * B-Tree. 74 * 75 * DELETIONS: A deletion makes no attempt to proactively balance the 76 * tree and will recursively remove nodes that become empty. If a 77 * deadlock occurs a deletion may not be able to remove an empty leaf. 78 * Deletions never allow internal nodes to become empty (that would blow 79 * up the boundaries). 80 */ 81 #include "hammer.h" 82 #include <sys/buf.h> 83 #include <sys/buf2.h> 84 85 static int btree_search(hammer_cursor_t cursor, int flags); 86 static int btree_split_internal(hammer_cursor_t cursor); 87 static int btree_split_leaf(hammer_cursor_t cursor); 88 static int btree_remove(hammer_cursor_t cursor); 89 static __inline int btree_node_is_full(hammer_node_ondisk_t node); 90 static __inline int btree_max_elements(u_int8_t type); 91 static int hammer_btree_mirror_propagate(hammer_cursor_t cursor, 92 hammer_tid_t mirror_tid); 93 static void hammer_make_separator(hammer_base_elm_t key1, 94 hammer_base_elm_t key2, hammer_base_elm_t dest); 95 static void hammer_cursor_mirror_filter(hammer_cursor_t cursor); 96 97 /* 98 * Iterate records after a search. The cursor is iterated forwards past 99 * the current record until a record matching the key-range requirements 100 * is found. ENOENT is returned if the iteration goes past the ending 101 * key. 102 * 103 * The iteration is inclusive of key_beg and can be inclusive or exclusive 104 * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set. 105 * 106 * When doing an as-of search (cursor->asof != 0), key_beg.create_tid 107 * may be modified by B-Tree functions. 108 * 109 * cursor->key_beg may or may not be modified by this function during 110 * the iteration. XXX future - in case of an inverted lock we may have 111 * to reinitiate the lookup and set key_beg to properly pick up where we 112 * left off. 113 * 114 * If HAMMER_CURSOR_ITERATE_CHECK is set it is possible that the cursor 115 * was reverse indexed due to being moved to a parent while unlocked, 116 * and something else might have inserted an element outside the iteration 117 * range. When this case occurs the iterator just keeps iterating until 118 * it gets back into the iteration range (instead of asserting). 119 * 120 * NOTE! EDEADLK *CANNOT* be returned by this procedure. 121 */ 122 int 123 hammer_btree_iterate(hammer_cursor_t cursor) 124 { 125 hammer_node_ondisk_t node; 126 hammer_btree_elm_t elm; 127 hammer_mount_t hmp; 128 int error = 0; 129 int r; 130 int s; 131 132 /* 133 * Skip past the current record 134 */ 135 hmp = cursor->trans->hmp; 136 node = cursor->node->ondisk; 137 if (node == NULL) 138 return(ENOENT); 139 if (cursor->index < node->count && 140 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 141 ++cursor->index; 142 } 143 144 /* 145 * HAMMER can wind up being cpu-bound. 146 */ 147 if (++hmp->check_yield > hammer_yield_check) { 148 hmp->check_yield = 0; 149 lwkt_user_yield(); 150 } 151 152 153 /* 154 * Loop until an element is found or we are done. 155 */ 156 for (;;) { 157 /* 158 * We iterate up the tree and then index over one element 159 * while we are at the last element in the current node. 160 * 161 * If we are at the root of the filesystem, cursor_up 162 * returns ENOENT. 163 * 164 * XXX this could be optimized by storing the information in 165 * the parent reference. 166 * 167 * XXX we can lose the node lock temporarily, this could mess 168 * up our scan. 169 */ 170 ++hammer_stats_btree_iterations; 171 hammer_flusher_clean_loose_ios(hmp); 172 173 if (cursor->index == node->count) { 174 if (hammer_debug_btree) { 175 kprintf("BRACKETU %016llx[%d] -> %016llx[%d] (td=%p)\n", 176 (long long)cursor->node->node_offset, 177 cursor->index, 178 (long long)(cursor->parent ? cursor->parent->node_offset : -1), 179 cursor->parent_index, 180 curthread); 181 } 182 KKASSERT(cursor->parent == NULL || cursor->parent->ondisk->elms[cursor->parent_index].internal.subtree_offset == cursor->node->node_offset); 183 error = hammer_cursor_up(cursor); 184 if (error) 185 break; 186 /* reload stale pointer */ 187 node = cursor->node->ondisk; 188 KKASSERT(cursor->index != node->count); 189 190 /* 191 * If we are reblocking we want to return internal 192 * nodes. Note that the internal node will be 193 * returned multiple times, on each upward recursion 194 * from its children. The caller selects which 195 * revisit it cares about (usually first or last only). 196 */ 197 if (cursor->flags & HAMMER_CURSOR_REBLOCKING) { 198 cursor->flags |= HAMMER_CURSOR_ATEDISK; 199 return(0); 200 } 201 ++cursor->index; 202 continue; 203 } 204 205 /* 206 * Check internal or leaf element. Determine if the record 207 * at the cursor has gone beyond the end of our range. 208 * 209 * We recurse down through internal nodes. 210 */ 211 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 212 elm = &node->elms[cursor->index]; 213 214 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 215 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 216 if (hammer_debug_btree) { 217 kprintf("BRACKETL %016llx[%d] %016llx %02x " 218 "key=%016llx lo=%02x %d (td=%p)\n", 219 (long long)cursor->node->node_offset, 220 cursor->index, 221 (long long)elm[0].internal.base.obj_id, 222 elm[0].internal.base.rec_type, 223 (long long)elm[0].internal.base.key, 224 elm[0].internal.base.localization, 225 r, 226 curthread 227 ); 228 kprintf("BRACKETR %016llx[%d] %016llx %02x " 229 "key=%016llx lo=%02x %d\n", 230 (long long)cursor->node->node_offset, 231 cursor->index + 1, 232 (long long)elm[1].internal.base.obj_id, 233 elm[1].internal.base.rec_type, 234 (long long)elm[1].internal.base.key, 235 elm[1].internal.base.localization, 236 s 237 ); 238 } 239 240 if (r < 0) { 241 error = ENOENT; 242 break; 243 } 244 if (r == 0 && (cursor->flags & 245 HAMMER_CURSOR_END_INCLUSIVE) == 0) { 246 error = ENOENT; 247 break; 248 } 249 250 /* 251 * Better not be zero 252 */ 253 KKASSERT(elm->internal.subtree_offset != 0); 254 255 if (s <= 0) { 256 /* 257 * If running the mirror filter see if we 258 * can skip one or more entire sub-trees. 259 * If we can we return the internal node 260 * and the caller processes the skipped 261 * range (see mirror_read). 262 */ 263 if (cursor->flags & 264 HAMMER_CURSOR_MIRROR_FILTERED) { 265 if (elm->internal.mirror_tid < 266 cursor->cmirror->mirror_tid) { 267 hammer_cursor_mirror_filter(cursor); 268 return(0); 269 } 270 } 271 } else { 272 /* 273 * Normally it would be impossible for the 274 * cursor to have gotten back-indexed, 275 * but it can happen if a node is deleted 276 * and the cursor is moved to its parent 277 * internal node. ITERATE_CHECK will be set. 278 */ 279 KKASSERT(cursor->flags & 280 HAMMER_CURSOR_ITERATE_CHECK); 281 kprintf("hammer_btree_iterate: " 282 "DEBUG: Caught parent seek " 283 "in internal iteration\n"); 284 } 285 286 error = hammer_cursor_down(cursor); 287 if (error) 288 break; 289 KKASSERT(cursor->index == 0); 290 /* reload stale pointer */ 291 node = cursor->node->ondisk; 292 continue; 293 } else { 294 elm = &node->elms[cursor->index]; 295 r = hammer_btree_cmp(&cursor->key_end, &elm->base); 296 if (hammer_debug_btree) { 297 kprintf("ELEMENT %016llx:%d %c %016llx %02x " 298 "key=%016llx lo=%02x %d\n", 299 (long long)cursor->node->node_offset, 300 cursor->index, 301 (elm[0].leaf.base.btype ? 302 elm[0].leaf.base.btype : '?'), 303 (long long)elm[0].leaf.base.obj_id, 304 elm[0].leaf.base.rec_type, 305 (long long)elm[0].leaf.base.key, 306 elm[0].leaf.base.localization, 307 r 308 ); 309 } 310 if (r < 0) { 311 error = ENOENT; 312 break; 313 } 314 315 /* 316 * We support both end-inclusive and 317 * end-exclusive searches. 318 */ 319 if (r == 0 && 320 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 321 error = ENOENT; 322 break; 323 } 324 325 /* 326 * If ITERATE_CHECK is set an unlocked cursor may 327 * have been moved to a parent and the iterate can 328 * happen upon elements that are not in the requested 329 * range. 330 */ 331 if (cursor->flags & HAMMER_CURSOR_ITERATE_CHECK) { 332 s = hammer_btree_cmp(&cursor->key_beg, 333 &elm->base); 334 if (s > 0) { 335 kprintf("hammer_btree_iterate: " 336 "DEBUG: Caught parent seek " 337 "in leaf iteration\n"); 338 ++cursor->index; 339 continue; 340 } 341 } 342 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 343 344 /* 345 * Return the element 346 */ 347 switch(elm->leaf.base.btype) { 348 case HAMMER_BTREE_TYPE_RECORD: 349 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 350 hammer_btree_chkts(cursor->asof, &elm->base)) { 351 ++cursor->index; 352 continue; 353 } 354 error = 0; 355 break; 356 default: 357 error = EINVAL; 358 break; 359 } 360 if (error) 361 break; 362 } 363 364 /* 365 * Return entry 366 */ 367 if (hammer_debug_btree) { 368 int i = cursor->index; 369 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 370 kprintf("ITERATE %p:%d %016llx %02x " 371 "key=%016llx lo=%02x\n", 372 cursor->node, i, 373 (long long)elm->internal.base.obj_id, 374 elm->internal.base.rec_type, 375 (long long)elm->internal.base.key, 376 elm->internal.base.localization 377 ); 378 } 379 return(0); 380 } 381 return(error); 382 } 383 384 /* 385 * We hit an internal element that we could skip as part of a mirroring 386 * scan. Calculate the entire range being skipped. 387 * 388 * It is important to include any gaps between the parent's left_bound 389 * and the node's left_bound, and same goes for the right side. 390 */ 391 static void 392 hammer_cursor_mirror_filter(hammer_cursor_t cursor) 393 { 394 struct hammer_cmirror *cmirror; 395 hammer_node_ondisk_t ondisk; 396 hammer_btree_elm_t elm; 397 398 ondisk = cursor->node->ondisk; 399 cmirror = cursor->cmirror; 400 401 /* 402 * Calculate the skipped range 403 */ 404 elm = &ondisk->elms[cursor->index]; 405 if (cursor->index == 0) 406 cmirror->skip_beg = *cursor->left_bound; 407 else 408 cmirror->skip_beg = elm->internal.base; 409 while (cursor->index < ondisk->count) { 410 if (elm->internal.mirror_tid >= cmirror->mirror_tid) 411 break; 412 ++cursor->index; 413 ++elm; 414 } 415 if (cursor->index == ondisk->count) 416 cmirror->skip_end = *cursor->right_bound; 417 else 418 cmirror->skip_end = elm->internal.base; 419 420 /* 421 * clip the returned result. 422 */ 423 if (hammer_btree_cmp(&cmirror->skip_beg, &cursor->key_beg) < 0) 424 cmirror->skip_beg = cursor->key_beg; 425 if (hammer_btree_cmp(&cmirror->skip_end, &cursor->key_end) > 0) 426 cmirror->skip_end = cursor->key_end; 427 } 428 429 /* 430 * Iterate in the reverse direction. This is used by the pruning code to 431 * avoid overlapping records. 432 */ 433 int 434 hammer_btree_iterate_reverse(hammer_cursor_t cursor) 435 { 436 hammer_node_ondisk_t node; 437 hammer_btree_elm_t elm; 438 hammer_mount_t hmp; 439 int error = 0; 440 int r; 441 int s; 442 443 /* mirror filtering not supported for reverse iteration */ 444 KKASSERT ((cursor->flags & HAMMER_CURSOR_MIRROR_FILTERED) == 0); 445 446 /* 447 * Skip past the current record. For various reasons the cursor 448 * may end up set to -1 or set to point at the end of the current 449 * node. These cases must be addressed. 450 */ 451 node = cursor->node->ondisk; 452 if (node == NULL) 453 return(ENOENT); 454 if (cursor->index != -1 && 455 (cursor->flags & HAMMER_CURSOR_ATEDISK)) { 456 --cursor->index; 457 } 458 if (cursor->index == cursor->node->ondisk->count) 459 --cursor->index; 460 461 /* 462 * HAMMER can wind up being cpu-bound. 463 */ 464 hmp = cursor->trans->hmp; 465 if (++hmp->check_yield > hammer_yield_check) { 466 hmp->check_yield = 0; 467 lwkt_user_yield(); 468 } 469 470 /* 471 * Loop until an element is found or we are done. 472 */ 473 for (;;) { 474 ++hammer_stats_btree_iterations; 475 hammer_flusher_clean_loose_ios(hmp); 476 477 /* 478 * We iterate up the tree and then index over one element 479 * while we are at the last element in the current node. 480 */ 481 if (cursor->index == -1) { 482 error = hammer_cursor_up(cursor); 483 if (error) { 484 cursor->index = 0; /* sanity */ 485 break; 486 } 487 /* reload stale pointer */ 488 node = cursor->node->ondisk; 489 KKASSERT(cursor->index != node->count); 490 --cursor->index; 491 continue; 492 } 493 494 /* 495 * Check internal or leaf element. Determine if the record 496 * at the cursor has gone beyond the end of our range. 497 * 498 * We recurse down through internal nodes. 499 */ 500 KKASSERT(cursor->index != node->count); 501 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 502 elm = &node->elms[cursor->index]; 503 504 r = hammer_btree_cmp(&cursor->key_end, &elm[0].base); 505 s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base); 506 if (hammer_debug_btree) { 507 kprintf("BRACKETL %016llx[%d] %016llx %02x " 508 "key=%016llx lo=%02x %d (td=%p)\n", 509 (long long)cursor->node->node_offset, 510 cursor->index, 511 (long long)elm[0].internal.base.obj_id, 512 elm[0].internal.base.rec_type, 513 (long long)elm[0].internal.base.key, 514 elm[0].internal.base.localization, 515 r, 516 curthread 517 ); 518 kprintf("BRACKETR %016llx[%d] %016llx %02x " 519 "key=%016llx lo=%02x %d\n", 520 (long long)cursor->node->node_offset, 521 cursor->index + 1, 522 (long long)elm[1].internal.base.obj_id, 523 elm[1].internal.base.rec_type, 524 (long long)elm[1].internal.base.key, 525 elm[1].internal.base.localization, 526 s 527 ); 528 } 529 530 if (s >= 0) { 531 error = ENOENT; 532 break; 533 } 534 535 /* 536 * It shouldn't be possible to be seeked past key_end, 537 * even if the cursor got moved to a parent. 538 */ 539 KKASSERT(r >= 0); 540 541 /* 542 * Better not be zero 543 */ 544 KKASSERT(elm->internal.subtree_offset != 0); 545 546 error = hammer_cursor_down(cursor); 547 if (error) 548 break; 549 KKASSERT(cursor->index == 0); 550 /* reload stale pointer */ 551 node = cursor->node->ondisk; 552 553 /* this can assign -1 if the leaf was empty */ 554 cursor->index = node->count - 1; 555 continue; 556 } else { 557 elm = &node->elms[cursor->index]; 558 s = hammer_btree_cmp(&cursor->key_beg, &elm->base); 559 if (hammer_debug_btree) { 560 kprintf("ELEMENTR %016llx:%d %c %016llx %02x " 561 "key=%016llx lo=%02x %d\n", 562 (long long)cursor->node->node_offset, 563 cursor->index, 564 (elm[0].leaf.base.btype ? 565 elm[0].leaf.base.btype : '?'), 566 (long long)elm[0].leaf.base.obj_id, 567 elm[0].leaf.base.rec_type, 568 (long long)elm[0].leaf.base.key, 569 elm[0].leaf.base.localization, 570 s 571 ); 572 } 573 if (s > 0) { 574 error = ENOENT; 575 break; 576 } 577 578 /* 579 * It shouldn't be possible to be seeked past key_end, 580 * even if the cursor got moved to a parent. 581 */ 582 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 583 584 /* 585 * Return the element 586 */ 587 switch(elm->leaf.base.btype) { 588 case HAMMER_BTREE_TYPE_RECORD: 589 if ((cursor->flags & HAMMER_CURSOR_ASOF) && 590 hammer_btree_chkts(cursor->asof, &elm->base)) { 591 --cursor->index; 592 continue; 593 } 594 error = 0; 595 break; 596 default: 597 error = EINVAL; 598 break; 599 } 600 if (error) 601 break; 602 } 603 604 /* 605 * Return entry 606 */ 607 if (hammer_debug_btree) { 608 int i = cursor->index; 609 hammer_btree_elm_t elm = &cursor->node->ondisk->elms[i]; 610 kprintf("ITERATER %p:%d %016llx %02x " 611 "key=%016llx lo=%02x\n", 612 cursor->node, i, 613 (long long)elm->internal.base.obj_id, 614 elm->internal.base.rec_type, 615 (long long)elm->internal.base.key, 616 elm->internal.base.localization 617 ); 618 } 619 return(0); 620 } 621 return(error); 622 } 623 624 /* 625 * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry 626 * could not be found, EDEADLK if inserting and a retry is needed, and a 627 * fatal error otherwise. When retrying, the caller must terminate the 628 * cursor and reinitialize it. EDEADLK cannot be returned if not inserting. 629 * 630 * The cursor is suitably positioned for a deletion on success, and suitably 631 * positioned for an insertion on ENOENT if HAMMER_CURSOR_INSERT was 632 * specified. 633 * 634 * The cursor may begin anywhere, the search will traverse the tree in 635 * either direction to locate the requested element. 636 * 637 * Most of the logic implementing historical searches is handled here. We 638 * do an initial lookup with create_tid set to the asof TID. Due to the 639 * way records are laid out, a backwards iteration may be required if 640 * ENOENT is returned to locate the historical record. Here's the 641 * problem: 642 * 643 * create_tid: 10 15 20 644 * LEAF1 LEAF2 645 * records: (11) (18) 646 * 647 * Lets say we want to do a lookup AS-OF timestamp 17. We will traverse 648 * LEAF2 but the only record in LEAF2 has a create_tid of 18, which is 649 * not visible and thus causes ENOENT to be returned. We really need 650 * to check record 11 in LEAF1. If it also fails then the search fails 651 * (e.g. it might represent the range 11-16 and thus still not match our 652 * AS-OF timestamp of 17). Note that LEAF1 could be empty, requiring 653 * further iterations. 654 * 655 * If this case occurs btree_search() will set HAMMER_CURSOR_CREATE_CHECK 656 * and the cursor->create_check TID if an iteration might be needed. 657 * In the above example create_check would be set to 14. 658 */ 659 int 660 hammer_btree_lookup(hammer_cursor_t cursor) 661 { 662 int error; 663 664 cursor->flags &= ~HAMMER_CURSOR_ITERATE_CHECK; 665 KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 || 666 cursor->trans->sync_lock_refs > 0); 667 ++hammer_stats_btree_lookups; 668 if (cursor->flags & HAMMER_CURSOR_ASOF) { 669 KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0); 670 cursor->key_beg.create_tid = cursor->asof; 671 for (;;) { 672 cursor->flags &= ~HAMMER_CURSOR_CREATE_CHECK; 673 error = btree_search(cursor, 0); 674 if (error != ENOENT || 675 (cursor->flags & HAMMER_CURSOR_CREATE_CHECK) == 0) { 676 /* 677 * Stop if no error. 678 * Stop if error other then ENOENT. 679 * Stop if ENOENT and not special case. 680 */ 681 break; 682 } 683 if (hammer_debug_btree) { 684 kprintf("CREATE_CHECK %016llx\n", 685 (long long)cursor->create_check); 686 } 687 cursor->key_beg.create_tid = cursor->create_check; 688 /* loop */ 689 } 690 } else { 691 error = btree_search(cursor, 0); 692 } 693 if (error == 0) 694 error = hammer_btree_extract(cursor, cursor->flags); 695 return(error); 696 } 697 698 /* 699 * Execute the logic required to start an iteration. The first record 700 * located within the specified range is returned and iteration control 701 * flags are adjusted for successive hammer_btree_iterate() calls. 702 * 703 * Set ATEDISK so a low-level caller can call btree_first/btree_iterate 704 * in a loop without worrying about it. Higher-level merged searches will 705 * adjust the flag appropriately. 706 */ 707 int 708 hammer_btree_first(hammer_cursor_t cursor) 709 { 710 int error; 711 712 error = hammer_btree_lookup(cursor); 713 if (error == ENOENT) { 714 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 715 error = hammer_btree_iterate(cursor); 716 } 717 cursor->flags |= HAMMER_CURSOR_ATEDISK; 718 return(error); 719 } 720 721 /* 722 * Similarly but for an iteration in the reverse direction. 723 * 724 * Set ATEDISK when iterating backwards to skip the current entry, 725 * which after an ENOENT lookup will be pointing beyond our end point. 726 * 727 * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse 728 * in a loop without worrying about it. Higher-level merged searches will 729 * adjust the flag appropriately. 730 */ 731 int 732 hammer_btree_last(hammer_cursor_t cursor) 733 { 734 struct hammer_base_elm save; 735 int error; 736 737 save = cursor->key_beg; 738 cursor->key_beg = cursor->key_end; 739 error = hammer_btree_lookup(cursor); 740 cursor->key_beg = save; 741 if (error == ENOENT || 742 (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) { 743 cursor->flags |= HAMMER_CURSOR_ATEDISK; 744 error = hammer_btree_iterate_reverse(cursor); 745 } 746 cursor->flags |= HAMMER_CURSOR_ATEDISK; 747 return(error); 748 } 749 750 /* 751 * Extract the record and/or data associated with the cursor's current 752 * position. Any prior record or data stored in the cursor is replaced. 753 * 754 * NOTE: All extractions occur at the leaf of the B-Tree. 755 */ 756 int 757 hammer_btree_extract(hammer_cursor_t cursor, int flags) 758 { 759 hammer_node_ondisk_t node; 760 hammer_btree_elm_t elm; 761 hammer_off_t data_off; 762 hammer_mount_t hmp; 763 int32_t data_len; 764 int error; 765 766 /* 767 * The case where the data reference resolves to the same buffer 768 * as the record reference must be handled. 769 */ 770 node = cursor->node->ondisk; 771 elm = &node->elms[cursor->index]; 772 cursor->data = NULL; 773 hmp = cursor->node->hmp; 774 775 /* 776 * There is nothing to extract for an internal element. 777 */ 778 if (node->type == HAMMER_BTREE_TYPE_INTERNAL) 779 return(EINVAL); 780 781 /* 782 * Only record types have data. 783 */ 784 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 785 cursor->leaf = &elm->leaf; 786 787 if ((flags & HAMMER_CURSOR_GET_DATA) == 0) 788 return(0); 789 if (elm->leaf.base.btype != HAMMER_BTREE_TYPE_RECORD) 790 return(0); 791 data_off = elm->leaf.data_offset; 792 data_len = elm->leaf.data_len; 793 if (data_off == 0) 794 return(0); 795 796 /* 797 * Load the data 798 */ 799 KKASSERT(data_len >= 0 && data_len <= HAMMER_XBUFSIZE); 800 cursor->data = hammer_bread_ext(hmp, data_off, data_len, 801 &error, &cursor->data_buffer); 802 803 /* 804 * Mark the data buffer as not being meta-data if it isn't 805 * meta-data (sometimes bulk data is accessed via a volume 806 * block device). 807 */ 808 if (error == 0) { 809 switch(elm->leaf.base.rec_type) { 810 case HAMMER_RECTYPE_DATA: 811 case HAMMER_RECTYPE_DB: 812 if ((data_off & HAMMER_ZONE_LARGE_DATA) == 0) 813 break; 814 if (hammer_double_buffer == 0 || 815 (cursor->flags & HAMMER_CURSOR_NOSWAPCACHE)) { 816 hammer_io_notmeta(cursor->data_buffer); 817 } 818 break; 819 default: 820 break; 821 } 822 } 823 824 /* 825 * Deal with CRC errors on the extracted data. 826 */ 827 if (error == 0 && 828 hammer_crc_test_leaf(cursor->data, &elm->leaf) == 0) { 829 kprintf("CRC DATA @ %016llx/%d FAILED\n", 830 (long long)elm->leaf.data_offset, elm->leaf.data_len); 831 if (hammer_debug_critical) 832 Debugger("CRC FAILED: DATA"); 833 if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM) 834 error = EDOM; /* less critical (mirroring) */ 835 else 836 error = EIO; /* critical */ 837 } 838 return(error); 839 } 840 841 842 /* 843 * Insert a leaf element into the B-Tree at the current cursor position. 844 * The cursor is positioned such that the element at and beyond the cursor 845 * are shifted to make room for the new record. 846 * 847 * The caller must call hammer_btree_lookup() with the HAMMER_CURSOR_INSERT 848 * flag set and that call must return ENOENT before this function can be 849 * called. ENOSPC is returned if there is no room to insert a new record. 850 * 851 * The caller may depend on the cursor's exclusive lock after return to 852 * interlock frontend visibility (see HAMMER_RECF_CONVERT_DELETE). 853 */ 854 int 855 hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm, 856 int *doprop) 857 { 858 hammer_node_ondisk_t node; 859 int i; 860 int error; 861 862 *doprop = 0; 863 if ((error = hammer_cursor_upgrade_node(cursor)) != 0) 864 return(error); 865 ++hammer_stats_btree_inserts; 866 867 /* 868 * Insert the element at the leaf node and update the count in the 869 * parent. It is possible for parent to be NULL, indicating that 870 * the filesystem's ROOT B-Tree node is a leaf itself, which is 871 * possible. The root inode can never be deleted so the leaf should 872 * never be empty. 873 * 874 * Remember that leaf nodes do not have boundaries. 875 */ 876 hammer_modify_node_all(cursor->trans, cursor->node); 877 node = cursor->node->ondisk; 878 i = cursor->index; 879 KKASSERT(elm->base.btype != 0); 880 KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); 881 KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS); 882 if (i != node->count) { 883 bcopy(&node->elms[i], &node->elms[i+1], 884 (node->count - i) * sizeof(*elm)); 885 } 886 node->elms[i].leaf = *elm; 887 ++node->count; 888 hammer_cursor_inserted_element(cursor->node, i); 889 890 /* 891 * Update the leaf node's aggregate mirror_tid for mirroring 892 * support. 893 */ 894 if (node->mirror_tid < elm->base.delete_tid) { 895 node->mirror_tid = elm->base.delete_tid; 896 *doprop = 1; 897 } 898 if (node->mirror_tid < elm->base.create_tid) { 899 node->mirror_tid = elm->base.create_tid; 900 *doprop = 1; 901 } 902 hammer_modify_node_done(cursor->node); 903 904 /* 905 * Debugging sanity checks. 906 */ 907 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->base) <= 0); 908 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->base) > 0); 909 if (i) { 910 KKASSERT(hammer_btree_cmp(&node->elms[i-1].leaf.base, &elm->base) < 0); 911 } 912 if (i != node->count - 1) 913 KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->base) > 0); 914 915 return(0); 916 } 917 918 /* 919 * Delete a record from the B-Tree at the current cursor position. 920 * The cursor is positioned such that the current element is the one 921 * to be deleted. 922 * 923 * On return the cursor will be positioned after the deleted element and 924 * MAY point to an internal node. It will be suitable for the continuation 925 * of an iteration but not for an insertion or deletion. 926 * 927 * Deletions will attempt to partially rebalance the B-Tree in an upward 928 * direction, but will terminate rather then deadlock. Empty internal nodes 929 * are never allowed by a deletion which deadlocks may end up giving us an 930 * empty leaf. The pruner will clean up and rebalance the tree. 931 * 932 * This function can return EDEADLK, requiring the caller to retry the 933 * operation after clearing the deadlock. 934 */ 935 int 936 hammer_btree_delete(hammer_cursor_t cursor) 937 { 938 hammer_node_ondisk_t ondisk; 939 hammer_node_t node; 940 hammer_node_t parent __debugvar; 941 int error; 942 int i; 943 944 KKASSERT (cursor->trans->sync_lock_refs > 0); 945 if ((error = hammer_cursor_upgrade(cursor)) != 0) 946 return(error); 947 ++hammer_stats_btree_deletes; 948 949 /* 950 * Delete the element from the leaf node. 951 * 952 * Remember that leaf nodes do not have boundaries. 953 */ 954 node = cursor->node; 955 ondisk = node->ondisk; 956 i = cursor->index; 957 958 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF); 959 KKASSERT(i >= 0 && i < ondisk->count); 960 hammer_modify_node_all(cursor->trans, node); 961 if (i + 1 != ondisk->count) { 962 bcopy(&ondisk->elms[i+1], &ondisk->elms[i], 963 (ondisk->count - i - 1) * sizeof(ondisk->elms[0])); 964 } 965 --ondisk->count; 966 hammer_modify_node_done(node); 967 hammer_cursor_deleted_element(node, i); 968 969 /* 970 * Validate local parent 971 */ 972 if (ondisk->parent) { 973 parent = cursor->parent; 974 975 KKASSERT(parent != NULL); 976 KKASSERT(parent->node_offset == ondisk->parent); 977 } 978 979 /* 980 * If the leaf becomes empty it must be detached from the parent, 981 * potentially recursing through to the filesystem root. 982 * 983 * This may reposition the cursor at one of the parent's of the 984 * current node. 985 * 986 * Ignore deadlock errors, that simply means that btree_remove 987 * was unable to recurse and had to leave us with an empty leaf. 988 */ 989 KKASSERT(cursor->index <= ondisk->count); 990 if (ondisk->count == 0) { 991 error = btree_remove(cursor); 992 if (error == EDEADLK) 993 error = 0; 994 } else { 995 error = 0; 996 } 997 KKASSERT(cursor->parent == NULL || 998 cursor->parent_index < cursor->parent->ondisk->count); 999 return(error); 1000 } 1001 1002 /* 1003 * PRIMARY B-TREE SEARCH SUPPORT PROCEDURE 1004 * 1005 * Search the filesystem B-Tree for cursor->key_beg, return the matching node. 1006 * 1007 * The search can begin ANYWHERE in the B-Tree. As a first step the search 1008 * iterates up the tree as necessary to properly position itself prior to 1009 * actually doing the sarch. 1010 * 1011 * INSERTIONS: The search will split full nodes and leaves on its way down 1012 * and guarentee that the leaf it ends up on is not full. If we run out 1013 * of space the search continues to the leaf, but ENOSPC is returned. 1014 * 1015 * The search is only guarenteed to end up on a leaf if an error code of 0 1016 * is returned, or if inserting and an error code of ENOENT is returned. 1017 * Otherwise it can stop at an internal node. On success a search returns 1018 * a leaf node. 1019 * 1020 * COMPLEXITY WARNING! This is the core B-Tree search code for the entire 1021 * filesystem, and it is not simple code. Please note the following facts: 1022 * 1023 * - Internal node recursions have a boundary on the left AND right. The 1024 * right boundary is non-inclusive. The create_tid is a generic part 1025 * of the key for internal nodes. 1026 * 1027 * - Filesystem lookups typically set HAMMER_CURSOR_ASOF, indicating a 1028 * historical search. ASOF and INSERT are mutually exclusive. When 1029 * doing an as-of lookup btree_search() checks for a right-edge boundary 1030 * case. If while recursing down the left-edge differs from the key 1031 * by ONLY its create_tid, HAMMER_CURSOR_CREATE_CHECK is set along 1032 * with cursor->create_check. This is used by btree_lookup() to iterate. 1033 * The iteration backwards because as-of searches can wind up going 1034 * down the wrong branch of the B-Tree. 1035 */ 1036 static 1037 int 1038 btree_search(hammer_cursor_t cursor, int flags) 1039 { 1040 hammer_node_ondisk_t node; 1041 hammer_btree_elm_t elm; 1042 int error; 1043 int enospc = 0; 1044 int i; 1045 int r; 1046 int s; 1047 1048 flags |= cursor->flags; 1049 ++hammer_stats_btree_searches; 1050 1051 if (hammer_debug_btree) { 1052 kprintf("SEARCH %016llx[%d] %016llx %02x key=%016llx cre=%016llx lo=%02x (td=%p)\n", 1053 (long long)cursor->node->node_offset, 1054 cursor->index, 1055 (long long)cursor->key_beg.obj_id, 1056 cursor->key_beg.rec_type, 1057 (long long)cursor->key_beg.key, 1058 (long long)cursor->key_beg.create_tid, 1059 cursor->key_beg.localization, 1060 curthread 1061 ); 1062 if (cursor->parent) 1063 kprintf("SEARCHP %016llx[%d] (%016llx/%016llx %016llx/%016llx) (%p/%p %p/%p)\n", 1064 (long long)cursor->parent->node_offset, 1065 cursor->parent_index, 1066 (long long)cursor->left_bound->obj_id, 1067 (long long)cursor->parent->ondisk->elms[cursor->parent_index].internal.base.obj_id, 1068 (long long)cursor->right_bound->obj_id, 1069 (long long)cursor->parent->ondisk->elms[cursor->parent_index+1].internal.base.obj_id, 1070 cursor->left_bound, 1071 &cursor->parent->ondisk->elms[cursor->parent_index], 1072 cursor->right_bound, 1073 &cursor->parent->ondisk->elms[cursor->parent_index+1] 1074 ); 1075 } 1076 1077 /* 1078 * Move our cursor up the tree until we find a node whos range covers 1079 * the key we are trying to locate. 1080 * 1081 * The left bound is inclusive, the right bound is non-inclusive. 1082 * It is ok to cursor up too far. 1083 */ 1084 for (;;) { 1085 r = hammer_btree_cmp(&cursor->key_beg, cursor->left_bound); 1086 s = hammer_btree_cmp(&cursor->key_beg, cursor->right_bound); 1087 if (r >= 0 && s < 0) 1088 break; 1089 KKASSERT(cursor->parent); 1090 ++hammer_stats_btree_iterations; 1091 error = hammer_cursor_up(cursor); 1092 if (error) 1093 goto done; 1094 } 1095 1096 /* 1097 * The delete-checks below are based on node, not parent. Set the 1098 * initial delete-check based on the parent. 1099 */ 1100 if (r == 1) { 1101 KKASSERT(cursor->left_bound->create_tid != 1); 1102 cursor->create_check = cursor->left_bound->create_tid - 1; 1103 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1104 } 1105 1106 /* 1107 * We better have ended up with a node somewhere. 1108 */ 1109 KKASSERT(cursor->node != NULL); 1110 1111 /* 1112 * If we are inserting we can't start at a full node if the parent 1113 * is also full (because there is no way to split the node), 1114 * continue running up the tree until the requirement is satisfied 1115 * or we hit the root of the filesystem. 1116 * 1117 * (If inserting we aren't doing an as-of search so we don't have 1118 * to worry about create_check). 1119 */ 1120 while (flags & HAMMER_CURSOR_INSERT) { 1121 if (btree_node_is_full(cursor->node->ondisk) == 0) 1122 break; 1123 if (cursor->node->ondisk->parent == 0 || 1124 cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) { 1125 break; 1126 } 1127 ++hammer_stats_btree_iterations; 1128 error = hammer_cursor_up(cursor); 1129 /* node may have become stale */ 1130 if (error) 1131 goto done; 1132 } 1133 1134 /* 1135 * Push down through internal nodes to locate the requested key. 1136 */ 1137 node = cursor->node->ondisk; 1138 while (node->type == HAMMER_BTREE_TYPE_INTERNAL) { 1139 /* 1140 * Scan the node to find the subtree index to push down into. 1141 * We go one-past, then back-up. 1142 * 1143 * We must proactively remove deleted elements which may 1144 * have been left over from a deadlocked btree_remove(). 1145 * 1146 * The left and right boundaries are included in the loop 1147 * in order to detect edge cases. 1148 * 1149 * If the separator only differs by create_tid (r == 1) 1150 * and we are doing an as-of search, we may end up going 1151 * down a branch to the left of the one containing the 1152 * desired key. This requires numerous special cases. 1153 */ 1154 ++hammer_stats_btree_iterations; 1155 if (hammer_debug_btree) { 1156 kprintf("SEARCH-I %016llx count=%d\n", 1157 (long long)cursor->node->node_offset, 1158 node->count); 1159 } 1160 1161 /* 1162 * Try to shortcut the search before dropping into the 1163 * linear loop. Locate the first node where r <= 1. 1164 */ 1165 i = hammer_btree_search_node(&cursor->key_beg, node); 1166 while (i <= node->count) { 1167 ++hammer_stats_btree_elements; 1168 elm = &node->elms[i]; 1169 r = hammer_btree_cmp(&cursor->key_beg, &elm->base); 1170 if (hammer_debug_btree > 2) { 1171 kprintf(" IELM %p %d r=%d\n", 1172 &node->elms[i], i, r); 1173 } 1174 if (r < 0) 1175 break; 1176 if (r == 1) { 1177 KKASSERT(elm->base.create_tid != 1); 1178 cursor->create_check = elm->base.create_tid - 1; 1179 cursor->flags |= HAMMER_CURSOR_CREATE_CHECK; 1180 } 1181 ++i; 1182 } 1183 if (hammer_debug_btree) { 1184 kprintf("SEARCH-I preI=%d/%d r=%d\n", 1185 i, node->count, r); 1186 } 1187 1188 /* 1189 * These cases occur when the parent's idea of the boundary 1190 * is wider then the child's idea of the boundary, and 1191 * require special handling. If not inserting we can 1192 * terminate the search early for these cases but the 1193 * child's boundaries cannot be unconditionally modified. 1194 */ 1195 if (i == 0) { 1196 /* 1197 * If i == 0 the search terminated to the LEFT of the 1198 * left_boundary but to the RIGHT of the parent's left 1199 * boundary. 1200 */ 1201 u_int8_t save; 1202 1203 elm = &node->elms[0]; 1204 1205 /* 1206 * If we aren't inserting we can stop here. 1207 */ 1208 if ((flags & (HAMMER_CURSOR_INSERT | 1209 HAMMER_CURSOR_PRUNING)) == 0) { 1210 cursor->index = 0; 1211 return(ENOENT); 1212 } 1213 1214 /* 1215 * Correct a left-hand boundary mismatch. 1216 * 1217 * We can only do this if we can upgrade the lock, 1218 * and synchronized as a background cursor (i.e. 1219 * inserting or pruning). 1220 * 1221 * WARNING: We can only do this if inserting, i.e. 1222 * we are running on the backend. 1223 */ 1224 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1225 return(error); 1226 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1227 hammer_modify_node_field(cursor->trans, cursor->node, 1228 elms[0]); 1229 save = node->elms[0].base.btype; 1230 node->elms[0].base = *cursor->left_bound; 1231 node->elms[0].base.btype = save; 1232 hammer_modify_node_done(cursor->node); 1233 } else if (i == node->count + 1) { 1234 /* 1235 * If i == node->count + 1 the search terminated to 1236 * the RIGHT of the right boundary but to the LEFT 1237 * of the parent's right boundary. If we aren't 1238 * inserting we can stop here. 1239 * 1240 * Note that the last element in this case is 1241 * elms[i-2] prior to adjustments to 'i'. 1242 */ 1243 --i; 1244 if ((flags & (HAMMER_CURSOR_INSERT | 1245 HAMMER_CURSOR_PRUNING)) == 0) { 1246 cursor->index = i; 1247 return (ENOENT); 1248 } 1249 1250 /* 1251 * Correct a right-hand boundary mismatch. 1252 * (actual push-down record is i-2 prior to 1253 * adjustments to i). 1254 * 1255 * We can only do this if we can upgrade the lock, 1256 * and synchronized as a background cursor (i.e. 1257 * inserting or pruning). 1258 * 1259 * WARNING: We can only do this if inserting, i.e. 1260 * we are running on the backend. 1261 */ 1262 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1263 return(error); 1264 elm = &node->elms[i]; 1265 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1266 hammer_modify_node(cursor->trans, cursor->node, 1267 &elm->base, sizeof(elm->base)); 1268 elm->base = *cursor->right_bound; 1269 hammer_modify_node_done(cursor->node); 1270 --i; 1271 } else { 1272 /* 1273 * The push-down index is now i - 1. If we had 1274 * terminated on the right boundary this will point 1275 * us at the last element. 1276 */ 1277 --i; 1278 } 1279 cursor->index = i; 1280 elm = &node->elms[i]; 1281 1282 if (hammer_debug_btree) { 1283 kprintf("RESULT-I %016llx[%d] %016llx %02x " 1284 "key=%016llx cre=%016llx lo=%02x\n", 1285 (long long)cursor->node->node_offset, 1286 i, 1287 (long long)elm->internal.base.obj_id, 1288 elm->internal.base.rec_type, 1289 (long long)elm->internal.base.key, 1290 (long long)elm->internal.base.create_tid, 1291 elm->internal.base.localization 1292 ); 1293 } 1294 1295 /* 1296 * We better have a valid subtree offset. 1297 */ 1298 KKASSERT(elm->internal.subtree_offset != 0); 1299 1300 /* 1301 * Handle insertion and deletion requirements. 1302 * 1303 * If inserting split full nodes. The split code will 1304 * adjust cursor->node and cursor->index if the current 1305 * index winds up in the new node. 1306 * 1307 * If inserting and a left or right edge case was detected, 1308 * we cannot correct the left or right boundary and must 1309 * prepend and append an empty leaf node in order to make 1310 * the boundary correction. 1311 * 1312 * If we run out of space we set enospc but continue on 1313 * to a leaf. 1314 */ 1315 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0) { 1316 if (btree_node_is_full(node)) { 1317 error = btree_split_internal(cursor); 1318 if (error) { 1319 if (error != ENOSPC) 1320 goto done; 1321 enospc = 1; 1322 } 1323 /* 1324 * reload stale pointers 1325 */ 1326 i = cursor->index; 1327 node = cursor->node->ondisk; 1328 } 1329 } 1330 1331 /* 1332 * Push down (push into new node, existing node becomes 1333 * the parent) and continue the search. 1334 */ 1335 error = hammer_cursor_down(cursor); 1336 /* node may have become stale */ 1337 if (error) 1338 goto done; 1339 node = cursor->node->ondisk; 1340 } 1341 1342 /* 1343 * We are at a leaf, do a linear search of the key array. 1344 * 1345 * On success the index is set to the matching element and 0 1346 * is returned. 1347 * 1348 * On failure the index is set to the insertion point and ENOENT 1349 * is returned. 1350 * 1351 * Boundaries are not stored in leaf nodes, so the index can wind 1352 * up to the left of element 0 (index == 0) or past the end of 1353 * the array (index == node->count). It is also possible that the 1354 * leaf might be empty. 1355 */ 1356 ++hammer_stats_btree_iterations; 1357 KKASSERT (node->type == HAMMER_BTREE_TYPE_LEAF); 1358 KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS); 1359 if (hammer_debug_btree) { 1360 kprintf("SEARCH-L %016llx count=%d\n", 1361 (long long)cursor->node->node_offset, 1362 node->count); 1363 } 1364 1365 /* 1366 * Try to shortcut the search before dropping into the 1367 * linear loop. Locate the first node where r <= 1. 1368 */ 1369 i = hammer_btree_search_node(&cursor->key_beg, node); 1370 while (i < node->count) { 1371 ++hammer_stats_btree_elements; 1372 elm = &node->elms[i]; 1373 1374 r = hammer_btree_cmp(&cursor->key_beg, &elm->leaf.base); 1375 1376 if (hammer_debug_btree > 1) 1377 kprintf(" ELM %p %d r=%d\n", &node->elms[i], i, r); 1378 1379 /* 1380 * We are at a record element. Stop if we've flipped past 1381 * key_beg, not counting the create_tid test. Allow the 1382 * r == 1 case (key_beg > element but differs only by its 1383 * create_tid) to fall through to the AS-OF check. 1384 */ 1385 KKASSERT (elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD); 1386 1387 if (r < 0) 1388 goto failed; 1389 if (r > 1) { 1390 ++i; 1391 continue; 1392 } 1393 1394 /* 1395 * Check our as-of timestamp against the element. 1396 */ 1397 if (flags & HAMMER_CURSOR_ASOF) { 1398 if (hammer_btree_chkts(cursor->asof, 1399 &node->elms[i].base) != 0) { 1400 ++i; 1401 continue; 1402 } 1403 /* success */ 1404 } else { 1405 if (r > 0) { /* can only be +1 */ 1406 ++i; 1407 continue; 1408 } 1409 /* success */ 1410 } 1411 cursor->index = i; 1412 error = 0; 1413 if (hammer_debug_btree) { 1414 kprintf("RESULT-L %016llx[%d] (SUCCESS)\n", 1415 (long long)cursor->node->node_offset, i); 1416 } 1417 goto done; 1418 } 1419 1420 /* 1421 * The search of the leaf node failed. i is the insertion point. 1422 */ 1423 failed: 1424 if (hammer_debug_btree) { 1425 kprintf("RESULT-L %016llx[%d] (FAILED)\n", 1426 (long long)cursor->node->node_offset, i); 1427 } 1428 1429 /* 1430 * No exact match was found, i is now at the insertion point. 1431 * 1432 * If inserting split a full leaf before returning. This 1433 * may have the side effect of adjusting cursor->node and 1434 * cursor->index. 1435 */ 1436 cursor->index = i; 1437 if ((flags & HAMMER_CURSOR_INSERT) && enospc == 0 && 1438 btree_node_is_full(node)) { 1439 error = btree_split_leaf(cursor); 1440 if (error) { 1441 if (error != ENOSPC) 1442 goto done; 1443 enospc = 1; 1444 } 1445 /* 1446 * reload stale pointers 1447 */ 1448 /* NOT USED 1449 i = cursor->index; 1450 node = &cursor->node->internal; 1451 */ 1452 } 1453 1454 /* 1455 * We reached a leaf but did not find the key we were looking for. 1456 * If this is an insert we will be properly positioned for an insert 1457 * (ENOENT) or unable to insert (ENOSPC). 1458 */ 1459 error = enospc ? ENOSPC : ENOENT; 1460 done: 1461 return(error); 1462 } 1463 1464 /* 1465 * Heuristical search for the first element whos comparison is <= 1. May 1466 * return an index whos compare result is > 1 but may only return an index 1467 * whos compare result is <= 1 if it is the first element with that result. 1468 */ 1469 int 1470 hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node) 1471 { 1472 int b; 1473 int s; 1474 int i; 1475 int r; 1476 1477 /* 1478 * Don't bother if the node does not have very many elements 1479 */ 1480 b = 0; 1481 s = node->count; 1482 while (s - b > 4) { 1483 i = b + (s - b) / 2; 1484 ++hammer_stats_btree_elements; 1485 r = hammer_btree_cmp(elm, &node->elms[i].leaf.base); 1486 if (r <= 1) { 1487 s = i; 1488 } else { 1489 b = i; 1490 } 1491 } 1492 return(b); 1493 } 1494 1495 1496 /************************************************************************ 1497 * SPLITTING AND MERGING * 1498 ************************************************************************ 1499 * 1500 * These routines do all the dirty work required to split and merge nodes. 1501 */ 1502 1503 /* 1504 * Split an internal node into two nodes and move the separator at the split 1505 * point to the parent. 1506 * 1507 * (cursor->node, cursor->index) indicates the element the caller intends 1508 * to push into. We will adjust node and index if that element winds 1509 * up in the split node. 1510 * 1511 * If we are at the root of the filesystem a new root must be created with 1512 * two elements, one pointing to the original root and one pointing to the 1513 * newly allocated split node. 1514 */ 1515 static 1516 int 1517 btree_split_internal(hammer_cursor_t cursor) 1518 { 1519 hammer_node_ondisk_t ondisk; 1520 hammer_node_t node; 1521 hammer_node_t parent; 1522 hammer_node_t new_node; 1523 hammer_btree_elm_t elm; 1524 hammer_btree_elm_t parent_elm; 1525 struct hammer_node_lock lockroot; 1526 hammer_mount_t hmp = cursor->trans->hmp; 1527 int parent_index; 1528 int made_root; 1529 int split; 1530 int error; 1531 int i; 1532 const int esize = sizeof(*elm); 1533 1534 hammer_node_lock_init(&lockroot, cursor->node); 1535 error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL); 1536 if (error) 1537 goto done; 1538 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1539 goto done; 1540 ++hammer_stats_btree_splits; 1541 1542 /* 1543 * Calculate the split point. If the insertion point is at the 1544 * end of the leaf we adjust the split point significantly to the 1545 * right to try to optimize node fill and flag it. If we hit 1546 * that same leaf again our heuristic failed and we don't try 1547 * to optimize node fill (it could lead to a degenerate case). 1548 */ 1549 node = cursor->node; 1550 ondisk = node->ondisk; 1551 KKASSERT(ondisk->count > 4); 1552 if (cursor->index == ondisk->count && 1553 (node->flags & HAMMER_NODE_NONLINEAR) == 0) { 1554 split = (ondisk->count + 1) * 3 / 4; 1555 node->flags |= HAMMER_NODE_NONLINEAR; 1556 } else { 1557 /* 1558 * We are splitting but elms[split] will be promoted to 1559 * the parent, leaving the right hand node with one less 1560 * element. If the insertion point will be on the 1561 * left-hand side adjust the split point to give the 1562 * right hand side one additional node. 1563 */ 1564 split = (ondisk->count + 1) / 2; 1565 if (cursor->index <= split) 1566 --split; 1567 } 1568 1569 /* 1570 * If we are at the root of the filesystem, create a new root node 1571 * with 1 element and split normally. Avoid making major 1572 * modifications until we know the whole operation will work. 1573 */ 1574 if (ondisk->parent == 0) { 1575 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1576 if (parent == NULL) 1577 goto done; 1578 hammer_lock_ex(&parent->lock); 1579 hammer_modify_node_noundo(cursor->trans, parent); 1580 ondisk = parent->ondisk; 1581 ondisk->count = 1; 1582 ondisk->parent = 0; 1583 ondisk->mirror_tid = node->ondisk->mirror_tid; 1584 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1585 ondisk->elms[0].base = hmp->root_btree_beg; 1586 ondisk->elms[0].base.btype = node->ondisk->type; 1587 ondisk->elms[0].internal.subtree_offset = node->node_offset; 1588 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid; 1589 ondisk->elms[1].base = hmp->root_btree_end; 1590 hammer_modify_node_done(parent); 1591 /* ondisk->elms[1].base.btype - not used */ 1592 made_root = 1; 1593 parent_index = 0; /* index of current node in parent */ 1594 } else { 1595 made_root = 0; 1596 parent = cursor->parent; 1597 parent_index = cursor->parent_index; 1598 } 1599 1600 /* 1601 * Split node into new_node at the split point. 1602 * 1603 * B O O O P N N B <-- P = node->elms[split] (index 4) 1604 * 0 1 2 3 4 5 6 <-- subtree indices 1605 * 1606 * x x P x x 1607 * s S S s 1608 * / \ 1609 * B O O O B B N N B <--- inner boundary points are 'P' 1610 * 0 1 2 3 4 5 6 1611 */ 1612 new_node = hammer_alloc_btree(cursor->trans, 0, &error); 1613 if (new_node == NULL) { 1614 if (made_root) { 1615 hammer_unlock(&parent->lock); 1616 hammer_delete_node(cursor->trans, parent); 1617 hammer_rel_node(parent); 1618 } 1619 goto done; 1620 } 1621 hammer_lock_ex(&new_node->lock); 1622 1623 /* 1624 * Create the new node. P becomes the left-hand boundary in the 1625 * new node. Copy the right-hand boundary as well. 1626 * 1627 * elm is the new separator. 1628 */ 1629 hammer_modify_node_noundo(cursor->trans, new_node); 1630 hammer_modify_node_all(cursor->trans, node); 1631 ondisk = node->ondisk; 1632 elm = &ondisk->elms[split]; 1633 bcopy(elm, &new_node->ondisk->elms[0], 1634 (ondisk->count - split + 1) * esize); 1635 new_node->ondisk->count = ondisk->count - split; 1636 new_node->ondisk->parent = parent->node_offset; 1637 new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1638 new_node->ondisk->mirror_tid = ondisk->mirror_tid; 1639 KKASSERT(ondisk->type == new_node->ondisk->type); 1640 hammer_cursor_split_node(node, new_node, split); 1641 1642 /* 1643 * Cleanup the original node. Elm (P) becomes the new boundary, 1644 * its subtree_offset was moved to the new node. If we had created 1645 * a new root its parent pointer may have changed. 1646 */ 1647 elm->internal.subtree_offset = 0; 1648 ondisk->count = split; 1649 1650 /* 1651 * Insert the separator into the parent, fixup the parent's 1652 * reference to the original node, and reference the new node. 1653 * The separator is P. 1654 * 1655 * Remember that base.count does not include the right-hand boundary. 1656 */ 1657 hammer_modify_node_all(cursor->trans, parent); 1658 ondisk = parent->ondisk; 1659 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1660 parent_elm = &ondisk->elms[parent_index+1]; 1661 bcopy(parent_elm, parent_elm + 1, 1662 (ondisk->count - parent_index) * esize); 1663 parent_elm->internal.base = elm->base; /* separator P */ 1664 parent_elm->internal.base.btype = new_node->ondisk->type; 1665 parent_elm->internal.subtree_offset = new_node->node_offset; 1666 parent_elm->internal.mirror_tid = new_node->ondisk->mirror_tid; 1667 ++ondisk->count; 1668 hammer_modify_node_done(parent); 1669 hammer_cursor_inserted_element(parent, parent_index + 1); 1670 1671 /* 1672 * The children of new_node need their parent pointer set to new_node. 1673 * The children have already been locked by 1674 * hammer_btree_lock_children(). 1675 */ 1676 for (i = 0; i < new_node->ondisk->count; ++i) { 1677 elm = &new_node->ondisk->elms[i]; 1678 error = btree_set_parent(cursor->trans, new_node, elm); 1679 if (error) { 1680 panic("btree_split_internal: btree-fixup problem"); 1681 } 1682 } 1683 hammer_modify_node_done(new_node); 1684 1685 /* 1686 * The filesystem's root B-Tree pointer may have to be updated. 1687 */ 1688 if (made_root) { 1689 hammer_volume_t volume; 1690 1691 volume = hammer_get_root_volume(hmp, &error); 1692 KKASSERT(error == 0); 1693 1694 hammer_modify_volume_field(cursor->trans, volume, 1695 vol0_btree_root); 1696 volume->ondisk->vol0_btree_root = parent->node_offset; 1697 hammer_modify_volume_done(volume); 1698 node->ondisk->parent = parent->node_offset; 1699 if (cursor->parent) { 1700 hammer_unlock(&cursor->parent->lock); 1701 hammer_rel_node(cursor->parent); 1702 } 1703 cursor->parent = parent; /* lock'd and ref'd */ 1704 hammer_rel_volume(volume, 0); 1705 } 1706 hammer_modify_node_done(node); 1707 1708 /* 1709 * Ok, now adjust the cursor depending on which element the original 1710 * index was pointing at. If we are >= the split point the push node 1711 * is now in the new node. 1712 * 1713 * NOTE: If we are at the split point itself we cannot stay with the 1714 * original node because the push index will point at the right-hand 1715 * boundary, which is illegal. 1716 * 1717 * NOTE: The cursor's parent or parent_index must be adjusted for 1718 * the case where a new parent (new root) was created, and the case 1719 * where the cursor is now pointing at the split node. 1720 */ 1721 if (cursor->index >= split) { 1722 cursor->parent_index = parent_index + 1; 1723 cursor->index -= split; 1724 hammer_unlock(&cursor->node->lock); 1725 hammer_rel_node(cursor->node); 1726 cursor->node = new_node; /* locked and ref'd */ 1727 } else { 1728 cursor->parent_index = parent_index; 1729 hammer_unlock(&new_node->lock); 1730 hammer_rel_node(new_node); 1731 } 1732 1733 /* 1734 * Fixup left and right bounds 1735 */ 1736 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1737 cursor->left_bound = &parent_elm[0].internal.base; 1738 cursor->right_bound = &parent_elm[1].internal.base; 1739 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1740 &cursor->node->ondisk->elms[0].internal.base) <= 0); 1741 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1742 &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0); 1743 1744 done: 1745 hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL); 1746 hammer_cursor_downgrade(cursor); 1747 return (error); 1748 } 1749 1750 /* 1751 * Same as the above, but splits a full leaf node. 1752 */ 1753 static 1754 int 1755 btree_split_leaf(hammer_cursor_t cursor) 1756 { 1757 hammer_node_ondisk_t ondisk; 1758 hammer_node_t parent; 1759 hammer_node_t leaf; 1760 hammer_mount_t hmp; 1761 hammer_node_t new_leaf; 1762 hammer_btree_elm_t elm; 1763 hammer_btree_elm_t parent_elm; 1764 hammer_base_elm_t mid_boundary; 1765 int parent_index; 1766 int made_root; 1767 int split; 1768 int error; 1769 const size_t esize = sizeof(*elm); 1770 1771 if ((error = hammer_cursor_upgrade(cursor)) != 0) 1772 return(error); 1773 ++hammer_stats_btree_splits; 1774 1775 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1776 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1777 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1778 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1779 1780 /* 1781 * Calculate the split point. If the insertion point is at the 1782 * end of the leaf we adjust the split point significantly to the 1783 * right to try to optimize node fill and flag it. If we hit 1784 * that same leaf again our heuristic failed and we don't try 1785 * to optimize node fill (it could lead to a degenerate case). 1786 */ 1787 leaf = cursor->node; 1788 ondisk = leaf->ondisk; 1789 KKASSERT(ondisk->count > 4); 1790 if (cursor->index == ondisk->count && 1791 (leaf->flags & HAMMER_NODE_NONLINEAR) == 0) { 1792 split = (ondisk->count + 1) * 3 / 4; 1793 leaf->flags |= HAMMER_NODE_NONLINEAR; 1794 } else { 1795 split = (ondisk->count + 1) / 2; 1796 } 1797 1798 #if 0 1799 /* 1800 * If the insertion point is at the split point shift the 1801 * split point left so we don't have to worry about 1802 */ 1803 if (cursor->index == split) 1804 --split; 1805 #endif 1806 KKASSERT(split > 0 && split < ondisk->count); 1807 1808 error = 0; 1809 hmp = leaf->hmp; 1810 1811 elm = &ondisk->elms[split]; 1812 1813 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm[-1].leaf.base) <= 0); 1814 KKASSERT(hammer_btree_cmp(cursor->left_bound, &elm->leaf.base) <= 0); 1815 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm->leaf.base) > 0); 1816 KKASSERT(hammer_btree_cmp(cursor->right_bound, &elm[1].leaf.base) > 0); 1817 1818 /* 1819 * If we are at the root of the tree, create a new root node with 1820 * 1 element and split normally. Avoid making major modifications 1821 * until we know the whole operation will work. 1822 */ 1823 if (ondisk->parent == 0) { 1824 parent = hammer_alloc_btree(cursor->trans, 0, &error); 1825 if (parent == NULL) 1826 goto done; 1827 hammer_lock_ex(&parent->lock); 1828 hammer_modify_node_noundo(cursor->trans, parent); 1829 ondisk = parent->ondisk; 1830 ondisk->count = 1; 1831 ondisk->parent = 0; 1832 ondisk->mirror_tid = leaf->ondisk->mirror_tid; 1833 ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; 1834 ondisk->elms[0].base = hmp->root_btree_beg; 1835 ondisk->elms[0].base.btype = leaf->ondisk->type; 1836 ondisk->elms[0].internal.subtree_offset = leaf->node_offset; 1837 ondisk->elms[0].internal.mirror_tid = ondisk->mirror_tid; 1838 ondisk->elms[1].base = hmp->root_btree_end; 1839 /* ondisk->elms[1].base.btype = not used */ 1840 hammer_modify_node_done(parent); 1841 made_root = 1; 1842 parent_index = 0; /* insertion point in parent */ 1843 } else { 1844 made_root = 0; 1845 parent = cursor->parent; 1846 parent_index = cursor->parent_index; 1847 } 1848 1849 /* 1850 * Split leaf into new_leaf at the split point. Select a separator 1851 * value in-between the two leafs but with a bent towards the right 1852 * leaf since comparisons use an 'elm >= separator' inequality. 1853 * 1854 * L L L L L L L L 1855 * 1856 * x x P x x 1857 * s S S s 1858 * / \ 1859 * L L L L L L L L 1860 */ 1861 new_leaf = hammer_alloc_btree(cursor->trans, 0, &error); 1862 if (new_leaf == NULL) { 1863 if (made_root) { 1864 hammer_unlock(&parent->lock); 1865 hammer_delete_node(cursor->trans, parent); 1866 hammer_rel_node(parent); 1867 } 1868 goto done; 1869 } 1870 hammer_lock_ex(&new_leaf->lock); 1871 1872 /* 1873 * Create the new node and copy the leaf elements from the split 1874 * point on to the new node. 1875 */ 1876 hammer_modify_node_all(cursor->trans, leaf); 1877 hammer_modify_node_noundo(cursor->trans, new_leaf); 1878 ondisk = leaf->ondisk; 1879 elm = &ondisk->elms[split]; 1880 bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize); 1881 new_leaf->ondisk->count = ondisk->count - split; 1882 new_leaf->ondisk->parent = parent->node_offset; 1883 new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF; 1884 new_leaf->ondisk->mirror_tid = ondisk->mirror_tid; 1885 KKASSERT(ondisk->type == new_leaf->ondisk->type); 1886 hammer_modify_node_done(new_leaf); 1887 hammer_cursor_split_node(leaf, new_leaf, split); 1888 1889 /* 1890 * Cleanup the original node. Because this is a leaf node and 1891 * leaf nodes do not have a right-hand boundary, there 1892 * aren't any special edge cases to clean up. We just fixup the 1893 * count. 1894 */ 1895 ondisk->count = split; 1896 1897 /* 1898 * Insert the separator into the parent, fixup the parent's 1899 * reference to the original node, and reference the new node. 1900 * The separator is P. 1901 * 1902 * Remember that base.count does not include the right-hand boundary. 1903 * We are copying parent_index+1 to parent_index+2, not +0 to +1. 1904 */ 1905 hammer_modify_node_all(cursor->trans, parent); 1906 ondisk = parent->ondisk; 1907 KKASSERT(split != 0); 1908 KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS); 1909 parent_elm = &ondisk->elms[parent_index+1]; 1910 bcopy(parent_elm, parent_elm + 1, 1911 (ondisk->count - parent_index) * esize); 1912 1913 hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base); 1914 parent_elm->internal.base.btype = new_leaf->ondisk->type; 1915 parent_elm->internal.subtree_offset = new_leaf->node_offset; 1916 parent_elm->internal.mirror_tid = new_leaf->ondisk->mirror_tid; 1917 mid_boundary = &parent_elm->base; 1918 ++ondisk->count; 1919 hammer_modify_node_done(parent); 1920 hammer_cursor_inserted_element(parent, parent_index + 1); 1921 1922 /* 1923 * The filesystem's root B-Tree pointer may have to be updated. 1924 */ 1925 if (made_root) { 1926 hammer_volume_t volume; 1927 1928 volume = hammer_get_root_volume(hmp, &error); 1929 KKASSERT(error == 0); 1930 1931 hammer_modify_volume_field(cursor->trans, volume, 1932 vol0_btree_root); 1933 volume->ondisk->vol0_btree_root = parent->node_offset; 1934 hammer_modify_volume_done(volume); 1935 leaf->ondisk->parent = parent->node_offset; 1936 if (cursor->parent) { 1937 hammer_unlock(&cursor->parent->lock); 1938 hammer_rel_node(cursor->parent); 1939 } 1940 cursor->parent = parent; /* lock'd and ref'd */ 1941 hammer_rel_volume(volume, 0); 1942 } 1943 hammer_modify_node_done(leaf); 1944 1945 /* 1946 * Ok, now adjust the cursor depending on which element the original 1947 * index was pointing at. If we are >= the split point the push node 1948 * is now in the new node. 1949 * 1950 * NOTE: If we are at the split point itself we need to select the 1951 * old or new node based on where key_beg's insertion point will be. 1952 * If we pick the wrong side the inserted element will wind up in 1953 * the wrong leaf node and outside that node's bounds. 1954 */ 1955 if (cursor->index > split || 1956 (cursor->index == split && 1957 hammer_btree_cmp(&cursor->key_beg, mid_boundary) >= 0)) { 1958 cursor->parent_index = parent_index + 1; 1959 cursor->index -= split; 1960 hammer_unlock(&cursor->node->lock); 1961 hammer_rel_node(cursor->node); 1962 cursor->node = new_leaf; 1963 } else { 1964 cursor->parent_index = parent_index; 1965 hammer_unlock(&new_leaf->lock); 1966 hammer_rel_node(new_leaf); 1967 } 1968 1969 /* 1970 * Fixup left and right bounds 1971 */ 1972 parent_elm = &parent->ondisk->elms[cursor->parent_index]; 1973 cursor->left_bound = &parent_elm[0].internal.base; 1974 cursor->right_bound = &parent_elm[1].internal.base; 1975 1976 /* 1977 * Assert that the bounds are correct. 1978 */ 1979 KKASSERT(hammer_btree_cmp(cursor->left_bound, 1980 &cursor->node->ondisk->elms[0].leaf.base) <= 0); 1981 KKASSERT(hammer_btree_cmp(cursor->right_bound, 1982 &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); 1983 KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->key_beg) <= 0); 1984 KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->key_beg) > 0); 1985 1986 done: 1987 hammer_cursor_downgrade(cursor); 1988 return (error); 1989 } 1990 1991 #if 0 1992 1993 /* 1994 * Recursively correct the right-hand boundary's create_tid to (tid) as 1995 * long as the rest of the key matches. We have to recurse upward in 1996 * the tree as well as down the left side of each parent's right node. 1997 * 1998 * Return EDEADLK if we were only partially successful, forcing the caller 1999 * to try again. The original cursor is not modified. This routine can 2000 * also fail with EDEADLK if it is forced to throw away a portion of its 2001 * record history. 2002 * 2003 * The caller must pass a downgraded cursor to us (otherwise we can't dup it). 2004 */ 2005 struct hammer_rhb { 2006 TAILQ_ENTRY(hammer_rhb) entry; 2007 hammer_node_t node; 2008 int index; 2009 }; 2010 2011 TAILQ_HEAD(hammer_rhb_list, hammer_rhb); 2012 2013 int 2014 hammer_btree_correct_rhb(hammer_cursor_t cursor, hammer_tid_t tid) 2015 { 2016 struct hammer_mount *hmp; 2017 struct hammer_rhb_list rhb_list; 2018 hammer_base_elm_t elm; 2019 hammer_node_t orig_node; 2020 struct hammer_rhb *rhb; 2021 int orig_index; 2022 int error; 2023 2024 TAILQ_INIT(&rhb_list); 2025 hmp = cursor->trans->hmp; 2026 2027 /* 2028 * Save our position so we can restore it on return. This also 2029 * gives us a stable 'elm'. 2030 */ 2031 orig_node = cursor->node; 2032 hammer_ref_node(orig_node); 2033 hammer_lock_sh(&orig_node->lock); 2034 orig_index = cursor->index; 2035 elm = &orig_node->ondisk->elms[orig_index].base; 2036 2037 /* 2038 * Now build a list of parents going up, allocating a rhb 2039 * structure for each one. 2040 */ 2041 while (cursor->parent) { 2042 /* 2043 * Stop if we no longer have any right-bounds to fix up 2044 */ 2045 if (elm->obj_id != cursor->right_bound->obj_id || 2046 elm->rec_type != cursor->right_bound->rec_type || 2047 elm->key != cursor->right_bound->key) { 2048 break; 2049 } 2050 2051 /* 2052 * Stop if the right-hand bound's create_tid does not 2053 * need to be corrected. 2054 */ 2055 if (cursor->right_bound->create_tid >= tid) 2056 break; 2057 2058 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2059 rhb->node = cursor->parent; 2060 rhb->index = cursor->parent_index; 2061 hammer_ref_node(rhb->node); 2062 hammer_lock_sh(&rhb->node->lock); 2063 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2064 2065 hammer_cursor_up(cursor); 2066 } 2067 2068 /* 2069 * now safely adjust the right hand bound for each rhb. This may 2070 * also require taking the right side of the tree and iterating down 2071 * ITS left side. 2072 */ 2073 error = 0; 2074 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2075 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2076 if (error) 2077 break; 2078 TAILQ_REMOVE(&rhb_list, rhb, entry); 2079 hammer_unlock(&rhb->node->lock); 2080 hammer_rel_node(rhb->node); 2081 kfree(rhb, hmp->m_misc); 2082 2083 switch (cursor->node->ondisk->type) { 2084 case HAMMER_BTREE_TYPE_INTERNAL: 2085 /* 2086 * Right-boundary for parent at internal node 2087 * is one element to the right of the element whos 2088 * right boundary needs adjusting. We must then 2089 * traverse down the left side correcting any left 2090 * bounds (which may now be too far to the left). 2091 */ 2092 ++cursor->index; 2093 error = hammer_btree_correct_lhb(cursor, tid); 2094 break; 2095 default: 2096 panic("hammer_btree_correct_rhb(): Bad node type"); 2097 error = EINVAL; 2098 break; 2099 } 2100 } 2101 2102 /* 2103 * Cleanup 2104 */ 2105 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2106 TAILQ_REMOVE(&rhb_list, rhb, entry); 2107 hammer_unlock(&rhb->node->lock); 2108 hammer_rel_node(rhb->node); 2109 kfree(rhb, hmp->m_misc); 2110 } 2111 error = hammer_cursor_seek(cursor, orig_node, orig_index); 2112 hammer_unlock(&orig_node->lock); 2113 hammer_rel_node(orig_node); 2114 return (error); 2115 } 2116 2117 /* 2118 * Similar to rhb (in fact, rhb calls lhb), but corrects the left hand 2119 * bound going downward starting at the current cursor position. 2120 * 2121 * This function does not restore the cursor after use. 2122 */ 2123 int 2124 hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid) 2125 { 2126 struct hammer_rhb_list rhb_list; 2127 hammer_base_elm_t elm; 2128 hammer_base_elm_t cmp; 2129 struct hammer_rhb *rhb; 2130 struct hammer_mount *hmp; 2131 int error; 2132 2133 TAILQ_INIT(&rhb_list); 2134 hmp = cursor->trans->hmp; 2135 2136 cmp = &cursor->node->ondisk->elms[cursor->index].base; 2137 2138 /* 2139 * Record the node and traverse down the left-hand side for all 2140 * matching records needing a boundary correction. 2141 */ 2142 error = 0; 2143 for (;;) { 2144 rhb = kmalloc(sizeof(*rhb), hmp->m_misc, M_WAITOK|M_ZERO); 2145 rhb->node = cursor->node; 2146 rhb->index = cursor->index; 2147 hammer_ref_node(rhb->node); 2148 hammer_lock_sh(&rhb->node->lock); 2149 TAILQ_INSERT_HEAD(&rhb_list, rhb, entry); 2150 2151 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2152 /* 2153 * Nothing to traverse down if we are at the right 2154 * boundary of an internal node. 2155 */ 2156 if (cursor->index == cursor->node->ondisk->count) 2157 break; 2158 } else { 2159 elm = &cursor->node->ondisk->elms[cursor->index].base; 2160 if (elm->btype == HAMMER_BTREE_TYPE_RECORD) 2161 break; 2162 panic("Illegal leaf record type %02x", elm->btype); 2163 } 2164 error = hammer_cursor_down(cursor); 2165 if (error) 2166 break; 2167 2168 elm = &cursor->node->ondisk->elms[cursor->index].base; 2169 if (elm->obj_id != cmp->obj_id || 2170 elm->rec_type != cmp->rec_type || 2171 elm->key != cmp->key) { 2172 break; 2173 } 2174 if (elm->create_tid >= tid) 2175 break; 2176 2177 } 2178 2179 /* 2180 * Now we can safely adjust the left-hand boundary from the bottom-up. 2181 * The last element we remove from the list is the caller's right hand 2182 * boundary, which must also be adjusted. 2183 */ 2184 while (error == 0 && (rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2185 error = hammer_cursor_seek(cursor, rhb->node, rhb->index); 2186 if (error) 2187 break; 2188 TAILQ_REMOVE(&rhb_list, rhb, entry); 2189 hammer_unlock(&rhb->node->lock); 2190 hammer_rel_node(rhb->node); 2191 kfree(rhb, hmp->m_misc); 2192 2193 elm = &cursor->node->ondisk->elms[cursor->index].base; 2194 if (cursor->node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 2195 hammer_modify_node(cursor->trans, cursor->node, 2196 &elm->create_tid, 2197 sizeof(elm->create_tid)); 2198 elm->create_tid = tid; 2199 hammer_modify_node_done(cursor->node); 2200 } else { 2201 panic("hammer_btree_correct_lhb(): Bad element type"); 2202 } 2203 } 2204 2205 /* 2206 * Cleanup 2207 */ 2208 while ((rhb = TAILQ_FIRST(&rhb_list)) != NULL) { 2209 TAILQ_REMOVE(&rhb_list, rhb, entry); 2210 hammer_unlock(&rhb->node->lock); 2211 hammer_rel_node(rhb->node); 2212 kfree(rhb, hmp->m_misc); 2213 } 2214 return (error); 2215 } 2216 2217 #endif 2218 2219 /* 2220 * Attempt to remove the locked, empty or want-to-be-empty B-Tree node at 2221 * (cursor->node). Returns 0 on success, EDEADLK if we could not complete 2222 * the operation due to a deadlock, or some other error. 2223 * 2224 * This routine is initially called with an empty leaf and may be 2225 * recursively called with single-element internal nodes. 2226 * 2227 * It should also be noted that when removing empty leaves we must be sure 2228 * to test and update mirror_tid because another thread may have deadlocked 2229 * against us (or someone) trying to propagate it up and cannot retry once 2230 * the node has been deleted. 2231 * 2232 * On return the cursor may end up pointing to an internal node, suitable 2233 * for further iteration but not for an immediate insertion or deletion. 2234 */ 2235 static int 2236 btree_remove(hammer_cursor_t cursor) 2237 { 2238 hammer_node_ondisk_t ondisk; 2239 hammer_btree_elm_t elm; 2240 hammer_node_t node; 2241 hammer_node_t parent; 2242 const int esize = sizeof(*elm); 2243 int error; 2244 2245 node = cursor->node; 2246 2247 /* 2248 * When deleting the root of the filesystem convert it to 2249 * an empty leaf node. Internal nodes cannot be empty. 2250 */ 2251 ondisk = node->ondisk; 2252 if (ondisk->parent == 0) { 2253 KKASSERT(cursor->parent == NULL); 2254 hammer_modify_node_all(cursor->trans, node); 2255 KKASSERT(ondisk == node->ondisk); 2256 ondisk->type = HAMMER_BTREE_TYPE_LEAF; 2257 ondisk->count = 0; 2258 hammer_modify_node_done(node); 2259 cursor->index = 0; 2260 return(0); 2261 } 2262 2263 parent = cursor->parent; 2264 2265 /* 2266 * Attempt to remove the parent's reference to the child. If the 2267 * parent would become empty we have to recurse. If we fail we 2268 * leave the parent pointing to an empty leaf node. 2269 * 2270 * We have to recurse successfully before we can delete the internal 2271 * node as it is illegal to have empty internal nodes. Even though 2272 * the operation may be aborted we must still fixup any unlocked 2273 * cursors as if we had deleted the element prior to recursing 2274 * (by calling hammer_cursor_deleted_element()) so those cursors 2275 * are properly forced up the chain by the recursion. 2276 */ 2277 if (parent->ondisk->count == 1) { 2278 /* 2279 * This special cursor_up_locked() call leaves the original 2280 * node exclusively locked and referenced, leaves the 2281 * original parent locked (as the new node), and locks the 2282 * new parent. It can return EDEADLK. 2283 * 2284 * We cannot call hammer_cursor_removed_node() until we are 2285 * actually able to remove the node. If we did then tracked 2286 * cursors in the middle of iterations could be repointed 2287 * to a parent node. If this occurs they could end up 2288 * scanning newly inserted records into the node (that could 2289 * not be deleted) when they push down again. 2290 * 2291 * Due to the way the recursion works the final parent is left 2292 * in cursor->parent after the recursion returns. Each 2293 * layer on the way back up is thus able to call 2294 * hammer_cursor_removed_node() and 'jump' the node up to 2295 * the (same) final parent. 2296 * 2297 * NOTE! The local variable 'parent' is invalid after we 2298 * call hammer_cursor_up_locked(). 2299 */ 2300 error = hammer_cursor_up_locked(cursor); 2301 parent = NULL; 2302 2303 if (error == 0) { 2304 hammer_cursor_deleted_element(cursor->node, 0); 2305 error = btree_remove(cursor); 2306 if (error == 0) { 2307 KKASSERT(node != cursor->node); 2308 hammer_cursor_removed_node( 2309 node, cursor->node, 2310 cursor->index); 2311 hammer_modify_node_all(cursor->trans, node); 2312 ondisk = node->ondisk; 2313 ondisk->type = HAMMER_BTREE_TYPE_DELETED; 2314 ondisk->count = 0; 2315 hammer_modify_node_done(node); 2316 hammer_flush_node(node, 0); 2317 hammer_delete_node(cursor->trans, node); 2318 } else { 2319 /* 2320 * Defer parent removal because we could not 2321 * get the lock, just let the leaf remain 2322 * empty. 2323 */ 2324 /**/ 2325 } 2326 hammer_unlock(&node->lock); 2327 hammer_rel_node(node); 2328 } else { 2329 /* 2330 * Defer parent removal because we could not 2331 * get the lock, just let the leaf remain 2332 * empty. 2333 */ 2334 /**/ 2335 } 2336 } else { 2337 KKASSERT(parent->ondisk->count > 1); 2338 2339 hammer_modify_node_all(cursor->trans, parent); 2340 ondisk = parent->ondisk; 2341 KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 2342 2343 elm = &ondisk->elms[cursor->parent_index]; 2344 KKASSERT(elm->internal.subtree_offset == node->node_offset); 2345 KKASSERT(ondisk->count > 0); 2346 2347 /* 2348 * We must retain the highest mirror_tid. The deleted 2349 * range is now encompassed by the element to the left. 2350 * If we are already at the left edge the new left edge 2351 * inherits mirror_tid. 2352 * 2353 * Note that bounds of the parent to our parent may create 2354 * a gap to the left of our left-most node or to the right 2355 * of our right-most node. The gap is silently included 2356 * in the mirror_tid's area of effect from the point of view 2357 * of the scan. 2358 */ 2359 if (cursor->parent_index) { 2360 if (elm[-1].internal.mirror_tid < 2361 elm[0].internal.mirror_tid) { 2362 elm[-1].internal.mirror_tid = 2363 elm[0].internal.mirror_tid; 2364 } 2365 } else { 2366 if (elm[1].internal.mirror_tid < 2367 elm[0].internal.mirror_tid) { 2368 elm[1].internal.mirror_tid = 2369 elm[0].internal.mirror_tid; 2370 } 2371 } 2372 2373 /* 2374 * Delete the subtree reference in the parent. Include 2375 * boundary element at end. 2376 */ 2377 bcopy(&elm[1], &elm[0], 2378 (ondisk->count - cursor->parent_index) * esize); 2379 --ondisk->count; 2380 hammer_modify_node_done(parent); 2381 hammer_cursor_removed_node(node, parent, cursor->parent_index); 2382 hammer_cursor_deleted_element(parent, cursor->parent_index); 2383 hammer_flush_node(node, 0); 2384 hammer_delete_node(cursor->trans, node); 2385 2386 /* 2387 * cursor->node is invalid, cursor up to make the cursor 2388 * valid again. We have to flag the condition in case 2389 * another thread wiggles an insertion in during an 2390 * iteration. 2391 */ 2392 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 2393 error = hammer_cursor_up(cursor); 2394 } 2395 return (error); 2396 } 2397 2398 /* 2399 * Propagate cursor->trans->tid up the B-Tree starting at the current 2400 * cursor position using pseudofs info gleaned from the passed inode. 2401 * 2402 * The passed inode has no relationship to the cursor position other 2403 * then being in the same pseudofs as the insertion or deletion we 2404 * are propagating the mirror_tid for. 2405 * 2406 * WARNING! Because we push and pop the passed cursor, it may be 2407 * modified by other B-Tree operations while it is unlocked 2408 * and things like the node & leaf pointers, and indexes might 2409 * change. 2410 */ 2411 void 2412 hammer_btree_do_propagation(hammer_cursor_t cursor, 2413 hammer_pseudofs_inmem_t pfsm, 2414 hammer_btree_leaf_elm_t leaf) 2415 { 2416 hammer_cursor_t ncursor; 2417 hammer_tid_t mirror_tid; 2418 int error __debugvar; 2419 2420 /* 2421 * We do not propagate a mirror_tid if the filesystem was mounted 2422 * in no-mirror mode. 2423 */ 2424 if (cursor->trans->hmp->master_id < 0) 2425 return; 2426 2427 /* 2428 * This is a bit of a hack because we cannot deadlock or return 2429 * EDEADLK here. The related operation has already completed and 2430 * we must propagate the mirror_tid now regardless. 2431 * 2432 * Generate a new cursor which inherits the original's locks and 2433 * unlock the original. Use the new cursor to propagate the 2434 * mirror_tid. Then clean up the new cursor and reacquire locks 2435 * on the original. 2436 * 2437 * hammer_dup_cursor() cannot dup locks. The dup inherits the 2438 * original's locks and the original is tracked and must be 2439 * re-locked. 2440 */ 2441 mirror_tid = cursor->node->ondisk->mirror_tid; 2442 KKASSERT(mirror_tid != 0); 2443 ncursor = hammer_push_cursor(cursor); 2444 error = hammer_btree_mirror_propagate(ncursor, mirror_tid); 2445 KKASSERT(error == 0); 2446 hammer_pop_cursor(cursor, ncursor); 2447 /* WARNING: cursor's leaf pointer may change after pop */ 2448 } 2449 2450 2451 /* 2452 * Propagate a mirror TID update upwards through the B-Tree to the root. 2453 * 2454 * A locked internal node must be passed in. The node will remain locked 2455 * on return. 2456 * 2457 * This function syncs mirror_tid at the specified internal node's element, 2458 * adjusts the node's aggregation mirror_tid, and then recurses upwards. 2459 */ 2460 static int 2461 hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid) 2462 { 2463 hammer_btree_internal_elm_t elm; 2464 hammer_node_t node; 2465 int error; 2466 2467 for (;;) { 2468 error = hammer_cursor_up(cursor); 2469 if (error == 0) 2470 error = hammer_cursor_upgrade(cursor); 2471 2472 /* 2473 * We can ignore HAMMER_CURSOR_ITERATE_CHECK, the 2474 * cursor will still be properly positioned for 2475 * mirror propagation, just not for iterations. 2476 */ 2477 while (error == EDEADLK) { 2478 hammer_recover_cursor(cursor); 2479 error = hammer_cursor_upgrade(cursor); 2480 } 2481 if (error) 2482 break; 2483 2484 /* 2485 * If the cursor deadlocked it could end up at a leaf 2486 * after we lost the lock. 2487 */ 2488 node = cursor->node; 2489 if (node->ondisk->type != HAMMER_BTREE_TYPE_INTERNAL) 2490 continue; 2491 2492 /* 2493 * Adjust the node's element 2494 */ 2495 elm = &node->ondisk->elms[cursor->index].internal; 2496 if (elm->mirror_tid >= mirror_tid) 2497 break; 2498 hammer_modify_node(cursor->trans, node, &elm->mirror_tid, 2499 sizeof(elm->mirror_tid)); 2500 elm->mirror_tid = mirror_tid; 2501 hammer_modify_node_done(node); 2502 if (hammer_debug_general & 0x0002) { 2503 kprintf("mirror_propagate: propagate " 2504 "%016llx @%016llx:%d\n", 2505 (long long)mirror_tid, 2506 (long long)node->node_offset, 2507 cursor->index); 2508 } 2509 2510 2511 /* 2512 * Adjust the node's mirror_tid aggregator 2513 */ 2514 if (node->ondisk->mirror_tid >= mirror_tid) 2515 return(0); 2516 hammer_modify_node_field(cursor->trans, node, mirror_tid); 2517 node->ondisk->mirror_tid = mirror_tid; 2518 hammer_modify_node_done(node); 2519 if (hammer_debug_general & 0x0002) { 2520 kprintf("mirror_propagate: propagate " 2521 "%016llx @%016llx\n", 2522 (long long)mirror_tid, 2523 (long long)node->node_offset); 2524 } 2525 } 2526 if (error == ENOENT) 2527 error = 0; 2528 return(error); 2529 } 2530 2531 hammer_node_t 2532 hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node, 2533 int *parent_indexp, int *errorp, int try_exclusive) 2534 { 2535 hammer_node_t parent; 2536 hammer_btree_elm_t elm; 2537 int i; 2538 2539 /* 2540 * Get the node 2541 */ 2542 parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp); 2543 if (*errorp) { 2544 KKASSERT(parent == NULL); 2545 return(NULL); 2546 } 2547 KKASSERT ((parent->flags & HAMMER_NODE_DELETED) == 0); 2548 2549 /* 2550 * Lock the node 2551 */ 2552 if (try_exclusive) { 2553 if (hammer_lock_ex_try(&parent->lock)) { 2554 hammer_rel_node(parent); 2555 *errorp = EDEADLK; 2556 return(NULL); 2557 } 2558 } else { 2559 hammer_lock_sh(&parent->lock); 2560 } 2561 2562 /* 2563 * Figure out which element in the parent is pointing to the 2564 * child. 2565 */ 2566 if (node->ondisk->count) { 2567 i = hammer_btree_search_node(&node->ondisk->elms[0].base, 2568 parent->ondisk); 2569 } else { 2570 i = 0; 2571 } 2572 while (i < parent->ondisk->count) { 2573 elm = &parent->ondisk->elms[i]; 2574 if (elm->internal.subtree_offset == node->node_offset) 2575 break; 2576 ++i; 2577 } 2578 if (i == parent->ondisk->count) { 2579 hammer_unlock(&parent->lock); 2580 panic("Bad B-Tree link: parent %p node %p", parent, node); 2581 } 2582 *parent_indexp = i; 2583 KKASSERT(*errorp == 0); 2584 return(parent); 2585 } 2586 2587 /* 2588 * The element (elm) has been moved to a new internal node (node). 2589 * 2590 * If the element represents a pointer to an internal node that node's 2591 * parent must be adjusted to the element's new location. 2592 * 2593 * XXX deadlock potential here with our exclusive locks 2594 */ 2595 int 2596 btree_set_parent(hammer_transaction_t trans, hammer_node_t node, 2597 hammer_btree_elm_t elm) 2598 { 2599 hammer_node_t child; 2600 int error; 2601 2602 error = 0; 2603 2604 switch(elm->base.btype) { 2605 case HAMMER_BTREE_TYPE_INTERNAL: 2606 case HAMMER_BTREE_TYPE_LEAF: 2607 child = hammer_get_node(trans, elm->internal.subtree_offset, 2608 0, &error); 2609 if (error == 0) { 2610 hammer_modify_node_field(trans, child, parent); 2611 child->ondisk->parent = node->node_offset; 2612 hammer_modify_node_done(child); 2613 hammer_rel_node(child); 2614 } 2615 break; 2616 default: 2617 break; 2618 } 2619 return(error); 2620 } 2621 2622 /* 2623 * Initialize the root of a recursive B-Tree node lock list structure. 2624 */ 2625 void 2626 hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node) 2627 { 2628 TAILQ_INIT(&parent->list); 2629 parent->parent = NULL; 2630 parent->node = node; 2631 parent->index = -1; 2632 parent->count = node->ondisk->count; 2633 parent->copy = NULL; 2634 parent->flags = 0; 2635 } 2636 2637 /* 2638 * Initialize a cache of hammer_node_lock's including space allocated 2639 * for node copies. 2640 * 2641 * This is used by the rebalancing code to preallocate the copy space 2642 * for ~4096 B-Tree nodes (16MB of data) prior to acquiring any HAMMER 2643 * locks, otherwise we can blow out the pageout daemon's emergency 2644 * reserve and deadlock it. 2645 * 2646 * NOTE: HAMMER_NODE_LOCK_LCACHE is not set on items cached in the lcache. 2647 * The flag is set when the item is pulled off the cache for use. 2648 */ 2649 void 2650 hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache, 2651 int depth) 2652 { 2653 hammer_node_lock_t item; 2654 int count; 2655 2656 for (count = 1; depth; --depth) 2657 count *= HAMMER_BTREE_LEAF_ELMS; 2658 bzero(lcache, sizeof(*lcache)); 2659 TAILQ_INIT(&lcache->list); 2660 while (count) { 2661 item = kmalloc(sizeof(*item), hmp->m_misc, M_WAITOK|M_ZERO); 2662 item->copy = kmalloc(sizeof(*item->copy), 2663 hmp->m_misc, M_WAITOK); 2664 TAILQ_INIT(&item->list); 2665 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2666 --count; 2667 } 2668 } 2669 2670 void 2671 hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache) 2672 { 2673 hammer_node_lock_t item; 2674 2675 while ((item = TAILQ_FIRST(&lcache->list)) != NULL) { 2676 TAILQ_REMOVE(&lcache->list, item, entry); 2677 KKASSERT(item->copy); 2678 KKASSERT(TAILQ_EMPTY(&item->list)); 2679 kfree(item->copy, hmp->m_misc); 2680 kfree(item, hmp->m_misc); 2681 } 2682 KKASSERT(lcache->copy == NULL); 2683 } 2684 2685 /* 2686 * Exclusively lock all the children of node. This is used by the split 2687 * code to prevent anyone from accessing the children of a cursor node 2688 * while we fix-up its parent offset. 2689 * 2690 * If we don't lock the children we can really mess up cursors which block 2691 * trying to cursor-up into our node. 2692 * 2693 * On failure EDEADLK (or some other error) is returned. If a deadlock 2694 * error is returned the cursor is adjusted to block on termination. 2695 * 2696 * The caller is responsible for managing parent->node, the root's node 2697 * is usually aliased from a cursor. 2698 */ 2699 int 2700 hammer_btree_lock_children(hammer_cursor_t cursor, int depth, 2701 hammer_node_lock_t parent, 2702 hammer_node_lock_t lcache) 2703 { 2704 hammer_node_t node; 2705 hammer_node_lock_t item; 2706 hammer_node_ondisk_t ondisk; 2707 hammer_btree_elm_t elm; 2708 hammer_node_t child; 2709 struct hammer_mount *hmp; 2710 int error; 2711 int i; 2712 2713 node = parent->node; 2714 ondisk = node->ondisk; 2715 error = 0; 2716 hmp = cursor->trans->hmp; 2717 2718 /* 2719 * We really do not want to block on I/O with exclusive locks held, 2720 * pre-get the children before trying to lock the mess. This is 2721 * only done one-level deep for now. 2722 */ 2723 for (i = 0; i < ondisk->count; ++i) { 2724 ++hammer_stats_btree_elements; 2725 elm = &ondisk->elms[i]; 2726 if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF && 2727 elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) { 2728 continue; 2729 } 2730 child = hammer_get_node(cursor->trans, 2731 elm->internal.subtree_offset, 2732 0, &error); 2733 if (child) 2734 hammer_rel_node(child); 2735 } 2736 2737 /* 2738 * Do it for real 2739 */ 2740 for (i = 0; error == 0 && i < ondisk->count; ++i) { 2741 ++hammer_stats_btree_elements; 2742 elm = &ondisk->elms[i]; 2743 2744 switch(elm->base.btype) { 2745 case HAMMER_BTREE_TYPE_INTERNAL: 2746 case HAMMER_BTREE_TYPE_LEAF: 2747 KKASSERT(elm->internal.subtree_offset != 0); 2748 child = hammer_get_node(cursor->trans, 2749 elm->internal.subtree_offset, 2750 0, &error); 2751 break; 2752 default: 2753 child = NULL; 2754 break; 2755 } 2756 if (child) { 2757 if (hammer_lock_ex_try(&child->lock) != 0) { 2758 if (cursor->deadlk_node == NULL) { 2759 cursor->deadlk_node = child; 2760 hammer_ref_node(cursor->deadlk_node); 2761 } 2762 error = EDEADLK; 2763 hammer_rel_node(child); 2764 } else { 2765 if (lcache) { 2766 item = TAILQ_FIRST(&lcache->list); 2767 KKASSERT(item != NULL); 2768 item->flags |= HAMMER_NODE_LOCK_LCACHE; 2769 TAILQ_REMOVE(&lcache->list, 2770 item, entry); 2771 } else { 2772 item = kmalloc(sizeof(*item), 2773 hmp->m_misc, 2774 M_WAITOK|M_ZERO); 2775 TAILQ_INIT(&item->list); 2776 } 2777 2778 TAILQ_INSERT_TAIL(&parent->list, item, entry); 2779 item->parent = parent; 2780 item->node = child; 2781 item->index = i; 2782 item->count = child->ondisk->count; 2783 2784 /* 2785 * Recurse (used by the rebalancing code) 2786 */ 2787 if (depth > 1 && elm->base.btype == HAMMER_BTREE_TYPE_INTERNAL) { 2788 error = hammer_btree_lock_children( 2789 cursor, 2790 depth - 1, 2791 item, 2792 lcache); 2793 } 2794 } 2795 } 2796 } 2797 if (error) 2798 hammer_btree_unlock_children(hmp, parent, lcache); 2799 return(error); 2800 } 2801 2802 /* 2803 * Create an in-memory copy of all B-Tree nodes listed, recursively, 2804 * including the parent. 2805 */ 2806 void 2807 hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2808 { 2809 hammer_mount_t hmp = cursor->trans->hmp; 2810 hammer_node_lock_t item; 2811 2812 if (parent->copy == NULL) { 2813 KKASSERT((parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0); 2814 parent->copy = kmalloc(sizeof(*parent->copy), 2815 hmp->m_misc, M_WAITOK); 2816 } 2817 KKASSERT((parent->flags & HAMMER_NODE_LOCK_UPDATED) == 0); 2818 *parent->copy = *parent->node->ondisk; 2819 TAILQ_FOREACH(item, &parent->list, entry) { 2820 hammer_btree_lock_copy(cursor, item); 2821 } 2822 } 2823 2824 /* 2825 * Recursively sync modified copies to the media. 2826 */ 2827 int 2828 hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) 2829 { 2830 hammer_node_lock_t item; 2831 int count = 0; 2832 2833 if (parent->flags & HAMMER_NODE_LOCK_UPDATED) { 2834 ++count; 2835 hammer_modify_node_all(cursor->trans, parent->node); 2836 *parent->node->ondisk = *parent->copy; 2837 hammer_modify_node_done(parent->node); 2838 if (parent->copy->type == HAMMER_BTREE_TYPE_DELETED) { 2839 hammer_flush_node(parent->node, 0); 2840 hammer_delete_node(cursor->trans, parent->node); 2841 } 2842 } 2843 TAILQ_FOREACH(item, &parent->list, entry) { 2844 count += hammer_btree_sync_copy(cursor, item); 2845 } 2846 return(count); 2847 } 2848 2849 /* 2850 * Release previously obtained node locks. The caller is responsible for 2851 * cleaning up parent->node itself (its usually just aliased from a cursor), 2852 * but this function will take care of the copies. 2853 * 2854 * NOTE: The root node is not placed in the lcache and node->copy is not 2855 * deallocated when lcache != NULL. 2856 */ 2857 void 2858 hammer_btree_unlock_children(hammer_mount_t hmp, hammer_node_lock_t parent, 2859 hammer_node_lock_t lcache) 2860 { 2861 hammer_node_lock_t item; 2862 hammer_node_ondisk_t copy; 2863 2864 while ((item = TAILQ_FIRST(&parent->list)) != NULL) { 2865 TAILQ_REMOVE(&parent->list, item, entry); 2866 hammer_btree_unlock_children(hmp, item, lcache); 2867 hammer_unlock(&item->node->lock); 2868 hammer_rel_node(item->node); 2869 if (lcache) { 2870 /* 2871 * NOTE: When placing the item back in the lcache 2872 * the flag is cleared by the bzero(). 2873 * Remaining fields are cleared as a safety 2874 * measure. 2875 */ 2876 KKASSERT(item->flags & HAMMER_NODE_LOCK_LCACHE); 2877 KKASSERT(TAILQ_EMPTY(&item->list)); 2878 copy = item->copy; 2879 bzero(item, sizeof(*item)); 2880 TAILQ_INIT(&item->list); 2881 item->copy = copy; 2882 if (copy) 2883 bzero(copy, sizeof(*copy)); 2884 TAILQ_INSERT_TAIL(&lcache->list, item, entry); 2885 } else { 2886 kfree(item, hmp->m_misc); 2887 } 2888 } 2889 if (parent->copy && (parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0) { 2890 kfree(parent->copy, hmp->m_misc); 2891 parent->copy = NULL; /* safety */ 2892 } 2893 } 2894 2895 /************************************************************************ 2896 * MISCELLANIOUS SUPPORT * 2897 ************************************************************************/ 2898 2899 /* 2900 * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp). 2901 * 2902 * Note that for this particular function a return value of -1, 0, or +1 2903 * can denote a match if create_tid is otherwise discounted. A create_tid 2904 * of zero is considered to be 'infinity' in comparisons. 2905 * 2906 * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c. 2907 */ 2908 int 2909 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) 2910 { 2911 if (key1->localization < key2->localization) 2912 return(-5); 2913 if (key1->localization > key2->localization) 2914 return(5); 2915 2916 if (key1->obj_id < key2->obj_id) 2917 return(-4); 2918 if (key1->obj_id > key2->obj_id) 2919 return(4); 2920 2921 if (key1->rec_type < key2->rec_type) 2922 return(-3); 2923 if (key1->rec_type > key2->rec_type) 2924 return(3); 2925 2926 if (key1->key < key2->key) 2927 return(-2); 2928 if (key1->key > key2->key) 2929 return(2); 2930 2931 /* 2932 * A create_tid of zero indicates a record which is undeletable 2933 * and must be considered to have a value of positive infinity. 2934 */ 2935 if (key1->create_tid == 0) { 2936 if (key2->create_tid == 0) 2937 return(0); 2938 return(1); 2939 } 2940 if (key2->create_tid == 0) 2941 return(-1); 2942 if (key1->create_tid < key2->create_tid) 2943 return(-1); 2944 if (key1->create_tid > key2->create_tid) 2945 return(1); 2946 return(0); 2947 } 2948 2949 /* 2950 * Test a timestamp against an element to determine whether the 2951 * element is visible. A timestamp of 0 means 'infinity'. 2952 */ 2953 int 2954 hammer_btree_chkts(hammer_tid_t asof, hammer_base_elm_t base) 2955 { 2956 if (asof == 0) { 2957 if (base->delete_tid) 2958 return(1); 2959 return(0); 2960 } 2961 if (asof < base->create_tid) 2962 return(-1); 2963 if (base->delete_tid && asof >= base->delete_tid) 2964 return(1); 2965 return(0); 2966 } 2967 2968 /* 2969 * Create a separator half way inbetween key1 and key2. For fields just 2970 * one unit apart, the separator will match key2. key1 is on the left-hand 2971 * side and key2 is on the right-hand side. 2972 * 2973 * key2 must be >= the separator. It is ok for the separator to match key2. 2974 * 2975 * NOTE: Even if key1 does not match key2, the separator may wind up matching 2976 * key2. 2977 * 2978 * NOTE: It might be beneficial to just scrap this whole mess and just 2979 * set the separator to key2. 2980 */ 2981 #define MAKE_SEPARATOR(key1, key2, dest, field) \ 2982 dest->field = key1->field + ((key2->field - key1->field + 1) >> 1); 2983 2984 static void 2985 hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, 2986 hammer_base_elm_t dest) 2987 { 2988 bzero(dest, sizeof(*dest)); 2989 2990 dest->rec_type = key2->rec_type; 2991 dest->key = key2->key; 2992 dest->obj_id = key2->obj_id; 2993 dest->create_tid = key2->create_tid; 2994 2995 MAKE_SEPARATOR(key1, key2, dest, localization); 2996 if (key1->localization == key2->localization) { 2997 MAKE_SEPARATOR(key1, key2, dest, obj_id); 2998 if (key1->obj_id == key2->obj_id) { 2999 MAKE_SEPARATOR(key1, key2, dest, rec_type); 3000 if (key1->rec_type == key2->rec_type) { 3001 MAKE_SEPARATOR(key1, key2, dest, key); 3002 /* 3003 * Don't bother creating a separator for 3004 * create_tid, which also conveniently avoids 3005 * having to handle the create_tid == 0 3006 * (infinity) case. Just leave create_tid 3007 * set to key2. 3008 * 3009 * Worst case, dest matches key2 exactly, 3010 * which is acceptable. 3011 */ 3012 } 3013 } 3014 } 3015 } 3016 3017 #undef MAKE_SEPARATOR 3018 3019 /* 3020 * Return whether a generic internal or leaf node is full 3021 */ 3022 static __inline 3023 int 3024 btree_node_is_full(hammer_node_ondisk_t node) 3025 { 3026 return(btree_max_elements(node->type) == node->count); 3027 } 3028 3029 static __inline 3030 int 3031 btree_max_elements(u_int8_t type) 3032 { 3033 if (type == HAMMER_BTREE_TYPE_LEAF) 3034 return(HAMMER_BTREE_LEAF_ELMS); 3035 if (type == HAMMER_BTREE_TYPE_INTERNAL) 3036 return(HAMMER_BTREE_INT_ELMS); 3037 panic("btree_max_elements: bad type %d", type); 3038 } 3039 3040 void 3041 hammer_print_btree_node(hammer_node_ondisk_t ondisk) 3042 { 3043 hammer_btree_elm_t elm; 3044 int i; 3045 3046 kprintf("node %p count=%d parent=%016llx type=%c\n", 3047 ondisk, ondisk->count, 3048 (long long)ondisk->parent, ondisk->type); 3049 3050 /* 3051 * Dump both boundary elements if an internal node 3052 */ 3053 if (ondisk->type == HAMMER_BTREE_TYPE_INTERNAL) { 3054 for (i = 0; i <= ondisk->count; ++i) { 3055 elm = &ondisk->elms[i]; 3056 hammer_print_btree_elm(elm, ondisk->type, i); 3057 } 3058 } else { 3059 for (i = 0; i < ondisk->count; ++i) { 3060 elm = &ondisk->elms[i]; 3061 hammer_print_btree_elm(elm, ondisk->type, i); 3062 } 3063 } 3064 } 3065 3066 void 3067 hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i) 3068 { 3069 kprintf(" %2d", i); 3070 kprintf("\tobj_id = %016llx\n", (long long)elm->base.obj_id); 3071 kprintf("\tkey = %016llx\n", (long long)elm->base.key); 3072 kprintf("\tcreate_tid = %016llx\n", (long long)elm->base.create_tid); 3073 kprintf("\tdelete_tid = %016llx\n", (long long)elm->base.delete_tid); 3074 kprintf("\trec_type = %04x\n", elm->base.rec_type); 3075 kprintf("\tobj_type = %02x\n", elm->base.obj_type); 3076 kprintf("\tbtype = %02x (%c)\n", 3077 elm->base.btype, 3078 (elm->base.btype ? elm->base.btype : '?')); 3079 kprintf("\tlocalization = %02x\n", elm->base.localization); 3080 3081 switch(type) { 3082 case HAMMER_BTREE_TYPE_INTERNAL: 3083 kprintf("\tsubtree_off = %016llx\n", 3084 (long long)elm->internal.subtree_offset); 3085 break; 3086 case HAMMER_BTREE_TYPE_RECORD: 3087 kprintf("\tdata_offset = %016llx\n", 3088 (long long)elm->leaf.data_offset); 3089 kprintf("\tdata_len = %08x\n", elm->leaf.data_len); 3090 kprintf("\tdata_crc = %08x\n", elm->leaf.data_crc); 3091 break; 3092 } 3093 } 3094