1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER B-Tree index - cursor support routines 37 */ 38 #include "hammer.h" 39 40 static int hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive); 41 42 /* 43 * Initialize a fresh cursor using the B-Tree node cache. If the cache 44 * is not available initialize a fresh cursor at the root of the filesystem. 45 */ 46 int 47 hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, 48 hammer_node_cache_t cache, hammer_inode_t ip) 49 { 50 hammer_volume_t volume; 51 hammer_node_t node; 52 hammer_mount_t hmp; 53 u_int tticks; 54 int error; 55 56 bzero(cursor, sizeof(*cursor)); 57 58 cursor->trans = trans; 59 hmp = trans->hmp; 60 61 /* 62 * As the number of inodes queued to the flusher increases we use 63 * time-domain multiplexing to control read vs flush performance. 64 * We have to do it here, before acquiring any ip or node locks, 65 * to avoid deadlocking or excessively delaying the flusher. 66 * 67 * The full time period is hammer_tdmux_ticks, typically 1/5 of 68 * a second. 69 * 70 * inode allocation begins to get restrained at 2/4 the limit 71 * via the "hmrrcm" mechanism in hammer_inode. We want to begin 72 * limiting read activity before that to try to avoid processes 73 * stalling out in "hmrrcm". 74 */ 75 tticks = hammer_tdmux_ticks; 76 if (trans->type != HAMMER_TRANS_FLS && tticks && 77 hmp->count_reclaims > hammer_limit_reclaims / tticks && 78 hmp->count_reclaims > hammer_autoflush * 2 && 79 hammer_flusher_running(hmp)) { 80 u_int rticks; 81 u_int xticks; 82 u_int dummy; 83 84 /* 85 * 0 ... xticks ... tticks 86 * 87 * rticks is the calculated position, xticks is the demarc 88 * where values below xticks are reserved for the flusher 89 * and values >= to xticks may be used by the frontend. 90 * 91 * At least one tick is always made available for the 92 * frontend. 93 */ 94 rticks = (u_int)ticks % tticks; 95 xticks = hmp->count_reclaims * tticks / hammer_limit_reclaims; 96 97 /* 98 * Ensure rticks and xticks are stable 99 */ 100 cpu_ccfence(); 101 if (rticks < xticks) { 102 if (hammer_debug_general & 0x0004) 103 kprintf("rt %3u, xt %3u, tt %3u\n", 104 rticks, xticks, tticks); 105 tsleep(&dummy, 0, "htdmux", xticks - rticks); 106 } 107 } 108 109 /* 110 * If the cursor operation is on behalf of an inode, lock 111 * the inode. 112 * 113 * When acquiring a shared lock on an inode on which the backend 114 * flusher deadlocked, wait up to hammer_tdmux_ticks (1 second) 115 * for the deadlock to clear. 116 */ 117 if ((cursor->ip = ip) != NULL) { 118 ++ip->cursor_ip_refs; 119 if (trans->type == HAMMER_TRANS_FLS) { 120 hammer_lock_ex(&ip->lock); 121 } else { 122 #if 0 123 if (ip->cursor_exclreq_count) { 124 tsleep(&ip->cursor_exclreq_count, 0, 125 "hstag1", hammer_tdmux_ticks); 126 } 127 #endif 128 hammer_lock_sh(&ip->lock); 129 } 130 } 131 132 /* 133 * Step 1 - acquire a locked node from the cache if possible 134 */ 135 if (cache && cache->node) { 136 node = hammer_ref_node_safe(trans, cache, &error); 137 if (error == 0) { 138 hammer_lock_sh(&node->lock); 139 if (node->flags & HAMMER_NODE_DELETED) { 140 hammer_unlock(&node->lock); 141 hammer_rel_node(node); 142 node = NULL; 143 } 144 } 145 if (node == NULL) 146 ++hammer_stats_btree_root_iterations; 147 } else { 148 node = NULL; 149 ++hammer_stats_btree_root_iterations; 150 } 151 152 /* 153 * Step 2 - If we couldn't get a node from the cache, get 154 * the one from the root of the filesystem. 155 */ 156 while (node == NULL) { 157 volume = hammer_get_root_volume(hmp, &error); 158 if (error) 159 break; 160 node = hammer_get_node(trans, volume->ondisk->vol0_btree_root, 161 0, &error); 162 hammer_rel_volume(volume, 0); 163 if (error) 164 break; 165 /* 166 * When the frontend acquires the root b-tree node while the 167 * backend is deadlocked on it, wait up to hammer_tdmux_ticks 168 * (1 second) for the deadlock to clear. 169 */ 170 #if 0 171 if (node->cursor_exclreq_count && 172 cursor->trans->type != HAMMER_TRANS_FLS) { 173 tsleep(&node->cursor_exclreq_count, 0, 174 "hstag3", hammer_tdmux_ticks); 175 } 176 #endif 177 hammer_lock_sh(&node->lock); 178 179 /* 180 * If someone got in before we could lock the node, retry. 181 */ 182 if (node->flags & HAMMER_NODE_DELETED) { 183 hammer_unlock(&node->lock); 184 hammer_rel_node(node); 185 node = NULL; 186 continue; 187 } 188 if (volume->ondisk->vol0_btree_root != node->node_offset) { 189 hammer_unlock(&node->lock); 190 hammer_rel_node(node); 191 node = NULL; 192 continue; 193 } 194 } 195 196 /* 197 * Step 3 - finish initializing the cursor by acquiring the parent 198 */ 199 cursor->node = node; 200 if (error == 0) 201 error = hammer_load_cursor_parent(cursor, 0); 202 KKASSERT(error == 0); 203 /* if (error) hammer_done_cursor(cursor); */ 204 return(error); 205 } 206 207 /* 208 * Normalize a cursor. Sometimes cursors can be left in a state 209 * where node is NULL. If the cursor is in this state, cursor up. 210 */ 211 void 212 hammer_normalize_cursor(hammer_cursor_t cursor) 213 { 214 if (cursor->node == NULL) { 215 KKASSERT(cursor->parent != NULL); 216 hammer_cursor_up(cursor); 217 } 218 } 219 220 221 /* 222 * We are finished with a cursor. We NULL out various fields as sanity 223 * check, in case the structure is inappropriately used afterwords. 224 */ 225 void 226 hammer_done_cursor(hammer_cursor_t cursor) 227 { 228 hammer_inode_t ip; 229 230 KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); 231 if (cursor->parent) { 232 hammer_unlock(&cursor->parent->lock); 233 hammer_rel_node(cursor->parent); 234 cursor->parent = NULL; 235 } 236 if (cursor->node) { 237 hammer_unlock(&cursor->node->lock); 238 hammer_rel_node(cursor->node); 239 cursor->node = NULL; 240 } 241 if (cursor->data_buffer) { 242 hammer_rel_buffer(cursor->data_buffer, 0); 243 cursor->data_buffer = NULL; 244 } 245 if ((ip = cursor->ip) != NULL) { 246 KKASSERT(ip->cursor_ip_refs > 0); 247 --ip->cursor_ip_refs; 248 hammer_unlock(&ip->lock); 249 cursor->ip = NULL; 250 } 251 if (cursor->iprec) { 252 hammer_rel_mem_record(cursor->iprec); 253 cursor->iprec = NULL; 254 } 255 256 /* 257 * If we deadlocked this node will be referenced. Do a quick 258 * lock/unlock to wait for the deadlock condition to clear. 259 * 260 * Maintain exclreq_count / wakeup as necessary to notify new 261 * entrants into ip. We continue to hold the fs_token so our 262 * EDEADLK retry loop should get its chance before another thread 263 * steals the lock. 264 */ 265 if (cursor->deadlk_node) { 266 #if 0 267 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) 268 ++ip->cursor_exclreq_count; 269 ++cursor->deadlk_node->cursor_exclreq_count; 270 #endif 271 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); 272 hammer_unlock(&cursor->deadlk_node->lock); 273 #if 0 274 if (--cursor->deadlk_node->cursor_exclreq_count == 0) 275 wakeup(&cursor->deadlk_node->cursor_exclreq_count); 276 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) { 277 if (--ip->cursor_exclreq_count == 0) 278 wakeup(&ip->cursor_exclreq_count); 279 } 280 #endif 281 hammer_rel_node(cursor->deadlk_node); 282 cursor->deadlk_node = NULL; 283 } 284 if (cursor->deadlk_rec) { 285 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); 286 hammer_rel_mem_record(cursor->deadlk_rec); 287 cursor->deadlk_rec = NULL; 288 } 289 290 cursor->data = NULL; 291 cursor->leaf = NULL; 292 cursor->left_bound = NULL; 293 cursor->right_bound = NULL; 294 cursor->trans = NULL; 295 } 296 297 /* 298 * Upgrade cursor->node and cursor->parent to exclusive locks. This 299 * function can return EDEADLK. 300 * 301 * The lock must already be either held shared or already held exclusively 302 * by us. 303 * 304 * We upgrade the parent first as it is the most likely to collide first 305 * with the downward traversal that the frontend typically does. 306 * 307 * If we fail to upgrade the lock and cursor->deadlk_node is NULL, 308 * we add another reference to the node that failed and set 309 * cursor->deadlk_node so hammer_done_cursor() can block on it. 310 */ 311 int 312 hammer_cursor_upgrade(hammer_cursor_t cursor) 313 { 314 int error; 315 316 if (cursor->parent) { 317 error = hammer_lock_upgrade(&cursor->parent->lock, 1); 318 if (error && cursor->deadlk_node == NULL) { 319 cursor->deadlk_node = cursor->parent; 320 hammer_ref_node(cursor->deadlk_node); 321 } 322 } else { 323 error = 0; 324 } 325 if (error == 0) { 326 error = hammer_lock_upgrade(&cursor->node->lock, 1); 327 if (error && cursor->deadlk_node == NULL) { 328 cursor->deadlk_node = cursor->node; 329 hammer_ref_node(cursor->deadlk_node); 330 } 331 } 332 #if 0 333 error = hammer_lock_upgrade(&cursor->node->lock, 1); 334 if (error && cursor->deadlk_node == NULL) { 335 cursor->deadlk_node = cursor->node; 336 hammer_ref_node(cursor->deadlk_node); 337 } else if (error == 0 && cursor->parent) { 338 error = hammer_lock_upgrade(&cursor->parent->lock, 1); 339 if (error && cursor->deadlk_node == NULL) { 340 cursor->deadlk_node = cursor->parent; 341 hammer_ref_node(cursor->deadlk_node); 342 } 343 } 344 #endif 345 return(error); 346 } 347 348 int 349 hammer_cursor_upgrade_node(hammer_cursor_t cursor) 350 { 351 int error; 352 353 error = hammer_lock_upgrade(&cursor->node->lock, 1); 354 if (error && cursor->deadlk_node == NULL) { 355 cursor->deadlk_node = cursor->node; 356 hammer_ref_node(cursor->deadlk_node); 357 } 358 return(error); 359 } 360 361 /* 362 * Downgrade cursor->node and cursor->parent to shared locks. 363 */ 364 void 365 hammer_cursor_downgrade(hammer_cursor_t cursor) 366 { 367 if (hammer_lock_excl_owned(&cursor->node->lock, curthread)) 368 hammer_lock_downgrade(&cursor->node->lock, 1); 369 if (cursor->parent && 370 hammer_lock_excl_owned(&cursor->parent->lock, curthread)) { 371 hammer_lock_downgrade(&cursor->parent->lock, 1); 372 } 373 } 374 375 /* 376 * Upgrade and downgrade pairs of cursors. This is used by the dedup 377 * code which must deal with two cursors. A special function is needed 378 * because some of the nodes may be shared between the two cursors, 379 * resulting in share counts > 1 which will normally cause an upgrade 380 * to fail. 381 */ 382 static __noinline 383 int 384 collect_node(hammer_node_t *array, int *counts, int n, hammer_node_t node) 385 { 386 int i; 387 388 for (i = 0; i < n; ++i) { 389 if (array[i] == node) 390 break; 391 } 392 if (i == n) { 393 array[i] = node; 394 counts[i] = 1; 395 ++i; 396 } else { 397 ++counts[i]; 398 } 399 return(i); 400 } 401 402 int 403 hammer_cursor_upgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) 404 { 405 hammer_node_t nodes[4]; 406 int counts[4]; 407 int error; 408 int i; 409 int n; 410 411 n = collect_node(nodes, counts, 0, cursor1->node); 412 if (cursor1->parent) 413 n = collect_node(nodes, counts, n, cursor1->parent); 414 n = collect_node(nodes, counts, n, cursor2->node); 415 if (cursor2->parent) 416 n = collect_node(nodes, counts, n, cursor2->parent); 417 418 error = 0; 419 for (i = 0; i < n; ++i) { 420 error = hammer_lock_upgrade(&nodes[i]->lock, counts[i]); 421 if (error) 422 break; 423 } 424 if (error) { 425 while (--i >= 0) 426 hammer_lock_downgrade(&nodes[i]->lock, counts[i]); 427 } 428 return (error); 429 } 430 431 void 432 hammer_cursor_downgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2) 433 { 434 hammer_node_t nodes[4]; 435 int counts[4]; 436 int i; 437 int n; 438 439 n = collect_node(nodes, counts, 0, cursor1->node); 440 if (cursor1->parent) 441 n = collect_node(nodes, counts, n, cursor1->parent); 442 n = collect_node(nodes, counts, n, cursor2->node); 443 if (cursor2->parent) 444 n = collect_node(nodes, counts, n, cursor2->parent); 445 446 for (i = 0; i < n; ++i) 447 hammer_lock_downgrade(&nodes[i]->lock, counts[i]); 448 } 449 450 /* 451 * Seek the cursor to the specified node and index. 452 * 453 * The caller must ref the node prior to calling this routine and release 454 * it after it returns. If the seek succeeds the cursor will gain its own 455 * ref on the node. 456 */ 457 int 458 hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index) 459 { 460 int error; 461 462 hammer_cursor_downgrade(cursor); 463 error = 0; 464 465 if (cursor->node != node) { 466 hammer_unlock(&cursor->node->lock); 467 hammer_rel_node(cursor->node); 468 cursor->node = node; 469 hammer_ref_node(node); 470 hammer_lock_sh(&node->lock); 471 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); 472 473 if (cursor->parent) { 474 hammer_unlock(&cursor->parent->lock); 475 hammer_rel_node(cursor->parent); 476 cursor->parent = NULL; 477 cursor->parent_index = 0; 478 } 479 error = hammer_load_cursor_parent(cursor, 0); 480 } 481 cursor->index = index; 482 return (error); 483 } 484 485 /* 486 * Load the parent of cursor->node into cursor->parent. 487 */ 488 static 489 int 490 hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive) 491 { 492 hammer_mount_t hmp; 493 hammer_node_t parent; 494 hammer_node_t node; 495 hammer_btree_elm_t elm; 496 int error; 497 int parent_index; 498 499 hmp = cursor->trans->hmp; 500 501 if (cursor->node->ondisk->parent) { 502 node = cursor->node; 503 parent = hammer_btree_get_parent(cursor->trans, node, 504 &parent_index, 505 &error, try_exclusive); 506 if (error == 0) { 507 elm = &parent->ondisk->elms[parent_index]; 508 cursor->parent = parent; 509 cursor->parent_index = parent_index; 510 cursor->left_bound = &elm[0].internal.base; 511 cursor->right_bound = &elm[1].internal.base; 512 } 513 } else { 514 cursor->parent = NULL; 515 cursor->parent_index = 0; 516 cursor->left_bound = &hmp->root_btree_beg; 517 cursor->right_bound = &hmp->root_btree_end; 518 error = 0; 519 } 520 return(error); 521 } 522 523 /* 524 * Cursor up to our parent node. Return ENOENT if we are at the root of 525 * the filesystem. 526 */ 527 int 528 hammer_cursor_up(hammer_cursor_t cursor) 529 { 530 int error; 531 532 hammer_cursor_downgrade(cursor); 533 534 /* 535 * If the parent is NULL we are at the root of the B-Tree and 536 * return ENOENT. 537 */ 538 if (cursor->parent == NULL) 539 return (ENOENT); 540 541 /* 542 * Set the node to its parent. 543 */ 544 hammer_unlock(&cursor->node->lock); 545 hammer_rel_node(cursor->node); 546 cursor->node = cursor->parent; 547 cursor->index = cursor->parent_index; 548 cursor->parent = NULL; 549 cursor->parent_index = 0; 550 551 error = hammer_load_cursor_parent(cursor, 0); 552 return(error); 553 } 554 555 /* 556 * Special cursor up given a locked cursor. The orignal node is not 557 * unlocked or released and the cursor is not downgraded. 558 * 559 * This function can fail with EDEADLK. 560 * 561 * This function is only run when recursively deleting parent nodes 562 * to get rid of an empty leaf. 563 */ 564 int 565 hammer_cursor_up_locked(hammer_cursor_t cursor) 566 { 567 hammer_node_t save; 568 int error; 569 int save_index; 570 571 /* 572 * If the parent is NULL we are at the root of the B-Tree and 573 * return ENOENT. 574 */ 575 if (cursor->parent == NULL) 576 return (ENOENT); 577 578 save = cursor->node; 579 save_index = cursor->index; 580 581 /* 582 * Set the node to its parent. 583 */ 584 cursor->node = cursor->parent; 585 cursor->index = cursor->parent_index; 586 cursor->parent = NULL; 587 cursor->parent_index = 0; 588 589 /* 590 * load the new parent, attempt to exclusively lock it. Note that 591 * we are still holding the old parent (now cursor->node) exclusively 592 * locked. 593 * 594 * This can return EDEADLK. Undo the operation on any error. These 595 * up sequences can occur during iterations so be sure to restore 596 * the index. 597 */ 598 error = hammer_load_cursor_parent(cursor, 1); 599 if (error) { 600 cursor->parent = cursor->node; 601 cursor->parent_index = cursor->index; 602 cursor->node = save; 603 cursor->index = save_index; 604 } 605 return(error); 606 } 607 608 609 /* 610 * Cursor down through the current node, which must be an internal node. 611 * 612 * This routine adjusts the cursor and sets index to 0. 613 */ 614 int 615 hammer_cursor_down(hammer_cursor_t cursor) 616 { 617 hammer_node_t node; 618 hammer_btree_elm_t elm; 619 int error; 620 621 /* 622 * The current node becomes the current parent 623 */ 624 hammer_cursor_downgrade(cursor); 625 node = cursor->node; 626 KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count); 627 if (cursor->parent) { 628 hammer_unlock(&cursor->parent->lock); 629 hammer_rel_node(cursor->parent); 630 } 631 cursor->parent = node; 632 cursor->parent_index = cursor->index; 633 cursor->node = NULL; 634 cursor->index = 0; 635 636 /* 637 * Extract element to push into at (node,index), set bounds. 638 */ 639 elm = &node->ondisk->elms[cursor->parent_index]; 640 641 /* 642 * Ok, push down into elm. If elm specifies an internal or leaf 643 * node the current node must be an internal node. 644 */ 645 switch(elm->base.btype) { 646 case HAMMER_BTREE_TYPE_INTERNAL: 647 case HAMMER_BTREE_TYPE_LEAF: 648 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); 649 KKASSERT(elm->internal.subtree_offset != 0); 650 cursor->left_bound = &elm[0].internal.base; 651 cursor->right_bound = &elm[1].internal.base; 652 node = hammer_get_node(cursor->trans, 653 elm->internal.subtree_offset, 0, &error); 654 if (error == 0) { 655 KASSERT(elm->base.btype == node->ondisk->type, ("BTYPE MISMATCH %c %c NODE %p", elm->base.btype, node->ondisk->type, node)); 656 if (node->ondisk->parent != cursor->parent->node_offset) 657 panic("node %p %016llx vs %016llx", node, (long long)node->ondisk->parent, (long long)cursor->parent->node_offset); 658 KKASSERT(node->ondisk->parent == cursor->parent->node_offset); 659 } 660 break; 661 default: 662 panic("hammer_cursor_down: illegal btype %02x (%c)", 663 elm->base.btype, 664 (elm->base.btype ? elm->base.btype : '?')); 665 break; 666 } 667 668 /* 669 * If no error occured we can lock the new child node. If the 670 * node is deadlock flagged wait up to hammer_tdmux_ticks (1 second) 671 * for the deadlock to clear. Otherwise a large number of concurrent 672 * readers can continuously stall the flusher. 673 * 674 * We specifically do this in the cursor_down() code in order to 675 * deal with frontend top-down searches smashing against bottom-up 676 * flusher-based mirror updates. These collisions typically occur 677 * above the inode in the B-Tree and are not covered by the 678 * ip->cursor_exclreq_count logic. 679 */ 680 if (error == 0) { 681 #if 0 682 if (node->cursor_exclreq_count && 683 cursor->trans->type != HAMMER_TRANS_FLS) { 684 tsleep(&node->cursor_exclreq_count, 0, 685 "hstag2", hammer_tdmux_ticks); 686 } 687 #endif 688 hammer_lock_sh(&node->lock); 689 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0); 690 cursor->node = node; 691 cursor->index = 0; 692 } 693 return(error); 694 } 695 696 /************************************************************************ 697 * DEADLOCK RECOVERY * 698 ************************************************************************ 699 * 700 * These are the new deadlock recovery functions. Currently they are only 701 * used for the mirror propagation and physical node removal cases but 702 * ultimately the intention is to use them for all deadlock recovery 703 * operations. 704 * 705 * WARNING! The contents of the cursor may be modified while unlocked. 706 * passive modifications including adjusting the node, parent, 707 * indexes, and leaf pointer. 708 * 709 * An outright removal of the element the cursor was pointing at 710 * will cause the HAMMER_CURSOR_TRACKED_RIPOUT flag to be set, 711 * which chains to causing the HAMMER_CURSOR_RETEST to be set 712 * when the cursor is locked again. 713 */ 714 void 715 hammer_unlock_cursor(hammer_cursor_t cursor) 716 { 717 hammer_node_t node; 718 719 KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0); 720 KKASSERT(cursor->node); 721 722 /* 723 * Release the cursor's locks and track B-Tree operations on node. 724 * While being tracked our cursor can be modified by other threads 725 * and the node may be replaced. 726 */ 727 if (cursor->parent) { 728 hammer_unlock(&cursor->parent->lock); 729 hammer_rel_node(cursor->parent); 730 cursor->parent = NULL; 731 } 732 node = cursor->node; 733 cursor->flags |= HAMMER_CURSOR_TRACKED; 734 TAILQ_INSERT_TAIL(&node->cursor_list, cursor, deadlk_entry); 735 hammer_unlock(&node->lock); 736 } 737 738 /* 739 * Get the cursor heated up again. The cursor's node may have 740 * changed and we might have to locate the new parent. 741 * 742 * If the exact element we were on got deleted RIPOUT will be 743 * set and we must clear ATEDISK so an iteration does not skip 744 * the element after it. 745 */ 746 int 747 hammer_lock_cursor(hammer_cursor_t cursor) 748 { 749 hammer_node_t node; 750 int error; 751 752 KKASSERT(cursor->flags & HAMMER_CURSOR_TRACKED); 753 754 /* 755 * Relock the node 756 */ 757 for (;;) { 758 node = cursor->node; 759 hammer_ref_node(node); 760 hammer_lock_sh(&node->lock); 761 if (cursor->node == node) { 762 hammer_rel_node(node); 763 break; 764 } 765 hammer_unlock(&node->lock); 766 hammer_rel_node(node); 767 } 768 769 /* 770 * Untrack the cursor, clean up, and re-establish the parent node. 771 */ 772 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); 773 cursor->flags &= ~HAMMER_CURSOR_TRACKED; 774 775 /* 776 * If a ripout has occured iterations must re-test the (new) 777 * current element. Clearing ATEDISK prevents the element from 778 * being skipped and RETEST causes it to be re-tested. 779 */ 780 if (cursor->flags & HAMMER_CURSOR_TRACKED_RIPOUT) { 781 cursor->flags &= ~HAMMER_CURSOR_TRACKED_RIPOUT; 782 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 783 cursor->flags |= HAMMER_CURSOR_RETEST; 784 } 785 error = hammer_load_cursor_parent(cursor, 0); 786 return(error); 787 } 788 789 /* 790 * Recover from a deadlocked cursor, tracking any node removals or 791 * replacements. If the cursor's current node is removed by another 792 * thread (via btree_remove()) the cursor will be seeked upwards. 793 * 794 * The caller is working a modifying operation and must be holding the 795 * sync lock (shared). We do not release the sync lock because this 796 * would break atomicy. 797 */ 798 int 799 hammer_recover_cursor(hammer_cursor_t cursor) 800 { 801 hammer_transaction_t trans __debugvar; 802 #if 0 803 hammer_inode_t ip; 804 #endif 805 int error; 806 807 hammer_unlock_cursor(cursor); 808 809 #if 0 810 ip = cursor->ip; 811 #endif 812 trans = cursor->trans; 813 KKASSERT(trans->sync_lock_refs > 0); 814 815 /* 816 * Wait for the deadlock to clear. 817 * 818 * Maintain exclreq_count / wakeup as necessary to notify new 819 * entrants into ip. We continue to hold the fs_token so our 820 * EDEADLK retry loop should get its chance before another thread 821 * steals the lock. 822 */ 823 if (cursor->deadlk_node) { 824 #if 0 825 if (ip && trans->type == HAMMER_TRANS_FLS) 826 ++ip->cursor_exclreq_count; 827 ++cursor->deadlk_node->cursor_exclreq_count; 828 #endif 829 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk"); 830 hammer_unlock(&cursor->deadlk_node->lock); 831 #if 0 832 if (--cursor->deadlk_node->cursor_exclreq_count == 0) 833 wakeup(&cursor->deadlk_node->cursor_exclreq_count); 834 if (ip && trans->type == HAMMER_TRANS_FLS) { 835 if (--ip->cursor_exclreq_count == 0) 836 wakeup(&ip->cursor_exclreq_count); 837 } 838 #endif 839 hammer_rel_node(cursor->deadlk_node); 840 cursor->deadlk_node = NULL; 841 } 842 if (cursor->deadlk_rec) { 843 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr"); 844 hammer_rel_mem_record(cursor->deadlk_rec); 845 cursor->deadlk_rec = NULL; 846 } 847 error = hammer_lock_cursor(cursor); 848 return(error); 849 } 850 851 /* 852 * Dup ocursor to ncursor. ncursor inherits ocursor's locks and ocursor 853 * is effectively unlocked and becomes tracked. If ocursor was not locked 854 * then ncursor also inherits the tracking. 855 * 856 * After the caller finishes working with ncursor it must be cleaned up 857 * with hammer_done_cursor(), and the caller must re-lock ocursor. 858 */ 859 hammer_cursor_t 860 hammer_push_cursor(hammer_cursor_t ocursor) 861 { 862 hammer_cursor_t ncursor; 863 hammer_inode_t ip; 864 hammer_node_t node; 865 hammer_mount_t hmp; 866 867 hmp = ocursor->trans->hmp; 868 ncursor = kmalloc(sizeof(*ncursor), hmp->m_misc, M_WAITOK | M_ZERO); 869 bcopy(ocursor, ncursor, sizeof(*ocursor)); 870 871 node = ocursor->node; 872 hammer_ref_node(node); 873 if ((ocursor->flags & HAMMER_CURSOR_TRACKED) == 0) { 874 ocursor->flags |= HAMMER_CURSOR_TRACKED; 875 TAILQ_INSERT_TAIL(&node->cursor_list, ocursor, deadlk_entry); 876 } 877 if (ncursor->parent) 878 ocursor->parent = NULL; 879 ocursor->data_buffer = NULL; 880 ocursor->leaf = NULL; 881 ocursor->data = NULL; 882 if (ncursor->flags & HAMMER_CURSOR_TRACKED) 883 TAILQ_INSERT_TAIL(&node->cursor_list, ncursor, deadlk_entry); 884 if ((ip = ncursor->ip) != NULL) { 885 ++ip->cursor_ip_refs; 886 } 887 if (ncursor->iprec) 888 hammer_ref(&ncursor->iprec->lock); 889 return(ncursor); 890 } 891 892 /* 893 * Destroy ncursor and restore ocursor 894 * 895 * This is a temporary hack for the release. We can't afford to lose 896 * the IP lock until the IP object scan code is able to deal with it, 897 * so have ocursor inherit it back. 898 */ 899 void 900 hammer_pop_cursor(hammer_cursor_t ocursor, hammer_cursor_t ncursor) 901 { 902 hammer_mount_t hmp; 903 hammer_inode_t ip; 904 905 hmp = ncursor->trans->hmp; 906 ip = ncursor->ip; 907 ncursor->ip = NULL; 908 if (ip) 909 --ip->cursor_ip_refs; 910 hammer_done_cursor(ncursor); 911 kfree(ncursor, hmp->m_misc); 912 KKASSERT(ocursor->ip == ip); 913 hammer_lock_cursor(ocursor); 914 } 915 916 /* 917 * onode is being replaced by nnode by the reblocking code. 918 */ 919 void 920 hammer_cursor_replaced_node(hammer_node_t onode, hammer_node_t nnode) 921 { 922 hammer_cursor_t cursor; 923 hammer_node_ondisk_t ondisk; 924 hammer_node_ondisk_t nndisk; 925 926 ondisk = onode->ondisk; 927 nndisk = nnode->ondisk; 928 929 while ((cursor = TAILQ_FIRST(&onode->cursor_list)) != NULL) { 930 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 931 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 932 KKASSERT(cursor->node == onode); 933 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 934 cursor->leaf = &nndisk->elms[cursor->index].leaf; 935 cursor->node = nnode; 936 hammer_ref_node(nnode); 937 hammer_rel_node(onode); 938 } 939 } 940 941 /* 942 * We have removed <node> from the parent and collapsed the parent. 943 * 944 * Cursors in deadlock recovery are seeked upward to the parent so the 945 * btree_remove() recursion works properly even though we have marked 946 * the cursor as requiring a reseek. 947 * 948 * This is the only cursor function which sets HAMMER_CURSOR_ITERATE_CHECK, 949 * meaning the cursor is no longer definitively pointing at an element 950 * within its iteration (if the cursor is being used to iterate). The 951 * iteration code will take this into account instead of asserting if the 952 * cursor is outside the iteration range. 953 */ 954 void 955 hammer_cursor_removed_node(hammer_node_t node, hammer_node_t parent, int index) 956 { 957 hammer_cursor_t cursor; 958 hammer_node_ondisk_t ondisk; 959 960 KKASSERT(parent != NULL); 961 ondisk = node->ondisk; 962 963 while ((cursor = TAILQ_FIRST(&node->cursor_list)) != NULL) { 964 KKASSERT(cursor->node == node); 965 KKASSERT(cursor->index == 0); 966 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry); 967 TAILQ_INSERT_TAIL(&parent->cursor_list, cursor, deadlk_entry); 968 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 969 cursor->leaf = NULL; 970 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; 971 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK; 972 cursor->node = parent; 973 cursor->index = index; 974 hammer_ref_node(parent); 975 hammer_rel_node(node); 976 } 977 } 978 979 /* 980 * node was split at (onode, index) with elements >= index moved to nnode. 981 */ 982 void 983 hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index) 984 { 985 hammer_cursor_t cursor; 986 hammer_node_ondisk_t ondisk; 987 hammer_node_ondisk_t nndisk; 988 989 ondisk = onode->ondisk; 990 nndisk = nnode->ondisk; 991 992 again: 993 TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { 994 KKASSERT(cursor->node == onode); 995 if (cursor->index < index) 996 continue; 997 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 998 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 999 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1000 cursor->leaf = &nndisk->elms[cursor->index - index].leaf; 1001 cursor->node = nnode; 1002 cursor->index -= index; 1003 hammer_ref_node(nnode); 1004 hammer_rel_node(onode); 1005 goto again; 1006 } 1007 } 1008 1009 /* 1010 * An element was moved from one node to another or within a node. The 1011 * index may also represent the end of the node (index == numelements). 1012 * 1013 * {oparent,pindex} is the parent node's pointer to onode/oindex. 1014 * 1015 * This is used by the rebalancing code. This is not an insertion or 1016 * deletion and any additional elements, including the degenerate case at 1017 * the end of the node, will be dealt with by additional distinct calls. 1018 */ 1019 void 1020 hammer_cursor_moved_element(hammer_node_t oparent, int pindex, 1021 hammer_node_t onode, int oindex, 1022 hammer_node_t nnode, int nindex) 1023 { 1024 hammer_cursor_t cursor; 1025 hammer_node_ondisk_t ondisk; 1026 hammer_node_ondisk_t nndisk; 1027 1028 /* 1029 * Adjust any cursors pointing at the element 1030 */ 1031 ondisk = onode->ondisk; 1032 nndisk = nnode->ondisk; 1033 again1: 1034 TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { 1035 KKASSERT(cursor->node == onode); 1036 if (cursor->index != oindex) 1037 continue; 1038 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry); 1039 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1040 if (cursor->leaf == &ondisk->elms[oindex].leaf) 1041 cursor->leaf = &nndisk->elms[nindex].leaf; 1042 cursor->node = nnode; 1043 cursor->index = nindex; 1044 hammer_ref_node(nnode); 1045 hammer_rel_node(onode); 1046 goto again1; 1047 } 1048 1049 /* 1050 * When moving the first element of onode to a different node any 1051 * cursor which is pointing at (oparent,pindex) must be repointed 1052 * to nnode and ATEDISK must be cleared. 1053 * 1054 * This prevents cursors from losing track due to insertions. 1055 * Insertions temporarily release the cursor in order to update 1056 * the mirror_tids. It primarily effects the mirror_write code. 1057 * The other code paths generally only do a single insertion and 1058 * then relookup or drop the cursor. 1059 */ 1060 if (onode == nnode || oindex) 1061 return; 1062 ondisk = oparent->ondisk; 1063 again2: 1064 TAILQ_FOREACH(cursor, &oparent->cursor_list, deadlk_entry) { 1065 KKASSERT(cursor->node == oparent); 1066 if (cursor->index != pindex) 1067 continue; 1068 kprintf("HAMMER debug: shifted cursor pointing at parent\n" 1069 "parent %016jx:%d onode %016jx:%d nnode %016jx:%d\n", 1070 (intmax_t)oparent->node_offset, pindex, 1071 (intmax_t)onode->node_offset, oindex, 1072 (intmax_t)nnode->node_offset, nindex); 1073 TAILQ_REMOVE(&oparent->cursor_list, cursor, deadlk_entry); 1074 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); 1075 if (cursor->leaf == &ondisk->elms[oindex].leaf) 1076 cursor->leaf = &nndisk->elms[nindex].leaf; 1077 cursor->node = nnode; 1078 cursor->index = nindex; 1079 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1080 hammer_ref_node(nnode); 1081 hammer_rel_node(oparent); 1082 goto again2; 1083 } 1084 } 1085 1086 /* 1087 * The B-Tree element pointing to the specified node was moved from (oparent) 1088 * to (nparent, nindex). We must locate any tracked cursors pointing at 1089 * node and adjust their parent accordingly. 1090 * 1091 * This is used by the rebalancing code when packing elements causes an 1092 * element to shift from one node to another. 1093 */ 1094 void 1095 hammer_cursor_parent_changed(hammer_node_t node, hammer_node_t oparent, 1096 hammer_node_t nparent, int nindex) 1097 { 1098 hammer_cursor_t cursor; 1099 1100 again: 1101 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1102 KKASSERT(cursor->node == node); 1103 if (cursor->parent == oparent) { 1104 cursor->parent = nparent; 1105 cursor->parent_index = nindex; 1106 hammer_ref_node(nparent); 1107 hammer_rel_node(oparent); 1108 goto again; 1109 } 1110 } 1111 } 1112 1113 /* 1114 * Deleted element at (node, index) 1115 * 1116 * Shift indexes >= index 1117 */ 1118 void 1119 hammer_cursor_deleted_element(hammer_node_t node, int index) 1120 { 1121 hammer_cursor_t cursor; 1122 hammer_node_ondisk_t ondisk; 1123 1124 ondisk = node->ondisk; 1125 1126 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1127 KKASSERT(cursor->node == node); 1128 if (cursor->index == index) { 1129 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT; 1130 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1131 cursor->leaf = NULL; 1132 } else if (cursor->index > index) { 1133 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1134 cursor->leaf = &ondisk->elms[cursor->index - 1].leaf; 1135 --cursor->index; 1136 } 1137 } 1138 } 1139 1140 /* 1141 * Inserted element at (node, index) 1142 * 1143 * Shift indexes >= index 1144 */ 1145 void 1146 hammer_cursor_inserted_element(hammer_node_t node, int index) 1147 { 1148 hammer_cursor_t cursor; 1149 hammer_node_ondisk_t ondisk; 1150 1151 ondisk = node->ondisk; 1152 1153 TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) { 1154 KKASSERT(cursor->node == node); 1155 if (cursor->index >= index) { 1156 if (cursor->leaf == &ondisk->elms[cursor->index].leaf) 1157 cursor->leaf = &ondisk->elms[cursor->index + 1].leaf; 1158 ++cursor->index; 1159 } 1160 } 1161 } 1162 1163 /* 1164 * Invalidate the cached data buffer associated with a cursor. 1165 * 1166 * This needs to be done when the underlying block is being freed or 1167 * the referenced buffer can prevent the related buffer cache buffer 1168 * from being properly invalidated. 1169 */ 1170 void 1171 hammer_cursor_invalidate_cache(hammer_cursor_t cursor) 1172 { 1173 if (cursor->data_buffer) { 1174 hammer_rel_buffer(cursor->data_buffer, 0); 1175 cursor->data_buffer = NULL; 1176 cursor->data = NULL; 1177 } 1178 } 1179 1180