1 /* 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * hammer2_cluster_lock() 59 * hammer2_cluster_parent() 60 * 61 * - Most complex functions, quorum management on transaction ids. 62 * 63 * - Locking and data accesses must be internally asynchronous. 64 * 65 * - Validate and manage cache coherency primitives (cache state 66 * is stored in chain topologies but must be validated by these 67 * functions). 68 * 69 * (2) Lookups and Scans 70 * hammer2_cluster_lookup() 71 * hammer2_cluster_next() 72 * 73 * - Depend on locking & data retrieval functions, but still complex. 74 * 75 * - Must do quorum management on transaction ids. 76 * 77 * - Lookup and Iteration ops Must be internally asynchronous. 78 * 79 * (3) Modifying Operations 80 * hammer2_cluster_create() 81 * hammer2_cluster_rename() 82 * hammer2_cluster_delete() 83 * hammer2_cluster_modify() 84 * hammer2_cluster_modsync() 85 * 86 * - Can usually punt on failures, operation continues unless quorum 87 * is lost. If quorum is lost, must wait for resynchronization 88 * (depending on the management mode). 89 * 90 * - Must disconnect node on failures (also not flush), remount, and 91 * resynchronize. 92 * 93 * - Network links (via kdmsg) are relatively easy to issue as the 94 * complex underworkings of hammer2_chain.c don't have to messed 95 * with (the protocol is at a higher level than block-level). 96 * 97 * - Multiple local disk nodes (i.e. block devices) are another matter. 98 * Chain operations have to be dispatched to per-node threads (xN) 99 * because we can't asynchronize potentially very complex chain 100 * operations in hammer2_chain.c (it would be a huge mess). 101 * 102 * (these threads are also used to terminate incoming kdmsg ops from 103 * other machines). 104 * 105 * - Single-node filesystems do not use threads and will simply call 106 * hammer2_chain.c functions directly. This short-cut is handled 107 * at the base of each cluster function. 108 */ 109 #include <sys/cdefs.h> 110 #include <sys/param.h> 111 #include <sys/systm.h> 112 #include <sys/types.h> 113 #include <sys/lock.h> 114 #include <sys/uuid.h> 115 116 #include "hammer2.h" 117 118 /* 119 * Returns TRUE if any chain in the cluster needs to be resized. 120 */ 121 int 122 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) 123 { 124 hammer2_chain_t *chain; 125 int i; 126 127 for (i = 0; i < cluster->nchains; ++i) { 128 chain = cluster->array[i].chain; 129 if (chain && chain->bytes != bytes) 130 return 1; 131 } 132 return 0; 133 } 134 135 uint8_t 136 hammer2_cluster_type(hammer2_cluster_t *cluster) 137 { 138 return(cluster->focus->bref.type); 139 } 140 141 int 142 hammer2_cluster_modified(hammer2_cluster_t *cluster) 143 { 144 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 145 } 146 147 /* 148 * Return a bref representative of the cluster. Any data offset is removed 149 * (since it would only be applicable to a particular chain in the cluster). 150 * 151 * However, the radix portion of data_off is used for many purposes and will 152 * be retained. 153 */ 154 void 155 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 156 { 157 *bref = cluster->focus->bref; 158 bref->data_off &= HAMMER2_OFF_MASK_RADIX; 159 } 160 161 /* 162 * Return non-zero if the chain representing an inode has been flagged 163 * as having been unlinked. Allows the vnode reclaim to avoid loading 164 * the inode data from disk e.g. when unmount or recycling old, clean 165 * vnodes. 166 */ 167 int 168 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster) 169 { 170 hammer2_chain_t *chain; 171 int flags; 172 int i; 173 174 flags = 0; 175 for (i = 0; i < cluster->nchains; ++i) { 176 chain = cluster->array[i].chain; 177 if (chain) 178 flags |= chain->flags; 179 } 180 return (flags & HAMMER2_CHAIN_UNLINKED); 181 } 182 183 void 184 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 185 { 186 hammer2_chain_t *chain; 187 int i; 188 189 for (i = 0; i < cluster->nchains; ++i) { 190 chain = cluster->array[i].chain; 191 if (chain) 192 atomic_set_int(&chain->flags, flags); 193 } 194 } 195 196 void 197 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 198 { 199 hammer2_chain_t *chain; 200 int i; 201 202 for (i = 0; i < cluster->nchains; ++i) { 203 chain = cluster->array[i].chain; 204 if (chain) 205 atomic_clear_int(&chain->flags, flags); 206 } 207 } 208 209 void 210 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster) 211 { 212 hammer2_chain_t *chain; 213 int i; 214 215 for (i = 0; i < cluster->nchains; ++i) { 216 chain = cluster->array[i].chain; 217 if (chain) 218 hammer2_chain_setflush(trans, chain); 219 } 220 } 221 222 void 223 hammer2_cluster_setmethod_check(hammer2_trans_t *trans, 224 hammer2_cluster_t *cluster, 225 int check_algo) 226 { 227 hammer2_chain_t *chain; 228 int i; 229 230 for (i = 0; i < cluster->nchains; ++i) { 231 chain = cluster->array[i].chain; 232 if (chain) { 233 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 234 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1); 235 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo); 236 } 237 } 238 } 239 240 /* 241 * Create a cluster with one ref from the specified chain. The chain 242 * is not further referenced. The caller typically supplies a locked 243 * chain and transfers ownership to the cluster. 244 * 245 * The returned cluster will be focused on the chain (strictly speaking, 246 * the focus should be NULL if the chain is not locked but we do not check 247 * for this condition). 248 * 249 * We fake the flags. 250 */ 251 hammer2_cluster_t * 252 hammer2_cluster_from_chain(hammer2_chain_t *chain) 253 { 254 hammer2_cluster_t *cluster; 255 256 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 257 cluster->array[0].chain = chain; 258 cluster->nchains = 1; 259 cluster->focus = chain; 260 cluster->pmp = chain->pmp; 261 cluster->refs = 1; 262 cluster->flags = HAMMER2_CLUSTER_LOCKED | 263 HAMMER2_CLUSTER_WRHARD | 264 HAMMER2_CLUSTER_RDHARD | 265 HAMMER2_CLUSTER_MSYNCED | 266 HAMMER2_CLUSTER_SSYNCED; 267 268 return cluster; 269 } 270 271 /* 272 * Add a reference to a cluster. 273 * 274 * We must also ref the underlying chains in order to allow ref/unlock 275 * sequences to later re-lock. 276 */ 277 void 278 hammer2_cluster_ref(hammer2_cluster_t *cluster) 279 { 280 hammer2_chain_t *chain; 281 int i; 282 283 atomic_add_int(&cluster->refs, 1); 284 for (i = 0; i < cluster->nchains; ++i) { 285 chain = cluster->array[i].chain; 286 if (chain) 287 hammer2_chain_ref(chain); 288 } 289 } 290 291 /* 292 * Drop the caller's reference to the cluster. When the ref count drops to 293 * zero this function frees the cluster and drops all underlying chains. 294 * 295 * In-progress read I/Os are typically detached from the cluster once the 296 * first one returns (the remaining stay attached to the DIOs but are then 297 * ignored and drop naturally). 298 */ 299 void 300 hammer2_cluster_drop(hammer2_cluster_t *cluster) 301 { 302 hammer2_chain_t *chain; 303 int i; 304 305 KKASSERT(cluster->refs > 0); 306 for (i = 0; i < cluster->nchains; ++i) { 307 chain = cluster->array[i].chain; 308 if (chain) { 309 hammer2_chain_drop(chain); 310 if (cluster->refs == 1) 311 cluster->array[i].chain = NULL; 312 } 313 } 314 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 315 cluster->focus = NULL; /* safety XXX chg to assert */ 316 kfree(cluster, M_HAMMER2); 317 /* cluster is invalid */ 318 } 319 } 320 321 void 322 hammer2_cluster_wait(hammer2_cluster_t *cluster) 323 { 324 tsleep(cluster->focus, 0, "h2clcw", 1); 325 } 326 327 /* 328 * Lock and ref a cluster. This adds a ref to the cluster and its chains 329 * and then locks them, modified by various RESOLVE flags. 330 * 331 * The act of locking a cluster sets its focus. 332 * 333 * The chains making up the cluster may be narrowed down based on quorum 334 * acceptability, and if RESOLVE_RDONLY is specified the chains can be 335 * narrowed down to a single chain as long as the entire subtopology is known 336 * to be intact. So, for example, we can narrow a read-only op to a single 337 * fast SLAVE but if we focus a CACHE chain we must still retain at least 338 * a SLAVE to ensure that the subtopology can be accessed. 339 * 340 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need 341 * to be maintained once the topology is validated as-of the top level of 342 * the operation. 343 * 344 * If a failure occurs the operation must be aborted by higher-level code and 345 * retried. XXX 346 */ 347 void 348 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 349 { 350 hammer2_chain_t *chain; 351 int i; 352 353 /* cannot be on inode-embedded cluster template, must be on copy */ 354 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0); 355 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) { 356 kprintf("hammer2_cluster_lock: cluster %p already locked!\n", 357 cluster); 358 } else { 359 KKASSERT(cluster->focus == NULL); 360 } 361 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 362 363 if ((how & HAMMER2_RESOLVE_NOREF) == 0) 364 atomic_add_int(&cluster->refs, 1); 365 366 /* 367 * Lock chains and resolve state. 368 */ 369 for (i = 0; i < cluster->nchains; ++i) { 370 chain = cluster->array[i].chain; 371 if (chain == NULL) 372 continue; 373 hammer2_chain_lock(chain, how); 374 } 375 376 hammer2_cluster_resolve(cluster); 377 } 378 379 void 380 hammer2_cluster_resolve(hammer2_cluster_t *cluster) 381 { 382 hammer2_chain_t *chain; 383 hammer2_pfs_t *pmp; 384 hammer2_tid_t quorum_tid; 385 int focus_pfs_type; 386 uint32_t nflags; 387 int ttlmasters; 388 int ttlslaves; 389 int nmasters; 390 int nslaves; 391 int nquorum; 392 int i; 393 394 cluster->error = 0; 395 396 quorum_tid = 0; 397 focus_pfs_type = 0; 398 nflags = 0; 399 ttlmasters = 0; 400 ttlslaves = 0; 401 nmasters = 0; 402 nslaves = 0; 403 404 /* 405 * Calculate quorum 406 */ 407 pmp = cluster->pmp; 408 KKASSERT(pmp != NULL || cluster->nchains == 0); 409 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 410 411 /* 412 * Pass 1 413 */ 414 for (i = 0; i < cluster->nchains; ++i) { 415 chain = cluster->array[i].chain; 416 if (chain == NULL) 417 continue; 418 419 switch (cluster->pmp->pfs_types[i]) { 420 case HAMMER2_PFSTYPE_MASTER: 421 ++ttlmasters; 422 if (quorum_tid < chain->bref.mirror_tid || 423 nmasters == 0) { 424 nmasters = 1; 425 quorum_tid = chain->bref.mirror_tid; 426 } else if (quorum_tid == chain->bref.mirror_tid) { 427 ++nmasters; 428 } 429 break; 430 case HAMMER2_PFSTYPE_SLAVE: 431 ++ttlslaves; 432 break; 433 case HAMMER2_PFSTYPE_SOFT_MASTER: 434 nflags |= HAMMER2_CLUSTER_WRSOFT; 435 nflags |= HAMMER2_CLUSTER_RDSOFT; 436 break; 437 case HAMMER2_PFSTYPE_SOFT_SLAVE: 438 nflags |= HAMMER2_CLUSTER_RDSOFT; 439 break; 440 case HAMMER2_PFSTYPE_SUPROOT: 441 /* 442 * Degenerate cluster representing the super-root 443 * topology on a single device. 444 */ 445 nflags |= HAMMER2_CLUSTER_WRHARD; 446 nflags |= HAMMER2_CLUSTER_RDHARD; 447 cluster->focus = chain; 448 cluster->error = chain->error; 449 break; 450 default: 451 break; 452 } 453 } 454 455 /* 456 * Pass 2 457 */ 458 for (i = 0; i < cluster->nchains; ++i) { 459 chain = cluster->array[i].chain; 460 if (chain == NULL) 461 continue; 462 463 switch (cluster->pmp->pfs_types[i]) { 464 case HAMMER2_PFSTYPE_MASTER: 465 /* 466 * We must have enough up-to-date masters to reach 467 * a quorum and the master mirror_tid must match 468 * the quorum's mirror_tid. 469 * 470 * Do not select an errored master. 471 */ 472 if (nmasters >= nquorum && 473 chain->error == 0 && 474 quorum_tid == chain->bref.mirror_tid) { 475 nflags |= HAMMER2_CLUSTER_WRHARD; 476 nflags |= HAMMER2_CLUSTER_RDHARD; 477 if (cluster->focus == NULL || 478 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) { 479 focus_pfs_type = HAMMER2_PFSTYPE_MASTER; 480 cluster->focus = chain; 481 cluster->error = chain->error; 482 } 483 } 484 break; 485 case HAMMER2_PFSTYPE_SLAVE: 486 /* 487 * We must have enough up-to-date masters to reach 488 * a quorum and the slave mirror_tid must match the 489 * quorum's mirror_tid. 490 * 491 * Do not select an errored slave. 492 */ 493 if (nmasters >= nquorum && 494 chain->error == 0 && 495 quorum_tid == chain->bref.mirror_tid) { 496 ++nslaves; 497 nflags |= HAMMER2_CLUSTER_RDHARD; 498 if (cluster->focus == NULL) { 499 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE; 500 cluster->focus = chain; 501 cluster->error = chain->error; 502 } 503 } 504 break; 505 case HAMMER2_PFSTYPE_SOFT_MASTER: 506 /* 507 * Directly mounted soft master always wins. There 508 * should be only one. 509 */ 510 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER); 511 cluster->focus = chain; 512 cluster->error = chain->error; 513 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER; 514 break; 515 case HAMMER2_PFSTYPE_SOFT_SLAVE: 516 /* 517 * Directly mounted soft slave always wins. There 518 * should be only one. 519 */ 520 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE); 521 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) { 522 cluster->focus = chain; 523 cluster->error = chain->error; 524 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE; 525 } 526 break; 527 default: 528 break; 529 } 530 } 531 532 /* 533 * Set SSYNCED or MSYNCED for slaves and masters respectively if 534 * all available nodes (even if 0 are available) are fully 535 * synchronized. This is used by the synchronization thread to 536 * determine if there is work it could potentially accomplish. 537 */ 538 if (nslaves == ttlslaves) 539 nflags |= HAMMER2_CLUSTER_SSYNCED; 540 if (nmasters == ttlmasters) 541 nflags |= HAMMER2_CLUSTER_MSYNCED; 542 543 /* 544 * Determine if the cluster was successfully locked for the 545 * requested operation and generate an error code. The cluster 546 * will not be locked (or ref'd) if an error is returned. 547 * 548 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok() 549 * to determine if reading or writing is possible. If writing, the 550 * cluster still requires a call to hammer2_cluster_modify() first. 551 */ 552 atomic_set_int(&cluster->flags, nflags); 553 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 554 } 555 556 /* 557 * Copy a cluster, returned a ref'd cluster. All underlying chains 558 * are also ref'd, but not locked. 559 * 560 * The cluster focus is not set because the cluster is not yet locked 561 * (and the originating cluster does not have to be locked either). 562 */ 563 hammer2_cluster_t * 564 hammer2_cluster_copy(hammer2_cluster_t *ocluster) 565 { 566 hammer2_pfs_t *pmp = ocluster->pmp; 567 hammer2_cluster_t *ncluster; 568 hammer2_chain_t *chain; 569 int i; 570 571 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 572 ncluster->pmp = pmp; 573 ncluster->nchains = ocluster->nchains; 574 ncluster->refs = 1; 575 ncluster->flags = 0; /* cluster not locked */ 576 577 for (i = 0; i < ocluster->nchains; ++i) { 578 chain = ocluster->array[i].chain; 579 ncluster->array[i].chain = chain; 580 if (chain) 581 hammer2_chain_ref(chain); 582 } 583 return (ncluster); 584 } 585 586 /* 587 * Unlock and deref a cluster. The cluster is destroyed if this is the 588 * last ref. 589 */ 590 void 591 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 592 { 593 hammer2_chain_t *chain; 594 int i; 595 596 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) { 597 kprintf("hammer2_cluster_unlock: cluster %p not locked\n", 598 cluster); 599 } 600 /* KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); */ 601 KKASSERT(cluster->refs > 0); 602 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 603 604 for (i = 0; i < cluster->nchains; ++i) { 605 chain = cluster->array[i].chain; 606 if (chain) { 607 hammer2_chain_unlock(chain); 608 if (cluster->refs == 1) 609 cluster->array[i].chain = NULL; /* safety */ 610 } 611 } 612 cluster->focus = NULL; 613 614 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 615 kfree(cluster, M_HAMMER2); 616 /* cluster = NULL; safety */ 617 } 618 } 619 620 /* 621 * Resize the cluster's physical storage allocation in-place. This may 622 * replace the cluster's chains. 623 */ 624 void 625 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 626 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 627 int nradix, int flags) 628 { 629 hammer2_chain_t *chain; 630 int i; 631 632 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ 633 KKASSERT(cparent->nchains == cluster->nchains); 634 635 for (i = 0; i < cluster->nchains; ++i) { 636 chain = cluster->array[i].chain; 637 if (chain) { 638 KKASSERT(cparent->array[i].chain); 639 hammer2_chain_resize(trans, ip, 640 cparent->array[i].chain, chain, 641 nradix, flags); 642 } 643 } 644 } 645 646 /* 647 * Set an inode's cluster modified, marking the related chains RW and 648 * duplicating them if necessary. 649 * 650 * The passed-in chain is a localized copy of the chain previously acquired 651 * when the inode was locked (and possilby replaced in the mean time), and 652 * must also be updated. In fact, we update it first and then synchronize 653 * the inode's cluster cache. 654 */ 655 hammer2_inode_data_t * 656 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 657 hammer2_cluster_t *cluster, int flags) 658 { 659 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 660 hammer2_cluster_modify(trans, cluster, flags); 661 662 hammer2_inode_repoint(ip, NULL, cluster); 663 if (ip->vp) 664 vsetisdirty(ip->vp); 665 return (&hammer2_cluster_wdata(cluster)->ipdata); 666 } 667 668 /* 669 * Adjust the cluster's chains to allow modification and adjust the 670 * focus. Data will be accessible on return. 671 * 672 * If our focused master errors on modify, re-resolve the cluster to 673 * try to select a different master. 674 */ 675 void 676 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 677 int flags) 678 { 679 hammer2_chain_t *chain; 680 int resolve_again; 681 int i; 682 683 resolve_again = 0; 684 for (i = 0; i < cluster->nchains; ++i) { 685 chain = cluster->array[i].chain; 686 if (chain) { 687 hammer2_chain_modify(trans, chain, flags); 688 if (cluster->focus == chain && 689 chain->error) { 690 cluster->error = chain->error; 691 resolve_again = 1; 692 } 693 } 694 } 695 if (resolve_again) 696 hammer2_cluster_resolve(cluster); 697 } 698 699 /* 700 * Synchronize modifications from the focus to other chains in a cluster. 701 * Convenient because nominal API users can just modify the contents of the 702 * focus (at least for non-blockref data). 703 * 704 * Nominal front-end operations only edit non-block-table data in a single 705 * chain. This code copies such modifications to the other chains in the 706 * cluster. Blocktable modifications are handled on a chain-by-chain basis 707 * by both the frontend and the backend and will explode in fireworks if 708 * blindly copied. 709 */ 710 void 711 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 712 { 713 hammer2_chain_t *focus; 714 hammer2_chain_t *scan; 715 const hammer2_inode_data_t *ripdata; 716 hammer2_inode_data_t *wipdata; 717 int i; 718 719 focus = cluster->focus; 720 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 721 722 for (i = 0; i < cluster->nchains; ++i) { 723 scan = cluster->array[i].chain; 724 if (scan == NULL || scan == focus) 725 continue; 726 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 727 KKASSERT(focus->bytes == scan->bytes && 728 focus->bref.type == scan->bref.type); 729 switch(focus->bref.type) { 730 case HAMMER2_BREF_TYPE_INODE: 731 ripdata = &focus->data->ipdata; 732 wipdata = &scan->data->ipdata; 733 if ((ripdata->op_flags & 734 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 735 bcopy(ripdata, wipdata, 736 offsetof(hammer2_inode_data_t, u)); 737 break; 738 } 739 /* fall through to full copy */ 740 case HAMMER2_BREF_TYPE_DATA: 741 bcopy(focus->data, scan->data, focus->bytes); 742 break; 743 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 744 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 745 case HAMMER2_BREF_TYPE_FREEMAP: 746 case HAMMER2_BREF_TYPE_VOLUME: 747 panic("hammer2_cluster_modsync: illegal node type"); 748 /* NOT REACHED */ 749 break; 750 default: 751 panic("hammer2_cluster_modsync: unknown node type"); 752 break; 753 } 754 } 755 } 756 757 /* 758 * Lookup initialization/completion API 759 */ 760 hammer2_cluster_t * 761 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 762 { 763 hammer2_cluster_t *cluster; 764 int i; 765 766 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 767 cluster->pmp = cparent->pmp; /* can be NULL */ 768 cluster->flags = 0; /* cluster not locked (yet) */ 769 /* cluster->focus = NULL; already null */ 770 771 for (i = 0; i < cparent->nchains; ++i) 772 cluster->array[i].chain = cparent->array[i].chain; 773 cluster->nchains = cparent->nchains; 774 775 /* 776 * Independently lock (this will also give cluster 1 ref) 777 */ 778 if (flags & HAMMER2_LOOKUP_SHARED) { 779 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 780 HAMMER2_RESOLVE_SHARED); 781 } else { 782 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 783 } 784 return (cluster); 785 } 786 787 void 788 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 789 { 790 if (cparent) 791 hammer2_cluster_unlock(cparent); 792 } 793 794 /* 795 * Locate first match or overlap under parent, return a new cluster 796 */ 797 hammer2_cluster_t * 798 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 799 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 800 { 801 hammer2_pfs_t *pmp; 802 hammer2_cluster_t *cluster; 803 hammer2_chain_t *chain; 804 hammer2_key_t key_accum; 805 hammer2_key_t key_next; 806 hammer2_key_t bref_key; 807 int null_count; 808 int bref_keybits; 809 int i; 810 uint8_t bref_type; 811 u_int bytes; 812 813 pmp = cparent->pmp; /* can be NULL */ 814 key_accum = *key_nextp; 815 null_count = 0; 816 bref_type = 0; 817 bref_key = 0; 818 bref_keybits = 0; 819 bytes = 0; 820 821 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 822 cluster->pmp = pmp; /* can be NULL */ 823 cluster->refs = 1; 824 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 825 cluster->flags |= HAMMER2_CLUSTER_LOCKED; 826 827 for (i = 0; i < cparent->nchains; ++i) { 828 key_next = *key_nextp; 829 if (cparent->array[i].chain == NULL) { 830 ++null_count; 831 continue; 832 } 833 chain = hammer2_chain_lookup(&cparent->array[i].chain, 834 &key_next, 835 key_beg, key_end, 836 &cparent->array[i].cache_index, 837 flags); 838 cluster->array[i].chain = chain; 839 if (chain == NULL) { 840 ++null_count; 841 } else { 842 int ddflag = (chain->bref.type == 843 HAMMER2_BREF_TYPE_INODE); 844 845 /* 846 * Set default focus. 847 */ 848 if (cluster->focus == NULL) { 849 bref_type = chain->bref.type; 850 bref_key = chain->bref.key; 851 bref_keybits = chain->bref.keybits; 852 bytes = chain->bytes; 853 cluster->ddflag = ddflag; 854 cluster->focus = chain; 855 } 856 857 /* 858 * Override default focus to follow the parent. 859 */ 860 if (cparent->focus == cparent->array[i].chain) 861 cluster->focus = chain; 862 863 KKASSERT(bref_type == chain->bref.type); 864 KKASSERT(bref_key == chain->bref.key); 865 KKASSERT(bref_keybits == chain->bref.keybits); 866 KKASSERT(bytes == chain->bytes); 867 KKASSERT(cluster->ddflag == ddflag); 868 } 869 if (key_accum > key_next) 870 key_accum = key_next; 871 } 872 *key_nextp = key_accum; 873 cluster->nchains = i; 874 hammer2_cluster_resolve(cluster); 875 876 if (null_count == i) { 877 hammer2_cluster_drop(cluster); 878 cluster = NULL; 879 } 880 881 return (cluster); 882 } 883 884 /* 885 * Locate next match or overlap under parent, replace cluster 886 */ 887 hammer2_cluster_t * 888 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 889 hammer2_key_t *key_nextp, 890 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 891 { 892 hammer2_chain_t *chain; 893 hammer2_key_t key_accum; 894 hammer2_key_t key_next; 895 hammer2_key_t bref_key; 896 int null_count; 897 int bref_keybits; 898 int i; 899 uint8_t bref_type; 900 u_int bytes; 901 902 key_accum = *key_nextp; 903 null_count = 0; 904 cluster->focus = NULL; 905 cparent->focus = NULL; 906 907 bref_type = 0; 908 bref_key = 0; 909 bref_keybits = 0; 910 bytes = 0; 911 cluster->ddflag = 0; 912 913 for (i = 0; i < cparent->nchains; ++i) { 914 key_next = *key_nextp; 915 chain = cluster->array[i].chain; 916 if (chain == NULL) { 917 ++null_count; 918 continue; 919 } 920 if (cparent->array[i].chain == NULL) { 921 if (flags & HAMMER2_LOOKUP_NOLOCK) 922 hammer2_chain_drop(chain); 923 else 924 hammer2_chain_unlock(chain); 925 ++null_count; 926 continue; 927 } 928 chain = hammer2_chain_next(&cparent->array[i].chain, chain, 929 &key_next, key_beg, key_end, 930 &cparent->array[i].cache_index, 931 flags); 932 cluster->array[i].chain = chain; 933 if (chain == NULL) { 934 ++null_count; 935 } else { 936 int ddflag = (chain->bref.type == 937 HAMMER2_BREF_TYPE_INODE); 938 if (cluster->focus == NULL) { 939 bref_type = chain->bref.type; 940 bref_key = chain->bref.key; 941 bref_keybits = chain->bref.keybits; 942 bytes = chain->bytes; 943 cluster->ddflag = ddflag; 944 cluster->focus = chain; 945 } 946 947 /* 948 * Override default focus to follow the parent. 949 */ 950 if (cparent->focus == cparent->array[i].chain) 951 cluster->focus = chain; 952 953 KKASSERT(bref_type == chain->bref.type); 954 KKASSERT(bref_key == chain->bref.key); 955 KKASSERT(bref_keybits == chain->bref.keybits); 956 KKASSERT(bytes == chain->bytes); 957 KKASSERT(cluster->ddflag == ddflag); 958 } 959 if (key_accum > key_next) 960 key_accum = key_next; 961 } 962 cluster->nchains = i; 963 hammer2_cluster_resolve(cluster); 964 965 if (null_count == i) { 966 hammer2_cluster_drop(cluster); 967 cluster = NULL; 968 } 969 return(cluster); 970 } 971 972 /* 973 * Create a new cluster using the specified key 974 */ 975 int 976 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 977 hammer2_cluster_t **clusterp, 978 hammer2_key_t key, int keybits, 979 int type, size_t bytes, int flags) 980 { 981 hammer2_cluster_t *cluster; 982 hammer2_pfs_t *pmp; 983 int error; 984 int i; 985 986 pmp = trans->pmp; /* can be NULL */ 987 988 if ((cluster = *clusterp) == NULL) { 989 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, 990 M_WAITOK | M_ZERO); 991 cluster->pmp = pmp; /* can be NULL */ 992 cluster->refs = 1; 993 cluster->flags = HAMMER2_CLUSTER_LOCKED; 994 } 995 cluster->focus = NULL; 996 997 /* 998 * NOTE: cluster->array[] entries can initially be NULL. If 999 * *clusterp is supplied, skip NULL entries, otherwise 1000 * create new chains. 1001 */ 1002 for (i = 0; i < cparent->nchains; ++i) { 1003 if (*clusterp && cluster->array[i].chain == NULL) { 1004 continue; 1005 } 1006 error = hammer2_chain_create(trans, &cparent->array[i].chain, 1007 &cluster->array[i].chain, pmp, 1008 key, keybits, 1009 type, bytes, flags); 1010 KKASSERT(error == 0); 1011 if (cluster->focus == NULL) 1012 cluster->focus = cluster->array[i].chain; 1013 if (cparent->focus == cparent->array[i].chain) 1014 cluster->focus = cluster->array[i].chain; 1015 } 1016 cluster->nchains = i; 1017 *clusterp = cluster; 1018 hammer2_cluster_resolve(cluster); 1019 1020 return error; 1021 } 1022 1023 /* 1024 * Rename a cluster to a new parent. 1025 * 1026 * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields 1027 * are used from a passed-in non-NULL bref pointer. All other fields 1028 * are extracted from the original chain for each chain in the 1029 * iteration. 1030 */ 1031 void 1032 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 1033 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 1034 int flags) 1035 { 1036 hammer2_chain_t *chain; 1037 hammer2_blockref_t xbref; 1038 int i; 1039 1040 cluster->focus = NULL; 1041 cparent->focus = NULL; 1042 1043 for (i = 0; i < cluster->nchains; ++i) { 1044 chain = cluster->array[i].chain; 1045 if (chain) { 1046 if (bref) { 1047 xbref = chain->bref; 1048 xbref.key = bref->key; 1049 xbref.keybits = bref->keybits; 1050 hammer2_chain_rename(trans, &xbref, 1051 &cparent->array[i].chain, 1052 chain, flags); 1053 } else { 1054 hammer2_chain_rename(trans, NULL, 1055 &cparent->array[i].chain, 1056 chain, flags); 1057 } 1058 KKASSERT(cluster->array[i].chain == chain); /*remove*/ 1059 } 1060 } 1061 } 1062 1063 /* 1064 * Mark a cluster deleted 1065 */ 1066 void 1067 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 1068 hammer2_cluster_t *cluster, int flags) 1069 { 1070 hammer2_chain_t *chain; 1071 hammer2_chain_t *parent; 1072 int i; 1073 1074 if (cparent == NULL) { 1075 kprintf("cparent is NULL\n"); 1076 return; 1077 } 1078 1079 for (i = 0; i < cluster->nchains; ++i) { 1080 parent = (i < cparent->nchains) ? 1081 cparent->array[i].chain : NULL; 1082 chain = cluster->array[i].chain; 1083 if (chain == NULL) 1084 continue; 1085 if (chain->parent != parent) { 1086 kprintf("hammer2_cluster_delete: parent " 1087 "mismatch chain=%p parent=%p against=%p\n", 1088 chain, chain->parent, parent); 1089 } else { 1090 hammer2_chain_delete(trans, parent, chain, flags); 1091 } 1092 } 1093 } 1094 1095 /* 1096 * Create a snapshot of the specified {parent, ochain} with the specified 1097 * label. The originating hammer2_inode must be exclusively locked for 1098 * safety. 1099 * 1100 * The ioctl code has already synced the filesystem. 1101 */ 1102 int 1103 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, 1104 hammer2_ioc_pfs_t *pfs) 1105 { 1106 hammer2_dev_t *hmp; 1107 hammer2_cluster_t *ncluster; 1108 const hammer2_inode_data_t *ripdata; 1109 hammer2_inode_data_t *wipdata; 1110 hammer2_chain_t *nchain; 1111 hammer2_inode_t *nip; 1112 size_t name_len; 1113 hammer2_key_t lhc; 1114 struct vattr vat; 1115 #if 0 1116 uuid_t opfs_clid; 1117 #endif 1118 int error; 1119 int i; 1120 1121 kprintf("snapshot %s\n", pfs->name); 1122 1123 name_len = strlen(pfs->name); 1124 lhc = hammer2_dirhash(pfs->name, name_len); 1125 1126 /* 1127 * Get the clid 1128 */ 1129 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata; 1130 #if 0 1131 opfs_clid = ripdata->pfs_clid; 1132 #endif 1133 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */ 1134 1135 /* 1136 * Create the snapshot directory under the super-root 1137 * 1138 * Set PFS type, generate a unique filesystem id, and generate 1139 * a cluster id. Use the same clid when snapshotting a PFS root, 1140 * which theoretically allows the snapshot to be used as part of 1141 * the same cluster (perhaps as a cache). 1142 * 1143 * Copy the (flushed) blockref array. Theoretically we could use 1144 * chain_duplicate() but it becomes difficult to disentangle 1145 * the shared core so for now just brute-force it. 1146 */ 1147 VATTR_NULL(&vat); 1148 vat.va_type = VDIR; 1149 vat.va_mode = 0755; 1150 ncluster = NULL; 1151 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, 1152 proc0.p_ucred, pfs->name, name_len, 1153 &ncluster, 1154 HAMMER2_INSERT_PFSROOT, &error); 1155 1156 if (nip) { 1157 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); 1158 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER; 1159 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT; 1160 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT; 1161 kern_uuidgen(&wipdata->pfs_fsid, 1); 1162 1163 /* 1164 * Give the snapshot its own private cluster. As a snapshot 1165 * no further synchronization with the original cluster will 1166 * be done. 1167 */ 1168 #if 0 1169 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1170 wipdata->pfs_clid = opfs_clid; 1171 else 1172 kern_uuidgen(&wipdata->pfs_clid, 1); 1173 #endif 1174 kern_uuidgen(&wipdata->pfs_clid, 1); 1175 1176 for (i = 0; i < ncluster->nchains; ++i) { 1177 nchain = ncluster->array[i].chain; 1178 if (nchain) 1179 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 1180 } 1181 #if 0 1182 /* XXX can't set this unless we do an explicit flush, which 1183 we also need a pmp assigned to do, else the flush code 1184 won't flush ncluster because it thinks it is crossing a 1185 flush boundary */ 1186 hammer2_cluster_set_chainflags(ncluster, 1187 HAMMER2_CHAIN_PFSBOUNDARY); 1188 #endif 1189 1190 /* XXX hack blockset copy */ 1191 /* XXX doesn't work with real cluster */ 1192 KKASSERT(ocluster->nchains == 1); 1193 wipdata->u.blockset = ripdata->u.blockset; 1194 hammer2_cluster_modsync(ncluster); 1195 for (i = 0; i < ncluster->nchains; ++i) { 1196 nchain = ncluster->array[i].chain; 1197 if (nchain) 1198 hammer2_flush(trans, nchain); 1199 } 1200 hammer2_inode_unlock_ex(nip, ncluster); 1201 } 1202 return (error); 1203 } 1204 1205 /* 1206 * Return locked parent cluster given a locked child. The child remains 1207 * locked on return. The new parent's focus follows the child's focus 1208 * and the parent is always resolved. 1209 */ 1210 hammer2_cluster_t * 1211 hammer2_cluster_parent(hammer2_cluster_t *cluster) 1212 { 1213 hammer2_cluster_t *cparent; 1214 int i; 1215 1216 cparent = hammer2_cluster_copy(cluster); 1217 1218 for (i = 0; i < cparent->nchains; ++i) { 1219 hammer2_chain_t *chain; 1220 hammer2_chain_t *rchain; 1221 1222 /* 1223 * Calculate parent for each element. Old chain has an extra 1224 * ref for cparent but the lock remains with cluster. 1225 */ 1226 chain = cparent->array[i].chain; 1227 if (chain == NULL) 1228 continue; 1229 while ((rchain = chain->parent) != NULL) { 1230 hammer2_chain_ref(rchain); 1231 hammer2_chain_unlock(chain); 1232 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS); 1233 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1234 hammer2_chain_drop(rchain); 1235 if (chain->parent == rchain) 1236 break; 1237 hammer2_chain_unlock(rchain); 1238 } 1239 if (cluster->focus == chain) 1240 cparent->focus = rchain; 1241 cparent->array[i].chain = rchain; 1242 hammer2_chain_drop(chain); 1243 } 1244 cparent->flags |= HAMMER2_CLUSTER_LOCKED; 1245 hammer2_cluster_resolve(cparent); 1246 1247 return cparent; 1248 } 1249 1250 /************************************************************************ 1251 * CLUSTER I/O * 1252 ************************************************************************ 1253 * 1254 * 1255 * WARNING! blockref[] array data is not universal. These functions should 1256 * only be used to access universal data. 1257 * 1258 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1259 * complete if necessary. The I/O's should have already been 1260 * initiated by the cluster_lock/chain_lock operation. 1261 * 1262 * The cluster must already be in a modified state before wdata 1263 * is called. The data will already be available for this case. 1264 */ 1265 const hammer2_media_data_t * 1266 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1267 { 1268 return(cluster->focus->data); 1269 } 1270 1271 hammer2_media_data_t * 1272 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1273 { 1274 KKASSERT(hammer2_cluster_modified(cluster)); 1275 return(cluster->focus->data); 1276 } 1277 1278 /* 1279 * Load cluster data asynchronously with callback. 1280 * 1281 * The callback is made for the first validated data found, or NULL 1282 * if no valid data is available. 1283 * 1284 * NOTE! The cluster structure is either unique or serialized (e.g. embedded 1285 * in the inode with an exclusive lock held), the chain structure may be 1286 * shared. 1287 */ 1288 void 1289 hammer2_cluster_load_async(hammer2_cluster_t *cluster, 1290 void (*callback)(hammer2_iocb_t *iocb), void *ptr) 1291 { 1292 hammer2_chain_t *chain; 1293 hammer2_iocb_t *iocb; 1294 hammer2_dev_t *hmp; 1295 hammer2_blockref_t *bref; 1296 int i; 1297 1298 /* 1299 * Try to find a chain whos data is already resolved. If none can 1300 * be found, start with the first chain. 1301 */ 1302 chain = NULL; 1303 for (i = 0; i < cluster->nchains; ++i) { 1304 chain = cluster->array[i].chain; 1305 if (chain && chain->data) 1306 break; 1307 } 1308 if (i == cluster->nchains) { 1309 chain = cluster->array[0].chain; 1310 i = 0; 1311 } 1312 1313 iocb = &cluster->iocb; 1314 iocb->callback = callback; 1315 iocb->dio = NULL; /* for already-validated case */ 1316 iocb->cluster = cluster; 1317 iocb->chain = chain; 1318 iocb->ptr = ptr; 1319 iocb->lbase = (off_t)i; 1320 iocb->flags = 0; 1321 iocb->error = 0; 1322 1323 /* 1324 * Data already validated 1325 */ 1326 if (chain->data) { 1327 callback(iocb); 1328 return; 1329 } 1330 1331 /* 1332 * We must resolve to a device buffer, either by issuing I/O or 1333 * by creating a zero-fill element. We do not mark the buffer 1334 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1335 * API must still be used to do that). 1336 * 1337 * The device buffer is variable-sized in powers of 2 down 1338 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1339 * chunk always contains buffers of the same size. (XXX) 1340 * 1341 * The minimum physical IO size may be larger than the variable 1342 * block size. 1343 * 1344 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes 1345 * matches hammer2_devblksize()? Or does the freemap's 1346 * pre-zeroing handle the case for us? 1347 */ 1348 bref = &chain->bref; 1349 hmp = chain->hmp; 1350 1351 #if 0 1352 /* handled by callback? <- TODO XXX even needed for loads? */ 1353 /* 1354 * The getblk() optimization for a 100% overwrite can only be used 1355 * if the physical block size matches the request. 1356 */ 1357 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 1358 chain->bytes == hammer2_devblksize(chain->bytes)) { 1359 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); 1360 KKASSERT(error == 0); 1361 iocb->dio = dio; 1362 callback(iocb); 1363 return; 1364 } 1365 #endif 1366 1367 /* 1368 * Otherwise issue a read 1369 */ 1370 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1371 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb); 1372 } 1373 1374 /************************************************************************ 1375 * NODE FAILURES * 1376 ************************************************************************ 1377 * 1378 * A node failure can occur for numerous reasons. 1379 * 1380 * - A read I/O may fail 1381 * - A write I/O may fail 1382 * - An unexpected chain might be found (or be missing) 1383 * - A node might disconnect temporarily and reconnect later 1384 * (for example, a USB stick could get pulled, or a node might 1385 * be programmatically disconnected). 1386 * - A node might run out of space during a modifying operation. 1387 * 1388 * When a read failure or an unexpected chain state is found, the chain and 1389 * parent chain at the failure point for the nodes involved (the nodes 1390 * which we determine to be in error) are flagged as failed and removed 1391 * from the cluster. The node itself is allowed to remain active. The 1392 * highest common point (usually a parent chain) is queued to the 1393 * resynchronization thread for action. 1394 * 1395 * When a write I/O fails or a node runs out of space, we first adjust 1396 * as if a read failure occurs but we further disable flushes on the 1397 * ENTIRE node. Concurrent modifying transactions are allowed to complete 1398 * but any new modifying transactions will automatically remove the node 1399 * from consideration in all related cluster structures and not generate 1400 * any new modified chains. The ROOT chain for the failed node(s) is queued 1401 * to the resynchronization thread for action. 1402 * 1403 * A temporary disconnect is handled as if a write failure occurred. 1404 * 1405 * Any of these failures might or might not stall related high level VNOPS, 1406 * depending on what has failed, what nodes remain, the type of cluster, 1407 * and the operating state of the cluster. 1408 * 1409 * FLUSH ON WRITE-DISABLED NODES 1410 * 1411 * A flush on a write-disabled node is not allowed to write anything because 1412 * we cannot safely update the mirror_tid anywhere on the failed node. The 1413 * synchronization thread uses mirror_tid to calculate incremental resyncs. 1414 * Dirty meta-data related to the failed node is thrown away. 1415 * 1416 * Dirty buffer cache buffers and inodes are only thrown away if they can be 1417 * retired... that is, if the filesystem still has enough nodes to complete 1418 * the operation. 1419 */ 1420 1421 /************************************************************************ 1422 * SYNCHRONIZATION THREAD * 1423 ************************************************************************ 1424 * 1425 * This thread is responsible for [re]synchronizing the cluster representing 1426 * a PFS. Any out-of-sync or failed node starts this thread on a 1427 * node-by-node basis when the failure is detected. 1428 * 1429 * Clusters needing resynchronization are queued at the highest point 1430 * where the parent on the failed node is still valid, or a special 1431 * incremental scan from the ROOT is queued if no parent exists. This 1432 * thread is also responsible for waiting for reconnections of the failed 1433 * node if the cause was due to a disconnect, and waiting for space to be 1434 * freed up if the cause was due to running out of space. 1435 * 1436 * If the cause is due to a node running out of space, this thread will also 1437 * remove older (unlocked) snapshots to make new space, recover space, and 1438 * then start resynchronization. 1439 * 1440 * Each resynchronization pass virtually snapshots the PFS on the good nodes 1441 * and synchronizes using that snapshot against the target node. This 1442 * ensures a consistent chain topology and also avoids interference between 1443 * the resynchronization thread and frontend operations. 1444 * 1445 * Since these are per-node threads it is possible to resynchronize several 1446 * nodes at once. 1447 */ 1448