1 /* 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * hammer2_cluster_lock() 59 * hammer2_cluster_parent() 60 * 61 * - Most complex functions, quorum management on transaction ids. 62 * 63 * - Locking and data accesses must be internally asynchronous. 64 * 65 * - Validate and manage cache coherency primitives (cache state 66 * is stored in chain topologies but must be validated by these 67 * functions). 68 * 69 * (2) Lookups and Scans 70 * hammer2_cluster_lookup() 71 * hammer2_cluster_next() 72 * 73 * - Depend on locking & data retrieval functions, but still complex. 74 * 75 * - Must do quorum management on transaction ids. 76 * 77 * - Lookup and Iteration ops Must be internally asynchronous. 78 * 79 * (3) Modifying Operations 80 * hammer2_cluster_create() 81 * hammer2_cluster_rename() 82 * hammer2_cluster_delete() 83 * hammer2_cluster_modify() 84 * hammer2_cluster_modsync() 85 * 86 * - Can usually punt on failures, operation continues unless quorum 87 * is lost. If quorum is lost, must wait for resynchronization 88 * (depending on the management mode). 89 * 90 * - Must disconnect node on failures (also not flush), remount, and 91 * resynchronize. 92 * 93 * - Network links (via kdmsg) are relatively easy to issue as the 94 * complex underworkings of hammer2_chain.c don't have to messed 95 * with (the protocol is at a higher level than block-level). 96 * 97 * - Multiple local disk nodes (i.e. block devices) are another matter. 98 * Chain operations have to be dispatched to per-node threads (xN) 99 * because we can't asynchronize potentially very complex chain 100 * operations in hammer2_chain.c (it would be a huge mess). 101 * 102 * (these threads are also used to terminate incoming kdmsg ops from 103 * other machines). 104 * 105 * - Single-node filesystems do not use threads and will simply call 106 * hammer2_chain.c functions directly. This short-cut is handled 107 * at the base of each cluster function. 108 */ 109 #include <sys/cdefs.h> 110 #include <sys/param.h> 111 #include <sys/systm.h> 112 #include <sys/types.h> 113 #include <sys/lock.h> 114 #include <sys/uuid.h> 115 116 #include "hammer2.h" 117 118 /* 119 * Returns non-zero if any chain in the cluster needs to be resized. 120 * Errored elements are not used in the calculation. 121 */ 122 int 123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) 124 { 125 hammer2_chain_t *chain; 126 int i; 127 128 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 129 for (i = 0; i < cluster->nchains; ++i) { 130 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 131 continue; 132 chain = cluster->array[i].chain; 133 if (chain == NULL) 134 continue; 135 if (chain->error) 136 continue; 137 if (chain->bytes != bytes) 138 return 1; 139 } 140 return 0; 141 } 142 143 /* 144 * Returns the bref type of the cluster's foucs. 145 * 146 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0). 147 * The cluster must be locked. 148 */ 149 uint8_t 150 hammer2_cluster_type(hammer2_cluster_t *cluster) 151 { 152 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 153 if (cluster->error == 0) 154 return(cluster->focus->bref.type); 155 return 0; 156 } 157 158 /* 159 * Returns non-zero if the cluster's focus is flagged as being modified. 160 * 161 * If the cluster is errored, returns 0. 162 */ 163 int 164 hammer2_cluster_modified(hammer2_cluster_t *cluster) 165 { 166 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 167 if (cluster->error == 0) 168 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 169 return 0; 170 } 171 172 /* 173 * Returns the bref of the cluster's focus, sans any data-offset information 174 * (since offset information is per-node and wouldn't be useful). 175 * 176 * Callers use this function to access modify_tid, mirror_tid, type, 177 * key, and keybits. 178 * 179 * If the cluster is errored, returns an empty bref. 180 * The cluster must be locked. 181 */ 182 void 183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 184 { 185 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 186 if (cluster->error == 0) { 187 *bref = cluster->focus->bref; 188 bref->data_off = 0; 189 } else { 190 bzero(bref, sizeof(*bref)); 191 } 192 } 193 194 /* 195 * Return non-zero if the chain representing an inode has been flagged 196 * as having been unlinked. Allows the vnode reclaim to avoid loading 197 * the inode data from disk e.g. when unmount or recycling old, clean 198 * vnodes. 199 * 200 * The cluster does not need to be locked. 201 * The focus cannot be used since the cluster might not be locked. 202 */ 203 int 204 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster) 205 { 206 hammer2_chain_t *chain; 207 int flags; 208 int i; 209 210 flags = 0; 211 for (i = 0; i < cluster->nchains; ++i) { 212 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 213 continue; 214 chain = cluster->array[i].chain; 215 if (chain) 216 flags |= chain->flags; 217 } 218 return (flags & HAMMER2_CHAIN_UNLINKED); 219 } 220 221 /* 222 * Set a bitmask of flags in all chains related to a cluster. 223 * The cluster should probably be locked. 224 * 225 * XXX Only operate on FEMOD elements? 226 */ 227 void 228 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 229 { 230 hammer2_chain_t *chain; 231 int i; 232 233 for (i = 0; i < cluster->nchains; ++i) { 234 chain = cluster->array[i].chain; 235 if (chain) 236 atomic_set_int(&chain->flags, flags); 237 } 238 } 239 240 /* 241 * Set a bitmask of flags in all chains related to a cluster. 242 * The cluster should probably be locked. 243 * 244 * XXX Only operate on FEMOD elements? 245 */ 246 void 247 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 248 { 249 hammer2_chain_t *chain; 250 int i; 251 252 for (i = 0; i < cluster->nchains; ++i) { 253 chain = cluster->array[i].chain; 254 if (chain) 255 atomic_clear_int(&chain->flags, flags); 256 } 257 } 258 259 /* 260 * Flag the cluster for flushing recursively up to the root. Despite the 261 * work it does, this is relatively benign. It just makes sure that the 262 * flusher has top-down visibility to this cluster. 263 * 264 * Errored chains are not flagged for flushing. 265 * 266 * The cluster should probably be locked. 267 */ 268 void 269 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster) 270 { 271 hammer2_chain_t *chain; 272 int i; 273 274 for (i = 0; i < cluster->nchains; ++i) { 275 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 276 continue; 277 chain = cluster->array[i].chain; 278 if (chain == NULL) 279 continue; 280 if (chain->error) 281 continue; 282 hammer2_chain_setflush(trans, chain); 283 } 284 } 285 286 /* 287 * Set the check mode for the cluster. 288 * Errored elements of the cluster are ignored. 289 * 290 * The cluster must be locked and modified. 291 */ 292 void 293 hammer2_cluster_setmethod_check(hammer2_trans_t *trans, 294 hammer2_cluster_t *cluster, 295 int check_algo) 296 { 297 hammer2_chain_t *chain; 298 int i; 299 300 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 301 for (i = 0; i < cluster->nchains; ++i) { 302 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 303 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 304 continue; 305 } 306 chain = cluster->array[i].chain; 307 if (chain == NULL) 308 continue; 309 if (chain->error) 310 continue; 311 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 312 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1); 313 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo); 314 } 315 } 316 317 /* 318 * Create a degenerate cluster with one ref from a single locked chain. 319 * The returned cluster will be focused on the chain and inherit its 320 * error state. 321 * 322 * The chain's lock and reference are transfered to the new cluster, so 323 * the caller should not try to unlock the chain separately. 324 * 325 * We fake the flags. 326 */ 327 hammer2_cluster_t * 328 hammer2_cluster_from_chain(hammer2_chain_t *chain) 329 { 330 hammer2_cluster_t *cluster; 331 332 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 333 cluster->array[0].chain = chain; 334 cluster->array[0].flags = HAMMER2_CITEM_FEMOD; 335 cluster->nchains = 1; 336 cluster->focus = chain; 337 cluster->focus_index = 0; 338 cluster->pmp = chain->pmp; 339 cluster->refs = 1; 340 cluster->error = chain->error; 341 cluster->flags = HAMMER2_CLUSTER_LOCKED | 342 HAMMER2_CLUSTER_WRHARD | 343 HAMMER2_CLUSTER_RDHARD | 344 HAMMER2_CLUSTER_MSYNCED | 345 HAMMER2_CLUSTER_SSYNCED; 346 347 return cluster; 348 } 349 350 /* 351 * Add a reference to a cluster and its underlying chains. 352 * 353 * We must also ref the underlying chains in order to allow ref/unlock 354 * sequences to later re-lock. 355 */ 356 void 357 hammer2_cluster_ref(hammer2_cluster_t *cluster) 358 { 359 atomic_add_int(&cluster->refs, 1); 360 } 361 362 /* 363 * Drop the caller's reference to the cluster. When the ref count drops to 364 * zero this function frees the cluster and drops all underlying chains. 365 * 366 * In-progress read I/Os are typically detached from the cluster once the 367 * first one returns (the remaining stay attached to the DIOs but are then 368 * ignored and drop naturally). 369 */ 370 void 371 hammer2_cluster_drop(hammer2_cluster_t *cluster) 372 { 373 hammer2_chain_t *chain; 374 int i; 375 376 KKASSERT(cluster->refs > 0); 377 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 378 cluster->focus = NULL; /* safety XXX chg to assert */ 379 cluster->focus_index = 0; 380 381 for (i = 0; i < cluster->nchains; ++i) { 382 chain = cluster->array[i].chain; 383 if (chain) { 384 hammer2_chain_drop(chain); 385 cluster->array[i].chain = NULL; /* safety */ 386 } 387 } 388 cluster->nchains = 0; /* safety */ 389 390 kfree(cluster, M_HAMMER2); 391 /* cluster is invalid */ 392 } 393 } 394 395 void 396 hammer2_cluster_wait(hammer2_cluster_t *cluster) 397 { 398 tsleep(cluster->focus, 0, "h2clcw", 1); 399 } 400 401 /* 402 * Lock a cluster. Cluster must already be referenced. Focus is maintained. 403 * 404 * WARNING! This function expects the caller to handle resolution of the 405 * cluster. We never re-resolve the cluster in this function, 406 * because it might be used to temporarily unlock/relock a cparent 407 * in an iteration or recursrion, and the cparents elements do not 408 * necessarily match. 409 */ 410 void 411 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how) 412 { 413 hammer2_chain_t *chain; 414 int i; 415 416 /* cannot be on inode-embedded cluster template, must be on copy */ 417 KKASSERT(cluster->refs > 0); 418 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0); 419 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) { 420 panic("hammer2_cluster_lock: cluster %p already locked!\n", 421 cluster); 422 } 423 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 424 425 /* 426 * Lock chains and resolve state. 427 */ 428 for (i = 0; i < cluster->nchains; ++i) { 429 if (i == idx) 430 continue; 431 chain = cluster->array[i].chain; 432 if (chain == NULL) 433 continue; 434 hammer2_chain_lock(chain, how); 435 } 436 } 437 438 void 439 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 440 { 441 hammer2_cluster_lock_except(cluster, -1, how); 442 } 443 444 /* 445 * Calculate the clustering state for the cluster and set its focus. 446 * This routine must be called with care. For example, it should not 447 * normally be called after relocking a non-leaf cluster because parent 448 * clusters help iterations and each element might be at a slightly different 449 * indirect node (each node's topology is independently indexed). 450 * 451 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal 452 * operations. Typically this is only set on a quorum of MASTERs or 453 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER 454 * is present, this bit is *not* set on a quorum of MASTERs. The 455 * synchronization code ignores this bit, but all hammer2_cluster_*() calls 456 * that create/modify/delete elements use it. 457 * 458 * The chains making up the cluster may be narrowed down based on quorum 459 * acceptability, and if RESOLVE_RDONLY is specified the chains can be 460 * narrowed down to a single chain as long as the entire subtopology is known 461 * to be intact. So, for example, we can narrow a read-only op to a single 462 * fast SLAVE but if we focus a CACHE chain we must still retain at least 463 * a SLAVE to ensure that the subtopology can be accessed. 464 * 465 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need 466 * to be maintained once the topology is validated as-of the top level of 467 * the operation. 468 * 469 * If a failure occurs the operation must be aborted by higher-level code and 470 * retried. XXX 471 */ 472 void 473 hammer2_cluster_resolve(hammer2_cluster_t *cluster) 474 { 475 hammer2_chain_t *chain; 476 hammer2_chain_t *focus; 477 hammer2_pfs_t *pmp; 478 hammer2_tid_t quorum_tid; 479 hammer2_tid_t last_best_quorum_tid; 480 int focus_pfs_type; 481 uint32_t nflags; 482 int ttlmasters; 483 int ttlslaves; 484 int nmasters; 485 int nslaves; 486 int nquorum; 487 int smpresent; 488 int i; 489 490 cluster->error = 0; 491 cluster->focus = NULL; 492 493 focus_pfs_type = 0; 494 nflags = 0; 495 ttlmasters = 0; 496 ttlslaves = 0; 497 nmasters = 0; 498 nslaves = 0; 499 500 /* 501 * Calculate quorum 502 */ 503 pmp = cluster->pmp; 504 KKASSERT(pmp != NULL || cluster->nchains == 0); 505 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 506 smpresent = 0; 507 508 /* 509 * Pass 1 510 * 511 * NOTE: A NULL chain is not necessarily an error, it could be 512 * e.g. a lookup failure or the end of an iteration. 513 * Process normally. 514 */ 515 for (i = 0; i < cluster->nchains; ++i) { 516 chain = cluster->array[i].chain; 517 if (chain && chain->error) { 518 if (cluster->focus == NULL || cluster->focus == chain) { 519 /* error will be overridden by valid focus */ 520 cluster->error = chain->error; 521 } 522 523 /* 524 * Must count total masters and slaves whether the 525 * chain is errored or not. 526 */ 527 switch (cluster->pmp->pfs_types[i]) { 528 case HAMMER2_PFSTYPE_MASTER: 529 ++ttlmasters; 530 break; 531 case HAMMER2_PFSTYPE_SLAVE: 532 ++ttlslaves; 533 break; 534 } 535 continue; 536 } 537 switch (cluster->pmp->pfs_types[i]) { 538 case HAMMER2_PFSTYPE_MASTER: 539 ++ttlmasters; 540 break; 541 case HAMMER2_PFSTYPE_SLAVE: 542 ++ttlslaves; 543 break; 544 case HAMMER2_PFSTYPE_SOFT_MASTER: 545 nflags |= HAMMER2_CLUSTER_WRSOFT; 546 nflags |= HAMMER2_CLUSTER_RDSOFT; 547 smpresent = 1; 548 break; 549 case HAMMER2_PFSTYPE_SOFT_SLAVE: 550 nflags |= HAMMER2_CLUSTER_RDSOFT; 551 break; 552 case HAMMER2_PFSTYPE_SUPROOT: 553 /* 554 * Degenerate cluster representing the super-root 555 * topology on a single device. Fake stuff so 556 * cluster ops work as expected. 557 */ 558 nflags |= HAMMER2_CLUSTER_WRHARD; 559 nflags |= HAMMER2_CLUSTER_RDHARD; 560 cluster->focus_index = i; 561 cluster->focus = chain; 562 cluster->error = chain ? chain->error : 0; 563 break; 564 default: 565 break; 566 } 567 } 568 569 /* 570 * Pass 2 571 * 572 * Resolve masters. Calculate nmasters for the highest matching 573 * TID, if a quorum cannot be attained try the next lower matching 574 * TID until we exhaust TIDs. 575 * 576 * NOTE: A NULL chain is not necessarily an error, it could be 577 * e.g. a lookup failure or the end of an iteration. 578 * Process normally. 579 */ 580 last_best_quorum_tid = HAMMER2_TID_MAX; 581 quorum_tid = 0; /* fix gcc warning */ 582 583 while (nmasters < nquorum && last_best_quorum_tid != 0) { 584 nmasters = 0; 585 quorum_tid = 0; 586 587 for (i = 0; i < cluster->nchains; ++i) { 588 if (cluster->pmp->pfs_types[i] != 589 HAMMER2_PFSTYPE_MASTER) { 590 continue; 591 } 592 chain = cluster->array[i].chain; 593 594 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 595 /* 596 * Invalid as in unsynchronized, cannot be 597 * used to calculate the quorum. 598 */ 599 } else if (chain == NULL && quorum_tid == 0) { 600 /* 601 * NULL chain on master matches NULL chains 602 * on other masters. 603 */ 604 ++nmasters; 605 } else if (quorum_tid < last_best_quorum_tid && 606 chain != NULL && 607 (quorum_tid < chain->bref.modify_tid || 608 nmasters == 0)) { 609 /* 610 * Better TID located, reset nmasters count. 611 */ 612 nmasters = 1; 613 quorum_tid = chain->bref.modify_tid; 614 } else if (chain && 615 quorum_tid == chain->bref.modify_tid) { 616 /* 617 * TID matches current collection. 618 */ 619 ++nmasters; 620 } 621 } 622 if (nmasters >= nquorum) 623 break; 624 last_best_quorum_tid = quorum_tid; 625 } 626 627 /* 628 * Pass 3 629 * 630 * NOTE: A NULL chain is not necessarily an error, it could be 631 * e.g. a lookup failure or the end of an iteration. 632 * Process normally. 633 */ 634 for (i = 0; i < cluster->nchains; ++i) { 635 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 636 chain = cluster->array[i].chain; 637 if (chain && chain->error) { 638 if (cluster->focus == NULL || cluster->focus == chain) { 639 /* error will be overridden by valid focus */ 640 cluster->error = chain->error; 641 } 642 continue; 643 } 644 645 switch (cluster->pmp->pfs_types[i]) { 646 case HAMMER2_PFSTYPE_MASTER: 647 /* 648 * We must have enough up-to-date masters to reach 649 * a quorum and the master modify_tid must match 650 * the quorum's modify_tid. 651 * 652 * Do not select an errored or out-of-sync master. 653 */ 654 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 655 nflags |= HAMMER2_CLUSTER_UNHARD; 656 } else if (nmasters >= nquorum && 657 (chain == NULL || chain->error == 0) && 658 ((chain == NULL && quorum_tid == 0) || 659 (chain != NULL && quorum_tid == 660 chain->bref.modify_tid))) { 661 nflags |= HAMMER2_CLUSTER_WRHARD; 662 nflags |= HAMMER2_CLUSTER_RDHARD; 663 if (!smpresent) { 664 cluster->array[i].flags |= 665 HAMMER2_CITEM_FEMOD; 666 } 667 if (cluster->focus == NULL || 668 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) { 669 focus_pfs_type = HAMMER2_PFSTYPE_MASTER; 670 cluster->focus_index = i; 671 cluster->focus = chain; /* NULL ok */ 672 cluster->error = chain ? chain->error : 673 0; 674 } 675 } else if (chain == NULL || chain->error == 0) { 676 nflags |= HAMMER2_CLUSTER_UNHARD; 677 } 678 break; 679 case HAMMER2_PFSTYPE_SLAVE: 680 /* 681 * We must have enough up-to-date masters to reach 682 * a quorum and the slave modify_tid must match the 683 * quorum's modify_tid. 684 * 685 * Do not select an errored slave. 686 */ 687 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 688 nflags |= HAMMER2_CLUSTER_UNHARD; 689 } else if (nmasters >= nquorum && 690 (chain == NULL || chain->error == 0) && 691 ((chain == NULL && quorum_tid == 0) || 692 (chain && quorum_tid == 693 chain->bref.modify_tid))) { 694 ++nslaves; 695 nflags |= HAMMER2_CLUSTER_RDHARD; 696 #if 0 697 /* XXX optimize for RESOLVE_RDONLY */ 698 if (cluster->focus == NULL) { 699 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE; 700 cluster->focus_index = i; 701 cluster->focus = chain; /* NULL ok */ 702 cluster->error = chain ? chain->error : 703 0; 704 } 705 #endif 706 } else if (chain == NULL || chain->error == 0) { 707 nflags |= HAMMER2_CLUSTER_UNSOFT; 708 } 709 break; 710 case HAMMER2_PFSTYPE_SOFT_MASTER: 711 /* 712 * Directly mounted soft master always wins. There 713 * should be only one. 714 */ 715 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER); 716 cluster->focus_index = i; 717 cluster->focus = chain; 718 cluster->error = chain ? chain->error : 0; 719 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER; 720 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 721 break; 722 case HAMMER2_PFSTYPE_SOFT_SLAVE: 723 /* 724 * Directly mounted soft slave always wins. There 725 * should be only one. 726 */ 727 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE); 728 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) { 729 cluster->focus_index = i; 730 cluster->focus = chain; 731 cluster->error = chain ? chain->error : 0; 732 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE; 733 } 734 break; 735 case HAMMER2_PFSTYPE_SUPROOT: 736 /* 737 * spmp (degenerate case) 738 */ 739 KKASSERT(i == 0); 740 cluster->focus_index = i; 741 cluster->focus = chain; 742 cluster->error = chain ? chain->error : 0; 743 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT; 744 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 745 break; 746 default: 747 break; 748 } 749 } 750 751 /* 752 * Focus now set, adjust ddflag. Skip this pass if the focus 753 * is bad or if we are at the PFS root (the bref won't match at 754 * the PFS root, obviously). 755 */ 756 focus = cluster->focus; 757 if (focus) { 758 cluster->ddflag = 759 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 760 } else { 761 cluster->ddflag = 0; 762 goto skip4; 763 } 764 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 765 goto skip4; 766 767 /* 768 * Pass 4 769 * 770 * Validate the elements that were not marked invalid. They should 771 * match. 772 */ 773 for (i = 0; i < cluster->nchains; ++i) { 774 int ddflag; 775 776 chain = cluster->array[i].chain; 777 778 if (chain == NULL) 779 continue; 780 if (chain == focus) 781 continue; 782 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 783 continue; 784 785 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 786 if (chain->bref.type != focus->bref.type || 787 chain->bref.key != focus->bref.key || 788 chain->bref.keybits != focus->bref.keybits || 789 chain->bref.modify_tid != focus->bref.modify_tid || 790 chain->bytes != focus->bytes || 791 ddflag != cluster->ddflag) { 792 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 793 if (hammer2_debug & 1) 794 kprintf("cluster_resolve: matching modify_tid failed " 795 "bref test: idx=%d type=%02x/%02x " 796 "key=%016jx/%d-%016jx/%d " 797 "mod=%016jx/%016jx bytes=%u/%u\n", 798 i, 799 chain->bref.type, focus->bref.type, 800 chain->bref.key, chain->bref.keybits, 801 focus->bref.key, focus->bref.keybits, 802 chain->bref.modify_tid, focus->bref.modify_tid, 803 chain->bytes, focus->bytes); 804 if (hammer2_debug & 0x4000) 805 panic("cluster_resolve"); 806 /* flag issue and force resync? */ 807 } 808 } 809 skip4: 810 811 if (ttlslaves == 0) 812 nflags |= HAMMER2_CLUSTER_NOSOFT; 813 if (ttlmasters == 0) 814 nflags |= HAMMER2_CLUSTER_NOHARD; 815 816 /* 817 * Set SSYNCED or MSYNCED for slaves and masters respectively if 818 * all available nodes (even if 0 are available) are fully 819 * synchronized. This is used by the synchronization thread to 820 * determine if there is work it could potentially accomplish. 821 */ 822 if (nslaves == ttlslaves) 823 nflags |= HAMMER2_CLUSTER_SSYNCED; 824 if (nmasters == ttlmasters) 825 nflags |= HAMMER2_CLUSTER_MSYNCED; 826 827 /* 828 * Determine if the cluster was successfully locked for the 829 * requested operation and generate an error code. The cluster 830 * will not be locked (or ref'd) if an error is returned. 831 * 832 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok() 833 * to determine if reading or writing is possible. If writing, the 834 * cluster still requires a call to hammer2_cluster_modify() first. 835 */ 836 atomic_set_int(&cluster->flags, nflags); 837 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 838 } 839 840 /* 841 * This is used by the sync thread to force non-NULL elements of a copy 842 * of the pmp->iroot cluster to be good which is required to prime the 843 * sync. 844 */ 845 void 846 hammer2_cluster_forcegood(hammer2_cluster_t *cluster) 847 { 848 int i; 849 850 for (i = 0; i < cluster->nchains; ++i) { 851 if (cluster->array[i].chain) 852 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 853 } 854 } 855 856 /* 857 * Copy a cluster, returned a ref'd cluster. All underlying chains 858 * are also ref'd, but not locked. Focus state is also copied. 859 * 860 * Original cluster does not have to be locked but usually is. 861 * New cluster will not be flagged as locked. 862 * 863 * Callers using this function to initialize a new cluster from an inode 864 * generally lock and resolve the resulting cluster. 865 * 866 * Callers which use this function to save/restore a cluster structure 867 * generally retain the focus state and do not re-resolve it. Caller should 868 * not try to re-resolve internal (cparent) node state during an iteration 869 * as the individual tracking elements of cparent in an iteration may not 870 * match even though they are correct. 871 */ 872 hammer2_cluster_t * 873 hammer2_cluster_copy(hammer2_cluster_t *ocluster) 874 { 875 hammer2_pfs_t *pmp = ocluster->pmp; 876 hammer2_cluster_t *ncluster; 877 hammer2_chain_t *chain; 878 int i; 879 880 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 881 ncluster->pmp = pmp; 882 ncluster->nchains = ocluster->nchains; 883 ncluster->refs = 1; 884 885 for (i = 0; i < ocluster->nchains; ++i) { 886 chain = ocluster->array[i].chain; 887 ncluster->array[i].chain = chain; 888 ncluster->array[i].flags = ocluster->array[i].flags; 889 if (chain) 890 hammer2_chain_ref(chain); 891 } 892 ncluster->focus_index = ocluster->focus_index; 893 ncluster->focus = ocluster->focus; 894 ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED | 895 HAMMER2_CLUSTER_INODE); 896 897 return (ncluster); 898 } 899 900 /* 901 * Unlock a cluster. Refcount and focus is maintained. 902 */ 903 void 904 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx) 905 { 906 hammer2_chain_t *chain; 907 int i; 908 909 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) { 910 kprintf("hammer2_cluster_unlock: cluster %p not locked\n", 911 cluster); 912 } 913 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 914 KKASSERT(cluster->refs > 0); 915 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 916 917 for (i = 0; i < cluster->nchains; ++i) { 918 if (i == idx) 919 continue; 920 chain = cluster->array[i].chain; 921 if (chain) 922 hammer2_chain_unlock(chain); 923 } 924 } 925 926 void 927 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 928 { 929 hammer2_cluster_unlock_except(cluster, -1); 930 } 931 932 /* 933 * Resize the cluster's physical storage allocation in-place. This may 934 * replace the cluster's chains. 935 */ 936 void 937 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 938 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 939 int nradix, int flags) 940 { 941 hammer2_chain_t *chain; 942 int i; 943 944 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ 945 KKASSERT(cparent->nchains == cluster->nchains); 946 947 for (i = 0; i < cluster->nchains; ++i) { 948 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 949 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 950 continue; 951 } 952 chain = cluster->array[i].chain; 953 if (chain) { 954 KKASSERT(cparent->array[i].chain); 955 hammer2_chain_resize(trans, ip, 956 cparent->array[i].chain, chain, 957 nradix, flags); 958 } 959 } 960 } 961 962 /* 963 * Set an inode's cluster modified, marking the related chains RW and 964 * duplicating them if necessary. 965 * 966 * The passed-in chain is a localized copy of the chain previously acquired 967 * when the inode was locked (and possilby replaced in the mean time), and 968 * must also be updated. In fact, we update it first and then synchronize 969 * the inode's cluster cache. 970 */ 971 hammer2_inode_data_t * 972 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 973 hammer2_cluster_t *cluster, int flags) 974 { 975 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 976 hammer2_cluster_modify(trans, cluster, flags); 977 978 hammer2_inode_repoint(ip, NULL, cluster); 979 if (ip->vp) 980 vsetisdirty(ip->vp); 981 return (&hammer2_cluster_wdata(cluster)->ipdata); 982 } 983 984 /* 985 * Adjust the cluster's chains to allow modification and adjust the 986 * focus. Data will be accessible on return. 987 * 988 * If our focused master errors on modify, re-resolve the cluster to 989 * try to select a different master. 990 */ 991 void 992 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 993 int flags) 994 { 995 hammer2_chain_t *chain; 996 int resolve_again; 997 int i; 998 999 resolve_again = 0; 1000 for (i = 0; i < cluster->nchains; ++i) { 1001 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1002 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1003 continue; 1004 } 1005 chain = cluster->array[i].chain; 1006 if (chain == NULL) 1007 continue; 1008 if (chain->error) 1009 continue; 1010 hammer2_chain_modify(trans, chain, flags); 1011 if (cluster->focus == chain && chain->error) { 1012 cluster->error = chain->error; 1013 resolve_again = 1; 1014 } 1015 } 1016 if (resolve_again) 1017 hammer2_cluster_resolve(cluster); 1018 } 1019 1020 /* 1021 * Synchronize modifications from the focus to other chains in a cluster. 1022 * Convenient because nominal API users can just modify the contents of the 1023 * focus (at least for non-blockref data). 1024 * 1025 * Nominal front-end operations only edit non-block-table data in a single 1026 * chain. This code copies such modifications to the other chains in the 1027 * cluster. Blocktable modifications are handled on a chain-by-chain basis 1028 * by both the frontend and the backend and will explode in fireworks if 1029 * blindly copied. 1030 */ 1031 void 1032 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 1033 { 1034 hammer2_chain_t *focus; 1035 hammer2_chain_t *scan; 1036 const hammer2_inode_data_t *ripdata; 1037 hammer2_inode_data_t *wipdata; 1038 int i; 1039 1040 focus = cluster->focus; 1041 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 1042 1043 for (i = 0; i < cluster->nchains; ++i) { 1044 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 1045 continue; 1046 scan = cluster->array[i].chain; 1047 if (scan == NULL || scan == focus) 1048 continue; 1049 if (scan->error) 1050 continue; 1051 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 1052 KKASSERT(focus->bytes == scan->bytes && 1053 focus->bref.type == scan->bref.type); 1054 switch(focus->bref.type) { 1055 case HAMMER2_BREF_TYPE_INODE: 1056 ripdata = &focus->data->ipdata; 1057 wipdata = &scan->data->ipdata; 1058 if ((ripdata->op_flags & 1059 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 1060 bcopy(ripdata, wipdata, 1061 offsetof(hammer2_inode_data_t, u)); 1062 break; 1063 } 1064 /* fall through to full copy */ 1065 case HAMMER2_BREF_TYPE_DATA: 1066 bcopy(focus->data, scan->data, focus->bytes); 1067 break; 1068 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1069 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1070 case HAMMER2_BREF_TYPE_FREEMAP: 1071 case HAMMER2_BREF_TYPE_VOLUME: 1072 panic("hammer2_cluster_modsync: illegal node type"); 1073 /* NOT REACHED */ 1074 break; 1075 default: 1076 panic("hammer2_cluster_modsync: unknown node type"); 1077 break; 1078 } 1079 } 1080 } 1081 1082 /* 1083 * Lookup initialization/completion API. Returns a locked, fully resolved 1084 * cluster with one ref. 1085 */ 1086 hammer2_cluster_t * 1087 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 1088 { 1089 hammer2_cluster_t *cluster; 1090 1091 cluster = hammer2_cluster_copy(cparent); 1092 if (flags & HAMMER2_LOOKUP_SHARED) { 1093 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 1094 HAMMER2_RESOLVE_SHARED); 1095 } else { 1096 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 1097 } 1098 hammer2_cluster_resolve(cluster); 1099 1100 return (cluster); 1101 } 1102 1103 void 1104 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 1105 { 1106 if (cparent) { 1107 hammer2_cluster_unlock(cparent); 1108 hammer2_cluster_drop(cparent); 1109 } 1110 } 1111 1112 /* 1113 * Locate first match or overlap under parent, return a new, locked, resolved 1114 * cluster with one ref. 1115 * 1116 * Must never be called with HAMMER2_LOOKUP_MATCHIND. 1117 */ 1118 hammer2_cluster_t * 1119 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 1120 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 1121 { 1122 hammer2_pfs_t *pmp; 1123 hammer2_cluster_t *cluster; 1124 hammer2_chain_t *chain; 1125 hammer2_key_t key_accum; 1126 hammer2_key_t key_next; 1127 int null_count; 1128 int rflags; 1129 int i; 1130 1131 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0); 1132 1133 pmp = cparent->pmp; /* can be NULL */ 1134 key_accum = *key_nextp; 1135 null_count = 0; 1136 if (flags & HAMMER2_LOOKUP_SHARED) 1137 rflags = HAMMER2_RESOLVE_SHARED; 1138 else 1139 rflags = 0; 1140 1141 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 1142 cluster->pmp = pmp; /* can be NULL */ 1143 cluster->refs = 1; 1144 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1145 cluster->flags |= HAMMER2_CLUSTER_LOCKED; 1146 1147 /* 1148 * Iterating earlier cluster elements with later elements still 1149 * locked is a problem, so we have to unlock the parent and then 1150 * re-lock as we go. 1151 */ 1152 hammer2_cluster_unlock(cparent); 1153 cparent->flags |= HAMMER2_CLUSTER_LOCKED; 1154 1155 /* 1156 * Pass-1, issue lookups. 1157 */ 1158 for (i = 0; i < cparent->nchains; ++i) { 1159 cluster->array[i].flags = cparent->array[i].flags; 1160 key_next = *key_nextp; 1161 1162 /* 1163 * Always relock the parent as we go. 1164 */ 1165 if (cparent->array[i].chain) { 1166 hammer2_chain_lock(cparent->array[i].chain, rflags); 1167 } 1168 1169 /* 1170 * Nothing to base the lookup, or parent was not synchronized. 1171 */ 1172 if (cparent->array[i].chain == NULL || 1173 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) { 1174 ++null_count; 1175 continue; 1176 } 1177 1178 chain = hammer2_chain_lookup(&cparent->array[i].chain, 1179 &key_next, 1180 key_beg, key_end, 1181 &cparent->array[i].cache_index, 1182 flags); 1183 cluster->array[i].chain = chain; 1184 if (chain == NULL) { 1185 ++null_count; 1186 } 1187 if (key_accum > key_next) 1188 key_accum = key_next; 1189 } 1190 1191 /* 1192 * Cleanup 1193 */ 1194 cluster->nchains = i; 1195 *key_nextp = key_accum; 1196 1197 /* 1198 * The cluster must be resolved, out of sync elements may be present. 1199 * 1200 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL. 1201 */ 1202 if (null_count != i) 1203 hammer2_cluster_resolve(cluster); 1204 if (null_count == i || 1205 (cluster->focus == NULL && 1206 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) { 1207 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1208 hammer2_cluster_unlock(cluster); 1209 hammer2_cluster_drop(cluster); 1210 cluster = NULL; 1211 } 1212 1213 return (cluster); 1214 } 1215 1216 /* 1217 * Locate next match or overlap under parent, replace the passed-in cluster. 1218 * The returned cluster is a new, locked, resolved cluster with one ref. 1219 * 1220 * Must never be called with HAMMER2_LOOKUP_MATCHIND. 1221 */ 1222 hammer2_cluster_t * 1223 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 1224 hammer2_key_t *key_nextp, 1225 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 1226 { 1227 hammer2_chain_t *ochain; 1228 hammer2_chain_t *nchain; 1229 hammer2_key_t key_accum; 1230 hammer2_key_t key_next; 1231 int parent_index; 1232 int cluster_index; 1233 int null_count; 1234 int rflags; 1235 int i; 1236 1237 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0); 1238 1239 key_accum = *key_nextp; 1240 null_count = 0; 1241 parent_index = cparent->focus_index; /* save prior focus */ 1242 cluster_index = cluster->focus_index; 1243 if (flags & HAMMER2_LOOKUP_SHARED) 1244 rflags = HAMMER2_RESOLVE_SHARED; 1245 else 1246 rflags = 0; 1247 1248 cluster->focus = NULL; /* XXX needed any more? */ 1249 /*cparent->focus = NULL;*/ 1250 cluster->focus_index = 0; /* XXX needed any more? */ 1251 /*cparent->focus_index = 0;*/ 1252 1253 cluster->ddflag = 0; 1254 1255 /* 1256 * The parent is always locked on entry, the iterator may be locked 1257 * depending on flags. 1258 * 1259 * We must temporarily unlock the passed-in clusters to avoid a 1260 * deadlock between elements of the cluster with other threads. 1261 * We will fixup the lock in the loop. 1262 * 1263 * Note that this will clear the focus. 1264 * 1265 * Reflag the clusters as locked, because we will relock them 1266 * as we go. 1267 */ 1268 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) { 1269 hammer2_cluster_unlock(cluster); 1270 cluster->flags |= HAMMER2_CLUSTER_LOCKED; 1271 } 1272 hammer2_cluster_unlock(cparent); 1273 cparent->flags |= HAMMER2_CLUSTER_LOCKED; 1274 1275 for (i = 0; i < cparent->nchains; ++i) { 1276 key_next = *key_nextp; 1277 ochain = cluster->array[i].chain; 1278 1279 /* 1280 * Always relock the parent as we go. 1281 */ 1282 if (cparent->array[i].chain) 1283 hammer2_chain_lock(cparent->array[i].chain, rflags); 1284 1285 /* 1286 * Nothing to iterate from. These cases can occur under 1287 * normal operations. For example, during synchronization 1288 * a slave might reach the end of its scan while records 1289 * are still left on the master(s). 1290 */ 1291 if (ochain == NULL) { 1292 ++null_count; 1293 continue; 1294 } 1295 if (cparent->array[i].chain == NULL || 1296 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) || 1297 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) { 1298 /* ochain has not yet been relocked */ 1299 hammer2_chain_drop(ochain); 1300 cluster->array[i].chain = NULL; 1301 ++null_count; 1302 continue; 1303 } 1304 1305 /* 1306 * Relock the child if necessary. Parent and child will then 1307 * be locked as expected by hammer2_chain_next() and flags. 1308 */ 1309 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1310 hammer2_chain_lock(ochain, rflags); 1311 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain, 1312 &key_next, key_beg, key_end, 1313 &cparent->array[i].cache_index, 1314 flags); 1315 /* ochain now invalid but can still be used for focus check */ 1316 if (parent_index == i) { 1317 cparent->focus_index = i; 1318 cparent->focus = cparent->array[i].chain; 1319 } 1320 1321 cluster->array[i].chain = nchain; 1322 if (nchain == NULL) { 1323 ++null_count; 1324 } 1325 if (key_accum > key_next) 1326 key_accum = key_next; 1327 } 1328 1329 /* 1330 * Cleanup 1331 */ 1332 cluster->nchains = i; 1333 *key_nextp = key_accum; 1334 1335 /* 1336 * The cluster must be resolved, out of sync elements may be present. 1337 * 1338 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL. 1339 */ 1340 if (null_count != i) 1341 hammer2_cluster_resolve(cluster); 1342 if (null_count == i || 1343 (cluster->focus == NULL && 1344 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) { 1345 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1346 hammer2_cluster_unlock(cluster); 1347 hammer2_cluster_drop(cluster); 1348 cluster = NULL; 1349 } 1350 return(cluster); 1351 } 1352 1353 /* 1354 * Advance just one chain in the cluster and recalculate the invalid bit. 1355 * The cluster index is allowed to be flagged invalid on input and is 1356 * recalculated on return. 1357 * 1358 * (used during synchronization to advance past a chain being deleted). 1359 * 1360 * The chain being advanced must not be the focus and the clusters in 1361 * question must have already passed normal cluster_lookup/cluster_next 1362 * checks. 1363 * 1364 * The cluster always remains intact on return, so void function. 1365 */ 1366 void 1367 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent, 1368 hammer2_cluster_t *cluster, 1369 hammer2_key_t *key_nextp, 1370 hammer2_key_t key_beg, 1371 hammer2_key_t key_end, 1372 int i, int flags) 1373 { 1374 hammer2_chain_t *ochain; 1375 hammer2_chain_t *nchain; 1376 hammer2_chain_t *focus; 1377 hammer2_key_t key_accum; 1378 hammer2_key_t key_next; 1379 int ddflag; 1380 1381 key_accum = *key_nextp; 1382 key_next = *key_nextp; 1383 ochain = cluster->array[i].chain; 1384 if (ochain == NULL) 1385 goto done; 1386 KKASSERT(ochain != cluster->focus); 1387 1388 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain, 1389 &key_next, key_beg, key_end, 1390 &cparent->array[i].cache_index, 1391 flags); 1392 /* ochain now invalid */ 1393 if (cparent->focus_index == i) 1394 cparent->focus = cparent->array[i].chain; 1395 1396 /* 1397 * Install nchain. Note that nchain can be NULL, and can also 1398 * be in an unlocked state depending on flags. 1399 */ 1400 cluster->array[i].chain = nchain; 1401 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1402 1403 if (key_accum > key_next) 1404 key_accum = key_next; 1405 1406 focus = cluster->focus; 1407 if (focus == NULL) 1408 goto done; 1409 if (nchain == NULL) 1410 goto done; 1411 #if 0 1412 if (nchain == focus) /* ASSERTED NOT TRUE */ 1413 ... 1414 #endif 1415 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE); 1416 if (nchain->bref.type != focus->bref.type || 1417 nchain->bref.key != focus->bref.key || 1418 nchain->bref.keybits != focus->bref.keybits || 1419 nchain->bref.modify_tid != focus->bref.modify_tid || 1420 nchain->bytes != focus->bytes || 1421 ddflag != cluster->ddflag) { 1422 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1423 } 1424 1425 done: 1426 *key_nextp = key_accum; 1427 #if 0 1428 /* 1429 * For now don't re-resolve cluster->flags. 1430 */ 1431 hammer2_cluster_resolve(cluster); 1432 #endif 1433 } 1434 1435 /* 1436 * Create a new cluster using the specified key 1437 */ 1438 int 1439 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 1440 hammer2_cluster_t **clusterp, 1441 hammer2_key_t key, int keybits, 1442 int type, size_t bytes, int flags) 1443 { 1444 hammer2_cluster_t *cluster; 1445 hammer2_pfs_t *pmp; 1446 int error; 1447 int i; 1448 1449 pmp = trans->pmp; /* can be NULL */ 1450 1451 if ((cluster = *clusterp) == NULL) { 1452 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, 1453 M_WAITOK | M_ZERO); 1454 cluster->pmp = pmp; /* can be NULL */ 1455 cluster->refs = 1; 1456 cluster->flags = HAMMER2_CLUSTER_LOCKED; 1457 } 1458 cluster->focus_index = 0; 1459 cluster->focus = NULL; 1460 1461 /* 1462 * NOTE: cluster->array[] entries can initially be NULL. If 1463 * *clusterp is supplied, skip NULL entries, otherwise 1464 * create new chains. 1465 */ 1466 for (i = 0; i < cparent->nchains; ++i) { 1467 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1468 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1469 continue; 1470 } 1471 if (*clusterp) { 1472 if ((cluster->array[i].flags & 1473 HAMMER2_CITEM_FEMOD) == 0) { 1474 cluster->array[i].flags |= 1475 HAMMER2_CITEM_INVALID; 1476 continue; 1477 } 1478 if (cluster->array[i].chain == NULL) 1479 continue; 1480 } 1481 error = hammer2_chain_create(trans, &cparent->array[i].chain, 1482 &cluster->array[i].chain, pmp, 1483 key, keybits, 1484 type, bytes, flags); 1485 if (cparent->focus_index == i) 1486 cparent->focus = cparent->array[i].chain; 1487 KKASSERT(error == 0); 1488 if (cluster->focus == NULL) { 1489 cluster->focus_index = i; 1490 cluster->focus = cluster->array[i].chain; 1491 } 1492 if (cparent->focus == cparent->array[i].chain) { 1493 cluster->focus_index = i; 1494 cluster->focus = cluster->array[i].chain; 1495 } 1496 } 1497 cluster->nchains = i; 1498 *clusterp = cluster; 1499 hammer2_cluster_resolve(cluster); 1500 1501 return error; 1502 } 1503 1504 /* 1505 * Rename a cluster to a new parent. 1506 * 1507 * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(), 1508 * So the data_off field is not relevant. Only the key and 1509 * keybits are used. 1510 */ 1511 void 1512 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 1513 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 1514 int flags) 1515 { 1516 hammer2_chain_t *chain; 1517 hammer2_blockref_t xbref; 1518 int i; 1519 1520 #if 0 1521 cluster->focus = NULL; 1522 cparent->focus = NULL; 1523 cluster->focus_index = 0; 1524 cparent->focus_index = 0; 1525 #endif 1526 1527 for (i = 0; i < cluster->nchains; ++i) { 1528 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1529 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1530 continue; 1531 } 1532 chain = cluster->array[i].chain; 1533 if (chain) { 1534 if (bref) { 1535 xbref = chain->bref; 1536 xbref.key = bref->key; 1537 xbref.keybits = bref->keybits; 1538 hammer2_chain_rename(trans, &xbref, 1539 &cparent->array[i].chain, 1540 chain, flags); 1541 } else { 1542 hammer2_chain_rename(trans, NULL, 1543 &cparent->array[i].chain, 1544 chain, flags); 1545 } 1546 if (cparent->focus_index == i) 1547 cparent->focus = cparent->array[i].chain; 1548 KKASSERT(cluster->array[i].chain == chain); /*remove*/ 1549 } 1550 } 1551 } 1552 1553 /* 1554 * Mark a cluster deleted 1555 */ 1556 void 1557 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 1558 hammer2_cluster_t *cluster, int flags) 1559 { 1560 hammer2_chain_t *chain; 1561 hammer2_chain_t *parent; 1562 int i; 1563 1564 if (cparent == NULL) { 1565 kprintf("cparent is NULL\n"); 1566 return; 1567 } 1568 1569 for (i = 0; i < cluster->nchains; ++i) { 1570 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1571 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1572 continue; 1573 } 1574 parent = cparent->array[i].chain; 1575 chain = cluster->array[i].chain; 1576 if (chain == NULL) 1577 continue; 1578 if (chain->parent != parent) { 1579 kprintf("hammer2_cluster_delete: parent " 1580 "mismatch chain=%p parent=%p against=%p\n", 1581 chain, chain->parent, parent); 1582 } else { 1583 hammer2_chain_delete(trans, parent, chain, flags); 1584 } 1585 } 1586 } 1587 1588 /* 1589 * Create a snapshot of the specified {parent, ochain} with the specified 1590 * label. The originating hammer2_inode must be exclusively locked for 1591 * safety. 1592 * 1593 * The ioctl code has already synced the filesystem. 1594 */ 1595 int 1596 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, 1597 hammer2_ioc_pfs_t *pfs) 1598 { 1599 hammer2_dev_t *hmp; 1600 hammer2_cluster_t *ncluster; 1601 const hammer2_inode_data_t *ripdata; 1602 hammer2_inode_data_t *wipdata; 1603 hammer2_chain_t *nchain; 1604 hammer2_inode_t *nip; 1605 size_t name_len; 1606 hammer2_key_t lhc; 1607 struct vattr vat; 1608 #if 0 1609 uuid_t opfs_clid; 1610 #endif 1611 int error; 1612 int i; 1613 1614 kprintf("snapshot %s\n", pfs->name); 1615 1616 name_len = strlen(pfs->name); 1617 lhc = hammer2_dirhash(pfs->name, name_len); 1618 1619 /* 1620 * Get the clid 1621 */ 1622 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata; 1623 #if 0 1624 opfs_clid = ripdata->pfs_clid; 1625 #endif 1626 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */ 1627 1628 /* 1629 * Create the snapshot directory under the super-root 1630 * 1631 * Set PFS type, generate a unique filesystem id, and generate 1632 * a cluster id. Use the same clid when snapshotting a PFS root, 1633 * which theoretically allows the snapshot to be used as part of 1634 * the same cluster (perhaps as a cache). 1635 * 1636 * Copy the (flushed) blockref array. Theoretically we could use 1637 * chain_duplicate() but it becomes difficult to disentangle 1638 * the shared core so for now just brute-force it. 1639 */ 1640 VATTR_NULL(&vat); 1641 vat.va_type = VDIR; 1642 vat.va_mode = 0755; 1643 ncluster = NULL; 1644 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, 1645 proc0.p_ucred, pfs->name, name_len, 1646 &ncluster, 1647 HAMMER2_INSERT_PFSROOT, &error); 1648 1649 if (nip) { 1650 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); 1651 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER; 1652 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT; 1653 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT; 1654 kern_uuidgen(&wipdata->pfs_fsid, 1); 1655 1656 /* 1657 * Give the snapshot its own private cluster. As a snapshot 1658 * no further synchronization with the original cluster will 1659 * be done. 1660 */ 1661 #if 0 1662 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1663 wipdata->pfs_clid = opfs_clid; 1664 else 1665 kern_uuidgen(&wipdata->pfs_clid, 1); 1666 #endif 1667 kern_uuidgen(&wipdata->pfs_clid, 1); 1668 1669 for (i = 0; i < ncluster->nchains; ++i) { 1670 if ((ncluster->array[i].flags & 1671 HAMMER2_CITEM_FEMOD) == 0) { 1672 ncluster->array[i].flags |= 1673 HAMMER2_CITEM_INVALID; 1674 continue; 1675 } 1676 nchain = ncluster->array[i].chain; 1677 if (nchain) 1678 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 1679 } 1680 #if 0 1681 /* XXX can't set this unless we do an explicit flush, which 1682 we also need a pmp assigned to do, else the flush code 1683 won't flush ncluster because it thinks it is crossing a 1684 flush boundary */ 1685 hammer2_cluster_set_chainflags(ncluster, 1686 HAMMER2_CHAIN_PFSBOUNDARY); 1687 #endif 1688 1689 /* XXX hack blockset copy */ 1690 /* XXX doesn't work with real cluster */ 1691 KKASSERT(ocluster->nchains == 1); 1692 wipdata->u.blockset = ripdata->u.blockset; 1693 hammer2_cluster_modsync(ncluster); 1694 for (i = 0; i < ncluster->nchains; ++i) { 1695 nchain = ncluster->array[i].chain; 1696 if (nchain) 1697 hammer2_flush(trans, nchain, 1); 1698 } 1699 hammer2_inode_unlock(nip, ncluster); 1700 } 1701 return (error); 1702 } 1703 1704 /* 1705 * Return locked parent cluster given a locked child. The child remains 1706 * locked on return. The new parent's focus follows the child's focus 1707 * and the parent is always resolved. 1708 * 1709 * We must temporarily unlock the passed-in cluster to avoid a deadlock 1710 * between elements of the cluster. 1711 * 1712 * We must not try to hammer2_cluster_resolve() cparent. The individual 1713 * parent chains for the nodes are the correct parents for the cluster but 1714 * do not necessarily match, so resolve would likely implode. 1715 */ 1716 hammer2_cluster_t * 1717 hammer2_cluster_parent(hammer2_cluster_t *cluster) 1718 { 1719 hammer2_cluster_t *cparent; 1720 int i; 1721 1722 cparent = hammer2_cluster_copy(cluster); 1723 hammer2_cluster_unlock(cluster); 1724 1725 for (i = 0; i < cparent->nchains; ++i) { 1726 hammer2_chain_t *chain; 1727 hammer2_chain_t *rchain; 1728 1729 /* 1730 * Calculate parent for each element. Old chain has an extra 1731 * ref for cparent but the lock remains with cluster. 1732 */ 1733 chain = cparent->array[i].chain; 1734 if (chain == NULL) 1735 continue; 1736 while ((rchain = chain->parent) != NULL) { 1737 hammer2_chain_ref(rchain); 1738 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS); 1739 if (chain->parent == rchain) 1740 break; 1741 hammer2_chain_unlock(rchain); 1742 hammer2_chain_drop(rchain); 1743 } 1744 cparent->array[i].chain = rchain; 1745 hammer2_chain_drop(chain); 1746 } 1747 cparent->flags |= HAMMER2_CLUSTER_LOCKED; 1748 /* hammer2_cluster_resolve(cparent); */ 1749 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 1750 1751 return cparent; 1752 } 1753 1754 /************************************************************************ 1755 * CLUSTER I/O * 1756 ************************************************************************ 1757 * 1758 * 1759 * WARNING! blockref[] array data is not universal. These functions should 1760 * only be used to access universal data. 1761 * 1762 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1763 * complete if necessary. The I/O's should have already been 1764 * initiated by the cluster_lock/chain_lock operation. 1765 * 1766 * The cluster must already be in a modified state before wdata 1767 * is called. The data will already be available for this case. 1768 */ 1769 const hammer2_media_data_t * 1770 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1771 { 1772 return(cluster->focus->data); 1773 } 1774 1775 hammer2_media_data_t * 1776 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1777 { 1778 KKASSERT(hammer2_cluster_modified(cluster)); 1779 return(cluster->focus->data); 1780 } 1781 1782 /* 1783 * Load cluster data asynchronously with callback. 1784 * 1785 * The callback is made for the first validated data found, or NULL 1786 * if no valid data is available. 1787 * 1788 * NOTE! The cluster structure is either unique or serialized (e.g. embedded 1789 * in the inode with an exclusive lock held), the chain structure may be 1790 * shared. 1791 */ 1792 void 1793 hammer2_cluster_load_async(hammer2_cluster_t *cluster, 1794 void (*callback)(hammer2_iocb_t *iocb), void *ptr) 1795 { 1796 hammer2_chain_t *chain; 1797 hammer2_iocb_t *iocb; 1798 hammer2_dev_t *hmp; 1799 hammer2_blockref_t *bref; 1800 int i; 1801 1802 i = cluster->focus_index; 1803 chain = cluster->focus; 1804 1805 iocb = &cluster->iocb; 1806 iocb->callback = callback; 1807 iocb->dio = NULL; /* for already-validated case */ 1808 iocb->cluster = cluster; 1809 iocb->chain = chain; 1810 iocb->ptr = ptr; 1811 iocb->lbase = (off_t)i; 1812 iocb->flags = 0; 1813 iocb->error = 0; 1814 1815 /* 1816 * Data already validated 1817 */ 1818 if (chain->data) { 1819 callback(iocb); 1820 return; 1821 } 1822 1823 /* 1824 * We must resolve to a device buffer, either by issuing I/O or 1825 * by creating a zero-fill element. We do not mark the buffer 1826 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1827 * API must still be used to do that). 1828 * 1829 * The device buffer is variable-sized in powers of 2 down 1830 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1831 * chunk always contains buffers of the same size. (XXX) 1832 * 1833 * The minimum physical IO size may be larger than the variable 1834 * block size. 1835 * 1836 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes 1837 * matches hammer2_devblksize()? Or does the freemap's 1838 * pre-zeroing handle the case for us? 1839 */ 1840 bref = &chain->bref; 1841 hmp = chain->hmp; 1842 1843 #if 0 1844 /* handled by callback? <- TODO XXX even needed for loads? */ 1845 /* 1846 * The getblk() optimization for a 100% overwrite can only be used 1847 * if the physical block size matches the request. 1848 */ 1849 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 1850 chain->bytes == hammer2_devblksize(chain->bytes)) { 1851 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); 1852 KKASSERT(error == 0); 1853 iocb->dio = dio; 1854 callback(iocb); 1855 return; 1856 } 1857 #endif 1858 1859 /* 1860 * Otherwise issue a read 1861 */ 1862 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1863 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb); 1864 } 1865