1 /* 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * hammer2_cluster_lock() 59 * hammer2_cluster_parent() 60 * 61 * - Most complex functions, quorum management on transaction ids. 62 * 63 * - Locking and data accesses must be internally asynchronous. 64 * 65 * - Validate and manage cache coherency primitives (cache state 66 * is stored in chain topologies but must be validated by these 67 * functions). 68 * 69 * (2) Lookups and Scans 70 * hammer2_cluster_lookup() 71 * hammer2_cluster_next() 72 * 73 * - Depend on locking & data retrieval functions, but still complex. 74 * 75 * - Must do quorum management on transaction ids. 76 * 77 * - Lookup and Iteration ops Must be internally asynchronous. 78 * 79 * (3) Modifying Operations 80 * hammer2_cluster_create() 81 * hammer2_cluster_rename() 82 * hammer2_cluster_delete() 83 * hammer2_cluster_modify() 84 * hammer2_cluster_modsync() 85 * 86 * - Can usually punt on failures, operation continues unless quorum 87 * is lost. If quorum is lost, must wait for resynchronization 88 * (depending on the management mode). 89 * 90 * - Must disconnect node on failures (also not flush), remount, and 91 * resynchronize. 92 * 93 * - Network links (via kdmsg) are relatively easy to issue as the 94 * complex underworkings of hammer2_chain.c don't have to messed 95 * with (the protocol is at a higher level than block-level). 96 * 97 * - Multiple local disk nodes (i.e. block devices) are another matter. 98 * Chain operations have to be dispatched to per-node threads (xN) 99 * because we can't asynchronize potentially very complex chain 100 * operations in hammer2_chain.c (it would be a huge mess). 101 * 102 * (these threads are also used to terminate incoming kdmsg ops from 103 * other machines). 104 * 105 * - Single-node filesystems do not use threads and will simply call 106 * hammer2_chain.c functions directly. This short-cut is handled 107 * at the base of each cluster function. 108 */ 109 #include <sys/cdefs.h> 110 #include <sys/param.h> 111 #include <sys/systm.h> 112 #include <sys/types.h> 113 #include <sys/lock.h> 114 #include <sys/uuid.h> 115 116 #include "hammer2.h" 117 118 /* 119 * Returns non-zero if any chain in the cluster needs to be resized. 120 * Errored elements are not used in the calculation. 121 */ 122 int 123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) 124 { 125 hammer2_chain_t *chain; 126 int i; 127 128 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 129 for (i = 0; i < cluster->nchains; ++i) { 130 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 131 continue; 132 chain = cluster->array[i].chain; 133 if (chain == NULL) 134 continue; 135 if (chain->error) 136 continue; 137 if (chain->bytes != bytes) 138 return 1; 139 } 140 return 0; 141 } 142 143 /* 144 * Returns the bref type of the cluster's foucs. 145 * 146 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0). 147 * The cluster must be locked. 148 */ 149 uint8_t 150 hammer2_cluster_type(hammer2_cluster_t *cluster) 151 { 152 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 153 if (cluster->error == 0) 154 return(cluster->focus->bref.type); 155 return 0; 156 } 157 158 /* 159 * Returns non-zero if the cluster's focus is flagged as being modified. 160 * 161 * If the cluster is errored, returns 0. 162 */ 163 int 164 hammer2_cluster_modified(hammer2_cluster_t *cluster) 165 { 166 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 167 if (cluster->error == 0) 168 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 169 return 0; 170 } 171 172 /* 173 * Returns the bref of the cluster's focus, sans any data-offset information 174 * (since offset information is per-node and wouldn't be useful). 175 * 176 * Callers use this function to access modify_tid, mirror_tid, type, 177 * key, and keybits. 178 * 179 * If the cluster is errored, returns an empty bref. 180 * The cluster must be locked. 181 */ 182 void 183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 184 { 185 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 186 if (cluster->error == 0) { 187 *bref = cluster->focus->bref; 188 bref->data_off = 0; 189 } else { 190 bzero(bref, sizeof(*bref)); 191 } 192 } 193 194 /* 195 * Return non-zero if the chain representing an inode has been flagged 196 * as having been unlinked. Allows the vnode reclaim to avoid loading 197 * the inode data from disk e.g. when unmount or recycling old, clean 198 * vnodes. 199 * 200 * The cluster does not need to be locked. 201 * The focus cannot be used since the cluster might not be locked. 202 */ 203 int 204 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster) 205 { 206 hammer2_chain_t *chain; 207 int flags; 208 int i; 209 210 flags = 0; 211 for (i = 0; i < cluster->nchains; ++i) { 212 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 213 continue; 214 chain = cluster->array[i].chain; 215 if (chain) 216 flags |= chain->flags; 217 } 218 return (flags & HAMMER2_CHAIN_UNLINKED); 219 } 220 221 /* 222 * Set a bitmask of flags in all chains related to a cluster. 223 * The cluster should probably be locked. 224 * 225 * XXX Only operate on FEMOD elements? 226 */ 227 void 228 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 229 { 230 hammer2_chain_t *chain; 231 int i; 232 233 for (i = 0; i < cluster->nchains; ++i) { 234 chain = cluster->array[i].chain; 235 if (chain) 236 atomic_set_int(&chain->flags, flags); 237 } 238 } 239 240 /* 241 * Set a bitmask of flags in all chains related to a cluster. 242 * The cluster should probably be locked. 243 * 244 * XXX Only operate on FEMOD elements? 245 */ 246 void 247 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 248 { 249 hammer2_chain_t *chain; 250 int i; 251 252 for (i = 0; i < cluster->nchains; ++i) { 253 chain = cluster->array[i].chain; 254 if (chain) 255 atomic_clear_int(&chain->flags, flags); 256 } 257 } 258 259 /* 260 * Flag the cluster for flushing recursively up to the root. Despite the 261 * work it does, this is relatively benign. It just makes sure that the 262 * flusher has top-down visibility to this cluster. 263 * 264 * Errored chains are not flagged for flushing. 265 * 266 * The cluster should probably be locked. 267 */ 268 void 269 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster) 270 { 271 hammer2_chain_t *chain; 272 int i; 273 274 for (i = 0; i < cluster->nchains; ++i) { 275 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 276 continue; 277 chain = cluster->array[i].chain; 278 if (chain == NULL) 279 continue; 280 if (chain->error) 281 continue; 282 hammer2_chain_setflush(trans, chain); 283 } 284 } 285 286 /* 287 * Set the check mode for the cluster. 288 * Errored elements of the cluster are ignored. 289 * 290 * The cluster must be locked and modified. 291 */ 292 void 293 hammer2_cluster_setmethod_check(hammer2_trans_t *trans, 294 hammer2_cluster_t *cluster, 295 int check_algo) 296 { 297 hammer2_chain_t *chain; 298 int i; 299 300 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 301 for (i = 0; i < cluster->nchains; ++i) { 302 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 303 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 304 continue; 305 } 306 chain = cluster->array[i].chain; 307 if (chain == NULL) 308 continue; 309 if (chain->error) 310 continue; 311 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 312 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1); 313 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo); 314 } 315 } 316 317 /* 318 * Create a degenerate cluster with one ref from a single locked chain. 319 * The returned cluster will be focused on the chain and inherit its 320 * error state. 321 * 322 * The chain's lock and reference are transfered to the new cluster, so 323 * the caller should not try to unlock the chain separately. 324 * 325 * We fake the flags. 326 */ 327 hammer2_cluster_t * 328 hammer2_cluster_from_chain(hammer2_chain_t *chain) 329 { 330 hammer2_cluster_t *cluster; 331 332 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 333 cluster->array[0].chain = chain; 334 cluster->array[0].flags = HAMMER2_CITEM_FEMOD; 335 cluster->nchains = 1; 336 cluster->focus = chain; 337 cluster->focus_index = 0; 338 cluster->pmp = chain->pmp; 339 cluster->refs = 1; 340 cluster->error = chain->error; 341 cluster->flags = HAMMER2_CLUSTER_LOCKED | 342 HAMMER2_CLUSTER_WRHARD | 343 HAMMER2_CLUSTER_RDHARD | 344 HAMMER2_CLUSTER_MSYNCED | 345 HAMMER2_CLUSTER_SSYNCED; 346 347 return cluster; 348 } 349 350 /* 351 * Add a reference to a cluster and its underlying chains. 352 * 353 * We must also ref the underlying chains in order to allow ref/unlock 354 * sequences to later re-lock. 355 */ 356 void 357 hammer2_cluster_ref(hammer2_cluster_t *cluster) 358 { 359 atomic_add_int(&cluster->refs, 1); 360 #if 0 361 hammer2_chain_t *chain; 362 int i; 363 364 for (i = 0; i < cluster->nchains; ++i) { 365 chain = cluster->array[i].chain; 366 if (chain) 367 hammer2_chain_ref(chain); 368 } 369 #endif 370 } 371 372 /* 373 * Drop the caller's reference to the cluster. When the ref count drops to 374 * zero this function frees the cluster and drops all underlying chains. 375 * 376 * In-progress read I/Os are typically detached from the cluster once the 377 * first one returns (the remaining stay attached to the DIOs but are then 378 * ignored and drop naturally). 379 */ 380 void 381 hammer2_cluster_drop(hammer2_cluster_t *cluster) 382 { 383 hammer2_chain_t *chain; 384 int i; 385 386 KKASSERT(cluster->refs > 0); 387 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 388 cluster->focus = NULL; /* safety XXX chg to assert */ 389 cluster->focus_index = 0; 390 391 for (i = 0; i < cluster->nchains; ++i) { 392 chain = cluster->array[i].chain; 393 if (chain) { 394 hammer2_chain_drop(chain); 395 cluster->array[i].chain = NULL; /* safety */ 396 } 397 } 398 cluster->nchains = 0; /* safety */ 399 400 kfree(cluster, M_HAMMER2); 401 /* cluster is invalid */ 402 } 403 } 404 405 void 406 hammer2_cluster_wait(hammer2_cluster_t *cluster) 407 { 408 tsleep(cluster->focus, 0, "h2clcw", 1); 409 } 410 411 /* 412 * Lock and ref a cluster. This adds a ref to the cluster and its chains 413 * and then locks them, modified by various RESOLVE flags. 414 * 415 * The act of locking a cluster sets its focus. Note that cluster elements 416 * flagged with HAMMER2_CITEM_INVALID cannot be set as a focus. Locking a 417 * cluster does not adjust this flag since exact matches only matter for leafs 418 * (parents can depend on minor differences in topology). 419 * 420 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal 421 * operations. Typically this is only set on a quorum of MASTERs or 422 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER 423 * is present, this bit is *not* set on a quorum of MASTERs. The 424 * synchronization code ignores this bit, but all hammer2_cluster_*() calls 425 * that create/modify/delete elements use it. 426 * 427 * The chains making up the cluster may be narrowed down based on quorum 428 * acceptability, and if RESOLVE_RDONLY is specified the chains can be 429 * narrowed down to a single chain as long as the entire subtopology is known 430 * to be intact. So, for example, we can narrow a read-only op to a single 431 * fast SLAVE but if we focus a CACHE chain we must still retain at least 432 * a SLAVE to ensure that the subtopology can be accessed. 433 * 434 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need 435 * to be maintained once the topology is validated as-of the top level of 436 * the operation. 437 * 438 * If a failure occurs the operation must be aborted by higher-level code and 439 * retried. XXX 440 */ 441 void 442 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 443 { 444 hammer2_chain_t *chain; 445 int i; 446 447 /* cannot be on inode-embedded cluster template, must be on copy */ 448 KKASSERT(cluster->refs > 0); 449 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0); 450 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) { 451 panic("hammer2_cluster_lock: cluster %p already locked!\n", 452 cluster); 453 } else { 454 KKASSERT(cluster->focus == NULL); 455 } 456 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 457 458 /* 459 * Lock chains and resolve state. 460 */ 461 for (i = 0; i < cluster->nchains; ++i) { 462 chain = cluster->array[i].chain; 463 if (chain == NULL) 464 continue; 465 hammer2_chain_lock(chain, how); 466 } 467 468 hammer2_cluster_resolve(cluster); 469 } 470 471 void 472 hammer2_cluster_resolve(hammer2_cluster_t *cluster) 473 { 474 hammer2_chain_t *chain; 475 hammer2_pfs_t *pmp; 476 hammer2_tid_t quorum_tid; 477 hammer2_tid_t last_best_quorum_tid; 478 int focus_pfs_type; 479 uint32_t nflags; 480 int ttlmasters; 481 int ttlslaves; 482 int nmasters; 483 int nslaves; 484 int nquorum; 485 int smpresent; 486 int i; 487 488 cluster->error = 0; 489 cluster->focus = NULL; 490 491 focus_pfs_type = 0; 492 nflags = 0; 493 ttlmasters = 0; 494 ttlslaves = 0; 495 nmasters = 0; 496 nslaves = 0; 497 498 /* 499 * Calculate quorum 500 */ 501 pmp = cluster->pmp; 502 KKASSERT(pmp != NULL || cluster->nchains == 0); 503 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 504 smpresent = 0; 505 506 /* 507 * Pass 1 508 * 509 * NOTE: A NULL chain is not necessarily an error, it could be 510 * e.g. a lookup failure or the end of an iteration. 511 * Process normally. 512 */ 513 for (i = 0; i < cluster->nchains; ++i) { 514 chain = cluster->array[i].chain; 515 if (chain && chain->error) { 516 if (cluster->focus == NULL || cluster->focus == chain) { 517 /* error will be overridden by valid focus */ 518 cluster->error = chain->error; 519 } 520 521 /* 522 * Must count total masters and slaves whether the 523 * chain is errored or not. 524 */ 525 switch (cluster->pmp->pfs_types[i]) { 526 case HAMMER2_PFSTYPE_MASTER: 527 ++ttlmasters; 528 break; 529 case HAMMER2_PFSTYPE_SLAVE: 530 ++ttlslaves; 531 break; 532 } 533 continue; 534 } 535 switch (cluster->pmp->pfs_types[i]) { 536 case HAMMER2_PFSTYPE_MASTER: 537 ++ttlmasters; 538 break; 539 case HAMMER2_PFSTYPE_SLAVE: 540 ++ttlslaves; 541 break; 542 case HAMMER2_PFSTYPE_SOFT_MASTER: 543 nflags |= HAMMER2_CLUSTER_WRSOFT; 544 nflags |= HAMMER2_CLUSTER_RDSOFT; 545 smpresent = 1; 546 break; 547 case HAMMER2_PFSTYPE_SOFT_SLAVE: 548 nflags |= HAMMER2_CLUSTER_RDSOFT; 549 break; 550 case HAMMER2_PFSTYPE_SUPROOT: 551 /* 552 * Degenerate cluster representing the super-root 553 * topology on a single device. Fake stuff so 554 * cluster ops work as expected. 555 */ 556 nflags |= HAMMER2_CLUSTER_WRHARD; 557 nflags |= HAMMER2_CLUSTER_RDHARD; 558 cluster->focus_index = i; 559 cluster->focus = chain; 560 cluster->error = chain ? chain->error : 0; 561 break; 562 default: 563 break; 564 } 565 } 566 567 /* 568 * Pass 2 569 * 570 * Resolve masters. Calculate nmasters for the highest matching 571 * TID, if a quorum cannot be attained try the next lower matching 572 * TID until we exhaust TIDs. 573 * 574 * NOTE: A NULL chain is not necessarily an error, it could be 575 * e.g. a lookup failure or the end of an iteration. 576 * Process normally. 577 */ 578 last_best_quorum_tid = HAMMER2_TID_MAX; 579 quorum_tid = 0; /* fix gcc warning */ 580 581 while (nmasters < nquorum && last_best_quorum_tid != 0) { 582 nmasters = 0; 583 quorum_tid = 0; 584 585 for (i = 0; i < cluster->nchains; ++i) { 586 if (cluster->pmp->pfs_types[i] != 587 HAMMER2_PFSTYPE_MASTER) { 588 continue; 589 } 590 chain = cluster->array[i].chain; 591 592 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 593 /* 594 * Invalid as in unsynchronized, cannot be 595 * used to calculate the quorum. 596 */ 597 } else if (chain == NULL && quorum_tid == 0) { 598 /* 599 * NULL chain on master matches NULL chains 600 * on other masters. 601 */ 602 ++nmasters; 603 } else if (quorum_tid < last_best_quorum_tid && 604 chain != NULL && 605 (quorum_tid < chain->bref.modify_tid || 606 nmasters == 0)) { 607 /* 608 * Better TID located, reset nmasters count. 609 */ 610 nmasters = 1; 611 quorum_tid = chain->bref.modify_tid; 612 } else if (chain && 613 quorum_tid == chain->bref.modify_tid) { 614 /* 615 * TID matches current collection. 616 */ 617 ++nmasters; 618 } 619 } 620 if (nmasters >= nquorum) 621 break; 622 last_best_quorum_tid = quorum_tid; 623 } 624 625 /* 626 * Pass 3 627 * 628 * NOTE: A NULL chain is not necessarily an error, it could be 629 * e.g. a lookup failure or the end of an iteration. 630 * Process normally. 631 */ 632 for (i = 0; i < cluster->nchains; ++i) { 633 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 634 chain = cluster->array[i].chain; 635 if (chain && chain->error) { 636 if (cluster->focus == NULL || cluster->focus == chain) { 637 /* error will be overridden by valid focus */ 638 cluster->error = chain->error; 639 } 640 continue; 641 } 642 643 switch (cluster->pmp->pfs_types[i]) { 644 case HAMMER2_PFSTYPE_MASTER: 645 /* 646 * We must have enough up-to-date masters to reach 647 * a quorum and the master modify_tid must match 648 * the quorum's modify_tid. 649 * 650 * Do not select an errored or out-of-sync master. 651 */ 652 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 653 nflags |= HAMMER2_CLUSTER_UNHARD; 654 } else if (nmasters >= nquorum && 655 (chain == NULL || chain->error == 0) && 656 ((chain == NULL && quorum_tid == 0) || 657 (chain != NULL && quorum_tid == 658 chain->bref.modify_tid))) { 659 nflags |= HAMMER2_CLUSTER_WRHARD; 660 nflags |= HAMMER2_CLUSTER_RDHARD; 661 if (!smpresent) { 662 cluster->array[i].flags |= 663 HAMMER2_CITEM_FEMOD; 664 } 665 if (cluster->focus == NULL || 666 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) { 667 focus_pfs_type = HAMMER2_PFSTYPE_MASTER; 668 cluster->focus_index = i; 669 cluster->focus = chain; /* NULL ok */ 670 cluster->error = chain ? chain->error : 671 0; 672 } 673 } else if (chain == NULL || chain->error == 0) { 674 nflags |= HAMMER2_CLUSTER_UNHARD; 675 } 676 break; 677 case HAMMER2_PFSTYPE_SLAVE: 678 /* 679 * We must have enough up-to-date masters to reach 680 * a quorum and the slave modify_tid must match the 681 * quorum's modify_tid. 682 * 683 * Do not select an errored slave. 684 */ 685 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 686 nflags |= HAMMER2_CLUSTER_UNHARD; 687 } else if (nmasters >= nquorum && 688 (chain == NULL || chain->error == 0) && 689 ((chain == NULL && quorum_tid == 0) || 690 (chain && quorum_tid == 691 chain->bref.modify_tid))) { 692 ++nslaves; 693 nflags |= HAMMER2_CLUSTER_RDHARD; 694 #if 0 695 /* XXX optimize for RESOLVE_RDONLY */ 696 if (cluster->focus == NULL) { 697 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE; 698 cluster->focus_index = i; 699 cluster->focus = chain; /* NULL ok */ 700 cluster->error = chain ? chain->error : 701 0; 702 } 703 #endif 704 } else if (chain == NULL || chain->error == 0) { 705 nflags |= HAMMER2_CLUSTER_UNSOFT; 706 } 707 break; 708 case HAMMER2_PFSTYPE_SOFT_MASTER: 709 /* 710 * Directly mounted soft master always wins. There 711 * should be only one. 712 */ 713 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER); 714 cluster->focus_index = i; 715 cluster->focus = chain; 716 cluster->error = chain ? chain->error : 0; 717 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER; 718 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 719 break; 720 case HAMMER2_PFSTYPE_SOFT_SLAVE: 721 /* 722 * Directly mounted soft slave always wins. There 723 * should be only one. 724 */ 725 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE); 726 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) { 727 cluster->focus_index = i; 728 cluster->focus = chain; 729 cluster->error = chain ? chain->error : 0; 730 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE; 731 } 732 break; 733 case HAMMER2_PFSTYPE_SUPROOT: 734 /* 735 * spmp (degenerate case) 736 */ 737 KKASSERT(i == 0); 738 cluster->focus_index = i; 739 cluster->focus = chain; 740 cluster->error = chain ? chain->error : 0; 741 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT; 742 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 743 break; 744 default: 745 break; 746 } 747 } 748 749 if (ttlslaves == 0) 750 nflags |= HAMMER2_CLUSTER_NOSOFT; 751 if (ttlmasters == 0) 752 nflags |= HAMMER2_CLUSTER_NOHARD; 753 754 /* 755 * Set SSYNCED or MSYNCED for slaves and masters respectively if 756 * all available nodes (even if 0 are available) are fully 757 * synchronized. This is used by the synchronization thread to 758 * determine if there is work it could potentially accomplish. 759 */ 760 if (nslaves == ttlslaves) 761 nflags |= HAMMER2_CLUSTER_SSYNCED; 762 if (nmasters == ttlmasters) 763 nflags |= HAMMER2_CLUSTER_MSYNCED; 764 765 /* 766 * Determine if the cluster was successfully locked for the 767 * requested operation and generate an error code. The cluster 768 * will not be locked (or ref'd) if an error is returned. 769 * 770 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok() 771 * to determine if reading or writing is possible. If writing, the 772 * cluster still requires a call to hammer2_cluster_modify() first. 773 */ 774 atomic_set_int(&cluster->flags, nflags); 775 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 776 } 777 778 /* 779 * Copy a cluster, returned a ref'd cluster. All underlying chains 780 * are also ref'd, but not locked. 781 * 782 * The cluster focus is not set because the cluster is not yet locked 783 * (and the originating cluster does not have to be locked either). 784 */ 785 hammer2_cluster_t * 786 hammer2_cluster_copy(hammer2_cluster_t *ocluster) 787 { 788 hammer2_pfs_t *pmp = ocluster->pmp; 789 hammer2_cluster_t *ncluster; 790 hammer2_chain_t *chain; 791 int i; 792 793 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 794 ncluster->pmp = pmp; 795 ncluster->nchains = ocluster->nchains; 796 ncluster->refs = 1; 797 ncluster->flags = 0; /* cluster not locked */ 798 799 for (i = 0; i < ocluster->nchains; ++i) { 800 chain = ocluster->array[i].chain; 801 ncluster->array[i].chain = chain; 802 if (chain) 803 hammer2_chain_ref(chain); 804 } 805 return (ncluster); 806 } 807 808 /* 809 * Unlock and deref a cluster. The cluster is destroyed if this is the 810 * last ref. 811 */ 812 void 813 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 814 { 815 hammer2_chain_t *chain; 816 int i; 817 818 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) { 819 kprintf("hammer2_cluster_unlock: cluster %p not locked\n", 820 cluster); 821 } 822 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 823 KKASSERT(cluster->refs > 0); 824 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 825 826 for (i = 0; i < cluster->nchains; ++i) { 827 chain = cluster->array[i].chain; 828 if (chain) 829 hammer2_chain_unlock(chain); 830 } 831 cluster->focus_index = 0; 832 cluster->focus = NULL; 833 } 834 835 /* 836 * Resize the cluster's physical storage allocation in-place. This may 837 * replace the cluster's chains. 838 */ 839 void 840 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 841 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 842 int nradix, int flags) 843 { 844 hammer2_chain_t *chain; 845 int i; 846 847 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ 848 KKASSERT(cparent->nchains == cluster->nchains); 849 850 for (i = 0; i < cluster->nchains; ++i) { 851 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 852 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 853 continue; 854 } 855 chain = cluster->array[i].chain; 856 if (chain) { 857 KKASSERT(cparent->array[i].chain); 858 hammer2_chain_resize(trans, ip, 859 cparent->array[i].chain, chain, 860 nradix, flags); 861 } 862 } 863 } 864 865 /* 866 * Set an inode's cluster modified, marking the related chains RW and 867 * duplicating them if necessary. 868 * 869 * The passed-in chain is a localized copy of the chain previously acquired 870 * when the inode was locked (and possilby replaced in the mean time), and 871 * must also be updated. In fact, we update it first and then synchronize 872 * the inode's cluster cache. 873 */ 874 hammer2_inode_data_t * 875 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 876 hammer2_cluster_t *cluster, int flags) 877 { 878 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 879 hammer2_cluster_modify(trans, cluster, flags); 880 881 hammer2_inode_repoint(ip, NULL, cluster); 882 if (ip->vp) 883 vsetisdirty(ip->vp); 884 return (&hammer2_cluster_wdata(cluster)->ipdata); 885 } 886 887 /* 888 * Adjust the cluster's chains to allow modification and adjust the 889 * focus. Data will be accessible on return. 890 * 891 * If our focused master errors on modify, re-resolve the cluster to 892 * try to select a different master. 893 */ 894 void 895 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 896 int flags) 897 { 898 hammer2_chain_t *chain; 899 int resolve_again; 900 int i; 901 902 resolve_again = 0; 903 for (i = 0; i < cluster->nchains; ++i) { 904 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 905 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 906 continue; 907 } 908 chain = cluster->array[i].chain; 909 if (chain == NULL) 910 continue; 911 if (chain->error) 912 continue; 913 hammer2_chain_modify(trans, chain, flags); 914 if (cluster->focus == chain && chain->error) { 915 cluster->error = chain->error; 916 resolve_again = 1; 917 } 918 } 919 if (resolve_again) 920 hammer2_cluster_resolve(cluster); 921 } 922 923 /* 924 * Synchronize modifications from the focus to other chains in a cluster. 925 * Convenient because nominal API users can just modify the contents of the 926 * focus (at least for non-blockref data). 927 * 928 * Nominal front-end operations only edit non-block-table data in a single 929 * chain. This code copies such modifications to the other chains in the 930 * cluster. Blocktable modifications are handled on a chain-by-chain basis 931 * by both the frontend and the backend and will explode in fireworks if 932 * blindly copied. 933 */ 934 void 935 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 936 { 937 hammer2_chain_t *focus; 938 hammer2_chain_t *scan; 939 const hammer2_inode_data_t *ripdata; 940 hammer2_inode_data_t *wipdata; 941 int i; 942 943 focus = cluster->focus; 944 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 945 946 for (i = 0; i < cluster->nchains; ++i) { 947 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 948 continue; 949 scan = cluster->array[i].chain; 950 if (scan == NULL || scan == focus) 951 continue; 952 if (scan->error) 953 continue; 954 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 955 KKASSERT(focus->bytes == scan->bytes && 956 focus->bref.type == scan->bref.type); 957 switch(focus->bref.type) { 958 case HAMMER2_BREF_TYPE_INODE: 959 ripdata = &focus->data->ipdata; 960 wipdata = &scan->data->ipdata; 961 if ((ripdata->op_flags & 962 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 963 bcopy(ripdata, wipdata, 964 offsetof(hammer2_inode_data_t, u)); 965 break; 966 } 967 /* fall through to full copy */ 968 case HAMMER2_BREF_TYPE_DATA: 969 bcopy(focus->data, scan->data, focus->bytes); 970 break; 971 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 972 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 973 case HAMMER2_BREF_TYPE_FREEMAP: 974 case HAMMER2_BREF_TYPE_VOLUME: 975 panic("hammer2_cluster_modsync: illegal node type"); 976 /* NOT REACHED */ 977 break; 978 default: 979 panic("hammer2_cluster_modsync: unknown node type"); 980 break; 981 } 982 } 983 } 984 985 /* 986 * Lookup initialization/completion API. Returns a locked cluster with 1 ref. 987 */ 988 hammer2_cluster_t * 989 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 990 { 991 hammer2_cluster_t *cluster; 992 993 cluster = hammer2_cluster_copy(cparent); 994 if (flags & HAMMER2_LOOKUP_SHARED) { 995 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 996 HAMMER2_RESOLVE_SHARED); 997 } else { 998 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 999 } 1000 return (cluster); 1001 } 1002 1003 void 1004 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 1005 { 1006 if (cparent) { 1007 hammer2_cluster_unlock(cparent); 1008 hammer2_cluster_drop(cparent); 1009 } 1010 } 1011 1012 /* 1013 * Locate first match or overlap under parent, return a new cluster 1014 */ 1015 hammer2_cluster_t * 1016 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 1017 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 1018 { 1019 hammer2_pfs_t *pmp; 1020 hammer2_cluster_t *cluster; 1021 hammer2_chain_t *chain; 1022 hammer2_chain_t *focus; 1023 hammer2_key_t key_accum; 1024 hammer2_key_t key_next; 1025 int null_count; 1026 int i; 1027 1028 pmp = cparent->pmp; /* can be NULL */ 1029 key_accum = *key_nextp; 1030 null_count = 0; 1031 1032 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 1033 cluster->pmp = pmp; /* can be NULL */ 1034 cluster->refs = 1; 1035 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1036 cluster->flags |= HAMMER2_CLUSTER_LOCKED; 1037 1038 /* 1039 * Pass-1, issue lookup and find focus. 1040 */ 1041 for (i = 0; i < cparent->nchains; ++i) { 1042 cluster->array[i].flags = cparent->array[i].flags; 1043 key_next = *key_nextp; 1044 1045 /* 1046 * Nothing to base the lookup, or parent was not synchronized. 1047 */ 1048 if (cparent->array[i].chain == NULL || 1049 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) { 1050 ++null_count; 1051 continue; 1052 } 1053 1054 chain = hammer2_chain_lookup(&cparent->array[i].chain, 1055 &key_next, 1056 key_beg, key_end, 1057 &cparent->array[i].cache_index, 1058 flags); 1059 if (cparent->focus_index == i) 1060 cparent->focus = cparent->array[i].chain; 1061 cluster->array[i].chain = chain; 1062 if (chain == NULL) { 1063 ++null_count; 1064 } else if (chain->error) { 1065 /* 1066 * Leave errored chain in cluster, but it cannot be 1067 * the cluster's focus. It is still possible for an 1068 * error'd chain to be synchronized (since we have 1069 * the bref), synchronization state will be handled 1070 * in pass-2. 1071 */ 1072 ; 1073 } else if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 1074 /* 1075 * Leave unsynchronized chain in cluster, but it cannot 1076 * be the cluster's focus. 1077 */ 1078 ; 1079 } else { 1080 int ddflag = (chain->bref.type == 1081 HAMMER2_BREF_TYPE_INODE); 1082 1083 if (cluster->focus == NULL) { 1084 cluster->focus_index = i; 1085 cluster->focus = chain; 1086 cluster->ddflag = ddflag; 1087 } 1088 if (cparent->focus == cparent->array[i].chain) { 1089 cluster->focus_index = i; 1090 cluster->focus = chain; 1091 cluster->ddflag = ddflag; 1092 } 1093 } 1094 if (key_accum > key_next) 1095 key_accum = key_next; 1096 } 1097 1098 /* 1099 * Pass-2 invalidate mismatches 1100 */ 1101 focus = cluster->focus; 1102 if (focus == NULL) 1103 goto done; 1104 1105 for (i = 0; i < cparent->nchains; ++i) { 1106 int ddflag; 1107 1108 chain = cluster->array[i].chain; 1109 1110 if (chain == NULL) 1111 continue; 1112 if (chain == focus) 1113 continue; 1114 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 1115 continue; 1116 1117 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 1118 if (chain->bref.type != focus->bref.type || 1119 chain->bref.key != focus->bref.key || 1120 chain->bref.keybits != focus->bref.keybits || 1121 chain->bref.modify_tid != focus->bref.modify_tid || 1122 chain->bytes != focus->bytes || 1123 ddflag != cluster->ddflag) { 1124 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1125 } 1126 } 1127 1128 /* 1129 * Resolve cluster flags. A lookup or locking failure could wind 1130 * up changing the cluster. 1131 */ 1132 done: 1133 *key_nextp = key_accum; 1134 cluster->nchains = i; 1135 hammer2_cluster_resolve(cluster); 1136 1137 if (null_count == i) { 1138 hammer2_cluster_drop(cluster); 1139 cluster = NULL; 1140 } 1141 1142 return (cluster); 1143 } 1144 1145 /* 1146 * Locate next match or overlap under parent, replace cluster 1147 */ 1148 hammer2_cluster_t * 1149 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 1150 hammer2_key_t *key_nextp, 1151 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 1152 { 1153 hammer2_chain_t *ochain; 1154 hammer2_chain_t *nchain; 1155 hammer2_chain_t *focus; 1156 hammer2_key_t key_accum; 1157 hammer2_key_t key_next; 1158 int null_count; 1159 int i; 1160 1161 key_accum = *key_nextp; 1162 null_count = 0; 1163 cluster->focus = NULL; 1164 cparent->focus = NULL; 1165 cluster->focus_index = 0; 1166 cparent->focus_index = 0; 1167 1168 cluster->ddflag = 0; 1169 1170 for (i = 0; i < cparent->nchains; ++i) { 1171 key_next = *key_nextp; 1172 ochain = cluster->array[i].chain; 1173 1174 /* 1175 * Nothing to iterate from. These cases can occur under 1176 * normal operations. For example, during synchronization 1177 * a slave might reach the end of its scan while records 1178 * are still left on the master(s). 1179 */ 1180 if (ochain == NULL) { 1181 ++null_count; 1182 continue; 1183 } 1184 if (cparent->array[i].chain == NULL || 1185 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) || 1186 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) { 1187 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1188 hammer2_chain_unlock(ochain); 1189 hammer2_chain_drop(ochain); 1190 cluster->array[i].chain = NULL; 1191 ++null_count; 1192 continue; 1193 } 1194 1195 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain, 1196 &key_next, key_beg, key_end, 1197 &cparent->array[i].cache_index, 1198 flags); 1199 if (cparent->focus_index == i) 1200 cparent->focus = cparent->array[i].chain; 1201 /* ochain now invalid but can still be used for focus check */ 1202 1203 cluster->array[i].chain = nchain; 1204 if (nchain == NULL) { 1205 ++null_count; 1206 } else if (nchain->error) { 1207 /* 1208 * Leave errored chain in cluster, but it cannot be 1209 * the cluster's focus. 1210 */ 1211 ; 1212 } else { 1213 int ddflag = (nchain->bref.type == 1214 HAMMER2_BREF_TYPE_INODE); 1215 1216 /* 1217 * Possible new focus. 1218 */ 1219 if (cluster->focus == NULL) { 1220 cluster->ddflag = ddflag; 1221 cluster->focus_index = i; 1222 cluster->focus = nchain; 1223 } 1224 1225 /* 1226 * Fixup pre-existing focus. 1227 */ 1228 if (cluster->focus == ochain) { 1229 cluster->focus_index = i; 1230 cluster->focus = nchain; 1231 } 1232 } 1233 if (key_accum > key_next) 1234 key_accum = key_next; 1235 } 1236 1237 /* 1238 * Pass-2 invalidate mismatches 1239 */ 1240 focus = cluster->focus; 1241 if (focus == NULL) 1242 goto done; 1243 1244 for (i = 0; i < cparent->nchains; ++i) { 1245 int ddflag; 1246 1247 nchain = cluster->array[i].chain; 1248 1249 if (nchain == NULL) 1250 continue; 1251 if (nchain == focus) 1252 continue; 1253 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 1254 continue; 1255 1256 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE); 1257 if (nchain->bref.type != focus->bref.type || 1258 nchain->bref.key != focus->bref.key || 1259 nchain->bref.keybits != focus->bref.keybits || 1260 nchain->bref.modify_tid != focus->bref.modify_tid || 1261 nchain->bytes != focus->bytes || 1262 ddflag != cluster->ddflag) { 1263 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1264 } 1265 } 1266 1267 done: 1268 *key_nextp = key_accum; 1269 cluster->nchains = i; 1270 hammer2_cluster_resolve(cluster); 1271 1272 if (null_count == i) { 1273 hammer2_cluster_drop(cluster); 1274 cluster = NULL; 1275 } 1276 return(cluster); 1277 } 1278 1279 /* 1280 * Advance just one chain in the cluster and recalculate the invalid bit. 1281 * The cluster index is allowed to be flagged invalid on input and is 1282 * recalculated on return. 1283 * 1284 * (used during synchronization to advance past a chain being deleted). 1285 * 1286 * The chain being advanced must not be the focus and the clusters in 1287 * question must have already passed normal cluster_lookup/cluster_next 1288 * checks. 1289 * 1290 * The cluster always remains intact on return, so void function. 1291 */ 1292 void 1293 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent, 1294 hammer2_cluster_t *cluster, 1295 hammer2_key_t *key_nextp, 1296 hammer2_key_t key_beg, 1297 hammer2_key_t key_end, 1298 int i, int flags) 1299 { 1300 hammer2_chain_t *ochain; 1301 hammer2_chain_t *nchain; 1302 hammer2_chain_t *focus; 1303 hammer2_key_t key_accum; 1304 hammer2_key_t key_next; 1305 int ddflag; 1306 1307 key_accum = *key_nextp; 1308 key_next = *key_nextp; 1309 ochain = cluster->array[i].chain; 1310 if (ochain == NULL) 1311 goto done; 1312 KKASSERT(ochain != cluster->focus); 1313 1314 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain, 1315 &key_next, key_beg, key_end, 1316 &cparent->array[i].cache_index, 1317 flags); 1318 if (cparent->focus_index == i) 1319 cparent->focus = cparent->array[i].chain; 1320 /* ochain now invalid */ 1321 1322 /* 1323 * Install nchain. Note that nchain can be NULL, and can also 1324 * be in an unlocked state depending on flags. 1325 */ 1326 cluster->array[i].chain = nchain; 1327 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1328 1329 if (key_accum > key_next) 1330 key_accum = key_next; 1331 1332 focus = cluster->focus; 1333 if (focus == NULL) 1334 goto done; 1335 if (nchain == NULL) 1336 goto done; 1337 #if 0 1338 if (nchain == focus) /* ASSERTED NOT TRUE */ 1339 ... 1340 #endif 1341 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE); 1342 if (nchain->bref.type != focus->bref.type || 1343 nchain->bref.key != focus->bref.key || 1344 nchain->bref.keybits != focus->bref.keybits || 1345 nchain->bref.modify_tid != focus->bref.modify_tid || 1346 nchain->bytes != focus->bytes || 1347 ddflag != cluster->ddflag) { 1348 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1349 } 1350 1351 done: 1352 *key_nextp = key_accum; 1353 #if 0 1354 /* 1355 * For now don't re-resolve cluster->flags. 1356 */ 1357 hammer2_cluster_resolve(cluster); 1358 #endif 1359 } 1360 1361 /* 1362 * Create a new cluster using the specified key 1363 */ 1364 int 1365 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 1366 hammer2_cluster_t **clusterp, 1367 hammer2_key_t key, int keybits, 1368 int type, size_t bytes, int flags) 1369 { 1370 hammer2_cluster_t *cluster; 1371 hammer2_pfs_t *pmp; 1372 int error; 1373 int i; 1374 1375 pmp = trans->pmp; /* can be NULL */ 1376 1377 if ((cluster = *clusterp) == NULL) { 1378 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, 1379 M_WAITOK | M_ZERO); 1380 cluster->pmp = pmp; /* can be NULL */ 1381 cluster->refs = 1; 1382 cluster->flags = HAMMER2_CLUSTER_LOCKED; 1383 } 1384 cluster->focus_index = 0; 1385 cluster->focus = NULL; 1386 1387 /* 1388 * NOTE: cluster->array[] entries can initially be NULL. If 1389 * *clusterp is supplied, skip NULL entries, otherwise 1390 * create new chains. 1391 */ 1392 for (i = 0; i < cparent->nchains; ++i) { 1393 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1394 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1395 continue; 1396 } 1397 if (*clusterp) { 1398 if ((cluster->array[i].flags & 1399 HAMMER2_CITEM_FEMOD) == 0) { 1400 cluster->array[i].flags |= 1401 HAMMER2_CITEM_INVALID; 1402 continue; 1403 } 1404 if (cluster->array[i].chain == NULL) 1405 continue; 1406 } 1407 error = hammer2_chain_create(trans, &cparent->array[i].chain, 1408 &cluster->array[i].chain, pmp, 1409 key, keybits, 1410 type, bytes, flags); 1411 if (cparent->focus_index == i) 1412 cparent->focus = cparent->array[i].chain; 1413 KKASSERT(error == 0); 1414 if (cluster->focus == NULL) { 1415 cluster->focus_index = i; 1416 cluster->focus = cluster->array[i].chain; 1417 } 1418 if (cparent->focus == cparent->array[i].chain) { 1419 cluster->focus_index = i; 1420 cluster->focus = cluster->array[i].chain; 1421 } 1422 } 1423 cluster->nchains = i; 1424 *clusterp = cluster; 1425 hammer2_cluster_resolve(cluster); 1426 1427 return error; 1428 } 1429 1430 /* 1431 * Rename a cluster to a new parent. 1432 * 1433 * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(), 1434 * So the data_off field is not relevant. Only the key and 1435 * keybits are used. 1436 */ 1437 void 1438 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 1439 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 1440 int flags) 1441 { 1442 hammer2_chain_t *chain; 1443 hammer2_blockref_t xbref; 1444 int i; 1445 1446 #if 0 1447 cluster->focus = NULL; 1448 cparent->focus = NULL; 1449 cluster->focus_index = 0; 1450 cparent->focus_index = 0; 1451 #endif 1452 1453 for (i = 0; i < cluster->nchains; ++i) { 1454 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1455 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1456 continue; 1457 } 1458 chain = cluster->array[i].chain; 1459 if (chain) { 1460 if (bref) { 1461 xbref = chain->bref; 1462 xbref.key = bref->key; 1463 xbref.keybits = bref->keybits; 1464 hammer2_chain_rename(trans, &xbref, 1465 &cparent->array[i].chain, 1466 chain, flags); 1467 } else { 1468 hammer2_chain_rename(trans, NULL, 1469 &cparent->array[i].chain, 1470 chain, flags); 1471 } 1472 if (cparent->focus_index == i) 1473 cparent->focus = cparent->array[i].chain; 1474 KKASSERT(cluster->array[i].chain == chain); /*remove*/ 1475 } 1476 } 1477 } 1478 1479 /* 1480 * Mark a cluster deleted 1481 */ 1482 void 1483 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 1484 hammer2_cluster_t *cluster, int flags) 1485 { 1486 hammer2_chain_t *chain; 1487 hammer2_chain_t *parent; 1488 int i; 1489 1490 if (cparent == NULL) { 1491 kprintf("cparent is NULL\n"); 1492 return; 1493 } 1494 1495 for (i = 0; i < cluster->nchains; ++i) { 1496 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1497 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1498 continue; 1499 } 1500 parent = cparent->array[i].chain; 1501 chain = cluster->array[i].chain; 1502 if (chain == NULL) 1503 continue; 1504 if (chain->parent != parent) { 1505 kprintf("hammer2_cluster_delete: parent " 1506 "mismatch chain=%p parent=%p against=%p\n", 1507 chain, chain->parent, parent); 1508 } else { 1509 hammer2_chain_delete(trans, parent, chain, flags); 1510 } 1511 } 1512 } 1513 1514 /* 1515 * Create a snapshot of the specified {parent, ochain} with the specified 1516 * label. The originating hammer2_inode must be exclusively locked for 1517 * safety. 1518 * 1519 * The ioctl code has already synced the filesystem. 1520 */ 1521 int 1522 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, 1523 hammer2_ioc_pfs_t *pfs) 1524 { 1525 hammer2_dev_t *hmp; 1526 hammer2_cluster_t *ncluster; 1527 const hammer2_inode_data_t *ripdata; 1528 hammer2_inode_data_t *wipdata; 1529 hammer2_chain_t *nchain; 1530 hammer2_inode_t *nip; 1531 size_t name_len; 1532 hammer2_key_t lhc; 1533 struct vattr vat; 1534 #if 0 1535 uuid_t opfs_clid; 1536 #endif 1537 int error; 1538 int i; 1539 1540 kprintf("snapshot %s\n", pfs->name); 1541 1542 name_len = strlen(pfs->name); 1543 lhc = hammer2_dirhash(pfs->name, name_len); 1544 1545 /* 1546 * Get the clid 1547 */ 1548 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata; 1549 #if 0 1550 opfs_clid = ripdata->pfs_clid; 1551 #endif 1552 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */ 1553 1554 /* 1555 * Create the snapshot directory under the super-root 1556 * 1557 * Set PFS type, generate a unique filesystem id, and generate 1558 * a cluster id. Use the same clid when snapshotting a PFS root, 1559 * which theoretically allows the snapshot to be used as part of 1560 * the same cluster (perhaps as a cache). 1561 * 1562 * Copy the (flushed) blockref array. Theoretically we could use 1563 * chain_duplicate() but it becomes difficult to disentangle 1564 * the shared core so for now just brute-force it. 1565 */ 1566 VATTR_NULL(&vat); 1567 vat.va_type = VDIR; 1568 vat.va_mode = 0755; 1569 ncluster = NULL; 1570 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, 1571 proc0.p_ucred, pfs->name, name_len, 1572 &ncluster, 1573 HAMMER2_INSERT_PFSROOT, &error); 1574 1575 if (nip) { 1576 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); 1577 wipdata->pfs_type = HAMMER2_PFSTYPE_MASTER; 1578 wipdata->pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT; 1579 wipdata->op_flags |= HAMMER2_OPFLAG_PFSROOT; 1580 kern_uuidgen(&wipdata->pfs_fsid, 1); 1581 1582 /* 1583 * Give the snapshot its own private cluster. As a snapshot 1584 * no further synchronization with the original cluster will 1585 * be done. 1586 */ 1587 #if 0 1588 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1589 wipdata->pfs_clid = opfs_clid; 1590 else 1591 kern_uuidgen(&wipdata->pfs_clid, 1); 1592 #endif 1593 kern_uuidgen(&wipdata->pfs_clid, 1); 1594 1595 for (i = 0; i < ncluster->nchains; ++i) { 1596 if ((ncluster->array[i].flags & 1597 HAMMER2_CITEM_FEMOD) == 0) { 1598 ncluster->array[i].flags |= 1599 HAMMER2_CITEM_INVALID; 1600 continue; 1601 } 1602 nchain = ncluster->array[i].chain; 1603 if (nchain) 1604 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 1605 } 1606 #if 0 1607 /* XXX can't set this unless we do an explicit flush, which 1608 we also need a pmp assigned to do, else the flush code 1609 won't flush ncluster because it thinks it is crossing a 1610 flush boundary */ 1611 hammer2_cluster_set_chainflags(ncluster, 1612 HAMMER2_CHAIN_PFSBOUNDARY); 1613 #endif 1614 1615 /* XXX hack blockset copy */ 1616 /* XXX doesn't work with real cluster */ 1617 KKASSERT(ocluster->nchains == 1); 1618 wipdata->u.blockset = ripdata->u.blockset; 1619 hammer2_cluster_modsync(ncluster); 1620 for (i = 0; i < ncluster->nchains; ++i) { 1621 nchain = ncluster->array[i].chain; 1622 if (nchain) 1623 hammer2_flush(trans, nchain); 1624 } 1625 hammer2_inode_unlock(nip, ncluster); 1626 } 1627 return (error); 1628 } 1629 1630 /* 1631 * Return locked parent cluster given a locked child. The child remains 1632 * locked on return. The new parent's focus follows the child's focus 1633 * and the parent is always resolved. 1634 */ 1635 hammer2_cluster_t * 1636 hammer2_cluster_parent(hammer2_cluster_t *cluster) 1637 { 1638 hammer2_cluster_t *cparent; 1639 int i; 1640 1641 cparent = hammer2_cluster_copy(cluster); 1642 1643 for (i = 0; i < cparent->nchains; ++i) { 1644 hammer2_chain_t *chain; 1645 hammer2_chain_t *rchain; 1646 1647 /* 1648 * Calculate parent for each element. Old chain has an extra 1649 * ref for cparent but the lock remains with cluster. 1650 */ 1651 chain = cparent->array[i].chain; 1652 if (chain == NULL) 1653 continue; 1654 while ((rchain = chain->parent) != NULL) { 1655 hammer2_chain_ref(rchain); 1656 hammer2_chain_unlock(chain); 1657 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS); 1658 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1659 if (chain->parent == rchain) 1660 break; 1661 hammer2_chain_unlock(rchain); 1662 hammer2_chain_drop(rchain); 1663 } 1664 if (cluster->focus == chain) { 1665 cparent->focus_index = i; 1666 cparent->focus = rchain; 1667 } 1668 cparent->array[i].chain = rchain; 1669 hammer2_chain_drop(chain); 1670 } 1671 cparent->flags |= HAMMER2_CLUSTER_LOCKED; 1672 hammer2_cluster_resolve(cparent); 1673 1674 return cparent; 1675 } 1676 1677 /************************************************************************ 1678 * CLUSTER I/O * 1679 ************************************************************************ 1680 * 1681 * 1682 * WARNING! blockref[] array data is not universal. These functions should 1683 * only be used to access universal data. 1684 * 1685 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1686 * complete if necessary. The I/O's should have already been 1687 * initiated by the cluster_lock/chain_lock operation. 1688 * 1689 * The cluster must already be in a modified state before wdata 1690 * is called. The data will already be available for this case. 1691 */ 1692 const hammer2_media_data_t * 1693 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1694 { 1695 return(cluster->focus->data); 1696 } 1697 1698 hammer2_media_data_t * 1699 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1700 { 1701 KKASSERT(hammer2_cluster_modified(cluster)); 1702 return(cluster->focus->data); 1703 } 1704 1705 /* 1706 * Load cluster data asynchronously with callback. 1707 * 1708 * The callback is made for the first validated data found, or NULL 1709 * if no valid data is available. 1710 * 1711 * NOTE! The cluster structure is either unique or serialized (e.g. embedded 1712 * in the inode with an exclusive lock held), the chain structure may be 1713 * shared. 1714 */ 1715 void 1716 hammer2_cluster_load_async(hammer2_cluster_t *cluster, 1717 void (*callback)(hammer2_iocb_t *iocb), void *ptr) 1718 { 1719 hammer2_chain_t *chain; 1720 hammer2_iocb_t *iocb; 1721 hammer2_dev_t *hmp; 1722 hammer2_blockref_t *bref; 1723 int i; 1724 1725 /* 1726 * Try to find a chain whos data is already resolved. If none can 1727 * be found, start with the first chain. 1728 */ 1729 chain = NULL; 1730 for (i = 0; i < cluster->nchains; ++i) { 1731 chain = cluster->array[i].chain; 1732 if (chain && chain->data) 1733 break; 1734 } 1735 if (i == cluster->nchains) { 1736 chain = cluster->array[0].chain; 1737 i = 0; 1738 } 1739 1740 iocb = &cluster->iocb; 1741 iocb->callback = callback; 1742 iocb->dio = NULL; /* for already-validated case */ 1743 iocb->cluster = cluster; 1744 iocb->chain = chain; 1745 iocb->ptr = ptr; 1746 iocb->lbase = (off_t)i; 1747 iocb->flags = 0; 1748 iocb->error = 0; 1749 1750 /* 1751 * Data already validated 1752 */ 1753 if (chain->data) { 1754 callback(iocb); 1755 return; 1756 } 1757 1758 /* 1759 * We must resolve to a device buffer, either by issuing I/O or 1760 * by creating a zero-fill element. We do not mark the buffer 1761 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1762 * API must still be used to do that). 1763 * 1764 * The device buffer is variable-sized in powers of 2 down 1765 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1766 * chunk always contains buffers of the same size. (XXX) 1767 * 1768 * The minimum physical IO size may be larger than the variable 1769 * block size. 1770 * 1771 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes 1772 * matches hammer2_devblksize()? Or does the freemap's 1773 * pre-zeroing handle the case for us? 1774 */ 1775 bref = &chain->bref; 1776 hmp = chain->hmp; 1777 1778 #if 0 1779 /* handled by callback? <- TODO XXX even needed for loads? */ 1780 /* 1781 * The getblk() optimization for a 100% overwrite can only be used 1782 * if the physical block size matches the request. 1783 */ 1784 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 1785 chain->bytes == hammer2_devblksize(chain->bytes)) { 1786 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); 1787 KKASSERT(error == 0); 1788 iocb->dio = dio; 1789 callback(iocb); 1790 return; 1791 } 1792 #endif 1793 1794 /* 1795 * Otherwise issue a read 1796 */ 1797 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1798 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb); 1799 } 1800