1 /* 2 * Copyright (c) 2013-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * 59 * - Most complex functions, quorum management on transaction ids. 60 * 61 * - Locking and data accesses must be internally asynchronous. 62 * 63 * - Validate and manage cache coherency primitives (cache state 64 * is stored in chain topologies but must be validated by these 65 * functions). 66 * 67 * (2) Lookups and Scans 68 * hammer2_cluster_lookup() 69 * hammer2_cluster_next() 70 * 71 * - Depend on locking & data retrieval functions, but still complex. 72 * 73 * - Must do quorum management on transaction ids. 74 * 75 * - Lookup and Iteration ops Must be internally asynchronous. 76 * 77 * (3) Modifying Operations 78 * hammer2_cluster_create() 79 * 80 * - Can usually punt on failures, operation continues unless quorum 81 * is lost. If quorum is lost, must wait for resynchronization 82 * (depending on the management mode). 83 * 84 * - Must disconnect node on failures (also not flush), remount, and 85 * resynchronize. 86 * 87 * - Network links (via kdmsg) are relatively easy to issue as the 88 * complex underworkings of hammer2_chain.c don't have to messed 89 * with (the protocol is at a higher level than block-level). 90 * 91 * - Multiple local disk nodes (i.e. block devices) are another matter. 92 * Chain operations have to be dispatched to per-node threads (xN) 93 * because we can't asynchronize potentially very complex chain 94 * operations in hammer2_chain.c (it would be a huge mess). 95 * 96 * (these threads are also used to terminate incoming kdmsg ops from 97 * other machines). 98 * 99 * - Single-node filesystems do not use threads and will simply call 100 * hammer2_chain.c functions directly. This short-cut is handled 101 * at the base of each cluster function. 102 */ 103 #include <sys/cdefs.h> 104 #include <sys/param.h> 105 #include <sys/systm.h> 106 #include <sys/types.h> 107 #include <sys/lock.h> 108 #include <sys/uuid.h> 109 110 #include "hammer2.h" 111 112 /* 113 * Returns the bref type of the cluster's foucs. 114 * 115 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0). 116 * The cluster must be locked. 117 */ 118 uint8_t 119 hammer2_cluster_type(hammer2_cluster_t *cluster) 120 { 121 if (cluster->error == 0) { 122 KKASSERT(cluster->focus != NULL); 123 return(cluster->focus->bref.type); 124 } 125 return 0; 126 } 127 128 /* 129 * Returns the bref of the cluster's focus, sans any data-offset information 130 * (since offset information is per-node and wouldn't be useful). 131 * 132 * Callers use this function to access modify_tid, mirror_tid, type, 133 * key, and keybits. 134 * 135 * If the cluster is errored, returns an empty bref. 136 * The cluster must be locked. 137 */ 138 void 139 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 140 { 141 if (cluster->error == 0) { 142 KKASSERT(cluster->focus != NULL); 143 *bref = cluster->focus->bref; 144 bref->data_off = 0; 145 } else { 146 bzero(bref, sizeof(*bref)); 147 } 148 } 149 150 /* 151 * Create a degenerate cluster with one ref from a single locked chain. 152 * The returned cluster will be focused on the chain and inherit its 153 * error state. 154 * 155 * The chain's lock and reference are transfered to the new cluster, so 156 * the caller should not try to unlock the chain separately. 157 * 158 * We fake the flags. 159 */ 160 void 161 hammer2_dummy_xop_from_chain(hammer2_xop_head_t *xop, hammer2_chain_t *chain) 162 { 163 hammer2_cluster_t *cluster; 164 165 bzero(xop, sizeof(*xop)); 166 167 cluster = &xop->cluster; 168 cluster->array[0].chain = chain; 169 cluster->array[0].flags = HAMMER2_CITEM_FEMOD; 170 cluster->nchains = 1; 171 cluster->focus = chain; 172 cluster->focus_index = 0; 173 cluster->pmp = chain->pmp; 174 cluster->refs = 1; 175 cluster->error = chain->error; 176 cluster->flags = HAMMER2_CLUSTER_LOCKED | 177 HAMMER2_CLUSTER_WRHARD | 178 HAMMER2_CLUSTER_RDHARD | 179 HAMMER2_CLUSTER_MSYNCED | 180 HAMMER2_CLUSTER_SSYNCED; 181 } 182 183 /* 184 * Add a reference to a cluster and its underlying chains. 185 * 186 * We must also ref the underlying chains in order to allow ref/unlock 187 * sequences to later re-lock. 188 */ 189 void 190 hammer2_cluster_ref(hammer2_cluster_t *cluster) 191 { 192 atomic_add_int(&cluster->refs, 1); 193 } 194 195 /* 196 * Drop the caller's reference to the cluster. When the ref count drops to 197 * zero this function frees the cluster and drops all underlying chains. 198 * 199 * In-progress read I/Os are typically detached from the cluster once the 200 * first one returns (the remaining stay attached to the DIOs but are then 201 * ignored and drop naturally). 202 */ 203 void 204 hammer2_cluster_drop(hammer2_cluster_t *cluster) 205 { 206 hammer2_chain_t *chain; 207 int i; 208 209 KKASSERT(cluster->refs > 0); 210 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 211 cluster->focus = NULL; /* safety XXX chg to assert */ 212 cluster->focus_index = 0; 213 214 for (i = 0; i < cluster->nchains; ++i) { 215 chain = cluster->array[i].chain; 216 if (chain) { 217 hammer2_chain_drop(chain); 218 cluster->array[i].chain = NULL; /* safety */ 219 } 220 } 221 cluster->nchains = 0; /* safety */ 222 223 kfree(cluster, M_HAMMER2); 224 /* cluster is invalid */ 225 } 226 } 227 228 /* 229 * Lock a cluster. Cluster must already be referenced. Focus is maintained. 230 * 231 * WARNING! This function expects the caller to handle resolution of the 232 * cluster. We never re-resolve the cluster in this function, 233 * because it might be used to temporarily unlock/relock a cparent 234 * in an iteration or recursrion, and the cparents elements do not 235 * necessarily match. 236 */ 237 void 238 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 239 { 240 hammer2_chain_t *chain; 241 int i; 242 243 /* cannot be on inode-embedded cluster template, must be on copy */ 244 KKASSERT(cluster->refs > 0); 245 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0); 246 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) { 247 panic("hammer2_cluster_lock: cluster %p already locked!\n", 248 cluster); 249 } 250 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 251 252 /* 253 * Lock chains and resolve state. 254 */ 255 for (i = 0; i < cluster->nchains; ++i) { 256 chain = cluster->array[i].chain; 257 if (chain == NULL) 258 continue; 259 hammer2_chain_lock(chain, how); 260 } 261 } 262 263 /* 264 * Calculate the clustering state for the cluster and set its focus. 265 * This routine must be called with care. For example, it should not 266 * normally be called after relocking a non-leaf cluster because parent 267 * clusters help iterations and each element might be at a slightly different 268 * indirect node (each node's topology is independently indexed). 269 * 270 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal 271 * operations. Typically this is only set on a quorum of MASTERs or 272 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER 273 * is present, this bit is *not* set on a quorum of MASTERs. The 274 * synchronization code ignores this bit, but all hammer2_cluster_*() calls 275 * that create/modify/delete elements use it. 276 * 277 * The chains making up the cluster may be narrowed down based on quorum 278 * acceptability, and if RESOLVE_RDONLY is specified the chains can be 279 * narrowed down to a single chain as long as the entire subtopology is known 280 * to be intact. So, for example, we can narrow a read-only op to a single 281 * fast SLAVE but if we focus a CACHE chain we must still retain at least 282 * a SLAVE to ensure that the subtopology can be accessed. 283 * 284 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need 285 * to be maintained once the topology is validated as-of the top level of 286 * the operation. 287 * 288 * If a failure occurs the operation must be aborted by higher-level code and 289 * retried. XXX 290 */ 291 void 292 hammer2_cluster_resolve(hammer2_cluster_t *cluster) 293 { 294 hammer2_chain_t *chain; 295 hammer2_chain_t *focus; 296 hammer2_pfs_t *pmp; 297 hammer2_tid_t quorum_tid; 298 hammer2_tid_t last_best_quorum_tid; 299 int focus_pfs_type; 300 uint32_t nflags; 301 int ttlmasters; 302 int ttlslaves; 303 int nmasters; 304 int nslaves; 305 int nquorum; 306 int smpresent; 307 int i; 308 309 cluster->error = 0; 310 cluster->focus = NULL; 311 312 focus_pfs_type = 0; 313 nflags = 0; 314 ttlmasters = 0; 315 ttlslaves = 0; 316 nmasters = 0; 317 nslaves = 0; 318 319 /* 320 * Calculate quorum 321 */ 322 pmp = cluster->pmp; 323 KKASSERT(pmp != NULL || cluster->nchains == 0); 324 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 325 smpresent = 0; 326 327 /* 328 * Pass 1 329 * 330 * NOTE: A NULL chain is not necessarily an error, it could be 331 * e.g. a lookup failure or the end of an iteration. 332 * Process normally. 333 */ 334 for (i = 0; i < cluster->nchains; ++i) { 335 chain = cluster->array[i].chain; 336 if (chain && chain->error) { 337 if (cluster->focus == NULL || cluster->focus == chain) { 338 /* error will be overridden by valid focus */ 339 cluster->error = chain->error; 340 } 341 342 /* 343 * Must count total masters and slaves whether the 344 * chain is errored or not. 345 */ 346 switch (cluster->pmp->pfs_types[i]) { 347 case HAMMER2_PFSTYPE_SUPROOT: 348 case HAMMER2_PFSTYPE_MASTER: 349 ++ttlmasters; 350 break; 351 case HAMMER2_PFSTYPE_SLAVE: 352 ++ttlslaves; 353 break; 354 } 355 continue; 356 } 357 switch (cluster->pmp->pfs_types[i]) { 358 case HAMMER2_PFSTYPE_MASTER: 359 ++ttlmasters; 360 break; 361 case HAMMER2_PFSTYPE_SLAVE: 362 ++ttlslaves; 363 break; 364 case HAMMER2_PFSTYPE_SOFT_MASTER: 365 nflags |= HAMMER2_CLUSTER_WRSOFT; 366 nflags |= HAMMER2_CLUSTER_RDSOFT; 367 smpresent = 1; 368 break; 369 case HAMMER2_PFSTYPE_SOFT_SLAVE: 370 nflags |= HAMMER2_CLUSTER_RDSOFT; 371 break; 372 case HAMMER2_PFSTYPE_SUPROOT: 373 /* 374 * Degenerate cluster representing the super-root 375 * topology on a single device. Fake stuff so 376 * cluster ops work as expected. 377 */ 378 nflags |= HAMMER2_CLUSTER_WRHARD; 379 nflags |= HAMMER2_CLUSTER_RDHARD; 380 cluster->focus_index = i; 381 cluster->focus = chain; 382 cluster->error = chain ? chain->error : 0; 383 ++ttlmasters; 384 break; 385 default: 386 break; 387 } 388 } 389 390 /* 391 * Pass 2 392 * 393 * Resolve masters. Calculate nmasters for the highest matching 394 * TID, if a quorum cannot be attained try the next lower matching 395 * TID until we exhaust TIDs. 396 * 397 * NOTE: A NULL chain is not necessarily an error, it could be 398 * e.g. a lookup failure or the end of an iteration. 399 * Process normally. 400 */ 401 last_best_quorum_tid = HAMMER2_TID_MAX; 402 quorum_tid = 0; /* fix gcc warning */ 403 404 while (nmasters < nquorum && last_best_quorum_tid != 0) { 405 nmasters = 0; 406 quorum_tid = 0; 407 408 for (i = 0; i < cluster->nchains; ++i) { 409 switch (cluster->pmp->pfs_types[i]) { 410 case HAMMER2_PFSTYPE_SUPROOT: 411 case HAMMER2_PFSTYPE_MASTER: 412 break; 413 default: 414 continue; 415 } 416 chain = cluster->array[i].chain; 417 418 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 419 /* 420 * Invalid as in unsynchronized, cannot be 421 * used to calculate the quorum. 422 */ 423 } else if (chain == NULL && quorum_tid == 0) { 424 /* 425 * NULL chain on master matches NULL chains 426 * on other masters. 427 */ 428 ++nmasters; 429 } else if (quorum_tid < last_best_quorum_tid && 430 chain != NULL && 431 (quorum_tid < chain->bref.modify_tid || 432 nmasters == 0)) { 433 /* 434 * Better TID located, reset nmasters count. 435 */ 436 nmasters = 1; 437 quorum_tid = chain->bref.modify_tid; 438 } else if (chain && 439 quorum_tid == chain->bref.modify_tid) { 440 /* 441 * TID matches current collection. 442 */ 443 ++nmasters; 444 } 445 } 446 if (nmasters >= nquorum) 447 break; 448 last_best_quorum_tid = quorum_tid; 449 } 450 451 /* 452 * Pass 3 453 * 454 * NOTE: A NULL chain is not necessarily an error, it could be 455 * e.g. a lookup failure or the end of an iteration. 456 * Process normally. 457 */ 458 for (i = 0; i < cluster->nchains; ++i) { 459 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 460 chain = cluster->array[i].chain; 461 if (chain && chain->error) { 462 if (cluster->focus == NULL || cluster->focus == chain) { 463 /* error will be overridden by valid focus */ 464 cluster->error = chain->error; 465 } 466 continue; 467 } 468 469 switch (cluster->pmp->pfs_types[i]) { 470 case HAMMER2_PFSTYPE_MASTER: 471 /* 472 * We must have enough up-to-date masters to reach 473 * a quorum and the master modify_tid must match 474 * the quorum's modify_tid. 475 * 476 * Do not select an errored or out-of-sync master. 477 */ 478 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 479 nflags |= HAMMER2_CLUSTER_UNHARD; 480 } else if (nmasters >= nquorum && 481 (chain == NULL || chain->error == 0) && 482 ((chain == NULL && quorum_tid == 0) || 483 (chain != NULL && quorum_tid == 484 chain->bref.modify_tid))) { 485 nflags |= HAMMER2_CLUSTER_WRHARD; 486 nflags |= HAMMER2_CLUSTER_RDHARD; 487 if (!smpresent) { 488 cluster->array[i].flags |= 489 HAMMER2_CITEM_FEMOD; 490 } 491 if (cluster->focus == NULL || 492 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) { 493 focus_pfs_type = HAMMER2_PFSTYPE_MASTER; 494 cluster->focus_index = i; 495 cluster->focus = chain; /* NULL ok */ 496 cluster->error = chain ? chain->error : 497 0; 498 } 499 } else if (chain == NULL || chain->error == 0) { 500 nflags |= HAMMER2_CLUSTER_UNHARD; 501 } 502 break; 503 case HAMMER2_PFSTYPE_SLAVE: 504 /* 505 * We must have enough up-to-date masters to reach 506 * a quorum and the slave modify_tid must match the 507 * quorum's modify_tid. 508 * 509 * Do not select an errored slave. 510 */ 511 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 512 nflags |= HAMMER2_CLUSTER_UNHARD; 513 } else if (nmasters >= nquorum && 514 (chain == NULL || chain->error == 0) && 515 ((chain == NULL && quorum_tid == 0) || 516 (chain && quorum_tid == 517 chain->bref.modify_tid))) { 518 ++nslaves; 519 nflags |= HAMMER2_CLUSTER_RDHARD; 520 #if 0 521 /* XXX optimize for RESOLVE_RDONLY */ 522 if (cluster->focus == NULL) { 523 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE; 524 cluster->focus_index = i; 525 cluster->focus = chain; /* NULL ok */ 526 cluster->error = chain ? chain->error : 527 0; 528 } 529 #endif 530 } else if (chain == NULL || chain->error == 0) { 531 nflags |= HAMMER2_CLUSTER_UNSOFT; 532 } 533 break; 534 case HAMMER2_PFSTYPE_SOFT_MASTER: 535 /* 536 * Directly mounted soft master always wins. There 537 * should be only one. 538 */ 539 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER); 540 cluster->focus_index = i; 541 cluster->focus = chain; 542 cluster->error = chain ? chain->error : 0; 543 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER; 544 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 545 break; 546 case HAMMER2_PFSTYPE_SOFT_SLAVE: 547 /* 548 * Directly mounted soft slave always wins. There 549 * should be only one. 550 */ 551 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE); 552 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) { 553 cluster->focus_index = i; 554 cluster->focus = chain; 555 cluster->error = chain ? chain->error : 0; 556 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE; 557 } 558 break; 559 case HAMMER2_PFSTYPE_SUPROOT: 560 /* 561 * spmp (degenerate case) 562 */ 563 KKASSERT(i == 0); 564 cluster->focus_index = i; 565 cluster->focus = chain; 566 cluster->error = chain ? chain->error : 0; 567 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT; 568 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 569 break; 570 default: 571 break; 572 } 573 } 574 575 /* 576 * Focus now set, adjust ddflag. Skip this pass if the focus 577 * is bad or if we are at the PFS root (the bref won't match at 578 * the PFS root, obviously). 579 */ 580 focus = cluster->focus; 581 if (focus) { 582 cluster->ddflag = 583 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 584 } else { 585 cluster->ddflag = 0; 586 goto skip4; 587 } 588 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 589 goto skip4; 590 591 /* 592 * Pass 4 593 * 594 * Validate the elements that were not marked invalid. They should 595 * match. 596 */ 597 for (i = 0; i < cluster->nchains; ++i) { 598 int ddflag; 599 600 chain = cluster->array[i].chain; 601 602 if (chain == NULL) 603 continue; 604 if (chain == focus) 605 continue; 606 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 607 continue; 608 609 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 610 if (chain->bref.type != focus->bref.type || 611 chain->bref.key != focus->bref.key || 612 chain->bref.keybits != focus->bref.keybits || 613 chain->bref.modify_tid != focus->bref.modify_tid || 614 chain->bytes != focus->bytes || 615 ddflag != cluster->ddflag) { 616 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 617 if (hammer2_debug & 1) 618 kprintf("cluster_resolve: matching modify_tid failed " 619 "bref test: idx=%d type=%02x/%02x " 620 "key=%016jx/%d-%016jx/%d " 621 "mod=%016jx/%016jx bytes=%u/%u\n", 622 i, 623 chain->bref.type, focus->bref.type, 624 chain->bref.key, chain->bref.keybits, 625 focus->bref.key, focus->bref.keybits, 626 chain->bref.modify_tid, focus->bref.modify_tid, 627 chain->bytes, focus->bytes); 628 if (hammer2_debug & 0x4000) 629 panic("cluster_resolve"); 630 /* flag issue and force resync? */ 631 } 632 } 633 skip4: 634 635 if (ttlslaves == 0) 636 nflags |= HAMMER2_CLUSTER_NOSOFT; 637 if (ttlmasters == 0) 638 nflags |= HAMMER2_CLUSTER_NOHARD; 639 640 /* 641 * Set SSYNCED or MSYNCED for slaves and masters respectively if 642 * all available nodes (even if 0 are available) are fully 643 * synchronized. This is used by the synchronization thread to 644 * determine if there is work it could potentially accomplish. 645 */ 646 if (nslaves == ttlslaves) 647 nflags |= HAMMER2_CLUSTER_SSYNCED; 648 if (nmasters == ttlmasters) 649 nflags |= HAMMER2_CLUSTER_MSYNCED; 650 651 /* 652 * Determine if the cluster was successfully locked for the 653 * requested operation and generate an error code. The cluster 654 * will not be locked (or ref'd) if an error is returned. 655 */ 656 atomic_set_int(&cluster->flags, nflags); 657 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 658 } 659 660 /* 661 * This is used by the XOPS subsystem to calculate the state of 662 * the collection and tell hammer2_xop_collect() what to do with it. 663 * The collection can be in various states of desynchronization, the 664 * caller specifically wants to resolve the passed-in key. 665 * 666 * Return values: 667 * 0 - Quorum agreement, key is valid 668 * 669 * ENOENT - Quorum agreement, end of scan 670 * 671 * ESRCH - Quorum agreement, key is INVALID (caller should 672 * skip key). 673 * 674 * EIO - Quorum agreement but all elements had errors. 675 * 676 * EDEADLK - No quorum agreement possible for key, a repair 677 * may be needed. Caller has to decide what to do, 678 * possibly iterating the key or generating an EIO. 679 * 680 * EINPROGRESS - No quorum agreement yet, but agreement is still 681 * possible if caller waits for more responses. Caller 682 * should not iterate key. 683 * 684 * NOTE! If the pmp is in HMNT2_LOCAL mode, the cluster check always succeeds. 685 * 686 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE 687 */ 688 int 689 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags) 690 { 691 hammer2_chain_t *chain; 692 hammer2_chain_t *focus; 693 hammer2_pfs_t *pmp; 694 hammer2_tid_t quorum_tid; 695 hammer2_tid_t last_best_quorum_tid; 696 uint32_t nflags; 697 int ttlmasters; 698 int ttlslaves; 699 int nmasters; 700 int nmasters_keymatch; 701 int nslaves; 702 int nquorum; 703 int umasters; /* unknown masters (still in progress) */ 704 int smpresent; 705 int error; 706 int i; 707 708 cluster->error = 0; 709 cluster->focus = NULL; 710 711 pmp = cluster->pmp; 712 KKASSERT(pmp != NULL || cluster->nchains == 0); 713 714 /* 715 * Calculate quorum 716 */ 717 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 718 smpresent = 0; 719 nflags = 0; 720 ttlmasters = 0; 721 ttlslaves = 0; 722 723 /* 724 * Pass 1 725 * 726 * NOTE: A NULL chain is not necessarily an error, it could be 727 * e.g. a lookup failure or the end of an iteration. 728 * Process normally. 729 */ 730 for (i = 0; i < cluster->nchains; ++i) { 731 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 732 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 733 734 chain = cluster->array[i].chain; 735 error = cluster->array[i].error; 736 if (chain && error) { 737 if (cluster->focus == NULL || cluster->focus == chain) { 738 /* error will be overridden by valid focus */ 739 /* XXX */ 740 } 741 742 /* 743 * Must count total masters and slaves whether the 744 * chain is errored or not. 745 */ 746 switch (cluster->pmp->pfs_types[i]) { 747 case HAMMER2_PFSTYPE_SUPROOT: 748 case HAMMER2_PFSTYPE_MASTER: 749 ++ttlmasters; 750 break; 751 case HAMMER2_PFSTYPE_SLAVE: 752 ++ttlslaves; 753 break; 754 } 755 continue; 756 } 757 switch (cluster->pmp->pfs_types[i]) { 758 case HAMMER2_PFSTYPE_MASTER: 759 ++ttlmasters; 760 break; 761 case HAMMER2_PFSTYPE_SLAVE: 762 ++ttlslaves; 763 break; 764 case HAMMER2_PFSTYPE_SOFT_MASTER: 765 nflags |= HAMMER2_CLUSTER_WRSOFT; 766 nflags |= HAMMER2_CLUSTER_RDSOFT; 767 smpresent = 1; 768 break; 769 case HAMMER2_PFSTYPE_SOFT_SLAVE: 770 nflags |= HAMMER2_CLUSTER_RDSOFT; 771 break; 772 case HAMMER2_PFSTYPE_SUPROOT: 773 /* 774 * Degenerate cluster representing the super-root 775 * topology on a single device. Fake stuff so 776 * cluster ops work as expected. 777 */ 778 ++ttlmasters; 779 nflags |= HAMMER2_CLUSTER_WRHARD; 780 nflags |= HAMMER2_CLUSTER_RDHARD; 781 cluster->focus_index = i; 782 cluster->focus = chain; 783 cluster->error = error; 784 break; 785 default: 786 break; 787 } 788 } 789 790 /* 791 * Pass 2 792 * 793 * Resolve nmasters - master nodes fully match 794 * 795 * Resolve umasters - master nodes operation still 796 * in progress 797 * 798 * Resolve nmasters_keymatch - master nodes match the passed-in 799 * key and may or may not match 800 * the quorum-agreed tid. 801 * 802 * The quorum-agreed TID is the highest matching TID. 803 */ 804 last_best_quorum_tid = HAMMER2_TID_MAX; 805 umasters = 0; 806 nmasters = 0; 807 nmasters_keymatch = 0; 808 quorum_tid = 0; /* fix gcc warning */ 809 810 while (nmasters < nquorum && last_best_quorum_tid != 0) { 811 umasters = 0; 812 nmasters = 0; 813 nmasters_keymatch = 0; 814 quorum_tid = 0; 815 816 for (i = 0; i < cluster->nchains; ++i) { 817 /* XXX SOFT smpresent handling */ 818 switch(cluster->pmp->pfs_types[i]) { 819 case HAMMER2_PFSTYPE_MASTER: 820 case HAMMER2_PFSTYPE_SUPROOT: 821 break; 822 default: 823 continue; 824 } 825 826 chain = cluster->array[i].chain; 827 error = cluster->array[i].error; 828 829 /* 830 * Skip elements still in progress. umasters keeps 831 * track of masters that might still be in-progress. 832 */ 833 if (chain == NULL && (cluster->array[i].flags & 834 HAMMER2_CITEM_NULL) == 0) { 835 ++umasters; 836 continue; 837 } 838 839 /* 840 * Key match? 841 */ 842 if (flags & HAMMER2_CHECK_NULL) { 843 if (chain == NULL) { 844 ++nmasters; 845 ++nmasters_keymatch; 846 if (cluster->error == 0) 847 cluster->error = error; 848 } 849 } else if (chain && 850 (key == (hammer2_key_t)-1 || 851 chain->bref.key == key)) { 852 ++nmasters_keymatch; 853 854 if (chain->bref.modify_tid < 855 last_best_quorum_tid && 856 quorum_tid < chain->bref.modify_tid) { 857 /* 858 * Select new TID as master if better 859 * than any found so far in this loop, 860 * as long as it does not reach the 861 * best tid found in the previous loop. 862 */ 863 nmasters = 0; 864 quorum_tid = chain->bref.modify_tid; 865 } 866 if (quorum_tid == chain->bref.modify_tid) { 867 /* 868 * TID matches current collection. 869 * 870 * (error handled in next pass) 871 */ 872 ++nmasters; 873 if (chain->error == 0) { 874 cluster->focus = chain; 875 cluster->focus_index = i; 876 } 877 } 878 } 879 } 880 if (nmasters >= nquorum) 881 break; 882 last_best_quorum_tid = quorum_tid; 883 } 884 885 /* 886 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n", 887 nmasters, nquorum, nmasters_keymatch, umasters); 888 */ 889 890 /* 891 * Early return if we do not have enough masters. 892 */ 893 if (nmasters < nquorum) { 894 if (nmasters + umasters >= nquorum) 895 return HAMMER2_ERROR_EINPROGRESS; 896 if (nmasters_keymatch < nquorum) 897 return HAMMER2_ERROR_ESRCH; 898 return HAMMER2_ERROR_EDEADLK; 899 } 900 901 /* 902 * Validated end of scan. 903 */ 904 if (flags & HAMMER2_CHECK_NULL) { 905 if (cluster->error == 0) 906 cluster->error = HAMMER2_ERROR_ENOENT; 907 return cluster->error; 908 } 909 910 /* 911 * If we have a NULL focus at this point the agreeing quorum all 912 * had chain errors. 913 */ 914 if (cluster->focus == NULL) 915 return HAMMER2_ERROR_EIO; 916 917 /* 918 * Pass 3 919 * 920 * We have quorum agreement, validate elements, not end of scan. 921 */ 922 nslaves = 0; 923 cluster->error = 0; 924 925 for (i = 0; i < cluster->nchains; ++i) { 926 chain = cluster->array[i].chain; 927 error = cluster->array[i].error; 928 if (chain == NULL || 929 chain->bref.key != key || 930 chain->bref.modify_tid != quorum_tid) { 931 continue; 932 } 933 934 /* 935 * Quorum Match 936 * 937 * XXX for now, cumulative error. 938 */ 939 if (cluster->error == 0) 940 cluster->error = error; 941 942 switch (cluster->pmp->pfs_types[i]) { 943 case HAMMER2_PFSTYPE_MASTER: 944 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 945 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 946 nflags |= HAMMER2_CLUSTER_WRHARD; 947 nflags |= HAMMER2_CLUSTER_RDHARD; 948 break; 949 case HAMMER2_PFSTYPE_SLAVE: 950 /* 951 * We must have enough up-to-date masters to reach 952 * a quorum and the slave modify_tid must match the 953 * quorum's modify_tid. 954 * 955 * Do not select an errored slave. 956 */ 957 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 958 nflags |= HAMMER2_CLUSTER_RDHARD; 959 ++nslaves; 960 break; 961 case HAMMER2_PFSTYPE_SOFT_MASTER: 962 /* 963 * Directly mounted soft master always wins. There 964 * should be only one. 965 */ 966 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 967 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 968 break; 969 case HAMMER2_PFSTYPE_SOFT_SLAVE: 970 /* 971 * Directly mounted soft slave always wins. There 972 * should be only one. 973 * 974 * XXX 975 */ 976 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 977 break; 978 case HAMMER2_PFSTYPE_SUPROOT: 979 /* 980 * spmp (degenerate case) 981 */ 982 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 983 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 984 nflags |= HAMMER2_CLUSTER_WRHARD; 985 nflags |= HAMMER2_CLUSTER_RDHARD; 986 break; 987 default: 988 break; 989 } 990 } 991 992 /* 993 * Focus now set, adjust ddflag. Skip this pass if the focus 994 * is bad or if we are at the PFS root (the bref won't match at 995 * the PFS root, obviously). 996 * 997 * focus is probably not locked and it isn't safe to test its 998 * content (e.g. focus->data, focus->dio, other content). We 999 * do not synchronize the dio to the cpu here. In fact, in numerous 1000 * situations the frontend doesn't even need to access its dio/data, 1001 * so synchronizing it here would be wasteful. 1002 */ 1003 focus = cluster->focus; 1004 if (focus) { 1005 cluster->ddflag = 1006 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 1007 } else { 1008 cluster->ddflag = 0; 1009 goto skip4; 1010 } 1011 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1012 goto skip4; 1013 1014 /* 1015 * Pass 4 1016 * 1017 * Validate the elements that were not marked invalid. They should 1018 * match. 1019 */ 1020 for (i = 0; i < cluster->nchains; ++i) { 1021 int ddflag; 1022 1023 chain = cluster->array[i].chain; 1024 1025 if (chain == NULL) 1026 continue; 1027 if (chain == focus) 1028 continue; 1029 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 1030 continue; 1031 1032 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 1033 if (chain->bref.type != focus->bref.type || 1034 chain->bref.key != focus->bref.key || 1035 chain->bref.keybits != focus->bref.keybits || 1036 chain->bref.modify_tid != focus->bref.modify_tid || 1037 chain->bytes != focus->bytes || 1038 ddflag != cluster->ddflag) { 1039 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1040 if (hammer2_debug & 1) 1041 kprintf("cluster_resolve: matching modify_tid failed " 1042 "bref test: idx=%d type=%02x/%02x " 1043 "key=%016jx/%d-%016jx/%d " 1044 "mod=%016jx/%016jx bytes=%u/%u\n", 1045 i, 1046 chain->bref.type, focus->bref.type, 1047 chain->bref.key, chain->bref.keybits, 1048 focus->bref.key, focus->bref.keybits, 1049 chain->bref.modify_tid, focus->bref.modify_tid, 1050 chain->bytes, focus->bytes); 1051 if (hammer2_debug & 0x4000) 1052 panic("cluster_resolve"); 1053 /* flag issue and force resync? */ 1054 } 1055 } 1056 skip4: 1057 1058 if (ttlslaves == 0) 1059 nflags |= HAMMER2_CLUSTER_NOSOFT; 1060 if (ttlmasters == 0) 1061 nflags |= HAMMER2_CLUSTER_NOHARD; 1062 1063 /* 1064 * Set SSYNCED or MSYNCED for slaves and masters respectively if 1065 * all available nodes (even if 0 are available) are fully 1066 * synchronized. This is used by the synchronization thread to 1067 * determine if there is work it could potentially accomplish. 1068 */ 1069 if (nslaves == ttlslaves) 1070 nflags |= HAMMER2_CLUSTER_SSYNCED; 1071 if (nmasters == ttlmasters) 1072 nflags |= HAMMER2_CLUSTER_MSYNCED; 1073 1074 /* 1075 * Determine if the cluster was successfully locked for the 1076 * requested operation and generate an error code. The cluster 1077 * will not be locked (or ref'd) if an error is returned. 1078 */ 1079 atomic_set_int(&cluster->flags, nflags); 1080 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 1081 1082 return cluster->error; 1083 } 1084 1085 /* 1086 * This is used by the sync thread to force non-NULL elements of a copy 1087 * of the pmp->iroot cluster to be good which is required to prime the 1088 * sync. 1089 */ 1090 void 1091 hammer2_cluster_forcegood(hammer2_cluster_t *cluster) 1092 { 1093 int i; 1094 1095 for (i = 0; i < cluster->nchains; ++i) { 1096 if (cluster->array[i].chain) 1097 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1098 } 1099 } 1100 1101 /* 1102 * Unlock a cluster. Refcount and focus is maintained. 1103 */ 1104 void 1105 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 1106 { 1107 hammer2_chain_t *chain; 1108 int i; 1109 1110 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) { 1111 kprintf("hammer2_cluster_unlock: cluster %p not locked\n", 1112 cluster); 1113 } 1114 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 1115 KKASSERT(cluster->refs > 0); 1116 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 1117 1118 for (i = 0; i < cluster->nchains; ++i) { 1119 chain = cluster->array[i].chain; 1120 if (chain) 1121 hammer2_chain_unlock(chain); 1122 } 1123 } 1124