1 /* 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * hammer2_cluster_lock() 59 * hammer2_cluster_parent() 60 * 61 * - Most complex functions, quorum management on transaction ids. 62 * 63 * - Locking and data accesses must be internally asynchronous. 64 * 65 * - Validate and manage cache coherency primitives (cache state 66 * is stored in chain topologies but must be validated by these 67 * functions). 68 * 69 * (2) Lookups and Scans 70 * hammer2_cluster_lookup() 71 * hammer2_cluster_next() 72 * 73 * - Depend on locking & data retrieval functions, but still complex. 74 * 75 * - Must do quorum management on transaction ids. 76 * 77 * - Lookup and Iteration ops Must be internally asynchronous. 78 * 79 * (3) Modifying Operations 80 * hammer2_cluster_create() 81 * hammer2_cluster_rename() 82 * hammer2_cluster_delete() 83 * hammer2_cluster_modify() 84 * hammer2_cluster_modsync() 85 * 86 * - Can usually punt on failures, operation continues unless quorum 87 * is lost. If quorum is lost, must wait for resynchronization 88 * (depending on the management mode). 89 * 90 * - Must disconnect node on failures (also not flush), remount, and 91 * resynchronize. 92 * 93 * - Network links (via kdmsg) are relatively easy to issue as the 94 * complex underworkings of hammer2_chain.c don't have to messed 95 * with (the protocol is at a higher level than block-level). 96 * 97 * - Multiple local disk nodes (i.e. block devices) are another matter. 98 * Chain operations have to be dispatched to per-node threads (xN) 99 * because we can't asynchronize potentially very complex chain 100 * operations in hammer2_chain.c (it would be a huge mess). 101 * 102 * (these threads are also used to terminate incoming kdmsg ops from 103 * other machines). 104 * 105 * - Single-node filesystems do not use threads and will simply call 106 * hammer2_chain.c functions directly. This short-cut is handled 107 * at the base of each cluster function. 108 */ 109 #include <sys/cdefs.h> 110 #include <sys/param.h> 111 #include <sys/systm.h> 112 #include <sys/types.h> 113 #include <sys/lock.h> 114 #include <sys/uuid.h> 115 116 #include "hammer2.h" 117 118 /* 119 * Returns the bref type of the cluster's foucs. 120 * 121 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0). 122 * The cluster must be locked. 123 */ 124 uint8_t 125 hammer2_cluster_type(hammer2_cluster_t *cluster) 126 { 127 if (cluster->error == 0) { 128 KKASSERT(cluster->focus != NULL); 129 return(cluster->focus->bref.type); 130 } 131 return 0; 132 } 133 134 /* 135 * Returns non-zero if the cluster's focus is flagged as being modified. 136 * 137 * If the cluster is errored, returns 0. 138 */ 139 static 140 int 141 hammer2_cluster_modified(hammer2_cluster_t *cluster) 142 { 143 if (cluster->error == 0) { 144 KKASSERT(cluster->focus != NULL); 145 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 146 } 147 return 0; 148 } 149 150 /* 151 * Returns the bref of the cluster's focus, sans any data-offset information 152 * (since offset information is per-node and wouldn't be useful). 153 * 154 * Callers use this function to access modify_tid, mirror_tid, type, 155 * key, and keybits. 156 * 157 * If the cluster is errored, returns an empty bref. 158 * The cluster must be locked. 159 */ 160 void 161 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 162 { 163 if (cluster->error == 0) { 164 KKASSERT(cluster->focus != NULL); 165 *bref = cluster->focus->bref; 166 bref->data_off = 0; 167 } else { 168 bzero(bref, sizeof(*bref)); 169 } 170 } 171 172 /* 173 * Create a degenerate cluster with one ref from a single locked chain. 174 * The returned cluster will be focused on the chain and inherit its 175 * error state. 176 * 177 * The chain's lock and reference are transfered to the new cluster, so 178 * the caller should not try to unlock the chain separately. 179 * 180 * We fake the flags. 181 */ 182 hammer2_cluster_t * 183 hammer2_cluster_from_chain(hammer2_chain_t *chain) 184 { 185 hammer2_cluster_t *cluster; 186 187 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 188 cluster->array[0].chain = chain; 189 cluster->array[0].flags = HAMMER2_CITEM_FEMOD; 190 cluster->nchains = 1; 191 cluster->focus = chain; 192 cluster->focus_index = 0; 193 cluster->pmp = chain->pmp; 194 cluster->refs = 1; 195 cluster->error = chain->error; 196 cluster->flags = HAMMER2_CLUSTER_LOCKED | 197 HAMMER2_CLUSTER_WRHARD | 198 HAMMER2_CLUSTER_RDHARD | 199 HAMMER2_CLUSTER_MSYNCED | 200 HAMMER2_CLUSTER_SSYNCED; 201 202 return cluster; 203 } 204 205 /* 206 * Add a reference to a cluster and its underlying chains. 207 * 208 * We must also ref the underlying chains in order to allow ref/unlock 209 * sequences to later re-lock. 210 */ 211 void 212 hammer2_cluster_ref(hammer2_cluster_t *cluster) 213 { 214 atomic_add_int(&cluster->refs, 1); 215 } 216 217 /* 218 * Drop the caller's reference to the cluster. When the ref count drops to 219 * zero this function frees the cluster and drops all underlying chains. 220 * 221 * In-progress read I/Os are typically detached from the cluster once the 222 * first one returns (the remaining stay attached to the DIOs but are then 223 * ignored and drop naturally). 224 */ 225 void 226 hammer2_cluster_drop(hammer2_cluster_t *cluster) 227 { 228 hammer2_chain_t *chain; 229 int i; 230 231 KKASSERT(cluster->refs > 0); 232 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 233 cluster->focus = NULL; /* safety XXX chg to assert */ 234 cluster->focus_index = 0; 235 236 for (i = 0; i < cluster->nchains; ++i) { 237 chain = cluster->array[i].chain; 238 if (chain) { 239 hammer2_chain_drop(chain); 240 cluster->array[i].chain = NULL; /* safety */ 241 } 242 } 243 cluster->nchains = 0; /* safety */ 244 245 kfree(cluster, M_HAMMER2); 246 /* cluster is invalid */ 247 } 248 } 249 250 /* 251 * Lock a cluster. Cluster must already be referenced. Focus is maintained. 252 * 253 * WARNING! This function expects the caller to handle resolution of the 254 * cluster. We never re-resolve the cluster in this function, 255 * because it might be used to temporarily unlock/relock a cparent 256 * in an iteration or recursrion, and the cparents elements do not 257 * necessarily match. 258 */ 259 void 260 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how) 261 { 262 hammer2_chain_t *chain; 263 int i; 264 265 /* cannot be on inode-embedded cluster template, must be on copy */ 266 KKASSERT(cluster->refs > 0); 267 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0); 268 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) { 269 panic("hammer2_cluster_lock: cluster %p already locked!\n", 270 cluster); 271 } 272 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 273 274 /* 275 * Lock chains and resolve state. 276 */ 277 for (i = 0; i < cluster->nchains; ++i) { 278 if (i == idx) 279 continue; 280 chain = cluster->array[i].chain; 281 if (chain == NULL) 282 continue; 283 hammer2_chain_lock(chain, how); 284 } 285 } 286 287 void 288 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 289 { 290 hammer2_cluster_lock_except(cluster, -1, how); 291 } 292 293 /* 294 * Calculate the clustering state for the cluster and set its focus. 295 * This routine must be called with care. For example, it should not 296 * normally be called after relocking a non-leaf cluster because parent 297 * clusters help iterations and each element might be at a slightly different 298 * indirect node (each node's topology is independently indexed). 299 * 300 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal 301 * operations. Typically this is only set on a quorum of MASTERs or 302 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER 303 * is present, this bit is *not* set on a quorum of MASTERs. The 304 * synchronization code ignores this bit, but all hammer2_cluster_*() calls 305 * that create/modify/delete elements use it. 306 * 307 * The chains making up the cluster may be narrowed down based on quorum 308 * acceptability, and if RESOLVE_RDONLY is specified the chains can be 309 * narrowed down to a single chain as long as the entire subtopology is known 310 * to be intact. So, for example, we can narrow a read-only op to a single 311 * fast SLAVE but if we focus a CACHE chain we must still retain at least 312 * a SLAVE to ensure that the subtopology can be accessed. 313 * 314 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need 315 * to be maintained once the topology is validated as-of the top level of 316 * the operation. 317 * 318 * If a failure occurs the operation must be aborted by higher-level code and 319 * retried. XXX 320 */ 321 void 322 hammer2_cluster_resolve(hammer2_cluster_t *cluster) 323 { 324 hammer2_chain_t *chain; 325 hammer2_chain_t *focus; 326 hammer2_pfs_t *pmp; 327 hammer2_tid_t quorum_tid; 328 hammer2_tid_t last_best_quorum_tid; 329 int focus_pfs_type; 330 uint32_t nflags; 331 int ttlmasters; 332 int ttlslaves; 333 int nmasters; 334 int nslaves; 335 int nquorum; 336 int smpresent; 337 int i; 338 339 cluster->error = 0; 340 cluster->focus = NULL; 341 342 focus_pfs_type = 0; 343 nflags = 0; 344 ttlmasters = 0; 345 ttlslaves = 0; 346 nmasters = 0; 347 nslaves = 0; 348 349 /* 350 * Calculate quorum 351 */ 352 pmp = cluster->pmp; 353 KKASSERT(pmp != NULL || cluster->nchains == 0); 354 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 355 smpresent = 0; 356 357 /* 358 * Pass 1 359 * 360 * NOTE: A NULL chain is not necessarily an error, it could be 361 * e.g. a lookup failure or the end of an iteration. 362 * Process normally. 363 */ 364 for (i = 0; i < cluster->nchains; ++i) { 365 chain = cluster->array[i].chain; 366 if (chain && chain->error) { 367 if (cluster->focus == NULL || cluster->focus == chain) { 368 /* error will be overridden by valid focus */ 369 cluster->error = chain->error; 370 } 371 372 /* 373 * Must count total masters and slaves whether the 374 * chain is errored or not. 375 */ 376 switch (cluster->pmp->pfs_types[i]) { 377 case HAMMER2_PFSTYPE_MASTER: 378 ++ttlmasters; 379 break; 380 case HAMMER2_PFSTYPE_SLAVE: 381 ++ttlslaves; 382 break; 383 } 384 continue; 385 } 386 switch (cluster->pmp->pfs_types[i]) { 387 case HAMMER2_PFSTYPE_MASTER: 388 ++ttlmasters; 389 break; 390 case HAMMER2_PFSTYPE_SLAVE: 391 ++ttlslaves; 392 break; 393 case HAMMER2_PFSTYPE_SOFT_MASTER: 394 nflags |= HAMMER2_CLUSTER_WRSOFT; 395 nflags |= HAMMER2_CLUSTER_RDSOFT; 396 smpresent = 1; 397 break; 398 case HAMMER2_PFSTYPE_SOFT_SLAVE: 399 nflags |= HAMMER2_CLUSTER_RDSOFT; 400 break; 401 case HAMMER2_PFSTYPE_SUPROOT: 402 /* 403 * Degenerate cluster representing the super-root 404 * topology on a single device. Fake stuff so 405 * cluster ops work as expected. 406 */ 407 nflags |= HAMMER2_CLUSTER_WRHARD; 408 nflags |= HAMMER2_CLUSTER_RDHARD; 409 cluster->focus_index = i; 410 cluster->focus = chain; 411 cluster->error = chain ? chain->error : 0; 412 break; 413 default: 414 break; 415 } 416 } 417 418 /* 419 * Pass 2 420 * 421 * Resolve masters. Calculate nmasters for the highest matching 422 * TID, if a quorum cannot be attained try the next lower matching 423 * TID until we exhaust TIDs. 424 * 425 * NOTE: A NULL chain is not necessarily an error, it could be 426 * e.g. a lookup failure or the end of an iteration. 427 * Process normally. 428 */ 429 last_best_quorum_tid = HAMMER2_TID_MAX; 430 quorum_tid = 0; /* fix gcc warning */ 431 432 while (nmasters < nquorum && last_best_quorum_tid != 0) { 433 nmasters = 0; 434 quorum_tid = 0; 435 436 for (i = 0; i < cluster->nchains; ++i) { 437 if (cluster->pmp->pfs_types[i] != 438 HAMMER2_PFSTYPE_MASTER) { 439 continue; 440 } 441 chain = cluster->array[i].chain; 442 443 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 444 /* 445 * Invalid as in unsynchronized, cannot be 446 * used to calculate the quorum. 447 */ 448 } else if (chain == NULL && quorum_tid == 0) { 449 /* 450 * NULL chain on master matches NULL chains 451 * on other masters. 452 */ 453 ++nmasters; 454 } else if (quorum_tid < last_best_quorum_tid && 455 chain != NULL && 456 (quorum_tid < chain->bref.modify_tid || 457 nmasters == 0)) { 458 /* 459 * Better TID located, reset nmasters count. 460 */ 461 nmasters = 1; 462 quorum_tid = chain->bref.modify_tid; 463 } else if (chain && 464 quorum_tid == chain->bref.modify_tid) { 465 /* 466 * TID matches current collection. 467 */ 468 ++nmasters; 469 } 470 } 471 if (nmasters >= nquorum) 472 break; 473 last_best_quorum_tid = quorum_tid; 474 } 475 476 /* 477 * Pass 3 478 * 479 * NOTE: A NULL chain is not necessarily an error, it could be 480 * e.g. a lookup failure or the end of an iteration. 481 * Process normally. 482 */ 483 for (i = 0; i < cluster->nchains; ++i) { 484 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 485 chain = cluster->array[i].chain; 486 if (chain && chain->error) { 487 if (cluster->focus == NULL || cluster->focus == chain) { 488 /* error will be overridden by valid focus */ 489 cluster->error = chain->error; 490 } 491 continue; 492 } 493 494 switch (cluster->pmp->pfs_types[i]) { 495 case HAMMER2_PFSTYPE_MASTER: 496 /* 497 * We must have enough up-to-date masters to reach 498 * a quorum and the master modify_tid must match 499 * the quorum's modify_tid. 500 * 501 * Do not select an errored or out-of-sync master. 502 */ 503 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 504 nflags |= HAMMER2_CLUSTER_UNHARD; 505 } else if (nmasters >= nquorum && 506 (chain == NULL || chain->error == 0) && 507 ((chain == NULL && quorum_tid == 0) || 508 (chain != NULL && quorum_tid == 509 chain->bref.modify_tid))) { 510 nflags |= HAMMER2_CLUSTER_WRHARD; 511 nflags |= HAMMER2_CLUSTER_RDHARD; 512 if (!smpresent) { 513 cluster->array[i].flags |= 514 HAMMER2_CITEM_FEMOD; 515 } 516 if (cluster->focus == NULL || 517 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) { 518 focus_pfs_type = HAMMER2_PFSTYPE_MASTER; 519 cluster->focus_index = i; 520 cluster->focus = chain; /* NULL ok */ 521 cluster->error = chain ? chain->error : 522 0; 523 } 524 } else if (chain == NULL || chain->error == 0) { 525 nflags |= HAMMER2_CLUSTER_UNHARD; 526 } 527 break; 528 case HAMMER2_PFSTYPE_SLAVE: 529 /* 530 * We must have enough up-to-date masters to reach 531 * a quorum and the slave modify_tid must match the 532 * quorum's modify_tid. 533 * 534 * Do not select an errored slave. 535 */ 536 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 537 nflags |= HAMMER2_CLUSTER_UNHARD; 538 } else if (nmasters >= nquorum && 539 (chain == NULL || chain->error == 0) && 540 ((chain == NULL && quorum_tid == 0) || 541 (chain && quorum_tid == 542 chain->bref.modify_tid))) { 543 ++nslaves; 544 nflags |= HAMMER2_CLUSTER_RDHARD; 545 #if 0 546 /* XXX optimize for RESOLVE_RDONLY */ 547 if (cluster->focus == NULL) { 548 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE; 549 cluster->focus_index = i; 550 cluster->focus = chain; /* NULL ok */ 551 cluster->error = chain ? chain->error : 552 0; 553 } 554 #endif 555 } else if (chain == NULL || chain->error == 0) { 556 nflags |= HAMMER2_CLUSTER_UNSOFT; 557 } 558 break; 559 case HAMMER2_PFSTYPE_SOFT_MASTER: 560 /* 561 * Directly mounted soft master always wins. There 562 * should be only one. 563 */ 564 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER); 565 cluster->focus_index = i; 566 cluster->focus = chain; 567 cluster->error = chain ? chain->error : 0; 568 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER; 569 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 570 break; 571 case HAMMER2_PFSTYPE_SOFT_SLAVE: 572 /* 573 * Directly mounted soft slave always wins. There 574 * should be only one. 575 */ 576 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE); 577 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) { 578 cluster->focus_index = i; 579 cluster->focus = chain; 580 cluster->error = chain ? chain->error : 0; 581 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE; 582 } 583 break; 584 case HAMMER2_PFSTYPE_SUPROOT: 585 /* 586 * spmp (degenerate case) 587 */ 588 KKASSERT(i == 0); 589 cluster->focus_index = i; 590 cluster->focus = chain; 591 cluster->error = chain ? chain->error : 0; 592 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT; 593 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 594 break; 595 default: 596 break; 597 } 598 } 599 600 /* 601 * Focus now set, adjust ddflag. Skip this pass if the focus 602 * is bad or if we are at the PFS root (the bref won't match at 603 * the PFS root, obviously). 604 */ 605 focus = cluster->focus; 606 if (focus) { 607 cluster->ddflag = 608 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 609 } else { 610 cluster->ddflag = 0; 611 goto skip4; 612 } 613 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 614 goto skip4; 615 616 /* 617 * Pass 4 618 * 619 * Validate the elements that were not marked invalid. They should 620 * match. 621 */ 622 for (i = 0; i < cluster->nchains; ++i) { 623 int ddflag; 624 625 chain = cluster->array[i].chain; 626 627 if (chain == NULL) 628 continue; 629 if (chain == focus) 630 continue; 631 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 632 continue; 633 634 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 635 if (chain->bref.type != focus->bref.type || 636 chain->bref.key != focus->bref.key || 637 chain->bref.keybits != focus->bref.keybits || 638 chain->bref.modify_tid != focus->bref.modify_tid || 639 chain->bytes != focus->bytes || 640 ddflag != cluster->ddflag) { 641 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 642 if (hammer2_debug & 1) 643 kprintf("cluster_resolve: matching modify_tid failed " 644 "bref test: idx=%d type=%02x/%02x " 645 "key=%016jx/%d-%016jx/%d " 646 "mod=%016jx/%016jx bytes=%u/%u\n", 647 i, 648 chain->bref.type, focus->bref.type, 649 chain->bref.key, chain->bref.keybits, 650 focus->bref.key, focus->bref.keybits, 651 chain->bref.modify_tid, focus->bref.modify_tid, 652 chain->bytes, focus->bytes); 653 if (hammer2_debug & 0x4000) 654 panic("cluster_resolve"); 655 /* flag issue and force resync? */ 656 } 657 } 658 skip4: 659 660 if (ttlslaves == 0) 661 nflags |= HAMMER2_CLUSTER_NOSOFT; 662 if (ttlmasters == 0) 663 nflags |= HAMMER2_CLUSTER_NOHARD; 664 665 /* 666 * Set SSYNCED or MSYNCED for slaves and masters respectively if 667 * all available nodes (even if 0 are available) are fully 668 * synchronized. This is used by the synchronization thread to 669 * determine if there is work it could potentially accomplish. 670 */ 671 if (nslaves == ttlslaves) 672 nflags |= HAMMER2_CLUSTER_SSYNCED; 673 if (nmasters == ttlmasters) 674 nflags |= HAMMER2_CLUSTER_MSYNCED; 675 676 /* 677 * Determine if the cluster was successfully locked for the 678 * requested operation and generate an error code. The cluster 679 * will not be locked (or ref'd) if an error is returned. 680 * 681 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok() 682 * to determine if reading or writing is possible. If writing, the 683 * cluster still requires a call to hammer2_cluster_modify() first. 684 */ 685 atomic_set_int(&cluster->flags, nflags); 686 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 687 } 688 689 /* 690 * This is used by the XOPS subsystem to calculate the state of 691 * the collection and tell hammer2_xop_collect() what to do with it. 692 * The collection can be in various states of desynchronization, the 693 * caller specifically wants to resolve the passed-in key. 694 * 695 * Return values: 696 * 0 - Quorum agreement, key is valid 697 * 698 * ENOENT - Quorum agreement, end of scan 699 * 700 * ESRCH - Quorum agreement, key is INVALID (caller should 701 * skip key). 702 * 703 * EIO - Quorum agreement but all elements had errors. 704 * 705 * EDEADLK - No quorum agreement possible for key, a repair 706 * may be needed. Caller has to decide what to do, 707 * possibly iterating the key or generating an EIO. 708 * 709 * EINPROGRESS - No quorum agreement yet, but agreement is still 710 * possible if caller waits for more responses. Caller 711 * should not iterate key. 712 * 713 * XXX needs to handle SOFT_MASTER and SOFT_SLAVE 714 */ 715 int 716 hammer2_cluster_check(hammer2_cluster_t *cluster, hammer2_key_t key, int flags) 717 { 718 hammer2_chain_t *chain; 719 hammer2_chain_t *focus; 720 hammer2_pfs_t *pmp; 721 hammer2_tid_t quorum_tid; 722 hammer2_tid_t last_best_quorum_tid; 723 uint32_t nflags; 724 int ttlmasters; 725 int ttlslaves; 726 int nmasters; 727 int nmasters_keymatch; 728 int nslaves; 729 int nquorum; 730 int umasters; /* unknown masters (still in progress) */ 731 int smpresent; 732 int i; 733 734 cluster->error = 0; 735 cluster->focus = NULL; 736 737 nflags = 0; 738 ttlmasters = 0; 739 ttlslaves = 0; 740 nmasters = 0; 741 nmasters_keymatch = 0; 742 umasters = 0; 743 nslaves = 0; 744 745 /* 746 * Calculate quorum 747 */ 748 pmp = cluster->pmp; 749 KKASSERT(pmp != NULL || cluster->nchains == 0); 750 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 751 smpresent = 0; 752 753 /* 754 * Pass 1 755 * 756 * NOTE: A NULL chain is not necessarily an error, it could be 757 * e.g. a lookup failure or the end of an iteration. 758 * Process normally. 759 */ 760 for (i = 0; i < cluster->nchains; ++i) { 761 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 762 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 763 764 chain = cluster->array[i].chain; 765 if (chain && chain->error) { 766 if (cluster->focus == NULL || cluster->focus == chain) { 767 /* error will be overridden by valid focus */ 768 cluster->error = chain->error; 769 } 770 771 /* 772 * Must count total masters and slaves whether the 773 * chain is errored or not. 774 */ 775 switch (cluster->pmp->pfs_types[i]) { 776 case HAMMER2_PFSTYPE_MASTER: 777 ++ttlmasters; 778 break; 779 case HAMMER2_PFSTYPE_SLAVE: 780 ++ttlslaves; 781 break; 782 } 783 continue; 784 } 785 switch (cluster->pmp->pfs_types[i]) { 786 case HAMMER2_PFSTYPE_MASTER: 787 ++ttlmasters; 788 break; 789 case HAMMER2_PFSTYPE_SLAVE: 790 ++ttlslaves; 791 break; 792 case HAMMER2_PFSTYPE_SOFT_MASTER: 793 nflags |= HAMMER2_CLUSTER_WRSOFT; 794 nflags |= HAMMER2_CLUSTER_RDSOFT; 795 smpresent = 1; 796 break; 797 case HAMMER2_PFSTYPE_SOFT_SLAVE: 798 nflags |= HAMMER2_CLUSTER_RDSOFT; 799 break; 800 case HAMMER2_PFSTYPE_SUPROOT: 801 /* 802 * Degenerate cluster representing the super-root 803 * topology on a single device. Fake stuff so 804 * cluster ops work as expected. 805 */ 806 nflags |= HAMMER2_CLUSTER_WRHARD; 807 nflags |= HAMMER2_CLUSTER_RDHARD; 808 cluster->focus_index = i; 809 cluster->focus = chain; 810 cluster->error = chain ? chain->error : 0; 811 break; 812 default: 813 break; 814 } 815 } 816 817 /* 818 * Pass 2 819 * 820 * Resolve nmasters - master nodes fully match 821 * 822 * Resolve umasters - master nodes operation still 823 * in progress 824 * 825 * Resolve nmasters_keymatch - master nodes match the passed-in 826 * key and may or may not match 827 * the quorum-agreed tid. 828 * 829 * The quorum-agreed TID is the highest matching TID. 830 */ 831 last_best_quorum_tid = HAMMER2_TID_MAX; 832 quorum_tid = 0; /* fix gcc warning */ 833 834 while (nmasters < nquorum && last_best_quorum_tid != 0) { 835 nmasters = 0; 836 quorum_tid = 0; 837 838 for (i = 0; i < cluster->nchains; ++i) { 839 /* XXX SOFT smpresent handling */ 840 if (cluster->pmp->pfs_types[i] != 841 HAMMER2_PFSTYPE_MASTER) { 842 continue; 843 } 844 845 chain = cluster->array[i].chain; 846 847 /* 848 * Skip elements still in progress. umasters keeps 849 * track of masters that might still be in-progress. 850 */ 851 if (chain == NULL && (cluster->array[i].flags & 852 HAMMER2_CITEM_NULL) == 0) { 853 ++umasters; 854 continue; 855 } 856 857 /* 858 * Key match? 859 */ 860 if (flags & HAMMER2_CHECK_NULL) { 861 if (chain == NULL) { 862 ++nmasters; 863 ++nmasters_keymatch; 864 } 865 } else if (chain && 866 (key == (hammer2_key_t)-1 || 867 chain->bref.key == key)) { 868 ++nmasters_keymatch; 869 if (quorum_tid < last_best_quorum_tid && 870 (quorum_tid < chain->bref.modify_tid || 871 nmasters == 0)) { 872 /* 873 * Better TID located, reset 874 * nmasters count. 875 */ 876 nmasters = 0; 877 quorum_tid = chain->bref.modify_tid; 878 } 879 if (quorum_tid == chain->bref.modify_tid) { 880 /* 881 * TID matches current collection. 882 */ 883 ++nmasters; 884 if (chain->error == 0) { 885 cluster->focus = chain; 886 cluster->focus_index = i; 887 } 888 } 889 } 890 } 891 if (nmasters >= nquorum) 892 break; 893 last_best_quorum_tid = quorum_tid; 894 } 895 896 /* 897 kprintf("nmasters %d/%d nmaster_keymatch=%d umasters=%d\n", 898 nmasters, nquorum, nmasters_keymatch, umasters); 899 */ 900 901 /* 902 * Early return if we do not have enough masters. 903 */ 904 if (nmasters < nquorum) { 905 if (nmasters + umasters >= nquorum) 906 return EINPROGRESS; 907 if (nmasters_keymatch < nquorum) 908 return ESRCH; 909 return EDEADLK; 910 } 911 912 /* 913 * Validated end of scan. 914 */ 915 if (flags & HAMMER2_CHECK_NULL) 916 return ENOENT; 917 918 /* 919 * If we have a NULL focus at this point the agreeing quorum all 920 * had chain errors. 921 */ 922 if (cluster->focus == NULL) 923 return EIO; 924 925 /* 926 * Pass 3 927 * 928 * We have quorum agreement, validate elements, not end of scan. 929 */ 930 for (i = 0; i < cluster->nchains; ++i) { 931 chain = cluster->array[i].chain; 932 if (chain == NULL || 933 chain->bref.key != key || 934 chain->bref.modify_tid != quorum_tid) { 935 continue; 936 } 937 938 switch (cluster->pmp->pfs_types[i]) { 939 case HAMMER2_PFSTYPE_MASTER: 940 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 941 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 942 nflags |= HAMMER2_CLUSTER_WRHARD; 943 nflags |= HAMMER2_CLUSTER_RDHARD; 944 break; 945 case HAMMER2_PFSTYPE_SLAVE: 946 /* 947 * We must have enough up-to-date masters to reach 948 * a quorum and the slave modify_tid must match the 949 * quorum's modify_tid. 950 * 951 * Do not select an errored slave. 952 */ 953 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 954 nflags |= HAMMER2_CLUSTER_RDHARD; 955 ++nslaves; 956 break; 957 case HAMMER2_PFSTYPE_SOFT_MASTER: 958 /* 959 * Directly mounted soft master always wins. There 960 * should be only one. 961 */ 962 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 963 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 964 break; 965 case HAMMER2_PFSTYPE_SOFT_SLAVE: 966 /* 967 * Directly mounted soft slave always wins. There 968 * should be only one. 969 * 970 * XXX 971 */ 972 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 973 break; 974 case HAMMER2_PFSTYPE_SUPROOT: 975 /* 976 * spmp (degenerate case) 977 */ 978 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 979 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 980 break; 981 default: 982 break; 983 } 984 } 985 986 /* 987 * Focus now set, adjust ddflag. Skip this pass if the focus 988 * is bad or if we are at the PFS root (the bref won't match at 989 * the PFS root, obviously). 990 */ 991 focus = cluster->focus; 992 if (focus) { 993 cluster->ddflag = 994 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 995 } else { 996 cluster->ddflag = 0; 997 goto skip4; 998 } 999 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1000 goto skip4; 1001 1002 /* 1003 * Pass 4 1004 * 1005 * Validate the elements that were not marked invalid. They should 1006 * match. 1007 */ 1008 for (i = 0; i < cluster->nchains; ++i) { 1009 int ddflag; 1010 1011 chain = cluster->array[i].chain; 1012 1013 if (chain == NULL) 1014 continue; 1015 if (chain == focus) 1016 continue; 1017 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 1018 continue; 1019 1020 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 1021 if (chain->bref.type != focus->bref.type || 1022 chain->bref.key != focus->bref.key || 1023 chain->bref.keybits != focus->bref.keybits || 1024 chain->bref.modify_tid != focus->bref.modify_tid || 1025 chain->bytes != focus->bytes || 1026 ddflag != cluster->ddflag) { 1027 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1028 if (hammer2_debug & 1) 1029 kprintf("cluster_resolve: matching modify_tid failed " 1030 "bref test: idx=%d type=%02x/%02x " 1031 "key=%016jx/%d-%016jx/%d " 1032 "mod=%016jx/%016jx bytes=%u/%u\n", 1033 i, 1034 chain->bref.type, focus->bref.type, 1035 chain->bref.key, chain->bref.keybits, 1036 focus->bref.key, focus->bref.keybits, 1037 chain->bref.modify_tid, focus->bref.modify_tid, 1038 chain->bytes, focus->bytes); 1039 if (hammer2_debug & 0x4000) 1040 panic("cluster_resolve"); 1041 /* flag issue and force resync? */ 1042 } 1043 } 1044 skip4: 1045 1046 if (ttlslaves == 0) 1047 nflags |= HAMMER2_CLUSTER_NOSOFT; 1048 if (ttlmasters == 0) 1049 nflags |= HAMMER2_CLUSTER_NOHARD; 1050 1051 /* 1052 * Set SSYNCED or MSYNCED for slaves and masters respectively if 1053 * all available nodes (even if 0 are available) are fully 1054 * synchronized. This is used by the synchronization thread to 1055 * determine if there is work it could potentially accomplish. 1056 */ 1057 if (nslaves == ttlslaves) 1058 nflags |= HAMMER2_CLUSTER_SSYNCED; 1059 if (nmasters == ttlmasters) 1060 nflags |= HAMMER2_CLUSTER_MSYNCED; 1061 1062 /* 1063 * Determine if the cluster was successfully locked for the 1064 * requested operation and generate an error code. The cluster 1065 * will not be locked (or ref'd) if an error is returned. 1066 * 1067 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok() 1068 * to determine if reading or writing is possible. If writing, the 1069 * cluster still requires a call to hammer2_cluster_modify() first. 1070 */ 1071 atomic_set_int(&cluster->flags, nflags); 1072 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 1073 1074 return 0; 1075 } 1076 1077 /* 1078 * This is used by the sync thread to force non-NULL elements of a copy 1079 * of the pmp->iroot cluster to be good which is required to prime the 1080 * sync. 1081 */ 1082 void 1083 hammer2_cluster_forcegood(hammer2_cluster_t *cluster) 1084 { 1085 int i; 1086 1087 for (i = 0; i < cluster->nchains; ++i) { 1088 if (cluster->array[i].chain) 1089 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1090 } 1091 } 1092 1093 /* 1094 * Copy a cluster, returned a ref'd cluster. All underlying chains 1095 * are also ref'd, but not locked. Focus state is also copied. 1096 * 1097 * Original cluster does not have to be locked but usually is. 1098 * New cluster will not be flagged as locked. 1099 * 1100 * Callers using this function to initialize a new cluster from an inode 1101 * generally lock and resolve the resulting cluster. 1102 * 1103 * Callers which use this function to save/restore a cluster structure 1104 * generally retain the focus state and do not re-resolve it. Caller should 1105 * not try to re-resolve internal (cparent) node state during an iteration 1106 * as the individual tracking elements of cparent in an iteration may not 1107 * match even though they are correct. 1108 */ 1109 hammer2_cluster_t * 1110 hammer2_cluster_copy(hammer2_cluster_t *ocluster) 1111 { 1112 hammer2_pfs_t *pmp = ocluster->pmp; 1113 hammer2_cluster_t *ncluster; 1114 hammer2_chain_t *chain; 1115 int i; 1116 1117 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 1118 ncluster->pmp = pmp; 1119 ncluster->nchains = ocluster->nchains; 1120 ncluster->refs = 1; 1121 1122 for (i = 0; i < ocluster->nchains; ++i) { 1123 chain = ocluster->array[i].chain; 1124 ncluster->array[i].chain = chain; 1125 ncluster->array[i].flags = ocluster->array[i].flags; 1126 if (chain) 1127 hammer2_chain_ref(chain); 1128 } 1129 ncluster->focus_index = ocluster->focus_index; 1130 ncluster->focus = ocluster->focus; 1131 ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED | 1132 HAMMER2_CLUSTER_INODE); 1133 1134 return (ncluster); 1135 } 1136 1137 /* 1138 * Unlock a cluster. Refcount and focus is maintained. 1139 */ 1140 void 1141 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx) 1142 { 1143 hammer2_chain_t *chain; 1144 int i; 1145 1146 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) { 1147 kprintf("hammer2_cluster_unlock: cluster %p not locked\n", 1148 cluster); 1149 } 1150 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 1151 KKASSERT(cluster->refs > 0); 1152 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 1153 1154 for (i = 0; i < cluster->nchains; ++i) { 1155 if (i == idx) 1156 continue; 1157 chain = cluster->array[i].chain; 1158 if (chain) 1159 hammer2_chain_unlock(chain); 1160 } 1161 } 1162 1163 void 1164 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 1165 { 1166 hammer2_cluster_unlock_except(cluster, -1); 1167 } 1168 1169 /* 1170 * Set an inode's cluster modified, marking the related chains RW and 1171 * duplicating them if necessary. 1172 * 1173 * The passed-in chain is a localized copy of the chain previously acquired 1174 * when the inode was locked (and possilby replaced in the mean time), and 1175 * must also be updated. In fact, we update it first and then synchronize 1176 * the inode's cluster cache. 1177 */ 1178 hammer2_inode_data_t * 1179 hammer2_cluster_modify_ip(hammer2_inode_t *ip, 1180 hammer2_cluster_t *cluster, int flags) 1181 { 1182 hammer2_inode_modify(ip); 1183 hammer2_cluster_modify(cluster, flags); 1184 hammer2_inode_repoint(ip, NULL, cluster); 1185 return (&hammer2_cluster_wdata(cluster)->ipdata); 1186 } 1187 1188 /* 1189 * Adjust the cluster's chains to allow modification and adjust the 1190 * focus. Data will be accessible on return. 1191 * 1192 * If our focused master errors on modify, re-resolve the cluster to 1193 * try to select a different master. 1194 */ 1195 void 1196 hammer2_cluster_modify(hammer2_cluster_t *cluster, int flags) 1197 { 1198 hammer2_chain_t *chain; 1199 int resolve_again; 1200 int i; 1201 1202 resolve_again = 0; 1203 for (i = 0; i < cluster->nchains; ++i) { 1204 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1205 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1206 continue; 1207 } 1208 chain = cluster->array[i].chain; 1209 if (chain == NULL) 1210 continue; 1211 if (chain->error) 1212 continue; 1213 hammer2_chain_modify(chain, flags); 1214 if (cluster->focus == chain && chain->error) { 1215 cluster->error = chain->error; 1216 resolve_again = 1; 1217 } 1218 } 1219 if (resolve_again) 1220 hammer2_cluster_resolve(cluster); 1221 } 1222 1223 /* 1224 * Synchronize modifications from the focus to other chains in a cluster. 1225 * Convenient because nominal API users can just modify the contents of the 1226 * focus (at least for non-blockref data). 1227 * 1228 * Nominal front-end operations only edit non-block-table data in a single 1229 * chain. This code copies such modifications to the other chains in the 1230 * cluster. Blocktable modifications are handled on a chain-by-chain basis 1231 * by both the frontend and the backend and will explode in fireworks if 1232 * blindly copied. 1233 */ 1234 void 1235 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 1236 { 1237 hammer2_chain_t *focus; 1238 hammer2_chain_t *scan; 1239 const hammer2_inode_data_t *ripdata; 1240 hammer2_inode_data_t *wipdata; 1241 int i; 1242 1243 focus = cluster->focus; 1244 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 1245 1246 for (i = 0; i < cluster->nchains; ++i) { 1247 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 1248 continue; 1249 scan = cluster->array[i].chain; 1250 if (scan == NULL || scan == focus) 1251 continue; 1252 if (scan->error) 1253 continue; 1254 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 1255 KKASSERT(focus->bytes == scan->bytes && 1256 focus->bref.type == scan->bref.type); 1257 switch(focus->bref.type) { 1258 case HAMMER2_BREF_TYPE_INODE: 1259 ripdata = &focus->data->ipdata; 1260 wipdata = &scan->data->ipdata; 1261 if ((ripdata->meta.op_flags & 1262 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 1263 bcopy(ripdata, wipdata, 1264 offsetof(hammer2_inode_data_t, u)); 1265 break; 1266 } 1267 /* fall through to full copy */ 1268 case HAMMER2_BREF_TYPE_DATA: 1269 bcopy(focus->data, scan->data, focus->bytes); 1270 break; 1271 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1272 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1273 case HAMMER2_BREF_TYPE_FREEMAP: 1274 case HAMMER2_BREF_TYPE_VOLUME: 1275 panic("hammer2_cluster_modsync: illegal node type"); 1276 /* NOT REACHED */ 1277 break; 1278 default: 1279 panic("hammer2_cluster_modsync: unknown node type"); 1280 break; 1281 } 1282 } 1283 } 1284 1285 /* 1286 * Lookup initialization/completion API. Returns a locked, fully resolved 1287 * cluster with one ref. 1288 */ 1289 hammer2_cluster_t * 1290 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 1291 { 1292 hammer2_cluster_t *cluster; 1293 1294 cluster = hammer2_cluster_copy(cparent); 1295 if (flags & HAMMER2_LOOKUP_SHARED) { 1296 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 1297 HAMMER2_RESOLVE_SHARED); 1298 } else { 1299 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 1300 } 1301 hammer2_cluster_resolve(cluster); 1302 1303 return (cluster); 1304 } 1305 1306 void 1307 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 1308 { 1309 if (cparent) { 1310 hammer2_cluster_unlock(cparent); 1311 hammer2_cluster_drop(cparent); 1312 } 1313 } 1314 1315 /* 1316 * Locate first match or overlap under parent, return a new, locked, resolved 1317 * cluster with one ref. 1318 * 1319 * Must never be called with HAMMER2_LOOKUP_MATCHIND. 1320 */ 1321 hammer2_cluster_t * 1322 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 1323 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 1324 { 1325 hammer2_pfs_t *pmp; 1326 hammer2_cluster_t *cluster; 1327 hammer2_chain_t *chain; 1328 hammer2_key_t key_accum; 1329 hammer2_key_t key_next; 1330 int null_count; 1331 int rflags; 1332 int i; 1333 1334 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0); 1335 1336 pmp = cparent->pmp; /* can be NULL */ 1337 key_accum = *key_nextp; 1338 null_count = 0; 1339 if (flags & HAMMER2_LOOKUP_SHARED) 1340 rflags = HAMMER2_RESOLVE_SHARED; 1341 else 1342 rflags = 0; 1343 1344 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 1345 cluster->pmp = pmp; /* can be NULL */ 1346 cluster->refs = 1; 1347 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1348 cluster->flags |= HAMMER2_CLUSTER_LOCKED; 1349 1350 /* 1351 * Iterating earlier cluster elements with later elements still 1352 * locked is a problem, so we have to unlock the parent and then 1353 * re-lock as we go. 1354 */ 1355 hammer2_cluster_unlock(cparent); 1356 cparent->flags |= HAMMER2_CLUSTER_LOCKED; 1357 1358 /* 1359 * Pass-1, issue lookups. 1360 */ 1361 for (i = 0; i < cparent->nchains; ++i) { 1362 cluster->array[i].flags = cparent->array[i].flags; 1363 key_next = *key_nextp; 1364 1365 /* 1366 * Always relock the parent as we go. 1367 */ 1368 if (cparent->array[i].chain) { 1369 hammer2_chain_lock(cparent->array[i].chain, rflags); 1370 } 1371 1372 /* 1373 * Nothing to base the lookup, or parent was not synchronized. 1374 */ 1375 if (cparent->array[i].chain == NULL || 1376 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) { 1377 ++null_count; 1378 continue; 1379 } 1380 1381 chain = hammer2_chain_lookup(&cparent->array[i].chain, 1382 &key_next, 1383 key_beg, key_end, 1384 &cparent->array[i].cache_index, 1385 flags); 1386 cluster->array[i].chain = chain; 1387 if (chain == NULL) { 1388 ++null_count; 1389 } 1390 if (key_accum > key_next) 1391 key_accum = key_next; 1392 } 1393 1394 /* 1395 * Cleanup 1396 */ 1397 cluster->nchains = i; 1398 *key_nextp = key_accum; 1399 1400 /* 1401 * The cluster must be resolved, out of sync elements may be present. 1402 * 1403 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL. 1404 */ 1405 if (null_count != i) 1406 hammer2_cluster_resolve(cluster); 1407 if (null_count == i || 1408 (cluster->focus == NULL && 1409 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) { 1410 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1411 hammer2_cluster_unlock(cluster); 1412 hammer2_cluster_drop(cluster); 1413 cluster = NULL; 1414 } 1415 1416 return (cluster); 1417 } 1418 1419 /* 1420 * Locate next match or overlap under parent, replace the passed-in cluster. 1421 * The returned cluster is a new, locked, resolved cluster with one ref. 1422 * 1423 * Must never be called with HAMMER2_LOOKUP_MATCHIND. 1424 */ 1425 hammer2_cluster_t * 1426 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 1427 hammer2_key_t *key_nextp, 1428 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 1429 { 1430 hammer2_chain_t *ochain; 1431 hammer2_chain_t *nchain; 1432 hammer2_key_t key_accum; 1433 hammer2_key_t key_next; 1434 int parent_index; 1435 int cluster_index; 1436 int null_count; 1437 int rflags; 1438 int i; 1439 1440 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0); 1441 1442 key_accum = *key_nextp; 1443 null_count = 0; 1444 parent_index = cparent->focus_index; /* save prior focus */ 1445 cluster_index = cluster->focus_index; 1446 if (flags & HAMMER2_LOOKUP_SHARED) 1447 rflags = HAMMER2_RESOLVE_SHARED; 1448 else 1449 rflags = 0; 1450 1451 cluster->focus = NULL; /* XXX needed any more? */ 1452 /*cparent->focus = NULL;*/ 1453 cluster->focus_index = 0; /* XXX needed any more? */ 1454 /*cparent->focus_index = 0;*/ 1455 1456 cluster->ddflag = 0; 1457 1458 /* 1459 * The parent is always locked on entry, the iterator may be locked 1460 * depending on flags. 1461 * 1462 * We must temporarily unlock the passed-in clusters to avoid a 1463 * deadlock between elements of the cluster with other threads. 1464 * We will fixup the lock in the loop. 1465 * 1466 * Note that this will clear the focus. 1467 * 1468 * Reflag the clusters as locked, because we will relock them 1469 * as we go. 1470 */ 1471 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) { 1472 hammer2_cluster_unlock(cluster); 1473 cluster->flags |= HAMMER2_CLUSTER_LOCKED; 1474 } 1475 hammer2_cluster_unlock(cparent); 1476 cparent->flags |= HAMMER2_CLUSTER_LOCKED; 1477 1478 for (i = 0; i < cparent->nchains; ++i) { 1479 key_next = *key_nextp; 1480 ochain = cluster->array[i].chain; 1481 1482 /* 1483 * Always relock the parent as we go. 1484 */ 1485 if (cparent->array[i].chain) 1486 hammer2_chain_lock(cparent->array[i].chain, rflags); 1487 1488 /* 1489 * Nothing to iterate from. These cases can occur under 1490 * normal operations. For example, during synchronization 1491 * a slave might reach the end of its scan while records 1492 * are still left on the master(s). 1493 */ 1494 if (ochain == NULL) { 1495 ++null_count; 1496 continue; 1497 } 1498 if (cparent->array[i].chain == NULL || 1499 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) || 1500 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) { 1501 /* ochain has not yet been relocked */ 1502 hammer2_chain_drop(ochain); 1503 cluster->array[i].chain = NULL; 1504 ++null_count; 1505 continue; 1506 } 1507 1508 /* 1509 * Relock the child if necessary. Parent and child will then 1510 * be locked as expected by hammer2_chain_next() and flags. 1511 */ 1512 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1513 hammer2_chain_lock(ochain, rflags); 1514 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain, 1515 &key_next, key_beg, key_end, 1516 &cparent->array[i].cache_index, 1517 flags); 1518 /* ochain now invalid but can still be used for focus check */ 1519 if (parent_index == i) { 1520 cparent->focus_index = i; 1521 cparent->focus = cparent->array[i].chain; 1522 } 1523 1524 cluster->array[i].chain = nchain; 1525 if (nchain == NULL) { 1526 ++null_count; 1527 } 1528 if (key_accum > key_next) 1529 key_accum = key_next; 1530 } 1531 1532 /* 1533 * Cleanup 1534 */ 1535 cluster->nchains = i; 1536 *key_nextp = key_accum; 1537 1538 /* 1539 * The cluster must be resolved, out of sync elements may be present. 1540 * 1541 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL. 1542 */ 1543 if (null_count != i) 1544 hammer2_cluster_resolve(cluster); 1545 if (null_count == i || 1546 (cluster->focus == NULL && 1547 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) { 1548 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1549 hammer2_cluster_unlock(cluster); 1550 hammer2_cluster_drop(cluster); 1551 cluster = NULL; 1552 } 1553 return(cluster); 1554 } 1555 1556 /* 1557 * Advance just one chain in the cluster and recalculate the invalid bit. 1558 * The cluster index is allowed to be flagged invalid on input and is 1559 * recalculated on return. 1560 * 1561 * (used during synchronization to advance past a chain being deleted). 1562 * 1563 * The chain being advanced must not be the focus and the clusters in 1564 * question must have already passed normal cluster_lookup/cluster_next 1565 * checks. 1566 * 1567 * The cluster always remains intact on return, so void function. 1568 */ 1569 void 1570 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent, 1571 hammer2_cluster_t *cluster, 1572 hammer2_key_t *key_nextp, 1573 hammer2_key_t key_beg, 1574 hammer2_key_t key_end, 1575 int i, int flags) 1576 { 1577 hammer2_chain_t *ochain; 1578 hammer2_chain_t *nchain; 1579 hammer2_chain_t *focus; 1580 hammer2_key_t key_accum; 1581 hammer2_key_t key_next; 1582 int ddflag; 1583 1584 key_accum = *key_nextp; 1585 key_next = *key_nextp; 1586 ochain = cluster->array[i].chain; 1587 if (ochain == NULL) 1588 goto done; 1589 KKASSERT(ochain != cluster->focus); 1590 1591 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain, 1592 &key_next, key_beg, key_end, 1593 &cparent->array[i].cache_index, 1594 flags); 1595 /* ochain now invalid */ 1596 if (cparent->focus_index == i) 1597 cparent->focus = cparent->array[i].chain; 1598 1599 /* 1600 * Install nchain. Note that nchain can be NULL, and can also 1601 * be in an unlocked state depending on flags. 1602 */ 1603 cluster->array[i].chain = nchain; 1604 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1605 1606 if (key_accum > key_next) 1607 key_accum = key_next; 1608 1609 focus = cluster->focus; 1610 if (focus == NULL) 1611 goto done; 1612 if (nchain == NULL) 1613 goto done; 1614 #if 0 1615 if (nchain == focus) /* ASSERTED NOT TRUE */ 1616 ... 1617 #endif 1618 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE); 1619 if (nchain->bref.type != focus->bref.type || 1620 nchain->bref.key != focus->bref.key || 1621 nchain->bref.keybits != focus->bref.keybits || 1622 nchain->bref.modify_tid != focus->bref.modify_tid || 1623 nchain->bytes != focus->bytes || 1624 ddflag != cluster->ddflag) { 1625 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1626 } 1627 1628 done: 1629 *key_nextp = key_accum; 1630 #if 0 1631 /* 1632 * For now don't re-resolve cluster->flags. 1633 */ 1634 hammer2_cluster_resolve(cluster); 1635 #endif 1636 } 1637 1638 /* 1639 * Mark a cluster deleted 1640 */ 1641 void 1642 hammer2_cluster_delete(hammer2_cluster_t *cparent, 1643 hammer2_cluster_t *cluster, int flags) 1644 { 1645 hammer2_chain_t *chain; 1646 hammer2_chain_t *parent; 1647 int i; 1648 1649 if (cparent == NULL) { 1650 kprintf("cparent is NULL\n"); 1651 return; 1652 } 1653 1654 for (i = 0; i < cluster->nchains; ++i) { 1655 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1656 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1657 continue; 1658 } 1659 parent = cparent->array[i].chain; 1660 chain = cluster->array[i].chain; 1661 if (chain == NULL) 1662 continue; 1663 if (chain->parent != parent) { 1664 kprintf("hammer2_cluster_delete: parent " 1665 "mismatch chain=%p parent=%p against=%p\n", 1666 chain, chain->parent, parent); 1667 } else { 1668 hammer2_chain_delete(parent, chain, flags); 1669 } 1670 } 1671 } 1672 1673 /* 1674 * Create a snapshot of the specified {parent, ochain} with the specified 1675 * label. The originating hammer2_inode must be exclusively locked for 1676 * safety. 1677 * 1678 * The ioctl code has already synced the filesystem. 1679 */ 1680 int 1681 hammer2_cluster_snapshot(hammer2_cluster_t *ocluster, 1682 hammer2_ioc_pfs_t *pmp) 1683 { 1684 hammer2_dev_t *hmp; 1685 const hammer2_inode_data_t *ripdata; 1686 hammer2_inode_data_t *wipdata; 1687 hammer2_chain_t *nchain; 1688 hammer2_inode_t *nip; 1689 size_t name_len; 1690 hammer2_key_t lhc; 1691 struct vattr vat; 1692 #if 0 1693 uuid_t opfs_clid; 1694 #endif 1695 int error; 1696 1697 kprintf("snapshot %s\n", pmp->name); 1698 1699 name_len = strlen(pmp->name); 1700 lhc = hammer2_dirhash(pmp->name, name_len); 1701 1702 /* 1703 * Get the clid 1704 */ 1705 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata; 1706 #if 0 1707 opfs_clid = ripdata->meta.pfs_clid; 1708 #endif 1709 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */ 1710 1711 /* 1712 * Create the snapshot directory under the super-root 1713 * 1714 * Set PFS type, generate a unique filesystem id, and generate 1715 * a cluster id. Use the same clid when snapshotting a PFS root, 1716 * which theoretically allows the snapshot to be used as part of 1717 * the same cluster (perhaps as a cache). 1718 * 1719 * Copy the (flushed) blockref array. Theoretically we could use 1720 * chain_duplicate() but it becomes difficult to disentangle 1721 * the shared core so for now just brute-force it. 1722 */ 1723 VATTR_NULL(&vat); 1724 vat.va_type = VDIR; 1725 vat.va_mode = 0755; 1726 nip = hammer2_inode_create(hmp->spmp->iroot, &vat, proc0.p_ucred, 1727 pmp->name, name_len, 0, 1728 1, 0, 0, 1729 HAMMER2_INSERT_PFSROOT, &error); 1730 1731 if (nip) { 1732 hammer2_inode_modify(nip); 1733 nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS); 1734 hammer2_chain_modify(nchain, 0); 1735 wipdata = &nchain->data->ipdata; 1736 1737 nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER; 1738 nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT; 1739 nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT; 1740 kern_uuidgen(&nip->meta.pfs_fsid, 1); 1741 1742 /* 1743 * Give the snapshot its own private cluster id. As a 1744 * snapshot no further synchronization with the original 1745 * cluster will be done. 1746 */ 1747 #if 0 1748 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1749 nip->meta.pfs_clid = opfs_clid; 1750 else 1751 kern_uuidgen(&nip->meta.pfs_clid, 1); 1752 #endif 1753 kern_uuidgen(&nip->meta.pfs_clid, 1); 1754 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 1755 1756 /* XXX hack blockset copy */ 1757 /* XXX doesn't work with real cluster */ 1758 KKASSERT(ocluster->nchains == 1); 1759 wipdata->meta = nip->meta; 1760 wipdata->u.blockset = ripdata->u.blockset; 1761 hammer2_flush(nchain, 1); 1762 hammer2_chain_unlock(nchain); 1763 hammer2_chain_drop(nchain); 1764 hammer2_inode_unlock(nip); 1765 } 1766 return (error); 1767 } 1768 1769 /************************************************************************ 1770 * CLUSTER I/O * 1771 ************************************************************************ 1772 * 1773 * 1774 * WARNING! blockref[] array data is not universal. These functions should 1775 * only be used to access universal data. 1776 * 1777 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1778 * complete if necessary. The I/O's should have already been 1779 * initiated by the cluster_lock/chain_lock operation. 1780 * 1781 * The cluster must already be in a modified state before wdata 1782 * is called. The data will already be available for this case. 1783 */ 1784 const hammer2_media_data_t * 1785 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1786 { 1787 KKASSERT(cluster->focus != NULL); 1788 return(cluster->focus->data); 1789 } 1790 1791 const hammer2_media_data_t * 1792 hammer2_cluster_rdata_bytes(hammer2_cluster_t *cluster, size_t *bytesp) 1793 { 1794 KKASSERT(cluster->focus != NULL); 1795 *bytesp = cluster->focus->bytes; 1796 return(cluster->focus->data); 1797 } 1798 1799 hammer2_media_data_t * 1800 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1801 { 1802 KKASSERT(cluster->focus != NULL); 1803 KKASSERT(hammer2_cluster_modified(cluster)); 1804 return(cluster->focus->data); 1805 } 1806