1 /* 2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information from different nodes into a single entity. It allows direct 37 * access to media data as long as it is not blockref array data (which 38 * will obviously have to be different at each node). 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 * 48 * WARNING! This module is *extremely* complex. It must issue asynchronous 49 * locks and I/O, do quorum and/or master-slave processing, and 50 * it must operate properly even if some nodes are broken (which 51 * can also mean indefinite locks). 52 * 53 * CLUSTER OPERATIONS 54 * 55 * Cluster operations can be broken down into three pieces: 56 * 57 * (1) Chain locking and data retrieval. 58 * hammer2_cluster_lock() 59 * hammer2_cluster_parent() 60 * 61 * - Most complex functions, quorum management on transaction ids. 62 * 63 * - Locking and data accesses must be internally asynchronous. 64 * 65 * - Validate and manage cache coherency primitives (cache state 66 * is stored in chain topologies but must be validated by these 67 * functions). 68 * 69 * (2) Lookups and Scans 70 * hammer2_cluster_lookup() 71 * hammer2_cluster_next() 72 * 73 * - Depend on locking & data retrieval functions, but still complex. 74 * 75 * - Must do quorum management on transaction ids. 76 * 77 * - Lookup and Iteration ops Must be internally asynchronous. 78 * 79 * (3) Modifying Operations 80 * hammer2_cluster_create() 81 * hammer2_cluster_rename() 82 * hammer2_cluster_delete() 83 * hammer2_cluster_modify() 84 * hammer2_cluster_modsync() 85 * 86 * - Can usually punt on failures, operation continues unless quorum 87 * is lost. If quorum is lost, must wait for resynchronization 88 * (depending on the management mode). 89 * 90 * - Must disconnect node on failures (also not flush), remount, and 91 * resynchronize. 92 * 93 * - Network links (via kdmsg) are relatively easy to issue as the 94 * complex underworkings of hammer2_chain.c don't have to messed 95 * with (the protocol is at a higher level than block-level). 96 * 97 * - Multiple local disk nodes (i.e. block devices) are another matter. 98 * Chain operations have to be dispatched to per-node threads (xN) 99 * because we can't asynchronize potentially very complex chain 100 * operations in hammer2_chain.c (it would be a huge mess). 101 * 102 * (these threads are also used to terminate incoming kdmsg ops from 103 * other machines). 104 * 105 * - Single-node filesystems do not use threads and will simply call 106 * hammer2_chain.c functions directly. This short-cut is handled 107 * at the base of each cluster function. 108 */ 109 #include <sys/cdefs.h> 110 #include <sys/param.h> 111 #include <sys/systm.h> 112 #include <sys/types.h> 113 #include <sys/lock.h> 114 #include <sys/uuid.h> 115 116 #include "hammer2.h" 117 118 /* 119 * Returns non-zero if any chain in the cluster needs to be resized. 120 * Errored elements are not used in the calculation. 121 */ 122 int 123 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) 124 { 125 hammer2_chain_t *chain; 126 int i; 127 128 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 129 for (i = 0; i < cluster->nchains; ++i) { 130 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 131 continue; 132 chain = cluster->array[i].chain; 133 if (chain == NULL) 134 continue; 135 if (chain->error) 136 continue; 137 if (chain->bytes != bytes) 138 return 1; 139 } 140 return 0; 141 } 142 143 /* 144 * Returns the bref type of the cluster's foucs. 145 * 146 * If the cluster is errored, returns HAMMER2_BREF_TYPE_EMPTY (0). 147 * The cluster must be locked. 148 */ 149 uint8_t 150 hammer2_cluster_type(hammer2_cluster_t *cluster) 151 { 152 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 153 if (cluster->error == 0) 154 return(cluster->focus->bref.type); 155 return 0; 156 } 157 158 /* 159 * Returns non-zero if the cluster's focus is flagged as being modified. 160 * 161 * If the cluster is errored, returns 0. 162 */ 163 int 164 hammer2_cluster_modified(hammer2_cluster_t *cluster) 165 { 166 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 167 if (cluster->error == 0) 168 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 169 return 0; 170 } 171 172 /* 173 * Returns the bref of the cluster's focus, sans any data-offset information 174 * (since offset information is per-node and wouldn't be useful). 175 * 176 * Callers use this function to access modify_tid, mirror_tid, type, 177 * key, and keybits. 178 * 179 * If the cluster is errored, returns an empty bref. 180 * The cluster must be locked. 181 */ 182 void 183 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 184 { 185 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 186 if (cluster->error == 0) { 187 *bref = cluster->focus->bref; 188 bref->data_off = 0; 189 } else { 190 bzero(bref, sizeof(*bref)); 191 } 192 } 193 194 /* 195 * Flag the cluster for flushing recursively up to the root. Despite the 196 * work it does, this is relatively benign. It just makes sure that the 197 * flusher has top-down visibility to this cluster. 198 * 199 * Errored chains are not flagged for flushing. 200 * 201 * The cluster should probably be locked. 202 */ 203 void 204 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster) 205 { 206 hammer2_chain_t *chain; 207 int i; 208 209 for (i = 0; i < cluster->nchains; ++i) { 210 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 211 continue; 212 chain = cluster->array[i].chain; 213 if (chain == NULL) 214 continue; 215 if (chain->error) 216 continue; 217 hammer2_chain_setflush(trans, chain); 218 } 219 } 220 221 /* 222 * Set the check mode for the cluster. 223 * Errored elements of the cluster are ignored. 224 * 225 * The cluster must be locked and modified. 226 */ 227 void 228 hammer2_cluster_setmethod_check(hammer2_trans_t *trans, 229 hammer2_cluster_t *cluster, 230 int check_algo) 231 { 232 hammer2_chain_t *chain; 233 int i; 234 235 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 236 for (i = 0; i < cluster->nchains; ++i) { 237 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 238 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 239 continue; 240 } 241 chain = cluster->array[i].chain; 242 if (chain == NULL) 243 continue; 244 if (chain->error) 245 continue; 246 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 247 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1); 248 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo); 249 } 250 } 251 252 /* 253 * Create a degenerate cluster with one ref from a single locked chain. 254 * The returned cluster will be focused on the chain and inherit its 255 * error state. 256 * 257 * The chain's lock and reference are transfered to the new cluster, so 258 * the caller should not try to unlock the chain separately. 259 * 260 * We fake the flags. 261 */ 262 hammer2_cluster_t * 263 hammer2_cluster_from_chain(hammer2_chain_t *chain) 264 { 265 hammer2_cluster_t *cluster; 266 267 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 268 cluster->array[0].chain = chain; 269 cluster->array[0].flags = HAMMER2_CITEM_FEMOD; 270 cluster->nchains = 1; 271 cluster->focus = chain; 272 cluster->focus_index = 0; 273 cluster->pmp = chain->pmp; 274 cluster->refs = 1; 275 cluster->error = chain->error; 276 cluster->flags = HAMMER2_CLUSTER_LOCKED | 277 HAMMER2_CLUSTER_WRHARD | 278 HAMMER2_CLUSTER_RDHARD | 279 HAMMER2_CLUSTER_MSYNCED | 280 HAMMER2_CLUSTER_SSYNCED; 281 282 return cluster; 283 } 284 285 /* 286 * Add a reference to a cluster and its underlying chains. 287 * 288 * We must also ref the underlying chains in order to allow ref/unlock 289 * sequences to later re-lock. 290 */ 291 void 292 hammer2_cluster_ref(hammer2_cluster_t *cluster) 293 { 294 atomic_add_int(&cluster->refs, 1); 295 } 296 297 /* 298 * Drop the caller's reference to the cluster. When the ref count drops to 299 * zero this function frees the cluster and drops all underlying chains. 300 * 301 * In-progress read I/Os are typically detached from the cluster once the 302 * first one returns (the remaining stay attached to the DIOs but are then 303 * ignored and drop naturally). 304 */ 305 void 306 hammer2_cluster_drop(hammer2_cluster_t *cluster) 307 { 308 hammer2_chain_t *chain; 309 int i; 310 311 KKASSERT(cluster->refs > 0); 312 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 313 cluster->focus = NULL; /* safety XXX chg to assert */ 314 cluster->focus_index = 0; 315 316 for (i = 0; i < cluster->nchains; ++i) { 317 chain = cluster->array[i].chain; 318 if (chain) { 319 hammer2_chain_drop(chain); 320 cluster->array[i].chain = NULL; /* safety */ 321 } 322 } 323 cluster->nchains = 0; /* safety */ 324 325 kfree(cluster, M_HAMMER2); 326 /* cluster is invalid */ 327 } 328 } 329 330 void 331 hammer2_cluster_wait(hammer2_cluster_t *cluster) 332 { 333 tsleep(cluster->focus, 0, "h2clcw", 1); 334 } 335 336 /* 337 * Lock a cluster. Cluster must already be referenced. Focus is maintained. 338 * 339 * WARNING! This function expects the caller to handle resolution of the 340 * cluster. We never re-resolve the cluster in this function, 341 * because it might be used to temporarily unlock/relock a cparent 342 * in an iteration or recursrion, and the cparents elements do not 343 * necessarily match. 344 */ 345 void 346 hammer2_cluster_lock_except(hammer2_cluster_t *cluster, int idx, int how) 347 { 348 hammer2_chain_t *chain; 349 int i; 350 351 /* cannot be on inode-embedded cluster template, must be on copy */ 352 KKASSERT(cluster->refs > 0); 353 KKASSERT((cluster->flags & HAMMER2_CLUSTER_INODE) == 0); 354 if (cluster->flags & HAMMER2_CLUSTER_LOCKED) { 355 panic("hammer2_cluster_lock: cluster %p already locked!\n", 356 cluster); 357 } 358 atomic_set_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 359 360 /* 361 * Lock chains and resolve state. 362 */ 363 for (i = 0; i < cluster->nchains; ++i) { 364 if (i == idx) 365 continue; 366 chain = cluster->array[i].chain; 367 if (chain == NULL) 368 continue; 369 hammer2_chain_lock(chain, how); 370 } 371 } 372 373 void 374 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 375 { 376 hammer2_cluster_lock_except(cluster, -1, how); 377 } 378 379 /* 380 * Calculate the clustering state for the cluster and set its focus. 381 * This routine must be called with care. For example, it should not 382 * normally be called after relocking a non-leaf cluster because parent 383 * clusters help iterations and each element might be at a slightly different 384 * indirect node (each node's topology is independently indexed). 385 * 386 * HAMMER2_CITEM_FEMOD flags which elements can be modified by normal 387 * operations. Typically this is only set on a quorum of MASTERs or 388 * on a SOFT_MASTER. Also as a degenerate case on SUPROOT. If a SOFT_MASTER 389 * is present, this bit is *not* set on a quorum of MASTERs. The 390 * synchronization code ignores this bit, but all hammer2_cluster_*() calls 391 * that create/modify/delete elements use it. 392 * 393 * The chains making up the cluster may be narrowed down based on quorum 394 * acceptability, and if RESOLVE_RDONLY is specified the chains can be 395 * narrowed down to a single chain as long as the entire subtopology is known 396 * to be intact. So, for example, we can narrow a read-only op to a single 397 * fast SLAVE but if we focus a CACHE chain we must still retain at least 398 * a SLAVE to ensure that the subtopology can be accessed. 399 * 400 * RESOLVE_RDONLY operations are effectively as-of so the quorum does not need 401 * to be maintained once the topology is validated as-of the top level of 402 * the operation. 403 * 404 * If a failure occurs the operation must be aborted by higher-level code and 405 * retried. XXX 406 */ 407 void 408 hammer2_cluster_resolve(hammer2_cluster_t *cluster) 409 { 410 hammer2_chain_t *chain; 411 hammer2_chain_t *focus; 412 hammer2_pfs_t *pmp; 413 hammer2_tid_t quorum_tid; 414 hammer2_tid_t last_best_quorum_tid; 415 int focus_pfs_type; 416 uint32_t nflags; 417 int ttlmasters; 418 int ttlslaves; 419 int nmasters; 420 int nslaves; 421 int nquorum; 422 int smpresent; 423 int i; 424 425 cluster->error = 0; 426 cluster->focus = NULL; 427 428 focus_pfs_type = 0; 429 nflags = 0; 430 ttlmasters = 0; 431 ttlslaves = 0; 432 nmasters = 0; 433 nslaves = 0; 434 435 /* 436 * Calculate quorum 437 */ 438 pmp = cluster->pmp; 439 KKASSERT(pmp != NULL || cluster->nchains == 0); 440 nquorum = pmp ? pmp->pfs_nmasters / 2 + 1 : 0; 441 smpresent = 0; 442 443 /* 444 * Pass 1 445 * 446 * NOTE: A NULL chain is not necessarily an error, it could be 447 * e.g. a lookup failure or the end of an iteration. 448 * Process normally. 449 */ 450 for (i = 0; i < cluster->nchains; ++i) { 451 chain = cluster->array[i].chain; 452 if (chain && chain->error) { 453 if (cluster->focus == NULL || cluster->focus == chain) { 454 /* error will be overridden by valid focus */ 455 cluster->error = chain->error; 456 } 457 458 /* 459 * Must count total masters and slaves whether the 460 * chain is errored or not. 461 */ 462 switch (cluster->pmp->pfs_types[i]) { 463 case HAMMER2_PFSTYPE_MASTER: 464 ++ttlmasters; 465 break; 466 case HAMMER2_PFSTYPE_SLAVE: 467 ++ttlslaves; 468 break; 469 } 470 continue; 471 } 472 switch (cluster->pmp->pfs_types[i]) { 473 case HAMMER2_PFSTYPE_MASTER: 474 ++ttlmasters; 475 break; 476 case HAMMER2_PFSTYPE_SLAVE: 477 ++ttlslaves; 478 break; 479 case HAMMER2_PFSTYPE_SOFT_MASTER: 480 nflags |= HAMMER2_CLUSTER_WRSOFT; 481 nflags |= HAMMER2_CLUSTER_RDSOFT; 482 smpresent = 1; 483 break; 484 case HAMMER2_PFSTYPE_SOFT_SLAVE: 485 nflags |= HAMMER2_CLUSTER_RDSOFT; 486 break; 487 case HAMMER2_PFSTYPE_SUPROOT: 488 /* 489 * Degenerate cluster representing the super-root 490 * topology on a single device. Fake stuff so 491 * cluster ops work as expected. 492 */ 493 nflags |= HAMMER2_CLUSTER_WRHARD; 494 nflags |= HAMMER2_CLUSTER_RDHARD; 495 cluster->focus_index = i; 496 cluster->focus = chain; 497 cluster->error = chain ? chain->error : 0; 498 break; 499 default: 500 break; 501 } 502 } 503 504 /* 505 * Pass 2 506 * 507 * Resolve masters. Calculate nmasters for the highest matching 508 * TID, if a quorum cannot be attained try the next lower matching 509 * TID until we exhaust TIDs. 510 * 511 * NOTE: A NULL chain is not necessarily an error, it could be 512 * e.g. a lookup failure or the end of an iteration. 513 * Process normally. 514 */ 515 last_best_quorum_tid = HAMMER2_TID_MAX; 516 quorum_tid = 0; /* fix gcc warning */ 517 518 while (nmasters < nquorum && last_best_quorum_tid != 0) { 519 nmasters = 0; 520 quorum_tid = 0; 521 522 for (i = 0; i < cluster->nchains; ++i) { 523 if (cluster->pmp->pfs_types[i] != 524 HAMMER2_PFSTYPE_MASTER) { 525 continue; 526 } 527 chain = cluster->array[i].chain; 528 529 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 530 /* 531 * Invalid as in unsynchronized, cannot be 532 * used to calculate the quorum. 533 */ 534 } else if (chain == NULL && quorum_tid == 0) { 535 /* 536 * NULL chain on master matches NULL chains 537 * on other masters. 538 */ 539 ++nmasters; 540 } else if (quorum_tid < last_best_quorum_tid && 541 chain != NULL && 542 (quorum_tid < chain->bref.modify_tid || 543 nmasters == 0)) { 544 /* 545 * Better TID located, reset nmasters count. 546 */ 547 nmasters = 1; 548 quorum_tid = chain->bref.modify_tid; 549 } else if (chain && 550 quorum_tid == chain->bref.modify_tid) { 551 /* 552 * TID matches current collection. 553 */ 554 ++nmasters; 555 } 556 } 557 if (nmasters >= nquorum) 558 break; 559 last_best_quorum_tid = quorum_tid; 560 } 561 562 /* 563 * Pass 3 564 * 565 * NOTE: A NULL chain is not necessarily an error, it could be 566 * e.g. a lookup failure or the end of an iteration. 567 * Process normally. 568 */ 569 for (i = 0; i < cluster->nchains; ++i) { 570 cluster->array[i].flags &= ~HAMMER2_CITEM_FEMOD; 571 chain = cluster->array[i].chain; 572 if (chain && chain->error) { 573 if (cluster->focus == NULL || cluster->focus == chain) { 574 /* error will be overridden by valid focus */ 575 cluster->error = chain->error; 576 } 577 continue; 578 } 579 580 switch (cluster->pmp->pfs_types[i]) { 581 case HAMMER2_PFSTYPE_MASTER: 582 /* 583 * We must have enough up-to-date masters to reach 584 * a quorum and the master modify_tid must match 585 * the quorum's modify_tid. 586 * 587 * Do not select an errored or out-of-sync master. 588 */ 589 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 590 nflags |= HAMMER2_CLUSTER_UNHARD; 591 } else if (nmasters >= nquorum && 592 (chain == NULL || chain->error == 0) && 593 ((chain == NULL && quorum_tid == 0) || 594 (chain != NULL && quorum_tid == 595 chain->bref.modify_tid))) { 596 nflags |= HAMMER2_CLUSTER_WRHARD; 597 nflags |= HAMMER2_CLUSTER_RDHARD; 598 if (!smpresent) { 599 cluster->array[i].flags |= 600 HAMMER2_CITEM_FEMOD; 601 } 602 if (cluster->focus == NULL || 603 focus_pfs_type == HAMMER2_PFSTYPE_SLAVE) { 604 focus_pfs_type = HAMMER2_PFSTYPE_MASTER; 605 cluster->focus_index = i; 606 cluster->focus = chain; /* NULL ok */ 607 cluster->error = chain ? chain->error : 608 0; 609 } 610 } else if (chain == NULL || chain->error == 0) { 611 nflags |= HAMMER2_CLUSTER_UNHARD; 612 } 613 break; 614 case HAMMER2_PFSTYPE_SLAVE: 615 /* 616 * We must have enough up-to-date masters to reach 617 * a quorum and the slave modify_tid must match the 618 * quorum's modify_tid. 619 * 620 * Do not select an errored slave. 621 */ 622 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) { 623 nflags |= HAMMER2_CLUSTER_UNHARD; 624 } else if (nmasters >= nquorum && 625 (chain == NULL || chain->error == 0) && 626 ((chain == NULL && quorum_tid == 0) || 627 (chain && quorum_tid == 628 chain->bref.modify_tid))) { 629 ++nslaves; 630 nflags |= HAMMER2_CLUSTER_RDHARD; 631 #if 0 632 /* XXX optimize for RESOLVE_RDONLY */ 633 if (cluster->focus == NULL) { 634 focus_pfs_type = HAMMER2_PFSTYPE_SLAVE; 635 cluster->focus_index = i; 636 cluster->focus = chain; /* NULL ok */ 637 cluster->error = chain ? chain->error : 638 0; 639 } 640 #endif 641 } else if (chain == NULL || chain->error == 0) { 642 nflags |= HAMMER2_CLUSTER_UNSOFT; 643 } 644 break; 645 case HAMMER2_PFSTYPE_SOFT_MASTER: 646 /* 647 * Directly mounted soft master always wins. There 648 * should be only one. 649 */ 650 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER); 651 cluster->focus_index = i; 652 cluster->focus = chain; 653 cluster->error = chain ? chain->error : 0; 654 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER; 655 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 656 break; 657 case HAMMER2_PFSTYPE_SOFT_SLAVE: 658 /* 659 * Directly mounted soft slave always wins. There 660 * should be only one. 661 */ 662 KKASSERT(focus_pfs_type != HAMMER2_PFSTYPE_SOFT_SLAVE); 663 if (focus_pfs_type != HAMMER2_PFSTYPE_SOFT_MASTER) { 664 cluster->focus_index = i; 665 cluster->focus = chain; 666 cluster->error = chain ? chain->error : 0; 667 focus_pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE; 668 } 669 break; 670 case HAMMER2_PFSTYPE_SUPROOT: 671 /* 672 * spmp (degenerate case) 673 */ 674 KKASSERT(i == 0); 675 cluster->focus_index = i; 676 cluster->focus = chain; 677 cluster->error = chain ? chain->error : 0; 678 focus_pfs_type = HAMMER2_PFSTYPE_SUPROOT; 679 cluster->array[i].flags |= HAMMER2_CITEM_FEMOD; 680 break; 681 default: 682 break; 683 } 684 } 685 686 /* 687 * Focus now set, adjust ddflag. Skip this pass if the focus 688 * is bad or if we are at the PFS root (the bref won't match at 689 * the PFS root, obviously). 690 */ 691 focus = cluster->focus; 692 if (focus) { 693 cluster->ddflag = 694 (cluster->focus->bref.type == HAMMER2_BREF_TYPE_INODE); 695 } else { 696 cluster->ddflag = 0; 697 goto skip4; 698 } 699 if (cluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 700 goto skip4; 701 702 /* 703 * Pass 4 704 * 705 * Validate the elements that were not marked invalid. They should 706 * match. 707 */ 708 for (i = 0; i < cluster->nchains; ++i) { 709 int ddflag; 710 711 chain = cluster->array[i].chain; 712 713 if (chain == NULL) 714 continue; 715 if (chain == focus) 716 continue; 717 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID) 718 continue; 719 720 ddflag = (chain->bref.type == HAMMER2_BREF_TYPE_INODE); 721 if (chain->bref.type != focus->bref.type || 722 chain->bref.key != focus->bref.key || 723 chain->bref.keybits != focus->bref.keybits || 724 chain->bref.modify_tid != focus->bref.modify_tid || 725 chain->bytes != focus->bytes || 726 ddflag != cluster->ddflag) { 727 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 728 if (hammer2_debug & 1) 729 kprintf("cluster_resolve: matching modify_tid failed " 730 "bref test: idx=%d type=%02x/%02x " 731 "key=%016jx/%d-%016jx/%d " 732 "mod=%016jx/%016jx bytes=%u/%u\n", 733 i, 734 chain->bref.type, focus->bref.type, 735 chain->bref.key, chain->bref.keybits, 736 focus->bref.key, focus->bref.keybits, 737 chain->bref.modify_tid, focus->bref.modify_tid, 738 chain->bytes, focus->bytes); 739 if (hammer2_debug & 0x4000) 740 panic("cluster_resolve"); 741 /* flag issue and force resync? */ 742 } 743 } 744 skip4: 745 746 if (ttlslaves == 0) 747 nflags |= HAMMER2_CLUSTER_NOSOFT; 748 if (ttlmasters == 0) 749 nflags |= HAMMER2_CLUSTER_NOHARD; 750 751 /* 752 * Set SSYNCED or MSYNCED for slaves and masters respectively if 753 * all available nodes (even if 0 are available) are fully 754 * synchronized. This is used by the synchronization thread to 755 * determine if there is work it could potentially accomplish. 756 */ 757 if (nslaves == ttlslaves) 758 nflags |= HAMMER2_CLUSTER_SSYNCED; 759 if (nmasters == ttlmasters) 760 nflags |= HAMMER2_CLUSTER_MSYNCED; 761 762 /* 763 * Determine if the cluster was successfully locked for the 764 * requested operation and generate an error code. The cluster 765 * will not be locked (or ref'd) if an error is returned. 766 * 767 * Caller can use hammer2_cluster_rdok() and hammer2_cluster_wrok() 768 * to determine if reading or writing is possible. If writing, the 769 * cluster still requires a call to hammer2_cluster_modify() first. 770 */ 771 atomic_set_int(&cluster->flags, nflags); 772 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_ZFLAGS & ~nflags); 773 } 774 775 /* 776 * This is used by the sync thread to force non-NULL elements of a copy 777 * of the pmp->iroot cluster to be good which is required to prime the 778 * sync. 779 */ 780 void 781 hammer2_cluster_forcegood(hammer2_cluster_t *cluster) 782 { 783 int i; 784 785 for (i = 0; i < cluster->nchains; ++i) { 786 if (cluster->array[i].chain) 787 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 788 } 789 } 790 791 /* 792 * Copy a cluster, returned a ref'd cluster. All underlying chains 793 * are also ref'd, but not locked. Focus state is also copied. 794 * 795 * Original cluster does not have to be locked but usually is. 796 * New cluster will not be flagged as locked. 797 * 798 * Callers using this function to initialize a new cluster from an inode 799 * generally lock and resolve the resulting cluster. 800 * 801 * Callers which use this function to save/restore a cluster structure 802 * generally retain the focus state and do not re-resolve it. Caller should 803 * not try to re-resolve internal (cparent) node state during an iteration 804 * as the individual tracking elements of cparent in an iteration may not 805 * match even though they are correct. 806 */ 807 hammer2_cluster_t * 808 hammer2_cluster_copy(hammer2_cluster_t *ocluster) 809 { 810 hammer2_pfs_t *pmp = ocluster->pmp; 811 hammer2_cluster_t *ncluster; 812 hammer2_chain_t *chain; 813 int i; 814 815 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 816 ncluster->pmp = pmp; 817 ncluster->nchains = ocluster->nchains; 818 ncluster->refs = 1; 819 820 for (i = 0; i < ocluster->nchains; ++i) { 821 chain = ocluster->array[i].chain; 822 ncluster->array[i].chain = chain; 823 ncluster->array[i].flags = ocluster->array[i].flags; 824 if (chain) 825 hammer2_chain_ref(chain); 826 } 827 ncluster->focus_index = ocluster->focus_index; 828 ncluster->focus = ocluster->focus; 829 ncluster->flags = ocluster->flags & ~(HAMMER2_CLUSTER_LOCKED | 830 HAMMER2_CLUSTER_INODE); 831 832 return (ncluster); 833 } 834 835 /* 836 * Unlock a cluster. Refcount and focus is maintained. 837 */ 838 void 839 hammer2_cluster_unlock_except(hammer2_cluster_t *cluster, int idx) 840 { 841 hammer2_chain_t *chain; 842 int i; 843 844 if ((cluster->flags & HAMMER2_CLUSTER_LOCKED) == 0) { 845 kprintf("hammer2_cluster_unlock: cluster %p not locked\n", 846 cluster); 847 } 848 KKASSERT(cluster->flags & HAMMER2_CLUSTER_LOCKED); 849 KKASSERT(cluster->refs > 0); 850 atomic_clear_int(&cluster->flags, HAMMER2_CLUSTER_LOCKED); 851 852 for (i = 0; i < cluster->nchains; ++i) { 853 if (i == idx) 854 continue; 855 chain = cluster->array[i].chain; 856 if (chain) 857 hammer2_chain_unlock(chain); 858 } 859 } 860 861 void 862 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 863 { 864 hammer2_cluster_unlock_except(cluster, -1); 865 } 866 867 /* 868 * Resize the cluster's physical storage allocation in-place. This may 869 * replace the cluster's chains. 870 */ 871 void 872 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 873 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 874 int nradix, int flags) 875 { 876 hammer2_chain_t *chain; 877 int i; 878 879 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ 880 KKASSERT(cparent->nchains == cluster->nchains); 881 882 for (i = 0; i < cluster->nchains; ++i) { 883 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 884 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 885 continue; 886 } 887 chain = cluster->array[i].chain; 888 if (chain) { 889 KKASSERT(cparent->array[i].chain); 890 hammer2_chain_resize(trans, ip, 891 cparent->array[i].chain, chain, 892 nradix, flags); 893 } 894 } 895 } 896 897 /* 898 * Set an inode's cluster modified, marking the related chains RW and 899 * duplicating them if necessary. 900 * 901 * The passed-in chain is a localized copy of the chain previously acquired 902 * when the inode was locked (and possilby replaced in the mean time), and 903 * must also be updated. In fact, we update it first and then synchronize 904 * the inode's cluster cache. 905 */ 906 hammer2_inode_data_t * 907 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 908 hammer2_cluster_t *cluster, int flags) 909 { 910 hammer2_inode_modify(trans, ip); 911 hammer2_cluster_modify(trans, cluster, flags); 912 hammer2_inode_repoint(ip, NULL, cluster); 913 return (&hammer2_cluster_wdata(cluster)->ipdata); 914 } 915 916 /* 917 * Adjust the cluster's chains to allow modification and adjust the 918 * focus. Data will be accessible on return. 919 * 920 * If our focused master errors on modify, re-resolve the cluster to 921 * try to select a different master. 922 */ 923 void 924 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 925 int flags) 926 { 927 hammer2_chain_t *chain; 928 int resolve_again; 929 int i; 930 931 resolve_again = 0; 932 for (i = 0; i < cluster->nchains; ++i) { 933 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 934 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 935 continue; 936 } 937 chain = cluster->array[i].chain; 938 if (chain == NULL) 939 continue; 940 if (chain->error) 941 continue; 942 hammer2_chain_modify(trans, chain, flags); 943 if (cluster->focus == chain && chain->error) { 944 cluster->error = chain->error; 945 resolve_again = 1; 946 } 947 } 948 if (resolve_again) 949 hammer2_cluster_resolve(cluster); 950 } 951 952 /* 953 * Synchronize modifications from the focus to other chains in a cluster. 954 * Convenient because nominal API users can just modify the contents of the 955 * focus (at least for non-blockref data). 956 * 957 * Nominal front-end operations only edit non-block-table data in a single 958 * chain. This code copies such modifications to the other chains in the 959 * cluster. Blocktable modifications are handled on a chain-by-chain basis 960 * by both the frontend and the backend and will explode in fireworks if 961 * blindly copied. 962 */ 963 void 964 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 965 { 966 hammer2_chain_t *focus; 967 hammer2_chain_t *scan; 968 const hammer2_inode_data_t *ripdata; 969 hammer2_inode_data_t *wipdata; 970 int i; 971 972 focus = cluster->focus; 973 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 974 975 for (i = 0; i < cluster->nchains; ++i) { 976 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) 977 continue; 978 scan = cluster->array[i].chain; 979 if (scan == NULL || scan == focus) 980 continue; 981 if (scan->error) 982 continue; 983 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 984 KKASSERT(focus->bytes == scan->bytes && 985 focus->bref.type == scan->bref.type); 986 switch(focus->bref.type) { 987 case HAMMER2_BREF_TYPE_INODE: 988 ripdata = &focus->data->ipdata; 989 wipdata = &scan->data->ipdata; 990 if ((ripdata->meta.op_flags & 991 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 992 bcopy(ripdata, wipdata, 993 offsetof(hammer2_inode_data_t, u)); 994 break; 995 } 996 /* fall through to full copy */ 997 case HAMMER2_BREF_TYPE_DATA: 998 bcopy(focus->data, scan->data, focus->bytes); 999 break; 1000 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1001 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1002 case HAMMER2_BREF_TYPE_FREEMAP: 1003 case HAMMER2_BREF_TYPE_VOLUME: 1004 panic("hammer2_cluster_modsync: illegal node type"); 1005 /* NOT REACHED */ 1006 break; 1007 default: 1008 panic("hammer2_cluster_modsync: unknown node type"); 1009 break; 1010 } 1011 } 1012 } 1013 1014 /* 1015 * Lookup initialization/completion API. Returns a locked, fully resolved 1016 * cluster with one ref. 1017 */ 1018 hammer2_cluster_t * 1019 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 1020 { 1021 hammer2_cluster_t *cluster; 1022 1023 cluster = hammer2_cluster_copy(cparent); 1024 if (flags & HAMMER2_LOOKUP_SHARED) { 1025 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 1026 HAMMER2_RESOLVE_SHARED); 1027 } else { 1028 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 1029 } 1030 hammer2_cluster_resolve(cluster); 1031 1032 return (cluster); 1033 } 1034 1035 void 1036 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 1037 { 1038 if (cparent) { 1039 hammer2_cluster_unlock(cparent); 1040 hammer2_cluster_drop(cparent); 1041 } 1042 } 1043 1044 /* 1045 * Locate first match or overlap under parent, return a new, locked, resolved 1046 * cluster with one ref. 1047 * 1048 * Must never be called with HAMMER2_LOOKUP_MATCHIND. 1049 */ 1050 hammer2_cluster_t * 1051 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 1052 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 1053 { 1054 hammer2_pfs_t *pmp; 1055 hammer2_cluster_t *cluster; 1056 hammer2_chain_t *chain; 1057 hammer2_key_t key_accum; 1058 hammer2_key_t key_next; 1059 int null_count; 1060 int rflags; 1061 int i; 1062 1063 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0); 1064 1065 pmp = cparent->pmp; /* can be NULL */ 1066 key_accum = *key_nextp; 1067 null_count = 0; 1068 if (flags & HAMMER2_LOOKUP_SHARED) 1069 rflags = HAMMER2_RESOLVE_SHARED; 1070 else 1071 rflags = 0; 1072 1073 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 1074 cluster->pmp = pmp; /* can be NULL */ 1075 cluster->refs = 1; 1076 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1077 cluster->flags |= HAMMER2_CLUSTER_LOCKED; 1078 1079 /* 1080 * Iterating earlier cluster elements with later elements still 1081 * locked is a problem, so we have to unlock the parent and then 1082 * re-lock as we go. 1083 */ 1084 hammer2_cluster_unlock(cparent); 1085 cparent->flags |= HAMMER2_CLUSTER_LOCKED; 1086 1087 /* 1088 * Pass-1, issue lookups. 1089 */ 1090 for (i = 0; i < cparent->nchains; ++i) { 1091 cluster->array[i].flags = cparent->array[i].flags; 1092 key_next = *key_nextp; 1093 1094 /* 1095 * Always relock the parent as we go. 1096 */ 1097 if (cparent->array[i].chain) { 1098 hammer2_chain_lock(cparent->array[i].chain, rflags); 1099 } 1100 1101 /* 1102 * Nothing to base the lookup, or parent was not synchronized. 1103 */ 1104 if (cparent->array[i].chain == NULL || 1105 (cparent->array[i].flags & HAMMER2_CITEM_INVALID)) { 1106 ++null_count; 1107 continue; 1108 } 1109 1110 chain = hammer2_chain_lookup(&cparent->array[i].chain, 1111 &key_next, 1112 key_beg, key_end, 1113 &cparent->array[i].cache_index, 1114 flags); 1115 cluster->array[i].chain = chain; 1116 if (chain == NULL) { 1117 ++null_count; 1118 } 1119 if (key_accum > key_next) 1120 key_accum = key_next; 1121 } 1122 1123 /* 1124 * Cleanup 1125 */ 1126 cluster->nchains = i; 1127 *key_nextp = key_accum; 1128 1129 /* 1130 * The cluster must be resolved, out of sync elements may be present. 1131 * 1132 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL. 1133 */ 1134 if (null_count != i) 1135 hammer2_cluster_resolve(cluster); 1136 if (null_count == i || 1137 (cluster->focus == NULL && 1138 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) { 1139 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1140 hammer2_cluster_unlock(cluster); 1141 hammer2_cluster_drop(cluster); 1142 cluster = NULL; 1143 } 1144 1145 return (cluster); 1146 } 1147 1148 /* 1149 * Locate next match or overlap under parent, replace the passed-in cluster. 1150 * The returned cluster is a new, locked, resolved cluster with one ref. 1151 * 1152 * Must never be called with HAMMER2_LOOKUP_MATCHIND. 1153 */ 1154 hammer2_cluster_t * 1155 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 1156 hammer2_key_t *key_nextp, 1157 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 1158 { 1159 hammer2_chain_t *ochain; 1160 hammer2_chain_t *nchain; 1161 hammer2_key_t key_accum; 1162 hammer2_key_t key_next; 1163 int parent_index; 1164 int cluster_index; 1165 int null_count; 1166 int rflags; 1167 int i; 1168 1169 KKASSERT((flags & HAMMER2_LOOKUP_MATCHIND) == 0); 1170 1171 key_accum = *key_nextp; 1172 null_count = 0; 1173 parent_index = cparent->focus_index; /* save prior focus */ 1174 cluster_index = cluster->focus_index; 1175 if (flags & HAMMER2_LOOKUP_SHARED) 1176 rflags = HAMMER2_RESOLVE_SHARED; 1177 else 1178 rflags = 0; 1179 1180 cluster->focus = NULL; /* XXX needed any more? */ 1181 /*cparent->focus = NULL;*/ 1182 cluster->focus_index = 0; /* XXX needed any more? */ 1183 /*cparent->focus_index = 0;*/ 1184 1185 cluster->ddflag = 0; 1186 1187 /* 1188 * The parent is always locked on entry, the iterator may be locked 1189 * depending on flags. 1190 * 1191 * We must temporarily unlock the passed-in clusters to avoid a 1192 * deadlock between elements of the cluster with other threads. 1193 * We will fixup the lock in the loop. 1194 * 1195 * Note that this will clear the focus. 1196 * 1197 * Reflag the clusters as locked, because we will relock them 1198 * as we go. 1199 */ 1200 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) { 1201 hammer2_cluster_unlock(cluster); 1202 cluster->flags |= HAMMER2_CLUSTER_LOCKED; 1203 } 1204 hammer2_cluster_unlock(cparent); 1205 cparent->flags |= HAMMER2_CLUSTER_LOCKED; 1206 1207 for (i = 0; i < cparent->nchains; ++i) { 1208 key_next = *key_nextp; 1209 ochain = cluster->array[i].chain; 1210 1211 /* 1212 * Always relock the parent as we go. 1213 */ 1214 if (cparent->array[i].chain) 1215 hammer2_chain_lock(cparent->array[i].chain, rflags); 1216 1217 /* 1218 * Nothing to iterate from. These cases can occur under 1219 * normal operations. For example, during synchronization 1220 * a slave might reach the end of its scan while records 1221 * are still left on the master(s). 1222 */ 1223 if (ochain == NULL) { 1224 ++null_count; 1225 continue; 1226 } 1227 if (cparent->array[i].chain == NULL || 1228 (cparent->array[i].flags & HAMMER2_CITEM_INVALID) || 1229 (cluster->array[i].flags & HAMMER2_CITEM_INVALID)) { 1230 /* ochain has not yet been relocked */ 1231 hammer2_chain_drop(ochain); 1232 cluster->array[i].chain = NULL; 1233 ++null_count; 1234 continue; 1235 } 1236 1237 /* 1238 * Relock the child if necessary. Parent and child will then 1239 * be locked as expected by hammer2_chain_next() and flags. 1240 */ 1241 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1242 hammer2_chain_lock(ochain, rflags); 1243 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain, 1244 &key_next, key_beg, key_end, 1245 &cparent->array[i].cache_index, 1246 flags); 1247 /* ochain now invalid but can still be used for focus check */ 1248 if (parent_index == i) { 1249 cparent->focus_index = i; 1250 cparent->focus = cparent->array[i].chain; 1251 } 1252 1253 cluster->array[i].chain = nchain; 1254 if (nchain == NULL) { 1255 ++null_count; 1256 } 1257 if (key_accum > key_next) 1258 key_accum = key_next; 1259 } 1260 1261 /* 1262 * Cleanup 1263 */ 1264 cluster->nchains = i; 1265 *key_nextp = key_accum; 1266 1267 /* 1268 * The cluster must be resolved, out of sync elements may be present. 1269 * 1270 * If HAMMER2_LOOKUP_ALLNODES is not set focus must be non-NULL. 1271 */ 1272 if (null_count != i) 1273 hammer2_cluster_resolve(cluster); 1274 if (null_count == i || 1275 (cluster->focus == NULL && 1276 (flags & HAMMER2_LOOKUP_ALLNODES) == 0)) { 1277 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1278 hammer2_cluster_unlock(cluster); 1279 hammer2_cluster_drop(cluster); 1280 cluster = NULL; 1281 } 1282 return(cluster); 1283 } 1284 1285 /* 1286 * Advance just one chain in the cluster and recalculate the invalid bit. 1287 * The cluster index is allowed to be flagged invalid on input and is 1288 * recalculated on return. 1289 * 1290 * (used during synchronization to advance past a chain being deleted). 1291 * 1292 * The chain being advanced must not be the focus and the clusters in 1293 * question must have already passed normal cluster_lookup/cluster_next 1294 * checks. 1295 * 1296 * The cluster always remains intact on return, so void function. 1297 */ 1298 void 1299 hammer2_cluster_next_single_chain(hammer2_cluster_t *cparent, 1300 hammer2_cluster_t *cluster, 1301 hammer2_key_t *key_nextp, 1302 hammer2_key_t key_beg, 1303 hammer2_key_t key_end, 1304 int i, int flags) 1305 { 1306 hammer2_chain_t *ochain; 1307 hammer2_chain_t *nchain; 1308 hammer2_chain_t *focus; 1309 hammer2_key_t key_accum; 1310 hammer2_key_t key_next; 1311 int ddflag; 1312 1313 key_accum = *key_nextp; 1314 key_next = *key_nextp; 1315 ochain = cluster->array[i].chain; 1316 if (ochain == NULL) 1317 goto done; 1318 KKASSERT(ochain != cluster->focus); 1319 1320 nchain = hammer2_chain_next(&cparent->array[i].chain, ochain, 1321 &key_next, key_beg, key_end, 1322 &cparent->array[i].cache_index, 1323 flags); 1324 /* ochain now invalid */ 1325 if (cparent->focus_index == i) 1326 cparent->focus = cparent->array[i].chain; 1327 1328 /* 1329 * Install nchain. Note that nchain can be NULL, and can also 1330 * be in an unlocked state depending on flags. 1331 */ 1332 cluster->array[i].chain = nchain; 1333 cluster->array[i].flags &= ~HAMMER2_CITEM_INVALID; 1334 1335 if (key_accum > key_next) 1336 key_accum = key_next; 1337 1338 focus = cluster->focus; 1339 if (focus == NULL) 1340 goto done; 1341 if (nchain == NULL) 1342 goto done; 1343 #if 0 1344 if (nchain == focus) /* ASSERTED NOT TRUE */ 1345 ... 1346 #endif 1347 ddflag = (nchain->bref.type == HAMMER2_BREF_TYPE_INODE); 1348 if (nchain->bref.type != focus->bref.type || 1349 nchain->bref.key != focus->bref.key || 1350 nchain->bref.keybits != focus->bref.keybits || 1351 nchain->bref.modify_tid != focus->bref.modify_tid || 1352 nchain->bytes != focus->bytes || 1353 ddflag != cluster->ddflag) { 1354 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1355 } 1356 1357 done: 1358 *key_nextp = key_accum; 1359 #if 0 1360 /* 1361 * For now don't re-resolve cluster->flags. 1362 */ 1363 hammer2_cluster_resolve(cluster); 1364 #endif 1365 } 1366 1367 /* 1368 * Create a new cluster using the specified key 1369 */ 1370 int 1371 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 1372 hammer2_cluster_t **clusterp, 1373 hammer2_key_t key, int keybits, 1374 int type, size_t bytes, int flags) 1375 { 1376 hammer2_cluster_t *cluster; 1377 hammer2_pfs_t *pmp; 1378 int error; 1379 int i; 1380 1381 pmp = trans->pmp; /* can be NULL */ 1382 1383 if ((cluster = *clusterp) == NULL) { 1384 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, 1385 M_WAITOK | M_ZERO); 1386 cluster->pmp = pmp; /* can be NULL */ 1387 cluster->refs = 1; 1388 cluster->flags = HAMMER2_CLUSTER_LOCKED; 1389 } 1390 cluster->focus_index = 0; 1391 cluster->focus = NULL; 1392 1393 /* 1394 * NOTE: cluster->array[] entries can initially be NULL. If 1395 * *clusterp is supplied, skip NULL entries, otherwise 1396 * create new chains. 1397 */ 1398 for (i = 0; i < cparent->nchains; ++i) { 1399 if ((cparent->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1400 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1401 continue; 1402 } 1403 if (*clusterp) { 1404 if ((cluster->array[i].flags & 1405 HAMMER2_CITEM_FEMOD) == 0) { 1406 cluster->array[i].flags |= 1407 HAMMER2_CITEM_INVALID; 1408 continue; 1409 } 1410 if (cluster->array[i].chain == NULL) 1411 continue; 1412 } 1413 error = hammer2_chain_create(trans, &cparent->array[i].chain, 1414 &cluster->array[i].chain, pmp, 1415 key, keybits, 1416 type, bytes, flags); 1417 if (cparent->focus_index == i) 1418 cparent->focus = cparent->array[i].chain; 1419 KKASSERT(error == 0); 1420 if (cluster->focus == NULL) { 1421 cluster->focus_index = i; 1422 cluster->focus = cluster->array[i].chain; 1423 } 1424 if (cparent->focus == cparent->array[i].chain) { 1425 cluster->focus_index = i; 1426 cluster->focus = cluster->array[i].chain; 1427 } 1428 } 1429 cluster->nchains = i; 1430 *clusterp = cluster; 1431 hammer2_cluster_resolve(cluster); 1432 1433 return error; 1434 } 1435 1436 /* 1437 * Rename a cluster to a new parent. 1438 * 1439 * WARNING! Any passed-in bref is probaly from hammer2_cluster_bref(), 1440 * So the data_off field is not relevant. Only the key and 1441 * keybits are used. 1442 */ 1443 void 1444 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 1445 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 1446 int flags) 1447 { 1448 hammer2_chain_t *chain; 1449 hammer2_blockref_t xbref; 1450 int i; 1451 1452 #if 0 1453 cluster->focus = NULL; 1454 cparent->focus = NULL; 1455 cluster->focus_index = 0; 1456 cparent->focus_index = 0; 1457 #endif 1458 1459 for (i = 0; i < cluster->nchains; ++i) { 1460 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1461 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1462 continue; 1463 } 1464 chain = cluster->array[i].chain; 1465 if (chain) { 1466 if (bref) { 1467 xbref = chain->bref; 1468 xbref.key = bref->key; 1469 xbref.keybits = bref->keybits; 1470 hammer2_chain_rename(trans, &xbref, 1471 &cparent->array[i].chain, 1472 chain, flags); 1473 } else { 1474 hammer2_chain_rename(trans, NULL, 1475 &cparent->array[i].chain, 1476 chain, flags); 1477 } 1478 if (cparent->focus_index == i) 1479 cparent->focus = cparent->array[i].chain; 1480 KKASSERT(cluster->array[i].chain == chain); /*remove*/ 1481 } 1482 } 1483 } 1484 1485 /* 1486 * Mark a cluster deleted 1487 */ 1488 void 1489 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 1490 hammer2_cluster_t *cluster, int flags) 1491 { 1492 hammer2_chain_t *chain; 1493 hammer2_chain_t *parent; 1494 int i; 1495 1496 if (cparent == NULL) { 1497 kprintf("cparent is NULL\n"); 1498 return; 1499 } 1500 1501 for (i = 0; i < cluster->nchains; ++i) { 1502 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0) { 1503 cluster->array[i].flags |= HAMMER2_CITEM_INVALID; 1504 continue; 1505 } 1506 parent = cparent->array[i].chain; 1507 chain = cluster->array[i].chain; 1508 if (chain == NULL) 1509 continue; 1510 if (chain->parent != parent) { 1511 kprintf("hammer2_cluster_delete: parent " 1512 "mismatch chain=%p parent=%p against=%p\n", 1513 chain, chain->parent, parent); 1514 } else { 1515 hammer2_chain_delete(trans, parent, chain, flags); 1516 } 1517 } 1518 } 1519 1520 /* 1521 * Create a snapshot of the specified {parent, ochain} with the specified 1522 * label. The originating hammer2_inode must be exclusively locked for 1523 * safety. 1524 * 1525 * The ioctl code has already synced the filesystem. 1526 */ 1527 int 1528 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, 1529 hammer2_ioc_pfs_t *pfs) 1530 { 1531 hammer2_dev_t *hmp; 1532 hammer2_cluster_t *ncluster; 1533 const hammer2_inode_data_t *ripdata; 1534 hammer2_inode_data_t *wipdata; 1535 hammer2_chain_t *nchain; 1536 hammer2_inode_t *nip; 1537 size_t name_len; 1538 hammer2_key_t lhc; 1539 struct vattr vat; 1540 #if 0 1541 uuid_t opfs_clid; 1542 #endif 1543 int error; 1544 int i; 1545 1546 kprintf("snapshot %s\n", pfs->name); 1547 1548 name_len = strlen(pfs->name); 1549 lhc = hammer2_dirhash(pfs->name, name_len); 1550 1551 /* 1552 * Get the clid 1553 */ 1554 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata; 1555 #if 0 1556 opfs_clid = ripdata->meta.pfs_clid; 1557 #endif 1558 hmp = ocluster->focus->hmp; /* XXX find synchronized local disk */ 1559 1560 /* 1561 * Create the snapshot directory under the super-root 1562 * 1563 * Set PFS type, generate a unique filesystem id, and generate 1564 * a cluster id. Use the same clid when snapshotting a PFS root, 1565 * which theoretically allows the snapshot to be used as part of 1566 * the same cluster (perhaps as a cache). 1567 * 1568 * Copy the (flushed) blockref array. Theoretically we could use 1569 * chain_duplicate() but it becomes difficult to disentangle 1570 * the shared core so for now just brute-force it. 1571 */ 1572 VATTR_NULL(&vat); 1573 vat.va_type = VDIR; 1574 vat.va_mode = 0755; 1575 ncluster = NULL; 1576 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, 1577 proc0.p_ucred, pfs->name, name_len, 1578 &ncluster, 1579 HAMMER2_INSERT_PFSROOT, &error); 1580 1581 if (nip) { 1582 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); 1583 wipdata->meta.pfs_type = HAMMER2_PFSTYPE_MASTER; 1584 wipdata->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT; 1585 wipdata->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT; 1586 kern_uuidgen(&wipdata->meta.pfs_fsid, 1); 1587 1588 /* 1589 * Give the snapshot its own private cluster. As a snapshot 1590 * no further synchronization with the original cluster will 1591 * be done. 1592 */ 1593 #if 0 1594 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1595 wipdata->meta.pfs_clid = opfs_clid; 1596 else 1597 kern_uuidgen(&wipdata->meta.pfs_clid, 1); 1598 #endif 1599 kern_uuidgen(&wipdata->meta.pfs_clid, 1); 1600 1601 for (i = 0; i < ncluster->nchains; ++i) { 1602 if ((ncluster->array[i].flags & 1603 HAMMER2_CITEM_FEMOD) == 0) { 1604 ncluster->array[i].flags |= 1605 HAMMER2_CITEM_INVALID; 1606 continue; 1607 } 1608 nchain = ncluster->array[i].chain; 1609 if (nchain) 1610 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 1611 } 1612 1613 /* XXX hack blockset copy */ 1614 /* XXX doesn't work with real cluster */ 1615 KKASSERT(ocluster->nchains == 1); 1616 wipdata->u.blockset = ripdata->u.blockset; 1617 hammer2_cluster_modsync(ncluster); 1618 for (i = 0; i < ncluster->nchains; ++i) { 1619 nchain = ncluster->array[i].chain; 1620 if (nchain) 1621 hammer2_flush(trans, nchain, 1); 1622 } 1623 hammer2_inode_unlock(nip, ncluster); 1624 } 1625 return (error); 1626 } 1627 1628 /* 1629 * Return locked parent cluster given a locked child. The child remains 1630 * locked on return. The new parent's focus follows the child's focus 1631 * and the parent is always resolved. 1632 * 1633 * We must temporarily unlock the passed-in cluster to avoid a deadlock 1634 * between elements of the cluster. 1635 * 1636 * We must not try to hammer2_cluster_resolve() cparent. The individual 1637 * parent chains for the nodes are the correct parents for the cluster but 1638 * do not necessarily match, so resolve would likely implode. 1639 */ 1640 hammer2_cluster_t * 1641 hammer2_cluster_parent(hammer2_cluster_t *cluster) 1642 { 1643 hammer2_cluster_t *cparent; 1644 int i; 1645 1646 cparent = hammer2_cluster_copy(cluster); 1647 hammer2_cluster_unlock(cluster); 1648 1649 for (i = 0; i < cparent->nchains; ++i) { 1650 hammer2_chain_t *chain; 1651 hammer2_chain_t *rchain; 1652 1653 /* 1654 * Calculate parent for each element. Old chain has an extra 1655 * ref for cparent but the lock remains with cluster. 1656 */ 1657 chain = cparent->array[i].chain; 1658 if (chain == NULL) 1659 continue; 1660 while ((rchain = chain->parent) != NULL) { 1661 hammer2_chain_ref(rchain); 1662 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS); 1663 if (chain->parent == rchain) 1664 break; 1665 hammer2_chain_unlock(rchain); 1666 hammer2_chain_drop(rchain); 1667 } 1668 cparent->array[i].chain = rchain; 1669 hammer2_chain_drop(chain); 1670 } 1671 cparent->flags |= HAMMER2_CLUSTER_LOCKED; 1672 /* hammer2_cluster_resolve(cparent); */ 1673 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 1674 1675 return cparent; 1676 } 1677 1678 /************************************************************************ 1679 * CLUSTER I/O * 1680 ************************************************************************ 1681 * 1682 * 1683 * WARNING! blockref[] array data is not universal. These functions should 1684 * only be used to access universal data. 1685 * 1686 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1687 * complete if necessary. The I/O's should have already been 1688 * initiated by the cluster_lock/chain_lock operation. 1689 * 1690 * The cluster must already be in a modified state before wdata 1691 * is called. The data will already be available for this case. 1692 */ 1693 const hammer2_media_data_t * 1694 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1695 { 1696 return(cluster->focus->data); 1697 } 1698 1699 hammer2_media_data_t * 1700 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1701 { 1702 KKASSERT(hammer2_cluster_modified(cluster)); 1703 return(cluster->focus->data); 1704 } 1705 1706 /* 1707 * Load cluster data asynchronously with callback. 1708 * 1709 * The callback is made for the first validated data found, or NULL 1710 * if no valid data is available. 1711 * 1712 * NOTE! The cluster structure is either unique or serialized (e.g. embedded 1713 * in the inode with an exclusive lock held), the chain structure may be 1714 * shared. 1715 */ 1716 void 1717 hammer2_cluster_load_async(hammer2_cluster_t *cluster, 1718 void (*callback)(hammer2_iocb_t *iocb), void *ptr) 1719 { 1720 hammer2_chain_t *chain; 1721 hammer2_iocb_t *iocb; 1722 hammer2_dev_t *hmp; 1723 hammer2_blockref_t *bref; 1724 int i; 1725 1726 i = cluster->focus_index; 1727 chain = cluster->focus; 1728 1729 iocb = &cluster->iocb; 1730 iocb->callback = callback; 1731 iocb->dio = NULL; /* for already-validated case */ 1732 iocb->cluster = cluster; 1733 iocb->chain = chain; 1734 iocb->ptr = ptr; 1735 iocb->lbase = (off_t)i; 1736 iocb->flags = 0; 1737 iocb->error = 0; 1738 1739 /* 1740 * Data already validated 1741 */ 1742 if (chain->data) { 1743 callback(iocb); 1744 return; 1745 } 1746 1747 /* 1748 * We must resolve to a device buffer, either by issuing I/O or 1749 * by creating a zero-fill element. We do not mark the buffer 1750 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1751 * API must still be used to do that). 1752 * 1753 * The device buffer is variable-sized in powers of 2 down 1754 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1755 * chunk always contains buffers of the same size. (XXX) 1756 * 1757 * The minimum physical IO size may be larger than the variable 1758 * block size. 1759 * 1760 * XXX TODO - handle HAMMER2_CHAIN_INITIAL for case where chain->bytes 1761 * matches hammer2_devblksize()? Or does the freemap's 1762 * pre-zeroing handle the case for us? 1763 */ 1764 bref = &chain->bref; 1765 hmp = chain->hmp; 1766 1767 #if 0 1768 /* handled by callback? <- TODO XXX even needed for loads? */ 1769 /* 1770 * The getblk() optimization for a 100% overwrite can only be used 1771 * if the physical block size matches the request. 1772 */ 1773 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 1774 chain->bytes == hammer2_devblksize(chain->bytes)) { 1775 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); 1776 KKASSERT(error == 0); 1777 iocb->dio = dio; 1778 callback(iocb); 1779 return; 1780 } 1781 #endif 1782 1783 /* 1784 * Otherwise issue a read 1785 */ 1786 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1787 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb); 1788 } 1789