1 /* 2 * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information into a single entity. It allows direct access to media 37 * data as long as it is not blockref array data. Meaning, basically, 38 * just inode and file data. 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 */ 48 #include <sys/cdefs.h> 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/types.h> 52 #include <sys/lock.h> 53 #include <sys/uuid.h> 54 55 #include "hammer2.h" 56 57 /* 58 * Returns TRUE if any chain in the cluster needs to be resized. 59 */ 60 int 61 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) 62 { 63 hammer2_chain_t *chain; 64 int i; 65 66 for (i = 0; i < cluster->nchains; ++i) { 67 chain = cluster->array[i].chain; 68 if (chain && chain->bytes != bytes) 69 return 1; 70 } 71 return 0; 72 } 73 74 uint8_t 75 hammer2_cluster_type(hammer2_cluster_t *cluster) 76 { 77 return(cluster->focus->bref.type); 78 } 79 80 int 81 hammer2_cluster_modified(hammer2_cluster_t *cluster) 82 { 83 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 84 } 85 86 /* 87 * Return a bref representative of the cluster. Any data offset is removed 88 * (since it would only be applicable to a particular chain in the cluster). 89 * 90 * However, the radix portion of data_off is used for many purposes and will 91 * be retained. 92 */ 93 void 94 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 95 { 96 *bref = cluster->focus->bref; 97 bref->data_off &= HAMMER2_OFF_MASK_RADIX; 98 } 99 100 /* 101 * Return non-zero if the chain representing an inode has been flagged 102 * as having been unlinked. Allows the vnode reclaim to avoid loading 103 * the inode data from disk e.g. when unmount or recycling old, clean 104 * vnodes. 105 */ 106 int 107 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster) 108 { 109 hammer2_chain_t *chain; 110 int flags; 111 int i; 112 113 flags = 0; 114 for (i = 0; i < cluster->nchains; ++i) { 115 chain = cluster->array[i].chain; 116 if (chain) 117 flags |= chain->flags; 118 } 119 return (flags & HAMMER2_CHAIN_UNLINKED); 120 } 121 122 void 123 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 124 { 125 hammer2_chain_t *chain; 126 int i; 127 128 for (i = 0; i < cluster->nchains; ++i) { 129 chain = cluster->array[i].chain; 130 if (chain) 131 atomic_set_int(&chain->flags, flags); 132 } 133 } 134 135 void 136 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 137 { 138 hammer2_chain_t *chain; 139 int i; 140 141 for (i = 0; i < cluster->nchains; ++i) { 142 chain = cluster->array[i].chain; 143 if (chain) 144 atomic_clear_int(&chain->flags, flags); 145 } 146 } 147 148 void 149 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster) 150 { 151 hammer2_chain_t *chain; 152 int i; 153 154 for (i = 0; i < cluster->nchains; ++i) { 155 chain = cluster->array[i].chain; 156 if (chain) 157 hammer2_chain_setflush(trans, chain); 158 } 159 } 160 161 void 162 hammer2_cluster_setmethod_check(hammer2_trans_t *trans, 163 hammer2_cluster_t *cluster, 164 int check_algo) 165 { 166 hammer2_chain_t *chain; 167 int i; 168 169 for (i = 0; i < cluster->nchains; ++i) { 170 chain = cluster->array[i].chain; 171 if (chain) { 172 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 173 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1); 174 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo); 175 } 176 } 177 } 178 179 /* 180 * Create a cluster with one ref from the specified chain. The chain 181 * is not further referenced. The caller typically supplies a locked 182 * chain and transfers ownership to the cluster. 183 */ 184 hammer2_cluster_t * 185 hammer2_cluster_from_chain(hammer2_chain_t *chain) 186 { 187 hammer2_cluster_t *cluster; 188 189 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 190 cluster->array[0].chain = chain; 191 cluster->nchains = 1; 192 cluster->focus = chain; 193 cluster->pmp = chain->pmp; 194 cluster->refs = 1; 195 196 return cluster; 197 } 198 199 /* 200 * Allocates a cluster and its underlying chain structures. The underlying 201 * chains will be locked. The cluster and underlying chains will have one 202 * ref. 203 */ 204 hammer2_cluster_t * 205 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp, 206 hammer2_trans_t *trans, hammer2_blockref_t *bref) 207 { 208 hammer2_cluster_t *cluster; 209 hammer2_cluster_t *rcluster; 210 hammer2_chain_t *chain; 211 hammer2_chain_t *rchain; 212 #if 0 213 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 214 #endif 215 int i; 216 217 KKASSERT(pmp != NULL); 218 219 /* 220 * Construct the appropriate system structure. 221 */ 222 switch(bref->type) { 223 case HAMMER2_BREF_TYPE_INODE: 224 case HAMMER2_BREF_TYPE_INDIRECT: 225 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 226 case HAMMER2_BREF_TYPE_DATA: 227 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 228 /* 229 * Chain's are really only associated with the hmp but we 230 * maintain a pmp association for per-mount memory tracking 231 * purposes. The pmp can be NULL. 232 */ 233 break; 234 case HAMMER2_BREF_TYPE_VOLUME: 235 case HAMMER2_BREF_TYPE_FREEMAP: 236 chain = NULL; 237 panic("hammer2_cluster_alloc volume type illegal for op"); 238 default: 239 chain = NULL; 240 panic("hammer2_cluster_alloc: unrecognized blockref type: %d", 241 bref->type); 242 } 243 244 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 245 cluster->refs = 1; 246 247 rcluster = &pmp->iroot->cluster; 248 for (i = 0; i < rcluster->nchains; ++i) { 249 rchain = rcluster->array[i].chain; 250 chain = hammer2_chain_alloc(rchain->hmp, pmp, trans, bref); 251 #if 0 252 chain->hmp = rchain->hmp; 253 chain->bref = *bref; 254 chain->bytes = bytes; 255 chain->refs = 1; 256 chain->flags = HAMMER2_CHAIN_ALLOCATED; 257 #endif 258 259 /* 260 * NOTE: When loading a chain from backing store or creating a 261 * snapshot, trans will be NULL and the caller is 262 * responsible for setting these fields. 263 */ 264 cluster->array[i].chain = chain; 265 } 266 cluster->nchains = i; 267 cluster->pmp = pmp; 268 cluster->focus = cluster->array[0].chain; 269 270 return (cluster); 271 } 272 273 /* 274 * Add a reference to a cluster. 275 * 276 * We must also ref the underlying chains in order to allow ref/unlock 277 * sequences to later re-lock. 278 */ 279 void 280 hammer2_cluster_ref(hammer2_cluster_t *cluster) 281 { 282 hammer2_chain_t *chain; 283 int i; 284 285 atomic_add_int(&cluster->refs, 1); 286 for (i = 0; i < cluster->nchains; ++i) { 287 chain = cluster->array[i].chain; 288 if (chain) 289 hammer2_chain_ref(chain); 290 } 291 } 292 293 /* 294 * Drop the caller's reference to the cluster. When the ref count drops to 295 * zero this function frees the cluster and drops all underlying chains. 296 * 297 * In-progress read I/Os are typically detached from the cluster once the 298 * first one returns (the remaining stay attached to the DIOs but are then 299 * ignored and drop naturally). 300 */ 301 void 302 hammer2_cluster_drop(hammer2_cluster_t *cluster) 303 { 304 hammer2_chain_t *chain; 305 int i; 306 307 KKASSERT(cluster->refs > 0); 308 for (i = 0; i < cluster->nchains; ++i) { 309 chain = cluster->array[i].chain; 310 if (chain) { 311 hammer2_chain_drop(chain); 312 if (cluster->refs == 1) 313 cluster->array[i].chain = NULL; 314 } 315 } 316 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 317 cluster->focus = NULL; 318 kfree(cluster, M_HAMMER2); 319 /* cluster = NULL; safety */ 320 } 321 } 322 323 void 324 hammer2_cluster_wait(hammer2_cluster_t *cluster) 325 { 326 tsleep(cluster->focus, 0, "h2clcw", 1); 327 } 328 329 /* 330 * Lock and ref a cluster. This adds a ref to the cluster and its chains 331 * and then locks them. 332 */ 333 int 334 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 335 { 336 hammer2_chain_t *chain; 337 hammer2_chain_t *tmp; 338 int i; 339 int error; 340 341 error = 0; 342 atomic_add_int(&cluster->refs, 1); 343 for (i = 0; i < cluster->nchains; ++i) { 344 chain = cluster->array[i].chain; 345 if (chain) { 346 error = hammer2_chain_lock(chain, how); 347 if (error) { 348 while (--i >= 0) { 349 tmp = cluster->array[i].chain; 350 hammer2_chain_unlock(tmp); 351 } 352 atomic_add_int(&cluster->refs, -1); 353 break; 354 } 355 } 356 } 357 return error; 358 } 359 360 /* 361 * Replace the contents of dst with src, adding a reference to src's chains. 362 * dst is assumed to already have a ref and any chains present in dst are 363 * assumed to be locked and will be unlocked. 364 * 365 * If the chains in src are locked, only one of (src) or (dst) should be 366 * considered locked by the caller after return, not both. 367 */ 368 void 369 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src) 370 { 371 hammer2_chain_t *chain; 372 hammer2_chain_t *tmp; 373 int i; 374 375 KKASSERT(dst->refs == 1); 376 dst->focus = NULL; 377 378 for (i = 0; i < src->nchains; ++i) { 379 chain = src->array[i].chain; 380 if (chain) { 381 hammer2_chain_ref(chain); 382 if (i < dst->nchains && 383 (tmp = dst->array[i].chain) != NULL) { 384 hammer2_chain_unlock(tmp); 385 } 386 dst->array[i].chain = chain; 387 if (dst->focus == NULL) 388 dst->focus = chain; 389 } 390 } 391 while (i < dst->nchains) { 392 chain = dst->array[i].chain; 393 if (chain) { 394 hammer2_chain_unlock(chain); 395 dst->array[i].chain = NULL; 396 } 397 ++i; 398 } 399 dst->nchains = src->nchains; 400 } 401 402 /* 403 * Replace the contents of the locked destination with the contents of the 404 * locked source. Destination must have one ref. 405 * 406 * Returns with the destination still with one ref and the copied chains 407 * with an additional lock (representing their state on the destination). 408 * The original chains associated with the destination are unlocked. 409 */ 410 void 411 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src) 412 { 413 hammer2_chain_t *chain; 414 hammer2_chain_t *tmp; 415 int i; 416 417 KKASSERT(dst->refs == 1); 418 419 dst->focus = NULL; 420 for (i = 0; i < src->nchains; ++i) { 421 chain = src->array[i].chain; 422 if (chain) { 423 hammer2_chain_lock(chain, 0); 424 if (i < dst->nchains && 425 (tmp = dst->array[i].chain) != NULL) { 426 hammer2_chain_unlock(tmp); 427 } 428 dst->array[i].chain = chain; 429 if (dst->focus == NULL) 430 dst->focus = chain; 431 } 432 } 433 while (i < dst->nchains) { 434 chain = dst->array[i].chain; 435 if (chain) { 436 hammer2_chain_unlock(chain); 437 dst->array[i].chain = NULL; 438 } 439 ++i; 440 } 441 dst->nchains = src->nchains; 442 } 443 444 /* 445 * Copy a cluster, returned a ref'd cluster. All underlying chains 446 * are also ref'd, but not locked. 447 * 448 * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied 449 * to the new cluster and a reference is nominally added to them and to 450 * the cluster. The cluster will have 1 ref. 451 * 452 * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains 453 * are copied but no additional references are made and the cluster will have 454 * 0 refs. Callers must ref the cluster and the chains before dropping it 455 * (typically by locking it). 456 * 457 * If flags are passed as 0 the copy is setup as if it contained the chains 458 * but the chains will not be copied over, and the cluster will have 0 refs. 459 * Callers must ref the cluster before dropping it (typically by locking it). 460 */ 461 hammer2_cluster_t * 462 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags) 463 { 464 hammer2_pfsmount_t *pmp = ocluster->pmp; 465 hammer2_cluster_t *ncluster; 466 hammer2_chain_t *chain; 467 int i; 468 469 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 470 ncluster->pmp = pmp; 471 ncluster->nchains = ocluster->nchains; 472 ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1; 473 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) { 474 ncluster->focus = ocluster->focus; 475 for (i = 0; i < ocluster->nchains; ++i) { 476 chain = ocluster->array[i].chain; 477 ncluster->array[i].chain = chain; 478 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 && 479 chain) { 480 hammer2_chain_ref(chain); 481 } 482 } 483 } 484 return (ncluster); 485 } 486 487 /* 488 * Unlock and deref a cluster. The cluster is destroyed if this is the 489 * last ref. 490 */ 491 void 492 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 493 { 494 hammer2_chain_t *chain; 495 int i; 496 497 KKASSERT(cluster->refs > 0); 498 for (i = 0; i < cluster->nchains; ++i) { 499 chain = cluster->array[i].chain; 500 if (chain) { 501 hammer2_chain_unlock(chain); 502 if (cluster->refs == 1) 503 cluster->array[i].chain = NULL; /* safety */ 504 } 505 } 506 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 507 cluster->focus = NULL; 508 kfree(cluster, M_HAMMER2); 509 /* cluster = NULL; safety */ 510 } 511 } 512 513 /* 514 * Resize the cluster's physical storage allocation in-place. This may 515 * replace the cluster's chains. 516 */ 517 void 518 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 519 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 520 int nradix, int flags) 521 { 522 hammer2_chain_t *chain; 523 int i; 524 525 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ 526 KKASSERT(cparent->nchains == cluster->nchains); 527 528 cluster->focus = NULL; 529 for (i = 0; i < cluster->nchains; ++i) { 530 chain = cluster->array[i].chain; 531 if (chain) { 532 KKASSERT(cparent->array[i].chain); 533 hammer2_chain_resize(trans, ip, 534 cparent->array[i].chain, chain, 535 nradix, flags); 536 if (cluster->focus == NULL) 537 cluster->focus = chain; 538 } 539 } 540 } 541 542 /* 543 * Set an inode's cluster modified, marking the related chains RW and 544 * duplicating them if necessary. 545 * 546 * The passed-in chain is a localized copy of the chain previously acquired 547 * when the inode was locked (and possilby replaced in the mean time), and 548 * must also be updated. In fact, we update it first and then synchronize 549 * the inode's cluster cache. 550 */ 551 hammer2_inode_data_t * 552 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 553 hammer2_cluster_t *cluster, int flags) 554 { 555 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 556 hammer2_cluster_modify(trans, cluster, flags); 557 558 hammer2_inode_repoint(ip, NULL, cluster); 559 if (ip->vp) 560 vsetisdirty(ip->vp); 561 return (&hammer2_cluster_wdata(cluster)->ipdata); 562 } 563 564 /* 565 * Adjust the cluster's chains to allow modification and adjust the 566 * focus. Data will be accessible on return. 567 */ 568 void 569 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 570 int flags) 571 { 572 hammer2_chain_t *chain; 573 int i; 574 575 cluster->focus = NULL; 576 for (i = 0; i < cluster->nchains; ++i) { 577 chain = cluster->array[i].chain; 578 if (chain) { 579 hammer2_chain_modify(trans, chain, flags); 580 if (cluster->focus == NULL) 581 cluster->focus = chain; 582 } 583 } 584 } 585 586 /* 587 * Synchronize modifications from the focus to other chains in a cluster. 588 * Convenient because nominal API users can just modify the contents of the 589 * focus (at least for non-blockref data). 590 * 591 * Nominal front-end operations only edit non-block-table data in a single 592 * chain. This code copies such modifications to the other chains in the 593 * cluster. Blocktable modifications are handled on a chain-by-chain basis 594 * by both the frontend and the backend and will explode in fireworks if 595 * blindly copied. 596 */ 597 void 598 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 599 { 600 hammer2_chain_t *focus; 601 hammer2_chain_t *scan; 602 const hammer2_inode_data_t *ripdata; 603 hammer2_inode_data_t *wipdata; 604 int i; 605 606 focus = cluster->focus; 607 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 608 609 for (i = 0; i < cluster->nchains; ++i) { 610 scan = cluster->array[i].chain; 611 if (scan == NULL || scan == focus) 612 continue; 613 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 614 KKASSERT(focus->bytes == scan->bytes && 615 focus->bref.type == scan->bref.type); 616 switch(focus->bref.type) { 617 case HAMMER2_BREF_TYPE_INODE: 618 ripdata = &focus->data->ipdata; 619 wipdata = &scan->data->ipdata; 620 if ((ripdata->op_flags & 621 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 622 bcopy(ripdata, wipdata, 623 offsetof(hammer2_inode_data_t, u)); 624 break; 625 } 626 /* fall through */ 627 case HAMMER2_BREF_TYPE_DATA: 628 bcopy(focus->data, scan->data, focus->bytes); 629 break; 630 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 631 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 632 case HAMMER2_BREF_TYPE_FREEMAP: 633 case HAMMER2_BREF_TYPE_VOLUME: 634 panic("hammer2_cluster_modsync: illegal node type"); 635 /* NOT REACHED */ 636 break; 637 default: 638 panic("hammer2_cluster_modsync: unknown node type"); 639 break; 640 } 641 } 642 } 643 644 /* 645 * Lookup initialization/completion API 646 */ 647 hammer2_cluster_t * 648 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 649 { 650 hammer2_cluster_t *cluster; 651 int i; 652 653 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 654 cluster->pmp = cparent->pmp; /* can be NULL */ 655 /* cluster->focus = NULL; already null */ 656 657 for (i = 0; i < cparent->nchains; ++i) { 658 cluster->array[i].chain = cparent->array[i].chain; 659 if (cluster->focus == NULL) 660 cluster->focus = cluster->array[i].chain; 661 } 662 cluster->nchains = cparent->nchains; 663 664 /* 665 * Independently lock (this will also give cluster 1 ref) 666 */ 667 if (flags & HAMMER2_LOOKUP_SHARED) { 668 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 669 HAMMER2_RESOLVE_SHARED); 670 } else { 671 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 672 } 673 return (cluster); 674 } 675 676 void 677 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 678 { 679 if (cparent) 680 hammer2_cluster_unlock(cparent); 681 } 682 683 /* 684 * Locate first match or overlap under parent, return a new cluster 685 */ 686 hammer2_cluster_t * 687 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 688 hammer2_key_t key_beg, hammer2_key_t key_end, 689 int flags, int *ddflagp) 690 { 691 hammer2_pfsmount_t *pmp; 692 hammer2_cluster_t *cluster; 693 hammer2_chain_t *chain; 694 hammer2_key_t key_accum; 695 hammer2_key_t key_next; 696 hammer2_key_t bref_key; 697 int bref_keybits; 698 int null_count; 699 int ddflag; 700 int i; 701 uint8_t bref_type; 702 u_int bytes; 703 704 pmp = cparent->pmp; /* can be NULL */ 705 key_accum = *key_nextp; 706 null_count = 0; 707 bref_type = 0; 708 bref_key = 0; 709 bref_keybits = 0; 710 bytes = 0; 711 712 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 713 cluster->pmp = pmp; /* can be NULL */ 714 cluster->refs = 1; 715 /* cluster->focus = NULL; already null */ 716 cparent->focus = NULL; 717 *ddflagp = 0; 718 719 for (i = 0; i < cparent->nchains; ++i) { 720 key_next = *key_nextp; 721 if (cparent->array[i].chain == NULL) { 722 ++null_count; 723 continue; 724 } 725 chain = hammer2_chain_lookup(&cparent->array[i].chain, 726 &key_next, 727 key_beg, key_end, 728 &cparent->array[i].cache_index, 729 flags, &ddflag); 730 if (cparent->focus == NULL) 731 cparent->focus = cparent->array[i].chain; 732 cluster->array[i].chain = chain; 733 if (chain == NULL) { 734 ++null_count; 735 } else { 736 if (cluster->focus == NULL) { 737 bref_type = chain->bref.type; 738 bref_key = chain->bref.key; 739 bref_keybits = chain->bref.keybits; 740 bytes = chain->bytes; 741 *ddflagp = ddflag; 742 cluster->focus = chain; 743 } 744 KKASSERT(bref_type == chain->bref.type); 745 KKASSERT(bref_key == chain->bref.key); 746 KKASSERT(bref_keybits == chain->bref.keybits); 747 KKASSERT(bytes == chain->bytes); 748 KKASSERT(*ddflagp == ddflag); 749 } 750 if (key_accum > key_next) 751 key_accum = key_next; 752 } 753 *key_nextp = key_accum; 754 cluster->nchains = i; 755 756 if (null_count == i) { 757 hammer2_cluster_drop(cluster); 758 cluster = NULL; 759 } 760 761 return (cluster); 762 } 763 764 /* 765 * Locate next match or overlap under parent, replace cluster 766 */ 767 hammer2_cluster_t * 768 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 769 hammer2_key_t *key_nextp, 770 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 771 { 772 hammer2_chain_t *chain; 773 hammer2_key_t key_accum; 774 hammer2_key_t key_next; 775 int null_count; 776 int i; 777 778 key_accum = *key_nextp; 779 null_count = 0; 780 cluster->focus = NULL; 781 cparent->focus = NULL; 782 783 for (i = 0; i < cparent->nchains; ++i) { 784 key_next = *key_nextp; 785 chain = cluster->array[i].chain; 786 if (chain == NULL) { 787 if (cparent->focus == NULL) 788 cparent->focus = cparent->array[i].chain; 789 ++null_count; 790 continue; 791 } 792 if (cparent->array[i].chain == NULL) { 793 if (flags & HAMMER2_LOOKUP_NOLOCK) 794 hammer2_chain_drop(chain); 795 else 796 hammer2_chain_unlock(chain); 797 ++null_count; 798 continue; 799 } 800 chain = hammer2_chain_next(&cparent->array[i].chain, chain, 801 &key_next, key_beg, key_end, 802 &cparent->array[i].cache_index, 803 flags); 804 if (cparent->focus == NULL) 805 cparent->focus = cparent->array[i].chain; 806 cluster->array[i].chain = chain; 807 if (chain == NULL) { 808 ++null_count; 809 } else if (cluster->focus == NULL) { 810 cluster->focus = chain; 811 } 812 if (key_accum > key_next) 813 key_accum = key_next; 814 } 815 816 if (null_count == i) { 817 hammer2_cluster_drop(cluster); 818 cluster = NULL; 819 } 820 return(cluster); 821 } 822 823 #if 0 824 /* 825 * XXX initial NULL cluster needs reworking (pass **clusterp ?) 826 * 827 * The raw scan function is similar to lookup/next but does not seek to a key. 828 * Blockrefs are iterated via first_chain = (parent, NULL) and 829 * next_chain = (parent, chain). 830 * 831 * The passed-in parent must be locked and its data resolved. The returned 832 * chain will be locked. Pass chain == NULL to acquire the first sub-chain 833 * under parent and then iterate with the passed-in chain (which this 834 * function will unlock). 835 */ 836 hammer2_cluster_t * 837 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 838 int flags) 839 { 840 hammer2_chain_t *chain; 841 int null_count; 842 int i; 843 844 null_count = 0; 845 846 for (i = 0; i < cparent->nchains; ++i) { 847 chain = cluster->array[i].chain; 848 if (chain == NULL) { 849 ++null_count; 850 continue; 851 } 852 if (cparent->array[i].chain == NULL) { 853 if (flags & HAMMER2_LOOKUP_NOLOCK) 854 hammer2_chain_drop(chain); 855 else 856 hammer2_chain_unlock(chain); 857 ++null_count; 858 continue; 859 } 860 861 chain = hammer2_chain_scan(cparent->array[i].chain, chain, 862 &cparent->array[i].cache_index, 863 flags); 864 cluster->array[i].chain = chain; 865 if (chain == NULL) 866 ++null_count; 867 } 868 869 if (null_count == i) { 870 hammer2_cluster_drop(cluster); 871 cluster = NULL; 872 } 873 return(cluster); 874 } 875 876 #endif 877 878 /* 879 * Create a new cluster using the specified key 880 */ 881 int 882 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 883 hammer2_cluster_t **clusterp, 884 hammer2_key_t key, int keybits, 885 int type, size_t bytes, int flags) 886 { 887 hammer2_cluster_t *cluster; 888 hammer2_pfsmount_t *pmp; 889 int error; 890 int i; 891 892 pmp = trans->pmp; /* can be NULL */ 893 894 if ((cluster = *clusterp) == NULL) { 895 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, 896 M_WAITOK | M_ZERO); 897 cluster->pmp = pmp; /* can be NULL */ 898 cluster->refs = 1; 899 } 900 cluster->focus = NULL; 901 cparent->focus = NULL; 902 903 /* 904 * NOTE: cluster->array[] entries can initially be NULL. If 905 * *clusterp is supplied, skip NULL entries, otherwise 906 * create new chains. 907 */ 908 for (i = 0; i < cparent->nchains; ++i) { 909 if (*clusterp && cluster->array[i].chain == NULL) { 910 if (cparent->focus == NULL) 911 cparent->focus = cparent->array[i].chain; 912 continue; 913 } 914 error = hammer2_chain_create(trans, &cparent->array[i].chain, 915 &cluster->array[i].chain, pmp, 916 key, keybits, 917 type, bytes, flags); 918 KKASSERT(error == 0); 919 if (cparent->focus == NULL) 920 cparent->focus = cparent->array[i].chain; 921 if (cluster->focus == NULL) 922 cluster->focus = cluster->array[i].chain; 923 } 924 cluster->nchains = i; 925 *clusterp = cluster; 926 927 return error; 928 } 929 930 /* 931 * Rename a cluster to a new parent. 932 * 933 * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields 934 * are used from a passed-in non-NULL bref pointer. All other fields 935 * are extracted from the original chain for each chain in the 936 * iteration. 937 */ 938 void 939 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 940 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 941 int flags) 942 { 943 hammer2_chain_t *chain; 944 hammer2_blockref_t xbref; 945 int i; 946 947 cluster->focus = NULL; 948 cparent->focus = NULL; 949 950 for (i = 0; i < cluster->nchains; ++i) { 951 chain = cluster->array[i].chain; 952 if (chain) { 953 if (bref) { 954 xbref = chain->bref; 955 xbref.key = bref->key; 956 xbref.keybits = bref->keybits; 957 hammer2_chain_rename(trans, &xbref, 958 &cparent->array[i].chain, 959 chain, flags); 960 } else { 961 hammer2_chain_rename(trans, NULL, 962 &cparent->array[i].chain, 963 chain, flags); 964 } 965 cluster->array[i].chain = chain; 966 if (cluster->focus == NULL) 967 cluster->focus = chain; 968 if (cparent->focus == NULL) 969 cparent->focus = cparent->array[i].chain; 970 } else { 971 if (cparent->focus == NULL) 972 cparent->focus = cparent->array[i].chain; 973 } 974 } 975 } 976 977 /* 978 * Mark a cluster deleted 979 */ 980 void 981 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 982 hammer2_cluster_t *cluster, int flags) 983 { 984 hammer2_chain_t *chain; 985 hammer2_chain_t *parent; 986 int i; 987 988 if (cparent == NULL) { 989 kprintf("cparent is NULL\n"); 990 return; 991 } 992 993 for (i = 0; i < cluster->nchains; ++i) { 994 parent = (i < cparent->nchains) ? 995 cparent->array[i].chain : NULL; 996 chain = cluster->array[i].chain; 997 if (chain == NULL) 998 continue; 999 if (chain->parent != parent) { 1000 kprintf("hammer2_cluster_delete: parent " 1001 "mismatch chain=%p parent=%p against=%p\n", 1002 chain, chain->parent, parent); 1003 } else { 1004 hammer2_chain_delete(trans, parent, chain, flags); 1005 } 1006 } 1007 } 1008 1009 /* 1010 * Create a snapshot of the specified {parent, ochain} with the specified 1011 * label. The originating hammer2_inode must be exclusively locked for 1012 * safety. 1013 * 1014 * The ioctl code has already synced the filesystem. 1015 */ 1016 int 1017 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, 1018 hammer2_ioc_pfs_t *pfs) 1019 { 1020 hammer2_mount_t *hmp; 1021 hammer2_cluster_t *ncluster; 1022 const hammer2_inode_data_t *ripdata; 1023 hammer2_inode_data_t *wipdata; 1024 hammer2_chain_t *nchain; 1025 hammer2_inode_t *nip; 1026 size_t name_len; 1027 hammer2_key_t lhc; 1028 struct vattr vat; 1029 uuid_t opfs_clid; 1030 int error; 1031 int i; 1032 1033 kprintf("snapshot %s\n", pfs->name); 1034 1035 name_len = strlen(pfs->name); 1036 lhc = hammer2_dirhash(pfs->name, name_len); 1037 1038 /* 1039 * Get the clid 1040 */ 1041 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata; 1042 opfs_clid = ripdata->pfs_clid; 1043 hmp = ocluster->focus->hmp; 1044 1045 /* 1046 * Create the snapshot directory under the super-root 1047 * 1048 * Set PFS type, generate a unique filesystem id, and generate 1049 * a cluster id. Use the same clid when snapshotting a PFS root, 1050 * which theoretically allows the snapshot to be used as part of 1051 * the same cluster (perhaps as a cache). 1052 * 1053 * Copy the (flushed) blockref array. Theoretically we could use 1054 * chain_duplicate() but it becomes difficult to disentangle 1055 * the shared core so for now just brute-force it. 1056 */ 1057 VATTR_NULL(&vat); 1058 vat.va_type = VDIR; 1059 vat.va_mode = 0755; 1060 ncluster = NULL; 1061 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, 1062 proc0.p_ucred, pfs->name, name_len, 1063 &ncluster, &error); 1064 1065 if (nip) { 1066 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); 1067 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT; 1068 kern_uuidgen(&wipdata->pfs_fsid, 1); 1069 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1070 wipdata->pfs_clid = opfs_clid; 1071 else 1072 kern_uuidgen(&wipdata->pfs_clid, 1); 1073 1074 for (i = 0; i < ncluster->nchains; ++i) { 1075 nchain = ncluster->array[i].chain; 1076 if (nchain) 1077 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 1078 } 1079 #if 0 1080 /* XXX can't set this unless we do an explicit flush, which 1081 we also need a pmp assigned to do, else the flush code 1082 won't flush ncluster because it thinks it is crossing a 1083 flush boundary */ 1084 hammer2_cluster_set_chainflags(ncluster, 1085 HAMMER2_CHAIN_PFSBOUNDARY); 1086 #endif 1087 1088 /* XXX hack blockset copy */ 1089 /* XXX doesn't work with real cluster */ 1090 KKASSERT(ocluster->nchains == 1); 1091 wipdata->u.blockset = ripdata->u.blockset; 1092 hammer2_cluster_modsync(ncluster); 1093 for (i = 0; i < ncluster->nchains; ++i) { 1094 nchain = ncluster->array[i].chain; 1095 if (nchain) 1096 hammer2_flush(trans, nchain); 1097 } 1098 hammer2_inode_unlock_ex(nip, ncluster); 1099 } 1100 return (error); 1101 } 1102 1103 /* 1104 * Return locked parent cluster given a locked child. The child remains 1105 * locked on return. 1106 */ 1107 hammer2_cluster_t * 1108 hammer2_cluster_parent(hammer2_cluster_t *cluster) 1109 { 1110 hammer2_cluster_t *cparent; 1111 int i; 1112 1113 cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS); 1114 for (i = 0; i < cluster->nchains; ++i) { 1115 hammer2_chain_t *chain; 1116 hammer2_chain_t *rchain; 1117 1118 chain = cluster->array[i].chain; 1119 if (chain == NULL) 1120 continue; 1121 hammer2_chain_ref(chain); 1122 while ((rchain = chain->parent) != NULL) { 1123 hammer2_chain_ref(rchain); 1124 hammer2_chain_unlock(chain); 1125 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS); 1126 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1127 hammer2_chain_drop(rchain); 1128 if (chain->parent == rchain) 1129 break; 1130 hammer2_chain_unlock(rchain); 1131 } 1132 hammer2_chain_drop(chain); 1133 cparent->array[i].chain = rchain; 1134 } 1135 return cparent; 1136 } 1137 1138 /************************************************************************ 1139 * CLUSTER I/O * 1140 ************************************************************************ 1141 * 1142 * 1143 * WARNING! blockref[] array data is not universal. These functions should 1144 * only be used to access universal data. 1145 * 1146 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1147 * complete if necessary. The I/O's should have already been 1148 * initiated by the cluster_lock/chain_lock operation. 1149 * 1150 * The cluster must already be in a modified state before wdata 1151 * is called. The data will already be available for this case. 1152 */ 1153 const hammer2_media_data_t * 1154 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1155 { 1156 return(cluster->focus->data); 1157 } 1158 1159 hammer2_media_data_t * 1160 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1161 { 1162 KKASSERT(hammer2_cluster_modified(cluster)); 1163 return(cluster->focus->data); 1164 } 1165 1166 /* 1167 * Load async into independent buffer - used to load logical buffers from 1168 * underlying device data. The callback is made for the first validated 1169 * data found, or NULL if no valid data is available. 1170 * 1171 * NOTE! The cluster structure is either unique or serialized (e.g. embedded 1172 * in the inode with an exclusive lock held), the chain structure may be 1173 * shared. 1174 */ 1175 void 1176 hammer2_cluster_load_async(hammer2_cluster_t *cluster, 1177 void (*callback)(hammer2_iocb_t *iocb), void *ptr) 1178 { 1179 hammer2_chain_t *chain; 1180 hammer2_iocb_t *iocb; 1181 hammer2_mount_t *hmp; 1182 hammer2_blockref_t *bref; 1183 int i; 1184 1185 /* 1186 * Try to find a chain whos data is already resolved. If none can 1187 * be found, start with the first chain. 1188 */ 1189 chain = NULL; 1190 for (i = 0; i < cluster->nchains; ++i) { 1191 chain = cluster->array[i].chain; 1192 if (chain && chain->data) 1193 break; 1194 } 1195 if (i == cluster->nchains) { 1196 chain = cluster->array[0].chain; 1197 i = 0; 1198 } 1199 1200 iocb = &cluster->iocb; 1201 iocb->callback = callback; 1202 iocb->dio = NULL; /* for already-validated case */ 1203 iocb->cluster = cluster; 1204 iocb->chain = chain; 1205 iocb->ptr = ptr; 1206 iocb->lbase = (off_t)i; 1207 iocb->flags = 0; 1208 iocb->error = 0; 1209 1210 /* 1211 * Data already validated 1212 */ 1213 if (chain->data) { 1214 callback(iocb); 1215 return; 1216 } 1217 1218 /* 1219 * We must resolve to a device buffer, either by issuing I/O or 1220 * by creating a zero-fill element. We do not mark the buffer 1221 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1222 * API must still be used to do that). 1223 * 1224 * The device buffer is variable-sized in powers of 2 down 1225 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1226 * chunk always contains buffers of the same size. (XXX) 1227 * 1228 * The minimum physical IO size may be larger than the variable 1229 * block size. 1230 */ 1231 bref = &chain->bref; 1232 hmp = chain->hmp; 1233 1234 #if 0 1235 /* handled by callback? <- TODO XXX even needed for loads? */ 1236 /* 1237 * The getblk() optimization for a 100% overwrite can only be used 1238 * if the physical block size matches the request. 1239 */ 1240 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 1241 chain->bytes == hammer2_devblksize(chain->bytes)) { 1242 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); 1243 KKASSERT(error == 0); 1244 iocb->dio = dio; 1245 callback(iocb); 1246 return; 1247 } 1248 #endif 1249 1250 /* 1251 * Otherwise issue a read 1252 */ 1253 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1254 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb); 1255 } 1256 1257 /************************************************************************ 1258 * NODE FAILURES * 1259 ************************************************************************ 1260 * 1261 * A node failure can occur for numerous reasons. 1262 * 1263 * - A read I/O may fail 1264 * - A write I/O may fail 1265 * - An unexpected chain might be found (or be missing) 1266 * - A node might disconnect temporarily and reconnect later 1267 * (for example, a USB stick could get pulled, or a node might 1268 * be programmatically disconnected). 1269 * - A node might run out of space during a modifying operation. 1270 * 1271 * When a read failure or an unexpected chain state is found, the chain and 1272 * parent chain at the failure point for the nodes involved (the nodes 1273 * which we determine to be in error) are flagged as failed and removed 1274 * from the cluster. The node itself is allowed to remain active. The 1275 * highest common point (usually a parent chain) is queued to the 1276 * resynchronization thread for action. 1277 * 1278 * When a write I/O fails or a node runs out of space, we first adjust 1279 * as if a read failure occurs but we further disable flushes on the 1280 * ENTIRE node. Concurrent modifying transactions are allowed to complete 1281 * but any new modifying transactions will automatically remove the node 1282 * from consideration in all related cluster structures and not generate 1283 * any new modified chains. The ROOT chain for the failed node(s) is queued 1284 * to the resynchronization thread for action. 1285 * 1286 * A temporary disconnect is handled as if a write failure occurred. 1287 * 1288 * Any of these failures might or might not stall related high level VNOPS, 1289 * depending on what has failed, what nodes remain, the type of cluster, 1290 * and the operating state of the cluster. 1291 * 1292 * FLUSH ON WRITE-DISABLED NODES 1293 * 1294 * A flush on a write-disabled node is not allowed to write anything because 1295 * we cannot safely update the mirror_tid anywhere on the failed node. The 1296 * synchronization thread uses mirror_tid to calculate incremental resyncs. 1297 * Dirty meta-data related to the failed node is thrown away. 1298 * 1299 * Dirty buffer cache buffers and inodes are only thrown away if they can be 1300 * retired... that is, if the filesystem still has enough nodes to complete 1301 * the operation. 1302 */ 1303 1304 /************************************************************************ 1305 * SYNCHRONIZATION THREAD * 1306 ************************************************************************ 1307 * 1308 * This thread is responsible for [re]synchronizing the cluster representing 1309 * a PFS. Any out-of-sync or failed node starts this thread on a 1310 * node-by-node basis when the failure is detected. 1311 * 1312 * Clusters needing resynchronization are queued at the highest point 1313 * where the parent on the failed node is still valid, or a special 1314 * incremental scan from the ROOT is queued if no parent exists. This 1315 * thread is also responsible for waiting for reconnections of the failed 1316 * node if the cause was due to a disconnect, and waiting for space to be 1317 * freed up if the cause was due to running out of space. 1318 * 1319 * If the cause is due to a node running out of space, this thread will also 1320 * remove older (unlocked) snapshots to make new space, recover space, and 1321 * then start resynchronization. 1322 * 1323 * Each resynchronization pass virtually snapshots the PFS on the good nodes 1324 * and synchronizes using that snapshot against the target node. This 1325 * ensures a consistent chain topology and also avoids interference between 1326 * the resynchronization thread and frontend operations. 1327 * 1328 * Since these are per-node threads it is possible to resynchronize several 1329 * nodes at once. 1330 */ 1331