1 /* 2 * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information into a single entity. It allows direct access to media 37 * data as long as it is not blockref array data. Meaning, basically, 38 * just inode and file data. 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 */ 48 #include <sys/cdefs.h> 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/types.h> 52 #include <sys/lock.h> 53 #include <sys/uuid.h> 54 55 #include "hammer2.h" 56 57 /* 58 * Returns TRUE if any chain in the cluster needs to be resized. 59 */ 60 int 61 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) 62 { 63 hammer2_chain_t *chain; 64 int i; 65 66 for (i = 0; i < cluster->nchains; ++i) { 67 chain = cluster->array[i]; 68 if (chain && chain->bytes != bytes) 69 return 1; 70 } 71 return 0; 72 } 73 74 uint8_t 75 hammer2_cluster_type(hammer2_cluster_t *cluster) 76 { 77 return(cluster->focus->bref.type); 78 } 79 80 int 81 hammer2_cluster_modified(hammer2_cluster_t *cluster) 82 { 83 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 84 } 85 86 /* 87 * Return a bref representative of the cluster. Any data offset is removed 88 * (since it would only be applicable to a particular chain in the cluster). 89 * 90 * However, the radix portion of data_off is used for many purposes and will 91 * be retained. 92 */ 93 void 94 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 95 { 96 *bref = cluster->focus->bref; 97 bref->data_off &= HAMMER2_OFF_MASK_RADIX; 98 } 99 100 /* 101 * Return non-zero if the chain representing an inode has been flagged 102 * as having been unlinked. Allows the vnode reclaim to avoid loading 103 * the inode data from disk e.g. when unmount or recycling old, clean 104 * vnodes. 105 */ 106 int 107 hammer2_cluster_isunlinked(hammer2_cluster_t *cluster) 108 { 109 hammer2_chain_t *chain; 110 int flags; 111 int i; 112 113 flags = 0; 114 for (i = 0; i < cluster->nchains; ++i) { 115 if ((chain = cluster->array[i]) != NULL) 116 flags |= chain->flags; 117 } 118 return (flags & HAMMER2_CHAIN_UNLINKED); 119 } 120 121 void 122 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 123 { 124 hammer2_chain_t *chain; 125 int i; 126 127 for (i = 0; i < cluster->nchains; ++i) { 128 chain = cluster->array[i]; 129 if (chain) 130 atomic_set_int(&chain->flags, flags); 131 } 132 } 133 134 void 135 hammer2_cluster_clr_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 136 { 137 hammer2_chain_t *chain; 138 int i; 139 140 for (i = 0; i < cluster->nchains; ++i) { 141 chain = cluster->array[i]; 142 if (chain) 143 atomic_clear_int(&chain->flags, flags); 144 } 145 } 146 147 void 148 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster) 149 { 150 hammer2_chain_t *chain; 151 int i; 152 153 for (i = 0; i < cluster->nchains; ++i) { 154 chain = cluster->array[i]; 155 if (chain) 156 hammer2_chain_setflush(trans, chain); 157 } 158 } 159 160 void 161 hammer2_cluster_setmethod_check(hammer2_trans_t *trans, 162 hammer2_cluster_t *cluster, 163 int check_algo) 164 { 165 hammer2_chain_t *chain; 166 int i; 167 168 for (i = 0; i < cluster->nchains; ++i) { 169 chain = cluster->array[i]; 170 if (chain) { 171 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 172 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1); 173 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo); 174 } 175 } 176 } 177 178 /* 179 * Create a cluster with one ref from the specified chain. The chain 180 * is not further referenced. The caller typically supplies a locked 181 * chain and transfers ownership to the cluster. 182 */ 183 hammer2_cluster_t * 184 hammer2_cluster_from_chain(hammer2_chain_t *chain) 185 { 186 hammer2_cluster_t *cluster; 187 188 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 189 cluster->array[0] = chain; 190 cluster->nchains = 1; 191 cluster->focus = chain; 192 cluster->pmp = chain->pmp; 193 cluster->refs = 1; 194 195 return cluster; 196 } 197 198 /* 199 * Allocates a cluster and its underlying chain structures. The underlying 200 * chains will be locked. The cluster and underlying chains will have one 201 * ref. 202 */ 203 hammer2_cluster_t * 204 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp, 205 hammer2_trans_t *trans, hammer2_blockref_t *bref) 206 { 207 hammer2_cluster_t *cluster; 208 hammer2_cluster_t *rcluster; 209 hammer2_chain_t *chain; 210 #if 0 211 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 212 #endif 213 int i; 214 215 KKASSERT(pmp != NULL); 216 217 /* 218 * Construct the appropriate system structure. 219 */ 220 switch(bref->type) { 221 case HAMMER2_BREF_TYPE_INODE: 222 case HAMMER2_BREF_TYPE_INDIRECT: 223 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 224 case HAMMER2_BREF_TYPE_DATA: 225 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 226 /* 227 * Chain's are really only associated with the hmp but we 228 * maintain a pmp association for per-mount memory tracking 229 * purposes. The pmp can be NULL. 230 */ 231 break; 232 case HAMMER2_BREF_TYPE_VOLUME: 233 case HAMMER2_BREF_TYPE_FREEMAP: 234 chain = NULL; 235 panic("hammer2_cluster_alloc volume type illegal for op"); 236 default: 237 chain = NULL; 238 panic("hammer2_cluster_alloc: unrecognized blockref type: %d", 239 bref->type); 240 } 241 242 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 243 cluster->refs = 1; 244 245 rcluster = &pmp->iroot->cluster; 246 for (i = 0; i < rcluster->nchains; ++i) { 247 chain = hammer2_chain_alloc(rcluster->array[i]->hmp, 248 pmp, trans, bref); 249 #if 0 250 chain->hmp = rcluster->array[i]->hmp; 251 chain->bref = *bref; 252 chain->bytes = bytes; 253 chain->refs = 1; 254 chain->flags = HAMMER2_CHAIN_ALLOCATED; 255 #endif 256 257 /* 258 * NOTE: When loading a chain from backing store or creating a 259 * snapshot, trans will be NULL and the caller is 260 * responsible for setting these fields. 261 */ 262 cluster->array[i] = chain; 263 } 264 cluster->nchains = i; 265 cluster->pmp = pmp; 266 cluster->focus = cluster->array[0]; 267 268 return (cluster); 269 } 270 271 /* 272 * Add a reference to a cluster. 273 * 274 * We must also ref the underlying chains in order to allow ref/unlock 275 * sequences to later re-lock. 276 */ 277 void 278 hammer2_cluster_ref(hammer2_cluster_t *cluster) 279 { 280 hammer2_chain_t *chain; 281 int i; 282 283 atomic_add_int(&cluster->refs, 1); 284 for (i = 0; i < cluster->nchains; ++i) { 285 chain = cluster->array[i]; 286 if (chain) 287 hammer2_chain_ref(chain); 288 } 289 } 290 291 /* 292 * Drop the caller's reference to the cluster. When the ref count drops to 293 * zero this function frees the cluster and drops all underlying chains. 294 * 295 * In-progress read I/Os are typically detached from the cluster once the 296 * first one returns (the remaining stay attached to the DIOs but are then 297 * ignored and drop naturally). 298 */ 299 void 300 hammer2_cluster_drop(hammer2_cluster_t *cluster) 301 { 302 hammer2_chain_t *chain; 303 int i; 304 305 KKASSERT(cluster->refs > 0); 306 for (i = 0; i < cluster->nchains; ++i) { 307 chain = cluster->array[i]; 308 if (chain) { 309 hammer2_chain_drop(chain); 310 if (cluster->refs == 1) 311 cluster->array[i] = NULL; 312 } 313 } 314 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 315 cluster->focus = NULL; 316 kfree(cluster, M_HAMMER2); 317 /* cluster = NULL; safety */ 318 } 319 } 320 321 void 322 hammer2_cluster_wait(hammer2_cluster_t *cluster) 323 { 324 tsleep(cluster->focus, 0, "h2clcw", 1); 325 } 326 327 /* 328 * Lock and ref a cluster. This adds a ref to the cluster and its chains 329 * and then locks them. 330 */ 331 int 332 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 333 { 334 hammer2_chain_t *chain; 335 int i; 336 int error; 337 338 error = 0; 339 atomic_add_int(&cluster->refs, 1); 340 for (i = 0; i < cluster->nchains; ++i) { 341 chain = cluster->array[i]; 342 if (chain) { 343 error = hammer2_chain_lock(chain, how); 344 if (error) { 345 while (--i >= 0) 346 hammer2_chain_unlock(cluster->array[i]); 347 atomic_add_int(&cluster->refs, -1); 348 break; 349 } 350 } 351 } 352 return error; 353 } 354 355 /* 356 * Replace the contents of dst with src, adding a reference to src's chains. 357 * dst is assumed to already have a ref and any chains present in dst are 358 * assumed to be locked and will be unlocked. 359 * 360 * If the chains in src are locked, only one of (src) or (dst) should be 361 * considered locked by the caller after return, not both. 362 */ 363 void 364 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src) 365 { 366 hammer2_chain_t *chain; 367 int i; 368 369 KKASSERT(dst->refs == 1); 370 dst->focus = NULL; 371 372 for (i = 0; i < src->nchains; ++i) { 373 chain = src->array[i]; 374 if (chain) { 375 hammer2_chain_ref(chain); 376 if (i < dst->nchains && dst->array[i]) 377 hammer2_chain_unlock(dst->array[i]); 378 dst->array[i] = chain; 379 if (dst->focus == NULL) 380 dst->focus = chain; 381 } 382 } 383 while (i < dst->nchains) { 384 chain = dst->array[i]; 385 if (chain) { 386 hammer2_chain_unlock(chain); 387 dst->array[i] = NULL; 388 } 389 ++i; 390 } 391 dst->nchains = src->nchains; 392 } 393 394 /* 395 * Replace the contents of the locked destination with the contents of the 396 * locked source. Destination must have one ref. 397 * 398 * Returns with the destination still with one ref and the copied chains 399 * with an additional lock (representing their state on the destination). 400 * The original chains associated with the destination are unlocked. 401 */ 402 void 403 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src) 404 { 405 hammer2_chain_t *chain; 406 int i; 407 408 KKASSERT(dst->refs == 1); 409 410 dst->focus = NULL; 411 for (i = 0; i < src->nchains; ++i) { 412 chain = src->array[i]; 413 if (chain) { 414 hammer2_chain_lock(chain, 0); 415 if (i < dst->nchains && dst->array[i]) 416 hammer2_chain_unlock(dst->array[i]); 417 dst->array[i] = src->array[i]; 418 if (dst->focus == NULL) 419 dst->focus = chain; 420 } 421 } 422 while (i < dst->nchains) { 423 chain = dst->array[i]; 424 if (chain) { 425 hammer2_chain_unlock(chain); 426 dst->array[i] = NULL; 427 } 428 ++i; 429 } 430 dst->nchains = src->nchains; 431 } 432 433 /* 434 * Copy a cluster, returned a ref'd cluster. All underlying chains 435 * are also ref'd, but not locked. 436 * 437 * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied 438 * to the new cluster and a reference is nominally added to them and to 439 * the cluster. The cluster will have 1 ref. 440 * 441 * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains 442 * are copied but no additional references are made and the cluster will have 443 * 0 refs. Callers must ref the cluster and the chains before dropping it 444 * (typically by locking it). 445 * 446 * If flags are passed as 0 the copy is setup as if it contained the chains 447 * but the chains will not be copied over, and the cluster will have 0 refs. 448 * Callers must ref the cluster before dropping it (typically by locking it). 449 */ 450 hammer2_cluster_t * 451 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags) 452 { 453 hammer2_pfsmount_t *pmp = ocluster->pmp; 454 hammer2_cluster_t *ncluster; 455 hammer2_chain_t *chain; 456 int i; 457 458 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 459 ncluster->pmp = pmp; 460 ncluster->nchains = ocluster->nchains; 461 ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1; 462 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) { 463 ncluster->focus = ocluster->focus; 464 for (i = 0; i < ocluster->nchains; ++i) { 465 chain = ocluster->array[i]; 466 ncluster->array[i] = chain; 467 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 && 468 chain) { 469 hammer2_chain_ref(chain); 470 } 471 } 472 } 473 return (ncluster); 474 } 475 476 /* 477 * Unlock and deref a cluster. The cluster is destroyed if this is the 478 * last ref. 479 */ 480 void 481 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 482 { 483 hammer2_chain_t *chain; 484 int i; 485 486 KKASSERT(cluster->refs > 0); 487 for (i = 0; i < cluster->nchains; ++i) { 488 chain = cluster->array[i]; 489 if (chain) { 490 hammer2_chain_unlock(chain); 491 if (cluster->refs == 1) 492 cluster->array[i] = NULL; /* safety */ 493 } 494 } 495 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 496 cluster->focus = NULL; 497 kfree(cluster, M_HAMMER2); 498 /* cluster = NULL; safety */ 499 } 500 } 501 502 /* 503 * Resize the cluster's physical storage allocation in-place. This may 504 * replace the cluster's chains. 505 */ 506 void 507 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 508 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 509 int nradix, int flags) 510 { 511 int i; 512 513 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ 514 KKASSERT(cparent->nchains == cluster->nchains); 515 516 cluster->focus = NULL; 517 for (i = 0; i < cluster->nchains; ++i) { 518 if (cluster->array[i]) { 519 KKASSERT(cparent->array[i]); 520 hammer2_chain_resize(trans, ip, 521 cparent->array[i], 522 cluster->array[i], 523 nradix, flags); 524 if (cluster->focus == NULL) 525 cluster->focus = cluster->array[i]; 526 } 527 } 528 } 529 530 /* 531 * Set an inode's cluster modified, marking the related chains RW and 532 * duplicating them if necessary. 533 * 534 * The passed-in chain is a localized copy of the chain previously acquired 535 * when the inode was locked (and possilby replaced in the mean time), and 536 * must also be updated. In fact, we update it first and then synchronize 537 * the inode's cluster cache. 538 */ 539 hammer2_inode_data_t * 540 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 541 hammer2_cluster_t *cluster, int flags) 542 { 543 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 544 hammer2_cluster_modify(trans, cluster, flags); 545 546 hammer2_inode_repoint(ip, NULL, cluster); 547 if (ip->vp) 548 vsetisdirty(ip->vp); 549 return (&hammer2_cluster_wdata(cluster)->ipdata); 550 } 551 552 /* 553 * Adjust the cluster's chains to allow modification and adjust the 554 * focus. Data will be accessible on return. 555 */ 556 void 557 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 558 int flags) 559 { 560 int i; 561 562 cluster->focus = NULL; 563 for (i = 0; i < cluster->nchains; ++i) { 564 if (cluster->array[i]) { 565 hammer2_chain_modify(trans, cluster->array[i], flags); 566 if (cluster->focus == NULL) 567 cluster->focus = cluster->array[i]; 568 } 569 } 570 } 571 572 /* 573 * Synchronize modifications from the focus to other chains in a cluster. 574 * Convenient because nominal API users can just modify the contents of the 575 * focus (at least for non-blockref data). 576 * 577 * Nominal front-end operations only edit non-block-table data in a single 578 * chain. This code copies such modifications to the other chains in the 579 * cluster. Blocktable modifications are handled on a chain-by-chain basis 580 * by both the frontend and the backend and will explode in fireworks if 581 * blindly copied. 582 */ 583 void 584 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 585 { 586 hammer2_chain_t *focus; 587 hammer2_chain_t *scan; 588 const hammer2_inode_data_t *ripdata; 589 hammer2_inode_data_t *wipdata; 590 int i; 591 592 focus = cluster->focus; 593 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 594 595 for (i = 0; i < cluster->nchains; ++i) { 596 scan = cluster->array[i]; 597 if (scan == NULL || scan == focus) 598 continue; 599 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 600 KKASSERT(focus->bytes == scan->bytes && 601 focus->bref.type == scan->bref.type); 602 switch(focus->bref.type) { 603 case HAMMER2_BREF_TYPE_INODE: 604 ripdata = &focus->data->ipdata; 605 wipdata = &scan->data->ipdata; 606 if ((ripdata->op_flags & 607 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 608 bcopy(ripdata, wipdata, 609 offsetof(hammer2_inode_data_t, u)); 610 break; 611 } 612 /* fall through */ 613 case HAMMER2_BREF_TYPE_DATA: 614 bcopy(focus->data, scan->data, focus->bytes); 615 break; 616 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 617 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 618 case HAMMER2_BREF_TYPE_FREEMAP: 619 case HAMMER2_BREF_TYPE_VOLUME: 620 panic("hammer2_cluster_modsync: illegal node type"); 621 /* NOT REACHED */ 622 break; 623 default: 624 panic("hammer2_cluster_modsync: unknown node type"); 625 break; 626 } 627 } 628 } 629 630 /* 631 * Lookup initialization/completion API 632 */ 633 hammer2_cluster_t * 634 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 635 { 636 hammer2_cluster_t *cluster; 637 int i; 638 639 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 640 cluster->pmp = cparent->pmp; /* can be NULL */ 641 /* cluster->focus = NULL; already null */ 642 643 for (i = 0; i < cparent->nchains; ++i) { 644 cluster->array[i] = cparent->array[i]; 645 if (cluster->focus == NULL) 646 cluster->focus = cluster->array[i]; 647 } 648 cluster->nchains = cparent->nchains; 649 650 /* 651 * Independently lock (this will also give cluster 1 ref) 652 */ 653 if (flags & HAMMER2_LOOKUP_SHARED) { 654 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 655 HAMMER2_RESOLVE_SHARED); 656 } else { 657 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 658 } 659 return (cluster); 660 } 661 662 void 663 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 664 { 665 if (cparent) 666 hammer2_cluster_unlock(cparent); 667 } 668 669 /* 670 * Locate first match or overlap under parent, return a new cluster 671 */ 672 hammer2_cluster_t * 673 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 674 hammer2_key_t key_beg, hammer2_key_t key_end, 675 int flags, int *ddflagp) 676 { 677 hammer2_pfsmount_t *pmp; 678 hammer2_cluster_t *cluster; 679 hammer2_chain_t *chain; 680 hammer2_key_t key_accum; 681 hammer2_key_t key_next; 682 hammer2_key_t bref_key; 683 int bref_keybits; 684 int null_count; 685 int ddflag; 686 int i; 687 uint8_t bref_type; 688 u_int bytes; 689 690 pmp = cparent->pmp; /* can be NULL */ 691 key_accum = *key_nextp; 692 null_count = 0; 693 bref_type = 0; 694 bref_key = 0; 695 bref_keybits = 0; 696 bytes = 0; 697 698 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 699 cluster->pmp = pmp; /* can be NULL */ 700 cluster->refs = 1; 701 /* cluster->focus = NULL; already null */ 702 cparent->focus = NULL; 703 *ddflagp = 0; 704 705 for (i = 0; i < cparent->nchains; ++i) { 706 key_next = *key_nextp; 707 if (cparent->array[i] == NULL) { 708 ++null_count; 709 continue; 710 } 711 chain = hammer2_chain_lookup(&cparent->array[i], &key_next, 712 key_beg, key_end, 713 &cparent->cache_index[i], 714 flags, &ddflag); 715 if (cparent->focus == NULL) 716 cparent->focus = cparent->array[i]; 717 cluster->array[i] = chain; 718 if (chain == NULL) { 719 ++null_count; 720 } else { 721 if (cluster->focus == NULL) { 722 bref_type = chain->bref.type; 723 bref_key = chain->bref.key; 724 bref_keybits = chain->bref.keybits; 725 bytes = chain->bytes; 726 *ddflagp = ddflag; 727 cluster->focus = chain; 728 } 729 KKASSERT(bref_type == chain->bref.type); 730 KKASSERT(bref_key == chain->bref.key); 731 KKASSERT(bref_keybits == chain->bref.keybits); 732 KKASSERT(bytes == chain->bytes); 733 KKASSERT(*ddflagp == ddflag); 734 } 735 if (key_accum > key_next) 736 key_accum = key_next; 737 } 738 *key_nextp = key_accum; 739 cluster->nchains = i; 740 741 if (null_count == i) { 742 hammer2_cluster_drop(cluster); 743 cluster = NULL; 744 } 745 746 return (cluster); 747 } 748 749 /* 750 * Locate next match or overlap under parent, replace cluster 751 */ 752 hammer2_cluster_t * 753 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 754 hammer2_key_t *key_nextp, 755 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 756 { 757 hammer2_chain_t *chain; 758 hammer2_key_t key_accum; 759 hammer2_key_t key_next; 760 int null_count; 761 int i; 762 763 key_accum = *key_nextp; 764 null_count = 0; 765 cluster->focus = NULL; 766 cparent->focus = NULL; 767 768 for (i = 0; i < cparent->nchains; ++i) { 769 key_next = *key_nextp; 770 chain = cluster->array[i]; 771 if (chain == NULL) { 772 if (cparent->focus == NULL) 773 cparent->focus = cparent->array[i]; 774 ++null_count; 775 continue; 776 } 777 if (cparent->array[i] == NULL) { 778 if (flags & HAMMER2_LOOKUP_NOLOCK) 779 hammer2_chain_drop(chain); 780 else 781 hammer2_chain_unlock(chain); 782 ++null_count; 783 continue; 784 } 785 chain = hammer2_chain_next(&cparent->array[i], chain, 786 &key_next, key_beg, key_end, 787 &cparent->cache_index[i], flags); 788 if (cparent->focus == NULL) 789 cparent->focus = cparent->array[i]; 790 cluster->array[i] = chain; 791 if (chain == NULL) { 792 ++null_count; 793 } else if (cluster->focus == NULL) { 794 cluster->focus = chain; 795 } 796 if (key_accum > key_next) 797 key_accum = key_next; 798 } 799 800 if (null_count == i) { 801 hammer2_cluster_drop(cluster); 802 cluster = NULL; 803 } 804 return(cluster); 805 } 806 807 #if 0 808 /* 809 * XXX initial NULL cluster needs reworking (pass **clusterp ?) 810 * 811 * The raw scan function is similar to lookup/next but does not seek to a key. 812 * Blockrefs are iterated via first_chain = (parent, NULL) and 813 * next_chain = (parent, chain). 814 * 815 * The passed-in parent must be locked and its data resolved. The returned 816 * chain will be locked. Pass chain == NULL to acquire the first sub-chain 817 * under parent and then iterate with the passed-in chain (which this 818 * function will unlock). 819 */ 820 hammer2_cluster_t * 821 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 822 int flags) 823 { 824 hammer2_chain_t *chain; 825 int null_count; 826 int i; 827 828 null_count = 0; 829 830 for (i = 0; i < cparent->nchains; ++i) { 831 chain = cluster->array[i]; 832 if (chain == NULL) { 833 ++null_count; 834 continue; 835 } 836 if (cparent->array[i] == NULL) { 837 if (flags & HAMMER2_LOOKUP_NOLOCK) 838 hammer2_chain_drop(chain); 839 else 840 hammer2_chain_unlock(chain); 841 ++null_count; 842 continue; 843 } 844 845 chain = hammer2_chain_scan(cparent->array[i], chain, 846 &cparent->cache_index[i], flags); 847 cluster->array[i] = chain; 848 if (chain == NULL) 849 ++null_count; 850 } 851 852 if (null_count == i) { 853 hammer2_cluster_drop(cluster); 854 cluster = NULL; 855 } 856 return(cluster); 857 } 858 859 #endif 860 861 /* 862 * Create a new cluster using the specified key 863 */ 864 int 865 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 866 hammer2_cluster_t **clusterp, 867 hammer2_key_t key, int keybits, 868 int type, size_t bytes, int flags) 869 { 870 hammer2_cluster_t *cluster; 871 hammer2_pfsmount_t *pmp; 872 int error; 873 int i; 874 875 pmp = trans->pmp; /* can be NULL */ 876 877 if ((cluster = *clusterp) == NULL) { 878 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, 879 M_WAITOK | M_ZERO); 880 cluster->pmp = pmp; /* can be NULL */ 881 cluster->refs = 1; 882 } 883 cluster->focus = NULL; 884 cparent->focus = NULL; 885 886 /* 887 * NOTE: cluster->array[] entries can initially be NULL. If 888 * *clusterp is supplied, skip NULL entries, otherwise 889 * create new chains. 890 */ 891 for (i = 0; i < cparent->nchains; ++i) { 892 if (*clusterp && cluster->array[i] == NULL) { 893 if (cparent->focus == NULL) 894 cparent->focus = cparent->array[i]; 895 continue; 896 } 897 error = hammer2_chain_create(trans, &cparent->array[i], 898 &cluster->array[i], pmp, 899 key, keybits, 900 type, bytes, flags); 901 KKASSERT(error == 0); 902 if (cparent->focus == NULL) 903 cparent->focus = cparent->array[i]; 904 if (cluster->focus == NULL) 905 cluster->focus = cluster->array[i]; 906 } 907 cluster->nchains = i; 908 *clusterp = cluster; 909 910 return error; 911 } 912 913 /* 914 * Rename a cluster to a new parent. 915 * 916 * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields 917 * are used from a passed-in non-NULL bref pointer. All other fields 918 * are extracted from the original chain for each chain in the 919 * iteration. 920 */ 921 void 922 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 923 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 924 int flags) 925 { 926 hammer2_chain_t *chain; 927 hammer2_blockref_t xbref; 928 int i; 929 930 cluster->focus = NULL; 931 cparent->focus = NULL; 932 933 for (i = 0; i < cluster->nchains; ++i) { 934 chain = cluster->array[i]; 935 if (chain) { 936 if (bref) { 937 xbref = chain->bref; 938 xbref.key = bref->key; 939 xbref.keybits = bref->keybits; 940 hammer2_chain_rename(trans, &xbref, 941 &cparent->array[i], 942 chain, flags); 943 } else { 944 hammer2_chain_rename(trans, NULL, 945 &cparent->array[i], 946 chain, flags); 947 } 948 cluster->array[i] = chain; 949 if (cluster->focus == NULL) 950 cluster->focus = chain; 951 if (cparent->focus == NULL) 952 cparent->focus = cparent->array[i]; 953 } else { 954 if (cparent->focus == NULL) 955 cparent->focus = cparent->array[i]; 956 } 957 } 958 } 959 960 /* 961 * Mark a cluster deleted 962 */ 963 void 964 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 965 hammer2_cluster_t *cluster, int flags) 966 { 967 hammer2_chain_t *chain; 968 hammer2_chain_t *parent; 969 int i; 970 971 if (cparent == NULL) { 972 kprintf("cparent is NULL\n"); 973 return; 974 } 975 976 for (i = 0; i < cluster->nchains; ++i) { 977 parent = (i < cparent->nchains) ? cparent->array[i] : NULL; 978 chain = cluster->array[i]; 979 if (chain == NULL) 980 continue; 981 if (chain->parent != parent) { 982 kprintf("hammer2_cluster_delete: parent " 983 "mismatch chain=%p parent=%p against=%p\n", 984 chain, chain->parent, parent); 985 } else { 986 hammer2_chain_delete(trans, parent, chain, flags); 987 } 988 } 989 } 990 991 /* 992 * Create a snapshot of the specified {parent, ochain} with the specified 993 * label. The originating hammer2_inode must be exclusively locked for 994 * safety. 995 * 996 * The ioctl code has already synced the filesystem. 997 */ 998 int 999 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, 1000 hammer2_ioc_pfs_t *pfs) 1001 { 1002 hammer2_mount_t *hmp; 1003 hammer2_cluster_t *ncluster; 1004 const hammer2_inode_data_t *ripdata; 1005 hammer2_inode_data_t *wipdata; 1006 hammer2_inode_t *nip; 1007 size_t name_len; 1008 hammer2_key_t lhc; 1009 struct vattr vat; 1010 uuid_t opfs_clid; 1011 int error; 1012 int i; 1013 1014 kprintf("snapshot %s\n", pfs->name); 1015 1016 name_len = strlen(pfs->name); 1017 lhc = hammer2_dirhash(pfs->name, name_len); 1018 1019 /* 1020 * Get the clid 1021 */ 1022 ripdata = &hammer2_cluster_rdata(ocluster)->ipdata; 1023 opfs_clid = ripdata->pfs_clid; 1024 hmp = ocluster->focus->hmp; 1025 1026 /* 1027 * Create the snapshot directory under the super-root 1028 * 1029 * Set PFS type, generate a unique filesystem id, and generate 1030 * a cluster id. Use the same clid when snapshotting a PFS root, 1031 * which theoretically allows the snapshot to be used as part of 1032 * the same cluster (perhaps as a cache). 1033 * 1034 * Copy the (flushed) blockref array. Theoretically we could use 1035 * chain_duplicate() but it becomes difficult to disentangle 1036 * the shared core so for now just brute-force it. 1037 */ 1038 VATTR_NULL(&vat); 1039 vat.va_type = VDIR; 1040 vat.va_mode = 0755; 1041 ncluster = NULL; 1042 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, 1043 proc0.p_ucred, pfs->name, name_len, 1044 &ncluster, &error); 1045 1046 if (nip) { 1047 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); 1048 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT; 1049 kern_uuidgen(&wipdata->pfs_fsid, 1); 1050 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1051 wipdata->pfs_clid = opfs_clid; 1052 else 1053 kern_uuidgen(&wipdata->pfs_clid, 1); 1054 1055 for (i = 0; i < ncluster->nchains; ++i) { 1056 if (ncluster->array[i]) { 1057 ncluster->array[i]->bref.flags |= 1058 HAMMER2_BREF_FLAG_PFSROOT; 1059 } 1060 } 1061 #if 0 1062 /* XXX can't set this unless we do an explicit flush, which 1063 we also need a pmp assigned to do, else the flush code 1064 won't flush ncluster because it thinks it is crossing a 1065 flush boundary */ 1066 hammer2_cluster_set_chainflags(ncluster, 1067 HAMMER2_CHAIN_PFSBOUNDARY); 1068 #endif 1069 1070 /* XXX hack blockset copy */ 1071 /* XXX doesn't work with real cluster */ 1072 KKASSERT(ocluster->nchains == 1); 1073 wipdata->u.blockset = ripdata->u.blockset; 1074 hammer2_cluster_modsync(ncluster); 1075 for (i = 0; i < ncluster->nchains; ++i) { 1076 if (ncluster->array[i]) 1077 hammer2_flush(trans, ncluster->array[i]); 1078 } 1079 hammer2_inode_unlock_ex(nip, ncluster); 1080 } 1081 return (error); 1082 } 1083 1084 /* 1085 * Return locked parent cluster given a locked child. The child remains 1086 * locked on return. 1087 */ 1088 hammer2_cluster_t * 1089 hammer2_cluster_parent(hammer2_cluster_t *cluster) 1090 { 1091 hammer2_cluster_t *cparent; 1092 int i; 1093 1094 cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS); 1095 for (i = 0; i < cluster->nchains; ++i) { 1096 hammer2_chain_t *chain; 1097 hammer2_chain_t *rchain; 1098 1099 chain = cluster->array[i]; 1100 if (chain == NULL) 1101 continue; 1102 hammer2_chain_ref(chain); 1103 while ((rchain = chain->parent) != NULL) { 1104 hammer2_chain_ref(rchain); 1105 hammer2_chain_unlock(chain); 1106 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS); 1107 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1108 hammer2_chain_drop(rchain); 1109 if (chain->parent == rchain) 1110 break; 1111 hammer2_chain_unlock(rchain); 1112 } 1113 hammer2_chain_drop(chain); 1114 cparent->array[i] = rchain; 1115 } 1116 return cparent; 1117 } 1118 1119 /************************************************************************ 1120 * CLUSTER I/O * 1121 ************************************************************************ 1122 * 1123 * 1124 * WARNING! blockref[] array data is not universal. These functions should 1125 * only be used to access universal data. 1126 * 1127 * NOTE! The rdata call will wait for at least one of the chain I/Os to 1128 * complete if necessary. The I/O's should have already been 1129 * initiated by the cluster_lock/chain_lock operation. 1130 * 1131 * The cluster must already be in a modified state before wdata 1132 * is called. The data will already be available for this case. 1133 */ 1134 const hammer2_media_data_t * 1135 hammer2_cluster_rdata(hammer2_cluster_t *cluster) 1136 { 1137 return(cluster->focus->data); 1138 } 1139 1140 hammer2_media_data_t * 1141 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 1142 { 1143 KKASSERT(hammer2_cluster_modified(cluster)); 1144 return(cluster->focus->data); 1145 } 1146 1147 /* 1148 * Load async into independent buffer - used to load logical buffers from 1149 * underlying device data. The callback is made for the first validated 1150 * data found, or NULL if no valid data is available. 1151 * 1152 * NOTE! The cluster structure is either unique or serialized (e.g. embedded 1153 * in the inode with an exclusive lock held), the chain structure may be 1154 * shared. 1155 */ 1156 void 1157 hammer2_cluster_load_async(hammer2_cluster_t *cluster, 1158 void (*callback)(hammer2_iocb_t *iocb), void *ptr) 1159 { 1160 hammer2_chain_t *chain; 1161 hammer2_iocb_t *iocb; 1162 hammer2_mount_t *hmp; 1163 hammer2_blockref_t *bref; 1164 int i; 1165 1166 /* 1167 * Try to find a chain whos data is already resolved. If none can 1168 * be found, start with the first chain. 1169 */ 1170 chain = NULL; 1171 for (i = 0; i < cluster->nchains; ++i) { 1172 chain = cluster->array[i]; 1173 if (chain && chain->data) 1174 break; 1175 } 1176 if (i == cluster->nchains) { 1177 chain = cluster->array[0]; 1178 i = 0; 1179 } 1180 1181 iocb = &cluster->iocb; 1182 iocb->callback = callback; 1183 iocb->dio = NULL; /* for already-validated case */ 1184 iocb->cluster = cluster; 1185 iocb->chain = chain; 1186 iocb->ptr = ptr; 1187 iocb->lbase = (off_t)i; 1188 iocb->flags = 0; 1189 iocb->error = 0; 1190 1191 /* 1192 * Data already validated 1193 */ 1194 if (chain->data) { 1195 callback(iocb); 1196 return; 1197 } 1198 1199 /* 1200 * We must resolve to a device buffer, either by issuing I/O or 1201 * by creating a zero-fill element. We do not mark the buffer 1202 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1203 * API must still be used to do that). 1204 * 1205 * The device buffer is variable-sized in powers of 2 down 1206 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1207 * chunk always contains buffers of the same size. (XXX) 1208 * 1209 * The minimum physical IO size may be larger than the variable 1210 * block size. 1211 */ 1212 bref = &chain->bref; 1213 hmp = chain->hmp; 1214 1215 #if 0 1216 /* handled by callback? <- TODO XXX even needed for loads? */ 1217 /* 1218 * The getblk() optimization for a 100% overwrite can only be used 1219 * if the physical block size matches the request. 1220 */ 1221 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 1222 chain->bytes == hammer2_devblksize(chain->bytes)) { 1223 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); 1224 KKASSERT(error == 0); 1225 iocb->dio = dio; 1226 callback(iocb); 1227 return; 1228 } 1229 #endif 1230 1231 /* 1232 * Otherwise issue a read 1233 */ 1234 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1235 hammer2_io_getblk(hmp, bref->data_off, chain->bytes, iocb); 1236 } 1237 1238 /************************************************************************ 1239 * NODE FAILURES * 1240 ************************************************************************ 1241 * 1242 * A node failure can occur for numerous reasons. 1243 * 1244 * - A read I/O may fail 1245 * - A write I/O may fail 1246 * - An unexpected chain might be found (or be missing) 1247 * - A node might disconnect temporarily and reconnect later 1248 * (for example, a USB stick could get pulled, or a node might 1249 * be programmatically disconnected). 1250 * - A node might run out of space during a modifying operation. 1251 * 1252 * When a read failure or an unexpected chain state is found, the chain and 1253 * parent chain at the failure point for the nodes involved (the nodes 1254 * which we determine to be in error) are flagged as failed and removed 1255 * from the cluster. The node itself is allowed to remain active. The 1256 * highest common point (usually a parent chain) is queued to the 1257 * resynchronization thread for action. 1258 * 1259 * When a write I/O fails or a node runs out of space, we first adjust 1260 * as if a read failure occurs but we further disable flushes on the 1261 * ENTIRE node. Concurrent modifying transactions are allowed to complete 1262 * but any new modifying transactions will automatically remove the node 1263 * from consideration in all related cluster structures and not generate 1264 * any new modified chains. The ROOT chain for the failed node(s) is queued 1265 * to the resynchronization thread for action. 1266 * 1267 * A temporary disconnect is handled as if a write failure occurred. 1268 * 1269 * Any of these failures might or might not stall related high level VNOPS, 1270 * depending on what has failed, what nodes remain, the type of cluster, 1271 * and the operating state of the cluster. 1272 * 1273 * FLUSH ON WRITE-DISABLED NODES 1274 * 1275 * A flush on a write-disabled node is not allowed to write anything because 1276 * we cannot safely update the mirror_tid anywhere on the failed node. The 1277 * synchronization thread uses mirror_tid to calculate incremental resyncs. 1278 * Dirty meta-data related to the failed node is thrown away. 1279 * 1280 * Dirty buffer cache buffers and inodes are only thrown away if they can be 1281 * retired... that is, if the filesystem still has enough nodes to complete 1282 * the operation. 1283 */ 1284 1285 /************************************************************************ 1286 * SYNCHRONIZATION THREAD * 1287 ************************************************************************ 1288 * 1289 * This thread is responsible for [re]synchronizing the cluster representing 1290 * a PFS. Any out-of-sync or failed node starts this thread on a 1291 * node-by-node basis when the failure is detected. 1292 * 1293 * Clusters needing resynchronization are queued at the highest point 1294 * where the parent on the failed node is still valid, or a special 1295 * incremental scan from the ROOT is queued if no parent exists. This 1296 * thread is also responsible for waiting for reconnections of the failed 1297 * node if the cause was due to a disconnect, and waiting for space to be 1298 * freed up if the cause was due to running out of space. 1299 * 1300 * If the cause is due to a node running out of space, this thread will also 1301 * remove older (unlocked) snapshots to make new space, recover space, and 1302 * then start resynchronization. 1303 * 1304 * Each resynchronization pass virtually snapshots the PFS on the good nodes 1305 * and synchronizes using that snapshot against the target node. This 1306 * ensures a consistent chain topology and also avoids interference between 1307 * the resynchronization thread and frontend operations. 1308 * 1309 * Since these are per-node threads it is possible to resynchronize several 1310 * nodes at once. 1311 */ 1312