1 /* 2 * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * The cluster module collects multiple chains representing the same 36 * information into a single entity. It allows direct access to media 37 * data as long as it is not blockref array data. Meaning, basically, 38 * just inode and file data. 39 * 40 * This module also handles I/O dispatch, status rollup, and various 41 * mastership arrangements including quorum operations. It effectively 42 * presents one topology to the vnops layer. 43 * 44 * Many of the API calls mimic chain API calls but operate on clusters 45 * instead of chains. Please see hammer2_chain.c for more complete code 46 * documentation of the API functions. 47 */ 48 #include <sys/cdefs.h> 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/types.h> 52 #include <sys/lock.h> 53 #include <sys/uuid.h> 54 55 #include "hammer2.h" 56 57 /* 58 * Returns TRUE if any chain in the cluster needs to be resized. 59 */ 60 int 61 hammer2_cluster_need_resize(hammer2_cluster_t *cluster, int bytes) 62 { 63 hammer2_chain_t *chain; 64 int i; 65 66 for (i = 0; i < cluster->nchains; ++i) { 67 chain = cluster->array[i]; 68 if (chain && chain->bytes != bytes) 69 return 1; 70 } 71 return 0; 72 } 73 74 uint8_t 75 hammer2_cluster_type(hammer2_cluster_t *cluster) 76 { 77 return(cluster->focus->bref.type); 78 } 79 80 /* 81 * NOTE: When modifying a cluster object via hammer2_cluster_wdata() 82 * and hammer2_cluster_modsync(), remember that block array 83 * entries are not copied to the elements of the cluster. 84 */ 85 const hammer2_media_data_t * 86 hammer2_cluster_data(hammer2_cluster_t *cluster) 87 { 88 return(cluster->focus->data); 89 } 90 91 hammer2_media_data_t * 92 hammer2_cluster_wdata(hammer2_cluster_t *cluster) 93 { 94 return(cluster->focus->data); 95 } 96 97 int 98 hammer2_cluster_modified(hammer2_cluster_t *cluster) 99 { 100 return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0); 101 } 102 103 /* 104 * Return a bref representative of the cluster. Any data offset is removed 105 * (since it would only be applicable to a particular chain in the cluster). 106 * 107 * However, the radix portion of data_off is used for many purposes and will 108 * be retained. 109 */ 110 void 111 hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref) 112 { 113 *bref = cluster->focus->bref; 114 bref->data_off &= HAMMER2_OFF_MASK_RADIX; 115 } 116 117 void 118 hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags) 119 { 120 hammer2_chain_t *chain; 121 int i; 122 123 for (i = 0; i < cluster->nchains; ++i) { 124 chain = cluster->array[i]; 125 if (chain) 126 atomic_set_int(&chain->flags, flags); 127 } 128 } 129 130 void 131 hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster) 132 { 133 hammer2_chain_t *chain; 134 int i; 135 136 for (i = 0; i < cluster->nchains; ++i) { 137 chain = cluster->array[i]; 138 if (chain) 139 hammer2_chain_setflush(trans, chain); 140 } 141 } 142 143 void 144 hammer2_cluster_setmethod_check(hammer2_trans_t *trans, 145 hammer2_cluster_t *cluster, 146 int check_algo) 147 { 148 hammer2_chain_t *chain; 149 int i; 150 151 for (i = 0; i < cluster->nchains; ++i) { 152 chain = cluster->array[i]; 153 if (chain) { 154 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 155 chain->bref.methods &= ~HAMMER2_ENC_CHECK(-1); 156 chain->bref.methods |= HAMMER2_ENC_CHECK(check_algo); 157 } 158 } 159 } 160 161 /* 162 * Create a cluster with one ref from the specified chain. The chain 163 * is not further referenced. The caller typically supplies a locked 164 * chain and transfers ownership to the cluster. 165 */ 166 hammer2_cluster_t * 167 hammer2_cluster_from_chain(hammer2_chain_t *chain) 168 { 169 hammer2_cluster_t *cluster; 170 171 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 172 cluster->array[0] = chain; 173 cluster->nchains = 1; 174 cluster->focus = chain; 175 cluster->pmp = chain->pmp; 176 cluster->refs = 1; 177 178 return cluster; 179 } 180 181 /* 182 * Allocates a cluster and its underlying chain structures. The underlying 183 * chains will be locked. The cluster and underlying chains will have one 184 * ref. 185 */ 186 hammer2_cluster_t * 187 hammer2_cluster_alloc(hammer2_pfsmount_t *pmp, 188 hammer2_trans_t *trans, hammer2_blockref_t *bref) 189 { 190 hammer2_cluster_t *cluster; 191 hammer2_cluster_t *rcluster; 192 hammer2_chain_t *chain; 193 #if 0 194 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 195 #endif 196 int i; 197 198 KKASSERT(pmp != NULL); 199 200 /* 201 * Construct the appropriate system structure. 202 */ 203 switch(bref->type) { 204 case HAMMER2_BREF_TYPE_INODE: 205 case HAMMER2_BREF_TYPE_INDIRECT: 206 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 207 case HAMMER2_BREF_TYPE_DATA: 208 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 209 /* 210 * Chain's are really only associated with the hmp but we 211 * maintain a pmp association for per-mount memory tracking 212 * purposes. The pmp can be NULL. 213 */ 214 break; 215 case HAMMER2_BREF_TYPE_VOLUME: 216 case HAMMER2_BREF_TYPE_FREEMAP: 217 chain = NULL; 218 panic("hammer2_cluster_alloc volume type illegal for op"); 219 default: 220 chain = NULL; 221 panic("hammer2_cluster_alloc: unrecognized blockref type: %d", 222 bref->type); 223 } 224 225 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 226 cluster->refs = 1; 227 228 rcluster = &pmp->iroot->cluster; 229 for (i = 0; i < rcluster->nchains; ++i) { 230 chain = hammer2_chain_alloc(rcluster->array[i]->hmp, 231 pmp, trans, bref); 232 #if 0 233 chain->hmp = rcluster->array[i]->hmp; 234 chain->bref = *bref; 235 chain->bytes = bytes; 236 chain->refs = 1; 237 chain->flags = HAMMER2_CHAIN_ALLOCATED; 238 #endif 239 240 /* 241 * NOTE: When loading a chain from backing store or creating a 242 * snapshot, trans will be NULL and the caller is 243 * responsible for setting these fields. 244 */ 245 cluster->array[i] = chain; 246 } 247 cluster->nchains = i; 248 cluster->pmp = pmp; 249 cluster->focus = cluster->array[0]; 250 251 return (cluster); 252 } 253 254 /* 255 * Add a reference to a cluster. 256 * 257 * We must also ref the underlying chains in order to allow ref/unlock 258 * sequences to later re-lock. 259 */ 260 void 261 hammer2_cluster_ref(hammer2_cluster_t *cluster) 262 { 263 hammer2_chain_t *chain; 264 int i; 265 266 atomic_add_int(&cluster->refs, 1); 267 for (i = 0; i < cluster->nchains; ++i) { 268 chain = cluster->array[i]; 269 if (chain) 270 hammer2_chain_ref(chain); 271 } 272 } 273 274 /* 275 * Drop the caller's reference to the cluster. When the ref count drops to 276 * zero this function frees the cluster and drops all underlying chains. 277 */ 278 void 279 hammer2_cluster_drop(hammer2_cluster_t *cluster) 280 { 281 hammer2_chain_t *chain; 282 int i; 283 284 KKASSERT(cluster->refs > 0); 285 for (i = 0; i < cluster->nchains; ++i) { 286 chain = cluster->array[i]; 287 if (chain) { 288 hammer2_chain_drop(chain); 289 if (cluster->refs == 1) 290 cluster->array[i] = NULL; 291 } 292 } 293 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 294 cluster->focus = NULL; 295 kfree(cluster, M_HAMMER2); 296 /* cluster = NULL; safety */ 297 } 298 } 299 300 void 301 hammer2_cluster_wait(hammer2_cluster_t *cluster) 302 { 303 tsleep(cluster->focus, 0, "h2clcw", 1); 304 } 305 306 /* 307 * Lock and ref a cluster. This adds a ref to the cluster and its chains 308 * and then locks them. 309 */ 310 int 311 hammer2_cluster_lock(hammer2_cluster_t *cluster, int how) 312 { 313 hammer2_chain_t *chain; 314 int i; 315 int error; 316 317 error = 0; 318 atomic_add_int(&cluster->refs, 1); 319 for (i = 0; i < cluster->nchains; ++i) { 320 chain = cluster->array[i]; 321 if (chain) { 322 error = hammer2_chain_lock(chain, how); 323 if (error) { 324 while (--i >= 0) 325 hammer2_chain_unlock(cluster->array[i]); 326 atomic_add_int(&cluster->refs, -1); 327 break; 328 } 329 } 330 } 331 return error; 332 } 333 334 /* 335 * Replace the contents of dst with src, adding a reference to src's chains. 336 * dst is assumed to already have a ref and any chains present in dst are 337 * assumed to be locked and will be unlocked. 338 * 339 * If the chains in src are locked, only one of (src) or (dst) should be 340 * considered locked by the caller after return, not both. 341 */ 342 void 343 hammer2_cluster_replace(hammer2_cluster_t *dst, hammer2_cluster_t *src) 344 { 345 hammer2_chain_t *chain; 346 int i; 347 348 KKASSERT(dst->refs == 1); 349 dst->focus = NULL; 350 351 for (i = 0; i < src->nchains; ++i) { 352 chain = src->array[i]; 353 if (chain) { 354 hammer2_chain_ref(chain); 355 if (i < dst->nchains && dst->array[i]) 356 hammer2_chain_unlock(dst->array[i]); 357 dst->array[i] = chain; 358 if (dst->focus == NULL) 359 dst->focus = chain; 360 } 361 } 362 while (i < dst->nchains) { 363 chain = dst->array[i]; 364 if (chain) { 365 hammer2_chain_unlock(chain); 366 dst->array[i] = NULL; 367 } 368 ++i; 369 } 370 dst->nchains = src->nchains; 371 } 372 373 /* 374 * Replace the contents of the locked destination with the contents of the 375 * locked source. Destination must have one ref. 376 * 377 * Returns with the destination still with one ref and the copied chains 378 * with an additional lock (representing their state on the destination). 379 * The original chains associated with the destination are unlocked. 380 */ 381 void 382 hammer2_cluster_replace_locked(hammer2_cluster_t *dst, hammer2_cluster_t *src) 383 { 384 hammer2_chain_t *chain; 385 int i; 386 387 KKASSERT(dst->refs == 1); 388 389 dst->focus = NULL; 390 for (i = 0; i < src->nchains; ++i) { 391 chain = src->array[i]; 392 if (chain) { 393 hammer2_chain_lock(chain, 0); 394 if (i < dst->nchains && dst->array[i]) 395 hammer2_chain_unlock(dst->array[i]); 396 dst->array[i] = src->array[i]; 397 if (dst->focus == NULL) 398 dst->focus = chain; 399 } 400 } 401 while (i < dst->nchains) { 402 chain = dst->array[i]; 403 if (chain) { 404 hammer2_chain_unlock(chain); 405 dst->array[i] = NULL; 406 } 407 ++i; 408 } 409 dst->nchains = src->nchains; 410 } 411 412 /* 413 * Copy a cluster, returned a ref'd cluster. All underlying chains 414 * are also ref'd, but not locked. 415 * 416 * If HAMMER2_CLUSTER_COPY_CHAINS is specified, the chains are copied 417 * to the new cluster and a reference is nominally added to them and to 418 * the cluster. The cluster will have 1 ref. 419 * 420 * If HAMMER2_CLUSTER_COPY_NOREF is specified along with CHAINS, the chains 421 * are copied but no additional references are made and the cluster will have 422 * 0 refs. Callers must ref the cluster and the chains before dropping it 423 * (typically by locking it). 424 * 425 * If flags are passed as 0 the copy is setup as if it contained the chains 426 * but the chains will not be copied over, and the cluster will have 0 refs. 427 * Callers must ref the cluster before dropping it (typically by locking it). 428 */ 429 hammer2_cluster_t * 430 hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags) 431 { 432 hammer2_pfsmount_t *pmp = ocluster->pmp; 433 hammer2_cluster_t *ncluster; 434 hammer2_chain_t *chain; 435 int i; 436 437 ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO); 438 ncluster->pmp = pmp; 439 ncluster->nchains = ocluster->nchains; 440 ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1; 441 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) { 442 ncluster->focus = ocluster->focus; 443 for (i = 0; i < ocluster->nchains; ++i) { 444 chain = ocluster->array[i]; 445 ncluster->array[i] = chain; 446 if ((copy_flags & HAMMER2_CLUSTER_COPY_NOREF) == 0 && 447 chain) { 448 hammer2_chain_ref(chain); 449 } 450 } 451 } 452 return (ncluster); 453 } 454 455 /* 456 * Unlock and deref a cluster. The cluster is destroyed if this is the 457 * last ref. 458 */ 459 void 460 hammer2_cluster_unlock(hammer2_cluster_t *cluster) 461 { 462 hammer2_chain_t *chain; 463 int i; 464 465 KKASSERT(cluster->refs > 0); 466 for (i = 0; i < cluster->nchains; ++i) { 467 chain = cluster->array[i]; 468 if (chain) { 469 hammer2_chain_unlock(chain); 470 if (cluster->refs == 1) 471 cluster->array[i] = NULL; /* safety */ 472 } 473 } 474 if (atomic_fetchadd_int(&cluster->refs, -1) == 1) { 475 cluster->focus = NULL; 476 kfree(cluster, M_HAMMER2); 477 /* cluster = NULL; safety */ 478 } 479 } 480 481 /* 482 * Resize the cluster's physical storage allocation in-place. This may 483 * replace the cluster's chains. 484 */ 485 void 486 hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 487 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 488 int nradix, int flags) 489 { 490 int i; 491 492 KKASSERT(cparent->pmp == cluster->pmp); /* can be NULL */ 493 KKASSERT(cparent->nchains == cluster->nchains); 494 495 cluster->focus = NULL; 496 for (i = 0; i < cluster->nchains; ++i) { 497 if (cluster->array[i]) { 498 KKASSERT(cparent->array[i]); 499 hammer2_chain_resize(trans, ip, 500 cparent->array[i], 501 cluster->array[i], 502 nradix, flags); 503 if (cluster->focus == NULL) 504 cluster->focus = cluster->array[i]; 505 } 506 } 507 } 508 509 /* 510 * Set an inode's cluster modified, marking the related chains RW and 511 * duplicating them if necessary. 512 * 513 * The passed-in chain is a localized copy of the chain previously acquired 514 * when the inode was locked (and possilby replaced in the mean time), and 515 * must also be updated. In fact, we update it first and then synchronize 516 * the inode's cluster cache. 517 */ 518 hammer2_inode_data_t * 519 hammer2_cluster_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 520 hammer2_cluster_t *cluster, int flags) 521 { 522 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 523 hammer2_cluster_modify(trans, cluster, flags); 524 525 hammer2_inode_repoint(ip, NULL, cluster); 526 if (ip->vp) 527 vsetisdirty(ip->vp); 528 return (&hammer2_cluster_wdata(cluster)->ipdata); 529 } 530 531 /* 532 * Adjust the cluster's chains to allow modification. 533 */ 534 void 535 hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster, 536 int flags) 537 { 538 int i; 539 540 cluster->focus = NULL; 541 for (i = 0; i < cluster->nchains; ++i) { 542 if (cluster->array[i]) { 543 hammer2_chain_modify(trans, cluster->array[i], flags); 544 if (cluster->focus == NULL) 545 cluster->focus = cluster->array[i]; 546 } 547 } 548 } 549 550 /* 551 * Synchronize modifications with other chains in a cluster. 552 * 553 * Nominal front-end operations only edit non-block-table data in a single 554 * chain. This code copies such modifications to the other chains in the 555 * cluster. Blocktable modifications are handled on a chain-by-chain basis 556 * by both the frontend and the backend and will explode in fireworks if 557 * blindly copied. 558 */ 559 void 560 hammer2_cluster_modsync(hammer2_cluster_t *cluster) 561 { 562 hammer2_chain_t *focus; 563 hammer2_chain_t *scan; 564 const hammer2_inode_data_t *ripdata; 565 hammer2_inode_data_t *wipdata; 566 int i; 567 568 focus = cluster->focus; 569 KKASSERT(focus->flags & HAMMER2_CHAIN_MODIFIED); 570 571 for (i = 0; i < cluster->nchains; ++i) { 572 scan = cluster->array[i]; 573 if (scan == NULL || scan == focus) 574 continue; 575 KKASSERT(scan->flags & HAMMER2_CHAIN_MODIFIED); 576 KKASSERT(focus->bytes == scan->bytes && 577 focus->bref.type == scan->bref.type); 578 switch(focus->bref.type) { 579 case HAMMER2_BREF_TYPE_INODE: 580 ripdata = &focus->data->ipdata; 581 wipdata = &scan->data->ipdata; 582 if ((ripdata->op_flags & 583 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 584 bcopy(ripdata, wipdata, 585 offsetof(hammer2_inode_data_t, u)); 586 break; 587 } 588 /* fall through */ 589 case HAMMER2_BREF_TYPE_DATA: 590 bcopy(focus->data, scan->data, focus->bytes); 591 break; 592 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 593 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 594 case HAMMER2_BREF_TYPE_FREEMAP: 595 case HAMMER2_BREF_TYPE_VOLUME: 596 panic("hammer2_cluster_modsync: illegal node type"); 597 /* NOT REACHED */ 598 break; 599 default: 600 panic("hammer2_cluster_modsync: unknown node type"); 601 break; 602 } 603 } 604 } 605 606 /* 607 * Lookup initialization/completion API 608 */ 609 hammer2_cluster_t * 610 hammer2_cluster_lookup_init(hammer2_cluster_t *cparent, int flags) 611 { 612 hammer2_cluster_t *cluster; 613 int i; 614 615 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 616 cluster->pmp = cparent->pmp; /* can be NULL */ 617 /* cluster->focus = NULL; already null */ 618 619 for (i = 0; i < cparent->nchains; ++i) { 620 cluster->array[i] = cparent->array[i]; 621 if (cluster->focus == NULL) 622 cluster->focus = cluster->array[i]; 623 } 624 cluster->nchains = cparent->nchains; 625 626 /* 627 * Independently lock (this will also give cluster 1 ref) 628 */ 629 if (flags & HAMMER2_LOOKUP_SHARED) { 630 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS | 631 HAMMER2_RESOLVE_SHARED); 632 } else { 633 hammer2_cluster_lock(cluster, HAMMER2_RESOLVE_ALWAYS); 634 } 635 return (cluster); 636 } 637 638 void 639 hammer2_cluster_lookup_done(hammer2_cluster_t *cparent) 640 { 641 if (cparent) 642 hammer2_cluster_unlock(cparent); 643 } 644 645 /* 646 * Locate first match or overlap under parent, return a new cluster 647 */ 648 hammer2_cluster_t * 649 hammer2_cluster_lookup(hammer2_cluster_t *cparent, hammer2_key_t *key_nextp, 650 hammer2_key_t key_beg, hammer2_key_t key_end, 651 int flags, int *ddflagp) 652 { 653 hammer2_pfsmount_t *pmp; 654 hammer2_cluster_t *cluster; 655 hammer2_chain_t *chain; 656 hammer2_key_t key_accum; 657 hammer2_key_t key_next; 658 hammer2_key_t bref_key; 659 int bref_keybits; 660 int null_count; 661 int ddflag; 662 int i; 663 uint8_t bref_type; 664 u_int bytes; 665 666 pmp = cparent->pmp; /* can be NULL */ 667 key_accum = *key_nextp; 668 null_count = 0; 669 bref_type = 0; 670 bref_key = 0; 671 bref_keybits = 0; 672 bytes = 0; 673 674 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, M_WAITOK | M_ZERO); 675 cluster->pmp = pmp; /* can be NULL */ 676 cluster->refs = 1; 677 /* cluster->focus = NULL; already null */ 678 cparent->focus = NULL; 679 *ddflagp = 0; 680 681 for (i = 0; i < cparent->nchains; ++i) { 682 key_next = *key_nextp; 683 if (cparent->array[i] == NULL) { 684 ++null_count; 685 continue; 686 } 687 chain = hammer2_chain_lookup(&cparent->array[i], &key_next, 688 key_beg, key_end, 689 &cparent->cache_index[i], 690 flags, &ddflag); 691 if (cparent->focus == NULL) 692 cparent->focus = cparent->array[i]; 693 cluster->array[i] = chain; 694 if (chain == NULL) { 695 ++null_count; 696 } else { 697 if (cluster->focus == NULL) { 698 bref_type = chain->bref.type; 699 bref_key = chain->bref.key; 700 bref_keybits = chain->bref.keybits; 701 bytes = chain->bytes; 702 *ddflagp = ddflag; 703 cluster->focus = chain; 704 } 705 KKASSERT(bref_type == chain->bref.type); 706 KKASSERT(bref_key == chain->bref.key); 707 KKASSERT(bref_keybits == chain->bref.keybits); 708 KKASSERT(bytes == chain->bytes); 709 KKASSERT(*ddflagp == ddflag); 710 } 711 if (key_accum > key_next) 712 key_accum = key_next; 713 } 714 *key_nextp = key_accum; 715 cluster->nchains = i; 716 717 if (null_count == i) { 718 hammer2_cluster_drop(cluster); 719 cluster = NULL; 720 } 721 722 return (cluster); 723 } 724 725 /* 726 * Locate next match or overlap under parent, replace cluster 727 */ 728 hammer2_cluster_t * 729 hammer2_cluster_next(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 730 hammer2_key_t *key_nextp, 731 hammer2_key_t key_beg, hammer2_key_t key_end, int flags) 732 { 733 hammer2_chain_t *chain; 734 hammer2_key_t key_accum; 735 hammer2_key_t key_next; 736 int null_count; 737 int i; 738 739 key_accum = *key_nextp; 740 null_count = 0; 741 cluster->focus = NULL; 742 cparent->focus = NULL; 743 744 for (i = 0; i < cparent->nchains; ++i) { 745 key_next = *key_nextp; 746 chain = cluster->array[i]; 747 if (chain == NULL) { 748 if (cparent->focus == NULL) 749 cparent->focus = cparent->array[i]; 750 ++null_count; 751 continue; 752 } 753 if (cparent->array[i] == NULL) { 754 if (flags & HAMMER2_LOOKUP_NOLOCK) 755 hammer2_chain_drop(chain); 756 else 757 hammer2_chain_unlock(chain); 758 ++null_count; 759 continue; 760 } 761 chain = hammer2_chain_next(&cparent->array[i], chain, 762 &key_next, key_beg, key_end, 763 &cparent->cache_index[i], flags); 764 if (cparent->focus == NULL) 765 cparent->focus = cparent->array[i]; 766 cluster->array[i] = chain; 767 if (chain == NULL) { 768 ++null_count; 769 } else if (cluster->focus == NULL) { 770 cluster->focus = chain; 771 } 772 if (key_accum > key_next) 773 key_accum = key_next; 774 } 775 776 if (null_count == i) { 777 hammer2_cluster_drop(cluster); 778 cluster = NULL; 779 } 780 return(cluster); 781 } 782 783 #if 0 784 /* 785 * XXX initial NULL cluster needs reworking (pass **clusterp ?) 786 * 787 * The raw scan function is similar to lookup/next but does not seek to a key. 788 * Blockrefs are iterated via first_chain = (parent, NULL) and 789 * next_chain = (parent, chain). 790 * 791 * The passed-in parent must be locked and its data resolved. The returned 792 * chain will be locked. Pass chain == NULL to acquire the first sub-chain 793 * under parent and then iterate with the passed-in chain (which this 794 * function will unlock). 795 */ 796 hammer2_cluster_t * 797 hammer2_cluster_scan(hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 798 int flags) 799 { 800 hammer2_chain_t *chain; 801 int null_count; 802 int i; 803 804 null_count = 0; 805 806 for (i = 0; i < cparent->nchains; ++i) { 807 chain = cluster->array[i]; 808 if (chain == NULL) { 809 ++null_count; 810 continue; 811 } 812 if (cparent->array[i] == NULL) { 813 if (flags & HAMMER2_LOOKUP_NOLOCK) 814 hammer2_chain_drop(chain); 815 else 816 hammer2_chain_unlock(chain); 817 ++null_count; 818 continue; 819 } 820 821 chain = hammer2_chain_scan(cparent->array[i], chain, 822 &cparent->cache_index[i], flags); 823 cluster->array[i] = chain; 824 if (chain == NULL) 825 ++null_count; 826 } 827 828 if (null_count == i) { 829 hammer2_cluster_drop(cluster); 830 cluster = NULL; 831 } 832 return(cluster); 833 } 834 835 #endif 836 837 /* 838 * Create a new cluster using the specified key 839 */ 840 int 841 hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 842 hammer2_cluster_t **clusterp, 843 hammer2_key_t key, int keybits, 844 int type, size_t bytes, int flags) 845 { 846 hammer2_cluster_t *cluster; 847 hammer2_pfsmount_t *pmp; 848 int error; 849 int i; 850 851 pmp = trans->pmp; /* can be NULL */ 852 853 if ((cluster = *clusterp) == NULL) { 854 cluster = kmalloc(sizeof(*cluster), M_HAMMER2, 855 M_WAITOK | M_ZERO); 856 cluster->pmp = pmp; /* can be NULL */ 857 cluster->refs = 1; 858 } 859 cluster->focus = NULL; 860 cparent->focus = NULL; 861 862 /* 863 * NOTE: cluster->array[] entries can initially be NULL. If 864 * *clusterp is supplied, skip NULL entries, otherwise 865 * create new chains. 866 */ 867 for (i = 0; i < cparent->nchains; ++i) { 868 if (*clusterp && cluster->array[i] == NULL) { 869 if (cparent->focus == NULL) 870 cparent->focus = cparent->array[i]; 871 continue; 872 } 873 error = hammer2_chain_create(trans, &cparent->array[i], 874 &cluster->array[i], pmp, 875 key, keybits, 876 type, bytes, flags); 877 KKASSERT(error == 0); 878 if (cparent->focus == NULL) 879 cparent->focus = cparent->array[i]; 880 if (cluster->focus == NULL) 881 cluster->focus = cluster->array[i]; 882 } 883 cluster->nchains = i; 884 *clusterp = cluster; 885 886 return error; 887 } 888 889 /* 890 * Rename a cluster to a new parent. 891 * 892 * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields 893 * are used from a passed-in non-NULL bref pointer. All other fields 894 * are extracted from the original chain for each chain in the 895 * iteration. 896 */ 897 void 898 hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 899 hammer2_cluster_t *cparent, hammer2_cluster_t *cluster, 900 int flags) 901 { 902 hammer2_chain_t *chain; 903 hammer2_blockref_t xbref; 904 int i; 905 906 cluster->focus = NULL; 907 cparent->focus = NULL; 908 909 for (i = 0; i < cluster->nchains; ++i) { 910 chain = cluster->array[i]; 911 if (chain) { 912 if (bref) { 913 xbref = chain->bref; 914 xbref.key = bref->key; 915 xbref.keybits = bref->keybits; 916 hammer2_chain_rename(trans, &xbref, 917 &cparent->array[i], 918 chain, flags); 919 } else { 920 hammer2_chain_rename(trans, NULL, 921 &cparent->array[i], 922 chain, flags); 923 } 924 cluster->array[i] = chain; 925 if (cluster->focus == NULL) 926 cluster->focus = chain; 927 if (cparent->focus == NULL) 928 cparent->focus = cparent->array[i]; 929 } else { 930 if (cparent->focus == NULL) 931 cparent->focus = cparent->array[i]; 932 } 933 } 934 } 935 936 /* 937 * Mark a cluster deleted 938 */ 939 void 940 hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent, 941 hammer2_cluster_t *cluster, int flags) 942 { 943 hammer2_chain_t *chain; 944 hammer2_chain_t *parent; 945 int i; 946 947 if (cparent == NULL) { 948 kprintf("cparent is NULL\n"); 949 return; 950 } 951 952 for (i = 0; i < cluster->nchains; ++i) { 953 parent = (i < cparent->nchains) ? cparent->array[i] : NULL; 954 chain = cluster->array[i]; 955 if (chain == NULL) 956 continue; 957 if (chain->parent != parent) { 958 kprintf("hammer2_cluster_delete: parent " 959 "mismatch chain=%p parent=%p against=%p\n", 960 chain, chain->parent, parent); 961 } else { 962 hammer2_chain_delete(trans, parent, chain, flags); 963 } 964 } 965 } 966 967 /* 968 * Create a snapshot of the specified {parent, ochain} with the specified 969 * label. The originating hammer2_inode must be exclusively locked for 970 * safety. 971 * 972 * The ioctl code has already synced the filesystem. 973 */ 974 int 975 hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, 976 hammer2_ioc_pfs_t *pfs) 977 { 978 hammer2_mount_t *hmp; 979 hammer2_cluster_t *ncluster; 980 const hammer2_inode_data_t *ipdata; 981 hammer2_inode_data_t *wipdata; 982 hammer2_inode_t *nip; 983 size_t name_len; 984 hammer2_key_t lhc; 985 struct vattr vat; 986 uuid_t opfs_clid; 987 int error; 988 int i; 989 990 kprintf("snapshot %s\n", pfs->name); 991 992 name_len = strlen(pfs->name); 993 lhc = hammer2_dirhash(pfs->name, name_len); 994 995 ipdata = &hammer2_cluster_data(ocluster)->ipdata; 996 opfs_clid = ipdata->pfs_clid; 997 hmp = ocluster->focus->hmp; 998 999 /* 1000 * Create the snapshot directory under the super-root 1001 * 1002 * Set PFS type, generate a unique filesystem id, and generate 1003 * a cluster id. Use the same clid when snapshotting a PFS root, 1004 * which theoretically allows the snapshot to be used as part of 1005 * the same cluster (perhaps as a cache). 1006 * 1007 * Copy the (flushed) blockref array. Theoretically we could use 1008 * chain_duplicate() but it becomes difficult to disentangle 1009 * the shared core so for now just brute-force it. 1010 */ 1011 VATTR_NULL(&vat); 1012 vat.va_type = VDIR; 1013 vat.va_mode = 0755; 1014 ncluster = NULL; 1015 nip = hammer2_inode_create(trans, hmp->spmp->iroot, &vat, 1016 proc0.p_ucred, pfs->name, name_len, 1017 &ncluster, &error); 1018 1019 if (nip) { 1020 wipdata = hammer2_cluster_modify_ip(trans, nip, ncluster, 0); 1021 wipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT; 1022 kern_uuidgen(&wipdata->pfs_fsid, 1); 1023 if (ocluster->focus->flags & HAMMER2_CHAIN_PFSBOUNDARY) 1024 wipdata->pfs_clid = opfs_clid; 1025 else 1026 kern_uuidgen(&wipdata->pfs_clid, 1); 1027 1028 for (i = 0; i < ncluster->nchains; ++i) { 1029 if (ncluster->array[i]) { 1030 ncluster->array[i]->bref.flags |= 1031 HAMMER2_BREF_FLAG_PFSROOT; 1032 } 1033 } 1034 #if 0 1035 /* XXX can't set this unless we do an explicit flush, which 1036 we also need a pmp assigned to do, else the flush code 1037 won't flush ncluster because it thinks it is crossing a 1038 flush boundary */ 1039 hammer2_cluster_set_chainflags(ncluster, 1040 HAMMER2_CHAIN_PFSBOUNDARY); 1041 #endif 1042 1043 /* XXX hack blockset copy */ 1044 /* XXX doesn't work with real cluster */ 1045 KKASSERT(ocluster->nchains == 1); 1046 wipdata->u.blockset = ocluster->focus->data->ipdata.u.blockset; 1047 hammer2_cluster_modsync(ncluster); 1048 for (i = 0; i < ncluster->nchains; ++i) { 1049 if (ncluster->array[i]) 1050 hammer2_flush(trans, ncluster->array[i]); 1051 } 1052 hammer2_inode_unlock_ex(nip, ncluster); 1053 } 1054 return (error); 1055 } 1056 1057 /* 1058 * Return locked parent cluster given a locked child. The child remains 1059 * locked on return. 1060 */ 1061 hammer2_cluster_t * 1062 hammer2_cluster_parent(hammer2_cluster_t *cluster) 1063 { 1064 hammer2_cluster_t *cparent; 1065 int i; 1066 1067 cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS); 1068 for (i = 0; i < cluster->nchains; ++i) { 1069 hammer2_chain_t *chain; 1070 hammer2_chain_t *rchain; 1071 1072 chain = cluster->array[i]; 1073 if (chain == NULL) 1074 continue; 1075 hammer2_chain_ref(chain); 1076 while ((rchain = chain->parent) != NULL) { 1077 hammer2_chain_ref(rchain); 1078 hammer2_chain_unlock(chain); 1079 hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS); 1080 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1081 hammer2_chain_drop(rchain); 1082 if (chain->parent == rchain) 1083 break; 1084 hammer2_chain_unlock(rchain); 1085 } 1086 hammer2_chain_drop(chain); 1087 cparent->array[i] = rchain; 1088 } 1089 return cparent; 1090 } 1091 1092 /************************************************************************ 1093 * NODE FAILURES * 1094 ************************************************************************ 1095 * 1096 * A node failure can occur for numerous reasons. 1097 * 1098 * - A read I/O may fail 1099 * - A write I/O may fail 1100 * - An unexpected chain might be found (or be missing) 1101 * - A node might disconnect temporarily and reconnect later 1102 * (for example, a USB stick could get pulled, or a node might 1103 * be programmatically disconnected). 1104 * - A node might run out of space during a modifying operation. 1105 * 1106 * When a read failure or an unexpected chain state is found, the chain and 1107 * parent chain at the failure point for the nodes involved (the nodes 1108 * which we determine to be in error) are flagged as failed and removed 1109 * from the cluster. The node itself is allowed to remain active. The 1110 * highest common point (usually a parent chain) is queued to the 1111 * resynchronization thread for action. 1112 * 1113 * When a write I/O fails or a node runs out of space, we first adjust 1114 * as if a read failure occurs but we further disable flushes on the 1115 * ENTIRE node. Concurrent modifying transactions are allowed to complete 1116 * but any new modifying transactions will automatically remove the node 1117 * from consideration in all related cluster structures and not generate 1118 * any new modified chains. The ROOT chain for the failed node(s) is queued 1119 * to the resynchronization thread for action. 1120 * 1121 * A temporary disconnect is handled as if a write failure occurred. 1122 * 1123 * Any of these failures might or might not stall related high level VNOPS, 1124 * depending on what has failed, what nodes remain, the type of cluster, 1125 * and the operating state of the cluster. 1126 * 1127 * FLUSH ON WRITE-DISABLED NODES 1128 * 1129 * A flush on a write-disabled node is not allowed to write anything because 1130 * we cannot safely update the mirror_tid anywhere on the failed node. The 1131 * synchronization thread uses mirror_tid to calculate incremental resyncs. 1132 * Dirty meta-data related to the failed node is thrown away. 1133 * 1134 * Dirty buffer cache buffers and inodes are only thrown away if they can be 1135 * retired... that is, if the filesystem still has enough nodes to complete 1136 * the operation. 1137 */ 1138 1139 /************************************************************************ 1140 * SYNCHRONIZATION THREAD * 1141 ************************************************************************ 1142 * 1143 * This thread is responsible for [re]synchronizing the cluster representing 1144 * a PFS. Any out-of-sync or failed node starts this thread on a 1145 * node-by-node basis when the failure is detected. 1146 * 1147 * Clusters needing resynchronization are queued at the highest point 1148 * where the parent on the failed node is still valid, or a special 1149 * incremental scan from the ROOT is queued if no parent exists. This 1150 * thread is also responsible for waiting for reconnections of the failed 1151 * node if the cause was due to a disconnect, and waiting for space to be 1152 * freed up if the cause was due to running out of space. 1153 * 1154 * If the cause is due to a node running out of space, this thread will also 1155 * remove older (unlocked) snapshots to make new space, recover space, and 1156 * then start resynchronization. 1157 * 1158 * Each resynchronization pass virtually snapshots the PFS on the good nodes 1159 * and synchronizes using that snapshot against the target node. This 1160 * ensures a consistent chain topology and also avoid interference between 1161 * the resynchronization thread and frontend operations. 1162 * 1163 * Since these are per-node threads it is possible to resynchronize several 1164 * nodes at once. 1165 */ 1166