1 /* 2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <sys/cdefs.h> 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/types.h> 40 #include <sys/lock.h> 41 #include <sys/uuid.h> 42 43 #include "hammer2.h" 44 45 /* 46 * Recursively flush the specified chain. The chain is locked and 47 * referenced by the caller and will remain so on return. The chain 48 * will remain referenced throughout but can temporarily lose its 49 * lock during the recursion to avoid unnecessarily stalling user 50 * processes. 51 */ 52 struct hammer2_flush_info { 53 hammer2_mount_t *hmp; 54 hammer2_chain_t *parent; 55 hammer2_trans_t *trans; 56 int depth; 57 int diddeferral; 58 struct flush_deferral_list flush_list; 59 hammer2_tid_t sync_tid; /* flush synchronization point */ 60 hammer2_tid_t mirror_tid; /* collect mirror TID updates */ 61 }; 62 63 typedef struct hammer2_flush_info hammer2_flush_info_t; 64 65 static void hammer2_chain_flush_core(hammer2_flush_info_t *info, 66 hammer2_chain_t *chain); 67 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data); 68 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data); 69 70 /* 71 * Transaction support functions for writing to the filesystem. 72 * 73 * Initializing a new transaction allocates a transaction ID. We 74 * don't bother marking the volume header MODIFIED. Instead, the volume 75 * will be synchronized at a later time as part of a larger flush sequence. 76 * 77 * Non-flush transactions can typically run concurrently. However if 78 * there are non-flush transaction both before AND after a flush trans, 79 * the transactions after stall until the ones before finish. 80 * 81 * Non-flush transactions occuring after a flush pointer can run concurrently 82 * with that flush. They only have to wait for transactions prior to the 83 * flush trans to complete before they unstall. 84 * 85 * WARNING! Modifications to the root volume cannot dup the root volume 86 * header to handle synchronization points, so alloc_tid can 87 * wind up (harmlessly) more advanced on flush. 88 * 89 * WARNING! Operations which might call inode_duplicate()/chain_duplicate() 90 * depend heavily on having a unique sync_tid to avoid duplication 91 * collisions (which key off of delete_tid). 92 */ 93 void 94 hammer2_trans_init(hammer2_trans_t *trans, hammer2_mount_t *hmp, 95 hammer2_inode_t *ip, int flags) 96 { 97 hammer2_trans_t *scan; 98 99 bzero(trans, sizeof(*trans)); 100 trans->hmp = hmp; 101 102 hammer2_voldata_lock(hmp); 103 trans->sync_tid = hmp->voldata.alloc_tid++; 104 trans->flags = flags; 105 trans->td = curthread; 106 trans->tmp_ip = ip; 107 trans->tmp_bpref = 0; 108 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry); 109 110 if (flags & HAMMER2_TRANS_ISFLUSH) { 111 /* 112 * If we are a flush we have to wait for all transactions 113 * prior to our flush synchronization point to complete 114 * before we can start our flush. 115 */ 116 ++hmp->flushcnt; 117 if (hmp->curflush == NULL) { 118 hmp->curflush = trans; 119 hmp->topo_flush_tid = trans->sync_tid; 120 } 121 while (TAILQ_FIRST(&hmp->transq) != trans) { 122 lksleep(&trans->sync_tid, &hmp->voldatalk, 123 0, "h2syncw", hz); 124 } 125 126 /* 127 * Once we become the running flush we can wakeup anyone 128 * who blocked on us. 129 */ 130 scan = trans; 131 while ((scan = TAILQ_NEXT(scan, entry)) != NULL) { 132 if (scan->flags & HAMMER2_TRANS_ISFLUSH) 133 break; 134 if (scan->blocked == 0) 135 break; 136 scan->blocked = 0; 137 wakeup(&scan->blocked); 138 } 139 } else { 140 /* 141 * If we are not a flush but our sync_tid is after a 142 * stalled flush, we have to wait until that flush unstalls 143 * (that is, all transactions prior to that flush complete), 144 * but then we can run concurrently with that flush. 145 * 146 * (flushcnt check only good as pre-condition, otherwise it 147 * may represent elements queued after us after we block). 148 */ 149 if (hmp->flushcnt > 1 || 150 (hmp->curflush && 151 TAILQ_FIRST(&hmp->transq) != hmp->curflush)) { 152 trans->blocked = 1; 153 while (trans->blocked) { 154 lksleep(&trans->blocked, &hmp->voldatalk, 155 0, "h2trans", hz); 156 } 157 } 158 } 159 hammer2_voldata_unlock(hmp, 0); 160 } 161 162 void 163 hammer2_trans_done(hammer2_trans_t *trans) 164 { 165 hammer2_mount_t *hmp = trans->hmp; 166 hammer2_trans_t *scan; 167 168 hammer2_voldata_lock(hmp); 169 TAILQ_REMOVE(&hmp->transq, trans, entry); 170 if (trans->flags & HAMMER2_TRANS_ISFLUSH) { 171 /* 172 * If we were a flush we have to adjust curflush to the 173 * next flush. 174 * 175 * flush_tid is used to partition copy-on-write operations 176 * (mostly duplicate-on-modify ops), which is what allows 177 * us to execute a flush concurrent with modifying operations 178 * with higher TIDs. 179 */ 180 --hmp->flushcnt; 181 if (hmp->flushcnt) { 182 TAILQ_FOREACH(scan, &hmp->transq, entry) { 183 if (scan->flags & HAMMER2_TRANS_ISFLUSH) 184 break; 185 } 186 KKASSERT(scan); 187 hmp->curflush = scan; 188 hmp->topo_flush_tid = scan->sync_tid; 189 } else { 190 /* 191 * Theoretically we don't have to clear flush_tid 192 * here since the flush will have synchronized 193 * all operations <= flush_tid already. But for 194 * now zero-it. 195 */ 196 hmp->curflush = NULL; 197 hmp->topo_flush_tid = 0; 198 } 199 } else { 200 /* 201 * If we are not a flush but a flush is now at the head 202 * of the queue and we were previously blocking it, 203 * we can now unblock it. 204 */ 205 if (hmp->flushcnt && 206 (scan = TAILQ_FIRST(&hmp->transq)) != NULL && 207 trans->sync_tid < scan->sync_tid && 208 (scan->flags & HAMMER2_TRANS_ISFLUSH)) { 209 wakeup(&scan->sync_tid); 210 } 211 } 212 hammer2_voldata_unlock(hmp, 0); 213 214 trans->hmp = NULL; 215 } 216 217 /* 218 * Flush the chain and all modified sub-chains through the specified 219 * synchronization point (sync_tid), propagating parent chain modifications 220 * and mirror_tid updates back up as needed. Since we are recursing downward 221 * we do not have to deal with the complexities of multi-homed chains (chains 222 * with multiple parents). 223 * 224 * Caller must have interlocked against any non-flush-related modifying 225 * operations in progress whos modify_tid values are less than or equal 226 * to the passed sync_tid. 227 * 228 * Caller must have already vetted synchronization points to ensure they 229 * are properly flushed. Only snapshots and cluster flushes can create 230 * these sorts of synchronization points. 231 * 232 * This routine can be called from several places but the most important 233 * is from the hammer2_vop_reclaim() function. We want to try to completely 234 * clean out the inode structure to prevent disconnected inodes from 235 * building up and blowing out the kmalloc pool. However, it is not actually 236 * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery 237 * capability. 238 * 239 * chain is locked on call and will remain locked on return. If a flush 240 * occured, the chain's MOVED bit will be set indicating that its parent 241 * (which is not part of the flush) should be updated. 242 */ 243 void 244 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain) 245 { 246 hammer2_chain_t *scan; 247 hammer2_chain_core_t *core; 248 hammer2_flush_info_t info; 249 250 /* 251 * Execute the recursive flush and handle deferrals. 252 * 253 * Chains can be ridiculously long (thousands deep), so to 254 * avoid blowing out the kernel stack the recursive flush has a 255 * depth limit. Elements at the limit are placed on a list 256 * for re-execution after the stack has been popped. 257 */ 258 bzero(&info, sizeof(info)); 259 TAILQ_INIT(&info.flush_list); 260 info.hmp = trans->hmp; 261 info.trans = trans; 262 info.sync_tid = trans->sync_tid; 263 info.mirror_tid = 0; 264 265 core = chain->core; 266 267 for (;;) { 268 /* 269 * Unwind deep recursions which had been deferred. This 270 * can leave MOVED set for these chains, which will be 271 * handled when we [re]flush chain after the unwind. 272 */ 273 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) { 274 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 275 TAILQ_REMOVE(&info.flush_list, scan, flush_node); 276 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 277 278 /* 279 * Now that we've popped back up we can do a secondary 280 * recursion on the deferred elements. 281 */ 282 if (hammer2_debug & 0x0040) 283 kprintf("defered flush %p\n", scan); 284 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 285 hammer2_chain_flush(trans, scan); 286 hammer2_chain_unlock(scan); 287 hammer2_chain_drop(scan); /* ref from deferral */ 288 } 289 290 /* 291 * Flush pass1 on root. 292 */ 293 info.diddeferral = 0; 294 hammer2_chain_flush_core(&info, chain); 295 #if FLUSH_DEBUG 296 kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n", 297 chain, chain->bref.type, chain->flags); 298 #endif 299 300 /* 301 * Only loop if deep recursions have been deferred. 302 */ 303 if (TAILQ_EMPTY(&info.flush_list)) 304 break; 305 } 306 } 307 308 /* 309 * This is the core of the chain flushing code. The chain is locked by the 310 * caller and remains locked on return. This function is keyed off of 311 * the SUBMODIFIED bit but must make fine-grained choices based on the 312 * synchronization point we are flushing to. 313 * 314 * If the flush accomplished any work chain will be flagged MOVED 315 * indicating a copy-on-write propagation back up is required. 316 * Deep sub-nodes may also have been entered onto the deferral list. 317 * MOVED is never set on the volume root. 318 * 319 * NOTE: modify_tid is different from MODIFIED. modify_tid is updated 320 * only when a chain is specifically modified, and not updated 321 * for copy-on-write propagations. MODIFIED is set on any modification 322 * including copy-on-write propagations. 323 */ 324 static void 325 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain) 326 { 327 hammer2_mount_t *hmp; 328 hammer2_blockref_t *bref; 329 hammer2_off_t pbase; 330 hammer2_tid_t saved_sync; 331 hammer2_trans_t *trans = info->trans; 332 hammer2_chain_core_t *core; 333 size_t bbytes; 334 size_t boff; 335 char *bdata; 336 struct buf *bp; 337 int error; 338 int wasmodified; 339 int diddeferral = 0; 340 341 hmp = info->hmp; 342 343 #if FLUSH_DEBUG 344 if (info->parent) 345 kprintf("flush_core %p->%p.%d %08x (%s)\n", 346 info->parent, chain, chain->bref.type, 347 chain->flags, 348 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ? 349 chain->data->ipdata.filename : "?")); 350 else 351 kprintf("flush_core NULL->%p.%d %08x (%s)\n", 352 chain, chain->bref.type, 353 chain->flags, 354 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ? 355 chain->data->ipdata.filename : "?")); 356 #endif 357 /* 358 * Ignore chains modified beyond the current flush point. These 359 * will be treated as if they did not exist. 360 */ 361 if (chain->modify_tid > info->sync_tid) 362 return; 363 364 /* 365 * Deleted chains which have not been destroyed must be retained, 366 * and we probably have to recurse to clean-up any sub-trees. 367 * However, restricted flushes can stop processing here because 368 * the chain cleanup will be handled by a later normal flush. 369 * 370 * The MODIFIED bit can likely be cleared in this situation and we 371 * will do so later on in this procedure. 372 */ 373 if (chain->delete_tid <= info->sync_tid) { 374 if (trans->flags & HAMMER2_TRANS_RESTRICTED) 375 return; 376 } 377 378 saved_sync = info->sync_tid; 379 core = chain->core; 380 381 /* 382 * If SUBMODIFIED is set we recurse the flush and adjust the 383 * blockrefs accordingly. 384 * 385 * NOTE: Looping on SUBMODIFIED can prevent a flush from ever 386 * finishing in the face of filesystem activity. 387 */ 388 if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) { 389 hammer2_chain_t *saved_parent; 390 hammer2_tid_t saved_mirror; 391 392 /* 393 * Clear SUBMODIFIED to catch races. Note that any child 394 * with MODIFIED, DELETED, or MOVED set during Scan2, after 395 * it processes the child, will cause SUBMODIFIED to be 396 * re-set. 397 * child has to be flushed SUBMODIFIED will wind up being 398 * set again (for next time), but this does not stop us from 399 * synchronizing block updates which occurred. 400 * 401 * We don't want to set our chain to MODIFIED gratuitously. 402 * 403 * We need an extra ref on chain because we are going to 404 * release its lock temporarily in our child loop. 405 */ 406 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED); 407 hammer2_chain_ref(chain); 408 409 /* 410 * Run two passes. The first pass handles MODIFIED and 411 * SUBMODIFIED chains and recurses while the second pass 412 * handles MOVED chains on the way back up. 413 * 414 * If the stack gets too deep we defer scan1, but must 415 * be sure to still run scan2 if on the next loop the 416 * deferred chain has been flushed and now needs MOVED 417 * handling on the way back up. 418 * 419 * Scan1 is recursive. 420 * 421 * NOTE: The act of handling a modified/submodified chain can 422 * cause the MOVED Flag to be set. It can also be set 423 * via hammer2_chain_delete() and in other situations. 424 * 425 * NOTE: RB_SCAN() must be used instead of RB_FOREACH() 426 * because children can be physically removed during 427 * the scan. 428 */ 429 saved_parent = info->parent; 430 saved_mirror = info->mirror_tid; 431 info->parent = chain; 432 info->mirror_tid = chain->bref.mirror_tid; 433 434 if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 435 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) { 436 hammer2_chain_ref(chain); 437 TAILQ_INSERT_TAIL(&info->flush_list, 438 chain, flush_node); 439 atomic_set_int(&chain->flags, 440 HAMMER2_CHAIN_DEFERRED); 441 } 442 diddeferral = 1; 443 } else { 444 info->diddeferral = 0; 445 spin_lock(&core->cst.spin); 446 RB_SCAN(hammer2_chain_tree, &chain->core->rbtree, 447 NULL, hammer2_chain_flush_scan1, info); 448 spin_unlock(&core->cst.spin); 449 diddeferral += info->diddeferral; 450 } 451 452 /* 453 * Handle successfully flushed children who are in the MOVED 454 * state on the way back up the recursion. This can have 455 * the side-effect of clearing MOVED. 456 * 457 * We execute this even if there were deferrals to try to 458 * keep the chain topology cleaner. 459 * 460 * Scan2 is non-recursive. 461 */ 462 if (diddeferral) { 463 atomic_set_int(&chain->flags, 464 HAMMER2_CHAIN_SUBMODIFIED); 465 } else { 466 #if FLUSH_DEBUG 467 kprintf("scan2_start parent %p %08x\n", 468 chain, chain->flags); 469 #endif 470 spin_lock(&core->cst.spin); 471 RB_SCAN(hammer2_chain_tree, &core->rbtree, 472 NULL, hammer2_chain_flush_scan2, info); 473 spin_unlock(&core->cst.spin); 474 #if FLUSH_DEBUG 475 kprintf("scan2_stop parent %p %08x\n", 476 chain, chain->flags); 477 #endif 478 } 479 chain->bref.mirror_tid = info->mirror_tid; 480 info->mirror_tid = saved_mirror; 481 info->parent = saved_parent; 482 hammer2_chain_drop(chain); 483 } 484 485 /* 486 * Restore sync_tid in case it was restricted by a delete/duplicate. 487 */ 488 info->sync_tid = saved_sync; 489 490 /* 491 * Rollup diddeferral for caller. Note direct assignment, not +=. 492 */ 493 info->diddeferral = diddeferral; 494 495 /* 496 * Do not flush chain if there were any deferrals. It will be 497 * retried later after the deferrals are independently handled. 498 */ 499 if (diddeferral) { 500 if (hammer2_debug & 0x0008) { 501 kprintf("%*.*s} %p/%d %04x (deferred)", 502 info->depth, info->depth, "", 503 chain, chain->refs, chain->flags); 504 } 505 return; 506 } 507 508 /* 509 * If we encounter a deleted chain within our flush we can clear 510 * the MODIFIED bit and avoid flushing it whether it has been 511 * destroyed or not. 512 */ 513 if (chain->delete_tid <= info->sync_tid) { 514 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 515 if (chain->bp) { 516 if (chain->bytes == chain->bp->b_bufsize) 517 chain->bp->b_flags |= B_INVAL|B_RELBUF; 518 } 519 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 520 hammer2_chain_drop(chain); 521 } 522 return; 523 } 524 #if 0 525 if ((chain->flags & HAMMER2_CHAIN_DESTROYED) && 526 (chain->flags & HAMMER2_CHAIN_DELETED) && 527 (trans->flags & HAMMER2_TRANS_RESTRICTED) == 0) { 528 /* 529 * Throw-away the MODIFIED flag 530 */ 531 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 532 if (chain->bp) { 533 if (chain->bytes == chain->bp->b_bufsize) 534 chain->bp->b_flags |= B_INVAL|B_RELBUF; 535 } 536 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 537 hammer2_chain_drop(chain); 538 } 539 return; 540 } 541 #endif 542 543 /* 544 * A degenerate flush might not have flushed anything and thus not 545 * processed modified blocks on the way back up. Detect the case. 546 * 547 * Note that MOVED can be set without MODIFIED being set due to 548 * a deletion, in which case it is handled by Scan2 later on. 549 * 550 * Both bits can be set along with DELETED due to a deletion if 551 * modified data within the synchronization zone and the chain 552 * was then deleted beyond the zone, in which case we still have 553 * to flush for synchronization point consistency. Otherwise though 554 * DELETED and MODIFIED are treated as separate flags. 555 */ 556 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) 557 return; 558 559 /* 560 * Issue flush. 561 * 562 * A DESTROYED node that reaches this point must be flushed for 563 * synchronization point consistency. 564 */ 565 566 /* 567 * Update mirror_tid, clear MODIFIED, and set MOVED. 568 * 569 * The caller will update the parent's reference to this chain 570 * by testing MOVED as long as the modification was in-bounds. 571 * 572 * MOVED is never set on the volume root as there is no parent 573 * to adjust. 574 */ 575 if (chain->bref.mirror_tid < info->sync_tid) 576 chain->bref.mirror_tid = info->sync_tid; 577 wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0; 578 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 579 if (chain == &hmp->vchain) 580 kprintf("(FLUSHED VOLUME HEADER)\n"); 581 if (chain == &hmp->fchain) 582 kprintf("(FLUSHED FREEMAP HEADER)\n"); 583 584 if ((chain->flags & HAMMER2_CHAIN_MOVED) || 585 chain == &hmp->vchain || 586 chain == &hmp->fchain) { 587 /* 588 * Drop the ref from the MODIFIED bit we cleared. 589 */ 590 if (wasmodified) 591 hammer2_chain_drop(chain); 592 } else { 593 /* 594 * If we were MODIFIED we inherit the ref from clearing 595 * that bit, otherwise we need another ref. 596 */ 597 if (wasmodified == 0) 598 hammer2_chain_ref(chain); 599 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 600 } 601 602 /* 603 * If this is part of a recursive flush we can go ahead and write 604 * out the buffer cache buffer and pass a new bref back up the chain 605 * via the MOVED bit. 606 * 607 * Volume headers are NOT flushed here as they require special 608 * processing. 609 */ 610 switch(chain->bref.type) { 611 case HAMMER2_BREF_TYPE_FREEMAP: 612 hammer2_modify_volume(hmp); 613 break; 614 case HAMMER2_BREF_TYPE_VOLUME: 615 /* 616 * We should flush the free block table before we calculate 617 * CRCs and copy voldata -> volsync. 618 */ 619 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 620 if (hmp->fchain.flags & (HAMMER2_CHAIN_MODIFIED | 621 HAMMER2_CHAIN_SUBMODIFIED)) { 622 /* this will modify vchain as a side effect */ 623 hammer2_chain_flush(info->trans, &hmp->fchain); 624 } 625 hammer2_chain_unlock(&hmp->fchain); 626 627 /* 628 * The volume header is flushed manually by the syncer, not 629 * here. All we do is adjust the crc's. 630 */ 631 KKASSERT(chain->data != NULL); 632 KKASSERT(chain->bp == NULL); 633 kprintf("volume header mirror_tid %jd\n", 634 hmp->voldata.mirror_tid); 635 636 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 637 hammer2_icrc32( 638 (char *)&hmp->voldata + 639 HAMMER2_VOLUME_ICRC1_OFF, 640 HAMMER2_VOLUME_ICRC1_SIZE); 641 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 642 hammer2_icrc32( 643 (char *)&hmp->voldata + 644 HAMMER2_VOLUME_ICRC0_OFF, 645 HAMMER2_VOLUME_ICRC0_SIZE); 646 hmp->voldata.icrc_volheader = 647 hammer2_icrc32( 648 (char *)&hmp->voldata + 649 HAMMER2_VOLUME_ICRCVH_OFF, 650 HAMMER2_VOLUME_ICRCVH_SIZE); 651 hmp->volsync = hmp->voldata; 652 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 653 break; 654 case HAMMER2_BREF_TYPE_DATA: 655 /* 656 * Data elements have already been flushed via the logical 657 * file buffer cache. Their hash was set in the bref by 658 * the vop_write code. 659 * 660 * Make sure any device buffer(s) have been flushed out here. 661 * (there aren't usually any to flush). 662 */ 663 bbytes = chain->bytes; 664 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1); 665 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1); 666 667 bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0); 668 if (bp) { 669 if ((bp->b_flags & (B_CACHE | B_DIRTY)) == 670 (B_CACHE | B_DIRTY)) { 671 cluster_awrite(bp); 672 } else { 673 bp->b_flags |= B_RELBUF; 674 brelse(bp); 675 } 676 } 677 break; 678 #if 0 679 case HAMMER2_BREF_TYPE_INDIRECT: 680 /* 681 * Indirect blocks may be in an INITIAL state. Use the 682 * chain_lock() call to ensure that the buffer has been 683 * instantiated (even though it is already locked the buffer 684 * might not have been instantiated). 685 * 686 * Only write the buffer out if it is dirty, it is possible 687 * the operating system had already written out the buffer. 688 */ 689 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 690 KKASSERT(chain->bp != NULL); 691 692 bp = chain->bp; 693 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) || 694 (bp->b_flags & B_DIRTY)) { 695 bdwrite(chain->bp); 696 } else { 697 brelse(chain->bp); 698 } 699 chain->bp = NULL; 700 chain->data = NULL; 701 hammer2_chain_unlock(chain); 702 break; 703 #endif 704 case HAMMER2_BREF_TYPE_INDIRECT: 705 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 706 /* 707 * Device-backed. Buffer will be flushed by the sync 708 * code XXX. 709 */ 710 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 711 break; 712 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 713 default: 714 /* 715 * Embedded elements have to be flushed out. 716 * (Basically just BREF_TYPE_INODE). 717 */ 718 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 719 KKASSERT(chain->data != NULL); 720 KKASSERT(chain->bp == NULL); 721 bref = &chain->bref; 722 723 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0); 724 KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) == 725 HAMMER2_CHECK_ISCSI32 || 726 HAMMER2_DEC_CHECK(chain->bref.methods) == 727 HAMMER2_CHECK_FREEMAP); 728 729 /* 730 * The data is embedded, we have to acquire the 731 * buffer cache buffer and copy the data into it. 732 */ 733 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) 734 bbytes = HAMMER2_MINIOSIZE; 735 pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1); 736 boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1); 737 738 /* 739 * The getblk() optimization can only be used if the 740 * physical block size matches the request. 741 */ 742 if (chain->bytes == bbytes) { 743 bp = getblk(hmp->devvp, pbase, bbytes, 0, 0); 744 error = 0; 745 } else { 746 error = bread(hmp->devvp, pbase, bbytes, &bp); 747 KKASSERT(error == 0); 748 } 749 bdata = (char *)bp->b_data + boff; 750 751 /* 752 * Copy the data to the buffer, mark the buffer 753 * dirty, and convert the chain to unmodified. 754 */ 755 bcopy(chain->data, bdata, chain->bytes); 756 bp->b_flags |= B_CLUSTEROK; 757 bdwrite(bp); 758 bp = NULL; 759 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 760 case HAMMER2_CHECK_FREEMAP: 761 chain->bref.check.freemap.icrc32 = 762 hammer2_icrc32(chain->data, chain->bytes); 763 break; 764 case HAMMER2_CHECK_ISCSI32: 765 chain->bref.check.iscsi32.value = 766 hammer2_icrc32(chain->data, chain->bytes); 767 break; 768 default: 769 panic("hammer2_flush_core: bad crc type"); 770 break; /* NOT REACHED */ 771 } 772 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 773 ++hammer2_iod_meta_write; 774 else 775 ++hammer2_iod_indr_write; 776 } 777 } 778 779 /* 780 * Flush helper scan1 (recursive) 781 * 782 * Flushes the children of the caller's chain (parent) and updates 783 * the blockref, restricted by sync_tid. 784 * 785 * Ripouts during the loop should not cause any problems. Because we are 786 * flushing to a synchronization point, modification races will occur after 787 * sync_tid and do not have to be flushed anyway. 788 * 789 * It is also ok if the parent is chain_duplicate()'d while unlocked because 790 * the delete/duplication will install a delete_tid that is still larger than 791 * our current sync_tid. 792 */ 793 static int 794 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data) 795 { 796 hammer2_flush_info_t *info = data; 797 hammer2_trans_t *trans = info->trans; 798 hammer2_chain_t *parent = info->parent; 799 /*hammer2_mount_t *hmp = info->hmp;*/ 800 int diddeferral; 801 802 /* 803 * We should only need to recurse if SUBMODIFIED is set, but as 804 * a safety also recurse if MODIFIED is also set. 805 * 806 * Return early if neither bit is set. We must re-assert the 807 * SUBMODIFIED flag in the parent if any child covered by the 808 * parent (via delete_tid) is skipped. 809 */ 810 if ((child->flags & (HAMMER2_CHAIN_MODIFIED | 811 HAMMER2_CHAIN_SUBMODIFIED)) == 0) { 812 return (0); 813 } 814 if (child->modify_tid > trans->sync_tid) { 815 if (parent->delete_tid > trans->sync_tid) { 816 atomic_set_int(&parent->flags, 817 HAMMER2_CHAIN_SUBMODIFIED); 818 } 819 return (0); 820 } 821 822 hammer2_chain_ref(child); 823 spin_unlock(&parent->core->cst.spin); 824 825 /* 826 * The caller has added a ref to the parent so we can temporarily 827 * unlock it in order to lock the child. Re-check the flags before 828 * continuing. 829 */ 830 hammer2_chain_unlock(parent); 831 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 832 833 if ((child->flags & (HAMMER2_CHAIN_MODIFIED | 834 HAMMER2_CHAIN_SUBMODIFIED)) == 0) { 835 hammer2_chain_unlock(child); 836 hammer2_chain_drop(child); 837 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 838 spin_lock(&parent->core->cst.spin); 839 return (0); 840 } 841 if (child->modify_tid > trans->sync_tid) { 842 hammer2_chain_unlock(child); 843 hammer2_chain_drop(child); 844 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 845 spin_lock(&parent->core->cst.spin); 846 if (parent->delete_tid > trans->sync_tid) { 847 atomic_set_int(&parent->flags, 848 HAMMER2_CHAIN_SUBMODIFIED); 849 } 850 return (0); 851 } 852 853 /* 854 * The DESTROYED flag can only be initially set on an unreferenced 855 * deleted inode and will propagate downward via the mechanic below. 856 * Such inode chains have been deleted for good and should no longer 857 * be subject to delete/duplication. 858 * 859 * This optimization allows the inode reclaim (destroy unlinked file 860 * on vnode reclamation after last close) to be flagged by just 861 * setting HAMMER2_CHAIN_DESTROYED at the top level and then will 862 * cause the chains to be terminated and related buffers to be 863 * invalidated and not flushed out. 864 * 865 * We have to be careful not to propagate the DESTROYED flag if 866 * the destruction occurred after our flush sync_tid. 867 */ 868 if ((parent->flags & HAMMER2_CHAIN_DESTROYED) && 869 (child->flags & HAMMER2_CHAIN_DELETED) && 870 (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) { 871 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROYED | 872 HAMMER2_CHAIN_SUBMODIFIED); 873 } 874 875 /* 876 * Recurse and collect deferral data. 877 */ 878 diddeferral = info->diddeferral; 879 ++info->depth; 880 hammer2_chain_flush_core(info, child); 881 #if FLUSH_DEBUG 882 kprintf("flush_core_done parent=%p flags=%08x child=%p.%d %08x\n", 883 parent, parent->flags, child, child->bref.type, child->flags); 884 #endif 885 --info->depth; 886 info->diddeferral += diddeferral; 887 888 if (child->flags & HAMMER2_CHAIN_SUBMODIFIED) 889 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED); 890 891 hammer2_chain_unlock(child); 892 hammer2_chain_drop(child); 893 894 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 895 896 spin_lock(&parent->core->cst.spin); 897 return (0); 898 } 899 900 /* 901 * Flush helper scan2 (non-recursive) 902 * 903 * This pass on a chain's children propagates any MOVED or DELETED 904 * elements back up the chain towards the root after those elements have 905 * been fully flushed. Unlike scan1, this function is NOT recursive and 906 * the parent remains locked across the entire scan. 907 * 908 * NOTE! We must re-set SUBMODIFIED on the parent(s) as appropriate, and 909 * due to the above conditions it is possible to do this and still 910 * have some children flagged MOVED depending on the synchronization. 911 * 912 * NOTE! A deletion is a visbility issue, there can still be referenced to 913 * deleted elements (for example, to an unlinked file which is still 914 * open), and there can also be multiple chains pointing to the same 915 * bref where some are deleted and some are not (for example due to 916 * a rename). So a chain marked for deletion is basically considered 917 * to be live until it is explicitly destroyed or until its ref-count 918 * reaches zero (also implying that MOVED and MODIFIED are clear). 919 */ 920 static int 921 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data) 922 { 923 hammer2_flush_info_t *info = data; 924 hammer2_chain_t *parent = info->parent; 925 hammer2_chain_core_t *above = child->above; 926 hammer2_mount_t *hmp = info->hmp; 927 hammer2_trans_t *trans = info->trans; 928 hammer2_blockref_t *base; 929 int count; 930 931 /* 932 * Inodes with stale children that have been converted to DIRECTDATA 933 * mode (file extension or hardlink conversion typically) need to 934 * skipped right now before we start messing with a non-existant 935 * block table. 936 */ 937 #if 0 938 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE && 939 (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)) { 940 #if FLUSH_DEBUG 941 kprintf("B"); 942 #endif 943 goto finalize; 944 } 945 #endif 946 947 /* 948 * Ignore children created after our flush point, treating them as 949 * if they did not exist). These children will not cause the parent 950 * to be updated. 951 * 952 * When we encounter such children and the parent chain has not been 953 * deleted, delete/duplicated, or delete/duplicated-for-move, then 954 * the parent may be used to funnel through several flush points. 955 * We must re-set the SUBMODIFIED flag in the parent to ensure that 956 * those flushes have visbility. A simple test of delete_tid suffices 957 * to determine if the parent spans beyond our current flush. 958 */ 959 if (child->modify_tid > trans->sync_tid) { 960 #if FLUSH_DEBUG 961 kprintf("E"); 962 #endif 963 goto finalize; 964 } 965 966 /* 967 * Ignore children which have not changed. The parent's block table 968 * is already correct. 969 */ 970 if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) { 971 #if FLUSH_DEBUG 972 kprintf("D"); 973 #endif 974 goto finalize; 975 } 976 977 978 hammer2_chain_ref(child); 979 spin_unlock(&above->cst.spin); 980 981 /* 982 * The MOVED bit implies an additional reference which prevents 983 * the child from being destroyed out from under our operation 984 * so we can lock the child safely without worrying about it 985 * getting ripped up (?). 986 * 987 * We can only update parents where child->parent matches. The 988 * child->parent link will migrate along the chain but the flush 989 * order must be enforced absolutely. Parent reflushed after the 990 * child has passed them by should skip due to the modify_tid test. 991 */ 992 hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER); 993 994 /* 995 * The parent's blockref to the child must be deleted or updated. 996 * 997 * This point is not reached on successful DESTROYED optimizations 998 * but can be reached on recursive deletions and restricted flushes. 999 * 1000 * Because flushes are ordered we do not have to make a 1001 * modify/duplicate of indirect blocks. That is, the flush 1002 * code does not have to kmalloc or duplicate anything. We 1003 * can adjust the indirect block table in-place and reuse the 1004 * chain. It IS possible that the chain has already been duplicated 1005 * or may wind up being duplicated on-the-fly by modifying code 1006 * on the frontend. We simply use the original and ignore such 1007 * chains. However, it does mean we can't clear the MOVED bit. 1008 * 1009 * XXX recursive deletions not optimized. 1010 */ 1011 hammer2_chain_modify(trans, &parent, 1012 HAMMER2_MODIFY_NO_MODIFY_TID | 1013 HAMMER2_MODIFY_ASSERTNOCOPY); 1014 1015 switch(parent->bref.type) { 1016 case HAMMER2_BREF_TYPE_INODE: 1017 /* 1018 * XXX Should assert that OPFLAG_DIRECTDATA is 0 once we 1019 * properly duplicate the inode headers and do proper flush 1020 * range checks (all the children should be beyond the flush 1021 * point). For now just don't sync the non-applicable 1022 * children. 1023 * 1024 * XXX Can also occur due to hardlink consolidation. We 1025 * set OPFLAG_DIRECTDATA to prevent the indirect and data 1026 * blocks from syncing ot the hardlink pointer. 1027 */ 1028 #if 0 1029 KKASSERT((parent->data->ipdata.op_flags & 1030 HAMMER2_OPFLAG_DIRECTDATA) == 0); 1031 #endif 1032 #if 0 1033 if (parent->data->ipdata.op_flags & 1034 HAMMER2_OPFLAG_DIRECTDATA) { 1035 base = NULL; 1036 } else 1037 #endif 1038 { 1039 base = &parent->data->ipdata.u.blockset.blockref[0]; 1040 count = HAMMER2_SET_COUNT; 1041 } 1042 break; 1043 case HAMMER2_BREF_TYPE_INDIRECT: 1044 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1045 if (parent->data) { 1046 base = &parent->data->npdata.blockref[0]; 1047 } else { 1048 base = NULL; 1049 KKASSERT(child->flags & HAMMER2_CHAIN_DELETED); 1050 } 1051 count = parent->bytes / sizeof(hammer2_blockref_t); 1052 break; 1053 case HAMMER2_BREF_TYPE_VOLUME: 1054 base = &hmp->voldata.sroot_blockset.blockref[0]; 1055 count = HAMMER2_SET_COUNT; 1056 break; 1057 case HAMMER2_BREF_TYPE_FREEMAP: 1058 base = &parent->data->npdata.blockref[0]; 1059 count = HAMMER2_SET_COUNT; 1060 break; 1061 default: 1062 base = NULL; 1063 count = 0; 1064 panic("hammer2_chain_get: " 1065 "unrecognized blockref type: %d", 1066 parent->bref.type); 1067 } 1068 1069 /* 1070 * Update the parent's blockref table and propagate mirror_tid. 1071 * 1072 * NOTE! Children with modify_tid's beyond our flush point are 1073 * considered to not exist for the purposes of updating the 1074 * parent's blockref array. 1075 * 1076 * NOTE! Updates to a parent's blockref table do not adjust the 1077 * parent's bref.modify_tid, only its bref.mirror_tid. 1078 */ 1079 KKASSERT(child->index >= 0); 1080 if (child->delete_tid <= trans->sync_tid) { 1081 if (base) { 1082 KKASSERT(child->index < count); 1083 bzero(&base[child->index], sizeof(child->bref)); 1084 if (info->mirror_tid < child->delete_tid) 1085 info->mirror_tid = child->delete_tid; 1086 } 1087 } else { 1088 if (base) { 1089 KKASSERT(child->index < count); 1090 base[child->index] = child->bref; 1091 if (info->mirror_tid < child->modify_tid) 1092 info->mirror_tid = child->modify_tid; 1093 } 1094 } 1095 1096 if (info->mirror_tid < child->bref.mirror_tid) { 1097 info->mirror_tid = child->bref.mirror_tid; 1098 } 1099 if ((parent->bref.type == HAMMER2_BREF_TYPE_VOLUME || 1100 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP) && 1101 hmp->voldata.mirror_tid < child->bref.mirror_tid) { 1102 hmp->voldata.mirror_tid = child->bref.mirror_tid; 1103 } 1104 1105 /* 1106 * When can we safely clear the MOVED flag? Flushes down duplicate 1107 * paths can occur out of order, for example if an inode is moved 1108 * as part of a hardlink consolidation or if an inode is moved into 1109 * an indirect block indexed before the inode. 1110 * 1111 * Only clear MOVED once all possible parents have been flushed. 1112 */ 1113 if (child->flags & HAMMER2_CHAIN_MOVED) { 1114 hammer2_chain_t *scan; 1115 int ok = 1; 1116 1117 spin_lock(&above->cst.spin); 1118 for (scan = above->first_parent; 1119 scan; 1120 scan = scan->next_parent) { 1121 /* 1122 * XXX weird code also checked at the top of scan2, 1123 * I would like to fix this by detaching the core 1124 * on initial hardlink consolidation (1->2 nlinks). 1125 */ 1126 #if 0 1127 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE && 1128 (scan->data->ipdata.op_flags & 1129 HAMMER2_OPFLAG_DIRECTDATA)) { 1130 continue; 1131 } 1132 #endif 1133 if (scan->flags & HAMMER2_CHAIN_SUBMODIFIED) { 1134 ok = 0; 1135 break; 1136 } 1137 } 1138 spin_unlock(&above->cst.spin); 1139 if (ok) { 1140 atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED); 1141 hammer2_chain_drop(child); /* flag */ 1142 } 1143 } 1144 1145 /* 1146 * Unlock the child. This can wind up dropping the child's 1147 * last ref, removing it from the parent's RB tree, and deallocating 1148 * the structure. The RB_SCAN() our caller is doing handles the 1149 * situation. 1150 */ 1151 hammer2_chain_unlock(child); 1152 hammer2_chain_drop(child); 1153 spin_lock(&above->cst.spin); 1154 #if FLUSH_DEBUG 1155 kprintf("F"); 1156 #endif 1157 1158 /* 1159 * The parent cleared SUBMODIFIED prior to the scan. If the child 1160 * still requires a flush (possibly due to being outside the current 1161 * synchronization zone), we must re-set SUBMODIFIED on the way back 1162 * up. 1163 */ 1164 finalize: 1165 #if FLUSH_DEBUG 1166 kprintf("G child %p 08x\n", child, child->flags); 1167 #endif 1168 return (0); 1169 } 1170