1 /* 2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <sys/cdefs.h> 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/types.h> 40 #include <sys/lock.h> 41 #include <sys/uuid.h> 42 43 #include "hammer2.h" 44 45 /* 46 * Recursively flush the specified chain. The chain is locked and 47 * referenced by the caller and will remain so on return. The chain 48 * will remain referenced throughout but can temporarily lose its 49 * lock during the recursion to avoid unnecessarily stalling user 50 * processes. 51 */ 52 struct hammer2_flush_info { 53 hammer2_chain_t *parent; 54 hammer2_trans_t *trans; 55 int depth; 56 int diddeferral; 57 struct flush_deferral_list flush_list; 58 hammer2_tid_t sync_tid; /* flush synchronization point */ 59 hammer2_tid_t mirror_tid; /* collect mirror TID updates */ 60 }; 61 62 typedef struct hammer2_flush_info hammer2_flush_info_t; 63 64 static void hammer2_chain_flush_core(hammer2_flush_info_t *info, 65 hammer2_chain_t *chain); 66 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data); 67 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data); 68 static void hammer2_rollup_stats(hammer2_chain_t *parent, 69 hammer2_chain_t *child, int how); 70 71 #if 0 72 static __inline 73 void 74 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref, 75 int how) 76 { 77 hammer2_key_t bytes; 78 79 if (bref->type != 0) { 80 bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX); 81 if (bref->type == HAMMER2_BREF_TYPE_INODE) 82 info->inode_count += how; 83 if (how < 0) 84 info->data_count -= bytes; 85 else 86 info->data_count += bytes; 87 } 88 } 89 #endif 90 91 /* 92 * Transaction support functions for writing to the filesystem. 93 * 94 * Initializing a new transaction allocates a transaction ID. We 95 * don't bother marking the volume header MODIFIED. Instead, the volume 96 * will be synchronized at a later time as part of a larger flush sequence. 97 * 98 * Non-flush transactions can typically run concurrently. However if 99 * there are non-flush transaction both before AND after a flush trans, 100 * the transactions after stall until the ones before finish. 101 * 102 * Non-flush transactions occuring after a flush pointer can run concurrently 103 * with that flush. They only have to wait for transactions prior to the 104 * flush trans to complete before they unstall. 105 * 106 * WARNING! Modifications to the root volume cannot dup the root volume 107 * header to handle synchronization points, so alloc_tid can 108 * wind up (harmlessly) more advanced on flush. 109 * 110 * WARNING! Operations which might call inode_duplicate()/chain_duplicate() 111 * depend heavily on having a unique sync_tid to avoid duplication 112 * collisions (which key off of delete_tid). 113 */ 114 void 115 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags) 116 { 117 hammer2_cluster_t *cluster; 118 hammer2_mount_t *hmp; 119 hammer2_trans_t *scan; 120 121 bzero(trans, sizeof(*trans)); 122 trans->pmp = pmp; 123 cluster = pmp->cluster; 124 hmp = cluster->hmp; 125 126 hammer2_voldata_lock(hmp); 127 trans->sync_tid = hmp->voldata.alloc_tid++; 128 trans->flags = flags; 129 trans->td = curthread; 130 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry); 131 132 if (flags & HAMMER2_TRANS_ISFLUSH) { 133 /* 134 * If we are a flush we have to wait for all transactions 135 * prior to our flush synchronization point to complete 136 * before we can start our flush. 137 */ 138 ++hmp->flushcnt; 139 if (hmp->curflush == NULL) { 140 hmp->curflush = trans; 141 hmp->topo_flush_tid = trans->sync_tid; 142 } 143 while (TAILQ_FIRST(&hmp->transq) != trans) { 144 lksleep(&trans->sync_tid, &hmp->voldatalk, 145 0, "h2syncw", hz); 146 } 147 148 /* 149 * Once we become the running flush we can wakeup anyone 150 * who blocked on us. 151 */ 152 scan = trans; 153 while ((scan = TAILQ_NEXT(scan, entry)) != NULL) { 154 if (scan->flags & HAMMER2_TRANS_ISFLUSH) 155 break; 156 if (scan->blocked == 0) 157 break; 158 scan->blocked = 0; 159 wakeup(&scan->blocked); 160 } 161 } else { 162 /* 163 * If we are not a flush but our sync_tid is after a 164 * stalled flush, we have to wait until that flush unstalls 165 * (that is, all transactions prior to that flush complete), 166 * but then we can run concurrently with that flush. 167 * 168 * (flushcnt check only good as pre-condition, otherwise it 169 * may represent elements queued after us after we block). 170 */ 171 if (hmp->flushcnt > 1 || 172 (hmp->curflush && 173 TAILQ_FIRST(&hmp->transq) != hmp->curflush)) { 174 trans->blocked = 1; 175 while (trans->blocked) { 176 lksleep(&trans->blocked, &hmp->voldatalk, 177 0, "h2trans", hz); 178 } 179 } 180 } 181 hammer2_voldata_unlock(hmp, 0); 182 } 183 184 void 185 hammer2_trans_done(hammer2_trans_t *trans) 186 { 187 hammer2_cluster_t *cluster; 188 hammer2_mount_t *hmp; 189 hammer2_trans_t *scan; 190 191 cluster = trans->pmp->cluster; 192 hmp = cluster->hmp; 193 194 hammer2_voldata_lock(hmp); 195 TAILQ_REMOVE(&hmp->transq, trans, entry); 196 if (trans->flags & HAMMER2_TRANS_ISFLUSH) { 197 /* 198 * If we were a flush we have to adjust curflush to the 199 * next flush. 200 * 201 * flush_tid is used to partition copy-on-write operations 202 * (mostly duplicate-on-modify ops), which is what allows 203 * us to execute a flush concurrent with modifying operations 204 * with higher TIDs. 205 */ 206 --hmp->flushcnt; 207 if (hmp->flushcnt) { 208 TAILQ_FOREACH(scan, &hmp->transq, entry) { 209 if (scan->flags & HAMMER2_TRANS_ISFLUSH) 210 break; 211 } 212 KKASSERT(scan); 213 hmp->curflush = scan; 214 hmp->topo_flush_tid = scan->sync_tid; 215 } else { 216 /* 217 * Theoretically we don't have to clear flush_tid 218 * here since the flush will have synchronized 219 * all operations <= flush_tid already. But for 220 * now zero-it. 221 */ 222 hmp->curflush = NULL; 223 hmp->topo_flush_tid = 0; 224 } 225 } else { 226 /* 227 * If we are not a flush but a flush is now at the head 228 * of the queue and we were previously blocking it, 229 * we can now unblock it. 230 */ 231 if (hmp->flushcnt && 232 (scan = TAILQ_FIRST(&hmp->transq)) != NULL && 233 trans->sync_tid < scan->sync_tid && 234 (scan->flags & HAMMER2_TRANS_ISFLUSH)) { 235 wakeup(&scan->sync_tid); 236 } 237 } 238 hammer2_voldata_unlock(hmp, 0); 239 } 240 241 /* 242 * Flush the chain and all modified sub-chains through the specified 243 * synchronization point (sync_tid), propagating parent chain modifications 244 * and mirror_tid updates back up as needed. Since we are recursing downward 245 * we do not have to deal with the complexities of multi-homed chains (chains 246 * with multiple parents). 247 * 248 * Caller must have interlocked against any non-flush-related modifying 249 * operations in progress whos modify_tid values are less than or equal 250 * to the passed sync_tid. 251 * 252 * Caller must have already vetted synchronization points to ensure they 253 * are properly flushed. Only snapshots and cluster flushes can create 254 * these sorts of synchronization points. 255 * 256 * This routine can be called from several places but the most important 257 * is from the hammer2_vop_reclaim() function. We want to try to completely 258 * clean out the inode structure to prevent disconnected inodes from 259 * building up and blowing out the kmalloc pool. However, it is not actually 260 * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery 261 * capability. 262 * 263 * chain is locked on call and will remain locked on return. If a flush 264 * occured, the chain's MOVED bit will be set indicating that its parent 265 * (which is not part of the flush) should be updated. 266 */ 267 void 268 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain) 269 { 270 hammer2_chain_t *scan; 271 hammer2_chain_core_t *core; 272 hammer2_flush_info_t info; 273 274 /* 275 * Execute the recursive flush and handle deferrals. 276 * 277 * Chains can be ridiculously long (thousands deep), so to 278 * avoid blowing out the kernel stack the recursive flush has a 279 * depth limit. Elements at the limit are placed on a list 280 * for re-execution after the stack has been popped. 281 */ 282 bzero(&info, sizeof(info)); 283 TAILQ_INIT(&info.flush_list); 284 info.trans = trans; 285 info.sync_tid = trans->sync_tid; 286 info.mirror_tid = 0; 287 288 core = chain->core; 289 290 for (;;) { 291 /* 292 * Unwind deep recursions which had been deferred. This 293 * can leave MOVED set for these chains, which will be 294 * handled when we [re]flush chain after the unwind. 295 */ 296 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) { 297 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 298 TAILQ_REMOVE(&info.flush_list, scan, flush_node); 299 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 300 301 /* 302 * Now that we've popped back up we can do a secondary 303 * recursion on the deferred elements. 304 */ 305 if (hammer2_debug & 0x0040) 306 kprintf("defered flush %p\n", scan); 307 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 308 hammer2_chain_flush(trans, scan); 309 hammer2_chain_unlock(scan); 310 hammer2_chain_drop(scan); /* ref from deferral */ 311 } 312 313 /* 314 * Flush pass1 on root. 315 */ 316 info.diddeferral = 0; 317 hammer2_chain_flush_core(&info, chain); 318 #if FLUSH_DEBUG 319 kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n", 320 chain, chain->bref.type, chain->flags); 321 #endif 322 323 /* 324 * Only loop if deep recursions have been deferred. 325 */ 326 if (TAILQ_EMPTY(&info.flush_list)) 327 break; 328 } 329 } 330 331 /* 332 * This is the core of the chain flushing code. The chain is locked by the 333 * caller and remains locked on return. This function is keyed off of 334 * the SUBMODIFIED bit but must make fine-grained choices based on the 335 * synchronization point we are flushing to. 336 * 337 * If the flush accomplished any work chain will be flagged MOVED 338 * indicating a copy-on-write propagation back up is required. 339 * Deep sub-nodes may also have been entered onto the deferral list. 340 * MOVED is never set on the volume root. 341 * 342 * NOTE: modify_tid is different from MODIFIED. modify_tid is updated 343 * only when a chain is specifically modified, and not updated 344 * for copy-on-write propagations. MODIFIED is set on any modification 345 * including copy-on-write propagations. 346 */ 347 static void 348 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain) 349 { 350 hammer2_mount_t *hmp; 351 hammer2_blockref_t *bref; 352 hammer2_off_t pbase; 353 hammer2_off_t pmask; 354 hammer2_tid_t saved_sync; 355 hammer2_trans_t *trans = info->trans; 356 hammer2_chain_core_t *core; 357 size_t psize; 358 size_t boff; 359 char *bdata; 360 struct buf *bp; 361 int error; 362 int wasmodified; 363 int diddeferral = 0; 364 365 hmp = chain->hmp; 366 367 #if FLUSH_DEBUG 368 if (info->parent) 369 kprintf("flush_core %p->%p.%d %08x (%s)\n", 370 info->parent, chain, chain->bref.type, 371 chain->flags, 372 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ? 373 chain->data->ipdata.filename : "?")); 374 else 375 kprintf("flush_core NULL->%p.%d %08x (%s)\n", 376 chain, chain->bref.type, 377 chain->flags, 378 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ? 379 chain->data->ipdata.filename : "?")); 380 #endif 381 /* 382 * Ignore chains modified beyond the current flush point. These 383 * will be treated as if they did not exist. 384 */ 385 if (chain->modify_tid > info->sync_tid) 386 return; 387 388 /* 389 * Deleted chains which have not been destroyed must be retained, 390 * and we probably have to recurse to clean-up any sub-trees. 391 * However, restricted flushes can stop processing here because 392 * the chain cleanup will be handled by a later normal flush. 393 * 394 * The MODIFIED bit can likely be cleared in this situation and we 395 * will do so later on in this procedure. 396 */ 397 if (chain->delete_tid <= info->sync_tid) { 398 if (trans->flags & HAMMER2_TRANS_RESTRICTED) 399 return; 400 } 401 402 saved_sync = info->sync_tid; 403 core = chain->core; 404 405 /* 406 * If SUBMODIFIED is set we recurse the flush and adjust the 407 * blockrefs accordingly. 408 * 409 * NOTE: Looping on SUBMODIFIED can prevent a flush from ever 410 * finishing in the face of filesystem activity. 411 */ 412 if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) { 413 hammer2_chain_t *saved_parent; 414 hammer2_tid_t saved_mirror; 415 416 /* 417 * Clear SUBMODIFIED to catch races. Note that any child 418 * with MODIFIED, DELETED, or MOVED set during Scan2, after 419 * it processes the child, will cause SUBMODIFIED to be 420 * re-set. 421 * child has to be flushed SUBMODIFIED will wind up being 422 * set again (for next time), but this does not stop us from 423 * synchronizing block updates which occurred. 424 * 425 * We don't want to set our chain to MODIFIED gratuitously. 426 * 427 * We need an extra ref on chain because we are going to 428 * release its lock temporarily in our child loop. 429 */ 430 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED); 431 hammer2_chain_ref(chain); 432 433 /* 434 * Run two passes. The first pass handles MODIFIED and 435 * SUBMODIFIED chains and recurses while the second pass 436 * handles MOVED chains on the way back up. 437 * 438 * If the stack gets too deep we defer scan1, but must 439 * be sure to still run scan2 if on the next loop the 440 * deferred chain has been flushed and now needs MOVED 441 * handling on the way back up. 442 * 443 * Scan1 is recursive. 444 * 445 * NOTE: The act of handling a modified/submodified chain can 446 * cause the MOVED Flag to be set. It can also be set 447 * via hammer2_chain_delete() and in other situations. 448 * 449 * NOTE: RB_SCAN() must be used instead of RB_FOREACH() 450 * because children can be physically removed during 451 * the scan. 452 */ 453 saved_parent = info->parent; 454 saved_mirror = info->mirror_tid; 455 info->parent = chain; 456 info->mirror_tid = chain->bref.mirror_tid; 457 458 if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 459 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) { 460 hammer2_chain_ref(chain); 461 TAILQ_INSERT_TAIL(&info->flush_list, 462 chain, flush_node); 463 atomic_set_int(&chain->flags, 464 HAMMER2_CHAIN_DEFERRED); 465 } 466 diddeferral = 1; 467 } else { 468 info->diddeferral = 0; 469 spin_lock(&core->cst.spin); 470 RB_SCAN(hammer2_chain_tree, &chain->core->rbtree, 471 NULL, hammer2_chain_flush_scan1, info); 472 spin_unlock(&core->cst.spin); 473 diddeferral += info->diddeferral; 474 } 475 476 /* 477 * Handle successfully flushed children who are in the MOVED 478 * state on the way back up the recursion. This can have 479 * the side-effect of clearing MOVED. 480 * 481 * We execute this even if there were deferrals to try to 482 * keep the chain topology cleaner. 483 * 484 * Scan2 is non-recursive. 485 */ 486 if (diddeferral) { 487 atomic_set_int(&chain->flags, 488 HAMMER2_CHAIN_SUBMODIFIED); 489 } else { 490 #if FLUSH_DEBUG 491 kprintf("scan2_start parent %p %08x\n", 492 chain, chain->flags); 493 #endif 494 spin_lock(&core->cst.spin); 495 RB_SCAN(hammer2_chain_tree, &core->rbtree, 496 NULL, hammer2_chain_flush_scan2, info); 497 spin_unlock(&core->cst.spin); 498 #if FLUSH_DEBUG 499 kprintf("scan2_stop parent %p %08x\n", 500 chain, chain->flags); 501 #endif 502 } 503 chain->bref.mirror_tid = info->mirror_tid; 504 info->mirror_tid = saved_mirror; 505 info->parent = saved_parent; 506 hammer2_chain_drop(chain); 507 } 508 509 /* 510 * Restore sync_tid in case it was restricted by a delete/duplicate. 511 */ 512 info->sync_tid = saved_sync; 513 514 /* 515 * Rollup diddeferral for caller. Note direct assignment, not +=. 516 */ 517 info->diddeferral = diddeferral; 518 519 /* 520 * Do not flush chain if there were any deferrals. It will be 521 * retried later after the deferrals are independently handled. 522 */ 523 if (diddeferral) { 524 if (hammer2_debug & 0x0008) { 525 kprintf("%*.*s} %p/%d %04x (deferred)", 526 info->depth, info->depth, "", 527 chain, chain->refs, chain->flags); 528 } 529 return; 530 } 531 532 /* 533 * If we encounter a deleted chain within our flush we can clear 534 * the MODIFIED bit and avoid flushing it whether it has been 535 * destroyed or not. We must make sure that the chain is flagged 536 * MOVED in this situation so the parent picks up the deletion. 537 * 538 * Note that scan2 has already executed above so statistics have 539 * already been rolled up. 540 */ 541 if (chain->delete_tid <= info->sync_tid) { 542 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 543 if (chain->bp) { 544 if (chain->bytes == chain->bp->b_bufsize) 545 chain->bp->b_flags |= B_INVAL|B_RELBUF; 546 } 547 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 548 hammer2_chain_ref(chain); 549 atomic_set_int(&chain->flags, 550 HAMMER2_CHAIN_MOVED); 551 } 552 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 553 hammer2_chain_drop(chain); 554 } 555 return; 556 } 557 #if 0 558 if ((chain->flags & HAMMER2_CHAIN_DESTROYED) && 559 (chain->flags & HAMMER2_CHAIN_DELETED) && 560 (trans->flags & HAMMER2_TRANS_RESTRICTED) == 0) { 561 /* 562 * Throw-away the MODIFIED flag 563 */ 564 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 565 if (chain->bp) { 566 if (chain->bytes == chain->bp->b_bufsize) 567 chain->bp->b_flags |= B_INVAL|B_RELBUF; 568 } 569 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 570 hammer2_chain_drop(chain); 571 } 572 return; 573 } 574 #endif 575 576 /* 577 * A degenerate flush might not have flushed anything and thus not 578 * processed modified blocks on the way back up. Detect the case. 579 * 580 * Note that MOVED can be set without MODIFIED being set due to 581 * a deletion, in which case it is handled by Scan2 later on. 582 * 583 * Both bits can be set along with DELETED due to a deletion if 584 * modified data within the synchronization zone and the chain 585 * was then deleted beyond the zone, in which case we still have 586 * to flush for synchronization point consistency. Otherwise though 587 * DELETED and MODIFIED are treated as separate flags. 588 */ 589 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) 590 return; 591 592 /* 593 * Issue flush. 594 * 595 * A DESTROYED node that reaches this point must be flushed for 596 * synchronization point consistency. 597 */ 598 599 /* 600 * Update mirror_tid, clear MODIFIED, and set MOVED. 601 * 602 * The caller will update the parent's reference to this chain 603 * by testing MOVED as long as the modification was in-bounds. 604 * 605 * MOVED is never set on the volume root as there is no parent 606 * to adjust. 607 */ 608 if (chain->bref.mirror_tid < info->sync_tid) 609 chain->bref.mirror_tid = info->sync_tid; 610 wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0; 611 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 612 if (chain == &hmp->vchain) 613 kprintf("(FLUSHED VOLUME HEADER)\n"); 614 if (chain == &hmp->fchain) 615 kprintf("(FLUSHED FREEMAP HEADER)\n"); 616 617 if ((chain->flags & HAMMER2_CHAIN_MOVED) || 618 chain == &hmp->vchain || 619 chain == &hmp->fchain) { 620 /* 621 * Drop the ref from the MODIFIED bit we cleared. 622 */ 623 if (wasmodified) 624 hammer2_chain_drop(chain); 625 } else { 626 /* 627 * If we were MODIFIED we inherit the ref from clearing 628 * that bit, otherwise we need another ref. 629 */ 630 if (wasmodified == 0) 631 hammer2_chain_ref(chain); 632 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 633 } 634 635 /* 636 * If this is part of a recursive flush we can go ahead and write 637 * out the buffer cache buffer and pass a new bref back up the chain 638 * via the MOVED bit. 639 * 640 * Volume headers are NOT flushed here as they require special 641 * processing. 642 */ 643 switch(chain->bref.type) { 644 case HAMMER2_BREF_TYPE_FREEMAP: 645 hammer2_modify_volume(hmp); 646 break; 647 case HAMMER2_BREF_TYPE_VOLUME: 648 /* 649 * We should flush the free block table before we calculate 650 * CRCs and copy voldata -> volsync. 651 * 652 * To prevent SMP races, fchain must remain locked until 653 * voldata is copied to volsync. 654 */ 655 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 656 if (hmp->fchain.flags & (HAMMER2_CHAIN_MODIFIED | 657 HAMMER2_CHAIN_SUBMODIFIED)) { 658 /* this will modify vchain as a side effect */ 659 hammer2_chain_flush(info->trans, &hmp->fchain); 660 } 661 662 /* 663 * The volume header is flushed manually by the syncer, not 664 * here. All we do is adjust the crc's. 665 */ 666 KKASSERT(chain->data != NULL); 667 KKASSERT(chain->bp == NULL); 668 kprintf("volume header mirror_tid %jd\n", 669 hmp->voldata.mirror_tid); 670 671 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 672 hammer2_icrc32( 673 (char *)&hmp->voldata + 674 HAMMER2_VOLUME_ICRC1_OFF, 675 HAMMER2_VOLUME_ICRC1_SIZE); 676 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 677 hammer2_icrc32( 678 (char *)&hmp->voldata + 679 HAMMER2_VOLUME_ICRC0_OFF, 680 HAMMER2_VOLUME_ICRC0_SIZE); 681 hmp->voldata.icrc_volheader = 682 hammer2_icrc32( 683 (char *)&hmp->voldata + 684 HAMMER2_VOLUME_ICRCVH_OFF, 685 HAMMER2_VOLUME_ICRCVH_SIZE); 686 hmp->volsync = hmp->voldata; 687 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 688 hammer2_chain_unlock(&hmp->fchain); 689 break; 690 case HAMMER2_BREF_TYPE_DATA: 691 /* 692 * Data elements have already been flushed via the logical 693 * file buffer cache. Their hash was set in the bref by 694 * the vop_write code. 695 * 696 * Make sure any device buffer(s) have been flushed out here. 697 * (there aren't usually any to flush). 698 */ 699 psize = hammer2_devblksize(chain->bytes); 700 pmask = (hammer2_off_t)psize - 1; 701 pbase = chain->bref.data_off & ~pmask; 702 boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask); 703 704 bp = getblk(hmp->devvp, pbase, psize, GETBLK_NOWAIT, 0); 705 if (bp) { 706 if ((bp->b_flags & (B_CACHE | B_DIRTY)) == 707 (B_CACHE | B_DIRTY)) { 708 cluster_awrite(bp); 709 } else { 710 bp->b_flags |= B_RELBUF; 711 brelse(bp); 712 } 713 } 714 break; 715 #if 0 716 case HAMMER2_BREF_TYPE_INDIRECT: 717 /* 718 * Indirect blocks may be in an INITIAL state. Use the 719 * chain_lock() call to ensure that the buffer has been 720 * instantiated (even though it is already locked the buffer 721 * might not have been instantiated). 722 * 723 * Only write the buffer out if it is dirty, it is possible 724 * the operating system had already written out the buffer. 725 */ 726 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 727 KKASSERT(chain->bp != NULL); 728 729 bp = chain->bp; 730 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) || 731 (bp->b_flags & B_DIRTY)) { 732 bdwrite(chain->bp); 733 } else { 734 brelse(chain->bp); 735 } 736 chain->bp = NULL; 737 chain->data = NULL; 738 hammer2_chain_unlock(chain); 739 break; 740 #endif 741 case HAMMER2_BREF_TYPE_INDIRECT: 742 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 743 /* 744 * Device-backed. Buffer will be flushed by the sync 745 * code XXX. 746 */ 747 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 748 break; 749 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 750 default: 751 /* 752 * Embedded elements have to be flushed out. 753 * (Basically just BREF_TYPE_INODE). 754 */ 755 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 756 KKASSERT(chain->data != NULL); 757 KKASSERT(chain->bp == NULL); 758 bref = &chain->bref; 759 760 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0); 761 KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) == 762 HAMMER2_CHECK_ISCSI32 || 763 HAMMER2_DEC_CHECK(chain->bref.methods) == 764 HAMMER2_CHECK_FREEMAP); 765 766 /* 767 * The data is embedded, we have to acquire the 768 * buffer cache buffer and copy the data into it. 769 */ 770 psize = hammer2_devblksize(chain->bytes); 771 pmask = (hammer2_off_t)psize - 1; 772 pbase = bref->data_off & ~pmask; 773 boff = bref->data_off & (HAMMER2_OFF_MASK & pmask); 774 775 /* 776 * The getblk() optimization can only be used if the 777 * physical block size matches the request. 778 */ 779 error = bread(hmp->devvp, pbase, psize, &bp); 780 KKASSERT(error == 0); 781 782 bdata = (char *)bp->b_data + boff; 783 784 /* 785 * Copy the data to the buffer, mark the buffer 786 * dirty, and convert the chain to unmodified. 787 */ 788 bcopy(chain->data, bdata, chain->bytes); 789 bp->b_flags |= B_CLUSTEROK; 790 bdwrite(bp); 791 bp = NULL; 792 793 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 794 case HAMMER2_CHECK_FREEMAP: 795 chain->bref.check.freemap.icrc32 = 796 hammer2_icrc32(chain->data, chain->bytes); 797 break; 798 case HAMMER2_CHECK_ISCSI32: 799 chain->bref.check.iscsi32.value = 800 hammer2_icrc32(chain->data, chain->bytes); 801 break; 802 default: 803 panic("hammer2_flush_core: bad crc type"); 804 break; /* NOT REACHED */ 805 } 806 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 807 ++hammer2_iod_meta_write; 808 else 809 ++hammer2_iod_indr_write; 810 } 811 } 812 813 /* 814 * Flush helper scan1 (recursive) 815 * 816 * Flushes the children of the caller's chain (parent) and updates 817 * the blockref, restricted by sync_tid. 818 * 819 * Ripouts during the loop should not cause any problems. Because we are 820 * flushing to a synchronization point, modification races will occur after 821 * sync_tid and do not have to be flushed anyway. 822 * 823 * It is also ok if the parent is chain_duplicate()'d while unlocked because 824 * the delete/duplication will install a delete_tid that is still larger than 825 * our current sync_tid. 826 */ 827 static int 828 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data) 829 { 830 hammer2_flush_info_t *info = data; 831 hammer2_trans_t *trans = info->trans; 832 hammer2_chain_t *parent = info->parent; 833 int diddeferral; 834 835 /* 836 * We should only need to recurse if SUBMODIFIED is set, but as 837 * a safety also recurse if MODIFIED is also set. 838 * 839 * Return early if neither bit is set. We must re-assert the 840 * SUBMODIFIED flag in the parent if any child covered by the 841 * parent (via delete_tid) is skipped. 842 */ 843 if ((child->flags & (HAMMER2_CHAIN_MODIFIED | 844 HAMMER2_CHAIN_SUBMODIFIED)) == 0) { 845 return (0); 846 } 847 if (child->modify_tid > trans->sync_tid) { 848 if (parent->delete_tid > trans->sync_tid) { 849 atomic_set_int(&parent->flags, 850 HAMMER2_CHAIN_SUBMODIFIED); 851 } 852 return (0); 853 } 854 855 hammer2_chain_ref(child); 856 spin_unlock(&parent->core->cst.spin); 857 858 /* 859 * The caller has added a ref to the parent so we can temporarily 860 * unlock it in order to lock the child. Re-check the flags before 861 * continuing. 862 */ 863 hammer2_chain_unlock(parent); 864 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 865 866 if ((child->flags & (HAMMER2_CHAIN_MODIFIED | 867 HAMMER2_CHAIN_SUBMODIFIED)) == 0) { 868 hammer2_chain_unlock(child); 869 hammer2_chain_drop(child); 870 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 871 spin_lock(&parent->core->cst.spin); 872 return (0); 873 } 874 if (child->modify_tid > trans->sync_tid) { 875 hammer2_chain_unlock(child); 876 hammer2_chain_drop(child); 877 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 878 spin_lock(&parent->core->cst.spin); 879 if (parent->delete_tid > trans->sync_tid) { 880 atomic_set_int(&parent->flags, 881 HAMMER2_CHAIN_SUBMODIFIED); 882 } 883 return (0); 884 } 885 886 /* 887 * The DESTROYED flag can only be initially set on an unreferenced 888 * deleted inode and will propagate downward via the mechanic below. 889 * Such inode chains have been deleted for good and should no longer 890 * be subject to delete/duplication. 891 * 892 * This optimization allows the inode reclaim (destroy unlinked file 893 * on vnode reclamation after last close) to be flagged by just 894 * setting HAMMER2_CHAIN_DESTROYED at the top level and then will 895 * cause the chains to be terminated and related buffers to be 896 * invalidated and not flushed out. 897 * 898 * We have to be careful not to propagate the DESTROYED flag if 899 * the destruction occurred after our flush sync_tid. 900 */ 901 if ((parent->flags & HAMMER2_CHAIN_DESTROYED) && 902 (child->flags & HAMMER2_CHAIN_DELETED) && 903 (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) { 904 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROYED | 905 HAMMER2_CHAIN_SUBMODIFIED); 906 } 907 908 /* 909 * Recurse and collect deferral data. 910 */ 911 diddeferral = info->diddeferral; 912 ++info->depth; 913 hammer2_chain_flush_core(info, child); 914 #if FLUSH_DEBUG 915 kprintf("flush_core_done parent=%p flags=%08x child=%p.%d %08x\n", 916 parent, parent->flags, child, child->bref.type, child->flags); 917 #endif 918 --info->depth; 919 info->diddeferral += diddeferral; 920 921 if (child->flags & HAMMER2_CHAIN_SUBMODIFIED) 922 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED); 923 924 hammer2_chain_unlock(child); 925 hammer2_chain_drop(child); 926 927 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 928 929 spin_lock(&parent->core->cst.spin); 930 return (0); 931 } 932 933 /* 934 * Flush helper scan2 (non-recursive) 935 * 936 * This pass on a chain's children propagates any MOVED or DELETED 937 * elements back up the chain towards the root after those elements have 938 * been fully flushed. Unlike scan1, this function is NOT recursive and 939 * the parent remains locked across the entire scan. 940 * 941 * This function also rolls up storage statistics. 942 * 943 * NOTE! We must re-set SUBMODIFIED on the parent(s) as appropriate, and 944 * due to the above conditions it is possible to do this and still 945 * have some children flagged MOVED depending on the synchronization. 946 * 947 * NOTE! A deletion is a visbility issue, there can still be referenced to 948 * deleted elements (for example, to an unlinked file which is still 949 * open), and there can also be multiple chains pointing to the same 950 * bref where some are deleted and some are not (for example due to 951 * a rename). So a chain marked for deletion is basically considered 952 * to be live until it is explicitly destroyed or until its ref-count 953 * reaches zero (also implying that MOVED and MODIFIED are clear). 954 */ 955 static int 956 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data) 957 { 958 hammer2_flush_info_t *info = data; 959 hammer2_chain_t *parent = info->parent; 960 hammer2_chain_core_t *above = child->above; 961 hammer2_mount_t *hmp = child->hmp; 962 hammer2_trans_t *trans = info->trans; 963 hammer2_blockref_t *base; 964 int count; 965 966 /* 967 * Inodes with stale children that have been converted to DIRECTDATA 968 * mode (file extension or hardlink conversion typically) need to 969 * skipped right now before we start messing with a non-existant 970 * block table. 971 */ 972 #if 0 973 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE && 974 (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)) { 975 #if FLUSH_DEBUG 976 kprintf("B"); 977 #endif 978 goto finalize; 979 } 980 #endif 981 982 /* 983 * Ignore children created after our flush point, treating them as 984 * if they did not exist). These children will not cause the parent 985 * to be updated. 986 * 987 * When we encounter such children and the parent chain has not been 988 * deleted, delete/duplicated, or delete/duplicated-for-move, then 989 * the parent may be used to funnel through several flush points. 990 * We must re-set the SUBMODIFIED flag in the parent to ensure that 991 * those flushes have visbility. A simple test of delete_tid suffices 992 * to determine if the parent spans beyond our current flush. 993 */ 994 if (child->modify_tid > trans->sync_tid) { 995 #if FLUSH_DEBUG 996 kprintf("E"); 997 #endif 998 goto finalize; 999 } 1000 1001 /* 1002 * Ignore children which have not changed. The parent's block table 1003 * is already correct. 1004 */ 1005 if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) { 1006 #if FLUSH_DEBUG 1007 kprintf("D"); 1008 #endif 1009 goto finalize; 1010 } 1011 1012 1013 hammer2_chain_ref(child); 1014 spin_unlock(&above->cst.spin); 1015 1016 /* 1017 * The MOVED bit implies an additional reference which prevents 1018 * the child from being destroyed out from under our operation 1019 * so we can lock the child safely without worrying about it 1020 * getting ripped up (?). 1021 * 1022 * We can only update parents where child->parent matches. The 1023 * child->parent link will migrate along the chain but the flush 1024 * order must be enforced absolutely. Parent reflushed after the 1025 * child has passed them by should skip due to the modify_tid test. 1026 */ 1027 hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER); 1028 1029 /* 1030 * The parent's blockref to the child must be deleted or updated. 1031 * 1032 * This point is not reached on successful DESTROYED optimizations 1033 * but can be reached on recursive deletions and restricted flushes. 1034 * 1035 * Because flushes are ordered we do not have to make a 1036 * modify/duplicate of indirect blocks. That is, the flush 1037 * code does not have to kmalloc or duplicate anything. We 1038 * can adjust the indirect block table in-place and reuse the 1039 * chain. It IS possible that the chain has already been duplicated 1040 * or may wind up being duplicated on-the-fly by modifying code 1041 * on the frontend. We simply use the original and ignore such 1042 * chains. However, it does mean we can't clear the MOVED bit. 1043 * 1044 * XXX recursive deletions not optimized. 1045 */ 1046 hammer2_chain_modify(trans, &parent, 1047 HAMMER2_MODIFY_NO_MODIFY_TID | 1048 HAMMER2_MODIFY_ASSERTNOCOPY); 1049 1050 switch(parent->bref.type) { 1051 case HAMMER2_BREF_TYPE_INODE: 1052 /* 1053 * XXX Should assert that OPFLAG_DIRECTDATA is 0 once we 1054 * properly duplicate the inode headers and do proper flush 1055 * range checks (all the children should be beyond the flush 1056 * point). For now just don't sync the non-applicable 1057 * children. 1058 * 1059 * XXX Can also occur due to hardlink consolidation. We 1060 * set OPFLAG_DIRECTDATA to prevent the indirect and data 1061 * blocks from syncing ot the hardlink pointer. 1062 */ 1063 #if 0 1064 KKASSERT((parent->data->ipdata.op_flags & 1065 HAMMER2_OPFLAG_DIRECTDATA) == 0); 1066 #endif 1067 #if 0 1068 if (parent->data->ipdata.op_flags & 1069 HAMMER2_OPFLAG_DIRECTDATA) { 1070 base = NULL; 1071 } else 1072 #endif 1073 { 1074 base = &parent->data->ipdata.u.blockset.blockref[0]; 1075 count = HAMMER2_SET_COUNT; 1076 } 1077 break; 1078 case HAMMER2_BREF_TYPE_INDIRECT: 1079 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1080 if (parent->data) { 1081 base = &parent->data->npdata[0]; 1082 } else { 1083 base = NULL; 1084 KKASSERT(child->flags & HAMMER2_CHAIN_DELETED); 1085 } 1086 count = parent->bytes / sizeof(hammer2_blockref_t); 1087 break; 1088 case HAMMER2_BREF_TYPE_VOLUME: 1089 base = &hmp->voldata.sroot_blockset.blockref[0]; 1090 count = HAMMER2_SET_COUNT; 1091 break; 1092 case HAMMER2_BREF_TYPE_FREEMAP: 1093 base = &parent->data->npdata[0]; 1094 count = HAMMER2_SET_COUNT; 1095 break; 1096 default: 1097 base = NULL; 1098 count = 0; 1099 panic("hammer2_chain_get: " 1100 "unrecognized blockref type: %d", 1101 parent->bref.type); 1102 } 1103 1104 /* 1105 * Update the parent's blockref table and propagate mirror_tid. 1106 * 1107 * NOTE! Children with modify_tid's beyond our flush point are 1108 * considered to not exist for the purposes of updating the 1109 * parent's blockref array. 1110 * 1111 * NOTE! Updates to a parent's blockref table do not adjust the 1112 * parent's bref.modify_tid, only its bref.mirror_tid. 1113 */ 1114 KKASSERT(child->index >= 0); 1115 if (child->delete_tid <= trans->sync_tid) { 1116 if (base) { 1117 hammer2_rollup_stats(parent, child, -1); 1118 KKASSERT(child->index < count); 1119 bzero(&base[child->index], sizeof(child->bref)); 1120 } 1121 if (info->mirror_tid < child->delete_tid) 1122 info->mirror_tid = child->delete_tid; 1123 } else { 1124 if (base) { 1125 KKASSERT(child->index < count); 1126 if (base[child->index].type == 0) 1127 hammer2_rollup_stats(parent, child, 1); 1128 else 1129 hammer2_rollup_stats(parent, child, 0); 1130 base[child->index] = child->bref; 1131 } 1132 if (info->mirror_tid < child->modify_tid) 1133 info->mirror_tid = child->modify_tid; 1134 } 1135 1136 if (info->mirror_tid < child->bref.mirror_tid) { 1137 info->mirror_tid = child->bref.mirror_tid; 1138 } 1139 if ((parent->bref.type == HAMMER2_BREF_TYPE_VOLUME || 1140 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP) && 1141 hmp->voldata.mirror_tid < child->bref.mirror_tid) { 1142 hmp->voldata.mirror_tid = child->bref.mirror_tid; 1143 } 1144 1145 /* 1146 * When can we safely clear the MOVED flag? Flushes down duplicate 1147 * paths can occur out of order, for example if an inode is moved 1148 * as part of a hardlink consolidation or if an inode is moved into 1149 * an indirect block indexed before the inode. 1150 * 1151 * Only clear MOVED once all possible parents have been flushed. 1152 */ 1153 if (child->flags & HAMMER2_CHAIN_MOVED) { 1154 hammer2_chain_t *scan; 1155 int ok = 1; 1156 1157 spin_lock(&above->cst.spin); 1158 for (scan = above->first_parent; 1159 scan; 1160 scan = scan->next_parent) { 1161 /* 1162 * XXX weird code also checked at the top of scan2, 1163 * I would like to fix this by detaching the core 1164 * on initial hardlink consolidation (1->2 nlinks). 1165 */ 1166 #if 0 1167 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE && 1168 (scan->data->ipdata.op_flags & 1169 HAMMER2_OPFLAG_DIRECTDATA)) { 1170 continue; 1171 } 1172 #endif 1173 if (scan->flags & HAMMER2_CHAIN_SUBMODIFIED) { 1174 ok = 0; 1175 break; 1176 } 1177 } 1178 spin_unlock(&above->cst.spin); 1179 if (ok) { 1180 atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED); 1181 hammer2_chain_drop(child); /* flag */ 1182 } 1183 } 1184 1185 /* 1186 * Unlock the child. This can wind up dropping the child's 1187 * last ref, removing it from the parent's RB tree, and deallocating 1188 * the structure. The RB_SCAN() our caller is doing handles the 1189 * situation. 1190 */ 1191 hammer2_chain_unlock(child); 1192 hammer2_chain_drop(child); 1193 spin_lock(&above->cst.spin); 1194 #if FLUSH_DEBUG 1195 kprintf("F"); 1196 #endif 1197 1198 /* 1199 * The parent cleared SUBMODIFIED prior to the scan. If the child 1200 * still requires a flush (possibly due to being outside the current 1201 * synchronization zone), we must re-set SUBMODIFIED on the way back 1202 * up. 1203 */ 1204 finalize: 1205 #if FLUSH_DEBUG 1206 kprintf("G child %p 08x\n", child, child->flags); 1207 #endif 1208 return (0); 1209 } 1210 1211 static 1212 void 1213 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how) 1214 { 1215 hammer2_chain_t *grandp; 1216 1217 parent->data_count += child->data_count; 1218 parent->inode_count += child->inode_count; 1219 child->data_count = 0; 1220 child->inode_count = 0; 1221 if (how < 0) { 1222 parent->data_count -= child->bytes; 1223 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) { 1224 parent->inode_count -= 1; 1225 #if 0 1226 /* XXX child->data may be NULL atm */ 1227 parent->data_count -= child->data->ipdata.data_count; 1228 parent->inode_count -= child->data->ipdata.inode_count; 1229 #endif 1230 } 1231 } else if (how > 0) { 1232 parent->data_count += child->bytes; 1233 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) { 1234 parent->inode_count += 1; 1235 #if 0 1236 /* XXX child->data may be NULL atm */ 1237 parent->data_count += child->data->ipdata.data_count; 1238 parent->inode_count += child->data->ipdata.inode_count; 1239 #endif 1240 } 1241 } 1242 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 1243 parent->data->ipdata.data_count += parent->data_count; 1244 parent->data->ipdata.inode_count += parent->inode_count; 1245 for (grandp = parent->above->first_parent; 1246 grandp; 1247 grandp = grandp->next_parent) { 1248 grandp->data_count += parent->data_count; 1249 grandp->inode_count += parent->inode_count; 1250 } 1251 parent->data_count = 0; 1252 parent->inode_count = 0; 1253 } 1254 } 1255