1 /* 2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <sys/cdefs.h> 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/types.h> 40 #include <sys/lock.h> 41 #include <sys/uuid.h> 42 43 #include "hammer2.h" 44 45 /* 46 * Recursively flush the specified chain. The chain is locked and 47 * referenced by the caller and will remain so on return. The chain 48 * will remain referenced throughout but can temporarily lose its 49 * lock during the recursion to avoid unnecessarily stalling user 50 * processes. 51 */ 52 struct hammer2_flush_info { 53 hammer2_chain_t *parent; 54 hammer2_trans_t *trans; 55 int depth; 56 int diddeferral; 57 struct flush_deferral_list flush_list; 58 hammer2_tid_t sync_tid; /* flush synchronization point */ 59 hammer2_tid_t mirror_tid; /* collect mirror TID updates */ 60 }; 61 62 typedef struct hammer2_flush_info hammer2_flush_info_t; 63 64 static void hammer2_chain_flush_core(hammer2_flush_info_t *info, 65 hammer2_chain_t *chain); 66 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data); 67 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data); 68 69 #if 0 70 static __inline 71 void 72 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref, 73 int how) 74 { 75 hammer2_key_t bytes; 76 77 if (bref->type != 0) { 78 bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX); 79 if (bref->type == HAMMER2_BREF_TYPE_INODE) 80 info->inode_count += how; 81 if (how < 0) 82 info->data_count -= bytes; 83 else 84 info->data_count += bytes; 85 } 86 } 87 #endif 88 89 /* 90 * Transaction support functions for writing to the filesystem. 91 * 92 * Initializing a new transaction allocates a transaction ID. We 93 * don't bother marking the volume header MODIFIED. Instead, the volume 94 * will be synchronized at a later time as part of a larger flush sequence. 95 * 96 * Non-flush transactions can typically run concurrently. However if 97 * there are non-flush transaction both before AND after a flush trans, 98 * the transactions after stall until the ones before finish. 99 * 100 * Non-flush transactions occuring after a flush pointer can run concurrently 101 * with that flush. They only have to wait for transactions prior to the 102 * flush trans to complete before they unstall. 103 * 104 * WARNING! Modifications to the root volume cannot dup the root volume 105 * header to handle synchronization points, so alloc_tid can 106 * wind up (harmlessly) more advanced on flush. 107 * 108 * WARNING! Operations which might call inode_duplicate()/chain_duplicate() 109 * depend heavily on having a unique sync_tid to avoid duplication 110 * collisions (which key off of delete_tid). 111 */ 112 void 113 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags) 114 { 115 hammer2_cluster_t *cluster; 116 hammer2_mount_t *hmp; 117 hammer2_trans_t *scan; 118 119 bzero(trans, sizeof(*trans)); 120 trans->pmp = pmp; 121 cluster = pmp->cluster; 122 hmp = cluster->hmp; 123 124 hammer2_voldata_lock(hmp); 125 trans->sync_tid = hmp->voldata.alloc_tid++; 126 trans->flags = flags; 127 trans->td = curthread; 128 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry); 129 130 if (flags & HAMMER2_TRANS_ISFLUSH) { 131 /* 132 * If we are a flush we have to wait for all transactions 133 * prior to our flush synchronization point to complete 134 * before we can start our flush. 135 */ 136 ++hmp->flushcnt; 137 if (hmp->curflush == NULL) { 138 hmp->curflush = trans; 139 hmp->topo_flush_tid = trans->sync_tid; 140 } 141 while (TAILQ_FIRST(&hmp->transq) != trans) { 142 lksleep(&trans->sync_tid, &hmp->voldatalk, 143 0, "h2syncw", hz); 144 } 145 146 /* 147 * Once we become the running flush we can wakeup anyone 148 * who blocked on us. 149 */ 150 scan = trans; 151 while ((scan = TAILQ_NEXT(scan, entry)) != NULL) { 152 if (scan->flags & HAMMER2_TRANS_ISFLUSH) 153 break; 154 if (scan->blocked == 0) 155 break; 156 scan->blocked = 0; 157 wakeup(&scan->blocked); 158 } 159 } else { 160 /* 161 * If we are not a flush but our sync_tid is after a 162 * stalled flush, we have to wait until that flush unstalls 163 * (that is, all transactions prior to that flush complete), 164 * but then we can run concurrently with that flush. 165 * 166 * (flushcnt check only good as pre-condition, otherwise it 167 * may represent elements queued after us after we block). 168 */ 169 if (hmp->flushcnt > 1 || 170 (hmp->curflush && 171 TAILQ_FIRST(&hmp->transq) != hmp->curflush)) { 172 trans->blocked = 1; 173 while (trans->blocked) { 174 lksleep(&trans->blocked, &hmp->voldatalk, 175 0, "h2trans", hz); 176 } 177 } 178 } 179 hammer2_voldata_unlock(hmp, 0); 180 } 181 182 void 183 hammer2_trans_done(hammer2_trans_t *trans) 184 { 185 hammer2_cluster_t *cluster; 186 hammer2_mount_t *hmp; 187 hammer2_trans_t *scan; 188 189 cluster = trans->pmp->cluster; 190 hmp = cluster->hmp; 191 192 hammer2_voldata_lock(hmp); 193 TAILQ_REMOVE(&hmp->transq, trans, entry); 194 if (trans->flags & HAMMER2_TRANS_ISFLUSH) { 195 /* 196 * If we were a flush we have to adjust curflush to the 197 * next flush. 198 * 199 * flush_tid is used to partition copy-on-write operations 200 * (mostly duplicate-on-modify ops), which is what allows 201 * us to execute a flush concurrent with modifying operations 202 * with higher TIDs. 203 */ 204 --hmp->flushcnt; 205 if (hmp->flushcnt) { 206 TAILQ_FOREACH(scan, &hmp->transq, entry) { 207 if (scan->flags & HAMMER2_TRANS_ISFLUSH) 208 break; 209 } 210 KKASSERT(scan); 211 hmp->curflush = scan; 212 hmp->topo_flush_tid = scan->sync_tid; 213 } else { 214 /* 215 * Theoretically we don't have to clear flush_tid 216 * here since the flush will have synchronized 217 * all operations <= flush_tid already. But for 218 * now zero-it. 219 */ 220 hmp->curflush = NULL; 221 hmp->topo_flush_tid = 0; 222 } 223 } else { 224 /* 225 * If we are not a flush but a flush is now at the head 226 * of the queue and we were previously blocking it, 227 * we can now unblock it. 228 */ 229 if (hmp->flushcnt && 230 (scan = TAILQ_FIRST(&hmp->transq)) != NULL && 231 trans->sync_tid < scan->sync_tid && 232 (scan->flags & HAMMER2_TRANS_ISFLUSH)) { 233 wakeup(&scan->sync_tid); 234 } 235 } 236 hammer2_voldata_unlock(hmp, 0); 237 } 238 239 /* 240 * Flush the chain and all modified sub-chains through the specified 241 * synchronization point (sync_tid), propagating parent chain modifications 242 * and mirror_tid updates back up as needed. Since we are recursing downward 243 * we do not have to deal with the complexities of multi-homed chains (chains 244 * with multiple parents). 245 * 246 * Caller must have interlocked against any non-flush-related modifying 247 * operations in progress whos modify_tid values are less than or equal 248 * to the passed sync_tid. 249 * 250 * Caller must have already vetted synchronization points to ensure they 251 * are properly flushed. Only snapshots and cluster flushes can create 252 * these sorts of synchronization points. 253 * 254 * This routine can be called from several places but the most important 255 * is from the hammer2_vop_reclaim() function. We want to try to completely 256 * clean out the inode structure to prevent disconnected inodes from 257 * building up and blowing out the kmalloc pool. However, it is not actually 258 * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery 259 * capability. 260 * 261 * chain is locked on call and will remain locked on return. If a flush 262 * occured, the chain's MOVED bit will be set indicating that its parent 263 * (which is not part of the flush) should be updated. 264 */ 265 void 266 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain) 267 { 268 hammer2_chain_t *scan; 269 hammer2_chain_core_t *core; 270 hammer2_flush_info_t info; 271 272 /* 273 * Execute the recursive flush and handle deferrals. 274 * 275 * Chains can be ridiculously long (thousands deep), so to 276 * avoid blowing out the kernel stack the recursive flush has a 277 * depth limit. Elements at the limit are placed on a list 278 * for re-execution after the stack has been popped. 279 */ 280 bzero(&info, sizeof(info)); 281 TAILQ_INIT(&info.flush_list); 282 info.trans = trans; 283 info.sync_tid = trans->sync_tid; 284 info.mirror_tid = 0; 285 286 core = chain->core; 287 288 for (;;) { 289 /* 290 * Unwind deep recursions which had been deferred. This 291 * can leave MOVED set for these chains, which will be 292 * handled when we [re]flush chain after the unwind. 293 */ 294 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) { 295 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 296 TAILQ_REMOVE(&info.flush_list, scan, flush_node); 297 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 298 299 /* 300 * Now that we've popped back up we can do a secondary 301 * recursion on the deferred elements. 302 */ 303 if (hammer2_debug & 0x0040) 304 kprintf("defered flush %p\n", scan); 305 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 306 hammer2_chain_flush(trans, scan); 307 hammer2_chain_unlock(scan); 308 hammer2_chain_drop(scan); /* ref from deferral */ 309 } 310 311 /* 312 * Flush pass1 on root. 313 */ 314 info.diddeferral = 0; 315 hammer2_chain_flush_core(&info, chain); 316 #if FLUSH_DEBUG 317 kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n", 318 chain, chain->bref.type, chain->flags); 319 #endif 320 321 /* 322 * Only loop if deep recursions have been deferred. 323 */ 324 if (TAILQ_EMPTY(&info.flush_list)) 325 break; 326 } 327 } 328 329 /* 330 * This is the core of the chain flushing code. The chain is locked by the 331 * caller and remains locked on return. This function is keyed off of 332 * the SUBMODIFIED bit but must make fine-grained choices based on the 333 * synchronization point we are flushing to. 334 * 335 * If the flush accomplished any work chain will be flagged MOVED 336 * indicating a copy-on-write propagation back up is required. 337 * Deep sub-nodes may also have been entered onto the deferral list. 338 * MOVED is never set on the volume root. 339 * 340 * NOTE: modify_tid is different from MODIFIED. modify_tid is updated 341 * only when a chain is specifically modified, and not updated 342 * for copy-on-write propagations. MODIFIED is set on any modification 343 * including copy-on-write propagations. 344 */ 345 static void 346 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain) 347 { 348 hammer2_mount_t *hmp; 349 hammer2_blockref_t *bref; 350 hammer2_off_t pbase; 351 hammer2_off_t pmask; 352 hammer2_tid_t saved_sync; 353 hammer2_trans_t *trans = info->trans; 354 hammer2_chain_core_t *core; 355 size_t psize; 356 size_t boff; 357 char *bdata; 358 struct buf *bp; 359 int error; 360 int wasmodified; 361 int diddeferral = 0; 362 363 hmp = chain->hmp; 364 365 #if FLUSH_DEBUG 366 if (info->parent) 367 kprintf("flush_core %p->%p.%d %08x (%s)\n", 368 info->parent, chain, chain->bref.type, 369 chain->flags, 370 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ? 371 chain->data->ipdata.filename : "?")); 372 else 373 kprintf("flush_core NULL->%p.%d %08x (%s)\n", 374 chain, chain->bref.type, 375 chain->flags, 376 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ? 377 chain->data->ipdata.filename : "?")); 378 #endif 379 /* 380 * Ignore chains modified beyond the current flush point. These 381 * will be treated as if they did not exist. 382 */ 383 if (chain->modify_tid > info->sync_tid) 384 return; 385 386 /* 387 * Deleted chains which have not been destroyed must be retained, 388 * and we probably have to recurse to clean-up any sub-trees. 389 * However, restricted flushes can stop processing here because 390 * the chain cleanup will be handled by a later normal flush. 391 * 392 * The MODIFIED bit can likely be cleared in this situation and we 393 * will do so later on in this procedure. 394 */ 395 if (chain->delete_tid <= info->sync_tid) { 396 if (trans->flags & HAMMER2_TRANS_RESTRICTED) 397 return; 398 } 399 400 saved_sync = info->sync_tid; 401 core = chain->core; 402 403 /* 404 * If SUBMODIFIED is set we recurse the flush and adjust the 405 * blockrefs accordingly. 406 * 407 * NOTE: Looping on SUBMODIFIED can prevent a flush from ever 408 * finishing in the face of filesystem activity. 409 */ 410 if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) { 411 hammer2_chain_t *saved_parent; 412 hammer2_tid_t saved_mirror; 413 414 /* 415 * Clear SUBMODIFIED to catch races. Note that any child 416 * with MODIFIED, DELETED, or MOVED set during Scan2, after 417 * it processes the child, will cause SUBMODIFIED to be 418 * re-set. 419 * child has to be flushed SUBMODIFIED will wind up being 420 * set again (for next time), but this does not stop us from 421 * synchronizing block updates which occurred. 422 * 423 * We don't want to set our chain to MODIFIED gratuitously. 424 * 425 * We need an extra ref on chain because we are going to 426 * release its lock temporarily in our child loop. 427 */ 428 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED); 429 hammer2_chain_ref(chain); 430 431 /* 432 * Run two passes. The first pass handles MODIFIED and 433 * SUBMODIFIED chains and recurses while the second pass 434 * handles MOVED chains on the way back up. 435 * 436 * If the stack gets too deep we defer scan1, but must 437 * be sure to still run scan2 if on the next loop the 438 * deferred chain has been flushed and now needs MOVED 439 * handling on the way back up. 440 * 441 * Scan1 is recursive. 442 * 443 * NOTE: The act of handling a modified/submodified chain can 444 * cause the MOVED Flag to be set. It can also be set 445 * via hammer2_chain_delete() and in other situations. 446 * 447 * NOTE: RB_SCAN() must be used instead of RB_FOREACH() 448 * because children can be physically removed during 449 * the scan. 450 */ 451 saved_parent = info->parent; 452 saved_mirror = info->mirror_tid; 453 info->parent = chain; 454 info->mirror_tid = chain->bref.mirror_tid; 455 456 if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 457 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) { 458 hammer2_chain_ref(chain); 459 TAILQ_INSERT_TAIL(&info->flush_list, 460 chain, flush_node); 461 atomic_set_int(&chain->flags, 462 HAMMER2_CHAIN_DEFERRED); 463 } 464 diddeferral = 1; 465 } else { 466 info->diddeferral = 0; 467 spin_lock(&core->cst.spin); 468 RB_SCAN(hammer2_chain_tree, &chain->core->rbtree, 469 NULL, hammer2_chain_flush_scan1, info); 470 spin_unlock(&core->cst.spin); 471 diddeferral += info->diddeferral; 472 } 473 474 /* 475 * Handle successfully flushed children who are in the MOVED 476 * state on the way back up the recursion. This can have 477 * the side-effect of clearing MOVED. 478 * 479 * We execute this even if there were deferrals to try to 480 * keep the chain topology cleaner. 481 * 482 * Scan2 is non-recursive. 483 */ 484 if (diddeferral) { 485 atomic_set_int(&chain->flags, 486 HAMMER2_CHAIN_SUBMODIFIED); 487 } else { 488 #if FLUSH_DEBUG 489 kprintf("scan2_start parent %p %08x\n", 490 chain, chain->flags); 491 #endif 492 spin_lock(&core->cst.spin); 493 RB_SCAN(hammer2_chain_tree, &core->rbtree, 494 NULL, hammer2_chain_flush_scan2, info); 495 spin_unlock(&core->cst.spin); 496 #if FLUSH_DEBUG 497 kprintf("scan2_stop parent %p %08x\n", 498 chain, chain->flags); 499 #endif 500 } 501 chain->bref.mirror_tid = info->mirror_tid; 502 info->mirror_tid = saved_mirror; 503 info->parent = saved_parent; 504 hammer2_chain_drop(chain); 505 } 506 507 /* 508 * Restore sync_tid in case it was restricted by a delete/duplicate. 509 */ 510 info->sync_tid = saved_sync; 511 512 /* 513 * Rollup diddeferral for caller. Note direct assignment, not +=. 514 */ 515 info->diddeferral = diddeferral; 516 517 /* 518 * Do not flush chain if there were any deferrals. It will be 519 * retried later after the deferrals are independently handled. 520 */ 521 if (diddeferral) { 522 if (hammer2_debug & 0x0008) { 523 kprintf("%*.*s} %p/%d %04x (deferred)", 524 info->depth, info->depth, "", 525 chain, chain->refs, chain->flags); 526 } 527 return; 528 } 529 530 /* 531 * If we encounter a deleted chain within our flush we can clear 532 * the MODIFIED bit and avoid flushing it whether it has been 533 * destroyed or not. We must make sure that the chain is flagged 534 * MOVED in this situation so the parent picks up the deletion. 535 */ 536 if (chain->delete_tid <= info->sync_tid) { 537 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 538 if (chain->bp) { 539 if (chain->bytes == chain->bp->b_bufsize) 540 chain->bp->b_flags |= B_INVAL|B_RELBUF; 541 } 542 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 543 hammer2_chain_ref(chain); 544 atomic_set_int(&chain->flags, 545 HAMMER2_CHAIN_MOVED); 546 } 547 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 548 hammer2_chain_drop(chain); 549 } 550 return; 551 } 552 #if 0 553 if ((chain->flags & HAMMER2_CHAIN_DESTROYED) && 554 (chain->flags & HAMMER2_CHAIN_DELETED) && 555 (trans->flags & HAMMER2_TRANS_RESTRICTED) == 0) { 556 /* 557 * Throw-away the MODIFIED flag 558 */ 559 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 560 if (chain->bp) { 561 if (chain->bytes == chain->bp->b_bufsize) 562 chain->bp->b_flags |= B_INVAL|B_RELBUF; 563 } 564 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 565 hammer2_chain_drop(chain); 566 } 567 return; 568 } 569 #endif 570 571 /* 572 * A degenerate flush might not have flushed anything and thus not 573 * processed modified blocks on the way back up. Detect the case. 574 * 575 * Note that MOVED can be set without MODIFIED being set due to 576 * a deletion, in which case it is handled by Scan2 later on. 577 * 578 * Both bits can be set along with DELETED due to a deletion if 579 * modified data within the synchronization zone and the chain 580 * was then deleted beyond the zone, in which case we still have 581 * to flush for synchronization point consistency. Otherwise though 582 * DELETED and MODIFIED are treated as separate flags. 583 */ 584 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) 585 return; 586 587 /* 588 * Issue flush. 589 * 590 * A DESTROYED node that reaches this point must be flushed for 591 * synchronization point consistency. 592 */ 593 594 /* 595 * Update mirror_tid, clear MODIFIED, and set MOVED. 596 * 597 * The caller will update the parent's reference to this chain 598 * by testing MOVED as long as the modification was in-bounds. 599 * 600 * MOVED is never set on the volume root as there is no parent 601 * to adjust. 602 */ 603 if (chain->bref.mirror_tid < info->sync_tid) 604 chain->bref.mirror_tid = info->sync_tid; 605 wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0; 606 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 607 if (chain == &hmp->vchain) 608 kprintf("(FLUSHED VOLUME HEADER)\n"); 609 if (chain == &hmp->fchain) 610 kprintf("(FLUSHED FREEMAP HEADER)\n"); 611 612 if ((chain->flags & HAMMER2_CHAIN_MOVED) || 613 chain == &hmp->vchain || 614 chain == &hmp->fchain) { 615 /* 616 * Drop the ref from the MODIFIED bit we cleared. 617 */ 618 if (wasmodified) 619 hammer2_chain_drop(chain); 620 } else { 621 /* 622 * If we were MODIFIED we inherit the ref from clearing 623 * that bit, otherwise we need another ref. 624 */ 625 if (wasmodified == 0) 626 hammer2_chain_ref(chain); 627 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 628 } 629 630 /* 631 * If this is part of a recursive flush we can go ahead and write 632 * out the buffer cache buffer and pass a new bref back up the chain 633 * via the MOVED bit. 634 * 635 * Volume headers are NOT flushed here as they require special 636 * processing. 637 */ 638 switch(chain->bref.type) { 639 case HAMMER2_BREF_TYPE_FREEMAP: 640 hammer2_modify_volume(hmp); 641 break; 642 case HAMMER2_BREF_TYPE_VOLUME: 643 /* 644 * We should flush the free block table before we calculate 645 * CRCs and copy voldata -> volsync. 646 * 647 * To prevent SMP races, fchain must remain locked until 648 * voldata is copied to volsync. 649 */ 650 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 651 if (hmp->fchain.flags & (HAMMER2_CHAIN_MODIFIED | 652 HAMMER2_CHAIN_SUBMODIFIED)) { 653 /* this will modify vchain as a side effect */ 654 hammer2_chain_flush(info->trans, &hmp->fchain); 655 } 656 657 /* 658 * The volume header is flushed manually by the syncer, not 659 * here. All we do is adjust the crc's. 660 */ 661 KKASSERT(chain->data != NULL); 662 KKASSERT(chain->bp == NULL); 663 kprintf("volume header mirror_tid %jd\n", 664 hmp->voldata.mirror_tid); 665 666 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 667 hammer2_icrc32( 668 (char *)&hmp->voldata + 669 HAMMER2_VOLUME_ICRC1_OFF, 670 HAMMER2_VOLUME_ICRC1_SIZE); 671 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 672 hammer2_icrc32( 673 (char *)&hmp->voldata + 674 HAMMER2_VOLUME_ICRC0_OFF, 675 HAMMER2_VOLUME_ICRC0_SIZE); 676 hmp->voldata.icrc_volheader = 677 hammer2_icrc32( 678 (char *)&hmp->voldata + 679 HAMMER2_VOLUME_ICRCVH_OFF, 680 HAMMER2_VOLUME_ICRCVH_SIZE); 681 hmp->volsync = hmp->voldata; 682 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 683 hammer2_chain_unlock(&hmp->fchain); 684 break; 685 case HAMMER2_BREF_TYPE_DATA: 686 /* 687 * Data elements have already been flushed via the logical 688 * file buffer cache. Their hash was set in the bref by 689 * the vop_write code. 690 * 691 * Make sure any device buffer(s) have been flushed out here. 692 * (there aren't usually any to flush). 693 */ 694 psize = hammer2_devblksize(chain->bytes); 695 pmask = (hammer2_off_t)psize - 1; 696 pbase = chain->bref.data_off & ~pmask; 697 boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask); 698 699 bp = getblk(hmp->devvp, pbase, psize, GETBLK_NOWAIT, 0); 700 if (bp) { 701 if ((bp->b_flags & (B_CACHE | B_DIRTY)) == 702 (B_CACHE | B_DIRTY)) { 703 cluster_awrite(bp); 704 } else { 705 bp->b_flags |= B_RELBUF; 706 brelse(bp); 707 } 708 } 709 break; 710 #if 0 711 case HAMMER2_BREF_TYPE_INDIRECT: 712 /* 713 * Indirect blocks may be in an INITIAL state. Use the 714 * chain_lock() call to ensure that the buffer has been 715 * instantiated (even though it is already locked the buffer 716 * might not have been instantiated). 717 * 718 * Only write the buffer out if it is dirty, it is possible 719 * the operating system had already written out the buffer. 720 */ 721 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 722 KKASSERT(chain->bp != NULL); 723 724 bp = chain->bp; 725 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) || 726 (bp->b_flags & B_DIRTY)) { 727 bdwrite(chain->bp); 728 } else { 729 brelse(chain->bp); 730 } 731 chain->bp = NULL; 732 chain->data = NULL; 733 hammer2_chain_unlock(chain); 734 break; 735 #endif 736 case HAMMER2_BREF_TYPE_INDIRECT: 737 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 738 /* 739 * Device-backed. Buffer will be flushed by the sync 740 * code XXX. 741 */ 742 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 743 break; 744 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 745 default: 746 /* 747 * Embedded elements have to be flushed out. 748 * (Basically just BREF_TYPE_INODE). 749 */ 750 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 751 KKASSERT(chain->data != NULL); 752 KKASSERT(chain->bp == NULL); 753 bref = &chain->bref; 754 755 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0); 756 KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) == 757 HAMMER2_CHECK_ISCSI32 || 758 HAMMER2_DEC_CHECK(chain->bref.methods) == 759 HAMMER2_CHECK_FREEMAP); 760 761 /* 762 * The data is embedded, we have to acquire the 763 * buffer cache buffer and copy the data into it. 764 */ 765 psize = hammer2_devblksize(chain->bytes); 766 pmask = (hammer2_off_t)psize - 1; 767 pbase = bref->data_off & ~pmask; 768 boff = bref->data_off & (HAMMER2_OFF_MASK & pmask); 769 770 /* 771 * The getblk() optimization can only be used if the 772 * physical block size matches the request. 773 */ 774 error = bread(hmp->devvp, pbase, psize, &bp); 775 KKASSERT(error == 0); 776 777 bdata = (char *)bp->b_data + boff; 778 779 /* 780 * Copy the data to the buffer, mark the buffer 781 * dirty, and convert the chain to unmodified. 782 */ 783 bcopy(chain->data, bdata, chain->bytes); 784 bp->b_flags |= B_CLUSTEROK; 785 bdwrite(bp); 786 bp = NULL; 787 788 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 789 case HAMMER2_CHECK_FREEMAP: 790 chain->bref.check.freemap.icrc32 = 791 hammer2_icrc32(chain->data, chain->bytes); 792 break; 793 case HAMMER2_CHECK_ISCSI32: 794 chain->bref.check.iscsi32.value = 795 hammer2_icrc32(chain->data, chain->bytes); 796 break; 797 default: 798 panic("hammer2_flush_core: bad crc type"); 799 break; /* NOT REACHED */ 800 } 801 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 802 ++hammer2_iod_meta_write; 803 else 804 ++hammer2_iod_indr_write; 805 } 806 } 807 808 /* 809 * Flush helper scan1 (recursive) 810 * 811 * Flushes the children of the caller's chain (parent) and updates 812 * the blockref, restricted by sync_tid. 813 * 814 * Ripouts during the loop should not cause any problems. Because we are 815 * flushing to a synchronization point, modification races will occur after 816 * sync_tid and do not have to be flushed anyway. 817 * 818 * It is also ok if the parent is chain_duplicate()'d while unlocked because 819 * the delete/duplication will install a delete_tid that is still larger than 820 * our current sync_tid. 821 */ 822 static int 823 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data) 824 { 825 hammer2_flush_info_t *info = data; 826 hammer2_trans_t *trans = info->trans; 827 hammer2_chain_t *parent = info->parent; 828 int diddeferral; 829 830 /* 831 * We should only need to recurse if SUBMODIFIED is set, but as 832 * a safety also recurse if MODIFIED is also set. 833 * 834 * Return early if neither bit is set. We must re-assert the 835 * SUBMODIFIED flag in the parent if any child covered by the 836 * parent (via delete_tid) is skipped. 837 */ 838 if ((child->flags & (HAMMER2_CHAIN_MODIFIED | 839 HAMMER2_CHAIN_SUBMODIFIED)) == 0) { 840 return (0); 841 } 842 if (child->modify_tid > trans->sync_tid) { 843 if (parent->delete_tid > trans->sync_tid) { 844 atomic_set_int(&parent->flags, 845 HAMMER2_CHAIN_SUBMODIFIED); 846 } 847 return (0); 848 } 849 850 hammer2_chain_ref(child); 851 spin_unlock(&parent->core->cst.spin); 852 853 /* 854 * The caller has added a ref to the parent so we can temporarily 855 * unlock it in order to lock the child. Re-check the flags before 856 * continuing. 857 */ 858 hammer2_chain_unlock(parent); 859 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 860 861 if ((child->flags & (HAMMER2_CHAIN_MODIFIED | 862 HAMMER2_CHAIN_SUBMODIFIED)) == 0) { 863 hammer2_chain_unlock(child); 864 hammer2_chain_drop(child); 865 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 866 spin_lock(&parent->core->cst.spin); 867 return (0); 868 } 869 if (child->modify_tid > trans->sync_tid) { 870 hammer2_chain_unlock(child); 871 hammer2_chain_drop(child); 872 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 873 spin_lock(&parent->core->cst.spin); 874 if (parent->delete_tid > trans->sync_tid) { 875 atomic_set_int(&parent->flags, 876 HAMMER2_CHAIN_SUBMODIFIED); 877 } 878 return (0); 879 } 880 881 /* 882 * The DESTROYED flag can only be initially set on an unreferenced 883 * deleted inode and will propagate downward via the mechanic below. 884 * Such inode chains have been deleted for good and should no longer 885 * be subject to delete/duplication. 886 * 887 * This optimization allows the inode reclaim (destroy unlinked file 888 * on vnode reclamation after last close) to be flagged by just 889 * setting HAMMER2_CHAIN_DESTROYED at the top level and then will 890 * cause the chains to be terminated and related buffers to be 891 * invalidated and not flushed out. 892 * 893 * We have to be careful not to propagate the DESTROYED flag if 894 * the destruction occurred after our flush sync_tid. 895 */ 896 if ((parent->flags & HAMMER2_CHAIN_DESTROYED) && 897 (child->flags & HAMMER2_CHAIN_DELETED) && 898 (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) { 899 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROYED | 900 HAMMER2_CHAIN_SUBMODIFIED); 901 } 902 903 /* 904 * Recurse and collect deferral data. 905 */ 906 diddeferral = info->diddeferral; 907 ++info->depth; 908 hammer2_chain_flush_core(info, child); 909 #if FLUSH_DEBUG 910 kprintf("flush_core_done parent=%p flags=%08x child=%p.%d %08x\n", 911 parent, parent->flags, child, child->bref.type, child->flags); 912 #endif 913 --info->depth; 914 info->diddeferral += diddeferral; 915 916 if (child->flags & HAMMER2_CHAIN_SUBMODIFIED) 917 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED); 918 919 hammer2_chain_unlock(child); 920 hammer2_chain_drop(child); 921 922 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 923 924 spin_lock(&parent->core->cst.spin); 925 return (0); 926 } 927 928 /* 929 * Flush helper scan2 (non-recursive) 930 * 931 * This pass on a chain's children propagates any MOVED or DELETED 932 * elements back up the chain towards the root after those elements have 933 * been fully flushed. Unlike scan1, this function is NOT recursive and 934 * the parent remains locked across the entire scan. 935 * 936 * NOTE! We must re-set SUBMODIFIED on the parent(s) as appropriate, and 937 * due to the above conditions it is possible to do this and still 938 * have some children flagged MOVED depending on the synchronization. 939 * 940 * NOTE! A deletion is a visbility issue, there can still be referenced to 941 * deleted elements (for example, to an unlinked file which is still 942 * open), and there can also be multiple chains pointing to the same 943 * bref where some are deleted and some are not (for example due to 944 * a rename). So a chain marked for deletion is basically considered 945 * to be live until it is explicitly destroyed or until its ref-count 946 * reaches zero (also implying that MOVED and MODIFIED are clear). 947 */ 948 static int 949 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data) 950 { 951 hammer2_flush_info_t *info = data; 952 hammer2_chain_t *parent = info->parent; 953 hammer2_chain_core_t *above = child->above; 954 hammer2_mount_t *hmp = child->hmp; 955 hammer2_trans_t *trans = info->trans; 956 hammer2_blockref_t *base; 957 int count; 958 959 /* 960 * Inodes with stale children that have been converted to DIRECTDATA 961 * mode (file extension or hardlink conversion typically) need to 962 * skipped right now before we start messing with a non-existant 963 * block table. 964 */ 965 #if 0 966 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE && 967 (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)) { 968 #if FLUSH_DEBUG 969 kprintf("B"); 970 #endif 971 goto finalize; 972 } 973 #endif 974 975 /* 976 * Ignore children created after our flush point, treating them as 977 * if they did not exist). These children will not cause the parent 978 * to be updated. 979 * 980 * When we encounter such children and the parent chain has not been 981 * deleted, delete/duplicated, or delete/duplicated-for-move, then 982 * the parent may be used to funnel through several flush points. 983 * We must re-set the SUBMODIFIED flag in the parent to ensure that 984 * those flushes have visbility. A simple test of delete_tid suffices 985 * to determine if the parent spans beyond our current flush. 986 */ 987 if (child->modify_tid > trans->sync_tid) { 988 #if FLUSH_DEBUG 989 kprintf("E"); 990 #endif 991 goto finalize; 992 } 993 994 /* 995 * Ignore children which have not changed. The parent's block table 996 * is already correct. 997 */ 998 if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) { 999 #if FLUSH_DEBUG 1000 kprintf("D"); 1001 #endif 1002 goto finalize; 1003 } 1004 1005 1006 hammer2_chain_ref(child); 1007 spin_unlock(&above->cst.spin); 1008 1009 /* 1010 * The MOVED bit implies an additional reference which prevents 1011 * the child from being destroyed out from under our operation 1012 * so we can lock the child safely without worrying about it 1013 * getting ripped up (?). 1014 * 1015 * We can only update parents where child->parent matches. The 1016 * child->parent link will migrate along the chain but the flush 1017 * order must be enforced absolutely. Parent reflushed after the 1018 * child has passed them by should skip due to the modify_tid test. 1019 */ 1020 hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER); 1021 1022 /* 1023 * The parent's blockref to the child must be deleted or updated. 1024 * 1025 * This point is not reached on successful DESTROYED optimizations 1026 * but can be reached on recursive deletions and restricted flushes. 1027 * 1028 * Because flushes are ordered we do not have to make a 1029 * modify/duplicate of indirect blocks. That is, the flush 1030 * code does not have to kmalloc or duplicate anything. We 1031 * can adjust the indirect block table in-place and reuse the 1032 * chain. It IS possible that the chain has already been duplicated 1033 * or may wind up being duplicated on-the-fly by modifying code 1034 * on the frontend. We simply use the original and ignore such 1035 * chains. However, it does mean we can't clear the MOVED bit. 1036 * 1037 * XXX recursive deletions not optimized. 1038 */ 1039 hammer2_chain_modify(trans, &parent, 1040 HAMMER2_MODIFY_NO_MODIFY_TID | 1041 HAMMER2_MODIFY_ASSERTNOCOPY); 1042 1043 switch(parent->bref.type) { 1044 case HAMMER2_BREF_TYPE_INODE: 1045 /* 1046 * XXX Should assert that OPFLAG_DIRECTDATA is 0 once we 1047 * properly duplicate the inode headers and do proper flush 1048 * range checks (all the children should be beyond the flush 1049 * point). For now just don't sync the non-applicable 1050 * children. 1051 * 1052 * XXX Can also occur due to hardlink consolidation. We 1053 * set OPFLAG_DIRECTDATA to prevent the indirect and data 1054 * blocks from syncing ot the hardlink pointer. 1055 */ 1056 #if 0 1057 KKASSERT((parent->data->ipdata.op_flags & 1058 HAMMER2_OPFLAG_DIRECTDATA) == 0); 1059 #endif 1060 #if 0 1061 if (parent->data->ipdata.op_flags & 1062 HAMMER2_OPFLAG_DIRECTDATA) { 1063 base = NULL; 1064 } else 1065 #endif 1066 { 1067 base = &parent->data->ipdata.u.blockset.blockref[0]; 1068 count = HAMMER2_SET_COUNT; 1069 } 1070 break; 1071 case HAMMER2_BREF_TYPE_INDIRECT: 1072 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1073 if (parent->data) { 1074 base = &parent->data->npdata[0]; 1075 } else { 1076 base = NULL; 1077 KKASSERT(child->flags & HAMMER2_CHAIN_DELETED); 1078 } 1079 count = parent->bytes / sizeof(hammer2_blockref_t); 1080 break; 1081 case HAMMER2_BREF_TYPE_VOLUME: 1082 base = &hmp->voldata.sroot_blockset.blockref[0]; 1083 count = HAMMER2_SET_COUNT; 1084 break; 1085 case HAMMER2_BREF_TYPE_FREEMAP: 1086 base = &parent->data->npdata[0]; 1087 count = HAMMER2_SET_COUNT; 1088 break; 1089 default: 1090 base = NULL; 1091 count = 0; 1092 panic("hammer2_chain_get: " 1093 "unrecognized blockref type: %d", 1094 parent->bref.type); 1095 } 1096 1097 /* 1098 * Update the parent's blockref table and propagate mirror_tid. 1099 * 1100 * NOTE! Children with modify_tid's beyond our flush point are 1101 * considered to not exist for the purposes of updating the 1102 * parent's blockref array. 1103 * 1104 * NOTE! Updates to a parent's blockref table do not adjust the 1105 * parent's bref.modify_tid, only its bref.mirror_tid. 1106 */ 1107 KKASSERT(child->index >= 0); 1108 if (child->delete_tid <= trans->sync_tid) { 1109 if (base) { 1110 KKASSERT(child->index < count); 1111 bzero(&base[child->index], sizeof(child->bref)); 1112 } 1113 if (info->mirror_tid < child->delete_tid) 1114 info->mirror_tid = child->delete_tid; 1115 } else { 1116 if (base) { 1117 KKASSERT(child->index < count); 1118 base[child->index] = child->bref; 1119 } 1120 if (info->mirror_tid < child->modify_tid) 1121 info->mirror_tid = child->modify_tid; 1122 } 1123 1124 if (info->mirror_tid < child->bref.mirror_tid) { 1125 info->mirror_tid = child->bref.mirror_tid; 1126 } 1127 if ((parent->bref.type == HAMMER2_BREF_TYPE_VOLUME || 1128 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP) && 1129 hmp->voldata.mirror_tid < child->bref.mirror_tid) { 1130 hmp->voldata.mirror_tid = child->bref.mirror_tid; 1131 } 1132 1133 /* 1134 * When can we safely clear the MOVED flag? Flushes down duplicate 1135 * paths can occur out of order, for example if an inode is moved 1136 * as part of a hardlink consolidation or if an inode is moved into 1137 * an indirect block indexed before the inode. 1138 * 1139 * Only clear MOVED once all possible parents have been flushed. 1140 */ 1141 if (child->flags & HAMMER2_CHAIN_MOVED) { 1142 hammer2_chain_t *scan; 1143 int ok = 1; 1144 1145 spin_lock(&above->cst.spin); 1146 for (scan = above->first_parent; 1147 scan; 1148 scan = scan->next_parent) { 1149 /* 1150 * XXX weird code also checked at the top of scan2, 1151 * I would like to fix this by detaching the core 1152 * on initial hardlink consolidation (1->2 nlinks). 1153 */ 1154 #if 0 1155 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE && 1156 (scan->data->ipdata.op_flags & 1157 HAMMER2_OPFLAG_DIRECTDATA)) { 1158 continue; 1159 } 1160 #endif 1161 if (scan->flags & HAMMER2_CHAIN_SUBMODIFIED) { 1162 ok = 0; 1163 break; 1164 } 1165 } 1166 spin_unlock(&above->cst.spin); 1167 if (ok) { 1168 atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED); 1169 hammer2_chain_drop(child); /* flag */ 1170 } 1171 } 1172 1173 /* 1174 * Unlock the child. This can wind up dropping the child's 1175 * last ref, removing it from the parent's RB tree, and deallocating 1176 * the structure. The RB_SCAN() our caller is doing handles the 1177 * situation. 1178 */ 1179 hammer2_chain_unlock(child); 1180 hammer2_chain_drop(child); 1181 spin_lock(&above->cst.spin); 1182 #if FLUSH_DEBUG 1183 kprintf("F"); 1184 #endif 1185 1186 /* 1187 * The parent cleared SUBMODIFIED prior to the scan. If the child 1188 * still requires a flush (possibly due to being outside the current 1189 * synchronization zone), we must re-set SUBMODIFIED on the way back 1190 * up. 1191 */ 1192 finalize: 1193 #if FLUSH_DEBUG 1194 kprintf("G child %p 08x\n", child, child->flags); 1195 #endif 1196 return (0); 1197 } 1198