1 /* 2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <sys/cdefs.h> 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/types.h> 40 #include <sys/lock.h> 41 #include <sys/uuid.h> 42 43 #include "hammer2.h" 44 45 #define FLUSH_DEBUG 0 46 47 /* 48 * Recursively flush the specified chain. The chain is locked and 49 * referenced by the caller and will remain so on return. The chain 50 * will remain referenced throughout but can temporarily lose its 51 * lock during the recursion to avoid unnecessarily stalling user 52 * processes. 53 */ 54 struct hammer2_flush_info { 55 hammer2_chain_t *parent; 56 hammer2_trans_t *trans; 57 int depth; 58 int diddeferral; 59 int pass; 60 int cache_index; 61 int domodify; 62 struct h2_flush_deferral_list flush_list; 63 hammer2_tid_t sync_tid; /* flush synchronization point */ 64 }; 65 66 typedef struct hammer2_flush_info hammer2_flush_info_t; 67 68 static void hammer2_chain_flush_core(hammer2_flush_info_t *info, 69 hammer2_chain_t **chainp); 70 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data); 71 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data); 72 static void hammer2_rollup_stats(hammer2_chain_t *parent, 73 hammer2_chain_t *child, int how); 74 75 /* 76 * Can we ignore a chain for the purposes of flushing modifications 77 * to the media? 78 */ 79 static __inline 80 int 81 h2ignore_deleted(hammer2_flush_info_t *info, hammer2_chain_t *chain) 82 { 83 return (chain->delete_tid <= info->sync_tid && 84 (chain->bref.type != HAMMER2_BREF_TYPE_INODE || 85 (chain->flags & HAMMER2_CHAIN_DESTROYED))); 86 } 87 88 #if 0 89 static __inline 90 void 91 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref, 92 int how) 93 { 94 hammer2_key_t bytes; 95 96 if (bref->type != 0) { 97 bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX); 98 if (bref->type == HAMMER2_BREF_TYPE_INODE) 99 info->inode_count += how; 100 if (how < 0) 101 info->data_count -= bytes; 102 else 103 info->data_count += bytes; 104 } 105 } 106 #endif 107 108 /* 109 * Transaction support functions for writing to the filesystem. 110 * 111 * Initializing a new transaction allocates a transaction ID. Typically 112 * passed a pmp (hmp passed as NULL), indicating a cluster transaction. Can 113 * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single 114 * media target. The latter mode is used by the recovery code. 115 * 116 * TWO TRANSACTION IDs can run concurrently, where one is a flush and the 117 * other is a set of any number of concurrent filesystem operations. We 118 * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops> 119 * or we can have <running_flush> + <concurrent_fs_ops>. 120 * 121 * During a flush, new fs_ops are only blocked until the fs_ops prior to 122 * the flush complete. The new fs_ops can then run concurrent with the flush. 123 * 124 * Buffer-cache transactions operate as fs_ops but never block. A 125 * buffer-cache flush will run either before or after the current pending 126 * flush depending on its state. 127 * 128 * sync_tid vs real_tid. For flush transactions ONLY, the flush operation 129 * actually uses two transaction ids, one for the flush operation itself, 130 * and <N+1> for any freemap allocations made as a side-effect. real_tid 131 * is fixed at <N>, sync_tid is adjusted dynamically as-needed. 132 * 133 * NOTE: The sync_tid for a flush's freemap allocation will match the 134 * sync_tid of the following <concurrent_fs_ops> transaction(s). 135 * The freemap topology will be out-of-step by one transaction id 136 * in order to give the flusher a stable freemap topology to flush 137 * out. This is fixed up at mount-time using a quick incremental 138 * scan. 139 */ 140 void 141 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, 142 hammer2_mount_t *hmp, int flags) 143 { 144 hammer2_trans_t *head; 145 146 bzero(trans, sizeof(*trans)); 147 if (pmp) { 148 trans->pmp = pmp; 149 KKASSERT(hmp == NULL); 150 hmp = pmp->cluster.chains[0]->hmp; /* XXX */ 151 } else { 152 trans->hmp_single = hmp; 153 KKASSERT(hmp); 154 } 155 156 hammer2_voldata_lock(hmp); 157 trans->flags = flags; 158 trans->td = curthread; 159 /*trans->delete_gen = 0;*/ /* multiple deletions within trans */ 160 161 if (flags & HAMMER2_TRANS_ISFLUSH) { 162 /* 163 * If multiple flushes are trying to run we have to 164 * wait until it is our turn. All flushes are serialized. 165 * 166 * We queue ourselves and then wait to become the head 167 * of the queue, allowing all prior flushes to complete. 168 * 169 * A unique transaction id is required to avoid confusion 170 * when updating the block tables. 171 */ 172 ++hmp->flushcnt; 173 ++hmp->voldata.alloc_tid; 174 trans->sync_tid = hmp->voldata.alloc_tid; 175 trans->real_tid = trans->sync_tid; 176 ++hmp->voldata.alloc_tid; 177 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry); 178 if (TAILQ_FIRST(&hmp->transq) != trans) { 179 trans->blocked = 1; 180 while (trans->blocked) { 181 lksleep(&trans->sync_tid, &hmp->voldatalk, 182 0, "h2multf", hz); 183 } 184 } 185 } else if (hmp->flushcnt == 0) { 186 /* 187 * No flushes are pending, we can go. 188 */ 189 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry); 190 trans->sync_tid = hmp->voldata.alloc_tid; 191 trans->real_tid = trans->sync_tid; 192 193 /* XXX improve/optimize inode allocation */ 194 } else { 195 /* 196 * One or more flushes are pending. We insert after 197 * the current flush and may block. We have priority 198 * over any flushes that are not the current flush. 199 * 200 * TRANS_BUFCACHE transactions cannot block. 201 */ 202 TAILQ_FOREACH(head, &hmp->transq, entry) { 203 if (head->flags & HAMMER2_TRANS_ISFLUSH) 204 break; 205 } 206 KKASSERT(head); 207 TAILQ_INSERT_AFTER(&hmp->transq, head, trans, entry); 208 trans->sync_tid = head->real_tid + 1; 209 trans->real_tid = trans->sync_tid; 210 211 if ((trans->flags & HAMMER2_TRANS_BUFCACHE) == 0) { 212 if (TAILQ_FIRST(&hmp->transq) != head) { 213 trans->blocked = 1; 214 while (trans->blocked) { 215 lksleep(&trans->sync_tid, 216 &hmp->voldatalk, 0, 217 "h2multf", hz); 218 } 219 } 220 } 221 } 222 if (flags & HAMMER2_TRANS_NEWINODE) 223 trans->inode_tid = hmp->voldata.inode_tid++; 224 hammer2_voldata_unlock(hmp, 0); 225 } 226 227 void 228 hammer2_trans_done(hammer2_trans_t *trans) 229 { 230 hammer2_mount_t *hmp; 231 hammer2_trans_t *head; 232 hammer2_trans_t *scan; 233 234 if (trans->pmp) 235 hmp = trans->pmp->cluster.chains[0]->hmp; 236 else 237 hmp = trans->hmp_single; 238 239 /* 240 * Remove and adjust flushcnt 241 */ 242 hammer2_voldata_lock(hmp); 243 TAILQ_REMOVE(&hmp->transq, trans, entry); 244 if (trans->flags & HAMMER2_TRANS_ISFLUSH) 245 --hmp->flushcnt; 246 247 /* 248 * Unblock the head of the queue and any additional transactions 249 * up to the next flush. 250 */ 251 head = TAILQ_FIRST(&hmp->transq); 252 if (head && head->blocked) { 253 head->blocked = 0; 254 wakeup(&head->sync_tid); 255 256 scan = TAILQ_NEXT(head, entry); 257 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 258 if (scan->blocked) { 259 scan->blocked = 0; 260 wakeup(&scan->sync_tid); 261 } 262 scan = TAILQ_NEXT(scan, entry); 263 } 264 } 265 hammer2_voldata_unlock(hmp, 0); 266 } 267 268 /* 269 * Flush the chain and all modified sub-chains through the specified 270 * synchronization point (sync_tid), propagating parent chain modifications 271 * and mirror_tid updates back up as needed. Since we are recursing downward 272 * we do not have to deal with the complexities of multi-homed chains (chains 273 * with multiple parents). 274 * 275 * Caller must have interlocked against any non-flush-related modifying 276 * operations in progress whos modify_tid values are less than or equal 277 * to the passed sync_tid. 278 * 279 * Caller must have already vetted synchronization points to ensure they 280 * are properly flushed. Only snapshots and cluster flushes can create 281 * these sorts of synchronization points. 282 * 283 * This routine can be called from several places but the most important 284 * is from the hammer2_vop_reclaim() function. We want to try to completely 285 * clean out the inode structure to prevent disconnected inodes from 286 * building up and blowing out the kmalloc pool. However, it is not actually 287 * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery 288 * capability. 289 * 290 * chain is locked on call and will remain locked on return. If a flush 291 * occured, the chain's MOVED bit will be set indicating that its parent 292 * (which is not part of the flush) should be updated. The chain may be 293 * replaced by the call. 294 */ 295 void 296 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp) 297 { 298 hammer2_chain_t *chain = *chainp; 299 hammer2_chain_t *scan; 300 hammer2_chain_core_t *core; 301 hammer2_flush_info_t info; 302 int loops; 303 304 /* 305 * Execute the recursive flush and handle deferrals. 306 * 307 * Chains can be ridiculously long (thousands deep), so to 308 * avoid blowing out the kernel stack the recursive flush has a 309 * depth limit. Elements at the limit are placed on a list 310 * for re-execution after the stack has been popped. 311 */ 312 bzero(&info, sizeof(info)); 313 TAILQ_INIT(&info.flush_list); 314 info.trans = trans; 315 info.sync_tid = trans->sync_tid; 316 info.cache_index = -1; 317 318 core = chain->core; 319 #if FLUSH_DEBUG 320 kprintf("CHAIN FLUSH trans %p.%016jx chain %p.%d mod %016jx upd %016jx\n", trans, trans->sync_tid, chain, chain->bref.type, chain->modify_tid, core->update_lo); 321 #endif 322 323 /* 324 * Extra ref needed because flush_core expects it when replacing 325 * chain. 326 */ 327 hammer2_chain_ref(chain); 328 loops = 0; 329 330 for (;;) { 331 /* 332 * Unwind deep recursions which had been deferred. This 333 * can leave MOVED set for these chains, which will be 334 * handled when we [re]flush chain after the unwind. 335 */ 336 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) { 337 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 338 TAILQ_REMOVE(&info.flush_list, scan, flush_node); 339 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 340 341 /* 342 * Now that we've popped back up we can do a secondary 343 * recursion on the deferred elements. 344 * 345 * NOTE: hammer2_chain_flush() may replace scan. 346 */ 347 if (hammer2_debug & 0x0040) 348 kprintf("deferred flush %p\n", scan); 349 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 350 hammer2_chain_drop(scan); /* ref from deferral */ 351 hammer2_chain_flush(trans, &scan); 352 hammer2_chain_unlock(scan); 353 } 354 355 /* 356 * [re]flush chain. 357 */ 358 info.diddeferral = 0; 359 hammer2_chain_flush_core(&info, &chain); 360 #if FLUSH_DEBUG 361 kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n", 362 chain, chain->bref.type, chain->flags); 363 #endif 364 365 /* 366 * Only loop if deep recursions have been deferred. 367 */ 368 if (TAILQ_EMPTY(&info.flush_list)) 369 break; 370 371 if (++loops % 1000 == 0) { 372 kprintf("hammer2_chain_flush: excessive loops on %p\n", 373 chain); 374 if (hammer2_debug & 0x100000) 375 Debugger("hell4"); 376 } 377 } 378 hammer2_chain_drop(chain); 379 *chainp = chain; 380 } 381 382 /* 383 * This is the core of the chain flushing code. The chain is locked by the 384 * caller and must also have an extra ref on it by the caller, and remains 385 * locked and will have an extra ref on return. 386 * 387 * If the flush accomplished any work chain will be flagged MOVED 388 * indicating a copy-on-write propagation back up is required. 389 * Deep sub-nodes may also have been entered onto the deferral list. 390 * MOVED is never set on the volume root. 391 * 392 * NOTE: modify_tid is different from MODIFIED. modify_tid is updated 393 * only when a chain is specifically modified, and not updated 394 * for copy-on-write propagations. MODIFIED is set on any modification 395 * including copy-on-write propagations. 396 * 397 * NOTE: We are responsible for updating chain->bref.mirror_tid and 398 * core->update_lo The caller is responsible for processing us into 399 * our parent (if any). 400 * 401 * We are also responsible for updating chain->core->update_lo to 402 * prevent repeated recursions due to deferrals. 403 */ 404 static void 405 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp) 406 { 407 hammer2_chain_t *chain = *chainp; 408 hammer2_mount_t *hmp; 409 hammer2_blockref_t *bref; 410 hammer2_chain_core_t *core; 411 char *bdata; 412 hammer2_io_t *dio; 413 int error; 414 int diddeferral; 415 416 hmp = chain->hmp; 417 core = chain->core; 418 diddeferral = info->diddeferral; 419 420 #if FLUSH_DEBUG 421 if (info->parent) 422 kprintf("flush_core %p->%p.%d %08x (%s)\n", 423 info->parent, chain, chain->bref.type, 424 chain->flags, 425 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ? 426 (char *)chain->data->ipdata.filename : "?")); 427 else 428 kprintf("flush_core NULL->%p.%d %08x (%s)\n", 429 chain, chain->bref.type, 430 chain->flags, 431 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ? 432 (char *)chain->data->ipdata.filename : "?")); 433 kprintf("PUSH %p.%d %08x mod=%016jx del=%016jx mirror=%016jx (sync %016jx, update_lo %016jx)\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid, chain->bref.mirror_tid, info->sync_tid, core->update_lo); 434 #endif 435 436 /* 437 * Check if we even have any work to do. 438 * 439 * We do not update core->update_lo because there might be other 440 * paths to the core and we haven't actually checked it. 441 * 442 * This bit of code is capable of short-cutting entire sub-trees 443 * if they have not been touched. 444 */ 445 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0 && 446 (core->update_lo >= info->sync_tid || 447 chain->bref.mirror_tid >= info->sync_tid || 448 chain->bref.mirror_tid >= core->update_hi)) { 449 KKASSERT(chain->modify_tid <= info->sync_tid); 450 /* don't update update_lo, there may be other paths to core */ 451 /* don't update bref.mirror_tid, scan2 is not called */ 452 return; 453 } 454 455 /* 456 * Ignore chains modified beyond the current flush point. These 457 * will be treated as if they did not exist. Subchains with lower 458 * modify_tid's will still be accessible via other parents. 459 * 460 * Do not update bref.mirror_tid here, it will interfere with 461 * synchronization. e.g. inode flush tid 1, concurrent D-D tid 2, 462 * then later on inode flush tid 2. If we were to set mirror_tid 463 * to 1 during inode flush tid 1 the blockrefs would only be partially 464 * updated (and likely panic). 465 * 466 * Do not update core->update_lo here, there might be other paths 467 * to the core and we haven't actually flushed it. 468 * 469 * (vchain and fchain are exceptions since they cannot be duplicated) 470 */ 471 if (chain->modify_tid > info->sync_tid && 472 chain != &hmp->fchain && chain != &hmp->vchain) { 473 chain->debug_reason = (chain->debug_reason & ~255) | 5; 474 /* do not update bref.mirror_tid, scan2 ignores chain */ 475 /* do not update core->update_lo, there may be another path */ 476 return; 477 } 478 479 retry: 480 /* 481 * Early handling of deleted chains is required to avoid double 482 * recursions. If the deleted chain has been duplicated then the 483 * flush will have visibility into chain->core via some other chain 484 * and we can safely terminate the operation right here. 485 * 486 * If the deleted chain has not been duplicated then the deletion 487 * is terminal and we must recurse to deal with any dirty chains 488 * under the deletion, including possibly flushing them out (e.g. 489 * open descriptor on an unlinked file). 490 */ 491 if (chain->delete_tid <= info->sync_tid && 492 (chain->flags & HAMMER2_CHAIN_DUPLICATED)) { 493 chain->debug_reason = (chain->debug_reason & ~255) | 9; 494 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 495 #if 0 496 /* 497 * XXX should be able to invalidate the buffer here. 498 * XXX problem if reused, snapshotted, or reactivated. 499 */ 500 if (chain->dio) { 501 hammer2_io_setinval(chain->dio, chain->bytes); 502 } 503 #endif 504 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 505 hammer2_chain_ref(chain); 506 atomic_set_int(&chain->flags, 507 HAMMER2_CHAIN_MOVED); 508 } 509 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 510 hammer2_chain_drop(chain); 511 } 512 513 /* 514 * Update mirror_tid, indicating that chain is synchronized 515 * on its modification and block table. This probably isn't 516 * needed since scan2 should ignore deleted chains anyway. 517 */ 518 if (chain->bref.mirror_tid < info->sync_tid) 519 chain->bref.mirror_tid = info->sync_tid; 520 /* do not update core->update_lo, there may be another path */ 521 return; 522 } 523 524 /* 525 * Recurse if we are not up-to-date. Once we are done we will 526 * update update_lo if there were no deferrals. update_lo can become 527 * higher than update_hi and is used to prevent re-recursions during 528 * the same flush cycle. 529 * 530 * update_hi was already checked and prevents initial recursions on 531 * subtrees which have not been modified. 532 * 533 * NOTE: We must recurse whether chain is flagged DELETED or not. 534 * However, if it is flagged DELETED we limit sync_tid to 535 * delete_tid to ensure that the chain's bref.mirror_tid is 536 * not fully updated and causes it to miss the non-DELETED 537 * path. 538 * 539 * NOTE: If a deferral occurs hammer2_chain_flush() will flush the 540 * deferred chain independently which will update it's 541 * bref.mirror_tid and prevent it from deferred again. 542 */ 543 if (chain->bref.mirror_tid < info->sync_tid && 544 chain->bref.mirror_tid < core->update_hi) { 545 hammer2_chain_t *saved_parent; 546 hammer2_chain_layer_t *layer; 547 int saved_domodify; 548 int save_gen; 549 550 /* 551 * Races will bump update_hi above trans->sync_tid causing 552 * us to catch the issue in a later flush. 553 * 554 * We don't want to set our chain to MODIFIED gratuitously. 555 * 556 * We need an extra ref on chain because we are going to 557 * release its lock temporarily in our child loop. 558 */ 559 560 /* 561 * Run two passes. The first pass handles MODIFIED and 562 * update_lo recursions while the second pass handles 563 * MOVED chains on the way back up. 564 * 565 * If the stack gets too deep we defer the chain. Since 566 * hammer2_chain_core's can be shared at multiple levels 567 * in the tree, we may encounter a chain that we had already 568 * deferred. We could undefer it but it will probably just 569 * defer again so it is best to leave it deferred. 570 * 571 * Scan1 is recursive. 572 * 573 * NOTE: The act of handling a modified/submodified chain can 574 * cause the MOVED Flag to be set. It can also be set 575 * via hammer2_chain_delete() and in other situations. 576 * 577 * NOTE: RB_SCAN() must be used instead of RB_FOREACH() 578 * because children can be physically removed during 579 * the scan. 580 * 581 * NOTE: We would normally not care about insertions except 582 * that some insertions might occur from the flush 583 * itself, so loop on generation number changes. 584 */ 585 saved_parent = info->parent; 586 saved_domodify = info->domodify; 587 info->parent = chain; 588 info->domodify = 0; 589 chain->debug_reason = (chain->debug_reason & ~255) | 6; 590 591 if (chain->flags & HAMMER2_CHAIN_DEFERRED) { 592 ++info->diddeferral; 593 } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 594 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) { 595 hammer2_chain_ref(chain); 596 TAILQ_INSERT_TAIL(&info->flush_list, 597 chain, flush_node); 598 atomic_set_int(&chain->flags, 599 HAMMER2_CHAIN_DEFERRED); 600 } 601 ++info->diddeferral; 602 } else { 603 spin_lock(&core->cst.spin); 604 KKASSERT(core->good == 0x1234 && core->sharecnt > 0); 605 do { 606 save_gen = core->generation; 607 TAILQ_FOREACH_REVERSE(layer, &core->layerq, 608 h2_layer_list, entry) { 609 ++layer->refs; 610 KKASSERT(layer->good == 0xABCD); 611 RB_SCAN(hammer2_chain_tree, 612 &layer->rbtree, 613 NULL, hammer2_chain_flush_scan1, 614 info); 615 --layer->refs; 616 } 617 } while (core->generation != save_gen); 618 spin_unlock(&core->cst.spin); 619 } 620 621 if (info->parent != chain) { 622 kprintf("ZZZ\n"); 623 hammer2_chain_drop(chain); 624 hammer2_chain_ref(info->parent); 625 } 626 chain = info->parent; 627 628 /* 629 * We unlock the parent during the scan1 recursion, parent 630 * may have been deleted out from under us. 631 * 632 * parent may have been destroyed out from under us 633 * 634 * parent may have been synchronously flushed due to aliasing 635 * via core (is this possible?). 636 */ 637 if (chain->delete_tid <= info->sync_tid && 638 (chain->flags & HAMMER2_CHAIN_DUPLICATED)) { 639 kprintf("xxx\n"); 640 goto retry; 641 } 642 if (chain->bref.mirror_tid >= info->sync_tid || 643 chain->bref.mirror_tid >= core->update_hi) { 644 kprintf("yyy\n"); 645 goto retry; 646 } 647 648 /* 649 * If any deferral occurred we must set domodify to 0 to avoid 650 * potentially modifying the parent twice (now and when we run 651 * the deferral list), as doing so could cause the blockref 652 * update to run on a block array which has already been 653 * updated. 654 */ 655 if (info->domodify && diddeferral != info->diddeferral) 656 info->domodify = 0; 657 658 /* 659 * We are responsible for setting the parent into a modified 660 * state before we scan the children to update the parent's 661 * block table. This must essentially be done as an atomic 662 * operation (the parent must remain locked throughout the 663 * operation), otherwise other transactions can squeeze a 664 * delete-duplicate in and create block table havoc. 665 * 666 * Care must be taken to not try to update the parent twice 667 * during the current flush cycle, which would cause more 668 * havoc. It's so important that we assert that we haven't 669 * double-flushed a parent below by testing modify_tid. 670 * 671 * NOTE: Blockrefs are only updated on live chains. 672 * 673 * NOTE: Modifying the parent generally causes a 674 * delete-duplicate to occur from within the flush 675 * itself, with an allocation from the freemap occuring 676 * as an additional side-effect. 677 * 678 * NOTE: If the parent was deleted our modified chain will 679 * also be marked deleted, but since it inherits the 680 * parent's delete_tid it will still appear to be 681 * 'live' for the purposes of the flush. 682 */ 683 if (info->domodify && !h2ignore_deleted(info, chain)) { 684 KKASSERT(chain->modify_tid < info->sync_tid); 685 686 /* 687 * The scan1 loop and/or flush_core is reentrant, 688 * particularly when core->generation changes. To 689 * avoid havoc we have to prevent repetitive 690 * delete-duplicates of the same chain. 691 * 692 * After executing the modify set the original chain's 693 * bref.mirror_tid to prevent any reentrancy during 694 * the current flush cycle. 695 */ 696 hammer2_chain_modify(info->trans, &info->parent, 697 HAMMER2_MODIFY_NO_MODIFY_TID); 698 if (info->parent != chain) { 699 if (chain->bref.mirror_tid < info->sync_tid) 700 chain->bref.mirror_tid = info->sync_tid; 701 hammer2_chain_drop(chain); 702 hammer2_chain_ref(info->parent); 703 } 704 chain = info->parent; 705 } 706 chain->debug_reason = (chain->debug_reason & ~255) | 7; 707 708 KKASSERT(chain == info->parent); 709 710 /* 711 * Handle successfully flushed children who are in the MOVED 712 * state on the way back up the recursion. This can have 713 * the side-effect of clearing MOVED. 714 * 715 * Scan2 may replace info->parent. If it does it will also 716 * replace the extra ref we made. 717 * 718 * Scan2 is non-recursive. 719 */ 720 if (diddeferral != info->diddeferral) { 721 spin_lock(&core->cst.spin); 722 } else { 723 KKASSERT(chain == info->parent); 724 KKASSERT(info->domodify == 0 || 725 (chain->flags & HAMMER2_CHAIN_FLUSHED) == 0); 726 atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSHED); 727 spin_lock(&core->cst.spin); 728 KKASSERT(core->good == 0x1234 && core->sharecnt > 0); 729 KKASSERT(info->parent->core == core); 730 TAILQ_FOREACH_REVERSE(layer, &core->layerq, 731 h2_layer_list, entry) { 732 info->pass = 1; 733 ++layer->refs; 734 KKASSERT(layer->good == 0xABCD); 735 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 736 NULL, hammer2_chain_flush_scan2, info); 737 info->pass = 2; 738 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 739 NULL, hammer2_chain_flush_scan2, info); 740 --layer->refs; 741 } 742 743 /* 744 * chain is now considered up-to-date, adjust 745 * bref.mirror_tid and update_lo before running 746 * pass3. 747 * 748 * (no deferral in this path) 749 */ 750 if (core->update_lo < info->sync_tid) 751 core->update_lo = info->sync_tid; 752 753 TAILQ_FOREACH_REVERSE(layer, &core->layerq, 754 h2_layer_list, entry) { 755 info->pass = 3; 756 ++layer->refs; 757 KKASSERT(layer->good == 0xABCD); 758 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 759 NULL, hammer2_chain_flush_scan2, info); 760 --layer->refs; 761 KKASSERT(info->parent->core == core); 762 } 763 } 764 765 /* 766 * info->parent must not have been replaced again 767 */ 768 KKASSERT(info->parent == chain); 769 770 chain->debug_reason = (chain->debug_reason & ~255) | 8; 771 *chainp = chain; 772 773 hammer2_chain_layer_check_locked(chain->hmp, core); 774 spin_unlock(&core->cst.spin); 775 776 info->parent = saved_parent; 777 info->domodify = saved_domodify; 778 KKASSERT(chain->refs > 1); 779 } else { 780 /* 781 * There is no deferral in this path. Chain is now 782 * considered up-to-date. 783 * 784 * Adjust update_lo now and bref.mirror_tid will be 785 * updated a bit later on the fall-through. 786 */ 787 if (core->update_lo < info->sync_tid) 788 core->update_lo = info->sync_tid; 789 } 790 791 #if FLUSH_DEBUG 792 kprintf("POP %p.%d defer=%d\n", chain, chain->bref.type, diddeferral); 793 #endif 794 795 /* 796 * Do not flush chain if there were any deferrals. It will be 797 * retried later after the deferrals are independently handled. 798 * Do not update update_lo or bref.mirror_tid. 799 */ 800 if (diddeferral != info->diddeferral) { 801 chain->debug_reason = (chain->debug_reason & ~255) | 99; 802 if (hammer2_debug & 0x0008) { 803 kprintf("%*.*s} %p/%d %04x (deferred)", 804 info->depth, info->depth, "", 805 chain, chain->refs, chain->flags); 806 } 807 /* do not update core->update_lo */ 808 /* do not update bref.mirror_tid */ 809 return; 810 } 811 812 /* 813 * Non-deferral path, chain is now deterministically being flushed. 814 * We've finished running the recursion and the blockref update. 815 * 816 * update bref.mirror_tid. update_lo has already been updated. 817 */ 818 if (chain->bref.mirror_tid < info->sync_tid) 819 chain->bref.mirror_tid = info->sync_tid; 820 821 /* 822 * Deal with deleted and destroyed chains on the way back up. 823 * 824 * Deleted inodes may still be active due to open descriptors so 825 * test whether the inode has been DESTROYED (aka deactivated after 826 * being unlinked) or not. 827 * 828 * Otherwise a delted chain can be optimized by clearing MODIFIED 829 * without bothering to write it out. 830 * 831 * NOTE: We optimize this by noting that only 'inode' chains require 832 * this treatment. When a file with an open descriptor is 833 * deleted only its inode is marked deleted. Other deletions, 834 * such as indirect block deletions, will no longer be visible 835 * to the live filesystem and do not need to be updated. 836 */ 837 if (h2ignore_deleted(info, chain)) { 838 /* 839 * At the moment we unconditionally set the MOVED bit because 840 * there are situations where it might not have been set due 841 * to similar delete-destroyed optimizations, and the parent 842 * of the parent still may need to be notified of the deletion. 843 */ 844 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 845 hammer2_chain_ref(chain); 846 atomic_set_int(&chain->flags, 847 HAMMER2_CHAIN_MOVED); 848 } 849 chain->debug_reason = (chain->debug_reason & ~255) | 9; 850 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 851 #if 0 852 /* 853 * XXX should be able to invalidate the buffer here. 854 * XXX problem if reused, snapshotted, or reactivated. 855 */ 856 if (chain->dio) { 857 hammer2_io_setinval(chain->dio, chain->bytes); 858 } 859 #endif 860 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 861 hammer2_chain_drop(chain); 862 } 863 return; 864 } 865 866 /* 867 * A degenerate flush might not have flushed anything and thus not 868 * processed modified blocks on the way back up. Detect the case. 869 * 870 * This case can occur when modifications cross flush boundaries 871 * and cause the submodified recursion to run up multiple parents (?). 872 */ 873 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 874 kprintf("chain %p.%d %08x recursed but wasn't " 875 "modified mirr=%016jx " 876 "update_lo=%016jx synctid=%016jx\n", 877 chain, chain->bref.type, chain->flags, 878 chain->bref.mirror_tid, 879 core->update_lo, info->sync_tid); 880 #if 0 881 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 882 hammer2_chain_ref(chain); 883 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 884 } 885 #endif 886 chain->debug_reason = (chain->debug_reason & ~255) | 10; 887 return; 888 } 889 890 chain->debug_reason = (chain->debug_reason & ~255) | 11; 891 892 /* 893 * Issue flush. 894 * 895 * A DESTROYED node that reaches this point must be flushed for 896 * synchronization point consistency. 897 * 898 * Update bref.mirror_tid, clear MODIFIED, and set MOVED. 899 * 900 * The caller will update the parent's reference to this chain 901 * by testing MOVED as long as the modification was in-bounds. 902 * 903 * MOVED is never set on the volume root as there is no parent 904 * to adjust. 905 */ 906 if (hammer2_debug & 0x1000) { 907 kprintf("Flush %p.%d %016jx/%d sync_tid=%016jx data=%016jx\n", 908 chain, chain->bref.type, 909 chain->bref.key, chain->bref.keybits, 910 info->sync_tid, chain->bref.data_off); 911 } 912 if (hammer2_debug & 0x2000) { 913 Debugger("Flush hell"); 914 } 915 916 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 917 918 if ((chain->flags & HAMMER2_CHAIN_MOVED) || 919 chain == &hmp->vchain || 920 chain == &hmp->fchain) { 921 /* 922 * Drop the ref from the MODIFIED bit we cleared, 923 * net -1 ref. 924 */ 925 hammer2_chain_drop(chain); 926 } else { 927 /* 928 * Drop the ref from the MODIFIED bit we cleared and 929 * set a ref for the MOVED bit we are setting. Net 0 refs. 930 */ 931 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 932 } 933 934 /* 935 * If this is part of a recursive flush we can go ahead and write 936 * out the buffer cache buffer and pass a new bref back up the chain 937 * via the MOVED bit. 938 * 939 * Volume headers are NOT flushed here as they require special 940 * processing. 941 */ 942 switch(chain->bref.type) { 943 case HAMMER2_BREF_TYPE_FREEMAP: 944 hammer2_modify_volume(hmp); 945 hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid; 946 break; 947 case HAMMER2_BREF_TYPE_VOLUME: 948 /* 949 * The free block table is flushed by hammer2_vfs_sync() 950 * before it flushes vchain. We must still hold fchain 951 * locked while copying voldata to volsync, however. 952 */ 953 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 954 #if 0 955 if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) || 956 hmp->voldata.freemap_tid < info->trans->sync_tid) { 957 /* this will modify vchain as a side effect */ 958 hammer2_chain_t *tmp = &hmp->fchain; 959 hammer2_chain_flush(info->trans, &tmp); 960 KKASSERT(tmp == &hmp->fchain); 961 } 962 #endif 963 964 /* 965 * There is no parent to our root vchain and fchain to 966 * synchronize the bref to, their updated mirror_tid's 967 * must be synchronized to the volume header. 968 */ 969 hmp->voldata.mirror_tid = chain->bref.mirror_tid; 970 /*hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;*/ 971 972 /* 973 * The volume header is flushed manually by the syncer, not 974 * here. All we do here is adjust the crc's. 975 */ 976 KKASSERT(chain->data != NULL); 977 KKASSERT(chain->dio == NULL); 978 979 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 980 hammer2_icrc32( 981 (char *)&hmp->voldata + 982 HAMMER2_VOLUME_ICRC1_OFF, 983 HAMMER2_VOLUME_ICRC1_SIZE); 984 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 985 hammer2_icrc32( 986 (char *)&hmp->voldata + 987 HAMMER2_VOLUME_ICRC0_OFF, 988 HAMMER2_VOLUME_ICRC0_SIZE); 989 hmp->voldata.icrc_volheader = 990 hammer2_icrc32( 991 (char *)&hmp->voldata + 992 HAMMER2_VOLUME_ICRCVH_OFF, 993 HAMMER2_VOLUME_ICRCVH_SIZE); 994 hmp->volsync = hmp->voldata; 995 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 996 hammer2_chain_unlock(&hmp->fchain); 997 break; 998 case HAMMER2_BREF_TYPE_DATA: 999 /* 1000 * Data elements have already been flushed via the logical 1001 * file buffer cache. Their hash was set in the bref by 1002 * the vop_write code. 1003 * 1004 * Make sure any device buffer(s) have been flushed out here. 1005 * (there aren't usually any to flush). 1006 */ 1007 #if 0 1008 /* XXX */ 1009 /* chain and chain->bref, NOWAIT operation */ 1010 #endif 1011 break; 1012 #if 0 1013 case HAMMER2_BREF_TYPE_INDIRECT: 1014 /* 1015 * Indirect blocks may be in an INITIAL state. Use the 1016 * chain_lock() call to ensure that the buffer has been 1017 * instantiated (even though it is already locked the buffer 1018 * might not have been instantiated). 1019 * 1020 * Only write the buffer out if it is dirty, it is possible 1021 * the operating system had already written out the buffer. 1022 */ 1023 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1024 KKASSERT(chain->dio != NULL); 1025 1026 chain->data = NULL; 1027 hammer2_io_bqrelse(&chain->dio); 1028 hammer2_chain_unlock(chain); 1029 break; 1030 #endif 1031 case HAMMER2_BREF_TYPE_INDIRECT: 1032 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1033 /* 1034 * Device-backed. Buffer will be flushed by the sync 1035 * code XXX. 1036 */ 1037 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 1038 break; 1039 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1040 default: 1041 /* 1042 * Embedded elements have to be flushed out. 1043 * (Basically just BREF_TYPE_INODE). 1044 */ 1045 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 1046 KKASSERT(chain->data != NULL); 1047 KKASSERT(chain->dio == NULL); 1048 bref = &chain->bref; 1049 1050 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0); 1051 KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) == 1052 HAMMER2_CHECK_ISCSI32 || 1053 HAMMER2_DEC_CHECK(chain->bref.methods) == 1054 HAMMER2_CHECK_FREEMAP); 1055 1056 /* 1057 * The data is embedded, we have to acquire the 1058 * buffer cache buffer and copy the data into it. 1059 */ 1060 error = hammer2_io_bread(hmp, bref->data_off, chain->bytes, 1061 &dio); 1062 KKASSERT(error == 0); 1063 bdata = hammer2_io_data(dio, bref->data_off); 1064 1065 /* 1066 * Copy the data to the buffer, mark the buffer 1067 * dirty, and convert the chain to unmodified. 1068 */ 1069 bcopy(chain->data, bdata, chain->bytes); 1070 hammer2_io_bdwrite(&dio); 1071 1072 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 1073 case HAMMER2_CHECK_FREEMAP: 1074 chain->bref.check.freemap.icrc32 = 1075 hammer2_icrc32(chain->data, chain->bytes); 1076 break; 1077 case HAMMER2_CHECK_ISCSI32: 1078 chain->bref.check.iscsi32.value = 1079 hammer2_icrc32(chain->data, chain->bytes); 1080 break; 1081 default: 1082 panic("hammer2_flush_core: bad crc type"); 1083 break; /* NOT REACHED */ 1084 } 1085 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 1086 ++hammer2_iod_meta_write; 1087 else 1088 ++hammer2_iod_indr_write; 1089 } 1090 } 1091 1092 /* 1093 * Flush helper scan1 (recursive) 1094 * 1095 * Flushes the children of the caller's chain (parent) and updates 1096 * the blockref, restricted by sync_tid. 1097 * 1098 * Ripouts during the loop should not cause any problems. Because we are 1099 * flushing to a synchronization point, modification races will occur after 1100 * sync_tid and do not have to be flushed anyway. 1101 * 1102 * It is also ok if the parent is chain_duplicate()'d while unlocked because 1103 * the delete/duplication will install a delete_tid that is still larger than 1104 * our current sync_tid. 1105 * 1106 * WARNING! If we do not call chain_flush_core we must update bref.mirror_tid 1107 * ourselves. 1108 */ 1109 static int 1110 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data) 1111 { 1112 hammer2_flush_info_t *info = data; 1113 hammer2_trans_t *trans = info->trans; 1114 hammer2_chain_t *parent = info->parent; 1115 int diddeferral; 1116 1117 if (hammer2_debug & 0x80000) 1118 Debugger("hell3"); 1119 diddeferral = info->diddeferral; 1120 1121 /* 1122 * Child is beyond the flush synchronization zone, don't persue. 1123 * Remember that modifications generally delete-duplicate so if the 1124 * sub-tree is dirty another child will get us there. But not this 1125 * one. 1126 * 1127 * Or MODIFIED is not set and child is already fully synchronized 1128 * with its sub-tree. Don't persue. 1129 * 1130 * (child can never be fchain or vchain so a special check isn't 1131 * needed). 1132 */ 1133 if (child->modify_tid > trans->sync_tid) { 1134 KKASSERT(child->delete_tid >= child->modify_tid); 1135 child->debug_reason = (child->debug_reason & ~255) | 1; 1136 /* do not update child->core->update_lo, core not flushed */ 1137 /* do not update core->update_lo, there may be another path */ 1138 /* do not update mirror_tid, scan2 will ignore chain */ 1139 return (0); 1140 } 1141 1142 /* 1143 * We must ref the child before unlocking the spinlock. 1144 * 1145 * The caller has added a ref to the parent so we can temporarily 1146 * unlock it in order to lock the child. 1147 */ 1148 hammer2_chain_ref(child); 1149 spin_unlock(&parent->core->cst.spin); 1150 1151 hammer2_chain_unlock(parent); 1152 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 1153 1154 #if 0 1155 /* 1156 * This isn't working atm, it seems to be causing necessary 1157 * updates to be thrown away, probably due to aliasing, resulting 1158 * base_insert/base_delete panics. 1159 */ 1160 /* 1161 * The DESTROYED flag can only be initially set on an unreferenced 1162 * deleted inode and will propagate downward via the mechanic below. 1163 * Such inode chains have been deleted for good and should no longer 1164 * be subject to delete/duplication. 1165 * 1166 * This optimization allows the inode reclaim (destroy unlinked file 1167 * on vnode reclamation after last close) to be flagged by just 1168 * setting HAMMER2_CHAIN_DESTROYED at the top level and then will 1169 * cause the chains to be terminated and related buffers to be 1170 * invalidated and not flushed out. 1171 * 1172 * We have to be careful not to propagate the DESTROYED flag if 1173 * the destruction occurred after our flush sync_tid. 1174 */ 1175 if (parent->delete_tid <= trans->sync_tid && 1176 (parent->flags & HAMMER2_CHAIN_DESTROYED) && 1177 (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) { 1178 /* 1179 * Force downward recursion by bringing update_hi up to 1180 * at least sync_tid, and setting the DESTROYED flag. 1181 * Parent's mirror_tid has not yet been updated. 1182 * 1183 * We do not mark the child DELETED because this would 1184 * cause unnecessary modifications/updates. Instead, the 1185 * DESTROYED flag propagates downward and causes the flush 1186 * to ignore any pre-existing modified chains. 1187 * 1188 * Vnode reclamation may have forced update_hi to MAX_TID 1189 * (we do this because there was no transaction at the time). 1190 * In this situation bring it down to something reasonable 1191 * so the elements being destroyed can be retired. 1192 */ 1193 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROYED); 1194 spin_lock(&child->core->cst.spin); 1195 if (child->core->update_hi < trans->sync_tid) 1196 child->core->update_hi = trans->sync_tid; 1197 spin_unlock(&child->core->cst.spin); 1198 } 1199 #endif 1200 1201 /* 1202 * No recursion needed if neither the child or anything under it 1203 * was messed with. 1204 */ 1205 if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 && 1206 child->core->update_lo >= info->sync_tid) { 1207 child->debug_reason = (child->debug_reason & ~255) | 2; 1208 if (child->bref.mirror_tid < info->sync_tid) 1209 child->bref.mirror_tid = info->sync_tid; 1210 goto skip; 1211 } 1212 1213 /* 1214 * Re-check original pre-lock conditions after locking. 1215 */ 1216 if (child->modify_tid > trans->sync_tid) { 1217 child->debug_reason = (child->debug_reason & ~255) | 3; 1218 hammer2_chain_unlock(child); 1219 hammer2_chain_drop(child); 1220 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1221 spin_lock(&parent->core->cst.spin); 1222 return (0); 1223 } 1224 1225 if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 && 1226 child->core->update_lo >= info->sync_tid) { 1227 child->debug_reason = (child->debug_reason & ~255) | 4; 1228 if (child->bref.mirror_tid < info->sync_tid) 1229 child->bref.mirror_tid = info->sync_tid; 1230 goto skip; 1231 } 1232 1233 /* 1234 * Recurse and collect deferral data. 1235 */ 1236 ++info->depth; 1237 hammer2_chain_flush_core(info, &child); 1238 --info->depth; 1239 1240 skip: 1241 /* 1242 * Check the conditions that could cause SCAN2 to modify the parent. 1243 * Modify the parent here instead of in SCAN2, which would cause 1244 * rollup chicken-and-egg races. 1245 * 1246 * Scan2 is expected to update bref.mirror_tid in the domodify case, 1247 * but will skip the child otherwise giving us the responsibility to 1248 * update bref.mirror_tid. 1249 * 1250 * WARNING! Do NOT update the child's bref.mirror_tid right here, 1251 * even if there was no deferral. Doing so would cause 1252 * confusion with the child's block array state in a 1253 * future flush. 1254 */ 1255 if (h2ignore_deleted(info, parent)) { 1256 /* 1257 * Special optimization matching similar tests done in 1258 * flush_core, scan1, and scan2. Avoid updating the block 1259 * table in the parent if the parent is no longer visible. 1260 * A deleted parent is no longer visible unless it's an 1261 * inode (in which case it might have an open fd).. the 1262 * DESTROYED flag must also be checked for inodes. 1263 */ 1264 ; 1265 } else if (child->delete_tid <= trans->sync_tid && 1266 child->delete_tid > parent->bref.mirror_tid && 1267 child->modify_tid <= parent->bref.mirror_tid) { 1268 info->domodify = 1; 1269 } else if (child->delete_tid > trans->sync_tid && 1270 child->modify_tid > parent->bref.mirror_tid) { 1271 info->domodify = 1; /* base insertion */ 1272 } 1273 1274 /* 1275 * Relock to continue the loop 1276 */ 1277 hammer2_chain_unlock(child); 1278 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1279 hammer2_chain_drop(child); 1280 KKASSERT(info->parent == parent); 1281 1282 spin_lock(&parent->core->cst.spin); 1283 return (0); 1284 } 1285 1286 /* 1287 * Flush helper scan2 (non-recursive) 1288 * 1289 * This pass on a chain's children propagates any MOVED or DELETED 1290 * elements back up the chain towards the root after those elements have 1291 * been fully flushed. Unlike scan1, this function is NOT recursive and 1292 * the parent remains locked across the entire scan. 1293 * 1294 * SCAN2 is called twice, once with pass set to 1 and once with it set to 2. 1295 * We have to do this so base[] elements can be deleted in pass 1 to make 1296 * room for adding new elements in pass 2. 1297 * 1298 * This function also rolls up storage statistics. 1299 * 1300 * NOTE! A deletion is a visbility issue, there can still be references to 1301 * deleted elements (for example, to an unlinked file which is still 1302 * open), and there can also be multiple chains pointing to the same 1303 * bref where some are deleted and some are not (for example due to 1304 * a rename). So a chain marked for deletion is basically considered 1305 * to be live until it is explicitly destroyed or until its ref-count 1306 * reaches zero (also implying that MOVED and MODIFIED are clear). 1307 * 1308 * NOTE! Info->parent will be locked but will only be instantiated/modified 1309 * if it is either MODIFIED or if scan1 determined that block table 1310 * updates will occur. 1311 * 1312 * NOTE! SCAN2 is responsible for updating child->bref.mirror_tid only in 1313 * the case where it modifies the parent (does a base insertion 1314 * or deletion). SCAN1 handled all other cases. 1315 */ 1316 static int 1317 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data) 1318 { 1319 hammer2_flush_info_t *info = data; 1320 hammer2_chain_t *parent = info->parent; 1321 hammer2_chain_core_t *above = child->above; 1322 hammer2_mount_t *hmp = child->hmp; 1323 hammer2_trans_t *trans = info->trans; 1324 hammer2_blockref_t *base; 1325 int count; 1326 int ok; 1327 1328 #if FLUSH_DEBUG 1329 kprintf("SCAN2 %p.%d %08x mod=%016jx del=%016jx trans=%016jx\n", child, child->bref.type, child->flags, child->modify_tid, child->delete_tid, info->trans->sync_tid); 1330 #endif 1331 /* 1332 * Ignore children created after our flush point, treating them as 1333 * if they did not exist). These children will not cause the parent 1334 * to be updated. 1335 * 1336 * Children deleted after our flush point are treated as having been 1337 * created for the purposes of the flush. The parent's update_hi 1338 * will already be higher than our trans->sync_tid so the path for 1339 * the next flush is left intact. 1340 * 1341 * When we encounter such children and the parent chain has not been 1342 * deleted, delete/duplicated, or delete/duplicated-for-move, then 1343 * the parent may be used to funnel through several flush points. 1344 * These chains will still be visible to later flushes due to having 1345 * a higher update_hi than we can set in the current flush. 1346 */ 1347 if (child->modify_tid > trans->sync_tid) { 1348 KKASSERT(child->delete_tid >= child->modify_tid); 1349 goto finalize; 1350 } 1351 1352 #if 0 1353 /* 1354 * Ignore children which have not changed. The parent's block table 1355 * is already correct. 1356 * 1357 * XXX The MOVED bit is only cleared when all multi-homed parents 1358 * have flushed, creating a situation where a re-flush can occur 1359 * via a parent which has already flushed. The hammer2_base_*() 1360 * functions currently have a hack to deal with this case but 1361 * we need something better. 1362 */ 1363 if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) { 1364 KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0); 1365 goto finalize; 1366 } 1367 #endif 1368 1369 /* 1370 * Make sure child is referenced before we unlock. 1371 */ 1372 hammer2_chain_ref(child); 1373 spin_unlock(&above->cst.spin); 1374 1375 /* 1376 * Parent reflushed after the child has passed them by should skip 1377 * due to the modify_tid test. XXX 1378 */ 1379 hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER); 1380 KKASSERT(child->above == above); 1381 KKASSERT(parent->core == above); 1382 1383 /* 1384 * The parent's blockref to the child must be deleted or updated. 1385 * 1386 * This point is not reached on successful DESTROYED optimizations 1387 * but can be reached on recursive deletions and restricted flushes. 1388 * 1389 * The chain_modify here may delete-duplicate the block. This can 1390 * cause a multitude of issues if the block was already modified 1391 * by a later (post-flush) transaction. Primarily blockrefs in 1392 * the later block can be out-of-date, so if the situation occurs 1393 * we can't throw away the MOVED bit on the current blocks until 1394 * the later blocks are flushed (so as to be able to regenerate all 1395 * the changes that were made). 1396 * 1397 * Because flushes are ordered we do not have to make a 1398 * modify/duplicate of indirect blocks. That is, the flush 1399 * code does not have to kmalloc or duplicate anything. We 1400 * can adjust the indirect block table in-place and reuse the 1401 * chain. It IS possible that the chain has already been duplicated 1402 * or may wind up being duplicated on-the-fly by modifying code 1403 * on the frontend. We simply use the original and ignore such 1404 * chains. However, it does mean we can't clear the MOVED bit. 1405 * 1406 * XXX recursive deletions not optimized. 1407 */ 1408 1409 switch(parent->bref.type) { 1410 case HAMMER2_BREF_TYPE_INODE: 1411 /* 1412 * Access the inode's block array. However, there is no 1413 * block array if the inode is flagged DIRECTDATA. The 1414 * DIRECTDATA case typicaly only occurs when a hardlink has 1415 * been shifted up the tree and the original inode gets 1416 * replaced with an OBJTYPE_HARDLINK placeholding inode. 1417 */ 1418 if (parent->data && 1419 (parent->data->ipdata.op_flags & 1420 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 1421 base = &parent->data->ipdata.u.blockset.blockref[0]; 1422 } else { 1423 base = NULL; 1424 } 1425 count = HAMMER2_SET_COUNT; 1426 break; 1427 case HAMMER2_BREF_TYPE_INDIRECT: 1428 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1429 if (parent->data) 1430 base = &parent->data->npdata[0]; 1431 else 1432 base = NULL; 1433 count = parent->bytes / sizeof(hammer2_blockref_t); 1434 break; 1435 case HAMMER2_BREF_TYPE_VOLUME: 1436 base = &hmp->voldata.sroot_blockset.blockref[0]; 1437 count = HAMMER2_SET_COUNT; 1438 break; 1439 case HAMMER2_BREF_TYPE_FREEMAP: 1440 base = &parent->data->npdata[0]; 1441 count = HAMMER2_SET_COUNT; 1442 break; 1443 default: 1444 base = NULL; 1445 count = 0; 1446 panic("hammer2_chain_flush_scan2: " 1447 "unrecognized blockref type: %d", 1448 parent->bref.type); 1449 } 1450 1451 /* 1452 * Don't bother updating a deleted + destroyed parent's blockrefs. 1453 * We MUST update deleted + non-destroyed parent's blockrefs since 1454 * they could represent an open file. 1455 * 1456 * Otherwise, we need to be COUNTEDBREFS synchronized for the 1457 * hammer2_base_*() functions. 1458 * 1459 * NOTE: We optimize this by noting that only 'inode' chains require 1460 * this treatment. When a file with an open descriptor is 1461 * deleted only its inode is marked deleted. Other deletions, 1462 * such as indirect block deletions, will no longer be visible 1463 * to the live filesystem and do not need to be updated. 1464 * 1465 * rm -rf's generally wind up setting DESTROYED on the way down 1466 * and the result is typically that no disk I/O is needed at all 1467 * when rm -rf'ing an entire directory topology. 1468 * 1469 * This test must match the similar one in flush_core. 1470 */ 1471 #if FLUSH_DEBUG 1472 kprintf("SCAN2 base=%p pass=%d PARENT %p.%d DTID=%016jx SYNC=%016jx\n", 1473 base, 1474 info->pass, parent, parent->bref.type, parent->delete_tid, trans->sync_tid); 1475 #endif 1476 if (h2ignore_deleted(info, parent)) 1477 base = NULL; 1478 1479 /* 1480 * Update the parent's blockref table and propagate mirror_tid. 1481 * 1482 * NOTE! Children with modify_tid's beyond our flush point are 1483 * considered to not exist for the purposes of updating the 1484 * parent's blockref array. 1485 * 1486 * NOTE! SCAN1 has already put the parent in a modified state 1487 * so if it isn't we panic. 1488 * 1489 * NOTE! chain->modify_tid vs chain->bref.modify_tid. The chain's 1490 * internal modify_tid is always updated based on creation 1491 * or delete-duplicate. However, the bref.modify_tid is NOT 1492 * updated due to simple blockref updates. 1493 */ 1494 #if FLUSH_DEBUG 1495 kprintf("chain %p->%p pass %d trans %016jx sync %p.%d %016jx/%d C=%016jx D=%016jx PMIRROR %016jx\n", 1496 parent, child, 1497 info->pass, trans->sync_tid, 1498 child, child->bref.type, 1499 child->bref.key, child->bref.keybits, 1500 child->modify_tid, child->delete_tid, parent->bref.mirror_tid); 1501 #endif 1502 1503 if (info->pass == 1 && child->delete_tid <= trans->sync_tid) { 1504 /* 1505 * Deleting. The block array is expected to contain the 1506 * child's entry if: 1507 * 1508 * (1) The deletion occurred after the parent's block table 1509 * was last synchronized (delete_tid), and 1510 * 1511 * (2) The creation occurred before or during the parent's 1512 * last block table synchronization. 1513 */ 1514 #if FLUSH_DEBUG 1515 kprintf("S2A %p.%d b=%p d/b=%016jx/%016jx m/b=%016jx/%016jx\n", 1516 child, child->bref.type, 1517 base, child->delete_tid, parent->bref.mirror_tid, 1518 child->modify_tid, parent->bref.mirror_tid); 1519 #endif 1520 if (base && 1521 child->delete_tid > parent->bref.mirror_tid && 1522 child->modify_tid <= parent->bref.mirror_tid) { 1523 KKASSERT(child->flags & HAMMER2_CHAIN_MOVED); 1524 KKASSERT(parent->modify_tid == trans->sync_tid || 1525 (parent == &hmp->vchain || 1526 parent == &hmp->fchain)); 1527 hammer2_rollup_stats(parent, child, -1); 1528 spin_lock(&above->cst.spin); 1529 #if FLUSH_DEBUG 1530 kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx " 1531 "flg=%08x %016jx/%d delete\n", 1532 trans->sync_tid, 1533 parent, parent->bref.type, 1534 child, child->bref.type, 1535 child->modify_tid, child->delete_tid, 1536 child->flags, 1537 child->bref.key, child->bref.keybits); 1538 #endif 1539 hammer2_base_delete(trans, parent, base, count, 1540 &info->cache_index, child); 1541 spin_unlock(&above->cst.spin); 1542 } 1543 } else if (info->pass == 2 && child->delete_tid > trans->sync_tid) { 1544 /* 1545 * Inserting. The block array is expected to NOT contain 1546 * the child's entry if: 1547 * 1548 * (1) The creation occurred after the parent's block table 1549 * was last synchronized (modify_tid), and 1550 * 1551 * (2) The child is not being deleted in the same 1552 * transaction. 1553 */ 1554 #if FLUSH_DEBUG 1555 kprintf("S2B %p.%d b=%p d/b=%016jx/%016jx m/b=%016jx/%016jx\n", 1556 child, child->bref.type, 1557 base, child->delete_tid, parent->bref.mirror_tid, 1558 child->modify_tid, parent->bref.mirror_tid); 1559 #endif 1560 if (base && 1561 child->modify_tid > parent->bref.mirror_tid) { 1562 KKASSERT(child->flags & HAMMER2_CHAIN_MOVED); 1563 KKASSERT(parent->modify_tid == trans->sync_tid || 1564 (parent == &hmp->vchain || 1565 parent == &hmp->fchain)); 1566 hammer2_rollup_stats(parent, child, 1); 1567 spin_lock(&above->cst.spin); 1568 #if FLUSH_DEBUG 1569 kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx " 1570 "flg=%08x %016jx/%d insert\n", 1571 trans->sync_tid, 1572 parent, parent->bref.type, 1573 child, child->bref.type, 1574 child->modify_tid, child->delete_tid, 1575 child->flags, 1576 child->bref.key, child->bref.keybits); 1577 #endif 1578 hammer2_base_insert(trans, parent, base, count, 1579 &info->cache_index, child); 1580 spin_unlock(&above->cst.spin); 1581 } 1582 } else if (info->pass == 3 && 1583 (child->delete_tid == HAMMER2_MAX_TID || 1584 child->delete_tid <= trans->sync_tid) && 1585 (child->flags & HAMMER2_CHAIN_MOVED)) { 1586 /* 1587 * We can't clear the MOVED bit on children whos modify_tid 1588 * is beyond our current trans (was tested at top of scan2), 1589 * or on deleted children which have not yet been flushed 1590 * (handled above). 1591 * 1592 * Scan all parents of this child and determine if any of 1593 * them still need the child's MOVED bit. 1594 */ 1595 hammer2_chain_t *scan; 1596 1597 if (hammer2_debug & 0x4000) 1598 kprintf("CHECKMOVED %p (parent=%p)", child, parent); 1599 1600 ok = 1; 1601 spin_lock(&above->cst.spin); 1602 TAILQ_FOREACH(scan, &above->ownerq, core_entry) { 1603 /* 1604 * Can't clear child's MOVED until all parent's have 1605 * synchronized with it. 1606 * 1607 * Ignore deleted parents as-of this flush TID. 1608 * Ignore the current parent being flushed. 1609 */ 1610 if (h2ignore_deleted(info, scan)) 1611 continue; 1612 if (scan == parent) 1613 continue; 1614 1615 /* 1616 * For parents not already synchronized check to see 1617 * if the flush has gotten past them yet or not. 1618 */ 1619 if (scan->bref.mirror_tid >= trans->sync_tid) 1620 continue; 1621 1622 if (hammer2_debug & 0x4000) { 1623 kprintf("(fail scan %p %016jx/%016jx)", 1624 scan, scan->bref.mirror_tid, 1625 child->modify_tid); 1626 } 1627 ok = 0; 1628 break; 1629 } 1630 if (hammer2_debug & 0x4000) 1631 kprintf("\n"); 1632 spin_unlock(&above->cst.spin); 1633 1634 /* 1635 * Can we finally clear MOVED? 1636 */ 1637 if (ok) { 1638 if (hammer2_debug & 0x4000) 1639 kprintf("clear moved %p.%d %016jx/%d\n", 1640 child, child->bref.type, 1641 child->bref.key, child->bref.keybits); 1642 atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED); 1643 hammer2_chain_drop(child); /* moved cleared */ 1644 KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0); 1645 } else { 1646 if (hammer2_debug & 0x4000) 1647 kprintf("keep moved %p.%d %016jx/%d\n", 1648 child, child->bref.type, 1649 child->bref.key, child->bref.keybits); 1650 } 1651 } 1652 1653 /* 1654 * Unlock the child. This can wind up dropping the child's 1655 * last ref, removing it from the parent's RB tree, and deallocating 1656 * the structure. The RB_SCAN() our caller is doing handles the 1657 * situation. 1658 */ 1659 hammer2_chain_unlock(child); 1660 hammer2_chain_drop(child); 1661 spin_lock(&above->cst.spin); 1662 1663 /* 1664 * The parent may have been delete-duplicated. 1665 */ 1666 info->parent = parent; 1667 finalize: 1668 return (0); 1669 } 1670 1671 static 1672 void 1673 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how) 1674 { 1675 #if 0 1676 hammer2_chain_t *grandp; 1677 #endif 1678 1679 parent->data_count += child->data_count; 1680 parent->inode_count += child->inode_count; 1681 child->data_count = 0; 1682 child->inode_count = 0; 1683 if (how < 0) { 1684 parent->data_count -= child->bytes; 1685 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) { 1686 parent->inode_count -= 1; 1687 #if 0 1688 /* XXX child->data may be NULL atm */ 1689 parent->data_count -= child->data->ipdata.data_count; 1690 parent->inode_count -= child->data->ipdata.inode_count; 1691 #endif 1692 } 1693 } else if (how > 0) { 1694 parent->data_count += child->bytes; 1695 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) { 1696 parent->inode_count += 1; 1697 #if 0 1698 /* XXX child->data may be NULL atm */ 1699 parent->data_count += child->data->ipdata.data_count; 1700 parent->inode_count += child->data->ipdata.inode_count; 1701 #endif 1702 } 1703 } 1704 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 1705 parent->data->ipdata.data_count += parent->data_count; 1706 parent->data->ipdata.inode_count += parent->inode_count; 1707 #if 0 1708 for (grandp = parent->above->first_parent; 1709 grandp; 1710 grandp = grandp->next_parent) { 1711 grandp->data_count += parent->data_count; 1712 grandp->inode_count += parent->inode_count; 1713 } 1714 #endif 1715 parent->data_count = 0; 1716 parent->inode_count = 0; 1717 } 1718 } 1719