1 /* 2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * TRANSACTION AND FLUSH HANDLING 37 * 38 * Deceptively simple but actually fairly difficult to implement properly is 39 * how I would describe it. 40 * 41 * The biggest issue is that each PFS may belong to a cluster so its media 42 * modify_tid and mirror_tid fields are in a completely different domain 43 * than the topology related to the super-root. 44 * 45 * Flushing generally occurs bottom-up but requires a top-down scan to 46 * locate chains with MODIFIED and/or UPDATE bits set. The ONFLUSH flag 47 * tells how to recurse downward to find these chains. 48 */ 49 50 #include <sys/cdefs.h> 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/types.h> 54 #include <sys/lock.h> 55 #include <sys/uuid.h> 56 57 #include "hammer2.h" 58 59 #define FLUSH_DEBUG 0 60 61 #define HAMMER2_FLUSH_DEPTH_LIMIT 10 /* stack recursion limit */ 62 63 64 /* 65 * Recursively flush the specified chain. The chain is locked and 66 * referenced by the caller and will remain so on return. The chain 67 * will remain referenced throughout but can temporarily lose its 68 * lock during the recursion to avoid unnecessarily stalling user 69 * processes. 70 */ 71 struct hammer2_flush_info { 72 hammer2_chain_t *parent; 73 hammer2_trans_t *trans; 74 int depth; 75 int diddeferral; 76 int cache_index; 77 struct h2_flush_list flushq; 78 hammer2_xid_t sync_xid; /* memory synchronization point */ 79 hammer2_chain_t *debug; 80 }; 81 82 typedef struct hammer2_flush_info hammer2_flush_info_t; 83 84 static void hammer2_flush_core(hammer2_flush_info_t *info, 85 hammer2_chain_t *chain, int deleting); 86 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data); 87 88 /* 89 * For now use a global transaction manager. What we ultimately want to do 90 * is give each non-overlapping hmp/pmp group its own transaction manager. 91 * 92 * Transactions govern XID tracking on the physical media (the hmp), but they 93 * also govern TID tracking which is per-PFS and thus might cross multiple 94 * hmp's. So we can't just stuff tmanage into hammer2_mount or 95 * hammer2_pfsmount. 96 */ 97 static hammer2_trans_manage_t tmanage; 98 99 void 100 hammer2_trans_manage_init(void) 101 { 102 lockinit(&tmanage.translk, "h2trans", 0, 0); 103 TAILQ_INIT(&tmanage.transq); 104 tmanage.flush_xid = 1; 105 tmanage.alloc_xid = tmanage.flush_xid + 1; 106 } 107 108 hammer2_xid_t 109 hammer2_trans_newxid(hammer2_pfsmount_t *pmp __unused) 110 { 111 hammer2_xid_t xid; 112 113 for (;;) { 114 xid = atomic_fetchadd_int(&tmanage.alloc_xid, 1); 115 if (xid) 116 break; 117 } 118 return xid; 119 } 120 121 /* 122 * Transaction support functions for writing to the filesystem. 123 * 124 * Initializing a new transaction allocates a transaction ID. Typically 125 * passed a pmp (hmp passed as NULL), indicating a cluster transaction. Can 126 * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single 127 * media target. The latter mode is used by the recovery code. 128 * 129 * TWO TRANSACTION IDs can run concurrently, where one is a flush and the 130 * other is a set of any number of concurrent filesystem operations. We 131 * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops> 132 * or we can have <running_flush> + <concurrent_fs_ops>. 133 * 134 * During a flush, new fs_ops are only blocked until the fs_ops prior to 135 * the flush complete. The new fs_ops can then run concurrent with the flush. 136 * 137 * Buffer-cache transactions operate as fs_ops but never block. A 138 * buffer-cache flush will run either before or after the current pending 139 * flush depending on its state. 140 */ 141 void 142 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags) 143 { 144 hammer2_trans_manage_t *tman; 145 hammer2_trans_t *head; 146 147 tman = &tmanage; 148 149 bzero(trans, sizeof(*trans)); 150 trans->pmp = pmp; 151 trans->flags = flags; 152 trans->td = curthread; 153 154 lockmgr(&tman->translk, LK_EXCLUSIVE); 155 156 if (flags & HAMMER2_TRANS_ISFLUSH) { 157 /* 158 * If multiple flushes are trying to run we have to 159 * wait until it is our turn. All flushes are serialized. 160 * 161 * We queue ourselves and then wait to become the head 162 * of the queue, allowing all prior flushes to complete. 163 * 164 * Multiple normal transactions can share the current 165 * transaction id but a flush transaction needs its own 166 * unique TID for proper block table update accounting. 167 */ 168 ++tman->flushcnt; 169 ++pmp->alloc_tid; 170 pmp->flush_tid = pmp->alloc_tid; 171 tman->flush_xid = hammer2_trans_newxid(pmp); 172 trans->sync_xid = tman->flush_xid; 173 ++pmp->alloc_tid; 174 TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 175 if (TAILQ_FIRST(&tman->transq) != trans) { 176 trans->blocked = 1; 177 while (trans->blocked) { 178 lksleep(&trans->sync_xid, &tman->translk, 179 0, "h2multf", hz); 180 } 181 } 182 } else if (tman->flushcnt == 0) { 183 /* 184 * No flushes are pending, we can go. Use prior flush_xid + 1. 185 * 186 * WARNING! Also see hammer2_chain_setflush() 187 */ 188 TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 189 trans->sync_xid = tman->flush_xid + 1; 190 191 /* XXX improve/optimize inode allocation */ 192 } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) { 193 /* 194 * A buffer cache transaction is requested while a flush 195 * is in progress. The flush's PREFLUSH flag must be set 196 * in this situation. 197 * 198 * The buffer cache flush takes on the main flush's 199 * transaction id. 200 */ 201 TAILQ_FOREACH(head, &tman->transq, entry) { 202 if (head->flags & HAMMER2_TRANS_ISFLUSH) 203 break; 204 } 205 KKASSERT(head); 206 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH); 207 trans->flags |= HAMMER2_TRANS_PREFLUSH; 208 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 209 trans->sync_xid = head->sync_xid; 210 trans->flags |= HAMMER2_TRANS_CONCURRENT; 211 /* not allowed to block */ 212 } else { 213 /* 214 * A normal transaction is requested while a flush is in 215 * progress. We insert after the current flush and may 216 * block. 217 * 218 * WARNING! Also see hammer2_chain_setflush() 219 */ 220 TAILQ_FOREACH(head, &tman->transq, entry) { 221 if (head->flags & HAMMER2_TRANS_ISFLUSH) 222 break; 223 } 224 KKASSERT(head); 225 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 226 trans->sync_xid = head->sync_xid + 1; 227 trans->flags |= HAMMER2_TRANS_CONCURRENT; 228 229 /* 230 * XXX for now we must block new transactions, synchronous 231 * flush mode is on by default. 232 * 233 * If synchronous flush mode is enabled concurrent 234 * frontend transactions during the flush are not 235 * allowed (except we don't have a choice for buffer 236 * cache ops). 237 */ 238 if (hammer2_synchronous_flush > 0 || 239 TAILQ_FIRST(&tman->transq) != head) { 240 trans->blocked = 1; 241 while (trans->blocked) { 242 lksleep(&trans->sync_xid, &tman->translk, 243 0, "h2multf", hz); 244 } 245 } 246 } 247 if (flags & HAMMER2_TRANS_NEWINODE) { 248 if (pmp->spmp_hmp) { 249 /* 250 * Super-root transaction, all new inodes have an 251 * inode number of 1. Normal pfs inode cache 252 * semantics are not used. 253 */ 254 trans->inode_tid = 1; 255 } else { 256 /* 257 * Normal transaction 258 */ 259 if (pmp->inode_tid < HAMMER2_INODE_START) 260 pmp->inode_tid = HAMMER2_INODE_START; 261 trans->inode_tid = pmp->inode_tid++; 262 } 263 } 264 265 lockmgr(&tman->translk, LK_RELEASE); 266 } 267 268 /* 269 * This may only be called while in a flush transaction. It's a bit of a 270 * hack but after flushing a PFS we need to flush each volume root as part 271 * of the same transaction. 272 */ 273 void 274 hammer2_trans_spmp(hammer2_trans_t *trans, hammer2_pfsmount_t *spmp) 275 { 276 ++spmp->alloc_tid; 277 spmp->flush_tid = spmp->alloc_tid; 278 ++spmp->alloc_tid; 279 trans->pmp = spmp; 280 } 281 282 283 void 284 hammer2_trans_done(hammer2_trans_t *trans) 285 { 286 hammer2_trans_manage_t *tman; 287 hammer2_trans_t *head; 288 hammer2_trans_t *scan; 289 290 tman = &tmanage; 291 292 /* 293 * Remove. 294 */ 295 lockmgr(&tman->translk, LK_EXCLUSIVE); 296 TAILQ_REMOVE(&tman->transq, trans, entry); 297 head = TAILQ_FIRST(&tman->transq); 298 299 /* 300 * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT 301 * up through the next flush. (If the head is a flush then we 302 * stop there, unlike the unblock code following this section). 303 */ 304 if (trans->flags & HAMMER2_TRANS_ISFLUSH) { 305 --tman->flushcnt; 306 scan = head; 307 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 308 atomic_clear_int(&scan->flags, 309 HAMMER2_TRANS_CONCURRENT); 310 scan = TAILQ_NEXT(scan, entry); 311 } 312 } 313 314 /* 315 * Unblock the head of the queue and any additional transactions 316 * up to the next flush. The head can be a flush and it will be 317 * unblocked along with the non-flush transactions following it 318 * (which are allowed to run concurrently with it). 319 * 320 * In synchronous flush mode we stop if the head transaction is 321 * a flush. 322 */ 323 if (head && head->blocked) { 324 head->blocked = 0; 325 wakeup(&head->sync_xid); 326 327 if (hammer2_synchronous_flush > 0) 328 scan = head; 329 else 330 scan = TAILQ_NEXT(head, entry); 331 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 332 if (scan->blocked) { 333 scan->blocked = 0; 334 wakeup(&scan->sync_xid); 335 } 336 scan = TAILQ_NEXT(scan, entry); 337 } 338 } 339 lockmgr(&tman->translk, LK_RELEASE); 340 } 341 342 /* 343 * Flush the chain and all modified sub-chains through the specified 344 * synchronization point, propagating parent chain modifications and 345 * mirror_tid updates back up as needed. 346 * 347 * Caller must have interlocked against any non-flush-related modifying 348 * operations in progress whos XXX values are less than or equal 349 * to the passed sync_xid. 350 * 351 * Caller must have already vetted synchronization points to ensure they 352 * are properly flushed. Only snapshots and cluster flushes can create 353 * these sorts of synchronization points. 354 * 355 * This routine can be called from several places but the most important 356 * is from VFS_SYNC. 357 * 358 * chain is locked on call and will remain locked on return. The chain's 359 * UPDATE flag indicates that its parent's block table (which is not yet 360 * part of the flush) should be updated. The chain may be replaced by 361 * the call if it was modified. 362 */ 363 void 364 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain) 365 { 366 hammer2_chain_t *scan; 367 hammer2_flush_info_t info; 368 int loops; 369 370 /* 371 * Execute the recursive flush and handle deferrals. 372 * 373 * Chains can be ridiculously long (thousands deep), so to 374 * avoid blowing out the kernel stack the recursive flush has a 375 * depth limit. Elements at the limit are placed on a list 376 * for re-execution after the stack has been popped. 377 */ 378 bzero(&info, sizeof(info)); 379 TAILQ_INIT(&info.flushq); 380 info.trans = trans; 381 info.sync_xid = trans->sync_xid; 382 info.cache_index = -1; 383 384 /* 385 * Calculate parent (can be NULL), if not NULL the flush core 386 * expects the parent to be referenced so it can easily lock/unlock 387 * it without it getting ripped up. 388 */ 389 if ((info.parent = chain->parent) != NULL) 390 hammer2_chain_ref(info.parent); 391 392 /* 393 * Extra ref needed because flush_core expects it when replacing 394 * chain. 395 */ 396 hammer2_chain_ref(chain); 397 loops = 0; 398 399 for (;;) { 400 /* 401 * Unwind deep recursions which had been deferred. This 402 * can leave the FLUSH_* bits set for these chains, which 403 * will be handled when we [re]flush chain after the unwind. 404 */ 405 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) { 406 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 407 TAILQ_REMOVE(&info.flushq, scan, flush_node); 408 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 409 410 /* 411 * Now that we've popped back up we can do a secondary 412 * recursion on the deferred elements. 413 * 414 * NOTE: hammer2_flush() may replace scan. 415 */ 416 if (hammer2_debug & 0x0040) 417 kprintf("deferred flush %p\n", scan); 418 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 419 hammer2_chain_drop(scan); /* ref from deferral */ 420 hammer2_flush(trans, scan); 421 hammer2_chain_unlock(scan); 422 } 423 424 /* 425 * [re]flush chain. 426 */ 427 info.diddeferral = 0; 428 hammer2_flush_core(&info, chain, 0); 429 430 /* 431 * Only loop if deep recursions have been deferred. 432 */ 433 if (TAILQ_EMPTY(&info.flushq)) 434 break; 435 436 if (++loops % 1000 == 0) { 437 kprintf("hammer2_flush: excessive loops on %p\n", 438 chain); 439 if (hammer2_debug & 0x100000) 440 Debugger("hell4"); 441 } 442 } 443 hammer2_chain_drop(chain); 444 if (info.parent) 445 hammer2_chain_drop(info.parent); 446 } 447 448 /* 449 * This is the core of the chain flushing code. The chain is locked by the 450 * caller and must also have an extra ref on it by the caller, and remains 451 * locked and will have an extra ref on return. Upon return, the caller can 452 * test the UPDATE bit on the child to determine if the parent needs updating. 453 * 454 * (1) Determine if this node is a candidate for the flush, return if it is 455 * not. fchain and vchain are always candidates for the flush. 456 * 457 * (2) If we recurse too deep the chain is entered onto the deferral list and 458 * the current flush stack is aborted until after the deferral list is 459 * run. 460 * 461 * (3) Recursively flush live children (rbtree). This can create deferrals. 462 * A successful flush clears the MODIFIED and UPDATE bits on the children 463 * and typically causes the parent to be marked MODIFIED as the children 464 * update the parent's block table. A parent might already be marked 465 * MODIFIED due to a deletion (whos blocktable update in the parent is 466 * handled by the frontend), or if the parent itself is modified by the 467 * frontend for other reasons. 468 * 469 * (4) Permanently disconnected sub-trees are cleaned up by the front-end. 470 * Deleted-but-open inodes can still be individually flushed via the 471 * filesystem syncer. 472 * 473 * (5) Note that an unmodified child may still need the block table in its 474 * parent updated (e.g. rename/move). The child will have UPDATE set 475 * in this case. 476 * 477 * WARNING ON BREF MODIFY_TID/MIRROR_TID 478 * 479 * blockref.modify_tid and blockref.mirror_tid are consistent only within a 480 * PFS. This is why we cannot cache sync_tid in the transaction structure. 481 * Instead we access it from the pmp. 482 */ 483 static void 484 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain, 485 int deleting) 486 { 487 hammer2_chain_t *parent; 488 hammer2_mount_t *hmp; 489 hammer2_pfsmount_t *pmp; 490 int diddeferral; 491 492 /* 493 * (1) Optimize downward recursion to locate nodes needing action. 494 * Nothing to do if none of these flags are set. 495 */ 496 if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) { 497 if (hammer2_debug & 0x200) { 498 if (info->debug == NULL) 499 info->debug = chain; 500 } else { 501 return; 502 } 503 } 504 505 hmp = chain->hmp; 506 pmp = chain->pmp; /* can be NULL */ 507 diddeferral = info->diddeferral; 508 parent = info->parent; /* can be NULL */ 509 510 /* 511 * mirror_tid should not be forward-indexed 512 */ 513 KKASSERT(pmp == NULL || chain->bref.mirror_tid <= pmp->flush_tid); 514 515 /* 516 * Downward search recursion 517 */ 518 if (chain->flags & HAMMER2_CHAIN_DEFERRED) { 519 /* 520 * Already deferred. 521 */ 522 ++info->diddeferral; 523 } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 524 /* 525 * Recursion depth reached. 526 */ 527 hammer2_chain_ref(chain); 528 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node); 529 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED); 530 ++info->diddeferral; 531 } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) { 532 /* 533 * Downward recursion search (actual flush occurs bottom-up). 534 * pre-clear ONFLUSH. It can get set again due to races, 535 * which we want so the scan finds us again in the next flush. 536 */ 537 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 538 info->parent = chain; 539 hammer2_spin_ex(&chain->core.spin); 540 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree, 541 NULL, hammer2_flush_recurse, info); 542 hammer2_spin_unex(&chain->core.spin); 543 info->parent = parent; 544 if (info->diddeferral) 545 hammer2_chain_setflush(info->trans, chain); 546 } 547 548 /* 549 * Now we are in the bottom-up part of the recursion. 550 * 551 * Do not update chain if lower layers were deferred. 552 */ 553 if (info->diddeferral) 554 goto done; 555 556 /* 557 * Propagate the DESTROY flag downwards. This dummies up the flush 558 * code and tries to invalidate related buffer cache buffers to 559 * avoid the disk write. 560 */ 561 if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY)) 562 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 563 564 /* 565 * Chain was already modified or has become modified, flush it out. 566 */ 567 again: 568 if ((hammer2_debug & 0x200) && 569 info->debug && 570 (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) { 571 hammer2_chain_t *scan = chain; 572 573 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain); 574 while (scan) { 575 kprintf(" chain %p [%08x] bref=%016jx:%02x\n", 576 scan, scan->flags, 577 scan->bref.key, scan->bref.type); 578 if (scan == info->debug) 579 break; 580 scan = scan->parent; 581 } 582 } 583 584 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 585 /* 586 * Dispose of the modified bit. UPDATE should already be 587 * set. 588 */ 589 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) || 590 chain == &hmp->vchain); 591 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 592 if (pmp) { 593 hammer2_pfs_memory_wakeup(pmp); 594 chain->bref.mirror_tid = pmp->flush_tid; 595 } 596 597 if ((chain->flags & HAMMER2_CHAIN_UPDATE) || 598 chain == &hmp->vchain || 599 chain == &hmp->fchain) { 600 /* 601 * Drop the ref from the MODIFIED bit we cleared, 602 * net -1 ref. 603 */ 604 hammer2_chain_drop(chain); 605 } else { 606 /* 607 * Drop the ref from the MODIFIED bit we cleared and 608 * set a ref for the UPDATE bit we are setting. Net 609 * 0 refs. 610 */ 611 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 612 } 613 614 /* 615 * Issue the flush. This is indirect via the DIO. 616 * 617 * NOTE: A DELETED node that reaches this point must be 618 * flushed for synchronization point consistency. 619 * 620 * NOTE: Even though MODIFIED was already set, the related DIO 621 * might not be dirty due to a system buffer cache 622 * flush and must be set dirty if we are going to make 623 * further modifications to the buffer. Chains with 624 * embedded data don't need this. 625 * 626 * Update bref.mirror_tid, clear MODIFIED, and set UPDATE. 627 */ 628 if (hammer2_debug & 0x1000) { 629 kprintf("Flush %p.%d %016jx/%d sync_xid=%08x " 630 "data=%016jx\n", 631 chain, chain->bref.type, 632 chain->bref.key, chain->bref.keybits, 633 info->sync_xid, 634 chain->bref.data_off); 635 } 636 if (hammer2_debug & 0x2000) { 637 Debugger("Flush hell"); 638 } 639 640 /* 641 * Update chain CRCs for flush. 642 * 643 * NOTE: Volume headers are NOT flushed here as they require 644 * special processing. 645 */ 646 switch(chain->bref.type) { 647 case HAMMER2_BREF_TYPE_FREEMAP: 648 /* 649 * (note: embedded data, do not call setdirty) 650 */ 651 KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED); 652 hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid; 653 break; 654 case HAMMER2_BREF_TYPE_VOLUME: 655 /* 656 * The free block table is flushed by hammer2_vfs_sync() 657 * before it flushes vchain. We must still hold fchain 658 * locked while copying voldata to volsync, however. 659 * 660 * (note: embedded data, do not call setdirty) 661 */ 662 hammer2_voldata_lock(hmp); 663 hammer2_chain_lock(&hmp->fchain, 664 HAMMER2_RESOLVE_ALWAYS); 665 /* 666 * There is no parent to our root vchain and fchain to 667 * synchronize the bref to, their updated mirror_tid's 668 * must be synchronized to the volume header. 669 */ 670 hmp->voldata.mirror_tid = chain->bref.mirror_tid; 671 hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid; 672 kprintf("mirror_tid %08jx\n", 673 (intmax_t)chain->bref.mirror_tid); 674 675 /* 676 * The volume header is flushed manually by the 677 * syncer, not here. All we do here is adjust the 678 * crc's. 679 */ 680 KKASSERT(chain->data != NULL); 681 KKASSERT(chain->dio == NULL); 682 683 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 684 hammer2_icrc32( 685 (char *)&hmp->voldata + 686 HAMMER2_VOLUME_ICRC1_OFF, 687 HAMMER2_VOLUME_ICRC1_SIZE); 688 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 689 hammer2_icrc32( 690 (char *)&hmp->voldata + 691 HAMMER2_VOLUME_ICRC0_OFF, 692 HAMMER2_VOLUME_ICRC0_SIZE); 693 hmp->voldata.icrc_volheader = 694 hammer2_icrc32( 695 (char *)&hmp->voldata + 696 HAMMER2_VOLUME_ICRCVH_OFF, 697 HAMMER2_VOLUME_ICRCVH_SIZE); 698 hmp->volsync = hmp->voldata; 699 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 700 hammer2_chain_unlock(&hmp->fchain); 701 hammer2_voldata_unlock(hmp); 702 break; 703 case HAMMER2_BREF_TYPE_DATA: 704 /* 705 * Data elements have already been flushed via the 706 * logical file buffer cache. Their hash was set in 707 * the bref by the vop_write code. Do not re-dirty. 708 * 709 * Make sure any device buffer(s) have been flushed 710 * out here (there aren't usually any to flush) XXX. 711 */ 712 break; 713 case HAMMER2_BREF_TYPE_INDIRECT: 714 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 715 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 716 /* 717 * Buffer I/O will be cleaned up when the volume is 718 * flushed (but the kernel is free to flush it before 719 * then, as well). 720 */ 721 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 722 hammer2_chain_setcheck(chain, chain->data); 723 break; 724 case HAMMER2_BREF_TYPE_INODE: 725 /* 726 * NOTE: We must call io_setdirty() to make any late 727 * changes to the inode data, the system might 728 * have already flushed the buffer. 729 */ 730 if (chain->data->ipdata.op_flags & 731 HAMMER2_OPFLAG_PFSROOT) { 732 /* 733 * non-NULL pmp if mounted as a PFS. We must 734 * sync fields cached in the pmp? XXX 735 */ 736 hammer2_inode_data_t *ipdata; 737 738 hammer2_io_setdirty(chain->dio); 739 ipdata = &chain->data->ipdata; 740 if (pmp) 741 ipdata->pfs_inum = pmp->inode_tid; 742 } else { 743 /* can't be mounted as a PFS */ 744 } 745 746 /* 747 * Update inode statistics. Pending stats in chain 748 * are cleared out on UPDATE so expect that bit to 749 * be set here too or the statistics will not be 750 * rolled-up properly. 751 */ 752 if (chain->data_count || chain->inode_count) { 753 hammer2_inode_data_t *ipdata; 754 755 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 756 hammer2_io_setdirty(chain->dio); 757 ipdata = &chain->data->ipdata; 758 ipdata->data_count += chain->data_count; 759 ipdata->inode_count += chain->inode_count; 760 } 761 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 762 hammer2_chain_setcheck(chain, chain->data); 763 break; 764 default: 765 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 766 panic("hammer2_flush_core: unsupported " 767 "embedded bref %d", 768 chain->bref.type); 769 /* NOT REACHED */ 770 } 771 772 /* 773 * If the chain was destroyed try to avoid unnecessary I/O. 774 * (this only really works if the DIO system buffer is the 775 * same size as chain->bytes). 776 */ 777 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) { 778 hammer2_io_setinval(chain->dio, chain->bytes); 779 } 780 } 781 782 /* 783 * If UPDATE is set the parent block table may need to be updated. 784 * 785 * NOTE: UPDATE may be set on vchain or fchain in which case 786 * parent could be NULL. It's easiest to allow the case 787 * and test for NULL. parent can also wind up being NULL 788 * due to a deletion so we need to handle the case anyway. 789 * 790 * If no parent exists we can just clear the UPDATE bit. If the 791 * chain gets reattached later on the bit will simply get set 792 * again. 793 */ 794 if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) { 795 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 796 hammer2_chain_drop(chain); 797 } 798 799 /* 800 * The chain may need its blockrefs updated in the parent. This 801 * requires some fancy footwork. 802 */ 803 if (chain->flags & HAMMER2_CHAIN_UPDATE) { 804 hammer2_blockref_t *base; 805 int count; 806 807 /* 808 * Both parent and chain must be locked. This requires 809 * temporarily unlocking the chain. We have to deal with 810 * the case where the chain might be reparented or modified 811 * while it was unlocked. 812 */ 813 hammer2_chain_unlock(chain); 814 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 815 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE); 816 if (chain->parent != parent) { 817 kprintf("PARENT MISMATCH ch=%p p=%p/%p\n", chain, chain->parent, parent); 818 hammer2_chain_unlock(parent); 819 goto done; 820 } 821 822 /* 823 * Check race condition. If someone got in and modified 824 * it again while it was unlocked, we have to loop up. 825 */ 826 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 827 hammer2_chain_unlock(parent); 828 kprintf("hammer2_flush: chain %p flush-mod race\n", 829 chain); 830 goto again; 831 } 832 833 /* 834 * Clear UPDATE flag 835 */ 836 if (chain->flags & HAMMER2_CHAIN_UPDATE) { 837 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 838 hammer2_chain_drop(chain); 839 } 840 hammer2_chain_modify(info->trans, parent, 0); 841 842 /* 843 * Calculate blockmap pointer 844 */ 845 switch(parent->bref.type) { 846 case HAMMER2_BREF_TYPE_INODE: 847 /* 848 * Access the inode's block array. However, there is 849 * no block array if the inode is flagged DIRECTDATA. 850 */ 851 if (parent->data && 852 (parent->data->ipdata.op_flags & 853 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 854 base = &parent->data-> 855 ipdata.u.blockset.blockref[0]; 856 } else { 857 base = NULL; 858 } 859 count = HAMMER2_SET_COUNT; 860 break; 861 case HAMMER2_BREF_TYPE_INDIRECT: 862 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 863 if (parent->data) 864 base = &parent->data->npdata[0]; 865 else 866 base = NULL; 867 count = parent->bytes / sizeof(hammer2_blockref_t); 868 break; 869 case HAMMER2_BREF_TYPE_VOLUME: 870 base = &chain->hmp->voldata.sroot_blockset.blockref[0]; 871 count = HAMMER2_SET_COUNT; 872 break; 873 case HAMMER2_BREF_TYPE_FREEMAP: 874 base = &parent->data->npdata[0]; 875 count = HAMMER2_SET_COUNT; 876 break; 877 default: 878 base = NULL; 879 count = 0; 880 panic("hammer2_flush_core: " 881 "unrecognized blockref type: %d", 882 parent->bref.type); 883 } 884 885 /* 886 * Blocktable updates 887 * 888 * We synchronize pending statistics at this time. Delta 889 * adjustments designated for the current and upper level 890 * are synchronized. 891 */ 892 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) { 893 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 894 hammer2_base_delete(info->trans, parent, 895 base, count, 896 &info->cache_index, chain); 897 /* base_delete clears both bits */ 898 } else { 899 atomic_clear_int(&chain->flags, 900 HAMMER2_CHAIN_BMAPUPD); 901 } 902 } 903 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 904 parent->data_count += chain->data_count + 905 chain->data_count_up; 906 parent->inode_count += chain->inode_count + 907 chain->inode_count_up; 908 chain->data_count = 0; 909 chain->inode_count = 0; 910 chain->data_count_up = 0; 911 chain->inode_count_up = 0; 912 hammer2_base_insert(info->trans, parent, 913 base, count, 914 &info->cache_index, chain); 915 /* base_insert sets BMAPPED */ 916 } 917 hammer2_chain_unlock(parent); 918 } 919 920 /* 921 * Final cleanup after flush 922 */ 923 done: 924 KKASSERT(chain->refs > 1); 925 KKASSERT(pmp == NULL || 926 chain->bref.mirror_tid <= chain->pmp->flush_tid); 927 if (hammer2_debug & 0x200) { 928 if (info->debug == chain) 929 info->debug = NULL; 930 } 931 } 932 933 /* 934 * Flush recursion helper, called from flush_core, calls flush_core. 935 * 936 * Flushes the children of the caller's chain (info->parent), restricted 937 * by sync_tid. Set info->domodify if the child's blockref must propagate 938 * back up to the parent. 939 * 940 * Ripouts can move child from rbtree to dbtree or dbq but the caller's 941 * flush scan order prevents any chains from being lost. A child can be 942 * executes more than once. 943 * 944 * WARNING! If we do not call hammer2_flush_core() we must update 945 * bref.mirror_tid ourselves to indicate that the flush has 946 * processed the child. 947 * 948 * WARNING! parent->core spinlock is held on entry and return. 949 * 950 * WARNING! Flushes do not cross PFS boundaries. Specifically, a flush must 951 * not cross a pfs-root boundary. 952 */ 953 static int 954 hammer2_flush_recurse(hammer2_chain_t *child, void *data) 955 { 956 hammer2_flush_info_t *info = data; 957 /*hammer2_trans_t *trans = info->trans;*/ 958 hammer2_chain_t *parent = info->parent; 959 960 /* 961 * (child can never be fchain or vchain so a special check isn't 962 * needed). 963 * 964 * We must ref the child before unlocking the spinlock. 965 * 966 * The caller has added a ref to the parent so we can temporarily 967 * unlock it in order to lock the child. 968 */ 969 hammer2_chain_ref(child); 970 hammer2_spin_unex(&parent->core.spin); 971 972 hammer2_chain_unlock(parent); 973 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 974 975 /* 976 * Never recurse across a mounted PFS boundary. 977 * 978 * Recurse and collect deferral data. 979 */ 980 if ((child->flags & HAMMER2_CHAIN_PFSBOUNDARY) == 0 || 981 child->pmp == NULL) { 982 if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) { 983 ++info->depth; 984 hammer2_flush_core(info, child, 0); /* XXX deleting */ 985 --info->depth; 986 } else if (hammer2_debug & 0x200) { 987 if (info->debug == NULL) 988 info->debug = child; 989 ++info->depth; 990 hammer2_flush_core(info, child, 0); /* XXX deleting */ 991 --info->depth; 992 if (info->debug == child) 993 info->debug = NULL; 994 } 995 } 996 997 /* 998 * Relock to continue the loop 999 */ 1000 hammer2_chain_unlock(child); 1001 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1002 hammer2_chain_drop(child); 1003 KKASSERT(info->parent == parent); 1004 hammer2_spin_ex(&parent->core.spin); 1005 1006 return (0); 1007 } 1008