1 /* 2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * TRANSACTION AND FLUSH HANDLING 37 * 38 * Deceptively simple but actually fairly difficult to implement properly is 39 * how I would describe it. 40 * 41 * Flushing generally occurs bottom-up but requires a top-down scan to 42 * locate chains with MODIFIED and/or UPDATE bits set. The ONFLUSH flag 43 * tells how to recurse downward to find these chains. 44 */ 45 46 #include <sys/cdefs.h> 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/types.h> 50 #include <sys/lock.h> 51 #include <sys/uuid.h> 52 53 #include "hammer2.h" 54 55 #define FLUSH_DEBUG 0 56 57 #define HAMMER2_FLUSH_DEPTH_LIMIT 10 /* stack recursion limit */ 58 59 60 /* 61 * Recursively flush the specified chain. The chain is locked and 62 * referenced by the caller and will remain so on return. The chain 63 * will remain referenced throughout but can temporarily lose its 64 * lock during the recursion to avoid unnecessarily stalling user 65 * processes. 66 */ 67 struct hammer2_flush_info { 68 hammer2_chain_t *parent; 69 hammer2_trans_t *trans; 70 int depth; 71 int diddeferral; 72 int cache_index; 73 struct h2_flush_list flushq; 74 hammer2_xid_t sync_xid; /* memory synchronization point */ 75 hammer2_chain_t *debug; 76 }; 77 78 typedef struct hammer2_flush_info hammer2_flush_info_t; 79 80 static void hammer2_flush_core(hammer2_flush_info_t *info, 81 hammer2_chain_t *chain, int deleting); 82 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data); 83 84 /* 85 * For now use a global transaction manager. What we ultimately want to do 86 * is give each non-overlapping hmp/pmp group its own transaction manager. 87 * 88 * Transactions govern XID tracking on the physical media (the hmp), but they 89 * also govern TID tracking which is per-PFS and thus might cross multiple 90 * hmp's. So we can't just stuff tmanage into hammer2_dev or 91 * hammer2_pfs. 92 */ 93 void 94 hammer2_trans_manage_init(hammer2_trans_manage_t *tman) 95 { 96 lockinit(&tman->translk, "h2trans", 0, 0); 97 TAILQ_INIT(&tman->transq); 98 tman->flush_xid = 1; 99 tman->alloc_xid = tman->flush_xid + 1; 100 } 101 102 hammer2_xid_t 103 hammer2_trans_newxid(hammer2_pfs_t *pmp) 104 { 105 hammer2_xid_t xid; 106 107 for (;;) { 108 xid = atomic_fetchadd_int(&pmp->tmanage.alloc_xid, 1); 109 if (xid) 110 break; 111 } 112 return xid; 113 } 114 115 /* 116 * Transaction support functions for writing to the filesystem. 117 * 118 * Initializing a new transaction allocates a transaction ID. Typically 119 * passed a pmp (hmp passed as NULL), indicating a cluster transaction. Can 120 * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single 121 * media target. The latter mode is used by the recovery code. 122 * 123 * TWO TRANSACTION IDs can run concurrently, where one is a flush and the 124 * other is a set of any number of concurrent filesystem operations. We 125 * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops> 126 * or we can have <running_flush> + <concurrent_fs_ops>. 127 * 128 * During a flush, new fs_ops are only blocked until the fs_ops prior to 129 * the flush complete. The new fs_ops can then run concurrent with the flush. 130 * 131 * Buffer-cache transactions operate as fs_ops but never block. A 132 * buffer-cache flush will run either before or after the current pending 133 * flush depending on its state. 134 */ 135 void 136 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfs_t *pmp, int flags) 137 { 138 hammer2_trans_manage_t *tman; 139 hammer2_trans_t *head; 140 141 tman = &pmp->tmanage; 142 143 bzero(trans, sizeof(*trans)); 144 trans->pmp = pmp; 145 trans->flags = flags; 146 trans->td = curthread; 147 148 lockmgr(&tman->translk, LK_EXCLUSIVE); 149 150 if (flags & HAMMER2_TRANS_ISFLUSH) { 151 /* 152 * If multiple flushes are trying to run we have to 153 * wait until it is our turn. All flushes are serialized. 154 * 155 * We queue ourselves and then wait to become the head 156 * of the queue, allowing all prior flushes to complete. 157 * 158 * Multiple normal transactions can share the current 159 * transaction id but a flush transaction needs its own 160 * unique TID for proper block table update accounting. 161 */ 162 ++tman->flushcnt; 163 ++pmp->modify_tid; 164 tman->flush_xid = hammer2_trans_newxid(pmp); 165 trans->sync_xid = tman->flush_xid; 166 trans->modify_tid = pmp->modify_tid; 167 TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 168 if (TAILQ_FIRST(&tman->transq) != trans) { 169 trans->blocked = 1; 170 while (trans->blocked) { 171 lksleep(&trans->sync_xid, &tman->translk, 172 0, "h2multf", hz); 173 } 174 } 175 } else if (tman->flushcnt == 0) { 176 /* 177 * No flushes are pending, we can go. Use prior flush_xid + 1. 178 * 179 * WARNING! Also see hammer2_chain_setflush() 180 */ 181 TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 182 trans->sync_xid = tman->flush_xid + 1; 183 184 /* XXX improve/optimize inode allocation */ 185 } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) { 186 /* 187 * A buffer cache transaction is requested while a flush 188 * is in progress. The flush's PREFLUSH flag must be set 189 * in this situation. 190 * 191 * The buffer cache flush takes on the main flush's 192 * transaction id. 193 */ 194 TAILQ_FOREACH(head, &tman->transq, entry) { 195 if (head->flags & HAMMER2_TRANS_ISFLUSH) 196 break; 197 } 198 KKASSERT(head); 199 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH); 200 trans->flags |= HAMMER2_TRANS_PREFLUSH; 201 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 202 trans->sync_xid = head->sync_xid; 203 trans->modify_tid = head->modify_tid; 204 trans->flags |= HAMMER2_TRANS_CONCURRENT; 205 /* not allowed to block */ 206 } else { 207 /* 208 * A normal transaction is requested while a flush is in 209 * progress. We insert after the current flush and may 210 * block. 211 * 212 * WARNING! Also see hammer2_chain_setflush() 213 */ 214 TAILQ_FOREACH(head, &tman->transq, entry) { 215 if (head->flags & HAMMER2_TRANS_ISFLUSH) 216 break; 217 } 218 KKASSERT(head); 219 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 220 trans->sync_xid = head->sync_xid + 1; 221 trans->flags |= HAMMER2_TRANS_CONCURRENT; 222 223 /* 224 * XXX for now we must block new transactions, synchronous 225 * flush mode is on by default. 226 * 227 * If synchronous flush mode is enabled concurrent 228 * frontend transactions during the flush are not 229 * allowed (except we don't have a choice for buffer 230 * cache ops). 231 */ 232 if (hammer2_synchronous_flush > 0 || 233 TAILQ_FIRST(&tman->transq) != head) { 234 trans->blocked = 1; 235 while (trans->blocked) { 236 lksleep(&trans->sync_xid, &tman->translk, 237 0, "h2multf", hz); 238 } 239 } 240 } 241 if (flags & HAMMER2_TRANS_NEWINODE) { 242 if (pmp->spmp_hmp) { 243 /* 244 * Super-root transaction, all new inodes have an 245 * inode number of 1. Normal pfs inode cache 246 * semantics are not used. 247 */ 248 trans->inode_tid = 1; 249 } else { 250 /* 251 * Normal transaction 252 */ 253 if (pmp->inode_tid < HAMMER2_INODE_START) 254 pmp->inode_tid = HAMMER2_INODE_START; 255 trans->inode_tid = pmp->inode_tid++; 256 } 257 } 258 259 lockmgr(&tman->translk, LK_RELEASE); 260 } 261 262 void 263 hammer2_trans_assert_strategy(hammer2_pfs_t *pmp) 264 { 265 hammer2_trans_manage_t *tman; 266 hammer2_trans_t *head; 267 268 tman = &pmp->tmanage; 269 lockmgr(&tman->translk, LK_EXCLUSIVE); 270 if (tman->flushcnt) { 271 TAILQ_FOREACH(head, &tman->transq, entry) { 272 if (head->flags & HAMMER2_TRANS_ISFLUSH) 273 break; 274 } 275 KKASSERT(head); 276 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH); 277 } 278 lockmgr(&tman->translk, LK_RELEASE); 279 } 280 281 void 282 hammer2_trans_done(hammer2_trans_t *trans) 283 { 284 hammer2_trans_manage_t *tman; 285 hammer2_trans_t *head; 286 hammer2_trans_t *scan; 287 288 tman = &trans->pmp->tmanage; 289 290 /* 291 * Remove. 292 */ 293 lockmgr(&tman->translk, LK_EXCLUSIVE); 294 TAILQ_REMOVE(&tman->transq, trans, entry); 295 head = TAILQ_FIRST(&tman->transq); 296 297 /* 298 * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT 299 * up through the next flush. (If the head is a flush then we 300 * stop there, unlike the unblock code following this section). 301 */ 302 if (trans->flags & HAMMER2_TRANS_ISFLUSH) { 303 --tman->flushcnt; 304 scan = head; 305 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 306 atomic_clear_int(&scan->flags, 307 HAMMER2_TRANS_CONCURRENT); 308 scan = TAILQ_NEXT(scan, entry); 309 } 310 } 311 312 /* 313 * Unblock the head of the queue and any additional transactions 314 * up to the next flush. The head can be a flush and it will be 315 * unblocked along with the non-flush transactions following it 316 * (which are allowed to run concurrently with it). 317 * 318 * In synchronous flush mode we stop if the head transaction is 319 * a flush. 320 */ 321 if (head && head->blocked) { 322 head->blocked = 0; 323 wakeup(&head->sync_xid); 324 325 if (hammer2_synchronous_flush > 0) 326 scan = head; 327 else 328 scan = TAILQ_NEXT(head, entry); 329 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 330 if (scan->blocked) { 331 scan->blocked = 0; 332 wakeup(&scan->sync_xid); 333 } 334 scan = TAILQ_NEXT(scan, entry); 335 } 336 } 337 lockmgr(&tman->translk, LK_RELEASE); 338 } 339 340 /* 341 * Chains undergoing destruction are removed from the in-memory topology. 342 * To avoid getting lost these chains are placed on the delayed flush 343 * queue which will properly dispose of them. 344 * 345 * We do this instead of issuing an immediate flush in order to give 346 * recursive deletions (rm -rf, etc) a chance to remove more of the 347 * hierarchy, potentially allowing an enormous amount of write I/O to 348 * be avoided. 349 */ 350 void 351 hammer2_delayed_flush(hammer2_trans_t *trans, hammer2_chain_t *chain) 352 { 353 if ((chain->flags & HAMMER2_CHAIN_DELAYED) == 0) { 354 hammer2_spin_ex(&chain->hmp->list_spin); 355 if ((chain->flags & (HAMMER2_CHAIN_DELAYED | 356 HAMMER2_CHAIN_DEFERRED)) == 0) { 357 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELAYED | 358 HAMMER2_CHAIN_DEFERRED); 359 TAILQ_INSERT_TAIL(&chain->hmp->flushq, 360 chain, flush_node); 361 hammer2_chain_ref(chain); 362 } 363 hammer2_spin_unex(&chain->hmp->list_spin); 364 } 365 } 366 367 /* 368 * Flush the chain and all modified sub-chains through the specified 369 * synchronization point, propagating parent chain modifications, modify_tid, 370 * and mirror_tid updates back up as needed. 371 * 372 * Caller must have interlocked against any non-flush-related modifying 373 * operations in progress whos XXX values are less than or equal 374 * to the passed sync_xid. 375 * 376 * Caller must have already vetted synchronization points to ensure they 377 * are properly flushed. Only snapshots and cluster flushes can create 378 * these sorts of synchronization points. 379 * 380 * This routine can be called from several places but the most important 381 * is from VFS_SYNC. 382 * 383 * chain is locked on call and will remain locked on return. The chain's 384 * UPDATE flag indicates that its parent's block table (which is not yet 385 * part of the flush) should be updated. The chain may be replaced by 386 * the call if it was modified. 387 */ 388 void 389 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain, int istop) 390 { 391 hammer2_chain_t *scan; 392 hammer2_flush_info_t info; 393 hammer2_dev_t *hmp; 394 int loops; 395 396 /* 397 * Execute the recursive flush and handle deferrals. 398 * 399 * Chains can be ridiculously long (thousands deep), so to 400 * avoid blowing out the kernel stack the recursive flush has a 401 * depth limit. Elements at the limit are placed on a list 402 * for re-execution after the stack has been popped. 403 */ 404 bzero(&info, sizeof(info)); 405 TAILQ_INIT(&info.flushq); 406 info.trans = trans; 407 info.sync_xid = trans->sync_xid; 408 info.cache_index = -1; 409 410 /* 411 * Calculate parent (can be NULL), if not NULL the flush core 412 * expects the parent to be referenced so it can easily lock/unlock 413 * it without it getting ripped up. 414 */ 415 if ((info.parent = chain->parent) != NULL) 416 hammer2_chain_ref(info.parent); 417 418 /* 419 * Extra ref needed because flush_core expects it when replacing 420 * chain. 421 */ 422 hammer2_chain_ref(chain); 423 hmp = chain->hmp; 424 loops = 0; 425 426 for (;;) { 427 /* 428 * Move hmp->flushq to info.flushq if non-empty so it can 429 * be processed. 430 */ 431 if (TAILQ_FIRST(&hmp->flushq) != NULL) { 432 hammer2_spin_ex(&chain->hmp->list_spin); 433 TAILQ_CONCAT(&info.flushq, &hmp->flushq, flush_node); 434 hammer2_spin_unex(&chain->hmp->list_spin); 435 } 436 437 /* 438 * Unwind deep recursions which had been deferred. This 439 * can leave the FLUSH_* bits set for these chains, which 440 * will be handled when we [re]flush chain after the unwind. 441 */ 442 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) { 443 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 444 TAILQ_REMOVE(&info.flushq, scan, flush_node); 445 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED | 446 HAMMER2_CHAIN_DELAYED); 447 448 /* 449 * Now that we've popped back up we can do a secondary 450 * recursion on the deferred elements. 451 * 452 * NOTE: hammer2_flush() may replace scan. 453 */ 454 if (hammer2_debug & 0x0040) 455 kprintf("deferred flush %p\n", scan); 456 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 457 hammer2_flush(trans, scan, 0); 458 hammer2_chain_unlock(scan); 459 hammer2_chain_drop(scan); /* ref from deferral */ 460 } 461 462 /* 463 * [re]flush chain. 464 */ 465 info.diddeferral = 0; 466 hammer2_flush_core(&info, chain, istop); 467 468 /* 469 * Only loop if deep recursions have been deferred. 470 */ 471 if (TAILQ_EMPTY(&info.flushq)) 472 break; 473 474 if (++loops % 1000 == 0) { 475 kprintf("hammer2_flush: excessive loops on %p\n", 476 chain); 477 if (hammer2_debug & 0x100000) 478 Debugger("hell4"); 479 } 480 } 481 hammer2_chain_drop(chain); 482 if (info.parent) 483 hammer2_chain_drop(info.parent); 484 } 485 486 /* 487 * This is the core of the chain flushing code. The chain is locked by the 488 * caller and must also have an extra ref on it by the caller, and remains 489 * locked and will have an extra ref on return. Upon return, the caller can 490 * test the UPDATE bit on the child to determine if the parent needs updating. 491 * 492 * (1) Determine if this node is a candidate for the flush, return if it is 493 * not. fchain and vchain are always candidates for the flush. 494 * 495 * (2) If we recurse too deep the chain is entered onto the deferral list and 496 * the current flush stack is aborted until after the deferral list is 497 * run. 498 * 499 * (3) Recursively flush live children (rbtree). This can create deferrals. 500 * A successful flush clears the MODIFIED and UPDATE bits on the children 501 * and typically causes the parent to be marked MODIFIED as the children 502 * update the parent's block table. A parent might already be marked 503 * MODIFIED due to a deletion (whos blocktable update in the parent is 504 * handled by the frontend), or if the parent itself is modified by the 505 * frontend for other reasons. 506 * 507 * (4) Permanently disconnected sub-trees are cleaned up by the front-end. 508 * Deleted-but-open inodes can still be individually flushed via the 509 * filesystem syncer. 510 * 511 * (5) Note that an unmodified child may still need the block table in its 512 * parent updated (e.g. rename/move). The child will have UPDATE set 513 * in this case. 514 * 515 * WARNING ON BREF MODIFY_TID/MIRROR_TID 516 * 517 * blockref.modify_tid is consistent only within a PFS, and will not be 518 * consistent during synchronization. mirror_tid is consistent across the 519 * block device regardless of the PFS. 520 */ 521 static void 522 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain, 523 int istop) 524 { 525 hammer2_chain_t *parent; 526 hammer2_dev_t *hmp; 527 int diddeferral; 528 529 /* 530 * (1) Optimize downward recursion to locate nodes needing action. 531 * Nothing to do if none of these flags are set. 532 */ 533 if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) { 534 if (hammer2_debug & 0x200) { 535 if (info->debug == NULL) 536 info->debug = chain; 537 } else { 538 return; 539 } 540 } 541 542 hmp = chain->hmp; 543 diddeferral = info->diddeferral; 544 parent = info->parent; /* can be NULL */ 545 546 /* 547 * Downward search recursion 548 */ 549 if (chain->flags & (HAMMER2_CHAIN_DEFERRED | HAMMER2_CHAIN_DELAYED)) { 550 /* 551 * Already deferred. 552 */ 553 ++info->diddeferral; 554 } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 555 /* 556 * Recursion depth reached. 557 */ 558 KKASSERT((chain->flags & HAMMER2_CHAIN_DELAYED) == 0); 559 hammer2_chain_ref(chain); 560 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node); 561 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED); 562 ++info->diddeferral; 563 } else if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) && istop == 0) { 564 /* 565 * We do not recurse through PFSROOTs. PFSROOT flushes are 566 * handled by the related pmp's (whether mounted or not, 567 * including during recovery). 568 * 569 * But we must still process the PFSROOT chains for block 570 * table updates in their parent (which IS part of our flush). 571 * 572 * Note that the volume root, vchain, does not set this flag. 573 */ 574 ; 575 } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) { 576 /* 577 * Downward recursion search (actual flush occurs bottom-up). 578 * pre-clear ONFLUSH. It can get set again due to races, 579 * which we want so the scan finds us again in the next flush. 580 * These races can also include 581 * 582 * Flush recursions stop at PFSROOT boundaries. Each PFS 583 * must be individually flushed and then the root must 584 * be flushed. 585 */ 586 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 587 info->parent = chain; 588 hammer2_spin_ex(&chain->core.spin); 589 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree, 590 NULL, hammer2_flush_recurse, info); 591 hammer2_spin_unex(&chain->core.spin); 592 info->parent = parent; 593 if (info->diddeferral) 594 hammer2_chain_setflush(info->trans, chain); 595 } 596 597 /* 598 * Now we are in the bottom-up part of the recursion. 599 * 600 * Do not update chain if lower layers were deferred. 601 */ 602 if (info->diddeferral) 603 goto done; 604 605 /* 606 * Propagate the DESTROY flag downwards. This dummies up the flush 607 * code and tries to invalidate related buffer cache buffers to 608 * avoid the disk write. 609 */ 610 if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY)) 611 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 612 613 /* 614 * Chain was already modified or has become modified, flush it out. 615 */ 616 again: 617 if ((hammer2_debug & 0x200) && 618 info->debug && 619 (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) { 620 hammer2_chain_t *scan = chain; 621 622 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain); 623 while (scan) { 624 kprintf(" chain %p [%08x] bref=%016jx:%02x\n", 625 scan, scan->flags, 626 scan->bref.key, scan->bref.type); 627 if (scan == info->debug) 628 break; 629 scan = scan->parent; 630 } 631 } 632 633 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 634 /* 635 * Dispose of the modified bit. 636 * 637 * UPDATE should already be set. 638 * bref.mirror_tid should already be set. 639 */ 640 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) || 641 chain == &hmp->vchain); 642 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 643 644 /* 645 * Manage threads waiting for excessive dirty memory to 646 * be retired. 647 */ 648 if (chain->pmp) 649 hammer2_pfs_memory_wakeup(chain->pmp); 650 651 if ((chain->flags & HAMMER2_CHAIN_UPDATE) || 652 chain == &hmp->vchain || 653 chain == &hmp->fchain) { 654 /* 655 * Drop the ref from the MODIFIED bit we cleared, 656 * net -1 ref. 657 */ 658 hammer2_chain_drop(chain); 659 } else { 660 /* 661 * Drop the ref from the MODIFIED bit we cleared and 662 * set a ref for the UPDATE bit we are setting. Net 663 * 0 refs. 664 */ 665 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 666 } 667 668 /* 669 * Issue the flush. This is indirect via the DIO. 670 * 671 * NOTE: A DELETED node that reaches this point must be 672 * flushed for synchronization point consistency. 673 * 674 * NOTE: Even though MODIFIED was already set, the related DIO 675 * might not be dirty due to a system buffer cache 676 * flush and must be set dirty if we are going to make 677 * further modifications to the buffer. Chains with 678 * embedded data don't need this. 679 */ 680 if (hammer2_debug & 0x1000) { 681 kprintf("Flush %p.%d %016jx/%d sync_xid=%08x " 682 "data=%016jx\n", 683 chain, chain->bref.type, 684 chain->bref.key, chain->bref.keybits, 685 info->sync_xid, 686 chain->bref.data_off); 687 } 688 if (hammer2_debug & 0x2000) { 689 Debugger("Flush hell"); 690 } 691 692 /* 693 * Update chain CRCs for flush. 694 * 695 * NOTE: Volume headers are NOT flushed here as they require 696 * special processing. 697 */ 698 switch(chain->bref.type) { 699 case HAMMER2_BREF_TYPE_FREEMAP: 700 /* 701 * Update the volume header's freemap_tid to the 702 * freemap's flushing mirror_tid. 703 * 704 * (note: embedded data, do not call setdirty) 705 */ 706 KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED); 707 KKASSERT(chain == &hmp->fchain); 708 hmp->voldata.freemap_tid = chain->bref.mirror_tid; 709 kprintf("sync freemap mirror_tid %08jx\n", 710 (intmax_t)chain->bref.mirror_tid); 711 712 /* 713 * The freemap can be flushed independently of the 714 * main topology, but for the case where it is 715 * flushed in the same transaction, and flushed 716 * before vchain (a case we want to allow for 717 * performance reasons), make sure modifications 718 * made during the flush under vchain use a new 719 * transaction id. 720 * 721 * Otherwise the mount recovery code will get confused. 722 */ 723 ++hmp->voldata.mirror_tid; 724 break; 725 case HAMMER2_BREF_TYPE_VOLUME: 726 /* 727 * The free block table is flushed by 728 * hammer2_vfs_sync() before it flushes vchain. 729 * We must still hold fchain locked while copying 730 * voldata to volsync, however. 731 * 732 * (note: embedded data, do not call setdirty) 733 */ 734 hammer2_chain_lock(&hmp->fchain, 735 HAMMER2_RESOLVE_ALWAYS); 736 hammer2_voldata_lock(hmp); 737 kprintf("sync volume mirror_tid %08jx\n", 738 (intmax_t)chain->bref.mirror_tid); 739 740 /* 741 * Update the volume header's mirror_tid to the 742 * main topology's flushing mirror_tid. It is 743 * possible that voldata.mirror_tid is already 744 * beyond bref.mirror_tid due to the bump we made 745 * above in BREF_TYPE_FREEMAP. 746 */ 747 if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) { 748 hmp->voldata.mirror_tid = 749 chain->bref.mirror_tid; 750 } 751 752 /* 753 * The volume header is flushed manually by the 754 * syncer, not here. All we do here is adjust the 755 * crc's. 756 */ 757 KKASSERT(chain->data != NULL); 758 KKASSERT(chain->dio == NULL); 759 760 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 761 hammer2_icrc32( 762 (char *)&hmp->voldata + 763 HAMMER2_VOLUME_ICRC1_OFF, 764 HAMMER2_VOLUME_ICRC1_SIZE); 765 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 766 hammer2_icrc32( 767 (char *)&hmp->voldata + 768 HAMMER2_VOLUME_ICRC0_OFF, 769 HAMMER2_VOLUME_ICRC0_SIZE); 770 hmp->voldata.icrc_volheader = 771 hammer2_icrc32( 772 (char *)&hmp->voldata + 773 HAMMER2_VOLUME_ICRCVH_OFF, 774 HAMMER2_VOLUME_ICRCVH_SIZE); 775 776 kprintf("syncvolhdr %016jx %016jx\n", 777 hmp->voldata.mirror_tid, 778 hmp->vchain.bref.mirror_tid); 779 hmp->volsync = hmp->voldata; 780 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 781 hammer2_voldata_unlock(hmp); 782 hammer2_chain_unlock(&hmp->fchain); 783 break; 784 case HAMMER2_BREF_TYPE_DATA: 785 /* 786 * Data elements have already been flushed via the 787 * logical file buffer cache. Their hash was set in 788 * the bref by the vop_write code. Do not re-dirty. 789 * 790 * Make sure any device buffer(s) have been flushed 791 * out here (there aren't usually any to flush) XXX. 792 */ 793 break; 794 case HAMMER2_BREF_TYPE_INDIRECT: 795 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 796 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 797 /* 798 * Buffer I/O will be cleaned up when the volume is 799 * flushed (but the kernel is free to flush it before 800 * then, as well). 801 */ 802 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 803 hammer2_chain_setcheck(chain, chain->data); 804 break; 805 case HAMMER2_BREF_TYPE_INODE: 806 /* 807 * NOTE: We must call io_setdirty() to make any late 808 * changes to the inode data, the system might 809 * have already flushed the buffer. 810 */ 811 if (chain->data->ipdata.meta.op_flags & 812 HAMMER2_OPFLAG_PFSROOT) { 813 /* 814 * non-NULL pmp if mounted as a PFS. We must 815 * sync fields cached in the pmp? XXX 816 */ 817 hammer2_inode_data_t *ipdata; 818 819 hammer2_io_setdirty(chain->dio); 820 ipdata = &chain->data->ipdata; 821 if (chain->pmp) { 822 ipdata->meta.pfs_inum = 823 chain->pmp->inode_tid; 824 } 825 } else { 826 /* can't be mounted as a PFS */ 827 } 828 829 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 830 hammer2_chain_setcheck(chain, chain->data); 831 break; 832 default: 833 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 834 panic("hammer2_flush_core: unsupported " 835 "embedded bref %d", 836 chain->bref.type); 837 /* NOT REACHED */ 838 } 839 840 /* 841 * If the chain was destroyed try to avoid unnecessary I/O. 842 * (this only really works if the DIO system buffer is the 843 * same size as chain->bytes). 844 */ 845 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) { 846 hammer2_io_setinval(chain->dio, chain->bytes); 847 } 848 } 849 850 /* 851 * If UPDATE is set the parent block table may need to be updated. 852 * 853 * NOTE: UPDATE may be set on vchain or fchain in which case 854 * parent could be NULL. It's easiest to allow the case 855 * and test for NULL. parent can also wind up being NULL 856 * due to a deletion so we need to handle the case anyway. 857 * 858 * If no parent exists we can just clear the UPDATE bit. If the 859 * chain gets reattached later on the bit will simply get set 860 * again. 861 */ 862 if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) { 863 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 864 hammer2_chain_drop(chain); 865 } 866 867 /* 868 * The chain may need its blockrefs updated in the parent. This 869 * requires some fancy footwork. 870 */ 871 if (chain->flags & HAMMER2_CHAIN_UPDATE) { 872 hammer2_blockref_t *base; 873 int count; 874 875 /* 876 * Both parent and chain must be locked. This requires 877 * temporarily unlocking the chain. We have to deal with 878 * the case where the chain might be reparented or modified 879 * while it was unlocked. 880 */ 881 hammer2_chain_unlock(chain); 882 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 883 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE); 884 if (chain->parent != parent) { 885 kprintf("PARENT MISMATCH ch=%p p=%p/%p\n", 886 chain, chain->parent, parent); 887 hammer2_chain_unlock(parent); 888 goto done; 889 } 890 891 /* 892 * Check race condition. If someone got in and modified 893 * it again while it was unlocked, we have to loop up. 894 */ 895 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 896 hammer2_chain_unlock(parent); 897 kprintf("hammer2_flush: chain %p flush-mod race\n", 898 chain); 899 goto again; 900 } 901 902 /* 903 * Clear UPDATE flag, mark parent modified, update its 904 * modify_tid if necessary, and adjust the parent blockmap. 905 */ 906 if (chain->flags & HAMMER2_CHAIN_UPDATE) { 907 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 908 hammer2_chain_drop(chain); 909 } 910 911 /* 912 * (optional code) 913 * 914 * Avoid actually modifying and updating the parent if it 915 * was flagged for destruction. This can greatly reduce 916 * disk I/O in large tree removals because the 917 * hammer2_io_setinval() call in the upward recursion 918 * (see MODIFIED code above) can only handle a few cases. 919 */ 920 if (parent->flags & HAMMER2_CHAIN_DESTROY) { 921 if (parent->bref.modify_tid < chain->bref.modify_tid) { 922 parent->bref.modify_tid = 923 chain->bref.modify_tid; 924 } 925 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 926 HAMMER2_CHAIN_BMAPUPD); 927 hammer2_chain_unlock(parent); 928 goto skipupdate; 929 } 930 931 /* 932 * We are updating the parent's blockmap, the parent must 933 * be set modified. 934 */ 935 hammer2_chain_modify(info->trans, parent, 0); 936 if (parent->bref.modify_tid < chain->bref.modify_tid) 937 parent->bref.modify_tid = chain->bref.modify_tid; 938 939 /* 940 * Calculate blockmap pointer 941 */ 942 switch(parent->bref.type) { 943 case HAMMER2_BREF_TYPE_INODE: 944 /* 945 * Access the inode's block array. However, there is 946 * no block array if the inode is flagged DIRECTDATA. 947 */ 948 if (parent->data && 949 (parent->data->ipdata.meta.op_flags & 950 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 951 base = &parent->data-> 952 ipdata.u.blockset.blockref[0]; 953 } else { 954 base = NULL; 955 } 956 count = HAMMER2_SET_COUNT; 957 break; 958 case HAMMER2_BREF_TYPE_INDIRECT: 959 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 960 if (parent->data) 961 base = &parent->data->npdata[0]; 962 else 963 base = NULL; 964 count = parent->bytes / sizeof(hammer2_blockref_t); 965 break; 966 case HAMMER2_BREF_TYPE_VOLUME: 967 base = &chain->hmp->voldata.sroot_blockset.blockref[0]; 968 count = HAMMER2_SET_COUNT; 969 break; 970 case HAMMER2_BREF_TYPE_FREEMAP: 971 base = &parent->data->npdata[0]; 972 count = HAMMER2_SET_COUNT; 973 break; 974 default: 975 base = NULL; 976 count = 0; 977 panic("hammer2_flush_core: " 978 "unrecognized blockref type: %d", 979 parent->bref.type); 980 } 981 982 /* 983 * Blocktable updates 984 * 985 * We synchronize pending statistics at this time. Delta 986 * adjustments designated for the current and upper level 987 * are synchronized. 988 */ 989 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) { 990 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 991 hammer2_spin_ex(&parent->core.spin); 992 hammer2_base_delete(info->trans, parent, 993 base, count, 994 &info->cache_index, chain); 995 hammer2_spin_unex(&parent->core.spin); 996 /* base_delete clears both bits */ 997 } else { 998 atomic_clear_int(&chain->flags, 999 HAMMER2_CHAIN_BMAPUPD); 1000 } 1001 } 1002 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 1003 hammer2_spin_ex(&parent->core.spin); 1004 hammer2_base_insert(info->trans, parent, 1005 base, count, 1006 &info->cache_index, chain); 1007 hammer2_spin_unex(&parent->core.spin); 1008 /* base_insert sets BMAPPED */ 1009 } 1010 hammer2_chain_unlock(parent); 1011 } 1012 skipupdate: 1013 ; 1014 1015 /* 1016 * Final cleanup after flush 1017 */ 1018 done: 1019 KKASSERT(chain->refs > 0); 1020 if (hammer2_debug & 0x200) { 1021 if (info->debug == chain) 1022 info->debug = NULL; 1023 } 1024 } 1025 1026 /* 1027 * Flush recursion helper, called from flush_core, calls flush_core. 1028 * 1029 * Flushes the children of the caller's chain (info->parent), restricted 1030 * by sync_tid. Set info->domodify if the child's blockref must propagate 1031 * back up to the parent. 1032 * 1033 * Ripouts can move child from rbtree to dbtree or dbq but the caller's 1034 * flush scan order prevents any chains from being lost. A child can be 1035 * executes more than once. 1036 * 1037 * WARNING! If we do not call hammer2_flush_core() we must update 1038 * bref.mirror_tid ourselves to indicate that the flush has 1039 * processed the child. 1040 * 1041 * WARNING! parent->core spinlock is held on entry and return. 1042 */ 1043 static int 1044 hammer2_flush_recurse(hammer2_chain_t *child, void *data) 1045 { 1046 hammer2_flush_info_t *info = data; 1047 /*hammer2_trans_t *trans = info->trans;*/ 1048 hammer2_chain_t *parent = info->parent; 1049 1050 /* 1051 * (child can never be fchain or vchain so a special check isn't 1052 * needed). 1053 * 1054 * We must ref the child before unlocking the spinlock. 1055 * 1056 * The caller has added a ref to the parent so we can temporarily 1057 * unlock it in order to lock the child. 1058 */ 1059 hammer2_chain_ref(child); 1060 hammer2_spin_unex(&parent->core.spin); 1061 1062 hammer2_chain_unlock(parent); 1063 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 1064 1065 /* 1066 * Recurse and collect deferral data. We're in the media flush, 1067 * this can cross PFS boundaries. 1068 */ 1069 if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) { 1070 ++info->depth; 1071 hammer2_flush_core(info, child, 0); 1072 --info->depth; 1073 } else if (hammer2_debug & 0x200) { 1074 if (info->debug == NULL) 1075 info->debug = child; 1076 ++info->depth; 1077 hammer2_flush_core(info, child, 0); 1078 --info->depth; 1079 if (info->debug == child) 1080 info->debug = NULL; 1081 } 1082 1083 /* 1084 * Relock to continue the loop 1085 */ 1086 hammer2_chain_unlock(child); 1087 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1088 hammer2_chain_drop(child); 1089 KKASSERT(info->parent == parent); 1090 hammer2_spin_ex(&parent->core.spin); 1091 1092 return (0); 1093 } 1094