1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <vm/vm_page2.h> 36 37 #include "hammer.h" 38 39 static int hammer_unload_inode(struct hammer_inode *ip); 40 static void hammer_free_inode(hammer_inode_t ip); 41 static void hammer_flush_inode_core(hammer_inode_t ip, 42 hammer_flush_group_t flg, int flags); 43 static int hammer_setup_child_callback(hammer_record_t rec, void *data); 44 #if 0 45 static int hammer_syncgrp_child_callback(hammer_record_t rec, void *data); 46 #endif 47 static int hammer_setup_parent_inodes(hammer_inode_t ip, int depth, 48 hammer_flush_group_t flg); 49 static int hammer_setup_parent_inodes_helper(hammer_record_t record, 50 int depth, hammer_flush_group_t flg); 51 static void hammer_inode_wakereclaims(hammer_inode_t ip); 52 static struct hammer_inostats *hammer_inode_inostats(hammer_mount_t hmp, 53 pid_t pid); 54 55 struct krate hammer_gen_krate = { 1 }; 56 57 /* 58 * RB-Tree support for inode structures 59 */ 60 int 61 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2) 62 { 63 if (ip1->obj_localization < ip2->obj_localization) 64 return(-1); 65 if (ip1->obj_localization > ip2->obj_localization) 66 return(1); 67 if (ip1->obj_id < ip2->obj_id) 68 return(-1); 69 if (ip1->obj_id > ip2->obj_id) 70 return(1); 71 if (ip1->obj_asof < ip2->obj_asof) 72 return(-1); 73 if (ip1->obj_asof > ip2->obj_asof) 74 return(1); 75 return(0); 76 } 77 78 int 79 hammer_redo_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2) 80 { 81 if (ip1->redo_fifo_start < ip2->redo_fifo_start) 82 return(-1); 83 if (ip1->redo_fifo_start > ip2->redo_fifo_start) 84 return(1); 85 return(0); 86 } 87 88 /* 89 * RB-Tree support for inode structures / special LOOKUP_INFO 90 */ 91 static int 92 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip) 93 { 94 if (info->obj_localization < ip->obj_localization) 95 return(-1); 96 if (info->obj_localization > ip->obj_localization) 97 return(1); 98 if (info->obj_id < ip->obj_id) 99 return(-1); 100 if (info->obj_id > ip->obj_id) 101 return(1); 102 if (info->obj_asof < ip->obj_asof) 103 return(-1); 104 if (info->obj_asof > ip->obj_asof) 105 return(1); 106 return(0); 107 } 108 109 /* 110 * Used by hammer_scan_inode_snapshots() to locate all of an object's 111 * snapshots. Note that the asof field is not tested, which we can get 112 * away with because it is the lowest-priority field. 113 */ 114 static int 115 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data) 116 { 117 hammer_inode_info_t info = data; 118 119 if (ip->obj_localization > info->obj_localization) 120 return(1); 121 if (ip->obj_localization < info->obj_localization) 122 return(-1); 123 if (ip->obj_id > info->obj_id) 124 return(1); 125 if (ip->obj_id < info->obj_id) 126 return(-1); 127 return(0); 128 } 129 130 /* 131 * Used by hammer_unload_pseudofs() to locate all inodes associated with 132 * a particular PFS. 133 */ 134 static int 135 hammer_inode_pfs_cmp(hammer_inode_t ip, void *data) 136 { 137 uint32_t localization = *(uint32_t *)data; 138 if (ip->obj_localization > localization) 139 return(1); 140 if (ip->obj_localization < localization) 141 return(-1); 142 return(0); 143 } 144 145 /* 146 * RB-Tree support for pseudofs structures 147 */ 148 static int 149 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2) 150 { 151 if (p1->localization < p2->localization) 152 return(-1); 153 if (p1->localization > p2->localization) 154 return(1); 155 return(0); 156 } 157 158 159 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare); 160 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node, 161 hammer_inode_info_cmp, hammer_inode_info_t); 162 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node, 163 hammer_pfs_rb_compare, uint32_t, localization); 164 165 /* 166 * The kernel is not actively referencing this vnode but is still holding 167 * it cached. 168 * 169 * This is called from the frontend. 170 * 171 * MPALMOSTSAFE 172 */ 173 int 174 hammer_vop_inactive(struct vop_inactive_args *ap) 175 { 176 struct hammer_inode *ip = VTOI(ap->a_vp); 177 hammer_mount_t hmp; 178 179 /* 180 * Degenerate case 181 */ 182 if (ip == NULL) { 183 vrecycle(ap->a_vp); 184 return(0); 185 } 186 187 /* 188 * If the inode no longer has visibility in the filesystem try to 189 * recycle it immediately, even if the inode is dirty. Recycling 190 * it quickly allows the system to reclaim buffer cache and VM 191 * resources which can matter a lot in a heavily loaded system. 192 * 193 * This can deadlock in vfsync() if we aren't careful. 194 * 195 * Do not queue the inode to the flusher if we still have visibility, 196 * otherwise namespace calls such as chmod will unnecessarily generate 197 * multiple inode updates. 198 */ 199 if (ip->ino_data.nlinks == 0) { 200 hmp = ip->hmp; 201 lwkt_gettoken(&hmp->fs_token); 202 hammer_inode_unloadable_check(ip, 0); 203 if (ip->flags & HAMMER_INODE_MODMASK) 204 hammer_flush_inode(ip, 0); 205 lwkt_reltoken(&hmp->fs_token); 206 vrecycle(ap->a_vp); 207 } 208 return(0); 209 } 210 211 /* 212 * Release the vnode association. This is typically (but not always) 213 * the last reference on the inode. 214 * 215 * Once the association is lost we are on our own with regards to 216 * flushing the inode. 217 * 218 * We must interlock ip->vp so hammer_get_vnode() can avoid races. 219 */ 220 int 221 hammer_vop_reclaim(struct vop_reclaim_args *ap) 222 { 223 struct hammer_inode *ip; 224 hammer_mount_t hmp; 225 struct vnode *vp; 226 227 vp = ap->a_vp; 228 229 if ((ip = vp->v_data) != NULL) { 230 hmp = ip->hmp; 231 lwkt_gettoken(&hmp->fs_token); 232 hammer_lock_ex(&ip->lock); 233 vp->v_data = NULL; 234 ip->vp = NULL; 235 236 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) { 237 ++hammer_count_reclaims; 238 ++hmp->count_reclaims; 239 ip->flags |= HAMMER_INODE_RECLAIM; 240 } 241 hammer_unlock(&ip->lock); 242 vclrisdirty(vp); 243 hammer_rel_inode(ip, 1); 244 lwkt_reltoken(&hmp->fs_token); 245 } 246 return(0); 247 } 248 249 /* 250 * Inform the kernel that the inode is dirty. This will be checked 251 * by vn_unlock(). 252 * 253 * Theoretically in order to reclaim a vnode the hammer_vop_reclaim() 254 * must be called which will interlock against our inode lock, so 255 * if VRECLAIMED is not set vp->v_mount (as used by vsetisdirty()) 256 * should be stable without having to acquire any new locks. 257 */ 258 void 259 hammer_inode_dirty(struct hammer_inode *ip) 260 { 261 struct vnode *vp; 262 263 if ((ip->flags & HAMMER_INODE_MODMASK) && 264 (vp = ip->vp) != NULL && 265 (vp->v_flag & (VRECLAIMED | VISDIRTY)) == 0) { 266 vsetisdirty(vp); 267 } 268 } 269 270 /* 271 * Return a locked vnode for the specified inode. The inode must be 272 * referenced but NOT LOCKED on entry and will remain referenced on 273 * return. 274 * 275 * Called from the frontend. 276 */ 277 int 278 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp) 279 { 280 hammer_mount_t hmp; 281 struct vnode *vp; 282 int error = 0; 283 uint8_t obj_type; 284 285 hmp = ip->hmp; 286 287 for (;;) { 288 if ((vp = ip->vp) == NULL) { 289 error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0); 290 if (error) 291 break; 292 hammer_lock_ex(&ip->lock); 293 if (ip->vp != NULL) { 294 hammer_unlock(&ip->lock); 295 vp = *vpp; 296 vp->v_type = VBAD; 297 vx_put(vp); 298 continue; 299 } 300 hammer_ref(&ip->lock); 301 vp = *vpp; 302 ip->vp = vp; 303 304 obj_type = ip->ino_data.obj_type; 305 vp->v_type = hammer_get_vnode_type(obj_type); 306 307 hammer_inode_wakereclaims(ip); 308 309 switch(ip->ino_data.obj_type) { 310 case HAMMER_OBJTYPE_CDEV: 311 case HAMMER_OBJTYPE_BDEV: 312 vp->v_ops = &hmp->mp->mnt_vn_spec_ops; 313 addaliasu(vp, ip->ino_data.rmajor, 314 ip->ino_data.rminor); 315 break; 316 case HAMMER_OBJTYPE_FIFO: 317 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops; 318 break; 319 case HAMMER_OBJTYPE_REGFILE: 320 break; 321 default: 322 break; 323 } 324 325 /* 326 * Only mark as the root vnode if the ip is not 327 * historical, otherwise the VFS cache will get 328 * confused. The other half of the special handling 329 * is in hammer_vop_nlookupdotdot(). 330 * 331 * Pseudo-filesystem roots can be accessed via 332 * non-root filesystem paths and setting VROOT may 333 * confuse the namecache. Set VPFSROOT instead. 334 */ 335 if (ip->obj_id == HAMMER_OBJID_ROOT) { 336 if (ip->obj_asof == hmp->asof) { 337 if (ip->obj_localization == 0) 338 vsetflags(vp, VROOT); 339 else 340 vsetflags(vp, VPFSROOT); 341 } else { 342 vsetflags(vp, VPFSROOT); 343 } 344 } 345 346 vp->v_data = (void *)ip; 347 /* vnode locked by getnewvnode() */ 348 /* make related vnode dirty if inode dirty? */ 349 hammer_unlock(&ip->lock); 350 if (vp->v_type == VREG) { 351 vinitvmio(vp, ip->ino_data.size, 352 hammer_blocksize(ip->ino_data.size), 353 hammer_blockoff(ip->ino_data.size)); 354 } 355 break; 356 } 357 358 /* 359 * Interlock vnode clearing. This does not prevent the 360 * vnode from going into a reclaimed state but it does 361 * prevent it from being destroyed or reused so the vget() 362 * will properly fail. 363 */ 364 hammer_lock_ex(&ip->lock); 365 if ((vp = ip->vp) == NULL) { 366 hammer_unlock(&ip->lock); 367 continue; 368 } 369 vhold(vp); 370 hammer_unlock(&ip->lock); 371 372 /* 373 * loop if the vget fails (aka races), or if the vp 374 * no longer matches ip->vp. 375 */ 376 if (vget(vp, LK_EXCLUSIVE) == 0) { 377 if (vp == ip->vp) { 378 vdrop(vp); 379 break; 380 } 381 vput(vp); 382 } 383 vdrop(vp); 384 } 385 *vpp = vp; 386 return(error); 387 } 388 389 /* 390 * Locate all copies of the inode for obj_id compatible with the specified 391 * asof, reference, and issue the related call-back. This routine is used 392 * for direct-io invalidation and does not create any new inodes. 393 */ 394 void 395 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo, 396 int (*callback)(hammer_inode_t ip, void *data), 397 void *data) 398 { 399 hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root, 400 hammer_inode_info_cmp_all_history, 401 callback, iinfo); 402 } 403 404 /* 405 * Acquire a HAMMER inode. The returned inode is not locked. These functions 406 * do not attach or detach the related vnode (use hammer_get_vnode() for 407 * that). 408 * 409 * The flags argument is only applied for newly created inodes, and only 410 * certain flags are inherited. 411 * 412 * Called from the frontend. 413 */ 414 struct hammer_inode * 415 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip, 416 int64_t obj_id, hammer_tid_t asof, uint32_t localization, 417 int flags, int *errorp) 418 { 419 hammer_mount_t hmp = trans->hmp; 420 struct hammer_node_cache *cachep; 421 struct hammer_inode_info iinfo; 422 struct hammer_cursor cursor; 423 struct hammer_inode *ip; 424 425 426 /* 427 * Determine if we already have an inode cached. If we do then 428 * we are golden. 429 * 430 * If we find an inode with no vnode we have to mark the 431 * transaction such that hammer_inode_waitreclaims() is 432 * called later on to avoid building up an infinite number 433 * of inodes. Otherwise we can continue to * add new inodes 434 * faster then they can be disposed of, even with the tsleep 435 * delay. 436 * 437 * If we find a dummy inode we return a failure so dounlink 438 * (which does another lookup) doesn't try to mess with the 439 * link count. hammer_vop_nresolve() uses hammer_get_dummy_inode() 440 * to ref dummy inodes. 441 */ 442 iinfo.obj_id = obj_id; 443 iinfo.obj_asof = asof; 444 iinfo.obj_localization = localization; 445 loop: 446 ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo); 447 if (ip) { 448 if (ip->flags & HAMMER_INODE_DUMMY) { 449 *errorp = ENOENT; 450 return(NULL); 451 } 452 hammer_ref(&ip->lock); 453 *errorp = 0; 454 return(ip); 455 } 456 457 /* 458 * Allocate a new inode structure and deal with races later. 459 */ 460 ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO); 461 ++hammer_count_inodes; 462 ++hmp->count_inodes; 463 ip->obj_id = obj_id; 464 ip->obj_asof = iinfo.obj_asof; 465 ip->obj_localization = localization; 466 ip->hmp = hmp; 467 ip->flags = flags & HAMMER_INODE_RO; 468 ip->cache[0].ip = ip; 469 ip->cache[1].ip = ip; 470 ip->cache[2].ip = ip; 471 ip->cache[3].ip = ip; 472 if (hmp->ronly) 473 ip->flags |= HAMMER_INODE_RO; 474 ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off = 475 0x7FFFFFFFFFFFFFFFLL; 476 RB_INIT(&ip->rec_tree); 477 TAILQ_INIT(&ip->target_list); 478 hammer_ref(&ip->lock); 479 480 /* 481 * Locate the on-disk inode. If this is a PFS root we always 482 * access the current version of the root inode and (if it is not 483 * a master) always access information under it with a snapshot 484 * TID. 485 * 486 * We cache recent inode lookups in this directory in dip->cache[2]. 487 * If we can't find it we assume the inode we are looking for is 488 * close to the directory inode. 489 */ 490 retry: 491 cachep = NULL; 492 if (dip) { 493 if (dip->cache[2].node) 494 cachep = &dip->cache[2]; 495 else 496 cachep = &dip->cache[0]; 497 } 498 hammer_init_cursor(trans, &cursor, cachep, NULL); 499 cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE; 500 cursor.key_beg.obj_id = ip->obj_id; 501 cursor.key_beg.key = 0; 502 cursor.key_beg.create_tid = 0; 503 cursor.key_beg.delete_tid = 0; 504 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE; 505 cursor.key_beg.obj_type = 0; 506 507 cursor.asof = iinfo.obj_asof; 508 cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA | 509 HAMMER_CURSOR_ASOF; 510 511 *errorp = hammer_btree_lookup(&cursor); 512 if (*errorp == EDEADLK) { 513 hammer_done_cursor(&cursor); 514 goto retry; 515 } 516 517 /* 518 * On success the B-Tree lookup will hold the appropriate 519 * buffer cache buffers and provide a pointer to the requested 520 * information. Copy the information to the in-memory inode 521 * and cache the B-Tree node to improve future operations. 522 */ 523 if (*errorp == 0) { 524 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf; 525 ip->ino_data = cursor.data->inode; 526 527 /* 528 * cache[0] tries to cache the location of the object inode. 529 * The assumption is that it is near the directory inode. 530 * 531 * cache[1] tries to cache the location of the object data. 532 * We might have something in the governing directory from 533 * scan optimizations (see the strategy code in 534 * hammer_vnops.c). 535 * 536 * We update dip->cache[2], if possible, with the location 537 * of the object inode for future directory shortcuts. 538 */ 539 hammer_cache_node(&ip->cache[0], cursor.node); 540 if (dip) { 541 if (dip->cache[3].node) { 542 hammer_cache_node(&ip->cache[1], 543 dip->cache[3].node); 544 } 545 hammer_cache_node(&dip->cache[2], cursor.node); 546 } 547 548 /* 549 * The file should not contain any data past the file size 550 * stored in the inode. Setting save_trunc_off to the 551 * file size instead of max reduces B-Tree lookup overheads 552 * on append by allowing the flusher to avoid checking for 553 * record overwrites. 554 */ 555 ip->save_trunc_off = ip->ino_data.size; 556 557 /* 558 * Locate and assign the pseudofs management structure to 559 * the inode. 560 */ 561 if (dip && dip->obj_localization == ip->obj_localization) { 562 ip->pfsm = dip->pfsm; 563 hammer_ref(&ip->pfsm->lock); 564 } else { 565 ip->pfsm = hammer_load_pseudofs(trans, 566 ip->obj_localization, 567 errorp); 568 *errorp = 0; /* ignore ENOENT */ 569 } 570 } 571 572 /* 573 * The inode is placed on the red-black tree and will be synced to 574 * the media when flushed or by the filesystem sync. If this races 575 * another instantiation/lookup the insertion will fail. 576 */ 577 if (*errorp == 0) { 578 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) { 579 hammer_free_inode(ip); 580 hammer_done_cursor(&cursor); 581 goto loop; 582 } 583 ip->flags |= HAMMER_INODE_ONDISK; 584 } else { 585 if (ip->flags & HAMMER_INODE_RSV_INODES) { 586 ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */ 587 --hmp->rsv_inodes; 588 } 589 590 hammer_free_inode(ip); 591 ip = NULL; 592 } 593 hammer_done_cursor(&cursor); 594 595 /* 596 * NEWINODE is only set if the inode becomes dirty later, 597 * setting it here just leads to unnecessary stalls. 598 * 599 * trans->flags |= HAMMER_TRANSF_NEWINODE; 600 */ 601 return (ip); 602 } 603 604 /* 605 * Get a dummy inode to placemark a broken directory entry. 606 */ 607 struct hammer_inode * 608 hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip, 609 int64_t obj_id, hammer_tid_t asof, uint32_t localization, 610 int flags, int *errorp) 611 { 612 hammer_mount_t hmp = trans->hmp; 613 struct hammer_inode_info iinfo; 614 struct hammer_inode *ip; 615 616 /* 617 * Determine if we already have an inode cached. If we do then 618 * we are golden. 619 * 620 * If we find an inode with no vnode we have to mark the 621 * transaction such that hammer_inode_waitreclaims() is 622 * called later on to avoid building up an infinite number 623 * of inodes. Otherwise we can continue to * add new inodes 624 * faster then they can be disposed of, even with the tsleep 625 * delay. 626 * 627 * If we find a non-fake inode we return an error. Only fake 628 * inodes can be returned by this routine. 629 */ 630 iinfo.obj_id = obj_id; 631 iinfo.obj_asof = asof; 632 iinfo.obj_localization = localization; 633 loop: 634 *errorp = 0; 635 ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo); 636 if (ip) { 637 if ((ip->flags & HAMMER_INODE_DUMMY) == 0) { 638 *errorp = ENOENT; 639 return(NULL); 640 } 641 hammer_ref(&ip->lock); 642 return(ip); 643 } 644 645 /* 646 * Allocate a new inode structure and deal with races later. 647 */ 648 ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO); 649 ++hammer_count_inodes; 650 ++hmp->count_inodes; 651 ip->obj_id = obj_id; 652 ip->obj_asof = iinfo.obj_asof; 653 ip->obj_localization = localization; 654 ip->hmp = hmp; 655 ip->flags = flags | HAMMER_INODE_RO | HAMMER_INODE_DUMMY; 656 ip->cache[0].ip = ip; 657 ip->cache[1].ip = ip; 658 ip->cache[2].ip = ip; 659 ip->cache[3].ip = ip; 660 ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off = 661 0x7FFFFFFFFFFFFFFFLL; 662 RB_INIT(&ip->rec_tree); 663 TAILQ_INIT(&ip->target_list); 664 hammer_ref(&ip->lock); 665 666 /* 667 * Populate the dummy inode. Leave everything zero'd out. 668 * 669 * (ip->ino_leaf and ip->ino_data) 670 * 671 * Make the dummy inode a FIFO object which most copy programs 672 * will properly ignore. 673 */ 674 ip->save_trunc_off = ip->ino_data.size; 675 ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO; 676 677 /* 678 * Locate and assign the pseudofs management structure to 679 * the inode. 680 */ 681 if (dip && dip->obj_localization == ip->obj_localization) { 682 ip->pfsm = dip->pfsm; 683 hammer_ref(&ip->pfsm->lock); 684 } else { 685 ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization, 686 errorp); 687 *errorp = 0; /* ignore ENOENT */ 688 } 689 690 /* 691 * The inode is placed on the red-black tree and will be synced to 692 * the media when flushed or by the filesystem sync. If this races 693 * another instantiation/lookup the insertion will fail. 694 * 695 * NOTE: Do not set HAMMER_INODE_ONDISK. The inode is a fake. 696 */ 697 if (*errorp == 0) { 698 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) { 699 hammer_free_inode(ip); 700 goto loop; 701 } 702 } else { 703 if (ip->flags & HAMMER_INODE_RSV_INODES) { 704 ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */ 705 --hmp->rsv_inodes; 706 } 707 hammer_free_inode(ip); 708 ip = NULL; 709 } 710 trans->flags |= HAMMER_TRANSF_NEWINODE; 711 return (ip); 712 } 713 714 /* 715 * Return a referenced inode only if it is in our inode cache. 716 * 717 * Dummy inodes do not count. 718 */ 719 struct hammer_inode * 720 hammer_find_inode(hammer_transaction_t trans, int64_t obj_id, 721 hammer_tid_t asof, uint32_t localization) 722 { 723 hammer_mount_t hmp = trans->hmp; 724 struct hammer_inode_info iinfo; 725 struct hammer_inode *ip; 726 727 iinfo.obj_id = obj_id; 728 iinfo.obj_asof = asof; 729 iinfo.obj_localization = localization; 730 731 ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo); 732 if (ip) { 733 if (ip->flags & HAMMER_INODE_DUMMY) 734 ip = NULL; 735 else 736 hammer_ref(&ip->lock); 737 } 738 return(ip); 739 } 740 741 /* 742 * Create a new filesystem object, returning the inode in *ipp. The 743 * returned inode will be referenced. The inode is created in-memory. 744 * 745 * If pfsm is non-NULL the caller wishes to create the root inode for 746 * a master PFS. 747 */ 748 int 749 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap, 750 struct ucred *cred, 751 hammer_inode_t dip, const char *name, int namelen, 752 hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp) 753 { 754 hammer_mount_t hmp; 755 hammer_inode_t ip; 756 uid_t xuid; 757 int error; 758 int64_t namekey; 759 uint32_t dummy; 760 761 hmp = trans->hmp; 762 763 /* 764 * Disallow the creation of new inodes in directories which 765 * have been deleted. In HAMMER, this will cause a record 766 * syncing assertion later on in the flush code. 767 */ 768 if (dip && dip->ino_data.nlinks == 0) { 769 *ipp = NULL; 770 return (EINVAL); 771 } 772 773 /* 774 * Allocate inode 775 */ 776 ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO); 777 ++hammer_count_inodes; 778 ++hmp->count_inodes; 779 trans->flags |= HAMMER_TRANSF_NEWINODE; 780 781 if (pfsm) { 782 KKASSERT(pfsm->localization != 0); 783 ip->obj_id = HAMMER_OBJID_ROOT; 784 ip->obj_localization = pfsm->localization; 785 } else { 786 KKASSERT(dip != NULL); 787 namekey = hammer_directory_namekey(dip, name, namelen, &dummy); 788 ip->obj_id = hammer_alloc_objid(hmp, dip, namekey); 789 ip->obj_localization = dip->obj_localization; 790 } 791 792 KKASSERT(ip->obj_id != 0); 793 ip->obj_asof = hmp->asof; 794 ip->hmp = hmp; 795 ip->flush_state = HAMMER_FST_IDLE; 796 ip->flags = HAMMER_INODE_DDIRTY | 797 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME; 798 ip->cache[0].ip = ip; 799 ip->cache[1].ip = ip; 800 ip->cache[2].ip = ip; 801 ip->cache[3].ip = ip; 802 803 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; 804 /* ip->save_trunc_off = 0; (already zero) */ 805 RB_INIT(&ip->rec_tree); 806 TAILQ_INIT(&ip->target_list); 807 808 ip->ino_data.atime = trans->time; 809 ip->ino_data.mtime = trans->time; 810 ip->ino_data.size = 0; 811 ip->ino_data.nlinks = 0; 812 813 /* 814 * A nohistory designator on the parent directory is inherited by 815 * the child. We will do this even for pseudo-fs creation... the 816 * sysad can turn it off. 817 */ 818 if (dip) { 819 ip->ino_data.uflags = dip->ino_data.uflags & 820 (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP); 821 } 822 823 ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD; 824 ip->ino_leaf.base.localization = ip->obj_localization + 825 HAMMER_LOCALIZE_INODE; 826 ip->ino_leaf.base.obj_id = ip->obj_id; 827 ip->ino_leaf.base.key = 0; 828 ip->ino_leaf.base.create_tid = 0; 829 ip->ino_leaf.base.delete_tid = 0; 830 ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE; 831 ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type); 832 833 ip->ino_data.obj_type = ip->ino_leaf.base.obj_type; 834 ip->ino_data.version = HAMMER_INODE_DATA_VERSION; 835 ip->ino_data.mode = vap->va_mode; 836 ip->ino_data.ctime = trans->time; 837 838 /* 839 * If we are running version 2 or greater directory entries are 840 * inode-localized instead of data-localized. 841 */ 842 if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) { 843 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 844 ip->ino_data.cap_flags |= 845 HAMMER_INODE_CAP_DIR_LOCAL_INO; 846 } 847 } 848 if (trans->hmp->version >= HAMMER_VOL_VERSION_SIX) { 849 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 850 ip->ino_data.cap_flags |= 851 HAMMER_INODE_CAP_DIRHASH_ALG1; 852 } 853 } 854 855 /* 856 * Setup the ".." pointer. This only needs to be done for directories 857 * but we do it for all objects as a recovery aid if dip exists. 858 * The inode is probably a PFS root if dip is NULL. 859 */ 860 if (dip) 861 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id; 862 #if 0 863 /* 864 * The parent_obj_localization field only applies to pseudo-fs roots. 865 * XXX this is no longer applicable, PFSs are no longer directly 866 * tied into the parent's directory structure. 867 */ 868 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY && 869 ip->obj_id == HAMMER_OBJID_ROOT) { 870 ip->ino_data.ext.obj.parent_obj_localization = 871 dip->obj_localization; 872 } 873 #endif 874 875 switch(ip->ino_leaf.base.obj_type) { 876 case HAMMER_OBJTYPE_CDEV: 877 case HAMMER_OBJTYPE_BDEV: 878 ip->ino_data.rmajor = vap->va_rmajor; 879 ip->ino_data.rminor = vap->va_rminor; 880 break; 881 default: 882 break; 883 } 884 885 /* 886 * Calculate default uid/gid and overwrite with information from 887 * the vap. 888 */ 889 if (dip) { 890 xuid = hammer_to_unix_xid(&dip->ino_data.uid); 891 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, 892 xuid, cred, &vap->va_mode); 893 } else { 894 xuid = 0; 895 } 896 ip->ino_data.mode = vap->va_mode; 897 898 if (vap->va_vaflags & VA_UID_UUID_VALID) 899 ip->ino_data.uid = vap->va_uid_uuid; 900 else if (vap->va_uid != (uid_t)VNOVAL) 901 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid); 902 else 903 hammer_guid_to_uuid(&ip->ino_data.uid, xuid); 904 905 if (vap->va_vaflags & VA_GID_UUID_VALID) 906 ip->ino_data.gid = vap->va_gid_uuid; 907 else if (vap->va_gid != (gid_t)VNOVAL) 908 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid); 909 else if (dip) 910 ip->ino_data.gid = dip->ino_data.gid; 911 912 hammer_ref(&ip->lock); 913 914 if (pfsm) { 915 ip->pfsm = pfsm; 916 hammer_ref(&pfsm->lock); 917 error = 0; 918 } else if (dip->obj_localization == ip->obj_localization) { 919 ip->pfsm = dip->pfsm; 920 hammer_ref(&ip->pfsm->lock); 921 error = 0; 922 } else { 923 ip->pfsm = hammer_load_pseudofs(trans, 924 ip->obj_localization, 925 &error); 926 error = 0; /* ignore ENOENT */ 927 } 928 929 if (error) { 930 hammer_free_inode(ip); 931 ip = NULL; 932 } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) { 933 hpanic("duplicate obj_id %llx", (long long)ip->obj_id); 934 /* not reached */ 935 hammer_free_inode(ip); 936 } 937 *ipp = ip; 938 return(error); 939 } 940 941 /* 942 * Final cleanup / freeing of an inode structure 943 */ 944 static void 945 hammer_free_inode(hammer_inode_t ip) 946 { 947 struct hammer_mount *hmp; 948 949 hmp = ip->hmp; 950 KKASSERT(hammer_oneref(&ip->lock)); 951 hammer_uncache_node(&ip->cache[0]); 952 hammer_uncache_node(&ip->cache[1]); 953 hammer_uncache_node(&ip->cache[2]); 954 hammer_uncache_node(&ip->cache[3]); 955 hammer_inode_wakereclaims(ip); 956 if (ip->objid_cache) 957 hammer_clear_objid(ip); 958 --hammer_count_inodes; 959 --hmp->count_inodes; 960 if (ip->pfsm) { 961 hammer_rel_pseudofs(hmp, ip->pfsm); 962 ip->pfsm = NULL; 963 } 964 kfree(ip, hmp->m_inodes); 965 } 966 967 /* 968 * Retrieve pseudo-fs data. NULL will never be returned. 969 * 970 * If an error occurs *errorp will be set and a default template is returned, 971 * otherwise *errorp is set to 0. Typically when an error occurs it will 972 * be ENOENT. 973 */ 974 hammer_pseudofs_inmem_t 975 hammer_load_pseudofs(hammer_transaction_t trans, 976 uint32_t localization, int *errorp) 977 { 978 hammer_mount_t hmp = trans->hmp; 979 hammer_inode_t ip; 980 hammer_pseudofs_inmem_t pfsm; 981 struct hammer_cursor cursor; 982 int bytes; 983 984 retry: 985 pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization); 986 if (pfsm) { 987 hammer_ref(&pfsm->lock); 988 *errorp = 0; 989 return(pfsm); 990 } 991 992 /* 993 * PFS records are associated with the root inode (not the PFS root 994 * inode, but the real root). Avoid an infinite recursion if loading 995 * the PFS for the real root. 996 */ 997 if (localization) { 998 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, 999 HAMMER_MAX_TID, 1000 HAMMER_DEF_LOCALIZATION, 0, errorp); 1001 } else { 1002 ip = NULL; 1003 } 1004 1005 pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK | M_ZERO); 1006 pfsm->localization = localization; 1007 pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid; 1008 pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid; 1009 1010 hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip); 1011 cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION + 1012 HAMMER_LOCALIZE_MISC; 1013 cursor.key_beg.obj_id = HAMMER_OBJID_ROOT; 1014 cursor.key_beg.create_tid = 0; 1015 cursor.key_beg.delete_tid = 0; 1016 cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS; 1017 cursor.key_beg.obj_type = 0; 1018 cursor.key_beg.key = localization; 1019 cursor.asof = HAMMER_MAX_TID; 1020 cursor.flags |= HAMMER_CURSOR_ASOF; 1021 1022 if (ip) 1023 *errorp = hammer_ip_lookup(&cursor); 1024 else 1025 *errorp = hammer_btree_lookup(&cursor); 1026 if (*errorp == 0) { 1027 *errorp = hammer_ip_resolve_data(&cursor); 1028 if (*errorp == 0) { 1029 if (cursor.data->pfsd.mirror_flags & 1030 HAMMER_PFSD_DELETED) { 1031 *errorp = ENOENT; 1032 } else { 1033 bytes = cursor.leaf->data_len; 1034 if (bytes > sizeof(pfsm->pfsd)) 1035 bytes = sizeof(pfsm->pfsd); 1036 bcopy(cursor.data, &pfsm->pfsd, bytes); 1037 } 1038 } 1039 } 1040 hammer_done_cursor(&cursor); 1041 1042 pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid); 1043 hammer_ref(&pfsm->lock); 1044 if (ip) 1045 hammer_rel_inode(ip, 0); 1046 if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) { 1047 kfree(pfsm, hmp->m_misc); 1048 goto retry; 1049 } 1050 return(pfsm); 1051 } 1052 1053 /* 1054 * Store pseudo-fs data. The backend will automatically delete any prior 1055 * on-disk pseudo-fs data but we have to delete in-memory versions. 1056 */ 1057 int 1058 hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm) 1059 { 1060 struct hammer_cursor cursor; 1061 hammer_record_t record; 1062 hammer_inode_t ip; 1063 int error; 1064 1065 /* 1066 * PFS records are associated with the root inode (not the PFS root 1067 * inode, but the real root). 1068 */ 1069 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID, 1070 HAMMER_DEF_LOCALIZATION, 0, &error); 1071 retry: 1072 pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid); 1073 hammer_init_cursor(trans, &cursor, &ip->cache[1], ip); 1074 cursor.key_beg.localization = ip->obj_localization + 1075 HAMMER_LOCALIZE_MISC; 1076 cursor.key_beg.obj_id = HAMMER_OBJID_ROOT; 1077 cursor.key_beg.create_tid = 0; 1078 cursor.key_beg.delete_tid = 0; 1079 cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS; 1080 cursor.key_beg.obj_type = 0; 1081 cursor.key_beg.key = pfsm->localization; 1082 cursor.asof = HAMMER_MAX_TID; 1083 cursor.flags |= HAMMER_CURSOR_ASOF; 1084 1085 /* 1086 * Replace any in-memory version of the record. 1087 */ 1088 error = hammer_ip_lookup(&cursor); 1089 if (error == 0 && hammer_cursor_inmem(&cursor)) { 1090 record = cursor.iprec; 1091 if (record->flags & HAMMER_RECF_INTERLOCK_BE) { 1092 KKASSERT(cursor.deadlk_rec == NULL); 1093 hammer_ref(&record->lock); 1094 cursor.deadlk_rec = record; 1095 error = EDEADLK; 1096 } else { 1097 record->flags |= HAMMER_RECF_DELETED_FE; 1098 error = 0; 1099 } 1100 } 1101 1102 /* 1103 * Allocate replacement general record. The backend flush will 1104 * delete any on-disk version of the record. 1105 */ 1106 if (error == 0 || error == ENOENT) { 1107 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd)); 1108 record->type = HAMMER_MEM_RECORD_GENERAL; 1109 1110 record->leaf.base.localization = ip->obj_localization + 1111 HAMMER_LOCALIZE_MISC; 1112 record->leaf.base.rec_type = HAMMER_RECTYPE_PFS; 1113 record->leaf.base.key = pfsm->localization; 1114 record->leaf.data_len = sizeof(pfsm->pfsd); 1115 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd)); 1116 error = hammer_ip_add_record(trans, record); 1117 } 1118 hammer_done_cursor(&cursor); 1119 if (error == EDEADLK) 1120 goto retry; 1121 hammer_rel_inode(ip, 0); 1122 return(error); 1123 } 1124 1125 /* 1126 * Create a root directory for a PFS if one does not alredy exist. 1127 * 1128 * The PFS root stands alone so we must also bump the nlinks count 1129 * to prevent it from being destroyed on release. 1130 */ 1131 int 1132 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred, 1133 hammer_pseudofs_inmem_t pfsm) 1134 { 1135 hammer_inode_t ip; 1136 struct vattr vap; 1137 int error; 1138 1139 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID, 1140 pfsm->localization, 0, &error); 1141 if (ip == NULL) { 1142 vattr_null(&vap); 1143 vap.va_mode = 0755; 1144 vap.va_type = VDIR; 1145 error = hammer_create_inode(trans, &vap, cred, 1146 NULL, NULL, 0, 1147 pfsm, &ip); 1148 if (error == 0) { 1149 ++ip->ino_data.nlinks; 1150 hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY); 1151 } 1152 } 1153 if (ip) 1154 hammer_rel_inode(ip, 0); 1155 return(error); 1156 } 1157 1158 /* 1159 * Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY 1160 * if we are unable to disassociate all the inodes. 1161 */ 1162 static 1163 int 1164 hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data) 1165 { 1166 int res; 1167 1168 hammer_ref(&ip->lock); 1169 if (ip->vp && (ip->vp->v_flag & VPFSROOT)) { 1170 /* 1171 * The hammer pfs-upgrade directive itself might have the 1172 * root of the pfs open. Just allow it. 1173 */ 1174 res = 0; 1175 } else { 1176 /* 1177 * Don't allow any subdirectories or files to be open. 1178 */ 1179 if (hammer_isactive(&ip->lock) == 2 && ip->vp) 1180 vclean_unlocked(ip->vp); 1181 if (hammer_isactive(&ip->lock) == 1 && ip->vp == NULL) 1182 res = 0; 1183 else 1184 res = -1; /* stop, someone is using the inode */ 1185 } 1186 hammer_rel_inode(ip, 0); 1187 return(res); 1188 } 1189 1190 int 1191 hammer_unload_pseudofs(hammer_transaction_t trans, uint32_t localization) 1192 { 1193 int res; 1194 int try; 1195 1196 for (try = res = 0; try < 4; ++try) { 1197 res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root, 1198 hammer_inode_pfs_cmp, 1199 hammer_unload_pseudofs_callback, 1200 &localization); 1201 if (res == 0 && try > 1) 1202 break; 1203 hammer_flusher_sync(trans->hmp); 1204 } 1205 if (res != 0) 1206 res = ENOTEMPTY; 1207 return(res); 1208 } 1209 1210 1211 /* 1212 * Release a reference on a PFS 1213 */ 1214 void 1215 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm) 1216 { 1217 hammer_rel(&pfsm->lock); 1218 if (hammer_norefs(&pfsm->lock)) { 1219 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm); 1220 kfree(pfsm, hmp->m_misc); 1221 } 1222 } 1223 1224 /* 1225 * Called by hammer_sync_inode(). 1226 */ 1227 static int 1228 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip) 1229 { 1230 hammer_transaction_t trans = cursor->trans; 1231 hammer_record_t record; 1232 int error; 1233 int redirty; 1234 1235 retry: 1236 error = 0; 1237 1238 /* 1239 * If the inode has a presence on-disk then locate it and mark 1240 * it deleted, setting DELONDISK. 1241 * 1242 * The record may or may not be physically deleted, depending on 1243 * the retention policy. 1244 */ 1245 if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) == 1246 HAMMER_INODE_ONDISK) { 1247 hammer_normalize_cursor(cursor); 1248 cursor->key_beg.localization = ip->obj_localization + 1249 HAMMER_LOCALIZE_INODE; 1250 cursor->key_beg.obj_id = ip->obj_id; 1251 cursor->key_beg.key = 0; 1252 cursor->key_beg.create_tid = 0; 1253 cursor->key_beg.delete_tid = 0; 1254 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE; 1255 cursor->key_beg.obj_type = 0; 1256 cursor->asof = ip->obj_asof; 1257 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1258 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF; 1259 cursor->flags |= HAMMER_CURSOR_BACKEND; 1260 1261 error = hammer_btree_lookup(cursor); 1262 if (hammer_debug_inode) 1263 hdkprintf("IPDEL %p %08x %d", ip, ip->flags, error); 1264 1265 if (error == 0) { 1266 error = hammer_ip_delete_record(cursor, ip, trans->tid); 1267 if (hammer_debug_inode) 1268 hdkprintf("error %d\n", error); 1269 if (error == 0) { 1270 ip->flags |= HAMMER_INODE_DELONDISK; 1271 } 1272 if (cursor->node) 1273 hammer_cache_node(&ip->cache[0], cursor->node); 1274 } 1275 if (error == EDEADLK) { 1276 hammer_done_cursor(cursor); 1277 error = hammer_init_cursor(trans, cursor, 1278 &ip->cache[0], ip); 1279 if (hammer_debug_inode) 1280 hdkprintf("IPDED %p %d\n", ip, error); 1281 if (error == 0) 1282 goto retry; 1283 } 1284 } 1285 1286 /* 1287 * Ok, write out the initial record or a new record (after deleting 1288 * the old one), unless the DELETED flag is set. This routine will 1289 * clear DELONDISK if it writes out a record. 1290 * 1291 * Update our inode statistics if this is the first application of 1292 * the inode on-disk. 1293 */ 1294 if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) { 1295 /* 1296 * Generate a record and write it to the media. We clean-up 1297 * the state before releasing so we do not have to set-up 1298 * a flush_group. 1299 */ 1300 record = hammer_alloc_mem_record(ip, 0); 1301 record->type = HAMMER_MEM_RECORD_INODE; 1302 record->flush_state = HAMMER_FST_FLUSH; 1303 record->leaf = ip->sync_ino_leaf; 1304 record->leaf.base.create_tid = trans->tid; 1305 record->leaf.data_len = sizeof(ip->sync_ino_data); 1306 record->leaf.create_ts = trans->time32; 1307 record->data = (void *)&ip->sync_ino_data; 1308 record->flags |= HAMMER_RECF_INTERLOCK_BE; 1309 1310 /* 1311 * If this flag is set we cannot sync the new file size 1312 * because we haven't finished related truncations. The 1313 * inode will be flushed in another flush group to finish 1314 * the job. 1315 */ 1316 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) && 1317 ip->sync_ino_data.size != ip->ino_data.size) { 1318 redirty = 1; 1319 ip->sync_ino_data.size = ip->ino_data.size; 1320 } else { 1321 redirty = 0; 1322 } 1323 1324 for (;;) { 1325 error = hammer_ip_sync_record_cursor(cursor, record); 1326 if (hammer_debug_inode) 1327 hdkprintf("GENREC %p rec %08x %d\n", 1328 ip, record->flags, error); 1329 if (error != EDEADLK) 1330 break; 1331 hammer_done_cursor(cursor); 1332 error = hammer_init_cursor(trans, cursor, 1333 &ip->cache[0], ip); 1334 if (hammer_debug_inode) 1335 hdkprintf("GENREC reinit %d\n", error); 1336 if (error) 1337 break; 1338 } 1339 1340 /* 1341 * Note: The record was never on the inode's record tree 1342 * so just wave our hands importantly and destroy it. 1343 */ 1344 record->flags |= HAMMER_RECF_COMMITTED; 1345 record->flags &= ~HAMMER_RECF_INTERLOCK_BE; 1346 record->flush_state = HAMMER_FST_IDLE; 1347 ++ip->rec_generation; 1348 hammer_rel_mem_record(record); 1349 1350 /* 1351 * Finish up. 1352 */ 1353 if (error == 0) { 1354 if (hammer_debug_inode) 1355 hdkprintf("CLEANDELOND %p %08x\n", ip, ip->flags); 1356 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | 1357 HAMMER_INODE_SDIRTY | 1358 HAMMER_INODE_ATIME | 1359 HAMMER_INODE_MTIME); 1360 ip->flags &= ~HAMMER_INODE_DELONDISK; 1361 if (redirty) 1362 ip->sync_flags |= HAMMER_INODE_DDIRTY; 1363 1364 /* 1365 * Root volume count of inodes 1366 */ 1367 hammer_sync_lock_sh(trans); 1368 if ((ip->flags & HAMMER_INODE_ONDISK) == 0) { 1369 hammer_modify_volume_field(trans, 1370 trans->rootvol, 1371 vol0_stat_inodes); 1372 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes; 1373 hammer_modify_volume_done(trans->rootvol); 1374 ip->flags |= HAMMER_INODE_ONDISK; 1375 if (hammer_debug_inode) 1376 hdkprintf("NOWONDISK %p\n", ip); 1377 } 1378 hammer_sync_unlock(trans); 1379 } 1380 } 1381 1382 /* 1383 * If the inode has been destroyed, clean out any left-over flags 1384 * that may have been set by the frontend. 1385 */ 1386 if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 1387 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | 1388 HAMMER_INODE_SDIRTY | 1389 HAMMER_INODE_ATIME | 1390 HAMMER_INODE_MTIME); 1391 } 1392 return(error); 1393 } 1394 1395 /* 1396 * Update only the itimes fields. 1397 * 1398 * ATIME can be updated without generating any UNDO. MTIME is updated 1399 * with UNDO so it is guaranteed to be synchronized properly in case of 1400 * a crash. 1401 * 1402 * Neither field is included in the B-Tree leaf element's CRC, which is how 1403 * we can get away with updating ATIME the way we do. 1404 */ 1405 static int 1406 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip) 1407 { 1408 hammer_transaction_t trans = cursor->trans; 1409 int error; 1410 1411 retry: 1412 if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) != 1413 HAMMER_INODE_ONDISK) { 1414 return(0); 1415 } 1416 1417 hammer_normalize_cursor(cursor); 1418 cursor->key_beg.localization = ip->obj_localization + 1419 HAMMER_LOCALIZE_INODE; 1420 cursor->key_beg.obj_id = ip->obj_id; 1421 cursor->key_beg.key = 0; 1422 cursor->key_beg.create_tid = 0; 1423 cursor->key_beg.delete_tid = 0; 1424 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE; 1425 cursor->key_beg.obj_type = 0; 1426 cursor->asof = ip->obj_asof; 1427 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1428 cursor->flags |= HAMMER_CURSOR_ASOF; 1429 cursor->flags |= HAMMER_CURSOR_GET_LEAF; 1430 cursor->flags |= HAMMER_CURSOR_GET_DATA; 1431 cursor->flags |= HAMMER_CURSOR_BACKEND; 1432 1433 error = hammer_btree_lookup(cursor); 1434 if (error == 0) { 1435 hammer_cache_node(&ip->cache[0], cursor->node); 1436 if (ip->sync_flags & HAMMER_INODE_MTIME) { 1437 /* 1438 * Updating MTIME requires an UNDO. Just cover 1439 * both atime and mtime. 1440 */ 1441 hammer_sync_lock_sh(trans); 1442 hammer_modify_buffer(trans, cursor->data_buffer, 1443 &cursor->data->inode.mtime, 1444 sizeof(cursor->data->inode.atime) + 1445 sizeof(cursor->data->inode.mtime)); 1446 cursor->data->inode.atime = ip->sync_ino_data.atime; 1447 cursor->data->inode.mtime = ip->sync_ino_data.mtime; 1448 hammer_modify_buffer_done(cursor->data_buffer); 1449 hammer_sync_unlock(trans); 1450 } else if (ip->sync_flags & HAMMER_INODE_ATIME) { 1451 /* 1452 * Updating atime only can be done in-place with 1453 * no UNDO. 1454 */ 1455 hammer_sync_lock_sh(trans); 1456 hammer_modify_buffer_noundo(trans, cursor->data_buffer); 1457 cursor->data->inode.atime = ip->sync_ino_data.atime; 1458 hammer_modify_buffer_done(cursor->data_buffer); 1459 hammer_sync_unlock(trans); 1460 } 1461 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME); 1462 } 1463 if (error == EDEADLK) { 1464 hammer_done_cursor(cursor); 1465 error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip); 1466 if (error == 0) 1467 goto retry; 1468 } 1469 return(error); 1470 } 1471 1472 /* 1473 * Release a reference on an inode, flush as requested. 1474 * 1475 * On the last reference we queue the inode to the flusher for its final 1476 * disposition. 1477 */ 1478 void 1479 hammer_rel_inode(struct hammer_inode *ip, int flush) 1480 { 1481 /* 1482 * Handle disposition when dropping the last ref. 1483 */ 1484 for (;;) { 1485 if (hammer_oneref(&ip->lock)) { 1486 /* 1487 * Determine whether on-disk action is needed for 1488 * the inode's final disposition. 1489 */ 1490 KKASSERT(ip->vp == NULL); 1491 hammer_inode_unloadable_check(ip, 0); 1492 if (ip->flags & HAMMER_INODE_MODMASK) { 1493 hammer_flush_inode(ip, 0); 1494 } else if (hammer_oneref(&ip->lock)) { 1495 hammer_unload_inode(ip); 1496 break; 1497 } 1498 } else { 1499 if (flush) 1500 hammer_flush_inode(ip, 0); 1501 1502 /* 1503 * The inode still has multiple refs, try to drop 1504 * one ref. 1505 */ 1506 KKASSERT(hammer_isactive(&ip->lock) >= 1); 1507 if (hammer_isactive(&ip->lock) > 1) { 1508 hammer_rel(&ip->lock); 1509 break; 1510 } 1511 } 1512 } 1513 } 1514 1515 /* 1516 * Unload and destroy the specified inode. Must be called with one remaining 1517 * reference. The reference is disposed of. 1518 * 1519 * The inode must be completely clean. 1520 */ 1521 static int 1522 hammer_unload_inode(struct hammer_inode *ip) 1523 { 1524 hammer_mount_t hmp = ip->hmp; 1525 1526 KASSERT(hammer_oneref(&ip->lock), 1527 ("hammer_unload_inode: %d refs", hammer_isactive(&ip->lock))); 1528 KKASSERT(ip->vp == NULL); 1529 KKASSERT(ip->flush_state == HAMMER_FST_IDLE); 1530 KKASSERT(ip->cursor_ip_refs == 0); 1531 KKASSERT(hammer_notlocked(&ip->lock)); 1532 KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0); 1533 1534 KKASSERT(RB_EMPTY(&ip->rec_tree)); 1535 KKASSERT(TAILQ_EMPTY(&ip->target_list)); 1536 1537 if (ip->flags & HAMMER_INODE_RDIRTY) { 1538 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip); 1539 ip->flags &= ~HAMMER_INODE_RDIRTY; 1540 } 1541 RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip); 1542 1543 hammer_free_inode(ip); 1544 return(0); 1545 } 1546 1547 /* 1548 * Called during unmounting if a critical error occured. The in-memory 1549 * inode and all related structures are destroyed. 1550 * 1551 * If a critical error did not occur the unmount code calls the standard 1552 * release and asserts that the inode is gone. 1553 */ 1554 int 1555 hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused) 1556 { 1557 hammer_record_t rec; 1558 1559 /* 1560 * Get rid of the inodes in-memory records, regardless of their 1561 * state, and clear the mod-mask. 1562 */ 1563 while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) { 1564 TAILQ_REMOVE(&ip->target_list, rec, target_entry); 1565 rec->target_ip = NULL; 1566 if (rec->flush_state == HAMMER_FST_SETUP) 1567 rec->flush_state = HAMMER_FST_IDLE; 1568 } 1569 while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) { 1570 if (rec->flush_state == HAMMER_FST_FLUSH) 1571 --rec->flush_group->refs; 1572 else 1573 hammer_ref(&rec->lock); 1574 KKASSERT(hammer_oneref(&rec->lock)); 1575 rec->flush_state = HAMMER_FST_IDLE; 1576 rec->flush_group = NULL; 1577 rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */ 1578 rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */ 1579 ++ip->rec_generation; 1580 hammer_rel_mem_record(rec); 1581 } 1582 ip->flags &= ~HAMMER_INODE_MODMASK; 1583 ip->sync_flags &= ~HAMMER_INODE_MODMASK; 1584 KKASSERT(ip->vp == NULL); 1585 1586 /* 1587 * Remove the inode from any flush group, force it idle. FLUSH 1588 * and SETUP states have an inode ref. 1589 */ 1590 switch(ip->flush_state) { 1591 case HAMMER_FST_FLUSH: 1592 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip); 1593 --ip->flush_group->refs; 1594 ip->flush_group = NULL; 1595 /* fall through */ 1596 case HAMMER_FST_SETUP: 1597 hammer_rel(&ip->lock); 1598 ip->flush_state = HAMMER_FST_IDLE; 1599 /* fall through */ 1600 case HAMMER_FST_IDLE: 1601 break; 1602 } 1603 1604 /* 1605 * There shouldn't be any associated vnode. The unload needs at 1606 * least one ref, if we do have a vp steal its ip ref. 1607 */ 1608 if (ip->vp) { 1609 hdkprintf("Unexpected vnode association ip %p vp %p\n", 1610 ip, ip->vp); 1611 ip->vp->v_data = NULL; 1612 ip->vp = NULL; 1613 } else { 1614 hammer_ref(&ip->lock); 1615 } 1616 hammer_unload_inode(ip); 1617 return(0); 1618 } 1619 1620 /* 1621 * Called on mount -u when switching from RW to RO or vise-versa. Adjust 1622 * the read-only flag for cached inodes. 1623 * 1624 * This routine is called from a RB_SCAN(). 1625 */ 1626 int 1627 hammer_reload_inode(hammer_inode_t ip, void *arg __unused) 1628 { 1629 hammer_mount_t hmp = ip->hmp; 1630 1631 if (hmp->ronly || hmp->asof != HAMMER_MAX_TID) 1632 ip->flags |= HAMMER_INODE_RO; 1633 else 1634 ip->flags &= ~HAMMER_INODE_RO; 1635 return(0); 1636 } 1637 1638 /* 1639 * A transaction has modified an inode, requiring updates as specified by 1640 * the passed flags. 1641 * 1642 * HAMMER_INODE_DDIRTY: Inode data has been updated, not incl mtime/atime, 1643 * and not including size changes due to write-append 1644 * (but other size changes are included). 1645 * HAMMER_INODE_SDIRTY: Inode data has been updated, size changes due to 1646 * write-append. 1647 * HAMMER_INODE_XDIRTY: Dirty in-memory records 1648 * HAMMER_INODE_BUFS: Dirty buffer cache buffers 1649 * HAMMER_INODE_DELETED: Inode record/data must be deleted 1650 * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated 1651 */ 1652 void 1653 hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags) 1654 { 1655 /* 1656 * ronly of 0 or 2 does not trigger assertion. 1657 * 2 is a special error state 1658 */ 1659 KKASSERT(ip->hmp->ronly != 1 || 1660 (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 1661 HAMMER_INODE_SDIRTY | 1662 HAMMER_INODE_BUFS | HAMMER_INODE_DELETED | 1663 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0); 1664 if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) { 1665 ip->flags |= HAMMER_INODE_RSV_INODES; 1666 ++ip->hmp->rsv_inodes; 1667 } 1668 1669 /* 1670 * Set the NEWINODE flag in the transaction if the inode 1671 * transitions to a dirty state. This is used to track 1672 * the load on the inode cache. 1673 */ 1674 if (trans && 1675 (ip->flags & HAMMER_INODE_MODMASK) == 0 && 1676 (flags & HAMMER_INODE_MODMASK)) { 1677 trans->flags |= HAMMER_TRANSF_NEWINODE; 1678 } 1679 if (flags & HAMMER_INODE_MODMASK) 1680 hammer_inode_dirty(ip); 1681 ip->flags |= flags; 1682 } 1683 1684 /* 1685 * Attempt to quickly update the atime for a hammer inode. Return 0 on 1686 * success, -1 on failure. 1687 * 1688 * We attempt to update the atime with only the ip lock and not the 1689 * whole filesystem lock in order to improve concurrency. We can only 1690 * do this safely if the ATIME flag is already pending on the inode. 1691 * 1692 * This function is called via a vnops path (ip pointer is stable) without 1693 * fs_token held. 1694 */ 1695 int 1696 hammer_update_atime_quick(hammer_inode_t ip) 1697 { 1698 struct timeval tv; 1699 int res = -1; 1700 1701 if ((ip->flags & HAMMER_INODE_RO) || 1702 (ip->hmp->mp->mnt_flag & MNT_NOATIME)) { 1703 /* 1704 * Silently indicate success on read-only mount/snap 1705 */ 1706 res = 0; 1707 } else if (ip->flags & HAMMER_INODE_ATIME) { 1708 /* 1709 * Double check with inode lock held against backend. This 1710 * is only safe if all we need to do is update 1711 * ino_data.atime. 1712 */ 1713 getmicrotime(&tv); 1714 hammer_lock_ex(&ip->lock); 1715 if (ip->flags & HAMMER_INODE_ATIME) { 1716 ip->ino_data.atime = 1717 (unsigned long)tv.tv_sec * 1000000ULL + tv.tv_usec; 1718 res = 0; 1719 } 1720 hammer_unlock(&ip->lock); 1721 } 1722 return res; 1723 } 1724 1725 /* 1726 * Request that an inode be flushed. This whole mess cannot block and may 1727 * recurse (if not synchronous). Once requested HAMMER will attempt to 1728 * actively flush the inode until the flush can be done. 1729 * 1730 * The inode may already be flushing, or may be in a setup state. We can 1731 * place the inode in a flushing state if it is currently idle and flag it 1732 * to reflush if it is currently flushing. 1733 * 1734 * Upon return if the inode could not be flushed due to a setup 1735 * dependancy, then it will be automatically flushed when the dependancy 1736 * is satisfied. 1737 */ 1738 void 1739 hammer_flush_inode(hammer_inode_t ip, int flags) 1740 { 1741 hammer_mount_t hmp; 1742 hammer_flush_group_t flg; 1743 int good; 1744 1745 /* 1746 * fill_flush_group is the first flush group we may be able to 1747 * continue filling, it may be open or closed but it will always 1748 * be past the currently flushing (running) flg. 1749 * 1750 * next_flush_group is the next open flush group. 1751 */ 1752 hmp = ip->hmp; 1753 while ((flg = hmp->fill_flush_group) != NULL) { 1754 KKASSERT(flg->running == 0); 1755 if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit && 1756 flg->total_count <= hammer_autoflush) { 1757 break; 1758 } 1759 hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry); 1760 hammer_flusher_async(ip->hmp, flg); 1761 } 1762 if (flg == NULL) { 1763 flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO); 1764 flg->seq = hmp->flusher.next++; 1765 if (hmp->next_flush_group == NULL) 1766 hmp->next_flush_group = flg; 1767 if (hmp->fill_flush_group == NULL) 1768 hmp->fill_flush_group = flg; 1769 RB_INIT(&flg->flush_tree); 1770 TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry); 1771 } 1772 1773 /* 1774 * Trivial 'nothing to flush' case. If the inode is in a SETUP 1775 * state we have to put it back into an IDLE state so we can 1776 * drop the extra ref. 1777 * 1778 * If we have a parent dependancy we must still fall through 1779 * so we can run it. 1780 */ 1781 if ((ip->flags & HAMMER_INODE_MODMASK) == 0) { 1782 if (ip->flush_state == HAMMER_FST_SETUP && 1783 TAILQ_EMPTY(&ip->target_list)) { 1784 ip->flush_state = HAMMER_FST_IDLE; 1785 hammer_rel_inode(ip, 0); 1786 } 1787 if (ip->flush_state == HAMMER_FST_IDLE) 1788 return; 1789 } 1790 1791 /* 1792 * Our flush action will depend on the current state. 1793 */ 1794 switch(ip->flush_state) { 1795 case HAMMER_FST_IDLE: 1796 /* 1797 * We have no dependancies and can flush immediately. Some 1798 * our children may not be flushable so we have to re-test 1799 * with that additional knowledge. 1800 */ 1801 hammer_flush_inode_core(ip, flg, flags); 1802 break; 1803 case HAMMER_FST_SETUP: 1804 /* 1805 * Recurse upwards through dependancies via target_list 1806 * and start their flusher actions going if possible. 1807 * 1808 * 'good' is our connectivity. -1 means we have none and 1809 * can't flush, 0 means there weren't any dependancies, and 1810 * 1 means we have good connectivity. 1811 */ 1812 good = hammer_setup_parent_inodes(ip, 0, flg); 1813 1814 if (good >= 0) { 1815 /* 1816 * We can continue if good >= 0. Determine how 1817 * many records under our inode can be flushed (and 1818 * mark them). 1819 */ 1820 hammer_flush_inode_core(ip, flg, flags); 1821 } else { 1822 /* 1823 * Parent has no connectivity, tell it to flush 1824 * us as soon as it does. 1825 * 1826 * The REFLUSH flag is also needed to trigger 1827 * dependancy wakeups. 1828 */ 1829 ip->flags |= HAMMER_INODE_CONN_DOWN | 1830 HAMMER_INODE_REFLUSH; 1831 if (flags & HAMMER_FLUSH_SIGNAL) { 1832 ip->flags |= HAMMER_INODE_RESIGNAL; 1833 hammer_flusher_async(ip->hmp, flg); 1834 } 1835 } 1836 break; 1837 case HAMMER_FST_FLUSH: 1838 /* 1839 * We are already flushing, flag the inode to reflush 1840 * if needed after it completes its current flush. 1841 * 1842 * The REFLUSH flag is also needed to trigger 1843 * dependancy wakeups. 1844 */ 1845 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0) 1846 ip->flags |= HAMMER_INODE_REFLUSH; 1847 if (flags & HAMMER_FLUSH_SIGNAL) { 1848 ip->flags |= HAMMER_INODE_RESIGNAL; 1849 hammer_flusher_async(ip->hmp, flg); 1850 } 1851 break; 1852 } 1853 } 1854 1855 /* 1856 * Scan ip->target_list, which is a list of records owned by PARENTS to our 1857 * ip which reference our ip. 1858 * 1859 * XXX This is a huge mess of recursive code, but not one bit of it blocks 1860 * so for now do not ref/deref the structures. Note that if we use the 1861 * ref/rel code later, the rel CAN block. 1862 */ 1863 static int 1864 hammer_setup_parent_inodes(hammer_inode_t ip, int depth, 1865 hammer_flush_group_t flg) 1866 { 1867 hammer_record_t depend; 1868 int good; 1869 int r; 1870 1871 /* 1872 * If we hit our recursion limit and we have parent dependencies 1873 * We cannot continue. Returning < 0 will cause us to be flagged 1874 * for reflush. Returning -2 cuts off additional dependency checks 1875 * because they are likely to also hit the depth limit. 1876 * 1877 * We cannot return < 0 if there are no dependencies or there might 1878 * not be anything to wakeup (ip). 1879 */ 1880 if (depth == 20 && TAILQ_FIRST(&ip->target_list)) { 1881 if (hammer_debug_general & 0x10000) 1882 hkrateprintf(&hammer_gen_krate, 1883 "Warning: depth limit reached on " 1884 "setup recursion, inode %p %016llx\n", 1885 ip, (long long)ip->obj_id); 1886 return(-2); 1887 } 1888 1889 /* 1890 * Scan dependencies 1891 */ 1892 good = 0; 1893 TAILQ_FOREACH(depend, &ip->target_list, target_entry) { 1894 r = hammer_setup_parent_inodes_helper(depend, depth, flg); 1895 KKASSERT(depend->target_ip == ip); 1896 if (r < 0 && good == 0) 1897 good = -1; 1898 if (r > 0) 1899 good = 1; 1900 1901 /* 1902 * If we failed due to the recursion depth limit then stop 1903 * now. 1904 */ 1905 if (r == -2) 1906 break; 1907 } 1908 return(good); 1909 } 1910 1911 /* 1912 * This helper function takes a record representing the dependancy between 1913 * the parent inode and child inode. 1914 * 1915 * record = record in question (*rec in below) 1916 * record->ip = parent inode (*pip in below) 1917 * record->target_ip = child inode (*ip in below) 1918 * 1919 * *pip--------------\ 1920 * ^ \rec_tree 1921 * \ \ 1922 * \ip /\\\\\ rbtree of recs from parent inode's view 1923 * \ //\\\\\\ 1924 * \ / ........ 1925 * \ / 1926 * \------*rec------target_ip------>*ip 1927 * ...target_entry<----...----->target_list<---... 1928 * list of recs from inode's view 1929 * 1930 * We are asked to recurse upwards and convert the record from SETUP 1931 * to FLUSH if possible. 1932 * 1933 * Return 1 if the record gives us connectivity 1934 * 1935 * Return 0 if the record is not relevant 1936 * 1937 * Return -1 if we can't resolve the dependancy and there is no connectivity. 1938 */ 1939 static int 1940 hammer_setup_parent_inodes_helper(hammer_record_t record, int depth, 1941 hammer_flush_group_t flg) 1942 { 1943 hammer_inode_t pip; 1944 int good; 1945 1946 KKASSERT(record->flush_state != HAMMER_FST_IDLE); 1947 pip = record->ip; 1948 1949 /* 1950 * If the record is already flushing, is it in our flush group? 1951 * 1952 * If it is in our flush group but it is a general record or a 1953 * delete-on-disk, it does not improve our connectivity (return 0), 1954 * and if the target inode is not trying to destroy itself we can't 1955 * allow the operation yet anyway (the second return -1). 1956 */ 1957 if (record->flush_state == HAMMER_FST_FLUSH) { 1958 /* 1959 * If not in our flush group ask the parent to reflush 1960 * us as soon as possible. 1961 */ 1962 if (record->flush_group != flg) { 1963 pip->flags |= HAMMER_INODE_REFLUSH; 1964 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN; 1965 return(-1); 1966 } 1967 1968 /* 1969 * If in our flush group everything is already set up, 1970 * just return whether the record will improve our 1971 * visibility or not. 1972 */ 1973 if (record->type == HAMMER_MEM_RECORD_ADD) 1974 return(1); 1975 return(0); 1976 } 1977 1978 /* 1979 * It must be a setup record. Try to resolve the setup dependancies 1980 * by recursing upwards so we can place ip on the flush list. 1981 * 1982 * Limit ourselves to 20 levels of recursion to avoid blowing out 1983 * the kernel stack. If we hit the recursion limit we can't flush 1984 * until the parent flushes. The parent will flush independantly 1985 * on its own and ultimately a deep recursion will be resolved. 1986 */ 1987 KKASSERT(record->flush_state == HAMMER_FST_SETUP); 1988 1989 good = hammer_setup_parent_inodes(pip, depth + 1, flg); 1990 1991 /* 1992 * If good < 0 the parent has no connectivity and we cannot safely 1993 * flush the directory entry, which also means we can't flush our 1994 * ip. Flag us for downward recursion once the parent's 1995 * connectivity is resolved. Flag the parent for [re]flush or it 1996 * may not check for downward recursions. 1997 */ 1998 if (good < 0) { 1999 pip->flags |= HAMMER_INODE_REFLUSH; 2000 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN; 2001 return(good); 2002 } 2003 2004 /* 2005 * We are go, place the parent inode in a flushing state so we can 2006 * place its record in a flushing state. Note that the parent 2007 * may already be flushing. The record must be in the same flush 2008 * group as the parent. 2009 */ 2010 if (pip->flush_state != HAMMER_FST_FLUSH) 2011 hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION); 2012 KKASSERT(pip->flush_state == HAMMER_FST_FLUSH); 2013 2014 /* 2015 * It is possible for a rename to create a loop in the recursion 2016 * and revisit a record. This will result in the record being 2017 * placed in a flush state unexpectedly. This check deals with 2018 * the case. 2019 */ 2020 if (record->flush_state == HAMMER_FST_FLUSH) { 2021 if (record->type == HAMMER_MEM_RECORD_ADD) 2022 return(1); 2023 return(0); 2024 } 2025 2026 KKASSERT(record->flush_state == HAMMER_FST_SETUP); 2027 2028 #if 0 2029 if (record->type == HAMMER_MEM_RECORD_DEL && 2030 (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) { 2031 /* 2032 * Regardless of flushing state we cannot sync this path if the 2033 * record represents a delete-on-disk but the target inode 2034 * is not ready to sync its own deletion. 2035 * 2036 * XXX need to count effective nlinks to determine whether 2037 * the flush is ok, otherwise removing a hardlink will 2038 * just leave the DEL record to rot. 2039 */ 2040 record->target_ip->flags |= HAMMER_INODE_REFLUSH; 2041 return(-1); 2042 } else 2043 #endif 2044 if (pip->flush_group == flg) { 2045 /* 2046 * Because we have not calculated nlinks yet we can just 2047 * set records to the flush state if the parent is in 2048 * the same flush group as we are. 2049 */ 2050 record->flush_state = HAMMER_FST_FLUSH; 2051 record->flush_group = flg; 2052 ++record->flush_group->refs; 2053 hammer_ref(&record->lock); 2054 2055 /* 2056 * A general directory-add contributes to our visibility. 2057 * 2058 * Otherwise it is probably a directory-delete or 2059 * delete-on-disk record and does not contribute to our 2060 * visbility (but we can still flush it). 2061 */ 2062 if (record->type == HAMMER_MEM_RECORD_ADD) 2063 return(1); 2064 return(0); 2065 } else { 2066 /* 2067 * If the parent is not in our flush group we cannot 2068 * flush this record yet, there is no visibility. 2069 * We tell the parent to reflush and mark ourselves 2070 * so the parent knows it should flush us too. 2071 */ 2072 pip->flags |= HAMMER_INODE_REFLUSH; 2073 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN; 2074 return(-1); 2075 } 2076 } 2077 2078 /* 2079 * This is the core routine placing an inode into the FST_FLUSH state. 2080 */ 2081 static void 2082 hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags) 2083 { 2084 hammer_mount_t hmp = ip->hmp; 2085 int go_count; 2086 2087 /* 2088 * Set flush state and prevent the flusher from cycling into 2089 * the next flush group. Do not place the ip on the list yet. 2090 * Inodes not in the idle state get an extra reference. 2091 */ 2092 KKASSERT(ip->flush_state != HAMMER_FST_FLUSH); 2093 if (ip->flush_state == HAMMER_FST_IDLE) 2094 hammer_ref(&ip->lock); 2095 ip->flush_state = HAMMER_FST_FLUSH; 2096 ip->flush_group = flg; 2097 ++hmp->flusher.group_lock; 2098 ++hmp->count_iqueued; 2099 ++hammer_count_iqueued; 2100 ++flg->total_count; 2101 hammer_redo_fifo_start_flush(ip); 2102 2103 #if 0 2104 /* 2105 * We need to be able to vfsync/truncate from the backend. 2106 * 2107 * XXX Any truncation from the backend will acquire the vnode 2108 * independently. 2109 */ 2110 KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0); 2111 if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) { 2112 ip->flags |= HAMMER_INODE_VHELD; 2113 vref(ip->vp); 2114 } 2115 #endif 2116 2117 /* 2118 * Figure out how many in-memory records we can actually flush 2119 * (not including inode meta-data, buffers, etc). 2120 */ 2121 KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0); 2122 if (flags & HAMMER_FLUSH_RECURSION) { 2123 /* 2124 * If this is a upwards recursion we do not want to 2125 * recurse down again! 2126 */ 2127 go_count = 1; 2128 #if 0 2129 } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) { 2130 /* 2131 * No new records are added if we must complete a flush 2132 * from a previous cycle, but we do have to move the records 2133 * from the previous cycle to the current one. 2134 */ 2135 #if 0 2136 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, 2137 hammer_syncgrp_child_callback, NULL); 2138 #endif 2139 go_count = 1; 2140 #endif 2141 } else { 2142 /* 2143 * Normal flush, scan records and bring them into the flush. 2144 * Directory adds and deletes are usually skipped (they are 2145 * grouped with the related inode rather then with the 2146 * directory). 2147 * 2148 * go_count can be negative, which means the scan aborted 2149 * due to the flush group being over-full and we should 2150 * flush what we have. 2151 */ 2152 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, 2153 hammer_setup_child_callback, NULL); 2154 } 2155 2156 /* 2157 * This is a more involved test that includes go_count. If we 2158 * can't flush, flag the inode and return. If go_count is 0 we 2159 * were are unable to flush any records in our rec_tree and 2160 * must ignore the XDIRTY flag. 2161 */ 2162 if (go_count == 0) { 2163 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) { 2164 --hmp->count_iqueued; 2165 --hammer_count_iqueued; 2166 2167 --flg->total_count; 2168 ip->flush_state = HAMMER_FST_SETUP; 2169 ip->flush_group = NULL; 2170 if (flags & HAMMER_FLUSH_SIGNAL) { 2171 ip->flags |= HAMMER_INODE_REFLUSH | 2172 HAMMER_INODE_RESIGNAL; 2173 } else { 2174 ip->flags |= HAMMER_INODE_REFLUSH; 2175 } 2176 #if 0 2177 if (ip->flags & HAMMER_INODE_VHELD) { 2178 ip->flags &= ~HAMMER_INODE_VHELD; 2179 vrele(ip->vp); 2180 } 2181 #endif 2182 2183 /* 2184 * REFLUSH is needed to trigger dependancy wakeups 2185 * when an inode is in SETUP. 2186 */ 2187 ip->flags |= HAMMER_INODE_REFLUSH; 2188 if (--hmp->flusher.group_lock == 0) 2189 wakeup(&hmp->flusher.group_lock); 2190 return; 2191 } 2192 } 2193 2194 /* 2195 * Snapshot the state of the inode for the backend flusher. 2196 * 2197 * We continue to retain save_trunc_off even when all truncations 2198 * have been resolved as an optimization to determine if we can 2199 * skip the B-Tree lookup for overwrite deletions. 2200 * 2201 * NOTE: The DELETING flag is a mod flag, but it is also sticky, 2202 * and stays in ip->flags. Once set, it stays set until the 2203 * inode is destroyed. 2204 */ 2205 if (ip->flags & HAMMER_INODE_TRUNCATED) { 2206 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0); 2207 ip->sync_trunc_off = ip->trunc_off; 2208 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; 2209 ip->flags &= ~HAMMER_INODE_TRUNCATED; 2210 ip->sync_flags |= HAMMER_INODE_TRUNCATED; 2211 2212 /* 2213 * The save_trunc_off used to cache whether the B-Tree 2214 * holds any records past that point is not used until 2215 * after the truncation has succeeded, so we can safely 2216 * set it now. 2217 */ 2218 if (ip->save_trunc_off > ip->sync_trunc_off) 2219 ip->save_trunc_off = ip->sync_trunc_off; 2220 } 2221 ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK & 2222 ~HAMMER_INODE_TRUNCATED); 2223 ip->sync_ino_leaf = ip->ino_leaf; 2224 ip->sync_ino_data = ip->ino_data; 2225 ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED; 2226 2227 /* 2228 * The flusher list inherits our inode and reference. 2229 */ 2230 KKASSERT(flg->running == 0); 2231 RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip); 2232 if (--hmp->flusher.group_lock == 0) 2233 wakeup(&hmp->flusher.group_lock); 2234 2235 /* 2236 * Auto-flush the group if it grows too large. Make sure the 2237 * inode reclaim wait pipeline continues to work. 2238 */ 2239 if (flg->total_count >= hammer_autoflush || 2240 flg->total_count >= hammer_limit_reclaims / 4) { 2241 if (hmp->fill_flush_group == flg) 2242 hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry); 2243 hammer_flusher_async(hmp, flg); 2244 } 2245 } 2246 2247 /* 2248 * Callback for scan of ip->rec_tree. Try to include each record in our 2249 * flush. ip->flush_group has been set but the inode has not yet been 2250 * moved into a flushing state. 2251 * 2252 * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on 2253 * both inodes. 2254 * 2255 * We return 1 for any record placed or found in FST_FLUSH, which prevents 2256 * the caller from shortcutting the flush. 2257 */ 2258 static int 2259 hammer_setup_child_callback(hammer_record_t rec, void *data) 2260 { 2261 hammer_flush_group_t flg; 2262 hammer_inode_t target_ip; 2263 hammer_inode_t ip; 2264 int r; 2265 2266 /* 2267 * Records deleted or committed by the backend are ignored. 2268 * Note that the flush detects deleted frontend records at 2269 * multiple points to deal with races. This is just the first 2270 * line of defense. The only time HAMMER_RECF_DELETED_FE cannot 2271 * be set is when HAMMER_RECF_INTERLOCK_BE is set, because it 2272 * messes up link-count calculations. 2273 * 2274 * NOTE: Don't get confused between record deletion and, say, 2275 * directory entry deletion. The deletion of a directory entry 2276 * which is on-media has nothing to do with the record deletion 2277 * flags. 2278 */ 2279 if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 2280 HAMMER_RECF_COMMITTED)) { 2281 if (rec->flush_state == HAMMER_FST_FLUSH) { 2282 KKASSERT(rec->flush_group == rec->ip->flush_group); 2283 r = 1; 2284 } else { 2285 r = 0; 2286 } 2287 return(r); 2288 } 2289 2290 /* 2291 * If the record is in an idle state it has no dependancies and 2292 * can be flushed. 2293 */ 2294 ip = rec->ip; 2295 flg = ip->flush_group; 2296 r = 0; 2297 2298 switch(rec->flush_state) { 2299 case HAMMER_FST_IDLE: 2300 /* 2301 * The record has no setup dependancy, we can flush it. 2302 */ 2303 KKASSERT(rec->target_ip == NULL); 2304 rec->flush_state = HAMMER_FST_FLUSH; 2305 rec->flush_group = flg; 2306 ++flg->refs; 2307 hammer_ref(&rec->lock); 2308 r = 1; 2309 break; 2310 case HAMMER_FST_SETUP: 2311 /* 2312 * The record has a setup dependancy. These are typically 2313 * directory entry adds and deletes. Such entries will be 2314 * flushed when their inodes are flushed so we do not 2315 * usually have to add them to the flush here. However, 2316 * if the target_ip has set HAMMER_INODE_CONN_DOWN then 2317 * it is asking us to flush this record (and it). 2318 */ 2319 target_ip = rec->target_ip; 2320 KKASSERT(target_ip != NULL); 2321 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE); 2322 2323 /* 2324 * If the target IP is already flushing in our group 2325 * we could associate the record, but target_ip has 2326 * already synced ino_data to sync_ino_data and we 2327 * would also have to adjust nlinks. Plus there are 2328 * ordering issues for adds and deletes. 2329 * 2330 * Reflush downward if this is an ADD, and upward if 2331 * this is a DEL. 2332 */ 2333 if (target_ip->flush_state == HAMMER_FST_FLUSH) { 2334 if (rec->type == HAMMER_MEM_RECORD_ADD) 2335 ip->flags |= HAMMER_INODE_REFLUSH; 2336 else 2337 target_ip->flags |= HAMMER_INODE_REFLUSH; 2338 break; 2339 } 2340 2341 /* 2342 * Target IP is not yet flushing. This can get complex 2343 * because we have to be careful about the recursion. 2344 * 2345 * Directories create an issue for us in that if a flush 2346 * of a directory is requested the expectation is to flush 2347 * any pending directory entries, but this will cause the 2348 * related inodes to recursively flush as well. We can't 2349 * really defer the operation so just get as many as we 2350 * can and 2351 */ 2352 #if 0 2353 if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 && 2354 (target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) { 2355 /* 2356 * We aren't reclaiming and the target ip was not 2357 * previously prevented from flushing due to this 2358 * record dependancy. Do not flush this record. 2359 */ 2360 /*r = 0;*/ 2361 } else 2362 #endif 2363 if (flg->total_count + flg->refs > 2364 ip->hmp->undo_rec_limit) { 2365 /* 2366 * Our flush group is over-full and we risk blowing 2367 * out the UNDO FIFO. Stop the scan, flush what we 2368 * have, then reflush the directory. 2369 * 2370 * The directory may be forced through multiple 2371 * flush groups before it can be completely 2372 * flushed. 2373 */ 2374 ip->flags |= HAMMER_INODE_RESIGNAL | 2375 HAMMER_INODE_REFLUSH; 2376 r = -1; 2377 } else if (rec->type == HAMMER_MEM_RECORD_ADD) { 2378 /* 2379 * If the target IP is not flushing we can force 2380 * it to flush, even if it is unable to write out 2381 * any of its own records we have at least one in 2382 * hand that we CAN deal with. 2383 */ 2384 rec->flush_state = HAMMER_FST_FLUSH; 2385 rec->flush_group = flg; 2386 ++flg->refs; 2387 hammer_ref(&rec->lock); 2388 hammer_flush_inode_core(target_ip, flg, 2389 HAMMER_FLUSH_RECURSION); 2390 r = 1; 2391 } else { 2392 /* 2393 * General or delete-on-disk record. 2394 * 2395 * XXX this needs help. If a delete-on-disk we could 2396 * disconnect the target. If the target has its own 2397 * dependancies they really need to be flushed. 2398 * 2399 * XXX 2400 */ 2401 rec->flush_state = HAMMER_FST_FLUSH; 2402 rec->flush_group = flg; 2403 ++flg->refs; 2404 hammer_ref(&rec->lock); 2405 hammer_flush_inode_core(target_ip, flg, 2406 HAMMER_FLUSH_RECURSION); 2407 r = 1; 2408 } 2409 break; 2410 case HAMMER_FST_FLUSH: 2411 /* 2412 * The record could be part of a previous flush group if the 2413 * inode is a directory (the record being a directory entry). 2414 * Once the flush group was closed a hammer_test_inode() 2415 * function can cause a new flush group to be setup, placing 2416 * the directory inode itself in a new flush group. 2417 * 2418 * When associated with a previous flush group we count it 2419 * as if it were in our current flush group, since it will 2420 * effectively be flushed by the time we flush our current 2421 * flush group. 2422 */ 2423 KKASSERT( 2424 rec->ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY || 2425 rec->flush_group == flg); 2426 r = 1; 2427 break; 2428 } 2429 return(r); 2430 } 2431 2432 #if 0 2433 /* 2434 * This version just moves records already in a flush state to the new 2435 * flush group and that is it. 2436 */ 2437 static int 2438 hammer_syncgrp_child_callback(hammer_record_t rec, void *data) 2439 { 2440 hammer_inode_t ip = rec->ip; 2441 2442 switch(rec->flush_state) { 2443 case HAMMER_FST_FLUSH: 2444 KKASSERT(rec->flush_group == ip->flush_group); 2445 break; 2446 default: 2447 break; 2448 } 2449 return(0); 2450 } 2451 #endif 2452 2453 /* 2454 * Wait for a previously queued flush to complete. 2455 * 2456 * If a critical error occured we don't try to wait. 2457 */ 2458 void 2459 hammer_wait_inode(hammer_inode_t ip) 2460 { 2461 /* 2462 * The inode can be in a SETUP state in which case RESIGNAL 2463 * should be set. If RESIGNAL is not set then the previous 2464 * flush completed and a later operation placed the inode 2465 * in a passive setup state again, so we're done. 2466 * 2467 * The inode can be in a FLUSH state in which case we 2468 * can just wait for completion. 2469 */ 2470 while (ip->flush_state == HAMMER_FST_FLUSH || 2471 (ip->flush_state == HAMMER_FST_SETUP && 2472 (ip->flags & HAMMER_INODE_RESIGNAL))) { 2473 /* 2474 * Don't try to flush on a critical error 2475 */ 2476 if (ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 2477 break; 2478 2479 /* 2480 * If the inode was already being flushed its flg 2481 * may not have been queued to the backend. We 2482 * have to make sure it gets queued or we can wind 2483 * up blocked or deadlocked (particularly if we are 2484 * the vnlru thread). 2485 */ 2486 if (ip->flush_state == HAMMER_FST_FLUSH) { 2487 KKASSERT(ip->flush_group); 2488 if (ip->flush_group->closed == 0) { 2489 if (hammer_debug_inode) { 2490 hkprintf("debug: forcing " 2491 "async flush ip %016jx\n", 2492 (intmax_t)ip->obj_id); 2493 } 2494 hammer_flusher_async(ip->hmp, ip->flush_group); 2495 continue; /* retest */ 2496 } 2497 } 2498 2499 /* 2500 * In a flush state with the flg queued to the backend 2501 * or in a setup state with RESIGNAL set, we can safely 2502 * wait. 2503 */ 2504 ip->flags |= HAMMER_INODE_FLUSHW; 2505 tsleep(&ip->flags, 0, "hmrwin", 0); 2506 } 2507 2508 #if 0 2509 /* 2510 * The inode may have been in a passive setup state, 2511 * call flush to make sure we get signaled. 2512 */ 2513 if (ip->flush_state == HAMMER_FST_SETUP) 2514 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 2515 #endif 2516 2517 } 2518 2519 /* 2520 * Called by the backend code when a flush has been completed. 2521 * The inode has already been removed from the flush list. 2522 * 2523 * A pipelined flush can occur, in which case we must re-enter the 2524 * inode on the list and re-copy its fields. 2525 */ 2526 void 2527 hammer_flush_inode_done(hammer_inode_t ip, int error) 2528 { 2529 hammer_mount_t hmp; 2530 int dorel; 2531 2532 KKASSERT(ip->flush_state == HAMMER_FST_FLUSH); 2533 2534 hmp = ip->hmp; 2535 2536 /* 2537 * Auto-reflush if the backend could not completely flush 2538 * the inode. This fixes a case where a deferred buffer flush 2539 * could cause fsync to return early. 2540 */ 2541 if (ip->sync_flags & HAMMER_INODE_MODMASK) 2542 ip->flags |= HAMMER_INODE_REFLUSH; 2543 2544 /* 2545 * Merge left-over flags back into the frontend and fix the state. 2546 * Incomplete truncations are retained by the backend. 2547 */ 2548 ip->error = error; 2549 ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED; 2550 ip->sync_flags &= HAMMER_INODE_TRUNCATED; 2551 2552 /* 2553 * The backend may have adjusted nlinks, so if the adjusted nlinks 2554 * does not match the fronttend set the frontend's DDIRTY flag again. 2555 */ 2556 if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks) 2557 ip->flags |= HAMMER_INODE_DDIRTY; 2558 2559 /* 2560 * Fix up the dirty buffer status. 2561 */ 2562 if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) { 2563 ip->flags |= HAMMER_INODE_BUFS; 2564 } 2565 hammer_redo_fifo_end_flush(ip); 2566 2567 /* 2568 * Re-set the XDIRTY flag if some of the inode's in-memory records 2569 * could not be flushed. 2570 */ 2571 KKASSERT((RB_EMPTY(&ip->rec_tree) && 2572 (ip->flags & HAMMER_INODE_XDIRTY) == 0) || 2573 (!RB_EMPTY(&ip->rec_tree) && 2574 (ip->flags & HAMMER_INODE_XDIRTY) != 0)); 2575 2576 /* 2577 * Do not lose track of inodes which no longer have vnode 2578 * assocations, otherwise they may never get flushed again. 2579 * 2580 * The reflush flag can be set superfluously, causing extra pain 2581 * for no reason. If the inode is no longer modified it no longer 2582 * needs to be flushed. 2583 */ 2584 if (ip->flags & HAMMER_INODE_MODMASK) { 2585 if (ip->vp == NULL) 2586 ip->flags |= HAMMER_INODE_REFLUSH; 2587 } else { 2588 ip->flags &= ~HAMMER_INODE_REFLUSH; 2589 } 2590 2591 /* 2592 * The fs token is held but the inode lock is not held. Because this 2593 * is a backend flush it is possible that the vnode has no references 2594 * and cause a reclaim race inside vsetisdirty() if/when it blocks. 2595 * 2596 * Therefore, we must lock the inode around this particular dirtying 2597 * operation. We don't have to around other dirtying operations 2598 * where the vnode is implicitly or explicitly held. 2599 */ 2600 if (ip->flags & HAMMER_INODE_MODMASK) { 2601 hammer_lock_ex(&ip->lock); 2602 hammer_inode_dirty(ip); 2603 hammer_unlock(&ip->lock); 2604 } 2605 2606 /* 2607 * Adjust the flush state. 2608 */ 2609 if (ip->flags & HAMMER_INODE_WOULDBLOCK) { 2610 /* 2611 * We were unable to flush out all our records, leave the 2612 * inode in a flush state and in the current flush group. 2613 * The flush group will be re-run. 2614 * 2615 * This occurs if the UNDO block gets too full or there is 2616 * too much dirty meta-data and allows the flusher to 2617 * finalize the UNDO block and then re-flush. 2618 */ 2619 ip->flags &= ~HAMMER_INODE_WOULDBLOCK; 2620 dorel = 0; 2621 } else { 2622 /* 2623 * Remove from the flush_group 2624 */ 2625 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip); 2626 ip->flush_group = NULL; 2627 2628 #if 0 2629 /* 2630 * Clean up the vnode ref and tracking counts. 2631 */ 2632 if (ip->flags & HAMMER_INODE_VHELD) { 2633 ip->flags &= ~HAMMER_INODE_VHELD; 2634 vrele(ip->vp); 2635 } 2636 #endif 2637 --hmp->count_iqueued; 2638 --hammer_count_iqueued; 2639 2640 /* 2641 * And adjust the state. 2642 */ 2643 if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) { 2644 ip->flush_state = HAMMER_FST_IDLE; 2645 dorel = 1; 2646 } else { 2647 ip->flush_state = HAMMER_FST_SETUP; 2648 dorel = 0; 2649 } 2650 2651 /* 2652 * If the frontend is waiting for a flush to complete, 2653 * wake it up. 2654 */ 2655 if (ip->flags & HAMMER_INODE_FLUSHW) { 2656 ip->flags &= ~HAMMER_INODE_FLUSHW; 2657 wakeup(&ip->flags); 2658 } 2659 2660 /* 2661 * If the frontend made more changes and requested another 2662 * flush, then try to get it running. 2663 * 2664 * Reflushes are aborted when the inode is errored out. 2665 */ 2666 if (ip->flags & HAMMER_INODE_REFLUSH) { 2667 ip->flags &= ~HAMMER_INODE_REFLUSH; 2668 if (ip->flags & HAMMER_INODE_RESIGNAL) { 2669 ip->flags &= ~HAMMER_INODE_RESIGNAL; 2670 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 2671 } else { 2672 hammer_flush_inode(ip, 0); 2673 } 2674 } 2675 } 2676 2677 /* 2678 * If we have no parent dependancies we can clear CONN_DOWN 2679 */ 2680 if (TAILQ_EMPTY(&ip->target_list)) 2681 ip->flags &= ~HAMMER_INODE_CONN_DOWN; 2682 2683 /* 2684 * If the inode is now clean drop the space reservation. 2685 */ 2686 if ((ip->flags & HAMMER_INODE_MODMASK) == 0 && 2687 (ip->flags & HAMMER_INODE_RSV_INODES)) { 2688 ip->flags &= ~HAMMER_INODE_RSV_INODES; 2689 --hmp->rsv_inodes; 2690 } 2691 2692 ip->flags &= ~HAMMER_INODE_SLAVEFLUSH; 2693 2694 if (dorel) 2695 hammer_rel_inode(ip, 0); 2696 } 2697 2698 /* 2699 * Called from hammer_sync_inode() to synchronize in-memory records 2700 * to the media. 2701 */ 2702 static int 2703 hammer_sync_record_callback(hammer_record_t record, void *data) 2704 { 2705 hammer_cursor_t cursor = data; 2706 hammer_transaction_t trans = cursor->trans; 2707 hammer_mount_t hmp = trans->hmp; 2708 int error; 2709 2710 /* 2711 * Skip records that do not belong to the current flush. 2712 */ 2713 ++hammer_stats_record_iterations; 2714 if (record->flush_state != HAMMER_FST_FLUSH) 2715 return(0); 2716 2717 if (record->flush_group != record->ip->flush_group) { 2718 hdkprintf("rec %p ip %p bad flush group %p %p\n", 2719 record, 2720 record->ip, 2721 record->flush_group, 2722 record->ip->flush_group); 2723 if (hammer_debug_critical) 2724 Debugger("blah2"); 2725 return(0); 2726 } 2727 KKASSERT(record->flush_group == record->ip->flush_group); 2728 2729 /* 2730 * Interlock the record using the BE flag. Once BE is set the 2731 * frontend cannot change the state of FE. 2732 * 2733 * NOTE: If FE is set prior to us setting BE we still sync the 2734 * record out, but the flush completion code converts it to 2735 * a delete-on-disk record instead of destroying it. 2736 */ 2737 KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0); 2738 record->flags |= HAMMER_RECF_INTERLOCK_BE; 2739 2740 /* 2741 * The backend has already disposed of the record. 2742 */ 2743 if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) { 2744 error = 0; 2745 goto done; 2746 } 2747 2748 /* 2749 * If the whole inode is being deleted and all on-disk records will 2750 * be deleted very soon, we can't sync any new records to disk 2751 * because they will be deleted in the same transaction they were 2752 * created in (delete_tid == create_tid), which will assert. 2753 * 2754 * XXX There may be a case with RECORD_ADD with DELETED_FE set 2755 * that we currently panic on. 2756 */ 2757 if (record->ip->sync_flags & HAMMER_INODE_DELETING) { 2758 switch(record->type) { 2759 case HAMMER_MEM_RECORD_DATA: 2760 /* 2761 * We don't have to do anything, if the record was 2762 * committed the space will have been accounted for 2763 * in the blockmap. 2764 */ 2765 /* fall through */ 2766 case HAMMER_MEM_RECORD_GENERAL: 2767 /* 2768 * Set deleted-by-backend flag. Do not set the 2769 * backend committed flag, because we are throwing 2770 * the record away. 2771 */ 2772 record->flags |= HAMMER_RECF_DELETED_BE; 2773 ++record->ip->rec_generation; 2774 error = 0; 2775 goto done; 2776 case HAMMER_MEM_RECORD_ADD: 2777 hpanic("illegal add during inode deletion record %p", 2778 record); 2779 break; /* NOT REACHED */ 2780 case HAMMER_MEM_RECORD_INODE: 2781 hpanic("attempt to sync inode record %p?", record); 2782 break; /* NOT REACHED */ 2783 case HAMMER_MEM_RECORD_DEL: 2784 /* 2785 * Follow through and issue the on-disk deletion 2786 */ 2787 break; 2788 } 2789 } 2790 2791 /* 2792 * If DELETED_FE is set special handling is needed for directory 2793 * entries. Dependant pieces related to the directory entry may 2794 * have already been synced to disk. If this occurs we have to 2795 * sync the directory entry and then change the in-memory record 2796 * from an ADD to a DELETE to cover the fact that it's been 2797 * deleted by the frontend. 2798 * 2799 * A directory delete covering record (MEM_RECORD_DEL) can never 2800 * be deleted by the frontend. 2801 * 2802 * Any other record type (aka DATA) can be deleted by the frontend. 2803 * XXX At the moment the flusher must skip it because there may 2804 * be another data record in the flush group for the same block, 2805 * meaning that some frontend data changes can leak into the backend's 2806 * synchronization point. 2807 */ 2808 if (record->flags & HAMMER_RECF_DELETED_FE) { 2809 if (record->type == HAMMER_MEM_RECORD_ADD) { 2810 /* 2811 * Convert a front-end deleted directory-add to 2812 * a directory-delete entry later. 2813 */ 2814 record->flags |= HAMMER_RECF_CONVERT_DELETE; 2815 } else { 2816 /* 2817 * Dispose of the record (race case). Mark as 2818 * deleted by backend (and not committed). 2819 */ 2820 KKASSERT(record->type != HAMMER_MEM_RECORD_DEL); 2821 record->flags |= HAMMER_RECF_DELETED_BE; 2822 ++record->ip->rec_generation; 2823 error = 0; 2824 goto done; 2825 } 2826 } 2827 2828 /* 2829 * Assign the create_tid for new records. Deletions already 2830 * have the record's entire key properly set up. 2831 */ 2832 if (record->type != HAMMER_MEM_RECORD_DEL) { 2833 record->leaf.base.create_tid = trans->tid; 2834 record->leaf.create_ts = trans->time32; 2835 } 2836 2837 /* 2838 * This actually moves the record to the on-media B-Tree. We 2839 * must also generate REDO_TERM entries in the UNDO/REDO FIFO 2840 * indicating that the related REDO_WRITE(s) have been committed. 2841 * 2842 * During recovery any REDO_TERM's within the nominal recovery span 2843 * are ignored since the related meta-data is being undone, causing 2844 * any matching REDO_WRITEs to execute. The REDO_TERMs outside 2845 * the nominal recovery span will match against REDO_WRITEs and 2846 * prevent them from being executed (because the meta-data has 2847 * already been synchronized). 2848 */ 2849 if (record->flags & HAMMER_RECF_REDO) { 2850 KKASSERT(record->type == HAMMER_MEM_RECORD_DATA); 2851 hammer_generate_redo(trans, record->ip, 2852 record->leaf.base.key - 2853 record->leaf.data_len, 2854 HAMMER_REDO_TERM_WRITE, 2855 NULL, 2856 record->leaf.data_len); 2857 } 2858 2859 for (;;) { 2860 error = hammer_ip_sync_record_cursor(cursor, record); 2861 if (error != EDEADLK) 2862 break; 2863 hammer_done_cursor(cursor); 2864 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0], 2865 record->ip); 2866 if (error) 2867 break; 2868 } 2869 record->flags &= ~HAMMER_RECF_CONVERT_DELETE; 2870 2871 if (error) 2872 error = -error; 2873 done: 2874 hammer_flush_record_done(record, error); 2875 2876 /* 2877 * Do partial finalization if we have built up too many dirty 2878 * buffers. Otherwise a buffer cache deadlock can occur when 2879 * doing things like creating tens of thousands of tiny files. 2880 * 2881 * We must release our cursor lock to avoid a 3-way deadlock 2882 * due to the exclusive sync lock the finalizer must get. 2883 * 2884 * WARNING: See warnings in hammer_unlock_cursor() function. 2885 */ 2886 if (hammer_flusher_meta_limit(hmp) || 2887 vm_page_count_severe()) { 2888 hammer_unlock_cursor(cursor); 2889 hammer_flusher_finalize(trans, 0); 2890 hammer_lock_cursor(cursor); 2891 } 2892 return(error); 2893 } 2894 2895 /* 2896 * Backend function called by the flusher to sync an inode to media. 2897 */ 2898 int 2899 hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip) 2900 { 2901 struct hammer_cursor cursor; 2902 hammer_node_t tmp_node; 2903 hammer_record_t depend; 2904 hammer_record_t next; 2905 int error, tmp_error; 2906 uint64_t nlinks; 2907 2908 if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0) 2909 return(0); 2910 2911 error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip); 2912 if (error) 2913 goto done; 2914 2915 /* 2916 * Any directory records referencing this inode which are not in 2917 * our current flush group must adjust our nlink count for the 2918 * purposes of synchronizating to disk. 2919 * 2920 * Records which are in our flush group can be unlinked from our 2921 * inode now, potentially allowing the inode to be physically 2922 * deleted. 2923 * 2924 * This cannot block. 2925 */ 2926 nlinks = ip->ino_data.nlinks; 2927 next = TAILQ_FIRST(&ip->target_list); 2928 while ((depend = next) != NULL) { 2929 next = TAILQ_NEXT(depend, target_entry); 2930 if (depend->flush_state == HAMMER_FST_FLUSH && 2931 depend->flush_group == ip->flush_group) { 2932 /* 2933 * If this is an ADD that was deleted by the frontend 2934 * the frontend nlinks count will have already been 2935 * decremented, but the backend is going to sync its 2936 * directory entry and must account for it. The 2937 * record will be converted to a delete-on-disk when 2938 * it gets synced. 2939 * 2940 * If the ADD was not deleted by the frontend we 2941 * can remove the dependancy from our target_list. 2942 */ 2943 if (depend->flags & HAMMER_RECF_DELETED_FE) { 2944 ++nlinks; 2945 } else { 2946 TAILQ_REMOVE(&ip->target_list, depend, 2947 target_entry); 2948 depend->target_ip = NULL; 2949 } 2950 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) { 2951 /* 2952 * Not part of our flush group and not deleted by 2953 * the front-end, adjust the link count synced to 2954 * the media (undo what the frontend did when it 2955 * queued the record). 2956 */ 2957 KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0); 2958 switch(depend->type) { 2959 case HAMMER_MEM_RECORD_ADD: 2960 --nlinks; 2961 break; 2962 case HAMMER_MEM_RECORD_DEL: 2963 ++nlinks; 2964 break; 2965 default: 2966 break; 2967 } 2968 } 2969 } 2970 2971 /* 2972 * Set dirty if we had to modify the link count. 2973 */ 2974 if (ip->sync_ino_data.nlinks != nlinks) { 2975 KKASSERT((int64_t)nlinks >= 0); 2976 ip->sync_ino_data.nlinks = nlinks; 2977 ip->sync_flags |= HAMMER_INODE_DDIRTY; 2978 } 2979 2980 /* 2981 * If there is a trunction queued destroy any data past the (aligned) 2982 * truncation point. Userland will have dealt with the buffer 2983 * containing the truncation point for us. 2984 * 2985 * We don't flush pending frontend data buffers until after we've 2986 * dealt with the truncation. 2987 */ 2988 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2989 /* 2990 * Interlock trunc_off. The VOP front-end may continue to 2991 * make adjustments to it while we are blocked. 2992 */ 2993 off_t trunc_off; 2994 off_t aligned_trunc_off; 2995 int blkmask; 2996 2997 trunc_off = ip->sync_trunc_off; 2998 blkmask = hammer_blocksize(trunc_off) - 1; 2999 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask; 3000 3001 /* 3002 * Delete any whole blocks on-media. The front-end has 3003 * already cleaned out any partial block and made it 3004 * pending. The front-end may have updated trunc_off 3005 * while we were blocked so we only use sync_trunc_off. 3006 * 3007 * This operation can blow out the buffer cache, EWOULDBLOCK 3008 * means we were unable to complete the deletion. The 3009 * deletion will update sync_trunc_off in that case. 3010 */ 3011 error = hammer_ip_delete_range(&cursor, ip, 3012 aligned_trunc_off, 3013 0x7FFFFFFFFFFFFFFFLL, 2); 3014 if (error == EWOULDBLOCK) { 3015 ip->flags |= HAMMER_INODE_WOULDBLOCK; 3016 error = 0; 3017 goto defer_buffer_flush; 3018 } 3019 3020 if (error) 3021 goto done; 3022 3023 /* 3024 * Generate a REDO_TERM_TRUNC entry in the UNDO/REDO FIFO. 3025 * 3026 * XXX we do this even if we did not previously generate 3027 * a REDO_TRUNC record. This operation may enclosed the 3028 * range for multiple prior truncation entries in the REDO 3029 * log. 3030 */ 3031 if (trans->hmp->version >= HAMMER_VOL_VERSION_FOUR && 3032 (ip->flags & HAMMER_INODE_RDIRTY)) { 3033 hammer_generate_redo(trans, ip, aligned_trunc_off, 3034 HAMMER_REDO_TERM_TRUNC, 3035 NULL, 0); 3036 } 3037 3038 /* 3039 * Clear the truncation flag on the backend after we have 3040 * completed the deletions. Backend data is now good again 3041 * (including new records we are about to sync, below). 3042 * 3043 * Leave sync_trunc_off intact. As we write additional 3044 * records the backend will update sync_trunc_off. This 3045 * tells the backend whether it can skip the overwrite 3046 * test. This should work properly even when the backend 3047 * writes full blocks where the truncation point straddles 3048 * the block because the comparison is against the base 3049 * offset of the record. 3050 */ 3051 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED; 3052 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */ 3053 } else { 3054 error = 0; 3055 } 3056 3057 /* 3058 * Now sync related records. These will typically be directory 3059 * entries, records tracking direct-writes, or delete-on-disk records. 3060 */ 3061 if (error == 0) { 3062 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, 3063 hammer_sync_record_callback, &cursor); 3064 if (tmp_error < 0) 3065 tmp_error = -error; 3066 if (tmp_error) 3067 error = tmp_error; 3068 } 3069 hammer_cache_node(&ip->cache[1], cursor.node); 3070 3071 /* 3072 * Re-seek for inode update, assuming our cache hasn't been ripped 3073 * out from under us. 3074 */ 3075 if (error == 0) { 3076 tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error); 3077 if (tmp_node) { 3078 hammer_cursor_downgrade(&cursor); 3079 hammer_lock_sh(&tmp_node->lock); 3080 if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0) 3081 hammer_cursor_seek(&cursor, tmp_node, 0); 3082 hammer_unlock(&tmp_node->lock); 3083 hammer_rel_node(tmp_node); 3084 } 3085 error = 0; 3086 } 3087 3088 /* 3089 * If we are deleting the inode the frontend had better not have 3090 * any active references on elements making up the inode. 3091 * 3092 * The call to hammer_ip_delete_clean() cleans up auxillary records 3093 * but not DB or DATA records. Those must have already been deleted 3094 * by the normal truncation mechanic. 3095 */ 3096 if (error == 0 && ip->sync_ino_data.nlinks == 0 && 3097 RB_EMPTY(&ip->rec_tree) && 3098 (ip->sync_flags & HAMMER_INODE_DELETING) && 3099 (ip->flags & HAMMER_INODE_DELETED) == 0) { 3100 int count1 = 0; 3101 3102 error = hammer_ip_delete_clean(&cursor, ip, &count1); 3103 if (error == 0) { 3104 ip->flags |= HAMMER_INODE_DELETED; 3105 ip->sync_flags &= ~HAMMER_INODE_DELETING; 3106 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED; 3107 KKASSERT(RB_EMPTY(&ip->rec_tree)); 3108 3109 /* 3110 * Set delete_tid in both the frontend and backend 3111 * copy of the inode record. The DELETED flag handles 3112 * this, do not set DDIRTY. 3113 */ 3114 ip->ino_leaf.base.delete_tid = trans->tid; 3115 ip->sync_ino_leaf.base.delete_tid = trans->tid; 3116 ip->ino_leaf.delete_ts = trans->time32; 3117 ip->sync_ino_leaf.delete_ts = trans->time32; 3118 3119 3120 /* 3121 * Adjust the inode count in the volume header 3122 */ 3123 hammer_sync_lock_sh(trans); 3124 if (ip->flags & HAMMER_INODE_ONDISK) { 3125 hammer_modify_volume_field(trans, 3126 trans->rootvol, 3127 vol0_stat_inodes); 3128 --ip->hmp->rootvol->ondisk->vol0_stat_inodes; 3129 hammer_modify_volume_done(trans->rootvol); 3130 } 3131 hammer_sync_unlock(trans); 3132 } 3133 } 3134 3135 if (error) 3136 goto done; 3137 ip->sync_flags &= ~HAMMER_INODE_BUFS; 3138 3139 defer_buffer_flush: 3140 /* 3141 * Now update the inode's on-disk inode-data and/or on-disk record. 3142 * DELETED and ONDISK are managed only in ip->flags. 3143 * 3144 * In the case of a defered buffer flush we still update the on-disk 3145 * inode to satisfy visibility requirements if there happen to be 3146 * directory dependancies. 3147 */ 3148 switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) { 3149 case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK: 3150 /* 3151 * If deleted and on-disk, don't set any additional flags. 3152 * the delete flag takes care of things. 3153 * 3154 * Clear flags which may have been set by the frontend. 3155 */ 3156 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 3157 HAMMER_INODE_SDIRTY | 3158 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME | 3159 HAMMER_INODE_DELETING); 3160 break; 3161 case HAMMER_INODE_DELETED: 3162 /* 3163 * Take care of the case where a deleted inode was never 3164 * flushed to the disk in the first place. 3165 * 3166 * Clear flags which may have been set by the frontend. 3167 */ 3168 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 3169 HAMMER_INODE_SDIRTY | 3170 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME | 3171 HAMMER_INODE_DELETING); 3172 while (RB_ROOT(&ip->rec_tree)) { 3173 hammer_record_t record = RB_ROOT(&ip->rec_tree); 3174 hammer_ref(&record->lock); 3175 KKASSERT(hammer_oneref(&record->lock)); 3176 record->flags |= HAMMER_RECF_DELETED_BE; 3177 ++record->ip->rec_generation; 3178 hammer_rel_mem_record(record); 3179 } 3180 break; 3181 case HAMMER_INODE_ONDISK: 3182 /* 3183 * If already on-disk, do not set any additional flags. 3184 */ 3185 break; 3186 default: 3187 /* 3188 * If not on-disk and not deleted, set DDIRTY to force 3189 * an initial record to be written. 3190 * 3191 * Also set the create_tid in both the frontend and backend 3192 * copy of the inode record. 3193 */ 3194 ip->ino_leaf.base.create_tid = trans->tid; 3195 ip->ino_leaf.create_ts = trans->time32; 3196 ip->sync_ino_leaf.base.create_tid = trans->tid; 3197 ip->sync_ino_leaf.create_ts = trans->time32; 3198 ip->sync_flags |= HAMMER_INODE_DDIRTY; 3199 break; 3200 } 3201 3202 /* 3203 * If DDIRTY or SDIRTY is set, write out a new record. 3204 * If the inode is already on-disk the old record is marked as 3205 * deleted. 3206 * 3207 * If DELETED is set hammer_update_inode() will delete the existing 3208 * record without writing out a new one. 3209 */ 3210 if (ip->flags & HAMMER_INODE_DELETED) { 3211 error = hammer_update_inode(&cursor, ip); 3212 } else 3213 if (!(ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY)) && 3214 (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) { 3215 error = hammer_update_itimes(&cursor, ip); 3216 } else 3217 if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY | 3218 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) { 3219 error = hammer_update_inode(&cursor, ip); 3220 } 3221 done: 3222 if (ip->flags & HAMMER_INODE_MODMASK) 3223 hammer_inode_dirty(ip); 3224 if (error) { 3225 hammer_critical_error(ip->hmp, ip, error, 3226 "while syncing inode"); 3227 } 3228 hammer_done_cursor(&cursor); 3229 return(error); 3230 } 3231 3232 /* 3233 * This routine is called when the OS is no longer actively referencing 3234 * the inode (but might still be keeping it cached), or when releasing 3235 * the last reference to an inode. 3236 * 3237 * At this point if the inode's nlinks count is zero we want to destroy 3238 * it, which may mean destroying it on-media too. 3239 */ 3240 void 3241 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp) 3242 { 3243 struct vnode *vp; 3244 3245 /* 3246 * Set the DELETING flag when the link count drops to 0 and the 3247 * OS no longer has any opens on the inode. 3248 * 3249 * The backend will clear DELETING (a mod flag) and set DELETED 3250 * (a state flag) when it is actually able to perform the 3251 * operation. 3252 * 3253 * Don't reflag the deletion if the flusher is currently syncing 3254 * one that was already flagged. A previously set DELETING flag 3255 * may bounce around flags and sync_flags until the operation is 3256 * completely done. 3257 * 3258 * Do not attempt to modify a snapshot inode (one set to read-only). 3259 */ 3260 if (ip->ino_data.nlinks == 0 && 3261 ((ip->flags | ip->sync_flags) & (HAMMER_INODE_RO|HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) { 3262 ip->flags |= HAMMER_INODE_DELETING; 3263 ip->flags |= HAMMER_INODE_TRUNCATED; 3264 ip->trunc_off = 0; 3265 vp = NULL; 3266 if (getvp) { 3267 if (hammer_get_vnode(ip, &vp) != 0) 3268 return; 3269 } 3270 3271 /* 3272 * Final cleanup 3273 */ 3274 if (ip->vp) 3275 nvtruncbuf(ip->vp, 0, HAMMER_BUFSIZE, 0, 0); 3276 if (ip->flags & HAMMER_INODE_MODMASK) 3277 hammer_inode_dirty(ip); 3278 if (getvp) 3279 vput(vp); 3280 } 3281 } 3282 3283 /* 3284 * After potentially resolving a dependancy the inode is tested 3285 * to determine whether it needs to be reflushed. 3286 */ 3287 void 3288 hammer_test_inode(hammer_inode_t ip) 3289 { 3290 if (ip->flags & HAMMER_INODE_REFLUSH) { 3291 ip->flags &= ~HAMMER_INODE_REFLUSH; 3292 hammer_ref(&ip->lock); 3293 if (ip->flags & HAMMER_INODE_RESIGNAL) { 3294 ip->flags &= ~HAMMER_INODE_RESIGNAL; 3295 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 3296 } else { 3297 hammer_flush_inode(ip, 0); 3298 } 3299 hammer_rel_inode(ip, 0); 3300 } 3301 } 3302 3303 /* 3304 * Clear the RECLAIM flag on an inode. This occurs when the inode is 3305 * reassociated with a vp or just before it gets freed. 3306 * 3307 * Pipeline wakeups to threads blocked due to an excessive number of 3308 * detached inodes. This typically occurs when atime updates accumulate 3309 * while scanning a directory tree. 3310 */ 3311 static void 3312 hammer_inode_wakereclaims(hammer_inode_t ip) 3313 { 3314 struct hammer_reclaim *reclaim; 3315 hammer_mount_t hmp = ip->hmp; 3316 3317 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) 3318 return; 3319 3320 --hammer_count_reclaims; 3321 --hmp->count_reclaims; 3322 ip->flags &= ~HAMMER_INODE_RECLAIM; 3323 3324 if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) { 3325 KKASSERT(reclaim->count > 0); 3326 if (--reclaim->count == 0) { 3327 TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry); 3328 wakeup(reclaim); 3329 } 3330 } 3331 } 3332 3333 /* 3334 * Setup our reclaim pipeline. We only let so many detached (and dirty) 3335 * inodes build up before we start blocking. This routine is called 3336 * if a new inode is created or an inode is loaded from media. 3337 * 3338 * When we block we don't care *which* inode has finished reclaiming, 3339 * as long as one does. 3340 * 3341 * The reclaim pipeline is primarily governed by the auto-flush which is 3342 * 1/4 hammer_limit_reclaims. We don't want to block if the count is 3343 * less than 1/2 hammer_limit_reclaims. From 1/2 to full count is 3344 * dynamically governed. 3345 */ 3346 void 3347 hammer_inode_waitreclaims(hammer_transaction_t trans) 3348 { 3349 hammer_mount_t hmp = trans->hmp; 3350 struct hammer_reclaim reclaim; 3351 int lower_limit; 3352 3353 /* 3354 * Track inode load, delay if the number of reclaiming inodes is 3355 * between 2/4 and 4/4 hammer_limit_reclaims, depending. 3356 */ 3357 if (curthread->td_proc) { 3358 struct hammer_inostats *stats; 3359 3360 stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid); 3361 ++stats->count; 3362 3363 if (stats->count > hammer_limit_reclaims / 2) 3364 stats->count = hammer_limit_reclaims / 2; 3365 lower_limit = hammer_limit_reclaims - stats->count; 3366 if (hammer_debug_general & 0x10000) { 3367 hdkprintf("pid %5d limit %d\n", 3368 (int)curthread->td_proc->p_pid, lower_limit); 3369 } 3370 } else { 3371 lower_limit = hammer_limit_reclaims * 3 / 4; 3372 } 3373 if (hmp->count_reclaims >= lower_limit) { 3374 reclaim.count = 1; 3375 TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry); 3376 tsleep(&reclaim, 0, "hmrrcm", hz); 3377 if (reclaim.count > 0) 3378 TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry); 3379 } 3380 } 3381 3382 /* 3383 * Keep track of reclaim statistics on a per-pid basis using a loose 3384 * 4-way set associative hash table. Collisions inherit the count of 3385 * the previous entry. 3386 * 3387 * NOTE: We want to be careful here to limit the chain size. If the chain 3388 * size is too large a pid will spread its stats out over too many 3389 * entries under certain types of heavy filesystem activity and 3390 * wind up not delaying long enough. 3391 */ 3392 static 3393 struct hammer_inostats * 3394 hammer_inode_inostats(hammer_mount_t hmp, pid_t pid) 3395 { 3396 struct hammer_inostats *stats; 3397 int delta; 3398 int chain; 3399 static volatile int iterator; /* we don't care about MP races */ 3400 3401 /* 3402 * Chain up to 4 times to find our entry. 3403 */ 3404 for (chain = 0; chain < 4; ++chain) { 3405 stats = &hmp->inostats[(pid + chain) & HAMMER_INOSTATS_HMASK]; 3406 if (stats->pid == pid) 3407 break; 3408 } 3409 3410 /* 3411 * Replace one of the four chaining entries with our new entry. 3412 */ 3413 if (chain == 4) { 3414 stats = &hmp->inostats[(pid + (iterator++ & 3)) & 3415 HAMMER_INOSTATS_HMASK]; 3416 stats->pid = pid; 3417 } 3418 3419 /* 3420 * Decay the entry 3421 */ 3422 if (stats->count && stats->ltick != ticks) { 3423 delta = ticks - stats->ltick; 3424 stats->ltick = ticks; 3425 if (delta <= 0 || delta > hz * 60) 3426 stats->count = 0; 3427 else 3428 stats->count = stats->count * hz / (hz + delta); 3429 } 3430 if (hammer_debug_general & 0x10000) 3431 hdkprintf("pid %5d stats %d\n", (int)pid, stats->count); 3432 return (stats); 3433 } 3434 3435 #if 0 3436 3437 /* 3438 * XXX not used, doesn't work very well due to the large batching nature 3439 * of flushes. 3440 * 3441 * A larger then normal backlog of inodes is sitting in the flusher, 3442 * enforce a general slowdown to let it catch up. This routine is only 3443 * called on completion of a non-flusher-related transaction which 3444 * performed B-Tree node I/O. 3445 * 3446 * It is possible for the flusher to stall in a continuous load. 3447 * blogbench -i1000 -o seems to do a good job generating this sort of load. 3448 * If the flusher is unable to catch up the inode count can bloat until 3449 * we run out of kvm. 3450 * 3451 * This is a bit of a hack. 3452 */ 3453 void 3454 hammer_inode_waithard(hammer_mount_t hmp) 3455 { 3456 /* 3457 * Hysteresis. 3458 */ 3459 if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) { 3460 if (hmp->count_reclaims < hammer_limit_reclaims / 2 && 3461 hmp->count_iqueued < hmp->count_inodes / 20) { 3462 hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY; 3463 return; 3464 } 3465 } else { 3466 if (hmp->count_reclaims < hammer_limit_reclaims || 3467 hmp->count_iqueued < hmp->count_inodes / 10) { 3468 return; 3469 } 3470 hmp->flags |= HAMMER_MOUNT_FLUSH_RECOVERY; 3471 } 3472 3473 /* 3474 * Block for one flush cycle. 3475 */ 3476 hammer_flusher_wait_next(hmp); 3477 } 3478 3479 #endif 3480