1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.114 2008/09/24 00:53:51 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 #include <vm/vm_extern.h> 39 #include <sys/buf.h> 40 #include <sys/buf2.h> 41 42 static int hammer_unload_inode(struct hammer_inode *ip); 43 static void hammer_free_inode(hammer_inode_t ip); 44 static void hammer_flush_inode_core(hammer_inode_t ip, 45 hammer_flush_group_t flg, int flags); 46 static int hammer_setup_child_callback(hammer_record_t rec, void *data); 47 #if 0 48 static int hammer_syncgrp_child_callback(hammer_record_t rec, void *data); 49 #endif 50 static int hammer_setup_parent_inodes(hammer_inode_t ip, int depth, 51 hammer_flush_group_t flg); 52 static int hammer_setup_parent_inodes_helper(hammer_record_t record, 53 int depth, hammer_flush_group_t flg); 54 static void hammer_inode_wakereclaims(hammer_inode_t ip, int dowake); 55 56 #ifdef DEBUG_TRUNCATE 57 extern struct hammer_inode *HammerTruncIp; 58 #endif 59 60 /* 61 * RB-Tree support for inode structures 62 */ 63 int 64 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2) 65 { 66 if (ip1->obj_localization < ip2->obj_localization) 67 return(-1); 68 if (ip1->obj_localization > ip2->obj_localization) 69 return(1); 70 if (ip1->obj_id < ip2->obj_id) 71 return(-1); 72 if (ip1->obj_id > ip2->obj_id) 73 return(1); 74 if (ip1->obj_asof < ip2->obj_asof) 75 return(-1); 76 if (ip1->obj_asof > ip2->obj_asof) 77 return(1); 78 return(0); 79 } 80 81 /* 82 * RB-Tree support for inode structures / special LOOKUP_INFO 83 */ 84 static int 85 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip) 86 { 87 if (info->obj_localization < ip->obj_localization) 88 return(-1); 89 if (info->obj_localization > ip->obj_localization) 90 return(1); 91 if (info->obj_id < ip->obj_id) 92 return(-1); 93 if (info->obj_id > ip->obj_id) 94 return(1); 95 if (info->obj_asof < ip->obj_asof) 96 return(-1); 97 if (info->obj_asof > ip->obj_asof) 98 return(1); 99 return(0); 100 } 101 102 /* 103 * Used by hammer_scan_inode_snapshots() to locate all of an object's 104 * snapshots. Note that the asof field is not tested, which we can get 105 * away with because it is the lowest-priority field. 106 */ 107 static int 108 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data) 109 { 110 hammer_inode_info_t info = data; 111 112 if (ip->obj_localization > info->obj_localization) 113 return(1); 114 if (ip->obj_localization < info->obj_localization) 115 return(-1); 116 if (ip->obj_id > info->obj_id) 117 return(1); 118 if (ip->obj_id < info->obj_id) 119 return(-1); 120 return(0); 121 } 122 123 /* 124 * Used by hammer_unload_pseudofs() to locate all inodes associated with 125 * a particular PFS. 126 */ 127 static int 128 hammer_inode_pfs_cmp(hammer_inode_t ip, void *data) 129 { 130 u_int32_t localization = *(u_int32_t *)data; 131 if (ip->obj_localization > localization) 132 return(1); 133 if (ip->obj_localization < localization) 134 return(-1); 135 return(0); 136 } 137 138 /* 139 * RB-Tree support for pseudofs structures 140 */ 141 static int 142 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2) 143 { 144 if (p1->localization < p2->localization) 145 return(-1); 146 if (p1->localization > p2->localization) 147 return(1); 148 return(0); 149 } 150 151 152 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare); 153 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node, 154 hammer_inode_info_cmp, hammer_inode_info_t); 155 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node, 156 hammer_pfs_rb_compare, u_int32_t, localization); 157 158 /* 159 * The kernel is not actively referencing this vnode but is still holding 160 * it cached. 161 * 162 * This is called from the frontend. 163 */ 164 int 165 hammer_vop_inactive(struct vop_inactive_args *ap) 166 { 167 struct hammer_inode *ip = VTOI(ap->a_vp); 168 169 /* 170 * Degenerate case 171 */ 172 if (ip == NULL) { 173 vrecycle(ap->a_vp); 174 return(0); 175 } 176 177 /* 178 * If the inode no longer has visibility in the filesystem try to 179 * recycle it immediately, even if the inode is dirty. Recycling 180 * it quickly allows the system to reclaim buffer cache and VM 181 * resources which can matter a lot in a heavily loaded system. 182 * 183 * This can deadlock in vfsync() if we aren't careful. 184 * 185 * Do not queue the inode to the flusher if we still have visibility, 186 * otherwise namespace calls such as chmod will unnecessarily generate 187 * multiple inode updates. 188 */ 189 hammer_inode_unloadable_check(ip, 0); 190 if (ip->ino_data.nlinks == 0) { 191 if (ip->flags & HAMMER_INODE_MODMASK) 192 hammer_flush_inode(ip, 0); 193 vrecycle(ap->a_vp); 194 } 195 return(0); 196 } 197 198 /* 199 * Release the vnode association. This is typically (but not always) 200 * the last reference on the inode. 201 * 202 * Once the association is lost we are on our own with regards to 203 * flushing the inode. 204 */ 205 int 206 hammer_vop_reclaim(struct vop_reclaim_args *ap) 207 { 208 struct hammer_inode *ip; 209 hammer_mount_t hmp; 210 struct vnode *vp; 211 212 vp = ap->a_vp; 213 214 if ((ip = vp->v_data) != NULL) { 215 hmp = ip->hmp; 216 vp->v_data = NULL; 217 ip->vp = NULL; 218 219 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) { 220 ++hammer_count_reclaiming; 221 ++hmp->inode_reclaims; 222 ip->flags |= HAMMER_INODE_RECLAIM; 223 } 224 hammer_rel_inode(ip, 1); 225 } 226 return(0); 227 } 228 229 /* 230 * Return a locked vnode for the specified inode. The inode must be 231 * referenced but NOT LOCKED on entry and will remain referenced on 232 * return. 233 * 234 * Called from the frontend. 235 */ 236 int 237 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp) 238 { 239 hammer_mount_t hmp; 240 struct vnode *vp; 241 int error = 0; 242 u_int8_t obj_type; 243 244 hmp = ip->hmp; 245 246 for (;;) { 247 if ((vp = ip->vp) == NULL) { 248 error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0); 249 if (error) 250 break; 251 hammer_lock_ex(&ip->lock); 252 if (ip->vp != NULL) { 253 hammer_unlock(&ip->lock); 254 vp = *vpp; 255 vp->v_type = VBAD; 256 vx_put(vp); 257 continue; 258 } 259 hammer_ref(&ip->lock); 260 vp = *vpp; 261 ip->vp = vp; 262 263 obj_type = ip->ino_data.obj_type; 264 vp->v_type = hammer_get_vnode_type(obj_type); 265 266 hammer_inode_wakereclaims(ip, 0); 267 268 switch(ip->ino_data.obj_type) { 269 case HAMMER_OBJTYPE_CDEV: 270 case HAMMER_OBJTYPE_BDEV: 271 vp->v_ops = &hmp->mp->mnt_vn_spec_ops; 272 addaliasu(vp, ip->ino_data.rmajor, 273 ip->ino_data.rminor); 274 break; 275 case HAMMER_OBJTYPE_FIFO: 276 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops; 277 break; 278 default: 279 break; 280 } 281 282 /* 283 * Only mark as the root vnode if the ip is not 284 * historical, otherwise the VFS cache will get 285 * confused. The other half of the special handling 286 * is in hammer_vop_nlookupdotdot(). 287 * 288 * Pseudo-filesystem roots can be accessed via 289 * non-root filesystem paths and setting VROOT may 290 * confuse the namecache. Set VPFSROOT instead. 291 */ 292 if (ip->obj_id == HAMMER_OBJID_ROOT && 293 ip->obj_asof == hmp->asof) { 294 if (ip->obj_localization == 0) 295 vp->v_flag |= VROOT; 296 else 297 vp->v_flag |= VPFSROOT; 298 } 299 300 vp->v_data = (void *)ip; 301 /* vnode locked by getnewvnode() */ 302 /* make related vnode dirty if inode dirty? */ 303 hammer_unlock(&ip->lock); 304 if (vp->v_type == VREG) 305 vinitvmio(vp, ip->ino_data.size); 306 break; 307 } 308 309 /* 310 * loop if the vget fails (aka races), or if the vp 311 * no longer matches ip->vp. 312 */ 313 if (vget(vp, LK_EXCLUSIVE) == 0) { 314 if (vp == ip->vp) 315 break; 316 vput(vp); 317 } 318 } 319 *vpp = vp; 320 return(error); 321 } 322 323 /* 324 * Locate all copies of the inode for obj_id compatible with the specified 325 * asof, reference, and issue the related call-back. This routine is used 326 * for direct-io invalidation and does not create any new inodes. 327 */ 328 void 329 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo, 330 int (*callback)(hammer_inode_t ip, void *data), 331 void *data) 332 { 333 hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root, 334 hammer_inode_info_cmp_all_history, 335 callback, iinfo); 336 } 337 338 /* 339 * Acquire a HAMMER inode. The returned inode is not locked. These functions 340 * do not attach or detach the related vnode (use hammer_get_vnode() for 341 * that). 342 * 343 * The flags argument is only applied for newly created inodes, and only 344 * certain flags are inherited. 345 * 346 * Called from the frontend. 347 */ 348 struct hammer_inode * 349 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip, 350 int64_t obj_id, hammer_tid_t asof, u_int32_t localization, 351 int flags, int *errorp) 352 { 353 hammer_mount_t hmp = trans->hmp; 354 struct hammer_inode_info iinfo; 355 struct hammer_cursor cursor; 356 struct hammer_inode *ip; 357 358 359 /* 360 * Determine if we already have an inode cached. If we do then 361 * we are golden. 362 * 363 * If we find an inode with no vnode we have to mark the 364 * transaction such that hammer_inode_waitreclaims() is 365 * called later on to avoid building up an infinite number 366 * of inodes. Otherwise we can continue to * add new inodes 367 * faster then they can be disposed of, even with the tsleep 368 * delay. 369 * 370 * If we find a dummy inode we return a failure so dounlink 371 * (which does another lookup) doesn't try to mess with the 372 * link count. hammer_vop_nresolve() uses hammer_get_dummy_inode() 373 * to ref dummy inodes. 374 */ 375 iinfo.obj_id = obj_id; 376 iinfo.obj_asof = asof; 377 iinfo.obj_localization = localization; 378 loop: 379 ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo); 380 if (ip) { 381 if (ip->flags & HAMMER_INODE_DUMMY) { 382 *errorp = ENOENT; 383 return(NULL); 384 } 385 hammer_ref(&ip->lock); 386 *errorp = 0; 387 return(ip); 388 } 389 390 /* 391 * Allocate a new inode structure and deal with races later. 392 */ 393 ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO); 394 ++hammer_count_inodes; 395 ++hmp->count_inodes; 396 ip->obj_id = obj_id; 397 ip->obj_asof = iinfo.obj_asof; 398 ip->obj_localization = localization; 399 ip->hmp = hmp; 400 ip->flags = flags & HAMMER_INODE_RO; 401 ip->cache[0].ip = ip; 402 ip->cache[1].ip = ip; 403 if (hmp->ronly) 404 ip->flags |= HAMMER_INODE_RO; 405 ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off = 406 0x7FFFFFFFFFFFFFFFLL; 407 RB_INIT(&ip->rec_tree); 408 TAILQ_INIT(&ip->target_list); 409 hammer_ref(&ip->lock); 410 411 /* 412 * Locate the on-disk inode. If this is a PFS root we always 413 * access the current version of the root inode and (if it is not 414 * a master) always access information under it with a snapshot 415 * TID. 416 */ 417 retry: 418 hammer_init_cursor(trans, &cursor, (dip ? &dip->cache[0] : NULL), NULL); 419 cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE; 420 cursor.key_beg.obj_id = ip->obj_id; 421 cursor.key_beg.key = 0; 422 cursor.key_beg.create_tid = 0; 423 cursor.key_beg.delete_tid = 0; 424 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE; 425 cursor.key_beg.obj_type = 0; 426 427 cursor.asof = iinfo.obj_asof; 428 cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA | 429 HAMMER_CURSOR_ASOF; 430 431 *errorp = hammer_btree_lookup(&cursor); 432 if (*errorp == EDEADLK) { 433 hammer_done_cursor(&cursor); 434 goto retry; 435 } 436 437 /* 438 * On success the B-Tree lookup will hold the appropriate 439 * buffer cache buffers and provide a pointer to the requested 440 * information. Copy the information to the in-memory inode 441 * and cache the B-Tree node to improve future operations. 442 */ 443 if (*errorp == 0) { 444 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf; 445 ip->ino_data = cursor.data->inode; 446 447 /* 448 * cache[0] tries to cache the location of the object inode. 449 * The assumption is that it is near the directory inode. 450 * 451 * cache[1] tries to cache the location of the object data. 452 * The assumption is that it is near the directory data. 453 */ 454 hammer_cache_node(&ip->cache[0], cursor.node); 455 if (dip && dip->cache[1].node) 456 hammer_cache_node(&ip->cache[1], dip->cache[1].node); 457 458 /* 459 * The file should not contain any data past the file size 460 * stored in the inode. Setting save_trunc_off to the 461 * file size instead of max reduces B-Tree lookup overheads 462 * on append by allowing the flusher to avoid checking for 463 * record overwrites. 464 */ 465 ip->save_trunc_off = ip->ino_data.size; 466 467 /* 468 * Locate and assign the pseudofs management structure to 469 * the inode. 470 */ 471 if (dip && dip->obj_localization == ip->obj_localization) { 472 ip->pfsm = dip->pfsm; 473 hammer_ref(&ip->pfsm->lock); 474 } else { 475 ip->pfsm = hammer_load_pseudofs(trans, 476 ip->obj_localization, 477 errorp); 478 *errorp = 0; /* ignore ENOENT */ 479 } 480 } 481 482 /* 483 * The inode is placed on the red-black tree and will be synced to 484 * the media when flushed or by the filesystem sync. If this races 485 * another instantiation/lookup the insertion will fail. 486 */ 487 if (*errorp == 0) { 488 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) { 489 hammer_free_inode(ip); 490 hammer_done_cursor(&cursor); 491 goto loop; 492 } 493 ip->flags |= HAMMER_INODE_ONDISK; 494 } else { 495 if (ip->flags & HAMMER_INODE_RSV_INODES) { 496 ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */ 497 --hmp->rsv_inodes; 498 } 499 500 hammer_free_inode(ip); 501 ip = NULL; 502 } 503 hammer_done_cursor(&cursor); 504 trans->flags |= HAMMER_TRANSF_NEWINODE; 505 return (ip); 506 } 507 508 /* 509 * Get a dummy inode to placemark a broken directory entry. 510 */ 511 struct hammer_inode * 512 hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip, 513 int64_t obj_id, hammer_tid_t asof, u_int32_t localization, 514 int flags, int *errorp) 515 { 516 hammer_mount_t hmp = trans->hmp; 517 struct hammer_inode_info iinfo; 518 struct hammer_inode *ip; 519 520 /* 521 * Determine if we already have an inode cached. If we do then 522 * we are golden. 523 * 524 * If we find an inode with no vnode we have to mark the 525 * transaction such that hammer_inode_waitreclaims() is 526 * called later on to avoid building up an infinite number 527 * of inodes. Otherwise we can continue to * add new inodes 528 * faster then they can be disposed of, even with the tsleep 529 * delay. 530 * 531 * If we find a non-fake inode we return an error. Only fake 532 * inodes can be returned by this routine. 533 */ 534 iinfo.obj_id = obj_id; 535 iinfo.obj_asof = asof; 536 iinfo.obj_localization = localization; 537 loop: 538 *errorp = 0; 539 ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo); 540 if (ip) { 541 if ((ip->flags & HAMMER_INODE_DUMMY) == 0) { 542 *errorp = ENOENT; 543 return(NULL); 544 } 545 hammer_ref(&ip->lock); 546 return(ip); 547 } 548 549 /* 550 * Allocate a new inode structure and deal with races later. 551 */ 552 ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO); 553 ++hammer_count_inodes; 554 ++hmp->count_inodes; 555 ip->obj_id = obj_id; 556 ip->obj_asof = iinfo.obj_asof; 557 ip->obj_localization = localization; 558 ip->hmp = hmp; 559 ip->flags = flags | HAMMER_INODE_RO | HAMMER_INODE_DUMMY; 560 ip->cache[0].ip = ip; 561 ip->cache[1].ip = ip; 562 ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off = 563 0x7FFFFFFFFFFFFFFFLL; 564 RB_INIT(&ip->rec_tree); 565 TAILQ_INIT(&ip->target_list); 566 hammer_ref(&ip->lock); 567 568 /* 569 * Populate the dummy inode. Leave everything zero'd out. 570 * 571 * (ip->ino_leaf and ip->ino_data) 572 * 573 * Make the dummy inode a FIFO object which most copy programs 574 * will properly ignore. 575 */ 576 ip->save_trunc_off = ip->ino_data.size; 577 ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO; 578 579 /* 580 * Locate and assign the pseudofs management structure to 581 * the inode. 582 */ 583 if (dip && dip->obj_localization == ip->obj_localization) { 584 ip->pfsm = dip->pfsm; 585 hammer_ref(&ip->pfsm->lock); 586 } else { 587 ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization, 588 errorp); 589 *errorp = 0; /* ignore ENOENT */ 590 } 591 592 /* 593 * The inode is placed on the red-black tree and will be synced to 594 * the media when flushed or by the filesystem sync. If this races 595 * another instantiation/lookup the insertion will fail. 596 * 597 * NOTE: Do not set HAMMER_INODE_ONDISK. The inode is a fake. 598 */ 599 if (*errorp == 0) { 600 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) { 601 hammer_free_inode(ip); 602 goto loop; 603 } 604 } else { 605 if (ip->flags & HAMMER_INODE_RSV_INODES) { 606 ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */ 607 --hmp->rsv_inodes; 608 } 609 hammer_free_inode(ip); 610 ip = NULL; 611 } 612 trans->flags |= HAMMER_TRANSF_NEWINODE; 613 return (ip); 614 } 615 616 /* 617 * Create a new filesystem object, returning the inode in *ipp. The 618 * returned inode will be referenced. The inode is created in-memory. 619 * 620 * If pfsm is non-NULL the caller wishes to create the root inode for 621 * a master PFS. 622 */ 623 int 624 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap, 625 struct ucred *cred, hammer_inode_t dip, 626 hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp) 627 { 628 hammer_mount_t hmp; 629 hammer_inode_t ip; 630 uid_t xuid; 631 int error; 632 633 hmp = trans->hmp; 634 635 ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO); 636 ++hammer_count_inodes; 637 ++hmp->count_inodes; 638 trans->flags |= HAMMER_TRANSF_NEWINODE; 639 640 if (pfsm) { 641 KKASSERT(pfsm->localization != 0); 642 ip->obj_id = HAMMER_OBJID_ROOT; 643 ip->obj_localization = pfsm->localization; 644 } else { 645 KKASSERT(dip != NULL); 646 ip->obj_id = hammer_alloc_objid(hmp, dip); 647 ip->obj_localization = dip->obj_localization; 648 } 649 650 KKASSERT(ip->obj_id != 0); 651 ip->obj_asof = hmp->asof; 652 ip->hmp = hmp; 653 ip->flush_state = HAMMER_FST_IDLE; 654 ip->flags = HAMMER_INODE_DDIRTY | 655 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME; 656 ip->cache[0].ip = ip; 657 ip->cache[1].ip = ip; 658 659 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; 660 /* ip->save_trunc_off = 0; (already zero) */ 661 RB_INIT(&ip->rec_tree); 662 TAILQ_INIT(&ip->target_list); 663 664 ip->ino_data.atime = trans->time; 665 ip->ino_data.mtime = trans->time; 666 ip->ino_data.size = 0; 667 ip->ino_data.nlinks = 0; 668 669 /* 670 * A nohistory designator on the parent directory is inherited by 671 * the child. We will do this even for pseudo-fs creation... the 672 * sysad can turn it off. 673 */ 674 if (dip) { 675 ip->ino_data.uflags = dip->ino_data.uflags & 676 (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP); 677 } 678 679 ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD; 680 ip->ino_leaf.base.localization = ip->obj_localization + 681 HAMMER_LOCALIZE_INODE; 682 ip->ino_leaf.base.obj_id = ip->obj_id; 683 ip->ino_leaf.base.key = 0; 684 ip->ino_leaf.base.create_tid = 0; 685 ip->ino_leaf.base.delete_tid = 0; 686 ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE; 687 ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type); 688 689 ip->ino_data.obj_type = ip->ino_leaf.base.obj_type; 690 ip->ino_data.version = HAMMER_INODE_DATA_VERSION; 691 ip->ino_data.mode = vap->va_mode; 692 ip->ino_data.ctime = trans->time; 693 694 /* 695 * If we are running version 2 or greater we use dirhash algorithm #1 696 * which is semi-sorted. Algorithm #0 was just a pure crc. 697 */ 698 if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) { 699 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) { 700 ip->ino_data.cap_flags |= HAMMER_INODE_CAP_DIRHASH_ALG1; 701 } 702 } 703 704 /* 705 * Setup the ".." pointer. This only needs to be done for directories 706 * but we do it for all objects as a recovery aid. 707 */ 708 if (dip) 709 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id; 710 #if 0 711 /* 712 * The parent_obj_localization field only applies to pseudo-fs roots. 713 * XXX this is no longer applicable, PFSs are no longer directly 714 * tied into the parent's directory structure. 715 */ 716 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY && 717 ip->obj_id == HAMMER_OBJID_ROOT) { 718 ip->ino_data.ext.obj.parent_obj_localization = 719 dip->obj_localization; 720 } 721 #endif 722 723 switch(ip->ino_leaf.base.obj_type) { 724 case HAMMER_OBJTYPE_CDEV: 725 case HAMMER_OBJTYPE_BDEV: 726 ip->ino_data.rmajor = vap->va_rmajor; 727 ip->ino_data.rminor = vap->va_rminor; 728 break; 729 default: 730 break; 731 } 732 733 /* 734 * Calculate default uid/gid and overwrite with information from 735 * the vap. 736 */ 737 if (dip) { 738 xuid = hammer_to_unix_xid(&dip->ino_data.uid); 739 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, 740 xuid, cred, &vap->va_mode); 741 } else { 742 xuid = 0; 743 } 744 ip->ino_data.mode = vap->va_mode; 745 746 if (vap->va_vaflags & VA_UID_UUID_VALID) 747 ip->ino_data.uid = vap->va_uid_uuid; 748 else if (vap->va_uid != (uid_t)VNOVAL) 749 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid); 750 else 751 hammer_guid_to_uuid(&ip->ino_data.uid, xuid); 752 753 if (vap->va_vaflags & VA_GID_UUID_VALID) 754 ip->ino_data.gid = vap->va_gid_uuid; 755 else if (vap->va_gid != (gid_t)VNOVAL) 756 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid); 757 else if (dip) 758 ip->ino_data.gid = dip->ino_data.gid; 759 760 hammer_ref(&ip->lock); 761 762 if (pfsm) { 763 ip->pfsm = pfsm; 764 hammer_ref(&pfsm->lock); 765 error = 0; 766 } else if (dip->obj_localization == ip->obj_localization) { 767 ip->pfsm = dip->pfsm; 768 hammer_ref(&ip->pfsm->lock); 769 error = 0; 770 } else { 771 ip->pfsm = hammer_load_pseudofs(trans, 772 ip->obj_localization, 773 &error); 774 error = 0; /* ignore ENOENT */ 775 } 776 777 if (error) { 778 hammer_free_inode(ip); 779 ip = NULL; 780 } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) { 781 panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id); 782 /* not reached */ 783 hammer_free_inode(ip); 784 } 785 *ipp = ip; 786 return(error); 787 } 788 789 /* 790 * Final cleanup / freeing of an inode structure 791 */ 792 static void 793 hammer_free_inode(hammer_inode_t ip) 794 { 795 struct hammer_mount *hmp; 796 797 hmp = ip->hmp; 798 KKASSERT(ip->lock.refs == 1); 799 hammer_uncache_node(&ip->cache[0]); 800 hammer_uncache_node(&ip->cache[1]); 801 hammer_inode_wakereclaims(ip, 1); 802 if (ip->objid_cache) 803 hammer_clear_objid(ip); 804 --hammer_count_inodes; 805 --hmp->count_inodes; 806 if (ip->pfsm) { 807 hammer_rel_pseudofs(hmp, ip->pfsm); 808 ip->pfsm = NULL; 809 } 810 kfree(ip, hmp->m_inodes); 811 ip = NULL; 812 } 813 814 /* 815 * Retrieve pseudo-fs data. NULL will never be returned. 816 * 817 * If an error occurs *errorp will be set and a default template is returned, 818 * otherwise *errorp is set to 0. Typically when an error occurs it will 819 * be ENOENT. 820 */ 821 hammer_pseudofs_inmem_t 822 hammer_load_pseudofs(hammer_transaction_t trans, 823 u_int32_t localization, int *errorp) 824 { 825 hammer_mount_t hmp = trans->hmp; 826 hammer_inode_t ip; 827 hammer_pseudofs_inmem_t pfsm; 828 struct hammer_cursor cursor; 829 int bytes; 830 831 retry: 832 pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization); 833 if (pfsm) { 834 hammer_ref(&pfsm->lock); 835 *errorp = 0; 836 return(pfsm); 837 } 838 839 /* 840 * PFS records are stored in the root inode (not the PFS root inode, 841 * but the real root). Avoid an infinite recursion if loading 842 * the PFS for the real root. 843 */ 844 if (localization) { 845 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, 846 HAMMER_MAX_TID, 847 HAMMER_DEF_LOCALIZATION, 0, errorp); 848 } else { 849 ip = NULL; 850 } 851 852 pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK | M_ZERO); 853 pfsm->localization = localization; 854 pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid; 855 pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid; 856 857 hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip); 858 cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION + 859 HAMMER_LOCALIZE_MISC; 860 cursor.key_beg.obj_id = HAMMER_OBJID_ROOT; 861 cursor.key_beg.create_tid = 0; 862 cursor.key_beg.delete_tid = 0; 863 cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS; 864 cursor.key_beg.obj_type = 0; 865 cursor.key_beg.key = localization; 866 cursor.asof = HAMMER_MAX_TID; 867 cursor.flags |= HAMMER_CURSOR_ASOF; 868 869 if (ip) 870 *errorp = hammer_ip_lookup(&cursor); 871 else 872 *errorp = hammer_btree_lookup(&cursor); 873 if (*errorp == 0) { 874 *errorp = hammer_ip_resolve_data(&cursor); 875 if (*errorp == 0) { 876 if (cursor.data->pfsd.mirror_flags & 877 HAMMER_PFSD_DELETED) { 878 *errorp = ENOENT; 879 } else { 880 bytes = cursor.leaf->data_len; 881 if (bytes > sizeof(pfsm->pfsd)) 882 bytes = sizeof(pfsm->pfsd); 883 bcopy(cursor.data, &pfsm->pfsd, bytes); 884 } 885 } 886 } 887 hammer_done_cursor(&cursor); 888 889 pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid); 890 hammer_ref(&pfsm->lock); 891 if (ip) 892 hammer_rel_inode(ip, 0); 893 if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) { 894 kfree(pfsm, hmp->m_misc); 895 goto retry; 896 } 897 return(pfsm); 898 } 899 900 /* 901 * Store pseudo-fs data. The backend will automatically delete any prior 902 * on-disk pseudo-fs data but we have to delete in-memory versions. 903 */ 904 int 905 hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm) 906 { 907 struct hammer_cursor cursor; 908 hammer_record_t record; 909 hammer_inode_t ip; 910 int error; 911 912 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID, 913 HAMMER_DEF_LOCALIZATION, 0, &error); 914 retry: 915 pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid); 916 hammer_init_cursor(trans, &cursor, &ip->cache[1], ip); 917 cursor.key_beg.localization = ip->obj_localization + 918 HAMMER_LOCALIZE_MISC; 919 cursor.key_beg.obj_id = HAMMER_OBJID_ROOT; 920 cursor.key_beg.create_tid = 0; 921 cursor.key_beg.delete_tid = 0; 922 cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS; 923 cursor.key_beg.obj_type = 0; 924 cursor.key_beg.key = pfsm->localization; 925 cursor.asof = HAMMER_MAX_TID; 926 cursor.flags |= HAMMER_CURSOR_ASOF; 927 928 /* 929 * Replace any in-memory version of the record. 930 */ 931 error = hammer_ip_lookup(&cursor); 932 if (error == 0 && hammer_cursor_inmem(&cursor)) { 933 record = cursor.iprec; 934 if (record->flags & HAMMER_RECF_INTERLOCK_BE) { 935 KKASSERT(cursor.deadlk_rec == NULL); 936 hammer_ref(&record->lock); 937 cursor.deadlk_rec = record; 938 error = EDEADLK; 939 } else { 940 record->flags |= HAMMER_RECF_DELETED_FE; 941 error = 0; 942 } 943 } 944 945 /* 946 * Allocate replacement general record. The backend flush will 947 * delete any on-disk version of the record. 948 */ 949 if (error == 0 || error == ENOENT) { 950 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd)); 951 record->type = HAMMER_MEM_RECORD_GENERAL; 952 953 record->leaf.base.localization = ip->obj_localization + 954 HAMMER_LOCALIZE_MISC; 955 record->leaf.base.rec_type = HAMMER_RECTYPE_PFS; 956 record->leaf.base.key = pfsm->localization; 957 record->leaf.data_len = sizeof(pfsm->pfsd); 958 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd)); 959 error = hammer_ip_add_record(trans, record); 960 } 961 hammer_done_cursor(&cursor); 962 if (error == EDEADLK) 963 goto retry; 964 hammer_rel_inode(ip, 0); 965 return(error); 966 } 967 968 /* 969 * Create a root directory for a PFS if one does not alredy exist. 970 * 971 * The PFS root stands alone so we must also bump the nlinks count 972 * to prevent it from being destroyed on release. 973 */ 974 int 975 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred, 976 hammer_pseudofs_inmem_t pfsm) 977 { 978 hammer_inode_t ip; 979 struct vattr vap; 980 int error; 981 982 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID, 983 pfsm->localization, 0, &error); 984 if (ip == NULL) { 985 vattr_null(&vap); 986 vap.va_mode = 0755; 987 vap.va_type = VDIR; 988 error = hammer_create_inode(trans, &vap, cred, NULL, pfsm, &ip); 989 if (error == 0) { 990 ++ip->ino_data.nlinks; 991 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); 992 } 993 } 994 if (ip) 995 hammer_rel_inode(ip, 0); 996 return(error); 997 } 998 999 /* 1000 * Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY 1001 * if we are unable to disassociate all the inodes. 1002 */ 1003 static 1004 int 1005 hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data) 1006 { 1007 int res; 1008 1009 hammer_ref(&ip->lock); 1010 if (ip->lock.refs == 2 && ip->vp) 1011 vclean_unlocked(ip->vp); 1012 if (ip->lock.refs == 1 && ip->vp == NULL) 1013 res = 0; 1014 else 1015 res = -1; /* stop, someone is using the inode */ 1016 hammer_rel_inode(ip, 0); 1017 return(res); 1018 } 1019 1020 int 1021 hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization) 1022 { 1023 int res; 1024 int try; 1025 1026 for (try = res = 0; try < 4; ++try) { 1027 res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root, 1028 hammer_inode_pfs_cmp, 1029 hammer_unload_pseudofs_callback, 1030 &localization); 1031 if (res == 0 && try > 1) 1032 break; 1033 hammer_flusher_sync(trans->hmp); 1034 } 1035 if (res != 0) 1036 res = ENOTEMPTY; 1037 return(res); 1038 } 1039 1040 1041 /* 1042 * Release a reference on a PFS 1043 */ 1044 void 1045 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm) 1046 { 1047 hammer_unref(&pfsm->lock); 1048 if (pfsm->lock.refs == 0) { 1049 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm); 1050 kfree(pfsm, hmp->m_misc); 1051 } 1052 } 1053 1054 /* 1055 * Called by hammer_sync_inode(). 1056 */ 1057 static int 1058 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip) 1059 { 1060 hammer_transaction_t trans = cursor->trans; 1061 hammer_record_t record; 1062 int error; 1063 int redirty; 1064 1065 retry: 1066 error = 0; 1067 1068 /* 1069 * If the inode has a presence on-disk then locate it and mark 1070 * it deleted, setting DELONDISK. 1071 * 1072 * The record may or may not be physically deleted, depending on 1073 * the retention policy. 1074 */ 1075 if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) == 1076 HAMMER_INODE_ONDISK) { 1077 hammer_normalize_cursor(cursor); 1078 cursor->key_beg.localization = ip->obj_localization + 1079 HAMMER_LOCALIZE_INODE; 1080 cursor->key_beg.obj_id = ip->obj_id; 1081 cursor->key_beg.key = 0; 1082 cursor->key_beg.create_tid = 0; 1083 cursor->key_beg.delete_tid = 0; 1084 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE; 1085 cursor->key_beg.obj_type = 0; 1086 cursor->asof = ip->obj_asof; 1087 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1088 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF; 1089 cursor->flags |= HAMMER_CURSOR_BACKEND; 1090 1091 error = hammer_btree_lookup(cursor); 1092 if (hammer_debug_inode) 1093 kprintf("IPDEL %p %08x %d", ip, ip->flags, error); 1094 1095 if (error == 0) { 1096 error = hammer_ip_delete_record(cursor, ip, trans->tid); 1097 if (hammer_debug_inode) 1098 kprintf(" error %d\n", error); 1099 if (error == 0) { 1100 ip->flags |= HAMMER_INODE_DELONDISK; 1101 } 1102 if (cursor->node) 1103 hammer_cache_node(&ip->cache[0], cursor->node); 1104 } 1105 if (error == EDEADLK) { 1106 hammer_done_cursor(cursor); 1107 error = hammer_init_cursor(trans, cursor, 1108 &ip->cache[0], ip); 1109 if (hammer_debug_inode) 1110 kprintf("IPDED %p %d\n", ip, error); 1111 if (error == 0) 1112 goto retry; 1113 } 1114 } 1115 1116 /* 1117 * Ok, write out the initial record or a new record (after deleting 1118 * the old one), unless the DELETED flag is set. This routine will 1119 * clear DELONDISK if it writes out a record. 1120 * 1121 * Update our inode statistics if this is the first application of 1122 * the inode on-disk. 1123 */ 1124 if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) { 1125 /* 1126 * Generate a record and write it to the media. We clean-up 1127 * the state before releasing so we do not have to set-up 1128 * a flush_group. 1129 */ 1130 record = hammer_alloc_mem_record(ip, 0); 1131 record->type = HAMMER_MEM_RECORD_INODE; 1132 record->flush_state = HAMMER_FST_FLUSH; 1133 record->leaf = ip->sync_ino_leaf; 1134 record->leaf.base.create_tid = trans->tid; 1135 record->leaf.data_len = sizeof(ip->sync_ino_data); 1136 record->leaf.create_ts = trans->time32; 1137 record->data = (void *)&ip->sync_ino_data; 1138 record->flags |= HAMMER_RECF_INTERLOCK_BE; 1139 1140 /* 1141 * If this flag is set we cannot sync the new file size 1142 * because we haven't finished related truncations. The 1143 * inode will be flushed in another flush group to finish 1144 * the job. 1145 */ 1146 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) && 1147 ip->sync_ino_data.size != ip->ino_data.size) { 1148 redirty = 1; 1149 ip->sync_ino_data.size = ip->ino_data.size; 1150 } else { 1151 redirty = 0; 1152 } 1153 1154 for (;;) { 1155 error = hammer_ip_sync_record_cursor(cursor, record); 1156 if (hammer_debug_inode) 1157 kprintf("GENREC %p rec %08x %d\n", 1158 ip, record->flags, error); 1159 if (error != EDEADLK) 1160 break; 1161 hammer_done_cursor(cursor); 1162 error = hammer_init_cursor(trans, cursor, 1163 &ip->cache[0], ip); 1164 if (hammer_debug_inode) 1165 kprintf("GENREC reinit %d\n", error); 1166 if (error) 1167 break; 1168 } 1169 1170 /* 1171 * Note: The record was never on the inode's record tree 1172 * so just wave our hands importantly and destroy it. 1173 */ 1174 record->flags |= HAMMER_RECF_COMMITTED; 1175 record->flags &= ~HAMMER_RECF_INTERLOCK_BE; 1176 record->flush_state = HAMMER_FST_IDLE; 1177 ++ip->rec_generation; 1178 hammer_rel_mem_record(record); 1179 1180 /* 1181 * Finish up. 1182 */ 1183 if (error == 0) { 1184 if (hammer_debug_inode) 1185 kprintf("CLEANDELOND %p %08x\n", ip, ip->flags); 1186 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | 1187 HAMMER_INODE_ATIME | 1188 HAMMER_INODE_MTIME); 1189 ip->flags &= ~HAMMER_INODE_DELONDISK; 1190 if (redirty) 1191 ip->sync_flags |= HAMMER_INODE_DDIRTY; 1192 1193 /* 1194 * Root volume count of inodes 1195 */ 1196 hammer_sync_lock_sh(trans); 1197 if ((ip->flags & HAMMER_INODE_ONDISK) == 0) { 1198 hammer_modify_volume_field(trans, 1199 trans->rootvol, 1200 vol0_stat_inodes); 1201 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes; 1202 hammer_modify_volume_done(trans->rootvol); 1203 ip->flags |= HAMMER_INODE_ONDISK; 1204 if (hammer_debug_inode) 1205 kprintf("NOWONDISK %p\n", ip); 1206 } 1207 hammer_sync_unlock(trans); 1208 } 1209 } 1210 1211 /* 1212 * If the inode has been destroyed, clean out any left-over flags 1213 * that may have been set by the frontend. 1214 */ 1215 if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 1216 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | 1217 HAMMER_INODE_ATIME | 1218 HAMMER_INODE_MTIME); 1219 } 1220 return(error); 1221 } 1222 1223 /* 1224 * Update only the itimes fields. 1225 * 1226 * ATIME can be updated without generating any UNDO. MTIME is updated 1227 * with UNDO so it is guaranteed to be synchronized properly in case of 1228 * a crash. 1229 * 1230 * Neither field is included in the B-Tree leaf element's CRC, which is how 1231 * we can get away with updating ATIME the way we do. 1232 */ 1233 static int 1234 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip) 1235 { 1236 hammer_transaction_t trans = cursor->trans; 1237 int error; 1238 1239 retry: 1240 if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) != 1241 HAMMER_INODE_ONDISK) { 1242 return(0); 1243 } 1244 1245 hammer_normalize_cursor(cursor); 1246 cursor->key_beg.localization = ip->obj_localization + 1247 HAMMER_LOCALIZE_INODE; 1248 cursor->key_beg.obj_id = ip->obj_id; 1249 cursor->key_beg.key = 0; 1250 cursor->key_beg.create_tid = 0; 1251 cursor->key_beg.delete_tid = 0; 1252 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE; 1253 cursor->key_beg.obj_type = 0; 1254 cursor->asof = ip->obj_asof; 1255 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1256 cursor->flags |= HAMMER_CURSOR_ASOF; 1257 cursor->flags |= HAMMER_CURSOR_GET_LEAF; 1258 cursor->flags |= HAMMER_CURSOR_GET_DATA; 1259 cursor->flags |= HAMMER_CURSOR_BACKEND; 1260 1261 error = hammer_btree_lookup(cursor); 1262 if (error == 0) { 1263 hammer_cache_node(&ip->cache[0], cursor->node); 1264 if (ip->sync_flags & HAMMER_INODE_MTIME) { 1265 /* 1266 * Updating MTIME requires an UNDO. Just cover 1267 * both atime and mtime. 1268 */ 1269 hammer_sync_lock_sh(trans); 1270 hammer_modify_buffer(trans, cursor->data_buffer, 1271 HAMMER_ITIMES_BASE(&cursor->data->inode), 1272 HAMMER_ITIMES_BYTES); 1273 cursor->data->inode.atime = ip->sync_ino_data.atime; 1274 cursor->data->inode.mtime = ip->sync_ino_data.mtime; 1275 hammer_modify_buffer_done(cursor->data_buffer); 1276 hammer_sync_unlock(trans); 1277 } else if (ip->sync_flags & HAMMER_INODE_ATIME) { 1278 /* 1279 * Updating atime only can be done in-place with 1280 * no UNDO. 1281 */ 1282 hammer_sync_lock_sh(trans); 1283 hammer_modify_buffer(trans, cursor->data_buffer, 1284 NULL, 0); 1285 cursor->data->inode.atime = ip->sync_ino_data.atime; 1286 hammer_modify_buffer_done(cursor->data_buffer); 1287 hammer_sync_unlock(trans); 1288 } 1289 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME); 1290 } 1291 if (error == EDEADLK) { 1292 hammer_done_cursor(cursor); 1293 error = hammer_init_cursor(trans, cursor, 1294 &ip->cache[0], ip); 1295 if (error == 0) 1296 goto retry; 1297 } 1298 return(error); 1299 } 1300 1301 /* 1302 * Release a reference on an inode, flush as requested. 1303 * 1304 * On the last reference we queue the inode to the flusher for its final 1305 * disposition. 1306 */ 1307 void 1308 hammer_rel_inode(struct hammer_inode *ip, int flush) 1309 { 1310 /*hammer_mount_t hmp = ip->hmp;*/ 1311 1312 /* 1313 * Handle disposition when dropping the last ref. 1314 */ 1315 for (;;) { 1316 if (ip->lock.refs == 1) { 1317 /* 1318 * Determine whether on-disk action is needed for 1319 * the inode's final disposition. 1320 */ 1321 KKASSERT(ip->vp == NULL); 1322 hammer_inode_unloadable_check(ip, 0); 1323 if (ip->flags & HAMMER_INODE_MODMASK) { 1324 hammer_flush_inode(ip, 0); 1325 } else if (ip->lock.refs == 1) { 1326 hammer_unload_inode(ip); 1327 break; 1328 } 1329 } else { 1330 if (flush) 1331 hammer_flush_inode(ip, 0); 1332 1333 /* 1334 * The inode still has multiple refs, try to drop 1335 * one ref. 1336 */ 1337 KKASSERT(ip->lock.refs >= 1); 1338 if (ip->lock.refs > 1) { 1339 hammer_unref(&ip->lock); 1340 break; 1341 } 1342 } 1343 } 1344 } 1345 1346 /* 1347 * Unload and destroy the specified inode. Must be called with one remaining 1348 * reference. The reference is disposed of. 1349 * 1350 * The inode must be completely clean. 1351 */ 1352 static int 1353 hammer_unload_inode(struct hammer_inode *ip) 1354 { 1355 hammer_mount_t hmp = ip->hmp; 1356 1357 KASSERT(ip->lock.refs == 1, 1358 ("hammer_unload_inode: %d refs\n", ip->lock.refs)); 1359 KKASSERT(ip->vp == NULL); 1360 KKASSERT(ip->flush_state == HAMMER_FST_IDLE); 1361 KKASSERT(ip->cursor_ip_refs == 0); 1362 KKASSERT(ip->lock.lockcount == 0); 1363 KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0); 1364 1365 KKASSERT(RB_EMPTY(&ip->rec_tree)); 1366 KKASSERT(TAILQ_EMPTY(&ip->target_list)); 1367 1368 RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip); 1369 1370 hammer_free_inode(ip); 1371 return(0); 1372 } 1373 1374 /* 1375 * Called during unmounting if a critical error occured. The in-memory 1376 * inode and all related structures are destroyed. 1377 * 1378 * If a critical error did not occur the unmount code calls the standard 1379 * release and asserts that the inode is gone. 1380 */ 1381 int 1382 hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused) 1383 { 1384 hammer_record_t rec; 1385 1386 /* 1387 * Get rid of the inodes in-memory records, regardless of their 1388 * state, and clear the mod-mask. 1389 */ 1390 while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) { 1391 TAILQ_REMOVE(&ip->target_list, rec, target_entry); 1392 rec->target_ip = NULL; 1393 if (rec->flush_state == HAMMER_FST_SETUP) 1394 rec->flush_state = HAMMER_FST_IDLE; 1395 } 1396 while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) { 1397 if (rec->flush_state == HAMMER_FST_FLUSH) 1398 --rec->flush_group->refs; 1399 else 1400 hammer_ref(&rec->lock); 1401 KKASSERT(rec->lock.refs == 1); 1402 rec->flush_state = HAMMER_FST_IDLE; 1403 rec->flush_group = NULL; 1404 rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */ 1405 rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */ 1406 ++ip->rec_generation; 1407 hammer_rel_mem_record(rec); 1408 } 1409 ip->flags &= ~HAMMER_INODE_MODMASK; 1410 ip->sync_flags &= ~HAMMER_INODE_MODMASK; 1411 KKASSERT(ip->vp == NULL); 1412 1413 /* 1414 * Remove the inode from any flush group, force it idle. FLUSH 1415 * and SETUP states have an inode ref. 1416 */ 1417 switch(ip->flush_state) { 1418 case HAMMER_FST_FLUSH: 1419 TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry); 1420 --ip->flush_group->refs; 1421 ip->flush_group = NULL; 1422 /* fall through */ 1423 case HAMMER_FST_SETUP: 1424 hammer_unref(&ip->lock); 1425 ip->flush_state = HAMMER_FST_IDLE; 1426 /* fall through */ 1427 case HAMMER_FST_IDLE: 1428 break; 1429 } 1430 1431 /* 1432 * There shouldn't be any associated vnode. The unload needs at 1433 * least one ref, if we do have a vp steal its ip ref. 1434 */ 1435 if (ip->vp) { 1436 kprintf("hammer_destroy_inode_callback: Unexpected " 1437 "vnode association ip %p vp %p\n", ip, ip->vp); 1438 ip->vp->v_data = NULL; 1439 ip->vp = NULL; 1440 } else { 1441 hammer_ref(&ip->lock); 1442 } 1443 hammer_unload_inode(ip); 1444 return(0); 1445 } 1446 1447 /* 1448 * Called on mount -u when switching from RW to RO or vise-versa. Adjust 1449 * the read-only flag for cached inodes. 1450 * 1451 * This routine is called from a RB_SCAN(). 1452 */ 1453 int 1454 hammer_reload_inode(hammer_inode_t ip, void *arg __unused) 1455 { 1456 hammer_mount_t hmp = ip->hmp; 1457 1458 if (hmp->ronly || hmp->asof != HAMMER_MAX_TID) 1459 ip->flags |= HAMMER_INODE_RO; 1460 else 1461 ip->flags &= ~HAMMER_INODE_RO; 1462 return(0); 1463 } 1464 1465 /* 1466 * A transaction has modified an inode, requiring updates as specified by 1467 * the passed flags. 1468 * 1469 * HAMMER_INODE_DDIRTY: Inode data has been updated 1470 * HAMMER_INODE_XDIRTY: Dirty in-memory records 1471 * HAMMER_INODE_BUFS: Dirty buffer cache buffers 1472 * HAMMER_INODE_DELETED: Inode record/data must be deleted 1473 * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated 1474 */ 1475 void 1476 hammer_modify_inode(hammer_inode_t ip, int flags) 1477 { 1478 /* 1479 * ronly of 0 or 2 does not trigger assertion. 1480 * 2 is a special error state 1481 */ 1482 KKASSERT(ip->hmp->ronly != 1 || 1483 (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 1484 HAMMER_INODE_BUFS | HAMMER_INODE_DELETED | 1485 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0); 1486 if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) { 1487 ip->flags |= HAMMER_INODE_RSV_INODES; 1488 ++ip->hmp->rsv_inodes; 1489 } 1490 1491 ip->flags |= flags; 1492 } 1493 1494 /* 1495 * Request that an inode be flushed. This whole mess cannot block and may 1496 * recurse (if not synchronous). Once requested HAMMER will attempt to 1497 * actively flush the inode until the flush can be done. 1498 * 1499 * The inode may already be flushing, or may be in a setup state. We can 1500 * place the inode in a flushing state if it is currently idle and flag it 1501 * to reflush if it is currently flushing. 1502 * 1503 * Upon return if the inode could not be flushed due to a setup 1504 * dependancy, then it will be automatically flushed when the dependancy 1505 * is satisfied. 1506 */ 1507 void 1508 hammer_flush_inode(hammer_inode_t ip, int flags) 1509 { 1510 hammer_mount_t hmp; 1511 hammer_flush_group_t flg; 1512 int good; 1513 1514 /* 1515 * next_flush_group is the first flush group we can place the inode 1516 * in. It may be NULL. If it becomes full we append a new flush 1517 * group and make that the next_flush_group. 1518 */ 1519 hmp = ip->hmp; 1520 while ((flg = hmp->next_flush_group) != NULL) { 1521 KKASSERT(flg->running == 0); 1522 if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit) 1523 break; 1524 hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry); 1525 hammer_flusher_async(ip->hmp, flg); 1526 } 1527 if (flg == NULL) { 1528 flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO); 1529 hmp->next_flush_group = flg; 1530 TAILQ_INIT(&flg->flush_list); 1531 TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry); 1532 } 1533 1534 /* 1535 * Trivial 'nothing to flush' case. If the inode is in a SETUP 1536 * state we have to put it back into an IDLE state so we can 1537 * drop the extra ref. 1538 * 1539 * If we have a parent dependancy we must still fall through 1540 * so we can run it. 1541 */ 1542 if ((ip->flags & HAMMER_INODE_MODMASK) == 0) { 1543 if (ip->flush_state == HAMMER_FST_SETUP && 1544 TAILQ_EMPTY(&ip->target_list)) { 1545 ip->flush_state = HAMMER_FST_IDLE; 1546 hammer_rel_inode(ip, 0); 1547 } 1548 if (ip->flush_state == HAMMER_FST_IDLE) 1549 return; 1550 } 1551 1552 /* 1553 * Our flush action will depend on the current state. 1554 */ 1555 switch(ip->flush_state) { 1556 case HAMMER_FST_IDLE: 1557 /* 1558 * We have no dependancies and can flush immediately. Some 1559 * our children may not be flushable so we have to re-test 1560 * with that additional knowledge. 1561 */ 1562 hammer_flush_inode_core(ip, flg, flags); 1563 break; 1564 case HAMMER_FST_SETUP: 1565 /* 1566 * Recurse upwards through dependancies via target_list 1567 * and start their flusher actions going if possible. 1568 * 1569 * 'good' is our connectivity. -1 means we have none and 1570 * can't flush, 0 means there weren't any dependancies, and 1571 * 1 means we have good connectivity. 1572 */ 1573 good = hammer_setup_parent_inodes(ip, 0, flg); 1574 1575 if (good >= 0) { 1576 /* 1577 * We can continue if good >= 0. Determine how 1578 * many records under our inode can be flushed (and 1579 * mark them). 1580 */ 1581 hammer_flush_inode_core(ip, flg, flags); 1582 } else { 1583 /* 1584 * Parent has no connectivity, tell it to flush 1585 * us as soon as it does. 1586 * 1587 * The REFLUSH flag is also needed to trigger 1588 * dependancy wakeups. 1589 */ 1590 ip->flags |= HAMMER_INODE_CONN_DOWN | 1591 HAMMER_INODE_REFLUSH; 1592 if (flags & HAMMER_FLUSH_SIGNAL) { 1593 ip->flags |= HAMMER_INODE_RESIGNAL; 1594 hammer_flusher_async(ip->hmp, flg); 1595 } 1596 } 1597 break; 1598 case HAMMER_FST_FLUSH: 1599 /* 1600 * We are already flushing, flag the inode to reflush 1601 * if needed after it completes its current flush. 1602 * 1603 * The REFLUSH flag is also needed to trigger 1604 * dependancy wakeups. 1605 */ 1606 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0) 1607 ip->flags |= HAMMER_INODE_REFLUSH; 1608 if (flags & HAMMER_FLUSH_SIGNAL) { 1609 ip->flags |= HAMMER_INODE_RESIGNAL; 1610 hammer_flusher_async(ip->hmp, flg); 1611 } 1612 break; 1613 } 1614 } 1615 1616 /* 1617 * Scan ip->target_list, which is a list of records owned by PARENTS to our 1618 * ip which reference our ip. 1619 * 1620 * XXX This is a huge mess of recursive code, but not one bit of it blocks 1621 * so for now do not ref/deref the structures. Note that if we use the 1622 * ref/rel code later, the rel CAN block. 1623 */ 1624 static int 1625 hammer_setup_parent_inodes(hammer_inode_t ip, int depth, 1626 hammer_flush_group_t flg) 1627 { 1628 hammer_record_t depend; 1629 int good; 1630 int r; 1631 1632 /* 1633 * If we hit our recursion limit and we have parent dependencies 1634 * We cannot continue. Returning < 0 will cause us to be flagged 1635 * for reflush. Returning -2 cuts off additional dependency checks 1636 * because they are likely to also hit the depth limit. 1637 * 1638 * We cannot return < 0 if there are no dependencies or there might 1639 * not be anything to wakeup (ip). 1640 */ 1641 if (depth == 20 && TAILQ_FIRST(&ip->target_list)) { 1642 kprintf("HAMMER Warning: depth limit reached on " 1643 "setup recursion, inode %p %016llx\n", 1644 ip, (long long)ip->obj_id); 1645 return(-2); 1646 } 1647 1648 /* 1649 * Scan dependencies 1650 */ 1651 good = 0; 1652 TAILQ_FOREACH(depend, &ip->target_list, target_entry) { 1653 r = hammer_setup_parent_inodes_helper(depend, depth, flg); 1654 KKASSERT(depend->target_ip == ip); 1655 if (r < 0 && good == 0) 1656 good = -1; 1657 if (r > 0) 1658 good = 1; 1659 1660 /* 1661 * If we failed due to the recursion depth limit then stop 1662 * now. 1663 */ 1664 if (r == -2) 1665 break; 1666 } 1667 return(good); 1668 } 1669 1670 /* 1671 * This helper function takes a record representing the dependancy between 1672 * the parent inode and child inode. 1673 * 1674 * record->ip = parent inode 1675 * record->target_ip = child inode 1676 * 1677 * We are asked to recurse upwards and convert the record from SETUP 1678 * to FLUSH if possible. 1679 * 1680 * Return 1 if the record gives us connectivity 1681 * 1682 * Return 0 if the record is not relevant 1683 * 1684 * Return -1 if we can't resolve the dependancy and there is no connectivity. 1685 */ 1686 static int 1687 hammer_setup_parent_inodes_helper(hammer_record_t record, int depth, 1688 hammer_flush_group_t flg) 1689 { 1690 hammer_mount_t hmp; 1691 hammer_inode_t pip; 1692 int good; 1693 1694 KKASSERT(record->flush_state != HAMMER_FST_IDLE); 1695 pip = record->ip; 1696 hmp = pip->hmp; 1697 1698 /* 1699 * If the record is already flushing, is it in our flush group? 1700 * 1701 * If it is in our flush group but it is a general record or a 1702 * delete-on-disk, it does not improve our connectivity (return 0), 1703 * and if the target inode is not trying to destroy itself we can't 1704 * allow the operation yet anyway (the second return -1). 1705 */ 1706 if (record->flush_state == HAMMER_FST_FLUSH) { 1707 /* 1708 * If not in our flush group ask the parent to reflush 1709 * us as soon as possible. 1710 */ 1711 if (record->flush_group != flg) { 1712 pip->flags |= HAMMER_INODE_REFLUSH; 1713 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN; 1714 return(-1); 1715 } 1716 1717 /* 1718 * If in our flush group everything is already set up, 1719 * just return whether the record will improve our 1720 * visibility or not. 1721 */ 1722 if (record->type == HAMMER_MEM_RECORD_ADD) 1723 return(1); 1724 return(0); 1725 } 1726 1727 /* 1728 * It must be a setup record. Try to resolve the setup dependancies 1729 * by recursing upwards so we can place ip on the flush list. 1730 * 1731 * Limit ourselves to 20 levels of recursion to avoid blowing out 1732 * the kernel stack. If we hit the recursion limit we can't flush 1733 * until the parent flushes. The parent will flush independantly 1734 * on its own and ultimately a deep recursion will be resolved. 1735 */ 1736 KKASSERT(record->flush_state == HAMMER_FST_SETUP); 1737 1738 good = hammer_setup_parent_inodes(pip, depth + 1, flg); 1739 1740 /* 1741 * If good < 0 the parent has no connectivity and we cannot safely 1742 * flush the directory entry, which also means we can't flush our 1743 * ip. Flag us for downward recursion once the parent's 1744 * connectivity is resolved. Flag the parent for [re]flush or it 1745 * may not check for downward recursions. 1746 */ 1747 if (good < 0) { 1748 pip->flags |= HAMMER_INODE_REFLUSH; 1749 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN; 1750 return(good); 1751 } 1752 1753 /* 1754 * We are go, place the parent inode in a flushing state so we can 1755 * place its record in a flushing state. Note that the parent 1756 * may already be flushing. The record must be in the same flush 1757 * group as the parent. 1758 */ 1759 if (pip->flush_state != HAMMER_FST_FLUSH) 1760 hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION); 1761 KKASSERT(pip->flush_state == HAMMER_FST_FLUSH); 1762 KKASSERT(record->flush_state == HAMMER_FST_SETUP); 1763 1764 #if 0 1765 if (record->type == HAMMER_MEM_RECORD_DEL && 1766 (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) { 1767 /* 1768 * Regardless of flushing state we cannot sync this path if the 1769 * record represents a delete-on-disk but the target inode 1770 * is not ready to sync its own deletion. 1771 * 1772 * XXX need to count effective nlinks to determine whether 1773 * the flush is ok, otherwise removing a hardlink will 1774 * just leave the DEL record to rot. 1775 */ 1776 record->target_ip->flags |= HAMMER_INODE_REFLUSH; 1777 return(-1); 1778 } else 1779 #endif 1780 if (pip->flush_group == flg) { 1781 /* 1782 * Because we have not calculated nlinks yet we can just 1783 * set records to the flush state if the parent is in 1784 * the same flush group as we are. 1785 */ 1786 record->flush_state = HAMMER_FST_FLUSH; 1787 record->flush_group = flg; 1788 ++record->flush_group->refs; 1789 hammer_ref(&record->lock); 1790 1791 /* 1792 * A general directory-add contributes to our visibility. 1793 * 1794 * Otherwise it is probably a directory-delete or 1795 * delete-on-disk record and does not contribute to our 1796 * visbility (but we can still flush it). 1797 */ 1798 if (record->type == HAMMER_MEM_RECORD_ADD) 1799 return(1); 1800 return(0); 1801 } else { 1802 /* 1803 * If the parent is not in our flush group we cannot 1804 * flush this record yet, there is no visibility. 1805 * We tell the parent to reflush and mark ourselves 1806 * so the parent knows it should flush us too. 1807 */ 1808 pip->flags |= HAMMER_INODE_REFLUSH; 1809 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN; 1810 return(-1); 1811 } 1812 } 1813 1814 /* 1815 * This is the core routine placing an inode into the FST_FLUSH state. 1816 */ 1817 static void 1818 hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags) 1819 { 1820 int go_count; 1821 1822 /* 1823 * Set flush state and prevent the flusher from cycling into 1824 * the next flush group. Do not place the ip on the list yet. 1825 * Inodes not in the idle state get an extra reference. 1826 */ 1827 KKASSERT(ip->flush_state != HAMMER_FST_FLUSH); 1828 if (ip->flush_state == HAMMER_FST_IDLE) 1829 hammer_ref(&ip->lock); 1830 ip->flush_state = HAMMER_FST_FLUSH; 1831 ip->flush_group = flg; 1832 ++ip->hmp->flusher.group_lock; 1833 ++ip->hmp->count_iqueued; 1834 ++hammer_count_iqueued; 1835 ++flg->total_count; 1836 1837 /* 1838 * If the flush group reaches the autoflush limit we want to signal 1839 * the flusher. This is particularly important for remove()s. 1840 */ 1841 if (flg->total_count == hammer_autoflush) 1842 flags |= HAMMER_FLUSH_SIGNAL; 1843 1844 /* 1845 * We need to be able to vfsync/truncate from the backend. 1846 */ 1847 KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0); 1848 if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) { 1849 ip->flags |= HAMMER_INODE_VHELD; 1850 vref(ip->vp); 1851 } 1852 1853 /* 1854 * Figure out how many in-memory records we can actually flush 1855 * (not including inode meta-data, buffers, etc). 1856 */ 1857 KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0); 1858 if (flags & HAMMER_FLUSH_RECURSION) { 1859 /* 1860 * If this is a upwards recursion we do not want to 1861 * recurse down again! 1862 */ 1863 go_count = 1; 1864 #if 0 1865 } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) { 1866 /* 1867 * No new records are added if we must complete a flush 1868 * from a previous cycle, but we do have to move the records 1869 * from the previous cycle to the current one. 1870 */ 1871 #if 0 1872 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, 1873 hammer_syncgrp_child_callback, NULL); 1874 #endif 1875 go_count = 1; 1876 #endif 1877 } else { 1878 /* 1879 * Normal flush, scan records and bring them into the flush. 1880 * Directory adds and deletes are usually skipped (they are 1881 * grouped with the related inode rather then with the 1882 * directory). 1883 * 1884 * go_count can be negative, which means the scan aborted 1885 * due to the flush group being over-full and we should 1886 * flush what we have. 1887 */ 1888 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, 1889 hammer_setup_child_callback, NULL); 1890 } 1891 1892 /* 1893 * This is a more involved test that includes go_count. If we 1894 * can't flush, flag the inode and return. If go_count is 0 we 1895 * were are unable to flush any records in our rec_tree and 1896 * must ignore the XDIRTY flag. 1897 */ 1898 if (go_count == 0) { 1899 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) { 1900 --ip->hmp->count_iqueued; 1901 --hammer_count_iqueued; 1902 1903 --flg->total_count; 1904 ip->flush_state = HAMMER_FST_SETUP; 1905 ip->flush_group = NULL; 1906 if (ip->flags & HAMMER_INODE_VHELD) { 1907 ip->flags &= ~HAMMER_INODE_VHELD; 1908 vrele(ip->vp); 1909 } 1910 1911 /* 1912 * REFLUSH is needed to trigger dependancy wakeups 1913 * when an inode is in SETUP. 1914 */ 1915 ip->flags |= HAMMER_INODE_REFLUSH; 1916 if (flags & HAMMER_FLUSH_SIGNAL) { 1917 ip->flags |= HAMMER_INODE_RESIGNAL; 1918 hammer_flusher_async(ip->hmp, flg); 1919 } 1920 if (--ip->hmp->flusher.group_lock == 0) 1921 wakeup(&ip->hmp->flusher.group_lock); 1922 return; 1923 } 1924 } 1925 1926 /* 1927 * Snapshot the state of the inode for the backend flusher. 1928 * 1929 * We continue to retain save_trunc_off even when all truncations 1930 * have been resolved as an optimization to determine if we can 1931 * skip the B-Tree lookup for overwrite deletions. 1932 * 1933 * NOTE: The DELETING flag is a mod flag, but it is also sticky, 1934 * and stays in ip->flags. Once set, it stays set until the 1935 * inode is destroyed. 1936 */ 1937 if (ip->flags & HAMMER_INODE_TRUNCATED) { 1938 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0); 1939 ip->sync_trunc_off = ip->trunc_off; 1940 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; 1941 ip->flags &= ~HAMMER_INODE_TRUNCATED; 1942 ip->sync_flags |= HAMMER_INODE_TRUNCATED; 1943 1944 /* 1945 * The save_trunc_off used to cache whether the B-Tree 1946 * holds any records past that point is not used until 1947 * after the truncation has succeeded, so we can safely 1948 * set it now. 1949 */ 1950 if (ip->save_trunc_off > ip->sync_trunc_off) 1951 ip->save_trunc_off = ip->sync_trunc_off; 1952 } 1953 ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK & 1954 ~HAMMER_INODE_TRUNCATED); 1955 ip->sync_ino_leaf = ip->ino_leaf; 1956 ip->sync_ino_data = ip->ino_data; 1957 ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED; 1958 #ifdef DEBUG_TRUNCATE 1959 if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp) 1960 kprintf("truncateS %016llx\n", ip->sync_trunc_off); 1961 #endif 1962 1963 /* 1964 * The flusher list inherits our inode and reference. 1965 */ 1966 KKASSERT(flg->running == 0); 1967 TAILQ_INSERT_TAIL(&flg->flush_list, ip, flush_entry); 1968 if (--ip->hmp->flusher.group_lock == 0) 1969 wakeup(&ip->hmp->flusher.group_lock); 1970 1971 if (flags & HAMMER_FLUSH_SIGNAL) { 1972 hammer_flusher_async(ip->hmp, flg); 1973 } 1974 } 1975 1976 /* 1977 * Callback for scan of ip->rec_tree. Try to include each record in our 1978 * flush. ip->flush_group has been set but the inode has not yet been 1979 * moved into a flushing state. 1980 * 1981 * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on 1982 * both inodes. 1983 * 1984 * We return 1 for any record placed or found in FST_FLUSH, which prevents 1985 * the caller from shortcutting the flush. 1986 */ 1987 static int 1988 hammer_setup_child_callback(hammer_record_t rec, void *data) 1989 { 1990 hammer_flush_group_t flg; 1991 hammer_inode_t target_ip; 1992 hammer_inode_t ip; 1993 int r; 1994 1995 /* 1996 * Records deleted or committed by the backend are ignored. 1997 * Note that the flush detects deleted frontend records at 1998 * multiple points to deal with races. This is just the first 1999 * line of defense. The only time HAMMER_RECF_DELETED_FE cannot 2000 * be set is when HAMMER_RECF_INTERLOCK_BE is set, because it 2001 * messes up link-count calculations. 2002 * 2003 * NOTE: Don't get confused between record deletion and, say, 2004 * directory entry deletion. The deletion of a directory entry 2005 * which is on-media has nothing to do with the record deletion 2006 * flags. 2007 */ 2008 if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 2009 HAMMER_RECF_COMMITTED)) { 2010 if (rec->flush_state == HAMMER_FST_FLUSH) { 2011 KKASSERT(rec->flush_group == rec->ip->flush_group); 2012 r = 1; 2013 } else { 2014 r = 0; 2015 } 2016 return(r); 2017 } 2018 2019 /* 2020 * If the record is in an idle state it has no dependancies and 2021 * can be flushed. 2022 */ 2023 ip = rec->ip; 2024 flg = ip->flush_group; 2025 r = 0; 2026 2027 switch(rec->flush_state) { 2028 case HAMMER_FST_IDLE: 2029 /* 2030 * The record has no setup dependancy, we can flush it. 2031 */ 2032 KKASSERT(rec->target_ip == NULL); 2033 rec->flush_state = HAMMER_FST_FLUSH; 2034 rec->flush_group = flg; 2035 ++flg->refs; 2036 hammer_ref(&rec->lock); 2037 r = 1; 2038 break; 2039 case HAMMER_FST_SETUP: 2040 /* 2041 * The record has a setup dependancy. These are typically 2042 * directory entry adds and deletes. Such entries will be 2043 * flushed when their inodes are flushed so we do not 2044 * usually have to add them to the flush here. However, 2045 * if the target_ip has set HAMMER_INODE_CONN_DOWN then 2046 * it is asking us to flush this record (and it). 2047 */ 2048 target_ip = rec->target_ip; 2049 KKASSERT(target_ip != NULL); 2050 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE); 2051 2052 /* 2053 * If the target IP is already flushing in our group 2054 * we could associate the record, but target_ip has 2055 * already synced ino_data to sync_ino_data and we 2056 * would also have to adjust nlinks. Plus there are 2057 * ordering issues for adds and deletes. 2058 * 2059 * Reflush downward if this is an ADD, and upward if 2060 * this is a DEL. 2061 */ 2062 if (target_ip->flush_state == HAMMER_FST_FLUSH) { 2063 if (rec->flush_state == HAMMER_MEM_RECORD_ADD) 2064 ip->flags |= HAMMER_INODE_REFLUSH; 2065 else 2066 target_ip->flags |= HAMMER_INODE_REFLUSH; 2067 break; 2068 } 2069 2070 /* 2071 * Target IP is not yet flushing. This can get complex 2072 * because we have to be careful about the recursion. 2073 * 2074 * Directories create an issue for us in that if a flush 2075 * of a directory is requested the expectation is to flush 2076 * any pending directory entries, but this will cause the 2077 * related inodes to recursively flush as well. We can't 2078 * really defer the operation so just get as many as we 2079 * can and 2080 */ 2081 #if 0 2082 if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 && 2083 (target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) { 2084 /* 2085 * We aren't reclaiming and the target ip was not 2086 * previously prevented from flushing due to this 2087 * record dependancy. Do not flush this record. 2088 */ 2089 /*r = 0;*/ 2090 } else 2091 #endif 2092 if (flg->total_count + flg->refs > 2093 ip->hmp->undo_rec_limit) { 2094 /* 2095 * Our flush group is over-full and we risk blowing 2096 * out the UNDO FIFO. Stop the scan, flush what we 2097 * have, then reflush the directory. 2098 * 2099 * The directory may be forced through multiple 2100 * flush groups before it can be completely 2101 * flushed. 2102 */ 2103 ip->flags |= HAMMER_INODE_RESIGNAL | 2104 HAMMER_INODE_REFLUSH; 2105 r = -1; 2106 } else if (rec->type == HAMMER_MEM_RECORD_ADD) { 2107 /* 2108 * If the target IP is not flushing we can force 2109 * it to flush, even if it is unable to write out 2110 * any of its own records we have at least one in 2111 * hand that we CAN deal with. 2112 */ 2113 rec->flush_state = HAMMER_FST_FLUSH; 2114 rec->flush_group = flg; 2115 ++flg->refs; 2116 hammer_ref(&rec->lock); 2117 hammer_flush_inode_core(target_ip, flg, 2118 HAMMER_FLUSH_RECURSION); 2119 r = 1; 2120 } else { 2121 /* 2122 * General or delete-on-disk record. 2123 * 2124 * XXX this needs help. If a delete-on-disk we could 2125 * disconnect the target. If the target has its own 2126 * dependancies they really need to be flushed. 2127 * 2128 * XXX 2129 */ 2130 rec->flush_state = HAMMER_FST_FLUSH; 2131 rec->flush_group = flg; 2132 ++flg->refs; 2133 hammer_ref(&rec->lock); 2134 hammer_flush_inode_core(target_ip, flg, 2135 HAMMER_FLUSH_RECURSION); 2136 r = 1; 2137 } 2138 break; 2139 case HAMMER_FST_FLUSH: 2140 /* 2141 * The flush_group should already match. 2142 */ 2143 KKASSERT(rec->flush_group == flg); 2144 r = 1; 2145 break; 2146 } 2147 return(r); 2148 } 2149 2150 #if 0 2151 /* 2152 * This version just moves records already in a flush state to the new 2153 * flush group and that is it. 2154 */ 2155 static int 2156 hammer_syncgrp_child_callback(hammer_record_t rec, void *data) 2157 { 2158 hammer_inode_t ip = rec->ip; 2159 2160 switch(rec->flush_state) { 2161 case HAMMER_FST_FLUSH: 2162 KKASSERT(rec->flush_group == ip->flush_group); 2163 break; 2164 default: 2165 break; 2166 } 2167 return(0); 2168 } 2169 #endif 2170 2171 /* 2172 * Wait for a previously queued flush to complete. 2173 * 2174 * If a critical error occured we don't try to wait. 2175 */ 2176 void 2177 hammer_wait_inode(hammer_inode_t ip) 2178 { 2179 hammer_flush_group_t flg; 2180 2181 flg = NULL; 2182 if ((ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) { 2183 while (ip->flush_state != HAMMER_FST_IDLE && 2184 (ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) { 2185 if (ip->flush_state == HAMMER_FST_SETUP) 2186 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 2187 if (ip->flush_state != HAMMER_FST_IDLE) { 2188 ip->flags |= HAMMER_INODE_FLUSHW; 2189 tsleep(&ip->flags, 0, "hmrwin", 0); 2190 } 2191 } 2192 } 2193 } 2194 2195 /* 2196 * Called by the backend code when a flush has been completed. 2197 * The inode has already been removed from the flush list. 2198 * 2199 * A pipelined flush can occur, in which case we must re-enter the 2200 * inode on the list and re-copy its fields. 2201 */ 2202 void 2203 hammer_flush_inode_done(hammer_inode_t ip, int error) 2204 { 2205 hammer_mount_t hmp; 2206 int dorel; 2207 2208 KKASSERT(ip->flush_state == HAMMER_FST_FLUSH); 2209 2210 hmp = ip->hmp; 2211 2212 /* 2213 * Auto-reflush if the backend could not completely flush 2214 * the inode. This fixes a case where a deferred buffer flush 2215 * could cause fsync to return early. 2216 */ 2217 if (ip->sync_flags & HAMMER_INODE_MODMASK) 2218 ip->flags |= HAMMER_INODE_REFLUSH; 2219 2220 /* 2221 * Merge left-over flags back into the frontend and fix the state. 2222 * Incomplete truncations are retained by the backend. 2223 */ 2224 ip->error = error; 2225 ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED; 2226 ip->sync_flags &= HAMMER_INODE_TRUNCATED; 2227 2228 /* 2229 * The backend may have adjusted nlinks, so if the adjusted nlinks 2230 * does not match the fronttend set the frontend's RDIRTY flag again. 2231 */ 2232 if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks) 2233 ip->flags |= HAMMER_INODE_DDIRTY; 2234 2235 /* 2236 * Fix up the dirty buffer status. 2237 */ 2238 if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) { 2239 ip->flags |= HAMMER_INODE_BUFS; 2240 } 2241 2242 /* 2243 * Re-set the XDIRTY flag if some of the inode's in-memory records 2244 * could not be flushed. 2245 */ 2246 KKASSERT((RB_EMPTY(&ip->rec_tree) && 2247 (ip->flags & HAMMER_INODE_XDIRTY) == 0) || 2248 (!RB_EMPTY(&ip->rec_tree) && 2249 (ip->flags & HAMMER_INODE_XDIRTY) != 0)); 2250 2251 /* 2252 * Do not lose track of inodes which no longer have vnode 2253 * assocations, otherwise they may never get flushed again. 2254 * 2255 * The reflush flag can be set superfluously, causing extra pain 2256 * for no reason. If the inode is no longer modified it no longer 2257 * needs to be flushed. 2258 */ 2259 if (ip->flags & HAMMER_INODE_MODMASK) { 2260 if (ip->vp == NULL) 2261 ip->flags |= HAMMER_INODE_REFLUSH; 2262 } else { 2263 ip->flags &= ~HAMMER_INODE_REFLUSH; 2264 } 2265 2266 /* 2267 * Adjust the flush state. 2268 */ 2269 if (ip->flags & HAMMER_INODE_WOULDBLOCK) { 2270 /* 2271 * We were unable to flush out all our records, leave the 2272 * inode in a flush state and in the current flush group. 2273 * The flush group will be re-run. 2274 * 2275 * This occurs if the UNDO block gets too full or there is 2276 * too much dirty meta-data and allows the flusher to 2277 * finalize the UNDO block and then re-flush. 2278 */ 2279 ip->flags &= ~HAMMER_INODE_WOULDBLOCK; 2280 dorel = 0; 2281 } else { 2282 /* 2283 * Remove from the flush_group 2284 */ 2285 TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry); 2286 ip->flush_group = NULL; 2287 2288 /* 2289 * Clean up the vnode ref and tracking counts. 2290 */ 2291 if (ip->flags & HAMMER_INODE_VHELD) { 2292 ip->flags &= ~HAMMER_INODE_VHELD; 2293 vrele(ip->vp); 2294 } 2295 --hmp->count_iqueued; 2296 --hammer_count_iqueued; 2297 2298 /* 2299 * And adjust the state. 2300 */ 2301 if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) { 2302 ip->flush_state = HAMMER_FST_IDLE; 2303 dorel = 1; 2304 } else { 2305 ip->flush_state = HAMMER_FST_SETUP; 2306 dorel = 0; 2307 } 2308 2309 /* 2310 * If the frontend is waiting for a flush to complete, 2311 * wake it up. 2312 */ 2313 if (ip->flags & HAMMER_INODE_FLUSHW) { 2314 ip->flags &= ~HAMMER_INODE_FLUSHW; 2315 wakeup(&ip->flags); 2316 } 2317 2318 /* 2319 * If the frontend made more changes and requested another 2320 * flush, then try to get it running. 2321 * 2322 * Reflushes are aborted when the inode is errored out. 2323 */ 2324 if (ip->flags & HAMMER_INODE_REFLUSH) { 2325 ip->flags &= ~HAMMER_INODE_REFLUSH; 2326 if (ip->flags & HAMMER_INODE_RESIGNAL) { 2327 ip->flags &= ~HAMMER_INODE_RESIGNAL; 2328 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 2329 } else { 2330 hammer_flush_inode(ip, 0); 2331 } 2332 } 2333 } 2334 2335 /* 2336 * If we have no parent dependancies we can clear CONN_DOWN 2337 */ 2338 if (TAILQ_EMPTY(&ip->target_list)) 2339 ip->flags &= ~HAMMER_INODE_CONN_DOWN; 2340 2341 /* 2342 * If the inode is now clean drop the space reservation. 2343 */ 2344 if ((ip->flags & HAMMER_INODE_MODMASK) == 0 && 2345 (ip->flags & HAMMER_INODE_RSV_INODES)) { 2346 ip->flags &= ~HAMMER_INODE_RSV_INODES; 2347 --hmp->rsv_inodes; 2348 } 2349 2350 if (dorel) 2351 hammer_rel_inode(ip, 0); 2352 } 2353 2354 /* 2355 * Called from hammer_sync_inode() to synchronize in-memory records 2356 * to the media. 2357 */ 2358 static int 2359 hammer_sync_record_callback(hammer_record_t record, void *data) 2360 { 2361 hammer_cursor_t cursor = data; 2362 hammer_transaction_t trans = cursor->trans; 2363 hammer_mount_t hmp = trans->hmp; 2364 int error; 2365 2366 /* 2367 * Skip records that do not belong to the current flush. 2368 */ 2369 ++hammer_stats_record_iterations; 2370 if (record->flush_state != HAMMER_FST_FLUSH) 2371 return(0); 2372 2373 #if 1 2374 if (record->flush_group != record->ip->flush_group) { 2375 kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group); 2376 Debugger("blah2"); 2377 return(0); 2378 } 2379 #endif 2380 KKASSERT(record->flush_group == record->ip->flush_group); 2381 2382 /* 2383 * Interlock the record using the BE flag. Once BE is set the 2384 * frontend cannot change the state of FE. 2385 * 2386 * NOTE: If FE is set prior to us setting BE we still sync the 2387 * record out, but the flush completion code converts it to 2388 * a delete-on-disk record instead of destroying it. 2389 */ 2390 KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0); 2391 record->flags |= HAMMER_RECF_INTERLOCK_BE; 2392 2393 /* 2394 * The backend has already disposed of the record. 2395 */ 2396 if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) { 2397 error = 0; 2398 goto done; 2399 } 2400 2401 /* 2402 * If the whole inode is being deleting all on-disk records will 2403 * be deleted very soon, we can't sync any new records to disk 2404 * because they will be deleted in the same transaction they were 2405 * created in (delete_tid == create_tid), which will assert. 2406 * 2407 * XXX There may be a case with RECORD_ADD with DELETED_FE set 2408 * that we currently panic on. 2409 */ 2410 if (record->ip->sync_flags & HAMMER_INODE_DELETING) { 2411 switch(record->type) { 2412 case HAMMER_MEM_RECORD_DATA: 2413 /* 2414 * We don't have to do anything, if the record was 2415 * committed the space will have been accounted for 2416 * in the blockmap. 2417 */ 2418 /* fall through */ 2419 case HAMMER_MEM_RECORD_GENERAL: 2420 /* 2421 * Set deleted-by-backend flag. Do not set the 2422 * backend committed flag, because we are throwing 2423 * the record away. 2424 */ 2425 record->flags |= HAMMER_RECF_DELETED_BE; 2426 ++record->ip->rec_generation; 2427 error = 0; 2428 goto done; 2429 case HAMMER_MEM_RECORD_ADD: 2430 panic("hammer_sync_record_callback: illegal add " 2431 "during inode deletion record %p", record); 2432 break; /* NOT REACHED */ 2433 case HAMMER_MEM_RECORD_INODE: 2434 panic("hammer_sync_record_callback: attempt to " 2435 "sync inode record %p?", record); 2436 break; /* NOT REACHED */ 2437 case HAMMER_MEM_RECORD_DEL: 2438 /* 2439 * Follow through and issue the on-disk deletion 2440 */ 2441 break; 2442 } 2443 } 2444 2445 /* 2446 * If DELETED_FE is set special handling is needed for directory 2447 * entries. Dependant pieces related to the directory entry may 2448 * have already been synced to disk. If this occurs we have to 2449 * sync the directory entry and then change the in-memory record 2450 * from an ADD to a DELETE to cover the fact that it's been 2451 * deleted by the frontend. 2452 * 2453 * A directory delete covering record (MEM_RECORD_DEL) can never 2454 * be deleted by the frontend. 2455 * 2456 * Any other record type (aka DATA) can be deleted by the frontend. 2457 * XXX At the moment the flusher must skip it because there may 2458 * be another data record in the flush group for the same block, 2459 * meaning that some frontend data changes can leak into the backend's 2460 * synchronization point. 2461 */ 2462 if (record->flags & HAMMER_RECF_DELETED_FE) { 2463 if (record->type == HAMMER_MEM_RECORD_ADD) { 2464 /* 2465 * Convert a front-end deleted directory-add to 2466 * a directory-delete entry later. 2467 */ 2468 record->flags |= HAMMER_RECF_CONVERT_DELETE; 2469 } else { 2470 /* 2471 * Dispose of the record (race case). Mark as 2472 * deleted by backend (and not committed). 2473 */ 2474 KKASSERT(record->type != HAMMER_MEM_RECORD_DEL); 2475 record->flags |= HAMMER_RECF_DELETED_BE; 2476 ++record->ip->rec_generation; 2477 error = 0; 2478 goto done; 2479 } 2480 } 2481 2482 /* 2483 * Assign the create_tid for new records. Deletions already 2484 * have the record's entire key properly set up. 2485 */ 2486 if (record->type != HAMMER_MEM_RECORD_DEL) { 2487 record->leaf.base.create_tid = trans->tid; 2488 record->leaf.create_ts = trans->time32; 2489 } 2490 for (;;) { 2491 error = hammer_ip_sync_record_cursor(cursor, record); 2492 if (error != EDEADLK) 2493 break; 2494 hammer_done_cursor(cursor); 2495 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0], 2496 record->ip); 2497 if (error) 2498 break; 2499 } 2500 record->flags &= ~HAMMER_RECF_CONVERT_DELETE; 2501 2502 if (error) 2503 error = -error; 2504 done: 2505 hammer_flush_record_done(record, error); 2506 2507 /* 2508 * Do partial finalization if we have built up too many dirty 2509 * buffers. Otherwise a buffer cache deadlock can occur when 2510 * doing things like creating tens of thousands of tiny files. 2511 * 2512 * We must release our cursor lock to avoid a 3-way deadlock 2513 * due to the exclusive sync lock the finalizer must get. 2514 */ 2515 if (hammer_flusher_meta_limit(hmp)) { 2516 hammer_unlock_cursor(cursor); 2517 hammer_flusher_finalize(trans, 0); 2518 hammer_lock_cursor(cursor); 2519 } 2520 2521 return(error); 2522 } 2523 2524 /* 2525 * Backend function called by the flusher to sync an inode to media. 2526 */ 2527 int 2528 hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip) 2529 { 2530 struct hammer_cursor cursor; 2531 hammer_node_t tmp_node; 2532 hammer_record_t depend; 2533 hammer_record_t next; 2534 int error, tmp_error; 2535 u_int64_t nlinks; 2536 2537 if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0) 2538 return(0); 2539 2540 error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip); 2541 if (error) 2542 goto done; 2543 2544 /* 2545 * Any directory records referencing this inode which are not in 2546 * our current flush group must adjust our nlink count for the 2547 * purposes of synchronization to disk. 2548 * 2549 * Records which are in our flush group can be unlinked from our 2550 * inode now, potentially allowing the inode to be physically 2551 * deleted. 2552 * 2553 * This cannot block. 2554 */ 2555 nlinks = ip->ino_data.nlinks; 2556 next = TAILQ_FIRST(&ip->target_list); 2557 while ((depend = next) != NULL) { 2558 next = TAILQ_NEXT(depend, target_entry); 2559 if (depend->flush_state == HAMMER_FST_FLUSH && 2560 depend->flush_group == ip->flush_group) { 2561 /* 2562 * If this is an ADD that was deleted by the frontend 2563 * the frontend nlinks count will have already been 2564 * decremented, but the backend is going to sync its 2565 * directory entry and must account for it. The 2566 * record will be converted to a delete-on-disk when 2567 * it gets synced. 2568 * 2569 * If the ADD was not deleted by the frontend we 2570 * can remove the dependancy from our target_list. 2571 */ 2572 if (depend->flags & HAMMER_RECF_DELETED_FE) { 2573 ++nlinks; 2574 } else { 2575 TAILQ_REMOVE(&ip->target_list, depend, 2576 target_entry); 2577 depend->target_ip = NULL; 2578 } 2579 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) { 2580 /* 2581 * Not part of our flush group and not deleted by 2582 * the front-end, adjust the link count synced to 2583 * the media (undo what the frontend did when it 2584 * queued the record). 2585 */ 2586 KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0); 2587 switch(depend->type) { 2588 case HAMMER_MEM_RECORD_ADD: 2589 --nlinks; 2590 break; 2591 case HAMMER_MEM_RECORD_DEL: 2592 ++nlinks; 2593 break; 2594 default: 2595 break; 2596 } 2597 } 2598 } 2599 2600 /* 2601 * Set dirty if we had to modify the link count. 2602 */ 2603 if (ip->sync_ino_data.nlinks != nlinks) { 2604 KKASSERT((int64_t)nlinks >= 0); 2605 ip->sync_ino_data.nlinks = nlinks; 2606 ip->sync_flags |= HAMMER_INODE_DDIRTY; 2607 } 2608 2609 /* 2610 * If there is a trunction queued destroy any data past the (aligned) 2611 * truncation point. Userland will have dealt with the buffer 2612 * containing the truncation point for us. 2613 * 2614 * We don't flush pending frontend data buffers until after we've 2615 * dealt with the truncation. 2616 */ 2617 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2618 /* 2619 * Interlock trunc_off. The VOP front-end may continue to 2620 * make adjustments to it while we are blocked. 2621 */ 2622 off_t trunc_off; 2623 off_t aligned_trunc_off; 2624 int blkmask; 2625 2626 trunc_off = ip->sync_trunc_off; 2627 blkmask = hammer_blocksize(trunc_off) - 1; 2628 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask; 2629 2630 /* 2631 * Delete any whole blocks on-media. The front-end has 2632 * already cleaned out any partial block and made it 2633 * pending. The front-end may have updated trunc_off 2634 * while we were blocked so we only use sync_trunc_off. 2635 * 2636 * This operation can blow out the buffer cache, EWOULDBLOCK 2637 * means we were unable to complete the deletion. The 2638 * deletion will update sync_trunc_off in that case. 2639 */ 2640 error = hammer_ip_delete_range(&cursor, ip, 2641 aligned_trunc_off, 2642 0x7FFFFFFFFFFFFFFFLL, 2); 2643 if (error == EWOULDBLOCK) { 2644 ip->flags |= HAMMER_INODE_WOULDBLOCK; 2645 error = 0; 2646 goto defer_buffer_flush; 2647 } 2648 2649 if (error) 2650 goto done; 2651 2652 /* 2653 * Clear the truncation flag on the backend after we have 2654 * complete the deletions. Backend data is now good again 2655 * (including new records we are about to sync, below). 2656 * 2657 * Leave sync_trunc_off intact. As we write additional 2658 * records the backend will update sync_trunc_off. This 2659 * tells the backend whether it can skip the overwrite 2660 * test. This should work properly even when the backend 2661 * writes full blocks where the truncation point straddles 2662 * the block because the comparison is against the base 2663 * offset of the record. 2664 */ 2665 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED; 2666 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */ 2667 } else { 2668 error = 0; 2669 } 2670 2671 /* 2672 * Now sync related records. These will typically be directory 2673 * entries, records tracking direct-writes, or delete-on-disk records. 2674 */ 2675 if (error == 0) { 2676 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, 2677 hammer_sync_record_callback, &cursor); 2678 if (tmp_error < 0) 2679 tmp_error = -error; 2680 if (tmp_error) 2681 error = tmp_error; 2682 } 2683 hammer_cache_node(&ip->cache[1], cursor.node); 2684 2685 /* 2686 * Re-seek for inode update, assuming our cache hasn't been ripped 2687 * out from under us. 2688 */ 2689 if (error == 0) { 2690 tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error); 2691 if (tmp_node) { 2692 hammer_cursor_downgrade(&cursor); 2693 hammer_lock_sh(&tmp_node->lock); 2694 if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0) 2695 hammer_cursor_seek(&cursor, tmp_node, 0); 2696 hammer_unlock(&tmp_node->lock); 2697 hammer_rel_node(tmp_node); 2698 } 2699 error = 0; 2700 } 2701 2702 /* 2703 * If we are deleting the inode the frontend had better not have 2704 * any active references on elements making up the inode. 2705 * 2706 * The call to hammer_ip_delete_clean() cleans up auxillary records 2707 * but not DB or DATA records. Those must have already been deleted 2708 * by the normal truncation mechanic. 2709 */ 2710 if (error == 0 && ip->sync_ino_data.nlinks == 0 && 2711 RB_EMPTY(&ip->rec_tree) && 2712 (ip->sync_flags & HAMMER_INODE_DELETING) && 2713 (ip->flags & HAMMER_INODE_DELETED) == 0) { 2714 int count1 = 0; 2715 2716 error = hammer_ip_delete_clean(&cursor, ip, &count1); 2717 if (error == 0) { 2718 ip->flags |= HAMMER_INODE_DELETED; 2719 ip->sync_flags &= ~HAMMER_INODE_DELETING; 2720 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED; 2721 KKASSERT(RB_EMPTY(&ip->rec_tree)); 2722 2723 /* 2724 * Set delete_tid in both the frontend and backend 2725 * copy of the inode record. The DELETED flag handles 2726 * this, do not set RDIRTY. 2727 */ 2728 ip->ino_leaf.base.delete_tid = trans->tid; 2729 ip->sync_ino_leaf.base.delete_tid = trans->tid; 2730 ip->ino_leaf.delete_ts = trans->time32; 2731 ip->sync_ino_leaf.delete_ts = trans->time32; 2732 2733 2734 /* 2735 * Adjust the inode count in the volume header 2736 */ 2737 hammer_sync_lock_sh(trans); 2738 if (ip->flags & HAMMER_INODE_ONDISK) { 2739 hammer_modify_volume_field(trans, 2740 trans->rootvol, 2741 vol0_stat_inodes); 2742 --ip->hmp->rootvol->ondisk->vol0_stat_inodes; 2743 hammer_modify_volume_done(trans->rootvol); 2744 } 2745 hammer_sync_unlock(trans); 2746 } 2747 } 2748 2749 if (error) 2750 goto done; 2751 ip->sync_flags &= ~HAMMER_INODE_BUFS; 2752 2753 defer_buffer_flush: 2754 /* 2755 * Now update the inode's on-disk inode-data and/or on-disk record. 2756 * DELETED and ONDISK are managed only in ip->flags. 2757 * 2758 * In the case of a defered buffer flush we still update the on-disk 2759 * inode to satisfy visibility requirements if there happen to be 2760 * directory dependancies. 2761 */ 2762 switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) { 2763 case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK: 2764 /* 2765 * If deleted and on-disk, don't set any additional flags. 2766 * the delete flag takes care of things. 2767 * 2768 * Clear flags which may have been set by the frontend. 2769 */ 2770 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 2771 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME | 2772 HAMMER_INODE_DELETING); 2773 break; 2774 case HAMMER_INODE_DELETED: 2775 /* 2776 * Take care of the case where a deleted inode was never 2777 * flushed to the disk in the first place. 2778 * 2779 * Clear flags which may have been set by the frontend. 2780 */ 2781 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 2782 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME | 2783 HAMMER_INODE_DELETING); 2784 while (RB_ROOT(&ip->rec_tree)) { 2785 hammer_record_t record = RB_ROOT(&ip->rec_tree); 2786 hammer_ref(&record->lock); 2787 KKASSERT(record->lock.refs == 1); 2788 record->flags |= HAMMER_RECF_DELETED_BE; 2789 ++record->ip->rec_generation; 2790 hammer_rel_mem_record(record); 2791 } 2792 break; 2793 case HAMMER_INODE_ONDISK: 2794 /* 2795 * If already on-disk, do not set any additional flags. 2796 */ 2797 break; 2798 default: 2799 /* 2800 * If not on-disk and not deleted, set DDIRTY to force 2801 * an initial record to be written. 2802 * 2803 * Also set the create_tid in both the frontend and backend 2804 * copy of the inode record. 2805 */ 2806 ip->ino_leaf.base.create_tid = trans->tid; 2807 ip->ino_leaf.create_ts = trans->time32; 2808 ip->sync_ino_leaf.base.create_tid = trans->tid; 2809 ip->sync_ino_leaf.create_ts = trans->time32; 2810 ip->sync_flags |= HAMMER_INODE_DDIRTY; 2811 break; 2812 } 2813 2814 /* 2815 * If RDIRTY or DDIRTY is set, write out a new record. If the inode 2816 * is already on-disk the old record is marked as deleted. 2817 * 2818 * If DELETED is set hammer_update_inode() will delete the existing 2819 * record without writing out a new one. 2820 * 2821 * If *ONLY* the ITIMES flag is set we can update the record in-place. 2822 */ 2823 if (ip->flags & HAMMER_INODE_DELETED) { 2824 error = hammer_update_inode(&cursor, ip); 2825 } else 2826 if ((ip->sync_flags & HAMMER_INODE_DDIRTY) == 0 && 2827 (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) { 2828 error = hammer_update_itimes(&cursor, ip); 2829 } else 2830 if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) { 2831 error = hammer_update_inode(&cursor, ip); 2832 } 2833 done: 2834 if (error) { 2835 hammer_critical_error(ip->hmp, ip, error, 2836 "while syncing inode"); 2837 } 2838 hammer_done_cursor(&cursor); 2839 return(error); 2840 } 2841 2842 /* 2843 * This routine is called when the OS is no longer actively referencing 2844 * the inode (but might still be keeping it cached), or when releasing 2845 * the last reference to an inode. 2846 * 2847 * At this point if the inode's nlinks count is zero we want to destroy 2848 * it, which may mean destroying it on-media too. 2849 */ 2850 void 2851 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp) 2852 { 2853 struct vnode *vp; 2854 2855 /* 2856 * Set the DELETING flag when the link count drops to 0 and the 2857 * OS no longer has any opens on the inode. 2858 * 2859 * The backend will clear DELETING (a mod flag) and set DELETED 2860 * (a state flag) when it is actually able to perform the 2861 * operation. 2862 * 2863 * Don't reflag the deletion if the flusher is currently syncing 2864 * one that was already flagged. A previously set DELETING flag 2865 * may bounce around flags and sync_flags until the operation is 2866 * completely done. 2867 */ 2868 if (ip->ino_data.nlinks == 0 && 2869 ((ip->flags | ip->sync_flags) & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) { 2870 ip->flags |= HAMMER_INODE_DELETING; 2871 ip->flags |= HAMMER_INODE_TRUNCATED; 2872 ip->trunc_off = 0; 2873 vp = NULL; 2874 if (getvp) { 2875 if (hammer_get_vnode(ip, &vp) != 0) 2876 return; 2877 } 2878 2879 /* 2880 * Final cleanup 2881 */ 2882 if (ip->vp) { 2883 vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE); 2884 vnode_pager_setsize(ip->vp, 0); 2885 } 2886 if (getvp) { 2887 vput(vp); 2888 } 2889 } 2890 } 2891 2892 /* 2893 * After potentially resolving a dependancy the inode is tested 2894 * to determine whether it needs to be reflushed. 2895 */ 2896 void 2897 hammer_test_inode(hammer_inode_t ip) 2898 { 2899 if (ip->flags & HAMMER_INODE_REFLUSH) { 2900 ip->flags &= ~HAMMER_INODE_REFLUSH; 2901 hammer_ref(&ip->lock); 2902 if (ip->flags & HAMMER_INODE_RESIGNAL) { 2903 ip->flags &= ~HAMMER_INODE_RESIGNAL; 2904 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 2905 } else { 2906 hammer_flush_inode(ip, 0); 2907 } 2908 hammer_rel_inode(ip, 0); 2909 } 2910 } 2911 2912 /* 2913 * Clear the RECLAIM flag on an inode. This occurs when the inode is 2914 * reassociated with a vp or just before it gets freed. 2915 * 2916 * Pipeline wakeups to threads blocked due to an excessive number of 2917 * detached inodes. The reclaim count generates a bit of negative 2918 * feedback. 2919 */ 2920 static void 2921 hammer_inode_wakereclaims(hammer_inode_t ip, int dowake) 2922 { 2923 struct hammer_reclaim *reclaim; 2924 hammer_mount_t hmp = ip->hmp; 2925 2926 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) 2927 return; 2928 2929 --hammer_count_reclaiming; 2930 --hmp->inode_reclaims; 2931 ip->flags &= ~HAMMER_INODE_RECLAIM; 2932 2933 if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT || dowake) { 2934 reclaim = TAILQ_FIRST(&hmp->reclaim_list); 2935 if (reclaim && reclaim->count > 0 && --reclaim->count == 0) { 2936 TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry); 2937 wakeup(reclaim); 2938 } 2939 } 2940 } 2941 2942 /* 2943 * Setup our reclaim pipeline. We only let so many detached (and dirty) 2944 * inodes build up before we start blocking. 2945 * 2946 * When we block we don't care *which* inode has finished reclaiming, 2947 * as lone as one does. This is somewhat heuristical... we also put a 2948 * cap on how long we are willing to wait. 2949 */ 2950 void 2951 hammer_inode_waitreclaims(hammer_mount_t hmp) 2952 { 2953 struct hammer_reclaim reclaim; 2954 int delay; 2955 2956 if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT) 2957 return; 2958 delay = (hmp->inode_reclaims - HAMMER_RECLAIM_WAIT) * hz / 2959 (HAMMER_RECLAIM_WAIT * 3) + 1; 2960 if (delay > 0) { 2961 reclaim.count = 2; 2962 TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry); 2963 tsleep(&reclaim, 0, "hmrrcm", delay); 2964 if (reclaim.count > 0) 2965 TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry); 2966 } 2967 } 2968 2969 /* 2970 * A larger then normal backlog of inodes is sitting in the flusher, 2971 * enforce a general slowdown to let it catch up. This routine is only 2972 * called on completion of a non-flusher-related transaction which 2973 * performed B-Tree node I/O. 2974 * 2975 * It is possible for the flusher to stall in a continuous load. 2976 * blogbench -i1000 -o seems to do a good job generating this sort of load. 2977 * If the flusher is unable to catch up the inode count can bloat until 2978 * we run out of kvm. 2979 * 2980 * This is a bit of a hack. 2981 */ 2982 void 2983 hammer_inode_waithard(hammer_mount_t hmp) 2984 { 2985 /* 2986 * Hysteresis. 2987 */ 2988 if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) { 2989 if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT / 2 && 2990 hmp->count_iqueued < hmp->count_inodes / 20) { 2991 hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY; 2992 return; 2993 } 2994 } else { 2995 if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT || 2996 hmp->count_iqueued < hmp->count_inodes / 10) { 2997 return; 2998 } 2999 hmp->flags |= HAMMER_MOUNT_FLUSH_RECOVERY; 3000 } 3001 3002 /* 3003 * Block for one flush cycle. 3004 */ 3005 hammer_flusher_wait_next(hmp); 3006 } 3007 3008