1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.95 2008/07/07 03:49:50 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 #include <vm/vm_extern.h> 39 #include <sys/buf.h> 40 #include <sys/buf2.h> 41 42 static int hammer_unload_inode(struct hammer_inode *ip); 43 static void hammer_free_inode(hammer_inode_t ip); 44 static void hammer_flush_inode_core(hammer_inode_t ip, int flags); 45 static int hammer_setup_child_callback(hammer_record_t rec, void *data); 46 static int hammer_syncgrp_child_callback(hammer_record_t rec, void *data); 47 static int hammer_setup_parent_inodes(hammer_inode_t ip); 48 static int hammer_setup_parent_inodes_helper(hammer_record_t record); 49 static void hammer_inode_wakereclaims(hammer_inode_t ip); 50 51 #ifdef DEBUG_TRUNCATE 52 extern struct hammer_inode *HammerTruncIp; 53 #endif 54 55 /* 56 * RB-Tree support for inode structures 57 */ 58 int 59 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2) 60 { 61 if (ip1->obj_localization < ip2->obj_localization) 62 return(-1); 63 if (ip1->obj_localization > ip2->obj_localization) 64 return(1); 65 if (ip1->obj_id < ip2->obj_id) 66 return(-1); 67 if (ip1->obj_id > ip2->obj_id) 68 return(1); 69 if (ip1->obj_asof < ip2->obj_asof) 70 return(-1); 71 if (ip1->obj_asof > ip2->obj_asof) 72 return(1); 73 return(0); 74 } 75 76 /* 77 * RB-Tree support for inode structures / special LOOKUP_INFO 78 */ 79 static int 80 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip) 81 { 82 if (info->obj_localization < ip->obj_localization) 83 return(-1); 84 if (info->obj_localization > ip->obj_localization) 85 return(1); 86 if (info->obj_id < ip->obj_id) 87 return(-1); 88 if (info->obj_id > ip->obj_id) 89 return(1); 90 if (info->obj_asof < ip->obj_asof) 91 return(-1); 92 if (info->obj_asof > ip->obj_asof) 93 return(1); 94 return(0); 95 } 96 97 /* 98 * Used by hammer_scan_inode_snapshots() to locate all of an object's 99 * snapshots. Note that the asof field is not tested, which we can get 100 * away with because it is the lowest-priority field. 101 */ 102 static int 103 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data) 104 { 105 hammer_inode_info_t info = data; 106 107 if (ip->obj_localization > info->obj_localization) 108 return(1); 109 if (ip->obj_localization < info->obj_localization) 110 return(-1); 111 if (ip->obj_id > info->obj_id) 112 return(1); 113 if (ip->obj_id < info->obj_id) 114 return(-1); 115 return(0); 116 } 117 118 /* 119 * RB-Tree support for pseudofs structures 120 */ 121 static int 122 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2) 123 { 124 if (p1->localization < p2->localization) 125 return(-1); 126 if (p1->localization > p2->localization) 127 return(1); 128 return(0); 129 } 130 131 132 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare); 133 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node, 134 hammer_inode_info_cmp, hammer_inode_info_t); 135 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node, 136 hammer_pfs_rb_compare, u_int32_t, localization); 137 138 /* 139 * The kernel is not actively referencing this vnode but is still holding 140 * it cached. 141 * 142 * This is called from the frontend. 143 */ 144 int 145 hammer_vop_inactive(struct vop_inactive_args *ap) 146 { 147 struct hammer_inode *ip = VTOI(ap->a_vp); 148 149 /* 150 * Degenerate case 151 */ 152 if (ip == NULL) { 153 vrecycle(ap->a_vp); 154 return(0); 155 } 156 157 /* 158 * If the inode no longer has visibility in the filesystem try to 159 * recycle it immediately, even if the inode is dirty. Recycling 160 * it quickly allows the system to reclaim buffer cache and VM 161 * resources which can matter a lot in a heavily loaded system. 162 * 163 * This can deadlock in vfsync() if we aren't careful. 164 * 165 * Do not queue the inode to the flusher if we still have visibility, 166 * otherwise namespace calls such as chmod will unnecessarily generate 167 * multiple inode updates. 168 */ 169 hammer_inode_unloadable_check(ip, 0); 170 if (ip->ino_data.nlinks == 0) { 171 if (ip->flags & HAMMER_INODE_MODMASK) 172 hammer_flush_inode(ip, 0); 173 vrecycle(ap->a_vp); 174 } 175 return(0); 176 } 177 178 /* 179 * Release the vnode association. This is typically (but not always) 180 * the last reference on the inode. 181 * 182 * Once the association is lost we are on our own with regards to 183 * flushing the inode. 184 */ 185 int 186 hammer_vop_reclaim(struct vop_reclaim_args *ap) 187 { 188 struct hammer_inode *ip; 189 hammer_mount_t hmp; 190 struct vnode *vp; 191 192 vp = ap->a_vp; 193 194 if ((ip = vp->v_data) != NULL) { 195 hmp = ip->hmp; 196 vp->v_data = NULL; 197 ip->vp = NULL; 198 199 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) { 200 ++hammer_count_reclaiming; 201 ++hmp->inode_reclaims; 202 ip->flags |= HAMMER_INODE_RECLAIM; 203 if (hmp->inode_reclaims > HAMMER_RECLAIM_FLUSH && 204 (hmp->inode_reclaims & 255) == 0) { 205 hammer_flusher_async(hmp); 206 } 207 } 208 hammer_rel_inode(ip, 1); 209 } 210 return(0); 211 } 212 213 /* 214 * Return a locked vnode for the specified inode. The inode must be 215 * referenced but NOT LOCKED on entry and will remain referenced on 216 * return. 217 * 218 * Called from the frontend. 219 */ 220 int 221 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp) 222 { 223 hammer_mount_t hmp; 224 struct vnode *vp; 225 int error = 0; 226 227 hmp = ip->hmp; 228 229 for (;;) { 230 if ((vp = ip->vp) == NULL) { 231 error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0); 232 if (error) 233 break; 234 hammer_lock_ex(&ip->lock); 235 if (ip->vp != NULL) { 236 hammer_unlock(&ip->lock); 237 vp->v_type = VBAD; 238 vx_put(vp); 239 continue; 240 } 241 hammer_ref(&ip->lock); 242 vp = *vpp; 243 ip->vp = vp; 244 vp->v_type = 245 hammer_get_vnode_type(ip->ino_data.obj_type); 246 247 hammer_inode_wakereclaims(ip); 248 249 switch(ip->ino_data.obj_type) { 250 case HAMMER_OBJTYPE_CDEV: 251 case HAMMER_OBJTYPE_BDEV: 252 vp->v_ops = &hmp->mp->mnt_vn_spec_ops; 253 addaliasu(vp, ip->ino_data.rmajor, 254 ip->ino_data.rminor); 255 break; 256 case HAMMER_OBJTYPE_FIFO: 257 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops; 258 break; 259 default: 260 break; 261 } 262 263 /* 264 * Only mark as the root vnode if the ip is not 265 * historical, otherwise the VFS cache will get 266 * confused. The other half of the special handling 267 * is in hammer_vop_nlookupdotdot(). 268 * 269 * Pseudo-filesystem roots also do not count. 270 */ 271 if (ip->obj_id == HAMMER_OBJID_ROOT && 272 ip->obj_asof == hmp->asof && 273 ip->obj_localization == 0) { 274 vp->v_flag |= VROOT; 275 } 276 277 vp->v_data = (void *)ip; 278 /* vnode locked by getnewvnode() */ 279 /* make related vnode dirty if inode dirty? */ 280 hammer_unlock(&ip->lock); 281 if (vp->v_type == VREG) 282 vinitvmio(vp, ip->ino_data.size); 283 break; 284 } 285 286 /* 287 * loop if the vget fails (aka races), or if the vp 288 * no longer matches ip->vp. 289 */ 290 if (vget(vp, LK_EXCLUSIVE) == 0) { 291 if (vp == ip->vp) 292 break; 293 vput(vp); 294 } 295 } 296 *vpp = vp; 297 return(error); 298 } 299 300 /* 301 * Locate all copies of the inode for obj_id compatible with the specified 302 * asof, reference, and issue the related call-back. This routine is used 303 * for direct-io invalidation and does not create any new inodes. 304 */ 305 void 306 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo, 307 int (*callback)(hammer_inode_t ip, void *data), 308 void *data) 309 { 310 hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root, 311 hammer_inode_info_cmp_all_history, 312 callback, iinfo); 313 } 314 315 /* 316 * Acquire a HAMMER inode. The returned inode is not locked. These functions 317 * do not attach or detach the related vnode (use hammer_get_vnode() for 318 * that). 319 * 320 * The flags argument is only applied for newly created inodes, and only 321 * certain flags are inherited. 322 * 323 * Called from the frontend. 324 */ 325 struct hammer_inode * 326 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip, 327 int64_t obj_id, hammer_tid_t asof, u_int32_t localization, 328 int flags, int *errorp) 329 { 330 hammer_mount_t hmp = trans->hmp; 331 struct hammer_inode_info iinfo; 332 struct hammer_cursor cursor; 333 struct hammer_inode *ip; 334 335 336 /* 337 * Determine if we already have an inode cached. If we do then 338 * we are golden. 339 */ 340 iinfo.obj_id = obj_id; 341 iinfo.obj_asof = asof; 342 iinfo.obj_localization = localization; 343 loop: 344 ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo); 345 if (ip) { 346 hammer_ref(&ip->lock); 347 *errorp = 0; 348 return(ip); 349 } 350 351 /* 352 * Allocate a new inode structure and deal with races later. 353 */ 354 ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO); 355 ++hammer_count_inodes; 356 ++hmp->count_inodes; 357 ip->obj_id = obj_id; 358 ip->obj_asof = iinfo.obj_asof; 359 ip->obj_localization = localization; 360 ip->hmp = hmp; 361 ip->flags = flags & HAMMER_INODE_RO; 362 ip->cache[0].ip = ip; 363 ip->cache[1].ip = ip; 364 if (hmp->ronly) 365 ip->flags |= HAMMER_INODE_RO; 366 ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off = 367 0x7FFFFFFFFFFFFFFFLL; 368 RB_INIT(&ip->rec_tree); 369 TAILQ_INIT(&ip->target_list); 370 hammer_ref(&ip->lock); 371 372 /* 373 * Locate the on-disk inode. 374 */ 375 retry: 376 hammer_init_cursor(trans, &cursor, (dip ? &dip->cache[0] : NULL), NULL); 377 cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE; 378 cursor.key_beg.obj_id = ip->obj_id; 379 cursor.key_beg.key = 0; 380 cursor.key_beg.create_tid = 0; 381 cursor.key_beg.delete_tid = 0; 382 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE; 383 cursor.key_beg.obj_type = 0; 384 cursor.asof = iinfo.obj_asof; 385 cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA | 386 HAMMER_CURSOR_ASOF; 387 388 *errorp = hammer_btree_lookup(&cursor); 389 if (*errorp == EDEADLK) { 390 hammer_done_cursor(&cursor); 391 goto retry; 392 } 393 394 /* 395 * On success the B-Tree lookup will hold the appropriate 396 * buffer cache buffers and provide a pointer to the requested 397 * information. Copy the information to the in-memory inode 398 * and cache the B-Tree node to improve future operations. 399 */ 400 if (*errorp == 0) { 401 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf; 402 ip->ino_data = cursor.data->inode; 403 404 /* 405 * cache[0] tries to cache the location of the object inode. 406 * The assumption is that it is near the directory inode. 407 * 408 * cache[1] tries to cache the location of the object data. 409 * The assumption is that it is near the directory data. 410 */ 411 hammer_cache_node(&ip->cache[0], cursor.node); 412 if (dip && dip->cache[1].node) 413 hammer_cache_node(&ip->cache[1], dip->cache[1].node); 414 415 /* 416 * The file should not contain any data past the file size 417 * stored in the inode. Setting save_trunc_off to the 418 * file size instead of max reduces B-Tree lookup overheads 419 * on append by allowing the flusher to avoid checking for 420 * record overwrites. 421 */ 422 ip->save_trunc_off = ip->ino_data.size; 423 424 /* 425 * Locate and assign the pseudofs management structure to 426 * the inode. 427 */ 428 if (dip && dip->obj_localization == ip->obj_localization) { 429 ip->pfsm = dip->pfsm; 430 hammer_ref(&ip->pfsm->lock); 431 } else { 432 *errorp = hammer_load_pseudofs(trans, ip); 433 } 434 } 435 436 /* 437 * The inode is placed on the red-black tree and will be synced to 438 * the media when flushed or by the filesystem sync. If this races 439 * another instantiation/lookup the insertion will fail. 440 */ 441 if (*errorp == 0) { 442 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) { 443 hammer_free_inode(ip); 444 hammer_done_cursor(&cursor); 445 goto loop; 446 } 447 ip->flags |= HAMMER_INODE_ONDISK; 448 } else { 449 if (ip->flags & HAMMER_INODE_RSV_INODES) { 450 ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */ 451 --hmp->rsv_inodes; 452 } 453 454 hammer_free_inode(ip); 455 ip = NULL; 456 } 457 hammer_done_cursor(&cursor); 458 return (ip); 459 } 460 461 /* 462 * Create a new filesystem object, returning the inode in *ipp. The 463 * returned inode will be referenced. 464 * 465 * The inode is created in-memory. 466 */ 467 int 468 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap, 469 struct ucred *cred, hammer_inode_t dip, 470 int pseudofs, struct hammer_inode **ipp) 471 { 472 hammer_mount_t hmp; 473 hammer_inode_t ip; 474 uid_t xuid; 475 u_int32_t localization; 476 int error; 477 478 hmp = trans->hmp; 479 480 /* 481 * Assign the localization domain. If if dip is NULL we are creating 482 * a pseudo-fs and must locate an unused localization domain. 483 */ 484 if (pseudofs) { 485 for (localization = HAMMER_DEF_LOCALIZATION; 486 localization < HAMMER_LOCALIZE_PSEUDOFS_MASK; 487 localization += HAMMER_LOCALIZE_PSEUDOFS_INC) { 488 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, 489 hmp->asof, localization, 490 0, &error); 491 if (ip == NULL) { 492 if (error != ENOENT) 493 return(error); 494 break; 495 } 496 if (ip) 497 hammer_rel_inode(ip, 0); 498 } 499 } else { 500 localization = dip->obj_localization; 501 } 502 503 ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO); 504 ++hammer_count_inodes; 505 ++hmp->count_inodes; 506 507 /* 508 * Allocate a new object id. If creating a new pseudo-fs the 509 * obj_id is 1. 510 */ 511 if (pseudofs) 512 ip->obj_id = HAMMER_OBJID_ROOT; 513 else 514 ip->obj_id = hammer_alloc_objid(hmp, dip); 515 ip->obj_localization = localization; 516 517 KKASSERT(ip->obj_id != 0); 518 ip->obj_asof = hmp->asof; 519 ip->hmp = hmp; 520 ip->flush_state = HAMMER_FST_IDLE; 521 ip->flags = HAMMER_INODE_DDIRTY | 522 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME; 523 ip->cache[0].ip = ip; 524 ip->cache[1].ip = ip; 525 526 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; 527 /* ip->save_trunc_off = 0; (already zero) */ 528 RB_INIT(&ip->rec_tree); 529 TAILQ_INIT(&ip->target_list); 530 531 ip->ino_data.atime = trans->time; 532 ip->ino_data.mtime = trans->time; 533 ip->ino_data.size = 0; 534 ip->ino_data.nlinks = 0; 535 536 /* 537 * A nohistory designator on the parent directory is inherited by 538 * the child. We will do this even for pseudo-fs creation... the 539 * sysad can turn it off. 540 */ 541 ip->ino_data.uflags = dip->ino_data.uflags & 542 (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP); 543 544 ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD; 545 ip->ino_leaf.base.localization = ip->obj_localization + 546 HAMMER_LOCALIZE_INODE; 547 ip->ino_leaf.base.obj_id = ip->obj_id; 548 ip->ino_leaf.base.key = 0; 549 ip->ino_leaf.base.create_tid = 0; 550 ip->ino_leaf.base.delete_tid = 0; 551 ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE; 552 ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type); 553 554 ip->ino_data.obj_type = ip->ino_leaf.base.obj_type; 555 ip->ino_data.version = HAMMER_INODE_DATA_VERSION; 556 ip->ino_data.mode = vap->va_mode; 557 ip->ino_data.ctime = trans->time; 558 559 /* 560 * Setup the ".." pointer. This only needs to be done for directories 561 * but we do it for all objects as a recovery aid. 562 * 563 * The parent_obj_localization field only applies to pseudo-fs roots. 564 */ 565 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id; 566 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY && 567 ip->obj_id == HAMMER_OBJID_ROOT) { 568 ip->ino_data.ext.obj.parent_obj_localization = 569 dip->obj_localization; 570 } 571 572 switch(ip->ino_leaf.base.obj_type) { 573 case HAMMER_OBJTYPE_CDEV: 574 case HAMMER_OBJTYPE_BDEV: 575 ip->ino_data.rmajor = vap->va_rmajor; 576 ip->ino_data.rminor = vap->va_rminor; 577 break; 578 default: 579 break; 580 } 581 582 /* 583 * Calculate default uid/gid and overwrite with information from 584 * the vap. 585 */ 586 xuid = hammer_to_unix_xid(&dip->ino_data.uid); 587 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, xuid, cred, 588 &vap->va_mode); 589 ip->ino_data.mode = vap->va_mode; 590 591 if (vap->va_vaflags & VA_UID_UUID_VALID) 592 ip->ino_data.uid = vap->va_uid_uuid; 593 else if (vap->va_uid != (uid_t)VNOVAL) 594 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid); 595 else 596 hammer_guid_to_uuid(&ip->ino_data.uid, xuid); 597 598 if (vap->va_vaflags & VA_GID_UUID_VALID) 599 ip->ino_data.gid = vap->va_gid_uuid; 600 else if (vap->va_gid != (gid_t)VNOVAL) 601 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid); 602 else 603 ip->ino_data.gid = dip->ino_data.gid; 604 605 hammer_ref(&ip->lock); 606 607 if (dip->obj_localization == ip->obj_localization) { 608 ip->pfsm = dip->pfsm; 609 hammer_ref(&ip->pfsm->lock); 610 error = 0; 611 } else { 612 error = hammer_load_pseudofs(trans, ip); 613 } 614 615 if (error) { 616 hammer_free_inode(ip); 617 ip = NULL; 618 } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) { 619 panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id); 620 /* not reached */ 621 hammer_free_inode(ip); 622 } 623 *ipp = ip; 624 return(error); 625 } 626 627 /* 628 * Final cleanup / freeing of an inode structure 629 */ 630 static void 631 hammer_free_inode(hammer_inode_t ip) 632 { 633 KKASSERT(ip->lock.refs == 1); 634 hammer_uncache_node(&ip->cache[0]); 635 hammer_uncache_node(&ip->cache[1]); 636 hammer_inode_wakereclaims(ip); 637 if (ip->objid_cache) 638 hammer_clear_objid(ip); 639 --hammer_count_inodes; 640 --ip->hmp->count_inodes; 641 if (ip->pfsm) { 642 hammer_rel_pseudofs(ip->hmp, ip->pfsm); 643 ip->pfsm = NULL; 644 } 645 kfree(ip, M_HAMMER); 646 ip = NULL; 647 } 648 649 /* 650 * Retrieve pseudo-fs data. 651 */ 652 int 653 hammer_load_pseudofs(hammer_transaction_t trans, hammer_inode_t ip) 654 { 655 hammer_mount_t hmp = trans->hmp; 656 hammer_pseudofs_inmem_t pfsm; 657 struct hammer_cursor cursor; 658 int error; 659 int bytes; 660 661 retry: 662 pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, 663 ip->obj_localization); 664 if (pfsm) { 665 KKASSERT(ip->pfsm == NULL); 666 ip->pfsm = pfsm; 667 hammer_ref(&pfsm->lock); 668 return(0); 669 } 670 671 pfsm = kmalloc(sizeof(*pfsm), M_HAMMER, M_WAITOK | M_ZERO); 672 pfsm->localization = ip->obj_localization; 673 pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid; 674 pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid; 675 676 hammer_init_cursor(trans, &cursor, NULL, NULL); 677 cursor.key_beg.localization = ip->obj_localization + 678 HAMMER_LOCALIZE_MISC; 679 cursor.key_beg.obj_id = HAMMER_OBJID_ROOT; 680 cursor.key_beg.create_tid = 0; 681 cursor.key_beg.delete_tid = 0; 682 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 683 cursor.key_beg.obj_type = 0; 684 cursor.key_beg.key = HAMMER_FIXKEY_PSEUDOFS; 685 cursor.asof = HAMMER_MAX_TID; 686 cursor.flags |= HAMMER_CURSOR_ASOF; 687 688 error = hammer_btree_lookup(&cursor); 689 if (error == 0) { 690 error = hammer_btree_extract(&cursor, HAMMER_CURSOR_GET_DATA); 691 if (error == 0) { 692 bytes = cursor.leaf->data_len; 693 if (bytes > sizeof(pfsm->pfsd)) 694 bytes = sizeof(pfsm->pfsd); 695 bcopy(cursor.data, &pfsm->pfsd, bytes); 696 } 697 } else if (error == ENOENT) { 698 error = 0; 699 } 700 701 hammer_done_cursor(&cursor); 702 703 if (error == 0) { 704 pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid); 705 hammer_ref(&pfsm->lock); 706 if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) { 707 kfree(pfsm, M_HAMMER); 708 goto retry; 709 } 710 ip->pfsm = pfsm; 711 712 /* 713 * Certain aspects of the pseudofs configuration are reflected 714 * in the inode. 715 */ 716 if (pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) { 717 ip->flags |= HAMMER_INODE_RO; 718 ip->flags |= HAMMER_INODE_PFSD; 719 } else if (pfsm->pfsd.master_id >= 0) { 720 ip->flags |= HAMMER_INODE_PFSD; 721 } 722 } else { 723 kprintf("cannot load pfsm error %d\n", error); 724 kfree(pfsm, M_HAMMER); 725 } 726 return(error); 727 } 728 729 /* 730 * Store pseudo-fs data. The backend will automatically delete any prior 731 * on-disk pseudo-fs data but we have to delete in-memory versions. 732 */ 733 int 734 hammer_save_pseudofs(hammer_transaction_t trans, hammer_inode_t ip) 735 { 736 struct hammer_cursor cursor; 737 hammer_pseudofs_inmem_t pfsm; 738 hammer_record_t record; 739 int error; 740 741 retry: 742 pfsm = ip->pfsm; 743 pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid); 744 hammer_init_cursor(trans, &cursor, &ip->cache[1], ip); 745 cursor.key_beg.localization = ip->obj_localization + 746 HAMMER_LOCALIZE_MISC; 747 cursor.key_beg.obj_id = ip->obj_id; 748 cursor.key_beg.create_tid = 0; 749 cursor.key_beg.delete_tid = 0; 750 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX; 751 cursor.key_beg.obj_type = 0; 752 cursor.key_beg.key = HAMMER_FIXKEY_PSEUDOFS; 753 cursor.asof = HAMMER_MAX_TID; 754 cursor.flags |= HAMMER_CURSOR_ASOF; 755 756 error = hammer_ip_lookup(&cursor); 757 if (error == 0 && hammer_cursor_inmem(&cursor)) { 758 record = cursor.iprec; 759 if (record->flags & HAMMER_RECF_INTERLOCK_BE) { 760 KKASSERT(cursor.deadlk_rec == NULL); 761 hammer_ref(&record->lock); 762 cursor.deadlk_rec = record; 763 error = EDEADLK; 764 } else { 765 record->flags |= HAMMER_RECF_DELETED_FE; 766 error = 0; 767 } 768 } 769 if (error == 0 || error == ENOENT) { 770 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd)); 771 record->type = HAMMER_MEM_RECORD_GENERAL; 772 773 record->leaf.base.localization = ip->obj_localization + 774 HAMMER_LOCALIZE_MISC; 775 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX; 776 record->leaf.base.key = HAMMER_FIXKEY_PSEUDOFS; 777 record->leaf.data_len = sizeof(pfsm->pfsd); 778 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd)); 779 error = hammer_ip_add_record(trans, record); 780 } 781 hammer_done_cursor(&cursor); 782 if (error == EDEADLK) 783 goto retry; 784 if (error == 0) { 785 /* 786 * Certain aspects of the pseudofs configuration are reflected 787 * in the inode. Note that we cannot mess with the as-of or 788 * clear the read-only state. 789 * 790 * If this inode represented a slave snapshot its asof will 791 * be set to a snapshot tid. When clearing slave mode any 792 * re-access of the inode via the parent directory will 793 * wind up using a different asof and thus will instantiate 794 * a new inode. 795 */ 796 if (pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) { 797 ip->flags |= HAMMER_INODE_RO; 798 ip->flags |= HAMMER_INODE_PFSD; 799 } else if (pfsm->pfsd.master_id >= 0) { 800 ip->flags |= HAMMER_INODE_PFSD; 801 } else { 802 ip->flags &= ~HAMMER_INODE_PFSD; 803 } 804 } 805 return(error); 806 } 807 808 void 809 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm) 810 { 811 hammer_unref(&pfsm->lock); 812 if (pfsm->lock.refs == 0) { 813 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm); 814 kfree(pfsm, M_HAMMER); 815 } 816 } 817 818 /* 819 * Called by hammer_sync_inode(). 820 */ 821 static int 822 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip) 823 { 824 hammer_transaction_t trans = cursor->trans; 825 hammer_record_t record; 826 int error; 827 int redirty; 828 829 retry: 830 error = 0; 831 832 /* 833 * If the inode has a presence on-disk then locate it and mark 834 * it deleted, setting DELONDISK. 835 * 836 * The record may or may not be physically deleted, depending on 837 * the retention policy. 838 */ 839 if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) == 840 HAMMER_INODE_ONDISK) { 841 hammer_normalize_cursor(cursor); 842 cursor->key_beg.localization = ip->obj_localization + 843 HAMMER_LOCALIZE_INODE; 844 cursor->key_beg.obj_id = ip->obj_id; 845 cursor->key_beg.key = 0; 846 cursor->key_beg.create_tid = 0; 847 cursor->key_beg.delete_tid = 0; 848 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE; 849 cursor->key_beg.obj_type = 0; 850 cursor->asof = ip->obj_asof; 851 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 852 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF; 853 cursor->flags |= HAMMER_CURSOR_BACKEND; 854 855 error = hammer_btree_lookup(cursor); 856 if (hammer_debug_inode) 857 kprintf("IPDEL %p %08x %d", ip, ip->flags, error); 858 if (error) { 859 kprintf("error %d\n", error); 860 Debugger("hammer_update_inode"); 861 } 862 863 if (error == 0) { 864 error = hammer_ip_delete_record(cursor, ip, trans->tid); 865 if (hammer_debug_inode) 866 kprintf(" error %d\n", error); 867 if (error && error != EDEADLK) { 868 kprintf("error %d\n", error); 869 Debugger("hammer_update_inode2"); 870 } 871 if (error == 0) { 872 ip->flags |= HAMMER_INODE_DELONDISK; 873 } 874 if (cursor->node) 875 hammer_cache_node(&ip->cache[0], cursor->node); 876 } 877 if (error == EDEADLK) { 878 hammer_done_cursor(cursor); 879 error = hammer_init_cursor(trans, cursor, 880 &ip->cache[0], ip); 881 if (hammer_debug_inode) 882 kprintf("IPDED %p %d\n", ip, error); 883 if (error == 0) 884 goto retry; 885 } 886 } 887 888 /* 889 * Ok, write out the initial record or a new record (after deleting 890 * the old one), unless the DELETED flag is set. This routine will 891 * clear DELONDISK if it writes out a record. 892 * 893 * Update our inode statistics if this is the first application of 894 * the inode on-disk. 895 */ 896 if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) { 897 /* 898 * Generate a record and write it to the media 899 */ 900 record = hammer_alloc_mem_record(ip, 0); 901 record->type = HAMMER_MEM_RECORD_INODE; 902 record->flush_state = HAMMER_FST_FLUSH; 903 record->leaf = ip->sync_ino_leaf; 904 record->leaf.base.create_tid = trans->tid; 905 record->leaf.data_len = sizeof(ip->sync_ino_data); 906 record->leaf.create_ts = trans->time32; 907 record->data = (void *)&ip->sync_ino_data; 908 record->flags |= HAMMER_RECF_INTERLOCK_BE; 909 910 /* 911 * If this flag is set we cannot sync the new file size 912 * because we haven't finished related truncations. The 913 * inode will be flushed in another flush group to finish 914 * the job. 915 */ 916 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) && 917 ip->sync_ino_data.size != ip->ino_data.size) { 918 redirty = 1; 919 ip->sync_ino_data.size = ip->ino_data.size; 920 } else { 921 redirty = 0; 922 } 923 924 for (;;) { 925 error = hammer_ip_sync_record_cursor(cursor, record); 926 if (hammer_debug_inode) 927 kprintf("GENREC %p rec %08x %d\n", 928 ip, record->flags, error); 929 if (error != EDEADLK) 930 break; 931 hammer_done_cursor(cursor); 932 error = hammer_init_cursor(trans, cursor, 933 &ip->cache[0], ip); 934 if (hammer_debug_inode) 935 kprintf("GENREC reinit %d\n", error); 936 if (error) 937 break; 938 } 939 if (error) { 940 kprintf("error %d\n", error); 941 Debugger("hammer_update_inode3"); 942 } 943 944 /* 945 * The record isn't managed by the inode's record tree, 946 * destroy it whether we succeed or fail. 947 */ 948 record->flags &= ~HAMMER_RECF_INTERLOCK_BE; 949 record->flags |= HAMMER_RECF_DELETED_FE; 950 record->flush_state = HAMMER_FST_IDLE; 951 hammer_rel_mem_record(record); 952 953 /* 954 * Finish up. 955 */ 956 if (error == 0) { 957 if (hammer_debug_inode) 958 kprintf("CLEANDELOND %p %08x\n", ip, ip->flags); 959 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | 960 HAMMER_INODE_ATIME | 961 HAMMER_INODE_MTIME); 962 ip->flags &= ~HAMMER_INODE_DELONDISK; 963 if (redirty) 964 ip->sync_flags |= HAMMER_INODE_DDIRTY; 965 966 /* 967 * Root volume count of inodes 968 */ 969 if ((ip->flags & HAMMER_INODE_ONDISK) == 0) { 970 hammer_modify_volume_field(trans, 971 trans->rootvol, 972 vol0_stat_inodes); 973 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes; 974 hammer_modify_volume_done(trans->rootvol); 975 ip->flags |= HAMMER_INODE_ONDISK; 976 if (hammer_debug_inode) 977 kprintf("NOWONDISK %p\n", ip); 978 } 979 } 980 } 981 982 /* 983 * If the inode has been destroyed, clean out any left-over flags 984 * that may have been set by the frontend. 985 */ 986 if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 987 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | 988 HAMMER_INODE_ATIME | 989 HAMMER_INODE_MTIME); 990 } 991 return(error); 992 } 993 994 /* 995 * Update only the itimes fields. 996 * 997 * ATIME can be updated without generating any UNDO. MTIME is updated 998 * with UNDO so it is guaranteed to be synchronized properly in case of 999 * a crash. 1000 * 1001 * Neither field is included in the B-Tree leaf element's CRC, which is how 1002 * we can get away with updating ATIME the way we do. 1003 */ 1004 static int 1005 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip) 1006 { 1007 hammer_transaction_t trans = cursor->trans; 1008 int error; 1009 1010 retry: 1011 if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) != 1012 HAMMER_INODE_ONDISK) { 1013 return(0); 1014 } 1015 1016 hammer_normalize_cursor(cursor); 1017 cursor->key_beg.localization = ip->obj_localization + 1018 HAMMER_LOCALIZE_INODE; 1019 cursor->key_beg.obj_id = ip->obj_id; 1020 cursor->key_beg.key = 0; 1021 cursor->key_beg.create_tid = 0; 1022 cursor->key_beg.delete_tid = 0; 1023 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE; 1024 cursor->key_beg.obj_type = 0; 1025 cursor->asof = ip->obj_asof; 1026 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1027 cursor->flags |= HAMMER_CURSOR_ASOF; 1028 cursor->flags |= HAMMER_CURSOR_GET_LEAF; 1029 cursor->flags |= HAMMER_CURSOR_GET_DATA; 1030 cursor->flags |= HAMMER_CURSOR_BACKEND; 1031 1032 error = hammer_btree_lookup(cursor); 1033 if (error) { 1034 kprintf("error %d\n", error); 1035 Debugger("hammer_update_itimes1"); 1036 } 1037 if (error == 0) { 1038 hammer_cache_node(&ip->cache[0], cursor->node); 1039 if (ip->sync_flags & HAMMER_INODE_MTIME) { 1040 /* 1041 * Updating MTIME requires an UNDO. Just cover 1042 * both atime and mtime. 1043 */ 1044 hammer_modify_buffer(trans, cursor->data_buffer, 1045 HAMMER_ITIMES_BASE(&cursor->data->inode), 1046 HAMMER_ITIMES_BYTES); 1047 cursor->data->inode.atime = ip->sync_ino_data.atime; 1048 cursor->data->inode.mtime = ip->sync_ino_data.mtime; 1049 hammer_modify_buffer_done(cursor->data_buffer); 1050 } else if (ip->sync_flags & HAMMER_INODE_ATIME) { 1051 /* 1052 * Updating atime only can be done in-place with 1053 * no UNDO. 1054 */ 1055 hammer_modify_buffer(trans, cursor->data_buffer, 1056 NULL, 0); 1057 cursor->data->inode.atime = ip->sync_ino_data.atime; 1058 hammer_modify_buffer_done(cursor->data_buffer); 1059 } 1060 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME); 1061 } 1062 if (error == EDEADLK) { 1063 hammer_done_cursor(cursor); 1064 error = hammer_init_cursor(trans, cursor, 1065 &ip->cache[0], ip); 1066 if (error == 0) 1067 goto retry; 1068 } 1069 return(error); 1070 } 1071 1072 /* 1073 * Release a reference on an inode, flush as requested. 1074 * 1075 * On the last reference we queue the inode to the flusher for its final 1076 * disposition. 1077 */ 1078 void 1079 hammer_rel_inode(struct hammer_inode *ip, int flush) 1080 { 1081 hammer_mount_t hmp = ip->hmp; 1082 1083 /* 1084 * Handle disposition when dropping the last ref. 1085 */ 1086 for (;;) { 1087 if (ip->lock.refs == 1) { 1088 /* 1089 * Determine whether on-disk action is needed for 1090 * the inode's final disposition. 1091 */ 1092 KKASSERT(ip->vp == NULL); 1093 hammer_inode_unloadable_check(ip, 0); 1094 if (ip->flags & HAMMER_INODE_MODMASK) { 1095 if (hmp->rsv_inodes > desiredvnodes) { 1096 hammer_flush_inode(ip, 1097 HAMMER_FLUSH_SIGNAL); 1098 } else { 1099 hammer_flush_inode(ip, 0); 1100 } 1101 } else if (ip->lock.refs == 1) { 1102 hammer_unload_inode(ip); 1103 break; 1104 } 1105 } else { 1106 if (flush) 1107 hammer_flush_inode(ip, 0); 1108 1109 /* 1110 * The inode still has multiple refs, try to drop 1111 * one ref. 1112 */ 1113 KKASSERT(ip->lock.refs >= 1); 1114 if (ip->lock.refs > 1) { 1115 hammer_unref(&ip->lock); 1116 break; 1117 } 1118 } 1119 } 1120 } 1121 1122 /* 1123 * Unload and destroy the specified inode. Must be called with one remaining 1124 * reference. The reference is disposed of. 1125 * 1126 * This can only be called in the context of the flusher. 1127 */ 1128 static int 1129 hammer_unload_inode(struct hammer_inode *ip) 1130 { 1131 hammer_mount_t hmp = ip->hmp; 1132 1133 KASSERT(ip->lock.refs == 1, 1134 ("hammer_unload_inode: %d refs\n", ip->lock.refs)); 1135 KKASSERT(ip->vp == NULL); 1136 KKASSERT(ip->flush_state == HAMMER_FST_IDLE); 1137 KKASSERT(ip->cursor_ip_refs == 0); 1138 KKASSERT(ip->lock.lockcount == 0); 1139 KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0); 1140 1141 KKASSERT(RB_EMPTY(&ip->rec_tree)); 1142 KKASSERT(TAILQ_EMPTY(&ip->target_list)); 1143 1144 RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip); 1145 1146 hammer_free_inode(ip); 1147 return(0); 1148 } 1149 1150 /* 1151 * Called on mount -u when switching from RW to RO or vise-versa. Adjust 1152 * the read-only flag for cached inodes. 1153 * 1154 * This routine is called from a RB_SCAN(). 1155 */ 1156 int 1157 hammer_reload_inode(hammer_inode_t ip, void *arg __unused) 1158 { 1159 hammer_mount_t hmp = ip->hmp; 1160 1161 if (hmp->ronly || hmp->asof != HAMMER_MAX_TID) 1162 ip->flags |= HAMMER_INODE_RO; 1163 else 1164 ip->flags &= ~HAMMER_INODE_RO; 1165 return(0); 1166 } 1167 1168 /* 1169 * A transaction has modified an inode, requiring updates as specified by 1170 * the passed flags. 1171 * 1172 * HAMMER_INODE_DDIRTY: Inode data has been updated 1173 * HAMMER_INODE_XDIRTY: Dirty in-memory records 1174 * HAMMER_INODE_BUFS: Dirty buffer cache buffers 1175 * HAMMER_INODE_DELETED: Inode record/data must be deleted 1176 * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated 1177 */ 1178 void 1179 hammer_modify_inode(hammer_inode_t ip, int flags) 1180 { 1181 KKASSERT(ip->hmp->ronly == 0 || 1182 (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 1183 HAMMER_INODE_BUFS | HAMMER_INODE_DELETED | 1184 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0); 1185 if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) { 1186 ip->flags |= HAMMER_INODE_RSV_INODES; 1187 ++ip->hmp->rsv_inodes; 1188 } 1189 1190 ip->flags |= flags; 1191 } 1192 1193 /* 1194 * Request that an inode be flushed. This whole mess cannot block and may 1195 * recurse (if not synchronous). Once requested HAMMER will attempt to 1196 * actively flush the inode until the flush can be done. 1197 * 1198 * The inode may already be flushing, or may be in a setup state. We can 1199 * place the inode in a flushing state if it is currently idle and flag it 1200 * to reflush if it is currently flushing. 1201 * 1202 * If the HAMMER_FLUSH_SYNCHRONOUS flag is specified we will attempt to 1203 * flush the indoe synchronously using the caller's context. 1204 */ 1205 void 1206 hammer_flush_inode(hammer_inode_t ip, int flags) 1207 { 1208 int good; 1209 1210 /* 1211 * Trivial 'nothing to flush' case. If the inode is ina SETUP 1212 * state we have to put it back into an IDLE state so we can 1213 * drop the extra ref. 1214 */ 1215 if ((ip->flags & HAMMER_INODE_MODMASK) == 0) { 1216 if (ip->flush_state == HAMMER_FST_SETUP) { 1217 ip->flush_state = HAMMER_FST_IDLE; 1218 hammer_rel_inode(ip, 0); 1219 } 1220 return; 1221 } 1222 1223 /* 1224 * Our flush action will depend on the current state. 1225 */ 1226 switch(ip->flush_state) { 1227 case HAMMER_FST_IDLE: 1228 /* 1229 * We have no dependancies and can flush immediately. Some 1230 * our children may not be flushable so we have to re-test 1231 * with that additional knowledge. 1232 */ 1233 hammer_flush_inode_core(ip, flags); 1234 break; 1235 case HAMMER_FST_SETUP: 1236 /* 1237 * Recurse upwards through dependancies via target_list 1238 * and start their flusher actions going if possible. 1239 * 1240 * 'good' is our connectivity. -1 means we have none and 1241 * can't flush, 0 means there weren't any dependancies, and 1242 * 1 means we have good connectivity. 1243 */ 1244 good = hammer_setup_parent_inodes(ip); 1245 1246 /* 1247 * We can continue if good >= 0. Determine how many records 1248 * under our inode can be flushed (and mark them). 1249 */ 1250 if (good >= 0) { 1251 hammer_flush_inode_core(ip, flags); 1252 } else { 1253 ip->flags |= HAMMER_INODE_REFLUSH; 1254 if (flags & HAMMER_FLUSH_SIGNAL) { 1255 ip->flags |= HAMMER_INODE_RESIGNAL; 1256 hammer_flusher_async(ip->hmp); 1257 } 1258 } 1259 break; 1260 default: 1261 /* 1262 * We are already flushing, flag the inode to reflush 1263 * if needed after it completes its current flush. 1264 */ 1265 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0) 1266 ip->flags |= HAMMER_INODE_REFLUSH; 1267 if (flags & HAMMER_FLUSH_SIGNAL) { 1268 ip->flags |= HAMMER_INODE_RESIGNAL; 1269 hammer_flusher_async(ip->hmp); 1270 } 1271 break; 1272 } 1273 } 1274 1275 /* 1276 * Scan ip->target_list, which is a list of records owned by PARENTS to our 1277 * ip which reference our ip. 1278 * 1279 * XXX This is a huge mess of recursive code, but not one bit of it blocks 1280 * so for now do not ref/deref the structures. Note that if we use the 1281 * ref/rel code later, the rel CAN block. 1282 */ 1283 static int 1284 hammer_setup_parent_inodes(hammer_inode_t ip) 1285 { 1286 hammer_record_t depend; 1287 #if 0 1288 hammer_record_t next; 1289 hammer_inode_t pip; 1290 #endif 1291 int good; 1292 int r; 1293 1294 good = 0; 1295 TAILQ_FOREACH(depend, &ip->target_list, target_entry) { 1296 r = hammer_setup_parent_inodes_helper(depend); 1297 KKASSERT(depend->target_ip == ip); 1298 if (r < 0 && good == 0) 1299 good = -1; 1300 if (r > 0) 1301 good = 1; 1302 } 1303 return(good); 1304 1305 #if 0 1306 retry: 1307 good = 0; 1308 next = TAILQ_FIRST(&ip->target_list); 1309 if (next) { 1310 hammer_ref(&next->lock); 1311 hammer_ref(&next->ip->lock); 1312 } 1313 while ((depend = next) != NULL) { 1314 if (depend->target_ip == NULL) { 1315 pip = depend->ip; 1316 hammer_rel_mem_record(depend); 1317 hammer_rel_inode(pip, 0); 1318 goto retry; 1319 } 1320 KKASSERT(depend->target_ip == ip); 1321 next = TAILQ_NEXT(depend, target_entry); 1322 if (next) { 1323 hammer_ref(&next->lock); 1324 hammer_ref(&next->ip->lock); 1325 } 1326 r = hammer_setup_parent_inodes_helper(depend); 1327 if (r < 0 && good == 0) 1328 good = -1; 1329 if (r > 0) 1330 good = 1; 1331 pip = depend->ip; 1332 hammer_rel_mem_record(depend); 1333 hammer_rel_inode(pip, 0); 1334 } 1335 return(good); 1336 #endif 1337 } 1338 1339 /* 1340 * This helper function takes a record representing the dependancy between 1341 * the parent inode and child inode. 1342 * 1343 * record->ip = parent inode 1344 * record->target_ip = child inode 1345 * 1346 * We are asked to recurse upwards and convert the record from SETUP 1347 * to FLUSH if possible. 1348 * 1349 * Return 1 if the record gives us connectivity 1350 * 1351 * Return 0 if the record is not relevant 1352 * 1353 * Return -1 if we can't resolve the dependancy and there is no connectivity. 1354 */ 1355 static int 1356 hammer_setup_parent_inodes_helper(hammer_record_t record) 1357 { 1358 hammer_mount_t hmp; 1359 hammer_inode_t pip; 1360 int good; 1361 1362 KKASSERT(record->flush_state != HAMMER_FST_IDLE); 1363 pip = record->ip; 1364 hmp = pip->hmp; 1365 1366 /* 1367 * If the record is already flushing, is it in our flush group? 1368 * 1369 * If it is in our flush group but it is a general record or a 1370 * delete-on-disk, it does not improve our connectivity (return 0), 1371 * and if the target inode is not trying to destroy itself we can't 1372 * allow the operation yet anyway (the second return -1). 1373 */ 1374 if (record->flush_state == HAMMER_FST_FLUSH) { 1375 if (record->flush_group != hmp->flusher.next) { 1376 pip->flags |= HAMMER_INODE_REFLUSH; 1377 return(-1); 1378 } 1379 if (record->type == HAMMER_MEM_RECORD_ADD) 1380 return(1); 1381 /* GENERAL or DEL */ 1382 return(0); 1383 } 1384 1385 /* 1386 * It must be a setup record. Try to resolve the setup dependancies 1387 * by recursing upwards so we can place ip on the flush list. 1388 */ 1389 KKASSERT(record->flush_state == HAMMER_FST_SETUP); 1390 1391 good = hammer_setup_parent_inodes(pip); 1392 1393 /* 1394 * We can't flush ip because it has no connectivity (XXX also check 1395 * nlinks for pre-existing connectivity!). Flag it so any resolution 1396 * recurses back down. 1397 */ 1398 if (good < 0) { 1399 pip->flags |= HAMMER_INODE_REFLUSH; 1400 return(good); 1401 } 1402 1403 /* 1404 * We are go, place the parent inode in a flushing state so we can 1405 * place its record in a flushing state. Note that the parent 1406 * may already be flushing. The record must be in the same flush 1407 * group as the parent. 1408 */ 1409 if (pip->flush_state != HAMMER_FST_FLUSH) 1410 hammer_flush_inode_core(pip, HAMMER_FLUSH_RECURSION); 1411 KKASSERT(pip->flush_state == HAMMER_FST_FLUSH); 1412 KKASSERT(record->flush_state == HAMMER_FST_SETUP); 1413 1414 #if 0 1415 if (record->type == HAMMER_MEM_RECORD_DEL && 1416 (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) { 1417 /* 1418 * Regardless of flushing state we cannot sync this path if the 1419 * record represents a delete-on-disk but the target inode 1420 * is not ready to sync its own deletion. 1421 * 1422 * XXX need to count effective nlinks to determine whether 1423 * the flush is ok, otherwise removing a hardlink will 1424 * just leave the DEL record to rot. 1425 */ 1426 record->target_ip->flags |= HAMMER_INODE_REFLUSH; 1427 return(-1); 1428 } else 1429 #endif 1430 if (pip->flush_group == pip->hmp->flusher.next) { 1431 /* 1432 * This is the record we wanted to synchronize. If the 1433 * record went into a flush state while we blocked it 1434 * had better be in the correct flush group. 1435 */ 1436 if (record->flush_state != HAMMER_FST_FLUSH) { 1437 record->flush_state = HAMMER_FST_FLUSH; 1438 record->flush_group = pip->flush_group; 1439 hammer_ref(&record->lock); 1440 } else { 1441 KKASSERT(record->flush_group == pip->flush_group); 1442 } 1443 if (record->type == HAMMER_MEM_RECORD_ADD) 1444 return(1); 1445 1446 /* 1447 * A general or delete-on-disk record does not contribute 1448 * to our visibility. We can still flush it, however. 1449 */ 1450 return(0); 1451 } else { 1452 /* 1453 * We couldn't resolve the dependancies, request that the 1454 * inode be flushed when the dependancies can be resolved. 1455 */ 1456 pip->flags |= HAMMER_INODE_REFLUSH; 1457 return(-1); 1458 } 1459 } 1460 1461 /* 1462 * This is the core routine placing an inode into the FST_FLUSH state. 1463 */ 1464 static void 1465 hammer_flush_inode_core(hammer_inode_t ip, int flags) 1466 { 1467 int go_count; 1468 1469 /* 1470 * Set flush state and prevent the flusher from cycling into 1471 * the next flush group. Do not place the ip on the list yet. 1472 * Inodes not in the idle state get an extra reference. 1473 */ 1474 KKASSERT(ip->flush_state != HAMMER_FST_FLUSH); 1475 if (ip->flush_state == HAMMER_FST_IDLE) 1476 hammer_ref(&ip->lock); 1477 ip->flush_state = HAMMER_FST_FLUSH; 1478 ip->flush_group = ip->hmp->flusher.next; 1479 ++ip->hmp->flusher.group_lock; 1480 ++ip->hmp->count_iqueued; 1481 ++hammer_count_iqueued; 1482 1483 /* 1484 * We need to be able to vfsync/truncate from the backend. 1485 */ 1486 KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0); 1487 if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) { 1488 ip->flags |= HAMMER_INODE_VHELD; 1489 vref(ip->vp); 1490 } 1491 1492 /* 1493 * Figure out how many in-memory records we can actually flush 1494 * (not including inode meta-data, buffers, etc). 1495 * 1496 * Do not add new records to the flush if this is a recursion or 1497 * if we must still complete a flush from the previous flush cycle. 1498 */ 1499 if (flags & HAMMER_FLUSH_RECURSION) { 1500 go_count = 1; 1501 } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) { 1502 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, 1503 hammer_syncgrp_child_callback, NULL); 1504 go_count = 1; 1505 } else { 1506 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, 1507 hammer_setup_child_callback, NULL); 1508 } 1509 1510 /* 1511 * This is a more involved test that includes go_count. If we 1512 * can't flush, flag the inode and return. If go_count is 0 we 1513 * were are unable to flush any records in our rec_tree and 1514 * must ignore the XDIRTY flag. 1515 */ 1516 if (go_count == 0) { 1517 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) { 1518 ip->flags |= HAMMER_INODE_REFLUSH; 1519 1520 --ip->hmp->count_iqueued; 1521 --hammer_count_iqueued; 1522 1523 ip->flush_state = HAMMER_FST_SETUP; 1524 if (ip->flags & HAMMER_INODE_VHELD) { 1525 ip->flags &= ~HAMMER_INODE_VHELD; 1526 vrele(ip->vp); 1527 } 1528 if (flags & HAMMER_FLUSH_SIGNAL) { 1529 ip->flags |= HAMMER_INODE_RESIGNAL; 1530 hammer_flusher_async(ip->hmp); 1531 } 1532 if (--ip->hmp->flusher.group_lock == 0) 1533 wakeup(&ip->hmp->flusher.group_lock); 1534 return; 1535 } 1536 } 1537 1538 /* 1539 * Snapshot the state of the inode for the backend flusher. 1540 * 1541 * We continue to retain save_trunc_off even when all truncations 1542 * have been resolved as an optimization to determine if we can 1543 * skip the B-Tree lookup for overwrite deletions. 1544 * 1545 * NOTE: The DELETING flag is a mod flag, but it is also sticky, 1546 * and stays in ip->flags. Once set, it stays set until the 1547 * inode is destroyed. 1548 * 1549 * NOTE: If a truncation from a previous flush cycle had to be 1550 * continued into this one, the TRUNCATED flag will still be 1551 * set in sync_flags as will WOULDBLOCK. When this occurs 1552 * we CANNOT safely integrate a new truncation from the front-end 1553 * because there may be data records in-memory assigned a flush 1554 * state from the previous cycle that are supposed to be flushed 1555 * before the next frontend truncation. 1556 */ 1557 if ((ip->flags & (HAMMER_INODE_TRUNCATED | HAMMER_INODE_WOULDBLOCK)) == 1558 HAMMER_INODE_TRUNCATED) { 1559 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0); 1560 ip->sync_trunc_off = ip->trunc_off; 1561 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL; 1562 ip->flags &= ~HAMMER_INODE_TRUNCATED; 1563 ip->sync_flags |= HAMMER_INODE_TRUNCATED; 1564 1565 /* 1566 * The save_trunc_off used to cache whether the B-Tree 1567 * holds any records past that point is not used until 1568 * after the truncation has succeeded, so we can safely 1569 * set it now. 1570 */ 1571 if (ip->save_trunc_off > ip->sync_trunc_off) 1572 ip->save_trunc_off = ip->sync_trunc_off; 1573 } 1574 ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK & 1575 ~HAMMER_INODE_TRUNCATED); 1576 ip->sync_ino_leaf = ip->ino_leaf; 1577 ip->sync_ino_data = ip->ino_data; 1578 ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED; 1579 #ifdef DEBUG_TRUNCATE 1580 if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp) 1581 kprintf("truncateS %016llx\n", ip->sync_trunc_off); 1582 #endif 1583 1584 /* 1585 * The flusher list inherits our inode and reference. 1586 */ 1587 TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry); 1588 if (--ip->hmp->flusher.group_lock == 0) 1589 wakeup(&ip->hmp->flusher.group_lock); 1590 1591 if (flags & HAMMER_FLUSH_SIGNAL) { 1592 hammer_flusher_async(ip->hmp); 1593 } 1594 } 1595 1596 /* 1597 * Callback for scan of ip->rec_tree. Try to include each record in our 1598 * flush. ip->flush_group has been set but the inode has not yet been 1599 * moved into a flushing state. 1600 * 1601 * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on 1602 * both inodes. 1603 * 1604 * We return 1 for any record placed or found in FST_FLUSH, which prevents 1605 * the caller from shortcutting the flush. 1606 */ 1607 static int 1608 hammer_setup_child_callback(hammer_record_t rec, void *data) 1609 { 1610 hammer_inode_t target_ip; 1611 hammer_inode_t ip; 1612 int r; 1613 1614 /* 1615 * Deleted records are ignored. Note that the flush detects deleted 1616 * front-end records at multiple points to deal with races. This is 1617 * just the first line of defense. The only time DELETED_FE cannot 1618 * be set is when HAMMER_RECF_INTERLOCK_BE is set. 1619 * 1620 * Don't get confused between record deletion and, say, directory 1621 * entry deletion. The deletion of a directory entry that is on 1622 * the media has nothing to do with the record deletion flags. 1623 * 1624 * The flush_group for a record already in a flush state must 1625 * be updated. This case can only occur if the inode deleting 1626 * too many records had to be moved to the next flush group. 1627 */ 1628 if (rec->flags & (HAMMER_RECF_DELETED_FE|HAMMER_RECF_DELETED_BE)) { 1629 if (rec->flush_state == HAMMER_FST_FLUSH) { 1630 KKASSERT(rec->ip->flags & HAMMER_INODE_WOULDBLOCK); 1631 rec->flush_group = rec->ip->flush_group; 1632 r = 1; 1633 } else { 1634 r = 0; 1635 } 1636 return(r); 1637 } 1638 1639 /* 1640 * If the record is in an idle state it has no dependancies and 1641 * can be flushed. 1642 */ 1643 ip = rec->ip; 1644 r = 0; 1645 1646 switch(rec->flush_state) { 1647 case HAMMER_FST_IDLE: 1648 /* 1649 * Record has no setup dependancy, we can flush it. 1650 */ 1651 KKASSERT(rec->target_ip == NULL); 1652 rec->flush_state = HAMMER_FST_FLUSH; 1653 rec->flush_group = ip->flush_group; 1654 hammer_ref(&rec->lock); 1655 r = 1; 1656 break; 1657 case HAMMER_FST_SETUP: 1658 /* 1659 * Record has a setup dependancy. Try to include the 1660 * target ip in the flush. 1661 * 1662 * We have to be careful here, if we do not do the right 1663 * thing we can lose track of dirty inodes and the system 1664 * will lockup trying to allocate buffers. 1665 */ 1666 target_ip = rec->target_ip; 1667 KKASSERT(target_ip != NULL); 1668 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE); 1669 if (target_ip->flush_state == HAMMER_FST_FLUSH) { 1670 /* 1671 * If the target IP is already flushing in our group 1672 * we are golden, otherwise make sure the target 1673 * reflushes. 1674 */ 1675 if (target_ip->flush_group == ip->flush_group) { 1676 rec->flush_state = HAMMER_FST_FLUSH; 1677 rec->flush_group = ip->flush_group; 1678 hammer_ref(&rec->lock); 1679 r = 1; 1680 } else { 1681 target_ip->flags |= HAMMER_INODE_REFLUSH; 1682 } 1683 } else if (rec->type == HAMMER_MEM_RECORD_ADD) { 1684 /* 1685 * If the target IP is not flushing we can force 1686 * it to flush, even if it is unable to write out 1687 * any of its own records we have at least one in 1688 * hand that we CAN deal with. 1689 */ 1690 rec->flush_state = HAMMER_FST_FLUSH; 1691 rec->flush_group = ip->flush_group; 1692 hammer_ref(&rec->lock); 1693 hammer_flush_inode_core(target_ip, 1694 HAMMER_FLUSH_RECURSION); 1695 r = 1; 1696 } else { 1697 /* 1698 * General or delete-on-disk record. 1699 * 1700 * XXX this needs help. If a delete-on-disk we could 1701 * disconnect the target. If the target has its own 1702 * dependancies they really need to be flushed. 1703 * 1704 * XXX 1705 */ 1706 rec->flush_state = HAMMER_FST_FLUSH; 1707 rec->flush_group = ip->flush_group; 1708 hammer_ref(&rec->lock); 1709 hammer_flush_inode_core(target_ip, 1710 HAMMER_FLUSH_RECURSION); 1711 r = 1; 1712 } 1713 break; 1714 case HAMMER_FST_FLUSH: 1715 /* 1716 * If the WOULDBLOCK flag is set records may have been left 1717 * over from a previous flush attempt and should be moved 1718 * to the current flush group. If it is not set then all 1719 * such records had better have been flushed already or 1720 * already associated with the current flush group. 1721 */ 1722 if (ip->flags & HAMMER_INODE_WOULDBLOCK) { 1723 rec->flush_group = ip->flush_group; 1724 } else { 1725 KKASSERT(rec->flush_group == ip->flush_group); 1726 } 1727 r = 1; 1728 break; 1729 } 1730 return(r); 1731 } 1732 1733 /* 1734 * This version just moves records already in a flush state to the new 1735 * flush group and that is it. 1736 */ 1737 static int 1738 hammer_syncgrp_child_callback(hammer_record_t rec, void *data) 1739 { 1740 hammer_inode_t ip = rec->ip; 1741 1742 switch(rec->flush_state) { 1743 case HAMMER_FST_FLUSH: 1744 if (ip->flags & HAMMER_INODE_WOULDBLOCK) { 1745 rec->flush_group = ip->flush_group; 1746 } else { 1747 KKASSERT(rec->flush_group == ip->flush_group); 1748 } 1749 break; 1750 default: 1751 break; 1752 } 1753 return(0); 1754 } 1755 1756 /* 1757 * Wait for a previously queued flush to complete. Not only do we need to 1758 * wait for the inode to sync out, we also may have to run the flusher again 1759 * to get it past the UNDO position pertaining to the flush so a crash does 1760 * not 'undo' our flush. 1761 */ 1762 void 1763 hammer_wait_inode(hammer_inode_t ip) 1764 { 1765 hammer_mount_t hmp = ip->hmp; 1766 int sync_group; 1767 int waitcount; 1768 1769 sync_group = ip->flush_group; 1770 waitcount = (ip->flags & HAMMER_INODE_REFLUSH) ? 2 : 1; 1771 1772 if (ip->flush_state == HAMMER_FST_SETUP) { 1773 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 1774 } 1775 /* XXX can we make this != FST_IDLE ? check SETUP depends */ 1776 while (ip->flush_state == HAMMER_FST_FLUSH && 1777 (ip->flush_group - sync_group) < waitcount) { 1778 ip->flags |= HAMMER_INODE_FLUSHW; 1779 tsleep(&ip->flags, 0, "hmrwin", 0); 1780 } 1781 while (hmp->flusher.done - sync_group < waitcount) { 1782 kprintf("Y"); 1783 hammer_flusher_sync(hmp); 1784 } 1785 } 1786 1787 /* 1788 * Called by the backend code when a flush has been completed. 1789 * The inode has already been removed from the flush list. 1790 * 1791 * A pipelined flush can occur, in which case we must re-enter the 1792 * inode on the list and re-copy its fields. 1793 */ 1794 void 1795 hammer_flush_inode_done(hammer_inode_t ip) 1796 { 1797 hammer_mount_t hmp; 1798 int dorel; 1799 1800 KKASSERT(ip->flush_state == HAMMER_FST_FLUSH); 1801 1802 hmp = ip->hmp; 1803 1804 /* 1805 * Merge left-over flags back into the frontend and fix the state. 1806 * Incomplete truncations are retained by the backend. 1807 */ 1808 ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED; 1809 ip->sync_flags &= HAMMER_INODE_TRUNCATED; 1810 1811 /* 1812 * The backend may have adjusted nlinks, so if the adjusted nlinks 1813 * does not match the fronttend set the frontend's RDIRTY flag again. 1814 */ 1815 if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks) 1816 ip->flags |= HAMMER_INODE_DDIRTY; 1817 1818 /* 1819 * Fix up the dirty buffer status. 1820 */ 1821 if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) { 1822 ip->flags |= HAMMER_INODE_BUFS; 1823 } 1824 1825 /* 1826 * Re-set the XDIRTY flag if some of the inode's in-memory records 1827 * could not be flushed. 1828 */ 1829 KKASSERT((RB_EMPTY(&ip->rec_tree) && 1830 (ip->flags & HAMMER_INODE_XDIRTY) == 0) || 1831 (!RB_EMPTY(&ip->rec_tree) && 1832 (ip->flags & HAMMER_INODE_XDIRTY) != 0)); 1833 1834 /* 1835 * Do not lose track of inodes which no longer have vnode 1836 * assocations, otherwise they may never get flushed again. 1837 */ 1838 if ((ip->flags & HAMMER_INODE_MODMASK) && ip->vp == NULL) 1839 ip->flags |= HAMMER_INODE_REFLUSH; 1840 1841 /* 1842 * Clean up the vnode ref 1843 */ 1844 if (ip->flags & HAMMER_INODE_VHELD) { 1845 ip->flags &= ~HAMMER_INODE_VHELD; 1846 vrele(ip->vp); 1847 } 1848 1849 /* 1850 * Adjust flush_state. The target state (idle or setup) shouldn't 1851 * be terribly important since we will reflush if we really need 1852 * to do anything. 1853 * 1854 * If the WOULDBLOCK flag is set we must re-flush immediately 1855 * to continue a potentially large deletion. The flag also causes 1856 * the hammer_setup_child_callback() to move records in the old 1857 * flush group to the new one. 1858 */ 1859 if (ip->flags & HAMMER_INODE_WOULDBLOCK) { 1860 ip->flush_state = HAMMER_FST_IDLE; 1861 hammer_flush_inode_core(ip, HAMMER_FLUSH_SIGNAL); 1862 ip->flags &= ~HAMMER_INODE_WOULDBLOCK; 1863 dorel = 1; 1864 } else if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) { 1865 ip->flush_state = HAMMER_FST_IDLE; 1866 dorel = 1; 1867 } else { 1868 ip->flush_state = HAMMER_FST_SETUP; 1869 dorel = 0; 1870 } 1871 1872 --hmp->count_iqueued; 1873 --hammer_count_iqueued; 1874 1875 /* 1876 * If the frontend made more changes and requested another flush, 1877 * then try to get it running. 1878 */ 1879 if (ip->flags & HAMMER_INODE_REFLUSH) { 1880 ip->flags &= ~HAMMER_INODE_REFLUSH; 1881 if (ip->flags & HAMMER_INODE_RESIGNAL) { 1882 ip->flags &= ~HAMMER_INODE_RESIGNAL; 1883 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 1884 } else { 1885 hammer_flush_inode(ip, 0); 1886 } 1887 } 1888 1889 /* 1890 * If the inode is now clean drop the space reservation. 1891 */ 1892 if ((ip->flags & HAMMER_INODE_MODMASK) == 0 && 1893 (ip->flags & HAMMER_INODE_RSV_INODES)) { 1894 ip->flags &= ~HAMMER_INODE_RSV_INODES; 1895 --hmp->rsv_inodes; 1896 } 1897 1898 /* 1899 * Finally, if the frontend is waiting for a flush to complete, 1900 * wake it up. 1901 */ 1902 if (ip->flush_state != HAMMER_FST_FLUSH) { 1903 if (ip->flags & HAMMER_INODE_FLUSHW) { 1904 ip->flags &= ~HAMMER_INODE_FLUSHW; 1905 wakeup(&ip->flags); 1906 } 1907 } 1908 if (dorel) 1909 hammer_rel_inode(ip, 0); 1910 } 1911 1912 /* 1913 * Called from hammer_sync_inode() to synchronize in-memory records 1914 * to the media. 1915 */ 1916 static int 1917 hammer_sync_record_callback(hammer_record_t record, void *data) 1918 { 1919 hammer_cursor_t cursor = data; 1920 hammer_transaction_t trans = cursor->trans; 1921 int error; 1922 1923 /* 1924 * Skip records that do not belong to the current flush. 1925 */ 1926 ++hammer_stats_record_iterations; 1927 if (record->flush_state != HAMMER_FST_FLUSH) 1928 return(0); 1929 1930 #if 1 1931 if (record->flush_group != record->ip->flush_group) { 1932 kprintf("sync_record %p ip %p bad flush group %d %d\n", record, record->ip, record->flush_group ,record->ip->flush_group); 1933 Debugger("blah2"); 1934 return(0); 1935 } 1936 #endif 1937 KKASSERT(record->flush_group == record->ip->flush_group); 1938 1939 /* 1940 * Interlock the record using the BE flag. Once BE is set the 1941 * frontend cannot change the state of FE. 1942 * 1943 * NOTE: If FE is set prior to us setting BE we still sync the 1944 * record out, but the flush completion code converts it to 1945 * a delete-on-disk record instead of destroying it. 1946 */ 1947 KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0); 1948 record->flags |= HAMMER_RECF_INTERLOCK_BE; 1949 1950 /* 1951 * The backend may have already disposed of the record. 1952 */ 1953 if (record->flags & HAMMER_RECF_DELETED_BE) { 1954 error = 0; 1955 goto done; 1956 } 1957 1958 /* 1959 * If the whole inode is being deleting all on-disk records will 1960 * be deleted very soon, we can't sync any new records to disk 1961 * because they will be deleted in the same transaction they were 1962 * created in (delete_tid == create_tid), which will assert. 1963 * 1964 * XXX There may be a case with RECORD_ADD with DELETED_FE set 1965 * that we currently panic on. 1966 */ 1967 if (record->ip->sync_flags & HAMMER_INODE_DELETING) { 1968 switch(record->type) { 1969 case HAMMER_MEM_RECORD_DATA: 1970 /* 1971 * We don't have to do anything, if the record was 1972 * committed the space will have been accounted for 1973 * in the blockmap. 1974 */ 1975 /* fall through */ 1976 case HAMMER_MEM_RECORD_GENERAL: 1977 record->flags |= HAMMER_RECF_DELETED_FE; 1978 record->flags |= HAMMER_RECF_DELETED_BE; 1979 error = 0; 1980 goto done; 1981 case HAMMER_MEM_RECORD_ADD: 1982 panic("hammer_sync_record_callback: illegal add " 1983 "during inode deletion record %p", record); 1984 break; /* NOT REACHED */ 1985 case HAMMER_MEM_RECORD_INODE: 1986 panic("hammer_sync_record_callback: attempt to " 1987 "sync inode record %p?", record); 1988 break; /* NOT REACHED */ 1989 case HAMMER_MEM_RECORD_DEL: 1990 /* 1991 * Follow through and issue the on-disk deletion 1992 */ 1993 break; 1994 } 1995 } 1996 1997 /* 1998 * If DELETED_FE is set special handling is needed for directory 1999 * entries. Dependant pieces related to the directory entry may 2000 * have already been synced to disk. If this occurs we have to 2001 * sync the directory entry and then change the in-memory record 2002 * from an ADD to a DELETE to cover the fact that it's been 2003 * deleted by the frontend. 2004 * 2005 * A directory delete covering record (MEM_RECORD_DEL) can never 2006 * be deleted by the frontend. 2007 * 2008 * Any other record type (aka DATA) can be deleted by the frontend. 2009 * XXX At the moment the flusher must skip it because there may 2010 * be another data record in the flush group for the same block, 2011 * meaning that some frontend data changes can leak into the backend's 2012 * synchronization point. 2013 */ 2014 if (record->flags & HAMMER_RECF_DELETED_FE) { 2015 if (record->type == HAMMER_MEM_RECORD_ADD) { 2016 record->flags |= HAMMER_RECF_CONVERT_DELETE; 2017 } else { 2018 KKASSERT(record->type != HAMMER_MEM_RECORD_DEL); 2019 record->flags |= HAMMER_RECF_DELETED_BE; 2020 error = 0; 2021 goto done; 2022 } 2023 } 2024 2025 /* 2026 * Assign the create_tid for new records. Deletions already 2027 * have the record's entire key properly set up. 2028 */ 2029 if (record->type != HAMMER_MEM_RECORD_DEL) 2030 record->leaf.base.create_tid = trans->tid; 2031 record->leaf.create_ts = trans->time32; 2032 for (;;) { 2033 error = hammer_ip_sync_record_cursor(cursor, record); 2034 if (error != EDEADLK) 2035 break; 2036 hammer_done_cursor(cursor); 2037 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0], 2038 record->ip); 2039 if (error) 2040 break; 2041 } 2042 record->flags &= ~HAMMER_RECF_CONVERT_DELETE; 2043 2044 if (error) { 2045 error = -error; 2046 if (error != -ENOSPC) { 2047 kprintf("hammer_sync_record_callback: sync failed rec " 2048 "%p, error %d\n", record, error); 2049 Debugger("sync failed rec"); 2050 } 2051 } 2052 done: 2053 hammer_flush_record_done(record, error); 2054 return(error); 2055 } 2056 2057 /* 2058 * XXX error handling 2059 */ 2060 int 2061 hammer_sync_inode(hammer_inode_t ip) 2062 { 2063 struct hammer_transaction trans; 2064 struct hammer_cursor cursor; 2065 hammer_node_t tmp_node; 2066 hammer_record_t depend; 2067 hammer_record_t next; 2068 int error, tmp_error; 2069 u_int64_t nlinks; 2070 2071 if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0) 2072 return(0); 2073 2074 hammer_start_transaction_fls(&trans, ip->hmp); 2075 error = hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip); 2076 if (error) 2077 goto done; 2078 2079 /* 2080 * Any directory records referencing this inode which are not in 2081 * our current flush group must adjust our nlink count for the 2082 * purposes of synchronization to disk. 2083 * 2084 * Records which are in our flush group can be unlinked from our 2085 * inode now, potentially allowing the inode to be physically 2086 * deleted. 2087 * 2088 * This cannot block. 2089 */ 2090 nlinks = ip->ino_data.nlinks; 2091 next = TAILQ_FIRST(&ip->target_list); 2092 while ((depend = next) != NULL) { 2093 next = TAILQ_NEXT(depend, target_entry); 2094 if (depend->flush_state == HAMMER_FST_FLUSH && 2095 depend->flush_group == ip->hmp->flusher.act) { 2096 /* 2097 * If this is an ADD that was deleted by the frontend 2098 * the frontend nlinks count will have already been 2099 * decremented, but the backend is going to sync its 2100 * directory entry and must account for it. The 2101 * record will be converted to a delete-on-disk when 2102 * it gets synced. 2103 * 2104 * If the ADD was not deleted by the frontend we 2105 * can remove the dependancy from our target_list. 2106 */ 2107 if (depend->flags & HAMMER_RECF_DELETED_FE) { 2108 ++nlinks; 2109 } else { 2110 TAILQ_REMOVE(&ip->target_list, depend, 2111 target_entry); 2112 depend->target_ip = NULL; 2113 } 2114 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) { 2115 /* 2116 * Not part of our flush group 2117 */ 2118 KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0); 2119 switch(depend->type) { 2120 case HAMMER_MEM_RECORD_ADD: 2121 --nlinks; 2122 break; 2123 case HAMMER_MEM_RECORD_DEL: 2124 ++nlinks; 2125 break; 2126 default: 2127 break; 2128 } 2129 } 2130 } 2131 2132 /* 2133 * Set dirty if we had to modify the link count. 2134 */ 2135 if (ip->sync_ino_data.nlinks != nlinks) { 2136 KKASSERT((int64_t)nlinks >= 0); 2137 ip->sync_ino_data.nlinks = nlinks; 2138 ip->sync_flags |= HAMMER_INODE_DDIRTY; 2139 } 2140 2141 /* 2142 * If there is a trunction queued destroy any data past the (aligned) 2143 * truncation point. Userland will have dealt with the buffer 2144 * containing the truncation point for us. 2145 * 2146 * We don't flush pending frontend data buffers until after we've 2147 * dealt with the truncation. 2148 */ 2149 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) { 2150 /* 2151 * Interlock trunc_off. The VOP front-end may continue to 2152 * make adjustments to it while we are blocked. 2153 */ 2154 off_t trunc_off; 2155 off_t aligned_trunc_off; 2156 int blkmask; 2157 2158 trunc_off = ip->sync_trunc_off; 2159 blkmask = hammer_blocksize(trunc_off) - 1; 2160 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask; 2161 2162 /* 2163 * Delete any whole blocks on-media. The front-end has 2164 * already cleaned out any partial block and made it 2165 * pending. The front-end may have updated trunc_off 2166 * while we were blocked so we only use sync_trunc_off. 2167 * 2168 * This operation can blow out the buffer cache, EWOULDBLOCK 2169 * means we were unable to complete the deletion. The 2170 * deletion will update sync_trunc_off in that case. 2171 */ 2172 error = hammer_ip_delete_range(&cursor, ip, 2173 aligned_trunc_off, 2174 0x7FFFFFFFFFFFFFFFLL, 2); 2175 if (error == EWOULDBLOCK) { 2176 ip->flags |= HAMMER_INODE_WOULDBLOCK; 2177 error = 0; 2178 goto defer_buffer_flush; 2179 } 2180 2181 if (error) 2182 Debugger("hammer_ip_delete_range errored"); 2183 2184 /* 2185 * Clear the truncation flag on the backend after we have 2186 * complete the deletions. Backend data is now good again 2187 * (including new records we are about to sync, below). 2188 * 2189 * Leave sync_trunc_off intact. As we write additional 2190 * records the backend will update sync_trunc_off. This 2191 * tells the backend whether it can skip the overwrite 2192 * test. This should work properly even when the backend 2193 * writes full blocks where the truncation point straddles 2194 * the block because the comparison is against the base 2195 * offset of the record. 2196 */ 2197 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED; 2198 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */ 2199 } else { 2200 error = 0; 2201 } 2202 2203 /* 2204 * Now sync related records. These will typically be directory 2205 * entries or delete-on-disk records. 2206 * 2207 * Not all records will be flushed, but clear XDIRTY anyway. We 2208 * will set it again in the frontend hammer_flush_inode_done() 2209 * if records remain. 2210 */ 2211 if (error == 0) { 2212 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL, 2213 hammer_sync_record_callback, &cursor); 2214 if (tmp_error < 0) 2215 tmp_error = -error; 2216 if (tmp_error) 2217 error = tmp_error; 2218 } 2219 hammer_cache_node(&ip->cache[1], cursor.node); 2220 2221 /* 2222 * Re-seek for inode update, assuming our cache hasn't been ripped 2223 * out from under us. 2224 */ 2225 if (error == 0) { 2226 tmp_node = hammer_ref_node_safe(ip->hmp, &ip->cache[0], &error); 2227 if (tmp_node) { 2228 hammer_cursor_downgrade(&cursor); 2229 hammer_lock_sh(&tmp_node->lock); 2230 if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0) 2231 hammer_cursor_seek(&cursor, tmp_node, 0); 2232 hammer_unlock(&tmp_node->lock); 2233 hammer_rel_node(tmp_node); 2234 } 2235 error = 0; 2236 } 2237 2238 /* 2239 * If we are deleting the inode the frontend had better not have 2240 * any active references on elements making up the inode. 2241 * 2242 * The call to hammer_ip_delete_clean() cleans up auxillary records 2243 * but not DB or DATA records. Those must have already been deleted 2244 * by the normal truncation mechanic. 2245 */ 2246 if (error == 0 && ip->sync_ino_data.nlinks == 0 && 2247 RB_EMPTY(&ip->rec_tree) && 2248 (ip->sync_flags & HAMMER_INODE_DELETING) && 2249 (ip->flags & HAMMER_INODE_DELETED) == 0) { 2250 int count1 = 0; 2251 2252 error = hammer_ip_delete_clean(&cursor, ip, &count1); 2253 if (error == 0) { 2254 ip->flags |= HAMMER_INODE_DELETED; 2255 ip->sync_flags &= ~HAMMER_INODE_DELETING; 2256 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED; 2257 KKASSERT(RB_EMPTY(&ip->rec_tree)); 2258 2259 /* 2260 * Set delete_tid in both the frontend and backend 2261 * copy of the inode record. The DELETED flag handles 2262 * this, do not set RDIRTY. 2263 */ 2264 ip->ino_leaf.base.delete_tid = trans.tid; 2265 ip->sync_ino_leaf.base.delete_tid = trans.tid; 2266 ip->ino_leaf.delete_ts = trans.time32; 2267 ip->sync_ino_leaf.delete_ts = trans.time32; 2268 2269 2270 /* 2271 * Adjust the inode count in the volume header 2272 */ 2273 if (ip->flags & HAMMER_INODE_ONDISK) { 2274 hammer_modify_volume_field(&trans, 2275 trans.rootvol, 2276 vol0_stat_inodes); 2277 --ip->hmp->rootvol->ondisk->vol0_stat_inodes; 2278 hammer_modify_volume_done(trans.rootvol); 2279 } 2280 } else { 2281 Debugger("hammer_ip_delete_clean errored"); 2282 } 2283 } 2284 2285 ip->sync_flags &= ~HAMMER_INODE_BUFS; 2286 2287 if (error) 2288 Debugger("RB_SCAN errored"); 2289 2290 defer_buffer_flush: 2291 /* 2292 * Now update the inode's on-disk inode-data and/or on-disk record. 2293 * DELETED and ONDISK are managed only in ip->flags. 2294 * 2295 * In the case of a defered buffer flush we still update the on-disk 2296 * inode to satisfy visibility requirements if there happen to be 2297 * directory dependancies. 2298 */ 2299 switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) { 2300 case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK: 2301 /* 2302 * If deleted and on-disk, don't set any additional flags. 2303 * the delete flag takes care of things. 2304 * 2305 * Clear flags which may have been set by the frontend. 2306 */ 2307 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 2308 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME | 2309 HAMMER_INODE_DELETING); 2310 break; 2311 case HAMMER_INODE_DELETED: 2312 /* 2313 * Take care of the case where a deleted inode was never 2314 * flushed to the disk in the first place. 2315 * 2316 * Clear flags which may have been set by the frontend. 2317 */ 2318 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 2319 HAMMER_INODE_ATIME | HAMMER_INODE_MTIME | 2320 HAMMER_INODE_DELETING); 2321 while (RB_ROOT(&ip->rec_tree)) { 2322 hammer_record_t record = RB_ROOT(&ip->rec_tree); 2323 hammer_ref(&record->lock); 2324 KKASSERT(record->lock.refs == 1); 2325 record->flags |= HAMMER_RECF_DELETED_FE; 2326 record->flags |= HAMMER_RECF_DELETED_BE; 2327 hammer_rel_mem_record(record); 2328 } 2329 break; 2330 case HAMMER_INODE_ONDISK: 2331 /* 2332 * If already on-disk, do not set any additional flags. 2333 */ 2334 break; 2335 default: 2336 /* 2337 * If not on-disk and not deleted, set DDIRTY to force 2338 * an initial record to be written. 2339 * 2340 * Also set the create_tid in both the frontend and backend 2341 * copy of the inode record. 2342 */ 2343 ip->ino_leaf.base.create_tid = trans.tid; 2344 ip->ino_leaf.create_ts = trans.time32; 2345 ip->sync_ino_leaf.base.create_tid = trans.tid; 2346 ip->sync_ino_leaf.create_ts = trans.time32; 2347 ip->sync_flags |= HAMMER_INODE_DDIRTY; 2348 break; 2349 } 2350 2351 /* 2352 * If RDIRTY or DDIRTY is set, write out a new record. If the inode 2353 * is already on-disk the old record is marked as deleted. 2354 * 2355 * If DELETED is set hammer_update_inode() will delete the existing 2356 * record without writing out a new one. 2357 * 2358 * If *ONLY* the ITIMES flag is set we can update the record in-place. 2359 */ 2360 if (ip->flags & HAMMER_INODE_DELETED) { 2361 error = hammer_update_inode(&cursor, ip); 2362 } else 2363 if ((ip->sync_flags & HAMMER_INODE_DDIRTY) == 0 && 2364 (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) { 2365 error = hammer_update_itimes(&cursor, ip); 2366 } else 2367 if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) { 2368 error = hammer_update_inode(&cursor, ip); 2369 } 2370 if (error) 2371 Debugger("hammer_update_itimes/inode errored"); 2372 done: 2373 /* 2374 * Save the TID we used to sync the inode with to make sure we 2375 * do not improperly reuse it. 2376 */ 2377 hammer_done_cursor(&cursor); 2378 hammer_done_transaction(&trans); 2379 return(error); 2380 } 2381 2382 /* 2383 * This routine is called when the OS is no longer actively referencing 2384 * the inode (but might still be keeping it cached), or when releasing 2385 * the last reference to an inode. 2386 * 2387 * At this point if the inode's nlinks count is zero we want to destroy 2388 * it, which may mean destroying it on-media too. 2389 */ 2390 void 2391 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp) 2392 { 2393 struct vnode *vp; 2394 2395 /* 2396 * Set the DELETING flag when the link count drops to 0 and the 2397 * OS no longer has any opens on the inode. 2398 * 2399 * The backend will clear DELETING (a mod flag) and set DELETED 2400 * (a state flag) when it is actually able to perform the 2401 * operation. 2402 */ 2403 if (ip->ino_data.nlinks == 0 && 2404 (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) { 2405 ip->flags |= HAMMER_INODE_DELETING; 2406 ip->flags |= HAMMER_INODE_TRUNCATED; 2407 ip->trunc_off = 0; 2408 vp = NULL; 2409 if (getvp) { 2410 if (hammer_get_vnode(ip, &vp) != 0) 2411 return; 2412 } 2413 2414 /* 2415 * Final cleanup 2416 */ 2417 if (ip->vp) { 2418 vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE); 2419 vnode_pager_setsize(ip->vp, 0); 2420 } 2421 if (getvp) { 2422 vput(vp); 2423 } 2424 } 2425 } 2426 2427 /* 2428 * Re-test an inode when a dependancy had gone away to see if we 2429 * can chain flush it. 2430 */ 2431 void 2432 hammer_test_inode(hammer_inode_t ip) 2433 { 2434 if (ip->flags & HAMMER_INODE_REFLUSH) { 2435 ip->flags &= ~HAMMER_INODE_REFLUSH; 2436 hammer_ref(&ip->lock); 2437 if (ip->flags & HAMMER_INODE_RESIGNAL) { 2438 ip->flags &= ~HAMMER_INODE_RESIGNAL; 2439 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL); 2440 } else { 2441 hammer_flush_inode(ip, 0); 2442 } 2443 hammer_rel_inode(ip, 0); 2444 } 2445 } 2446 2447 /* 2448 * Clear the RECLAIM flag on an inode. This occurs when the inode is 2449 * reassociated with a vp or just before it gets freed. 2450 * 2451 * Wakeup one thread blocked waiting on reclaims to complete. Note that 2452 * the inode the thread is waiting on behalf of is a different inode then 2453 * the inode we are called with. This is to create a pipeline. 2454 */ 2455 static void 2456 hammer_inode_wakereclaims(hammer_inode_t ip) 2457 { 2458 struct hammer_reclaim *reclaim; 2459 hammer_mount_t hmp = ip->hmp; 2460 2461 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) 2462 return; 2463 2464 --hammer_count_reclaiming; 2465 --hmp->inode_reclaims; 2466 ip->flags &= ~HAMMER_INODE_RECLAIM; 2467 2468 if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) { 2469 TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry); 2470 reclaim->okydoky = 1; 2471 wakeup(reclaim); 2472 } 2473 } 2474 2475 /* 2476 * Setup our reclaim pipeline. We only let so many detached (and dirty) 2477 * inodes build up before we start blocking. 2478 * 2479 * When we block we don't care *which* inode has finished reclaiming, 2480 * as lone as one does. This is somewhat heuristical... we also put a 2481 * cap on how long we are willing to wait. 2482 */ 2483 void 2484 hammer_inode_waitreclaims(hammer_mount_t hmp) 2485 { 2486 struct hammer_reclaim reclaim; 2487 int delay; 2488 2489 if (hmp->inode_reclaims > HAMMER_RECLAIM_WAIT) { 2490 reclaim.okydoky = 0; 2491 TAILQ_INSERT_TAIL(&hmp->reclaim_list, 2492 &reclaim, entry); 2493 } else { 2494 reclaim.okydoky = 1; 2495 } 2496 2497 if (reclaim.okydoky == 0) { 2498 delay = (hmp->inode_reclaims - HAMMER_RECLAIM_WAIT) * hz / 2499 HAMMER_RECLAIM_WAIT; 2500 if (delay >= 0) 2501 tsleep(&reclaim, 0, "hmrrcm", delay + 1); 2502 if (reclaim.okydoky == 0) 2503 TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry); 2504 } 2505 } 2506 2507