1 /* $NetBSD: lfs_vnops.c,v 1.242 2012/05/09 00:21:18 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 /* 32 * Copyright (c) 1986, 1989, 1991, 1993, 1995 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * @(#)lfs_vnops.c 8.13 (Berkeley) 6/10/95 60 */ 61 62 #include <sys/cdefs.h> 63 __KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.242 2012/05/09 00:21:18 riastradh Exp $"); 64 65 #ifdef _KERNEL_OPT 66 #include "opt_compat_netbsd.h" 67 #include "opt_uvm_page_trkown.h" 68 #endif 69 70 #include <sys/param.h> 71 #include <sys/systm.h> 72 #include <sys/namei.h> 73 #include <sys/resourcevar.h> 74 #include <sys/kernel.h> 75 #include <sys/file.h> 76 #include <sys/stat.h> 77 #include <sys/buf.h> 78 #include <sys/proc.h> 79 #include <sys/mount.h> 80 #include <sys/vnode.h> 81 #include <sys/pool.h> 82 #include <sys/signalvar.h> 83 #include <sys/kauth.h> 84 #include <sys/syslog.h> 85 #include <sys/fstrans.h> 86 87 #include <miscfs/fifofs/fifo.h> 88 #include <miscfs/genfs/genfs.h> 89 #include <miscfs/specfs/specdev.h> 90 91 #include <ufs/ufs/inode.h> 92 #include <ufs/ufs/dir.h> 93 #include <ufs/ufs/ufsmount.h> 94 #include <ufs/ufs/ufs_bswap.h> 95 #include <ufs/ufs/ufs_extern.h> 96 97 #include <uvm/uvm.h> 98 #include <uvm/uvm_pmap.h> 99 #include <uvm/uvm_stat.h> 100 #include <uvm/uvm_pager.h> 101 102 #include <ufs/lfs/lfs.h> 103 #include <ufs/lfs/lfs_extern.h> 104 105 extern pid_t lfs_writer_daemon; 106 int lfs_ignore_lazy_sync = 1; 107 108 /* Global vfs data structures for lfs. */ 109 int (**lfs_vnodeop_p)(void *); 110 const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = { 111 { &vop_default_desc, vn_default_error }, 112 { &vop_lookup_desc, ufs_lookup }, /* lookup */ 113 { &vop_create_desc, lfs_create }, /* create */ 114 { &vop_whiteout_desc, ufs_whiteout }, /* whiteout */ 115 { &vop_mknod_desc, lfs_mknod }, /* mknod */ 116 { &vop_open_desc, ufs_open }, /* open */ 117 { &vop_close_desc, lfs_close }, /* close */ 118 { &vop_access_desc, ufs_access }, /* access */ 119 { &vop_getattr_desc, lfs_getattr }, /* getattr */ 120 { &vop_setattr_desc, lfs_setattr }, /* setattr */ 121 { &vop_read_desc, lfs_read }, /* read */ 122 { &vop_write_desc, lfs_write }, /* write */ 123 { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */ 124 { &vop_fcntl_desc, lfs_fcntl }, /* fcntl */ 125 { &vop_poll_desc, ufs_poll }, /* poll */ 126 { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */ 127 { &vop_revoke_desc, ufs_revoke }, /* revoke */ 128 { &vop_mmap_desc, lfs_mmap }, /* mmap */ 129 { &vop_fsync_desc, lfs_fsync }, /* fsync */ 130 { &vop_seek_desc, ufs_seek }, /* seek */ 131 { &vop_remove_desc, lfs_remove }, /* remove */ 132 { &vop_link_desc, lfs_link }, /* link */ 133 { &vop_rename_desc, lfs_rename }, /* rename */ 134 { &vop_mkdir_desc, lfs_mkdir }, /* mkdir */ 135 { &vop_rmdir_desc, lfs_rmdir }, /* rmdir */ 136 { &vop_symlink_desc, lfs_symlink }, /* symlink */ 137 { &vop_readdir_desc, ufs_readdir }, /* readdir */ 138 { &vop_readlink_desc, ufs_readlink }, /* readlink */ 139 { &vop_abortop_desc, ufs_abortop }, /* abortop */ 140 { &vop_inactive_desc, lfs_inactive }, /* inactive */ 141 { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */ 142 { &vop_lock_desc, ufs_lock }, /* lock */ 143 { &vop_unlock_desc, ufs_unlock }, /* unlock */ 144 { &vop_bmap_desc, ufs_bmap }, /* bmap */ 145 { &vop_strategy_desc, lfs_strategy }, /* strategy */ 146 { &vop_print_desc, ufs_print }, /* print */ 147 { &vop_islocked_desc, ufs_islocked }, /* islocked */ 148 { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */ 149 { &vop_advlock_desc, ufs_advlock }, /* advlock */ 150 { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */ 151 { &vop_getpages_desc, lfs_getpages }, /* getpages */ 152 { &vop_putpages_desc, lfs_putpages }, /* putpages */ 153 { NULL, NULL } 154 }; 155 const struct vnodeopv_desc lfs_vnodeop_opv_desc = 156 { &lfs_vnodeop_p, lfs_vnodeop_entries }; 157 158 int (**lfs_specop_p)(void *); 159 const struct vnodeopv_entry_desc lfs_specop_entries[] = { 160 { &vop_default_desc, vn_default_error }, 161 { &vop_lookup_desc, spec_lookup }, /* lookup */ 162 { &vop_create_desc, spec_create }, /* create */ 163 { &vop_mknod_desc, spec_mknod }, /* mknod */ 164 { &vop_open_desc, spec_open }, /* open */ 165 { &vop_close_desc, lfsspec_close }, /* close */ 166 { &vop_access_desc, ufs_access }, /* access */ 167 { &vop_getattr_desc, lfs_getattr }, /* getattr */ 168 { &vop_setattr_desc, lfs_setattr }, /* setattr */ 169 { &vop_read_desc, ufsspec_read }, /* read */ 170 { &vop_write_desc, ufsspec_write }, /* write */ 171 { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ 172 { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */ 173 { &vop_poll_desc, spec_poll }, /* poll */ 174 { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */ 175 { &vop_revoke_desc, spec_revoke }, /* revoke */ 176 { &vop_mmap_desc, spec_mmap }, /* mmap */ 177 { &vop_fsync_desc, spec_fsync }, /* fsync */ 178 { &vop_seek_desc, spec_seek }, /* seek */ 179 { &vop_remove_desc, spec_remove }, /* remove */ 180 { &vop_link_desc, spec_link }, /* link */ 181 { &vop_rename_desc, spec_rename }, /* rename */ 182 { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ 183 { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ 184 { &vop_symlink_desc, spec_symlink }, /* symlink */ 185 { &vop_readdir_desc, spec_readdir }, /* readdir */ 186 { &vop_readlink_desc, spec_readlink }, /* readlink */ 187 { &vop_abortop_desc, spec_abortop }, /* abortop */ 188 { &vop_inactive_desc, lfs_inactive }, /* inactive */ 189 { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */ 190 { &vop_lock_desc, ufs_lock }, /* lock */ 191 { &vop_unlock_desc, ufs_unlock }, /* unlock */ 192 { &vop_bmap_desc, spec_bmap }, /* bmap */ 193 { &vop_strategy_desc, spec_strategy }, /* strategy */ 194 { &vop_print_desc, ufs_print }, /* print */ 195 { &vop_islocked_desc, ufs_islocked }, /* islocked */ 196 { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ 197 { &vop_advlock_desc, spec_advlock }, /* advlock */ 198 { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ 199 { &vop_getpages_desc, spec_getpages }, /* getpages */ 200 { &vop_putpages_desc, spec_putpages }, /* putpages */ 201 { NULL, NULL } 202 }; 203 const struct vnodeopv_desc lfs_specop_opv_desc = 204 { &lfs_specop_p, lfs_specop_entries }; 205 206 int (**lfs_fifoop_p)(void *); 207 const struct vnodeopv_entry_desc lfs_fifoop_entries[] = { 208 { &vop_default_desc, vn_default_error }, 209 { &vop_lookup_desc, vn_fifo_bypass }, /* lookup */ 210 { &vop_create_desc, vn_fifo_bypass }, /* create */ 211 { &vop_mknod_desc, vn_fifo_bypass }, /* mknod */ 212 { &vop_open_desc, vn_fifo_bypass }, /* open */ 213 { &vop_close_desc, lfsfifo_close }, /* close */ 214 { &vop_access_desc, ufs_access }, /* access */ 215 { &vop_getattr_desc, lfs_getattr }, /* getattr */ 216 { &vop_setattr_desc, lfs_setattr }, /* setattr */ 217 { &vop_read_desc, ufsfifo_read }, /* read */ 218 { &vop_write_desc, ufsfifo_write }, /* write */ 219 { &vop_ioctl_desc, vn_fifo_bypass }, /* ioctl */ 220 { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */ 221 { &vop_poll_desc, vn_fifo_bypass }, /* poll */ 222 { &vop_kqfilter_desc, vn_fifo_bypass }, /* kqfilter */ 223 { &vop_revoke_desc, vn_fifo_bypass }, /* revoke */ 224 { &vop_mmap_desc, vn_fifo_bypass }, /* mmap */ 225 { &vop_fsync_desc, vn_fifo_bypass }, /* fsync */ 226 { &vop_seek_desc, vn_fifo_bypass }, /* seek */ 227 { &vop_remove_desc, vn_fifo_bypass }, /* remove */ 228 { &vop_link_desc, vn_fifo_bypass }, /* link */ 229 { &vop_rename_desc, vn_fifo_bypass }, /* rename */ 230 { &vop_mkdir_desc, vn_fifo_bypass }, /* mkdir */ 231 { &vop_rmdir_desc, vn_fifo_bypass }, /* rmdir */ 232 { &vop_symlink_desc, vn_fifo_bypass }, /* symlink */ 233 { &vop_readdir_desc, vn_fifo_bypass }, /* readdir */ 234 { &vop_readlink_desc, vn_fifo_bypass }, /* readlink */ 235 { &vop_abortop_desc, vn_fifo_bypass }, /* abortop */ 236 { &vop_inactive_desc, lfs_inactive }, /* inactive */ 237 { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */ 238 { &vop_lock_desc, ufs_lock }, /* lock */ 239 { &vop_unlock_desc, ufs_unlock }, /* unlock */ 240 { &vop_bmap_desc, vn_fifo_bypass }, /* bmap */ 241 { &vop_strategy_desc, vn_fifo_bypass }, /* strategy */ 242 { &vop_print_desc, ufs_print }, /* print */ 243 { &vop_islocked_desc, ufs_islocked }, /* islocked */ 244 { &vop_pathconf_desc, vn_fifo_bypass }, /* pathconf */ 245 { &vop_advlock_desc, vn_fifo_bypass }, /* advlock */ 246 { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */ 247 { &vop_putpages_desc, vn_fifo_bypass }, /* putpages */ 248 { NULL, NULL } 249 }; 250 const struct vnodeopv_desc lfs_fifoop_opv_desc = 251 { &lfs_fifoop_p, lfs_fifoop_entries }; 252 253 static int check_dirty(struct lfs *, struct vnode *, off_t, off_t, off_t, int, int, struct vm_page **); 254 255 #define LFS_READWRITE 256 #include <ufs/ufs/ufs_readwrite.c> 257 #undef LFS_READWRITE 258 259 /* 260 * Synch an open file. 261 */ 262 /* ARGSUSED */ 263 int 264 lfs_fsync(void *v) 265 { 266 struct vop_fsync_args /* { 267 struct vnode *a_vp; 268 kauth_cred_t a_cred; 269 int a_flags; 270 off_t offlo; 271 off_t offhi; 272 } */ *ap = v; 273 struct vnode *vp = ap->a_vp; 274 int error, wait; 275 struct inode *ip = VTOI(vp); 276 struct lfs *fs = ip->i_lfs; 277 278 /* If we're mounted read-only, don't try to sync. */ 279 if (fs->lfs_ronly) 280 return 0; 281 282 /* If a removed vnode is being cleaned, no need to sync here. */ 283 if ((ap->a_flags & FSYNC_RECLAIM) != 0 && ip->i_mode == 0) 284 return 0; 285 286 /* 287 * Trickle sync simply adds this vnode to the pager list, as if 288 * the pagedaemon had requested a pageout. 289 */ 290 if (ap->a_flags & FSYNC_LAZY) { 291 if (lfs_ignore_lazy_sync == 0) { 292 mutex_enter(&lfs_lock); 293 if (!(ip->i_flags & IN_PAGING)) { 294 ip->i_flags |= IN_PAGING; 295 TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, 296 i_lfs_pchain); 297 } 298 wakeup(&lfs_writer_daemon); 299 mutex_exit(&lfs_lock); 300 } 301 return 0; 302 } 303 304 /* 305 * If a vnode is bring cleaned, flush it out before we try to 306 * reuse it. This prevents the cleaner from writing files twice 307 * in the same partial segment, causing an accounting underflow. 308 */ 309 if (ap->a_flags & FSYNC_RECLAIM && ip->i_flags & IN_CLEANING) { 310 lfs_vflush(vp); 311 } 312 313 wait = (ap->a_flags & FSYNC_WAIT); 314 do { 315 mutex_enter(vp->v_interlock); 316 error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo), 317 round_page(ap->a_offhi), 318 PGO_CLEANIT | (wait ? PGO_SYNCIO : 0)); 319 if (error == EAGAIN) { 320 mutex_enter(&lfs_lock); 321 mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_fsync", 322 hz / 100 + 1, &lfs_lock); 323 mutex_exit(&lfs_lock); 324 } 325 } while (error == EAGAIN); 326 if (error) 327 return error; 328 329 if ((ap->a_flags & FSYNC_DATAONLY) == 0) 330 error = lfs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0); 331 332 if (error == 0 && ap->a_flags & FSYNC_CACHE) { 333 int l = 0; 334 error = VOP_IOCTL(ip->i_devvp, DIOCCACHESYNC, &l, FWRITE, 335 curlwp->l_cred); 336 } 337 if (wait && !VPISEMPTY(vp)) 338 LFS_SET_UINO(ip, IN_MODIFIED); 339 340 return error; 341 } 342 343 /* 344 * Take IN_ADIROP off, then call ufs_inactive. 345 */ 346 int 347 lfs_inactive(void *v) 348 { 349 struct vop_inactive_args /* { 350 struct vnode *a_vp; 351 } */ *ap = v; 352 353 lfs_unmark_vnode(ap->a_vp); 354 355 /* 356 * The Ifile is only ever inactivated on unmount. 357 * Streamline this process by not giving it more dirty blocks. 358 */ 359 if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM) { 360 mutex_enter(&lfs_lock); 361 LFS_CLR_UINO(VTOI(ap->a_vp), IN_ALLMOD); 362 mutex_exit(&lfs_lock); 363 VOP_UNLOCK(ap->a_vp); 364 return 0; 365 } 366 367 #ifdef DEBUG 368 /* 369 * This might happen on unmount. 370 * XXX If it happens at any other time, it should be a panic. 371 */ 372 if (ap->a_vp->v_uflag & VU_DIROP) { 373 struct inode *ip = VTOI(ap->a_vp); 374 printf("lfs_inactive: inactivating VU_DIROP? ino = %d\n", (int)ip->i_number); 375 } 376 #endif /* DIAGNOSTIC */ 377 378 return ufs_inactive(v); 379 } 380 381 /* 382 * These macros are used to bracket UFS directory ops, so that we can 383 * identify all the pages touched during directory ops which need to 384 * be ordered and flushed atomically, so that they may be recovered. 385 * 386 * Because we have to mark nodes VU_DIROP in order to prevent 387 * the cache from reclaiming them while a dirop is in progress, we must 388 * also manage the number of nodes so marked (otherwise we can run out). 389 * We do this by setting lfs_dirvcount to the number of marked vnodes; it 390 * is decremented during segment write, when VU_DIROP is taken off. 391 */ 392 #define MARK_VNODE(vp) lfs_mark_vnode(vp) 393 #define UNMARK_VNODE(vp) lfs_unmark_vnode(vp) 394 #define SET_DIROP_CREATE(dvp, vpp) lfs_set_dirop_create((dvp), (vpp)) 395 #define SET_DIROP_REMOVE(dvp, vp) lfs_set_dirop((dvp), (vp)) 396 static int lfs_set_dirop_create(struct vnode *, struct vnode **); 397 static int lfs_set_dirop(struct vnode *, struct vnode *); 398 399 static int 400 lfs_set_dirop(struct vnode *dvp, struct vnode *vp) 401 { 402 struct lfs *fs; 403 int error; 404 405 KASSERT(VOP_ISLOCKED(dvp)); 406 KASSERT(vp == NULL || VOP_ISLOCKED(vp)); 407 408 fs = VTOI(dvp)->i_lfs; 409 410 ASSERT_NO_SEGLOCK(fs); 411 /* 412 * LFS_NRESERVE calculates direct and indirect blocks as well 413 * as an inode block; an overestimate in most cases. 414 */ 415 if ((error = lfs_reserve(fs, dvp, vp, LFS_NRESERVE(fs))) != 0) 416 return (error); 417 418 restart: 419 mutex_enter(&lfs_lock); 420 if (fs->lfs_dirops == 0) { 421 mutex_exit(&lfs_lock); 422 lfs_check(dvp, LFS_UNUSED_LBN, 0); 423 mutex_enter(&lfs_lock); 424 } 425 while (fs->lfs_writer) { 426 error = mtsleep(&fs->lfs_dirops, (PRIBIO + 1) | PCATCH, 427 "lfs_sdirop", 0, &lfs_lock); 428 if (error == EINTR) { 429 mutex_exit(&lfs_lock); 430 goto unreserve; 431 } 432 } 433 if (lfs_dirvcount > LFS_MAX_DIROP && fs->lfs_dirops == 0) { 434 wakeup(&lfs_writer_daemon); 435 mutex_exit(&lfs_lock); 436 preempt(); 437 goto restart; 438 } 439 440 if (lfs_dirvcount > LFS_MAX_DIROP) { 441 DLOG((DLOG_DIROP, "lfs_set_dirop: sleeping with dirops=%d, " 442 "dirvcount=%d\n", fs->lfs_dirops, lfs_dirvcount)); 443 if ((error = mtsleep(&lfs_dirvcount, 444 PCATCH | PUSER | PNORELOCK, "lfs_maxdirop", 0, 445 &lfs_lock)) != 0) { 446 goto unreserve; 447 } 448 goto restart; 449 } 450 451 ++fs->lfs_dirops; 452 /* fs->lfs_doifile = 1; */ /* XXX why? --ks */ 453 mutex_exit(&lfs_lock); 454 455 /* Hold a reference so SET_ENDOP will be happy */ 456 vref(dvp); 457 if (vp) { 458 vref(vp); 459 MARK_VNODE(vp); 460 } 461 462 MARK_VNODE(dvp); 463 return 0; 464 465 unreserve: 466 lfs_reserve(fs, dvp, vp, -LFS_NRESERVE(fs)); 467 return error; 468 } 469 470 /* 471 * Get a new vnode *before* adjusting the dirop count, to avoid a deadlock 472 * in getnewvnode(), if we have a stacked filesystem mounted on top 473 * of us. 474 * 475 * NB: this means we have to clear the new vnodes on error. Fortunately 476 * SET_ENDOP is there to do that for us. 477 */ 478 static int 479 lfs_set_dirop_create(struct vnode *dvp, struct vnode **vpp) 480 { 481 int error; 482 struct lfs *fs; 483 484 fs = VFSTOUFS(dvp->v_mount)->um_lfs; 485 ASSERT_NO_SEGLOCK(fs); 486 if (fs->lfs_ronly) 487 return EROFS; 488 if (vpp == NULL) { 489 return lfs_set_dirop(dvp, NULL); 490 } 491 error = getnewvnode(VT_LFS, dvp->v_mount, lfs_vnodeop_p, NULL, vpp); 492 if (error) { 493 DLOG((DLOG_ALLOC, "lfs_set_dirop_create: dvp %p error %d\n", 494 dvp, error)); 495 return error; 496 } 497 if ((error = lfs_set_dirop(dvp, NULL)) != 0) { 498 ungetnewvnode(*vpp); 499 *vpp = NULL; 500 return error; 501 } 502 return 0; 503 } 504 505 #define SET_ENDOP_BASE(fs, dvp, str) \ 506 do { \ 507 mutex_enter(&lfs_lock); \ 508 --(fs)->lfs_dirops; \ 509 if (!(fs)->lfs_dirops) { \ 510 if ((fs)->lfs_nadirop) { \ 511 panic("SET_ENDOP: %s: no dirops but " \ 512 " nadirop=%d", (str), \ 513 (fs)->lfs_nadirop); \ 514 } \ 515 wakeup(&(fs)->lfs_writer); \ 516 mutex_exit(&lfs_lock); \ 517 lfs_check((dvp), LFS_UNUSED_LBN, 0); \ 518 } else \ 519 mutex_exit(&lfs_lock); \ 520 } while(0) 521 #define SET_ENDOP_CREATE(fs, dvp, nvpp, str) \ 522 do { \ 523 UNMARK_VNODE(dvp); \ 524 if (nvpp && *nvpp) \ 525 UNMARK_VNODE(*nvpp); \ 526 /* Check for error return to stem vnode leakage */ \ 527 if (nvpp && *nvpp && !((*nvpp)->v_uflag & VU_DIROP)) \ 528 ungetnewvnode(*(nvpp)); \ 529 SET_ENDOP_BASE((fs), (dvp), (str)); \ 530 lfs_reserve((fs), (dvp), NULL, -LFS_NRESERVE(fs)); \ 531 vrele(dvp); \ 532 } while(0) 533 #define SET_ENDOP_CREATE_AP(ap, str) \ 534 SET_ENDOP_CREATE(VTOI((ap)->a_dvp)->i_lfs, (ap)->a_dvp, \ 535 (ap)->a_vpp, (str)) 536 #define SET_ENDOP_REMOVE(fs, dvp, ovp, str) \ 537 do { \ 538 UNMARK_VNODE(dvp); \ 539 if (ovp) \ 540 UNMARK_VNODE(ovp); \ 541 SET_ENDOP_BASE((fs), (dvp), (str)); \ 542 lfs_reserve((fs), (dvp), (ovp), -LFS_NRESERVE(fs)); \ 543 vrele(dvp); \ 544 if (ovp) \ 545 vrele(ovp); \ 546 } while(0) 547 548 void 549 lfs_mark_vnode(struct vnode *vp) 550 { 551 struct inode *ip = VTOI(vp); 552 struct lfs *fs = ip->i_lfs; 553 554 mutex_enter(&lfs_lock); 555 if (!(ip->i_flag & IN_ADIROP)) { 556 if (!(vp->v_uflag & VU_DIROP)) { 557 mutex_exit(&lfs_lock); 558 mutex_enter(vp->v_interlock); 559 if (lfs_vref(vp) != 0) 560 panic("lfs_mark_vnode: could not vref"); 561 mutex_enter(&lfs_lock); 562 ++lfs_dirvcount; 563 ++fs->lfs_dirvcount; 564 TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain); 565 vp->v_uflag |= VU_DIROP; 566 } 567 ++fs->lfs_nadirop; 568 ip->i_flag &= ~IN_CDIROP; 569 ip->i_flag |= IN_ADIROP; 570 } else 571 KASSERT(vp->v_uflag & VU_DIROP); 572 mutex_exit(&lfs_lock); 573 } 574 575 void 576 lfs_unmark_vnode(struct vnode *vp) 577 { 578 struct inode *ip = VTOI(vp); 579 580 mutex_enter(&lfs_lock); 581 if (ip && (ip->i_flag & IN_ADIROP)) { 582 KASSERT(vp->v_uflag & VU_DIROP); 583 --ip->i_lfs->lfs_nadirop; 584 ip->i_flag &= ~IN_ADIROP; 585 } 586 mutex_exit(&lfs_lock); 587 } 588 589 int 590 lfs_symlink(void *v) 591 { 592 struct vop_symlink_args /* { 593 struct vnode *a_dvp; 594 struct vnode **a_vpp; 595 struct componentname *a_cnp; 596 struct vattr *a_vap; 597 char *a_target; 598 } */ *ap = v; 599 int error; 600 601 if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) { 602 vput(ap->a_dvp); 603 return error; 604 } 605 error = ufs_symlink(ap); 606 SET_ENDOP_CREATE_AP(ap, "symlink"); 607 return (error); 608 } 609 610 int 611 lfs_mknod(void *v) 612 { 613 struct vop_mknod_args /* { 614 struct vnode *a_dvp; 615 struct vnode **a_vpp; 616 struct componentname *a_cnp; 617 struct vattr *a_vap; 618 } */ *ap = v; 619 struct vattr *vap = ap->a_vap; 620 struct vnode **vpp = ap->a_vpp; 621 struct inode *ip; 622 int error; 623 struct mount *mp; 624 ino_t ino; 625 struct ufs_lookup_results *ulr; 626 627 /* XXX should handle this material another way */ 628 ulr = &VTOI(ap->a_dvp)->i_crap; 629 UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp)); 630 631 if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) { 632 vput(ap->a_dvp); 633 return error; 634 } 635 error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), 636 ap->a_dvp, ulr, vpp, ap->a_cnp); 637 638 /* Either way we're done with the dirop at this point */ 639 SET_ENDOP_CREATE_AP(ap, "mknod"); 640 641 if (error) 642 return (error); 643 644 ip = VTOI(*vpp); 645 mp = (*vpp)->v_mount; 646 ino = ip->i_number; 647 ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; 648 if (vap->va_rdev != VNOVAL) { 649 /* 650 * Want to be able to use this to make badblock 651 * inodes, so don't truncate the dev number. 652 */ 653 #if 0 654 ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev, 655 UFS_MPNEEDSWAP((*vpp)->v_mount)); 656 #else 657 ip->i_ffs1_rdev = vap->va_rdev; 658 #endif 659 } 660 661 /* 662 * Call fsync to write the vnode so that we don't have to deal with 663 * flushing it when it's marked VU_DIROP|VI_XLOCK. 664 * 665 * XXX KS - If we can't flush we also can't call vgone(), so must 666 * return. But, that leaves this vnode in limbo, also not good. 667 * Can this ever happen (barring hardware failure)? 668 */ 669 if ((error = VOP_FSYNC(*vpp, NOCRED, FSYNC_WAIT, 0, 0)) != 0) { 670 panic("lfs_mknod: couldn't fsync (ino %llu)", 671 (unsigned long long)ino); 672 /* return (error); */ 673 } 674 /* 675 * Remove vnode so that it will be reloaded by VFS_VGET and 676 * checked to see if it is an alias of an existing entry in 677 * the inode cache. 678 */ 679 /* Used to be vput, but that causes us to call VOP_INACTIVE twice. */ 680 681 VOP_UNLOCK(*vpp); 682 (*vpp)->v_type = VNON; 683 vgone(*vpp); 684 error = VFS_VGET(mp, ino, vpp); 685 686 if (error != 0) { 687 *vpp = NULL; 688 return (error); 689 } 690 return (0); 691 } 692 693 int 694 lfs_create(void *v) 695 { 696 struct vop_create_args /* { 697 struct vnode *a_dvp; 698 struct vnode **a_vpp; 699 struct componentname *a_cnp; 700 struct vattr *a_vap; 701 } */ *ap = v; 702 int error; 703 704 if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) { 705 vput(ap->a_dvp); 706 return error; 707 } 708 error = ufs_create(ap); 709 SET_ENDOP_CREATE_AP(ap, "create"); 710 return (error); 711 } 712 713 int 714 lfs_mkdir(void *v) 715 { 716 struct vop_mkdir_args /* { 717 struct vnode *a_dvp; 718 struct vnode **a_vpp; 719 struct componentname *a_cnp; 720 struct vattr *a_vap; 721 } */ *ap = v; 722 int error; 723 724 if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) { 725 vput(ap->a_dvp); 726 return error; 727 } 728 error = ufs_mkdir(ap); 729 SET_ENDOP_CREATE_AP(ap, "mkdir"); 730 return (error); 731 } 732 733 int 734 lfs_remove(void *v) 735 { 736 struct vop_remove_args /* { 737 struct vnode *a_dvp; 738 struct vnode *a_vp; 739 struct componentname *a_cnp; 740 } */ *ap = v; 741 struct vnode *dvp, *vp; 742 struct inode *ip; 743 int error; 744 745 dvp = ap->a_dvp; 746 vp = ap->a_vp; 747 ip = VTOI(vp); 748 if ((error = SET_DIROP_REMOVE(dvp, vp)) != 0) { 749 if (dvp == vp) 750 vrele(vp); 751 else 752 vput(vp); 753 vput(dvp); 754 return error; 755 } 756 error = ufs_remove(ap); 757 if (ip->i_nlink == 0) 758 lfs_orphan(ip->i_lfs, ip->i_number); 759 SET_ENDOP_REMOVE(ip->i_lfs, dvp, ap->a_vp, "remove"); 760 return (error); 761 } 762 763 int 764 lfs_rmdir(void *v) 765 { 766 struct vop_rmdir_args /* { 767 struct vnodeop_desc *a_desc; 768 struct vnode *a_dvp; 769 struct vnode *a_vp; 770 struct componentname *a_cnp; 771 } */ *ap = v; 772 struct vnode *vp; 773 struct inode *ip; 774 int error; 775 776 vp = ap->a_vp; 777 ip = VTOI(vp); 778 if ((error = SET_DIROP_REMOVE(ap->a_dvp, ap->a_vp)) != 0) { 779 if (ap->a_dvp == vp) 780 vrele(ap->a_dvp); 781 else 782 vput(ap->a_dvp); 783 vput(vp); 784 return error; 785 } 786 error = ufs_rmdir(ap); 787 if (ip->i_nlink == 0) 788 lfs_orphan(ip->i_lfs, ip->i_number); 789 SET_ENDOP_REMOVE(ip->i_lfs, ap->a_dvp, ap->a_vp, "rmdir"); 790 return (error); 791 } 792 793 int 794 lfs_link(void *v) 795 { 796 struct vop_link_args /* { 797 struct vnode *a_dvp; 798 struct vnode *a_vp; 799 struct componentname *a_cnp; 800 } */ *ap = v; 801 int error; 802 struct vnode **vpp = NULL; 803 804 if ((error = SET_DIROP_CREATE(ap->a_dvp, vpp)) != 0) { 805 vput(ap->a_dvp); 806 return error; 807 } 808 error = ufs_link(ap); 809 SET_ENDOP_CREATE(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, vpp, "link"); 810 return (error); 811 } 812 813 static const struct genfs_rename_ops lfs_genfs_rename_ops; 814 815 /* 816 * lfs_sane_rename: The hairiest vop, with the saner API. 817 * 818 * Arguments: 819 * 820 * . fdvp (from directory vnode), 821 * . fcnp (from component name), 822 * . tdvp (to directory vnode), 823 * . tcnp (to component name), 824 * . cred (credentials structure), and 825 * . posixly_correct (flag for behaviour if target & source link same file). 826 * 827 * fdvp and tdvp may be the same, and must be referenced and unlocked. 828 */ 829 static int 830 lfs_sane_rename( 831 struct vnode *fdvp, struct componentname *fcnp, 832 struct vnode *tdvp, struct componentname *tcnp, 833 kauth_cred_t cred, bool posixly_correct) 834 { 835 struct ufs_lookup_results fulr, tulr; 836 837 /* 838 * XXX Provisional kludge -- ufs_lookup does not reject rename 839 * of . or .. (from or to), so we hack it here. This is not 840 * the right place: it should be caller's responsibility to 841 * reject this case. 842 */ 843 KASSERT(fcnp != NULL); 844 KASSERT(tcnp != NULL); 845 KASSERT(fcnp != tcnp); 846 KASSERT(fcnp->cn_nameptr != NULL); 847 KASSERT(tcnp->cn_nameptr != NULL); 848 849 if ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) 850 return EINVAL; /* XXX EISDIR? */ 851 if ((fcnp->cn_namelen == 1) && (fcnp->cn_nameptr[0] == '.')) 852 return EINVAL; 853 if ((tcnp->cn_namelen == 1) && (tcnp->cn_nameptr[0] == '.')) 854 return EINVAL; 855 856 return genfs_sane_rename(&lfs_genfs_rename_ops, 857 fdvp, fcnp, &fulr, tdvp, tcnp, &tulr, 858 cred, posixly_correct); 859 } 860 861 /* 862 * lfs_rename: The hairiest vop, with the insanest API. Defer to 863 * genfs_insane_rename immediately. 864 */ 865 int 866 lfs_rename(void *v) 867 { 868 869 return genfs_insane_rename(v, &lfs_sane_rename); 870 } 871 872 /* 873 * lfs_gro_rename: Actually perform the rename operation. Do a little 874 * LFS bookkeeping and then defer to ufs_gro_rename. 875 */ 876 static int 877 lfs_gro_rename(struct mount *mp, kauth_cred_t cred, 878 struct vnode *fdvp, struct componentname *fcnp, 879 void *fde, struct vnode *fvp, 880 struct vnode *tdvp, struct componentname *tcnp, 881 void *tde, struct vnode *tvp) 882 { 883 int error; 884 885 KASSERT(mp != NULL); 886 KASSERT(fdvp != NULL); 887 KASSERT(fcnp != NULL); 888 KASSERT(fde != NULL); 889 KASSERT(fvp != NULL); 890 KASSERT(tdvp != NULL); 891 KASSERT(tcnp != NULL); 892 KASSERT(tde != NULL); 893 KASSERT(fdvp != fvp); 894 KASSERT(fdvp != tvp); 895 KASSERT(tdvp != fvp); 896 KASSERT(tdvp != tvp); 897 KASSERT(fvp != tvp); 898 KASSERT(fdvp->v_mount == mp); 899 KASSERT(fvp->v_mount == mp); 900 KASSERT(tdvp->v_mount == mp); 901 KASSERT((tvp == NULL) || (tvp->v_mount == mp)); 902 KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); 903 KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); 904 KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); 905 KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); 906 907 error = SET_DIROP_REMOVE(tdvp, tvp); 908 if (error != 0) 909 return error; 910 911 MARK_VNODE(fdvp); 912 MARK_VNODE(fvp); 913 914 error = ufs_gro_rename(mp, cred, 915 fdvp, fcnp, fde, fvp, 916 tdvp, tcnp, tde, tvp); 917 918 UNMARK_VNODE(fdvp); 919 UNMARK_VNODE(fvp); 920 SET_ENDOP_REMOVE(VFSTOUFS(mp)->um_lfs, tdvp, tvp, "rename"); 921 922 return error; 923 } 924 925 static const struct genfs_rename_ops lfs_genfs_rename_ops = { 926 .gro_directory_empty_p = ufs_gro_directory_empty_p, 927 .gro_rename_check_possible = ufs_gro_rename_check_possible, 928 .gro_rename_check_permitted = ufs_gro_rename_check_permitted, 929 .gro_remove_check_possible = ufs_gro_remove_check_possible, 930 .gro_remove_check_permitted = ufs_gro_remove_check_permitted, 931 .gro_rename = lfs_gro_rename, 932 .gro_remove = ufs_gro_remove, 933 .gro_lookup = ufs_gro_lookup, 934 .gro_genealogy = ufs_gro_genealogy, 935 .gro_lock_directory = ufs_gro_lock_directory, 936 }; 937 938 /* XXX hack to avoid calling ITIMES in getattr */ 939 int 940 lfs_getattr(void *v) 941 { 942 struct vop_getattr_args /* { 943 struct vnode *a_vp; 944 struct vattr *a_vap; 945 kauth_cred_t a_cred; 946 } */ *ap = v; 947 struct vnode *vp = ap->a_vp; 948 struct inode *ip = VTOI(vp); 949 struct vattr *vap = ap->a_vap; 950 struct lfs *fs = ip->i_lfs; 951 /* 952 * Copy from inode table 953 */ 954 vap->va_fsid = ip->i_dev; 955 vap->va_fileid = ip->i_number; 956 vap->va_mode = ip->i_mode & ~IFMT; 957 vap->va_nlink = ip->i_nlink; 958 vap->va_uid = ip->i_uid; 959 vap->va_gid = ip->i_gid; 960 vap->va_rdev = (dev_t)ip->i_ffs1_rdev; 961 vap->va_size = vp->v_size; 962 vap->va_atime.tv_sec = ip->i_ffs1_atime; 963 vap->va_atime.tv_nsec = ip->i_ffs1_atimensec; 964 vap->va_mtime.tv_sec = ip->i_ffs1_mtime; 965 vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec; 966 vap->va_ctime.tv_sec = ip->i_ffs1_ctime; 967 vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec; 968 vap->va_flags = ip->i_flags; 969 vap->va_gen = ip->i_gen; 970 /* this doesn't belong here */ 971 if (vp->v_type == VBLK) 972 vap->va_blocksize = BLKDEV_IOSIZE; 973 else if (vp->v_type == VCHR) 974 vap->va_blocksize = MAXBSIZE; 975 else 976 vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; 977 vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_lfs_effnblks); 978 vap->va_type = vp->v_type; 979 vap->va_filerev = ip->i_modrev; 980 return (0); 981 } 982 983 /* 984 * Check to make sure the inode blocks won't choke the buffer 985 * cache, then call ufs_setattr as usual. 986 */ 987 int 988 lfs_setattr(void *v) 989 { 990 struct vop_setattr_args /* { 991 struct vnode *a_vp; 992 struct vattr *a_vap; 993 kauth_cred_t a_cred; 994 } */ *ap = v; 995 struct vnode *vp = ap->a_vp; 996 997 lfs_check(vp, LFS_UNUSED_LBN, 0); 998 return ufs_setattr(v); 999 } 1000 1001 /* 1002 * Release the block we hold on lfs_newseg wrapping. Called on file close, 1003 * or explicitly from LFCNWRAPGO. Called with the interlock held. 1004 */ 1005 static int 1006 lfs_wrapgo(struct lfs *fs, struct inode *ip, int waitfor) 1007 { 1008 if (fs->lfs_stoplwp != curlwp) 1009 return EBUSY; 1010 1011 fs->lfs_stoplwp = NULL; 1012 cv_signal(&fs->lfs_stopcv); 1013 1014 KASSERT(fs->lfs_nowrap > 0); 1015 if (fs->lfs_nowrap <= 0) { 1016 return 0; 1017 } 1018 1019 if (--fs->lfs_nowrap == 0) { 1020 log(LOG_NOTICE, "%s: re-enabled log wrap\n", fs->lfs_fsmnt); 1021 wakeup(&fs->lfs_wrappass); 1022 lfs_wakeup_cleaner(fs); 1023 } 1024 if (waitfor) { 1025 mtsleep(&fs->lfs_nextseg, PCATCH | PUSER, "segment", 1026 0, &lfs_lock); 1027 } 1028 1029 return 0; 1030 } 1031 1032 /* 1033 * Close called 1034 */ 1035 /* ARGSUSED */ 1036 int 1037 lfs_close(void *v) 1038 { 1039 struct vop_close_args /* { 1040 struct vnode *a_vp; 1041 int a_fflag; 1042 kauth_cred_t a_cred; 1043 } */ *ap = v; 1044 struct vnode *vp = ap->a_vp; 1045 struct inode *ip = VTOI(vp); 1046 struct lfs *fs = ip->i_lfs; 1047 1048 if ((ip->i_number == ROOTINO || ip->i_number == LFS_IFILE_INUM) && 1049 fs->lfs_stoplwp == curlwp) { 1050 mutex_enter(&lfs_lock); 1051 log(LOG_NOTICE, "lfs_close: releasing log wrap control\n"); 1052 lfs_wrapgo(fs, ip, 0); 1053 mutex_exit(&lfs_lock); 1054 } 1055 1056 if (vp == ip->i_lfs->lfs_ivnode && 1057 vp->v_mount->mnt_iflag & IMNT_UNMOUNT) 1058 return 0; 1059 1060 if (vp->v_usecount > 1 && vp != ip->i_lfs->lfs_ivnode) { 1061 LFS_ITIMES(ip, NULL, NULL, NULL); 1062 } 1063 return (0); 1064 } 1065 1066 /* 1067 * Close wrapper for special devices. 1068 * 1069 * Update the times on the inode then do device close. 1070 */ 1071 int 1072 lfsspec_close(void *v) 1073 { 1074 struct vop_close_args /* { 1075 struct vnode *a_vp; 1076 int a_fflag; 1077 kauth_cred_t a_cred; 1078 } */ *ap = v; 1079 struct vnode *vp; 1080 struct inode *ip; 1081 1082 vp = ap->a_vp; 1083 ip = VTOI(vp); 1084 if (vp->v_usecount > 1) { 1085 LFS_ITIMES(ip, NULL, NULL, NULL); 1086 } 1087 return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap)); 1088 } 1089 1090 /* 1091 * Close wrapper for fifo's. 1092 * 1093 * Update the times on the inode then do device close. 1094 */ 1095 int 1096 lfsfifo_close(void *v) 1097 { 1098 struct vop_close_args /* { 1099 struct vnode *a_vp; 1100 int a_fflag; 1101 kauth_cred_ a_cred; 1102 } */ *ap = v; 1103 struct vnode *vp; 1104 struct inode *ip; 1105 1106 vp = ap->a_vp; 1107 ip = VTOI(vp); 1108 if (ap->a_vp->v_usecount > 1) { 1109 LFS_ITIMES(ip, NULL, NULL, NULL); 1110 } 1111 return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap)); 1112 } 1113 1114 /* 1115 * Reclaim an inode so that it can be used for other purposes. 1116 */ 1117 1118 int 1119 lfs_reclaim(void *v) 1120 { 1121 struct vop_reclaim_args /* { 1122 struct vnode *a_vp; 1123 } */ *ap = v; 1124 struct vnode *vp = ap->a_vp; 1125 struct inode *ip = VTOI(vp); 1126 struct lfs *fs = ip->i_lfs; 1127 int error; 1128 1129 /* 1130 * The inode must be freed and updated before being removed 1131 * from its hash chain. Other threads trying to gain a hold 1132 * on the inode will be stalled because it is locked (VI_XLOCK). 1133 */ 1134 if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) 1135 lfs_vfree(vp, ip->i_number, ip->i_omode); 1136 1137 mutex_enter(&lfs_lock); 1138 LFS_CLR_UINO(ip, IN_ALLMOD); 1139 mutex_exit(&lfs_lock); 1140 if ((error = ufs_reclaim(vp))) 1141 return (error); 1142 1143 /* 1144 * Take us off the paging and/or dirop queues if we were on them. 1145 * We shouldn't be on them. 1146 */ 1147 mutex_enter(&lfs_lock); 1148 if (ip->i_flags & IN_PAGING) { 1149 log(LOG_WARNING, "%s: reclaimed vnode is IN_PAGING\n", 1150 fs->lfs_fsmnt); 1151 ip->i_flags &= ~IN_PAGING; 1152 TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain); 1153 } 1154 if (vp->v_uflag & VU_DIROP) { 1155 panic("reclaimed vnode is VU_DIROP"); 1156 vp->v_uflag &= ~VU_DIROP; 1157 TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain); 1158 } 1159 mutex_exit(&lfs_lock); 1160 1161 pool_put(&lfs_dinode_pool, ip->i_din.ffs1_din); 1162 lfs_deregister_all(vp); 1163 pool_put(&lfs_inoext_pool, ip->inode_ext.lfs); 1164 ip->inode_ext.lfs = NULL; 1165 genfs_node_destroy(vp); 1166 pool_put(&lfs_inode_pool, vp->v_data); 1167 vp->v_data = NULL; 1168 return (0); 1169 } 1170 1171 /* 1172 * Read a block from a storage device. 1173 * In order to avoid reading blocks that are in the process of being 1174 * written by the cleaner---and hence are not mutexed by the normal 1175 * buffer cache / page cache mechanisms---check for collisions before 1176 * reading. 1177 * 1178 * We inline ufs_strategy to make sure that the VOP_BMAP occurs *before* 1179 * the active cleaner test. 1180 * 1181 * XXX This code assumes that lfs_markv makes synchronous checkpoints. 1182 */ 1183 int 1184 lfs_strategy(void *v) 1185 { 1186 struct vop_strategy_args /* { 1187 struct vnode *a_vp; 1188 struct buf *a_bp; 1189 } */ *ap = v; 1190 struct buf *bp; 1191 struct lfs *fs; 1192 struct vnode *vp; 1193 struct inode *ip; 1194 daddr_t tbn; 1195 #define MAXLOOP 25 1196 int i, sn, error, slept, loopcount; 1197 1198 bp = ap->a_bp; 1199 vp = ap->a_vp; 1200 ip = VTOI(vp); 1201 fs = ip->i_lfs; 1202 1203 /* lfs uses its strategy routine only for read */ 1204 KASSERT(bp->b_flags & B_READ); 1205 1206 if (vp->v_type == VBLK || vp->v_type == VCHR) 1207 panic("lfs_strategy: spec"); 1208 KASSERT(bp->b_bcount != 0); 1209 if (bp->b_blkno == bp->b_lblkno) { 1210 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, 1211 NULL); 1212 if (error) { 1213 bp->b_error = error; 1214 bp->b_resid = bp->b_bcount; 1215 biodone(bp); 1216 return (error); 1217 } 1218 if ((long)bp->b_blkno == -1) /* no valid data */ 1219 clrbuf(bp); 1220 } 1221 if ((long)bp->b_blkno < 0) { /* block is not on disk */ 1222 bp->b_resid = bp->b_bcount; 1223 biodone(bp); 1224 return (0); 1225 } 1226 1227 slept = 1; 1228 loopcount = 0; 1229 mutex_enter(&lfs_lock); 1230 while (slept && fs->lfs_seglock) { 1231 mutex_exit(&lfs_lock); 1232 /* 1233 * Look through list of intervals. 1234 * There will only be intervals to look through 1235 * if the cleaner holds the seglock. 1236 * Since the cleaner is synchronous, we can trust 1237 * the list of intervals to be current. 1238 */ 1239 tbn = dbtofsb(fs, bp->b_blkno); 1240 sn = dtosn(fs, tbn); 1241 slept = 0; 1242 for (i = 0; i < fs->lfs_cleanind; i++) { 1243 if (sn == dtosn(fs, fs->lfs_cleanint[i]) && 1244 tbn >= fs->lfs_cleanint[i]) { 1245 DLOG((DLOG_CLEAN, 1246 "lfs_strategy: ino %d lbn %" PRId64 1247 " ind %d sn %d fsb %" PRIx32 1248 " given sn %d fsb %" PRIx64 "\n", 1249 ip->i_number, bp->b_lblkno, i, 1250 dtosn(fs, fs->lfs_cleanint[i]), 1251 fs->lfs_cleanint[i], sn, tbn)); 1252 DLOG((DLOG_CLEAN, 1253 "lfs_strategy: sleeping on ino %d lbn %" 1254 PRId64 "\n", ip->i_number, bp->b_lblkno)); 1255 mutex_enter(&lfs_lock); 1256 if (LFS_SEGLOCK_HELD(fs) && fs->lfs_iocount) { 1257 /* 1258 * Cleaner can't wait for itself. 1259 * Instead, wait for the blocks 1260 * to be written to disk. 1261 * XXX we need pribio in the test 1262 * XXX here. 1263 */ 1264 mtsleep(&fs->lfs_iocount, 1265 (PRIBIO + 1) | PNORELOCK, 1266 "clean2", hz/10 + 1, 1267 &lfs_lock); 1268 slept = 1; 1269 ++loopcount; 1270 break; 1271 } else if (fs->lfs_seglock) { 1272 mtsleep(&fs->lfs_seglock, 1273 (PRIBIO + 1) | PNORELOCK, 1274 "clean1", 0, 1275 &lfs_lock); 1276 slept = 1; 1277 break; 1278 } 1279 mutex_exit(&lfs_lock); 1280 } 1281 } 1282 mutex_enter(&lfs_lock); 1283 if (loopcount > MAXLOOP) { 1284 printf("lfs_strategy: breaking out of clean2 loop\n"); 1285 break; 1286 } 1287 } 1288 mutex_exit(&lfs_lock); 1289 1290 vp = ip->i_devvp; 1291 VOP_STRATEGY(vp, bp); 1292 return (0); 1293 } 1294 1295 /* 1296 * Inline lfs_segwrite/lfs_writevnodes, but just for dirops. 1297 * Technically this is a checkpoint (the on-disk state is valid) 1298 * even though we are leaving out all the file data. 1299 */ 1300 int 1301 lfs_flush_dirops(struct lfs *fs) 1302 { 1303 struct inode *ip, *nip; 1304 struct vnode *vp; 1305 extern int lfs_dostats; 1306 struct segment *sp; 1307 int flags = 0; 1308 int error = 0; 1309 1310 ASSERT_MAYBE_SEGLOCK(fs); 1311 KASSERT(fs->lfs_nadirop == 0); 1312 1313 if (fs->lfs_ronly) 1314 return EROFS; 1315 1316 mutex_enter(&lfs_lock); 1317 if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) { 1318 mutex_exit(&lfs_lock); 1319 return 0; 1320 } else 1321 mutex_exit(&lfs_lock); 1322 1323 if (lfs_dostats) 1324 ++lfs_stats.flush_invoked; 1325 1326 lfs_imtime(fs); 1327 lfs_seglock(fs, flags); 1328 sp = fs->lfs_sp; 1329 1330 /* 1331 * lfs_writevnodes, optimized to get dirops out of the way. 1332 * Only write dirops, and don't flush files' pages, only 1333 * blocks from the directories. 1334 * 1335 * We don't need to vref these files because they are 1336 * dirops and so hold an extra reference until the 1337 * segunlock clears them of that status. 1338 * 1339 * We don't need to check for IN_ADIROP because we know that 1340 * no dirops are active. 1341 * 1342 */ 1343 mutex_enter(&lfs_lock); 1344 for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) { 1345 nip = TAILQ_NEXT(ip, i_lfs_dchain); 1346 mutex_exit(&lfs_lock); 1347 vp = ITOV(ip); 1348 1349 KASSERT((ip->i_flag & IN_ADIROP) == 0); 1350 KASSERT(vp->v_uflag & VU_DIROP); 1351 KASSERT(!(vp->v_iflag & VI_XLOCK)); 1352 1353 /* 1354 * All writes to directories come from dirops; all 1355 * writes to files' direct blocks go through the page 1356 * cache, which we're not touching. Reads to files 1357 * and/or directories will not be affected by writing 1358 * directory blocks inodes and file inodes. So we don't 1359 * really need to lock. 1360 */ 1361 if (vp->v_iflag & VI_XLOCK) { 1362 mutex_enter(&lfs_lock); 1363 continue; 1364 } 1365 /* XXX see below 1366 * waslocked = VOP_ISLOCKED(vp); 1367 */ 1368 if (vp->v_type != VREG && 1369 ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp))) { 1370 error = lfs_writefile(fs, sp, vp); 1371 if (!VPISEMPTY(vp) && !WRITEINPROG(vp) && 1372 !(ip->i_flag & IN_ALLMOD)) { 1373 mutex_enter(&lfs_lock); 1374 LFS_SET_UINO(ip, IN_MODIFIED); 1375 mutex_exit(&lfs_lock); 1376 } 1377 if (error && (sp->seg_flags & SEGM_SINGLE)) { 1378 mutex_enter(&lfs_lock); 1379 error = EAGAIN; 1380 break; 1381 } 1382 } 1383 KDASSERT(ip->i_number != LFS_IFILE_INUM); 1384 error = lfs_writeinode(fs, sp, ip); 1385 mutex_enter(&lfs_lock); 1386 if (error && (sp->seg_flags & SEGM_SINGLE)) { 1387 error = EAGAIN; 1388 break; 1389 } 1390 1391 /* 1392 * We might need to update these inodes again, 1393 * for example, if they have data blocks to write. 1394 * Make sure that after this flush, they are still 1395 * marked IN_MODIFIED so that we don't forget to 1396 * write them. 1397 */ 1398 /* XXX only for non-directories? --KS */ 1399 LFS_SET_UINO(ip, IN_MODIFIED); 1400 } 1401 mutex_exit(&lfs_lock); 1402 /* We've written all the dirops there are */ 1403 ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT); 1404 lfs_finalize_fs_seguse(fs); 1405 (void) lfs_writeseg(fs, sp); 1406 lfs_segunlock(fs); 1407 1408 return error; 1409 } 1410 1411 /* 1412 * Flush all vnodes for which the pagedaemon has requested pageouts. 1413 * Skip over any files that are marked VU_DIROP (since lfs_flush_dirop() 1414 * has just run, this would be an error). If we have to skip a vnode 1415 * for any reason, just skip it; if we have to wait for the cleaner, 1416 * abort. The writer daemon will call us again later. 1417 */ 1418 int 1419 lfs_flush_pchain(struct lfs *fs) 1420 { 1421 struct inode *ip, *nip; 1422 struct vnode *vp; 1423 extern int lfs_dostats; 1424 struct segment *sp; 1425 int error, error2; 1426 1427 ASSERT_NO_SEGLOCK(fs); 1428 1429 if (fs->lfs_ronly) 1430 return EROFS; 1431 1432 mutex_enter(&lfs_lock); 1433 if (TAILQ_FIRST(&fs->lfs_pchainhd) == NULL) { 1434 mutex_exit(&lfs_lock); 1435 return 0; 1436 } else 1437 mutex_exit(&lfs_lock); 1438 1439 /* Get dirops out of the way */ 1440 if ((error = lfs_flush_dirops(fs)) != 0) 1441 return error; 1442 1443 if (lfs_dostats) 1444 ++lfs_stats.flush_invoked; 1445 1446 /* 1447 * Inline lfs_segwrite/lfs_writevnodes, but just for pageouts. 1448 */ 1449 lfs_imtime(fs); 1450 lfs_seglock(fs, 0); 1451 sp = fs->lfs_sp; 1452 1453 /* 1454 * lfs_writevnodes, optimized to clear pageout requests. 1455 * Only write non-dirop files that are in the pageout queue. 1456 * We're very conservative about what we write; we want to be 1457 * fast and async. 1458 */ 1459 mutex_enter(&lfs_lock); 1460 top: 1461 for (ip = TAILQ_FIRST(&fs->lfs_pchainhd); ip != NULL; ip = nip) { 1462 nip = TAILQ_NEXT(ip, i_lfs_pchain); 1463 vp = ITOV(ip); 1464 1465 if (!(ip->i_flags & IN_PAGING)) 1466 goto top; 1467 1468 mutex_enter(vp->v_interlock); 1469 if ((vp->v_iflag & VI_XLOCK) || (vp->v_uflag & VU_DIROP) != 0) { 1470 mutex_exit(vp->v_interlock); 1471 continue; 1472 } 1473 if (vp->v_type != VREG) { 1474 mutex_exit(vp->v_interlock); 1475 continue; 1476 } 1477 if (lfs_vref(vp)) 1478 continue; 1479 mutex_exit(&lfs_lock); 1480 1481 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_RETRY) != 0) { 1482 lfs_vunref(vp); 1483 mutex_enter(&lfs_lock); 1484 continue; 1485 } 1486 1487 error = lfs_writefile(fs, sp, vp); 1488 if (!VPISEMPTY(vp) && !WRITEINPROG(vp) && 1489 !(ip->i_flag & IN_ALLMOD)) { 1490 mutex_enter(&lfs_lock); 1491 LFS_SET_UINO(ip, IN_MODIFIED); 1492 mutex_exit(&lfs_lock); 1493 } 1494 KDASSERT(ip->i_number != LFS_IFILE_INUM); 1495 error2 = lfs_writeinode(fs, sp, ip); 1496 1497 VOP_UNLOCK(vp); 1498 lfs_vunref(vp); 1499 1500 if (error == EAGAIN || error2 == EAGAIN) { 1501 lfs_writeseg(fs, sp); 1502 mutex_enter(&lfs_lock); 1503 break; 1504 } 1505 mutex_enter(&lfs_lock); 1506 } 1507 mutex_exit(&lfs_lock); 1508 (void) lfs_writeseg(fs, sp); 1509 lfs_segunlock(fs); 1510 1511 return 0; 1512 } 1513 1514 /* 1515 * Provide a fcntl interface to sys_lfs_{segwait,bmapv,markv}. 1516 */ 1517 int 1518 lfs_fcntl(void *v) 1519 { 1520 struct vop_fcntl_args /* { 1521 struct vnode *a_vp; 1522 u_int a_command; 1523 void * a_data; 1524 int a_fflag; 1525 kauth_cred_t a_cred; 1526 } */ *ap = v; 1527 struct timeval tv; 1528 struct timeval *tvp; 1529 BLOCK_INFO *blkiov; 1530 CLEANERINFO *cip; 1531 SEGUSE *sup; 1532 int blkcnt, error, oclean; 1533 size_t fh_size; 1534 struct lfs_fcntl_markv blkvp; 1535 struct lwp *l; 1536 fsid_t *fsidp; 1537 struct lfs *fs; 1538 struct buf *bp; 1539 fhandle_t *fhp; 1540 daddr_t off; 1541 1542 /* Only respect LFS fcntls on fs root or Ifile */ 1543 if (VTOI(ap->a_vp)->i_number != ROOTINO && 1544 VTOI(ap->a_vp)->i_number != LFS_IFILE_INUM) { 1545 return ufs_fcntl(v); 1546 } 1547 1548 /* Avoid locking a draining lock */ 1549 if (ap->a_vp->v_mount->mnt_iflag & IMNT_UNMOUNT) { 1550 return ESHUTDOWN; 1551 } 1552 1553 /* LFS control and monitoring fcntls are available only to root */ 1554 l = curlwp; 1555 if (((ap->a_command & 0xff00) >> 8) == 'L' && 1556 (error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 1557 KAUTH_REQ_SYSTEM_LFS_FCNTL, NULL, NULL, NULL)) != 0) 1558 return (error); 1559 1560 fs = VTOI(ap->a_vp)->i_lfs; 1561 fsidp = &ap->a_vp->v_mount->mnt_stat.f_fsidx; 1562 1563 error = 0; 1564 switch ((int)ap->a_command) { 1565 case LFCNSEGWAITALL_COMPAT_50: 1566 case LFCNSEGWAITALL_COMPAT: 1567 fsidp = NULL; 1568 /* FALLSTHROUGH */ 1569 case LFCNSEGWAIT_COMPAT_50: 1570 case LFCNSEGWAIT_COMPAT: 1571 { 1572 struct timeval50 *tvp50 1573 = (struct timeval50 *)ap->a_data; 1574 timeval50_to_timeval(tvp50, &tv); 1575 tvp = &tv; 1576 } 1577 goto segwait_common; 1578 case LFCNSEGWAITALL: 1579 fsidp = NULL; 1580 /* FALLSTHROUGH */ 1581 case LFCNSEGWAIT: 1582 tvp = (struct timeval *)ap->a_data; 1583 segwait_common: 1584 mutex_enter(&lfs_lock); 1585 ++fs->lfs_sleepers; 1586 mutex_exit(&lfs_lock); 1587 1588 error = lfs_segwait(fsidp, tvp); 1589 1590 mutex_enter(&lfs_lock); 1591 if (--fs->lfs_sleepers == 0) 1592 wakeup(&fs->lfs_sleepers); 1593 mutex_exit(&lfs_lock); 1594 return error; 1595 1596 case LFCNBMAPV: 1597 case LFCNMARKV: 1598 blkvp = *(struct lfs_fcntl_markv *)ap->a_data; 1599 1600 blkcnt = blkvp.blkcnt; 1601 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 1602 return (EINVAL); 1603 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 1604 if ((error = copyin(blkvp.blkiov, blkiov, 1605 blkcnt * sizeof(BLOCK_INFO))) != 0) { 1606 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 1607 return error; 1608 } 1609 1610 mutex_enter(&lfs_lock); 1611 ++fs->lfs_sleepers; 1612 mutex_exit(&lfs_lock); 1613 if (ap->a_command == LFCNBMAPV) 1614 error = lfs_bmapv(l->l_proc, fsidp, blkiov, blkcnt); 1615 else /* LFCNMARKV */ 1616 error = lfs_markv(l->l_proc, fsidp, blkiov, blkcnt); 1617 if (error == 0) 1618 error = copyout(blkiov, blkvp.blkiov, 1619 blkcnt * sizeof(BLOCK_INFO)); 1620 mutex_enter(&lfs_lock); 1621 if (--fs->lfs_sleepers == 0) 1622 wakeup(&fs->lfs_sleepers); 1623 mutex_exit(&lfs_lock); 1624 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 1625 return error; 1626 1627 case LFCNRECLAIM: 1628 /* 1629 * Flush dirops and write Ifile, allowing empty segments 1630 * to be immediately reclaimed. 1631 */ 1632 lfs_writer_enter(fs, "pndirop"); 1633 off = fs->lfs_offset; 1634 lfs_seglock(fs, SEGM_FORCE_CKP | SEGM_CKP); 1635 lfs_flush_dirops(fs); 1636 LFS_CLEANERINFO(cip, fs, bp); 1637 oclean = cip->clean; 1638 LFS_SYNC_CLEANERINFO(cip, fs, bp, 1); 1639 lfs_segwrite(ap->a_vp->v_mount, SEGM_FORCE_CKP); 1640 fs->lfs_sp->seg_flags |= SEGM_PROT; 1641 lfs_segunlock(fs); 1642 lfs_writer_leave(fs); 1643 1644 #ifdef DEBUG 1645 LFS_CLEANERINFO(cip, fs, bp); 1646 DLOG((DLOG_CLEAN, "lfs_fcntl: reclaim wrote %" PRId64 1647 " blocks, cleaned %" PRId32 " segments (activesb %d)\n", 1648 fs->lfs_offset - off, cip->clean - oclean, 1649 fs->lfs_activesb)); 1650 LFS_SYNC_CLEANERINFO(cip, fs, bp, 0); 1651 #endif 1652 1653 return 0; 1654 1655 case LFCNIFILEFH_COMPAT: 1656 /* Return the filehandle of the Ifile */ 1657 if ((error = kauth_authorize_system(l->l_cred, 1658 KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL)) != 0) 1659 return (error); 1660 fhp = (struct fhandle *)ap->a_data; 1661 fhp->fh_fsid = *fsidp; 1662 fh_size = 16; /* former VFS_MAXFIDSIZ */ 1663 return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size); 1664 1665 case LFCNIFILEFH_COMPAT2: 1666 case LFCNIFILEFH: 1667 /* Return the filehandle of the Ifile */ 1668 fhp = (struct fhandle *)ap->a_data; 1669 fhp->fh_fsid = *fsidp; 1670 fh_size = sizeof(struct lfs_fhandle) - 1671 offsetof(fhandle_t, fh_fid); 1672 return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size); 1673 1674 case LFCNREWIND: 1675 /* Move lfs_offset to the lowest-numbered segment */ 1676 return lfs_rewind(fs, *(int *)ap->a_data); 1677 1678 case LFCNINVAL: 1679 /* Mark a segment SEGUSE_INVAL */ 1680 LFS_SEGENTRY(sup, fs, *(int *)ap->a_data, bp); 1681 if (sup->su_nbytes > 0) { 1682 brelse(bp, 0); 1683 lfs_unset_inval_all(fs); 1684 return EBUSY; 1685 } 1686 sup->su_flags |= SEGUSE_INVAL; 1687 VOP_BWRITE(bp->b_vp, bp); 1688 return 0; 1689 1690 case LFCNRESIZE: 1691 /* Resize the filesystem */ 1692 return lfs_resize_fs(fs, *(int *)ap->a_data); 1693 1694 case LFCNWRAPSTOP: 1695 case LFCNWRAPSTOP_COMPAT: 1696 /* 1697 * Hold lfs_newseg at segment 0; if requested, sleep until 1698 * the filesystem wraps around. To support external agents 1699 * (dump, fsck-based regression test) that need to look at 1700 * a snapshot of the filesystem, without necessarily 1701 * requiring that all fs activity stops. 1702 */ 1703 if (fs->lfs_stoplwp == curlwp) 1704 return EALREADY; 1705 1706 mutex_enter(&lfs_lock); 1707 while (fs->lfs_stoplwp != NULL) 1708 cv_wait(&fs->lfs_stopcv, &lfs_lock); 1709 fs->lfs_stoplwp = curlwp; 1710 if (fs->lfs_nowrap == 0) 1711 log(LOG_NOTICE, "%s: disabled log wrap\n", fs->lfs_fsmnt); 1712 ++fs->lfs_nowrap; 1713 if (*(int *)ap->a_data == 1 1714 || ap->a_command == LFCNWRAPSTOP_COMPAT) { 1715 log(LOG_NOTICE, "LFCNSTOPWRAP waiting for log wrap\n"); 1716 error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER, 1717 "segwrap", 0, &lfs_lock); 1718 log(LOG_NOTICE, "LFCNSTOPWRAP done waiting\n"); 1719 if (error) { 1720 lfs_wrapgo(fs, VTOI(ap->a_vp), 0); 1721 } 1722 } 1723 mutex_exit(&lfs_lock); 1724 return 0; 1725 1726 case LFCNWRAPGO: 1727 case LFCNWRAPGO_COMPAT: 1728 /* 1729 * Having done its work, the agent wakes up the writer. 1730 * If the argument is 1, it sleeps until a new segment 1731 * is selected. 1732 */ 1733 mutex_enter(&lfs_lock); 1734 error = lfs_wrapgo(fs, VTOI(ap->a_vp), 1735 ap->a_command == LFCNWRAPGO_COMPAT ? 1 : 1736 *((int *)ap->a_data)); 1737 mutex_exit(&lfs_lock); 1738 return error; 1739 1740 case LFCNWRAPPASS: 1741 if ((VTOI(ap->a_vp)->i_lfs_iflags & LFSI_WRAPWAIT)) 1742 return EALREADY; 1743 mutex_enter(&lfs_lock); 1744 if (fs->lfs_stoplwp != curlwp) { 1745 mutex_exit(&lfs_lock); 1746 return EALREADY; 1747 } 1748 if (fs->lfs_nowrap == 0) { 1749 mutex_exit(&lfs_lock); 1750 return EBUSY; 1751 } 1752 fs->lfs_wrappass = 1; 1753 wakeup(&fs->lfs_wrappass); 1754 /* Wait for the log to wrap, if asked */ 1755 if (*(int *)ap->a_data) { 1756 mutex_enter(ap->a_vp->v_interlock); 1757 if (lfs_vref(ap->a_vp) != 0) 1758 panic("LFCNWRAPPASS: lfs_vref failed"); 1759 VTOI(ap->a_vp)->i_lfs_iflags |= LFSI_WRAPWAIT; 1760 log(LOG_NOTICE, "LFCNPASS waiting for log wrap\n"); 1761 error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER, 1762 "segwrap", 0, &lfs_lock); 1763 log(LOG_NOTICE, "LFCNPASS done waiting\n"); 1764 VTOI(ap->a_vp)->i_lfs_iflags &= ~LFSI_WRAPWAIT; 1765 lfs_vunref(ap->a_vp); 1766 } 1767 mutex_exit(&lfs_lock); 1768 return error; 1769 1770 case LFCNWRAPSTATUS: 1771 mutex_enter(&lfs_lock); 1772 *(int *)ap->a_data = fs->lfs_wrapstatus; 1773 mutex_exit(&lfs_lock); 1774 return 0; 1775 1776 default: 1777 return ufs_fcntl(v); 1778 } 1779 return 0; 1780 } 1781 1782 int 1783 lfs_getpages(void *v) 1784 { 1785 struct vop_getpages_args /* { 1786 struct vnode *a_vp; 1787 voff_t a_offset; 1788 struct vm_page **a_m; 1789 int *a_count; 1790 int a_centeridx; 1791 vm_prot_t a_access_type; 1792 int a_advice; 1793 int a_flags; 1794 } */ *ap = v; 1795 1796 if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM && 1797 (ap->a_access_type & VM_PROT_WRITE) != 0) { 1798 return EPERM; 1799 } 1800 if ((ap->a_access_type & VM_PROT_WRITE) != 0) { 1801 mutex_enter(&lfs_lock); 1802 LFS_SET_UINO(VTOI(ap->a_vp), IN_MODIFIED); 1803 mutex_exit(&lfs_lock); 1804 } 1805 1806 /* 1807 * we're relying on the fact that genfs_getpages() always read in 1808 * entire filesystem blocks. 1809 */ 1810 return genfs_getpages(v); 1811 } 1812 1813 /* 1814 * Wait for a page to become unbusy, possibly printing diagnostic messages 1815 * as well. 1816 * 1817 * Called with vp->v_interlock held; return with it held. 1818 */ 1819 static void 1820 wait_for_page(struct vnode *vp, struct vm_page *pg, const char *label) 1821 { 1822 KASSERT(mutex_owned(vp->v_interlock)); 1823 if ((pg->flags & PG_BUSY) == 0) 1824 return; /* Nothing to wait for! */ 1825 1826 #if defined(DEBUG) && defined(UVM_PAGE_TRKOWN) 1827 static struct vm_page *lastpg; 1828 1829 if (label != NULL && pg != lastpg) { 1830 if (pg->owner_tag) { 1831 printf("lfs_putpages[%d.%d]: %s: page %p owner %d.%d [%s]\n", 1832 curproc->p_pid, curlwp->l_lid, label, 1833 pg, pg->owner, pg->lowner, pg->owner_tag); 1834 } else { 1835 printf("lfs_putpages[%d.%d]: %s: page %p unowned?!\n", 1836 curproc->p_pid, curlwp->l_lid, label, pg); 1837 } 1838 } 1839 lastpg = pg; 1840 #endif 1841 1842 pg->flags |= PG_WANTED; 1843 UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0, "lfsput", 0); 1844 mutex_enter(vp->v_interlock); 1845 } 1846 1847 /* 1848 * This routine is called by lfs_putpages() when it can't complete the 1849 * write because a page is busy. This means that either (1) someone, 1850 * possibly the pagedaemon, is looking at this page, and will give it up 1851 * presently; or (2) we ourselves are holding the page busy in the 1852 * process of being written (either gathered or actually on its way to 1853 * disk). We don't need to give up the segment lock, but we might need 1854 * to call lfs_writeseg() to expedite the page's journey to disk. 1855 * 1856 * Called with vp->v_interlock held; return with it held. 1857 */ 1858 /* #define BUSYWAIT */ 1859 static void 1860 write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg, 1861 int seglocked, const char *label) 1862 { 1863 KASSERT(mutex_owned(vp->v_interlock)); 1864 #ifndef BUSYWAIT 1865 struct inode *ip = VTOI(vp); 1866 struct segment *sp = fs->lfs_sp; 1867 int count = 0; 1868 1869 if (pg == NULL) 1870 return; 1871 1872 while (pg->flags & PG_BUSY && 1873 pg->uobject == &vp->v_uobj) { 1874 mutex_exit(vp->v_interlock); 1875 if (sp->cbpp - sp->bpp > 1) { 1876 /* Write gathered pages */ 1877 lfs_updatemeta(sp); 1878 lfs_release_finfo(fs); 1879 (void) lfs_writeseg(fs, sp); 1880 1881 /* 1882 * Reinitialize FIP 1883 */ 1884 KASSERT(sp->vp == vp); 1885 lfs_acquire_finfo(fs, ip->i_number, 1886 ip->i_gen); 1887 } 1888 ++count; 1889 mutex_enter(vp->v_interlock); 1890 wait_for_page(vp, pg, label); 1891 } 1892 if (label != NULL && count > 1) { 1893 DLOG((DLOG_PAGE, "lfs_putpages[%d]: %s: %sn = %d\n", 1894 curproc->p_pid, label, (count > 0 ? "looping, " : ""), 1895 count)); 1896 } 1897 #else 1898 preempt(1); 1899 #endif 1900 KASSERT(mutex_owned(vp->v_interlock)); 1901 } 1902 1903 /* 1904 * Make sure that for all pages in every block in the given range, 1905 * either all are dirty or all are clean. If any of the pages 1906 * we've seen so far are dirty, put the vnode on the paging chain, 1907 * and mark it IN_PAGING. 1908 * 1909 * If checkfirst != 0, don't check all the pages but return at the 1910 * first dirty page. 1911 */ 1912 static int 1913 check_dirty(struct lfs *fs, struct vnode *vp, 1914 off_t startoffset, off_t endoffset, off_t blkeof, 1915 int flags, int checkfirst, struct vm_page **pgp) 1916 { 1917 int by_list; 1918 struct vm_page *curpg = NULL; /* XXX: gcc */ 1919 struct vm_page *pgs[MAXBSIZE / PAGE_SIZE], *pg; 1920 off_t soff = 0; /* XXX: gcc */ 1921 voff_t off; 1922 int i; 1923 int nonexistent; 1924 int any_dirty; /* number of dirty pages */ 1925 int dirty; /* number of dirty pages in a block */ 1926 int tdirty; 1927 int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT; 1928 int pagedaemon = (curlwp == uvm.pagedaemon_lwp); 1929 1930 KASSERT(mutex_owned(vp->v_interlock)); 1931 ASSERT_MAYBE_SEGLOCK(fs); 1932 top: 1933 by_list = (vp->v_uobj.uo_npages <= 1934 ((endoffset - startoffset) >> PAGE_SHIFT) * 1935 UVM_PAGE_TREE_PENALTY); 1936 any_dirty = 0; 1937 1938 if (by_list) { 1939 curpg = TAILQ_FIRST(&vp->v_uobj.memq); 1940 } else { 1941 soff = startoffset; 1942 } 1943 while (by_list || soff < MIN(blkeof, endoffset)) { 1944 if (by_list) { 1945 /* 1946 * Find the first page in a block. Skip 1947 * blocks outside our area of interest or beyond 1948 * the end of file. 1949 */ 1950 KASSERT(curpg == NULL 1951 || (curpg->flags & PG_MARKER) == 0); 1952 if (pages_per_block > 1) { 1953 while (curpg && 1954 ((curpg->offset & fs->lfs_bmask) || 1955 curpg->offset >= vp->v_size || 1956 curpg->offset >= endoffset)) { 1957 curpg = TAILQ_NEXT(curpg, listq.queue); 1958 KASSERT(curpg == NULL || 1959 (curpg->flags & PG_MARKER) == 0); 1960 } 1961 } 1962 if (curpg == NULL) 1963 break; 1964 soff = curpg->offset; 1965 } 1966 1967 /* 1968 * Mark all pages in extended range busy; find out if any 1969 * of them are dirty. 1970 */ 1971 nonexistent = dirty = 0; 1972 for (i = 0; i == 0 || i < pages_per_block; i++) { 1973 KASSERT(mutex_owned(vp->v_interlock)); 1974 if (by_list && pages_per_block <= 1) { 1975 pgs[i] = pg = curpg; 1976 } else { 1977 off = soff + (i << PAGE_SHIFT); 1978 pgs[i] = pg = uvm_pagelookup(&vp->v_uobj, off); 1979 if (pg == NULL) { 1980 ++nonexistent; 1981 continue; 1982 } 1983 } 1984 KASSERT(pg != NULL); 1985 1986 /* 1987 * If we're holding the segment lock, we can deadlock 1988 * against a process that has our page and is waiting 1989 * for the cleaner, while the cleaner waits for the 1990 * segment lock. Just bail in that case. 1991 */ 1992 if ((pg->flags & PG_BUSY) && 1993 (pagedaemon || LFS_SEGLOCK_HELD(fs))) { 1994 if (i > 0) 1995 uvm_page_unbusy(pgs, i); 1996 DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n")); 1997 if (pgp) 1998 *pgp = pg; 1999 KASSERT(mutex_owned(vp->v_interlock)); 2000 return -1; 2001 } 2002 2003 while (pg->flags & PG_BUSY) { 2004 wait_for_page(vp, pg, NULL); 2005 KASSERT(mutex_owned(vp->v_interlock)); 2006 if (i > 0) 2007 uvm_page_unbusy(pgs, i); 2008 KASSERT(mutex_owned(vp->v_interlock)); 2009 goto top; 2010 } 2011 pg->flags |= PG_BUSY; 2012 UVM_PAGE_OWN(pg, "lfs_putpages"); 2013 2014 pmap_page_protect(pg, VM_PROT_NONE); 2015 tdirty = (pmap_clear_modify(pg) || 2016 (pg->flags & PG_CLEAN) == 0); 2017 dirty += tdirty; 2018 } 2019 if (pages_per_block > 0 && nonexistent >= pages_per_block) { 2020 if (by_list) { 2021 curpg = TAILQ_NEXT(curpg, listq.queue); 2022 } else { 2023 soff += fs->lfs_bsize; 2024 } 2025 continue; 2026 } 2027 2028 any_dirty += dirty; 2029 KASSERT(nonexistent == 0); 2030 KASSERT(mutex_owned(vp->v_interlock)); 2031 2032 /* 2033 * If any are dirty make all dirty; unbusy them, 2034 * but if we were asked to clean, wire them so that 2035 * the pagedaemon doesn't bother us about them while 2036 * they're on their way to disk. 2037 */ 2038 for (i = 0; i == 0 || i < pages_per_block; i++) { 2039 KASSERT(mutex_owned(vp->v_interlock)); 2040 pg = pgs[i]; 2041 KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI))); 2042 KASSERT(pg->flags & PG_BUSY); 2043 if (dirty) { 2044 pg->flags &= ~PG_CLEAN; 2045 if (flags & PGO_FREE) { 2046 /* 2047 * Wire the page so that 2048 * pdaemon doesn't see it again. 2049 */ 2050 mutex_enter(&uvm_pageqlock); 2051 uvm_pagewire(pg); 2052 mutex_exit(&uvm_pageqlock); 2053 2054 /* Suspended write flag */ 2055 pg->flags |= PG_DELWRI; 2056 } 2057 } 2058 if (pg->flags & PG_WANTED) 2059 wakeup(pg); 2060 pg->flags &= ~(PG_WANTED|PG_BUSY); 2061 UVM_PAGE_OWN(pg, NULL); 2062 } 2063 2064 if (checkfirst && any_dirty) 2065 break; 2066 2067 if (by_list) { 2068 curpg = TAILQ_NEXT(curpg, listq.queue); 2069 } else { 2070 soff += MAX(PAGE_SIZE, fs->lfs_bsize); 2071 } 2072 } 2073 2074 KASSERT(mutex_owned(vp->v_interlock)); 2075 return any_dirty; 2076 } 2077 2078 /* 2079 * lfs_putpages functions like genfs_putpages except that 2080 * 2081 * (1) It needs to bounds-check the incoming requests to ensure that 2082 * they are block-aligned; if they are not, expand the range and 2083 * do the right thing in case, e.g., the requested range is clean 2084 * but the expanded range is dirty. 2085 * 2086 * (2) It needs to explicitly send blocks to be written when it is done. 2087 * If VOP_PUTPAGES is called without the seglock held, we simply take 2088 * the seglock and let lfs_segunlock wait for us. 2089 * XXX There might be a bad situation if we have to flush a vnode while 2090 * XXX lfs_markv is in operation. As of this writing we panic in this 2091 * XXX case. 2092 * 2093 * Assumptions: 2094 * 2095 * (1) The caller does not hold any pages in this vnode busy. If it does, 2096 * there is a danger that when we expand the page range and busy the 2097 * pages we will deadlock. 2098 * 2099 * (2) We are called with vp->v_interlock held; we must return with it 2100 * released. 2101 * 2102 * (3) We don't absolutely have to free pages right away, provided that 2103 * the request does not have PGO_SYNCIO. When the pagedaemon gives 2104 * us a request with PGO_FREE, we take the pages out of the paging 2105 * queue and wake up the writer, which will handle freeing them for us. 2106 * 2107 * We ensure that for any filesystem block, all pages for that 2108 * block are either resident or not, even if those pages are higher 2109 * than EOF; that means that we will be getting requests to free 2110 * "unused" pages above EOF all the time, and should ignore them. 2111 * 2112 * (4) If we are called with PGO_LOCKED, the finfo array we are to write 2113 * into has been set up for us by lfs_writefile. If not, we will 2114 * have to handle allocating and/or freeing an finfo entry. 2115 * 2116 * XXX note that we're (ab)using PGO_LOCKED as "seglock held". 2117 */ 2118 2119 /* How many times to loop before we should start to worry */ 2120 #define TOOMANY 4 2121 2122 int 2123 lfs_putpages(void *v) 2124 { 2125 int error; 2126 struct vop_putpages_args /* { 2127 struct vnode *a_vp; 2128 voff_t a_offlo; 2129 voff_t a_offhi; 2130 int a_flags; 2131 } */ *ap = v; 2132 struct vnode *vp; 2133 struct inode *ip; 2134 struct lfs *fs; 2135 struct segment *sp; 2136 off_t origoffset, startoffset, endoffset, origendoffset, blkeof; 2137 off_t off, max_endoffset; 2138 bool seglocked, sync, pagedaemon, reclaim; 2139 struct vm_page *pg, *busypg; 2140 UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist); 2141 int oreclaim = 0; 2142 int donewriting = 0; 2143 #ifdef DEBUG 2144 int debug_n_again, debug_n_dirtyclean; 2145 #endif 2146 2147 vp = ap->a_vp; 2148 ip = VTOI(vp); 2149 fs = ip->i_lfs; 2150 sync = (ap->a_flags & PGO_SYNCIO) != 0; 2151 reclaim = (ap->a_flags & PGO_RECLAIM) != 0; 2152 pagedaemon = (curlwp == uvm.pagedaemon_lwp); 2153 2154 KASSERT(mutex_owned(vp->v_interlock)); 2155 2156 /* Putpages does nothing for metadata. */ 2157 if (vp == fs->lfs_ivnode || vp->v_type != VREG) { 2158 mutex_exit(vp->v_interlock); 2159 return 0; 2160 } 2161 2162 /* 2163 * If there are no pages, don't do anything. 2164 */ 2165 if (vp->v_uobj.uo_npages == 0) { 2166 if (TAILQ_EMPTY(&vp->v_uobj.memq) && 2167 (vp->v_iflag & VI_ONWORKLST) && 2168 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 2169 vp->v_iflag &= ~VI_WRMAPDIRTY; 2170 vn_syncer_remove_from_worklist(vp); 2171 } 2172 mutex_exit(vp->v_interlock); 2173 2174 /* Remove us from paging queue, if we were on it */ 2175 mutex_enter(&lfs_lock); 2176 if (ip->i_flags & IN_PAGING) { 2177 ip->i_flags &= ~IN_PAGING; 2178 TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain); 2179 } 2180 mutex_exit(&lfs_lock); 2181 2182 KASSERT(!mutex_owned(vp->v_interlock)); 2183 return 0; 2184 } 2185 2186 blkeof = blkroundup(fs, ip->i_size); 2187 2188 /* 2189 * Ignore requests to free pages past EOF but in the same block 2190 * as EOF, unless the vnode is being reclaimed or the request 2191 * is synchronous. (If the request is sync, it comes from 2192 * lfs_truncate.) 2193 * 2194 * To avoid being flooded with this request, make these pages 2195 * look "active". 2196 */ 2197 if (!sync && !reclaim && 2198 ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) { 2199 origoffset = ap->a_offlo; 2200 for (off = origoffset; off < blkeof; off += fs->lfs_bsize) { 2201 pg = uvm_pagelookup(&vp->v_uobj, off); 2202 KASSERT(pg != NULL); 2203 while (pg->flags & PG_BUSY) { 2204 pg->flags |= PG_WANTED; 2205 UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0, 2206 "lfsput2", 0); 2207 mutex_enter(vp->v_interlock); 2208 } 2209 mutex_enter(&uvm_pageqlock); 2210 uvm_pageactivate(pg); 2211 mutex_exit(&uvm_pageqlock); 2212 } 2213 ap->a_offlo = blkeof; 2214 if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) { 2215 mutex_exit(vp->v_interlock); 2216 return 0; 2217 } 2218 } 2219 2220 /* 2221 * Extend page range to start and end at block boundaries. 2222 * (For the purposes of VOP_PUTPAGES, fragments don't exist.) 2223 */ 2224 origoffset = ap->a_offlo; 2225 origendoffset = ap->a_offhi; 2226 startoffset = origoffset & ~(fs->lfs_bmask); 2227 max_endoffset = (trunc_page(LLONG_MAX) >> fs->lfs_bshift) 2228 << fs->lfs_bshift; 2229 2230 if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) { 2231 endoffset = max_endoffset; 2232 origendoffset = endoffset; 2233 } else { 2234 origendoffset = round_page(ap->a_offhi); 2235 endoffset = round_page(blkroundup(fs, origendoffset)); 2236 } 2237 2238 KASSERT(startoffset > 0 || endoffset >= startoffset); 2239 if (startoffset == endoffset) { 2240 /* Nothing to do, why were we called? */ 2241 mutex_exit(vp->v_interlock); 2242 DLOG((DLOG_PAGE, "lfs_putpages: startoffset = endoffset = %" 2243 PRId64 "\n", startoffset)); 2244 return 0; 2245 } 2246 2247 ap->a_offlo = startoffset; 2248 ap->a_offhi = endoffset; 2249 2250 /* 2251 * If not cleaning, just send the pages through genfs_putpages 2252 * to be returned to the pool. 2253 */ 2254 if (!(ap->a_flags & PGO_CLEANIT)) { 2255 DLOG((DLOG_PAGE, "lfs_putpages: no cleanit vn %p ino %d (flags %x)\n", 2256 vp, (int)ip->i_number, ap->a_flags)); 2257 int r = genfs_putpages(v); 2258 KASSERT(!mutex_owned(vp->v_interlock)); 2259 return r; 2260 } 2261 2262 /* Set PGO_BUSYFAIL to avoid deadlocks */ 2263 ap->a_flags |= PGO_BUSYFAIL; 2264 2265 /* 2266 * Likewise, if we are asked to clean but the pages are not 2267 * dirty, we can just free them using genfs_putpages. 2268 */ 2269 #ifdef DEBUG 2270 debug_n_dirtyclean = 0; 2271 #endif 2272 do { 2273 int r; 2274 KASSERT(mutex_owned(vp->v_interlock)); 2275 2276 /* Count the number of dirty pages */ 2277 r = check_dirty(fs, vp, startoffset, endoffset, blkeof, 2278 ap->a_flags, 1, NULL); 2279 if (r < 0) { 2280 /* Pages are busy with another process */ 2281 mutex_exit(vp->v_interlock); 2282 return EDEADLK; 2283 } 2284 if (r > 0) /* Some pages are dirty */ 2285 break; 2286 2287 /* 2288 * Sometimes pages are dirtied between the time that 2289 * we check and the time we try to clean them. 2290 * Instruct lfs_gop_write to return EDEADLK in this case 2291 * so we can write them properly. 2292 */ 2293 ip->i_lfs_iflags |= LFSI_NO_GOP_WRITE; 2294 r = genfs_do_putpages(vp, startoffset, endoffset, 2295 ap->a_flags & ~PGO_SYNCIO, &busypg); 2296 ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE; 2297 if (r != EDEADLK) { 2298 KASSERT(!mutex_owned(vp->v_interlock)); 2299 return r; 2300 } 2301 2302 /* One of the pages was busy. Start over. */ 2303 mutex_enter(vp->v_interlock); 2304 wait_for_page(vp, busypg, "dirtyclean"); 2305 #ifdef DEBUG 2306 ++debug_n_dirtyclean; 2307 #endif 2308 } while(1); 2309 2310 #ifdef DEBUG 2311 if (debug_n_dirtyclean > TOOMANY) 2312 DLOG((DLOG_PAGE, "lfs_putpages: dirtyclean: looping, n = %d\n", 2313 debug_n_dirtyclean)); 2314 #endif 2315 2316 /* 2317 * Dirty and asked to clean. 2318 * 2319 * Pagedaemon can't actually write LFS pages; wake up 2320 * the writer to take care of that. The writer will 2321 * notice the pager inode queue and act on that. 2322 * 2323 * XXX We must drop the vp->interlock before taking the lfs_lock or we 2324 * get a nasty deadlock with lfs_flush_pchain(). 2325 */ 2326 if (pagedaemon) { 2327 mutex_exit(vp->v_interlock); 2328 mutex_enter(&lfs_lock); 2329 if (!(ip->i_flags & IN_PAGING)) { 2330 ip->i_flags |= IN_PAGING; 2331 TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, i_lfs_pchain); 2332 } 2333 wakeup(&lfs_writer_daemon); 2334 mutex_exit(&lfs_lock); 2335 preempt(); 2336 KASSERT(!mutex_owned(vp->v_interlock)); 2337 return EWOULDBLOCK; 2338 } 2339 2340 /* 2341 * If this is a file created in a recent dirop, we can't flush its 2342 * inode until the dirop is complete. Drain dirops, then flush the 2343 * filesystem (taking care of any other pending dirops while we're 2344 * at it). 2345 */ 2346 if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT && 2347 (vp->v_uflag & VU_DIROP)) { 2348 DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n")); 2349 2350 lfs_writer_enter(fs, "ppdirop"); 2351 2352 /* Note if we hold the vnode locked */ 2353 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 2354 { 2355 DLOG((DLOG_PAGE, "lfs_putpages: dirop inode already locked\n")); 2356 } else { 2357 DLOG((DLOG_PAGE, "lfs_putpages: dirop inode not locked\n")); 2358 } 2359 mutex_exit(vp->v_interlock); 2360 2361 mutex_enter(&lfs_lock); 2362 lfs_flush_fs(fs, sync ? SEGM_SYNC : 0); 2363 mutex_exit(&lfs_lock); 2364 2365 mutex_enter(vp->v_interlock); 2366 lfs_writer_leave(fs); 2367 2368 /* The flush will have cleaned out this vnode as well, 2369 no need to do more to it. */ 2370 } 2371 2372 /* 2373 * This is it. We are going to write some pages. From here on 2374 * down it's all just mechanics. 2375 * 2376 * Don't let genfs_putpages wait; lfs_segunlock will wait for us. 2377 */ 2378 ap->a_flags &= ~PGO_SYNCIO; 2379 2380 /* 2381 * If we've already got the seglock, flush the node and return. 2382 * The FIP has already been set up for us by lfs_writefile, 2383 * and FIP cleanup and lfs_updatemeta will also be done there, 2384 * unless genfs_putpages returns EDEADLK; then we must flush 2385 * what we have, and correct FIP and segment header accounting. 2386 */ 2387 get_seglock: 2388 /* 2389 * If we are not called with the segment locked, lock it. 2390 * Account for a new FIP in the segment header, and set sp->vp. 2391 * (This should duplicate the setup at the top of lfs_writefile().) 2392 */ 2393 seglocked = (ap->a_flags & PGO_LOCKED) != 0; 2394 if (!seglocked) { 2395 mutex_exit(vp->v_interlock); 2396 error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0)); 2397 if (error != 0) { 2398 KASSERT(!mutex_owned(vp->v_interlock)); 2399 return error; 2400 } 2401 mutex_enter(vp->v_interlock); 2402 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); 2403 } 2404 sp = fs->lfs_sp; 2405 KASSERT(sp->vp == NULL); 2406 sp->vp = vp; 2407 2408 /* Note segments written by reclaim; only for debugging */ 2409 if ((vp->v_iflag & VI_XLOCK) != 0) { 2410 sp->seg_flags |= SEGM_RECLAIM; 2411 fs->lfs_reclino = ip->i_number; 2412 } 2413 2414 /* 2415 * Ensure that the partial segment is marked SS_DIROP if this 2416 * vnode is a DIROP. 2417 */ 2418 if (!seglocked && vp->v_uflag & VU_DIROP) 2419 ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT); 2420 2421 /* 2422 * Loop over genfs_putpages until all pages are gathered. 2423 * genfs_putpages() drops the interlock, so reacquire it if necessary. 2424 * Whenever we lose the interlock we have to rerun check_dirty, as 2425 * well, since more pages might have been dirtied in our absence. 2426 */ 2427 #ifdef DEBUG 2428 debug_n_again = 0; 2429 #endif 2430 do { 2431 busypg = NULL; 2432 KASSERT(mutex_owned(vp->v_interlock)); 2433 if (check_dirty(fs, vp, startoffset, endoffset, blkeof, 2434 ap->a_flags, 0, &busypg) < 0) { 2435 mutex_exit(vp->v_interlock); 2436 /* XXX why? --ks */ 2437 mutex_enter(vp->v_interlock); 2438 write_and_wait(fs, vp, busypg, seglocked, NULL); 2439 if (!seglocked) { 2440 mutex_exit(vp->v_interlock); 2441 lfs_release_finfo(fs); 2442 lfs_segunlock(fs); 2443 mutex_enter(vp->v_interlock); 2444 } 2445 sp->vp = NULL; 2446 goto get_seglock; 2447 } 2448 2449 busypg = NULL; 2450 KASSERT(!mutex_owned(&uvm_pageqlock)); 2451 oreclaim = (ap->a_flags & PGO_RECLAIM); 2452 ap->a_flags &= ~PGO_RECLAIM; 2453 error = genfs_do_putpages(vp, startoffset, endoffset, 2454 ap->a_flags, &busypg); 2455 ap->a_flags |= oreclaim; 2456 2457 if (error == EDEADLK || error == EAGAIN) { 2458 DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned" 2459 " %d ino %d off %x (seg %d)\n", error, 2460 ip->i_number, fs->lfs_offset, 2461 dtosn(fs, fs->lfs_offset))); 2462 2463 if (oreclaim) { 2464 mutex_enter(vp->v_interlock); 2465 write_and_wait(fs, vp, busypg, seglocked, "again"); 2466 mutex_exit(vp->v_interlock); 2467 } else { 2468 if ((sp->seg_flags & SEGM_SINGLE) && 2469 fs->lfs_curseg != fs->lfs_startseg) 2470 donewriting = 1; 2471 } 2472 } else if (error) { 2473 DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned" 2474 " %d ino %d off %x (seg %d)\n", error, 2475 (int)ip->i_number, fs->lfs_offset, 2476 dtosn(fs, fs->lfs_offset))); 2477 } 2478 /* genfs_do_putpages loses the interlock */ 2479 #ifdef DEBUG 2480 ++debug_n_again; 2481 #endif 2482 if (oreclaim && error == EAGAIN) { 2483 DLOG((DLOG_PAGE, "vp %p ino %d vi_flags %x a_flags %x avoiding vclean panic\n", 2484 vp, (int)ip->i_number, vp->v_iflag, ap->a_flags)); 2485 mutex_enter(vp->v_interlock); 2486 } 2487 if (error == EDEADLK) 2488 mutex_enter(vp->v_interlock); 2489 } while (error == EDEADLK || (oreclaim && error == EAGAIN)); 2490 #ifdef DEBUG 2491 if (debug_n_again > TOOMANY) 2492 DLOG((DLOG_PAGE, "lfs_putpages: again: looping, n = %d\n", debug_n_again)); 2493 #endif 2494 2495 KASSERT(sp != NULL && sp->vp == vp); 2496 if (!seglocked && !donewriting) { 2497 sp->vp = NULL; 2498 2499 /* Write indirect blocks as well */ 2500 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_indir); 2501 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_dindir); 2502 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_tindir); 2503 2504 KASSERT(sp->vp == NULL); 2505 sp->vp = vp; 2506 } 2507 2508 /* 2509 * Blocks are now gathered into a segment waiting to be written. 2510 * All that's left to do is update metadata, and write them. 2511 */ 2512 lfs_updatemeta(sp); 2513 KASSERT(sp->vp == vp); 2514 sp->vp = NULL; 2515 2516 /* 2517 * If we were called from lfs_writefile, we don't need to clean up 2518 * the FIP or unlock the segment lock. We're done. 2519 */ 2520 if (seglocked) { 2521 KASSERT(!mutex_owned(vp->v_interlock)); 2522 return error; 2523 } 2524 2525 /* Clean up FIP and send it to disk. */ 2526 lfs_release_finfo(fs); 2527 lfs_writeseg(fs, fs->lfs_sp); 2528 2529 /* 2530 * Remove us from paging queue if we wrote all our pages. 2531 */ 2532 if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) { 2533 mutex_enter(&lfs_lock); 2534 if (ip->i_flags & IN_PAGING) { 2535 ip->i_flags &= ~IN_PAGING; 2536 TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain); 2537 } 2538 mutex_exit(&lfs_lock); 2539 } 2540 2541 /* 2542 * XXX - with the malloc/copy writeseg, the pages are freed by now 2543 * even if we don't wait (e.g. if we hold a nested lock). This 2544 * will not be true if we stop using malloc/copy. 2545 */ 2546 KASSERT(fs->lfs_sp->seg_flags & SEGM_PROT); 2547 lfs_segunlock(fs); 2548 2549 /* 2550 * Wait for v_numoutput to drop to zero. The seglock should 2551 * take care of this, but there is a slight possibility that 2552 * aiodoned might not have got around to our buffers yet. 2553 */ 2554 if (sync) { 2555 mutex_enter(vp->v_interlock); 2556 while (vp->v_numoutput > 0) { 2557 DLOG((DLOG_PAGE, "lfs_putpages: ino %d sleeping on" 2558 " num %d\n", ip->i_number, vp->v_numoutput)); 2559 cv_wait(&vp->v_cv, vp->v_interlock); 2560 } 2561 mutex_exit(vp->v_interlock); 2562 } 2563 KASSERT(!mutex_owned(vp->v_interlock)); 2564 return error; 2565 } 2566 2567 /* 2568 * Return the last logical file offset that should be written for this file 2569 * if we're doing a write that ends at "size". If writing, we need to know 2570 * about sizes on disk, i.e. fragments if there are any; if reading, we need 2571 * to know about entire blocks. 2572 */ 2573 void 2574 lfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags) 2575 { 2576 struct inode *ip = VTOI(vp); 2577 struct lfs *fs = ip->i_lfs; 2578 daddr_t olbn, nlbn; 2579 2580 olbn = lblkno(fs, ip->i_size); 2581 nlbn = lblkno(fs, size); 2582 if (!(flags & GOP_SIZE_MEM) && nlbn < NDADDR && olbn <= nlbn) { 2583 *eobp = fragroundup(fs, size); 2584 } else { 2585 *eobp = blkroundup(fs, size); 2586 } 2587 } 2588 2589 #ifdef DEBUG 2590 void lfs_dump_vop(void *); 2591 2592 void 2593 lfs_dump_vop(void *v) 2594 { 2595 struct vop_putpages_args /* { 2596 struct vnode *a_vp; 2597 voff_t a_offlo; 2598 voff_t a_offhi; 2599 int a_flags; 2600 } */ *ap = v; 2601 2602 #ifdef DDB 2603 vfs_vnode_print(ap->a_vp, 0, printf); 2604 #endif 2605 lfs_dump_dinode(VTOI(ap->a_vp)->i_din.ffs1_din); 2606 } 2607 #endif 2608 2609 int 2610 lfs_mmap(void *v) 2611 { 2612 struct vop_mmap_args /* { 2613 const struct vnodeop_desc *a_desc; 2614 struct vnode *a_vp; 2615 vm_prot_t a_prot; 2616 kauth_cred_t a_cred; 2617 } */ *ap = v; 2618 2619 if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM) 2620 return EOPNOTSUPP; 2621 return ufs_mmap(v); 2622 } 2623