1 /* $NetBSD: lfs_vfsops.c,v 1.361 2017/10/28 00:37:13 pgoyette Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Konrad E. Schroder <perseant@hhhh.org>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 /*- 33 * Copyright (c) 1989, 1991, 1993, 1994 34 * The Regents of the University of California. All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)lfs_vfsops.c 8.20 (Berkeley) 6/10/95 61 */ 62 63 #include <sys/cdefs.h> 64 __KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.361 2017/10/28 00:37:13 pgoyette Exp $"); 65 66 #if defined(_KERNEL_OPT) 67 #include "opt_lfs.h" 68 #include "opt_quota.h" 69 #endif 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/namei.h> 74 #include <sys/proc.h> 75 #include <sys/kernel.h> 76 #include <sys/vnode.h> 77 #include <sys/mount.h> 78 #include <sys/kthread.h> 79 #include <sys/buf.h> 80 #include <sys/device.h> 81 #include <sys/mbuf.h> 82 #include <sys/file.h> 83 #include <sys/disklabel.h> 84 #include <sys/ioctl.h> 85 #include <sys/errno.h> 86 #include <sys/malloc.h> 87 #include <sys/pool.h> 88 #include <sys/socket.h> 89 #include <sys/syslog.h> 90 #include <uvm/uvm_extern.h> 91 #include <sys/sysctl.h> 92 #include <sys/conf.h> 93 #include <sys/kauth.h> 94 #include <sys/module.h> 95 #include <sys/syscallvar.h> 96 #include <sys/syscall.h> 97 #include <sys/syscallargs.h> 98 99 #include <miscfs/specfs/specdev.h> 100 101 #include <ufs/lfs/ulfs_quotacommon.h> 102 #include <ufs/lfs/ulfs_inode.h> 103 #include <ufs/lfs/ulfsmount.h> 104 #include <ufs/lfs/ulfs_bswap.h> 105 #include <ufs/lfs/ulfs_extern.h> 106 107 #include <uvm/uvm.h> 108 #include <uvm/uvm_stat.h> 109 #include <uvm/uvm_pager.h> 110 #include <uvm/uvm_pdaemon.h> 111 112 #include <ufs/lfs/lfs.h> 113 #include <ufs/lfs/lfs_accessors.h> 114 #include <ufs/lfs/lfs_kernel.h> 115 #include <ufs/lfs/lfs_extern.h> 116 117 #include <miscfs/genfs/genfs.h> 118 #include <miscfs/genfs/genfs_node.h> 119 120 MODULE(MODULE_CLASS_VFS, lfs, NULL); 121 122 static int lfs_gop_write(struct vnode *, struct vm_page **, int, int); 123 static int lfs_mountfs(struct vnode *, struct mount *, struct lwp *); 124 125 static struct sysctllog *lfs_sysctl_log; 126 127 extern const struct vnodeopv_desc lfs_vnodeop_opv_desc; 128 extern const struct vnodeopv_desc lfs_specop_opv_desc; 129 extern const struct vnodeopv_desc lfs_fifoop_opv_desc; 130 131 struct lwp * lfs_writer_daemon = NULL; 132 kcondvar_t lfs_writerd_cv; 133 134 int lfs_do_flush = 0; 135 #ifdef LFS_KERNEL_RFW 136 int lfs_do_rfw = 0; 137 #endif 138 139 const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = { 140 &lfs_vnodeop_opv_desc, 141 &lfs_specop_opv_desc, 142 &lfs_fifoop_opv_desc, 143 NULL, 144 }; 145 146 struct vfsops lfs_vfsops = { 147 .vfs_name = MOUNT_LFS, 148 .vfs_min_mount_data = sizeof (struct ulfs_args), 149 .vfs_mount = lfs_mount, 150 .vfs_start = ulfs_start, 151 .vfs_unmount = lfs_unmount, 152 .vfs_root = ulfs_root, 153 .vfs_quotactl = ulfs_quotactl, 154 .vfs_statvfs = lfs_statvfs, 155 .vfs_sync = lfs_sync, 156 .vfs_vget = lfs_vget, 157 .vfs_loadvnode = lfs_loadvnode, 158 .vfs_newvnode = lfs_newvnode, 159 .vfs_fhtovp = lfs_fhtovp, 160 .vfs_vptofh = lfs_vptofh, 161 .vfs_init = lfs_init, 162 .vfs_reinit = lfs_reinit, 163 .vfs_done = lfs_done, 164 .vfs_mountroot = lfs_mountroot, 165 .vfs_snapshot = (void *)eopnotsupp, 166 .vfs_extattrctl = lfs_extattrctl, 167 .vfs_suspendctl = genfs_suspendctl, 168 .vfs_renamelock_enter = genfs_renamelock_enter, 169 .vfs_renamelock_exit = genfs_renamelock_exit, 170 .vfs_fsync = (void *)eopnotsupp, 171 .vfs_opv_descs = lfs_vnodeopv_descs 172 }; 173 174 const struct genfs_ops lfs_genfsops = { 175 .gop_size = lfs_gop_size, 176 .gop_alloc = ulfs_gop_alloc, 177 .gop_write = lfs_gop_write, 178 .gop_markupdate = ulfs_gop_markupdate, 179 }; 180 181 struct shortlong { 182 const char *sname; 183 const char *lname; 184 }; 185 186 static int 187 sysctl_lfs_dostats(SYSCTLFN_ARGS) 188 { 189 extern struct lfs_stats lfs_stats; 190 extern int lfs_dostats; 191 int error; 192 193 error = sysctl_lookup(SYSCTLFN_CALL(rnode)); 194 if (error || newp == NULL) 195 return (error); 196 197 if (lfs_dostats == 0) 198 memset(&lfs_stats, 0, sizeof(lfs_stats)); 199 200 return (0); 201 } 202 203 static void 204 lfs_sysctl_setup(struct sysctllog **clog) 205 { 206 int i; 207 extern int lfs_writeindir, lfs_dostats, lfs_clean_vnhead, 208 lfs_fs_pagetrip, lfs_ignore_lazy_sync; 209 #ifdef DEBUG 210 extern int lfs_debug_log_subsys[DLOG_MAX]; 211 struct shortlong dlog_names[DLOG_MAX] = { /* Must match lfs.h ! */ 212 { "rollforward", "Debug roll-forward code" }, 213 { "alloc", "Debug inode allocation and free list" }, 214 { "avail", "Debug space-available-now accounting" }, 215 { "flush", "Debug flush triggers" }, 216 { "lockedlist", "Debug locked list accounting" }, 217 { "vnode_verbose", "Verbose per-vnode-written debugging" }, 218 { "vnode", "Debug vnode use during segment write" }, 219 { "segment", "Debug segment writing" }, 220 { "seguse", "Debug segment used-bytes accounting" }, 221 { "cleaner", "Debug cleaning routines" }, 222 { "mount", "Debug mount/unmount routines" }, 223 { "pagecache", "Debug UBC interactions" }, 224 { "dirop", "Debug directory-operation accounting" }, 225 { "malloc", "Debug private malloc accounting" }, 226 }; 227 #endif /* DEBUG */ 228 struct shortlong stat_names[] = { /* Must match lfs.h! */ 229 { "segsused", "Number of new segments allocated" }, 230 { "psegwrites", "Number of partial-segment writes" }, 231 { "psyncwrites", "Number of synchronous partial-segment" 232 " writes" }, 233 { "pcleanwrites", "Number of partial-segment writes by the" 234 " cleaner" }, 235 { "blocktot", "Number of blocks written" }, 236 { "cleanblocks", "Number of blocks written by the cleaner" }, 237 { "ncheckpoints", "Number of checkpoints made" }, 238 { "nwrites", "Number of whole writes" }, 239 { "nsync_writes", "Number of synchronous writes" }, 240 { "wait_exceeded", "Number of times writer waited for" 241 " cleaner" }, 242 { "write_exceeded", "Number of times writer invoked flush" }, 243 { "flush_invoked", "Number of times flush was invoked" }, 244 { "vflush_invoked", "Number of time vflush was called" }, 245 { "clean_inlocked", "Number of vnodes skipped for being dead" }, 246 { "clean_vnlocked", "Number of vnodes skipped for vget failure" }, 247 { "segs_reclaimed", "Number of segments reclaimed" }, 248 }; 249 250 sysctl_createv(clog, 0, NULL, NULL, 251 CTLFLAG_PERMANENT, 252 CTLTYPE_NODE, "lfs", 253 SYSCTL_DESCR("Log-structured file system"), 254 NULL, 0, NULL, 0, 255 CTL_VFS, 5, CTL_EOL); 256 /* 257 * XXX the "5" above could be dynamic, thereby eliminating one 258 * more instance of the "number to vfs" mapping problem, but 259 * "5" is the order as taken from sys/mount.h 260 */ 261 262 sysctl_createv(clog, 0, NULL, NULL, 263 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 264 CTLTYPE_INT, "flushindir", NULL, 265 NULL, 0, &lfs_writeindir, 0, 266 CTL_VFS, 5, LFS_WRITEINDIR, CTL_EOL); 267 sysctl_createv(clog, 0, NULL, NULL, 268 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 269 CTLTYPE_INT, "clean_vnhead", NULL, 270 NULL, 0, &lfs_clean_vnhead, 0, 271 CTL_VFS, 5, LFS_CLEAN_VNHEAD, CTL_EOL); 272 sysctl_createv(clog, 0, NULL, NULL, 273 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 274 CTLTYPE_INT, "dostats", 275 SYSCTL_DESCR("Maintain statistics on LFS operations"), 276 sysctl_lfs_dostats, 0, &lfs_dostats, 0, 277 CTL_VFS, 5, LFS_DOSTATS, CTL_EOL); 278 sysctl_createv(clog, 0, NULL, NULL, 279 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 280 CTLTYPE_INT, "pagetrip", 281 SYSCTL_DESCR("How many dirty pages in fs triggers" 282 " a flush"), 283 NULL, 0, &lfs_fs_pagetrip, 0, 284 CTL_VFS, 5, LFS_FS_PAGETRIP, CTL_EOL); 285 sysctl_createv(clog, 0, NULL, NULL, 286 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 287 CTLTYPE_INT, "ignore_lazy_sync", 288 SYSCTL_DESCR("Lazy Sync is ignored entirely"), 289 NULL, 0, &lfs_ignore_lazy_sync, 0, 290 CTL_VFS, 5, LFS_IGNORE_LAZY_SYNC, CTL_EOL); 291 #ifdef LFS_KERNEL_RFW 292 sysctl_createv(clog, 0, NULL, NULL, 293 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 294 CTLTYPE_INT, "rfw", 295 SYSCTL_DESCR("Use in-kernel roll-forward on mount"), 296 NULL, 0, &lfs_do_rfw, 0, 297 CTL_VFS, 5, LFS_DO_RFW, CTL_EOL); 298 #endif 299 300 sysctl_createv(clog, 0, NULL, NULL, 301 CTLFLAG_PERMANENT, 302 CTLTYPE_NODE, "stats", 303 SYSCTL_DESCR("Debugging options"), 304 NULL, 0, NULL, 0, 305 CTL_VFS, 5, LFS_STATS, CTL_EOL); 306 for (i = 0; i < sizeof(struct lfs_stats) / sizeof(u_int); i++) { 307 sysctl_createv(clog, 0, NULL, NULL, 308 CTLFLAG_PERMANENT|CTLFLAG_READONLY, 309 CTLTYPE_INT, stat_names[i].sname, 310 SYSCTL_DESCR(stat_names[i].lname), 311 NULL, 0, &(((u_int *)&lfs_stats.segsused)[i]), 312 0, CTL_VFS, 5, LFS_STATS, i, CTL_EOL); 313 } 314 315 #ifdef DEBUG 316 sysctl_createv(clog, 0, NULL, NULL, 317 CTLFLAG_PERMANENT, 318 CTLTYPE_NODE, "debug", 319 SYSCTL_DESCR("Debugging options"), 320 NULL, 0, NULL, 0, 321 CTL_VFS, 5, LFS_DEBUGLOG, CTL_EOL); 322 for (i = 0; i < DLOG_MAX; i++) { 323 sysctl_createv(clog, 0, NULL, NULL, 324 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 325 CTLTYPE_INT, dlog_names[i].sname, 326 SYSCTL_DESCR(dlog_names[i].lname), 327 NULL, 0, &(lfs_debug_log_subsys[i]), 0, 328 CTL_VFS, 5, LFS_DEBUGLOG, i, CTL_EOL); 329 } 330 #endif 331 } 332 333 /* old cleaner syscall interface. see VOP_FCNTL() */ 334 static const struct syscall_package lfs_syscalls[] = { 335 { SYS_lfs_bmapv, 0, (sy_call_t *)sys_lfs_bmapv }, 336 { SYS_lfs_markv, 0, (sy_call_t *)sys_lfs_markv }, 337 { SYS___lfs_segwait50, 0, (sy_call_t *)sys___lfs_segwait50 }, 338 { SYS_lfs_segclean, 0, (sy_call_t *)sys_lfs_segclean }, 339 { 0, 0, NULL }, 340 }; 341 342 static int 343 lfs_modcmd(modcmd_t cmd, void *arg) 344 { 345 int error; 346 347 switch (cmd) { 348 case MODULE_CMD_INIT: 349 error = syscall_establish(NULL, lfs_syscalls); 350 if (error) 351 return error; 352 error = vfs_attach(&lfs_vfsops); 353 if (error != 0) { 354 syscall_disestablish(NULL, lfs_syscalls); 355 break; 356 } 357 lfs_sysctl_setup(&lfs_sysctl_log); 358 break; 359 case MODULE_CMD_FINI: 360 error = vfs_detach(&lfs_vfsops); 361 if (error != 0) 362 break; 363 syscall_disestablish(NULL, lfs_syscalls); 364 sysctl_teardown(&lfs_sysctl_log); 365 break; 366 default: 367 error = ENOTTY; 368 break; 369 } 370 371 return (error); 372 } 373 374 /* 375 * XXX Same structure as FFS inodes? Should we share a common pool? 376 */ 377 struct pool lfs_inode_pool; 378 struct pool lfs_dinode_pool; 379 struct pool lfs_inoext_pool; 380 struct pool lfs_lbnentry_pool; 381 382 /* 383 * The writer daemon. UVM keeps track of how many dirty pages we are holding 384 * in lfs_subsys_pages; the daemon flushes the filesystem when this value 385 * crosses the (user-defined) threshhold LFS_MAX_PAGES. 386 */ 387 static void 388 lfs_writerd(void *arg) 389 { 390 mount_iterator_t *iter; 391 struct mount *mp; 392 struct lfs *fs; 393 struct vfsops *vfs = NULL; 394 int fsflags; 395 int lfsc; 396 int wrote_something = 0; 397 398 mutex_enter(&lfs_lock); 399 KASSERTMSG(lfs_writer_daemon == NULL, "more than one LFS writer daemon"); 400 lfs_writer_daemon = curlwp; 401 mutex_exit(&lfs_lock); 402 403 /* Take an extra reference to the LFS vfsops. */ 404 vfs = vfs_getopsbyname(MOUNT_LFS); 405 406 mutex_enter(&lfs_lock); 407 for (;;) { 408 KASSERT(mutex_owned(&lfs_lock)); 409 if (wrote_something == 0) 410 cv_timedwait(&lfs_writerd_cv, &lfs_lock, hz/10 + 1); 411 KASSERT(mutex_owned(&lfs_lock)); 412 wrote_something = 0; 413 414 /* 415 * If global state wants a flush, flush everything. 416 */ 417 if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS || 418 locked_queue_bytes > LFS_MAX_BYTES || 419 lfs_subsys_pages > LFS_MAX_PAGES) { 420 421 if (lfs_do_flush) { 422 DLOG((DLOG_FLUSH, "lfs_writerd: lfs_do_flush\n")); 423 } 424 if (locked_queue_count > LFS_MAX_BUFS) { 425 DLOG((DLOG_FLUSH, "lfs_writerd: lqc = %d, max %d\n", 426 locked_queue_count, LFS_MAX_BUFS)); 427 } 428 if (locked_queue_bytes > LFS_MAX_BYTES) { 429 DLOG((DLOG_FLUSH, "lfs_writerd: lqb = %ld, max %ld\n", 430 locked_queue_bytes, LFS_MAX_BYTES)); 431 } 432 if (lfs_subsys_pages > LFS_MAX_PAGES) { 433 DLOG((DLOG_FLUSH, "lfs_writerd: lssp = %d, max %d\n", 434 lfs_subsys_pages, LFS_MAX_PAGES)); 435 } 436 437 lfs_flush(NULL, SEGM_WRITERD, 0); 438 lfs_do_flush = 0; 439 KASSERT(mutex_owned(&lfs_lock)); 440 continue; 441 } 442 KASSERT(mutex_owned(&lfs_lock)); 443 mutex_exit(&lfs_lock); 444 445 /* 446 * Look through the list of LFSs to see if any of them 447 * have requested pageouts. 448 */ 449 mountlist_iterator_init(&iter); 450 lfsc = 0; 451 while ((mp = mountlist_iterator_next(iter)) != NULL) { 452 KASSERT(!mutex_owned(&lfs_lock)); 453 if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS, 454 sizeof(mp->mnt_stat.f_fstypename)) == 0) { 455 ++lfsc; 456 fs = VFSTOULFS(mp)->um_lfs; 457 daddr_t ooffset = 0; 458 fsflags = SEGM_SINGLE; 459 460 mutex_enter(&lfs_lock); 461 ooffset = lfs_sb_getoffset(fs); 462 463 if (lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs) && fs->lfs_nowrap) { 464 /* Don't try to write if we're suspended */ 465 mutex_exit(&lfs_lock); 466 continue; 467 } 468 if (LFS_STARVED_FOR_SEGS(fs)) { 469 mutex_exit(&lfs_lock); 470 471 DLOG((DLOG_FLUSH, "lfs_writerd: need cleaning before writing possible\n")); 472 lfs_wakeup_cleaner(fs); 473 continue; 474 } 475 476 if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) || 477 lfs_dirvcount > LFS_MAX_DIROP) && 478 fs->lfs_dirops == 0) { 479 fsflags &= ~SEGM_SINGLE; 480 fsflags |= SEGM_CKP; 481 DLOG((DLOG_FLUSH, "lfs_writerd: checkpoint\n")); 482 lfs_flush_fs(fs, fsflags); 483 } else if (fs->lfs_pdflush) { 484 DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n")); 485 lfs_flush_fs(fs, fsflags); 486 } else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) { 487 DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n")); 488 mutex_exit(&lfs_lock); 489 lfs_writer_enter(fs, "wrdirop"); 490 lfs_flush_pchain(fs); 491 lfs_writer_leave(fs); 492 mutex_enter(&lfs_lock); 493 } 494 if (lfs_sb_getoffset(fs) != ooffset) 495 ++wrote_something; 496 mutex_exit(&lfs_lock); 497 } 498 KASSERT(!mutex_owned(&lfs_lock)); 499 } 500 if (lfsc == 0) { 501 mutex_enter(&lfs_lock); 502 lfs_writer_daemon = NULL; 503 mutex_exit(&lfs_lock); 504 mountlist_iterator_destroy(iter); 505 break; 506 } 507 mountlist_iterator_destroy(iter); 508 509 mutex_enter(&lfs_lock); 510 } 511 KASSERT(!mutex_owned(&lfs_lock)); 512 513 /* Give up our extra reference so the module can be unloaded. */ 514 mutex_enter(&vfs_list_lock); 515 if (vfs != NULL) 516 vfs->vfs_refcount--; 517 mutex_exit(&vfs_list_lock); 518 519 /* Done! */ 520 kthread_exit(0); 521 } 522 523 /* 524 * Initialize the filesystem, most work done by ulfs_init. 525 */ 526 void 527 lfs_init(void) 528 { 529 530 /* 531 * XXX: should we use separate pools for 32-bit and 64-bit 532 * dinodes? 533 */ 534 malloc_type_attach(M_SEGMENT); 535 pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0, 536 "lfsinopl", &pool_allocator_nointr, IPL_NONE); 537 pool_init(&lfs_dinode_pool, sizeof(union lfs_dinode), 0, 0, 0, 538 "lfsdinopl", &pool_allocator_nointr, IPL_NONE); 539 pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0, 540 "lfsinoextpl", &pool_allocator_nointr, IPL_NONE); 541 pool_init(&lfs_lbnentry_pool, sizeof(struct lbnentry), 0, 0, 0, 542 "lfslbnpool", &pool_allocator_nointr, IPL_NONE); 543 ulfs_init(); 544 545 #ifdef DEBUG 546 memset(lfs_log, 0, sizeof(lfs_log)); 547 #endif 548 mutex_init(&lfs_lock, MUTEX_DEFAULT, IPL_NONE); 549 cv_init(&lfs_writerd_cv, "lfswrite"); 550 cv_init(&locked_queue_cv, "lfsbuf"); 551 cv_init(&lfs_writing_cv, "lfsflush"); 552 } 553 554 void 555 lfs_reinit(void) 556 { 557 ulfs_reinit(); 558 } 559 560 void 561 lfs_done(void) 562 { 563 ulfs_done(); 564 mutex_destroy(&lfs_lock); 565 cv_destroy(&lfs_writerd_cv); 566 cv_destroy(&locked_queue_cv); 567 cv_destroy(&lfs_writing_cv); 568 pool_destroy(&lfs_inode_pool); 569 pool_destroy(&lfs_dinode_pool); 570 pool_destroy(&lfs_inoext_pool); 571 pool_destroy(&lfs_lbnentry_pool); 572 malloc_type_detach(M_SEGMENT); 573 } 574 575 /* 576 * Called by main() when ulfs is going to be mounted as root. 577 */ 578 int 579 lfs_mountroot(void) 580 { 581 extern struct vnode *rootvp; 582 struct lfs *fs = NULL; /* LFS */ 583 struct mount *mp; 584 struct lwp *l = curlwp; 585 struct ulfsmount *ump; 586 int error; 587 588 if (device_class(root_device) != DV_DISK) 589 return (ENODEV); 590 591 if (rootdev == NODEV) 592 return (ENODEV); 593 if ((error = vfs_rootmountalloc(MOUNT_LFS, "root_device", &mp))) { 594 vrele(rootvp); 595 return (error); 596 } 597 if ((error = lfs_mountfs(rootvp, mp, l))) { 598 vfs_unbusy(mp); 599 vfs_rele(mp); 600 return (error); 601 } 602 mountlist_append(mp); 603 ump = VFSTOULFS(mp); 604 fs = ump->um_lfs; 605 lfs_sb_setfsmnt(fs, mp->mnt_stat.f_mntonname); 606 (void)lfs_statvfs(mp, &mp->mnt_stat); 607 vfs_unbusy(mp); 608 setrootfstime((time_t)lfs_sb_gettstamp(VFSTOULFS(mp)->um_lfs)); 609 return (0); 610 } 611 612 /* 613 * VFS Operations. 614 * 615 * mount system call 616 */ 617 int 618 lfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) 619 { 620 struct lwp *l = curlwp; 621 struct vnode *devvp; 622 struct ulfs_args *args = data; 623 struct ulfsmount *ump = NULL; 624 struct lfs *fs = NULL; /* LFS */ 625 int error = 0, update; 626 mode_t accessmode; 627 628 if (args == NULL) 629 return EINVAL; 630 if (*data_len < sizeof *args) 631 return EINVAL; 632 633 if (mp->mnt_flag & MNT_GETARGS) { 634 ump = VFSTOULFS(mp); 635 if (ump == NULL) 636 return EIO; 637 args->fspec = NULL; 638 *data_len = sizeof *args; 639 return 0; 640 } 641 642 update = mp->mnt_flag & MNT_UPDATE; 643 644 /* Check arguments */ 645 if (args->fspec != NULL) { 646 /* 647 * Look up the name and verify that it's sane. 648 */ 649 error = namei_simple_user(args->fspec, 650 NSM_FOLLOW_NOEMULROOT, &devvp); 651 if (error != 0) 652 return (error); 653 654 if (!update) { 655 /* 656 * Be sure this is a valid block device 657 */ 658 if (devvp->v_type != VBLK) 659 error = ENOTBLK; 660 else if (bdevsw_lookup(devvp->v_rdev) == NULL) 661 error = ENXIO; 662 } else { 663 /* 664 * Be sure we're still naming the same device 665 * used for our initial mount 666 * 667 * XXX dholland 20151010: if namei gives us a 668 * different vnode for the same device, 669 * wouldn't it be better to use it going 670 * forward rather than ignore it in favor of 671 * the old one? 672 */ 673 ump = VFSTOULFS(mp); 674 fs = ump->um_lfs; 675 if (devvp != fs->lfs_devvp) { 676 if (devvp->v_rdev != fs->lfs_devvp->v_rdev) 677 error = EINVAL; 678 else { 679 vrele(devvp); 680 devvp = fs->lfs_devvp; 681 vref(devvp); 682 } 683 } 684 } 685 } else { 686 if (!update) { 687 /* New mounts must have a filename for the device */ 688 return (EINVAL); 689 } else { 690 /* Use the extant mount */ 691 ump = VFSTOULFS(mp); 692 fs = ump->um_lfs; 693 devvp = fs->lfs_devvp; 694 vref(devvp); 695 } 696 } 697 698 699 /* 700 * If mount by non-root, then verify that user has necessary 701 * permissions on the device. 702 */ 703 if (error == 0) { 704 accessmode = VREAD; 705 if (update ? 706 (mp->mnt_iflag & IMNT_WANTRDWR) != 0 : 707 (mp->mnt_flag & MNT_RDONLY) == 0) 708 accessmode |= VWRITE; 709 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 710 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT, 711 KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp, 712 KAUTH_ARG(accessmode)); 713 VOP_UNLOCK(devvp); 714 } 715 716 if (error) { 717 vrele(devvp); 718 return (error); 719 } 720 721 if (!update) { 722 int flags; 723 724 if (mp->mnt_flag & MNT_RDONLY) 725 flags = FREAD; 726 else 727 flags = FREAD|FWRITE; 728 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 729 error = VOP_OPEN(devvp, flags, FSCRED); 730 VOP_UNLOCK(devvp); 731 if (error) 732 goto fail; 733 error = lfs_mountfs(devvp, mp, l); /* LFS */ 734 if (error) { 735 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 736 (void)VOP_CLOSE(devvp, flags, NOCRED); 737 VOP_UNLOCK(devvp); 738 goto fail; 739 } 740 741 ump = VFSTOULFS(mp); 742 fs = ump->um_lfs; 743 } else { 744 /* 745 * Update the mount. 746 */ 747 748 /* 749 * The initial mount got a reference on this 750 * device, so drop the one obtained via 751 * namei(), above. 752 */ 753 vrele(devvp); 754 755 ump = VFSTOULFS(mp); 756 fs = ump->um_lfs; 757 758 if (fs->lfs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { 759 /* 760 * Changing from read/write to read-only. 761 * XXX: shouldn't we sync here? or does vfs do that? 762 */ 763 #ifdef LFS_QUOTA2 764 /* XXX: quotas should remain on when readonly */ 765 if (fs->lfs_use_quota2) { 766 error = lfsquota2_umount(mp, 0); 767 if (error) { 768 return error; 769 } 770 } 771 #endif 772 } 773 774 if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) { 775 /* 776 * Changing from read-only to read/write. 777 * Note in the superblocks that we're writing. 778 */ 779 780 /* XXX: quotas should have been on even if readonly */ 781 if (fs->lfs_use_quota2) { 782 #ifdef LFS_QUOTA2 783 error = lfs_quota2_mount(mp); 784 #else 785 uprintf("%s: no kernel support for this " 786 "filesystem's quotas\n", 787 mp->mnt_stat.f_mntonname); 788 if (mp->mnt_flag & MNT_FORCE) { 789 uprintf("%s: mounting anyway; " 790 "fsck afterwards\n", 791 mp->mnt_stat.f_mntonname); 792 } else { 793 error = EINVAL; 794 } 795 #endif 796 if (error) { 797 return error; 798 } 799 } 800 801 fs->lfs_ronly = 0; 802 if (lfs_sb_getpflags(fs) & LFS_PF_CLEAN) { 803 lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) & ~LFS_PF_CLEAN); 804 lfs_writesuper(fs, lfs_sb_getsboff(fs, 0)); 805 lfs_writesuper(fs, lfs_sb_getsboff(fs, 1)); 806 } 807 } 808 if (args->fspec == NULL) 809 return EINVAL; 810 } 811 812 error = set_statvfs_info(path, UIO_USERSPACE, args->fspec, 813 UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l); 814 if (error == 0) 815 lfs_sb_setfsmnt(fs, mp->mnt_stat.f_mntonname); 816 return error; 817 818 fail: 819 vrele(devvp); 820 return (error); 821 } 822 823 /* 824 * Helper for mountfs. Note that the fs pointer may be a dummy one 825 * pointing into a superblock buffer. (Which is gross; see below.) 826 */ 827 static int 828 lfs_checkmagic(struct lfs *fs) 829 { 830 switch (fs->lfs_dlfs_u.u_32.dlfs_magic) { 831 case LFS_MAGIC: 832 fs->lfs_is64 = false; 833 fs->lfs_dobyteswap = false; 834 break; 835 case LFS64_MAGIC: 836 fs->lfs_is64 = true; 837 fs->lfs_dobyteswap = false; 838 break; 839 #ifdef LFS_EI 840 case LFS_MAGIC_SWAPPED: 841 fs->lfs_is64 = false; 842 fs->lfs_dobyteswap = true; 843 break; 844 case LFS64_MAGIC_SWAPPED: 845 fs->lfs_is64 = true; 846 fs->lfs_dobyteswap = true; 847 break; 848 #endif 849 default: 850 /* XXX needs translation */ 851 return EINVAL; 852 } 853 return 0; 854 } 855 856 /* 857 * Common code for mount and mountroot 858 * LFS specific 859 */ 860 int 861 lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) 862 { 863 static bool lfs_mounted_once = false; 864 struct lfs *primarysb, *altsb, *thesb; 865 struct buf *primarybuf, *altbuf; 866 struct lfs *fs; 867 struct ulfsmount *ump; 868 struct vnode *vp; 869 dev_t dev; 870 int error, i, ronly, fsbsize; 871 kauth_cred_t cred; 872 CLEANERINFO *cip; 873 SEGUSE *sup; 874 daddr_t sb_addr; 875 876 cred = l ? l->l_cred : NOCRED; 877 878 /* The superblock is supposed to be 512 bytes. */ 879 __CTASSERT(sizeof(struct dlfs) == DEV_BSIZE); 880 881 /* 882 * Flush out any old buffers remaining from a previous use. 883 */ 884 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 885 error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0); 886 VOP_UNLOCK(devvp); 887 if (error) 888 return (error); 889 890 ronly = (mp->mnt_flag & MNT_RDONLY) != 0; 891 892 /* Don't free random space on error. */ 893 primarybuf = NULL; 894 altbuf = NULL; 895 ump = NULL; 896 897 sb_addr = LFS_LABELPAD / DEV_BSIZE; 898 while (1) { 899 /* 900 * Read in the superblock. 901 * 902 * Note that because LFS_SBPAD is substantially larger 903 * (8K) than the actual on-disk superblock (512 bytes) 904 * the buffer contains enough space to be used as a 905 * whole struct lfs (in-memory superblock) - we do this 906 * only so we can set and use the is64 and dobyteswap 907 * members. XXX this is gross and the logic here should 908 * be reworked. 909 */ 910 error = bread(devvp, sb_addr, LFS_SBPAD, 0, &primarybuf); 911 if (error) 912 goto out; 913 primarysb = (struct lfs *)primarybuf->b_data; 914 915 /* Check the basics. */ 916 error = lfs_checkmagic(primarysb); 917 if (error) { 918 DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock wrong magic\n")); 919 goto out; 920 } 921 if (lfs_sb_getbsize(primarysb) > MAXBSIZE || 922 lfs_sb_getversion(primarysb) > LFS_VERSION || 923 lfs_sb_getbsize(primarysb) < sizeof(struct dlfs)) { 924 DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock sanity failed\n")); 925 /* XXX needs translation */ 926 error = EINVAL; 927 goto out; 928 } 929 if (lfs_sb_getinodefmt(primarysb) > LFS_MAXINODEFMT) { 930 DLOG((DLOG_MOUNT, "lfs_mountfs: unknown inode format %d\n", 931 lfs_sb_getinodefmt(primarysb))); 932 error = EINVAL; 933 goto out; 934 } 935 936 if (lfs_sb_getversion(primarysb) == 1) 937 fsbsize = DEV_BSIZE; 938 else { 939 fsbsize = 1 << lfs_sb_getffshift(primarysb); 940 /* 941 * Could be, if the frag size is large enough, that we 942 * don't have the "real" primary superblock. If that's 943 * the case, get the real one, and try again. 944 */ 945 if (sb_addr != (lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT))) { 946 DLOG((DLOG_MOUNT, "lfs_mountfs: sb daddr" 947 " 0x%llx is not right, trying 0x%llx\n", 948 (long long)sb_addr, 949 (long long)(lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT)))); 950 sb_addr = lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT); 951 brelse(primarybuf, BC_INVAL); 952 continue; 953 } 954 } 955 break; 956 } 957 958 /* 959 * Check the second superblock to see which is newer; then mount 960 * using the older of the two. This is necessary to ensure that 961 * the filesystem is valid if it was not unmounted cleanly. 962 */ 963 964 if (lfs_sb_getsboff(primarysb, 1) && 965 lfs_sb_getsboff(primarysb, 1) - LFS_LABELPAD / fsbsize > LFS_SBPAD / fsbsize) 966 { 967 error = bread(devvp, lfs_sb_getsboff(primarysb, 1) * (fsbsize / DEV_BSIZE), 968 LFS_SBPAD, 0, &altbuf); 969 if (error) 970 goto out; 971 altsb = (struct lfs *)altbuf->b_data; 972 973 /* 974 * Note: this used to do the sanity check only if the 975 * timestamp/serial comparison required use of altsb; 976 * this way is less tolerant, but if altsb is corrupted 977 * enough that the magic number, version, and blocksize 978 * are bogus, why would the timestamp or serial fields 979 * mean anything either? If this kind of thing happens, 980 * you need to fsck anyway. 981 */ 982 983 error = lfs_checkmagic(altsb); 984 if (error) 985 goto out; 986 987 /* Check the basics. */ 988 if (lfs_sb_getbsize(altsb) > MAXBSIZE || 989 lfs_sb_getversion(altsb) > LFS_VERSION || 990 lfs_sb_getbsize(altsb) < sizeof(struct dlfs)) { 991 DLOG((DLOG_MOUNT, "lfs_mountfs: alt superblock" 992 " sanity failed\n")); 993 error = EINVAL; /* XXX needs translation */ 994 goto out; 995 } 996 997 if (lfs_sb_getversion(primarysb) == 1) { 998 /* 1s resolution comparison */ 999 if (lfs_sb_gettstamp(altsb) < lfs_sb_gettstamp(primarysb)) 1000 thesb = altsb; 1001 else 1002 thesb = primarysb; 1003 } else { 1004 /* monotonic infinite-resolution comparison */ 1005 if (lfs_sb_getserial(altsb) < lfs_sb_getserial(primarysb)) 1006 thesb = altsb; 1007 else 1008 thesb = primarysb; 1009 } 1010 } else { 1011 DLOG((DLOG_MOUNT, "lfs_mountfs: invalid alt superblock location" 1012 " daddr=0x%x\n", lfs_sb_getsboff(primarysb, 1))); 1013 error = EINVAL; 1014 goto out; 1015 } 1016 1017 /* 1018 * Allocate the mount structure, copy the superblock into it. 1019 * Note that the 32-bit and 64-bit superblocks are the same size. 1020 */ 1021 fs = kmem_zalloc(sizeof(struct lfs), KM_SLEEP); 1022 memcpy(&fs->lfs_dlfs_u.u_32, &thesb->lfs_dlfs_u.u_32, 1023 sizeof(struct dlfs)); 1024 fs->lfs_is64 = thesb->lfs_is64; 1025 fs->lfs_dobyteswap = thesb->lfs_dobyteswap; 1026 fs->lfs_hasolddirfmt = false; /* set for real below */ 1027 1028 /* Compatibility */ 1029 if (lfs_sb_getversion(fs) < 2) { 1030 lfs_sb_setsumsize(fs, LFS_V1_SUMMARY_SIZE); 1031 lfs_sb_setibsize(fs, lfs_sb_getbsize(fs)); 1032 lfs_sb_sets0addr(fs, lfs_sb_getsboff(fs, 0)); 1033 lfs_sb_settstamp(fs, lfs_sb_getotstamp(fs)); 1034 lfs_sb_setfsbtodb(fs, 0); 1035 } 1036 if (lfs_sb_getresvseg(fs) == 0) 1037 lfs_sb_setresvseg(fs, MIN(lfs_sb_getminfreeseg(fs) - 1, \ 1038 MAX(MIN_RESV_SEGS, lfs_sb_getminfreeseg(fs) / 2 + 1))); 1039 1040 /* 1041 * If we aren't going to be able to write meaningfully to this 1042 * filesystem, and were not mounted readonly, bomb out now. 1043 */ 1044 if (lfs_fsbtob(fs, LFS_NRESERVE(fs)) > LFS_MAX_BYTES && !ronly) { 1045 DLOG((DLOG_MOUNT, "lfs_mount: to mount this filesystem read/write," 1046 " we need BUFPAGES >= %lld\n", 1047 (long long)((bufmem_hiwater / bufmem_lowater) * 1048 LFS_INVERSE_MAX_BYTES( 1049 lfs_fsbtob(fs, LFS_NRESERVE(fs))) >> PAGE_SHIFT))); 1050 kmem_free(fs, sizeof(struct lfs)); 1051 error = EFBIG; /* XXX needs translation */ 1052 goto out; 1053 } 1054 1055 /* Before rolling forward, lock so vget will sleep for other procs */ 1056 if (l != NULL) { 1057 fs->lfs_flags = LFS_NOTYET; 1058 fs->lfs_rfpid = l->l_proc->p_pid; 1059 } 1060 1061 ump = kmem_zalloc(sizeof(*ump), KM_SLEEP); 1062 ump->um_lfs = fs; 1063 ump->um_fstype = fs->lfs_is64 ? ULFS2 : ULFS1; 1064 /* ump->um_cleaner_thread = NULL; */ 1065 brelse(primarybuf, BC_INVAL); 1066 brelse(altbuf, BC_INVAL); 1067 primarybuf = NULL; 1068 altbuf = NULL; 1069 1070 1071 /* Set up the I/O information */ 1072 fs->lfs_devbsize = DEV_BSIZE; 1073 fs->lfs_iocount = 0; 1074 fs->lfs_diropwait = 0; 1075 fs->lfs_activesb = 0; 1076 lfs_sb_setuinodes(fs, 0); 1077 fs->lfs_ravail = 0; 1078 fs->lfs_favail = 0; 1079 fs->lfs_sbactive = 0; 1080 1081 /* Set up the ifile and lock aflags */ 1082 fs->lfs_doifile = 0; 1083 fs->lfs_writer = 0; 1084 fs->lfs_dirops = 0; 1085 fs->lfs_nadirop = 0; 1086 fs->lfs_seglock = 0; 1087 fs->lfs_pdflush = 0; 1088 fs->lfs_sleepers = 0; 1089 fs->lfs_pages = 0; 1090 rw_init(&fs->lfs_fraglock); 1091 rw_init(&fs->lfs_iflock); 1092 cv_init(&fs->lfs_sleeperscv, "lfs_slp"); 1093 cv_init(&fs->lfs_diropscv, "lfs_dirop"); 1094 cv_init(&fs->lfs_stopcv, "lfsstop"); 1095 cv_init(&fs->lfs_nextsegsleep, "segment"); 1096 1097 /* Initialize values for all LFS mounts */ 1098 if (!lfs_mounted_once) { 1099 cv_init(&lfs_allclean_wakeup, "segment"); 1100 lfs_mounted_once = true; 1101 } 1102 1103 /* Set the file system readonly/modify bits. */ 1104 fs->lfs_ronly = ronly; 1105 if (ronly == 0) 1106 fs->lfs_fmod = 1; 1107 1108 /* Device we're using */ 1109 dev = devvp->v_rdev; 1110 fs->lfs_dev = dev; 1111 fs->lfs_devvp = devvp; 1112 1113 /* ulfs-level information */ 1114 fs->um_flags = 0; 1115 fs->um_bptrtodb = lfs_sb_getffshift(fs) - DEV_BSHIFT; 1116 fs->um_seqinc = lfs_sb_getfrag(fs); 1117 fs->um_nindir = lfs_sb_getnindir(fs); 1118 fs->um_lognindir = ffs(lfs_sb_getnindir(fs)) - 1; 1119 fs->um_maxsymlinklen = lfs_sb_getmaxsymlinklen(fs); 1120 fs->um_dirblksiz = LFS_DIRBLKSIZ; 1121 fs->um_maxfilesize = lfs_sb_getmaxfilesize(fs); 1122 1123 /* quota stuff */ 1124 /* XXX: these need to come from the on-disk superblock to be used */ 1125 fs->lfs_use_quota2 = 0; 1126 fs->lfs_quota_magic = 0; 1127 fs->lfs_quota_flags = 0; 1128 fs->lfs_quotaino[0] = 0; 1129 fs->lfs_quotaino[1] = 0; 1130 1131 /* Initialize the mount structure. */ 1132 mp->mnt_data = ump; 1133 mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev; 1134 mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_LFS); 1135 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 1136 mp->mnt_stat.f_namemax = LFS_MAXNAMLEN; 1137 mp->mnt_stat.f_iosize = lfs_sb_getbsize(fs); 1138 mp->mnt_flag |= MNT_LOCAL; 1139 mp->mnt_fs_bshift = lfs_sb_getbshift(fs); 1140 if (fs->um_maxsymlinklen > 0) 1141 mp->mnt_iflag |= IMNT_DTYPE; 1142 else 1143 fs->lfs_hasolddirfmt = true; 1144 1145 ump->um_mountp = mp; 1146 for (i = 0; i < ULFS_MAXQUOTAS; i++) 1147 ump->um_quotas[i] = NULLVP; 1148 spec_node_setmountedfs(devvp, mp); 1149 1150 /* Set up reserved memory for pageout */ 1151 lfs_setup_resblks(fs); 1152 /* Set up vdirop tailq */ 1153 TAILQ_INIT(&fs->lfs_dchainhd); 1154 /* and paging tailq */ 1155 TAILQ_INIT(&fs->lfs_pchainhd); 1156 /* and delayed segment accounting for truncation list */ 1157 LIST_INIT(&fs->lfs_segdhd); 1158 1159 /* 1160 * We use the ifile vnode for almost every operation. Instead of 1161 * retrieving it from the hash table each time we retrieve it here, 1162 * artificially increment the reference count and keep a pointer 1163 * to it in the incore copy of the superblock. 1164 */ 1165 if ((error = VFS_VGET(mp, LFS_IFILE_INUM, &vp)) != 0) { 1166 DLOG((DLOG_MOUNT, "lfs_mountfs: ifile vget failed, error=%d\n", error)); 1167 goto out; 1168 } 1169 fs->lfs_ivnode = vp; 1170 vref(vp); 1171 1172 /* Set up inode bitmap and order free list */ 1173 lfs_order_freelist(fs); 1174 1175 /* Set up segment usage flags for the autocleaner. */ 1176 fs->lfs_nactive = 0; 1177 fs->lfs_suflags = malloc(2 * sizeof(u_int32_t *), 1178 M_SEGMENT, M_WAITOK); 1179 fs->lfs_suflags[0] = malloc(lfs_sb_getnseg(fs) * sizeof(u_int32_t), 1180 M_SEGMENT, M_WAITOK); 1181 fs->lfs_suflags[1] = malloc(lfs_sb_getnseg(fs) * sizeof(u_int32_t), 1182 M_SEGMENT, M_WAITOK); 1183 memset(fs->lfs_suflags[1], 0, lfs_sb_getnseg(fs) * sizeof(u_int32_t)); 1184 for (i = 0; i < lfs_sb_getnseg(fs); i++) { 1185 int changed; 1186 struct buf *bp; 1187 1188 LFS_SEGENTRY(sup, fs, i, bp); 1189 changed = 0; 1190 if (!ronly) { 1191 if (sup->su_nbytes == 0 && 1192 !(sup->su_flags & SEGUSE_EMPTY)) { 1193 sup->su_flags |= SEGUSE_EMPTY; 1194 ++changed; 1195 } else if (!(sup->su_nbytes == 0) && 1196 (sup->su_flags & SEGUSE_EMPTY)) { 1197 sup->su_flags &= ~SEGUSE_EMPTY; 1198 ++changed; 1199 } 1200 if (sup->su_flags & (SEGUSE_ACTIVE|SEGUSE_INVAL)) { 1201 sup->su_flags &= ~(SEGUSE_ACTIVE|SEGUSE_INVAL); 1202 ++changed; 1203 } 1204 } 1205 fs->lfs_suflags[0][i] = sup->su_flags; 1206 if (changed) 1207 LFS_WRITESEGENTRY(sup, fs, i, bp); 1208 else 1209 brelse(bp, 0); 1210 } 1211 1212 /* 1213 * XXX: if the fs has quotas, quotas should be on even if 1214 * readonly. Otherwise you can't query the quota info! 1215 * However, that's not how the quota2 code got written and I 1216 * don't know if it'll behave itself if enabled while 1217 * readonly, so for now use the same enable logic as ffs. 1218 * 1219 * XXX: also, if you use the -f behavior allowed here (and 1220 * equivalently above for remount) it will corrupt the fs. It 1221 * ought not to allow that. It should allow mounting readonly 1222 * if there are quotas and the kernel doesn't have the quota 1223 * code, but only readonly. 1224 * 1225 * XXX: and if you use the -f behavior allowed here it will 1226 * likely crash at unmount time (or remount time) because we 1227 * think quotas are active. 1228 * 1229 * Although none of this applies until there's a way to set 1230 * lfs_use_quota2 and have quotas in the fs at all. 1231 */ 1232 if (!ronly && fs->lfs_use_quota2) { 1233 #ifdef LFS_QUOTA2 1234 error = lfs_quota2_mount(mp); 1235 #else 1236 uprintf("%s: no kernel support for this filesystem's quotas\n", 1237 mp->mnt_stat.f_mntonname); 1238 if (mp->mnt_flag & MNT_FORCE) { 1239 uprintf("%s: mounting anyway; fsck afterwards\n", 1240 mp->mnt_stat.f_mntonname); 1241 } else { 1242 error = EINVAL; 1243 } 1244 #endif 1245 if (error) { 1246 /* XXX XXX must clean up the stuff immediately above */ 1247 printf("lfs_mountfs: sorry, leaking some memory\n"); 1248 goto out; 1249 } 1250 } 1251 1252 #ifdef LFS_KERNEL_RFW 1253 lfs_roll_forward(fs, mp, l); 1254 #endif 1255 1256 /* If writing, sb is not clean; record in case of immediate crash */ 1257 if (!fs->lfs_ronly) { 1258 lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) & ~LFS_PF_CLEAN); 1259 lfs_writesuper(fs, lfs_sb_getsboff(fs, 0)); 1260 lfs_writesuper(fs, lfs_sb_getsboff(fs, 1)); 1261 } 1262 1263 /* Allow vget now that roll-forward is complete */ 1264 fs->lfs_flags &= ~(LFS_NOTYET); 1265 wakeup(&fs->lfs_flags); 1266 1267 /* 1268 * Initialize the ifile cleaner info with information from 1269 * the superblock. 1270 */ 1271 { 1272 struct buf *bp; 1273 1274 LFS_CLEANERINFO(cip, fs, bp); 1275 lfs_ci_setclean(fs, cip, lfs_sb_getnclean(fs)); 1276 lfs_ci_setdirty(fs, cip, lfs_sb_getnseg(fs) - lfs_sb_getnclean(fs)); 1277 lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs)); 1278 lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs)); 1279 (void) LFS_BWRITE_LOG(bp); /* Ifile */ 1280 } 1281 1282 /* 1283 * Mark the current segment as ACTIVE, since we're going to 1284 * be writing to it. 1285 */ 1286 { 1287 struct buf *bp; 1288 1289 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getoffset(fs)), bp); 1290 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 1291 fs->lfs_nactive++; 1292 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getoffset(fs)), bp); /* Ifile */ 1293 } 1294 1295 /* Now that roll-forward is done, unlock the Ifile */ 1296 vput(vp); 1297 1298 /* Start the pagedaemon-anticipating daemon */ 1299 mutex_enter(&lfs_lock); 1300 if (lfs_writer_daemon == NULL && 1301 kthread_create(PRI_BIO, 0, NULL, 1302 lfs_writerd, NULL, NULL, "lfs_writer") != 0) 1303 panic("fork lfs_writer"); 1304 mutex_exit(&lfs_lock); 1305 1306 printf("WARNING: the log-structured file system is experimental\n" 1307 "WARNING: it may cause system crashes and/or corrupt data\n"); 1308 1309 return (0); 1310 1311 out: 1312 if (primarybuf) 1313 brelse(primarybuf, BC_INVAL); 1314 if (altbuf) 1315 brelse(altbuf, BC_INVAL); 1316 if (ump) { 1317 kmem_free(ump->um_lfs, sizeof(struct lfs)); 1318 kmem_free(ump, sizeof(*ump)); 1319 mp->mnt_data = NULL; 1320 } 1321 1322 return (error); 1323 } 1324 1325 /* 1326 * unmount system call 1327 */ 1328 int 1329 lfs_unmount(struct mount *mp, int mntflags) 1330 { 1331 struct lwp *l = curlwp; 1332 struct ulfsmount *ump; 1333 struct lfs *fs; 1334 int error, flags, ronly; 1335 vnode_t *vp; 1336 1337 flags = 0; 1338 if (mntflags & MNT_FORCE) 1339 flags |= FORCECLOSE; 1340 1341 ump = VFSTOULFS(mp); 1342 fs = ump->um_lfs; 1343 1344 /* Two checkpoints */ 1345 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); 1346 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); 1347 1348 /* wake up the cleaner so it can die */ 1349 /* XXX: shouldn't this be *after* the error cases below? */ 1350 lfs_wakeup_cleaner(fs); 1351 mutex_enter(&lfs_lock); 1352 while (fs->lfs_sleepers) 1353 cv_wait(&fs->lfs_sleeperscv, &lfs_lock); 1354 mutex_exit(&lfs_lock); 1355 1356 #ifdef LFS_EXTATTR 1357 if (ump->um_fstype == ULFS1) { 1358 if (ump->um_extattr.uepm_flags & ULFS_EXTATTR_UEPM_STARTED) { 1359 ulfs_extattr_stop(mp, curlwp); 1360 } 1361 if (ump->um_extattr.uepm_flags & ULFS_EXTATTR_UEPM_INITIALIZED) { 1362 ulfs_extattr_uepm_destroy(&ump->um_extattr); 1363 mp->mnt_flag &= ~MNT_EXTATTR; 1364 } 1365 } 1366 #endif 1367 #ifdef LFS_QUOTA 1368 if ((error = lfsquota1_umount(mp, flags)) != 0) 1369 return (error); 1370 #endif 1371 #ifdef LFS_QUOTA2 1372 if ((error = lfsquota2_umount(mp, flags)) != 0) 1373 return (error); 1374 #endif 1375 if ((error = vflush(mp, fs->lfs_ivnode, flags)) != 0) 1376 return (error); 1377 if ((error = VFS_SYNC(mp, 1, l->l_cred)) != 0) 1378 return (error); 1379 vp = fs->lfs_ivnode; 1380 mutex_enter(vp->v_interlock); 1381 if (LIST_FIRST(&vp->v_dirtyblkhd)) 1382 panic("lfs_unmount: still dirty blocks on ifile vnode"); 1383 mutex_exit(vp->v_interlock); 1384 1385 /* Explicitly write the superblock, to update serial and pflags */ 1386 lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) | LFS_PF_CLEAN); 1387 lfs_writesuper(fs, lfs_sb_getsboff(fs, 0)); 1388 lfs_writesuper(fs, lfs_sb_getsboff(fs, 1)); 1389 mutex_enter(&lfs_lock); 1390 while (fs->lfs_iocount) 1391 mtsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs_umount", 0, 1392 &lfs_lock); 1393 mutex_exit(&lfs_lock); 1394 1395 /* Finish with the Ifile, now that we're done with it */ 1396 vgone(fs->lfs_ivnode); 1397 1398 ronly = !fs->lfs_ronly; 1399 if (fs->lfs_devvp->v_type != VBAD) 1400 spec_node_setmountedfs(fs->lfs_devvp, NULL); 1401 vn_lock(fs->lfs_devvp, LK_EXCLUSIVE | LK_RETRY); 1402 error = VOP_CLOSE(fs->lfs_devvp, 1403 ronly ? FREAD : FREAD|FWRITE, NOCRED); 1404 vput(fs->lfs_devvp); 1405 1406 /* Complain about page leakage */ 1407 if (fs->lfs_pages > 0) 1408 printf("lfs_unmount: still claim %d pages (%d in subsystem)\n", 1409 fs->lfs_pages, lfs_subsys_pages); 1410 1411 /* Free per-mount data structures */ 1412 free(fs->lfs_ino_bitmap, M_SEGMENT); 1413 free(fs->lfs_suflags[0], M_SEGMENT); 1414 free(fs->lfs_suflags[1], M_SEGMENT); 1415 free(fs->lfs_suflags, M_SEGMENT); 1416 lfs_free_resblks(fs); 1417 cv_destroy(&fs->lfs_sleeperscv); 1418 cv_destroy(&fs->lfs_diropscv); 1419 cv_destroy(&fs->lfs_stopcv); 1420 cv_destroy(&fs->lfs_nextsegsleep); 1421 1422 rw_destroy(&fs->lfs_fraglock); 1423 rw_destroy(&fs->lfs_iflock); 1424 1425 kmem_free(fs, sizeof(struct lfs)); 1426 kmem_free(ump, sizeof(*ump)); 1427 1428 mp->mnt_data = NULL; 1429 mp->mnt_flag &= ~MNT_LOCAL; 1430 return (error); 1431 } 1432 1433 /* 1434 * Get file system statistics. 1435 * 1436 * NB: We don't lock to access the superblock here, because it's not 1437 * really that important if we get it wrong. 1438 */ 1439 int 1440 lfs_statvfs(struct mount *mp, struct statvfs *sbp) 1441 { 1442 struct lfs *fs; 1443 struct ulfsmount *ump; 1444 1445 ump = VFSTOULFS(mp); 1446 fs = ump->um_lfs; 1447 1448 sbp->f_bsize = lfs_sb_getbsize(fs); 1449 sbp->f_frsize = lfs_sb_getfsize(fs); 1450 sbp->f_iosize = lfs_sb_getbsize(fs); 1451 sbp->f_blocks = LFS_EST_NONMETA(fs) - VTOI(fs->lfs_ivnode)->i_lfs_effnblks; 1452 1453 sbp->f_bfree = LFS_EST_BFREE(fs); 1454 /* 1455 * XXX this should be lfs_sb_getsize (measured in frags) 1456 * rather than dsize (measured in diskblocks). However, 1457 * getsize needs a format version check (for version 1 it 1458 * needs to be blockstofrags'd) so for the moment I'm going to 1459 * leave this... it won't fire wrongly as frags are at least 1460 * as big as diskblocks. 1461 */ 1462 KASSERT(sbp->f_bfree <= lfs_sb_getdsize(fs)); 1463 #if 0 1464 if (sbp->f_bfree < 0) 1465 sbp->f_bfree = 0; 1466 #endif 1467 1468 sbp->f_bresvd = LFS_EST_RSVD(fs); 1469 if (sbp->f_bfree > sbp->f_bresvd) 1470 sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd; 1471 else 1472 sbp->f_bavail = 0; 1473 1474 /* XXX: huh? - dholland 20150728 */ 1475 sbp->f_files = lfs_sb_getbfree(fs) / lfs_btofsb(fs, lfs_sb_getibsize(fs)) 1476 * LFS_INOPB(fs); 1477 sbp->f_ffree = sbp->f_files - lfs_sb_getnfiles(fs); 1478 sbp->f_favail = sbp->f_ffree; 1479 sbp->f_fresvd = 0; 1480 copy_statvfs_info(sbp, mp); 1481 return (0); 1482 } 1483 1484 /* 1485 * Go through the disk queues to initiate sandbagged IO; 1486 * go through the inodes to write those that have been modified; 1487 * initiate the writing of the super block if it has been modified. 1488 * 1489 * Note: we are always called with the filesystem marked `MPBUSY'. 1490 */ 1491 int 1492 lfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred) 1493 { 1494 int error; 1495 struct lfs *fs; 1496 1497 fs = VFSTOULFS(mp)->um_lfs; 1498 if (fs->lfs_ronly) 1499 return 0; 1500 1501 /* Snapshots should not hose the syncer */ 1502 /* 1503 * XXX Sync can block here anyway, since we don't have a very 1504 * XXX good idea of how much data is pending. If it's more 1505 * XXX than a segment and lfs_nextseg is close to the end of 1506 * XXX the log, we'll likely block. 1507 */ 1508 mutex_enter(&lfs_lock); 1509 if (fs->lfs_nowrap && lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs)) { 1510 mutex_exit(&lfs_lock); 1511 return 0; 1512 } 1513 mutex_exit(&lfs_lock); 1514 1515 lfs_writer_enter(fs, "lfs_dirops"); 1516 1517 /* All syncs must be checkpoints until roll-forward is implemented. */ 1518 DLOG((DLOG_FLUSH, "lfs_sync at 0x%jx\n", 1519 (uintmax_t)lfs_sb_getoffset(fs))); 1520 error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0)); 1521 lfs_writer_leave(fs); 1522 #ifdef LFS_QUOTA 1523 lfs_qsync(mp); 1524 #endif 1525 return (error); 1526 } 1527 1528 /* 1529 * Look up an LFS dinode number to find its incore vnode. If not already 1530 * in core, read it in from the specified device. Return the inode locked. 1531 * Detection and handling of mount points must be done by the calling routine. 1532 */ 1533 int 1534 lfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp) 1535 { 1536 int error; 1537 1538 error = vcache_get(mp, &ino, sizeof(ino), vpp); 1539 if (error) 1540 return error; 1541 error = vn_lock(*vpp, LK_EXCLUSIVE); 1542 if (error) { 1543 vrele(*vpp); 1544 *vpp = NULL; 1545 return error; 1546 } 1547 1548 return 0; 1549 } 1550 1551 /* 1552 * Create a new vnode/inode pair and initialize what fields we can. 1553 */ 1554 static void 1555 lfs_init_vnode(struct ulfsmount *ump, ino_t ino, struct vnode *vp) 1556 { 1557 struct lfs *fs = ump->um_lfs; 1558 struct inode *ip; 1559 union lfs_dinode *dp; 1560 1561 ASSERT_NO_SEGLOCK(fs); 1562 1563 /* Initialize the inode. */ 1564 ip = pool_get(&lfs_inode_pool, PR_WAITOK); 1565 memset(ip, 0, sizeof(*ip)); 1566 dp = pool_get(&lfs_dinode_pool, PR_WAITOK); 1567 memset(dp, 0, sizeof(*dp)); 1568 ip->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK); 1569 memset(ip->inode_ext.lfs, 0, sizeof(*ip->inode_ext.lfs)); 1570 ip->i_din = dp; 1571 ip->i_ump = ump; 1572 ip->i_vnode = vp; 1573 ip->i_dev = fs->lfs_dev; 1574 lfs_dino_setinumber(fs, dp, ino); 1575 ip->i_number = ino; 1576 ip->i_lfs = fs; 1577 ip->i_lfs_effnblks = 0; 1578 SPLAY_INIT(&ip->i_lfs_lbtree); 1579 ip->i_lfs_nbtree = 0; 1580 LIST_INIT(&ip->i_lfs_segdhd); 1581 1582 vp->v_tag = VT_LFS; 1583 vp->v_op = lfs_vnodeop_p; 1584 vp->v_data = ip; 1585 } 1586 1587 /* 1588 * Undo lfs_init_vnode(). 1589 */ 1590 static void 1591 lfs_deinit_vnode(struct ulfsmount *ump, struct vnode *vp) 1592 { 1593 struct inode *ip = VTOI(vp); 1594 1595 pool_put(&lfs_inoext_pool, ip->inode_ext.lfs); 1596 pool_put(&lfs_dinode_pool, ip->i_din); 1597 pool_put(&lfs_inode_pool, ip); 1598 vp->v_data = NULL; 1599 } 1600 1601 /* 1602 * Read an inode from disk and initialize this vnode / inode pair. 1603 * Caller assures no other thread will try to load this inode. 1604 */ 1605 int 1606 lfs_loadvnode(struct mount *mp, struct vnode *vp, 1607 const void *key, size_t key_len, const void **new_key) 1608 { 1609 struct lfs *fs; 1610 union lfs_dinode *dip; 1611 struct inode *ip; 1612 struct buf *bp; 1613 IFILE *ifp; 1614 struct ulfsmount *ump; 1615 ino_t ino; 1616 daddr_t daddr; 1617 int error, retries; 1618 struct timespec ts; 1619 1620 KASSERT(key_len == sizeof(ino)); 1621 memcpy(&ino, key, key_len); 1622 1623 memset(&ts, 0, sizeof ts); /* XXX gcc */ 1624 1625 ump = VFSTOULFS(mp); 1626 fs = ump->um_lfs; 1627 1628 /* 1629 * If the filesystem is not completely mounted yet, suspend 1630 * any access requests (wait for roll-forward to complete). 1631 */ 1632 mutex_enter(&lfs_lock); 1633 while ((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid != fs->lfs_rfpid) 1634 mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_notyet", 0, 1635 &lfs_lock); 1636 mutex_exit(&lfs_lock); 1637 1638 /* Translate the inode number to a disk address. */ 1639 if (ino == LFS_IFILE_INUM) 1640 daddr = lfs_sb_getidaddr(fs); 1641 else { 1642 /* XXX bounds-check this too */ 1643 LFS_IENTRY(ifp, fs, ino, bp); 1644 daddr = lfs_if_getdaddr(fs, ifp); 1645 if (lfs_sb_getversion(fs) > 1) { 1646 ts.tv_sec = lfs_if_getatime_sec(fs, ifp); 1647 ts.tv_nsec = lfs_if_getatime_nsec(fs, ifp); 1648 } 1649 1650 brelse(bp, 0); 1651 if (daddr == LFS_UNUSED_DADDR) 1652 return (ENOENT); 1653 } 1654 1655 /* Allocate/init new vnode/inode. */ 1656 lfs_init_vnode(ump, ino, vp); 1657 ip = VTOI(vp); 1658 1659 /* If the cleaner supplied the inode, use it. */ 1660 if (curlwp == fs->lfs_cleaner_thread && fs->lfs_cleaner_hint != NULL && 1661 fs->lfs_cleaner_hint->bi_lbn == LFS_UNUSED_LBN) { 1662 dip = fs->lfs_cleaner_hint->bi_bp; 1663 if (fs->lfs_is64) { 1664 error = copyin(dip, &ip->i_din->u_64, 1665 sizeof(struct lfs64_dinode)); 1666 } else { 1667 error = copyin(dip, &ip->i_din->u_32, 1668 sizeof(struct lfs32_dinode)); 1669 } 1670 if (error) { 1671 lfs_deinit_vnode(ump, vp); 1672 return error; 1673 } 1674 KASSERT(ip->i_number == ino); 1675 goto out; 1676 } 1677 1678 /* Read in the disk contents for the inode, copy into the inode. */ 1679 retries = 0; 1680 again: 1681 error = bread(fs->lfs_devvp, LFS_FSBTODB(fs, daddr), 1682 (lfs_sb_getversion(fs) == 1 ? lfs_sb_getbsize(fs) : lfs_sb_getibsize(fs)), 1683 0, &bp); 1684 if (error) { 1685 lfs_deinit_vnode(ump, vp); 1686 return error; 1687 } 1688 1689 dip = lfs_ifind(fs, ino, bp); 1690 if (dip == NULL) { 1691 /* Assume write has not completed yet; try again */ 1692 brelse(bp, BC_INVAL); 1693 ++retries; 1694 if (retries <= LFS_IFIND_RETRIES) { 1695 mutex_enter(&lfs_lock); 1696 if (fs->lfs_iocount) { 1697 DLOG((DLOG_VNODE, 1698 "%s: dinode %d not found, retrying...\n", 1699 __func__, ino)); 1700 (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1, 1701 "lfs ifind", 1, &lfs_lock); 1702 } else 1703 retries = LFS_IFIND_RETRIES; 1704 mutex_exit(&lfs_lock); 1705 goto again; 1706 } 1707 #ifdef DEBUG 1708 /* If the seglock is held look at the bpp to see 1709 what is there anyway */ 1710 mutex_enter(&lfs_lock); 1711 if (fs->lfs_seglock > 0) { 1712 struct buf **bpp; 1713 union lfs_dinode *dp; 1714 int i; 1715 1716 for (bpp = fs->lfs_sp->bpp; 1717 bpp != fs->lfs_sp->cbpp; ++bpp) { 1718 if ((*bpp)->b_vp == fs->lfs_ivnode && 1719 bpp != fs->lfs_sp->bpp) { 1720 /* Inode block */ 1721 printf("%s: block 0x%" PRIx64 ": ", 1722 __func__, (*bpp)->b_blkno); 1723 for (i = 0; i < LFS_INOPB(fs); i++) { 1724 dp = DINO_IN_BLOCK(fs, 1725 (*bpp)->b_data, i); 1726 if (lfs_dino_getinumber(fs, dp)) 1727 printf("%ju ", 1728 (uintmax_t)lfs_dino_getinumber(fs, dp)); 1729 } 1730 printf("\n"); 1731 } 1732 } 1733 } 1734 mutex_exit(&lfs_lock); 1735 #endif /* DEBUG */ 1736 panic("lfs_loadvnode: dinode not found"); 1737 } 1738 lfs_copy_dinode(fs, ip->i_din, dip); 1739 brelse(bp, 0); 1740 1741 out: 1742 if (lfs_sb_getversion(fs) > 1) { 1743 lfs_dino_setatime(fs, ip->i_din, ts.tv_sec); 1744 lfs_dino_setatimensec(fs, ip->i_din, ts.tv_nsec); 1745 } 1746 1747 lfs_vinit(mp, &vp); 1748 1749 *new_key = &ip->i_number; 1750 return 0; 1751 } 1752 1753 /* 1754 * Create a new inode and initialize this vnode / inode pair. 1755 */ 1756 int 1757 lfs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp, 1758 struct vattr *vap, kauth_cred_t cred, 1759 size_t *key_len, const void **new_key) 1760 { 1761 ino_t ino; 1762 struct inode *ip; 1763 struct ulfsmount *ump; 1764 struct lfs *fs; 1765 int error, mode, gen; 1766 1767 KASSERT(dvp != NULL || vap->va_fileid > 0); 1768 KASSERT(dvp != NULL && dvp->v_mount == mp); 1769 KASSERT(vap->va_type != VNON); 1770 1771 *key_len = sizeof(ino); 1772 ump = VFSTOULFS(mp); 1773 fs = ump->um_lfs; 1774 mode = MAKEIMODE(vap->va_type, vap->va_mode); 1775 1776 /* 1777 * Allocate fresh inode. With "dvp == NULL" take the inode number 1778 * and version from "vap". 1779 */ 1780 if (dvp == NULL) { 1781 ino = vap->va_fileid; 1782 gen = vap->va_gen; 1783 error = lfs_valloc_fixed(fs, ino, gen); 1784 } else { 1785 error = lfs_valloc(dvp, mode, cred, &ino, &gen); 1786 } 1787 if (error) 1788 return error; 1789 1790 /* Attach inode to vnode. */ 1791 lfs_init_vnode(ump, ino, vp); 1792 ip = VTOI(vp); 1793 1794 mutex_enter(&lfs_lock); 1795 LFS_SET_UINO(ip, IN_CHANGE); 1796 mutex_exit(&lfs_lock); 1797 1798 /* Note no blocks yet */ 1799 ip->i_lfs_hiblk = -1; 1800 1801 /* Set a new generation number for this inode. */ 1802 ip->i_gen = gen; 1803 lfs_dino_setgen(fs, ip->i_din, gen); 1804 1805 memset(ip->i_lfs_fragsize, 0, 1806 ULFS_NDADDR * sizeof(*ip->i_lfs_fragsize)); 1807 1808 /* Set uid / gid. */ 1809 if (cred == NOCRED || cred == FSCRED) { 1810 ip->i_gid = 0; 1811 ip->i_uid = 0; 1812 } else { 1813 ip->i_gid = VTOI(dvp)->i_gid; 1814 ip->i_uid = kauth_cred_geteuid(cred); 1815 } 1816 DIP_ASSIGN(ip, gid, ip->i_gid); 1817 DIP_ASSIGN(ip, uid, ip->i_uid); 1818 1819 #if defined(LFS_QUOTA) || defined(LFS_QUOTA2) 1820 error = lfs_chkiq(ip, 1, cred, 0); 1821 if (error) { 1822 lfs_vfree(dvp, ino, mode); 1823 lfs_deinit_vnode(ump, vp); 1824 1825 return error; 1826 } 1827 #endif 1828 1829 /* Set type and finalize. */ 1830 ip->i_flags = 0; 1831 DIP_ASSIGN(ip, flags, 0); 1832 ip->i_mode = mode; 1833 DIP_ASSIGN(ip, mode, mode); 1834 if (vap->va_rdev != VNOVAL) { 1835 /* 1836 * Want to be able to use this to make badblock 1837 * inodes, so don't truncate the dev number. 1838 */ 1839 // XXX clean this up 1840 if (ump->um_fstype == ULFS1) 1841 ip->i_din->u_32.di_rdev = ulfs_rw32(vap->va_rdev, 1842 ULFS_MPNEEDSWAP(fs)); 1843 else 1844 ip->i_din->u_64.di_rdev = ulfs_rw64(vap->va_rdev, 1845 ULFS_MPNEEDSWAP(fs)); 1846 } 1847 lfs_vinit(mp, &vp); 1848 1849 *new_key = &ip->i_number; 1850 return 0; 1851 } 1852 1853 /* 1854 * File handle to vnode 1855 */ 1856 int 1857 lfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp) 1858 { 1859 struct lfid lfh; 1860 struct lfs *fs; 1861 1862 if (fhp->fid_len != sizeof(struct lfid)) 1863 return EINVAL; 1864 1865 memcpy(&lfh, fhp, sizeof(lfh)); 1866 if (lfh.lfid_ino < LFS_IFILE_INUM) 1867 return ESTALE; 1868 1869 fs = VFSTOULFS(mp)->um_lfs; 1870 if (lfh.lfid_ident != lfs_sb_getident(fs)) 1871 return ESTALE; 1872 1873 if (lfh.lfid_ino > 1874 ((lfs_dino_getsize(fs, VTOI(fs->lfs_ivnode)->i_din) >> lfs_sb_getbshift(fs)) - 1875 lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs)) 1876 return ESTALE; 1877 1878 return (ulfs_fhtovp(mp, &lfh.lfid_ufid, vpp)); 1879 } 1880 1881 /* 1882 * Vnode pointer to File handle 1883 */ 1884 /* ARGSUSED */ 1885 int 1886 lfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size) 1887 { 1888 struct inode *ip; 1889 struct lfid lfh; 1890 1891 if (*fh_size < sizeof(struct lfid)) { 1892 *fh_size = sizeof(struct lfid); 1893 return E2BIG; 1894 } 1895 *fh_size = sizeof(struct lfid); 1896 ip = VTOI(vp); 1897 memset(&lfh, 0, sizeof(lfh)); 1898 lfh.lfid_len = sizeof(struct lfid); 1899 lfh.lfid_ino = ip->i_number; 1900 lfh.lfid_gen = ip->i_gen; 1901 lfh.lfid_ident = lfs_sb_getident(ip->i_lfs); 1902 memcpy(fhp, &lfh, sizeof(lfh)); 1903 return (0); 1904 } 1905 1906 /* 1907 * ulfs_bmaparray callback function for writing. 1908 * 1909 * Since blocks will be written to the new segment anyway, 1910 * we don't care about current daddr of them. 1911 */ 1912 static bool 1913 lfs_issequential_hole(const struct lfs *fs, 1914 daddr_t daddr0, daddr_t daddr1) 1915 { 1916 (void)fs; /* not used */ 1917 1918 KASSERT(daddr0 == UNWRITTEN || 1919 (0 <= daddr0 && daddr0 <= LFS_MAX_DADDR(fs))); 1920 KASSERT(daddr1 == UNWRITTEN || 1921 (0 <= daddr1 && daddr1 <= LFS_MAX_DADDR(fs))); 1922 1923 /* NOTE: all we want to know here is 'hole or not'. */ 1924 /* NOTE: UNASSIGNED is converted to 0 by ulfs_bmaparray. */ 1925 1926 /* 1927 * treat UNWRITTENs and all resident blocks as 'contiguous' 1928 */ 1929 if (daddr0 != 0 && daddr1 != 0) 1930 return true; 1931 1932 /* 1933 * both are in hole? 1934 */ 1935 if (daddr0 == 0 && daddr1 == 0) 1936 return true; /* all holes are 'contiguous' for us. */ 1937 1938 return false; 1939 } 1940 1941 /* 1942 * lfs_gop_write functions exactly like genfs_gop_write, except that 1943 * (1) it requires the seglock to be held by its caller, and sp->fip 1944 * to be properly initialized (it will return without re-initializing 1945 * sp->fip, and without calling lfs_writeseg). 1946 * (2) it uses the remaining space in the segment, rather than VOP_BMAP, 1947 * to determine how large a block it can write at once (though it does 1948 * still use VOP_BMAP to find holes in the file); 1949 * (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks 1950 * (leaving lfs_writeseg to deal with the cluster blocks, so we might 1951 * now have clusters of clusters, ick.) 1952 */ 1953 static int 1954 lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, 1955 int flags) 1956 { 1957 int i, error, run, haveeof = 0; 1958 int fs_bshift; 1959 vaddr_t kva; 1960 off_t eof, offset, startoffset = 0; 1961 size_t bytes, iobytes, skipbytes; 1962 bool async = (flags & PGO_SYNCIO) == 0; 1963 daddr_t lbn, blkno; 1964 struct vm_page *pg; 1965 struct buf *mbp, *bp; 1966 struct vnode *devvp = VTOI(vp)->i_devvp; 1967 struct inode *ip = VTOI(vp); 1968 struct lfs *fs = ip->i_lfs; 1969 struct segment *sp = fs->lfs_sp; 1970 SEGSUM *ssp; 1971 UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist); 1972 const char * failreason = NULL; 1973 1974 ASSERT_SEGLOCK(fs); 1975 1976 /* The Ifile lives in the buffer cache */ 1977 KASSERT(vp != fs->lfs_ivnode); 1978 1979 /* 1980 * We don't want to fill the disk before the cleaner has a chance 1981 * to make room for us. If we're in danger of doing that, fail 1982 * with EAGAIN. The caller will have to notice this, unlock 1983 * so the cleaner can run, relock and try again. 1984 * 1985 * We must write everything, however, if our vnode is being 1986 * reclaimed. 1987 */ 1988 mutex_enter(vp->v_interlock); 1989 if (LFS_STARVED_FOR_SEGS(fs) && vdead_check(vp, VDEAD_NOWAIT) == 0) { 1990 mutex_exit(vp->v_interlock); 1991 failreason = "Starved for segs and not flushing vp"; 1992 goto tryagain; 1993 } 1994 mutex_exit(vp->v_interlock); 1995 1996 /* 1997 * Sometimes things slip past the filters in lfs_putpages, 1998 * and the pagedaemon tries to write pages---problem is 1999 * that the pagedaemon never acquires the segment lock. 2000 * 2001 * Alternatively, pages that were clean when we called 2002 * genfs_putpages may have become dirty in the meantime. In this 2003 * case the segment header is not properly set up for blocks 2004 * to be added to it. 2005 * 2006 * Unbusy and unclean the pages, and put them on the ACTIVE 2007 * queue under the hypothesis that they couldn't have got here 2008 * unless they were modified *quite* recently. 2009 * 2010 * XXXUBC that last statement is an oversimplification of course. 2011 */ 2012 if (!LFS_SEGLOCK_HELD(fs)) { 2013 failreason = "Seglock not held"; 2014 goto tryagain; 2015 } 2016 if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) { 2017 failreason = "Inode with no_gop_write"; 2018 goto tryagain; 2019 } 2020 if ((pgs[0]->offset & lfs_sb_getbmask(fs)) != 0) { 2021 failreason = "Bad page offset"; 2022 goto tryagain; 2023 } 2024 2025 UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx", 2026 (uintptr_t)vp, (uintptr_t)pgs, npages, flags); 2027 2028 GOP_SIZE(vp, vp->v_size, &eof, 0); 2029 haveeof = 1; 2030 2031 if (vp->v_type == VREG) 2032 fs_bshift = vp->v_mount->mnt_fs_bshift; 2033 else 2034 fs_bshift = DEV_BSHIFT; 2035 error = 0; 2036 pg = pgs[0]; 2037 startoffset = pg->offset; 2038 KASSERT(eof >= 0); 2039 2040 if (startoffset >= eof) { 2041 failreason = "Offset beyond EOF"; 2042 goto tryagain; 2043 } else 2044 bytes = MIN(npages << PAGE_SHIFT, eof - startoffset); 2045 skipbytes = 0; 2046 2047 KASSERT(bytes != 0); 2048 2049 /* Swap PG_DELWRI for PG_PAGEOUT */ 2050 for (i = 0; i < npages; i++) { 2051 if (pgs[i]->flags & PG_DELWRI) { 2052 KASSERT(!(pgs[i]->flags & PG_PAGEOUT)); 2053 pgs[i]->flags &= ~PG_DELWRI; 2054 pgs[i]->flags |= PG_PAGEOUT; 2055 uvm_pageout_start(1); 2056 mutex_enter(vp->v_interlock); 2057 mutex_enter(&uvm_pageqlock); 2058 uvm_pageunwire(pgs[i]); 2059 mutex_exit(&uvm_pageqlock); 2060 mutex_exit(vp->v_interlock); 2061 } 2062 } 2063 2064 /* 2065 * Check to make sure we're starting on a block boundary. 2066 * We'll check later to make sure we always write entire 2067 * blocks (or fragments). 2068 */ 2069 if (startoffset & lfs_sb_getbmask(fs)) 2070 printf("%" PRId64 " & %" PRIu64 " = %" PRId64 "\n", 2071 startoffset, lfs_sb_getbmask(fs), 2072 startoffset & lfs_sb_getbmask(fs)); 2073 KASSERT((startoffset & lfs_sb_getbmask(fs)) == 0); 2074 if (bytes & lfs_sb_getffmask(fs)) { 2075 printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes); 2076 panic("lfs_gop_write: non-integer blocks"); 2077 } 2078 2079 /* 2080 * We could deadlock here on pager_map with UVMPAGER_MAPIN_WAITOK. 2081 * If we would, write what we have and try again. If we don't 2082 * have anything to write, we'll have to sleep. 2083 */ 2084 ssp = (SEGSUM *)sp->segsum; 2085 if ((kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE | 2086 (lfs_ss_getnfinfo(fs, ssp) < 1 ? 2087 UVMPAGER_MAPIN_WAITOK : 0))) == 0x0) { 2088 DLOG((DLOG_PAGE, "lfs_gop_write: forcing write\n")); 2089 #if 0 2090 " with nfinfo=%d at offset 0x%jx\n", 2091 (int)lfs_ss_getnfinfo(fs, ssp), 2092 (uintmax_t)lfs_sb_getoffset(fs))); 2093 #endif 2094 lfs_updatemeta(sp); 2095 lfs_release_finfo(fs); 2096 (void) lfs_writeseg(fs, sp); 2097 2098 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); 2099 2100 /* 2101 * Having given up all of the pager_map we were holding, 2102 * we can now wait for aiodoned to reclaim it for us 2103 * without fear of deadlock. 2104 */ 2105 kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE | 2106 UVMPAGER_MAPIN_WAITOK); 2107 } 2108 2109 mbp = getiobuf(NULL, true); 2110 UVMHIST_LOG(ubchist, "vp %#jx mbp %#jx num now %jd bytes 0x%jx", 2111 (uintptr_t)vp, (uintptr_t)mbp, vp->v_numoutput, bytes); 2112 mbp->b_bufsize = npages << PAGE_SHIFT; 2113 mbp->b_data = (void *)kva; 2114 mbp->b_resid = mbp->b_bcount = bytes; 2115 mbp->b_cflags = BC_BUSY|BC_AGE; 2116 mbp->b_iodone = uvm_aio_biodone; 2117 2118 bp = NULL; 2119 for (offset = startoffset; 2120 bytes > 0; 2121 offset += iobytes, bytes -= iobytes) { 2122 lbn = offset >> fs_bshift; 2123 error = ulfs_bmaparray(vp, lbn, &blkno, NULL, NULL, &run, 2124 lfs_issequential_hole); 2125 if (error) { 2126 UVMHIST_LOG(ubchist, "ulfs_bmaparray() -> %jd", 2127 error,0,0,0); 2128 skipbytes += bytes; 2129 bytes = 0; 2130 break; 2131 } 2132 2133 iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, 2134 bytes); 2135 if (blkno == (daddr_t)-1) { 2136 skipbytes += iobytes; 2137 continue; 2138 } 2139 2140 /* 2141 * Discover how much we can really pack into this buffer. 2142 */ 2143 /* If no room in the current segment, finish it up */ 2144 if (sp->sum_bytes_left < sizeof(int32_t) || 2145 sp->seg_bytes_left < (1 << lfs_sb_getbshift(fs))) { 2146 int vers; 2147 2148 lfs_updatemeta(sp); 2149 vers = lfs_fi_getversion(fs, sp->fip); 2150 lfs_release_finfo(fs); 2151 (void) lfs_writeseg(fs, sp); 2152 2153 lfs_acquire_finfo(fs, ip->i_number, vers); 2154 } 2155 /* Check both for space in segment and space in segsum */ 2156 iobytes = MIN(iobytes, (sp->seg_bytes_left >> fs_bshift) 2157 << fs_bshift); 2158 iobytes = MIN(iobytes, (sp->sum_bytes_left / sizeof(int32_t)) 2159 << fs_bshift); 2160 KASSERT(iobytes > 0); 2161 2162 /* if it's really one i/o, don't make a second buf */ 2163 if (offset == startoffset && iobytes == bytes) { 2164 bp = mbp; 2165 /* 2166 * All the LFS output is done by the segwriter. It 2167 * will increment numoutput by one for all the bufs it 2168 * recieves. However this buffer needs one extra to 2169 * account for aiodone. 2170 */ 2171 mutex_enter(vp->v_interlock); 2172 vp->v_numoutput++; 2173 mutex_exit(vp->v_interlock); 2174 } else { 2175 bp = getiobuf(NULL, true); 2176 UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd", 2177 (uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0); 2178 nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes); 2179 /* 2180 * LFS doesn't like async I/O here, dies with 2181 * an assert in lfs_bwrite(). Is that assert 2182 * valid? I retained non-async behaviour when 2183 * converted this to use nestiobuf --pooka 2184 */ 2185 bp->b_flags &= ~B_ASYNC; 2186 } 2187 2188 /* XXX This is silly ... is this necessary? */ 2189 mutex_enter(&bufcache_lock); 2190 mutex_enter(vp->v_interlock); 2191 bgetvp(vp, bp); 2192 mutex_exit(vp->v_interlock); 2193 mutex_exit(&bufcache_lock); 2194 2195 bp->b_lblkno = lfs_lblkno(fs, offset); 2196 bp->b_private = mbp; 2197 if (devvp->v_type == VBLK) { 2198 bp->b_dev = devvp->v_rdev; 2199 } 2200 VOP_BWRITE(bp->b_vp, bp); 2201 while (lfs_gatherblock(sp, bp, NULL)) 2202 continue; 2203 } 2204 2205 nestiobuf_done(mbp, skipbytes, error); 2206 if (skipbytes) { 2207 UVMHIST_LOG(ubchist, "skipbytes %jd", skipbytes, 0,0,0); 2208 } 2209 UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0); 2210 2211 if (!async) { 2212 /* Start a segment write. */ 2213 UVMHIST_LOG(ubchist, "flushing", 0,0,0,0); 2214 mutex_enter(&lfs_lock); 2215 lfs_flush(fs, 0, 1); 2216 mutex_exit(&lfs_lock); 2217 } 2218 2219 if ((sp->seg_flags & SEGM_SINGLE) && lfs_sb_getcurseg(fs) != fs->lfs_startseg) 2220 return EAGAIN; 2221 2222 return (0); 2223 2224 tryagain: 2225 /* 2226 * We can't write the pages, for whatever reason. 2227 * Clean up after ourselves, and make the caller try again. 2228 */ 2229 mutex_enter(vp->v_interlock); 2230 2231 /* Tell why we're here, if we know */ 2232 if (failreason != NULL) { 2233 DLOG((DLOG_PAGE, "lfs_gop_write: %s\n", failreason)); 2234 } 2235 if (haveeof && startoffset >= eof) { 2236 DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64 2237 " eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number, 2238 pgs[0]->offset, eof, npages)); 2239 } 2240 2241 mutex_enter(&uvm_pageqlock); 2242 for (i = 0; i < npages; i++) { 2243 pg = pgs[i]; 2244 2245 if (pg->flags & PG_PAGEOUT) 2246 uvm_pageout_done(1); 2247 if (pg->flags & PG_DELWRI) { 2248 uvm_pageunwire(pg); 2249 } 2250 uvm_pageactivate(pg); 2251 pg->flags &= ~(PG_CLEAN|PG_DELWRI|PG_PAGEOUT|PG_RELEASED); 2252 DLOG((DLOG_PAGE, "pg[%d] = %p (vp %p off %" PRIx64 ")\n", i, pg, 2253 vp, pg->offset)); 2254 DLOG((DLOG_PAGE, "pg[%d]->flags = %x\n", i, pg->flags)); 2255 DLOG((DLOG_PAGE, "pg[%d]->pqflags = %x\n", i, pg->pqflags)); 2256 DLOG((DLOG_PAGE, "pg[%d]->uanon = %p\n", i, pg->uanon)); 2257 DLOG((DLOG_PAGE, "pg[%d]->uobject = %p\n", i, pg->uobject)); 2258 DLOG((DLOG_PAGE, "pg[%d]->wire_count = %d\n", i, 2259 pg->wire_count)); 2260 DLOG((DLOG_PAGE, "pg[%d]->loan_count = %d\n", i, 2261 pg->loan_count)); 2262 } 2263 /* uvm_pageunbusy takes care of PG_BUSY, PG_WANTED */ 2264 uvm_page_unbusy(pgs, npages); 2265 mutex_exit(&uvm_pageqlock); 2266 mutex_exit(vp->v_interlock); 2267 return EAGAIN; 2268 } 2269 2270 /* 2271 * finish vnode/inode initialization. 2272 * used by lfs_vget. 2273 */ 2274 void 2275 lfs_vinit(struct mount *mp, struct vnode **vpp) 2276 { 2277 struct vnode *vp = *vpp; 2278 struct inode *ip = VTOI(vp); 2279 struct ulfsmount *ump = VFSTOULFS(mp); 2280 struct lfs *fs = ump->um_lfs; 2281 int i; 2282 2283 ip->i_mode = lfs_dino_getmode(fs, ip->i_din); 2284 ip->i_nlink = lfs_dino_getnlink(fs, ip->i_din); 2285 ip->i_lfs_osize = ip->i_size = lfs_dino_getsize(fs, ip->i_din); 2286 ip->i_flags = lfs_dino_getflags(fs, ip->i_din); 2287 ip->i_gen = lfs_dino_getgen(fs, ip->i_din); 2288 ip->i_uid = lfs_dino_getuid(fs, ip->i_din); 2289 ip->i_gid = lfs_dino_getgid(fs, ip->i_din); 2290 2291 ip->i_lfs_effnblks = lfs_dino_getblocks(fs, ip->i_din); 2292 ip->i_lfs_odnlink = lfs_dino_getnlink(fs, ip->i_din); 2293 2294 /* 2295 * Initialize the vnode from the inode, check for aliases. In all 2296 * cases re-init ip, the underlying vnode/inode may have changed. 2297 */ 2298 ulfs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp); 2299 ip = VTOI(vp); 2300 2301 memset(ip->i_lfs_fragsize, 0, ULFS_NDADDR * sizeof(*ip->i_lfs_fragsize)); 2302 if (vp->v_type != VLNK || ip->i_size >= ip->i_lfs->um_maxsymlinklen) { 2303 #ifdef DEBUG 2304 for (i = (ip->i_size + lfs_sb_getbsize(fs) - 1) >> lfs_sb_getbshift(fs); 2305 i < ULFS_NDADDR; i++) { 2306 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 2307 i == 0) 2308 continue; 2309 if (lfs_dino_getdb(fs, ip->i_din, i) != 0) { 2310 lfs_dump_dinode(fs, ip->i_din); 2311 panic("inconsistent inode (direct)"); 2312 } 2313 } 2314 for ( ; i < ULFS_NDADDR + ULFS_NIADDR; i++) { 2315 if (lfs_dino_getib(fs, ip->i_din, i - ULFS_NDADDR) != 0) { 2316 lfs_dump_dinode(fs, ip->i_din); 2317 panic("inconsistent inode (indirect)"); 2318 } 2319 } 2320 #endif /* DEBUG */ 2321 for (i = 0; i < ULFS_NDADDR; i++) 2322 if (lfs_dino_getdb(fs, ip->i_din, i) != 0) 2323 ip->i_lfs_fragsize[i] = lfs_blksize(fs, ip, i); 2324 } 2325 2326 KASSERTMSG((vp->v_type != VNON), 2327 "lfs_vinit: ino %llu is type VNON! (ifmt=%o)\n", 2328 (unsigned long long)ip->i_number, 2329 (ip->i_mode & LFS_IFMT) >> 12); 2330 2331 /* 2332 * Finish inode initialization now that aliasing has been resolved. 2333 */ 2334 2335 ip->i_devvp = fs->lfs_devvp; 2336 vref(ip->i_devvp); 2337 #if defined(LFS_QUOTA) || defined(LFS_QUOTA2) 2338 ulfsquota_init(ip); 2339 #endif 2340 genfs_node_init(vp, &lfs_genfsops); 2341 uvm_vnp_setsize(vp, ip->i_size); 2342 2343 /* Initialize hiblk from file size */ 2344 ip->i_lfs_hiblk = lfs_lblkno(ip->i_lfs, ip->i_size + lfs_sb_getbsize(ip->i_lfs) - 1) - 1; 2345 2346 *vpp = vp; 2347 } 2348 2349 /* 2350 * Resize the filesystem to contain the specified number of segments. 2351 */ 2352 int 2353 lfs_resize_fs(struct lfs *fs, int newnsegs) 2354 { 2355 SEGUSE *sup; 2356 CLEANERINFO *cip; 2357 struct buf *bp, *obp; 2358 daddr_t olast, nlast, ilast, noff, start, end; 2359 struct vnode *ivp; 2360 struct inode *ip; 2361 int error, badnews, inc, oldnsegs; 2362 int sbbytes, csbbytes, gain, cgain; 2363 int i; 2364 2365 /* Only support v2 and up */ 2366 if (lfs_sb_getversion(fs) < 2) 2367 return EOPNOTSUPP; 2368 2369 /* If we're doing nothing, do it fast */ 2370 oldnsegs = lfs_sb_getnseg(fs); 2371 if (newnsegs == oldnsegs) 2372 return 0; 2373 2374 /* We always have to have two superblocks */ 2375 if (newnsegs <= lfs_dtosn(fs, lfs_sb_getsboff(fs, 1))) 2376 /* XXX this error code is rather nonsense */ 2377 return EFBIG; 2378 2379 ivp = fs->lfs_ivnode; 2380 ip = VTOI(ivp); 2381 error = 0; 2382 2383 /* Take the segment lock so no one else calls lfs_newseg() */ 2384 lfs_seglock(fs, SEGM_PROT); 2385 2386 /* 2387 * Make sure the segments we're going to be losing, if any, 2388 * are in fact empty. We hold the seglock, so their status 2389 * cannot change underneath us. Count the superblocks we lose, 2390 * while we're at it. 2391 */ 2392 sbbytes = csbbytes = 0; 2393 cgain = 0; 2394 for (i = newnsegs; i < oldnsegs; i++) { 2395 LFS_SEGENTRY(sup, fs, i, bp); 2396 badnews = sup->su_nbytes || !(sup->su_flags & SEGUSE_INVAL); 2397 if (sup->su_flags & SEGUSE_SUPERBLOCK) 2398 sbbytes += LFS_SBPAD; 2399 if (!(sup->su_flags & SEGUSE_DIRTY)) { 2400 ++cgain; 2401 if (sup->su_flags & SEGUSE_SUPERBLOCK) 2402 csbbytes += LFS_SBPAD; 2403 } 2404 brelse(bp, 0); 2405 if (badnews) { 2406 error = EBUSY; 2407 goto out; 2408 } 2409 } 2410 2411 /* Note old and new segment table endpoints, and old ifile size */ 2412 olast = lfs_sb_getcleansz(fs) + lfs_sb_getsegtabsz(fs); 2413 nlast = howmany(newnsegs, lfs_sb_getsepb(fs)) + lfs_sb_getcleansz(fs); 2414 ilast = ivp->v_size >> lfs_sb_getbshift(fs); 2415 noff = nlast - olast; 2416 2417 /* 2418 * Make sure no one can use the Ifile while we change it around. 2419 * Even after taking the iflock we need to make sure no one still 2420 * is holding Ifile buffers, so we get each one, to drain them. 2421 * (XXX this could be done better.) 2422 */ 2423 rw_enter(&fs->lfs_iflock, RW_WRITER); 2424 for (i = 0; i < ilast; i++) { 2425 /* XXX what to do if bread fails? */ 2426 bread(ivp, i, lfs_sb_getbsize(fs), 0, &bp); 2427 brelse(bp, 0); 2428 } 2429 2430 /* Allocate new Ifile blocks */ 2431 for (i = ilast; i < ilast + noff; i++) { 2432 if (lfs_balloc(ivp, i * lfs_sb_getbsize(fs), lfs_sb_getbsize(fs), NOCRED, 0, 2433 &bp) != 0) 2434 panic("balloc extending ifile"); 2435 memset(bp->b_data, 0, lfs_sb_getbsize(fs)); 2436 VOP_BWRITE(bp->b_vp, bp); 2437 } 2438 2439 /* Register new ifile size */ 2440 ip->i_size += noff * lfs_sb_getbsize(fs); 2441 lfs_dino_setsize(fs, ip->i_din, ip->i_size); 2442 uvm_vnp_setsize(ivp, ip->i_size); 2443 2444 /* Copy the inode table to its new position */ 2445 if (noff != 0) { 2446 if (noff < 0) { 2447 start = nlast; 2448 end = ilast + noff; 2449 inc = 1; 2450 } else { 2451 start = ilast + noff - 1; 2452 end = nlast - 1; 2453 inc = -1; 2454 } 2455 for (i = start; i != end; i += inc) { 2456 if (bread(ivp, i, lfs_sb_getbsize(fs), 2457 B_MODIFY, &bp) != 0) 2458 panic("resize: bread dst blk failed"); 2459 if (bread(ivp, i - noff, lfs_sb_getbsize(fs), 2460 0, &obp)) 2461 panic("resize: bread src blk failed"); 2462 memcpy(bp->b_data, obp->b_data, lfs_sb_getbsize(fs)); 2463 VOP_BWRITE(bp->b_vp, bp); 2464 brelse(obp, 0); 2465 } 2466 } 2467 2468 /* If we are expanding, write the new empty SEGUSE entries */ 2469 if (newnsegs > oldnsegs) { 2470 for (i = oldnsegs; i < newnsegs; i++) { 2471 if ((error = bread(ivp, i / lfs_sb_getsepb(fs) + 2472 lfs_sb_getcleansz(fs), lfs_sb_getbsize(fs), 2473 B_MODIFY, &bp)) != 0) 2474 panic("lfs: ifile read: %d", error); 2475 while ((i + 1) % lfs_sb_getsepb(fs) && i < newnsegs) { 2476 sup = &((SEGUSE *)bp->b_data)[i % lfs_sb_getsepb(fs)]; 2477 memset(sup, 0, sizeof(*sup)); 2478 i++; 2479 } 2480 VOP_BWRITE(bp->b_vp, bp); 2481 } 2482 } 2483 2484 /* Zero out unused superblock offsets */ 2485 for (i = 2; i < LFS_MAXNUMSB; i++) 2486 if (lfs_dtosn(fs, lfs_sb_getsboff(fs, i)) >= newnsegs) 2487 lfs_sb_setsboff(fs, i, 0x0); 2488 2489 /* 2490 * Correct superblock entries that depend on fs size. 2491 * The computations of these are as follows: 2492 * 2493 * size = lfs_segtod(fs, nseg) 2494 * dsize = lfs_segtod(fs, nseg - minfreeseg) - lfs_btofsb(#super * LFS_SBPAD) 2495 * bfree = dsize - lfs_btofsb(fs, bsize * nseg / 2) - blocks_actually_used 2496 * avail = lfs_segtod(fs, nclean) - lfs_btofsb(#clean_super * LFS_SBPAD) 2497 * + (lfs_segtod(fs, 1) - (offset - curseg)) 2498 * - lfs_segtod(fs, minfreeseg - (minfreeseg / 2)) 2499 * 2500 * XXX - we should probably adjust minfreeseg as well. 2501 */ 2502 gain = (newnsegs - oldnsegs); 2503 lfs_sb_setnseg(fs, newnsegs); 2504 lfs_sb_setsegtabsz(fs, nlast - lfs_sb_getcleansz(fs)); 2505 lfs_sb_addsize(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs))); 2506 lfs_sb_adddsize(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - lfs_btofsb(fs, sbbytes)); 2507 lfs_sb_addbfree(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - lfs_btofsb(fs, sbbytes) 2508 - gain * lfs_btofsb(fs, lfs_sb_getbsize(fs) / 2)); 2509 if (gain > 0) { 2510 lfs_sb_addnclean(fs, gain); 2511 lfs_sb_addavail(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs))); 2512 } else { 2513 lfs_sb_subnclean(fs, cgain); 2514 lfs_sb_subavail(fs, cgain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - 2515 lfs_btofsb(fs, csbbytes)); 2516 } 2517 2518 /* Resize segment flag cache */ 2519 fs->lfs_suflags[0] = realloc(fs->lfs_suflags[0], 2520 lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK); 2521 fs->lfs_suflags[1] = realloc(fs->lfs_suflags[1], 2522 lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK); 2523 for (i = oldnsegs; i < newnsegs; i++) 2524 fs->lfs_suflags[0][i] = fs->lfs_suflags[1][i] = 0x0; 2525 2526 /* Truncate Ifile if necessary */ 2527 if (noff < 0) 2528 lfs_truncate(ivp, ivp->v_size + (noff << lfs_sb_getbshift(fs)), 0, 2529 NOCRED); 2530 2531 /* Update cleaner info so the cleaner can die */ 2532 /* XXX what to do if bread fails? */ 2533 bread(ivp, 0, lfs_sb_getbsize(fs), B_MODIFY, &bp); 2534 cip = bp->b_data; 2535 lfs_ci_setclean(fs, cip, lfs_sb_getnclean(fs)); 2536 lfs_ci_setdirty(fs, cip, lfs_sb_getnseg(fs) - lfs_sb_getnclean(fs)); 2537 VOP_BWRITE(bp->b_vp, bp); 2538 2539 /* Let Ifile accesses proceed */ 2540 rw_exit(&fs->lfs_iflock); 2541 2542 out: 2543 lfs_segunlock(fs); 2544 return error; 2545 } 2546 2547 /* 2548 * Extended attribute dispatch 2549 */ 2550 int 2551 lfs_extattrctl(struct mount *mp, int cmd, struct vnode *vp, 2552 int attrnamespace, const char *attrname) 2553 { 2554 #ifdef LFS_EXTATTR 2555 struct ulfsmount *ump; 2556 2557 ump = VFSTOULFS(mp); 2558 if (ump->um_fstype == ULFS1) { 2559 return ulfs_extattrctl(mp, cmd, vp, attrnamespace, attrname); 2560 } 2561 #endif 2562 return vfs_stdextattrctl(mp, cmd, vp, attrnamespace, attrname); 2563 } 2564