1 /* $NetBSD: lfs_vfsops.c,v 1.143 2004/01/28 20:57:15 he Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 /*- 39 * Copyright (c) 1989, 1991, 1993, 1994 40 * The Regents of the University of California. All rights reserved. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)lfs_vfsops.c 8.20 (Berkeley) 6/10/95 67 */ 68 69 #include <sys/cdefs.h> 70 __KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.143 2004/01/28 20:57:15 he Exp $"); 71 72 #if defined(_KERNEL_OPT) 73 #include "opt_quota.h" 74 #endif 75 76 #include <sys/param.h> 77 #include <sys/systm.h> 78 #include <sys/namei.h> 79 #include <sys/proc.h> 80 #include <sys/kernel.h> 81 #include <sys/vnode.h> 82 #include <sys/mount.h> 83 #include <sys/kthread.h> 84 #include <sys/buf.h> 85 #include <sys/device.h> 86 #include <sys/mbuf.h> 87 #include <sys/file.h> 88 #include <sys/disklabel.h> 89 #include <sys/ioctl.h> 90 #include <sys/errno.h> 91 #include <sys/malloc.h> 92 #include <sys/pool.h> 93 #include <sys/socket.h> 94 #include <uvm/uvm_extern.h> 95 #include <sys/sysctl.h> 96 #include <sys/conf.h> 97 98 #include <miscfs/specfs/specdev.h> 99 100 #include <ufs/ufs/quota.h> 101 #include <ufs/ufs/inode.h> 102 #include <ufs/ufs/ufsmount.h> 103 #include <ufs/ufs/ufs_extern.h> 104 105 #include <uvm/uvm.h> 106 #include <uvm/uvm_stat.h> 107 #include <uvm/uvm_pager.h> 108 #include <uvm/uvm_pdaemon.h> 109 110 #include <ufs/lfs/lfs.h> 111 #include <ufs/lfs/lfs_extern.h> 112 113 #include <miscfs/genfs/genfs.h> 114 #include <miscfs/genfs/genfs_node.h> 115 116 static int lfs_gop_write(struct vnode *, struct vm_page **, int, int); 117 static boolean_t lfs_issequential_hole(const struct ufsmount *, 118 daddr_t, daddr_t); 119 120 static int lfs_mountfs(struct vnode *, struct mount *, struct proc *); 121 static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t, 122 struct ucred *, int, int *, struct proc *); 123 124 extern const struct vnodeopv_desc lfs_vnodeop_opv_desc; 125 extern const struct vnodeopv_desc lfs_specop_opv_desc; 126 extern const struct vnodeopv_desc lfs_fifoop_opv_desc; 127 128 pid_t lfs_writer_daemon = 0; 129 int lfs_do_flush = 0; 130 131 const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = { 132 &lfs_vnodeop_opv_desc, 133 &lfs_specop_opv_desc, 134 &lfs_fifoop_opv_desc, 135 NULL, 136 }; 137 138 struct vfsops lfs_vfsops = { 139 MOUNT_LFS, 140 lfs_mount, 141 ufs_start, 142 lfs_unmount, 143 ufs_root, 144 ufs_quotactl, 145 lfs_statfs, 146 lfs_sync, 147 lfs_vget, 148 lfs_fhtovp, 149 lfs_vptofh, 150 lfs_init, 151 lfs_reinit, 152 lfs_done, 153 NULL, 154 lfs_mountroot, 155 ufs_check_export, 156 lfs_vnodeopv_descs, 157 }; 158 159 struct genfs_ops lfs_genfsops = { 160 lfs_gop_size, 161 ufs_gop_alloc, 162 lfs_gop_write, 163 }; 164 165 struct pool lfs_inode_pool; 166 struct pool lfs_dinode_pool; 167 struct pool lfs_inoext_pool; 168 169 /* 170 * The writer daemon. UVM keeps track of how many dirty pages we are holding 171 * in lfs_subsys_pages; the daemon flushes the filesystem when this value 172 * crosses the (user-defined) threshhold LFS_MAX_PAGES. 173 */ 174 static void 175 lfs_writerd(void *arg) 176 { 177 #ifdef LFS_PD 178 struct mount *mp, *nmp; 179 struct lfs *fs; 180 #endif 181 182 lfs_writer_daemon = curproc->p_pid; 183 184 simple_lock(&lfs_subsys_lock); 185 for (;;) { 186 ltsleep(&lfs_writer_daemon, PVM | PNORELOCK, "lfswriter", 0, 187 &lfs_subsys_lock); 188 189 #ifdef LFS_PD 190 /* 191 * Look through the list of LFSs to see if any of them 192 * have requested pageouts. 193 */ 194 simple_lock(&mountlist_slock); 195 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 196 mp = nmp) { 197 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 198 nmp = CIRCLEQ_NEXT(mp, mnt_list); 199 continue; 200 } 201 if (strncmp(&mp->mnt_stat.f_fstypename[0], MOUNT_LFS, 202 MFSNAMELEN) == 0) { 203 fs = VFSTOUFS(mp)->um_lfs; 204 if (fs->lfs_pdflush || 205 !TAILQ_EMPTY(&fs->lfs_pchainhd)) { 206 fs->lfs_pdflush = 0; 207 lfs_flush_fs(fs, 0); 208 } 209 } 210 211 simple_lock(&mountlist_slock); 212 nmp = CIRCLEQ_NEXT(mp, mnt_list); 213 vfs_unbusy(mp); 214 } 215 simple_unlock(&mountlist_slock); 216 #endif /* LFS_PD */ 217 218 /* 219 * If global state wants a flush, flush everything. 220 */ 221 simple_lock(&lfs_subsys_lock); 222 while (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS || 223 locked_queue_bytes > LFS_MAX_BYTES || 224 lfs_subsys_pages > LFS_MAX_PAGES) { 225 226 #ifdef DEBUG_LFS_FLUSH 227 if (lfs_do_flush) 228 printf("daemon: lfs_do_flush\n"); 229 if (locked_queue_count > LFS_MAX_BUFS) 230 printf("daemon: lqc = %d, max %d\n", 231 locked_queue_count, LFS_MAX_BUFS); 232 if (locked_queue_bytes > LFS_MAX_BYTES) 233 printf("daemon: lqb = %ld, max %ld\n", 234 locked_queue_bytes, LFS_MAX_BYTES); 235 if (lfs_subsys_pages > LFS_MAX_PAGES) 236 printf("daemon: lssp = %d, max %d\n", 237 lfs_subsys_pages, LFS_MAX_PAGES); 238 #endif /* DEBUG_LFS_FLUSH */ 239 lfs_flush(NULL, SEGM_WRITERD); 240 lfs_do_flush = 0; 241 } 242 } 243 /* NOTREACHED */ 244 } 245 246 /* 247 * Initialize the filesystem, most work done by ufs_init. 248 */ 249 void 250 lfs_init() 251 { 252 ufs_init(); 253 254 /* 255 * XXX Same structure as FFS inodes? Should we share a common pool? 256 */ 257 pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0, 258 "lfsinopl", &pool_allocator_nointr); 259 pool_init(&lfs_dinode_pool, sizeof(struct ufs1_dinode), 0, 0, 0, 260 "lfsdinopl", &pool_allocator_nointr); 261 pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0, 262 "lfsinoextpl", &pool_allocator_nointr); 263 #ifdef DEBUG 264 memset(lfs_log, 0, sizeof(lfs_log)); 265 #endif 266 simple_lock_init(&lfs_subsys_lock); 267 } 268 269 void 270 lfs_reinit() 271 { 272 ufs_reinit(); 273 } 274 275 void 276 lfs_done() 277 { 278 ufs_done(); 279 pool_destroy(&lfs_inode_pool); 280 pool_destroy(&lfs_inoext_pool); 281 } 282 283 /* 284 * Called by main() when ufs is going to be mounted as root. 285 */ 286 int 287 lfs_mountroot() 288 { 289 extern struct vnode *rootvp; 290 struct mount *mp; 291 struct proc *p = curproc; /* XXX */ 292 int error; 293 294 if (root_device->dv_class != DV_DISK) 295 return (ENODEV); 296 297 if (rootdev == NODEV) 298 return (ENODEV); 299 /* 300 * Get vnodes for swapdev and rootdev. 301 */ 302 if ((error = bdevvp(rootdev, &rootvp))) { 303 printf("lfs_mountroot: can't setup bdevvp's"); 304 return (error); 305 } 306 if ((error = vfs_rootmountalloc(MOUNT_LFS, "root_device", &mp))) { 307 vrele(rootvp); 308 return (error); 309 } 310 if ((error = lfs_mountfs(rootvp, mp, p))) { 311 mp->mnt_op->vfs_refcount--; 312 vfs_unbusy(mp); 313 free(mp, M_MOUNT); 314 vrele(rootvp); 315 return (error); 316 } 317 simple_lock(&mountlist_slock); 318 CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); 319 simple_unlock(&mountlist_slock); 320 (void)lfs_statfs(mp, &mp->mnt_stat, p); 321 vfs_unbusy(mp); 322 inittodr(VFSTOUFS(mp)->um_lfs->lfs_tstamp); 323 return (0); 324 } 325 326 /* 327 * VFS Operations. 328 * 329 * mount system call 330 */ 331 int 332 lfs_mount(struct mount *mp, const char *path, void *data, struct nameidata *ndp, struct proc *p) 333 { 334 struct vnode *devvp; 335 struct ufs_args args; 336 struct ufsmount *ump = NULL; 337 struct lfs *fs = NULL; /* LFS */ 338 int error; 339 mode_t accessmode; 340 341 if (mp->mnt_flag & MNT_GETARGS) { 342 ump = VFSTOUFS(mp); 343 if (ump == NULL) 344 return EIO; 345 args.fspec = NULL; 346 vfs_showexport(mp, &args.export, &ump->um_export); 347 return copyout(&args, data, sizeof(args)); 348 } 349 error = copyin(data, &args, sizeof (struct ufs_args)); 350 if (error) 351 return (error); 352 353 /* 354 * If updating, check whether changing from read-only to 355 * read/write; if there is no device name, that's all we do. 356 */ 357 if (mp->mnt_flag & MNT_UPDATE) { 358 ump = VFSTOUFS(mp); 359 fs = ump->um_lfs; 360 if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) { 361 /* 362 * If upgrade to read-write by non-root, then verify 363 * that user has necessary permissions on the device. 364 */ 365 if (p->p_ucred->cr_uid != 0) { 366 vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); 367 error = VOP_ACCESS(ump->um_devvp, VREAD|VWRITE, 368 p->p_ucred, p); 369 VOP_UNLOCK(ump->um_devvp, 0); 370 if (error) 371 return (error); 372 } 373 fs->lfs_ronly = 0; 374 } 375 if (args.fspec == 0) { 376 /* 377 * Process export requests. 378 */ 379 return (vfs_export(mp, &ump->um_export, &args.export)); 380 } 381 } 382 /* 383 * Not an update, or updating the name: look up the name 384 * and verify that it refers to a sensible block device. 385 */ 386 NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); 387 if ((error = namei(ndp)) != 0) 388 return (error); 389 devvp = ndp->ni_vp; 390 if (devvp->v_type != VBLK) { 391 vrele(devvp); 392 return (ENOTBLK); 393 } 394 if (bdevsw_lookup(devvp->v_rdev) == NULL) { 395 vrele(devvp); 396 return (ENXIO); 397 } 398 /* 399 * If mount by non-root, then verify that user has necessary 400 * permissions on the device. 401 */ 402 if (p->p_ucred->cr_uid != 0) { 403 accessmode = VREAD; 404 if ((mp->mnt_flag & MNT_RDONLY) == 0) 405 accessmode |= VWRITE; 406 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 407 error = VOP_ACCESS(devvp, accessmode, p->p_ucred, p); 408 if (error) { 409 vput(devvp); 410 return (error); 411 } 412 VOP_UNLOCK(devvp, 0); 413 } 414 if ((mp->mnt_flag & MNT_UPDATE) == 0) 415 error = lfs_mountfs(devvp, mp, p); /* LFS */ 416 else { 417 if (devvp != ump->um_devvp) 418 error = EINVAL; /* needs translation */ 419 else 420 vrele(devvp); 421 } 422 if (error) { 423 vrele(devvp); 424 return (error); 425 } 426 ump = VFSTOUFS(mp); 427 fs = ump->um_lfs; /* LFS */ 428 return set_statfs_info(path, UIO_USERSPACE, args.fspec, 429 UIO_USERSPACE, mp, p); 430 } 431 432 /* 433 * Roll-forward code. 434 */ 435 436 /* 437 * Load the appropriate indirect block, and change the appropriate pointer. 438 * Mark the block dirty. Do segment and avail accounting. 439 */ 440 static int 441 update_meta(struct lfs *fs, ino_t ino, int version, daddr_t lbn, 442 daddr_t ndaddr, size_t size, struct proc *p) 443 { 444 int error; 445 struct vnode *vp; 446 struct inode *ip; 447 #ifdef DEBUG_LFS_RFW 448 daddr_t odaddr; 449 struct indir a[NIADDR]; 450 int num; 451 int i; 452 #endif /* DEBUG_LFS_RFW */ 453 struct buf *bp; 454 SEGUSE *sup; 455 456 KASSERT(lbn >= 0); /* no indirect blocks */ 457 458 if ((error = lfs_rf_valloc(fs, ino, version, p, &vp)) != 0) { 459 #ifdef DEBUG_LFS_RFW 460 printf("update_meta: ino %d: lfs_rf_valloc returned %d\n", ino, 461 error); 462 #endif /* DEBUG_LFS_RFW */ 463 return error; 464 } 465 466 if ((error = VOP_BALLOC(vp, (lbn << fs->lfs_bshift), size, 467 NOCRED, 0, &bp)) != 0) { 468 vput(vp); 469 return (error); 470 } 471 /* No need to write, the block is already on disk */ 472 if (bp->b_flags & B_DELWRI) { 473 LFS_UNLOCK_BUF(bp); 474 fs->lfs_avail += btofsb(fs, bp->b_bcount); 475 } 476 bp->b_flags |= B_INVAL; 477 brelse(bp); 478 479 /* 480 * Extend the file, if it is not large enough already. 481 * XXX this is not exactly right, we don't know how much of the 482 * XXX last block is actually used. We hope that an inode will 483 * XXX appear later to give the correct size. 484 */ 485 ip = VTOI(vp); 486 if (ip->i_size <= (lbn << fs->lfs_bshift)) { 487 u_int64_t newsize; 488 489 if (lbn < NDADDR) 490 newsize = ip->i_ffs1_size = (lbn << fs->lfs_bshift) + 491 (size - fs->lfs_fsize) + 1; 492 else 493 newsize = ip->i_ffs1_size = (lbn << fs->lfs_bshift) + 1; 494 495 if (ip->i_size < newsize) { 496 ip->i_size = newsize; 497 /* 498 * tell vm our new size for the case the inode won't 499 * appear later. 500 */ 501 uvm_vnp_setsize(vp, newsize); 502 } 503 } 504 505 lfs_update_single(fs, NULL, vp, lbn, ndaddr, size); 506 507 LFS_SEGENTRY(sup, fs, dtosn(fs, ndaddr), bp); 508 sup->su_nbytes += size; 509 LFS_WRITESEGENTRY(sup, fs, dtosn(fs, ndaddr), bp); 510 511 /* differences here should be due to UNWRITTEN indirect blocks. */ 512 KASSERT((lblkno(fs, ip->i_size) > NDADDR && 513 ip->i_lfs_effnblks == ip->i_ffs1_blocks) || 514 ip->i_lfs_effnblks >= ip->i_ffs1_blocks); 515 516 #ifdef DEBUG_LFS_RFW 517 /* Now look again to make sure it worked */ 518 ufs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 519 for (i = num; i > 0; i--) { 520 if (!a[i].in_exists) 521 panic("update_meta: absent %d lv indirect block", i); 522 } 523 if (dbtofsb(fs, odaddr) != ndaddr) 524 printf("update_meta: failed setting ino %d lbn %" PRId64 525 " to %" PRId64 "\n", ino, lbn, ndaddr); 526 #endif /* DEBUG_LFS_RFW */ 527 vput(vp); 528 return 0; 529 } 530 531 static int 532 update_inoblk(struct lfs *fs, daddr_t offset, struct ucred *cred, 533 struct proc *p) 534 { 535 struct vnode *devvp, *vp; 536 struct inode *ip; 537 struct ufs1_dinode *dip; 538 struct buf *dbp, *ibp; 539 int error; 540 daddr_t daddr; 541 IFILE *ifp; 542 SEGUSE *sup; 543 544 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 545 546 /* 547 * Get the inode, update times and perms. 548 * DO NOT update disk blocks, we do that separately. 549 */ 550 error = bread(devvp, fsbtodb(fs, offset), fs->lfs_ibsize, cred, &dbp); 551 if (error) { 552 #ifdef DEBUG_LFS_RFW 553 printf("update_inoblk: bread returned %d\n", error); 554 #endif 555 return error; 556 } 557 dip = ((struct ufs1_dinode *)(dbp->b_data)) + INOPB(fs); 558 while (--dip >= (struct ufs1_dinode *)dbp->b_data) { 559 if (dip->di_inumber > LFS_IFILE_INUM) { 560 /* printf("ino %d version %d\n", dip->di_inumber, 561 dip->di_gen); */ 562 error = lfs_rf_valloc(fs, dip->di_inumber, dip->di_gen, 563 p, &vp); 564 if (error) { 565 #ifdef DEBUG_LFS_RFW 566 printf("update_inoblk: lfs_rf_valloc returned %d\n", error); 567 #endif 568 continue; 569 } 570 ip = VTOI(vp); 571 if (dip->di_size != ip->i_size) 572 VOP_TRUNCATE(vp, dip->di_size, 0, NOCRED, p); 573 /* Get mode, link count, size, and times */ 574 memcpy(ip->i_din.ffs1_din, dip, 575 offsetof(struct ufs1_dinode, di_db[0])); 576 577 /* Then the rest, except di_blocks */ 578 ip->i_flags = ip->i_ffs1_flags = dip->di_flags; 579 ip->i_gen = ip->i_ffs1_gen = dip->di_gen; 580 ip->i_uid = ip->i_ffs1_uid = dip->di_uid; 581 ip->i_gid = ip->i_ffs1_gid = dip->di_gid; 582 583 ip->i_mode = ip->i_ffs1_mode; 584 ip->i_nlink = ip->i_ffs_effnlink = ip->i_ffs1_nlink; 585 ip->i_size = ip->i_ffs1_size; 586 587 LFS_SET_UINO(ip, IN_CHANGE | IN_MODIFIED | IN_UPDATE); 588 589 /* Re-initialize to get type right */ 590 ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, 591 &vp); 592 vput(vp); 593 594 /* Record change in location */ 595 LFS_IENTRY(ifp, fs, dip->di_inumber, ibp); 596 daddr = ifp->if_daddr; 597 ifp->if_daddr = dbtofsb(fs, dbp->b_blkno); 598 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 599 /* And do segment accounting */ 600 if (dtosn(fs, daddr) != dtosn(fs, dbtofsb(fs, dbp->b_blkno))) { 601 if (daddr > 0) { 602 LFS_SEGENTRY(sup, fs, dtosn(fs, daddr), 603 ibp); 604 sup->su_nbytes -= sizeof (struct ufs1_dinode); 605 LFS_WRITESEGENTRY(sup, fs, 606 dtosn(fs, daddr), 607 ibp); 608 } 609 LFS_SEGENTRY(sup, fs, dtosn(fs, dbtofsb(fs, dbp->b_blkno)), 610 ibp); 611 sup->su_nbytes += sizeof (struct ufs1_dinode); 612 LFS_WRITESEGENTRY(sup, fs, 613 dtosn(fs, dbtofsb(fs, dbp->b_blkno)), 614 ibp); 615 } 616 } 617 } 618 dbp->b_flags |= B_AGE; 619 brelse(dbp); 620 621 return 0; 622 } 623 624 #define CHECK_CKSUM 0x0001 /* Check the checksum to make sure it's valid */ 625 #define CHECK_UPDATE 0x0002 /* Update Ifile for new data blocks / inodes */ 626 627 static daddr_t 628 check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial, 629 struct ucred *cred, int flags, int *pseg_flags, struct proc *p) 630 { 631 struct vnode *devvp; 632 struct buf *bp, *dbp; 633 int error, nblocks = 0, ninos, i, j; /* XXX: gcc */ 634 SEGSUM *ssp; 635 u_long *dp = NULL, *datap = NULL; /* XXX u_int32_t */ 636 daddr_t oldoffset; 637 int32_t *iaddr; /* XXX ondisk32 */ 638 FINFO *fip; 639 SEGUSE *sup; 640 size_t size; 641 642 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 643 /* 644 * If the segment has a superblock and we're at the top 645 * of the segment, skip the superblock. 646 */ 647 if (sntod(fs, dtosn(fs, offset)) == offset) { 648 LFS_SEGENTRY(sup, fs, dtosn(fs, offset), bp); 649 if (sup->su_flags & SEGUSE_SUPERBLOCK) 650 offset += btofsb(fs, LFS_SBPAD); 651 brelse(bp); 652 } 653 654 /* Read in the segment summary */ 655 error = bread(devvp, fsbtodb(fs, offset), fs->lfs_sumsize, cred, &bp); 656 if (error) 657 return -1; 658 659 /* Check summary checksum */ 660 ssp = (SEGSUM *)bp->b_data; 661 if (flags & CHECK_CKSUM) { 662 if (ssp->ss_sumsum != cksum(&ssp->ss_datasum, 663 fs->lfs_sumsize - 664 sizeof(ssp->ss_sumsum))) { 665 #ifdef DEBUG_LFS_RFW 666 printf("Sumsum error at 0x%" PRIx64 "\n", offset); 667 #endif 668 offset = -1; 669 goto err1; 670 } 671 if (ssp->ss_nfinfo == 0 && ssp->ss_ninos == 0) { 672 #ifdef DEBUG_LFS_RFW 673 printf("Empty pseg at 0x%" PRIx64 "\n", offset); 674 #endif 675 offset = -1; 676 goto err1; 677 } 678 if (ssp->ss_create < fs->lfs_tstamp) { 679 #ifdef DEBUG_LFS_RFW 680 printf("Old data at 0x%" PRIx64 "\n", offset); 681 #endif 682 offset = -1; 683 goto err1; 684 } 685 } 686 if (fs->lfs_version > 1) { 687 if (ssp->ss_serial != nextserial) { 688 #ifdef DEBUG_LFS_RFW 689 printf("Unexpected serial number at 0x%" PRIx64 690 "\n", offset); 691 #endif 692 offset = -1; 693 goto err1; 694 } 695 if (ssp->ss_ident != fs->lfs_ident) { 696 #ifdef DEBUG_LFS_RFW 697 printf("Incorrect fsid (0x%x vs 0x%x) at 0x%" 698 PRIx64 "\n", ssp->ss_ident, fs->lfs_ident, offset); 699 #endif 700 offset = -1; 701 goto err1; 702 } 703 } 704 if (pseg_flags) 705 *pseg_flags = ssp->ss_flags; 706 oldoffset = offset; 707 offset += btofsb(fs, fs->lfs_sumsize); 708 709 ninos = howmany(ssp->ss_ninos, INOPB(fs)); 710 /* XXX ondisk32 */ 711 iaddr = (int32_t *)(bp->b_data + fs->lfs_sumsize - sizeof(int32_t)); 712 if (flags & CHECK_CKSUM) { 713 /* Count blocks */ 714 nblocks = 0; 715 fip = (FINFO *)(bp->b_data + SEGSUM_SIZE(fs)); 716 for (i = 0; i < ssp->ss_nfinfo; ++i) { 717 nblocks += fip->fi_nblocks; 718 if (fip->fi_nblocks <= 0) 719 break; 720 /* XXX ondisk32 */ 721 fip = (FINFO *)(((char *)fip) + FINFOSIZE + 722 (fip->fi_nblocks * sizeof(int32_t))); 723 } 724 nblocks += ninos; 725 /* Create the sum array */ 726 datap = dp = (u_long *)malloc(nblocks * sizeof(u_long), 727 M_SEGMENT, M_WAITOK); 728 } 729 730 /* Handle individual blocks */ 731 fip = (FINFO *)(bp->b_data + SEGSUM_SIZE(fs)); 732 for (i = 0; i < ssp->ss_nfinfo || ninos; ++i) { 733 /* Inode block? */ 734 if (ninos && *iaddr == offset) { 735 if (flags & CHECK_CKSUM) { 736 /* Read in the head and add to the buffer */ 737 error = bread(devvp, fsbtodb(fs, offset), fs->lfs_bsize, 738 cred, &dbp); 739 if (error) { 740 offset = -1; 741 goto err2; 742 } 743 (*dp++) = ((u_long *)(dbp->b_data))[0]; 744 dbp->b_flags |= B_AGE; 745 brelse(dbp); 746 } 747 if (flags & CHECK_UPDATE) { 748 if ((error = update_inoblk(fs, offset, cred, p)) 749 != 0) { 750 offset = -1; 751 goto err2; 752 } 753 } 754 offset += btofsb(fs, fs->lfs_ibsize); 755 --iaddr; 756 --ninos; 757 --i; /* compensate */ 758 continue; 759 } 760 /* printf("check: blocks from ino %d version %d\n", 761 fip->fi_ino, fip->fi_version); */ 762 size = fs->lfs_bsize; 763 for (j = 0; j < fip->fi_nblocks; ++j) { 764 if (j == fip->fi_nblocks - 1) 765 size = fip->fi_lastlength; 766 if (flags & CHECK_CKSUM) { 767 error = bread(devvp, fsbtodb(fs, offset), size, cred, &dbp); 768 if (error) { 769 offset = -1; 770 goto err2; 771 } 772 (*dp++) = ((u_long *)(dbp->b_data))[0]; 773 dbp->b_flags |= B_AGE; 774 brelse(dbp); 775 } 776 /* Account for and update any direct blocks */ 777 if ((flags & CHECK_UPDATE) && 778 fip->fi_ino > LFS_IFILE_INUM && 779 fip->fi_blocks[j] >= 0) { 780 update_meta(fs, fip->fi_ino, fip->fi_version, 781 fip->fi_blocks[j], offset, size, p); 782 } 783 offset += btofsb(fs, size); 784 } 785 /* XXX ondisk32 */ 786 fip = (FINFO *)(((char *)fip) + FINFOSIZE 787 + fip->fi_nblocks * sizeof(int32_t)); 788 } 789 /* Checksum the array, compare */ 790 if ((flags & CHECK_CKSUM) && 791 ssp->ss_datasum != cksum(datap, nblocks * sizeof(u_long))) 792 { 793 #ifdef DEBUG_LFS_RFW 794 printf("Datasum error at 0x%" PRIx64 " (wanted %x got %x)\n", 795 offset, ssp->ss_datasum, cksum(datap, nblocks * 796 sizeof(u_long))); 797 #endif 798 offset = -1; 799 goto err2; 800 } 801 802 /* If we're at the end of the segment, move to the next */ 803 if (dtosn(fs, offset + btofsb(fs, fs->lfs_sumsize + fs->lfs_bsize)) != 804 dtosn(fs, offset)) { 805 if (dtosn(fs, offset) == dtosn(fs, ssp->ss_next)) { 806 offset = -1; 807 goto err2; 808 } 809 offset = ssp->ss_next; 810 #ifdef DEBUG_LFS_RFW 811 printf("LFS roll forward: moving on to offset 0x%" PRIx64 812 " -> segment %d\n", offset, dtosn(fs,offset)); 813 #endif 814 } 815 816 if (flags & CHECK_UPDATE) { 817 fs->lfs_avail -= (offset - oldoffset); 818 /* Don't clog the buffer queue */ 819 simple_lock(&lfs_subsys_lock); 820 if (locked_queue_count > LFS_MAX_BUFS || 821 locked_queue_bytes > LFS_MAX_BYTES) { 822 lfs_flush(fs, SEGM_CKP); 823 } 824 simple_unlock(&lfs_subsys_lock); 825 } 826 827 err2: 828 if (flags & CHECK_CKSUM) 829 free(datap, M_SEGMENT); 830 err1: 831 bp->b_flags |= B_AGE; 832 brelse(bp); 833 834 /* XXX should we update the serial number even for bad psegs? */ 835 if ((flags & CHECK_UPDATE) && offset > 0 && fs->lfs_version > 1) 836 fs->lfs_serial = nextserial; 837 return offset; 838 } 839 840 /* 841 * Common code for mount and mountroot 842 * LFS specific 843 */ 844 int 845 lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p) 846 { 847 extern struct vnode *rootvp; 848 struct dlfs *tdfs, *dfs, *adfs; 849 struct lfs *fs; 850 struct ufsmount *ump; 851 struct vnode *vp; 852 struct buf *bp, *abp; 853 struct partinfo dpart; 854 dev_t dev; 855 int error, i, ronly, secsize, fsbsize; 856 struct ucred *cred; 857 CLEANERINFO *cip; 858 SEGUSE *sup; 859 int flags, dirty, do_rollforward; 860 daddr_t offset, oldoffset, lastgoodpseg, sb_addr; 861 int sn, curseg; 862 863 cred = p ? p->p_ucred : NOCRED; 864 /* 865 * Disallow multiple mounts of the same device. 866 * Disallow mounting of a device that is currently in use 867 * (except for root, which might share swap device for miniroot). 868 * Flush out any old buffers remaining from a previous use. 869 */ 870 if ((error = vfs_mountedon(devvp)) != 0) 871 return (error); 872 if (vcount(devvp) > 1 && devvp != rootvp) 873 return (EBUSY); 874 if ((error = vinvalbuf(devvp, V_SAVE, cred, p, 0, 0)) != 0) 875 return (error); 876 877 ronly = (mp->mnt_flag & MNT_RDONLY) != 0; 878 error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); 879 if (error) 880 return (error); 881 if (VOP_IOCTL(devvp, DIOCGPART, &dpart, FREAD, cred, p) != 0) 882 secsize = DEV_BSIZE; 883 else 884 secsize = dpart.disklab->d_secsize; 885 886 /* Don't free random space on error. */ 887 bp = NULL; 888 abp = NULL; 889 ump = NULL; 890 891 sb_addr = LFS_LABELPAD / secsize; 892 while (1) { 893 /* Read in the superblock. */ 894 error = bread(devvp, sb_addr, LFS_SBPAD, cred, &bp); 895 if (error) 896 goto out; 897 dfs = (struct dlfs *)bp->b_data; 898 899 /* Check the basics. */ 900 if (dfs->dlfs_magic != LFS_MAGIC || dfs->dlfs_bsize >= MAXBSIZE || 901 dfs->dlfs_version > LFS_VERSION || 902 dfs->dlfs_bsize < sizeof(struct dlfs)) { 903 #ifdef DEBUG_LFS 904 printf("lfs_mountfs: primary superblock sanity failed\n"); 905 #endif 906 error = EINVAL; /* XXX needs translation */ 907 goto out; 908 } 909 if (dfs->dlfs_inodefmt > LFS_MAXINODEFMT) 910 printf("lfs_mountfs: warning: unknown inode format %d\n", 911 dfs->dlfs_inodefmt); 912 913 if (dfs->dlfs_version == 1) 914 fsbsize = secsize; 915 else { 916 fsbsize = 1 << (dfs->dlfs_bshift - dfs->dlfs_blktodb + 917 dfs->dlfs_fsbtodb); 918 /* 919 * Could be, if the frag size is large enough, that we 920 * don't have the "real" primary superblock. If that's 921 * the case, get the real one, and try again. 922 */ 923 if (sb_addr != dfs->dlfs_sboffs[0] << 924 dfs->dlfs_fsbtodb) { 925 /* #ifdef DEBUG_LFS */ 926 printf("lfs_mountfs: sb daddr 0x%llx is not right, trying 0x%llx\n", 927 (long long)sb_addr, (long long)(dfs->dlfs_sboffs[0] << 928 dfs->dlfs_fsbtodb)); 929 /* #endif */ 930 sb_addr = dfs->dlfs_sboffs[0] << 931 dfs->dlfs_fsbtodb; 932 brelse(bp); 933 continue; 934 } 935 } 936 break; 937 } 938 939 /* 940 * Check the second superblock to see which is newer; then mount 941 * using the older of the two. This is necessary to ensure that 942 * the filesystem is valid if it was not unmounted cleanly. 943 */ 944 945 if (dfs->dlfs_sboffs[1] && 946 dfs->dlfs_sboffs[1] - LFS_LABELPAD / fsbsize > LFS_SBPAD / fsbsize) 947 { 948 error = bread(devvp, dfs->dlfs_sboffs[1] * (fsbsize / secsize), 949 LFS_SBPAD, cred, &abp); 950 if (error) 951 goto out; 952 adfs = (struct dlfs *)abp->b_data; 953 954 if (dfs->dlfs_version == 1) { 955 /* 1s resolution comparison */ 956 if (adfs->dlfs_tstamp < dfs->dlfs_tstamp) 957 tdfs = adfs; 958 else 959 tdfs = dfs; 960 } else { 961 /* monotonic infinite-resolution comparison */ 962 if (adfs->dlfs_serial < dfs->dlfs_serial) 963 tdfs = adfs; 964 else 965 tdfs = dfs; 966 } 967 968 /* Check the basics. */ 969 if (tdfs->dlfs_magic != LFS_MAGIC || 970 tdfs->dlfs_bsize > MAXBSIZE || 971 tdfs->dlfs_version > LFS_VERSION || 972 tdfs->dlfs_bsize < sizeof(struct dlfs)) { 973 #ifdef DEBUG_LFS 974 printf("lfs_mountfs: alt superblock sanity failed\n"); 975 #endif 976 error = EINVAL; /* XXX needs translation */ 977 goto out; 978 } 979 } else { 980 #ifdef DEBUG_LFS 981 printf("lfs_mountfs: invalid alt superblock daddr=0x%x\n", 982 dfs->dlfs_sboffs[1]); 983 #endif 984 error = EINVAL; 985 goto out; 986 } 987 988 /* Allocate the mount structure, copy the superblock into it. */ 989 fs = malloc(sizeof(struct lfs), M_UFSMNT, M_WAITOK | M_ZERO); 990 memcpy(&fs->lfs_dlfs, tdfs, sizeof(struct dlfs)); 991 992 /* Compatibility */ 993 if (fs->lfs_version < 2) { 994 fs->lfs_sumsize = LFS_V1_SUMMARY_SIZE; 995 fs->lfs_ibsize = fs->lfs_bsize; 996 fs->lfs_start = fs->lfs_sboffs[0]; 997 fs->lfs_tstamp = fs->lfs_otstamp; 998 fs->lfs_fsbtodb = 0; 999 } 1000 1001 /* Before rolling forward, lock so vget will sleep for other procs */ 1002 fs->lfs_flags = LFS_NOTYET; 1003 fs->lfs_rfpid = p->p_pid; 1004 1005 ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); 1006 ump->um_lfs = fs; 1007 ump->um_fstype = UFS1; 1008 if (sizeof(struct lfs) < LFS_SBPAD) { /* XXX why? */ 1009 bp->b_flags |= B_INVAL; 1010 abp->b_flags |= B_INVAL; 1011 } 1012 brelse(bp); 1013 bp = NULL; 1014 brelse(abp); 1015 abp = NULL; 1016 1017 /* Set up the I/O information */ 1018 fs->lfs_devbsize = secsize; 1019 fs->lfs_iocount = 0; 1020 fs->lfs_diropwait = 0; 1021 fs->lfs_activesb = 0; 1022 fs->lfs_uinodes = 0; 1023 fs->lfs_ravail = 0; 1024 fs->lfs_sbactive = 0; 1025 1026 /* Set up the ifile and lock aflags */ 1027 fs->lfs_doifile = 0; 1028 fs->lfs_writer = 0; 1029 fs->lfs_dirops = 0; 1030 fs->lfs_nadirop = 0; 1031 fs->lfs_seglock = 0; 1032 fs->lfs_pdflush = 0; 1033 fs->lfs_sleepers = 0; 1034 simple_lock_init(&fs->lfs_interlock); 1035 lockinit(&fs->lfs_fraglock, PINOD, "lfs_fraglock", 0, 0); 1036 1037 /* Set the file system readonly/modify bits. */ 1038 fs->lfs_ronly = ronly; 1039 if (ronly == 0) 1040 fs->lfs_fmod = 1; 1041 1042 /* Initialize the mount structure. */ 1043 dev = devvp->v_rdev; 1044 mp->mnt_data = ump; 1045 mp->mnt_stat.f_fsid.val[0] = (long)dev; 1046 mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_LFS); 1047 mp->mnt_stat.f_iosize = fs->lfs_bsize; 1048 mp->mnt_maxsymlinklen = fs->lfs_maxsymlinklen; 1049 mp->mnt_flag |= MNT_LOCAL; 1050 mp->mnt_fs_bshift = fs->lfs_bshift; 1051 ump->um_flags = 0; 1052 ump->um_mountp = mp; 1053 ump->um_dev = dev; 1054 ump->um_devvp = devvp; 1055 ump->um_bptrtodb = fs->lfs_fsbtodb; 1056 ump->um_seqinc = fragstofsb(fs, fs->lfs_frag); 1057 ump->um_nindir = fs->lfs_nindir; 1058 ump->um_lognindir = ffs(fs->lfs_nindir) - 1; 1059 for (i = 0; i < MAXQUOTAS; i++) 1060 ump->um_quotas[i] = NULLVP; 1061 devvp->v_specmountpoint = mp; 1062 1063 /* Set up reserved memory for pageout */ 1064 lfs_setup_resblks(fs); 1065 /* Set up vdirop tailq */ 1066 TAILQ_INIT(&fs->lfs_dchainhd); 1067 /* and paging tailq */ 1068 TAILQ_INIT(&fs->lfs_pchainhd); 1069 1070 /* 1071 * We use the ifile vnode for almost every operation. Instead of 1072 * retrieving it from the hash table each time we retrieve it here, 1073 * artificially increment the reference count and keep a pointer 1074 * to it in the incore copy of the superblock. 1075 */ 1076 if ((error = VFS_VGET(mp, LFS_IFILE_INUM, &vp)) != 0) { 1077 #ifdef DEBUG 1078 printf("lfs_mountfs: ifile vget failed, error=%d\n", error); 1079 #endif 1080 goto out; 1081 } 1082 fs->lfs_ivnode = vp; 1083 VREF(vp); 1084 1085 /* Set up segment usage flags for the autocleaner. */ 1086 fs->lfs_nactive = 0; 1087 fs->lfs_suflags = (u_int32_t **)malloc(2 * sizeof(u_int32_t *), 1088 M_SEGMENT, M_WAITOK); 1089 fs->lfs_suflags[0] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t), 1090 M_SEGMENT, M_WAITOK); 1091 fs->lfs_suflags[1] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t), 1092 M_SEGMENT, M_WAITOK); 1093 memset(fs->lfs_suflags[1], 0, fs->lfs_nseg * sizeof(u_int32_t)); 1094 for (i = 0; i < fs->lfs_nseg; i++) { 1095 int changed; 1096 1097 LFS_SEGENTRY(sup, fs, i, bp); 1098 changed = 0; 1099 if (!ronly) { 1100 if (sup->su_nbytes == 0 && 1101 !(sup->su_flags & SEGUSE_EMPTY)) { 1102 sup->su_flags |= SEGUSE_EMPTY; 1103 ++changed; 1104 } else if (!(sup->su_nbytes == 0) && 1105 (sup->su_flags & SEGUSE_EMPTY)) { 1106 sup->su_flags &= ~SEGUSE_EMPTY; 1107 ++changed; 1108 } 1109 if (sup->su_flags & SEGUSE_ACTIVE) { 1110 sup->su_flags &= ~SEGUSE_ACTIVE; 1111 ++changed; 1112 } 1113 } 1114 fs->lfs_suflags[0][i] = sup->su_flags; 1115 if (changed) 1116 LFS_WRITESEGENTRY(sup, fs, i, bp); 1117 else 1118 brelse(bp); 1119 } 1120 1121 /* 1122 * Roll forward. 1123 * 1124 * We don't automatically roll forward for v1 filesystems, because 1125 * of the danger that the clock was turned back between the last 1126 * checkpoint and crash. This would roll forward garbage. 1127 * 1128 * v2 filesystems don't have this problem because they use a 1129 * monotonically increasing serial number instead of a timestamp. 1130 */ 1131 #ifdef LFS_DO_ROLLFORWARD 1132 do_rollforward = !fs->lfs_ronly; 1133 #else 1134 do_rollforward = (fs->lfs_version > 1 && !fs->lfs_ronly && 1135 !(fs->lfs_pflags & LFS_PF_CLEAN)); 1136 #endif 1137 if (do_rollforward) { 1138 u_int64_t nextserial; 1139 /* 1140 * Phase I: Find the address of the last good partial 1141 * segment that was written after the checkpoint. Mark 1142 * the segments in question dirty, so they won't be 1143 * reallocated. 1144 */ 1145 lastgoodpseg = oldoffset = offset = fs->lfs_offset; 1146 flags = 0x0; 1147 #ifdef DEBUG_LFS_RFW 1148 printf("LFS roll forward phase 1: starting at offset 0x%" 1149 PRIx64 "\n", offset); 1150 #endif 1151 LFS_SEGENTRY(sup, fs, dtosn(fs, offset), bp); 1152 if (!(sup->su_flags & SEGUSE_DIRTY)) 1153 --fs->lfs_nclean; 1154 sup->su_flags |= SEGUSE_DIRTY; 1155 LFS_WRITESEGENTRY(sup, fs, dtosn(fs, offset), bp); 1156 nextserial = fs->lfs_serial + 1; 1157 while ((offset = check_segsum(fs, offset, nextserial, 1158 cred, CHECK_CKSUM, &flags, p)) > 0) { 1159 nextserial++; 1160 if (sntod(fs, oldoffset) != sntod(fs, offset)) { 1161 LFS_SEGENTRY(sup, fs, dtosn(fs, oldoffset), 1162 bp); 1163 if (!(sup->su_flags & SEGUSE_DIRTY)) 1164 --fs->lfs_nclean; 1165 sup->su_flags |= SEGUSE_DIRTY; 1166 LFS_WRITESEGENTRY(sup, fs, dtosn(fs, oldoffset), 1167 bp); 1168 } 1169 1170 #ifdef DEBUG_LFS_RFW 1171 printf("LFS roll forward phase 1: offset=0x%" 1172 PRIx64 "\n", offset); 1173 if (flags & SS_DIROP) { 1174 printf("lfs_mountfs: dirops at 0x%" PRIx64 "\n", 1175 oldoffset); 1176 if (!(flags & SS_CONT)) 1177 printf("lfs_mountfs: dirops end " 1178 "at 0x%" PRIx64 "\n", oldoffset); 1179 } 1180 #endif 1181 if (!(flags & SS_CONT)) 1182 lastgoodpseg = offset; 1183 oldoffset = offset; 1184 } 1185 #ifdef DEBUG_LFS_RFW 1186 if (flags & SS_CONT) { 1187 printf("LFS roll forward: warning: incomplete " 1188 "dirops discarded\n"); 1189 } 1190 printf("LFS roll forward phase 1: completed: " 1191 "lastgoodpseg=0x%" PRIx64 "\n", lastgoodpseg); 1192 #endif 1193 oldoffset = fs->lfs_offset; 1194 if (fs->lfs_offset != lastgoodpseg) { 1195 /* Don't overwrite what we're trying to preserve */ 1196 offset = fs->lfs_offset; 1197 fs->lfs_offset = lastgoodpseg; 1198 fs->lfs_curseg = sntod(fs, dtosn(fs, fs->lfs_offset)); 1199 for (sn = curseg = dtosn(fs, fs->lfs_curseg);;) { 1200 sn = (sn + 1) % fs->lfs_nseg; 1201 if (sn == curseg) 1202 panic("lfs_mountfs: no clean segments"); 1203 LFS_SEGENTRY(sup, fs, sn, bp); 1204 dirty = (sup->su_flags & SEGUSE_DIRTY); 1205 brelse(bp); 1206 if (!dirty) 1207 break; 1208 } 1209 fs->lfs_nextseg = sntod(fs, sn); 1210 1211 /* 1212 * Phase II: Roll forward from the first superblock. 1213 */ 1214 while (offset != lastgoodpseg) { 1215 #ifdef DEBUG_LFS_RFW 1216 printf("LFS roll forward phase 2: 0x%" 1217 PRIx64 "\n", offset); 1218 #endif 1219 offset = check_segsum(fs, offset, 1220 fs->lfs_serial + 1, cred, CHECK_UPDATE, 1221 NULL, p); 1222 } 1223 1224 /* 1225 * Finish: flush our changes to disk. 1226 */ 1227 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); 1228 printf("lfs_mountfs: roll forward recovered %lld blocks\n", 1229 (long long)(lastgoodpseg - oldoffset)); 1230 } 1231 #ifdef DEBUG_LFS_RFW 1232 printf("LFS roll forward complete\n"); 1233 #endif 1234 } 1235 /* If writing, sb is not clean; record in case of immediate crash */ 1236 if (!fs->lfs_ronly) { 1237 fs->lfs_pflags &= ~LFS_PF_CLEAN; 1238 lfs_writesuper(fs, fs->lfs_sboffs[0]); 1239 lfs_writesuper(fs, fs->lfs_sboffs[1]); 1240 } 1241 1242 /* Allow vget now that roll-forward is complete */ 1243 fs->lfs_flags &= ~(LFS_NOTYET); 1244 wakeup(&fs->lfs_flags); 1245 1246 /* 1247 * Initialize the ifile cleaner info with information from 1248 * the superblock. 1249 */ 1250 LFS_CLEANERINFO(cip, fs, bp); 1251 cip->clean = fs->lfs_nclean; 1252 cip->dirty = fs->lfs_nseg - fs->lfs_nclean; 1253 cip->avail = fs->lfs_avail; 1254 cip->bfree = fs->lfs_bfree; 1255 (void) LFS_BWRITE_LOG(bp); /* Ifile */ 1256 1257 /* 1258 * Mark the current segment as ACTIVE, since we're going to 1259 * be writing to it. 1260 */ 1261 LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp); 1262 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 1263 fs->lfs_nactive++; 1264 LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp); /* Ifile */ 1265 1266 /* Now that roll-forward is done, unlock the Ifile */ 1267 vput(vp); 1268 1269 /* Comment on ifile size if it is too large */ 1270 if (fs->lfs_ivnode->v_size / fs->lfs_bsize > LFS_MAX_BUFS) { 1271 fs->lfs_flags |= LFS_WARNED; 1272 printf("lfs_mountfs: please consider increasing NBUF to at least %lld\n", 1273 (long long)(fs->lfs_ivnode->v_size / fs->lfs_bsize) * (nbuf / LFS_MAX_BUFS)); 1274 } 1275 if (fs->lfs_ivnode->v_size > LFS_MAX_BYTES) { 1276 fs->lfs_flags |= LFS_WARNED; 1277 printf("lfs_mountfs: please consider increasing BUFPAGES to at least %lld\n", 1278 (long long)(fs->lfs_ivnode->v_size * bufpages / LFS_MAX_BYTES)); 1279 } 1280 1281 return (0); 1282 out: 1283 if (bp) 1284 brelse(bp); 1285 if (abp) 1286 brelse(abp); 1287 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1288 (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p); 1289 VOP_UNLOCK(devvp, 0); 1290 if (ump) { 1291 free(ump->um_lfs, M_UFSMNT); 1292 free(ump, M_UFSMNT); 1293 mp->mnt_data = NULL; 1294 } 1295 1296 /* Start the pagedaemon-anticipating daemon */ 1297 if (lfs_writer_daemon == 0 && 1298 kthread_create1(lfs_writerd, NULL, NULL, "lfs_writer") != 0) 1299 panic("fork lfs_writer"); 1300 1301 return (error); 1302 } 1303 1304 /* 1305 * unmount system call 1306 */ 1307 int 1308 lfs_unmount(struct mount *mp, int mntflags, struct proc *p) 1309 { 1310 struct ufsmount *ump; 1311 struct lfs *fs; 1312 int error, flags, ronly; 1313 int s; 1314 1315 flags = 0; 1316 if (mntflags & MNT_FORCE) 1317 flags |= FORCECLOSE; 1318 1319 ump = VFSTOUFS(mp); 1320 fs = ump->um_lfs; 1321 1322 /* wake up the cleaner so it can die */ 1323 wakeup(&fs->lfs_nextseg); 1324 wakeup(&lfs_allclean_wakeup); 1325 simple_lock(&fs->lfs_interlock); 1326 while (fs->lfs_sleepers) 1327 ltsleep(&fs->lfs_sleepers, PRIBIO + 1, "lfs_sleepers", 0, 1328 &fs->lfs_interlock); 1329 simple_unlock(&fs->lfs_interlock); 1330 1331 #ifdef QUOTA 1332 if (mp->mnt_flag & MNT_QUOTA) { 1333 int i; 1334 error = vflush(mp, fs->lfs_ivnode, SKIPSYSTEM|flags); 1335 if (error) 1336 return (error); 1337 for (i = 0; i < MAXQUOTAS; i++) { 1338 if (ump->um_quotas[i] == NULLVP) 1339 continue; 1340 quotaoff(p, mp, i); 1341 } 1342 /* 1343 * Here we fall through to vflush again to ensure 1344 * that we have gotten rid of all the system vnodes. 1345 */ 1346 } 1347 #endif 1348 if ((error = vflush(mp, fs->lfs_ivnode, flags)) != 0) 1349 return (error); 1350 if ((error = VFS_SYNC(mp, 1, p->p_ucred, p)) != 0) 1351 return (error); 1352 s = splbio(); 1353 if (LIST_FIRST(&fs->lfs_ivnode->v_dirtyblkhd)) 1354 panic("lfs_unmount: still dirty blocks on ifile vnode"); 1355 splx(s); 1356 1357 /* Comment on ifile size if it has become too large */ 1358 if (!(fs->lfs_flags & LFS_WARNED)) { 1359 if (fs->lfs_ivnode->v_size / fs->lfs_bsize > LFS_MAX_BUFS) 1360 printf("lfs_unmount: please consider increasing" 1361 " NBUF to at least %lld\n", 1362 (long long)(fs->lfs_ivnode->v_size / 1363 fs->lfs_bsize) * 1364 (long long)(nbuf / LFS_MAX_BUFS)); 1365 if (fs->lfs_ivnode->v_size > LFS_MAX_BYTES) 1366 printf("lfs_unmount: please consider increasing" 1367 " BUFPAGES to at least %lld\n", 1368 (long long)(fs->lfs_ivnode->v_size * 1369 bufpages / LFS_MAX_BYTES)); 1370 } 1371 1372 /* Explicitly write the superblock, to update serial and pflags */ 1373 fs->lfs_pflags |= LFS_PF_CLEAN; 1374 lfs_writesuper(fs, fs->lfs_sboffs[0]); 1375 lfs_writesuper(fs, fs->lfs_sboffs[1]); 1376 while (fs->lfs_iocount) 1377 tsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs_umount", 0); 1378 1379 /* Finish with the Ifile, now that we're done with it */ 1380 vrele(fs->lfs_ivnode); 1381 vgone(fs->lfs_ivnode); 1382 1383 ronly = !fs->lfs_ronly; 1384 if (ump->um_devvp->v_type != VBAD) 1385 ump->um_devvp->v_specmountpoint = NULL; 1386 vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); 1387 error = VOP_CLOSE(ump->um_devvp, 1388 ronly ? FREAD : FREAD|FWRITE, NOCRED, p); 1389 vput(ump->um_devvp); 1390 1391 /* Free per-mount data structures */ 1392 free(fs->lfs_suflags[0], M_SEGMENT); 1393 free(fs->lfs_suflags[1], M_SEGMENT); 1394 free(fs->lfs_suflags, M_SEGMENT); 1395 lfs_free_resblks(fs); 1396 free(fs, M_UFSMNT); 1397 free(ump, M_UFSMNT); 1398 1399 mp->mnt_data = NULL; 1400 mp->mnt_flag &= ~MNT_LOCAL; 1401 return (error); 1402 } 1403 1404 /* 1405 * Get file system statistics. 1406 */ 1407 int 1408 lfs_statfs(struct mount *mp, struct statfs *sbp, struct proc *p) 1409 { 1410 struct lfs *fs; 1411 struct ufsmount *ump; 1412 1413 ump = VFSTOUFS(mp); 1414 fs = ump->um_lfs; 1415 if (fs->lfs_magic != LFS_MAGIC) 1416 panic("lfs_statfs: magic"); 1417 1418 sbp->f_type = 0; 1419 sbp->f_bsize = fs->lfs_fsize; 1420 sbp->f_iosize = fs->lfs_bsize; 1421 sbp->f_blocks = fsbtofrags(fs, LFS_EST_NONMETA(fs)); 1422 sbp->f_bfree = fsbtofrags(fs, LFS_EST_BFREE(fs)); 1423 sbp->f_bavail = fsbtofrags(fs, (long)LFS_EST_BFREE(fs) - 1424 (long)LFS_EST_RSVD(fs)); 1425 1426 sbp->f_files = fs->lfs_bfree / btofsb(fs, fs->lfs_ibsize) * INOPB(fs); 1427 sbp->f_ffree = sbp->f_files - fs->lfs_nfiles; 1428 copy_statfs_info(sbp, mp); 1429 return (0); 1430 } 1431 1432 /* 1433 * Go through the disk queues to initiate sandbagged IO; 1434 * go through the inodes to write those that have been modified; 1435 * initiate the writing of the super block if it has been modified. 1436 * 1437 * Note: we are always called with the filesystem marked `MPBUSY'. 1438 */ 1439 int 1440 lfs_sync(struct mount *mp, int waitfor, struct ucred *cred, struct proc *p) 1441 { 1442 int error; 1443 struct lfs *fs; 1444 1445 fs = VFSTOUFS(mp)->um_lfs; 1446 if (fs->lfs_ronly) 1447 return 0; 1448 lfs_writer_enter(fs, "lfs_dirops"); 1449 1450 /* All syncs must be checkpoints until roll-forward is implemented. */ 1451 error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0)); 1452 lfs_writer_leave(fs); 1453 #ifdef QUOTA 1454 qsync(mp); 1455 #endif 1456 return (error); 1457 } 1458 1459 extern struct lock ufs_hashlock; 1460 1461 /* 1462 * Look up an LFS dinode number to find its incore vnode. If not already 1463 * in core, read it in from the specified device. Return the inode locked. 1464 * Detection and handling of mount points must be done by the calling routine. 1465 */ 1466 int 1467 lfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp) 1468 { 1469 struct lfs *fs; 1470 struct ufs1_dinode *dip; 1471 struct inode *ip; 1472 struct buf *bp; 1473 struct ifile *ifp; 1474 struct vnode *vp; 1475 struct ufsmount *ump; 1476 daddr_t daddr; 1477 dev_t dev; 1478 int error, retries; 1479 struct timespec ts; 1480 1481 ump = VFSTOUFS(mp); 1482 dev = ump->um_dev; 1483 fs = ump->um_lfs; 1484 1485 /* 1486 * If the filesystem is not completely mounted yet, suspend 1487 * any access requests (wait for roll-forward to complete). 1488 */ 1489 while ((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid != fs->lfs_rfpid) 1490 tsleep(&fs->lfs_flags, PRIBIO+1, "lfs_notyet", 0); 1491 1492 if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) 1493 return (0); 1494 1495 if ((error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, &vp)) != 0) { 1496 *vpp = NULL; 1497 return (error); 1498 } 1499 1500 do { 1501 if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) { 1502 ungetnewvnode(vp); 1503 return (0); 1504 } 1505 } while (lockmgr(&ufs_hashlock, LK_EXCLUSIVE|LK_SLEEPFAIL, 0)); 1506 1507 /* Translate the inode number to a disk address. */ 1508 if (ino == LFS_IFILE_INUM) 1509 daddr = fs->lfs_idaddr; 1510 else { 1511 /* XXX bounds-check this too */ 1512 LFS_IENTRY(ifp, fs, ino, bp); 1513 daddr = ifp->if_daddr; 1514 if (fs->lfs_version > 1) { 1515 ts.tv_sec = ifp->if_atime_sec; 1516 ts.tv_nsec = ifp->if_atime_nsec; 1517 } 1518 1519 brelse(bp); 1520 if (daddr == LFS_UNUSED_DADDR) { 1521 *vpp = NULLVP; 1522 ungetnewvnode(vp); 1523 lockmgr(&ufs_hashlock, LK_RELEASE, 0); 1524 return (ENOENT); 1525 } 1526 } 1527 1528 /* Allocate/init new vnode/inode. */ 1529 lfs_vcreate(mp, ino, vp); 1530 1531 /* 1532 * Put it onto its hash chain and lock it so that other requests for 1533 * this inode will block if they arrive while we are sleeping waiting 1534 * for old data structures to be purged or for the contents of the 1535 * disk portion of this inode to be read. 1536 */ 1537 ip = VTOI(vp); 1538 ufs_ihashins(ip); 1539 lockmgr(&ufs_hashlock, LK_RELEASE, 0); 1540 1541 /* 1542 * XXX 1543 * This may not need to be here, logically it should go down with 1544 * the i_devvp initialization. 1545 * Ask Kirk. 1546 */ 1547 ip->i_lfs = ump->um_lfs; 1548 1549 /* Read in the disk contents for the inode, copy into the inode. */ 1550 retries = 0; 1551 again: 1552 error = bread(ump->um_devvp, fsbtodb(fs, daddr), 1553 (fs->lfs_version == 1 ? fs->lfs_bsize : fs->lfs_ibsize), 1554 NOCRED, &bp); 1555 if (error) { 1556 /* 1557 * The inode does not contain anything useful, so it would 1558 * be misleading to leave it on its hash chain. With mode 1559 * still zero, it will be unlinked and returned to the free 1560 * list by vput(). 1561 */ 1562 vput(vp); 1563 brelse(bp); 1564 *vpp = NULL; 1565 return (error); 1566 } 1567 1568 dip = lfs_ifind(fs, ino, bp); 1569 if (dip == NULL) { 1570 /* Assume write has not completed yet; try again */ 1571 bp->b_flags |= B_INVAL; 1572 brelse(bp); 1573 ++retries; 1574 if (retries > LFS_IFIND_RETRIES) { 1575 #ifdef DEBUG 1576 /* If the seglock is held look at the bpp to see 1577 what is there anyway */ 1578 if (fs->lfs_seglock > 0) { 1579 struct buf **bpp; 1580 struct ufs1_dinode *dp; 1581 int i; 1582 1583 for (bpp = fs->lfs_sp->bpp; 1584 bpp != fs->lfs_sp->cbpp; ++bpp) { 1585 if ((*bpp)->b_vp == fs->lfs_ivnode && 1586 bpp != fs->lfs_sp->bpp) { 1587 /* Inode block */ 1588 printf("block 0x%" PRIx64 ": ", 1589 (*bpp)->b_blkno); 1590 dp = (struct ufs1_dinode *)(*bpp)->b_data; 1591 for (i = 0; i < INOPB(fs); i++) 1592 if (dp[i].di_u.inumber) 1593 printf("%d ", dp[i].di_u.inumber); 1594 printf("\n"); 1595 } 1596 } 1597 } 1598 #endif 1599 panic("lfs_vget: dinode not found"); 1600 } 1601 printf("lfs_vget: dinode %d not found, retrying...\n", ino); 1602 (void)tsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs ifind", 1); 1603 goto again; 1604 } 1605 *ip->i_din.ffs1_din = *dip; 1606 brelse(bp); 1607 1608 if (fs->lfs_version > 1) { 1609 ip->i_ffs1_atime = ts.tv_sec; 1610 ip->i_ffs1_atimensec = ts.tv_nsec; 1611 } 1612 1613 lfs_vinit(mp, &vp); 1614 1615 *vpp = vp; 1616 1617 KASSERT(VOP_ISLOCKED(vp)); 1618 1619 return (0); 1620 } 1621 1622 /* 1623 * File handle to vnode 1624 */ 1625 int 1626 lfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp) 1627 { 1628 struct lfid *lfhp; 1629 struct buf *bp; 1630 IFILE *ifp; 1631 int32_t daddr; 1632 struct lfs *fs; 1633 1634 lfhp = (struct lfid *)fhp; 1635 if (lfhp->lfid_ino < LFS_IFILE_INUM) 1636 return ESTALE; 1637 1638 fs = VFSTOUFS(mp)->um_lfs; 1639 if (lfhp->lfid_ident != fs->lfs_ident) 1640 return ESTALE; 1641 1642 if (lfhp->lfid_ino > 1643 ((VTOI(fs->lfs_ivnode)->i_ffs1_size >> fs->lfs_bshift) - 1644 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb) 1645 return ESTALE; 1646 1647 if (ufs_ihashlookup(VFSTOUFS(mp)->um_dev, lfhp->lfid_ino) == NULLVP) { 1648 LFS_IENTRY(ifp, fs, lfhp->lfid_ino, bp); 1649 daddr = ifp->if_daddr; 1650 brelse(bp); 1651 if (daddr == LFS_UNUSED_DADDR) 1652 return ESTALE; 1653 } 1654 1655 return (ufs_fhtovp(mp, &lfhp->lfid_ufid, vpp)); 1656 } 1657 1658 /* 1659 * Vnode pointer to File handle 1660 */ 1661 /* ARGSUSED */ 1662 int 1663 lfs_vptofh(struct vnode *vp, struct fid *fhp) 1664 { 1665 struct inode *ip; 1666 struct lfid *lfhp; 1667 1668 ip = VTOI(vp); 1669 lfhp = (struct lfid *)fhp; 1670 lfhp->lfid_len = sizeof(struct lfid); 1671 lfhp->lfid_ino = ip->i_number; 1672 lfhp->lfid_gen = ip->i_gen; 1673 lfhp->lfid_ident = ip->i_lfs->lfs_ident; 1674 return (0); 1675 } 1676 1677 static int 1678 sysctl_lfs_dostats(SYSCTLFN_ARGS) 1679 { 1680 extern struct lfs_stats lfs_stats; 1681 extern int lfs_dostats; 1682 int error; 1683 1684 error = sysctl_lookup(SYSCTLFN_CALL(rnode)); 1685 if (error || newp == NULL) 1686 return (error); 1687 1688 if (lfs_dostats == 0) 1689 memset(&lfs_stats,0,sizeof(lfs_stats)); 1690 1691 return (0); 1692 } 1693 1694 SYSCTL_SETUP(sysctl_vfs_lfs_setup, "sysctl vfs.lfs setup") 1695 { 1696 extern int lfs_writeindir, lfs_dostats, lfs_clean_vnhead; 1697 1698 sysctl_createv(SYSCTL_PERMANENT, 1699 CTLTYPE_NODE, "vfs", NULL, 1700 NULL, 0, NULL, 0, 1701 CTL_VFS, CTL_EOL); 1702 sysctl_createv(SYSCTL_PERMANENT, 1703 CTLTYPE_NODE, "lfs", NULL, 1704 NULL, 0, NULL, 0, 1705 CTL_VFS, 5, CTL_EOL); 1706 /* 1707 * XXX the "5" above could be dynamic, thereby eliminating one 1708 * more instance of the "number to vfs" mapping problem, but 1709 * "2" is the order as taken from sys/mount.h 1710 */ 1711 1712 sysctl_createv(SYSCTL_PERMANENT|SYSCTL_READWRITE, 1713 CTLTYPE_INT, "flushindir", NULL, 1714 NULL, 0, &lfs_writeindir, 0, 1715 CTL_VFS, 5, LFS_WRITEINDIR, CTL_EOL); 1716 sysctl_createv(SYSCTL_PERMANENT|SYSCTL_READWRITE, 1717 CTLTYPE_INT, "clean_vnhead", NULL, 1718 NULL, 0, &lfs_clean_vnhead, 0, 1719 CTL_VFS, 5, LFS_CLEAN_VNHEAD, CTL_EOL); 1720 sysctl_createv(SYSCTL_PERMANENT|SYSCTL_READWRITE, 1721 CTLTYPE_INT, "dostats", NULL, 1722 sysctl_lfs_dostats, 0, &lfs_dostats, 0, 1723 CTL_VFS, 5, LFS_DOSTATS, CTL_EOL); 1724 } 1725 1726 /* 1727 * ufs_bmaparray callback function for writing. 1728 * 1729 * Since blocks will be written to the new segment anyway, 1730 * we don't care about current daddr of them. 1731 */ 1732 static boolean_t 1733 lfs_issequential_hole(const struct ufsmount *ump, 1734 daddr_t daddr0, daddr_t daddr1) 1735 { 1736 1737 KASSERT(daddr0 == UNWRITTEN || 1738 (0 <= daddr0 && daddr0 <= LFS_MAX_DADDR)); 1739 KASSERT(daddr1 == UNWRITTEN || 1740 (0 <= daddr1 && daddr1 <= LFS_MAX_DADDR)); 1741 1742 /* NOTE: all we want to know here is 'hole or not'. */ 1743 /* NOTE: UNASSIGNED is converted to 0 by ufs_bmaparray. */ 1744 1745 /* 1746 * treat UNWRITTENs and all resident blocks as 'contiguous' 1747 */ 1748 if (daddr0 != 0 && daddr1 != 0) 1749 return TRUE; 1750 1751 /* 1752 * both are in hole? 1753 */ 1754 if (daddr0 == 0 && daddr1 == 0) 1755 return TRUE; /* all holes are 'contiguous' for us. */ 1756 1757 return FALSE; 1758 } 1759 1760 /* 1761 * lfs_gop_write functions exactly like genfs_gop_write, except that 1762 * (1) it requires the seglock to be held by its caller, and sp->fip 1763 * to be properly initialized (it will return without re-initializing 1764 * sp->fip, and without calling lfs_writeseg). 1765 * (2) it uses the remaining space in the segment, rather than VOP_BMAP, 1766 * to determine how large a block it can write at once (though it does 1767 * still use VOP_BMAP to find holes in the file); 1768 * (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks 1769 * (leaving lfs_writeseg to deal with the cluster blocks, so we might 1770 * now have clusters of clusters, ick.) 1771 */ 1772 static int 1773 lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags) 1774 { 1775 int i, s, error, run; 1776 int fs_bshift; 1777 vaddr_t kva; 1778 off_t eof, offset, startoffset; 1779 size_t bytes, iobytes, skipbytes; 1780 daddr_t lbn, blkno; 1781 struct vm_page *pg; 1782 struct buf *mbp, *bp; 1783 struct vnode *devvp = VTOI(vp)->i_devvp; 1784 struct inode *ip = VTOI(vp); 1785 struct lfs *fs = ip->i_lfs; 1786 struct segment *sp = fs->lfs_sp; 1787 UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist); 1788 1789 /* The Ifile lives in the buffer cache */ 1790 if (vp == fs->lfs_ivnode) 1791 return genfs_compat_gop_write(vp, pgs, npages, flags); 1792 1793 /* 1794 * Sometimes things slip past the filters in lfs_putpages, 1795 * and the pagedaemon tries to write pages---problem is 1796 * that the pagedaemon never acquires the segment lock. 1797 * 1798 * Unbusy and unclean the pages, and put them on the ACTIVE 1799 * queue under the hypothesis that they couldn't have got here 1800 * unless they were modified *quite* recently. 1801 * 1802 * XXXUBC that last statement is an oversimplification of course. 1803 */ 1804 if (!(fs->lfs_seglock) || fs->lfs_lockpid != curproc->p_pid) { 1805 simple_lock(&vp->v_interlock); 1806 #ifdef DEBUG 1807 printf("lfs_gop_write: seglock not held\n"); 1808 #endif 1809 uvm_lock_pageq(); 1810 for (i = 0; i < npages; i++) { 1811 pg = pgs[i]; 1812 1813 if (pg->flags & PG_PAGEOUT) 1814 uvmexp.paging--; 1815 if (pg->flags & PG_DELWRI) { 1816 uvm_pageunwire(pg); 1817 } 1818 uvm_pageactivate(pg); 1819 pg->flags &= ~(PG_CLEAN|PG_DELWRI|PG_PAGEOUT|PG_RELEASED); 1820 #ifdef DEBUG_LFS 1821 printf("pg[%d]->flags = %x\n", i, pg->flags); 1822 printf("pg[%d]->pqflags = %x\n", i, pg->pqflags); 1823 printf("pg[%d]->uanon = %p\n", i, pg->uanon); 1824 printf("pg[%d]->uobject = %p\n", i, pg->uobject); 1825 printf("pg[%d]->wire_count = %d\n", i, pg->wire_count); 1826 printf("pg[%d]->loan_count = %d\n", i, pg->loan_count); 1827 #endif 1828 } 1829 /* uvm_pageunbusy takes care of PG_BUSY, PG_WANTED */ 1830 uvm_page_unbusy(pgs, npages); 1831 uvm_unlock_pageq(); 1832 simple_unlock(&vp->v_interlock); 1833 return EAGAIN; 1834 } 1835 1836 UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x", 1837 vp, pgs, npages, flags); 1838 1839 GOP_SIZE(vp, vp->v_size, &eof, GOP_SIZE_WRITE); 1840 1841 if (vp->v_type == VREG) 1842 fs_bshift = vp->v_mount->mnt_fs_bshift; 1843 else 1844 fs_bshift = DEV_BSHIFT; 1845 error = 0; 1846 pg = pgs[0]; 1847 startoffset = pg->offset; 1848 bytes = MIN(npages << PAGE_SHIFT, eof - startoffset); 1849 skipbytes = 0; 1850 1851 /* KASSERT(bytes != 0); */ 1852 if (bytes == 0) 1853 printf("ino %d bytes == 0 offset %" PRId64 "\n", 1854 VTOI(vp)->i_number, pgs[0]->offset); 1855 1856 /* Swap PG_DELWRI for PG_PAGEOUT */ 1857 for (i = 0; i < npages; i++) 1858 if (pgs[i]->flags & PG_DELWRI) { 1859 KASSERT(!(pgs[i]->flags & PG_PAGEOUT)); 1860 pgs[i]->flags &= ~PG_DELWRI; 1861 pgs[i]->flags |= PG_PAGEOUT; 1862 uvmexp.paging++; 1863 uvm_lock_pageq(); 1864 uvm_pageunwire(pgs[i]); 1865 uvm_unlock_pageq(); 1866 } 1867 1868 /* 1869 * Check to make sure we're starting on a block boundary. 1870 * We'll check later to make sure we always write entire 1871 * blocks (or fragments). 1872 */ 1873 if (startoffset & fs->lfs_bmask) 1874 printf("%" PRId64 " & %" PRId64 " = %" PRId64 "\n", 1875 startoffset, fs->lfs_bmask, 1876 startoffset & fs->lfs_bmask); 1877 KASSERT((startoffset & fs->lfs_bmask) == 0); 1878 if (bytes & fs->lfs_ffmask) { 1879 printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes); 1880 panic("lfs_gop_write: non-integer blocks"); 1881 } 1882 1883 kva = uvm_pagermapin(pgs, npages, 1884 UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK); 1885 1886 s = splbio(); 1887 simple_lock(&global_v_numoutput_slock); 1888 vp->v_numoutput += 2; /* one for biodone, one for aiodone */ 1889 simple_unlock(&global_v_numoutput_slock); 1890 mbp = pool_get(&bufpool, PR_WAITOK); 1891 splx(s); 1892 1893 memset(mbp, 0, sizeof(*bp)); 1894 BUF_INIT(mbp); 1895 UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x", 1896 vp, mbp, vp->v_numoutput, bytes); 1897 mbp->b_bufsize = npages << PAGE_SHIFT; 1898 mbp->b_data = (void *)kva; 1899 mbp->b_resid = mbp->b_bcount = bytes; 1900 mbp->b_flags = B_BUSY|B_WRITE|B_AGE|B_CALL; 1901 mbp->b_iodone = uvm_aio_biodone; 1902 mbp->b_vp = vp; 1903 1904 bp = NULL; 1905 for (offset = startoffset; 1906 bytes > 0; 1907 offset += iobytes, bytes -= iobytes) { 1908 lbn = offset >> fs_bshift; 1909 error = ufs_bmaparray(vp, lbn, &blkno, NULL, NULL, &run, 1910 lfs_issequential_hole); 1911 if (error) { 1912 UVMHIST_LOG(ubchist, "ufs_bmaparray() -> %d", 1913 error,0,0,0); 1914 skipbytes += bytes; 1915 bytes = 0; 1916 break; 1917 } 1918 1919 iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, 1920 bytes); 1921 if (blkno == (daddr_t)-1) { 1922 skipbytes += iobytes; 1923 continue; 1924 } 1925 1926 /* 1927 * Discover how much we can really pack into this buffer. 1928 */ 1929 /* If no room in the current segment, finish it up */ 1930 if (sp->sum_bytes_left < sizeof(int32_t) || 1931 sp->seg_bytes_left < (1 << fs->lfs_bshift)) { 1932 int version; 1933 1934 lfs_updatemeta(sp); 1935 1936 version = sp->fip->fi_version; 1937 (void) lfs_writeseg(fs, sp); 1938 1939 sp->fip->fi_version = version; 1940 sp->fip->fi_ino = ip->i_number; 1941 /* Add the current file to the segment summary. */ 1942 ++((SEGSUM *)(sp->segsum))->ss_nfinfo; 1943 sp->sum_bytes_left -= FINFOSIZE; 1944 } 1945 /* Check both for space in segment and space in segsum */ 1946 iobytes = MIN(iobytes, (sp->seg_bytes_left >> fs_bshift) 1947 << fs_bshift); 1948 iobytes = MIN(iobytes, (sp->sum_bytes_left / sizeof(int32_t)) 1949 << fs_bshift); 1950 KASSERT(iobytes > 0); 1951 1952 /* if it's really one i/o, don't make a second buf */ 1953 if (offset == startoffset && iobytes == bytes) { 1954 bp = mbp; 1955 /* printf("bp is mbp\n"); */ 1956 /* correct overcount if there is no second buffer */ 1957 s = splbio(); 1958 simple_lock(&global_v_numoutput_slock); 1959 --vp->v_numoutput; 1960 simple_unlock(&global_v_numoutput_slock); 1961 splx(s); 1962 } else { 1963 /* printf("bp is not mbp\n"); */ 1964 s = splbio(); 1965 bp = pool_get(&bufpool, PR_WAITOK); 1966 UVMHIST_LOG(ubchist, "vp %p bp %p num now %d", 1967 vp, bp, vp->v_numoutput, 0); 1968 splx(s); 1969 memset(bp, 0, sizeof(*bp)); 1970 BUF_INIT(bp); 1971 bp->b_data = (char *)kva + 1972 (vaddr_t)(offset - pg->offset); 1973 bp->b_resid = bp->b_bcount = iobytes; 1974 bp->b_flags = B_BUSY|B_WRITE|B_CALL; 1975 bp->b_iodone = uvm_aio_biodone1; 1976 } 1977 1978 /* XXX This is silly ... is this necessary? */ 1979 bp->b_vp = NULL; 1980 s = splbio(); 1981 bgetvp(vp, bp); 1982 splx(s); 1983 1984 bp->b_lblkno = lblkno(fs, offset); 1985 bp->b_private = mbp; 1986 if (devvp->v_type == VBLK) { 1987 bp->b_dev = devvp->v_rdev; 1988 } 1989 VOP_BWRITE(bp); 1990 while (lfs_gatherblock(sp, bp, NULL)) 1991 continue; 1992 } 1993 1994 if (skipbytes) { 1995 UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0); 1996 s = splbio(); 1997 if (error) { 1998 mbp->b_flags |= B_ERROR; 1999 mbp->b_error = error; 2000 } 2001 mbp->b_resid -= skipbytes; 2002 if (mbp->b_resid == 0) { 2003 biodone(mbp); 2004 } 2005 splx(s); 2006 } 2007 UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0); 2008 return (0); 2009 } 2010 2011 /* 2012 * finish vnode/inode initialization. 2013 * used by lfs_vget and lfs_fastvget. 2014 */ 2015 void 2016 lfs_vinit(struct mount *mp, struct vnode **vpp) 2017 { 2018 struct vnode *vp = *vpp; 2019 struct inode *ip = VTOI(vp); 2020 struct ufsmount *ump = VFSTOUFS(mp); 2021 int i; 2022 2023 ip->i_mode = ip->i_ffs1_mode; 2024 ip->i_ffs_effnlink = ip->i_nlink = ip->i_ffs1_nlink; 2025 ip->i_lfs_osize = ip->i_size = ip->i_ffs1_size; 2026 ip->i_flags = ip->i_ffs1_flags; 2027 ip->i_gen = ip->i_ffs1_gen; 2028 ip->i_uid = ip->i_ffs1_uid; 2029 ip->i_gid = ip->i_ffs1_gid; 2030 2031 ip->i_lfs_effnblks = ip->i_ffs1_blocks; 2032 2033 /* 2034 * Initialize the vnode from the inode, check for aliases. In all 2035 * cases re-init ip, the underlying vnode/inode may have changed. 2036 */ 2037 ufs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp); 2038 2039 memset(ip->i_lfs_fragsize, 0, NDADDR * sizeof(*ip->i_lfs_fragsize)); 2040 if (vp->v_type != VLNK || 2041 VTOI(vp)->i_size >= vp->v_mount->mnt_maxsymlinklen) { 2042 struct lfs *fs = ump->um_lfs; 2043 #ifdef DEBUG 2044 for (i = (ip->i_size + fs->lfs_bsize - 1) >> fs->lfs_bshift; 2045 i < NDADDR; i++) { 2046 if (ip->i_ffs1_db[i] != 0) { 2047 inconsistent: 2048 lfs_dump_dinode(ip->i_din.ffs1_din); 2049 panic("inconsistent inode"); 2050 } 2051 } 2052 for ( ; i < NDADDR + NIADDR; i++) { 2053 if (ip->i_ffs1_ib[i - NDADDR] != 0) { 2054 goto inconsistent; 2055 } 2056 } 2057 #endif /* DEBUG */ 2058 for (i = 0; i < NDADDR; i++) 2059 if (ip->i_ffs1_db[i] != 0) 2060 ip->i_lfs_fragsize[i] = blksize(fs, ip, i); 2061 } 2062 2063 #ifdef DEBUG 2064 if (vp->v_type == VNON) { 2065 printf("lfs_vinit: ino %d is type VNON! (ifmt=%o)\n", 2066 ip->i_number, (ip->i_mode & IFMT) >> 12); 2067 lfs_dump_dinode(ip->i_din.ffs1_din); 2068 #ifdef DDB 2069 Debugger(); 2070 #endif /* DDB */ 2071 } 2072 #endif /* DEBUG */ 2073 2074 /* 2075 * Finish inode initialization now that aliasing has been resolved. 2076 */ 2077 2078 ip->i_devvp = ump->um_devvp; 2079 VREF(ip->i_devvp); 2080 genfs_node_init(vp, &lfs_genfsops); 2081 uvm_vnp_setsize(vp, ip->i_size); 2082 2083 *vpp = vp; 2084 } 2085