1 /* $NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Konrad E. Schroder <perseant@hhhh.org>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 /*- 33 * Copyright (c) 1991, 1993, 1994 34 * The Regents of the University of California. All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95 61 */ 62 63 #include <sys/cdefs.h> 64 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $"); 65 66 #ifndef LFS 67 # define LFS /* for prototypes in syscallargs.h */ 68 #endif 69 70 #include <sys/param.h> 71 #include <sys/systm.h> 72 #include <sys/proc.h> 73 #include <sys/buf.h> 74 #include <sys/mount.h> 75 #include <sys/vnode.h> 76 #include <sys/kernel.h> 77 #include <sys/kauth.h> 78 #include <sys/syscallargs.h> 79 80 #include <ufs/ufs/inode.h> 81 #include <ufs/ufs/ufsmount.h> 82 #include <ufs/ufs/ufs_extern.h> 83 84 #include <ufs/lfs/lfs.h> 85 #include <ufs/lfs/lfs_extern.h> 86 87 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, void *); 88 int lfs_fasthashget(dev_t, ino_t, struct vnode **); 89 90 pid_t lfs_cleaner_pid = 0; 91 92 /* 93 * sys_lfs_markv: 94 * 95 * This will mark inodes and blocks dirty, so they are written into the log. 96 * It will block until all the blocks have been written. The segment create 97 * time passed in the block_info and inode_info structures is used to decide 98 * if the data is valid for each block (in case some process dirtied a block 99 * or inode that is being cleaned between the determination that a block is 100 * live and the lfs_markv call). 101 * 102 * 0 on success 103 * -1/errno is return on error. 104 */ 105 #ifdef USE_64BIT_SYSCALLS 106 int 107 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) 108 { 109 /* { 110 syscallarg(fsid_t *) fsidp; 111 syscallarg(struct block_info *) blkiov; 112 syscallarg(int) blkcnt; 113 } */ 114 BLOCK_INFO *blkiov; 115 int blkcnt, error; 116 fsid_t fsid; 117 struct lfs *fs; 118 struct mount *mntp; 119 120 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 121 NULL)) != 0) 122 return (error); 123 124 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 125 return (error); 126 127 if ((mntp = vfs_getvfs(fsidp)) == NULL) 128 return (ENOENT); 129 fs = VFSTOUFS(mntp)->um_lfs; 130 131 blkcnt = SCARG(uap, blkcnt); 132 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 133 return (EINVAL); 134 135 KERNEL_LOCK(1, NULL); 136 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 137 if ((error = copyin(SCARG(uap, blkiov), blkiov, 138 blkcnt * sizeof(BLOCK_INFO))) != 0) 139 goto out; 140 141 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0) 142 copyout(blkiov, SCARG(uap, blkiov), 143 blkcnt * sizeof(BLOCK_INFO)); 144 out: 145 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 146 KERNEL_UNLOCK_ONE(NULL); 147 return error; 148 } 149 #else 150 int 151 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) 152 { 153 /* { 154 syscallarg(fsid_t *) fsidp; 155 syscallarg(struct block_info *) blkiov; 156 syscallarg(int) blkcnt; 157 } */ 158 BLOCK_INFO *blkiov; 159 BLOCK_INFO_15 *blkiov15; 160 int i, blkcnt, error; 161 fsid_t fsid; 162 struct lfs *fs; 163 struct mount *mntp; 164 165 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 166 NULL)) != 0) 167 return (error); 168 169 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 170 return (error); 171 172 if ((mntp = vfs_getvfs(&fsid)) == NULL) 173 return (ENOENT); 174 fs = VFSTOUFS(mntp)->um_lfs; 175 176 blkcnt = SCARG(uap, blkcnt); 177 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 178 return (EINVAL); 179 180 KERNEL_LOCK(1, NULL); 181 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 182 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); 183 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 184 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 185 goto out; 186 187 for (i = 0; i < blkcnt; i++) { 188 blkiov[i].bi_inode = blkiov15[i].bi_inode; 189 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 190 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 191 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 192 blkiov[i].bi_version = blkiov15[i].bi_version; 193 blkiov[i].bi_bp = blkiov15[i].bi_bp; 194 blkiov[i].bi_size = blkiov15[i].bi_size; 195 } 196 197 if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) { 198 for (i = 0; i < blkcnt; i++) { 199 blkiov15[i].bi_inode = blkiov[i].bi_inode; 200 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 201 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 202 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 203 blkiov15[i].bi_version = blkiov[i].bi_version; 204 blkiov15[i].bi_bp = blkiov[i].bi_bp; 205 blkiov15[i].bi_size = blkiov[i].bi_size; 206 } 207 copyout(blkiov15, SCARG(uap, blkiov), 208 blkcnt * sizeof(BLOCK_INFO_15)); 209 } 210 out: 211 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 212 lfs_free(fs, blkiov15, LFS_NB_BLKIOV); 213 KERNEL_UNLOCK_ONE(NULL); 214 return error; 215 } 216 #endif 217 218 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS) 219 220 int 221 lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, 222 int blkcnt) 223 { 224 BLOCK_INFO *blkp; 225 IFILE *ifp; 226 struct buf *bp; 227 struct inode *ip = NULL; 228 struct lfs *fs; 229 struct mount *mntp; 230 struct vnode *vp = NULL; 231 ino_t lastino; 232 daddr_t b_daddr, v_daddr; 233 int cnt, error; 234 int do_again = 0; 235 int numrefed = 0; 236 ino_t maxino; 237 size_t obsize; 238 239 /* number of blocks/inodes that we have already bwrite'ed */ 240 int nblkwritten, ninowritten; 241 242 if ((mntp = vfs_getvfs(fsidp)) == NULL) 243 return (ENOENT); 244 245 fs = VFSTOUFS(mntp)->um_lfs; 246 247 if (fs->lfs_ronly) 248 return EROFS; 249 250 maxino = (fragstoblks(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks) - 251 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb; 252 253 cnt = blkcnt; 254 255 if ((error = vfs_busy(mntp, NULL)) != 0) 256 return (error); 257 258 /* 259 * This seglock is just to prevent the fact that we might have to sleep 260 * from allowing the possibility that our blocks might become 261 * invalid. 262 * 263 * It is also important to note here that unless we specify SEGM_CKP, 264 * any Ifile blocks that we might be asked to clean will never get 265 * to the disk. 266 */ 267 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 268 269 /* Mark blocks/inodes dirty. */ 270 error = 0; 271 272 /* these were inside the initialization for the for loop */ 273 v_daddr = LFS_UNUSED_DADDR; 274 lastino = LFS_UNUSED_INUM; 275 nblkwritten = ninowritten = 0; 276 for (blkp = blkiov; cnt--; ++blkp) 277 { 278 /* Bounds-check incoming data, avoid panic for failed VGET */ 279 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) { 280 error = EINVAL; 281 goto err3; 282 } 283 /* 284 * Get the IFILE entry (only once) and see if the file still 285 * exists. 286 */ 287 if (lastino != blkp->bi_inode) { 288 /* 289 * Finish the old file, if there was one. The presence 290 * of a usable vnode in vp is signaled by a valid v_daddr. 291 */ 292 if (v_daddr != LFS_UNUSED_DADDR) { 293 lfs_vunref(vp); 294 numrefed--; 295 } 296 297 /* 298 * Start a new file 299 */ 300 lastino = blkp->bi_inode; 301 if (blkp->bi_inode == LFS_IFILE_INUM) 302 v_daddr = fs->lfs_idaddr; 303 else { 304 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 305 /* XXX fix for force write */ 306 v_daddr = ifp->if_daddr; 307 brelse(bp, 0); 308 } 309 if (v_daddr == LFS_UNUSED_DADDR) 310 continue; 311 312 /* Get the vnode/inode. */ 313 error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr, 314 &vp, 315 (blkp->bi_lbn == LFS_UNUSED_LBN 316 ? blkp->bi_bp 317 : NULL)); 318 319 if (!error) { 320 numrefed++; 321 } 322 if (error) { 323 DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget" 324 " failed with %d (ino %d, segment %d)\n", 325 error, blkp->bi_inode, 326 dtosn(fs, blkp->bi_daddr))); 327 /* 328 * If we got EAGAIN, that means that the 329 * Inode was locked. This is 330 * recoverable: just clean the rest of 331 * this segment, and let the cleaner try 332 * again with another. (When the 333 * cleaner runs again, this segment will 334 * sort high on the list, since it is 335 * now almost entirely empty.) But, we 336 * still set v_daddr = LFS_UNUSED_ADDR 337 * so as not to test this over and over 338 * again. 339 */ 340 if (error == EAGAIN) { 341 error = 0; 342 do_again++; 343 } 344 #ifdef DIAGNOSTIC 345 else if (error != ENOENT) 346 panic("lfs_markv VFS_VGET FAILED"); 347 #endif 348 /* lastino = LFS_UNUSED_INUM; */ 349 v_daddr = LFS_UNUSED_DADDR; 350 vp = NULL; 351 ip = NULL; 352 continue; 353 } 354 ip = VTOI(vp); 355 ninowritten++; 356 } else if (v_daddr == LFS_UNUSED_DADDR) { 357 /* 358 * This can only happen if the vnode is dead (or 359 * in any case we can't get it...e.g., it is 360 * inlocked). Keep going. 361 */ 362 continue; 363 } 364 365 /* Past this point we are guaranteed that vp, ip are valid. */ 366 367 /* Can't clean VU_DIROP directories in case of truncation */ 368 /* XXX - maybe we should mark removed dirs specially? */ 369 if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) { 370 do_again++; 371 continue; 372 } 373 374 /* If this BLOCK_INFO didn't contain a block, keep going. */ 375 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 376 /* XXX need to make sure that the inode gets written in this case */ 377 /* XXX but only write the inode if it's the right one */ 378 if (blkp->bi_inode != LFS_IFILE_INUM) { 379 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 380 if (ifp->if_daddr == blkp->bi_daddr) { 381 mutex_enter(&lfs_lock); 382 LFS_SET_UINO(ip, IN_CLEANING); 383 mutex_exit(&lfs_lock); 384 } 385 brelse(bp, 0); 386 } 387 continue; 388 } 389 390 b_daddr = 0; 391 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) || 392 dbtofsb(fs, b_daddr) != blkp->bi_daddr) 393 { 394 if (dtosn(fs, dbtofsb(fs, b_daddr)) == 395 dtosn(fs, blkp->bi_daddr)) 396 { 397 DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n", 398 (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr))); 399 } 400 do_again++; 401 continue; 402 } 403 404 /* 405 * Check block sizes. The blocks being cleaned come from 406 * disk, so they should have the same size as their on-disk 407 * counterparts. 408 */ 409 if (blkp->bi_lbn >= 0) 410 obsize = blksize(fs, ip, blkp->bi_lbn); 411 else 412 obsize = fs->lfs_bsize; 413 /* Check for fragment size change */ 414 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) { 415 obsize = ip->i_lfs_fragsize[blkp->bi_lbn]; 416 } 417 if (obsize != blkp->bi_size) { 418 DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong" 419 " size (%ld != %d), try again\n", 420 blkp->bi_inode, (long long)blkp->bi_lbn, 421 (long) obsize, blkp->bi_size)); 422 do_again++; 423 continue; 424 } 425 426 /* 427 * If we get to here, then we are keeping the block. If 428 * it is an indirect block, we want to actually put it 429 * in the buffer cache so that it can be updated in the 430 * finish_meta section. If it's not, we need to 431 * allocate a fake buffer so that writeseg can perform 432 * the copyin and write the buffer. 433 */ 434 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) { 435 /* Data Block */ 436 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn, 437 blkp->bi_size, blkp->bi_bp); 438 /* Pretend we used bread() to get it */ 439 bp->b_blkno = fsbtodb(fs, blkp->bi_daddr); 440 } else { 441 /* Indirect block or ifile */ 442 if (blkp->bi_size != fs->lfs_bsize && 443 ip->i_number != LFS_IFILE_INUM) 444 panic("lfs_markv: partial indirect block?" 445 " size=%d\n", blkp->bi_size); 446 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0); 447 if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) { 448 /* 449 * The block in question was not found 450 * in the cache; i.e., the block that 451 * getblk() returned is empty. So, we 452 * can (and should) copy in the 453 * contents, because we've already 454 * determined that this was the right 455 * version of this block on disk. 456 * 457 * And, it can't have changed underneath 458 * us, because we have the segment lock. 459 */ 460 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size); 461 if (error) 462 goto err2; 463 } 464 } 465 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0) 466 goto err2; 467 468 nblkwritten++; 469 /* 470 * XXX should account indirect blocks and ifile pages as well 471 */ 472 if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode)) 473 > LFS_MARKV_MAX_BLOCKS) { 474 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n", 475 nblkwritten, ninowritten)); 476 lfs_segwrite(mntp, SEGM_CLEAN); 477 nblkwritten = ninowritten = 0; 478 } 479 } 480 481 /* 482 * Finish the old file, if there was one 483 */ 484 if (v_daddr != LFS_UNUSED_DADDR) { 485 lfs_vunref(vp); 486 numrefed--; 487 } 488 489 #ifdef DIAGNOSTIC 490 if (numrefed != 0) 491 panic("lfs_markv: numrefed=%d", numrefed); 492 #endif 493 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n", 494 nblkwritten, ninowritten)); 495 496 /* 497 * The last write has to be SEGM_SYNC, because of calling semantics. 498 * It also has to be SEGM_CKP, because otherwise we could write 499 * over the newly cleaned data contained in a checkpoint, and then 500 * we'd be unhappy at recovery time. 501 */ 502 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 503 504 lfs_segunlock(fs); 505 506 vfs_unbusy(mntp, false, NULL); 507 if (error) 508 return (error); 509 else if (do_again) 510 return EAGAIN; 511 512 return 0; 513 514 err2: 515 DLOG((DLOG_CLEAN, "lfs_markv err2\n")); 516 517 /* 518 * XXX we're here because copyin() failed. 519 * XXX it means that we can't trust the cleanerd. too bad. 520 * XXX how can we recover from this? 521 */ 522 523 err3: 524 KERNEL_UNLOCK_ONE(NULL); 525 /* 526 * XXX should do segwrite here anyway? 527 */ 528 529 if (v_daddr != LFS_UNUSED_DADDR) { 530 lfs_vunref(vp); 531 --numrefed; 532 } 533 534 lfs_segunlock(fs); 535 vfs_unbusy(mntp, false, NULL); 536 #ifdef DIAGNOSTIC 537 if (numrefed != 0) 538 panic("lfs_markv: numrefed=%d", numrefed); 539 #endif 540 541 return (error); 542 } 543 544 /* 545 * sys_lfs_bmapv: 546 * 547 * This will fill in the current disk address for arrays of blocks. 548 * 549 * 0 on success 550 * -1/errno is return on error. 551 */ 552 #ifdef USE_64BIT_SYSCALLS 553 int 554 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) 555 { 556 /* { 557 syscallarg(fsid_t *) fsidp; 558 syscallarg(struct block_info *) blkiov; 559 syscallarg(int) blkcnt; 560 } */ 561 BLOCK_INFO *blkiov; 562 int blkcnt, error; 563 fsid_t fsid; 564 struct lfs *fs; 565 struct mount *mntp; 566 567 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 568 NULL)) != 0) 569 return (error); 570 571 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 572 return (error); 573 574 if ((mntp = vfs_getvfs(&fsid)) == NULL) 575 return (ENOENT); 576 fs = VFSTOUFS(mntp)->um_lfs; 577 578 blkcnt = SCARG(uap, blkcnt); 579 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 580 return (EINVAL); 581 KERNEL_LOCK(1, NULL); 582 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 583 if ((error = copyin(SCARG(uap, blkiov), blkiov, 584 blkcnt * sizeof(BLOCK_INFO))) != 0) 585 goto out; 586 587 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0) 588 copyout(blkiov, SCARG(uap, blkiov), 589 blkcnt * sizeof(BLOCK_INFO)); 590 out: 591 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 592 KERNEL_UNLOCK_ONE(NULL); 593 return error; 594 } 595 #else 596 int 597 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) 598 { 599 /* { 600 syscallarg(fsid_t *) fsidp; 601 syscallarg(struct block_info *) blkiov; 602 syscallarg(int) blkcnt; 603 } */ 604 BLOCK_INFO *blkiov; 605 BLOCK_INFO_15 *blkiov15; 606 int i, blkcnt, error; 607 fsid_t fsid; 608 struct lfs *fs; 609 struct mount *mntp; 610 611 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 612 NULL)) != 0) 613 return (error); 614 615 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 616 return (error); 617 618 if ((mntp = vfs_getvfs(&fsid)) == NULL) 619 return (ENOENT); 620 fs = VFSTOUFS(mntp)->um_lfs; 621 622 blkcnt = SCARG(uap, blkcnt); 623 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 624 return (EINVAL); 625 KERNEL_LOCK(1, NULL); 626 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 627 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); 628 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 629 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 630 goto out; 631 632 for (i = 0; i < blkcnt; i++) { 633 blkiov[i].bi_inode = blkiov15[i].bi_inode; 634 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 635 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 636 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 637 blkiov[i].bi_version = blkiov15[i].bi_version; 638 blkiov[i].bi_bp = blkiov15[i].bi_bp; 639 blkiov[i].bi_size = blkiov15[i].bi_size; 640 } 641 642 if ((error = lfs_bmapv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) { 643 for (i = 0; i < blkcnt; i++) { 644 blkiov15[i].bi_inode = blkiov[i].bi_inode; 645 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 646 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 647 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 648 blkiov15[i].bi_version = blkiov[i].bi_version; 649 blkiov15[i].bi_bp = blkiov[i].bi_bp; 650 blkiov15[i].bi_size = blkiov[i].bi_size; 651 } 652 copyout(blkiov15, SCARG(uap, blkiov), 653 blkcnt * sizeof(BLOCK_INFO_15)); 654 } 655 out: 656 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 657 lfs_free(fs, blkiov15, LFS_NB_BLKIOV); 658 KERNEL_UNLOCK_ONE(NULL); 659 return error; 660 } 661 #endif 662 663 int 664 lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) 665 { 666 BLOCK_INFO *blkp; 667 IFILE *ifp; 668 struct buf *bp; 669 struct inode *ip = NULL; 670 struct lfs *fs; 671 struct mount *mntp; 672 struct ufsmount *ump; 673 struct vnode *vp; 674 ino_t lastino; 675 daddr_t v_daddr; 676 int cnt, error; 677 int numrefed = 0; 678 679 lfs_cleaner_pid = p->p_pid; 680 681 if ((mntp = vfs_getvfs(fsidp)) == NULL) 682 return (ENOENT); 683 684 ump = VFSTOUFS(mntp); 685 if ((error = vfs_busy(mntp, NULL)) != 0) 686 return (error); 687 688 cnt = blkcnt; 689 690 fs = VFSTOUFS(mntp)->um_lfs; 691 692 error = 0; 693 694 /* these were inside the initialization for the for loop */ 695 v_daddr = LFS_UNUSED_DADDR; 696 lastino = LFS_UNUSED_INUM; 697 for (blkp = blkiov; cnt--; ++blkp) 698 { 699 /* 700 * Get the IFILE entry (only once) and see if the file still 701 * exists. 702 */ 703 if (lastino != blkp->bi_inode) { 704 /* 705 * Finish the old file, if there was one. The presence 706 * of a usable vnode in vp is signaled by a valid 707 * v_daddr. 708 */ 709 if (v_daddr != LFS_UNUSED_DADDR) { 710 lfs_vunref(vp); 711 numrefed--; 712 } 713 714 /* 715 * Start a new file 716 */ 717 lastino = blkp->bi_inode; 718 if (blkp->bi_inode == LFS_IFILE_INUM) 719 v_daddr = fs->lfs_idaddr; 720 else { 721 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 722 v_daddr = ifp->if_daddr; 723 brelse(bp, 0); 724 } 725 if (v_daddr == LFS_UNUSED_DADDR) { 726 blkp->bi_daddr = LFS_UNUSED_DADDR; 727 continue; 728 } 729 /* 730 * A regular call to VFS_VGET could deadlock 731 * here. Instead, we try an unlocked access. 732 */ 733 mutex_enter(&ufs_ihash_lock); 734 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode); 735 if (vp != NULL && !(vp->v_iflag & VI_XLOCK)) { 736 ip = VTOI(vp); 737 mutex_enter(vp->v_interlock); 738 mutex_exit(&ufs_ihash_lock); 739 if (lfs_vref(vp)) { 740 v_daddr = LFS_UNUSED_DADDR; 741 continue; 742 } 743 numrefed++; 744 } else { 745 mutex_exit(&ufs_ihash_lock); 746 /* 747 * Don't VFS_VGET if we're being unmounted, 748 * since we hold vfs_busy(). 749 */ 750 if (mntp->mnt_iflag & IMNT_UNMOUNT) { 751 v_daddr = LFS_UNUSED_DADDR; 752 continue; 753 } 754 error = VFS_VGET(mntp, blkp->bi_inode, &vp); 755 if (error) { 756 DLOG((DLOG_CLEAN, "lfs_bmapv: vget ino" 757 "%d failed with %d", 758 blkp->bi_inode,error)); 759 v_daddr = LFS_UNUSED_DADDR; 760 continue; 761 } else { 762 KASSERT(VOP_ISLOCKED(vp)); 763 VOP_UNLOCK(vp); 764 numrefed++; 765 } 766 } 767 ip = VTOI(vp); 768 } else if (v_daddr == LFS_UNUSED_DADDR) { 769 /* 770 * This can only happen if the vnode is dead. 771 * Keep going. Note that we DO NOT set the 772 * bi_addr to anything -- if we failed to get 773 * the vnode, for example, we want to assume 774 * conservatively that all of its blocks *are* 775 * located in the segment in question. 776 * lfs_markv will throw them out if we are 777 * wrong. 778 */ 779 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */ 780 continue; 781 } 782 783 /* Past this point we are guaranteed that vp, ip are valid. */ 784 785 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 786 /* 787 * We just want the inode address, which is 788 * conveniently in v_daddr. 789 */ 790 blkp->bi_daddr = v_daddr; 791 } else { 792 daddr_t bi_daddr; 793 794 /* XXX ondisk32 */ 795 error = VOP_BMAP(vp, blkp->bi_lbn, NULL, 796 &bi_daddr, NULL); 797 if (error) 798 { 799 blkp->bi_daddr = LFS_UNUSED_DADDR; 800 continue; 801 } 802 blkp->bi_daddr = dbtofsb(fs, bi_daddr); 803 /* Fill in the block size, too */ 804 if (blkp->bi_lbn >= 0) 805 blkp->bi_size = blksize(fs, ip, blkp->bi_lbn); 806 else 807 blkp->bi_size = fs->lfs_bsize; 808 } 809 } 810 811 /* 812 * Finish the old file, if there was one. The presence 813 * of a usable vnode in vp is signaled by a valid v_daddr. 814 */ 815 if (v_daddr != LFS_UNUSED_DADDR) { 816 lfs_vunref(vp); 817 numrefed--; 818 } 819 820 #ifdef DIAGNOSTIC 821 if (numrefed != 0) 822 panic("lfs_bmapv: numrefed=%d", numrefed); 823 #endif 824 825 vfs_unbusy(mntp, false, NULL); 826 827 return 0; 828 } 829 830 /* 831 * sys_lfs_segclean: 832 * 833 * Mark the segment clean. 834 * 835 * 0 on success 836 * -1/errno is return on error. 837 */ 838 int 839 sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval) 840 { 841 /* { 842 syscallarg(fsid_t *) fsidp; 843 syscallarg(u_long) segment; 844 } */ 845 struct lfs *fs; 846 struct mount *mntp; 847 fsid_t fsid; 848 int error; 849 unsigned long segnum; 850 851 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 852 NULL)) != 0) 853 return (error); 854 855 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 856 return (error); 857 if ((mntp = vfs_getvfs(&fsid)) == NULL) 858 return (ENOENT); 859 860 fs = VFSTOUFS(mntp)->um_lfs; 861 segnum = SCARG(uap, segment); 862 863 if ((error = vfs_busy(mntp, NULL)) != 0) 864 return (error); 865 866 KERNEL_LOCK(1, NULL); 867 lfs_seglock(fs, SEGM_PROT); 868 error = lfs_do_segclean(fs, segnum); 869 lfs_segunlock(fs); 870 KERNEL_UNLOCK_ONE(NULL); 871 vfs_unbusy(mntp, false, NULL); 872 return error; 873 } 874 875 /* 876 * Actually mark the segment clean. 877 * Must be called with the segment lock held. 878 */ 879 int 880 lfs_do_segclean(struct lfs *fs, unsigned long segnum) 881 { 882 extern int lfs_dostats; 883 struct buf *bp; 884 CLEANERINFO *cip; 885 SEGUSE *sup; 886 887 if (dtosn(fs, fs->lfs_curseg) == segnum) { 888 return (EBUSY); 889 } 890 891 LFS_SEGENTRY(sup, fs, segnum, bp); 892 if (sup->su_nbytes) { 893 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 894 " %d live bytes\n", segnum, sup->su_nbytes)); 895 brelse(bp, 0); 896 return (EBUSY); 897 } 898 if (sup->su_flags & SEGUSE_ACTIVE) { 899 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 900 " segment is active\n", segnum)); 901 brelse(bp, 0); 902 return (EBUSY); 903 } 904 if (!(sup->su_flags & SEGUSE_DIRTY)) { 905 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 906 " segment is already clean\n", segnum)); 907 brelse(bp, 0); 908 return (EALREADY); 909 } 910 911 fs->lfs_avail += segtod(fs, 1); 912 if (sup->su_flags & SEGUSE_SUPERBLOCK) 913 fs->lfs_avail -= btofsb(fs, LFS_SBPAD); 914 if (fs->lfs_version > 1 && segnum == 0 && 915 fs->lfs_start < btofsb(fs, LFS_LABELPAD)) 916 fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start; 917 mutex_enter(&lfs_lock); 918 fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) + 919 btofsb(fs, sup->su_ninos * fs->lfs_ibsize); 920 fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) + 921 btofsb(fs, sup->su_ninos * fs->lfs_ibsize); 922 if (fs->lfs_dmeta < 0) 923 fs->lfs_dmeta = 0; 924 mutex_exit(&lfs_lock); 925 sup->su_flags &= ~SEGUSE_DIRTY; 926 LFS_WRITESEGENTRY(sup, fs, segnum, bp); 927 928 LFS_CLEANERINFO(cip, fs, bp); 929 ++cip->clean; 930 --cip->dirty; 931 fs->lfs_nclean = cip->clean; 932 cip->bfree = fs->lfs_bfree; 933 mutex_enter(&lfs_lock); 934 cip->avail = fs->lfs_avail - fs->lfs_ravail - fs->lfs_favail; 935 wakeup(&fs->lfs_avail); 936 mutex_exit(&lfs_lock); 937 (void) LFS_BWRITE_LOG(bp); 938 939 if (lfs_dostats) 940 ++lfs_stats.segs_reclaimed; 941 942 return (0); 943 } 944 945 /* 946 * This will block until a segment in file system fsid is written. A timeout 947 * in milliseconds may be specified which will awake the cleaner automatically. 948 * An fsid of -1 means any file system, and a timeout of 0 means forever. 949 */ 950 int 951 lfs_segwait(fsid_t *fsidp, struct timeval *tv) 952 { 953 struct mount *mntp; 954 void *addr; 955 u_long timeout; 956 int error; 957 958 KERNEL_LOCK(1, NULL); 959 if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL) 960 addr = &lfs_allclean_wakeup; 961 else 962 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg; 963 /* 964 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}! 965 * XXX IS THAT WHAT IS INTENDED? 966 */ 967 timeout = tvtohz(tv); 968 error = tsleep(addr, PCATCH | PVFS, "segment", timeout); 969 KERNEL_UNLOCK_ONE(NULL); 970 return (error == ERESTART ? EINTR : 0); 971 } 972 973 /* 974 * sys_lfs_segwait: 975 * 976 * System call wrapper around lfs_segwait(). 977 * 978 * 0 on success 979 * 1 on timeout 980 * -1/errno is return on error. 981 */ 982 int 983 sys___lfs_segwait50(struct lwp *l, const struct sys___lfs_segwait50_args *uap, 984 register_t *retval) 985 { 986 /* { 987 syscallarg(fsid_t *) fsidp; 988 syscallarg(struct timeval *) tv; 989 } */ 990 struct timeval atv; 991 fsid_t fsid; 992 int error; 993 994 /* XXX need we be su to segwait? */ 995 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 996 NULL)) != 0) 997 return (error); 998 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 999 return (error); 1000 1001 if (SCARG(uap, tv)) { 1002 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval)); 1003 if (error) 1004 return (error); 1005 if (itimerfix(&atv)) 1006 return (EINVAL); 1007 } else /* NULL or invalid */ 1008 atv.tv_sec = atv.tv_usec = 0; 1009 return lfs_segwait(&fsid, &atv); 1010 } 1011 1012 /* 1013 * VFS_VGET call specialized for the cleaner. The cleaner already knows the 1014 * daddr from the ifile, so don't look it up again. If the cleaner is 1015 * processing IINFO structures, it may have the ondisk inode already, so 1016 * don't go retrieving it again. 1017 * 1018 * we lfs_vref, and it is the caller's responsibility to lfs_vunref 1019 * when finished. 1020 */ 1021 1022 int 1023 lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp) 1024 { 1025 struct vnode *vp; 1026 1027 mutex_enter(&ufs_ihash_lock); 1028 if ((vp = ufs_ihashlookup(dev, ino)) != NULL) { 1029 mutex_enter(vp->v_interlock); 1030 mutex_exit(&ufs_ihash_lock); 1031 if (vp->v_iflag & VI_XLOCK) { 1032 DLOG((DLOG_CLEAN, "lfs_fastvget: ino %d VI_XLOCK\n", 1033 ino)); 1034 lfs_stats.clean_vnlocked++; 1035 mutex_exit(vp->v_interlock); 1036 return EAGAIN; 1037 } 1038 if (lfs_vref(vp)) { 1039 DLOG((DLOG_CLEAN, "lfs_fastvget: lfs_vref failed" 1040 " for ino %d\n", ino)); 1041 lfs_stats.clean_inlocked++; 1042 return EAGAIN; 1043 } 1044 } else { 1045 mutex_exit(&ufs_ihash_lock); 1046 } 1047 *vpp = vp; 1048 1049 return (0); 1050 } 1051 1052 int 1053 lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp, 1054 struct ufs1_dinode *dinp) 1055 { 1056 struct inode *ip; 1057 struct ufs1_dinode *dip; 1058 struct vnode *vp; 1059 struct ufsmount *ump; 1060 dev_t dev; 1061 int error, retries; 1062 struct buf *bp; 1063 struct lfs *fs; 1064 1065 ump = VFSTOUFS(mp); 1066 dev = ump->um_dev; 1067 fs = ump->um_lfs; 1068 1069 /* 1070 * Wait until the filesystem is fully mounted before allowing vget 1071 * to complete. This prevents possible problems with roll-forward. 1072 */ 1073 mutex_enter(&lfs_lock); 1074 while (fs->lfs_flags & LFS_NOTYET) { 1075 mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0, 1076 &lfs_lock); 1077 } 1078 mutex_exit(&lfs_lock); 1079 1080 /* 1081 * This is playing fast and loose. Someone may have the inode 1082 * locked, in which case they are going to be distinctly unhappy 1083 * if we trash something. 1084 */ 1085 1086 error = lfs_fasthashget(dev, ino, vpp); 1087 if (error != 0 || *vpp != NULL) 1088 return (error); 1089 1090 /* 1091 * getnewvnode(9) will call vfs_busy, which will block if the 1092 * filesystem is being unmounted; but umount(9) is waiting for 1093 * us because we're already holding the fs busy. 1094 * XXXMP 1095 */ 1096 if (mp->mnt_iflag & IMNT_UNMOUNT) { 1097 *vpp = NULL; 1098 return EDEADLK; 1099 } 1100 error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, NULL, &vp); 1101 if (error) { 1102 *vpp = NULL; 1103 return (error); 1104 } 1105 1106 mutex_enter(&ufs_hashlock); 1107 error = lfs_fasthashget(dev, ino, vpp); 1108 if (error != 0 || *vpp != NULL) { 1109 mutex_exit(&ufs_hashlock); 1110 ungetnewvnode(vp); 1111 return (error); 1112 } 1113 1114 /* Allocate new vnode/inode. */ 1115 lfs_vcreate(mp, ino, vp); 1116 1117 /* 1118 * Put it onto its hash chain and lock it so that other requests for 1119 * this inode will block if they arrive while we are sleeping waiting 1120 * for old data structures to be purged or for the contents of the 1121 * disk portion of this inode to be read. 1122 */ 1123 ip = VTOI(vp); 1124 ufs_ihashins(ip); 1125 mutex_exit(&ufs_hashlock); 1126 1127 /* 1128 * XXX 1129 * This may not need to be here, logically it should go down with 1130 * the i_devvp initialization. 1131 * Ask Kirk. 1132 */ 1133 ip->i_lfs = fs; 1134 1135 /* Read in the disk contents for the inode, copy into the inode. */ 1136 if (dinp) { 1137 error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode)); 1138 if (error) { 1139 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode copyin failed" 1140 " for ino %d\n", ino)); 1141 ufs_ihashrem(ip); 1142 1143 /* Unlock and discard unneeded inode. */ 1144 VOP_UNLOCK(vp); 1145 lfs_vunref(vp); 1146 *vpp = NULL; 1147 return (error); 1148 } 1149 if (ip->i_number != ino) 1150 panic("lfs_fastvget: I was fed the wrong inode!"); 1151 } else { 1152 retries = 0; 1153 again: 1154 error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize, 1155 NOCRED, 0, &bp); 1156 if (error) { 1157 DLOG((DLOG_CLEAN, "lfs_fastvget: bread failed (%d)\n", 1158 error)); 1159 /* 1160 * The inode does not contain anything useful, so it 1161 * would be misleading to leave it on its hash chain. 1162 * Iput() will return it to the free list. 1163 */ 1164 ufs_ihashrem(ip); 1165 1166 /* Unlock and discard unneeded inode. */ 1167 VOP_UNLOCK(vp); 1168 lfs_vunref(vp); 1169 brelse(bp, 0); 1170 *vpp = NULL; 1171 return (error); 1172 } 1173 dip = lfs_ifind(ump->um_lfs, ino, bp); 1174 if (dip == NULL) { 1175 /* Assume write has not completed yet; try again */ 1176 brelse(bp, BC_INVAL); 1177 ++retries; 1178 if (retries > LFS_IFIND_RETRIES) 1179 panic("lfs_fastvget: dinode not found"); 1180 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode not found," 1181 " retrying...\n")); 1182 goto again; 1183 } 1184 *ip->i_din.ffs1_din = *dip; 1185 brelse(bp, 0); 1186 } 1187 lfs_vinit(mp, &vp); 1188 1189 *vpp = vp; 1190 1191 KASSERT(VOP_ISLOCKED(vp)); 1192 VOP_UNLOCK(vp); 1193 1194 return (0); 1195 } 1196 1197 /* 1198 * Make up a "fake" cleaner buffer, copy the data from userland into it. 1199 */ 1200 struct buf * 1201 lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, void *uaddr) 1202 { 1203 struct buf *bp; 1204 int error; 1205 1206 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM); 1207 1208 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN); 1209 error = copyin(uaddr, bp->b_data, size); 1210 if (error) { 1211 lfs_freebuf(fs, bp); 1212 return NULL; 1213 } 1214 KDASSERT(bp->b_iodone == lfs_callback); 1215 1216 #if 0 1217 mutex_enter(&lfs_lock); 1218 ++fs->lfs_iocount; 1219 mutex_exit(&lfs_lock); 1220 #endif 1221 bp->b_bufsize = size; 1222 bp->b_bcount = size; 1223 return (bp); 1224 } 1225