1 /* $NetBSD: lfs_syscalls.c,v 1.172 2015/10/15 06:15:48 dholland Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Konrad E. Schroder <perseant@hhhh.org>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 /*- 33 * Copyright (c) 1991, 1993, 1994 34 * The Regents of the University of California. All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95 61 */ 62 63 #include <sys/cdefs.h> 64 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.172 2015/10/15 06:15:48 dholland Exp $"); 65 66 #ifndef LFS 67 # define LFS /* for prototypes in syscallargs.h */ 68 #endif 69 70 #include <sys/param.h> 71 #include <sys/systm.h> 72 #include <sys/proc.h> 73 #include <sys/buf.h> 74 #include <sys/mount.h> 75 #include <sys/vnode.h> 76 #include <sys/kernel.h> 77 #include <sys/kauth.h> 78 #include <sys/syscallargs.h> 79 80 #include <ufs/lfs/ulfs_inode.h> 81 #include <ufs/lfs/ulfsmount.h> 82 #include <ufs/lfs/ulfs_extern.h> 83 84 #include <ufs/lfs/lfs.h> 85 #include <ufs/lfs/lfs_accessors.h> 86 #include <ufs/lfs/lfs_kernel.h> 87 #include <ufs/lfs/lfs_extern.h> 88 89 static int lfs_fastvget(struct mount *, ino_t, BLOCK_INFO *, int, 90 struct vnode **); 91 static struct buf *lfs_fakebuf(struct lfs *, struct vnode *, daddr_t, 92 size_t, void *); 93 94 /* 95 * sys_lfs_markv: 96 * 97 * This will mark inodes and blocks dirty, so they are written into the log. 98 * It will block until all the blocks have been written. The segment create 99 * time passed in the block_info and inode_info structures is used to decide 100 * if the data is valid for each block (in case some process dirtied a block 101 * or inode that is being cleaned between the determination that a block is 102 * live and the lfs_markv call). 103 * 104 * 0 on success 105 * -1/errno is return on error. 106 */ 107 #ifdef USE_64BIT_SYSCALLS 108 int 109 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) 110 { 111 /* { 112 syscallarg(fsid_t *) fsidp; 113 syscallarg(struct block_info *) blkiov; 114 syscallarg(int) blkcnt; 115 } */ 116 BLOCK_INFO *blkiov; 117 int blkcnt, error; 118 fsid_t fsid; 119 struct lfs *fs; 120 struct mount *mntp; 121 122 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 123 return (error); 124 125 if ((mntp = vfs_getvfs(&fsid)) == NULL) 126 return (ENOENT); 127 fs = VFSTOULFS(mntp)->um_lfs; 128 129 blkcnt = SCARG(uap, blkcnt); 130 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 131 return (EINVAL); 132 133 KERNEL_LOCK(1, NULL); 134 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 135 if ((error = copyin(SCARG(uap, blkiov), blkiov, 136 blkcnt * sizeof(BLOCK_INFO))) != 0) 137 goto out; 138 139 if ((error = lfs_markv(l, &fsid, blkiov, blkcnt)) == 0) 140 copyout(blkiov, SCARG(uap, blkiov), 141 blkcnt * sizeof(BLOCK_INFO)); 142 out: 143 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 144 KERNEL_UNLOCK_ONE(NULL); 145 return error; 146 } 147 #else 148 int 149 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) 150 { 151 /* { 152 syscallarg(fsid_t *) fsidp; 153 syscallarg(struct block_info *) blkiov; 154 syscallarg(int) blkcnt; 155 } */ 156 BLOCK_INFO *blkiov; 157 BLOCK_INFO_15 *blkiov15; 158 int i, blkcnt, error; 159 fsid_t fsid; 160 struct lfs *fs; 161 struct mount *mntp; 162 163 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 164 return (error); 165 166 if ((mntp = vfs_getvfs(&fsid)) == NULL) 167 return (ENOENT); 168 fs = VFSTOULFS(mntp)->um_lfs; 169 170 blkcnt = SCARG(uap, blkcnt); 171 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 172 return (EINVAL); 173 174 KERNEL_LOCK(1, NULL); 175 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 176 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); 177 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 178 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 179 goto out; 180 181 for (i = 0; i < blkcnt; i++) { 182 blkiov[i].bi_inode = blkiov15[i].bi_inode; 183 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 184 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 185 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 186 blkiov[i].bi_version = blkiov15[i].bi_version; 187 blkiov[i].bi_bp = blkiov15[i].bi_bp; 188 blkiov[i].bi_size = blkiov15[i].bi_size; 189 } 190 191 if ((error = lfs_markv(l, &fsid, blkiov, blkcnt)) == 0) { 192 for (i = 0; i < blkcnt; i++) { 193 blkiov15[i].bi_inode = blkiov[i].bi_inode; 194 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 195 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 196 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 197 blkiov15[i].bi_version = blkiov[i].bi_version; 198 blkiov15[i].bi_bp = blkiov[i].bi_bp; 199 blkiov15[i].bi_size = blkiov[i].bi_size; 200 } 201 copyout(blkiov15, SCARG(uap, blkiov), 202 blkcnt * sizeof(BLOCK_INFO_15)); 203 } 204 out: 205 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 206 lfs_free(fs, blkiov15, LFS_NB_BLKIOV); 207 KERNEL_UNLOCK_ONE(NULL); 208 return error; 209 } 210 #endif 211 212 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS) 213 214 int 215 lfs_markv(struct lwp *l, fsid_t *fsidp, BLOCK_INFO *blkiov, 216 int blkcnt) 217 { 218 BLOCK_INFO *blkp; 219 IFILE *ifp; 220 struct buf *bp; 221 struct inode *ip = NULL; 222 struct lfs *fs; 223 struct mount *mntp; 224 struct ulfsmount *ump; 225 struct vnode *vp; 226 ino_t lastino; 227 daddr_t b_daddr; 228 int cnt, error; 229 int do_again = 0; 230 int numrefed = 0; 231 ino_t maxino; 232 size_t obsize; 233 234 /* number of blocks/inodes that we have already bwrite'ed */ 235 int nblkwritten, ninowritten; 236 237 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 238 KAUTH_REQ_SYSTEM_LFS_MARKV, NULL, NULL, NULL); 239 if (error) 240 return (error); 241 242 if ((mntp = vfs_getvfs(fsidp)) == NULL) 243 return (ENOENT); 244 245 ump = VFSTOULFS(mntp); 246 fs = ump->um_lfs; 247 248 if (fs->lfs_ronly) 249 return EROFS; 250 251 maxino = (lfs_fragstoblks(fs, lfs_dino_getblocks(fs, VTOI(fs->lfs_ivnode)->i_din)) - 252 lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs); 253 254 cnt = blkcnt; 255 256 if ((error = vfs_busy(mntp, NULL)) != 0) 257 return (error); 258 259 /* 260 * This seglock is just to prevent the fact that we might have to sleep 261 * from allowing the possibility that our blocks might become 262 * invalid. 263 * 264 * It is also important to note here that unless we specify SEGM_CKP, 265 * any Ifile blocks that we might be asked to clean will never get 266 * to the disk. 267 */ 268 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 269 270 /* Mark blocks/inodes dirty. */ 271 error = 0; 272 273 /* these were inside the initialization for the for loop */ 274 vp = NULL; 275 lastino = LFS_UNUSED_INUM; 276 nblkwritten = ninowritten = 0; 277 for (blkp = blkiov; cnt--; ++blkp) 278 { 279 /* Bounds-check incoming data, avoid panic for failed VGET */ 280 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) { 281 error = EINVAL; 282 goto err3; 283 } 284 /* 285 * Get the IFILE entry (only once) and see if the file still 286 * exists. 287 */ 288 if (lastino != blkp->bi_inode) { 289 /* 290 * Finish the old file, if there was one. 291 */ 292 if (vp != NULL) { 293 vput(vp); 294 vp = NULL; 295 numrefed--; 296 } 297 298 /* 299 * Start a new file 300 */ 301 lastino = blkp->bi_inode; 302 303 /* Get the vnode/inode. */ 304 error = lfs_fastvget(mntp, blkp->bi_inode, blkp, 305 LK_EXCLUSIVE | LK_NOWAIT, &vp); 306 if (error) { 307 DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget" 308 " failed with %d (ino %d, segment %d)\n", 309 error, blkp->bi_inode, 310 lfs_dtosn(fs, blkp->bi_daddr))); 311 /* 312 * If we got EAGAIN, that means that the 313 * Inode was locked. This is 314 * recoverable: just clean the rest of 315 * this segment, and let the cleaner try 316 * again with another. (When the 317 * cleaner runs again, this segment will 318 * sort high on the list, since it is 319 * now almost entirely empty.) 320 */ 321 if (error == EAGAIN) { 322 error = 0; 323 do_again++; 324 } else 325 KASSERT(error == ENOENT); 326 KASSERT(vp == NULL); 327 ip = NULL; 328 continue; 329 } 330 331 ip = VTOI(vp); 332 numrefed++; 333 ninowritten++; 334 } else if (vp == NULL) { 335 /* 336 * This can only happen if the vnode is dead (or 337 * in any case we can't get it...e.g., it is 338 * inlocked). Keep going. 339 */ 340 continue; 341 } 342 343 /* Past this point we are guaranteed that vp, ip are valid. */ 344 345 /* Can't clean VU_DIROP directories in case of truncation */ 346 /* XXX - maybe we should mark removed dirs specially? */ 347 if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) { 348 do_again++; 349 continue; 350 } 351 352 /* If this BLOCK_INFO didn't contain a block, keep going. */ 353 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 354 /* XXX need to make sure that the inode gets written in this case */ 355 /* XXX but only write the inode if it's the right one */ 356 if (blkp->bi_inode != LFS_IFILE_INUM) { 357 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 358 if (lfs_if_getdaddr(fs, ifp) == blkp->bi_daddr) { 359 mutex_enter(&lfs_lock); 360 LFS_SET_UINO(ip, IN_CLEANING); 361 mutex_exit(&lfs_lock); 362 } 363 brelse(bp, 0); 364 } 365 continue; 366 } 367 368 b_daddr = 0; 369 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) || 370 LFS_DBTOFSB(fs, b_daddr) != blkp->bi_daddr) 371 { 372 if (lfs_dtosn(fs, LFS_DBTOFSB(fs, b_daddr)) == 373 lfs_dtosn(fs, blkp->bi_daddr)) 374 { 375 DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %jx vs %jx\n", 376 (intmax_t)blkp->bi_daddr, (intmax_t)LFS_DBTOFSB(fs, b_daddr))); 377 } 378 do_again++; 379 continue; 380 } 381 382 /* 383 * Check block sizes. The blocks being cleaned come from 384 * disk, so they should have the same size as their on-disk 385 * counterparts. 386 */ 387 if (blkp->bi_lbn >= 0) 388 obsize = lfs_blksize(fs, ip, blkp->bi_lbn); 389 else 390 obsize = lfs_sb_getbsize(fs); 391 /* Check for fragment size change */ 392 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < ULFS_NDADDR) { 393 obsize = ip->i_lfs_fragsize[blkp->bi_lbn]; 394 } 395 if (obsize != blkp->bi_size) { 396 DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %jd wrong" 397 " size (%ld != %d), try again\n", 398 blkp->bi_inode, (intmax_t)blkp->bi_lbn, 399 (long) obsize, blkp->bi_size)); 400 do_again++; 401 continue; 402 } 403 404 /* 405 * If we get to here, then we are keeping the block. If 406 * it is an indirect block, we want to actually put it 407 * in the buffer cache so that it can be updated in the 408 * finish_meta section. If it's not, we need to 409 * allocate a fake buffer so that writeseg can perform 410 * the copyin and write the buffer. 411 */ 412 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) { 413 /* Data Block */ 414 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn, 415 blkp->bi_size, blkp->bi_bp); 416 /* Pretend we used bread() to get it */ 417 bp->b_blkno = LFS_FSBTODB(fs, blkp->bi_daddr); 418 } else { 419 /* Indirect block or ifile */ 420 if (blkp->bi_size != lfs_sb_getbsize(fs) && 421 ip->i_number != LFS_IFILE_INUM) 422 panic("lfs_markv: partial indirect block?" 423 " size=%d\n", blkp->bi_size); 424 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0); 425 if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) { 426 /* 427 * The block in question was not found 428 * in the cache; i.e., the block that 429 * getblk() returned is empty. So, we 430 * can (and should) copy in the 431 * contents, because we've already 432 * determined that this was the right 433 * version of this block on disk. 434 * 435 * And, it can't have changed underneath 436 * us, because we have the segment lock. 437 */ 438 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size); 439 if (error) 440 goto err2; 441 } 442 } 443 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0) 444 goto err2; 445 446 nblkwritten++; 447 /* 448 * XXX should account indirect blocks and ifile pages as well 449 */ 450 if (nblkwritten + lfs_lblkno(fs, ninowritten * DINOSIZE(fs)) 451 > LFS_MARKV_MAX_BLOCKS) { 452 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n", 453 nblkwritten, ninowritten)); 454 lfs_segwrite(mntp, SEGM_CLEAN); 455 nblkwritten = ninowritten = 0; 456 } 457 } 458 459 /* 460 * Finish the old file, if there was one 461 */ 462 if (vp != NULL) { 463 vput(vp); 464 vp = NULL; 465 numrefed--; 466 } 467 468 #ifdef DIAGNOSTIC 469 if (numrefed != 0) 470 panic("lfs_markv: numrefed=%d", numrefed); 471 #endif 472 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n", 473 nblkwritten, ninowritten)); 474 475 /* 476 * The last write has to be SEGM_SYNC, because of calling semantics. 477 * It also has to be SEGM_CKP, because otherwise we could write 478 * over the newly cleaned data contained in a checkpoint, and then 479 * we'd be unhappy at recovery time. 480 */ 481 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 482 483 lfs_segunlock(fs); 484 485 vfs_unbusy(mntp, false, NULL); 486 if (error) 487 return (error); 488 else if (do_again) 489 return EAGAIN; 490 491 return 0; 492 493 err2: 494 DLOG((DLOG_CLEAN, "lfs_markv err2\n")); 495 496 /* 497 * XXX we're here because copyin() failed. 498 * XXX it means that we can't trust the cleanerd. too bad. 499 * XXX how can we recover from this? 500 */ 501 502 err3: 503 /* 504 * XXX should do segwrite here anyway? 505 */ 506 507 if (vp != NULL) { 508 vput(vp); 509 vp = NULL; 510 --numrefed; 511 } 512 513 lfs_segunlock(fs); 514 vfs_unbusy(mntp, false, NULL); 515 #ifdef DIAGNOSTIC 516 if (numrefed != 0) 517 panic("lfs_markv: numrefed=%d", numrefed); 518 #endif 519 520 return (error); 521 } 522 523 /* 524 * sys_lfs_bmapv: 525 * 526 * This will fill in the current disk address for arrays of blocks. 527 * 528 * 0 on success 529 * -1/errno is return on error. 530 */ 531 #ifdef USE_64BIT_SYSCALLS 532 int 533 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) 534 { 535 /* { 536 syscallarg(fsid_t *) fsidp; 537 syscallarg(struct block_info *) blkiov; 538 syscallarg(int) blkcnt; 539 } */ 540 BLOCK_INFO *blkiov; 541 int blkcnt, error; 542 fsid_t fsid; 543 struct lfs *fs; 544 struct mount *mntp; 545 546 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 547 return (error); 548 549 if ((mntp = vfs_getvfs(&fsid)) == NULL) 550 return (ENOENT); 551 fs = VFSTOULFS(mntp)->um_lfs; 552 553 blkcnt = SCARG(uap, blkcnt); 554 #if SIZE_T_MAX <= UINT_MAX 555 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 556 return (EINVAL); 557 #endif 558 KERNEL_LOCK(1, NULL); 559 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 560 if ((error = copyin(SCARG(uap, blkiov), blkiov, 561 blkcnt * sizeof(BLOCK_INFO))) != 0) 562 goto out; 563 564 if ((error = lfs_bmapv(l, &fsid, blkiov, blkcnt)) == 0) 565 copyout(blkiov, SCARG(uap, blkiov), 566 blkcnt * sizeof(BLOCK_INFO)); 567 out: 568 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 569 KERNEL_UNLOCK_ONE(NULL); 570 return error; 571 } 572 #else 573 int 574 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) 575 { 576 /* { 577 syscallarg(fsid_t *) fsidp; 578 syscallarg(struct block_info *) blkiov; 579 syscallarg(int) blkcnt; 580 } */ 581 BLOCK_INFO *blkiov; 582 BLOCK_INFO_15 *blkiov15; 583 int i, blkcnt, error; 584 fsid_t fsid; 585 struct lfs *fs; 586 struct mount *mntp; 587 588 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 589 return (error); 590 591 if ((mntp = vfs_getvfs(&fsid)) == NULL) 592 return (ENOENT); 593 fs = VFSTOULFS(mntp)->um_lfs; 594 595 blkcnt = SCARG(uap, blkcnt); 596 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 597 return (EINVAL); 598 KERNEL_LOCK(1, NULL); 599 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 600 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); 601 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 602 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 603 goto out; 604 605 for (i = 0; i < blkcnt; i++) { 606 blkiov[i].bi_inode = blkiov15[i].bi_inode; 607 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 608 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 609 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 610 blkiov[i].bi_version = blkiov15[i].bi_version; 611 blkiov[i].bi_bp = blkiov15[i].bi_bp; 612 blkiov[i].bi_size = blkiov15[i].bi_size; 613 } 614 615 if ((error = lfs_bmapv(l, &fsid, blkiov, blkcnt)) == 0) { 616 for (i = 0; i < blkcnt; i++) { 617 blkiov15[i].bi_inode = blkiov[i].bi_inode; 618 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 619 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 620 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 621 blkiov15[i].bi_version = blkiov[i].bi_version; 622 blkiov15[i].bi_bp = blkiov[i].bi_bp; 623 blkiov15[i].bi_size = blkiov[i].bi_size; 624 } 625 copyout(blkiov15, SCARG(uap, blkiov), 626 blkcnt * sizeof(BLOCK_INFO_15)); 627 } 628 out: 629 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 630 lfs_free(fs, blkiov15, LFS_NB_BLKIOV); 631 KERNEL_UNLOCK_ONE(NULL); 632 return error; 633 } 634 #endif 635 636 int 637 lfs_bmapv(struct lwp *l, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) 638 { 639 BLOCK_INFO *blkp; 640 IFILE *ifp; 641 struct buf *bp; 642 struct inode *ip = NULL; 643 struct lfs *fs; 644 struct mount *mntp; 645 struct ulfsmount *ump; 646 struct vnode *vp; 647 ino_t lastino; 648 daddr_t v_daddr; 649 int cnt, error; 650 int numrefed = 0; 651 652 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 653 KAUTH_REQ_SYSTEM_LFS_BMAPV, NULL, NULL, NULL); 654 if (error) 655 return (error); 656 657 if ((mntp = vfs_getvfs(fsidp)) == NULL) 658 return (ENOENT); 659 660 if ((error = vfs_busy(mntp, NULL)) != 0) 661 return (error); 662 663 ump = VFSTOULFS(mntp); 664 fs = ump->um_lfs; 665 666 if (fs->lfs_cleaner_thread == NULL) 667 fs->lfs_cleaner_thread = curlwp; 668 KASSERT(fs->lfs_cleaner_thread == curlwp); 669 670 cnt = blkcnt; 671 672 error = 0; 673 674 /* these were inside the initialization for the for loop */ 675 vp = NULL; 676 v_daddr = LFS_UNUSED_DADDR; 677 lastino = LFS_UNUSED_INUM; 678 for (blkp = blkiov; cnt--; ++blkp) 679 { 680 /* 681 * Get the IFILE entry (only once) and see if the file still 682 * exists. 683 */ 684 if (lastino != blkp->bi_inode) { 685 /* 686 * Finish the old file, if there was one. 687 */ 688 if (vp != NULL) { 689 vput(vp); 690 vp = NULL; 691 numrefed--; 692 } 693 694 /* 695 * Start a new file 696 */ 697 lastino = blkp->bi_inode; 698 if (blkp->bi_inode == LFS_IFILE_INUM) 699 v_daddr = lfs_sb_getidaddr(fs); 700 else { 701 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 702 v_daddr = lfs_if_getdaddr(fs, ifp); 703 brelse(bp, 0); 704 } 705 if (v_daddr == LFS_UNUSED_DADDR) { 706 blkp->bi_daddr = LFS_UNUSED_DADDR; 707 continue; 708 } 709 error = lfs_fastvget(mntp, blkp->bi_inode, NULL, 710 LK_SHARED, &vp); 711 if (error) { 712 DLOG((DLOG_CLEAN, "lfs_bmapv: lfs_fastvget ino" 713 "%d failed with %d", 714 blkp->bi_inode,error)); 715 KASSERT(vp == NULL); 716 continue; 717 } else { 718 KASSERT(VOP_ISLOCKED(vp)); 719 numrefed++; 720 } 721 ip = VTOI(vp); 722 } else if (vp == NULL) { 723 /* 724 * This can only happen if the vnode is dead. 725 * Keep going. Note that we DO NOT set the 726 * bi_addr to anything -- if we failed to get 727 * the vnode, for example, we want to assume 728 * conservatively that all of its blocks *are* 729 * located in the segment in question. 730 * lfs_markv will throw them out if we are 731 * wrong. 732 */ 733 continue; 734 } 735 736 /* Past this point we are guaranteed that vp, ip are valid. */ 737 738 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 739 /* 740 * We just want the inode address, which is 741 * conveniently in v_daddr. 742 */ 743 blkp->bi_daddr = v_daddr; 744 } else { 745 daddr_t bi_daddr; 746 747 error = VOP_BMAP(vp, blkp->bi_lbn, NULL, 748 &bi_daddr, NULL); 749 if (error) 750 { 751 blkp->bi_daddr = LFS_UNUSED_DADDR; 752 continue; 753 } 754 blkp->bi_daddr = LFS_DBTOFSB(fs, bi_daddr); 755 /* Fill in the block size, too */ 756 if (blkp->bi_lbn >= 0) 757 blkp->bi_size = lfs_blksize(fs, ip, blkp->bi_lbn); 758 else 759 blkp->bi_size = lfs_sb_getbsize(fs); 760 } 761 } 762 763 /* 764 * Finish the old file, if there was one. 765 */ 766 if (vp != NULL) { 767 vput(vp); 768 vp = NULL; 769 numrefed--; 770 } 771 772 #ifdef DIAGNOSTIC 773 if (numrefed != 0) 774 panic("lfs_bmapv: numrefed=%d", numrefed); 775 #endif 776 777 vfs_unbusy(mntp, false, NULL); 778 779 return 0; 780 } 781 782 /* 783 * sys_lfs_segclean: 784 * 785 * Mark the segment clean. 786 * 787 * 0 on success 788 * -1/errno is return on error. 789 */ 790 int 791 sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval) 792 { 793 /* { 794 syscallarg(fsid_t *) fsidp; 795 syscallarg(u_long) segment; 796 } */ 797 struct lfs *fs; 798 struct mount *mntp; 799 fsid_t fsid; 800 int error; 801 unsigned long segnum; 802 803 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 804 KAUTH_REQ_SYSTEM_LFS_SEGCLEAN, NULL, NULL, NULL); 805 if (error) 806 return (error); 807 808 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 809 return (error); 810 if ((mntp = vfs_getvfs(&fsid)) == NULL) 811 return (ENOENT); 812 813 fs = VFSTOULFS(mntp)->um_lfs; 814 segnum = SCARG(uap, segment); 815 816 if ((error = vfs_busy(mntp, NULL)) != 0) 817 return (error); 818 819 KERNEL_LOCK(1, NULL); 820 lfs_seglock(fs, SEGM_PROT); 821 error = lfs_do_segclean(fs, segnum); 822 lfs_segunlock(fs); 823 KERNEL_UNLOCK_ONE(NULL); 824 vfs_unbusy(mntp, false, NULL); 825 return error; 826 } 827 828 /* 829 * Actually mark the segment clean. 830 * Must be called with the segment lock held. 831 */ 832 int 833 lfs_do_segclean(struct lfs *fs, unsigned long segnum) 834 { 835 extern int lfs_dostats; 836 struct buf *bp; 837 CLEANERINFO *cip; 838 SEGUSE *sup; 839 840 if (lfs_dtosn(fs, lfs_sb_getcurseg(fs)) == segnum) { 841 return (EBUSY); 842 } 843 844 LFS_SEGENTRY(sup, fs, segnum, bp); 845 if (sup->su_nbytes) { 846 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 847 " %d live bytes\n", segnum, sup->su_nbytes)); 848 brelse(bp, 0); 849 return (EBUSY); 850 } 851 if (sup->su_flags & SEGUSE_ACTIVE) { 852 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 853 " segment is active\n", segnum)); 854 brelse(bp, 0); 855 return (EBUSY); 856 } 857 if (!(sup->su_flags & SEGUSE_DIRTY)) { 858 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 859 " segment is already clean\n", segnum)); 860 brelse(bp, 0); 861 return (EALREADY); 862 } 863 864 lfs_sb_addavail(fs, lfs_segtod(fs, 1)); 865 if (sup->su_flags & SEGUSE_SUPERBLOCK) 866 lfs_sb_subavail(fs, lfs_btofsb(fs, LFS_SBPAD)); 867 if (lfs_sb_getversion(fs) > 1 && segnum == 0 && 868 lfs_sb_gets0addr(fs) < lfs_btofsb(fs, LFS_LABELPAD)) 869 lfs_sb_subavail(fs, lfs_btofsb(fs, LFS_LABELPAD) - lfs_sb_gets0addr(fs)); 870 mutex_enter(&lfs_lock); 871 lfs_sb_addbfree(fs, sup->su_nsums * lfs_btofsb(fs, lfs_sb_getsumsize(fs)) + 872 lfs_btofsb(fs, sup->su_ninos * lfs_sb_getibsize(fs))); 873 lfs_sb_subdmeta(fs, sup->su_nsums * lfs_btofsb(fs, lfs_sb_getsumsize(fs)) + 874 lfs_btofsb(fs, sup->su_ninos * lfs_sb_getibsize(fs))); 875 if (lfs_sb_getdmeta(fs) < 0) 876 lfs_sb_setdmeta(fs, 0); 877 mutex_exit(&lfs_lock); 878 sup->su_flags &= ~SEGUSE_DIRTY; 879 LFS_WRITESEGENTRY(sup, fs, segnum, bp); 880 881 LFS_CLEANERINFO(cip, fs, bp); 882 lfs_ci_shiftdirtytoclean(fs, cip, 1); 883 lfs_sb_setnclean(fs, lfs_ci_getclean(fs, cip)); 884 mutex_enter(&lfs_lock); 885 lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs)); 886 lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs) 887 - fs->lfs_ravail - fs->lfs_favail); 888 wakeup(&fs->lfs_availsleep); 889 mutex_exit(&lfs_lock); 890 (void) LFS_BWRITE_LOG(bp); 891 892 if (lfs_dostats) 893 ++lfs_stats.segs_reclaimed; 894 895 return (0); 896 } 897 898 /* 899 * This will block until a segment in file system fsid is written. A timeout 900 * in milliseconds may be specified which will awake the cleaner automatically. 901 * An fsid of -1 means any file system, and a timeout of 0 means forever. 902 */ 903 int 904 lfs_segwait(fsid_t *fsidp, struct timeval *tv) 905 { 906 struct mount *mntp; 907 void *addr; 908 u_long timeout; 909 int error; 910 911 KERNEL_LOCK(1, NULL); 912 if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL) 913 addr = &lfs_allclean_wakeup; 914 else 915 addr = &VFSTOULFS(mntp)->um_lfs->lfs_nextsegsleep; 916 /* 917 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}! 918 * XXX IS THAT WHAT IS INTENDED? 919 */ 920 timeout = tvtohz(tv); 921 error = tsleep(addr, PCATCH | PVFS, "segment", timeout); 922 KERNEL_UNLOCK_ONE(NULL); 923 return (error == ERESTART ? EINTR : 0); 924 } 925 926 /* 927 * sys_lfs_segwait: 928 * 929 * System call wrapper around lfs_segwait(). 930 * 931 * 0 on success 932 * 1 on timeout 933 * -1/errno is return on error. 934 */ 935 int 936 sys___lfs_segwait50(struct lwp *l, const struct sys___lfs_segwait50_args *uap, 937 register_t *retval) 938 { 939 /* { 940 syscallarg(fsid_t *) fsidp; 941 syscallarg(struct timeval *) tv; 942 } */ 943 struct timeval atv; 944 fsid_t fsid; 945 int error; 946 947 /* XXX need we be su to segwait? */ 948 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 949 KAUTH_REQ_SYSTEM_LFS_SEGWAIT, NULL, NULL, NULL); 950 if (error) 951 return (error); 952 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 953 return (error); 954 955 if (SCARG(uap, tv)) { 956 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval)); 957 if (error) 958 return (error); 959 if (itimerfix(&atv)) 960 return (EINVAL); 961 } else /* NULL or invalid */ 962 atv.tv_sec = atv.tv_usec = 0; 963 return lfs_segwait(&fsid, &atv); 964 } 965 966 /* 967 * VFS_VGET call specialized for the cleaner. If the cleaner is 968 * processing IINFO structures, it may have the ondisk inode already, so 969 * don't go retrieving it again. 970 * 971 * Return the vnode referenced and locked. 972 */ 973 974 static int 975 lfs_fastvget(struct mount *mp, ino_t ino, BLOCK_INFO *blkp, int lk_flags, 976 struct vnode **vpp) 977 { 978 struct ulfsmount *ump; 979 struct lfs *fs; 980 int error; 981 982 ump = VFSTOULFS(mp); 983 fs = ump->um_lfs; 984 fs->lfs_cleaner_hint = blkp; 985 error = vcache_get(mp, &ino, sizeof(ino), vpp); 986 fs->lfs_cleaner_hint = NULL; 987 if (error) 988 return error; 989 error = vn_lock(*vpp, lk_flags); 990 if (error) { 991 if (error == EBUSY) 992 error = EAGAIN; 993 vrele(*vpp); 994 *vpp = NULL; 995 return error; 996 } 997 998 return 0; 999 } 1000 1001 /* 1002 * Make up a "fake" cleaner buffer, copy the data from userland into it. 1003 */ 1004 static struct buf * 1005 lfs_fakebuf(struct lfs *fs, struct vnode *vp, daddr_t lbn, size_t size, void *uaddr) 1006 { 1007 struct buf *bp; 1008 int error; 1009 1010 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM); 1011 1012 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN); 1013 error = copyin(uaddr, bp->b_data, size); 1014 if (error) { 1015 lfs_freebuf(fs, bp); 1016 return NULL; 1017 } 1018 KDASSERT(bp->b_iodone == lfs_callback); 1019 1020 #if 0 1021 mutex_enter(&lfs_lock); 1022 ++fs->lfs_iocount; 1023 mutex_exit(&lfs_lock); 1024 #endif 1025 bp->b_bufsize = size; 1026 bp->b_bcount = size; 1027 return (bp); 1028 } 1029