1 /* $NetBSD: lfs_syscalls.c,v 1.100 2003/12/04 14:57:47 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 /*- 39 * Copyright (c) 1991, 1993, 1994 40 * The Regents of the University of California. All rights reserved. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95 67 */ 68 69 #include <sys/cdefs.h> 70 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.100 2003/12/04 14:57:47 yamt Exp $"); 71 72 #ifndef LFS 73 # define LFS /* for prototypes in syscallargs.h */ 74 #endif 75 76 #include <sys/param.h> 77 #include <sys/systm.h> 78 #include <sys/proc.h> 79 #include <sys/buf.h> 80 #include <sys/mount.h> 81 #include <sys/vnode.h> 82 #include <sys/malloc.h> 83 #include <sys/kernel.h> 84 85 #include <sys/sa.h> 86 #include <sys/syscallargs.h> 87 88 #include <ufs/ufs/inode.h> 89 #include <ufs/ufs/ufsmount.h> 90 #include <ufs/ufs/ufs_extern.h> 91 92 #include <ufs/lfs/lfs.h> 93 #include <ufs/lfs/lfs_extern.h> 94 95 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, caddr_t); 96 int lfs_fasthashget(dev_t, ino_t, struct vnode **); 97 98 int debug_cleaner = 0; 99 int clean_vnlocked = 0; 100 int clean_inlocked = 0; 101 int verbose_debug = 0; 102 103 pid_t lfs_cleaner_pid = 0; 104 105 #define LFS_FORCE_WRITE UNASSIGNED 106 107 /* 108 * sys_lfs_markv: 109 * 110 * This will mark inodes and blocks dirty, so they are written into the log. 111 * It will block until all the blocks have been written. The segment create 112 * time passed in the block_info and inode_info structures is used to decide 113 * if the data is valid for each block (in case some process dirtied a block 114 * or inode that is being cleaned between the determination that a block is 115 * live and the lfs_markv call). 116 * 117 * 0 on success 118 * -1/errno is return on error. 119 */ 120 #ifdef USE_64BIT_SYSCALLS 121 int 122 sys_lfs_markv(struct proc *p, void *v, register_t *retval) 123 { 124 struct sys_lfs_markv_args /* { 125 syscallarg(fsid_t *) fsidp; 126 syscallarg(struct block_info *) blkiov; 127 syscallarg(int) blkcnt; 128 } */ *uap = v; 129 BLOCK_INFO *blkiov; 130 int blkcnt, error; 131 fsid_t fsid; 132 133 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) 134 return (error); 135 136 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 137 return (error); 138 139 blkcnt = SCARG(uap, blkcnt); 140 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 141 return (EINVAL); 142 143 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK); 144 if ((error = copyin(SCARG(uap, blkiov), blkiov, 145 blkcnt * sizeof(BLOCK_INFO))) != 0) 146 goto out; 147 148 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0) 149 copyout(blkiov, SCARG(uap, blkiov), 150 blkcnt * sizeof(BLOCK_INFO)); 151 out: 152 free(blkiov, M_SEGMENT); 153 return error; 154 } 155 #else 156 int 157 sys_lfs_markv(struct lwp *l, void *v, register_t *retval) 158 { 159 struct sys_lfs_markv_args /* { 160 syscallarg(fsid_t *) fsidp; 161 syscallarg(struct block_info *) blkiov; 162 syscallarg(int) blkcnt; 163 } */ *uap = v; 164 BLOCK_INFO *blkiov; 165 BLOCK_INFO_15 *blkiov15; 166 int i, blkcnt, error; 167 fsid_t fsid; 168 169 if ((error = suser(l->l_proc->p_ucred, &l->l_proc->p_acflag)) != 0) 170 return (error); 171 172 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 173 return (error); 174 175 blkcnt = SCARG(uap, blkcnt); 176 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 177 return (EINVAL); 178 179 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK); 180 blkiov15 = malloc(blkcnt * sizeof(BLOCK_INFO_15), M_SEGMENT, M_WAITOK); 181 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 182 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 183 goto out; 184 185 for (i = 0; i < blkcnt; i++) { 186 blkiov[i].bi_inode = blkiov15[i].bi_inode; 187 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 188 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 189 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 190 blkiov[i].bi_version = blkiov15[i].bi_version; 191 blkiov[i].bi_bp = blkiov15[i].bi_bp; 192 blkiov[i].bi_size = blkiov15[i].bi_size; 193 } 194 195 if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) { 196 for (i = 0; i < blkcnt; i++) { 197 blkiov15[i].bi_inode = blkiov[i].bi_inode; 198 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 199 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 200 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 201 blkiov15[i].bi_version = blkiov[i].bi_version; 202 blkiov15[i].bi_bp = blkiov[i].bi_bp; 203 blkiov15[i].bi_size = blkiov[i].bi_size; 204 } 205 copyout(blkiov15, SCARG(uap, blkiov), 206 blkcnt * sizeof(BLOCK_INFO_15)); 207 } 208 out: 209 free(blkiov, M_SEGMENT); 210 free(blkiov15, M_SEGMENT); 211 return error; 212 } 213 #endif 214 215 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS) 216 217 int 218 lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) 219 { 220 BLOCK_INFO *blkp; 221 IFILE *ifp; 222 struct buf *bp; 223 struct inode *ip = NULL; 224 struct lfs *fs; 225 struct mount *mntp; 226 struct vnode *vp; 227 #ifdef DEBUG_LFS 228 int vputc = 0, iwritten = 0; 229 #endif 230 ino_t lastino; 231 daddr_t b_daddr, v_daddr; 232 int cnt, error; 233 int do_again = 0; 234 int numrefed = 0; 235 ino_t maxino; 236 size_t obsize; 237 238 /* number of blocks/inodes that we have already bwrite'ed */ 239 int nblkwritten, ninowritten; 240 241 if ((mntp = vfs_getvfs(fsidp)) == NULL) 242 return (ENOENT); 243 244 fs = VFSTOUFS(mntp)->um_lfs; 245 246 if (fs->lfs_ronly) 247 return EROFS; 248 249 maxino = (fragstoblks(fs, fsbtofrags(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks)) - 250 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb; 251 252 cnt = blkcnt; 253 254 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0) 255 return (error); 256 257 /* 258 * This seglock is just to prevent the fact that we might have to sleep 259 * from allowing the possibility that our blocks might become 260 * invalid. 261 * 262 * It is also important to note here that unless we specify SEGM_CKP, 263 * any Ifile blocks that we might be asked to clean will never get 264 * to the disk. 265 */ 266 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 267 268 /* Mark blocks/inodes dirty. */ 269 error = 0; 270 271 #ifdef DEBUG_LFS 272 /* Run through and count the inodes */ 273 lastino = LFS_UNUSED_INUM; 274 for (blkp = blkiov; cnt--; ++blkp) { 275 if (lastino != blkp->bi_inode) { 276 lastino = blkp->bi_inode; 277 vputc++; 278 } 279 } 280 cnt = blkcnt; 281 printf("[%d/",vputc); 282 iwritten = 0; 283 #endif /* DEBUG_LFS */ 284 /* these were inside the initialization for the for loop */ 285 v_daddr = LFS_UNUSED_DADDR; 286 lastino = LFS_UNUSED_INUM; 287 nblkwritten = ninowritten = 0; 288 for (blkp = blkiov; cnt--; ++blkp) 289 { 290 if (blkp->bi_daddr == LFS_FORCE_WRITE) 291 printf("lfs_markv: warning: force-writing ino %d " 292 "lbn %lld\n", 293 blkp->bi_inode, (long long)blkp->bi_lbn); 294 /* Bounds-check incoming data, avoid panic for failed VGET */ 295 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) { 296 error = EINVAL; 297 goto err3; 298 } 299 /* 300 * Get the IFILE entry (only once) and see if the file still 301 * exists. 302 */ 303 if (lastino != blkp->bi_inode) { 304 /* 305 * Finish the old file, if there was one. The presence 306 * of a usable vnode in vp is signaled by a valid v_daddr. 307 */ 308 if (v_daddr != LFS_UNUSED_DADDR) { 309 #ifdef DEBUG_LFS 310 if (ip->i_flag & (IN_MODIFIED|IN_CLEANING)) 311 iwritten++; 312 #endif 313 lfs_vunref(vp); 314 numrefed--; 315 } 316 317 /* 318 * Start a new file 319 */ 320 lastino = blkp->bi_inode; 321 if (blkp->bi_inode == LFS_IFILE_INUM) 322 v_daddr = fs->lfs_idaddr; 323 else { 324 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 325 /* XXX fix for force write */ 326 v_daddr = ifp->if_daddr; 327 brelse(bp); 328 } 329 /* Don't force-write the ifile */ 330 if (blkp->bi_inode == LFS_IFILE_INUM 331 && blkp->bi_daddr == LFS_FORCE_WRITE) 332 { 333 continue; 334 } 335 if (v_daddr == LFS_UNUSED_DADDR 336 && blkp->bi_daddr != LFS_FORCE_WRITE) 337 { 338 continue; 339 } 340 341 /* Get the vnode/inode. */ 342 error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr, 343 &vp, 344 (blkp->bi_lbn == LFS_UNUSED_LBN 345 ? blkp->bi_bp 346 : NULL)); 347 348 if (!error) { 349 numrefed++; 350 } 351 if (error) { 352 #ifdef DEBUG_LFS 353 printf("lfs_markv: lfs_fastvget failed with %d (ino %d, segment %d)\n", 354 error, blkp->bi_inode, 355 dtosn(fs, blkp->bi_daddr)); 356 #endif /* DEBUG_LFS */ 357 /* 358 * If we got EAGAIN, that means that the 359 * Inode was locked. This is 360 * recoverable: just clean the rest of 361 * this segment, and let the cleaner try 362 * again with another. (When the 363 * cleaner runs again, this segment will 364 * sort high on the list, since it is 365 * now almost entirely empty.) But, we 366 * still set v_daddr = LFS_UNUSED_ADDR 367 * so as not to test this over and over 368 * again. 369 */ 370 if (error == EAGAIN) { 371 error = 0; 372 do_again++; 373 } 374 #ifdef DIAGNOSTIC 375 else if (error != ENOENT) 376 panic("lfs_markv VFS_VGET FAILED"); 377 #endif 378 /* lastino = LFS_UNUSED_INUM; */ 379 v_daddr = LFS_UNUSED_DADDR; 380 vp = NULL; 381 ip = NULL; 382 continue; 383 } 384 ip = VTOI(vp); 385 ninowritten++; 386 } else if (v_daddr == LFS_UNUSED_DADDR) { 387 /* 388 * This can only happen if the vnode is dead (or 389 * in any case we can't get it...e.g., it is 390 * inlocked). Keep going. 391 */ 392 continue; 393 } 394 395 /* Past this point we are guaranteed that vp, ip are valid. */ 396 397 /* If this BLOCK_INFO didn't contain a block, keep going. */ 398 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 399 /* XXX need to make sure that the inode gets written in this case */ 400 /* XXX but only write the inode if it's the right one */ 401 if (blkp->bi_inode != LFS_IFILE_INUM) { 402 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 403 if (ifp->if_daddr == blkp->bi_daddr 404 || blkp->bi_daddr == LFS_FORCE_WRITE) 405 { 406 LFS_SET_UINO(ip, IN_CLEANING); 407 } 408 brelse(bp); 409 } 410 continue; 411 } 412 413 b_daddr = 0; 414 if (blkp->bi_daddr != LFS_FORCE_WRITE) { 415 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) || 416 dbtofsb(fs, b_daddr) != blkp->bi_daddr) 417 { 418 if (dtosn(fs,dbtofsb(fs, b_daddr)) 419 == dtosn(fs,blkp->bi_daddr)) 420 { 421 printf("lfs_markv: wrong da same seg: %llx vs %llx\n", 422 (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr)); 423 } 424 do_again++; 425 continue; 426 } 427 } 428 429 /* 430 * Check block sizes. The blocks being cleaned come from 431 * disk, so they should have the same size as their on-disk 432 * counterparts. 433 */ 434 if (blkp->bi_lbn >= 0) 435 obsize = blksize(fs, ip, blkp->bi_lbn); 436 else 437 obsize = fs->lfs_bsize; 438 /* Check for fragment size change */ 439 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) { 440 obsize = ip->i_lfs_fragsize[blkp->bi_lbn]; 441 } 442 if (obsize != blkp->bi_size) { 443 printf("lfs_markv: ino %d lbn %lld wrong size (%ld != %d), try again\n", 444 blkp->bi_inode, (long long)blkp->bi_lbn, 445 (long) obsize, blkp->bi_size); 446 do_again++; 447 continue; 448 } 449 450 /* 451 * If we get to here, then we are keeping the block. If 452 * it is an indirect block, we want to actually put it 453 * in the buffer cache so that it can be updated in the 454 * finish_meta section. If it's not, we need to 455 * allocate a fake buffer so that writeseg can perform 456 * the copyin and write the buffer. 457 */ 458 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) { 459 /* Data Block */ 460 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn, 461 blkp->bi_size, blkp->bi_bp); 462 /* Pretend we used bread() to get it */ 463 bp->b_blkno = fsbtodb(fs, blkp->bi_daddr); 464 } else { 465 /* Indirect block or ifile */ 466 if (blkp->bi_size != fs->lfs_bsize && 467 ip->i_number != LFS_IFILE_INUM) 468 panic("lfs_markv: partial indirect block?" 469 " size=%d\n", blkp->bi_size); 470 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0); 471 if (!(bp->b_flags & (B_DONE|B_DELWRI))) { /* B_CACHE */ 472 /* 473 * The block in question was not found 474 * in the cache; i.e., the block that 475 * getblk() returned is empty. So, we 476 * can (and should) copy in the 477 * contents, because we've already 478 * determined that this was the right 479 * version of this block on disk. 480 * 481 * And, it can't have changed underneath 482 * us, because we have the segment lock. 483 */ 484 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size); 485 if (error) 486 goto err2; 487 } 488 } 489 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0) 490 goto err2; 491 492 nblkwritten++; 493 /* 494 * XXX should account indirect blocks and ifile pages as well 495 */ 496 if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode)) 497 > LFS_MARKV_MAX_BLOCKS) { 498 #ifdef DEBUG_LFS 499 printf("lfs_markv: writing %d blks %d inos\n", 500 nblkwritten, ninowritten); 501 #endif 502 lfs_segwrite(mntp, SEGM_CLEAN); 503 nblkwritten = ninowritten = 0; 504 } 505 } 506 507 /* 508 * Finish the old file, if there was one 509 */ 510 if (v_daddr != LFS_UNUSED_DADDR) { 511 #ifdef DEBUG_LFS 512 if (ip->i_flag & (IN_MODIFIED|IN_CLEANING)) 513 iwritten++; 514 #endif 515 lfs_vunref(vp); 516 numrefed--; 517 } 518 519 #ifdef DEBUG_LFS 520 printf("%d]",iwritten); 521 if (numrefed != 0) { 522 panic("lfs_markv: numrefed=%d", numrefed); 523 } 524 #endif 525 526 #ifdef DEBUG_LFS 527 printf("lfs_markv: writing %d blks %d inos (check point)\n", 528 nblkwritten, ninowritten); 529 #endif 530 /* 531 * The last write has to be SEGM_SYNC, because of calling semantics. 532 * It also has to be SEGM_CKP, because otherwise we could write 533 * over the newly cleaned data contained in a checkpoint, and then 534 * we'd be unhappy at recovery time. 535 */ 536 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 537 538 lfs_segunlock(fs); 539 540 vfs_unbusy(mntp); 541 if (error) 542 return (error); 543 else if (do_again) 544 return EAGAIN; 545 546 return 0; 547 548 err2: 549 printf("lfs_markv err2\n"); 550 551 /* 552 * XXX we're here because copyin() failed. 553 * XXX it means that we can't trust the cleanerd. too bad. 554 * XXX how can we recover from this? 555 */ 556 557 err3: 558 /* 559 * XXX should do segwrite here anyway? 560 */ 561 562 if (v_daddr != LFS_UNUSED_DADDR) { 563 lfs_vunref(vp); 564 --numrefed; 565 } 566 567 lfs_segunlock(fs); 568 vfs_unbusy(mntp); 569 #ifdef DEBUG_LFS 570 if (numrefed != 0) { 571 panic("lfs_markv: numrefed=%d", numrefed); 572 } 573 #endif 574 575 return (error); 576 } 577 578 /* 579 * sys_lfs_bmapv: 580 * 581 * This will fill in the current disk address for arrays of blocks. 582 * 583 * 0 on success 584 * -1/errno is return on error. 585 */ 586 #ifdef USE_64BIT_SYSCALLS 587 int 588 sys_lfs_bmapv(struct proc *p, void *v, register_t *retval) 589 { 590 struct sys_lfs_bmapv_args /* { 591 syscallarg(fsid_t *) fsidp; 592 syscallarg(struct block_info *) blkiov; 593 syscallarg(int) blkcnt; 594 } */ *uap = v; 595 BLOCK_INFO *blkiov; 596 int blkcnt, error; 597 fsid_t fsid; 598 599 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) 600 return (error); 601 602 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 603 return (error); 604 605 blkcnt = SCARG(uap, blkcnt); 606 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 607 return (EINVAL); 608 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK); 609 if ((error = copyin(SCARG(uap, blkiov), blkiov, 610 blkcnt * sizeof(BLOCK_INFO))) != 0) 611 goto out; 612 613 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0) 614 copyout(blkiov, SCARG(uap, blkiov), 615 blkcnt * sizeof(BLOCK_INFO)); 616 out: 617 free(blkiov, M_SEGMENT); 618 return error; 619 } 620 #else 621 int 622 sys_lfs_bmapv(struct lwp *l, void *v, register_t *retval) 623 { 624 struct sys_lfs_bmapv_args /* { 625 syscallarg(fsid_t *) fsidp; 626 syscallarg(struct block_info *) blkiov; 627 syscallarg(int) blkcnt; 628 } */ *uap = v; 629 struct proc *p = l->l_proc; 630 BLOCK_INFO *blkiov; 631 BLOCK_INFO_15 *blkiov15; 632 int i, blkcnt, error; 633 fsid_t fsid; 634 635 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) 636 return (error); 637 638 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 639 return (error); 640 641 blkcnt = SCARG(uap, blkcnt); 642 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 643 return (EINVAL); 644 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK); 645 blkiov15 = malloc(blkcnt * sizeof(BLOCK_INFO_15), M_SEGMENT, M_WAITOK); 646 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 647 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 648 goto out; 649 650 for (i = 0; i < blkcnt; i++) { 651 blkiov[i].bi_inode = blkiov15[i].bi_inode; 652 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 653 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 654 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 655 blkiov[i].bi_version = blkiov15[i].bi_version; 656 blkiov[i].bi_bp = blkiov15[i].bi_bp; 657 blkiov[i].bi_size = blkiov15[i].bi_size; 658 } 659 660 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0) { 661 for (i = 0; i < blkcnt; i++) { 662 blkiov15[i].bi_inode = blkiov[i].bi_inode; 663 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 664 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 665 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 666 blkiov15[i].bi_version = blkiov[i].bi_version; 667 blkiov15[i].bi_bp = blkiov[i].bi_bp; 668 blkiov15[i].bi_size = blkiov[i].bi_size; 669 } 670 copyout(blkiov15, SCARG(uap, blkiov), 671 blkcnt * sizeof(BLOCK_INFO_15)); 672 } 673 out: 674 free(blkiov, M_SEGMENT); 675 free(blkiov15, M_SEGMENT); 676 return error; 677 } 678 #endif 679 680 int 681 lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) 682 { 683 BLOCK_INFO *blkp; 684 IFILE *ifp; 685 struct buf *bp; 686 struct inode *ip = NULL; 687 struct lfs *fs; 688 struct mount *mntp; 689 struct ufsmount *ump; 690 struct vnode *vp; 691 ino_t lastino; 692 daddr_t v_daddr; 693 int cnt, error; 694 int numrefed = 0; 695 696 lfs_cleaner_pid = p->p_pid; 697 698 if ((mntp = vfs_getvfs(fsidp)) == NULL) 699 return (ENOENT); 700 701 ump = VFSTOUFS(mntp); 702 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0) 703 return (error); 704 705 cnt = blkcnt; 706 707 fs = VFSTOUFS(mntp)->um_lfs; 708 709 error = 0; 710 711 /* these were inside the initialization for the for loop */ 712 v_daddr = LFS_UNUSED_DADDR; 713 lastino = LFS_UNUSED_INUM; 714 for (blkp = blkiov; cnt--; ++blkp) 715 { 716 /* 717 * Get the IFILE entry (only once) and see if the file still 718 * exists. 719 */ 720 if (lastino != blkp->bi_inode) { 721 /* 722 * Finish the old file, if there was one. The presence 723 * of a usable vnode in vp is signaled by a valid 724 * v_daddr. 725 */ 726 if (v_daddr != LFS_UNUSED_DADDR) { 727 lfs_vunref(vp); 728 numrefed--; 729 } 730 731 /* 732 * Start a new file 733 */ 734 lastino = blkp->bi_inode; 735 if (blkp->bi_inode == LFS_IFILE_INUM) 736 v_daddr = fs->lfs_idaddr; 737 else { 738 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 739 v_daddr = ifp->if_daddr; 740 brelse(bp); 741 } 742 if (v_daddr == LFS_UNUSED_DADDR) { 743 blkp->bi_daddr = LFS_UNUSED_DADDR; 744 continue; 745 } 746 /* 747 * A regular call to VFS_VGET could deadlock 748 * here. Instead, we try an unlocked access. 749 */ 750 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode); 751 if (vp != NULL && !(vp->v_flag & VXLOCK)) { 752 ip = VTOI(vp); 753 if (lfs_vref(vp)) { 754 v_daddr = LFS_UNUSED_DADDR; 755 continue; 756 } 757 numrefed++; 758 } else { 759 error = VFS_VGET(mntp, blkp->bi_inode, &vp); 760 if (error) { 761 #ifdef DEBUG_LFS 762 printf("lfs_bmapv: vget of ino %d failed with %d",blkp->bi_inode,error); 763 #endif 764 v_daddr = LFS_UNUSED_DADDR; 765 continue; 766 } else { 767 KASSERT(VOP_ISLOCKED(vp)); 768 VOP_UNLOCK(vp, 0); 769 numrefed++; 770 } 771 } 772 ip = VTOI(vp); 773 } else if (v_daddr == LFS_UNUSED_DADDR) { 774 /* 775 * This can only happen if the vnode is dead. 776 * Keep going. Note that we DO NOT set the 777 * bi_addr to anything -- if we failed to get 778 * the vnode, for example, we want to assume 779 * conservatively that all of its blocks *are* 780 * located in the segment in question. 781 * lfs_markv will throw them out if we are 782 * wrong. 783 */ 784 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */ 785 continue; 786 } 787 788 /* Past this point we are guaranteed that vp, ip are valid. */ 789 790 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 791 /* 792 * We just want the inode address, which is 793 * conveniently in v_daddr. 794 */ 795 blkp->bi_daddr = v_daddr; 796 } else { 797 daddr_t bi_daddr; 798 799 /* XXX ondisk32 */ 800 error = VOP_BMAP(vp, blkp->bi_lbn, NULL, 801 &bi_daddr, NULL); 802 if (error) 803 { 804 blkp->bi_daddr = LFS_UNUSED_DADDR; 805 continue; 806 } 807 blkp->bi_daddr = dbtofsb(fs, bi_daddr); 808 /* Fill in the block size, too */ 809 if (blkp->bi_lbn >= 0) 810 blkp->bi_size = blksize(fs, ip, blkp->bi_lbn); 811 else 812 blkp->bi_size = fs->lfs_bsize; 813 } 814 } 815 816 /* 817 * Finish the old file, if there was one. The presence 818 * of a usable vnode in vp is signaled by a valid v_daddr. 819 */ 820 if (v_daddr != LFS_UNUSED_DADDR) { 821 lfs_vunref(vp); 822 numrefed--; 823 } 824 825 #ifdef DEBUG_LFS 826 if (numrefed != 0) { 827 panic("lfs_bmapv: numrefed=%d", numrefed); 828 } 829 #endif 830 831 vfs_unbusy(mntp); 832 833 return 0; 834 } 835 836 /* 837 * sys_lfs_segclean: 838 * 839 * Mark the segment clean. 840 * 841 * 0 on success 842 * -1/errno is return on error. 843 */ 844 int 845 sys_lfs_segclean(struct lwp *l, void *v, register_t *retval) 846 { 847 struct sys_lfs_segclean_args /* { 848 syscallarg(fsid_t *) fsidp; 849 syscallarg(u_long) segment; 850 } */ *uap = v; 851 struct lfs *fs; 852 struct mount *mntp; 853 fsid_t fsid; 854 int error; 855 unsigned long segnum; 856 struct proc *p = l->l_proc; 857 858 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) 859 return (error); 860 861 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 862 return (error); 863 if ((mntp = vfs_getvfs(&fsid)) == NULL) 864 return (ENOENT); 865 866 fs = VFSTOUFS(mntp)->um_lfs; 867 segnum = SCARG(uap, segment); 868 869 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0) 870 return (error); 871 872 lfs_seglock(fs, SEGM_PROT); 873 error = lfs_do_segclean(fs, segnum); 874 lfs_segunlock(fs); 875 vfs_unbusy(mntp); 876 return error; 877 } 878 879 /* 880 * Actually mark the segment clean. 881 * Must be called with the segment lock held. 882 */ 883 int 884 lfs_do_segclean(struct lfs *fs, unsigned long segnum) 885 { 886 struct buf *bp; 887 CLEANERINFO *cip; 888 SEGUSE *sup; 889 890 if (dtosn(fs, fs->lfs_curseg) == segnum) { 891 return (EBUSY); 892 } 893 894 LFS_SEGENTRY(sup, fs, segnum, bp); 895 if (sup->su_nbytes) { 896 printf("lfs_segclean: not cleaning segment %lu: %d live bytes\n", 897 segnum, sup->su_nbytes); 898 brelse(bp); 899 return (EBUSY); 900 } 901 if (sup->su_flags & SEGUSE_ACTIVE) { 902 brelse(bp); 903 return (EBUSY); 904 } 905 if (!(sup->su_flags & SEGUSE_DIRTY)) { 906 brelse(bp); 907 return (EALREADY); 908 } 909 910 fs->lfs_avail += segtod(fs, 1); 911 if (sup->su_flags & SEGUSE_SUPERBLOCK) 912 fs->lfs_avail -= btofsb(fs, LFS_SBPAD); 913 if (fs->lfs_version > 1 && segnum == 0 && 914 fs->lfs_start < btofsb(fs, LFS_LABELPAD)) 915 fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start; 916 fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) + 917 btofsb(fs, sup->su_ninos * fs->lfs_ibsize); 918 fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) + 919 btofsb(fs, sup->su_ninos * fs->lfs_ibsize); 920 if (fs->lfs_dmeta < 0) 921 fs->lfs_dmeta = 0; 922 sup->su_flags &= ~SEGUSE_DIRTY; 923 LFS_WRITESEGENTRY(sup, fs, segnum, bp); 924 925 LFS_CLEANERINFO(cip, fs, bp); 926 ++cip->clean; 927 --cip->dirty; 928 fs->lfs_nclean = cip->clean; 929 cip->bfree = fs->lfs_bfree; 930 cip->avail = fs->lfs_avail - fs->lfs_ravail; 931 (void) LFS_BWRITE_LOG(bp); 932 wakeup(&fs->lfs_avail); 933 934 return (0); 935 } 936 937 /* 938 * This will block until a segment in file system fsid is written. A timeout 939 * in milliseconds may be specified which will awake the cleaner automatically. 940 * An fsid of -1 means any file system, and a timeout of 0 means forever. 941 */ 942 int 943 lfs_segwait(fsid_t *fsidp, struct timeval *tv) 944 { 945 struct mount *mntp; 946 void *addr; 947 u_long timeout; 948 int error, s; 949 950 if ((mntp = vfs_getvfs(fsidp)) == NULL) 951 addr = &lfs_allclean_wakeup; 952 else 953 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg; 954 /* 955 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}! 956 * XXX IS THAT WHAT IS INTENDED? 957 */ 958 s = splclock(); 959 timeradd(tv, &time, tv); 960 timeout = hzto(tv); 961 splx(s); 962 error = tsleep(addr, PCATCH | PUSER, "segment", timeout); 963 return (error == ERESTART ? EINTR : 0); 964 } 965 966 /* 967 * sys_lfs_segwait: 968 * 969 * System call wrapper around lfs_segwait(). 970 * 971 * 0 on success 972 * 1 on timeout 973 * -1/errno is return on error. 974 */ 975 int 976 sys_lfs_segwait(struct lwp *l, void *v, register_t *retval) 977 { 978 struct sys_lfs_segwait_args /* { 979 syscallarg(fsid_t *) fsidp; 980 syscallarg(struct timeval *) tv; 981 } */ *uap = v; 982 struct proc *p = l->l_proc; 983 struct timeval atv; 984 fsid_t fsid; 985 int error; 986 987 /* XXX need we be su to segwait? */ 988 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) { 989 return (error); 990 } 991 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 992 return (error); 993 994 if (SCARG(uap, tv)) { 995 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval)); 996 if (error) 997 return (error); 998 if (itimerfix(&atv)) 999 return (EINVAL); 1000 } else /* NULL or invalid */ 1001 atv.tv_sec = atv.tv_usec = 0; 1002 return lfs_segwait(&fsid, &atv); 1003 } 1004 1005 /* 1006 * VFS_VGET call specialized for the cleaner. The cleaner already knows the 1007 * daddr from the ifile, so don't look it up again. If the cleaner is 1008 * processing IINFO structures, it may have the ondisk inode already, so 1009 * don't go retrieving it again. 1010 * 1011 * we lfs_vref, and it is the caller's responsibility to lfs_vunref 1012 * when finished. 1013 */ 1014 extern struct lock ufs_hashlock; 1015 1016 int 1017 lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp) 1018 { 1019 1020 /* 1021 * This is playing fast and loose. Someone may have the inode 1022 * locked, in which case they are going to be distinctly unhappy 1023 * if we trash something. 1024 */ 1025 if ((*vpp = ufs_ihashlookup(dev, ino)) != NULL) { 1026 if ((*vpp)->v_flag & VXLOCK) { 1027 printf("lfs_fastvget: vnode VXLOCKed for ino %d\n", 1028 ino); 1029 clean_vnlocked++; 1030 #ifdef LFS_EAGAIN_FAIL 1031 return EAGAIN; 1032 #endif 1033 } 1034 if (lfs_vref(*vpp)) { 1035 clean_inlocked++; 1036 return EAGAIN; 1037 } 1038 } else 1039 *vpp = NULL; 1040 1041 return (0); 1042 } 1043 1044 int 1045 lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp, struct ufs1_dinode *dinp) 1046 { 1047 struct inode *ip; 1048 struct ufs1_dinode *dip; 1049 struct vnode *vp; 1050 struct ufsmount *ump; 1051 dev_t dev; 1052 int error, retries; 1053 struct buf *bp; 1054 struct lfs *fs; 1055 1056 ump = VFSTOUFS(mp); 1057 dev = ump->um_dev; 1058 fs = ump->um_lfs; 1059 1060 /* 1061 * Wait until the filesystem is fully mounted before allowing vget 1062 * to complete. This prevents possible problems with roll-forward. 1063 */ 1064 while (fs->lfs_flags & LFS_NOTYET) { 1065 tsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0); 1066 } 1067 /* 1068 * This is playing fast and loose. Someone may have the inode 1069 * locked, in which case they are going to be distinctly unhappy 1070 * if we trash something. 1071 */ 1072 1073 error = lfs_fasthashget(dev, ino, vpp); 1074 if (error != 0 || *vpp != NULL) 1075 return (error); 1076 1077 if ((error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, &vp)) != 0) { 1078 *vpp = NULL; 1079 return (error); 1080 } 1081 1082 do { 1083 error = lfs_fasthashget(dev, ino, vpp); 1084 if (error != 0 || *vpp != NULL) { 1085 ungetnewvnode(vp); 1086 return (error); 1087 } 1088 } while (lockmgr(&ufs_hashlock, LK_EXCLUSIVE|LK_SLEEPFAIL, 0)); 1089 1090 /* Allocate new vnode/inode. */ 1091 lfs_vcreate(mp, ino, vp); 1092 1093 /* 1094 * Put it onto its hash chain and lock it so that other requests for 1095 * this inode will block if they arrive while we are sleeping waiting 1096 * for old data structures to be purged or for the contents of the 1097 * disk portion of this inode to be read. 1098 */ 1099 ip = VTOI(vp); 1100 ufs_ihashins(ip); 1101 lockmgr(&ufs_hashlock, LK_RELEASE, 0); 1102 1103 /* 1104 * XXX 1105 * This may not need to be here, logically it should go down with 1106 * the i_devvp initialization. 1107 * Ask Kirk. 1108 */ 1109 ip->i_lfs = fs; 1110 1111 /* Read in the disk contents for the inode, copy into the inode. */ 1112 if (dinp) { 1113 error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode)); 1114 if (error) { 1115 printf("lfs_fastvget: dinode copyin failed for ino %d\n", ino); 1116 ufs_ihashrem(ip); 1117 1118 /* Unlock and discard unneeded inode. */ 1119 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock); 1120 lfs_vunref(vp); 1121 *vpp = NULL; 1122 return (error); 1123 } 1124 if (ip->i_number != ino) 1125 panic("lfs_fastvget: I was fed the wrong inode!"); 1126 } else { 1127 retries = 0; 1128 again: 1129 error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize, 1130 NOCRED, &bp); 1131 if (error) { 1132 printf("lfs_fastvget: bread failed with %d\n",error); 1133 /* 1134 * The inode does not contain anything useful, so it 1135 * would be misleading to leave it on its hash chain. 1136 * Iput() will return it to the free list. 1137 */ 1138 ufs_ihashrem(ip); 1139 1140 /* Unlock and discard unneeded inode. */ 1141 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock); 1142 lfs_vunref(vp); 1143 brelse(bp); 1144 *vpp = NULL; 1145 return (error); 1146 } 1147 dip = lfs_ifind(ump->um_lfs, ino, bp); 1148 if (dip == NULL) { 1149 /* Assume write has not completed yet; try again */ 1150 bp->b_flags |= B_INVAL; 1151 brelse(bp); 1152 ++retries; 1153 if (retries > LFS_IFIND_RETRIES) 1154 panic("lfs_fastvget: dinode not found"); 1155 printf("lfs_fastvget: dinode not found, retrying...\n"); 1156 goto again; 1157 } 1158 *ip->i_din.ffs1_din = *dip; 1159 brelse(bp); 1160 } 1161 lfs_vinit(mp, &vp); 1162 1163 *vpp = vp; 1164 1165 KASSERT(VOP_ISLOCKED(vp)); 1166 VOP_UNLOCK(vp, 0); 1167 1168 return (0); 1169 } 1170 1171 /* 1172 * Make up a "fake" cleaner buffer, copy the data from userland into it. 1173 */ 1174 struct buf * 1175 lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uaddr) 1176 { 1177 struct buf *bp; 1178 int error; 1179 1180 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM); 1181 1182 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN); 1183 error = copyin(uaddr, bp->b_data, size); 1184 if (error) { 1185 lfs_freebuf(fs, bp); 1186 return NULL; 1187 } 1188 KDASSERT(bp->b_iodone == lfs_callback); 1189 1190 #if 0 1191 ++fs->lfs_iocount; 1192 #endif 1193 bp->b_bufsize = size; 1194 bp->b_bcount = size; 1195 return (bp); 1196 } 1197