1 /* $NetBSD: ufs_readwrite.c,v 1.67 2006/03/01 12:38:33 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.67 2006/03/01 12:38:33 yamt Exp $"); 36 37 #ifdef LFS_READWRITE 38 #define BLKSIZE(a, b, c) blksize(a, b, c) 39 #define FS struct lfs 40 #define I_FS i_lfs 41 #define READ lfs_read 42 #define READ_S "lfs_read" 43 #define WRITE lfs_write 44 #define WRITE_S "lfs_write" 45 #define fs_bsize lfs_bsize 46 #define fs_bmask lfs_bmask 47 #else 48 #define BLKSIZE(a, b, c) blksize(a, b, c) 49 #define FS struct fs 50 #define I_FS i_fs 51 #define READ ffs_read 52 #define READ_S "ffs_read" 53 #define WRITE ffs_write 54 #define WRITE_S "ffs_write" 55 #endif 56 57 /* 58 * Vnode op for reading. 59 */ 60 /* ARGSUSED */ 61 int 62 READ(void *v) 63 { 64 struct vop_read_args /* { 65 struct vnode *a_vp; 66 struct uio *a_uio; 67 int a_ioflag; 68 struct ucred *a_cred; 69 } */ *ap = v; 70 struct vnode *vp; 71 struct inode *ip; 72 struct uio *uio; 73 struct ufsmount *ump; 74 struct buf *bp; 75 FS *fs; 76 void *win; 77 vsize_t bytelen; 78 daddr_t lbn, nextlbn; 79 off_t bytesinfile; 80 long size, xfersize, blkoffset; 81 int error, flags; 82 boolean_t usepc = FALSE; 83 84 vp = ap->a_vp; 85 ip = VTOI(vp); 86 ump = ip->i_ump; 87 uio = ap->a_uio; 88 error = 0; 89 90 #ifdef DIAGNOSTIC 91 if (uio->uio_rw != UIO_READ) 92 panic("%s: mode", READ_S); 93 94 if (vp->v_type == VLNK) { 95 if (ip->i_size < ump->um_maxsymlinklen || 96 (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) 97 panic("%s: short symlink", READ_S); 98 } else if (vp->v_type != VREG && vp->v_type != VDIR) 99 panic("%s: type %d", READ_S, vp->v_type); 100 #endif 101 fs = ip->I_FS; 102 if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize) 103 return (EFBIG); 104 if (uio->uio_resid == 0) 105 return (0); 106 if (uio->uio_offset >= ip->i_size) 107 goto out; 108 109 #ifdef LFS_READWRITE 110 usepc = (vp->v_type == VREG && ip->i_number != LFS_IFILE_INUM); 111 #else /* !LFS_READWRITE */ 112 usepc = vp->v_type == VREG; 113 #endif /* !LFS_READWRITE */ 114 if (usepc) { 115 const int advice = IO_ADV_DECODE(ap->a_ioflag); 116 117 while (uio->uio_resid > 0) { 118 bytelen = MIN(ip->i_size - uio->uio_offset, 119 uio->uio_resid); 120 if (bytelen == 0) 121 break; 122 123 win = ubc_alloc(&vp->v_uobj, uio->uio_offset, 124 &bytelen, advice, UBC_READ); 125 error = uiomove(win, bytelen, uio); 126 flags = UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0; 127 ubc_release(win, flags); 128 if (error) 129 break; 130 } 131 goto out; 132 } 133 134 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 135 bytesinfile = ip->i_size - uio->uio_offset; 136 if (bytesinfile <= 0) 137 break; 138 lbn = lblkno(fs, uio->uio_offset); 139 nextlbn = lbn + 1; 140 size = BLKSIZE(fs, ip, lbn); 141 blkoffset = blkoff(fs, uio->uio_offset); 142 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), 143 bytesinfile); 144 145 if (lblktosize(fs, nextlbn) >= ip->i_size) 146 error = bread(vp, lbn, size, NOCRED, &bp); 147 else { 148 int nextsize = BLKSIZE(fs, ip, nextlbn); 149 error = breadn(vp, lbn, 150 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 151 } 152 if (error) 153 break; 154 155 /* 156 * We should only get non-zero b_resid when an I/O error 157 * has occurred, which should cause us to break above. 158 * However, if the short read did not cause an error, 159 * then we want to ensure that we do not uiomove bad 160 * or uninitialized data. 161 */ 162 size -= bp->b_resid; 163 if (size < xfersize) { 164 if (size == 0) 165 break; 166 xfersize = size; 167 } 168 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 169 if (error) 170 break; 171 brelse(bp); 172 } 173 if (bp != NULL) 174 brelse(bp); 175 176 out: 177 if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { 178 ip->i_flag |= IN_ACCESS; 179 if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) 180 error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); 181 } 182 return (error); 183 } 184 185 /* 186 * Vnode op for writing. 187 */ 188 int 189 WRITE(void *v) 190 { 191 struct vop_write_args /* { 192 struct vnode *a_vp; 193 struct uio *a_uio; 194 int a_ioflag; 195 struct ucred *a_cred; 196 } */ *ap = v; 197 struct vnode *vp; 198 struct uio *uio; 199 struct inode *ip; 200 struct genfs_node *gp; 201 FS *fs; 202 struct buf *bp; 203 struct lwp *l; 204 struct ucred *cred; 205 daddr_t lbn; 206 off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize; 207 int blkoffset, error, flags, ioflag, resid, size, xfersize; 208 int aflag; 209 int ubc_alloc_flags, ubc_release_flags; 210 int extended=0; 211 void *win; 212 vsize_t bytelen; 213 boolean_t async; 214 boolean_t usepc = FALSE; 215 #ifdef LFS_READWRITE 216 boolean_t need_unreserve = FALSE; 217 #endif 218 struct ufsmount *ump; 219 220 cred = ap->a_cred; 221 ioflag = ap->a_ioflag; 222 uio = ap->a_uio; 223 vp = ap->a_vp; 224 ip = VTOI(vp); 225 gp = VTOG(vp); 226 ump = ip->i_ump; 227 228 KASSERT(vp->v_size == ip->i_size); 229 #ifdef DIAGNOSTIC 230 if (uio->uio_rw != UIO_WRITE) 231 panic("%s: mode", WRITE_S); 232 #endif 233 234 switch (vp->v_type) { 235 case VREG: 236 if (ioflag & IO_APPEND) 237 uio->uio_offset = ip->i_size; 238 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 239 return (EPERM); 240 /* FALLTHROUGH */ 241 case VLNK: 242 break; 243 case VDIR: 244 if ((ioflag & IO_SYNC) == 0) 245 panic("%s: nonsync dir write", WRITE_S); 246 break; 247 default: 248 panic("%s: type", WRITE_S); 249 } 250 251 fs = ip->I_FS; 252 if (uio->uio_offset < 0 || 253 (u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize) 254 return (EFBIG); 255 #ifdef LFS_READWRITE 256 /* Disallow writes to the Ifile, even if noschg flag is removed */ 257 /* XXX can this go away when the Ifile is no longer in the namespace? */ 258 if (vp == fs->lfs_ivnode) 259 return (EPERM); 260 #endif 261 /* 262 * Maybe this should be above the vnode op call, but so long as 263 * file servers have no limits, I don't think it matters. 264 */ 265 l = curlwp; 266 if (vp->v_type == VREG && l && 267 uio->uio_offset + uio->uio_resid > 268 l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 269 psignal(l->l_proc, SIGXFSZ); 270 return (EFBIG); 271 } 272 if (uio->uio_resid == 0) 273 return (0); 274 275 flags = ioflag & IO_SYNC ? B_SYNC : 0; 276 async = vp->v_mount->mnt_flag & MNT_ASYNC; 277 origoff = uio->uio_offset; 278 resid = uio->uio_resid; 279 osize = ip->i_size; 280 error = 0; 281 282 usepc = vp->v_type == VREG; 283 #ifdef LFS_READWRITE 284 async = TRUE; 285 lfs_check(vp, LFS_UNUSED_LBN, 0); 286 #endif /* !LFS_READWRITE */ 287 if (!usepc) 288 goto bcache; 289 290 preallocoff = round_page(blkroundup(fs, MAX(osize, uio->uio_offset))); 291 aflag = ioflag & IO_SYNC ? B_SYNC : 0; 292 nsize = MAX(osize, uio->uio_offset + uio->uio_resid); 293 endallocoff = nsize - blkoff(fs, nsize); 294 295 /* 296 * if we're increasing the file size, deal with expanding 297 * the fragment if there is one. 298 */ 299 300 if (nsize > osize && lblkno(fs, osize) < NDADDR && 301 lblkno(fs, osize) != lblkno(fs, nsize) && 302 blkroundup(fs, osize) != osize) { 303 off_t eob; 304 305 eob = blkroundup(fs, osize); 306 error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag); 307 if (error) 308 goto out; 309 if (flags & B_SYNC) { 310 vp->v_size = eob; 311 simple_lock(&vp->v_interlock); 312 VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask), 313 round_page(eob), PGO_CLEANIT | PGO_SYNCIO); 314 } 315 } 316 317 ubc_alloc_flags = UBC_WRITE; 318 while (uio->uio_resid > 0) { 319 boolean_t extending; /* if we're extending a whole block */ 320 off_t newoff; 321 322 oldoff = uio->uio_offset; 323 blkoffset = blkoff(fs, uio->uio_offset); 324 bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid); 325 326 /* 327 * if we're filling in a hole, allocate the blocks now and 328 * initialize the pages first. if we're extending the file, 329 * we can safely allocate blocks without initializing pages 330 * since the new blocks will be inaccessible until the write 331 * is complete. 332 */ 333 extending = uio->uio_offset >= preallocoff && 334 uio->uio_offset < endallocoff; 335 336 if (!extending) { 337 error = ufs_balloc_range(vp, uio->uio_offset, bytelen, 338 cred, aflag); 339 if (error) 340 break; 341 ubc_alloc_flags &= ~UBC_FAULTBUSY; 342 } else { 343 lockmgr(&gp->g_glock, LK_EXCLUSIVE, NULL); 344 error = GOP_ALLOC(vp, uio->uio_offset, bytelen, 345 aflag, cred); 346 lockmgr(&gp->g_glock, LK_RELEASE, NULL); 347 if (error) 348 break; 349 ubc_alloc_flags |= UBC_FAULTBUSY; 350 } 351 352 /* 353 * copy the data. 354 */ 355 356 win = ubc_alloc(&vp->v_uobj, uio->uio_offset, &bytelen, 357 UVM_ADV_NORMAL, ubc_alloc_flags); 358 error = uiomove(win, bytelen, uio); 359 if (error && extending) { 360 /* 361 * if we haven't initialized the pages yet, 362 * do it now. it's safe to use memset here 363 * because we just mapped the pages above. 364 */ 365 memset(win, 0, bytelen); 366 } 367 ubc_release_flags = UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0; 368 ubc_release(win, ubc_release_flags); 369 370 /* 371 * update UVM's notion of the size now that we've 372 * copied the data into the vnode's pages. 373 * 374 * we should update the size even when uiomove failed. 375 * otherwise ffs_truncate can't flush soft update states. 376 */ 377 378 newoff = oldoff + bytelen; 379 if (vp->v_size < newoff) { 380 uvm_vnp_setsize(vp, newoff); 381 extended = 1; 382 } 383 384 if (error) 385 break; 386 387 /* 388 * flush what we just wrote if necessary. 389 * XXXUBC simplistic async flushing. 390 */ 391 392 if (!async && oldoff >> 16 != uio->uio_offset >> 16) { 393 simple_lock(&vp->v_interlock); 394 error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16, 395 (uio->uio_offset >> 16) << 16, PGO_CLEANIT); 396 if (error) 397 break; 398 } 399 } 400 if (error == 0 && ioflag & IO_SYNC) { 401 simple_lock(&vp->v_interlock); 402 error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask), 403 round_page(blkroundup(fs, uio->uio_offset)), 404 PGO_CLEANIT | PGO_SYNCIO); 405 } 406 goto out; 407 408 bcache: 409 simple_lock(&vp->v_interlock); 410 VOP_PUTPAGES(vp, trunc_page(origoff), round_page(origoff + resid), 411 PGO_CLEANIT | PGO_FREE | PGO_SYNCIO); 412 while (uio->uio_resid > 0) { 413 lbn = lblkno(fs, uio->uio_offset); 414 blkoffset = blkoff(fs, uio->uio_offset); 415 xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid); 416 if (fs->fs_bsize > xfersize) 417 flags |= B_CLRBUF; 418 else 419 flags &= ~B_CLRBUF; 420 421 #ifdef LFS_READWRITE 422 error = lfs_reserve(fs, vp, NULL, 423 btofsb(fs, (NIADDR + 1) << fs->lfs_bshift)); 424 if (error) 425 break; 426 need_unreserve = TRUE; 427 #endif 428 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 429 ap->a_cred, flags, &bp); 430 431 if (error) 432 break; 433 if (uio->uio_offset + xfersize > ip->i_size) { 434 ip->i_size = uio->uio_offset + xfersize; 435 DIP_ASSIGN(ip, size, ip->i_size); 436 uvm_vnp_setsize(vp, ip->i_size); 437 extended = 1; 438 } 439 size = BLKSIZE(fs, ip, lbn) - bp->b_resid; 440 if (xfersize > size) 441 xfersize = size; 442 443 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 444 445 /* 446 * if we didn't clear the block and the uiomove failed, 447 * the buf will now contain part of some other file, 448 * so we need to invalidate it. 449 */ 450 if (error && (flags & B_CLRBUF) == 0) { 451 bp->b_flags |= B_INVAL; 452 brelse(bp); 453 break; 454 } 455 #ifdef LFS_READWRITE 456 (void)VOP_BWRITE(bp); 457 lfs_reserve(fs, vp, NULL, 458 -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift)); 459 need_unreserve = FALSE; 460 #else 461 if (ioflag & IO_SYNC) 462 (void)bwrite(bp); 463 else if (xfersize + blkoffset == fs->fs_bsize) 464 bawrite(bp); 465 else 466 bdwrite(bp); 467 #endif 468 if (error || xfersize == 0) 469 break; 470 } 471 #ifdef LFS_READWRITE 472 if (need_unreserve) { 473 lfs_reserve(fs, vp, NULL, 474 -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift)); 475 } 476 #endif 477 478 /* 479 * If we successfully wrote any data, and we are not the superuser 480 * we clear the setuid and setgid bits as a precaution against 481 * tampering. 482 */ 483 out: 484 ip->i_flag |= IN_CHANGE | IN_UPDATE; 485 if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) { 486 ip->i_mode &= ~(ISUID | ISGID); 487 DIP_ASSIGN(ip, mode, ip->i_mode); 488 } 489 if (resid > uio->uio_resid) 490 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 491 if (error) { 492 (void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred, 493 curlwp); 494 uio->uio_offset -= resid - uio->uio_resid; 495 uio->uio_resid = resid; 496 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) 497 error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); 498 KASSERT(vp->v_size == ip->i_size); 499 return (error); 500 } 501