1 /* $NetBSD: ufs_readwrite.c,v 1.94 2009/02/22 20:28:07 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.94 2009/02/22 20:28:07 ad Exp $"); 36 37 #ifdef LFS_READWRITE 38 #define FS struct lfs 39 #define I_FS i_lfs 40 #define READ lfs_read 41 #define READ_S "lfs_read" 42 #define WRITE lfs_write 43 #define WRITE_S "lfs_write" 44 #define fs_bsize lfs_bsize 45 #define fs_bmask lfs_bmask 46 #define UFS_WAPBL_BEGIN(mp) 0 47 #define UFS_WAPBL_END(mp) do { } while (0) 48 #define UFS_WAPBL_UPDATE(vp, access, modify, flags) do { } while (0) 49 #else 50 #define FS struct fs 51 #define I_FS i_fs 52 #define READ ffs_read 53 #define READ_S "ffs_read" 54 #define WRITE ffs_write 55 #define WRITE_S "ffs_write" 56 #endif 57 58 /* 59 * Vnode op for reading. 60 */ 61 /* ARGSUSED */ 62 int 63 READ(void *v) 64 { 65 struct vop_read_args /* { 66 struct vnode *a_vp; 67 struct uio *a_uio; 68 int a_ioflag; 69 kauth_cred_t a_cred; 70 } */ *ap = v; 71 struct vnode *vp; 72 struct inode *ip; 73 struct uio *uio; 74 struct ufsmount *ump; 75 struct buf *bp; 76 FS *fs; 77 vsize_t bytelen; 78 daddr_t lbn, nextlbn; 79 off_t bytesinfile; 80 long size, xfersize, blkoffset; 81 int error, ioflag; 82 bool usepc = false; 83 84 vp = ap->a_vp; 85 ip = VTOI(vp); 86 ump = ip->i_ump; 87 uio = ap->a_uio; 88 ioflag = ap->a_ioflag; 89 error = 0; 90 91 #ifdef DIAGNOSTIC 92 if (uio->uio_rw != UIO_READ) 93 panic("%s: mode", READ_S); 94 95 if (vp->v_type == VLNK) { 96 if (ip->i_size < ump->um_maxsymlinklen || 97 (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) 98 panic("%s: short symlink", READ_S); 99 } else if (vp->v_type != VREG && vp->v_type != VDIR) 100 panic("%s: type %d", READ_S, vp->v_type); 101 #endif 102 fs = ip->I_FS; 103 if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize) 104 return (EFBIG); 105 if (uio->uio_resid == 0) 106 return (0); 107 108 #ifndef LFS_READWRITE 109 if ((ip->i_flags & SF_SNAPSHOT)) 110 return ffs_snapshot_read(vp, uio, ioflag); 111 #endif /* !LFS_READWRITE */ 112 113 fstrans_start(vp->v_mount, FSTRANS_SHARED); 114 115 if (uio->uio_offset >= ip->i_size) 116 goto out; 117 118 #ifdef LFS_READWRITE 119 usepc = (vp->v_type == VREG && ip->i_number != LFS_IFILE_INUM); 120 #else /* !LFS_READWRITE */ 121 usepc = vp->v_type == VREG; 122 #endif /* !LFS_READWRITE */ 123 if (usepc) { 124 const int advice = IO_ADV_DECODE(ap->a_ioflag); 125 126 while (uio->uio_resid > 0) { 127 if (ioflag & IO_DIRECT) { 128 genfs_directio(vp, uio, ioflag); 129 } 130 bytelen = MIN(ip->i_size - uio->uio_offset, 131 uio->uio_resid); 132 if (bytelen == 0) 133 break; 134 error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice, 135 UBC_READ | UBC_PARTIALOK | 136 (UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0)); 137 if (error) 138 break; 139 } 140 goto out; 141 } 142 143 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 144 bytesinfile = ip->i_size - uio->uio_offset; 145 if (bytesinfile <= 0) 146 break; 147 lbn = lblkno(fs, uio->uio_offset); 148 nextlbn = lbn + 1; 149 size = blksize(fs, ip, lbn); 150 blkoffset = blkoff(fs, uio->uio_offset); 151 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), 152 bytesinfile); 153 154 if (lblktosize(fs, nextlbn) >= ip->i_size) 155 error = bread(vp, lbn, size, NOCRED, 0, &bp); 156 else { 157 int nextsize = blksize(fs, ip, nextlbn); 158 error = breadn(vp, lbn, 159 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); 160 } 161 if (error) 162 break; 163 164 /* 165 * We should only get non-zero b_resid when an I/O error 166 * has occurred, which should cause us to break above. 167 * However, if the short read did not cause an error, 168 * then we want to ensure that we do not uiomove bad 169 * or uninitialized data. 170 */ 171 size -= bp->b_resid; 172 if (size < xfersize) { 173 if (size == 0) 174 break; 175 xfersize = size; 176 } 177 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 178 if (error) 179 break; 180 brelse(bp, 0); 181 } 182 if (bp != NULL) 183 brelse(bp, 0); 184 185 out: 186 if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { 187 ip->i_flag |= IN_ACCESS; 188 if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) { 189 error = UFS_WAPBL_BEGIN(vp->v_mount); 190 if (error) { 191 fstrans_done(vp->v_mount); 192 return error; 193 } 194 error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); 195 UFS_WAPBL_END(vp->v_mount); 196 } 197 } 198 199 fstrans_done(vp->v_mount); 200 return (error); 201 } 202 203 /* 204 * Vnode op for writing. 205 */ 206 int 207 WRITE(void *v) 208 { 209 struct vop_write_args /* { 210 struct vnode *a_vp; 211 struct uio *a_uio; 212 int a_ioflag; 213 kauth_cred_t a_cred; 214 } */ *ap = v; 215 struct vnode *vp; 216 struct uio *uio; 217 struct inode *ip; 218 FS *fs; 219 struct buf *bp; 220 struct lwp *l; 221 kauth_cred_t cred; 222 daddr_t lbn; 223 off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize; 224 int blkoffset, error, flags, ioflag, resid, size, xfersize; 225 int aflag; 226 int extended=0; 227 vsize_t bytelen; 228 bool async; 229 bool usepc = false; 230 #ifdef LFS_READWRITE 231 bool need_unreserve = false; 232 #endif 233 struct ufsmount *ump; 234 235 cred = ap->a_cred; 236 ioflag = ap->a_ioflag; 237 uio = ap->a_uio; 238 vp = ap->a_vp; 239 ip = VTOI(vp); 240 ump = ip->i_ump; 241 242 KASSERT(vp->v_size == ip->i_size); 243 #ifdef DIAGNOSTIC 244 if (uio->uio_rw != UIO_WRITE) 245 panic("%s: mode", WRITE_S); 246 #endif 247 248 switch (vp->v_type) { 249 case VREG: 250 if (ioflag & IO_APPEND) 251 uio->uio_offset = ip->i_size; 252 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 253 return (EPERM); 254 /* FALLTHROUGH */ 255 case VLNK: 256 break; 257 case VDIR: 258 if ((ioflag & IO_SYNC) == 0) 259 panic("%s: nonsync dir write", WRITE_S); 260 break; 261 default: 262 panic("%s: type", WRITE_S); 263 } 264 265 fs = ip->I_FS; 266 if (uio->uio_offset < 0 || 267 (u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize) 268 return (EFBIG); 269 #ifdef LFS_READWRITE 270 /* Disallow writes to the Ifile, even if noschg flag is removed */ 271 /* XXX can this go away when the Ifile is no longer in the namespace? */ 272 if (vp == fs->lfs_ivnode) 273 return (EPERM); 274 #endif 275 /* 276 * Maybe this should be above the vnode op call, but so long as 277 * file servers have no limits, I don't think it matters. 278 */ 279 l = curlwp; 280 if (vp->v_type == VREG && l && 281 uio->uio_offset + uio->uio_resid > 282 l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 283 mutex_enter(proc_lock); 284 psignal(l->l_proc, SIGXFSZ); 285 mutex_exit(proc_lock); 286 return (EFBIG); 287 } 288 if (uio->uio_resid == 0) 289 return (0); 290 291 fstrans_start(vp->v_mount, FSTRANS_SHARED); 292 293 flags = ioflag & IO_SYNC ? B_SYNC : 0; 294 async = vp->v_mount->mnt_flag & MNT_ASYNC; 295 origoff = uio->uio_offset; 296 resid = uio->uio_resid; 297 osize = ip->i_size; 298 error = 0; 299 300 usepc = vp->v_type == VREG; 301 302 if ((ioflag & IO_JOURNALLOCKED) == 0) { 303 error = UFS_WAPBL_BEGIN(vp->v_mount); 304 if (error) { 305 fstrans_done(vp->v_mount); 306 return error; 307 } 308 } 309 310 #ifdef LFS_READWRITE 311 async = true; 312 lfs_check(vp, LFS_UNUSED_LBN, 0); 313 #endif /* !LFS_READWRITE */ 314 if (!usepc) 315 goto bcache; 316 317 preallocoff = round_page(blkroundup(fs, MAX(osize, uio->uio_offset))); 318 aflag = ioflag & IO_SYNC ? B_SYNC : 0; 319 nsize = MAX(osize, uio->uio_offset + uio->uio_resid); 320 endallocoff = nsize - blkoff(fs, nsize); 321 322 /* 323 * if we're increasing the file size, deal with expanding 324 * the fragment if there is one. 325 */ 326 327 if (nsize > osize && lblkno(fs, osize) < NDADDR && 328 lblkno(fs, osize) != lblkno(fs, nsize) && 329 blkroundup(fs, osize) != osize) { 330 off_t eob; 331 332 eob = blkroundup(fs, osize); 333 uvm_vnp_setwritesize(vp, eob); 334 error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag); 335 if (error) 336 goto out; 337 if (flags & B_SYNC) { 338 mutex_enter(&vp->v_interlock); 339 VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask), 340 round_page(eob), 341 PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); 342 } 343 } 344 345 while (uio->uio_resid > 0) { 346 int ubc_flags = UBC_WRITE; 347 bool overwrite; /* if we're overwrite a whole block */ 348 off_t newoff; 349 350 if (ioflag & IO_DIRECT) { 351 genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED); 352 } 353 354 oldoff = uio->uio_offset; 355 blkoffset = blkoff(fs, uio->uio_offset); 356 bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid); 357 if (bytelen == 0) { 358 break; 359 } 360 361 /* 362 * if we're filling in a hole, allocate the blocks now and 363 * initialize the pages first. if we're extending the file, 364 * we can safely allocate blocks without initializing pages 365 * since the new blocks will be inaccessible until the write 366 * is complete. 367 */ 368 overwrite = uio->uio_offset >= preallocoff && 369 uio->uio_offset < endallocoff; 370 if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 && 371 blkoff(fs, uio->uio_offset) == 0 && 372 (uio->uio_offset & PAGE_MASK) == 0) { 373 vsize_t len; 374 375 len = trunc_page(bytelen); 376 len -= blkoff(fs, len); 377 if (len > 0) { 378 overwrite = true; 379 bytelen = len; 380 } 381 } 382 383 newoff = oldoff + bytelen; 384 if (vp->v_size < newoff) { 385 uvm_vnp_setwritesize(vp, newoff); 386 } 387 388 if (!overwrite) { 389 error = ufs_balloc_range(vp, uio->uio_offset, bytelen, 390 cred, aflag); 391 if (error) 392 break; 393 } else { 394 genfs_node_wrlock(vp); 395 error = GOP_ALLOC(vp, uio->uio_offset, bytelen, 396 aflag, cred); 397 genfs_node_unlock(vp); 398 if (error) 399 break; 400 ubc_flags |= UBC_FAULTBUSY; 401 } 402 403 /* 404 * copy the data. 405 */ 406 407 ubc_flags |= UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0; 408 error = ubc_uiomove(&vp->v_uobj, uio, bytelen, 409 IO_ADV_DECODE(ioflag), ubc_flags); 410 411 /* 412 * update UVM's notion of the size now that we've 413 * copied the data into the vnode's pages. 414 * 415 * we should update the size even when uiomove failed. 416 */ 417 418 if (vp->v_size < newoff) { 419 uvm_vnp_setsize(vp, newoff); 420 extended = 1; 421 } 422 423 if (error) 424 break; 425 426 /* 427 * flush what we just wrote if necessary. 428 * XXXUBC simplistic async flushing. 429 */ 430 431 #ifndef LFS_READWRITE 432 if (!async && oldoff >> 16 != uio->uio_offset >> 16) { 433 mutex_enter(&vp->v_interlock); 434 error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16, 435 (uio->uio_offset >> 16) << 16, 436 PGO_CLEANIT | PGO_JOURNALLOCKED); 437 if (error) 438 break; 439 } 440 #endif 441 } 442 if (error == 0 && ioflag & IO_SYNC) { 443 mutex_enter(&vp->v_interlock); 444 error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask), 445 round_page(blkroundup(fs, uio->uio_offset)), 446 PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); 447 } 448 goto out; 449 450 bcache: 451 mutex_enter(&vp->v_interlock); 452 VOP_PUTPAGES(vp, trunc_page(origoff), round_page(origoff + resid), 453 PGO_CLEANIT | PGO_FREE | PGO_SYNCIO | PGO_JOURNALLOCKED); 454 while (uio->uio_resid > 0) { 455 lbn = lblkno(fs, uio->uio_offset); 456 blkoffset = blkoff(fs, uio->uio_offset); 457 xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid); 458 if (fs->fs_bsize > xfersize) 459 flags |= B_CLRBUF; 460 else 461 flags &= ~B_CLRBUF; 462 463 #ifdef LFS_READWRITE 464 error = lfs_reserve(fs, vp, NULL, 465 btofsb(fs, (NIADDR + 1) << fs->lfs_bshift)); 466 if (error) 467 break; 468 need_unreserve = true; 469 #endif 470 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 471 ap->a_cred, flags, &bp); 472 473 if (error) 474 break; 475 if (uio->uio_offset + xfersize > ip->i_size) { 476 ip->i_size = uio->uio_offset + xfersize; 477 DIP_ASSIGN(ip, size, ip->i_size); 478 uvm_vnp_setsize(vp, ip->i_size); 479 extended = 1; 480 } 481 size = blksize(fs, ip, lbn) - bp->b_resid; 482 if (xfersize > size) 483 xfersize = size; 484 485 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 486 487 /* 488 * if we didn't clear the block and the uiomove failed, 489 * the buf will now contain part of some other file, 490 * so we need to invalidate it. 491 */ 492 if (error && (flags & B_CLRBUF) == 0) { 493 brelse(bp, BC_INVAL); 494 break; 495 } 496 #ifdef LFS_READWRITE 497 (void)VOP_BWRITE(bp); 498 lfs_reserve(fs, vp, NULL, 499 -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift)); 500 need_unreserve = false; 501 #else 502 if (ioflag & IO_SYNC) 503 (void)bwrite(bp); 504 else if (xfersize + blkoffset == fs->fs_bsize) 505 bawrite(bp); 506 else 507 bdwrite(bp); 508 #endif 509 if (error || xfersize == 0) 510 break; 511 } 512 #ifdef LFS_READWRITE 513 if (need_unreserve) { 514 lfs_reserve(fs, vp, NULL, 515 -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift)); 516 } 517 #endif 518 519 /* 520 * If we successfully wrote any data, and we are not the superuser 521 * we clear the setuid and setgid bits as a precaution against 522 * tampering. 523 */ 524 out: 525 ip->i_flag |= IN_CHANGE | IN_UPDATE; 526 if (resid > uio->uio_resid && ap->a_cred && 527 kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL)) { 528 ip->i_mode &= ~(ISUID | ISGID); 529 DIP_ASSIGN(ip, mode, ip->i_mode); 530 } 531 if (resid > uio->uio_resid) 532 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 533 if (error) { 534 (void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred); 535 uio->uio_offset -= resid - uio->uio_resid; 536 uio->uio_resid = resid; 537 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) 538 error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); 539 else 540 UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); 541 KASSERT(vp->v_size == ip->i_size); 542 if ((ioflag & IO_JOURNALLOCKED) == 0) 543 UFS_WAPBL_END(vp->v_mount); 544 fstrans_done(vp->v_mount); 545 546 return (error); 547 } 548