1 /* $NetBSD: ufs_readwrite.c,v 1.107 2013/06/23 07:28:37 dholland Exp $ */ 2 3 /*- 4 * Copyright (c) 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.107 2013/06/23 07:28:37 dholland Exp $"); 36 37 #ifdef LFS_READWRITE 38 #define FS struct lfs 39 #define I_FS i_lfs 40 #define READ lfs_read 41 #define READ_S "lfs_read" 42 #define WRITE lfs_write 43 #define WRITE_S "lfs_write" 44 #define fs_bsize lfs_bsize 45 #define fs_bmask lfs_bmask 46 #define UFS_WAPBL_BEGIN(mp) 0 47 #define UFS_WAPBL_END(mp) do { } while (0) 48 #define UFS_WAPBL_UPDATE(vp, access, modify, flags) do { } while (0) 49 #define ufs_blkoff lfs_blkoff 50 #define ufs_blksize lfs_blksize 51 #define ufs_lblkno lfs_lblkno 52 #define ufs_lblktosize lfs_lblktosize 53 #define ufs_blkroundup lfs_blkroundup 54 #else 55 #define FS struct fs 56 #define I_FS i_fs 57 #define READ ffs_read 58 #define READ_S "ffs_read" 59 #define WRITE ffs_write 60 #define WRITE_S "ffs_write" 61 #define ufs_blkoff ffs_blkoff 62 #define ufs_blksize ffs_blksize 63 #define ufs_lblkno ffs_lblkno 64 #define ufs_lblktosize ffs_lblktosize 65 #define ufs_blkroundup ffs_blkroundup 66 #endif 67 68 /* 69 * Vnode op for reading. 70 */ 71 /* ARGSUSED */ 72 int 73 READ(void *v) 74 { 75 struct vop_read_args /* { 76 struct vnode *a_vp; 77 struct uio *a_uio; 78 int a_ioflag; 79 kauth_cred_t a_cred; 80 } */ *ap = v; 81 struct vnode *vp; 82 struct inode *ip; 83 struct uio *uio; 84 struct ufsmount *ump; 85 struct buf *bp; 86 FS *fs; 87 vsize_t bytelen; 88 daddr_t lbn, nextlbn; 89 off_t bytesinfile; 90 long size, xfersize, blkoffset; 91 int error, ioflag; 92 bool usepc = false; 93 94 vp = ap->a_vp; 95 ip = VTOI(vp); 96 ump = ip->i_ump; 97 uio = ap->a_uio; 98 ioflag = ap->a_ioflag; 99 error = 0; 100 101 #ifdef DIAGNOSTIC 102 if (uio->uio_rw != UIO_READ) 103 panic("%s: mode", READ_S); 104 105 if (vp->v_type == VLNK) { 106 if (ip->i_size < ump->um_maxsymlinklen || 107 (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) 108 panic("%s: short symlink", READ_S); 109 } else if (vp->v_type != VREG && vp->v_type != VDIR) 110 panic("%s: type %d", READ_S, vp->v_type); 111 #endif 112 fs = ip->I_FS; 113 if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize) 114 return (EFBIG); 115 if (uio->uio_resid == 0) 116 return (0); 117 118 #ifndef LFS_READWRITE 119 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT) 120 return ffs_snapshot_read(vp, uio, ioflag); 121 #endif /* !LFS_READWRITE */ 122 123 fstrans_start(vp->v_mount, FSTRANS_SHARED); 124 125 if (uio->uio_offset >= ip->i_size) 126 goto out; 127 128 #ifdef LFS_READWRITE 129 usepc = (vp->v_type == VREG && ip->i_number != LFS_IFILE_INUM); 130 #else /* !LFS_READWRITE */ 131 usepc = vp->v_type == VREG; 132 #endif /* !LFS_READWRITE */ 133 if (usepc) { 134 const int advice = IO_ADV_DECODE(ap->a_ioflag); 135 136 while (uio->uio_resid > 0) { 137 if (ioflag & IO_DIRECT) { 138 genfs_directio(vp, uio, ioflag); 139 } 140 bytelen = MIN(ip->i_size - uio->uio_offset, 141 uio->uio_resid); 142 if (bytelen == 0) 143 break; 144 error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice, 145 UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp)); 146 if (error) 147 break; 148 } 149 goto out; 150 } 151 152 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 153 bytesinfile = ip->i_size - uio->uio_offset; 154 if (bytesinfile <= 0) 155 break; 156 lbn = ufs_lblkno(fs, uio->uio_offset); 157 nextlbn = lbn + 1; 158 size = ufs_blksize(fs, ip, lbn); 159 blkoffset = ufs_blkoff(fs, uio->uio_offset); 160 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), 161 bytesinfile); 162 163 if (ufs_lblktosize(fs, nextlbn) >= ip->i_size) 164 error = bread(vp, lbn, size, NOCRED, 0, &bp); 165 else { 166 int nextsize = ufs_blksize(fs, ip, nextlbn); 167 error = breadn(vp, lbn, 168 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); 169 } 170 if (error) 171 break; 172 173 /* 174 * We should only get non-zero b_resid when an I/O error 175 * has occurred, which should cause us to break above. 176 * However, if the short read did not cause an error, 177 * then we want to ensure that we do not uiomove bad 178 * or uninitialized data. 179 */ 180 size -= bp->b_resid; 181 if (size < xfersize) { 182 if (size == 0) 183 break; 184 xfersize = size; 185 } 186 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 187 if (error) 188 break; 189 brelse(bp, 0); 190 } 191 if (bp != NULL) 192 brelse(bp, 0); 193 194 out: 195 if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { 196 ip->i_flag |= IN_ACCESS; 197 if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) { 198 error = UFS_WAPBL_BEGIN(vp->v_mount); 199 if (error) { 200 fstrans_done(vp->v_mount); 201 return error; 202 } 203 error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); 204 UFS_WAPBL_END(vp->v_mount); 205 } 206 } 207 208 fstrans_done(vp->v_mount); 209 return (error); 210 } 211 212 /* 213 * Vnode op for writing. 214 */ 215 int 216 WRITE(void *v) 217 { 218 struct vop_write_args /* { 219 struct vnode *a_vp; 220 struct uio *a_uio; 221 int a_ioflag; 222 kauth_cred_t a_cred; 223 } */ *ap = v; 224 struct vnode *vp; 225 struct uio *uio; 226 struct inode *ip; 227 FS *fs; 228 struct buf *bp; 229 kauth_cred_t cred; 230 daddr_t lbn; 231 off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize; 232 int blkoffset, error, flags, ioflag, resid, size, xfersize; 233 int aflag; 234 int extended=0; 235 vsize_t bytelen; 236 bool async; 237 bool usepc = false; 238 #ifdef LFS_READWRITE 239 bool need_unreserve = false; 240 #endif 241 struct ufsmount *ump; 242 243 cred = ap->a_cred; 244 ioflag = ap->a_ioflag; 245 uio = ap->a_uio; 246 vp = ap->a_vp; 247 ip = VTOI(vp); 248 ump = ip->i_ump; 249 250 KASSERT(vp->v_size == ip->i_size); 251 #ifdef DIAGNOSTIC 252 if (uio->uio_rw != UIO_WRITE) 253 panic("%s: mode", WRITE_S); 254 #endif 255 256 switch (vp->v_type) { 257 case VREG: 258 if (ioflag & IO_APPEND) 259 uio->uio_offset = ip->i_size; 260 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 261 return (EPERM); 262 /* FALLTHROUGH */ 263 case VLNK: 264 break; 265 case VDIR: 266 if ((ioflag & IO_SYNC) == 0) 267 panic("%s: nonsync dir write", WRITE_S); 268 break; 269 default: 270 panic("%s: type", WRITE_S); 271 } 272 273 fs = ip->I_FS; 274 if (uio->uio_offset < 0 || 275 (u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize) 276 return (EFBIG); 277 #ifdef LFS_READWRITE 278 /* Disallow writes to the Ifile, even if noschg flag is removed */ 279 /* XXX can this go away when the Ifile is no longer in the namespace? */ 280 if (vp == fs->lfs_ivnode) 281 return (EPERM); 282 #endif 283 if (uio->uio_resid == 0) 284 return (0); 285 286 fstrans_start(vp->v_mount, FSTRANS_SHARED); 287 288 flags = ioflag & IO_SYNC ? B_SYNC : 0; 289 async = vp->v_mount->mnt_flag & MNT_ASYNC; 290 origoff = uio->uio_offset; 291 resid = uio->uio_resid; 292 osize = ip->i_size; 293 error = 0; 294 295 usepc = vp->v_type == VREG; 296 297 if ((ioflag & IO_JOURNALLOCKED) == 0) { 298 error = UFS_WAPBL_BEGIN(vp->v_mount); 299 if (error) { 300 fstrans_done(vp->v_mount); 301 return error; 302 } 303 } 304 305 #ifdef LFS_READWRITE 306 async = true; 307 lfs_availwait(fs, btofsb(fs, uio->uio_resid)); 308 lfs_check(vp, LFS_UNUSED_LBN, 0); 309 #endif /* !LFS_READWRITE */ 310 if (!usepc) 311 goto bcache; 312 313 preallocoff = round_page(ufs_blkroundup(fs, MAX(osize, uio->uio_offset))); 314 aflag = ioflag & IO_SYNC ? B_SYNC : 0; 315 nsize = MAX(osize, uio->uio_offset + uio->uio_resid); 316 endallocoff = nsize - ufs_blkoff(fs, nsize); 317 318 /* 319 * if we're increasing the file size, deal with expanding 320 * the fragment if there is one. 321 */ 322 323 if (nsize > osize && ufs_lblkno(fs, osize) < UFS_NDADDR && 324 ufs_lblkno(fs, osize) != ufs_lblkno(fs, nsize) && 325 ufs_blkroundup(fs, osize) != osize) { 326 off_t eob; 327 328 eob = ufs_blkroundup(fs, osize); 329 uvm_vnp_setwritesize(vp, eob); 330 error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag); 331 if (error) 332 goto out; 333 if (flags & B_SYNC) { 334 mutex_enter(vp->v_interlock); 335 VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask), 336 round_page(eob), 337 PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); 338 } 339 } 340 341 while (uio->uio_resid > 0) { 342 int ubc_flags = UBC_WRITE; 343 bool overwrite; /* if we're overwrite a whole block */ 344 off_t newoff; 345 346 if (ioflag & IO_DIRECT) { 347 genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED); 348 } 349 350 oldoff = uio->uio_offset; 351 blkoffset = ufs_blkoff(fs, uio->uio_offset); 352 bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid); 353 if (bytelen == 0) { 354 break; 355 } 356 357 /* 358 * if we're filling in a hole, allocate the blocks now and 359 * initialize the pages first. if we're extending the file, 360 * we can safely allocate blocks without initializing pages 361 * since the new blocks will be inaccessible until the write 362 * is complete. 363 */ 364 overwrite = uio->uio_offset >= preallocoff && 365 uio->uio_offset < endallocoff; 366 if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 && 367 ufs_blkoff(fs, uio->uio_offset) == 0 && 368 (uio->uio_offset & PAGE_MASK) == 0) { 369 vsize_t len; 370 371 len = trunc_page(bytelen); 372 len -= ufs_blkoff(fs, len); 373 if (len > 0) { 374 overwrite = true; 375 bytelen = len; 376 } 377 } 378 379 newoff = oldoff + bytelen; 380 if (vp->v_size < newoff) { 381 uvm_vnp_setwritesize(vp, newoff); 382 } 383 384 if (!overwrite) { 385 error = ufs_balloc_range(vp, uio->uio_offset, bytelen, 386 cred, aflag); 387 if (error) 388 break; 389 } else { 390 genfs_node_wrlock(vp); 391 error = GOP_ALLOC(vp, uio->uio_offset, bytelen, 392 aflag, cred); 393 genfs_node_unlock(vp); 394 if (error) 395 break; 396 ubc_flags |= UBC_FAULTBUSY; 397 } 398 399 /* 400 * copy the data. 401 */ 402 403 error = ubc_uiomove(&vp->v_uobj, uio, bytelen, 404 IO_ADV_DECODE(ioflag), ubc_flags | UBC_UNMAP_FLAG(vp)); 405 406 /* 407 * update UVM's notion of the size now that we've 408 * copied the data into the vnode's pages. 409 * 410 * we should update the size even when uiomove failed. 411 */ 412 413 if (vp->v_size < newoff) { 414 uvm_vnp_setsize(vp, newoff); 415 extended = 1; 416 } 417 418 if (error) 419 break; 420 421 /* 422 * flush what we just wrote if necessary. 423 * XXXUBC simplistic async flushing. 424 */ 425 426 #ifndef LFS_READWRITE 427 if (!async && oldoff >> 16 != uio->uio_offset >> 16) { 428 mutex_enter(vp->v_interlock); 429 error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16, 430 (uio->uio_offset >> 16) << 16, 431 PGO_CLEANIT | PGO_JOURNALLOCKED | PGO_LAZY); 432 if (error) 433 break; 434 } 435 #endif 436 } 437 if (error == 0 && ioflag & IO_SYNC) { 438 mutex_enter(vp->v_interlock); 439 error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask), 440 round_page(ufs_blkroundup(fs, uio->uio_offset)), 441 PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); 442 } 443 goto out; 444 445 bcache: 446 mutex_enter(vp->v_interlock); 447 VOP_PUTPAGES(vp, trunc_page(origoff), round_page(origoff + resid), 448 PGO_CLEANIT | PGO_FREE | PGO_SYNCIO | PGO_JOURNALLOCKED); 449 while (uio->uio_resid > 0) { 450 lbn = ufs_lblkno(fs, uio->uio_offset); 451 blkoffset = ufs_blkoff(fs, uio->uio_offset); 452 xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid); 453 if (fs->fs_bsize > xfersize) 454 flags |= B_CLRBUF; 455 else 456 flags &= ~B_CLRBUF; 457 458 #ifdef LFS_READWRITE 459 error = lfs_reserve(fs, vp, NULL, 460 btofsb(fs, (UFS_NIADDR + 1) << fs->lfs_bshift)); 461 if (error) 462 break; 463 need_unreserve = true; 464 #endif 465 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 466 ap->a_cred, flags, &bp); 467 468 if (error) 469 break; 470 if (uio->uio_offset + xfersize > ip->i_size) { 471 ip->i_size = uio->uio_offset + xfersize; 472 DIP_ASSIGN(ip, size, ip->i_size); 473 uvm_vnp_setsize(vp, ip->i_size); 474 extended = 1; 475 } 476 size = ufs_blksize(fs, ip, lbn) - bp->b_resid; 477 if (xfersize > size) 478 xfersize = size; 479 480 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 481 482 /* 483 * if we didn't clear the block and the uiomove failed, 484 * the buf will now contain part of some other file, 485 * so we need to invalidate it. 486 */ 487 if (error && (flags & B_CLRBUF) == 0) { 488 brelse(bp, BC_INVAL); 489 break; 490 } 491 #ifdef LFS_READWRITE 492 (void)VOP_BWRITE(bp->b_vp, bp); 493 lfs_reserve(fs, vp, NULL, 494 -btofsb(fs, (UFS_NIADDR + 1) << fs->lfs_bshift)); 495 need_unreserve = false; 496 #else 497 if (ioflag & IO_SYNC) 498 (void)bwrite(bp); 499 else if (xfersize + blkoffset == fs->fs_bsize) 500 bawrite(bp); 501 else 502 bdwrite(bp); 503 #endif 504 if (error || xfersize == 0) 505 break; 506 } 507 #ifdef LFS_READWRITE 508 if (need_unreserve) { 509 lfs_reserve(fs, vp, NULL, 510 -btofsb(fs, (UFS_NIADDR + 1) << fs->lfs_bshift)); 511 } 512 #endif 513 514 /* 515 * If we successfully wrote any data, and we are not the superuser 516 * we clear the setuid and setgid bits as a precaution against 517 * tampering. 518 */ 519 out: 520 ip->i_flag |= IN_CHANGE | IN_UPDATE; 521 if (vp->v_mount->mnt_flag & MNT_RELATIME) 522 ip->i_flag |= IN_ACCESS; 523 if (resid > uio->uio_resid && ap->a_cred) { 524 if (ip->i_mode & ISUID) { 525 if (kauth_authorize_vnode(ap->a_cred, 526 KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0) { 527 ip->i_mode &= ~ISUID; 528 DIP_ASSIGN(ip, mode, ip->i_mode); 529 } 530 } 531 532 if (ip->i_mode & ISGID) { 533 if (kauth_authorize_vnode(ap->a_cred, 534 KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0) { 535 ip->i_mode &= ~ISGID; 536 DIP_ASSIGN(ip, mode, ip->i_mode); 537 } 538 } 539 } 540 if (resid > uio->uio_resid) 541 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 542 if (error) { 543 (void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred); 544 uio->uio_offset -= resid - uio->uio_resid; 545 uio->uio_resid = resid; 546 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) 547 error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); 548 else 549 UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); 550 KASSERT(vp->v_size == ip->i_size); 551 if ((ioflag & IO_JOURNALLOCKED) == 0) 552 UFS_WAPBL_END(vp->v_mount); 553 fstrans_done(vp->v_mount); 554 555 return (error); 556 } 557