1 /* $NetBSD: ulfs_readwrite.c,v 1.24 2017/06/10 05:29:36 maya Exp $ */ 2 /* from NetBSD: ufs_readwrite.c,v 1.120 2015/04/12 22:48:38 riastradh Exp */ 3 4 /*- 5 * Copyright (c) 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(1, "$NetBSD: ulfs_readwrite.c,v 1.24 2017/06/10 05:29:36 maya Exp $"); 37 38 #ifdef LFS_READWRITE 39 #define FS struct lfs 40 #define I_FS i_lfs 41 #define READ lfs_read 42 #define READ_S "lfs_read" 43 #define WRITE lfs_write 44 #define WRITE_S "lfs_write" 45 #define BUFRD lfs_bufrd 46 #define BUFWR lfs_bufwr 47 #define fs_sb_getbsize(fs) lfs_sb_getbsize(fs) 48 #define fs_bmask lfs_bmask 49 #else 50 #define FS struct fs 51 #define I_FS i_fs 52 #define READ ffs_read 53 #define READ_S "ffs_read" 54 #define WRITE ffs_write 55 #define WRITE_S "ffs_write" 56 #define BUFRD ffs_bufrd 57 #define BUFWR ffs_bufwr 58 #define fs_sb_getbsize(fs) (fs)->fs_bsize 59 #endif 60 61 static int ulfs_post_read_update(struct vnode *, int, int); 62 static int ulfs_post_write_update(struct vnode *, struct uio *, int, 63 kauth_cred_t, off_t, int, int, int); 64 65 /* 66 * Vnode op for reading. 67 */ 68 /* ARGSUSED */ 69 int 70 READ(void *v) 71 { 72 struct vop_read_args /* { 73 struct vnode *a_vp; 74 struct uio *a_uio; 75 int a_ioflag; 76 kauth_cred_t a_cred; 77 } */ *ap = v; 78 struct vnode *vp; 79 struct inode *ip; 80 struct uio *uio; 81 FS *fs; 82 vsize_t bytelen; 83 int error, ioflag, advice; 84 85 vp = ap->a_vp; 86 ip = VTOI(vp); 87 fs = ip->I_FS; 88 uio = ap->a_uio; 89 ioflag = ap->a_ioflag; 90 error = 0; 91 92 KASSERT(uio->uio_rw == UIO_READ); 93 KASSERT(vp->v_type == VREG || vp->v_type == VDIR); 94 95 /* XXX Eliminate me by refusing directory reads from userland. */ 96 if (vp->v_type == VDIR) 97 return BUFRD(vp, uio, ioflag, ap->a_cred); 98 #ifdef LFS_READWRITE 99 /* XXX Eliminate me by using ufs_bufio in lfs. */ 100 if (vp->v_type == VREG && ip->i_number == LFS_IFILE_INUM) 101 return BUFRD(vp, uio, ioflag, ap->a_cred); 102 #endif 103 if ((u_int64_t)uio->uio_offset > fs->um_maxfilesize) 104 return (EFBIG); 105 if (uio->uio_resid == 0) 106 return (0); 107 108 #ifndef LFS_READWRITE 109 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT) 110 return ffs_snapshot_read(vp, uio, ioflag); 111 #endif /* !LFS_READWRITE */ 112 113 if (uio->uio_offset >= ip->i_size) 114 goto out; 115 116 KASSERT(vp->v_type == VREG); 117 advice = IO_ADV_DECODE(ap->a_ioflag); 118 while (uio->uio_resid > 0) { 119 if (ioflag & IO_DIRECT) { 120 genfs_directio(vp, uio, ioflag); 121 } 122 bytelen = MIN(ip->i_size - uio->uio_offset, uio->uio_resid); 123 if (bytelen == 0) 124 break; 125 error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice, 126 UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp)); 127 if (error) 128 break; 129 } 130 131 out: 132 error = ulfs_post_read_update(vp, ap->a_ioflag, error); 133 return (error); 134 } 135 136 /* 137 * UFS op for reading via the buffer cache 138 */ 139 int 140 BUFRD(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred) 141 { 142 struct inode *ip; 143 FS *fs; 144 struct buf *bp; 145 daddr_t lbn, nextlbn; 146 off_t bytesinfile; 147 long size, xfersize, blkoffset; 148 int error; 149 150 KASSERT(VOP_ISLOCKED(vp)); 151 KASSERT(vp->v_type == VDIR || vp->v_type == VLNK || 152 vp->v_type == VREG); 153 KASSERT(uio->uio_rw == UIO_READ); 154 155 ip = VTOI(vp); 156 fs = ip->I_FS; 157 error = 0; 158 159 KASSERT(vp->v_type != VLNK || ip->i_size >= fs->um_maxsymlinklen); 160 KASSERT(vp->v_type != VLNK || fs->um_maxsymlinklen != 0 || 161 DIP(ip, blocks) == 0); 162 KASSERT(vp->v_type != VREG || vp == fs->lfs_ivnode); 163 KASSERT(vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM); 164 165 if (uio->uio_offset > fs->um_maxfilesize) 166 return EFBIG; 167 if (uio->uio_resid == 0) 168 return 0; 169 170 #ifndef LFS_READWRITE 171 KASSERT(!ISSET(ip->i_flags, (SF_SNAPSHOT | SF_SNAPINVAL))); 172 #endif 173 174 if (uio->uio_offset >= ip->i_size) 175 goto out; 176 177 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 178 bytesinfile = ip->i_size - uio->uio_offset; 179 if (bytesinfile <= 0) 180 break; 181 lbn = lfs_lblkno(fs, uio->uio_offset); 182 nextlbn = lbn + 1; 183 size = lfs_blksize(fs, ip, lbn); 184 blkoffset = lfs_blkoff(fs, uio->uio_offset); 185 xfersize = MIN(MIN(fs_sb_getbsize(fs) - blkoffset, uio->uio_resid), 186 bytesinfile); 187 188 if (lfs_lblktosize(fs, nextlbn) >= ip->i_size) 189 error = bread(vp, lbn, size, 0, &bp); 190 else { 191 int nextsize = lfs_blksize(fs, ip, nextlbn); 192 error = breadn(vp, lbn, 193 size, &nextlbn, &nextsize, 1, 0, &bp); 194 } 195 if (error) 196 break; 197 198 /* 199 * We should only get non-zero b_resid when an I/O error 200 * has occurred, which should cause us to break above. 201 * However, if the short read did not cause an error, 202 * then we want to ensure that we do not uiomove bad 203 * or uninitialized data. 204 */ 205 size -= bp->b_resid; 206 if (size < xfersize) { 207 if (size == 0) 208 break; 209 xfersize = size; 210 } 211 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 212 if (error) 213 break; 214 brelse(bp, 0); 215 } 216 if (bp != NULL) 217 brelse(bp, 0); 218 219 out: 220 error = ulfs_post_read_update(vp, ioflag, error); 221 return (error); 222 } 223 224 static int 225 ulfs_post_read_update(struct vnode *vp, int ioflag, int oerror) 226 { 227 struct inode *ip = VTOI(vp); 228 int error = oerror; 229 230 if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { 231 ip->i_state |= IN_ACCESS; 232 if ((ioflag & IO_SYNC) == IO_SYNC) { 233 error = lfs_update(vp, NULL, NULL, UPDATE_WAIT); 234 } 235 } 236 237 /* Read error overrides any inode update error. */ 238 if (oerror) 239 error = oerror; 240 return error; 241 } 242 243 /* 244 * Vnode op for writing. 245 */ 246 int 247 WRITE(void *v) 248 { 249 struct vop_write_args /* { 250 struct vnode *a_vp; 251 struct uio *a_uio; 252 int a_ioflag; 253 kauth_cred_t a_cred; 254 } */ *ap = v; 255 struct vnode *vp; 256 struct uio *uio; 257 struct inode *ip; 258 FS *fs; 259 kauth_cred_t cred; 260 off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize; 261 int blkoffset, error, flags, ioflag, resid; 262 int aflag; 263 int extended=0; 264 vsize_t bytelen; 265 bool async; 266 267 cred = ap->a_cred; 268 ioflag = ap->a_ioflag; 269 uio = ap->a_uio; 270 vp = ap->a_vp; 271 ip = VTOI(vp); 272 273 KASSERT(vp->v_size == ip->i_size); 274 KASSERT(uio->uio_rw == UIO_WRITE); 275 KASSERT(vp->v_type == VREG); 276 277 if (ioflag & IO_APPEND) 278 uio->uio_offset = ip->i_size; 279 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) 280 return (EPERM); 281 282 fs = ip->I_FS; 283 if (uio->uio_offset < 0 || 284 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->um_maxfilesize) 285 return (EFBIG); 286 #ifdef LFS_READWRITE 287 /* Disallow writes to the Ifile, even if noschg flag is removed */ 288 /* XXX can this go away when the Ifile is no longer in the namespace? */ 289 if (vp == fs->lfs_ivnode) 290 return (EPERM); 291 #endif 292 if (uio->uio_resid == 0) 293 return (0); 294 295 flags = ioflag & IO_SYNC ? B_SYNC : 0; 296 async = vp->v_mount->mnt_flag & MNT_ASYNC; 297 origoff = uio->uio_offset; 298 resid = uio->uio_resid; 299 osize = ip->i_size; 300 error = 0; 301 302 KASSERT(vp->v_type == VREG); 303 304 #ifdef LFS_READWRITE 305 async = true; 306 lfs_availwait(fs, lfs_btofsb(fs, uio->uio_resid)); 307 lfs_check(vp, LFS_UNUSED_LBN, 0); 308 #endif /* !LFS_READWRITE */ 309 310 preallocoff = round_page(lfs_blkroundup(fs, MAX(osize, uio->uio_offset))); 311 aflag = ioflag & IO_SYNC ? B_SYNC : 0; 312 nsize = MAX(osize, uio->uio_offset + uio->uio_resid); 313 endallocoff = nsize - lfs_blkoff(fs, nsize); 314 315 /* 316 * if we're increasing the file size, deal with expanding 317 * the fragment if there is one. 318 */ 319 320 if (nsize > osize && lfs_lblkno(fs, osize) < ULFS_NDADDR && 321 lfs_lblkno(fs, osize) != lfs_lblkno(fs, nsize) && 322 lfs_blkroundup(fs, osize) != osize) { 323 off_t eob; 324 325 eob = lfs_blkroundup(fs, osize); 326 uvm_vnp_setwritesize(vp, eob); 327 error = ulfs_balloc_range(vp, osize, eob - osize, cred, aflag); 328 if (error) 329 goto out; 330 if (flags & B_SYNC) { 331 mutex_enter(vp->v_interlock); 332 VOP_PUTPAGES(vp, trunc_page(osize & lfs_sb_getbmask(fs)), 333 round_page(eob), 334 PGO_CLEANIT | PGO_SYNCIO); 335 } 336 } 337 338 while (uio->uio_resid > 0) { 339 int ubc_flags = UBC_WRITE; 340 bool overwrite; /* if we're overwrite a whole block */ 341 off_t newoff; 342 343 if (ioflag & IO_DIRECT) { 344 genfs_directio(vp, uio, ioflag); 345 } 346 347 oldoff = uio->uio_offset; 348 blkoffset = lfs_blkoff(fs, uio->uio_offset); 349 bytelen = MIN(fs_sb_getbsize(fs) - blkoffset, uio->uio_resid); 350 if (bytelen == 0) { 351 break; 352 } 353 354 /* 355 * if we're filling in a hole, allocate the blocks now and 356 * initialize the pages first. if we're extending the file, 357 * we can safely allocate blocks without initializing pages 358 * since the new blocks will be inaccessible until the write 359 * is complete. 360 */ 361 overwrite = uio->uio_offset >= preallocoff && 362 uio->uio_offset < endallocoff; 363 if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 && 364 lfs_blkoff(fs, uio->uio_offset) == 0 && 365 (uio->uio_offset & PAGE_MASK) == 0) { 366 vsize_t len; 367 368 len = trunc_page(bytelen); 369 len -= lfs_blkoff(fs, len); 370 if (len > 0) { 371 overwrite = true; 372 bytelen = len; 373 } 374 } 375 376 newoff = oldoff + bytelen; 377 if (vp->v_size < newoff) { 378 uvm_vnp_setwritesize(vp, newoff); 379 } 380 381 if (!overwrite) { 382 error = ulfs_balloc_range(vp, uio->uio_offset, bytelen, 383 cred, aflag); 384 if (error) 385 break; 386 } else { 387 genfs_node_wrlock(vp); 388 error = GOP_ALLOC(vp, uio->uio_offset, bytelen, 389 aflag, cred); 390 genfs_node_unlock(vp); 391 if (error) 392 break; 393 ubc_flags |= UBC_FAULTBUSY; 394 } 395 396 /* 397 * copy the data. 398 */ 399 400 error = ubc_uiomove(&vp->v_uobj, uio, bytelen, 401 IO_ADV_DECODE(ioflag), ubc_flags | UBC_UNMAP_FLAG(vp)); 402 403 /* 404 * update UVM's notion of the size now that we've 405 * copied the data into the vnode's pages. 406 * 407 * we should update the size even when uiomove failed. 408 */ 409 410 if (vp->v_size < newoff) { 411 uvm_vnp_setsize(vp, newoff); 412 extended = 1; 413 } 414 415 if (error) 416 break; 417 418 /* 419 * flush what we just wrote if necessary. 420 * XXXUBC simplistic async flushing. 421 */ 422 423 #ifndef LFS_READWRITE 424 if (!async && oldoff >> 16 != uio->uio_offset >> 16) { 425 mutex_enter(vp->v_interlock); 426 error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16, 427 (uio->uio_offset >> 16) << 16, 428 PGO_CLEANIT | PGO_LAZY); 429 if (error) 430 break; 431 } 432 #else 433 __USE(async); 434 #endif 435 } 436 if (error == 0 && ioflag & IO_SYNC) { 437 mutex_enter(vp->v_interlock); 438 error = VOP_PUTPAGES(vp, trunc_page(origoff & lfs_sb_getbmask(fs)), 439 round_page(lfs_blkroundup(fs, uio->uio_offset)), 440 PGO_CLEANIT | PGO_SYNCIO); 441 } 442 443 out: 444 error = ulfs_post_write_update(vp, uio, ioflag, cred, osize, resid, 445 extended, error); 446 447 return (error); 448 } 449 450 /* 451 * UFS op for writing via the buffer cache 452 */ 453 int 454 BUFWR(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred) 455 { 456 struct inode *ip; 457 FS *fs; 458 int flags; 459 struct buf *bp; 460 off_t osize; 461 int resid, xfersize, size, blkoffset; 462 daddr_t lbn; 463 int extended=0; 464 int error; 465 #ifdef LFS_READWRITE 466 bool need_unreserve = false; 467 #endif 468 469 KASSERT(ISSET(ioflag, IO_NODELOCKED)); 470 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 471 KASSERT(vp->v_type == VDIR || vp->v_type == VLNK); 472 KASSERT(vp->v_type != VDIR || ISSET(ioflag, IO_SYNC)); 473 KASSERT(uio->uio_rw == UIO_WRITE); 474 475 ip = VTOI(vp); 476 fs = ip->I_FS; 477 478 KASSERT(vp->v_size == ip->i_size); 479 480 if (uio->uio_offset < 0 || 481 uio->uio_resid > fs->um_maxfilesize || 482 uio->uio_offset > (fs->um_maxfilesize - uio->uio_resid)) 483 return EFBIG; 484 #ifdef LFS_READWRITE 485 KASSERT(vp != fs->lfs_ivnode); 486 #endif 487 if (uio->uio_resid == 0) 488 return 0; 489 490 flags = ioflag & IO_SYNC ? B_SYNC : 0; 491 resid = uio->uio_resid; 492 osize = ip->i_size; 493 error = 0; 494 495 KASSERT(vp->v_type != VREG); 496 497 #ifdef LFS_READWRITE 498 lfs_availwait(fs, lfs_btofsb(fs, uio->uio_resid)); 499 lfs_check(vp, LFS_UNUSED_LBN, 0); 500 #endif /* !LFS_READWRITE */ 501 502 /* XXX Should never have pages cached here. */ 503 KASSERT(vp->v_uobj.uo_npages == 0); 504 while (uio->uio_resid > 0) { 505 lbn = lfs_lblkno(fs, uio->uio_offset); 506 blkoffset = lfs_blkoff(fs, uio->uio_offset); 507 xfersize = MIN(fs_sb_getbsize(fs) - blkoffset, uio->uio_resid); 508 if (fs_sb_getbsize(fs) > xfersize) 509 flags |= B_CLRBUF; 510 else 511 flags &= ~B_CLRBUF; 512 513 #ifdef LFS_READWRITE 514 error = lfs_reserve(fs, vp, NULL, 515 lfs_btofsb(fs, (ULFS_NIADDR + 1) << lfs_sb_getbshift(fs))); 516 if (error) 517 break; 518 need_unreserve = true; 519 #endif 520 error = lfs_balloc(vp, uio->uio_offset, xfersize, cred, flags, 521 &bp); 522 523 if (error) 524 break; 525 if (uio->uio_offset + xfersize > ip->i_size) { 526 ip->i_size = uio->uio_offset + xfersize; 527 DIP_ASSIGN(ip, size, ip->i_size); 528 uvm_vnp_setsize(vp, ip->i_size); 529 extended = 1; 530 } 531 size = lfs_blksize(fs, ip, lbn) - bp->b_resid; 532 if (xfersize > size) 533 xfersize = size; 534 535 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 536 537 /* 538 * if we didn't clear the block and the uiomove failed, 539 * the buf will now contain part of some other file, 540 * so we need to invalidate it. 541 */ 542 if (error && (flags & B_CLRBUF) == 0) { 543 brelse(bp, BC_INVAL); 544 break; 545 } 546 #ifdef LFS_READWRITE 547 (void)VOP_BWRITE(bp->b_vp, bp); 548 lfs_reserve(fs, vp, NULL, 549 -lfs_btofsb(fs, (ULFS_NIADDR + 1) << lfs_sb_getbshift(fs))); 550 need_unreserve = false; 551 #else 552 if (ioflag & IO_SYNC) 553 (void)bwrite(bp); 554 else if (xfersize + blkoffset == fs->fs_bsize) 555 bawrite(bp); 556 else 557 bdwrite(bp); 558 #endif 559 if (error || xfersize == 0) 560 break; 561 } 562 #ifdef LFS_READWRITE 563 if (need_unreserve) { 564 lfs_reserve(fs, vp, NULL, 565 -lfs_btofsb(fs, (ULFS_NIADDR + 1) << lfs_sb_getbshift(fs))); 566 } 567 #endif 568 569 error = ulfs_post_write_update(vp, uio, ioflag, cred, osize, resid, 570 extended, error); 571 572 return (error); 573 } 574 575 static int 576 ulfs_post_write_update(struct vnode *vp, struct uio *uio, int ioflag, 577 kauth_cred_t cred, off_t osize, int resid, int extended, int oerror) 578 { 579 struct inode *ip = VTOI(vp); 580 int error = oerror; 581 582 /* Trigger ctime and mtime updates, and atime if MNT_RELATIME. */ 583 ip->i_state |= IN_CHANGE | IN_UPDATE; 584 if (vp->v_mount->mnt_flag & MNT_RELATIME) 585 ip->i_state |= IN_ACCESS; 586 587 /* 588 * If we successfully wrote any data and we are not the superuser, 589 * we clear the setuid and setgid bits as a precaution against 590 * tampering. 591 */ 592 if (resid > uio->uio_resid && cred) { 593 if (ip->i_mode & ISUID) { 594 if (kauth_authorize_vnode(cred, 595 KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0) { 596 ip->i_mode &= ~ISUID; 597 DIP_ASSIGN(ip, mode, ip->i_mode); 598 } 599 } 600 601 if (ip->i_mode & ISGID) { 602 if (kauth_authorize_vnode(cred, 603 KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0) { 604 ip->i_mode &= ~ISGID; 605 DIP_ASSIGN(ip, mode, ip->i_mode); 606 } 607 } 608 } 609 610 /* If we successfully wrote anything, notify kevent listeners. */ 611 if (resid > uio->uio_resid) 612 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 613 614 /* 615 * Update the size on disk: truncate back to original size on 616 * error, or reflect the new size on success. 617 */ 618 if (error) { 619 (void) lfs_truncate(vp, osize, ioflag & IO_SYNC, cred); 620 uio->uio_offset -= resid - uio->uio_resid; 621 uio->uio_resid = resid; 622 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) { 623 error = lfs_update(vp, NULL, NULL, UPDATE_WAIT); 624 } else { 625 /* nothing */ 626 } 627 628 /* Make sure the vnode uvm size matches the inode file size. */ 629 KASSERT(vp->v_size == ip->i_size); 630 631 /* Write error overrides any inode update error. */ 632 if (oerror) 633 error = oerror; 634 return error; 635 } 636