1 /* $NetBSD: nfs_bio.c,v 1.175 2008/04/24 15:35:31 ad Exp $ */ 2 3 /* 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Rick Macklem at The University of Guelph. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 35 */ 36 37 #include <sys/cdefs.h> 38 __KERNEL_RCSID(0, "$NetBSD: nfs_bio.c,v 1.175 2008/04/24 15:35:31 ad Exp $"); 39 40 #include "opt_nfs.h" 41 #include "opt_ddb.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/resourcevar.h> 46 #include <sys/signalvar.h> 47 #include <sys/proc.h> 48 #include <sys/buf.h> 49 #include <sys/vnode.h> 50 #include <sys/mount.h> 51 #include <sys/kernel.h> 52 #include <sys/namei.h> 53 #include <sys/dirent.h> 54 #include <sys/kauth.h> 55 56 #include <uvm/uvm_extern.h> 57 #include <uvm/uvm.h> 58 59 #include <nfs/rpcv2.h> 60 #include <nfs/nfsproto.h> 61 #include <nfs/nfs.h> 62 #include <nfs/nfsmount.h> 63 #include <nfs/nfsnode.h> 64 #include <nfs/nfs_var.h> 65 66 extern int nfs_numasync; 67 extern int nfs_commitsize; 68 extern struct nfsstats nfsstats; 69 70 static int nfs_doio_read __P((struct buf *, struct uio *)); 71 static int nfs_doio_write __P((struct buf *, struct uio *)); 72 static int nfs_doio_phys __P((struct buf *, struct uio *)); 73 74 /* 75 * Vnode op for read using bio 76 * Any similarity to readip() is purely coincidental 77 */ 78 int 79 nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, 80 kauth_cred_t cred, int cflag) 81 { 82 struct nfsnode *np = VTONFS(vp); 83 struct buf *bp = NULL, *rabp; 84 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 85 struct nfsdircache *ndp = NULL, *nndp = NULL; 86 void *baddr; 87 int got_buf = 0, error = 0, n = 0, on = 0, en, enn; 88 int enough = 0; 89 struct dirent *dp, *pdp, *edp, *ep; 90 off_t curoff = 0; 91 int advice; 92 struct lwp *l = curlwp; 93 94 #ifdef DIAGNOSTIC 95 if (uio->uio_rw != UIO_READ) 96 panic("nfs_read mode"); 97 #endif 98 if (uio->uio_resid == 0) 99 return (0); 100 if (vp->v_type != VDIR && uio->uio_offset < 0) 101 return (EINVAL); 102 #ifndef NFS_V2_ONLY 103 if ((nmp->nm_flag & NFSMNT_NFSV3) && 104 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 105 (void)nfs_fsinfo(nmp, vp, cred, l); 106 #endif 107 if (vp->v_type != VDIR && 108 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 109 return (EFBIG); 110 111 /* 112 * For nfs, cache consistency can only be maintained approximately. 113 * Although RFC1094 does not specify the criteria, the following is 114 * believed to be compatible with the reference port. 115 * 116 * If the file's modify time on the server has changed since the 117 * last read rpc or you have written to the file, 118 * you may have lost data cache consistency with the 119 * server, so flush all of the file's data out of the cache. 120 * Then force a getattr rpc to ensure that you have up to date 121 * attributes. 122 * NB: This implies that cache data can be read when up to 123 * nfs_attrtimeo seconds out of date. If you find that you need current 124 * attributes this could be forced by setting n_attrstamp to 0 before 125 * the VOP_GETATTR() call. 126 */ 127 128 if (vp->v_type != VLNK) { 129 error = nfs_flushstalebuf(vp, cred, l, 130 NFS_FLUSHSTALEBUF_MYWRITE); 131 if (error) 132 return error; 133 } 134 135 do { 136 /* 137 * Don't cache symlinks. 138 */ 139 if ((vp->v_vflag & VV_ROOT) && vp->v_type == VLNK) { 140 return (nfs_readlinkrpc(vp, uio, cred)); 141 } 142 baddr = (void *)0; 143 switch (vp->v_type) { 144 case VREG: 145 nfsstats.biocache_reads++; 146 147 advice = IO_ADV_DECODE(ioflag); 148 error = 0; 149 while (uio->uio_resid > 0) { 150 vsize_t bytelen; 151 152 nfs_delayedtruncate(vp); 153 if (np->n_size <= uio->uio_offset) { 154 break; 155 } 156 bytelen = 157 MIN(np->n_size - uio->uio_offset, uio->uio_resid); 158 error = ubc_uiomove(&vp->v_uobj, uio, bytelen, 159 advice, UBC_READ | UBC_PARTIALOK | 160 (UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0)); 161 if (error) { 162 /* 163 * XXXkludge 164 * the file has been truncated on the server. 165 * there isn't much we can do. 166 */ 167 if (uio->uio_offset >= np->n_size) { 168 /* end of file */ 169 error = 0; 170 } else { 171 break; 172 } 173 } 174 } 175 break; 176 177 case VLNK: 178 nfsstats.biocache_readlinks++; 179 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, l); 180 if (!bp) 181 return (EINTR); 182 if ((bp->b_oflags & BO_DONE) == 0) { 183 bp->b_flags |= B_READ; 184 error = nfs_doio(bp); 185 if (error) { 186 brelse(bp, 0); 187 return (error); 188 } 189 } 190 n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 191 got_buf = 1; 192 on = 0; 193 break; 194 case VDIR: 195 diragain: 196 nfsstats.biocache_readdirs++; 197 ndp = nfs_searchdircache(vp, uio->uio_offset, 198 (nmp->nm_flag & NFSMNT_XLATECOOKIE), 0); 199 if (!ndp) { 200 /* 201 * We've been handed a cookie that is not 202 * in the cache. If we're not translating 203 * 32 <-> 64, it may be a value that was 204 * flushed out of the cache because it grew 205 * too big. Let the server judge if it's 206 * valid or not. In the translation case, 207 * we have no way of validating this value, 208 * so punt. 209 */ 210 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) 211 return (EINVAL); 212 ndp = nfs_enterdircache(vp, uio->uio_offset, 213 uio->uio_offset, 0, 0); 214 } 215 216 if (NFS_EOFVALID(np) && 217 ndp->dc_cookie == np->n_direofoffset) { 218 nfs_putdircache(np, ndp); 219 nfsstats.direofcache_hits++; 220 return (0); 221 } 222 223 bp = nfs_getcacheblk(vp, NFSDC_BLKNO(ndp), NFS_DIRBLKSIZ, l); 224 if (!bp) 225 return (EINTR); 226 if ((bp->b_oflags & BO_DONE) == 0) { 227 bp->b_flags |= B_READ; 228 bp->b_dcookie = ndp->dc_blkcookie; 229 error = nfs_doio(bp); 230 if (error) { 231 /* 232 * Yuck! The directory has been modified on the 233 * server. Punt and let the userland code 234 * deal with it. 235 */ 236 nfs_putdircache(np, ndp); 237 brelse(bp, 0); 238 /* 239 * nfs_request maps NFSERR_BAD_COOKIE to EINVAL. 240 */ 241 if (error == EINVAL) { /* NFSERR_BAD_COOKIE */ 242 nfs_invaldircache(vp, 0); 243 nfs_vinvalbuf(vp, 0, cred, l, 1); 244 } 245 return (error); 246 } 247 } 248 249 /* 250 * Just return if we hit EOF right away with this 251 * block. Always check here, because direofoffset 252 * may have been set by an nfsiod since the last 253 * check. 254 * 255 * also, empty block implies EOF. 256 */ 257 258 if (bp->b_bcount == bp->b_resid || 259 (NFS_EOFVALID(np) && 260 ndp->dc_blkcookie == np->n_direofoffset)) { 261 KASSERT(bp->b_bcount != bp->b_resid || 262 ndp->dc_blkcookie == bp->b_dcookie); 263 nfs_putdircache(np, ndp); 264 brelse(bp, BC_NOCACHE); 265 return 0; 266 } 267 268 /* 269 * Find the entry we were looking for in the block. 270 */ 271 272 en = ndp->dc_entry; 273 274 pdp = dp = (struct dirent *)bp->b_data; 275 edp = (struct dirent *)(void *)((char *)bp->b_data + bp->b_bcount - 276 bp->b_resid); 277 enn = 0; 278 while (enn < en && dp < edp) { 279 pdp = dp; 280 dp = _DIRENT_NEXT(dp); 281 enn++; 282 } 283 284 /* 285 * If the entry number was bigger than the number of 286 * entries in the block, or the cookie of the previous 287 * entry doesn't match, the directory cache is 288 * stale. Flush it and try again (i.e. go to 289 * the server). 290 */ 291 if (dp >= edp || (struct dirent *)_DIRENT_NEXT(dp) > edp || 292 (en > 0 && NFS_GETCOOKIE(pdp) != ndp->dc_cookie)) { 293 #ifdef DEBUG 294 printf("invalid cache: %p %p %p off %lx %lx\n", 295 pdp, dp, edp, 296 (unsigned long)uio->uio_offset, 297 (unsigned long)NFS_GETCOOKIE(pdp)); 298 #endif 299 nfs_putdircache(np, ndp); 300 brelse(bp, 0); 301 nfs_invaldircache(vp, 0); 302 nfs_vinvalbuf(vp, 0, cred, l, 0); 303 goto diragain; 304 } 305 306 on = (char *)dp - (char *)bp->b_data; 307 308 /* 309 * Cache all entries that may be exported to the 310 * user, as they may be thrown back at us. The 311 * NFSBIO_CACHECOOKIES flag indicates that all 312 * entries are being 'exported', so cache them all. 313 */ 314 315 if (en == 0 && pdp == dp) { 316 dp = _DIRENT_NEXT(dp); 317 enn++; 318 } 319 320 if (uio->uio_resid < (bp->b_bcount - bp->b_resid - on)) { 321 n = uio->uio_resid; 322 enough = 1; 323 } else 324 n = bp->b_bcount - bp->b_resid - on; 325 326 ep = (struct dirent *)(void *)((char *)bp->b_data + on + n); 327 328 /* 329 * Find last complete entry to copy, caching entries 330 * (if requested) as we go. 331 */ 332 333 while (dp < ep && (struct dirent *)_DIRENT_NEXT(dp) <= ep) { 334 if (cflag & NFSBIO_CACHECOOKIES) { 335 nndp = nfs_enterdircache(vp, NFS_GETCOOKIE(pdp), 336 ndp->dc_blkcookie, enn, bp->b_lblkno); 337 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 338 NFS_STASHCOOKIE32(pdp, 339 nndp->dc_cookie32); 340 } 341 nfs_putdircache(np, nndp); 342 } 343 pdp = dp; 344 dp = _DIRENT_NEXT(dp); 345 enn++; 346 } 347 nfs_putdircache(np, ndp); 348 349 /* 350 * If the last requested entry was not the last in the 351 * buffer (happens if NFS_DIRFRAGSIZ < NFS_DIRBLKSIZ), 352 * cache the cookie of the last requested one, and 353 * set of the offset to it. 354 */ 355 356 if ((on + n) < bp->b_bcount - bp->b_resid) { 357 curoff = NFS_GETCOOKIE(pdp); 358 nndp = nfs_enterdircache(vp, curoff, ndp->dc_blkcookie, 359 enn, bp->b_lblkno); 360 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 361 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 362 curoff = nndp->dc_cookie32; 363 } 364 nfs_putdircache(np, nndp); 365 } else 366 curoff = bp->b_dcookie; 367 368 /* 369 * Always cache the entry for the next block, 370 * so that readaheads can use it. 371 */ 372 nndp = nfs_enterdircache(vp, bp->b_dcookie, bp->b_dcookie, 0,0); 373 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 374 if (curoff == bp->b_dcookie) { 375 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 376 curoff = nndp->dc_cookie32; 377 } 378 } 379 380 n = (char *)_DIRENT_NEXT(pdp) - ((char *)bp->b_data + on); 381 382 /* 383 * If not eof and read aheads are enabled, start one. 384 * (You need the current block first, so that you have the 385 * directory offset cookie of the next block.) 386 */ 387 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 388 !NFS_EOFVALID(np)) { 389 rabp = nfs_getcacheblk(vp, NFSDC_BLKNO(nndp), 390 NFS_DIRBLKSIZ, l); 391 if (rabp) { 392 if ((rabp->b_oflags & (BO_DONE | BO_DELWRI)) == 0) { 393 rabp->b_dcookie = nndp->dc_cookie; 394 rabp->b_flags |= (B_READ | B_ASYNC); 395 if (nfs_asyncio(rabp)) { 396 brelse(rabp, BC_INVAL); 397 } 398 } else 399 brelse(rabp, 0); 400 } 401 } 402 nfs_putdircache(np, nndp); 403 got_buf = 1; 404 break; 405 default: 406 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 407 break; 408 } 409 410 if (n > 0) { 411 if (!baddr) 412 baddr = bp->b_data; 413 error = uiomove((char *)baddr + on, (int)n, uio); 414 } 415 switch (vp->v_type) { 416 case VREG: 417 break; 418 case VLNK: 419 n = 0; 420 break; 421 case VDIR: 422 uio->uio_offset = curoff; 423 if (enough) 424 n = 0; 425 break; 426 default: 427 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 428 } 429 if (got_buf) 430 brelse(bp, 0); 431 } while (error == 0 && uio->uio_resid > 0 && n > 0); 432 return (error); 433 } 434 435 /* 436 * Vnode op for write using bio 437 */ 438 int 439 nfs_write(void *v) 440 { 441 struct vop_write_args /* { 442 struct vnode *a_vp; 443 struct uio *a_uio; 444 int a_ioflag; 445 kauth_cred_t a_cred; 446 } */ *ap = v; 447 struct uio *uio = ap->a_uio; 448 struct lwp *l = curlwp; 449 struct vnode *vp = ap->a_vp; 450 struct nfsnode *np = VTONFS(vp); 451 kauth_cred_t cred = ap->a_cred; 452 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 453 voff_t oldoff, origoff; 454 vsize_t bytelen; 455 int error = 0; 456 int ioflag = ap->a_ioflag; 457 int extended = 0, wrotedata = 0; 458 459 #ifdef DIAGNOSTIC 460 if (uio->uio_rw != UIO_WRITE) 461 panic("nfs_write mode"); 462 #endif 463 if (vp->v_type != VREG) 464 return (EIO); 465 if (np->n_flag & NWRITEERR) { 466 np->n_flag &= ~NWRITEERR; 467 return (np->n_error); 468 } 469 #ifndef NFS_V2_ONLY 470 if ((nmp->nm_flag & NFSMNT_NFSV3) && 471 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 472 (void)nfs_fsinfo(nmp, vp, cred, l); 473 #endif 474 if (ioflag & IO_APPEND) { 475 NFS_INVALIDATE_ATTRCACHE(np); 476 error = nfs_flushstalebuf(vp, cred, l, 477 NFS_FLUSHSTALEBUF_MYWRITE); 478 if (error) 479 return (error); 480 uio->uio_offset = np->n_size; 481 } 482 if (uio->uio_offset < 0) 483 return (EINVAL); 484 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 485 return (EFBIG); 486 if (uio->uio_resid == 0) 487 return (0); 488 /* 489 * Maybe this should be above the vnode op call, but so long as 490 * file servers have no limits, i don't think it matters 491 */ 492 if (l && l->l_proc && uio->uio_offset + uio->uio_resid > 493 l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 494 mutex_enter(proc_lock); 495 psignal(l->l_proc, SIGXFSZ); 496 mutex_exit(proc_lock); 497 return (EFBIG); 498 } 499 500 origoff = uio->uio_offset; 501 do { 502 bool overwrite; /* if we are overwriting whole pages */ 503 u_quad_t oldsize; 504 oldoff = uio->uio_offset; 505 bytelen = uio->uio_resid; 506 507 nfsstats.biocache_writes++; 508 509 oldsize = np->n_size; 510 np->n_flag |= NMODIFIED; 511 if (np->n_size < uio->uio_offset + bytelen) { 512 np->n_size = uio->uio_offset + bytelen; 513 } 514 overwrite = false; 515 if ((uio->uio_offset & PAGE_MASK) == 0) { 516 if ((vp->v_vflag & VV_MAPPED) == 0 && 517 bytelen > PAGE_SIZE) { 518 bytelen = trunc_page(bytelen); 519 overwrite = true; 520 } else if ((bytelen & PAGE_MASK) == 0 && 521 uio->uio_offset >= vp->v_size) { 522 overwrite = true; 523 } 524 } 525 if (vp->v_size < uio->uio_offset + bytelen) { 526 uvm_vnp_setwritesize(vp, uio->uio_offset + bytelen); 527 } 528 error = ubc_uiomove(&vp->v_uobj, uio, bytelen, 529 UVM_ADV_RANDOM, UBC_WRITE | UBC_PARTIALOK | 530 (overwrite ? UBC_FAULTBUSY : 0) | 531 (UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0)); 532 if (error) { 533 uvm_vnp_setwritesize(vp, vp->v_size); 534 if (overwrite && np->n_size != oldsize) { 535 /* 536 * backout size and free pages past eof. 537 */ 538 np->n_size = oldsize; 539 mutex_enter(&vp->v_interlock); 540 (void)VOP_PUTPAGES(vp, round_page(vp->v_size), 541 0, PGO_SYNCIO | PGO_FREE); 542 } 543 break; 544 } 545 wrotedata = 1; 546 547 /* 548 * update UVM's notion of the size now that we've 549 * copied the data into the vnode's pages. 550 */ 551 552 if (vp->v_size < uio->uio_offset) { 553 uvm_vnp_setsize(vp, uio->uio_offset); 554 extended = 1; 555 } 556 557 if ((oldoff & ~(nmp->nm_wsize - 1)) != 558 (uio->uio_offset & ~(nmp->nm_wsize - 1))) { 559 mutex_enter(&vp->v_interlock); 560 error = VOP_PUTPAGES(vp, 561 trunc_page(oldoff & ~(nmp->nm_wsize - 1)), 562 round_page((uio->uio_offset + nmp->nm_wsize - 1) & 563 ~(nmp->nm_wsize - 1)), PGO_CLEANIT); 564 } 565 } while (uio->uio_resid > 0); 566 if (wrotedata) 567 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 568 if (error == 0 && (ioflag & IO_SYNC) != 0) { 569 mutex_enter(&vp->v_interlock); 570 error = VOP_PUTPAGES(vp, 571 trunc_page(origoff & ~(nmp->nm_wsize - 1)), 572 round_page((uio->uio_offset + nmp->nm_wsize - 1) & 573 ~(nmp->nm_wsize - 1)), 574 PGO_CLEANIT | PGO_SYNCIO); 575 } 576 return error; 577 } 578 579 /* 580 * Get an nfs cache block. 581 * Allocate a new one if the block isn't currently in the cache 582 * and return the block marked busy. If the calling process is 583 * interrupted by a signal for an interruptible mount point, return 584 * NULL. 585 */ 586 struct buf * 587 nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct lwp *l) 588 { 589 struct buf *bp; 590 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 591 592 if (nmp->nm_flag & NFSMNT_INT) { 593 bp = getblk(vp, bn, size, PCATCH, 0); 594 while (bp == NULL) { 595 if (nfs_sigintr(nmp, NULL, l)) 596 return (NULL); 597 bp = getblk(vp, bn, size, 0, 2 * hz); 598 } 599 } else 600 bp = getblk(vp, bn, size, 0, 0); 601 return (bp); 602 } 603 604 /* 605 * Flush and invalidate all dirty buffers. If another process is already 606 * doing the flush, just wait for completion. 607 */ 608 int 609 nfs_vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, 610 struct lwp *l, int intrflg) 611 { 612 struct nfsnode *np = VTONFS(vp); 613 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 614 int error = 0, slptimeo; 615 bool catch; 616 617 if ((nmp->nm_flag & NFSMNT_INT) == 0) 618 intrflg = 0; 619 if (intrflg) { 620 catch = true; 621 slptimeo = 2 * hz; 622 } else { 623 catch = false; 624 slptimeo = 0; 625 } 626 /* 627 * First wait for any other process doing a flush to complete. 628 */ 629 mutex_enter(&vp->v_interlock); 630 while (np->n_flag & NFLUSHINPROG) { 631 np->n_flag |= NFLUSHWANT; 632 error = mtsleep(&np->n_flag, PRIBIO + 2, "nfsvinval", 633 slptimeo, &vp->v_interlock); 634 if (error && intrflg && nfs_sigintr(nmp, NULL, l)) { 635 mutex_exit(&vp->v_interlock); 636 return EINTR; 637 } 638 } 639 640 /* 641 * Now, flush as required. 642 */ 643 np->n_flag |= NFLUSHINPROG; 644 mutex_exit(&vp->v_interlock); 645 error = vinvalbuf(vp, flags, cred, l, catch, 0); 646 while (error) { 647 if (intrflg && nfs_sigintr(nmp, NULL, l)) { 648 error = EINTR; 649 break; 650 } 651 error = vinvalbuf(vp, flags, cred, l, 0, slptimeo); 652 } 653 mutex_enter(&vp->v_interlock); 654 if (error == 0) 655 np->n_flag &= ~NMODIFIED; 656 np->n_flag &= ~NFLUSHINPROG; 657 if (np->n_flag & NFLUSHWANT) { 658 np->n_flag &= ~NFLUSHWANT; 659 wakeup(&np->n_flag); 660 } 661 mutex_exit(&vp->v_interlock); 662 return error; 663 } 664 665 /* 666 * nfs_flushstalebuf: flush cache if it's stale. 667 * 668 * => caller shouldn't own any pages or buffers which belong to the vnode. 669 */ 670 671 int 672 nfs_flushstalebuf(struct vnode *vp, kauth_cred_t cred, struct lwp *l, 673 int flags) 674 { 675 struct nfsnode *np = VTONFS(vp); 676 struct vattr vattr; 677 int error; 678 679 if (np->n_flag & NMODIFIED) { 680 if ((flags & NFS_FLUSHSTALEBUF_MYWRITE) == 0 681 || vp->v_type != VREG) { 682 error = nfs_vinvalbuf(vp, V_SAVE, cred, l, 1); 683 if (error) 684 return error; 685 if (vp->v_type == VDIR) { 686 nfs_invaldircache(vp, 0); 687 } 688 } else { 689 /* 690 * XXX assuming writes are ours. 691 */ 692 } 693 NFS_INVALIDATE_ATTRCACHE(np); 694 error = VOP_GETATTR(vp, &vattr, cred); 695 if (error) 696 return error; 697 np->n_mtime = vattr.va_mtime; 698 } else { 699 error = VOP_GETATTR(vp, &vattr, cred); 700 if (error) 701 return error; 702 if (timespeccmp(&np->n_mtime, &vattr.va_mtime, !=)) { 703 if (vp->v_type == VDIR) { 704 nfs_invaldircache(vp, 0); 705 } 706 error = nfs_vinvalbuf(vp, V_SAVE, cred, l, 1); 707 if (error) 708 return error; 709 np->n_mtime = vattr.va_mtime; 710 } 711 } 712 713 return error; 714 } 715 716 /* 717 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 718 * This is mainly to avoid queueing async I/O requests when the nfsiods 719 * are all hung on a dead server. 720 */ 721 722 int 723 nfs_asyncio(struct buf *bp) 724 { 725 struct nfs_iod *iod; 726 struct nfsmount *nmp; 727 int slptimeo = 0, error; 728 bool catch = false; 729 730 if (nfs_numasync == 0) 731 return (EIO); 732 733 nmp = VFSTONFS(bp->b_vp->v_mount); 734 again: 735 if (nmp->nm_flag & NFSMNT_INT) 736 catch = true; 737 738 /* 739 * Find a free iod to process this request. 740 */ 741 742 mutex_enter(&nfs_iodlist_lock); 743 iod = LIST_FIRST(&nfs_iodlist_idle); 744 if (iod) { 745 /* 746 * Found one, so wake it up and tell it which 747 * mount to process. 748 */ 749 LIST_REMOVE(iod, nid_idle); 750 mutex_enter(&iod->nid_lock); 751 mutex_exit(&nfs_iodlist_lock); 752 KASSERT(iod->nid_mount == NULL); 753 iod->nid_mount = nmp; 754 cv_signal(&iod->nid_cv); 755 mutex_enter(&nmp->nm_lock); 756 mutex_exit(&iod->nid_lock); 757 nmp->nm_bufqiods++; 758 if (nmp->nm_bufqlen < 2 * nmp->nm_bufqiods) { 759 cv_broadcast(&nmp->nm_aiocv); 760 } 761 } else { 762 mutex_exit(&nfs_iodlist_lock); 763 mutex_enter(&nmp->nm_lock); 764 } 765 766 KASSERT(mutex_owned(&nmp->nm_lock)); 767 768 /* 769 * If we have an iod which can process the request, then queue 770 * the buffer. However, even if we have an iod, do not initiate 771 * queue cleaning if curproc is the pageout daemon. if the NFS mount 772 * is via local loopback, we may put curproc (pagedaemon) to sleep 773 * waiting for the writes to complete. But the server (ourself) 774 * may block the write, waiting for its (ie., our) pagedaemon 775 * to produce clean pages to handle the write: deadlock. 776 * XXX: start non-loopback mounts straight away? If "lots free", 777 * let pagedaemon start loopback writes anyway? 778 */ 779 if (nmp->nm_bufqiods > 0) { 780 781 /* 782 * Ensure that the queue never grows too large. 783 */ 784 if (curlwp == uvm.pagedaemon_lwp) { 785 /* Enque for later, to avoid free-page deadlock */ 786 } else while (nmp->nm_bufqlen >= 2 * nmp->nm_bufqiods) { 787 if (catch) { 788 error = cv_timedwait_sig(&nmp->nm_aiocv, 789 &nmp->nm_lock, slptimeo); 790 } else { 791 error = cv_timedwait(&nmp->nm_aiocv, 792 &nmp->nm_lock, slptimeo); 793 } 794 if (error) { 795 if (nfs_sigintr(nmp, NULL, curlwp)) { 796 mutex_exit(&nmp->nm_lock); 797 return (EINTR); 798 } 799 if (catch) { 800 catch = false; 801 slptimeo = 2 * hz; 802 } 803 } 804 805 /* 806 * We might have lost our iod while sleeping, 807 * so check and loop if necessary. 808 */ 809 810 if (nmp->nm_bufqiods == 0) { 811 mutex_exit(&nmp->nm_lock); 812 goto again; 813 } 814 } 815 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 816 nmp->nm_bufqlen++; 817 mutex_exit(&nmp->nm_lock); 818 return (0); 819 } 820 mutex_exit(&nmp->nm_lock); 821 822 /* 823 * All the iods are busy on other mounts, so return EIO to 824 * force the caller to process the i/o synchronously. 825 */ 826 827 return (EIO); 828 } 829 830 /* 831 * nfs_doio for read. 832 */ 833 static int 834 nfs_doio_read(struct buf *bp, struct uio *uiop) 835 { 836 struct vnode *vp = bp->b_vp; 837 struct nfsnode *np = VTONFS(vp); 838 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 839 int error = 0; 840 841 uiop->uio_rw = UIO_READ; 842 switch (vp->v_type) { 843 case VREG: 844 nfsstats.read_bios++; 845 error = nfs_readrpc(vp, uiop); 846 if (!error && uiop->uio_resid) { 847 int diff, len; 848 849 /* 850 * If uio_resid > 0, there is a hole in the file and 851 * no writes after the hole have been pushed to 852 * the server yet or the file has been truncated 853 * on the server. 854 * Just zero fill the rest of the valid area. 855 */ 856 857 KASSERT(vp->v_size >= 858 uiop->uio_offset + uiop->uio_resid); 859 diff = bp->b_bcount - uiop->uio_resid; 860 len = uiop->uio_resid; 861 memset((char *)bp->b_data + diff, 0, len); 862 uiop->uio_resid = 0; 863 } 864 #if 0 865 if (uiop->uio_lwp && (vp->v_iflag & VI_TEXT) && 866 timespeccmp(&np->n_mtime, &np->n_vattr->va_mtime, !=)) { 867 mutex_enter(proc_lock); 868 killproc(uiop->uio_lwp->l_proc, "process text file was modified"); 869 mutex_exit(proc_lock); 870 #if 0 /* XXX NJWLWP */ 871 uiop->uio_lwp->l_proc->p_holdcnt++; 872 #endif 873 } 874 #endif 875 break; 876 case VLNK: 877 KASSERT(uiop->uio_offset == (off_t)0); 878 nfsstats.readlink_bios++; 879 error = nfs_readlinkrpc(vp, uiop, np->n_rcred); 880 break; 881 case VDIR: 882 nfsstats.readdir_bios++; 883 uiop->uio_offset = bp->b_dcookie; 884 #ifndef NFS_V2_ONLY 885 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 886 error = nfs_readdirplusrpc(vp, uiop, 887 curlwp->l_cred); 888 /* 889 * nfs_request maps NFSERR_NOTSUPP to ENOTSUP. 890 */ 891 if (error == ENOTSUP) 892 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 893 } 894 #else 895 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 896 #endif 897 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 898 error = nfs_readdirrpc(vp, uiop, 899 curlwp->l_cred); 900 if (!error) { 901 bp->b_dcookie = uiop->uio_offset; 902 } 903 break; 904 default: 905 printf("nfs_doio: type %x unexpected\n", vp->v_type); 906 break; 907 } 908 bp->b_error = error; 909 return error; 910 } 911 912 /* 913 * nfs_doio for write. 914 */ 915 static int 916 nfs_doio_write(struct buf *bp, struct uio *uiop) 917 { 918 struct vnode *vp = bp->b_vp; 919 struct nfsnode *np = VTONFS(vp); 920 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 921 int iomode; 922 bool stalewriteverf = false; 923 int i, npages = (bp->b_bcount + PAGE_SIZE - 1) >> PAGE_SHIFT; 924 struct vm_page *pgs[npages]; 925 #ifndef NFS_V2_ONLY 926 bool needcommit = true; /* need only COMMIT RPC */ 927 #else 928 bool needcommit = false; /* need only COMMIT RPC */ 929 #endif 930 bool pageprotected; 931 struct uvm_object *uobj = &vp->v_uobj; 932 int error; 933 off_t off, cnt; 934 935 if ((bp->b_flags & B_ASYNC) != 0 && NFS_ISV3(vp)) { 936 iomode = NFSV3WRITE_UNSTABLE; 937 } else { 938 iomode = NFSV3WRITE_FILESYNC; 939 } 940 941 #ifndef NFS_V2_ONLY 942 again: 943 #endif 944 rw_enter(&nmp->nm_writeverflock, RW_READER); 945 946 for (i = 0; i < npages; i++) { 947 pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT)); 948 if (pgs[i]->uobject == uobj && 949 pgs[i]->offset == uiop->uio_offset + (i << PAGE_SHIFT)) { 950 KASSERT(pgs[i]->flags & PG_BUSY); 951 /* 952 * this page belongs to our object. 953 */ 954 mutex_enter(&uobj->vmobjlock); 955 /* 956 * write out the page stably if it's about to 957 * be released because we can't resend it 958 * on the server crash. 959 * 960 * XXX assuming PG_RELEASE|PG_PAGEOUT won't be 961 * changed until unbusy the page. 962 */ 963 if (pgs[i]->flags & (PG_RELEASED|PG_PAGEOUT)) 964 iomode = NFSV3WRITE_FILESYNC; 965 /* 966 * if we met a page which hasn't been sent yet, 967 * we need do WRITE RPC. 968 */ 969 if ((pgs[i]->flags & PG_NEEDCOMMIT) == 0) 970 needcommit = false; 971 mutex_exit(&uobj->vmobjlock); 972 } else { 973 iomode = NFSV3WRITE_FILESYNC; 974 needcommit = false; 975 } 976 } 977 if (!needcommit && iomode == NFSV3WRITE_UNSTABLE) { 978 mutex_enter(&uobj->vmobjlock); 979 for (i = 0; i < npages; i++) { 980 pgs[i]->flags |= PG_NEEDCOMMIT | PG_RDONLY; 981 pmap_page_protect(pgs[i], VM_PROT_READ); 982 } 983 mutex_exit(&uobj->vmobjlock); 984 pageprotected = true; /* pages can't be modified during i/o. */ 985 } else 986 pageprotected = false; 987 988 /* 989 * Send the data to the server if necessary, 990 * otherwise just send a commit rpc. 991 */ 992 #ifndef NFS_V2_ONLY 993 if (needcommit) { 994 995 /* 996 * If the buffer is in the range that we already committed, 997 * there's nothing to do. 998 * 999 * If it's in the range that we need to commit, push the 1000 * whole range at once, otherwise only push the buffer. 1001 * In both these cases, acquire the commit lock to avoid 1002 * other processes modifying the range. 1003 */ 1004 1005 off = uiop->uio_offset; 1006 cnt = bp->b_bcount; 1007 mutex_enter(&np->n_commitlock); 1008 if (!nfs_in_committed_range(vp, off, bp->b_bcount)) { 1009 bool pushedrange; 1010 if (nfs_in_tobecommitted_range(vp, off, bp->b_bcount)) { 1011 pushedrange = true; 1012 off = np->n_pushlo; 1013 cnt = np->n_pushhi - np->n_pushlo; 1014 } else { 1015 pushedrange = false; 1016 } 1017 error = nfs_commit(vp, off, cnt, curlwp); 1018 if (error == 0) { 1019 if (pushedrange) { 1020 nfs_merge_commit_ranges(vp); 1021 } else { 1022 nfs_add_committed_range(vp, off, cnt); 1023 } 1024 } 1025 } else { 1026 error = 0; 1027 } 1028 mutex_exit(&np->n_commitlock); 1029 rw_exit(&nmp->nm_writeverflock); 1030 if (!error) { 1031 /* 1032 * pages are now on stable storage. 1033 */ 1034 uiop->uio_resid = 0; 1035 mutex_enter(&uobj->vmobjlock); 1036 for (i = 0; i < npages; i++) { 1037 pgs[i]->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY); 1038 } 1039 mutex_exit(&uobj->vmobjlock); 1040 return 0; 1041 } else if (error == NFSERR_STALEWRITEVERF) { 1042 nfs_clearcommit(vp->v_mount); 1043 goto again; 1044 } 1045 if (error) { 1046 bp->b_error = np->n_error = error; 1047 np->n_flag |= NWRITEERR; 1048 } 1049 return error; 1050 } 1051 #endif 1052 off = uiop->uio_offset; 1053 cnt = bp->b_bcount; 1054 uiop->uio_rw = UIO_WRITE; 1055 nfsstats.write_bios++; 1056 error = nfs_writerpc(vp, uiop, &iomode, pageprotected, &stalewriteverf); 1057 #ifndef NFS_V2_ONLY 1058 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1059 /* 1060 * we need to commit pages later. 1061 */ 1062 mutex_enter(&np->n_commitlock); 1063 nfs_add_tobecommitted_range(vp, off, cnt); 1064 /* 1065 * if there can be too many uncommitted pages, commit them now. 1066 */ 1067 if (np->n_pushhi - np->n_pushlo > nfs_commitsize) { 1068 off = np->n_pushlo; 1069 cnt = nfs_commitsize >> 1; 1070 error = nfs_commit(vp, off, cnt, curlwp); 1071 if (!error) { 1072 nfs_add_committed_range(vp, off, cnt); 1073 nfs_del_tobecommitted_range(vp, off, cnt); 1074 } 1075 if (error == NFSERR_STALEWRITEVERF) { 1076 stalewriteverf = true; 1077 error = 0; /* it isn't a real error */ 1078 } 1079 } else { 1080 /* 1081 * re-dirty pages so that they will be passed 1082 * to us later again. 1083 */ 1084 mutex_enter(&uobj->vmobjlock); 1085 for (i = 0; i < npages; i++) { 1086 pgs[i]->flags &= ~PG_CLEAN; 1087 } 1088 mutex_exit(&uobj->vmobjlock); 1089 } 1090 mutex_exit(&np->n_commitlock); 1091 } else 1092 #endif 1093 if (!error) { 1094 /* 1095 * pages are now on stable storage. 1096 */ 1097 mutex_enter(&np->n_commitlock); 1098 nfs_del_committed_range(vp, off, cnt); 1099 mutex_exit(&np->n_commitlock); 1100 mutex_enter(&uobj->vmobjlock); 1101 for (i = 0; i < npages; i++) { 1102 pgs[i]->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY); 1103 } 1104 mutex_exit(&uobj->vmobjlock); 1105 } else { 1106 /* 1107 * we got an error. 1108 */ 1109 bp->b_error = np->n_error = error; 1110 np->n_flag |= NWRITEERR; 1111 } 1112 1113 rw_exit(&nmp->nm_writeverflock); 1114 1115 if (stalewriteverf) { 1116 nfs_clearcommit(vp->v_mount); 1117 } 1118 return error; 1119 } 1120 1121 /* 1122 * nfs_doio for B_PHYS. 1123 */ 1124 static int 1125 nfs_doio_phys(struct buf *bp, struct uio *uiop) 1126 { 1127 struct vnode *vp = bp->b_vp; 1128 int error; 1129 1130 uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; 1131 if (bp->b_flags & B_READ) { 1132 uiop->uio_rw = UIO_READ; 1133 nfsstats.read_physios++; 1134 error = nfs_readrpc(vp, uiop); 1135 } else { 1136 int iomode = NFSV3WRITE_DATASYNC; 1137 bool stalewriteverf; 1138 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1139 1140 uiop->uio_rw = UIO_WRITE; 1141 nfsstats.write_physios++; 1142 rw_enter(&nmp->nm_writeverflock, RW_READER); 1143 error = nfs_writerpc(vp, uiop, &iomode, false, &stalewriteverf); 1144 rw_exit(&nmp->nm_writeverflock); 1145 if (stalewriteverf) { 1146 nfs_clearcommit(bp->b_vp->v_mount); 1147 } 1148 } 1149 bp->b_error = error; 1150 return error; 1151 } 1152 1153 /* 1154 * Do an I/O operation to/from a cache block. This may be called 1155 * synchronously or from an nfsiod. 1156 */ 1157 int 1158 nfs_doio(struct buf *bp) 1159 { 1160 int error; 1161 struct uio uio; 1162 struct uio *uiop = &uio; 1163 struct iovec io; 1164 UVMHIST_FUNC("nfs_doio"); UVMHIST_CALLED(ubchist); 1165 1166 uiop->uio_iov = &io; 1167 uiop->uio_iovcnt = 1; 1168 uiop->uio_offset = (((off_t)bp->b_blkno) << DEV_BSHIFT); 1169 UIO_SETUP_SYSSPACE(uiop); 1170 io.iov_base = bp->b_data; 1171 io.iov_len = uiop->uio_resid = bp->b_bcount; 1172 1173 /* 1174 * Historically, paging was done with physio, but no more... 1175 */ 1176 if (bp->b_flags & B_PHYS) { 1177 /* 1178 * ...though reading /dev/drum still gets us here. 1179 */ 1180 error = nfs_doio_phys(bp, uiop); 1181 } else if (bp->b_flags & B_READ) { 1182 error = nfs_doio_read(bp, uiop); 1183 } else { 1184 error = nfs_doio_write(bp, uiop); 1185 } 1186 bp->b_resid = uiop->uio_resid; 1187 biodone(bp); 1188 return (error); 1189 } 1190 1191 /* 1192 * Vnode op for VM getpages. 1193 */ 1194 1195 int 1196 nfs_getpages(void *v) 1197 { 1198 struct vop_getpages_args /* { 1199 struct vnode *a_vp; 1200 voff_t a_offset; 1201 struct vm_page **a_m; 1202 int *a_count; 1203 int a_centeridx; 1204 vm_prot_t a_access_type; 1205 int a_advice; 1206 int a_flags; 1207 } */ *ap = v; 1208 1209 struct vnode *vp = ap->a_vp; 1210 struct uvm_object *uobj = &vp->v_uobj; 1211 struct nfsnode *np = VTONFS(vp); 1212 const int npages = *ap->a_count; 1213 struct vm_page *pg, **pgs, *opgs[npages]; 1214 off_t origoffset, len; 1215 int i, error; 1216 bool v3 = NFS_ISV3(vp); 1217 bool write = (ap->a_access_type & VM_PROT_WRITE) != 0; 1218 bool locked = (ap->a_flags & PGO_LOCKED) != 0; 1219 1220 /* 1221 * call the genfs code to get the pages. `pgs' may be NULL 1222 * when doing read-ahead. 1223 */ 1224 1225 pgs = ap->a_m; 1226 if (write && locked && v3) { 1227 KASSERT(pgs != NULL); 1228 #ifdef DEBUG 1229 1230 /* 1231 * If PGO_LOCKED is set, real pages shouldn't exists 1232 * in the array. 1233 */ 1234 1235 for (i = 0; i < npages; i++) 1236 KDASSERT(pgs[i] == NULL || pgs[i] == PGO_DONTCARE); 1237 #endif 1238 memcpy(opgs, pgs, npages * sizeof(struct vm_pages *)); 1239 } 1240 error = genfs_getpages(v); 1241 if (error) { 1242 return (error); 1243 } 1244 1245 /* 1246 * for read faults where the nfs node is not yet marked NMODIFIED, 1247 * set PG_RDONLY on the pages so that we come back here if someone 1248 * tries to modify later via the mapping that will be entered for 1249 * this fault. 1250 */ 1251 1252 if (!write && (np->n_flag & NMODIFIED) == 0 && pgs != NULL) { 1253 if (!locked) { 1254 mutex_enter(&uobj->vmobjlock); 1255 } 1256 for (i = 0; i < npages; i++) { 1257 pg = pgs[i]; 1258 if (pg == NULL || pg == PGO_DONTCARE) { 1259 continue; 1260 } 1261 pg->flags |= PG_RDONLY; 1262 } 1263 if (!locked) { 1264 mutex_exit(&uobj->vmobjlock); 1265 } 1266 } 1267 if (!write) { 1268 return (0); 1269 } 1270 1271 /* 1272 * this is a write fault, update the commit info. 1273 */ 1274 1275 origoffset = ap->a_offset; 1276 len = npages << PAGE_SHIFT; 1277 1278 if (v3) { 1279 if (!locked) { 1280 mutex_enter(&np->n_commitlock); 1281 } else { 1282 if (!mutex_tryenter(&np->n_commitlock)) { 1283 1284 /* 1285 * Since PGO_LOCKED is set, we need to unbusy 1286 * all pages fetched by genfs_getpages() above, 1287 * tell the caller that there are no pages 1288 * available and put back original pgs array. 1289 */ 1290 1291 mutex_enter(&uvm_pageqlock); 1292 uvm_page_unbusy(pgs, npages); 1293 mutex_exit(&uvm_pageqlock); 1294 *ap->a_count = 0; 1295 memcpy(pgs, opgs, 1296 npages * sizeof(struct vm_pages *)); 1297 return EBUSY; 1298 } 1299 } 1300 nfs_del_committed_range(vp, origoffset, len); 1301 nfs_del_tobecommitted_range(vp, origoffset, len); 1302 } 1303 np->n_flag |= NMODIFIED; 1304 if (!locked) { 1305 mutex_enter(&uobj->vmobjlock); 1306 } 1307 for (i = 0; i < npages; i++) { 1308 pg = pgs[i]; 1309 if (pg == NULL || pg == PGO_DONTCARE) { 1310 continue; 1311 } 1312 pg->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY); 1313 } 1314 if (!locked) { 1315 mutex_exit(&uobj->vmobjlock); 1316 } 1317 if (v3) { 1318 mutex_exit(&np->n_commitlock); 1319 } 1320 return (0); 1321 } 1322