1 /* $NetBSD: nfs_bio.c,v 1.40 1997/11/23 13:52:24 fvdl Exp $ */ 2 3 /* 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Rick Macklem at The University of Guelph. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 39 */ 40 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/resourcevar.h> 45 #include <sys/signalvar.h> 46 #include <sys/proc.h> 47 #include <sys/buf.h> 48 #include <sys/vnode.h> 49 #include <sys/trace.h> 50 #include <sys/mount.h> 51 #include <sys/kernel.h> 52 #include <sys/namei.h> 53 #include <sys/dirent.h> 54 55 #include <vm/vm.h> 56 57 #include <nfs/rpcv2.h> 58 #include <nfs/nfsproto.h> 59 #include <nfs/nfs.h> 60 #include <nfs/nfsmount.h> 61 #include <nfs/nqnfs.h> 62 #include <nfs/nfsnode.h> 63 #include <nfs/nfs_var.h> 64 65 extern int nfs_numasync; 66 extern struct nfsstats nfsstats; 67 68 /* 69 * Vnode op for read using bio 70 * Any similarity to readip() is purely coincidental 71 */ 72 int 73 nfs_bioread(vp, uio, ioflag, cred, cflag) 74 register struct vnode *vp; 75 register struct uio *uio; 76 int ioflag, cflag; 77 struct ucred *cred; 78 { 79 register struct nfsnode *np = VTONFS(vp); 80 register int biosize, diff; 81 struct buf *bp = NULL, *rabp; 82 struct vattr vattr; 83 struct proc *p; 84 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 85 struct nfsdircache *ndp = NULL, *nndp = NULL; 86 daddr_t lbn, bn, rabn; 87 caddr_t baddr, ep, edp; 88 int got_buf = 0, nra, error = 0, n = 0, on = 0, not_readin, en, enn; 89 int enough = 0; 90 struct dirent *dp, *pdp; 91 off_t curoff = 0; 92 93 #ifdef DIAGNOSTIC 94 if (uio->uio_rw != UIO_READ) 95 panic("nfs_read mode"); 96 #endif 97 if (uio->uio_resid == 0) 98 return (0); 99 if (vp->v_type != VDIR && uio->uio_offset < 0) 100 return (EINVAL); 101 p = uio->uio_procp; 102 if ((nmp->nm_flag & NFSMNT_NFSV3) && 103 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 104 (void)nfs_fsinfo(nmp, vp, cred, p); 105 if (vp->v_type != VDIR && 106 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 107 return (EFBIG); 108 biosize = nmp->nm_rsize; 109 /* 110 * For nfs, cache consistency can only be maintained approximately. 111 * Although RFC1094 does not specify the criteria, the following is 112 * believed to be compatible with the reference port. 113 * For nqnfs, full cache consistency is maintained within the loop. 114 * For nfs: 115 * If the file's modify time on the server has changed since the 116 * last read rpc or you have written to the file, 117 * you may have lost data cache consistency with the 118 * server, so flush all of the file's data out of the cache. 119 * Then force a getattr rpc to ensure that you have up to date 120 * attributes. 121 * NB: This implies that cache data can be read when up to 122 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 123 * attributes this could be forced by setting n_attrstamp to 0 before 124 * the VOP_GETATTR() call. 125 */ 126 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) { 127 if (np->n_flag & NMODIFIED) { 128 if (vp->v_type != VREG) { 129 if (vp->v_type != VDIR) 130 panic("nfs: bioread, not dir"); 131 nfs_invaldircache(vp, 0); 132 np->n_direofoffset = 0; 133 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 134 if (error) 135 return (error); 136 } 137 np->n_attrstamp = 0; 138 error = VOP_GETATTR(vp, &vattr, cred, p); 139 if (error) 140 return (error); 141 np->n_mtime = vattr.va_mtime.tv_sec; 142 } else { 143 error = VOP_GETATTR(vp, &vattr, cred, p); 144 if (error) 145 return (error); 146 if (np->n_mtime != vattr.va_mtime.tv_sec) { 147 if (vp->v_type == VDIR) { 148 nfs_invaldircache(vp, 0); 149 np->n_direofoffset = 0; 150 } 151 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 152 if (error) 153 return (error); 154 np->n_mtime = vattr.va_mtime.tv_sec; 155 } 156 } 157 } 158 do { 159 160 /* 161 * Get a valid lease. If cached data is stale, flush it. 162 */ 163 if (nmp->nm_flag & NFSMNT_NQNFS) { 164 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 165 do { 166 error = nqnfs_getlease(vp, ND_READ, cred, p); 167 } while (error == NQNFS_EXPIRED); 168 if (error) 169 return (error); 170 if (np->n_lrev != np->n_brev || 171 (np->n_flag & NQNFSNONCACHE) || 172 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 173 if (vp->v_type == VDIR) { 174 nfs_invaldircache(vp, 0); 175 np->n_direofoffset = 0; 176 } 177 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 178 if (error) 179 return (error); 180 np->n_brev = np->n_lrev; 181 } 182 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 183 nfs_invaldircache(vp, 0); 184 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 185 np->n_direofoffset = 0; 186 if (error) 187 return (error); 188 } 189 } 190 /* 191 * Don't cache symlinks. 192 */ 193 if (np->n_flag & NQNFSNONCACHE 194 || ((vp->v_flag & VROOT) && vp->v_type == VLNK)) { 195 switch (vp->v_type) { 196 case VREG: 197 return (nfs_readrpc(vp, uio, cred)); 198 case VLNK: 199 return (nfs_readlinkrpc(vp, uio, cred)); 200 case VDIR: 201 break; 202 default: 203 printf(" NQNFSNONCACHE: type %x unexpected\n", 204 vp->v_type); 205 }; 206 } 207 baddr = (caddr_t)0; 208 switch (vp->v_type) { 209 case VREG: 210 nfsstats.biocache_reads++; 211 lbn = uio->uio_offset / biosize; 212 on = uio->uio_offset & (biosize - 1); 213 bn = lbn * (biosize / DEV_BSIZE); 214 not_readin = 1; 215 216 /* 217 * Start the read ahead(s), as required. 218 */ 219 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 220 lbn - 1 == vp->v_lastr) { 221 for (nra = 0; nra < nmp->nm_readahead && 222 (lbn + 1 + nra) * biosize < np->n_size; nra++) { 223 rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE); 224 if (!incore(vp, rabn)) { 225 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 226 if (!rabp) 227 return (EINTR); 228 if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) { 229 rabp->b_flags |= (B_READ | B_ASYNC); 230 if (nfs_asyncio(rabp, cred)) { 231 rabp->b_flags |= B_INVAL; 232 brelse(rabp); 233 } 234 } else 235 brelse(rabp); 236 } 237 } 238 } 239 240 /* 241 * If the block is in the cache and has the required data 242 * in a valid region, just copy it out. 243 * Otherwise, get the block and write back/read in, 244 * as required. 245 */ 246 if ((bp = incore(vp, bn)) && 247 (bp->b_flags & (B_BUSY | B_WRITEINPROG)) == 248 (B_BUSY | B_WRITEINPROG)) 249 got_buf = 0; 250 else { 251 again: 252 bp = nfs_getcacheblk(vp, bn, biosize, p); 253 if (!bp) 254 return (EINTR); 255 got_buf = 1; 256 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 257 bp->b_flags |= B_READ; 258 not_readin = 0; 259 error = nfs_doio(bp, cred, p); 260 if (error) { 261 brelse(bp); 262 return (error); 263 } 264 } 265 } 266 n = min((unsigned)(biosize - on), uio->uio_resid); 267 diff = np->n_size - uio->uio_offset; 268 if (diff < n) 269 n = diff; 270 if (not_readin && n > 0) { 271 if (on < bp->b_validoff || (on + n) > bp->b_validend) { 272 if (!got_buf) { 273 bp = nfs_getcacheblk(vp, bn, biosize, p); 274 if (!bp) 275 return (EINTR); 276 got_buf = 1; 277 } 278 bp->b_flags |= B_INVAFTERWRITE; 279 if (bp->b_dirtyend > 0) { 280 if ((bp->b_flags & B_DELWRI) == 0) 281 panic("nfsbioread"); 282 if (VOP_BWRITE(bp) == EINTR) 283 return (EINTR); 284 } else 285 brelse(bp); 286 goto again; 287 } 288 } 289 vp->v_lastr = lbn; 290 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); 291 if (diff < n) 292 n = diff; 293 break; 294 case VLNK: 295 nfsstats.biocache_readlinks++; 296 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 297 if (!bp) 298 return (EINTR); 299 if ((bp->b_flags & B_DONE) == 0) { 300 bp->b_flags |= B_READ; 301 error = nfs_doio(bp, cred, p); 302 if (error) { 303 brelse(bp); 304 return (error); 305 } 306 } 307 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 308 got_buf = 1; 309 on = 0; 310 break; 311 case VDIR: 312 diragain: 313 nfsstats.biocache_readdirs++; 314 ndp = nfs_searchdircache(vp, uio->uio_offset, 315 (nmp->nm_flag & NFSMNT_XLATECOOKIE), 0); 316 if (!ndp) { 317 /* 318 * We've been handed a cookie that is not 319 * in the cache. If we're not translating 320 * 32 <-> 64, it may be a value that was 321 * flushed out of the cache because it grew 322 * too big. Let the server judge if it's 323 * valid or not. In the translation case, 324 * we have no way of validating this value, 325 * so punt. 326 */ 327 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) 328 return (EINVAL); 329 ndp = nfs_enterdircache(vp, uio->uio_offset, 330 uio->uio_offset, 0, 0); 331 } 332 333 if (uio->uio_offset != 0 && 334 ndp->dc_cookie == np->n_direofoffset) { 335 nfsstats.direofcache_hits++; 336 return (0); 337 } 338 339 bp = nfs_getcacheblk(vp, ndp->dc_blkno, NFS_DIRBLKSIZ, p); 340 if (!bp) 341 return (EINTR); 342 if ((bp->b_flags & B_DONE) == 0) { 343 bp->b_flags |= B_READ; 344 bp->b_dcookie = ndp->dc_blkcookie; 345 error = nfs_doio(bp, cred, p); 346 if (error) { 347 /* 348 * Yuck! The directory has been modified on the 349 * server. Punt and let the userland code 350 * deal with it. 351 */ 352 brelse(bp); 353 if (error == NFSERR_BAD_COOKIE) { 354 nfs_invaldircache(vp, 0); 355 nfs_vinvalbuf(vp, 0, cred, p, 1); 356 error = EINVAL; 357 } 358 return (error); 359 } 360 } 361 362 /* 363 * Just return if we hit EOF right away with this 364 * block. Always check here, because direofoffset 365 * may have been set by an nfsiod since the last 366 * check. 367 */ 368 if (np->n_direofoffset != 0 && 369 ndp->dc_blkcookie == np->n_direofoffset) { 370 brelse(bp); 371 return (0); 372 } 373 374 /* 375 * Find the entry we were looking for in the block. 376 */ 377 378 en = ndp->dc_entry; 379 380 pdp = dp = (struct dirent *)bp->b_data; 381 edp = bp->b_data + bp->b_validend; 382 enn = 0; 383 while (enn < en && (caddr_t)dp < edp) { 384 pdp = dp; 385 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 386 enn++; 387 } 388 389 /* 390 * If the entry number was bigger than the number of 391 * entries in the block, or the cookie of the previous 392 * entry doesn't match, the directory cache is 393 * stale. Flush it and try again (i.e. go to 394 * the server). 395 */ 396 if ((caddr_t)dp >= edp || (caddr_t)dp + dp->d_reclen > edp || 397 (en > 0 && NFS_GETCOOKIE(pdp) != ndp->dc_cookie)) { 398 #ifdef DEBUG 399 printf("invalid cache: %p %p %p off %lx %lx\n", 400 pdp, dp, edp, 401 (unsigned long)uio->uio_offset, 402 (unsigned long)NFS_GETCOOKIE(pdp)); 403 #endif 404 brelse(bp); 405 nfs_invaldircache(vp, 0); 406 nfs_vinvalbuf(vp, 0, cred, p, 0); 407 goto diragain; 408 } 409 410 on = (caddr_t)dp - bp->b_data; 411 412 /* 413 * Cache all entries that may be exported to the 414 * user, as they may be thrown back at us. The 415 * NFSBIO_CACHECOOKIES flag indicates that all 416 * entries are being 'exported', so cache them all. 417 */ 418 419 if (en == 0 && pdp == dp) { 420 dp = (struct dirent *) 421 ((caddr_t)dp + dp->d_reclen); 422 enn++; 423 } 424 425 if (uio->uio_resid < (bp->b_validend - on)) { 426 n = uio->uio_resid; 427 enough = 1; 428 } else 429 n = bp->b_validend - on; 430 431 ep = bp->b_data + on + n; 432 433 /* 434 * Find last complete entry to copy, caching entries 435 * (if requested) as we go. 436 */ 437 438 while ((caddr_t)dp < ep && (caddr_t)dp + dp->d_reclen <= ep) { 439 if (cflag & NFSBIO_CACHECOOKIES) { 440 nndp = nfs_enterdircache(vp, NFS_GETCOOKIE(pdp), 441 ndp->dc_blkcookie, enn, bp->b_lblkno); 442 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 443 NFS_STASHCOOKIE32(pdp, 444 nndp->dc_cookie32); 445 } 446 } 447 pdp = dp; 448 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 449 enn++; 450 } 451 452 /* 453 * If the last requested entry was not the last in the 454 * buffer (happens if NFS_DIRFRAGSIZ < NFS_DIRBLKSIZ), 455 * cache the cookie of the last requested one, and 456 * set of the offset to it. 457 */ 458 459 if ((on + n) < bp->b_validend) { 460 curoff = NFS_GETCOOKIE(pdp); 461 nndp = nfs_enterdircache(vp, curoff, ndp->dc_blkcookie, 462 enn, bp->b_lblkno); 463 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 464 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 465 curoff = nndp->dc_cookie32; 466 } 467 } else 468 curoff = bp->b_dcookie; 469 470 /* 471 * Always cache the entry for the next block, 472 * so that readaheads can use it. 473 */ 474 nndp = nfs_enterdircache(vp, bp->b_dcookie, bp->b_dcookie, 0,0); 475 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 476 if (curoff == bp->b_dcookie) { 477 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 478 curoff = nndp->dc_cookie32; 479 } 480 } 481 482 n = ((caddr_t)pdp + pdp->d_reclen) - (bp->b_data + on); 483 484 /* 485 * If not eof and read aheads are enabled, start one. 486 * (You need the current block first, so that you have the 487 * directory offset cookie of the next block.) 488 */ 489 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 490 np->n_direofoffset == 0 && !(np->n_flag & NQNFSNONCACHE)) { 491 rabp = nfs_getcacheblk(vp, nndp->dc_blkno, 492 NFS_DIRBLKSIZ, p); 493 if (rabp) { 494 if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) { 495 rabp->b_dcookie = nndp->dc_cookie; 496 rabp->b_flags |= (B_READ | B_ASYNC); 497 if (nfs_asyncio(rabp, cred)) { 498 rabp->b_flags |= B_INVAL; 499 brelse(rabp); 500 } 501 } else 502 brelse(rabp); 503 } 504 } 505 got_buf = 1; 506 break; 507 default: 508 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 509 break; 510 }; 511 512 if (n > 0) { 513 if (!baddr) 514 baddr = bp->b_data; 515 error = uiomove(baddr + on, (int)n, uio); 516 } 517 switch (vp->v_type) { 518 case VREG: 519 break; 520 case VLNK: 521 n = 0; 522 break; 523 case VDIR: 524 if (np->n_flag & NQNFSNONCACHE) 525 bp->b_flags |= B_INVAL; 526 uio->uio_offset = curoff; 527 if (enough) 528 n = 0; 529 break; 530 default: 531 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 532 } 533 if (got_buf) 534 brelse(bp); 535 } while (error == 0 && uio->uio_resid > 0 && n > 0); 536 return (error); 537 } 538 539 /* 540 * Vnode op for write using bio 541 */ 542 int 543 nfs_write(v) 544 void *v; 545 { 546 struct vop_write_args /* { 547 struct vnode *a_vp; 548 struct uio *a_uio; 549 int a_ioflag; 550 struct ucred *a_cred; 551 } */ *ap = v; 552 register int biosize; 553 register struct uio *uio = ap->a_uio; 554 struct proc *p = uio->uio_procp; 555 register struct vnode *vp = ap->a_vp; 556 struct nfsnode *np = VTONFS(vp); 557 register struct ucred *cred = ap->a_cred; 558 int ioflag = ap->a_ioflag; 559 struct buf *bp; 560 struct vattr vattr; 561 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 562 daddr_t lbn, bn; 563 int n, on, error = 0, iomode, must_commit; 564 565 #ifdef DIAGNOSTIC 566 if (uio->uio_rw != UIO_WRITE) 567 panic("nfs_write mode"); 568 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 569 panic("nfs_write proc"); 570 #endif 571 if (vp->v_type != VREG) 572 return (EIO); 573 if (np->n_flag & NWRITEERR) { 574 np->n_flag &= ~NWRITEERR; 575 return (np->n_error); 576 } 577 if ((nmp->nm_flag & NFSMNT_NFSV3) && 578 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 579 (void)nfs_fsinfo(nmp, vp, cred, p); 580 if (ioflag & (IO_APPEND | IO_SYNC)) { 581 if (np->n_flag & NMODIFIED) { 582 np->n_attrstamp = 0; 583 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 584 if (error) 585 return (error); 586 } 587 if (ioflag & IO_APPEND) { 588 np->n_attrstamp = 0; 589 error = VOP_GETATTR(vp, &vattr, cred, p); 590 if (error) 591 return (error); 592 uio->uio_offset = np->n_size; 593 } 594 } 595 if (uio->uio_offset < 0) 596 return (EINVAL); 597 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 598 return (EFBIG); 599 if (uio->uio_resid == 0) 600 return (0); 601 /* 602 * Maybe this should be above the vnode op call, but so long as 603 * file servers have no limits, i don't think it matters 604 */ 605 if (p && uio->uio_offset + uio->uio_resid > 606 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 607 psignal(p, SIGXFSZ); 608 return (EFBIG); 609 } 610 /* 611 * I use nm_rsize, not nm_wsize so that all buffer cache blocks 612 * will be the same size within a filesystem. nfs_writerpc will 613 * still use nm_wsize when sizing the rpc's. 614 */ 615 biosize = nmp->nm_rsize; 616 do { 617 618 /* 619 * XXX make sure we aren't cached in the VM page cache 620 */ 621 (void)vnode_pager_uncache(vp); 622 623 /* 624 * Check for a valid write lease. 625 */ 626 if ((nmp->nm_flag & NFSMNT_NQNFS) && 627 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 628 do { 629 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 630 } while (error == NQNFS_EXPIRED); 631 if (error) 632 return (error); 633 if (np->n_lrev != np->n_brev || 634 (np->n_flag & NQNFSNONCACHE)) { 635 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 636 if (error) 637 return (error); 638 np->n_brev = np->n_lrev; 639 } 640 } 641 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 642 iomode = NFSV3WRITE_FILESYNC; 643 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 644 if (must_commit) 645 nfs_clearcommit(vp->v_mount); 646 return (error); 647 } 648 nfsstats.biocache_writes++; 649 lbn = uio->uio_offset / biosize; 650 on = uio->uio_offset & (biosize-1); 651 n = min((unsigned)(biosize - on), uio->uio_resid); 652 bn = lbn * (biosize / DEV_BSIZE); 653 again: 654 bp = nfs_getcacheblk(vp, bn, biosize, p); 655 if (!bp) 656 return (EINTR); 657 if (bp->b_wcred == NOCRED) { 658 crhold(cred); 659 bp->b_wcred = cred; 660 } 661 np->n_flag |= NMODIFIED; 662 if (uio->uio_offset + n > np->n_size) { 663 np->n_size = uio->uio_offset + n; 664 vnode_pager_setsize(vp, np->n_size); 665 } 666 667 /* 668 * If the new write will leave a contiguous dirty 669 * area, just update the b_dirtyoff and b_dirtyend, 670 * otherwise force a write rpc of the old dirty area. 671 */ 672 if (bp->b_dirtyend > 0 && 673 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 674 bp->b_proc = p; 675 if (VOP_BWRITE(bp) == EINTR) 676 return (EINTR); 677 goto again; 678 } 679 680 /* 681 * Check for valid write lease and get one as required. 682 * In case getblk() and/or bwrite() delayed us. 683 */ 684 if ((nmp->nm_flag & NFSMNT_NQNFS) && 685 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 686 do { 687 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 688 } while (error == NQNFS_EXPIRED); 689 if (error) { 690 brelse(bp); 691 return (error); 692 } 693 if (np->n_lrev != np->n_brev || 694 (np->n_flag & NQNFSNONCACHE)) { 695 brelse(bp); 696 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 697 if (error) 698 return (error); 699 np->n_brev = np->n_lrev; 700 goto again; 701 } 702 } 703 error = uiomove((char *)bp->b_data + on, n, uio); 704 if (error) { 705 bp->b_flags |= B_ERROR; 706 brelse(bp); 707 return (error); 708 } 709 if (bp->b_dirtyend > 0) { 710 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 711 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 712 } else { 713 bp->b_dirtyoff = on; 714 bp->b_dirtyend = on + n; 715 } 716 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || 717 bp->b_validoff > bp->b_dirtyend) { 718 bp->b_validoff = bp->b_dirtyoff; 719 bp->b_validend = bp->b_dirtyend; 720 } else { 721 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); 722 bp->b_validend = max(bp->b_validend, bp->b_dirtyend); 723 } 724 725 /* 726 * Since this block is being modified, it must be written 727 * again and not just committed. 728 */ 729 bp->b_flags &= ~B_NEEDCOMMIT; 730 731 /* 732 * If the lease is non-cachable or IO_SYNC do bwrite(). 733 */ 734 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 735 bp->b_proc = p; 736 error = VOP_BWRITE(bp); 737 if (error) 738 return (error); 739 if (np->n_flag & NQNFSNONCACHE) { 740 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 741 if (error) 742 return (error); 743 } 744 } else if ((n + on) == biosize && 745 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 746 bp->b_proc = (struct proc *)0; 747 bp->b_flags |= B_ASYNC; 748 (void)nfs_writebp(bp, 0); 749 } else { 750 bdwrite(bp); 751 } 752 } while (uio->uio_resid > 0 && n > 0); 753 return (0); 754 } 755 756 /* 757 * Get an nfs cache block. 758 * Allocate a new one if the block isn't currently in the cache 759 * and return the block marked busy. If the calling process is 760 * interrupted by a signal for an interruptible mount point, return 761 * NULL. 762 */ 763 struct buf * 764 nfs_getcacheblk(vp, bn, size, p) 765 struct vnode *vp; 766 daddr_t bn; 767 int size; 768 struct proc *p; 769 { 770 register struct buf *bp; 771 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 772 773 if (nmp->nm_flag & NFSMNT_INT) { 774 bp = getblk(vp, bn, size, PCATCH, 0); 775 while (bp == (struct buf *)0) { 776 if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 777 return ((struct buf *)0); 778 bp = getblk(vp, bn, size, 0, 2 * hz); 779 } 780 } else 781 bp = getblk(vp, bn, size, 0, 0); 782 return (bp); 783 } 784 785 /* 786 * Flush and invalidate all dirty buffers. If another process is already 787 * doing the flush, just wait for completion. 788 */ 789 int 790 nfs_vinvalbuf(vp, flags, cred, p, intrflg) 791 struct vnode *vp; 792 int flags; 793 struct ucred *cred; 794 struct proc *p; 795 int intrflg; 796 { 797 register struct nfsnode *np = VTONFS(vp); 798 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 799 int error = 0, slpflag, slptimeo; 800 801 if ((nmp->nm_flag & NFSMNT_INT) == 0) 802 intrflg = 0; 803 if (intrflg) { 804 slpflag = PCATCH; 805 slptimeo = 2 * hz; 806 } else { 807 slpflag = 0; 808 slptimeo = 0; 809 } 810 /* 811 * First wait for any other process doing a flush to complete. 812 */ 813 while (np->n_flag & NFLUSHINPROG) { 814 np->n_flag |= NFLUSHWANT; 815 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 816 slptimeo); 817 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 818 return (EINTR); 819 } 820 821 /* 822 * Now, flush as required. 823 */ 824 np->n_flag |= NFLUSHINPROG; 825 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 826 while (error) { 827 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 828 np->n_flag &= ~NFLUSHINPROG; 829 if (np->n_flag & NFLUSHWANT) { 830 np->n_flag &= ~NFLUSHWANT; 831 wakeup((caddr_t)&np->n_flag); 832 } 833 return (EINTR); 834 } 835 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 836 } 837 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 838 if (np->n_flag & NFLUSHWANT) { 839 np->n_flag &= ~NFLUSHWANT; 840 wakeup((caddr_t)&np->n_flag); 841 } 842 return (0); 843 } 844 845 /* 846 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 847 * This is mainly to avoid queueing async I/O requests when the nfsiods 848 * are all hung on a dead server. 849 */ 850 int 851 nfs_asyncio(bp, cred) 852 register struct buf *bp; 853 struct ucred *cred; 854 { 855 register int i; 856 register struct nfsmount *nmp; 857 int gotiod, slpflag = 0, slptimeo = 0, error; 858 859 if (nfs_numasync == 0) 860 return (EIO); 861 862 863 nmp = VFSTONFS(bp->b_vp->v_mount); 864 again: 865 if (nmp->nm_flag & NFSMNT_INT) 866 slpflag = PCATCH; 867 gotiod = FALSE; 868 869 /* 870 * Find a free iod to process this request. 871 */ 872 873 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 874 if (nfs_iodwant[i]) { 875 /* 876 * Found one, so wake it up and tell it which 877 * mount to process. 878 */ 879 nfs_iodwant[i] = (struct proc *)0; 880 nfs_iodmount[i] = nmp; 881 nmp->nm_bufqiods++; 882 wakeup((caddr_t)&nfs_iodwant[i]); 883 gotiod = TRUE; 884 break; 885 } 886 /* 887 * If none are free, we may already have an iod working on this mount 888 * point. If so, it will process our request. 889 */ 890 if (!gotiod && nmp->nm_bufqiods > 0) 891 gotiod = TRUE; 892 893 /* 894 * If we have an iod which can process the request, then queue 895 * the buffer. 896 */ 897 if (gotiod) { 898 /* 899 * Ensure that the queue never grows too large. 900 */ 901 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 902 nmp->nm_bufqwant = TRUE; 903 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 904 "nfsaio", slptimeo); 905 if (error) { 906 if (nfs_sigintr(nmp, NULL, bp->b_proc)) 907 return (EINTR); 908 if (slpflag == PCATCH) { 909 slpflag = 0; 910 slptimeo = 2 * hz; 911 } 912 } 913 /* 914 * We might have lost our iod while sleeping, 915 * so check and loop if nescessary. 916 */ 917 if (nmp->nm_bufqiods == 0) 918 goto again; 919 } 920 921 if (bp->b_flags & B_READ) { 922 if (bp->b_rcred == NOCRED && cred != NOCRED) { 923 crhold(cred); 924 bp->b_rcred = cred; 925 } 926 } else { 927 bp->b_flags |= B_WRITEINPROG; 928 if (bp->b_wcred == NOCRED && cred != NOCRED) { 929 crhold(cred); 930 bp->b_wcred = cred; 931 } 932 } 933 934 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 935 nmp->nm_bufqlen++; 936 return (0); 937 } 938 939 /* 940 * All the iods are busy on other mounts, so return EIO to 941 * force the caller to process the i/o synchronously. 942 */ 943 return (EIO); 944 } 945 946 /* 947 * Do an I/O operation to/from a cache block. This may be called 948 * synchronously or from an nfsiod. 949 */ 950 int 951 nfs_doio(bp, cr, p) 952 register struct buf *bp; 953 struct ucred *cr; 954 struct proc *p; 955 { 956 register struct uio *uiop; 957 register struct vnode *vp; 958 struct nfsnode *np; 959 struct nfsmount *nmp; 960 int error = 0, diff, len, iomode, must_commit = 0; 961 struct uio uio; 962 struct iovec io; 963 964 vp = bp->b_vp; 965 np = VTONFS(vp); 966 nmp = VFSTONFS(vp->v_mount); 967 uiop = &uio; 968 uiop->uio_iov = &io; 969 uiop->uio_iovcnt = 1; 970 uiop->uio_segflg = UIO_SYSSPACE; 971 uiop->uio_procp = p; 972 973 /* 974 * Historically, paging was done with physio, but no more... 975 */ 976 if (bp->b_flags & B_PHYS) { 977 /* 978 * ...though reading /dev/drum still gets us here. 979 */ 980 io.iov_len = uiop->uio_resid = bp->b_bcount; 981 /* mapping was done by vmapbuf() */ 982 io.iov_base = bp->b_data; 983 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 984 if (bp->b_flags & B_READ) { 985 uiop->uio_rw = UIO_READ; 986 nfsstats.read_physios++; 987 error = nfs_readrpc(vp, uiop, cr); 988 } else { 989 iomode = NFSV3WRITE_DATASYNC; 990 uiop->uio_rw = UIO_WRITE; 991 nfsstats.write_physios++; 992 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 993 } 994 if (error) { 995 bp->b_flags |= B_ERROR; 996 bp->b_error = error; 997 } 998 } else if (bp->b_flags & B_READ) { 999 io.iov_len = uiop->uio_resid = bp->b_bcount; 1000 io.iov_base = bp->b_data; 1001 uiop->uio_rw = UIO_READ; 1002 switch (vp->v_type) { 1003 case VREG: 1004 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1005 nfsstats.read_bios++; 1006 error = nfs_readrpc(vp, uiop, cr); 1007 if (!error) { 1008 bp->b_validoff = 0; 1009 if (uiop->uio_resid) { 1010 /* 1011 * If len > 0, there is a hole in the file and 1012 * no writes after the hole have been pushed to 1013 * the server yet. 1014 * Just zero fill the rest of the valid area. 1015 */ 1016 diff = bp->b_bcount - uiop->uio_resid; 1017 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE 1018 + diff); 1019 if (len > 0) { 1020 len = min(len, uiop->uio_resid); 1021 bzero((char *)bp->b_data + diff, len); 1022 bp->b_validend = diff + len; 1023 } else 1024 bp->b_validend = diff; 1025 } else 1026 bp->b_validend = bp->b_bcount; 1027 } 1028 if (p && (vp->v_flag & VTEXT) && 1029 (((nmp->nm_flag & NFSMNT_NQNFS) && 1030 NQNFS_CKINVALID(vp, np, ND_READ) && 1031 np->n_lrev != np->n_brev) || 1032 (!(nmp->nm_flag & NFSMNT_NQNFS) && 1033 np->n_mtime != np->n_vattr->va_mtime.tv_sec))) { 1034 uprintf("Process killed due to text file modification\n"); 1035 psignal(p, SIGKILL); 1036 p->p_holdcnt++; 1037 } 1038 break; 1039 case VLNK: 1040 uiop->uio_offset = (off_t)0; 1041 nfsstats.readlink_bios++; 1042 error = nfs_readlinkrpc(vp, uiop, cr); 1043 break; 1044 case VDIR: 1045 nfsstats.readdir_bios++; 1046 uiop->uio_offset = bp->b_dcookie; 1047 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 1048 error = nfs_readdirplusrpc(vp, uiop, cr); 1049 if (error == NFSERR_NOTSUPP) 1050 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1051 } 1052 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1053 error = nfs_readdirrpc(vp, uiop, cr); 1054 if (!error) { 1055 bp->b_dcookie = uiop->uio_offset; 1056 bp->b_validoff = 0; 1057 bp->b_validend = bp->b_bcount - uiop->uio_resid; 1058 } 1059 break; 1060 default: 1061 printf("nfs_doio: type %x unexpected\n",vp->v_type); 1062 break; 1063 }; 1064 if (error) { 1065 bp->b_flags |= B_ERROR; 1066 bp->b_error = error; 1067 } 1068 } else { 1069 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1070 - bp->b_dirtyoff; 1071 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE 1072 + bp->b_dirtyoff; 1073 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1074 uiop->uio_rw = UIO_WRITE; 1075 nfsstats.write_bios++; 1076 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC) 1077 iomode = NFSV3WRITE_UNSTABLE; 1078 else 1079 iomode = NFSV3WRITE_FILESYNC; 1080 bp->b_flags |= B_WRITEINPROG; 1081 #ifdef fvdl_debug 1082 printf("nfs_doio(%x): bp %x doff %d dend %d\n", 1083 vp, bp, bp->b_dirtyoff, bp->b_dirtyend); 1084 #endif 1085 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 1086 if (!error && iomode == NFSV3WRITE_UNSTABLE) 1087 bp->b_flags |= B_NEEDCOMMIT; 1088 else 1089 bp->b_flags &= ~B_NEEDCOMMIT; 1090 bp->b_flags &= ~B_WRITEINPROG; 1091 1092 /* 1093 * For an interrupted write, the buffer is still valid and the 1094 * write hasn't been pushed to the server yet, so we can't set 1095 * B_ERROR and report the interruption by setting B_EINTR. For 1096 * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt 1097 * is essentially a noop. 1098 * For the case of a V3 write rpc not being committed to stable 1099 * storage, the block is still dirty and requires either a commit 1100 * rpc or another write rpc with iomode == NFSV3WRITE_FILESYNC 1101 * before the block is reused. This is indicated by setting the 1102 * B_DELWRI and B_NEEDCOMMIT flags. 1103 */ 1104 if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1105 bp->b_flags |= B_DELWRI; 1106 1107 /* 1108 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the 1109 * buffer to the clean list, we have to reassign it back to the 1110 * dirty one. Ugh. 1111 */ 1112 if (bp->b_flags & B_ASYNC) 1113 reassignbuf(bp, vp); 1114 else if (error) 1115 bp->b_flags |= B_EINTR; 1116 } else { 1117 if (error) { 1118 bp->b_flags |= B_ERROR; 1119 bp->b_error = np->n_error = error; 1120 np->n_flag |= NWRITEERR; 1121 } 1122 bp->b_dirtyoff = bp->b_dirtyend = 0; 1123 } 1124 } 1125 bp->b_resid = uiop->uio_resid; 1126 if (must_commit) 1127 nfs_clearcommit(vp->v_mount); 1128 biodone(bp); 1129 return (error); 1130 } 1131