1 /* $NetBSD: nfs_bio.c,v 1.50 2000/06/27 17:52:28 mrg Exp $ */ 2 3 /* 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Rick Macklem at The University of Guelph. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/resourcevar.h> 44 #include <sys/signalvar.h> 45 #include <sys/proc.h> 46 #include <sys/buf.h> 47 #include <sys/vnode.h> 48 #include <sys/trace.h> 49 #include <sys/mount.h> 50 #include <sys/kernel.h> 51 #include <sys/namei.h> 52 #include <sys/dirent.h> 53 54 #include <uvm/uvm_extern.h> 55 56 #include <nfs/rpcv2.h> 57 #include <nfs/nfsproto.h> 58 #include <nfs/nfs.h> 59 #include <nfs/nfsmount.h> 60 #include <nfs/nqnfs.h> 61 #include <nfs/nfsnode.h> 62 #include <nfs/nfs_var.h> 63 64 extern int nfs_numasync; 65 extern struct nfsstats nfsstats; 66 67 /* 68 * Vnode op for read using bio 69 * Any similarity to readip() is purely coincidental 70 */ 71 int 72 nfs_bioread(vp, uio, ioflag, cred, cflag) 73 struct vnode *vp; 74 struct uio *uio; 75 int ioflag, cflag; 76 struct ucred *cred; 77 { 78 struct nfsnode *np = VTONFS(vp); 79 int biosize, diff; 80 struct buf *bp = NULL, *rabp; 81 struct vattr vattr; 82 struct proc *p; 83 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 84 struct nfsdircache *ndp = NULL, *nndp = NULL; 85 daddr_t lbn, bn, rabn; 86 caddr_t baddr, ep, edp; 87 int got_buf = 0, nra, error = 0, n = 0, on = 0, not_readin, en, enn; 88 int enough = 0; 89 struct dirent *dp, *pdp; 90 off_t curoff = 0, offdiff; 91 92 #ifdef DIAGNOSTIC 93 if (uio->uio_rw != UIO_READ) 94 panic("nfs_read mode"); 95 #endif 96 if (uio->uio_resid == 0) 97 return (0); 98 if (vp->v_type != VDIR && uio->uio_offset < 0) 99 return (EINVAL); 100 p = uio->uio_procp; 101 if ((nmp->nm_flag & NFSMNT_NFSV3) && 102 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 103 (void)nfs_fsinfo(nmp, vp, cred, p); 104 if (vp->v_type != VDIR && 105 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 106 return (EFBIG); 107 biosize = nmp->nm_rsize; 108 /* 109 * For nfs, cache consistency can only be maintained approximately. 110 * Although RFC1094 does not specify the criteria, the following is 111 * believed to be compatible with the reference port. 112 * For nqnfs, full cache consistency is maintained within the loop. 113 * For nfs: 114 * If the file's modify time on the server has changed since the 115 * last read rpc or you have written to the file, 116 * you may have lost data cache consistency with the 117 * server, so flush all of the file's data out of the cache. 118 * Then force a getattr rpc to ensure that you have up to date 119 * attributes. 120 * NB: This implies that cache data can be read when up to 121 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 122 * attributes this could be forced by setting n_attrstamp to 0 before 123 * the VOP_GETATTR() call. 124 */ 125 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) { 126 if (np->n_flag & NMODIFIED) { 127 if (vp->v_type != VREG) { 128 if (vp->v_type != VDIR) 129 panic("nfs: bioread, not dir"); 130 nfs_invaldircache(vp, 0); 131 np->n_direofoffset = 0; 132 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 133 if (error) 134 return (error); 135 } 136 np->n_attrstamp = 0; 137 error = VOP_GETATTR(vp, &vattr, cred, p); 138 if (error) 139 return (error); 140 np->n_mtime = vattr.va_mtime.tv_sec; 141 } else { 142 error = VOP_GETATTR(vp, &vattr, cred, p); 143 if (error) 144 return (error); 145 if (np->n_mtime != vattr.va_mtime.tv_sec) { 146 if (vp->v_type == VDIR) { 147 nfs_invaldircache(vp, 0); 148 np->n_direofoffset = 0; 149 } 150 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 151 if (error) 152 return (error); 153 np->n_mtime = vattr.va_mtime.tv_sec; 154 } 155 } 156 } 157 do { 158 159 /* 160 * Get a valid lease. If cached data is stale, flush it. 161 */ 162 if (nmp->nm_flag & NFSMNT_NQNFS) { 163 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 164 do { 165 error = nqnfs_getlease(vp, ND_READ, cred, p); 166 } while (error == NQNFS_EXPIRED); 167 if (error) 168 return (error); 169 if (np->n_lrev != np->n_brev || 170 (np->n_flag & NQNFSNONCACHE) || 171 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 172 if (vp->v_type == VDIR) { 173 nfs_invaldircache(vp, 0); 174 np->n_direofoffset = 0; 175 } 176 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 177 if (error) 178 return (error); 179 np->n_brev = np->n_lrev; 180 } 181 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 182 nfs_invaldircache(vp, 0); 183 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 184 np->n_direofoffset = 0; 185 if (error) 186 return (error); 187 } 188 } 189 /* 190 * Don't cache symlinks. 191 */ 192 if (np->n_flag & NQNFSNONCACHE 193 || ((vp->v_flag & VROOT) && vp->v_type == VLNK)) { 194 switch (vp->v_type) { 195 case VREG: 196 return (nfs_readrpc(vp, uio, cred)); 197 case VLNK: 198 return (nfs_readlinkrpc(vp, uio, cred)); 199 case VDIR: 200 break; 201 default: 202 printf(" NQNFSNONCACHE: type %x unexpected\n", 203 vp->v_type); 204 }; 205 } 206 baddr = (caddr_t)0; 207 switch (vp->v_type) { 208 case VREG: 209 nfsstats.biocache_reads++; 210 lbn = uio->uio_offset / biosize; 211 on = uio->uio_offset & (biosize - 1); 212 bn = lbn * (biosize / DEV_BSIZE); 213 not_readin = 1; 214 215 /* 216 * Start the read ahead(s), as required. 217 */ 218 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 219 lbn - 1 == vp->v_lastr) { 220 for (nra = 0; nra < nmp->nm_readahead && 221 (lbn + 1 + nra) * biosize < np->n_size; nra++) { 222 rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE); 223 if (!incore(vp, rabn)) { 224 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 225 if (!rabp) 226 return (EINTR); 227 if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) { 228 rabp->b_flags |= (B_READ | B_ASYNC); 229 if (nfs_asyncio(rabp, cred)) { 230 rabp->b_flags |= B_INVAL; 231 brelse(rabp); 232 } 233 } else 234 brelse(rabp); 235 } 236 } 237 } 238 239 /* 240 * If the block is in the cache and has the required data 241 * in a valid region, just copy it out. 242 * Otherwise, get the block and write back/read in, 243 * as required. 244 */ 245 if ((bp = incore(vp, bn)) && 246 (bp->b_flags & (B_BUSY | B_WRITEINPROG)) == 247 (B_BUSY | B_WRITEINPROG)) 248 got_buf = 0; 249 else { 250 again: 251 bp = nfs_getcacheblk(vp, bn, biosize, p); 252 if (!bp) 253 return (EINTR); 254 got_buf = 1; 255 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 256 bp->b_flags |= B_READ; 257 not_readin = 0; 258 error = nfs_doio(bp, cred, p); 259 if (error) { 260 brelse(bp); 261 return (error); 262 } 263 } 264 } 265 n = min((unsigned)(biosize - on), uio->uio_resid); 266 offdiff = np->n_size - uio->uio_offset; 267 if (offdiff < (off_t)n) 268 n = (int)offdiff; 269 if (not_readin && n > 0) { 270 if (on < bp->b_validoff || (on + n) > bp->b_validend) { 271 if (!got_buf) { 272 bp = nfs_getcacheblk(vp, bn, biosize, p); 273 if (!bp) 274 return (EINTR); 275 got_buf = 1; 276 } 277 bp->b_flags |= B_INVAFTERWRITE; 278 if (bp->b_dirtyend > 0) { 279 if ((bp->b_flags & B_DELWRI) == 0) 280 panic("nfsbioread"); 281 if (VOP_BWRITE(bp) == EINTR) 282 return (EINTR); 283 } else 284 brelse(bp); 285 goto again; 286 } 287 } 288 vp->v_lastr = lbn; 289 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); 290 if (diff < n) 291 n = diff; 292 break; 293 case VLNK: 294 nfsstats.biocache_readlinks++; 295 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 296 if (!bp) 297 return (EINTR); 298 if ((bp->b_flags & B_DONE) == 0) { 299 bp->b_flags |= B_READ; 300 error = nfs_doio(bp, cred, p); 301 if (error) { 302 brelse(bp); 303 return (error); 304 } 305 } 306 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 307 got_buf = 1; 308 on = 0; 309 break; 310 case VDIR: 311 diragain: 312 nfsstats.biocache_readdirs++; 313 ndp = nfs_searchdircache(vp, uio->uio_offset, 314 (nmp->nm_flag & NFSMNT_XLATECOOKIE), 0); 315 if (!ndp) { 316 /* 317 * We've been handed a cookie that is not 318 * in the cache. If we're not translating 319 * 32 <-> 64, it may be a value that was 320 * flushed out of the cache because it grew 321 * too big. Let the server judge if it's 322 * valid or not. In the translation case, 323 * we have no way of validating this value, 324 * so punt. 325 */ 326 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) 327 return (EINVAL); 328 ndp = nfs_enterdircache(vp, uio->uio_offset, 329 uio->uio_offset, 0, 0); 330 } 331 332 if (uio->uio_offset != 0 && 333 ndp->dc_cookie == np->n_direofoffset) { 334 nfsstats.direofcache_hits++; 335 return (0); 336 } 337 338 bp = nfs_getcacheblk(vp, ndp->dc_blkno, NFS_DIRBLKSIZ, p); 339 if (!bp) 340 return (EINTR); 341 if ((bp->b_flags & B_DONE) == 0) { 342 bp->b_flags |= B_READ; 343 bp->b_dcookie = ndp->dc_blkcookie; 344 error = nfs_doio(bp, cred, p); 345 if (error) { 346 /* 347 * Yuck! The directory has been modified on the 348 * server. Punt and let the userland code 349 * deal with it. 350 */ 351 brelse(bp); 352 if (error == NFSERR_BAD_COOKIE) { 353 nfs_invaldircache(vp, 0); 354 nfs_vinvalbuf(vp, 0, cred, p, 1); 355 error = EINVAL; 356 } 357 return (error); 358 } 359 } 360 361 /* 362 * Just return if we hit EOF right away with this 363 * block. Always check here, because direofoffset 364 * may have been set by an nfsiod since the last 365 * check. 366 */ 367 if (np->n_direofoffset != 0 && 368 ndp->dc_blkcookie == np->n_direofoffset) { 369 brelse(bp); 370 return (0); 371 } 372 373 /* 374 * Find the entry we were looking for in the block. 375 */ 376 377 en = ndp->dc_entry; 378 379 pdp = dp = (struct dirent *)bp->b_data; 380 edp = bp->b_data + bp->b_validend; 381 enn = 0; 382 while (enn < en && (caddr_t)dp < edp) { 383 pdp = dp; 384 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 385 enn++; 386 } 387 388 /* 389 * If the entry number was bigger than the number of 390 * entries in the block, or the cookie of the previous 391 * entry doesn't match, the directory cache is 392 * stale. Flush it and try again (i.e. go to 393 * the server). 394 */ 395 if ((caddr_t)dp >= edp || (caddr_t)dp + dp->d_reclen > edp || 396 (en > 0 && NFS_GETCOOKIE(pdp) != ndp->dc_cookie)) { 397 #ifdef DEBUG 398 printf("invalid cache: %p %p %p off %lx %lx\n", 399 pdp, dp, edp, 400 (unsigned long)uio->uio_offset, 401 (unsigned long)NFS_GETCOOKIE(pdp)); 402 #endif 403 brelse(bp); 404 nfs_invaldircache(vp, 0); 405 nfs_vinvalbuf(vp, 0, cred, p, 0); 406 goto diragain; 407 } 408 409 on = (caddr_t)dp - bp->b_data; 410 411 /* 412 * Cache all entries that may be exported to the 413 * user, as they may be thrown back at us. The 414 * NFSBIO_CACHECOOKIES flag indicates that all 415 * entries are being 'exported', so cache them all. 416 */ 417 418 if (en == 0 && pdp == dp) { 419 dp = (struct dirent *) 420 ((caddr_t)dp + dp->d_reclen); 421 enn++; 422 } 423 424 if (uio->uio_resid < (bp->b_validend - on)) { 425 n = uio->uio_resid; 426 enough = 1; 427 } else 428 n = bp->b_validend - on; 429 430 ep = bp->b_data + on + n; 431 432 /* 433 * Find last complete entry to copy, caching entries 434 * (if requested) as we go. 435 */ 436 437 while ((caddr_t)dp < ep && (caddr_t)dp + dp->d_reclen <= ep) { 438 if (cflag & NFSBIO_CACHECOOKIES) { 439 nndp = nfs_enterdircache(vp, NFS_GETCOOKIE(pdp), 440 ndp->dc_blkcookie, enn, bp->b_lblkno); 441 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 442 NFS_STASHCOOKIE32(pdp, 443 nndp->dc_cookie32); 444 } 445 } 446 pdp = dp; 447 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 448 enn++; 449 } 450 451 /* 452 * If the last requested entry was not the last in the 453 * buffer (happens if NFS_DIRFRAGSIZ < NFS_DIRBLKSIZ), 454 * cache the cookie of the last requested one, and 455 * set of the offset to it. 456 */ 457 458 if ((on + n) < bp->b_validend) { 459 curoff = NFS_GETCOOKIE(pdp); 460 nndp = nfs_enterdircache(vp, curoff, ndp->dc_blkcookie, 461 enn, bp->b_lblkno); 462 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 463 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 464 curoff = nndp->dc_cookie32; 465 } 466 } else 467 curoff = bp->b_dcookie; 468 469 /* 470 * Always cache the entry for the next block, 471 * so that readaheads can use it. 472 */ 473 nndp = nfs_enterdircache(vp, bp->b_dcookie, bp->b_dcookie, 0,0); 474 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 475 if (curoff == bp->b_dcookie) { 476 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 477 curoff = nndp->dc_cookie32; 478 } 479 } 480 481 n = ((caddr_t)pdp + pdp->d_reclen) - (bp->b_data + on); 482 483 /* 484 * If not eof and read aheads are enabled, start one. 485 * (You need the current block first, so that you have the 486 * directory offset cookie of the next block.) 487 */ 488 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 489 np->n_direofoffset == 0 && !(np->n_flag & NQNFSNONCACHE)) { 490 rabp = nfs_getcacheblk(vp, nndp->dc_blkno, 491 NFS_DIRBLKSIZ, p); 492 if (rabp) { 493 if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) { 494 rabp->b_dcookie = nndp->dc_cookie; 495 rabp->b_flags |= (B_READ | B_ASYNC); 496 if (nfs_asyncio(rabp, cred)) { 497 rabp->b_flags |= B_INVAL; 498 brelse(rabp); 499 } 500 } else 501 brelse(rabp); 502 } 503 } 504 got_buf = 1; 505 break; 506 default: 507 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 508 break; 509 }; 510 511 if (n > 0) { 512 if (!baddr) 513 baddr = bp->b_data; 514 error = uiomove(baddr + on, (int)n, uio); 515 } 516 switch (vp->v_type) { 517 case VREG: 518 break; 519 case VLNK: 520 n = 0; 521 break; 522 case VDIR: 523 if (np->n_flag & NQNFSNONCACHE) 524 bp->b_flags |= B_INVAL; 525 uio->uio_offset = curoff; 526 if (enough) 527 n = 0; 528 break; 529 default: 530 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 531 } 532 if (got_buf) 533 brelse(bp); 534 } while (error == 0 && uio->uio_resid > 0 && n > 0); 535 return (error); 536 } 537 538 /* 539 * Vnode op for write using bio 540 */ 541 int 542 nfs_write(v) 543 void *v; 544 { 545 struct vop_write_args /* { 546 struct vnode *a_vp; 547 struct uio *a_uio; 548 int a_ioflag; 549 struct ucred *a_cred; 550 } */ *ap = v; 551 int biosize; 552 struct uio *uio = ap->a_uio; 553 struct proc *p = uio->uio_procp; 554 struct vnode *vp = ap->a_vp; 555 struct nfsnode *np = VTONFS(vp); 556 struct ucred *cred = ap->a_cred; 557 int ioflag = ap->a_ioflag; 558 struct buf *bp; 559 struct vattr vattr; 560 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 561 daddr_t lbn, bn; 562 int n, on, error = 0, iomode, must_commit; 563 564 #ifdef DIAGNOSTIC 565 if (uio->uio_rw != UIO_WRITE) 566 panic("nfs_write mode"); 567 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 568 panic("nfs_write proc"); 569 #endif 570 if (vp->v_type != VREG) 571 return (EIO); 572 if (np->n_flag & NWRITEERR) { 573 np->n_flag &= ~NWRITEERR; 574 return (np->n_error); 575 } 576 if ((nmp->nm_flag & NFSMNT_NFSV3) && 577 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 578 (void)nfs_fsinfo(nmp, vp, cred, p); 579 if (ioflag & (IO_APPEND | IO_SYNC)) { 580 if (np->n_flag & NMODIFIED) { 581 np->n_attrstamp = 0; 582 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 583 if (error) 584 return (error); 585 } 586 if (ioflag & IO_APPEND) { 587 np->n_attrstamp = 0; 588 error = VOP_GETATTR(vp, &vattr, cred, p); 589 if (error) 590 return (error); 591 uio->uio_offset = np->n_size; 592 } 593 } 594 if (uio->uio_offset < 0) 595 return (EINVAL); 596 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 597 return (EFBIG); 598 if (uio->uio_resid == 0) 599 return (0); 600 /* 601 * Maybe this should be above the vnode op call, but so long as 602 * file servers have no limits, i don't think it matters 603 */ 604 if (p && uio->uio_offset + uio->uio_resid > 605 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 606 psignal(p, SIGXFSZ); 607 return (EFBIG); 608 } 609 /* 610 * I use nm_rsize, not nm_wsize so that all buffer cache blocks 611 * will be the same size within a filesystem. nfs_writerpc will 612 * still use nm_wsize when sizing the rpc's. 613 */ 614 biosize = nmp->nm_rsize; 615 do { 616 617 /* 618 * XXX make sure we aren't cached in the VM page cache 619 */ 620 (void)uvm_vnp_uncache(vp); 621 622 /* 623 * Check for a valid write lease. 624 */ 625 if ((nmp->nm_flag & NFSMNT_NQNFS) && 626 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 627 do { 628 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 629 } while (error == NQNFS_EXPIRED); 630 if (error) 631 return (error); 632 if (np->n_lrev != np->n_brev || 633 (np->n_flag & NQNFSNONCACHE)) { 634 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 635 if (error) 636 return (error); 637 np->n_brev = np->n_lrev; 638 } 639 } 640 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 641 iomode = NFSV3WRITE_FILESYNC; 642 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 643 if (must_commit) 644 nfs_clearcommit(vp->v_mount); 645 return (error); 646 } 647 nfsstats.biocache_writes++; 648 lbn = uio->uio_offset / biosize; 649 on = uio->uio_offset & (biosize-1); 650 n = min((unsigned)(biosize - on), uio->uio_resid); 651 bn = lbn * (biosize / DEV_BSIZE); 652 again: 653 bp = nfs_getcacheblk(vp, bn, biosize, p); 654 if (!bp) 655 return (EINTR); 656 if (bp->b_wcred == NOCRED) { 657 crhold(cred); 658 bp->b_wcred = cred; 659 } 660 np->n_flag |= NMODIFIED; 661 if (uio->uio_offset + n > np->n_size) { 662 np->n_size = uio->uio_offset + n; 663 uvm_vnp_setsize(vp, np->n_size); 664 } 665 666 /* 667 * If the new write will leave a contiguous dirty 668 * area, just update the b_dirtyoff and b_dirtyend, 669 * otherwise force a write rpc of the old dirty area. 670 */ 671 if (bp->b_dirtyend > 0 && 672 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 673 bp->b_proc = p; 674 if (VOP_BWRITE(bp) == EINTR) 675 return (EINTR); 676 goto again; 677 } 678 679 /* 680 * Check for valid write lease and get one as required. 681 * In case getblk() and/or bwrite() delayed us. 682 */ 683 if ((nmp->nm_flag & NFSMNT_NQNFS) && 684 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 685 do { 686 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 687 } while (error == NQNFS_EXPIRED); 688 if (error) { 689 brelse(bp); 690 return (error); 691 } 692 if (np->n_lrev != np->n_brev || 693 (np->n_flag & NQNFSNONCACHE)) { 694 brelse(bp); 695 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 696 if (error) 697 return (error); 698 np->n_brev = np->n_lrev; 699 goto again; 700 } 701 } 702 error = uiomove((char *)bp->b_data + on, n, uio); 703 if (error) { 704 bp->b_flags |= B_ERROR; 705 brelse(bp); 706 return (error); 707 } 708 if (bp->b_dirtyend > 0) { 709 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 710 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 711 } else { 712 bp->b_dirtyoff = on; 713 bp->b_dirtyend = on + n; 714 } 715 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || 716 bp->b_validoff > bp->b_dirtyend) { 717 bp->b_validoff = bp->b_dirtyoff; 718 bp->b_validend = bp->b_dirtyend; 719 } else { 720 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); 721 bp->b_validend = max(bp->b_validend, bp->b_dirtyend); 722 } 723 724 /* 725 * Since this block is being modified, it must be written 726 * again and not just committed. 727 */ 728 bp->b_flags &= ~B_NEEDCOMMIT; 729 730 /* 731 * If the lease is non-cachable or IO_SYNC do bwrite(). 732 */ 733 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 734 bp->b_proc = p; 735 error = VOP_BWRITE(bp); 736 if (error) 737 return (error); 738 if (np->n_flag & NQNFSNONCACHE) { 739 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 740 if (error) 741 return (error); 742 } 743 } else if ((n + on) == biosize && 744 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 745 bp->b_proc = (struct proc *)0; 746 bp->b_flags |= B_ASYNC; 747 (void)nfs_writebp(bp, 0); 748 } else { 749 bdwrite(bp); 750 } 751 } while (uio->uio_resid > 0 && n > 0); 752 return (0); 753 } 754 755 /* 756 * Get an nfs cache block. 757 * Allocate a new one if the block isn't currently in the cache 758 * and return the block marked busy. If the calling process is 759 * interrupted by a signal for an interruptible mount point, return 760 * NULL. 761 */ 762 struct buf * 763 nfs_getcacheblk(vp, bn, size, p) 764 struct vnode *vp; 765 daddr_t bn; 766 int size; 767 struct proc *p; 768 { 769 struct buf *bp; 770 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 771 772 if (nmp->nm_flag & NFSMNT_INT) { 773 bp = getblk(vp, bn, size, PCATCH, 0); 774 while (bp == (struct buf *)0) { 775 if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 776 return ((struct buf *)0); 777 bp = getblk(vp, bn, size, 0, 2 * hz); 778 } 779 } else 780 bp = getblk(vp, bn, size, 0, 0); 781 return (bp); 782 } 783 784 /* 785 * Flush and invalidate all dirty buffers. If another process is already 786 * doing the flush, just wait for completion. 787 */ 788 int 789 nfs_vinvalbuf(vp, flags, cred, p, intrflg) 790 struct vnode *vp; 791 int flags; 792 struct ucred *cred; 793 struct proc *p; 794 int intrflg; 795 { 796 struct nfsnode *np = VTONFS(vp); 797 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 798 int error = 0, slpflag, slptimeo; 799 800 if ((nmp->nm_flag & NFSMNT_INT) == 0) 801 intrflg = 0; 802 if (intrflg) { 803 slpflag = PCATCH; 804 slptimeo = 2 * hz; 805 } else { 806 slpflag = 0; 807 slptimeo = 0; 808 } 809 /* 810 * First wait for any other process doing a flush to complete. 811 */ 812 while (np->n_flag & NFLUSHINPROG) { 813 np->n_flag |= NFLUSHWANT; 814 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 815 slptimeo); 816 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 817 return (EINTR); 818 } 819 820 /* 821 * Now, flush as required. 822 */ 823 np->n_flag |= NFLUSHINPROG; 824 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 825 while (error) { 826 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 827 np->n_flag &= ~NFLUSHINPROG; 828 if (np->n_flag & NFLUSHWANT) { 829 np->n_flag &= ~NFLUSHWANT; 830 wakeup((caddr_t)&np->n_flag); 831 } 832 return (EINTR); 833 } 834 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 835 } 836 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 837 if (np->n_flag & NFLUSHWANT) { 838 np->n_flag &= ~NFLUSHWANT; 839 wakeup((caddr_t)&np->n_flag); 840 } 841 return (0); 842 } 843 844 /* 845 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 846 * This is mainly to avoid queueing async I/O requests when the nfsiods 847 * are all hung on a dead server. 848 */ 849 int 850 nfs_asyncio(bp, cred) 851 struct buf *bp; 852 struct ucred *cred; 853 { 854 int i; 855 struct nfsmount *nmp; 856 int gotiod, slpflag = 0, slptimeo = 0, error; 857 858 if (nfs_numasync == 0) 859 return (EIO); 860 861 862 nmp = VFSTONFS(bp->b_vp->v_mount); 863 again: 864 if (nmp->nm_flag & NFSMNT_INT) 865 slpflag = PCATCH; 866 gotiod = FALSE; 867 868 /* 869 * Find a free iod to process this request. 870 */ 871 872 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 873 if (nfs_iodwant[i]) { 874 /* 875 * Found one, so wake it up and tell it which 876 * mount to process. 877 */ 878 nfs_iodwant[i] = (struct proc *)0; 879 nfs_iodmount[i] = nmp; 880 nmp->nm_bufqiods++; 881 wakeup((caddr_t)&nfs_iodwant[i]); 882 gotiod = TRUE; 883 break; 884 } 885 /* 886 * If none are free, we may already have an iod working on this mount 887 * point. If so, it will process our request. 888 */ 889 if (!gotiod && nmp->nm_bufqiods > 0) 890 gotiod = TRUE; 891 892 /* 893 * If we have an iod which can process the request, then queue 894 * the buffer. 895 */ 896 if (gotiod) { 897 /* 898 * Ensure that the queue never grows too large. 899 */ 900 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 901 nmp->nm_bufqwant = TRUE; 902 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 903 "nfsaio", slptimeo); 904 if (error) { 905 if (nfs_sigintr(nmp, NULL, bp->b_proc)) 906 return (EINTR); 907 if (slpflag == PCATCH) { 908 slpflag = 0; 909 slptimeo = 2 * hz; 910 } 911 } 912 /* 913 * We might have lost our iod while sleeping, 914 * so check and loop if nescessary. 915 */ 916 if (nmp->nm_bufqiods == 0) 917 goto again; 918 } 919 920 if (bp->b_flags & B_READ) { 921 if (bp->b_rcred == NOCRED && cred != NOCRED) { 922 crhold(cred); 923 bp->b_rcred = cred; 924 } 925 } else { 926 bp->b_flags |= B_WRITEINPROG; 927 if (bp->b_wcred == NOCRED && cred != NOCRED) { 928 crhold(cred); 929 bp->b_wcred = cred; 930 } 931 } 932 933 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 934 nmp->nm_bufqlen++; 935 return (0); 936 } 937 938 /* 939 * All the iods are busy on other mounts, so return EIO to 940 * force the caller to process the i/o synchronously. 941 */ 942 return (EIO); 943 } 944 945 /* 946 * Do an I/O operation to/from a cache block. This may be called 947 * synchronously or from an nfsiod. 948 */ 949 int 950 nfs_doio(bp, cr, p) 951 struct buf *bp; 952 struct ucred *cr; 953 struct proc *p; 954 { 955 struct uio *uiop; 956 struct vnode *vp; 957 struct nfsnode *np; 958 struct nfsmount *nmp; 959 int error = 0, diff, len, iomode, must_commit = 0, s; 960 struct uio uio; 961 struct iovec io; 962 963 vp = bp->b_vp; 964 np = VTONFS(vp); 965 nmp = VFSTONFS(vp->v_mount); 966 uiop = &uio; 967 uiop->uio_iov = &io; 968 uiop->uio_iovcnt = 1; 969 uiop->uio_segflg = UIO_SYSSPACE; 970 uiop->uio_procp = p; 971 972 /* 973 * Historically, paging was done with physio, but no more... 974 */ 975 if (bp->b_flags & B_PHYS) { 976 /* 977 * ...though reading /dev/drum still gets us here. 978 */ 979 io.iov_len = uiop->uio_resid = bp->b_bcount; 980 /* mapping was done by vmapbuf() */ 981 io.iov_base = bp->b_data; 982 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 983 if (bp->b_flags & B_READ) { 984 uiop->uio_rw = UIO_READ; 985 nfsstats.read_physios++; 986 error = nfs_readrpc(vp, uiop, cr); 987 } else { 988 iomode = NFSV3WRITE_DATASYNC; 989 uiop->uio_rw = UIO_WRITE; 990 nfsstats.write_physios++; 991 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 992 } 993 if (error) { 994 bp->b_flags |= B_ERROR; 995 bp->b_error = error; 996 } 997 } else if (bp->b_flags & B_READ) { 998 io.iov_len = uiop->uio_resid = bp->b_bcount; 999 io.iov_base = bp->b_data; 1000 uiop->uio_rw = UIO_READ; 1001 switch (vp->v_type) { 1002 case VREG: 1003 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1004 nfsstats.read_bios++; 1005 error = nfs_readrpc(vp, uiop, cr); 1006 if (!error) { 1007 bp->b_validoff = 0; 1008 if (uiop->uio_resid) { 1009 /* 1010 * If len > 0, there is a hole in the file and 1011 * no writes after the hole have been pushed to 1012 * the server yet. 1013 * Just zero fill the rest of the valid area. 1014 */ 1015 diff = bp->b_bcount - uiop->uio_resid; 1016 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE 1017 + diff); 1018 if (len > 0) { 1019 len = min(len, uiop->uio_resid); 1020 memset((char *)bp->b_data + diff, 0, len); 1021 bp->b_validend = diff + len; 1022 } else 1023 bp->b_validend = diff; 1024 } else 1025 bp->b_validend = bp->b_bcount; 1026 } 1027 if (p && (vp->v_flag & VTEXT) && 1028 (((nmp->nm_flag & NFSMNT_NQNFS) && 1029 NQNFS_CKINVALID(vp, np, ND_READ) && 1030 np->n_lrev != np->n_brev) || 1031 (!(nmp->nm_flag & NFSMNT_NQNFS) && 1032 np->n_mtime != np->n_vattr->va_mtime.tv_sec))) { 1033 uprintf("Process killed due to text file modification\n"); 1034 psignal(p, SIGKILL); 1035 p->p_holdcnt++; 1036 } 1037 break; 1038 case VLNK: 1039 uiop->uio_offset = (off_t)0; 1040 nfsstats.readlink_bios++; 1041 error = nfs_readlinkrpc(vp, uiop, cr); 1042 break; 1043 case VDIR: 1044 nfsstats.readdir_bios++; 1045 uiop->uio_offset = bp->b_dcookie; 1046 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 1047 error = nfs_readdirplusrpc(vp, uiop, cr); 1048 if (error == NFSERR_NOTSUPP) 1049 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1050 } 1051 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1052 error = nfs_readdirrpc(vp, uiop, cr); 1053 if (!error) { 1054 bp->b_dcookie = uiop->uio_offset; 1055 bp->b_validoff = 0; 1056 bp->b_validend = bp->b_bcount - uiop->uio_resid; 1057 } 1058 break; 1059 default: 1060 printf("nfs_doio: type %x unexpected\n",vp->v_type); 1061 break; 1062 }; 1063 if (error) { 1064 bp->b_flags |= B_ERROR; 1065 bp->b_error = error; 1066 } 1067 } else { 1068 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1069 - bp->b_dirtyoff; 1070 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE 1071 + bp->b_dirtyoff; 1072 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1073 uiop->uio_rw = UIO_WRITE; 1074 nfsstats.write_bios++; 1075 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC) 1076 iomode = NFSV3WRITE_UNSTABLE; 1077 else 1078 iomode = NFSV3WRITE_FILESYNC; 1079 bp->b_flags |= B_WRITEINPROG; 1080 #ifdef fvdl_debug 1081 printf("nfs_doio(%p): bp %p doff %d dend %d\n", 1082 vp, bp, bp->b_dirtyoff, bp->b_dirtyend); 1083 #endif 1084 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 1085 s = splbio(); 1086 if (!error && iomode == NFSV3WRITE_UNSTABLE) 1087 bp->b_flags |= B_NEEDCOMMIT; 1088 else 1089 bp->b_flags &= ~B_NEEDCOMMIT; 1090 bp->b_flags &= ~B_WRITEINPROG; 1091 1092 /* 1093 * For an interrupted write, the buffer is still valid and the 1094 * write hasn't been pushed to the server yet, so we can't set 1095 * B_ERROR and report the interruption by setting B_EINTR. For 1096 * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt 1097 * is essentially a noop. 1098 * For the case of a V3 write rpc not being committed to stable 1099 * storage, the block is still dirty and requires either a commit 1100 * rpc or another write rpc with iomode == NFSV3WRITE_FILESYNC 1101 * before the block is reused. This is indicated by setting the 1102 * B_DELWRI and B_NEEDCOMMIT flags. 1103 */ 1104 if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1105 bp->b_flags |= B_DELWRI; 1106 1107 /* 1108 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the 1109 * buffer to the clean list, we have to reassign it back to the 1110 * dirty one. Ugh. 1111 */ 1112 if (bp->b_flags & B_ASYNC) { 1113 reassignbuf(bp, vp); 1114 } else if (error) 1115 bp->b_flags |= B_EINTR; 1116 } else { 1117 if (error) { 1118 bp->b_flags |= B_ERROR; 1119 bp->b_error = np->n_error = error; 1120 np->n_flag |= NWRITEERR; 1121 } 1122 bp->b_dirtyoff = bp->b_dirtyend = 0; 1123 } 1124 splx(s); 1125 } 1126 bp->b_resid = uiop->uio_resid; 1127 if (must_commit) 1128 nfs_clearcommit(vp->v_mount); 1129 biodone(bp); 1130 return (error); 1131 } 1132