1 /* $NetBSD: nfs_bio.c,v 1.68 2001/06/27 17:33:43 thorpej Exp $ */ 2 3 /* 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Rick Macklem at The University of Guelph. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 39 */ 40 41 #include "opt_nfs.h" 42 #include "opt_ddb.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/resourcevar.h> 47 #include <sys/signalvar.h> 48 #include <sys/proc.h> 49 #include <sys/buf.h> 50 #include <sys/vnode.h> 51 #include <sys/mount.h> 52 #include <sys/kernel.h> 53 #include <sys/namei.h> 54 #include <sys/dirent.h> 55 #include <sys/malloc.h> 56 57 #include <uvm/uvm_extern.h> 58 #include <uvm/uvm.h> 59 60 #include <nfs/rpcv2.h> 61 #include <nfs/nfsproto.h> 62 #include <nfs/nfs.h> 63 #include <nfs/nfsmount.h> 64 #include <nfs/nqnfs.h> 65 #include <nfs/nfsnode.h> 66 #include <nfs/nfs_var.h> 67 68 extern int nfs_numasync; 69 extern struct nfsstats nfsstats; 70 71 /* 72 * Vnode op for read using bio 73 * Any similarity to readip() is purely coincidental 74 */ 75 int 76 nfs_bioread(vp, uio, ioflag, cred, cflag) 77 struct vnode *vp; 78 struct uio *uio; 79 int ioflag, cflag; 80 struct ucred *cred; 81 { 82 struct nfsnode *np = VTONFS(vp); 83 int biosize; 84 struct buf *bp = NULL, *rabp; 85 struct vattr vattr; 86 struct proc *p; 87 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 88 struct nfsdircache *ndp = NULL, *nndp = NULL; 89 caddr_t baddr, ep, edp; 90 int got_buf = 0, error = 0, n = 0, on = 0, en, enn; 91 int enough = 0; 92 struct dirent *dp, *pdp; 93 off_t curoff = 0; 94 95 #ifdef DIAGNOSTIC 96 if (uio->uio_rw != UIO_READ) 97 panic("nfs_read mode"); 98 #endif 99 if (uio->uio_resid == 0) 100 return (0); 101 if (vp->v_type != VDIR && uio->uio_offset < 0) 102 return (EINVAL); 103 p = uio->uio_procp; 104 #ifndef NFS_V2_ONLY 105 if ((nmp->nm_flag & NFSMNT_NFSV3) && 106 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 107 (void)nfs_fsinfo(nmp, vp, cred, p); 108 #endif 109 if (vp->v_type != VDIR && 110 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 111 return (EFBIG); 112 biosize = nmp->nm_rsize; 113 114 /* 115 * For nfs, cache consistency can only be maintained approximately. 116 * Although RFC1094 does not specify the criteria, the following is 117 * believed to be compatible with the reference port. 118 * For nqnfs, full cache consistency is maintained within the loop. 119 * For nfs: 120 * If the file's modify time on the server has changed since the 121 * last read rpc or you have written to the file, 122 * you may have lost data cache consistency with the 123 * server, so flush all of the file's data out of the cache. 124 * Then force a getattr rpc to ensure that you have up to date 125 * attributes. 126 * NB: This implies that cache data can be read when up to 127 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 128 * attributes this could be forced by setting n_attrstamp to 0 before 129 * the VOP_GETATTR() call. 130 */ 131 132 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) { 133 if (np->n_flag & NMODIFIED) { 134 if (vp->v_type != VREG) { 135 if (vp->v_type != VDIR) 136 panic("nfs: bioread, not dir"); 137 nfs_invaldircache(vp, 0); 138 np->n_direofoffset = 0; 139 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 140 if (error) 141 return (error); 142 } 143 np->n_attrstamp = 0; 144 error = VOP_GETATTR(vp, &vattr, cred, p); 145 if (error) 146 return (error); 147 np->n_mtime = vattr.va_mtime.tv_sec; 148 } else { 149 error = VOP_GETATTR(vp, &vattr, cred, p); 150 if (error) 151 return (error); 152 if (np->n_mtime != vattr.va_mtime.tv_sec) { 153 if (vp->v_type == VDIR) { 154 nfs_invaldircache(vp, 0); 155 np->n_direofoffset = 0; 156 } 157 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 158 if (error) 159 return (error); 160 np->n_mtime = vattr.va_mtime.tv_sec; 161 } 162 } 163 } 164 165 /* 166 * update the cached read creds for this node. 167 */ 168 169 if (np->n_rcred) { 170 crfree(np->n_rcred); 171 } 172 np->n_rcred = cred; 173 crhold(cred); 174 175 do { 176 #ifndef NFS_V2_ONLY 177 /* 178 * Get a valid lease. If cached data is stale, flush it. 179 */ 180 if (nmp->nm_flag & NFSMNT_NQNFS) { 181 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 182 do { 183 error = nqnfs_getlease(vp, ND_READ, cred, p); 184 } while (error == NQNFS_EXPIRED); 185 if (error) 186 return (error); 187 if (np->n_lrev != np->n_brev || 188 (np->n_flag & NQNFSNONCACHE) || 189 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 190 if (vp->v_type == VDIR) { 191 nfs_invaldircache(vp, 0); 192 np->n_direofoffset = 0; 193 } 194 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 195 if (error) 196 return (error); 197 np->n_brev = np->n_lrev; 198 } 199 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 200 nfs_invaldircache(vp, 0); 201 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 202 np->n_direofoffset = 0; 203 if (error) 204 return (error); 205 } 206 } 207 #endif 208 /* 209 * Don't cache symlinks. 210 */ 211 if (np->n_flag & NQNFSNONCACHE 212 || ((vp->v_flag & VROOT) && vp->v_type == VLNK)) { 213 switch (vp->v_type) { 214 case VREG: 215 return (nfs_readrpc(vp, uio)); 216 case VLNK: 217 return (nfs_readlinkrpc(vp, uio, cred)); 218 case VDIR: 219 break; 220 default: 221 printf(" NQNFSNONCACHE: type %x unexpected\n", 222 vp->v_type); 223 }; 224 } 225 baddr = (caddr_t)0; 226 switch (vp->v_type) { 227 case VREG: 228 nfsstats.biocache_reads++; 229 230 error = 0; 231 if (uio->uio_offset >= np->n_size) { 232 break; 233 } 234 while (uio->uio_resid > 0) { 235 void *win; 236 vsize_t bytelen = MIN(np->n_size - uio->uio_offset, 237 uio->uio_resid); 238 239 if (bytelen == 0) 240 break; 241 win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, 242 &bytelen, UBC_READ); 243 error = uiomove(win, bytelen, uio); 244 ubc_release(win, 0); 245 if (error) { 246 break; 247 } 248 } 249 n = 0; 250 break; 251 252 case VLNK: 253 nfsstats.biocache_readlinks++; 254 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 255 if (!bp) 256 return (EINTR); 257 if ((bp->b_flags & B_DONE) == 0) { 258 bp->b_flags |= B_READ; 259 error = nfs_doio(bp, p); 260 if (error) { 261 brelse(bp); 262 return (error); 263 } 264 } 265 n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 266 got_buf = 1; 267 on = 0; 268 break; 269 case VDIR: 270 diragain: 271 nfsstats.biocache_readdirs++; 272 ndp = nfs_searchdircache(vp, uio->uio_offset, 273 (nmp->nm_flag & NFSMNT_XLATECOOKIE), 0); 274 if (!ndp) { 275 /* 276 * We've been handed a cookie that is not 277 * in the cache. If we're not translating 278 * 32 <-> 64, it may be a value that was 279 * flushed out of the cache because it grew 280 * too big. Let the server judge if it's 281 * valid or not. In the translation case, 282 * we have no way of validating this value, 283 * so punt. 284 */ 285 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) 286 return (EINVAL); 287 ndp = nfs_enterdircache(vp, uio->uio_offset, 288 uio->uio_offset, 0, 0); 289 } 290 291 if (uio->uio_offset != 0 && 292 ndp->dc_cookie == np->n_direofoffset) { 293 nfsstats.direofcache_hits++; 294 return (0); 295 } 296 297 bp = nfs_getcacheblk(vp, ndp->dc_blkno, NFS_DIRBLKSIZ, p); 298 if (!bp) 299 return (EINTR); 300 if ((bp->b_flags & B_DONE) == 0) { 301 bp->b_flags |= B_READ; 302 bp->b_dcookie = ndp->dc_blkcookie; 303 error = nfs_doio(bp, p); 304 if (error) { 305 /* 306 * Yuck! The directory has been modified on the 307 * server. Punt and let the userland code 308 * deal with it. 309 */ 310 brelse(bp); 311 if (error == NFSERR_BAD_COOKIE) { 312 nfs_invaldircache(vp, 0); 313 nfs_vinvalbuf(vp, 0, cred, p, 1); 314 error = EINVAL; 315 } 316 return (error); 317 } 318 } 319 320 /* 321 * Just return if we hit EOF right away with this 322 * block. Always check here, because direofoffset 323 * may have been set by an nfsiod since the last 324 * check. 325 */ 326 if (np->n_direofoffset != 0 && 327 ndp->dc_blkcookie == np->n_direofoffset) { 328 brelse(bp); 329 return (0); 330 } 331 332 /* 333 * Find the entry we were looking for in the block. 334 */ 335 336 en = ndp->dc_entry; 337 338 pdp = dp = (struct dirent *)bp->b_data; 339 edp = bp->b_data + bp->b_bcount - bp->b_resid; 340 enn = 0; 341 while (enn < en && (caddr_t)dp < edp) { 342 pdp = dp; 343 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 344 enn++; 345 } 346 347 /* 348 * If the entry number was bigger than the number of 349 * entries in the block, or the cookie of the previous 350 * entry doesn't match, the directory cache is 351 * stale. Flush it and try again (i.e. go to 352 * the server). 353 */ 354 if ((caddr_t)dp >= edp || (caddr_t)dp + dp->d_reclen > edp || 355 (en > 0 && NFS_GETCOOKIE(pdp) != ndp->dc_cookie)) { 356 #ifdef DEBUG 357 printf("invalid cache: %p %p %p off %lx %lx\n", 358 pdp, dp, edp, 359 (unsigned long)uio->uio_offset, 360 (unsigned long)NFS_GETCOOKIE(pdp)); 361 #endif 362 brelse(bp); 363 nfs_invaldircache(vp, 0); 364 nfs_vinvalbuf(vp, 0, cred, p, 0); 365 goto diragain; 366 } 367 368 on = (caddr_t)dp - bp->b_data; 369 370 /* 371 * Cache all entries that may be exported to the 372 * user, as they may be thrown back at us. The 373 * NFSBIO_CACHECOOKIES flag indicates that all 374 * entries are being 'exported', so cache them all. 375 */ 376 377 if (en == 0 && pdp == dp) { 378 dp = (struct dirent *) 379 ((caddr_t)dp + dp->d_reclen); 380 enn++; 381 } 382 383 if (uio->uio_resid < (bp->b_bcount - bp->b_resid - on)) { 384 n = uio->uio_resid; 385 enough = 1; 386 } else 387 n = bp->b_bcount - bp->b_resid - on; 388 389 ep = bp->b_data + on + n; 390 391 /* 392 * Find last complete entry to copy, caching entries 393 * (if requested) as we go. 394 */ 395 396 while ((caddr_t)dp < ep && (caddr_t)dp + dp->d_reclen <= ep) { 397 if (cflag & NFSBIO_CACHECOOKIES) { 398 nndp = nfs_enterdircache(vp, NFS_GETCOOKIE(pdp), 399 ndp->dc_blkcookie, enn, bp->b_lblkno); 400 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 401 NFS_STASHCOOKIE32(pdp, 402 nndp->dc_cookie32); 403 } 404 } 405 pdp = dp; 406 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 407 enn++; 408 } 409 410 /* 411 * If the last requested entry was not the last in the 412 * buffer (happens if NFS_DIRFRAGSIZ < NFS_DIRBLKSIZ), 413 * cache the cookie of the last requested one, and 414 * set of the offset to it. 415 */ 416 417 if ((on + n) < bp->b_bcount - bp->b_resid) { 418 curoff = NFS_GETCOOKIE(pdp); 419 nndp = nfs_enterdircache(vp, curoff, ndp->dc_blkcookie, 420 enn, bp->b_lblkno); 421 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 422 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 423 curoff = nndp->dc_cookie32; 424 } 425 } else 426 curoff = bp->b_dcookie; 427 428 /* 429 * Always cache the entry for the next block, 430 * so that readaheads can use it. 431 */ 432 nndp = nfs_enterdircache(vp, bp->b_dcookie, bp->b_dcookie, 0,0); 433 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 434 if (curoff == bp->b_dcookie) { 435 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 436 curoff = nndp->dc_cookie32; 437 } 438 } 439 440 n = ((caddr_t)pdp + pdp->d_reclen) - (bp->b_data + on); 441 442 /* 443 * If not eof and read aheads are enabled, start one. 444 * (You need the current block first, so that you have the 445 * directory offset cookie of the next block.) 446 */ 447 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 448 np->n_direofoffset == 0 && !(np->n_flag & NQNFSNONCACHE)) { 449 rabp = nfs_getcacheblk(vp, nndp->dc_blkno, 450 NFS_DIRBLKSIZ, p); 451 if (rabp) { 452 if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) { 453 rabp->b_dcookie = nndp->dc_cookie; 454 rabp->b_flags |= (B_READ | B_ASYNC); 455 if (nfs_asyncio(rabp)) { 456 rabp->b_flags |= B_INVAL; 457 brelse(rabp); 458 } 459 } else 460 brelse(rabp); 461 } 462 } 463 got_buf = 1; 464 break; 465 default: 466 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 467 break; 468 } 469 470 if (n > 0) { 471 if (!baddr) 472 baddr = bp->b_data; 473 error = uiomove(baddr + on, (int)n, uio); 474 } 475 switch (vp->v_type) { 476 case VREG: 477 break; 478 case VLNK: 479 n = 0; 480 break; 481 case VDIR: 482 if (np->n_flag & NQNFSNONCACHE) 483 bp->b_flags |= B_INVAL; 484 uio->uio_offset = curoff; 485 if (enough) 486 n = 0; 487 break; 488 default: 489 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 490 } 491 if (got_buf) 492 brelse(bp); 493 } while (error == 0 && uio->uio_resid > 0 && n > 0); 494 return (error); 495 } 496 497 /* 498 * Vnode op for write using bio 499 */ 500 int 501 nfs_write(v) 502 void *v; 503 { 504 struct vop_write_args /* { 505 struct vnode *a_vp; 506 struct uio *a_uio; 507 int a_ioflag; 508 struct ucred *a_cred; 509 } */ *ap = v; 510 struct uio *uio = ap->a_uio; 511 struct proc *p = uio->uio_procp; 512 struct vnode *vp = ap->a_vp; 513 struct nfsnode *np = VTONFS(vp); 514 struct ucred *cred = ap->a_cred; 515 int ioflag = ap->a_ioflag; 516 struct vattr vattr; 517 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 518 int error = 0, iomode, must_commit; 519 int rv; 520 521 #ifdef DIAGNOSTIC 522 if (uio->uio_rw != UIO_WRITE) 523 panic("nfs_write mode"); 524 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 525 panic("nfs_write proc"); 526 #endif 527 if (vp->v_type != VREG) 528 return (EIO); 529 if (np->n_flag & NWRITEERR) { 530 np->n_flag &= ~NWRITEERR; 531 return (np->n_error); 532 } 533 #ifndef NFS_V2_ONLY 534 if ((nmp->nm_flag & NFSMNT_NFSV3) && 535 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 536 (void)nfs_fsinfo(nmp, vp, cred, p); 537 #endif 538 if (ioflag & (IO_APPEND | IO_SYNC)) { 539 if (np->n_flag & NMODIFIED) { 540 np->n_attrstamp = 0; 541 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 542 if (error) 543 return (error); 544 } 545 if (ioflag & IO_APPEND) { 546 np->n_attrstamp = 0; 547 error = VOP_GETATTR(vp, &vattr, cred, p); 548 if (error) 549 return (error); 550 uio->uio_offset = np->n_size; 551 } 552 } 553 if (uio->uio_offset < 0) 554 return (EINVAL); 555 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 556 return (EFBIG); 557 if (uio->uio_resid == 0) 558 return (0); 559 /* 560 * Maybe this should be above the vnode op call, but so long as 561 * file servers have no limits, i don't think it matters 562 */ 563 if (p && uio->uio_offset + uio->uio_resid > 564 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 565 psignal(p, SIGXFSZ); 566 return (EFBIG); 567 } 568 569 /* 570 * update the cached write creds for this node. 571 */ 572 573 if (np->n_wcred) { 574 crfree(np->n_wcred); 575 } 576 np->n_wcred = cred; 577 crhold(cred); 578 579 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 580 iomode = NFSV3WRITE_FILESYNC; 581 error = nfs_writerpc(vp, uio, &iomode, &must_commit); 582 if (must_commit) 583 nfs_clearcommit(vp->v_mount); 584 return (error); 585 } 586 587 do { 588 void *win; 589 voff_t oldoff = uio->uio_offset; 590 vsize_t bytelen = uio->uio_resid; 591 592 #ifndef NFS_V2_ONLY 593 /* 594 * Check for a valid write lease. 595 */ 596 if ((nmp->nm_flag & NFSMNT_NQNFS) && 597 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 598 do { 599 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 600 } while (error == NQNFS_EXPIRED); 601 if (error) 602 return (error); 603 if (np->n_lrev != np->n_brev || 604 (np->n_flag & NQNFSNONCACHE)) { 605 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 606 if (error) 607 return (error); 608 np->n_brev = np->n_lrev; 609 } 610 } 611 #endif 612 nfsstats.biocache_writes++; 613 614 np->n_flag |= NMODIFIED; 615 if (np->n_size < uio->uio_offset + bytelen) { 616 np->n_size = uio->uio_offset + bytelen; 617 uvm_vnp_setsize(vp, np->n_size); 618 } 619 win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, &bytelen, 620 UBC_WRITE); 621 error = uiomove(win, bytelen, uio); 622 ubc_release(win, 0); 623 rv = 1; 624 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 625 simple_lock(&vp->v_uvm.u_obj.vmobjlock); 626 rv = vp->v_uvm.u_obj.pgops->pgo_flush( 627 &vp->v_uvm.u_obj, 628 oldoff & ~(nmp->nm_wsize - 1), 629 uio->uio_offset & ~(nmp->nm_wsize - 1), 630 PGO_CLEANIT|PGO_SYNCIO); 631 simple_unlock(&vp->v_uvm.u_obj.vmobjlock); 632 } else if ((oldoff & ~(nmp->nm_wsize - 1)) != 633 (uio->uio_offset & ~(nmp->nm_wsize - 1))) { 634 simple_lock(&vp->v_uvm.u_obj.vmobjlock); 635 rv = vp->v_uvm.u_obj.pgops->pgo_flush( 636 &vp->v_uvm.u_obj, 637 oldoff & ~(nmp->nm_wsize - 1), 638 uio->uio_offset & ~(nmp->nm_wsize - 1), 639 PGO_CLEANIT|PGO_WEAK); 640 simple_unlock(&vp->v_uvm.u_obj.vmobjlock); 641 } 642 if (!rv) { 643 error = EIO; 644 } 645 if (error) { 646 break; 647 } 648 } while (uio->uio_resid > 0); 649 return error; 650 } 651 652 /* 653 * Get an nfs cache block. 654 * Allocate a new one if the block isn't currently in the cache 655 * and return the block marked busy. If the calling process is 656 * interrupted by a signal for an interruptible mount point, return 657 * NULL. 658 */ 659 struct buf * 660 nfs_getcacheblk(vp, bn, size, p) 661 struct vnode *vp; 662 daddr_t bn; 663 int size; 664 struct proc *p; 665 { 666 struct buf *bp; 667 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 668 669 if (nmp->nm_flag & NFSMNT_INT) { 670 bp = getblk(vp, bn, size, PCATCH, 0); 671 while (bp == NULL) { 672 if (nfs_sigintr(nmp, NULL, p)) 673 return (NULL); 674 bp = getblk(vp, bn, size, 0, 2 * hz); 675 } 676 } else 677 bp = getblk(vp, bn, size, 0, 0); 678 return (bp); 679 } 680 681 /* 682 * Flush and invalidate all dirty buffers. If another process is already 683 * doing the flush, just wait for completion. 684 */ 685 int 686 nfs_vinvalbuf(vp, flags, cred, p, intrflg) 687 struct vnode *vp; 688 int flags; 689 struct ucred *cred; 690 struct proc *p; 691 int intrflg; 692 { 693 struct nfsnode *np = VTONFS(vp); 694 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 695 int error = 0, slpflag, slptimeo; 696 697 if ((nmp->nm_flag & NFSMNT_INT) == 0) 698 intrflg = 0; 699 if (intrflg) { 700 slpflag = PCATCH; 701 slptimeo = 2 * hz; 702 } else { 703 slpflag = 0; 704 slptimeo = 0; 705 } 706 /* 707 * First wait for any other process doing a flush to complete. 708 */ 709 while (np->n_flag & NFLUSHINPROG) { 710 np->n_flag |= NFLUSHWANT; 711 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 712 slptimeo); 713 if (error && intrflg && nfs_sigintr(nmp, NULL, p)) 714 return (EINTR); 715 } 716 717 /* 718 * Now, flush as required. 719 */ 720 np->n_flag |= NFLUSHINPROG; 721 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 722 while (error) { 723 if (intrflg && nfs_sigintr(nmp, NULL, p)) { 724 np->n_flag &= ~NFLUSHINPROG; 725 if (np->n_flag & NFLUSHWANT) { 726 np->n_flag &= ~NFLUSHWANT; 727 wakeup((caddr_t)&np->n_flag); 728 } 729 return (EINTR); 730 } 731 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 732 } 733 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 734 if (np->n_flag & NFLUSHWANT) { 735 np->n_flag &= ~NFLUSHWANT; 736 wakeup((caddr_t)&np->n_flag); 737 } 738 return (0); 739 } 740 741 /* 742 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 743 * This is mainly to avoid queueing async I/O requests when the nfsiods 744 * are all hung on a dead server. 745 */ 746 int 747 nfs_asyncio(bp) 748 struct buf *bp; 749 { 750 int i; 751 struct nfsmount *nmp; 752 int gotiod, slpflag = 0, slptimeo = 0, error; 753 754 if (nfs_numasync == 0) 755 return (EIO); 756 757 758 nmp = VFSTONFS(bp->b_vp->v_mount); 759 again: 760 if (nmp->nm_flag & NFSMNT_INT) 761 slpflag = PCATCH; 762 gotiod = FALSE; 763 764 /* 765 * Find a free iod to process this request. 766 */ 767 768 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 769 if (nfs_iodwant[i]) { 770 /* 771 * Found one, so wake it up and tell it which 772 * mount to process. 773 */ 774 nfs_iodwant[i] = NULL; 775 nfs_iodmount[i] = nmp; 776 nmp->nm_bufqiods++; 777 wakeup((caddr_t)&nfs_iodwant[i]); 778 gotiod = TRUE; 779 break; 780 } 781 /* 782 * If none are free, we may already have an iod working on this mount 783 * point. If so, it will process our request. 784 */ 785 if (!gotiod && nmp->nm_bufqiods > 0) 786 gotiod = TRUE; 787 788 /* 789 * If we have an iod which can process the request, then queue 790 * the buffer. 791 */ 792 if (gotiod) { 793 /* 794 * Ensure that the queue never grows too large. 795 */ 796 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 797 nmp->nm_bufqwant = TRUE; 798 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 799 "nfsaio", slptimeo); 800 if (error) { 801 if (nfs_sigintr(nmp, NULL, bp->b_proc)) 802 return (EINTR); 803 if (slpflag == PCATCH) { 804 slpflag = 0; 805 slptimeo = 2 * hz; 806 } 807 } 808 /* 809 * We might have lost our iod while sleeping, 810 * so check and loop if nescessary. 811 */ 812 if (nmp->nm_bufqiods == 0) 813 goto again; 814 } 815 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 816 nmp->nm_bufqlen++; 817 return (0); 818 } 819 820 /* 821 * All the iods are busy on other mounts, so return EIO to 822 * force the caller to process the i/o synchronously. 823 */ 824 return (EIO); 825 } 826 827 /* 828 * Do an I/O operation to/from a cache block. This may be called 829 * synchronously or from an nfsiod. 830 */ 831 int 832 nfs_doio(bp, p) 833 struct buf *bp; 834 struct proc *p; 835 { 836 struct uio *uiop; 837 struct vnode *vp; 838 struct nfsnode *np; 839 struct nfsmount *nmp; 840 int error = 0, diff, len, iomode, must_commit = 0; 841 struct uio uio; 842 struct iovec io; 843 844 vp = bp->b_vp; 845 np = VTONFS(vp); 846 nmp = VFSTONFS(vp->v_mount); 847 uiop = &uio; 848 uiop->uio_iov = &io; 849 uiop->uio_iovcnt = 1; 850 uiop->uio_segflg = UIO_SYSSPACE; 851 uiop->uio_procp = p; 852 853 /* 854 * Historically, paging was done with physio, but no more... 855 */ 856 if (bp->b_flags & B_PHYS) { 857 /* 858 * ...though reading /dev/drum still gets us here. 859 */ 860 io.iov_len = uiop->uio_resid = bp->b_bcount; 861 /* mapping was done by vmapbuf() */ 862 io.iov_base = bp->b_data; 863 uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; 864 if (bp->b_flags & B_READ) { 865 uiop->uio_rw = UIO_READ; 866 nfsstats.read_physios++; 867 error = nfs_readrpc(vp, uiop); 868 } else { 869 iomode = NFSV3WRITE_DATASYNC; 870 uiop->uio_rw = UIO_WRITE; 871 nfsstats.write_physios++; 872 error = nfs_writerpc(vp, uiop, &iomode, &must_commit); 873 } 874 if (error) { 875 bp->b_flags |= B_ERROR; 876 bp->b_error = error; 877 } 878 } else if (bp->b_flags & B_READ) { 879 io.iov_len = uiop->uio_resid = bp->b_bcount; 880 io.iov_base = bp->b_data; 881 uiop->uio_rw = UIO_READ; 882 switch (vp->v_type) { 883 case VREG: 884 uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; 885 nfsstats.read_bios++; 886 error = nfs_readrpc(vp, uiop); 887 if (!error && uiop->uio_resid) { 888 889 /* 890 * If len > 0, there is a hole in the file and 891 * no writes after the hole have been pushed to 892 * the server yet. 893 * Just zero fill the rest of the valid area. 894 */ 895 896 diff = bp->b_bcount - uiop->uio_resid; 897 len = np->n_size - ((((off_t)bp->b_blkno) << DEV_BSHIFT) 898 + diff); 899 if (len > 0) { 900 len = MIN(len, uiop->uio_resid); 901 memset((char *)bp->b_data + diff, 0, len); 902 } 903 } 904 if (p && (vp->v_flag & VTEXT) && 905 (((nmp->nm_flag & NFSMNT_NQNFS) && 906 NQNFS_CKINVALID(vp, np, ND_READ) && 907 np->n_lrev != np->n_brev) || 908 (!(nmp->nm_flag & NFSMNT_NQNFS) && 909 np->n_mtime != np->n_vattr->va_mtime.tv_sec))) { 910 uprintf("Process killed due to " 911 "text file modification\n"); 912 psignal(p, SIGKILL); 913 p->p_holdcnt++; 914 } 915 break; 916 case VLNK: 917 uiop->uio_offset = (off_t)0; 918 nfsstats.readlink_bios++; 919 error = nfs_readlinkrpc(vp, uiop, curproc->p_ucred); 920 break; 921 case VDIR: 922 nfsstats.readdir_bios++; 923 uiop->uio_offset = bp->b_dcookie; 924 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 925 error = nfs_readdirplusrpc(vp, uiop, curproc->p_ucred); 926 if (error == NFSERR_NOTSUPP) 927 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 928 } 929 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 930 error = nfs_readdirrpc(vp, uiop, curproc->p_ucred); 931 if (!error) { 932 bp->b_dcookie = uiop->uio_offset; 933 } 934 break; 935 default: 936 printf("nfs_doio: type %x unexpected\n",vp->v_type); 937 break; 938 } 939 if (error) { 940 bp->b_flags |= B_ERROR; 941 bp->b_error = error; 942 } 943 } else { 944 /* 945 * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not 946 * an actual write will have to be scheduled. 947 */ 948 949 io.iov_base = bp->b_data; 950 io.iov_len = uiop->uio_resid = bp->b_bcount; 951 uiop->uio_offset = (((off_t)bp->b_blkno) << DEV_BSHIFT); 952 uiop->uio_rw = UIO_WRITE; 953 nfsstats.write_bios++; 954 iomode = NFSV3WRITE_UNSTABLE; 955 error = nfs_writerpc(vp, uiop, &iomode, &must_commit); 956 } 957 bp->b_resid = uiop->uio_resid; 958 if (must_commit) 959 nfs_clearcommit(vp->v_mount); 960 biodone(bp); 961 return (error); 962 } 963 964 /* 965 * Vnode op for VM getpages. 966 */ 967 int 968 nfs_getpages(v) 969 void *v; 970 { 971 struct vop_getpages_args /* { 972 struct vnode *a_vp; 973 voff_t a_offset; 974 struct vm_page **a_m; 975 int *a_count; 976 int a_centeridx; 977 vm_prot_t a_access_type; 978 int a_advice; 979 int a_flags; 980 } */ *ap = v; 981 982 off_t eof, offset, origoffset, startoffset, endoffset; 983 int s, i, error, npages, orignpages, npgs, ridx, pidx, pcount; 984 vaddr_t kva; 985 struct buf *bp, *mbp; 986 struct vnode *vp = ap->a_vp; 987 struct nfsnode *np = VTONFS(vp); 988 struct uvm_object *uobj = &vp->v_uvm.u_obj; 989 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 990 size_t bytes, iobytes, tailbytes, totalbytes, skipbytes; 991 int flags = ap->a_flags; 992 int bsize; 993 struct vm_page *pgs[16]; /* XXXUBC 16 */ 994 boolean_t v3 = NFS_ISV3(vp); 995 boolean_t async = (flags & PGO_SYNCIO) == 0; 996 boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0; 997 998 UVMHIST_FUNC("nfs_getpages"); UVMHIST_CALLED(ubchist); 999 UVMHIST_LOG(ubchist, "vp %p off 0x%x count %d", vp, (int)ap->a_offset, 1000 *ap->a_count,0); 1001 1002 #ifdef DIAGNOSTIC 1003 if (ap->a_centeridx < 0 || ap->a_centeridx >= *ap->a_count) { 1004 panic("nfs_getpages: centeridx %d out of range", 1005 ap->a_centeridx); 1006 } 1007 #endif 1008 1009 error = 0; 1010 origoffset = ap->a_offset; 1011 eof = vp->v_uvm.u_size; 1012 if (origoffset >= eof) { 1013 if ((flags & PGO_LOCKED) == 0) { 1014 simple_unlock(&uobj->vmobjlock); 1015 } 1016 UVMHIST_LOG(ubchist, "off 0x%x past EOF 0x%x", 1017 (int)origoffset, (int)eof,0,0); 1018 return EINVAL; 1019 } 1020 1021 if (flags & PGO_LOCKED) { 1022 uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, 1023 UFP_NOWAIT|UFP_NOALLOC); 1024 return 0; 1025 } 1026 1027 /* vnode is VOP_LOCKed, uobj is locked */ 1028 1029 if (write && (vp->v_flag & VONWORKLST) == 0) { 1030 vn_syncer_add_to_worklist(vp, filedelay); 1031 } 1032 1033 bsize = nmp->nm_rsize; 1034 orignpages = MIN(*ap->a_count, 1035 round_page(eof - origoffset) >> PAGE_SHIFT); 1036 npages = orignpages; 1037 startoffset = origoffset & ~(bsize - 1); 1038 endoffset = round_page((origoffset + (npages << PAGE_SHIFT) 1039 + bsize - 1) & ~(bsize - 1)); 1040 endoffset = MIN(endoffset, round_page(eof)); 1041 ridx = (origoffset - startoffset) >> PAGE_SHIFT; 1042 1043 if (!async && !write) { 1044 int rapages = MAX(PAGE_SIZE, nmp->nm_rsize) >> PAGE_SHIFT; 1045 1046 (void) VOP_GETPAGES(vp, endoffset, NULL, &rapages, 0, 1047 VM_PROT_READ, 0, 0); 1048 simple_lock(&uobj->vmobjlock); 1049 } 1050 1051 UVMHIST_LOG(ubchist, "npages %d offset 0x%x", npages, 1052 (int)origoffset, 0,0); 1053 memset(pgs, 0, sizeof(pgs)); 1054 uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL); 1055 1056 if (flags & PGO_OVERWRITE) { 1057 UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0); 1058 1059 /* XXXUBC for now, zero the page if we allocated it */ 1060 for (i = 0; i < npages; i++) { 1061 struct vm_page *pg = pgs[ridx + i]; 1062 1063 if (pg->flags & PG_FAKE) { 1064 uvm_pagezero(pg); 1065 pg->flags &= ~(PG_FAKE); 1066 } 1067 } 1068 npages += ridx; 1069 if (v3) { 1070 simple_unlock(&uobj->vmobjlock); 1071 goto uncommit; 1072 } 1073 goto out; 1074 } 1075 1076 /* 1077 * if the pages are already resident, just return them. 1078 */ 1079 1080 for (i = 0; i < npages; i++) { 1081 struct vm_page *pg = pgs[ridx + i]; 1082 1083 if ((pg->flags & PG_FAKE) != 0 || 1084 ((ap->a_access_type & VM_PROT_WRITE) && 1085 (pg->flags & PG_RDONLY))) { 1086 break; 1087 } 1088 } 1089 if (i == npages) { 1090 UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0); 1091 npages += ridx; 1092 goto out; 1093 } 1094 1095 /* 1096 * the page wasn't resident and we're not overwriting, 1097 * so we're going to have to do some i/o. 1098 * find any additional pages needed to cover the expanded range. 1099 */ 1100 1101 if (startoffset != origoffset || 1102 startoffset + (npages << PAGE_SHIFT) != endoffset) { 1103 1104 /* 1105 * XXXUBC we need to avoid deadlocks caused by locking 1106 * additional pages at lower offsets than pages we 1107 * already have locked. for now, unlock them all and 1108 * start over. 1109 */ 1110 1111 for (i = 0; i < npages; i++) { 1112 struct vm_page *pg = pgs[ridx + i]; 1113 1114 if (pg->flags & PG_FAKE) { 1115 pg->flags |= PG_RELEASED; 1116 } 1117 } 1118 uvm_page_unbusy(&pgs[ridx], npages); 1119 memset(pgs, 0, sizeof(pgs)); 1120 1121 UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x", 1122 startoffset, endoffset, 0,0); 1123 npages = (endoffset - startoffset) >> PAGE_SHIFT; 1124 npgs = npages; 1125 uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL); 1126 } 1127 simple_unlock(&uobj->vmobjlock); 1128 1129 /* 1130 * update the cached read creds for this node. 1131 */ 1132 1133 if (np->n_rcred) { 1134 crfree(np->n_rcred); 1135 } 1136 np->n_rcred = curproc->p_ucred; 1137 crhold(np->n_rcred); 1138 1139 /* 1140 * read the desired page(s). 1141 */ 1142 1143 totalbytes = npages << PAGE_SHIFT; 1144 bytes = MIN(totalbytes, vp->v_uvm.u_size - startoffset); 1145 tailbytes = totalbytes - bytes; 1146 skipbytes = 0; 1147 1148 kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK | 1149 UVMPAGER_MAPIN_READ); 1150 1151 s = splbio(); 1152 mbp = pool_get(&bufpool, PR_WAITOK); 1153 splx(s); 1154 mbp->b_bufsize = totalbytes; 1155 mbp->b_data = (void *)kva; 1156 mbp->b_resid = mbp->b_bcount = bytes; 1157 mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL|B_ASYNC : 0); 1158 mbp->b_iodone = uvm_aio_biodone; 1159 mbp->b_vp = vp; 1160 mbp->b_proc = NULL; /* XXXUBC */ 1161 LIST_INIT(&mbp->b_dep); 1162 1163 /* 1164 * if EOF is in the middle of the last page, zero the part past EOF. 1165 */ 1166 1167 if (tailbytes > 0 && (pgs[bytes >> PAGE_SHIFT]->flags & PG_FAKE)) { 1168 memset((char *)kva + bytes, 0, tailbytes); 1169 } 1170 1171 /* 1172 * now loop over the pages, reading as needed. 1173 */ 1174 1175 bp = NULL; 1176 for (offset = startoffset; 1177 bytes > 0; 1178 offset += iobytes, bytes -= iobytes) { 1179 1180 /* 1181 * skip pages which don't need to be read. 1182 */ 1183 1184 pidx = (offset - startoffset) >> PAGE_SHIFT; 1185 UVMHIST_LOG(ubchist, "pidx %d offset 0x%x startoffset 0x%x", 1186 pidx, (int)offset, (int)startoffset,0); 1187 while ((pgs[pidx]->flags & PG_FAKE) == 0) { 1188 size_t b; 1189 1190 KASSERT((offset & (PAGE_SIZE - 1)) == 0); 1191 b = MIN(PAGE_SIZE, bytes); 1192 offset += b; 1193 bytes -= b; 1194 skipbytes += b; 1195 pidx++; 1196 UVMHIST_LOG(ubchist, "skipping, new offset 0x%x", 1197 (int)offset, 0,0,0); 1198 if (bytes == 0) { 1199 goto loopdone; 1200 } 1201 } 1202 1203 /* 1204 * see how many pages can be read with this i/o. 1205 * reduce the i/o size if necessary. 1206 */ 1207 1208 iobytes = bytes; 1209 if (offset + iobytes > round_page(offset)) { 1210 pcount = 1; 1211 while (pidx + pcount < npages && 1212 pgs[pidx + pcount]->flags & PG_FAKE) { 1213 pcount++; 1214 } 1215 iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) - 1216 (offset - trunc_page(offset))); 1217 } 1218 iobytes = MIN(iobytes, nmp->nm_rsize); 1219 1220 /* 1221 * allocate a sub-buf for this piece of the i/o 1222 * (or just use mbp if there's only 1 piece), 1223 * and start it going. 1224 */ 1225 1226 if (offset == startoffset && iobytes == bytes) { 1227 bp = mbp; 1228 } else { 1229 s = splbio(); 1230 bp = pool_get(&bufpool, PR_WAITOK); 1231 splx(s); 1232 bp->b_data = (char *)kva + offset - startoffset; 1233 bp->b_resid = bp->b_bcount = iobytes; 1234 bp->b_flags = B_BUSY|B_READ|B_CALL|B_ASYNC; 1235 bp->b_iodone = uvm_aio_biodone1; 1236 bp->b_vp = vp; 1237 bp->b_proc = NULL; /* XXXUBC */ 1238 LIST_INIT(&bp->b_dep); 1239 } 1240 bp->b_private = mbp; 1241 bp->b_lblkno = bp->b_blkno = offset >> DEV_BSHIFT; 1242 1243 UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x", 1244 bp, offset, iobytes, bp->b_blkno); 1245 1246 VOP_STRATEGY(bp); 1247 } 1248 1249 loopdone: 1250 if (skipbytes) { 1251 s = splbio(); 1252 mbp->b_resid -= skipbytes; 1253 if (mbp->b_resid == 0) { 1254 biodone(mbp); 1255 } 1256 splx(s); 1257 } 1258 if (async) { 1259 UVMHIST_LOG(ubchist, "returning 0 (async)",0,0,0,0); 1260 return 0; 1261 } 1262 if (bp != NULL) { 1263 error = biowait(mbp); 1264 } 1265 s = splbio(); 1266 pool_put(&bufpool, mbp); 1267 splx(s); 1268 uvm_pagermapout(kva, npages); 1269 1270 if (write && v3) { 1271 uncommit: 1272 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1273 nfs_del_committed_range(vp, origoffset, npages); 1274 nfs_del_tobecommitted_range(vp, origoffset, npages); 1275 simple_lock(&uobj->vmobjlock); 1276 for (i = 0; i < npages; i++) { 1277 if (pgs[i] == NULL) { 1278 continue; 1279 } 1280 pgs[i]->flags &= ~(PG_NEEDCOMMIT|PG_RDONLY); 1281 } 1282 simple_unlock(&uobj->vmobjlock); 1283 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1284 } 1285 1286 simple_lock(&uobj->vmobjlock); 1287 1288 out: 1289 if (error) { 1290 uvm_lock_pageq(); 1291 for (i = 0; i < npages; i++) { 1292 if (pgs[i] == NULL) { 1293 continue; 1294 } 1295 UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", 1296 pgs[i], pgs[i]->flags, 0,0); 1297 if (pgs[i]->flags & PG_WANTED) { 1298 wakeup(pgs[i]); 1299 } 1300 if (pgs[i]->flags & PG_RELEASED) { 1301 uvm_unlock_pageq(); 1302 (uobj->pgops->pgo_releasepg)(pgs[i], NULL); 1303 uvm_lock_pageq(); 1304 continue; 1305 } 1306 if (pgs[i]->flags & PG_FAKE) { 1307 uvm_pagefree(pgs[i]); 1308 continue; 1309 } 1310 uvm_pageactivate(pgs[i]); 1311 pgs[i]->flags &= ~(PG_WANTED|PG_BUSY); 1312 UVM_PAGE_OWN(pgs[i], NULL); 1313 } 1314 uvm_unlock_pageq(); 1315 simple_unlock(&uobj->vmobjlock); 1316 UVMHIST_LOG(ubchist, "returning error %d", error,0,0,0); 1317 return error; 1318 } 1319 1320 UVMHIST_LOG(ubchist, "ridx %d count %d", ridx, npages, 0,0); 1321 uvm_lock_pageq(); 1322 for (i = 0; i < npages; i++) { 1323 if (pgs[i] == NULL) { 1324 continue; 1325 } 1326 UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", 1327 pgs[i], pgs[i]->flags, 0,0); 1328 if (pgs[i]->flags & PG_FAKE) { 1329 UVMHIST_LOG(ubchist, "unfaking pg %p offset 0x%x", 1330 pgs[i], (int)pgs[i]->offset,0,0); 1331 pgs[i]->flags &= ~(PG_FAKE); 1332 pmap_clear_modify(pgs[i]); 1333 pmap_clear_reference(pgs[i]); 1334 } 1335 if (i < ridx || i >= ridx + orignpages || async) { 1336 UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x", 1337 pgs[i], (int)pgs[i]->offset,0,0); 1338 if (pgs[i]->flags & PG_WANTED) { 1339 wakeup(pgs[i]); 1340 } 1341 if (pgs[i]->flags & PG_RELEASED) { 1342 uvm_unlock_pageq(); 1343 (uobj->pgops->pgo_releasepg)(pgs[i], NULL); 1344 uvm_lock_pageq(); 1345 continue; 1346 } 1347 uvm_pageactivate(pgs[i]); 1348 pgs[i]->flags &= ~(PG_WANTED|PG_BUSY); 1349 UVM_PAGE_OWN(pgs[i], NULL); 1350 } 1351 } 1352 uvm_unlock_pageq(); 1353 simple_unlock(&uobj->vmobjlock); 1354 if (ap->a_m != NULL) { 1355 memcpy(ap->a_m, &pgs[ridx], 1356 *ap->a_count * sizeof(struct vm_page *)); 1357 } 1358 return 0; 1359 } 1360 1361 /* 1362 * Vnode op for VM putpages. 1363 */ 1364 int 1365 nfs_putpages(v) 1366 void *v; 1367 { 1368 struct vop_putpages_args /* { 1369 struct vnode *a_vp; 1370 struct vm_page **a_m; 1371 int a_count; 1372 int a_flags; 1373 int *a_rtvals; 1374 } */ *ap = v; 1375 1376 struct vnode *vp = ap->a_vp; 1377 struct nfsnode *np = VTONFS(vp); 1378 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1379 struct buf *bp, *mbp; 1380 struct vm_page **pgs = ap->a_m; 1381 int flags = ap->a_flags; 1382 int npages = ap->a_count; 1383 int s, error, i; 1384 size_t bytes, iobytes, skipbytes; 1385 vaddr_t kva; 1386 off_t offset, origoffset, commitoff; 1387 uint32_t commitbytes; 1388 boolean_t v3 = NFS_ISV3(vp); 1389 boolean_t async = (flags & PGO_SYNCIO) == 0; 1390 boolean_t weak = (flags & PGO_WEAK) && v3; 1391 UVMHIST_FUNC("nfs_putpages"); UVMHIST_CALLED(ubchist); 1392 1393 UVMHIST_LOG(ubchist, "vp %p pgp %p count %d", 1394 vp, ap->a_m, ap->a_count,0); 1395 1396 simple_unlock(&vp->v_uvm.u_obj.vmobjlock); 1397 1398 error = 0; 1399 origoffset = pgs[0]->offset; 1400 bytes = MIN(ap->a_count << PAGE_SHIFT, vp->v_uvm.u_size - origoffset); 1401 skipbytes = 0; 1402 1403 /* 1404 * if the range has been committed already, mark the pages thus. 1405 * if the range just needs to be committed, we're done 1406 * if it's a weak putpage, otherwise commit the range. 1407 */ 1408 1409 if (v3) { 1410 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1411 if (nfs_in_committed_range(vp, origoffset, bytes)) { 1412 goto committed; 1413 } 1414 if (nfs_in_tobecommitted_range(vp, origoffset, bytes)) { 1415 if (weak) { 1416 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1417 return 0; 1418 } else { 1419 commitoff = np->n_pushlo; 1420 commitbytes = (uint32_t)(np->n_pushhi - 1421 np->n_pushlo); 1422 goto commit; 1423 } 1424 } 1425 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1426 } 1427 1428 /* 1429 * otherwise write or commit all the pages. 1430 */ 1431 1432 kva = uvm_pagermapin(pgs, ap->a_count, UVMPAGER_MAPIN_WAITOK| 1433 UVMPAGER_MAPIN_WRITE); 1434 1435 s = splbio(); 1436 vp->v_numoutput += 2; 1437 mbp = pool_get(&bufpool, PR_WAITOK); 1438 UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x", 1439 vp, mbp, vp->v_numoutput, bytes); 1440 splx(s); 1441 mbp->b_bufsize = npages << PAGE_SHIFT; 1442 mbp->b_data = (void *)kva; 1443 mbp->b_resid = mbp->b_bcount = bytes; 1444 mbp->b_flags = B_BUSY|B_WRITE|B_AGE | 1445 (async ? B_CALL|B_ASYNC : 0) | 1446 (curproc == uvm.pagedaemon_proc ? B_PDAEMON : 0); 1447 mbp->b_iodone = uvm_aio_biodone; 1448 mbp->b_vp = vp; 1449 mbp->b_proc = NULL; /* XXXUBC */ 1450 LIST_INIT(&mbp->b_dep); 1451 1452 for (offset = origoffset; 1453 bytes > 0; 1454 offset += iobytes, bytes -= iobytes) { 1455 iobytes = MIN(nmp->nm_wsize, bytes); 1456 1457 /* 1458 * skip writing any pages which only need a commit. 1459 */ 1460 1461 if ((pgs[(offset - origoffset) >> PAGE_SHIFT]->flags & 1462 PG_NEEDCOMMIT) != 0) { 1463 KASSERT((offset & (PAGE_SIZE - 1)) == 0); 1464 iobytes = MIN(PAGE_SIZE, bytes); 1465 skipbytes += iobytes; 1466 continue; 1467 } 1468 1469 /* if it's really one i/o, don't make a second buf */ 1470 if (offset == origoffset && iobytes == bytes) { 1471 bp = mbp; 1472 } else { 1473 s = splbio(); 1474 vp->v_numoutput++; 1475 bp = pool_get(&bufpool, PR_WAITOK); 1476 UVMHIST_LOG(ubchist, "vp %p bp %p num now %d", 1477 vp, bp, vp->v_numoutput, 0); 1478 splx(s); 1479 bp->b_data = (char *)kva + (offset - origoffset); 1480 bp->b_resid = bp->b_bcount = iobytes; 1481 bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC; 1482 bp->b_iodone = uvm_aio_biodone1; 1483 bp->b_vp = vp; 1484 bp->b_proc = NULL; /* XXXUBC */ 1485 LIST_INIT(&bp->b_dep); 1486 } 1487 bp->b_private = mbp; 1488 bp->b_lblkno = bp->b_blkno = (daddr_t)(offset >> DEV_BSHIFT); 1489 UVMHIST_LOG(ubchist, "bp %p numout %d", 1490 bp, vp->v_numoutput,0,0); 1491 VOP_STRATEGY(bp); 1492 } 1493 if (skipbytes) { 1494 UVMHIST_LOG(ubchist, "skipbytes %d", bytes, 0,0,0); 1495 s = splbio(); 1496 mbp->b_resid -= skipbytes; 1497 if (mbp->b_resid == 0) { 1498 biodone(mbp); 1499 } 1500 splx(s); 1501 } 1502 if (async) { 1503 return 0; 1504 } 1505 if (bp != NULL) { 1506 error = biowait(mbp); 1507 } 1508 1509 s = splbio(); 1510 vwakeup(mbp); 1511 pool_put(&bufpool, mbp); 1512 splx(s); 1513 1514 uvm_pagermapout(kva, ap->a_count); 1515 if (error || !v3) { 1516 UVMHIST_LOG(ubchist, "returning error %d", error, 0,0,0); 1517 return error; 1518 } 1519 1520 /* 1521 * for a weak put, mark the range as "to be committed" 1522 * and mark the pages read-only so that we will be notified 1523 * to remove the pages from the "to be committed" range 1524 * if they are made dirty again. 1525 * for a strong put, commit the pages and remove them from the 1526 * "to be committed" range. also, mark them as writable 1527 * and not cleanable with just a commit. 1528 */ 1529 1530 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1531 if (weak) { 1532 nfs_add_tobecommitted_range(vp, origoffset, 1533 npages << PAGE_SHIFT); 1534 for (i = 0; i < npages; i++) { 1535 pgs[i]->flags |= PG_NEEDCOMMIT|PG_RDONLY; 1536 } 1537 } else { 1538 commitoff = origoffset; 1539 commitbytes = npages << PAGE_SHIFT; 1540 commit: 1541 error = nfs_commit(vp, commitoff, commitbytes, curproc); 1542 nfs_del_tobecommitted_range(vp, commitoff, commitbytes); 1543 committed: 1544 for (i = 0; i < npages; i++) { 1545 pgs[i]->flags &= ~(PG_NEEDCOMMIT|PG_RDONLY); 1546 } 1547 } 1548 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1549 return error; 1550 } 1551