1 /* $NetBSD: nfs_bio.c,v 1.58 2000/12/27 05:15:43 chs Exp $ */ 2 3 /* 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Rick Macklem at The University of Guelph. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 39 */ 40 41 #include "opt_nfs.h" 42 #include "opt_ddb.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/resourcevar.h> 47 #include <sys/signalvar.h> 48 #include <sys/proc.h> 49 #include <sys/buf.h> 50 #include <sys/vnode.h> 51 #include <sys/mount.h> 52 #include <sys/kernel.h> 53 #include <sys/namei.h> 54 #include <sys/dirent.h> 55 #include <sys/malloc.h> 56 57 #include <uvm/uvm_extern.h> 58 #include <uvm/uvm.h> 59 60 #include <nfs/rpcv2.h> 61 #include <nfs/nfsproto.h> 62 #include <nfs/nfs.h> 63 #include <nfs/nfsmount.h> 64 #include <nfs/nqnfs.h> 65 #include <nfs/nfsnode.h> 66 #include <nfs/nfs_var.h> 67 68 extern int nfs_numasync; 69 extern struct nfsstats nfsstats; 70 71 /* 72 * Vnode op for read using bio 73 * Any similarity to readip() is purely coincidental 74 */ 75 int 76 nfs_bioread(vp, uio, ioflag, cred, cflag) 77 struct vnode *vp; 78 struct uio *uio; 79 int ioflag, cflag; 80 struct ucred *cred; 81 { 82 struct nfsnode *np = VTONFS(vp); 83 int biosize; 84 struct buf *bp = NULL, *rabp; 85 struct vattr vattr; 86 struct proc *p; 87 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 88 struct nfsdircache *ndp = NULL, *nndp = NULL; 89 caddr_t baddr, ep, edp; 90 int got_buf = 0, error = 0, n = 0, on = 0, en, enn; 91 int enough = 0; 92 struct dirent *dp, *pdp; 93 off_t curoff = 0; 94 95 #ifdef DIAGNOSTIC 96 if (uio->uio_rw != UIO_READ) 97 panic("nfs_read mode"); 98 #endif 99 if (uio->uio_resid == 0) 100 return (0); 101 if (vp->v_type != VDIR && uio->uio_offset < 0) 102 return (EINVAL); 103 p = uio->uio_procp; 104 #ifndef NFS_V2_ONLY 105 if ((nmp->nm_flag & NFSMNT_NFSV3) && 106 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 107 (void)nfs_fsinfo(nmp, vp, cred, p); 108 #endif 109 if (vp->v_type != VDIR && 110 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 111 return (EFBIG); 112 biosize = nmp->nm_rsize; 113 114 /* 115 * For nfs, cache consistency can only be maintained approximately. 116 * Although RFC1094 does not specify the criteria, the following is 117 * believed to be compatible with the reference port. 118 * For nqnfs, full cache consistency is maintained within the loop. 119 * For nfs: 120 * If the file's modify time on the server has changed since the 121 * last read rpc or you have written to the file, 122 * you may have lost data cache consistency with the 123 * server, so flush all of the file's data out of the cache. 124 * Then force a getattr rpc to ensure that you have up to date 125 * attributes. 126 * NB: This implies that cache data can be read when up to 127 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 128 * attributes this could be forced by setting n_attrstamp to 0 before 129 * the VOP_GETATTR() call. 130 */ 131 132 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) { 133 if (np->n_flag & NMODIFIED) { 134 if (vp->v_type != VREG) { 135 if (vp->v_type != VDIR) 136 panic("nfs: bioread, not dir"); 137 nfs_invaldircache(vp, 0); 138 np->n_direofoffset = 0; 139 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 140 if (error) 141 return (error); 142 } 143 np->n_attrstamp = 0; 144 error = VOP_GETATTR(vp, &vattr, cred, p); 145 if (error) 146 return (error); 147 np->n_mtime = vattr.va_mtime.tv_sec; 148 } else { 149 error = VOP_GETATTR(vp, &vattr, cred, p); 150 if (error) 151 return (error); 152 if (np->n_mtime != vattr.va_mtime.tv_sec) { 153 if (vp->v_type == VDIR) { 154 nfs_invaldircache(vp, 0); 155 np->n_direofoffset = 0; 156 } 157 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 158 if (error) 159 return (error); 160 np->n_mtime = vattr.va_mtime.tv_sec; 161 } 162 } 163 } 164 165 /* 166 * update the cached read creds for this node. 167 */ 168 169 if (np->n_rcred) { 170 crfree(np->n_rcred); 171 } 172 np->n_rcred = cred; 173 crhold(cred); 174 175 do { 176 #ifndef NFS_V2_ONLY 177 /* 178 * Get a valid lease. If cached data is stale, flush it. 179 */ 180 if (nmp->nm_flag & NFSMNT_NQNFS) { 181 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 182 do { 183 error = nqnfs_getlease(vp, ND_READ, cred, p); 184 } while (error == NQNFS_EXPIRED); 185 if (error) 186 return (error); 187 if (np->n_lrev != np->n_brev || 188 (np->n_flag & NQNFSNONCACHE) || 189 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 190 if (vp->v_type == VDIR) { 191 nfs_invaldircache(vp, 0); 192 np->n_direofoffset = 0; 193 } 194 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 195 if (error) 196 return (error); 197 np->n_brev = np->n_lrev; 198 } 199 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 200 nfs_invaldircache(vp, 0); 201 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 202 np->n_direofoffset = 0; 203 if (error) 204 return (error); 205 } 206 } 207 #endif 208 /* 209 * Don't cache symlinks. 210 */ 211 if (np->n_flag & NQNFSNONCACHE 212 || ((vp->v_flag & VROOT) && vp->v_type == VLNK)) { 213 switch (vp->v_type) { 214 case VREG: 215 return (nfs_readrpc(vp, uio)); 216 case VLNK: 217 return (nfs_readlinkrpc(vp, uio, cred)); 218 case VDIR: 219 break; 220 default: 221 printf(" NQNFSNONCACHE: type %x unexpected\n", 222 vp->v_type); 223 }; 224 } 225 baddr = (caddr_t)0; 226 switch (vp->v_type) { 227 case VREG: 228 nfsstats.biocache_reads++; 229 230 error = 0; 231 while (uio->uio_resid > 0) { 232 void *win; 233 vsize_t bytelen = min(np->n_size - uio->uio_offset, 234 uio->uio_resid); 235 236 if (bytelen == 0) 237 break; 238 win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, 239 &bytelen, UBC_READ); 240 error = uiomove(win, bytelen, uio); 241 ubc_release(win, 0); 242 if (error) { 243 break; 244 } 245 } 246 n = 0; 247 break; 248 249 case VLNK: 250 nfsstats.biocache_readlinks++; 251 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 252 if (!bp) 253 return (EINTR); 254 if ((bp->b_flags & B_DONE) == 0) { 255 bp->b_flags |= B_READ; 256 error = nfs_doio(bp, p); 257 if (error) { 258 brelse(bp); 259 return (error); 260 } 261 } 262 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 263 got_buf = 1; 264 on = 0; 265 break; 266 case VDIR: 267 diragain: 268 nfsstats.biocache_readdirs++; 269 ndp = nfs_searchdircache(vp, uio->uio_offset, 270 (nmp->nm_flag & NFSMNT_XLATECOOKIE), 0); 271 if (!ndp) { 272 /* 273 * We've been handed a cookie that is not 274 * in the cache. If we're not translating 275 * 32 <-> 64, it may be a value that was 276 * flushed out of the cache because it grew 277 * too big. Let the server judge if it's 278 * valid or not. In the translation case, 279 * we have no way of validating this value, 280 * so punt. 281 */ 282 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) 283 return (EINVAL); 284 ndp = nfs_enterdircache(vp, uio->uio_offset, 285 uio->uio_offset, 0, 0); 286 } 287 288 if (uio->uio_offset != 0 && 289 ndp->dc_cookie == np->n_direofoffset) { 290 nfsstats.direofcache_hits++; 291 return (0); 292 } 293 294 bp = nfs_getcacheblk(vp, ndp->dc_blkno, NFS_DIRBLKSIZ, p); 295 if (!bp) 296 return (EINTR); 297 if ((bp->b_flags & B_DONE) == 0) { 298 bp->b_flags |= B_READ; 299 bp->b_dcookie = ndp->dc_blkcookie; 300 error = nfs_doio(bp, p); 301 if (error) { 302 /* 303 * Yuck! The directory has been modified on the 304 * server. Punt and let the userland code 305 * deal with it. 306 */ 307 brelse(bp); 308 if (error == NFSERR_BAD_COOKIE) { 309 nfs_invaldircache(vp, 0); 310 nfs_vinvalbuf(vp, 0, cred, p, 1); 311 error = EINVAL; 312 } 313 return (error); 314 } 315 } 316 317 /* 318 * Just return if we hit EOF right away with this 319 * block. Always check here, because direofoffset 320 * may have been set by an nfsiod since the last 321 * check. 322 */ 323 if (np->n_direofoffset != 0 && 324 ndp->dc_blkcookie == np->n_direofoffset) { 325 brelse(bp); 326 return (0); 327 } 328 329 /* 330 * Find the entry we were looking for in the block. 331 */ 332 333 en = ndp->dc_entry; 334 335 pdp = dp = (struct dirent *)bp->b_data; 336 edp = bp->b_data + bp->b_bcount; 337 enn = 0; 338 while (enn < en && (caddr_t)dp < edp) { 339 pdp = dp; 340 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 341 enn++; 342 } 343 344 /* 345 * If the entry number was bigger than the number of 346 * entries in the block, or the cookie of the previous 347 * entry doesn't match, the directory cache is 348 * stale. Flush it and try again (i.e. go to 349 * the server). 350 */ 351 if ((caddr_t)dp >= edp || (caddr_t)dp + dp->d_reclen > edp || 352 (en > 0 && NFS_GETCOOKIE(pdp) != ndp->dc_cookie)) { 353 #ifdef DEBUG 354 printf("invalid cache: %p %p %p off %lx %lx\n", 355 pdp, dp, edp, 356 (unsigned long)uio->uio_offset, 357 (unsigned long)NFS_GETCOOKIE(pdp)); 358 #endif 359 brelse(bp); 360 nfs_invaldircache(vp, 0); 361 nfs_vinvalbuf(vp, 0, cred, p, 0); 362 goto diragain; 363 } 364 365 on = (caddr_t)dp - bp->b_data; 366 367 /* 368 * Cache all entries that may be exported to the 369 * user, as they may be thrown back at us. The 370 * NFSBIO_CACHECOOKIES flag indicates that all 371 * entries are being 'exported', so cache them all. 372 */ 373 374 if (en == 0 && pdp == dp) { 375 dp = (struct dirent *) 376 ((caddr_t)dp + dp->d_reclen); 377 enn++; 378 } 379 380 if (uio->uio_resid < (bp->b_bcount - on)) { 381 n = uio->uio_resid; 382 enough = 1; 383 } else 384 n = bp->b_bcount - on; 385 386 ep = bp->b_data + on + n; 387 388 /* 389 * Find last complete entry to copy, caching entries 390 * (if requested) as we go. 391 */ 392 393 while ((caddr_t)dp < ep && (caddr_t)dp + dp->d_reclen <= ep) { 394 if (cflag & NFSBIO_CACHECOOKIES) { 395 nndp = nfs_enterdircache(vp, NFS_GETCOOKIE(pdp), 396 ndp->dc_blkcookie, enn, bp->b_lblkno); 397 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 398 NFS_STASHCOOKIE32(pdp, 399 nndp->dc_cookie32); 400 } 401 } 402 pdp = dp; 403 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 404 enn++; 405 } 406 407 /* 408 * If the last requested entry was not the last in the 409 * buffer (happens if NFS_DIRFRAGSIZ < NFS_DIRBLKSIZ), 410 * cache the cookie of the last requested one, and 411 * set of the offset to it. 412 */ 413 414 if ((on + n) < bp->b_bcount) { 415 curoff = NFS_GETCOOKIE(pdp); 416 nndp = nfs_enterdircache(vp, curoff, ndp->dc_blkcookie, 417 enn, bp->b_lblkno); 418 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 419 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 420 curoff = nndp->dc_cookie32; 421 } 422 } else 423 curoff = bp->b_dcookie; 424 425 /* 426 * Always cache the entry for the next block, 427 * so that readaheads can use it. 428 */ 429 nndp = nfs_enterdircache(vp, bp->b_dcookie, bp->b_dcookie, 0,0); 430 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 431 if (curoff == bp->b_dcookie) { 432 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 433 curoff = nndp->dc_cookie32; 434 } 435 } 436 437 n = ((caddr_t)pdp + pdp->d_reclen) - (bp->b_data + on); 438 439 /* 440 * If not eof and read aheads are enabled, start one. 441 * (You need the current block first, so that you have the 442 * directory offset cookie of the next block.) 443 */ 444 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 445 np->n_direofoffset == 0 && !(np->n_flag & NQNFSNONCACHE)) { 446 rabp = nfs_getcacheblk(vp, nndp->dc_blkno, 447 NFS_DIRBLKSIZ, p); 448 if (rabp) { 449 if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) { 450 rabp->b_dcookie = nndp->dc_cookie; 451 rabp->b_flags |= (B_READ | B_ASYNC); 452 if (nfs_asyncio(rabp)) { 453 rabp->b_flags |= B_INVAL; 454 brelse(rabp); 455 } 456 } else 457 brelse(rabp); 458 } 459 } 460 got_buf = 1; 461 break; 462 default: 463 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 464 break; 465 } 466 467 if (n > 0) { 468 if (!baddr) 469 baddr = bp->b_data; 470 error = uiomove(baddr + on, (int)n, uio); 471 } 472 switch (vp->v_type) { 473 case VREG: 474 break; 475 case VLNK: 476 n = 0; 477 break; 478 case VDIR: 479 if (np->n_flag & NQNFSNONCACHE) 480 bp->b_flags |= B_INVAL; 481 uio->uio_offset = curoff; 482 if (enough) 483 n = 0; 484 break; 485 default: 486 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 487 } 488 if (got_buf) 489 brelse(bp); 490 } while (error == 0 && uio->uio_resid > 0 && n > 0); 491 return (error); 492 } 493 494 /* 495 * Vnode op for write using bio 496 */ 497 int 498 nfs_write(v) 499 void *v; 500 { 501 struct vop_write_args /* { 502 struct vnode *a_vp; 503 struct uio *a_uio; 504 int a_ioflag; 505 struct ucred *a_cred; 506 } */ *ap = v; 507 struct uio *uio = ap->a_uio; 508 struct proc *p = uio->uio_procp; 509 struct vnode *vp = ap->a_vp; 510 struct nfsnode *np = VTONFS(vp); 511 struct ucred *cred = ap->a_cred; 512 int ioflag = ap->a_ioflag; 513 struct vattr vattr; 514 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 515 int error = 0, iomode, must_commit; 516 int rv; 517 518 #ifdef DIAGNOSTIC 519 if (uio->uio_rw != UIO_WRITE) 520 panic("nfs_write mode"); 521 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 522 panic("nfs_write proc"); 523 #endif 524 if (vp->v_type != VREG) 525 return (EIO); 526 if (np->n_flag & NWRITEERR) { 527 np->n_flag &= ~NWRITEERR; 528 return (np->n_error); 529 } 530 #ifndef NFS_V2_ONLY 531 if ((nmp->nm_flag & NFSMNT_NFSV3) && 532 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 533 (void)nfs_fsinfo(nmp, vp, cred, p); 534 #endif 535 if (ioflag & (IO_APPEND | IO_SYNC)) { 536 if (np->n_flag & NMODIFIED) { 537 np->n_attrstamp = 0; 538 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 539 if (error) 540 return (error); 541 } 542 if (ioflag & IO_APPEND) { 543 np->n_attrstamp = 0; 544 error = VOP_GETATTR(vp, &vattr, cred, p); 545 if (error) 546 return (error); 547 uio->uio_offset = np->n_size; 548 } 549 } 550 if (uio->uio_offset < 0) 551 return (EINVAL); 552 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 553 return (EFBIG); 554 if (uio->uio_resid == 0) 555 return (0); 556 /* 557 * Maybe this should be above the vnode op call, but so long as 558 * file servers have no limits, i don't think it matters 559 */ 560 if (p && uio->uio_offset + uio->uio_resid > 561 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 562 psignal(p, SIGXFSZ); 563 return (EFBIG); 564 } 565 566 /* 567 * update the cached write creds for this node. 568 */ 569 570 if (np->n_wcred) { 571 crfree(np->n_wcred); 572 } 573 np->n_wcred = cred; 574 crhold(cred); 575 576 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 577 iomode = NFSV3WRITE_FILESYNC; 578 error = nfs_writerpc(vp, uio, &iomode, &must_commit); 579 if (must_commit) 580 nfs_clearcommit(vp->v_mount); 581 return (error); 582 } 583 584 do { 585 void *win; 586 voff_t oldoff = uio->uio_offset; 587 vsize_t bytelen = uio->uio_resid; 588 589 #ifndef NFS_V2_ONLY 590 /* 591 * Check for a valid write lease. 592 */ 593 if ((nmp->nm_flag & NFSMNT_NQNFS) && 594 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 595 do { 596 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 597 } while (error == NQNFS_EXPIRED); 598 if (error) 599 return (error); 600 if (np->n_lrev != np->n_brev || 601 (np->n_flag & NQNFSNONCACHE)) { 602 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 603 if (error) 604 return (error); 605 np->n_brev = np->n_lrev; 606 } 607 } 608 #endif 609 nfsstats.biocache_writes++; 610 611 np->n_flag |= NMODIFIED; 612 if (np->n_size < uio->uio_offset + bytelen) { 613 np->n_size = uio->uio_offset + bytelen; 614 uvm_vnp_setsize(vp, np->n_size); 615 } 616 win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, &bytelen, 617 UBC_WRITE); 618 error = uiomove(win, bytelen, uio); 619 ubc_release(win, 0); 620 rv = 1; 621 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 622 simple_lock(&vp->v_uvm.u_obj.vmobjlock); 623 rv = vp->v_uvm.u_obj.pgops->pgo_flush( 624 &vp->v_uvm.u_obj, 625 oldoff & ~(nmp->nm_wsize - 1), 626 uio->uio_offset & ~(nmp->nm_wsize - 1), 627 PGO_CLEANIT|PGO_SYNCIO); 628 simple_unlock(&vp->v_uvm.u_obj.vmobjlock); 629 } else if ((oldoff & ~(nmp->nm_wsize - 1)) != 630 (uio->uio_offset & ~(nmp->nm_wsize - 1))) { 631 simple_lock(&vp->v_uvm.u_obj.vmobjlock); 632 rv = vp->v_uvm.u_obj.pgops->pgo_flush( 633 &vp->v_uvm.u_obj, 634 oldoff & ~(nmp->nm_wsize - 1), 635 uio->uio_offset & ~(nmp->nm_wsize - 1), 636 PGO_CLEANIT|PGO_WEAK); 637 simple_unlock(&vp->v_uvm.u_obj.vmobjlock); 638 } 639 if (!rv) { 640 error = EIO; 641 } 642 if (error) { 643 break; 644 } 645 } while (uio->uio_resid > 0); 646 return error; 647 } 648 649 /* 650 * Get an nfs cache block. 651 * Allocate a new one if the block isn't currently in the cache 652 * and return the block marked busy. If the calling process is 653 * interrupted by a signal for an interruptible mount point, return 654 * NULL. 655 */ 656 struct buf * 657 nfs_getcacheblk(vp, bn, size, p) 658 struct vnode *vp; 659 daddr_t bn; 660 int size; 661 struct proc *p; 662 { 663 struct buf *bp; 664 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 665 666 if (nmp->nm_flag & NFSMNT_INT) { 667 bp = getblk(vp, bn, size, PCATCH, 0); 668 while (bp == NULL) { 669 if (nfs_sigintr(nmp, NULL, p)) 670 return (NULL); 671 bp = getblk(vp, bn, size, 0, 2 * hz); 672 } 673 } else 674 bp = getblk(vp, bn, size, 0, 0); 675 return (bp); 676 } 677 678 /* 679 * Flush and invalidate all dirty buffers. If another process is already 680 * doing the flush, just wait for completion. 681 */ 682 int 683 nfs_vinvalbuf(vp, flags, cred, p, intrflg) 684 struct vnode *vp; 685 int flags; 686 struct ucred *cred; 687 struct proc *p; 688 int intrflg; 689 { 690 struct nfsnode *np = VTONFS(vp); 691 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 692 int error = 0, slpflag, slptimeo; 693 694 if ((nmp->nm_flag & NFSMNT_INT) == 0) 695 intrflg = 0; 696 if (intrflg) { 697 slpflag = PCATCH; 698 slptimeo = 2 * hz; 699 } else { 700 slpflag = 0; 701 slptimeo = 0; 702 } 703 /* 704 * First wait for any other process doing a flush to complete. 705 */ 706 while (np->n_flag & NFLUSHINPROG) { 707 np->n_flag |= NFLUSHWANT; 708 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 709 slptimeo); 710 if (error && intrflg && nfs_sigintr(nmp, NULL, p)) 711 return (EINTR); 712 } 713 714 /* 715 * Now, flush as required. 716 */ 717 np->n_flag |= NFLUSHINPROG; 718 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 719 while (error) { 720 if (intrflg && nfs_sigintr(nmp, NULL, p)) { 721 np->n_flag &= ~NFLUSHINPROG; 722 if (np->n_flag & NFLUSHWANT) { 723 np->n_flag &= ~NFLUSHWANT; 724 wakeup((caddr_t)&np->n_flag); 725 } 726 return (EINTR); 727 } 728 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 729 } 730 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 731 if (np->n_flag & NFLUSHWANT) { 732 np->n_flag &= ~NFLUSHWANT; 733 wakeup((caddr_t)&np->n_flag); 734 } 735 return (0); 736 } 737 738 /* 739 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 740 * This is mainly to avoid queueing async I/O requests when the nfsiods 741 * are all hung on a dead server. 742 */ 743 int 744 nfs_asyncio(bp) 745 struct buf *bp; 746 { 747 int i; 748 struct nfsmount *nmp; 749 int gotiod, slpflag = 0, slptimeo = 0, error; 750 751 if (nfs_numasync == 0) 752 return (EIO); 753 754 755 nmp = VFSTONFS(bp->b_vp->v_mount); 756 again: 757 if (nmp->nm_flag & NFSMNT_INT) 758 slpflag = PCATCH; 759 gotiod = FALSE; 760 761 /* 762 * Find a free iod to process this request. 763 */ 764 765 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 766 if (nfs_iodwant[i]) { 767 /* 768 * Found one, so wake it up and tell it which 769 * mount to process. 770 */ 771 nfs_iodwant[i] = NULL; 772 nfs_iodmount[i] = nmp; 773 nmp->nm_bufqiods++; 774 wakeup((caddr_t)&nfs_iodwant[i]); 775 gotiod = TRUE; 776 break; 777 } 778 /* 779 * If none are free, we may already have an iod working on this mount 780 * point. If so, it will process our request. 781 */ 782 if (!gotiod && nmp->nm_bufqiods > 0) 783 gotiod = TRUE; 784 785 /* 786 * If we have an iod which can process the request, then queue 787 * the buffer. 788 */ 789 if (gotiod) { 790 /* 791 * Ensure that the queue never grows too large. 792 */ 793 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 794 nmp->nm_bufqwant = TRUE; 795 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 796 "nfsaio", slptimeo); 797 if (error) { 798 if (nfs_sigintr(nmp, NULL, bp->b_proc)) 799 return (EINTR); 800 if (slpflag == PCATCH) { 801 slpflag = 0; 802 slptimeo = 2 * hz; 803 } 804 } 805 /* 806 * We might have lost our iod while sleeping, 807 * so check and loop if nescessary. 808 */ 809 if (nmp->nm_bufqiods == 0) 810 goto again; 811 } 812 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 813 nmp->nm_bufqlen++; 814 return (0); 815 } 816 817 /* 818 * All the iods are busy on other mounts, so return EIO to 819 * force the caller to process the i/o synchronously. 820 */ 821 return (EIO); 822 } 823 824 /* 825 * Do an I/O operation to/from a cache block. This may be called 826 * synchronously or from an nfsiod. 827 */ 828 int 829 nfs_doio(bp, p) 830 struct buf *bp; 831 struct proc *p; 832 { 833 struct uio *uiop; 834 struct vnode *vp; 835 struct nfsnode *np; 836 struct nfsmount *nmp; 837 int error = 0, diff, len, iomode, must_commit = 0; 838 struct uio uio; 839 struct iovec io; 840 841 vp = bp->b_vp; 842 np = VTONFS(vp); 843 nmp = VFSTONFS(vp->v_mount); 844 uiop = &uio; 845 uiop->uio_iov = &io; 846 uiop->uio_iovcnt = 1; 847 uiop->uio_segflg = UIO_SYSSPACE; 848 uiop->uio_procp = p; 849 850 /* 851 * Historically, paging was done with physio, but no more... 852 */ 853 if (bp->b_flags & B_PHYS) { 854 /* 855 * ...though reading /dev/drum still gets us here. 856 */ 857 io.iov_len = uiop->uio_resid = bp->b_bcount; 858 /* mapping was done by vmapbuf() */ 859 io.iov_base = bp->b_data; 860 uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; 861 if (bp->b_flags & B_READ) { 862 uiop->uio_rw = UIO_READ; 863 nfsstats.read_physios++; 864 error = nfs_readrpc(vp, uiop); 865 } else { 866 iomode = NFSV3WRITE_DATASYNC; 867 uiop->uio_rw = UIO_WRITE; 868 nfsstats.write_physios++; 869 error = nfs_writerpc(vp, uiop, &iomode, &must_commit); 870 } 871 if (error) { 872 bp->b_flags |= B_ERROR; 873 bp->b_error = error; 874 } 875 } else if (bp->b_flags & B_READ) { 876 io.iov_len = uiop->uio_resid = bp->b_bcount; 877 io.iov_base = bp->b_data; 878 uiop->uio_rw = UIO_READ; 879 switch (vp->v_type) { 880 case VREG: 881 uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; 882 nfsstats.read_bios++; 883 error = nfs_readrpc(vp, uiop); 884 if (!error && uiop->uio_resid) { 885 886 /* 887 * If len > 0, there is a hole in the file and 888 * no writes after the hole have been pushed to 889 * the server yet. 890 * Just zero fill the rest of the valid area. 891 */ 892 893 diff = bp->b_bcount - uiop->uio_resid; 894 len = np->n_size - ((((off_t)bp->b_blkno) << DEV_BSHIFT) 895 + diff); 896 if (len > 0) { 897 len = min(len, uiop->uio_resid); 898 memset((char *)bp->b_data + diff, 0, len); 899 } 900 } 901 if (p && (vp->v_flag & VTEXT) && 902 (((nmp->nm_flag & NFSMNT_NQNFS) && 903 NQNFS_CKINVALID(vp, np, ND_READ) && 904 np->n_lrev != np->n_brev) || 905 (!(nmp->nm_flag & NFSMNT_NQNFS) && 906 np->n_mtime != np->n_vattr->va_mtime.tv_sec))) { 907 uprintf("Process killed due to " 908 "text file modification\n"); 909 psignal(p, SIGKILL); 910 p->p_holdcnt++; 911 } 912 break; 913 case VLNK: 914 uiop->uio_offset = (off_t)0; 915 nfsstats.readlink_bios++; 916 error = nfs_readlinkrpc(vp, uiop, curproc->p_ucred); 917 break; 918 case VDIR: 919 nfsstats.readdir_bios++; 920 uiop->uio_offset = bp->b_dcookie; 921 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 922 error = nfs_readdirplusrpc(vp, uiop, curproc->p_ucred); 923 if (error == NFSERR_NOTSUPP) 924 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 925 } 926 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 927 error = nfs_readdirrpc(vp, uiop, curproc->p_ucred); 928 if (!error) { 929 bp->b_dcookie = uiop->uio_offset; 930 } 931 break; 932 default: 933 printf("nfs_doio: type %x unexpected\n",vp->v_type); 934 break; 935 } 936 if (error) { 937 bp->b_flags |= B_ERROR; 938 bp->b_error = error; 939 } 940 } else { 941 /* 942 * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not 943 * an actual write will have to be scheduled. 944 */ 945 946 io.iov_base = bp->b_data; 947 io.iov_len = uiop->uio_resid = bp->b_bcount; 948 uiop->uio_offset = (((off_t)bp->b_blkno) << DEV_BSHIFT); 949 uiop->uio_rw = UIO_WRITE; 950 nfsstats.write_bios++; 951 iomode = NFSV3WRITE_UNSTABLE; 952 error = nfs_writerpc(vp, uiop, &iomode, &must_commit); 953 } 954 bp->b_resid = uiop->uio_resid; 955 if (must_commit) 956 nfs_clearcommit(vp->v_mount); 957 biodone(bp); 958 return (error); 959 } 960 961 /* 962 * Vnode op for VM getpages. 963 */ 964 int 965 nfs_getpages(v) 966 void *v; 967 { 968 struct vop_getpages_args /* { 969 struct vnode *a_vp; 970 voff_t a_offset; 971 vm_page_t *a_m; 972 int *a_count; 973 int a_centeridx; 974 vm_prot_t a_access_type; 975 int a_advice; 976 int a_flags; 977 } */ *ap = v; 978 979 off_t eof, offset, origoffset, startoffset, endoffset; 980 int s, i, error, npages, orignpages, npgs, ridx, pidx, pcount; 981 vaddr_t kva; 982 struct buf *bp, *mbp; 983 struct vnode *vp = ap->a_vp; 984 struct nfsnode *np = VTONFS(vp); 985 struct uvm_object *uobj = &vp->v_uvm.u_obj; 986 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 987 size_t bytes, iobytes, tailbytes, totalbytes, skipbytes; 988 int flags = ap->a_flags; 989 int bsize; 990 struct vm_page *pgs[16]; /* XXXUBC 16 */ 991 boolean_t v3 = NFS_ISV3(vp); 992 boolean_t async = (flags & PGO_SYNCIO) == 0; 993 boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0; 994 995 UVMHIST_FUNC("nfs_getpages"); UVMHIST_CALLED(ubchist); 996 UVMHIST_LOG(ubchist, "vp %p off 0x%x count %d", vp, (int)ap->a_offset, 997 *ap->a_count,0); 998 999 #ifdef DIAGNOSTIC 1000 if (ap->a_centeridx < 0 || ap->a_centeridx >= *ap->a_count) { 1001 panic("nfs_getpages: centeridx %d out of range", 1002 ap->a_centeridx); 1003 } 1004 #endif 1005 1006 error = 0; 1007 origoffset = ap->a_offset; 1008 eof = vp->v_uvm.u_size; 1009 if (origoffset >= eof) { 1010 if ((flags & PGO_LOCKED) == 0) { 1011 simple_unlock(&uobj->vmobjlock); 1012 } 1013 UVMHIST_LOG(ubchist, "off 0x%x past EOF 0x%x", 1014 (int)origoffset, (int)eof,0,0); 1015 return EINVAL; 1016 } 1017 1018 if (flags & PGO_LOCKED) { 1019 uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, 1020 UFP_NOWAIT|UFP_NOALLOC); 1021 return 0; 1022 } 1023 1024 /* vnode is VOP_LOCKed, uobj is locked */ 1025 1026 bsize = nmp->nm_rsize; 1027 orignpages = min(*ap->a_count, 1028 round_page(eof - origoffset) >> PAGE_SHIFT); 1029 npages = orignpages; 1030 startoffset = origoffset & ~(bsize - 1); 1031 endoffset = round_page((origoffset + (npages << PAGE_SHIFT) 1032 + bsize - 1) & ~(bsize - 1)); 1033 endoffset = min(endoffset, round_page(eof)); 1034 ridx = (origoffset - startoffset) >> PAGE_SHIFT; 1035 1036 if (!async && !write) { 1037 int rapages = max(PAGE_SIZE, nmp->nm_rsize) >> PAGE_SHIFT; 1038 1039 (void) VOP_GETPAGES(vp, endoffset, NULL, &rapages, 0, 1040 VM_PROT_READ, 0, 0); 1041 simple_lock(&uobj->vmobjlock); 1042 } 1043 1044 UVMHIST_LOG(ubchist, "npages %d offset 0x%x", npages, 1045 (int)origoffset, 0,0); 1046 memset(pgs, 0, sizeof(pgs)); 1047 uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL); 1048 1049 if (flags & PGO_OVERWRITE) { 1050 UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0); 1051 1052 /* XXXUBC for now, zero the page if we allocated it */ 1053 for (i = 0; i < npages; i++) { 1054 struct vm_page *pg = pgs[ridx + i]; 1055 1056 if (pg->flags & PG_FAKE) { 1057 uvm_pagezero(pg); 1058 pg->flags &= ~(PG_FAKE); 1059 } 1060 } 1061 if (v3) { 1062 simple_unlock(&uobj->vmobjlock); 1063 npages += ridx; 1064 goto uncommit; 1065 } 1066 goto out; 1067 } 1068 1069 /* 1070 * if the pages are already resident, just return them. 1071 */ 1072 1073 for (i = 0; i < npages; i++) { 1074 struct vm_page *pg = pgs[ridx + i]; 1075 1076 if ((pg->flags & PG_FAKE) != 0 || 1077 ((ap->a_access_type & VM_PROT_WRITE) && 1078 (pg->flags & PG_RDONLY))) { 1079 break; 1080 } 1081 } 1082 if (i == npages) { 1083 UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0); 1084 goto out; 1085 } 1086 1087 /* 1088 * the page wasn't resident and we're not overwriting, 1089 * so we're going to have to do some i/o. 1090 * find any additional pages needed to cover the expanded range. 1091 */ 1092 1093 if (startoffset != origoffset || 1094 startoffset + (npages << PAGE_SHIFT) != endoffset) { 1095 1096 /* 1097 * XXXUBC we need to avoid deadlocks caused by locking 1098 * additional pages at lower offsets than pages we 1099 * already have locked. for now, unlock them all and 1100 * start over. 1101 */ 1102 1103 for (i = 0; i < npages; i++) { 1104 struct vm_page *pg = pgs[ridx + i]; 1105 1106 if (pg->flags & PG_FAKE) { 1107 pg->flags |= PG_RELEASED; 1108 } 1109 } 1110 uvm_page_unbusy(&pgs[ridx], npages); 1111 memset(pgs, 0, sizeof(pgs)); 1112 1113 UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x", 1114 startoffset, endoffset, 0,0); 1115 npages = (endoffset - startoffset) >> PAGE_SHIFT; 1116 npgs = npages; 1117 uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL); 1118 } 1119 simple_unlock(&uobj->vmobjlock); 1120 1121 /* 1122 * update the cached read creds for this node. 1123 */ 1124 1125 if (np->n_rcred) { 1126 crfree(np->n_rcred); 1127 } 1128 np->n_rcred = curproc->p_ucred; 1129 crhold(np->n_rcred); 1130 1131 /* 1132 * read the desired page(s). 1133 */ 1134 1135 totalbytes = npages << PAGE_SHIFT; 1136 bytes = min(totalbytes, vp->v_uvm.u_size - startoffset); 1137 tailbytes = totalbytes - bytes; 1138 skipbytes = 0; 1139 1140 kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK | 1141 UVMPAGER_MAPIN_READ); 1142 1143 s = splbio(); 1144 mbp = pool_get(&bufpool, PR_WAITOK); 1145 splx(s); 1146 mbp->b_bufsize = totalbytes; 1147 mbp->b_data = (void *)kva; 1148 mbp->b_resid = mbp->b_bcount = bytes; 1149 mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL|B_ASYNC : 0); 1150 mbp->b_iodone = uvm_aio_biodone; 1151 mbp->b_vp = vp; 1152 LIST_INIT(&mbp->b_dep); 1153 1154 /* 1155 * if EOF is in the middle of the last page, zero the part past EOF. 1156 */ 1157 1158 if (tailbytes > 0 && (pgs[bytes >> PAGE_SHIFT]->flags & PG_FAKE)) { 1159 memset((char *)kva + bytes, 0, tailbytes); 1160 } 1161 1162 /* 1163 * now loop over the pages, reading as needed. 1164 */ 1165 1166 bp = NULL; 1167 for (offset = startoffset; 1168 bytes > 0; 1169 offset += iobytes, bytes -= iobytes) { 1170 1171 /* 1172 * skip pages which don't need to be read. 1173 */ 1174 1175 pidx = (offset - startoffset) >> PAGE_SHIFT; 1176 UVMHIST_LOG(ubchist, "pidx %d offset 0x%x startoffset 0x%x", 1177 pidx, (int)offset, (int)startoffset,0); 1178 while ((pgs[pidx]->flags & PG_FAKE) == 0) { 1179 size_t b; 1180 1181 KASSERT((offset & (PAGE_SIZE - 1)) == 0); 1182 b = min(PAGE_SIZE, bytes); 1183 offset += b; 1184 bytes -= b; 1185 skipbytes += b; 1186 pidx++; 1187 UVMHIST_LOG(ubchist, "skipping, new offset 0x%x", 1188 (int)offset, 0,0,0); 1189 if (bytes == 0) { 1190 goto loopdone; 1191 } 1192 } 1193 1194 /* 1195 * see how many pages can be read with this i/o. 1196 * reduce the i/o size if necessary. 1197 */ 1198 1199 iobytes = bytes; 1200 if (offset + iobytes > round_page(offset)) { 1201 pcount = 1; 1202 while (pidx + pcount < npages && 1203 pgs[pidx + pcount]->flags & PG_FAKE) { 1204 pcount++; 1205 } 1206 iobytes = min(iobytes, (pcount << PAGE_SHIFT) - 1207 (offset - trunc_page(offset))); 1208 } 1209 iobytes = min(iobytes, nmp->nm_rsize); 1210 1211 /* 1212 * allocate a sub-buf for this piece of the i/o 1213 * (or just use mbp if there's only 1 piece), 1214 * and start it going. 1215 */ 1216 1217 if (offset == startoffset && iobytes == bytes) { 1218 bp = mbp; 1219 } else { 1220 s = splbio(); 1221 bp = pool_get(&bufpool, PR_WAITOK); 1222 splx(s); 1223 bp->b_data = (char *)kva + offset - startoffset; 1224 bp->b_resid = bp->b_bcount = iobytes; 1225 bp->b_flags = B_BUSY|B_READ|B_CALL|B_ASYNC; 1226 bp->b_iodone = uvm_aio_biodone1; 1227 bp->b_vp = vp; 1228 LIST_INIT(&bp->b_dep); 1229 } 1230 bp->b_private = mbp; 1231 bp->b_lblkno = bp->b_blkno = offset >> DEV_BSHIFT; 1232 1233 UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x", 1234 bp, offset, iobytes, bp->b_blkno); 1235 1236 VOP_STRATEGY(bp); 1237 } 1238 1239 loopdone: 1240 if (skipbytes) { 1241 s = splbio(); 1242 mbp->b_resid -= skipbytes; 1243 if (mbp->b_resid == 0) { 1244 biodone(mbp); 1245 } 1246 splx(s); 1247 } 1248 if (async) { 1249 UVMHIST_LOG(ubchist, "returning PEND",0,0,0,0); 1250 return EINPROGRESS; 1251 } 1252 if (bp != NULL) { 1253 error = biowait(mbp); 1254 } 1255 s = splbio(); 1256 pool_put(&bufpool, mbp); 1257 splx(s); 1258 uvm_pagermapout(kva, npages); 1259 1260 if (write && v3) { 1261 uncommit: 1262 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1263 nfs_del_committed_range(vp, origoffset, npages); 1264 nfs_del_tobecommitted_range(vp, origoffset, npages); 1265 simple_lock(&uobj->vmobjlock); 1266 for (i = 0; i < npages; i++) { 1267 if (pgs[i] == NULL) { 1268 continue; 1269 } 1270 pgs[i]->flags &= ~(PG_NEEDCOMMIT|PG_RDONLY); 1271 } 1272 simple_unlock(&uobj->vmobjlock); 1273 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1274 } 1275 1276 simple_lock(&uobj->vmobjlock); 1277 1278 out: 1279 uvm_lock_pageq(); 1280 if (error) { 1281 for (i = 0; i < npages; i++) { 1282 if (pgs[i] == NULL) { 1283 continue; 1284 } 1285 UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", 1286 pgs[i], pgs[i]->flags, 0,0); 1287 if ((pgs[i]->flags & PG_FAKE) == 0) { 1288 continue; 1289 } 1290 if (pgs[i]->flags & PG_WANTED) { 1291 wakeup(pgs[i]); 1292 } 1293 uvm_pagefree(pgs[i]); 1294 } 1295 goto done; 1296 } 1297 1298 UVMHIST_LOG(ubchist, "ridx %d count %d", ridx, npages, 0,0); 1299 for (i = 0; i < npages; i++) { 1300 if (pgs[i] == NULL) { 1301 continue; 1302 } 1303 UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", 1304 pgs[i], pgs[i]->flags, 0,0); 1305 if (pgs[i]->flags & PG_FAKE) { 1306 UVMHIST_LOG(ubchist, "unfaking pg %p offset 0x%x", 1307 pgs[i], (int)pgs[i]->offset,0,0); 1308 pgs[i]->flags &= ~(PG_FAKE); 1309 pmap_clear_modify(pgs[i]); 1310 pmap_clear_reference(pgs[i]); 1311 } 1312 if (i < ridx || i >= ridx + orignpages || async) { 1313 UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x", 1314 pgs[i], (int)pgs[i]->offset,0,0); 1315 KASSERT((pgs[i]->flags & PG_RELEASED) == 0); 1316 if (pgs[i]->flags & PG_WANTED) { 1317 wakeup(pgs[i]); 1318 } 1319 if (pgs[i]->wire_count == 0) { 1320 uvm_pageactivate(pgs[i]); 1321 } 1322 pgs[i]->flags &= ~(PG_WANTED|PG_BUSY); 1323 UVM_PAGE_OWN(pgs[i], NULL); 1324 } 1325 } 1326 1327 done: 1328 uvm_unlock_pageq(); 1329 simple_unlock(&uobj->vmobjlock); 1330 if (ap->a_m != NULL) { 1331 memcpy(ap->a_m, &pgs[ridx], 1332 *ap->a_count * sizeof(struct vm_page *)); 1333 } 1334 1335 UVMHIST_LOG(ubchist, "done -> %d", error, 0,0,0); 1336 return error; 1337 } 1338 1339 /* 1340 * Vnode op for VM putpages. 1341 */ 1342 int 1343 nfs_putpages(v) 1344 void *v; 1345 { 1346 struct vop_putpages_args /* { 1347 struct vnode *a_vp; 1348 struct vm_page **a_m; 1349 int a_count; 1350 int a_flags; 1351 int *a_rtvals; 1352 } */ *ap = v; 1353 1354 struct vnode *vp = ap->a_vp; 1355 struct nfsnode *np = VTONFS(vp); 1356 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1357 struct buf *bp, *mbp; 1358 struct vm_page **pgs = ap->a_m; 1359 int flags = ap->a_flags; 1360 int npages = ap->a_count; 1361 int s, error = 0, i; 1362 size_t bytes, iobytes, skipbytes; 1363 vaddr_t kva; 1364 off_t offset, origoffset, commitoff; 1365 uint32_t commitbytes; 1366 boolean_t v3 = NFS_ISV3(vp); 1367 boolean_t async = (flags & PGO_SYNCIO) == 0; 1368 boolean_t weak = (flags & PGO_WEAK) && v3; 1369 UVMHIST_FUNC("nfs_putpages"); UVMHIST_CALLED(ubchist); 1370 1371 UVMHIST_LOG(ubchist, "vp %p pgp %p count %d", 1372 vp, ap->a_m, ap->a_count,0); 1373 1374 simple_unlock(&vp->v_uvm.u_obj.vmobjlock); 1375 1376 origoffset = pgs[0]->offset; 1377 bytes = min(ap->a_count << PAGE_SHIFT, vp->v_uvm.u_size - origoffset); 1378 skipbytes = 0; 1379 1380 /* 1381 * if the range has been committed already, mark the pages thus. 1382 * if the range just needs to be committed, we're done 1383 * if it's a weak putpage, otherwise commit the range. 1384 */ 1385 1386 if (v3) { 1387 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1388 if (nfs_in_committed_range(vp, origoffset, bytes)) { 1389 goto committed; 1390 } 1391 if (nfs_in_tobecommitted_range(vp, origoffset, bytes)) { 1392 if (weak) { 1393 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1394 return 0; 1395 } else { 1396 commitoff = np->n_pushlo; 1397 commitbytes = (uint32_t)(np->n_pushhi - 1398 np->n_pushlo); 1399 goto commit; 1400 } 1401 } 1402 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1403 } 1404 1405 /* 1406 * otherwise write or commit all the pages. 1407 */ 1408 1409 kva = uvm_pagermapin(pgs, ap->a_count, UVMPAGER_MAPIN_WAITOK| 1410 UVMPAGER_MAPIN_WRITE); 1411 1412 s = splbio(); 1413 vp->v_numoutput += 2; 1414 mbp = pool_get(&bufpool, PR_WAITOK); 1415 UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x", 1416 vp, mbp, vp->v_numoutput, bytes); 1417 splx(s); 1418 mbp->b_bufsize = npages << PAGE_SHIFT; 1419 mbp->b_data = (void *)kva; 1420 mbp->b_resid = mbp->b_bcount = bytes; 1421 mbp->b_flags = B_BUSY|B_WRITE|B_AGE | 1422 (async ? B_CALL|B_ASYNC : 0) | 1423 (curproc == uvm.pagedaemon_proc ? B_PDAEMON : 0); 1424 mbp->b_iodone = uvm_aio_aiodone; 1425 mbp->b_vp = vp; 1426 LIST_INIT(&mbp->b_dep); 1427 1428 for (offset = origoffset; 1429 bytes > 0; 1430 offset += iobytes, bytes -= iobytes) { 1431 iobytes = min(nmp->nm_wsize, bytes); 1432 1433 /* 1434 * skip writing any pages which only need a commit. 1435 */ 1436 1437 if ((pgs[(offset - origoffset) >> PAGE_SHIFT]->flags & 1438 PG_NEEDCOMMIT) != 0) { 1439 KASSERT((offset & (PAGE_SIZE - 1)) == 0); 1440 iobytes = min(PAGE_SIZE, bytes); 1441 skipbytes += iobytes; 1442 continue; 1443 } 1444 1445 /* if it's really one i/o, don't make a second buf */ 1446 if (offset == origoffset && iobytes == bytes) { 1447 bp = mbp; 1448 } else { 1449 s = splbio(); 1450 vp->v_numoutput++; 1451 bp = pool_get(&bufpool, PR_WAITOK); 1452 UVMHIST_LOG(ubchist, "vp %p bp %p num now %d", 1453 vp, bp, vp->v_numoutput, 0); 1454 splx(s); 1455 bp->b_data = (char *)kva + (offset - origoffset); 1456 bp->b_resid = bp->b_bcount = iobytes; 1457 bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC; 1458 bp->b_iodone = uvm_aio_biodone1; 1459 bp->b_vp = vp; 1460 LIST_INIT(&bp->b_dep); 1461 } 1462 bp->b_private = mbp; 1463 bp->b_lblkno = bp->b_blkno = (daddr_t)(offset >> DEV_BSHIFT); 1464 UVMHIST_LOG(ubchist, "bp %p numout %d", 1465 bp, vp->v_numoutput,0,0); 1466 VOP_STRATEGY(bp); 1467 } 1468 if (skipbytes) { 1469 UVMHIST_LOG(ubchist, "skipbytes %d", bytes, 0,0,0); 1470 s = splbio(); 1471 mbp->b_resid -= skipbytes; 1472 if (mbp->b_resid == 0) { 1473 biodone(mbp); 1474 } 1475 splx(s); 1476 } 1477 if (async) { 1478 return EINPROGRESS; 1479 } 1480 error = biowait(mbp); 1481 1482 s = splbio(); 1483 vwakeup(mbp); 1484 pool_put(&bufpool, mbp); 1485 splx(s); 1486 1487 uvm_pagermapout(kva, ap->a_count); 1488 if (error || !v3) { 1489 UVMHIST_LOG(ubchist, "returning error %d", error, 0,0,0); 1490 return error; 1491 } 1492 1493 /* 1494 * for a weak put, mark the range as "to be committed" 1495 * and mark the pages read-only so that we will be notified 1496 * to remove the pages from the "to be committed" range 1497 * if they are made dirty again. 1498 * for a strong put, commit the pages and remove them from the 1499 * "to be committed" range. also, mark them as writable 1500 * and not cleanable with just a commit. 1501 */ 1502 1503 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1504 if (weak) { 1505 nfs_add_tobecommitted_range(vp, origoffset, 1506 npages << PAGE_SHIFT); 1507 for (i = 0; i < npages; i++) { 1508 pgs[i]->flags |= PG_NEEDCOMMIT|PG_RDONLY; 1509 } 1510 } else { 1511 commitoff = origoffset; 1512 commitbytes = npages << PAGE_SHIFT; 1513 commit: 1514 error = nfs_commit(vp, commitoff, commitbytes, curproc); 1515 nfs_del_tobecommitted_range(vp, commitoff, commitbytes); 1516 committed: 1517 for (i = 0; i < npages; i++) { 1518 pgs[i]->flags &= ~(PG_NEEDCOMMIT|PG_RDONLY); 1519 } 1520 } 1521 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1522 return error; 1523 } 1524