xref: /netbsd-src/sys/nfs/nfs_bio.c (revision 2a399c6883d870daece976daec6ffa7bb7f934ce)
1 /*	$NetBSD: nfs_bio.c,v 1.40 1997/11/23 13:52:24 fvdl Exp $	*/
2 
3 /*
4  * Copyright (c) 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Rick Macklem at The University of Guelph.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
39  */
40 
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/resourcevar.h>
45 #include <sys/signalvar.h>
46 #include <sys/proc.h>
47 #include <sys/buf.h>
48 #include <sys/vnode.h>
49 #include <sys/trace.h>
50 #include <sys/mount.h>
51 #include <sys/kernel.h>
52 #include <sys/namei.h>
53 #include <sys/dirent.h>
54 
55 #include <vm/vm.h>
56 
57 #include <nfs/rpcv2.h>
58 #include <nfs/nfsproto.h>
59 #include <nfs/nfs.h>
60 #include <nfs/nfsmount.h>
61 #include <nfs/nqnfs.h>
62 #include <nfs/nfsnode.h>
63 #include <nfs/nfs_var.h>
64 
65 extern int nfs_numasync;
66 extern struct nfsstats nfsstats;
67 
68 /*
69  * Vnode op for read using bio
70  * Any similarity to readip() is purely coincidental
71  */
72 int
73 nfs_bioread(vp, uio, ioflag, cred, cflag)
74 	register struct vnode *vp;
75 	register struct uio *uio;
76 	int ioflag, cflag;
77 	struct ucred *cred;
78 {
79 	register struct nfsnode *np = VTONFS(vp);
80 	register int biosize, diff;
81 	struct buf *bp = NULL, *rabp;
82 	struct vattr vattr;
83 	struct proc *p;
84 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
85 	struct nfsdircache *ndp = NULL, *nndp = NULL;
86 	daddr_t lbn, bn, rabn;
87 	caddr_t baddr, ep, edp;
88 	int got_buf = 0, nra, error = 0, n = 0, on = 0, not_readin, en, enn;
89 	int enough = 0;
90 	struct dirent *dp, *pdp;
91 	off_t curoff = 0;
92 
93 #ifdef DIAGNOSTIC
94 	if (uio->uio_rw != UIO_READ)
95 		panic("nfs_read mode");
96 #endif
97 	if (uio->uio_resid == 0)
98 		return (0);
99 	if (vp->v_type != VDIR && uio->uio_offset < 0)
100 		return (EINVAL);
101 	p = uio->uio_procp;
102 	if ((nmp->nm_flag & NFSMNT_NFSV3) &&
103 	    !(nmp->nm_iflag & NFSMNT_GOTFSINFO))
104 		(void)nfs_fsinfo(nmp, vp, cred, p);
105 	if (vp->v_type != VDIR &&
106 	    (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
107 		return (EFBIG);
108 	biosize = nmp->nm_rsize;
109 	/*
110 	 * For nfs, cache consistency can only be maintained approximately.
111 	 * Although RFC1094 does not specify the criteria, the following is
112 	 * believed to be compatible with the reference port.
113 	 * For nqnfs, full cache consistency is maintained within the loop.
114 	 * For nfs:
115 	 * If the file's modify time on the server has changed since the
116 	 * last read rpc or you have written to the file,
117 	 * you may have lost data cache consistency with the
118 	 * server, so flush all of the file's data out of the cache.
119 	 * Then force a getattr rpc to ensure that you have up to date
120 	 * attributes.
121 	 * NB: This implies that cache data can be read when up to
122 	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
123 	 * attributes this could be forced by setting n_attrstamp to 0 before
124 	 * the VOP_GETATTR() call.
125 	 */
126 	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) {
127 		if (np->n_flag & NMODIFIED) {
128 			if (vp->v_type != VREG) {
129 				if (vp->v_type != VDIR)
130 					panic("nfs: bioread, not dir");
131 				nfs_invaldircache(vp, 0);
132 				np->n_direofoffset = 0;
133 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
134 				if (error)
135 					return (error);
136 			}
137 			np->n_attrstamp = 0;
138 			error = VOP_GETATTR(vp, &vattr, cred, p);
139 			if (error)
140 				return (error);
141 			np->n_mtime = vattr.va_mtime.tv_sec;
142 		} else {
143 			error = VOP_GETATTR(vp, &vattr, cred, p);
144 			if (error)
145 				return (error);
146 			if (np->n_mtime != vattr.va_mtime.tv_sec) {
147 				if (vp->v_type == VDIR) {
148 					nfs_invaldircache(vp, 0);
149 					np->n_direofoffset = 0;
150 				}
151 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
152 				if (error)
153 					return (error);
154 				np->n_mtime = vattr.va_mtime.tv_sec;
155 			}
156 		}
157 	}
158 	do {
159 
160 	    /*
161 	     * Get a valid lease. If cached data is stale, flush it.
162 	     */
163 	    if (nmp->nm_flag & NFSMNT_NQNFS) {
164 		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
165 		    do {
166 			error = nqnfs_getlease(vp, ND_READ, cred, p);
167 		    } while (error == NQNFS_EXPIRED);
168 		    if (error)
169 			return (error);
170 		    if (np->n_lrev != np->n_brev ||
171 			(np->n_flag & NQNFSNONCACHE) ||
172 			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
173 			if (vp->v_type == VDIR) {
174 				nfs_invaldircache(vp, 0);
175 				np->n_direofoffset = 0;
176 			}
177 			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
178 			if (error)
179 			    return (error);
180 			np->n_brev = np->n_lrev;
181 		    }
182 		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
183 		    nfs_invaldircache(vp, 0);
184 		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
185 		    np->n_direofoffset = 0;
186 		    if (error)
187 			return (error);
188 		}
189 	    }
190 	    /*
191 	     * Don't cache symlinks.
192 	     */
193 	    if (np->n_flag & NQNFSNONCACHE
194 		|| ((vp->v_flag & VROOT) && vp->v_type == VLNK)) {
195 		switch (vp->v_type) {
196 		case VREG:
197 			return (nfs_readrpc(vp, uio, cred));
198 		case VLNK:
199 			return (nfs_readlinkrpc(vp, uio, cred));
200 		case VDIR:
201 			break;
202 		default:
203 			printf(" NQNFSNONCACHE: type %x unexpected\n",
204 			    vp->v_type);
205 		};
206 	    }
207 	    baddr = (caddr_t)0;
208 	    switch (vp->v_type) {
209 	    case VREG:
210 		nfsstats.biocache_reads++;
211 		lbn = uio->uio_offset / biosize;
212 		on = uio->uio_offset & (biosize - 1);
213 		bn = lbn * (biosize / DEV_BSIZE);
214 		not_readin = 1;
215 
216 		/*
217 		 * Start the read ahead(s), as required.
218 		 */
219 		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
220 		    lbn - 1 == vp->v_lastr) {
221 		    for (nra = 0; nra < nmp->nm_readahead &&
222 			(lbn + 1 + nra) * biosize < np->n_size; nra++) {
223 			rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE);
224 			if (!incore(vp, rabn)) {
225 			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
226 			    if (!rabp)
227 				return (EINTR);
228 			    if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) {
229 				rabp->b_flags |= (B_READ | B_ASYNC);
230 				if (nfs_asyncio(rabp, cred)) {
231 				    rabp->b_flags |= B_INVAL;
232 				    brelse(rabp);
233 				}
234 			    } else
235 				brelse(rabp);
236 			}
237 		    }
238 		}
239 
240 		/*
241 		 * If the block is in the cache and has the required data
242 		 * in a valid region, just copy it out.
243 		 * Otherwise, get the block and write back/read in,
244 		 * as required.
245 		 */
246 		if ((bp = incore(vp, bn)) &&
247 		    (bp->b_flags & (B_BUSY | B_WRITEINPROG)) ==
248 		    (B_BUSY | B_WRITEINPROG))
249 			got_buf = 0;
250 		else {
251 again:
252 			bp = nfs_getcacheblk(vp, bn, biosize, p);
253 			if (!bp)
254 				return (EINTR);
255 			got_buf = 1;
256 			if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
257 				bp->b_flags |= B_READ;
258 				not_readin = 0;
259 				error = nfs_doio(bp, cred, p);
260 				if (error) {
261 				    brelse(bp);
262 				    return (error);
263 				}
264 			}
265 		}
266 		n = min((unsigned)(biosize - on), uio->uio_resid);
267 		diff = np->n_size - uio->uio_offset;
268 		if (diff < n)
269 			n = diff;
270 		if (not_readin && n > 0) {
271 			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
272 				if (!got_buf) {
273 				    bp = nfs_getcacheblk(vp, bn, biosize, p);
274 				    if (!bp)
275 					return (EINTR);
276 				    got_buf = 1;
277 				}
278 				bp->b_flags |= B_INVAFTERWRITE;
279 				if (bp->b_dirtyend > 0) {
280 				    if ((bp->b_flags & B_DELWRI) == 0)
281 					panic("nfsbioread");
282 				    if (VOP_BWRITE(bp) == EINTR)
283 					return (EINTR);
284 				} else
285 				    brelse(bp);
286 				goto again;
287 			}
288 		}
289 		vp->v_lastr = lbn;
290 		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
291 		if (diff < n)
292 			n = diff;
293 		break;
294 	    case VLNK:
295 		nfsstats.biocache_readlinks++;
296 		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
297 		if (!bp)
298 			return (EINTR);
299 		if ((bp->b_flags & B_DONE) == 0) {
300 			bp->b_flags |= B_READ;
301 			error = nfs_doio(bp, cred, p);
302 			if (error) {
303 				brelse(bp);
304 				return (error);
305 			}
306 		}
307 		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
308 		got_buf = 1;
309 		on = 0;
310 		break;
311 	    case VDIR:
312 diragain:
313 		nfsstats.biocache_readdirs++;
314 		ndp = nfs_searchdircache(vp, uio->uio_offset,
315 			(nmp->nm_flag & NFSMNT_XLATECOOKIE), 0);
316 		if (!ndp) {
317 			/*
318 			 * We've been handed a cookie that is not
319 			 * in the cache. If we're not translating
320 			 * 32 <-> 64, it may be a value that was
321 			 * flushed out of the cache because it grew
322 			 * too big. Let the server judge if it's
323 			 * valid or not. In the translation case,
324 			 * we have no way of validating this value,
325 			 * so punt.
326 			 */
327 			if (nmp->nm_flag & NFSMNT_XLATECOOKIE)
328 				return (EINVAL);
329 			ndp = nfs_enterdircache(vp, uio->uio_offset,
330 				uio->uio_offset, 0, 0);
331 		}
332 
333 		if (uio->uio_offset != 0 &&
334 		    ndp->dc_cookie == np->n_direofoffset) {
335 			nfsstats.direofcache_hits++;
336 			return (0);
337 		}
338 
339 		bp = nfs_getcacheblk(vp, ndp->dc_blkno, NFS_DIRBLKSIZ, p);
340 		if (!bp)
341 		    return (EINTR);
342 		if ((bp->b_flags & B_DONE) == 0) {
343 		    bp->b_flags |= B_READ;
344 		    bp->b_dcookie = ndp->dc_blkcookie;
345 		    error = nfs_doio(bp, cred, p);
346 		    if (error) {
347 			/*
348 			 * Yuck! The directory has been modified on the
349 			 * server. Punt and let the userland code
350 			 * deal with it.
351 			 */
352 			brelse(bp);
353 			if (error == NFSERR_BAD_COOKIE) {
354 			    nfs_invaldircache(vp, 0);
355 			    nfs_vinvalbuf(vp, 0, cred, p, 1);
356 			    error = EINVAL;
357 			}
358 			return (error);
359 		    }
360 		}
361 
362 		/*
363 		 * Just return if we hit EOF right away with this
364 		 * block. Always check here, because direofoffset
365 		 * may have been set by an nfsiod since the last
366 		 * check.
367 		 */
368 		if (np->n_direofoffset != 0 &&
369 			ndp->dc_blkcookie == np->n_direofoffset) {
370 			brelse(bp);
371 			return (0);
372 		}
373 
374 		/*
375 		 * Find the entry we were looking for in the block.
376 		 */
377 
378 		en = ndp->dc_entry;
379 
380 		pdp = dp = (struct dirent *)bp->b_data;
381 		edp = bp->b_data + bp->b_validend;
382 		enn = 0;
383 		while (enn < en && (caddr_t)dp < edp) {
384 			pdp = dp;
385 			dp = (struct dirent *)((caddr_t)dp + dp->d_reclen);
386 			enn++;
387 		}
388 
389 		/*
390 		 * If the entry number was bigger than the number of
391 		 * entries in the block, or the cookie of the previous
392 		 * entry doesn't match, the directory cache is
393 		 * stale. Flush it and try again (i.e. go to
394 		 * the server).
395 		 */
396 		if ((caddr_t)dp >= edp || (caddr_t)dp + dp->d_reclen > edp ||
397 		    (en > 0 && NFS_GETCOOKIE(pdp) != ndp->dc_cookie)) {
398 #ifdef DEBUG
399 		    	printf("invalid cache: %p %p %p off %lx %lx\n",
400 				pdp, dp, edp,
401 				(unsigned long)uio->uio_offset,
402 				(unsigned long)NFS_GETCOOKIE(pdp));
403 #endif
404 			brelse(bp);
405 			nfs_invaldircache(vp, 0);
406 			nfs_vinvalbuf(vp, 0, cred, p, 0);
407 			goto diragain;
408 		}
409 
410 		on = (caddr_t)dp - bp->b_data;
411 
412 		/*
413 		 * Cache all entries that may be exported to the
414 		 * user, as they may be thrown back at us. The
415 		 * NFSBIO_CACHECOOKIES flag indicates that all
416 		 * entries are being 'exported', so cache them all.
417 		 */
418 
419 		if (en == 0 && pdp == dp) {
420 			dp = (struct dirent *)
421 			    ((caddr_t)dp + dp->d_reclen);
422 			enn++;
423 		}
424 
425 		if (uio->uio_resid < (bp->b_validend - on)) {
426 			n = uio->uio_resid;
427 			enough = 1;
428 		} else
429 			n = bp->b_validend - on;
430 
431 		ep = bp->b_data + on + n;
432 
433 		/*
434 		 * Find last complete entry to copy, caching entries
435 		 * (if requested) as we go.
436 		 */
437 
438 		while ((caddr_t)dp < ep && (caddr_t)dp + dp->d_reclen <= ep) {
439 			if (cflag & NFSBIO_CACHECOOKIES) {
440 				nndp = nfs_enterdircache(vp, NFS_GETCOOKIE(pdp),
441 				    ndp->dc_blkcookie, enn, bp->b_lblkno);
442 				if (nmp->nm_flag & NFSMNT_XLATECOOKIE) {
443 					NFS_STASHCOOKIE32(pdp,
444 					    nndp->dc_cookie32);
445 				}
446 			}
447 			pdp = dp;
448 			dp = (struct dirent *)((caddr_t)dp + dp->d_reclen);
449 			enn++;
450 		}
451 
452 		/*
453 		 * If the last requested entry was not the last in the
454 		 * buffer (happens if NFS_DIRFRAGSIZ < NFS_DIRBLKSIZ),
455 		 * cache the cookie of the last requested one, and
456 		 * set of the offset to it.
457 		 */
458 
459 		if ((on + n) < bp->b_validend) {
460 			curoff = NFS_GETCOOKIE(pdp);
461 			nndp = nfs_enterdircache(vp, curoff, ndp->dc_blkcookie,
462 			    enn, bp->b_lblkno);
463 			if (nmp->nm_flag & NFSMNT_XLATECOOKIE) {
464 				NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32);
465 				curoff = nndp->dc_cookie32;
466 			}
467 		} else
468 			curoff = bp->b_dcookie;
469 
470 		/*
471 		 * Always cache the entry for the next block,
472 		 * so that readaheads can use it.
473 		 */
474 		nndp = nfs_enterdircache(vp, bp->b_dcookie, bp->b_dcookie, 0,0);
475 		if (nmp->nm_flag & NFSMNT_XLATECOOKIE) {
476 			if (curoff == bp->b_dcookie) {
477 				NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32);
478 				curoff = nndp->dc_cookie32;
479 			}
480 		}
481 
482 		n = ((caddr_t)pdp + pdp->d_reclen) - (bp->b_data + on);
483 
484 		/*
485 		 * If not eof and read aheads are enabled, start one.
486 		 * (You need the current block first, so that you have the
487 		 *  directory offset cookie of the next block.)
488 		 */
489 		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
490 		    np->n_direofoffset == 0 && !(np->n_flag & NQNFSNONCACHE)) {
491 			rabp = nfs_getcacheblk(vp, nndp->dc_blkno,
492 						NFS_DIRBLKSIZ, p);
493 			if (rabp) {
494 			    if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) {
495 				rabp->b_dcookie = nndp->dc_cookie;
496 				rabp->b_flags |= (B_READ | B_ASYNC);
497 				if (nfs_asyncio(rabp, cred)) {
498 				    rabp->b_flags |= B_INVAL;
499 				    brelse(rabp);
500 				}
501 			    } else
502 				brelse(rabp);
503 			}
504 		}
505 		got_buf = 1;
506 		break;
507 	    default:
508 		printf(" nfsbioread: type %x unexpected\n",vp->v_type);
509 		break;
510 	    };
511 
512 	    if (n > 0) {
513 		if (!baddr)
514 			baddr = bp->b_data;
515 		error = uiomove(baddr + on, (int)n, uio);
516 	    }
517 	    switch (vp->v_type) {
518 	    case VREG:
519 		break;
520 	    case VLNK:
521 		n = 0;
522 		break;
523 	    case VDIR:
524 		if (np->n_flag & NQNFSNONCACHE)
525 			bp->b_flags |= B_INVAL;
526 		uio->uio_offset = curoff;
527 		if (enough)
528 			n = 0;
529 		break;
530 	    default:
531 		printf(" nfsbioread: type %x unexpected\n",vp->v_type);
532 	    }
533 	    if (got_buf)
534 		brelse(bp);
535 	} while (error == 0 && uio->uio_resid > 0 && n > 0);
536 	return (error);
537 }
538 
539 /*
540  * Vnode op for write using bio
541  */
542 int
543 nfs_write(v)
544 	void *v;
545 {
546 	struct vop_write_args /* {
547 		struct vnode *a_vp;
548 		struct uio *a_uio;
549 		int  a_ioflag;
550 		struct ucred *a_cred;
551 	} */ *ap = v;
552 	register int biosize;
553 	register struct uio *uio = ap->a_uio;
554 	struct proc *p = uio->uio_procp;
555 	register struct vnode *vp = ap->a_vp;
556 	struct nfsnode *np = VTONFS(vp);
557 	register struct ucred *cred = ap->a_cred;
558 	int ioflag = ap->a_ioflag;
559 	struct buf *bp;
560 	struct vattr vattr;
561 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
562 	daddr_t lbn, bn;
563 	int n, on, error = 0, iomode, must_commit;
564 
565 #ifdef DIAGNOSTIC
566 	if (uio->uio_rw != UIO_WRITE)
567 		panic("nfs_write mode");
568 	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
569 		panic("nfs_write proc");
570 #endif
571 	if (vp->v_type != VREG)
572 		return (EIO);
573 	if (np->n_flag & NWRITEERR) {
574 		np->n_flag &= ~NWRITEERR;
575 		return (np->n_error);
576 	}
577 	if ((nmp->nm_flag & NFSMNT_NFSV3) &&
578 	    !(nmp->nm_iflag & NFSMNT_GOTFSINFO))
579 		(void)nfs_fsinfo(nmp, vp, cred, p);
580 	if (ioflag & (IO_APPEND | IO_SYNC)) {
581 		if (np->n_flag & NMODIFIED) {
582 			np->n_attrstamp = 0;
583 			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
584 			if (error)
585 				return (error);
586 		}
587 		if (ioflag & IO_APPEND) {
588 			np->n_attrstamp = 0;
589 			error = VOP_GETATTR(vp, &vattr, cred, p);
590 			if (error)
591 				return (error);
592 			uio->uio_offset = np->n_size;
593 		}
594 	}
595 	if (uio->uio_offset < 0)
596 		return (EINVAL);
597 	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
598 		return (EFBIG);
599 	if (uio->uio_resid == 0)
600 		return (0);
601 	/*
602 	 * Maybe this should be above the vnode op call, but so long as
603 	 * file servers have no limits, i don't think it matters
604 	 */
605 	if (p && uio->uio_offset + uio->uio_resid >
606 	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
607 		psignal(p, SIGXFSZ);
608 		return (EFBIG);
609 	}
610 	/*
611 	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
612 	 * will be the same size within a filesystem. nfs_writerpc will
613 	 * still use nm_wsize when sizing the rpc's.
614 	 */
615 	biosize = nmp->nm_rsize;
616 	do {
617 
618 		/*
619 		 * XXX make sure we aren't cached in the VM page cache
620 		 */
621 		(void)vnode_pager_uncache(vp);
622 
623 		/*
624 		 * Check for a valid write lease.
625 		 */
626 		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
627 		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
628 			do {
629 				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
630 			} while (error == NQNFS_EXPIRED);
631 			if (error)
632 				return (error);
633 			if (np->n_lrev != np->n_brev ||
634 			    (np->n_flag & NQNFSNONCACHE)) {
635 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
636 				if (error)
637 					return (error);
638 				np->n_brev = np->n_lrev;
639 			}
640 		}
641 		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
642 		    iomode = NFSV3WRITE_FILESYNC;
643 		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
644 		    if (must_commit)
645 			nfs_clearcommit(vp->v_mount);
646 		    return (error);
647 		}
648 		nfsstats.biocache_writes++;
649 		lbn = uio->uio_offset / biosize;
650 		on = uio->uio_offset & (biosize-1);
651 		n = min((unsigned)(biosize - on), uio->uio_resid);
652 		bn = lbn * (biosize / DEV_BSIZE);
653 again:
654 		bp = nfs_getcacheblk(vp, bn, biosize, p);
655 		if (!bp)
656 			return (EINTR);
657 		if (bp->b_wcred == NOCRED) {
658 			crhold(cred);
659 			bp->b_wcred = cred;
660 		}
661 		np->n_flag |= NMODIFIED;
662 		if (uio->uio_offset + n > np->n_size) {
663 			np->n_size = uio->uio_offset + n;
664 			vnode_pager_setsize(vp, np->n_size);
665 		}
666 
667 		/*
668 		 * If the new write will leave a contiguous dirty
669 		 * area, just update the b_dirtyoff and b_dirtyend,
670 		 * otherwise force a write rpc of the old dirty area.
671 		 */
672 		if (bp->b_dirtyend > 0 &&
673 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
674 			bp->b_proc = p;
675 			if (VOP_BWRITE(bp) == EINTR)
676 				return (EINTR);
677 			goto again;
678 		}
679 
680 		/*
681 		 * Check for valid write lease and get one as required.
682 		 * In case getblk() and/or bwrite() delayed us.
683 		 */
684 		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
685 		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
686 			do {
687 				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
688 			} while (error == NQNFS_EXPIRED);
689 			if (error) {
690 				brelse(bp);
691 				return (error);
692 			}
693 			if (np->n_lrev != np->n_brev ||
694 			    (np->n_flag & NQNFSNONCACHE)) {
695 				brelse(bp);
696 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
697 				if (error)
698 					return (error);
699 				np->n_brev = np->n_lrev;
700 				goto again;
701 			}
702 		}
703 		error = uiomove((char *)bp->b_data + on, n, uio);
704 		if (error) {
705 			bp->b_flags |= B_ERROR;
706 			brelse(bp);
707 			return (error);
708 		}
709 		if (bp->b_dirtyend > 0) {
710 			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
711 			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
712 		} else {
713 			bp->b_dirtyoff = on;
714 			bp->b_dirtyend = on + n;
715 		}
716 		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
717 		    bp->b_validoff > bp->b_dirtyend) {
718 			bp->b_validoff = bp->b_dirtyoff;
719 			bp->b_validend = bp->b_dirtyend;
720 		} else {
721 			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
722 			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
723 		}
724 
725 		/*
726 		 * Since this block is being modified, it must be written
727 		 * again and not just committed.
728 		 */
729 		bp->b_flags &= ~B_NEEDCOMMIT;
730 
731 		/*
732 		 * If the lease is non-cachable or IO_SYNC do bwrite().
733 		 */
734 		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
735 			bp->b_proc = p;
736 			error = VOP_BWRITE(bp);
737 			if (error)
738 				return (error);
739 			if (np->n_flag & NQNFSNONCACHE) {
740 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
741 				if (error)
742 					return (error);
743 			}
744 		} else if ((n + on) == biosize &&
745 			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
746 			bp->b_proc = (struct proc *)0;
747 			bp->b_flags |= B_ASYNC;
748 			(void)nfs_writebp(bp, 0);
749 		} else {
750 			bdwrite(bp);
751 		}
752 	} while (uio->uio_resid > 0 && n > 0);
753 	return (0);
754 }
755 
756 /*
757  * Get an nfs cache block.
758  * Allocate a new one if the block isn't currently in the cache
759  * and return the block marked busy. If the calling process is
760  * interrupted by a signal for an interruptible mount point, return
761  * NULL.
762  */
763 struct buf *
764 nfs_getcacheblk(vp, bn, size, p)
765 	struct vnode *vp;
766 	daddr_t bn;
767 	int size;
768 	struct proc *p;
769 {
770 	register struct buf *bp;
771 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
772 
773 	if (nmp->nm_flag & NFSMNT_INT) {
774 		bp = getblk(vp, bn, size, PCATCH, 0);
775 		while (bp == (struct buf *)0) {
776 			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
777 				return ((struct buf *)0);
778 			bp = getblk(vp, bn, size, 0, 2 * hz);
779 		}
780 	} else
781 		bp = getblk(vp, bn, size, 0, 0);
782 	return (bp);
783 }
784 
785 /*
786  * Flush and invalidate all dirty buffers. If another process is already
787  * doing the flush, just wait for completion.
788  */
789 int
790 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
791 	struct vnode *vp;
792 	int flags;
793 	struct ucred *cred;
794 	struct proc *p;
795 	int intrflg;
796 {
797 	register struct nfsnode *np = VTONFS(vp);
798 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
799 	int error = 0, slpflag, slptimeo;
800 
801 	if ((nmp->nm_flag & NFSMNT_INT) == 0)
802 		intrflg = 0;
803 	if (intrflg) {
804 		slpflag = PCATCH;
805 		slptimeo = 2 * hz;
806 	} else {
807 		slpflag = 0;
808 		slptimeo = 0;
809 	}
810 	/*
811 	 * First wait for any other process doing a flush to complete.
812 	 */
813 	while (np->n_flag & NFLUSHINPROG) {
814 		np->n_flag |= NFLUSHWANT;
815 		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
816 			slptimeo);
817 		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
818 			return (EINTR);
819 	}
820 
821 	/*
822 	 * Now, flush as required.
823 	 */
824 	np->n_flag |= NFLUSHINPROG;
825 	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
826 	while (error) {
827 		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
828 			np->n_flag &= ~NFLUSHINPROG;
829 			if (np->n_flag & NFLUSHWANT) {
830 				np->n_flag &= ~NFLUSHWANT;
831 				wakeup((caddr_t)&np->n_flag);
832 			}
833 			return (EINTR);
834 		}
835 		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
836 	}
837 	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
838 	if (np->n_flag & NFLUSHWANT) {
839 		np->n_flag &= ~NFLUSHWANT;
840 		wakeup((caddr_t)&np->n_flag);
841 	}
842 	return (0);
843 }
844 
845 /*
846  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
847  * This is mainly to avoid queueing async I/O requests when the nfsiods
848  * are all hung on a dead server.
849  */
850 int
851 nfs_asyncio(bp, cred)
852 	register struct buf *bp;
853 	struct ucred *cred;
854 {
855 	register int i;
856 	register struct nfsmount *nmp;
857 	int gotiod, slpflag = 0, slptimeo = 0, error;
858 
859 	if (nfs_numasync == 0)
860 		return (EIO);
861 
862 
863 	nmp = VFSTONFS(bp->b_vp->v_mount);
864 again:
865 	if (nmp->nm_flag & NFSMNT_INT)
866 		slpflag = PCATCH;
867 	gotiod = FALSE;
868 
869 	/*
870 	 * Find a free iod to process this request.
871 	 */
872 
873 	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
874 		if (nfs_iodwant[i]) {
875 			/*
876 			 * Found one, so wake it up and tell it which
877 			 * mount to process.
878 			 */
879 			nfs_iodwant[i] = (struct proc *)0;
880 			nfs_iodmount[i] = nmp;
881 			nmp->nm_bufqiods++;
882 			wakeup((caddr_t)&nfs_iodwant[i]);
883 			gotiod = TRUE;
884 			break;
885 		}
886 	/*
887 	 * If none are free, we may already have an iod working on this mount
888 	 * point.  If so, it will process our request.
889 	 */
890 	if (!gotiod && nmp->nm_bufqiods > 0)
891 		gotiod = TRUE;
892 
893 	/*
894 	 * If we have an iod which can process the request, then queue
895 	 * the buffer.
896 	 */
897 	if (gotiod) {
898 		/*
899 		 * Ensure that the queue never grows too large.
900 		 */
901 		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
902 			nmp->nm_bufqwant = TRUE;
903 			error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
904 				"nfsaio", slptimeo);
905 			if (error) {
906 				if (nfs_sigintr(nmp, NULL, bp->b_proc))
907 					return (EINTR);
908 				if (slpflag == PCATCH) {
909 					slpflag = 0;
910 					slptimeo = 2 * hz;
911 				}
912 			}
913 			/*
914 			 * We might have lost our iod while sleeping,
915 			 * so check and loop if nescessary.
916 			 */
917 			if (nmp->nm_bufqiods == 0)
918 				goto again;
919 		}
920 
921 		if (bp->b_flags & B_READ) {
922 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
923 				crhold(cred);
924 				bp->b_rcred = cred;
925 			}
926 		} else {
927 			bp->b_flags |= B_WRITEINPROG;
928 			if (bp->b_wcred == NOCRED && cred != NOCRED) {
929 				crhold(cred);
930 				bp->b_wcred = cred;
931 			}
932 		}
933 
934 		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
935 		nmp->nm_bufqlen++;
936 		return (0);
937 	    }
938 
939 	/*
940 	 * All the iods are busy on other mounts, so return EIO to
941 	 * force the caller to process the i/o synchronously.
942 	 */
943 	return (EIO);
944 }
945 
946 /*
947  * Do an I/O operation to/from a cache block. This may be called
948  * synchronously or from an nfsiod.
949  */
950 int
951 nfs_doio(bp, cr, p)
952 	register struct buf *bp;
953 	struct ucred *cr;
954 	struct proc *p;
955 {
956 	register struct uio *uiop;
957 	register struct vnode *vp;
958 	struct nfsnode *np;
959 	struct nfsmount *nmp;
960 	int error = 0, diff, len, iomode, must_commit = 0;
961 	struct uio uio;
962 	struct iovec io;
963 
964 	vp = bp->b_vp;
965 	np = VTONFS(vp);
966 	nmp = VFSTONFS(vp->v_mount);
967 	uiop = &uio;
968 	uiop->uio_iov = &io;
969 	uiop->uio_iovcnt = 1;
970 	uiop->uio_segflg = UIO_SYSSPACE;
971 	uiop->uio_procp = p;
972 
973 	/*
974 	 * Historically, paging was done with physio, but no more...
975 	 */
976 	if (bp->b_flags & B_PHYS) {
977 	    /*
978 	     * ...though reading /dev/drum still gets us here.
979 	     */
980 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
981 	    /* mapping was done by vmapbuf() */
982 	    io.iov_base = bp->b_data;
983 	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
984 	    if (bp->b_flags & B_READ) {
985 		uiop->uio_rw = UIO_READ;
986 		nfsstats.read_physios++;
987 		error = nfs_readrpc(vp, uiop, cr);
988 	    } else {
989 		iomode = NFSV3WRITE_DATASYNC;
990 		uiop->uio_rw = UIO_WRITE;
991 		nfsstats.write_physios++;
992 		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
993 	    }
994 	    if (error) {
995 		bp->b_flags |= B_ERROR;
996 		bp->b_error = error;
997 	    }
998 	} else if (bp->b_flags & B_READ) {
999 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
1000 	    io.iov_base = bp->b_data;
1001 	    uiop->uio_rw = UIO_READ;
1002 	    switch (vp->v_type) {
1003 	    case VREG:
1004 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1005 		nfsstats.read_bios++;
1006 		error = nfs_readrpc(vp, uiop, cr);
1007 		if (!error) {
1008 		    bp->b_validoff = 0;
1009 		    if (uiop->uio_resid) {
1010 			/*
1011 			 * If len > 0, there is a hole in the file and
1012 			 * no writes after the hole have been pushed to
1013 			 * the server yet.
1014 			 * Just zero fill the rest of the valid area.
1015 			 */
1016 			diff = bp->b_bcount - uiop->uio_resid;
1017 			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
1018 				+ diff);
1019 			if (len > 0) {
1020 			    len = min(len, uiop->uio_resid);
1021 			    bzero((char *)bp->b_data + diff, len);
1022 			    bp->b_validend = diff + len;
1023 			} else
1024 			    bp->b_validend = diff;
1025 		    } else
1026 			bp->b_validend = bp->b_bcount;
1027 		}
1028 		if (p && (vp->v_flag & VTEXT) &&
1029 			(((nmp->nm_flag & NFSMNT_NQNFS) &&
1030 			  NQNFS_CKINVALID(vp, np, ND_READ) &&
1031 			  np->n_lrev != np->n_brev) ||
1032 			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1033 			  np->n_mtime != np->n_vattr->va_mtime.tv_sec))) {
1034 			uprintf("Process killed due to text file modification\n");
1035 			psignal(p, SIGKILL);
1036 			p->p_holdcnt++;
1037 		}
1038 		break;
1039 	    case VLNK:
1040 		uiop->uio_offset = (off_t)0;
1041 		nfsstats.readlink_bios++;
1042 		error = nfs_readlinkrpc(vp, uiop, cr);
1043 		break;
1044 	    case VDIR:
1045 		nfsstats.readdir_bios++;
1046 		uiop->uio_offset = bp->b_dcookie;
1047 		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1048 			error = nfs_readdirplusrpc(vp, uiop, cr);
1049 			if (error == NFSERR_NOTSUPP)
1050 				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1051 		}
1052 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1053 			error = nfs_readdirrpc(vp, uiop, cr);
1054 		if (!error) {
1055 			bp->b_dcookie = uiop->uio_offset;
1056 			bp->b_validoff = 0;
1057 			bp->b_validend = bp->b_bcount - uiop->uio_resid;
1058 		}
1059 		break;
1060 	    default:
1061 		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
1062 		break;
1063 	    };
1064 	    if (error) {
1065 		bp->b_flags |= B_ERROR;
1066 		bp->b_error = error;
1067 	    }
1068 	} else {
1069 	    io.iov_len = uiop->uio_resid = bp->b_dirtyend
1070 		- bp->b_dirtyoff;
1071 	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
1072 		+ bp->b_dirtyoff;
1073 	    io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1074 	    uiop->uio_rw = UIO_WRITE;
1075 	    nfsstats.write_bios++;
1076 	    if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
1077 		iomode = NFSV3WRITE_UNSTABLE;
1078 	    else
1079 		iomode = NFSV3WRITE_FILESYNC;
1080 	    bp->b_flags |= B_WRITEINPROG;
1081 #ifdef fvdl_debug
1082 	    printf("nfs_doio(%x): bp %x doff %d dend %d\n",
1083 		vp, bp, bp->b_dirtyoff, bp->b_dirtyend);
1084 #endif
1085 	    error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1086 	    if (!error && iomode == NFSV3WRITE_UNSTABLE)
1087 		bp->b_flags |= B_NEEDCOMMIT;
1088 	    else
1089 		bp->b_flags &= ~B_NEEDCOMMIT;
1090 	    bp->b_flags &= ~B_WRITEINPROG;
1091 
1092 	    /*
1093 	     * For an interrupted write, the buffer is still valid and the
1094 	     * write hasn't been pushed to the server yet, so we can't set
1095 	     * B_ERROR and report the interruption by setting B_EINTR. For
1096 	     * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt
1097 	     * is essentially a noop.
1098 	     * For the case of a V3 write rpc not being committed to stable
1099 	     * storage, the block is still dirty and requires either a commit
1100 	     * rpc or another write rpc with iomode == NFSV3WRITE_FILESYNC
1101 	     * before the block is reused. This is indicated by setting the
1102 	     * B_DELWRI and B_NEEDCOMMIT flags.
1103 	     */
1104 	    if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1105 		bp->b_flags |= B_DELWRI;
1106 
1107 		/*
1108 		 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
1109 		 * buffer to the clean list, we have to reassign it back to the
1110 		 * dirty one. Ugh.
1111 		 */
1112 		if (bp->b_flags & B_ASYNC)
1113 		    reassignbuf(bp, vp);
1114 		else if (error)
1115 		    bp->b_flags |= B_EINTR;
1116 	    } else {
1117 		if (error) {
1118 		    bp->b_flags |= B_ERROR;
1119 		    bp->b_error = np->n_error = error;
1120 		    np->n_flag |= NWRITEERR;
1121 		}
1122 		bp->b_dirtyoff = bp->b_dirtyend = 0;
1123 	    }
1124 	}
1125 	bp->b_resid = uiop->uio_resid;
1126 	if (must_commit)
1127 		nfs_clearcommit(vp->v_mount);
1128 	biodone(bp);
1129 	return (error);
1130 }
1131