xref: /netbsd-src/sys/miscfs/genfs/genfs_vnops.c (revision 9fbd88883c38d0c0fbfcbe66d76fe6b0fab3f9de)
1 /*	$NetBSD: genfs_vnops.c,v 1.46 2002/01/26 02:44:27 chs Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  */
36 
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.46 2002/01/26 02:44:27 chs Exp $");
39 
40 #include "opt_nfsserver.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/proc.h>
45 #include <sys/kernel.h>
46 #include <sys/mount.h>
47 #include <sys/namei.h>
48 #include <sys/vnode.h>
49 #include <sys/fcntl.h>
50 #include <sys/malloc.h>
51 #include <sys/poll.h>
52 #include <sys/mman.h>
53 
54 #include <miscfs/genfs/genfs.h>
55 #include <miscfs/genfs/genfs_node.h>
56 #include <miscfs/specfs/specdev.h>
57 
58 #include <uvm/uvm.h>
59 #include <uvm/uvm_pager.h>
60 
61 #ifdef NFSSERVER
62 #include <nfs/rpcv2.h>
63 #include <nfs/nfsproto.h>
64 #include <nfs/nfs.h>
65 #include <nfs/nqnfs.h>
66 #include <nfs/nfs_var.h>
67 #endif
68 
69 #define MAX_READ_AHEAD	16 	/* XXXUBC 16 */
70 
71 int
72 genfs_poll(v)
73 	void *v;
74 {
75 	struct vop_poll_args /* {
76 		struct vnode *a_vp;
77 		int a_events;
78 		struct proc *a_p;
79 	} */ *ap = v;
80 
81 	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
82 }
83 
84 int
85 genfs_fsync(v)
86 	void *v;
87 {
88 	struct vop_fsync_args /* {
89 		struct vnode *a_vp;
90 		struct ucred *a_cred;
91 		int a_flags;
92 		off_t offlo;
93 		off_t offhi;
94 		struct proc *a_p;
95 	} */ *ap = v;
96 	struct vnode *vp = ap->a_vp;
97 	int wait;
98 
99 	wait = (ap->a_flags & FSYNC_WAIT) != 0;
100 	vflushbuf(vp, wait);
101 	if ((ap->a_flags & FSYNC_DATAONLY) != 0)
102 		return (0);
103 	else
104 		return (VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0));
105 }
106 
107 int
108 genfs_seek(v)
109 	void *v;
110 {
111 	struct vop_seek_args /* {
112 		struct vnode *a_vp;
113 		off_t a_oldoff;
114 		off_t a_newoff;
115 		struct ucred *a_ucred;
116 	} */ *ap = v;
117 
118 	if (ap->a_newoff < 0)
119 		return (EINVAL);
120 
121 	return (0);
122 }
123 
124 int
125 genfs_abortop(v)
126 	void *v;
127 {
128 	struct vop_abortop_args /* {
129 		struct vnode *a_dvp;
130 		struct componentname *a_cnp;
131 	} */ *ap = v;
132 
133 	if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF)
134 		PNBUF_PUT(ap->a_cnp->cn_pnbuf);
135 	return (0);
136 }
137 
138 int
139 genfs_fcntl(v)
140 	void *v;
141 {
142 	struct vop_fcntl_args /* {
143 		struct vnode *a_vp;
144 		u_int a_command;
145 		caddr_t a_data;
146 		int a_fflag;
147 		struct ucred *a_cred;
148 		struct proc *a_p;
149 	} */ *ap = v;
150 
151 	if (ap->a_command == F_SETFL)
152 		return (0);
153 	else
154 		return (EOPNOTSUPP);
155 }
156 
157 /*ARGSUSED*/
158 int
159 genfs_badop(v)
160 	void *v;
161 {
162 
163 	panic("genfs: bad op");
164 }
165 
166 /*ARGSUSED*/
167 int
168 genfs_nullop(v)
169 	void *v;
170 {
171 
172 	return (0);
173 }
174 
175 /*ARGSUSED*/
176 int
177 genfs_einval(v)
178 	void *v;
179 {
180 
181 	return (EINVAL);
182 }
183 
184 /*ARGSUSED*/
185 int
186 genfs_eopnotsupp(v)
187 	void *v;
188 {
189 
190 	return (EOPNOTSUPP);
191 }
192 
193 /*
194  * Called when an fs doesn't support a particular vop but the vop needs to
195  * vrele, vput, or vunlock passed in vnodes.
196  */
197 int
198 genfs_eopnotsupp_rele(v)
199 	void *v;
200 {
201 	struct vop_generic_args /*
202 		struct vnodeop_desc *a_desc;
203 		/ * other random data follows, presumably * /
204 	} */ *ap = v;
205 	struct vnodeop_desc *desc = ap->a_desc;
206 	struct vnode *vp;
207 	int flags, i, j, offset;
208 
209 	flags = desc->vdesc_flags;
210 	for (i = 0; i < VDESC_MAX_VPS; flags >>=1, i++) {
211 		if ((offset = desc->vdesc_vp_offsets[i]) == VDESC_NO_OFFSET)
212 			break;	/* stop at end of list */
213 		if ((j = flags & VDESC_VP0_WILLPUT)) {
214 			vp = *VOPARG_OFFSETTO(struct vnode**,offset,ap);
215 			switch (j) {
216 			case VDESC_VP0_WILLPUT:
217 				vput(vp);
218 				break;
219 			case VDESC_VP0_WILLUNLOCK:
220 				VOP_UNLOCK(vp, 0);
221 				break;
222 			case VDESC_VP0_WILLRELE:
223 				vrele(vp);
224 				break;
225 			}
226 		}
227 	}
228 
229 	return (EOPNOTSUPP);
230 }
231 
232 /*ARGSUSED*/
233 int
234 genfs_ebadf(v)
235 	void *v;
236 {
237 
238 	return (EBADF);
239 }
240 
241 /* ARGSUSED */
242 int
243 genfs_enoioctl(v)
244 	void *v;
245 {
246 
247 	return (ENOTTY);
248 }
249 
250 
251 /*
252  * Eliminate all activity associated with the requested vnode
253  * and with all vnodes aliased to the requested vnode.
254  */
255 int
256 genfs_revoke(v)
257 	void *v;
258 {
259 	struct vop_revoke_args /* {
260 		struct vnode *a_vp;
261 		int a_flags;
262 	} */ *ap = v;
263 	struct vnode *vp, *vq;
264 	struct proc *p = curproc;	/* XXX */
265 
266 #ifdef DIAGNOSTIC
267 	if ((ap->a_flags & REVOKEALL) == 0)
268 		panic("genfs_revoke: not revokeall");
269 #endif
270 
271 	vp = ap->a_vp;
272 	simple_lock(&vp->v_interlock);
273 
274 	if (vp->v_flag & VALIASED) {
275 		/*
276 		 * If a vgone (or vclean) is already in progress,
277 		 * wait until it is done and return.
278 		 */
279 		if (vp->v_flag & VXLOCK) {
280 			vp->v_flag |= VXWANT;
281 			simple_unlock(&vp->v_interlock);
282 			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
283 			return (0);
284 		}
285 		/*
286 		 * Ensure that vp will not be vgone'd while we
287 		 * are eliminating its aliases.
288 		 */
289 		vp->v_flag |= VXLOCK;
290 		simple_unlock(&vp->v_interlock);
291 		while (vp->v_flag & VALIASED) {
292 			simple_lock(&spechash_slock);
293 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
294 				if (vq->v_rdev != vp->v_rdev ||
295 				    vq->v_type != vp->v_type || vp == vq)
296 					continue;
297 				simple_unlock(&spechash_slock);
298 				vgone(vq);
299 				break;
300 			}
301 			if (vq == NULLVP)
302 				simple_unlock(&spechash_slock);
303 		}
304 		/*
305 		 * Remove the lock so that vgone below will
306 		 * really eliminate the vnode after which time
307 		 * vgone will awaken any sleepers.
308 		 */
309 		simple_lock(&vp->v_interlock);
310 		vp->v_flag &= ~VXLOCK;
311 	}
312 	vgonel(vp, p);
313 	return (0);
314 }
315 
316 /*
317  * Lock the node.
318  */
319 int
320 genfs_lock(v)
321 	void *v;
322 {
323 	struct vop_lock_args /* {
324 		struct vnode *a_vp;
325 		int a_flags;
326 	} */ *ap = v;
327 	struct vnode *vp = ap->a_vp;
328 
329 	return (lockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock));
330 }
331 
332 /*
333  * Unlock the node.
334  */
335 int
336 genfs_unlock(v)
337 	void *v;
338 {
339 	struct vop_unlock_args /* {
340 		struct vnode *a_vp;
341 		int a_flags;
342 	} */ *ap = v;
343 	struct vnode *vp = ap->a_vp;
344 
345 	return (lockmgr(&vp->v_lock, ap->a_flags | LK_RELEASE,
346 		&vp->v_interlock));
347 }
348 
349 /*
350  * Return whether or not the node is locked.
351  */
352 int
353 genfs_islocked(v)
354 	void *v;
355 {
356 	struct vop_islocked_args /* {
357 		struct vnode *a_vp;
358 	} */ *ap = v;
359 	struct vnode *vp = ap->a_vp;
360 
361 	return (lockstatus(&vp->v_lock));
362 }
363 
364 /*
365  * Stubs to use when there is no locking to be done on the underlying object.
366  */
367 int
368 genfs_nolock(v)
369 	void *v;
370 {
371 	struct vop_lock_args /* {
372 		struct vnode *a_vp;
373 		int a_flags;
374 		struct proc *a_p;
375 	} */ *ap = v;
376 
377 	/*
378 	 * Since we are not using the lock manager, we must clear
379 	 * the interlock here.
380 	 */
381 	if (ap->a_flags & LK_INTERLOCK)
382 		simple_unlock(&ap->a_vp->v_interlock);
383 	return (0);
384 }
385 
386 int
387 genfs_nounlock(v)
388 	void *v;
389 {
390 	return (0);
391 }
392 
393 int
394 genfs_noislocked(v)
395 	void *v;
396 {
397 	return (0);
398 }
399 
400 /*
401  * Local lease check for NFS servers.  Just set up args and let
402  * nqsrv_getlease() do the rest.  If NFSSERVER is not in the kernel,
403  * this is a null operation.
404  */
405 int
406 genfs_lease_check(v)
407 	void *v;
408 {
409 #ifdef NFSSERVER
410 	struct vop_lease_args /* {
411 		struct vnode *a_vp;
412 		struct proc *a_p;
413 		struct ucred *a_cred;
414 		int a_flag;
415 	} */ *ap = v;
416 	u_int32_t duration = 0;
417 	int cache;
418 	u_quad_t frev;
419 
420 	(void) nqsrv_getlease(ap->a_vp, &duration, ND_CHECK | ap->a_flag,
421 	    NQLOCALSLP, ap->a_p, (struct mbuf *)0, &cache, &frev, ap->a_cred);
422 	return (0);
423 #else
424 	return (0);
425 #endif /* NFSSERVER */
426 }
427 
428 int
429 genfs_mmap(v)
430 	void *v;
431 {
432 	return 0;
433 }
434 
435 /*
436  * generic VM getpages routine.
437  * Return PG_BUSY pages for the given range,
438  * reading from backing store if necessary.
439  */
440 
441 int
442 genfs_getpages(v)
443 	void *v;
444 {
445 	struct vop_getpages_args /* {
446 		struct vnode *a_vp;
447 		voff_t a_offset;
448 		struct vm_page **a_m;
449 		int *a_count;
450 		int a_centeridx;
451 		vm_prot_t a_access_type;
452 		int a_advice;
453 		int a_flags;
454 	} */ *ap = v;
455 
456 	off_t newsize, diskeof, memeof;
457 	off_t offset, origoffset, startoffset, endoffset, raoffset;
458 	daddr_t lbn, blkno;
459 	int s, i, error, npages, orignpages, npgs, run, ridx, pidx, pcount;
460 	int fs_bshift, fs_bsize, dev_bshift;
461 	int flags = ap->a_flags;
462 	size_t bytes, iobytes, tailbytes, totalbytes, skipbytes;
463 	vaddr_t kva;
464 	struct buf *bp, *mbp;
465 	struct vnode *vp = ap->a_vp;
466 	struct vnode *devvp;
467 	struct genfs_node *gp = VTOG(vp);
468 	struct uvm_object *uobj = &vp->v_uobj;
469 	struct vm_page *pg, *pgs[MAX_READ_AHEAD];
470 	struct ucred *cred = curproc->p_ucred;		/* XXXUBC curproc */
471 	boolean_t async = (flags & PGO_SYNCIO) == 0;
472 	boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0;
473 	boolean_t sawhole = FALSE;
474 	boolean_t overwrite = (flags & PGO_OVERWRITE) != 0;
475 	UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist);
476 
477 	UVMHIST_LOG(ubchist, "vp %p off 0x%x/%x count %d",
478 		    vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count);
479 
480 	/* XXXUBC temp limit */
481 	if (*ap->a_count > MAX_READ_AHEAD) {
482 		panic("genfs_getpages: too many pages");
483 	}
484 
485 	error = 0;
486 	origoffset = ap->a_offset;
487 	orignpages = *ap->a_count;
488 	GOP_SIZE(vp, vp->v_size, &diskeof);
489 	if (flags & PGO_PASTEOF) {
490 		newsize = MAX(vp->v_size,
491 			      origoffset + (orignpages << PAGE_SHIFT));
492 		GOP_SIZE(vp, newsize, &memeof);
493 	} else {
494 		memeof = diskeof;
495 	}
496 	KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages);
497 	KASSERT((origoffset & (PAGE_SIZE - 1)) == 0 && origoffset >= 0);
498 	KASSERT(orignpages > 0);
499 
500 	/*
501 	 * Bounds-check the request.
502 	 */
503 
504 	if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) {
505 		if ((flags & PGO_LOCKED) == 0) {
506 			simple_unlock(&uobj->vmobjlock);
507 		}
508 		UVMHIST_LOG(ubchist, "off 0x%x count %d goes past EOF 0x%x",
509 			    origoffset, *ap->a_count, memeof,0);
510 		return EINVAL;
511 	}
512 
513 	/*
514 	 * For PGO_LOCKED requests, just return whatever's in memory.
515 	 */
516 
517 	if (flags & PGO_LOCKED) {
518 		uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m,
519 			      UFP_NOWAIT|UFP_NOALLOC|UFP_NORDONLY);
520 
521 		return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
522 	}
523 
524 	/* vnode is VOP_LOCKed, uobj is locked */
525 
526 	if (write && (vp->v_flag & VONWORKLST) == 0) {
527 		vn_syncer_add_to_worklist(vp, filedelay);
528 	}
529 
530 	/*
531 	 * find the requested pages and make some simple checks.
532 	 * leave space in the page array for a whole block.
533 	 */
534 
535 	if (vp->v_type == VREG) {
536 		fs_bshift = vp->v_mount->mnt_fs_bshift;
537 		dev_bshift = vp->v_mount->mnt_dev_bshift;
538 	} else {
539 		fs_bshift = DEV_BSHIFT;
540 		dev_bshift = DEV_BSHIFT;
541 	}
542 	fs_bsize = 1 << fs_bshift;
543 
544 	orignpages = MIN(orignpages,
545 	    round_page(memeof - origoffset) >> PAGE_SHIFT);
546 	npages = orignpages;
547 	startoffset = origoffset & ~(fs_bsize - 1);
548 	endoffset = round_page((origoffset + (npages << PAGE_SHIFT)
549 				+ fs_bsize - 1) & ~(fs_bsize - 1));
550 	endoffset = MIN(endoffset, round_page(memeof));
551 	ridx = (origoffset - startoffset) >> PAGE_SHIFT;
552 
553 	memset(pgs, 0, sizeof(pgs));
554 	uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL);
555 
556 	/*
557 	 * if the pages are already resident, just return them.
558 	 */
559 
560 	for (i = 0; i < npages; i++) {
561 		struct vm_page *pg = pgs[ridx + i];
562 
563 		if ((pg->flags & PG_FAKE) ||
564 		    (write && (pg->flags & PG_RDONLY))) {
565 			break;
566 		}
567 	}
568 	if (i == npages) {
569 		UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0);
570 		raoffset = origoffset + (orignpages << PAGE_SHIFT);
571 		npages += ridx;
572 		goto raout;
573 	}
574 
575 	/*
576 	 * if PGO_OVERWRITE is set, don't bother reading the pages.
577 	 */
578 
579 	if (flags & PGO_OVERWRITE) {
580 		UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0);
581 
582 		for (i = 0; i < npages; i++) {
583 			struct vm_page *pg = pgs[ridx + i];
584 
585 			pg->flags &= ~(PG_RDONLY|PG_CLEAN);
586 		}
587 		npages += ridx;
588 		goto out;
589 	}
590 
591 	/*
592 	 * the page wasn't resident and we're not overwriting,
593 	 * so we're going to have to do some i/o.
594 	 * find any additional pages needed to cover the expanded range.
595 	 */
596 
597 	npages = (endoffset - startoffset) >> PAGE_SHIFT;
598 	if (startoffset != origoffset || npages != orignpages) {
599 
600 		/*
601 		 * we need to avoid deadlocks caused by locking
602 		 * additional pages at lower offsets than pages we
603 		 * already have locked.  unlock them all and start over.
604 		 */
605 
606 		for (i = 0; i < orignpages; i++) {
607 			struct vm_page *pg = pgs[ridx + i];
608 
609 			if (pg->flags & PG_FAKE) {
610 				pg->flags |= PG_RELEASED;
611 			}
612 		}
613 		uvm_page_unbusy(&pgs[ridx], orignpages);
614 		memset(pgs, 0, sizeof(pgs));
615 
616 		UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x",
617 			    startoffset, endoffset, 0,0);
618 		npgs = npages;
619 		uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL);
620 	}
621 	simple_unlock(&uobj->vmobjlock);
622 
623 	/*
624 	 * read the desired page(s).
625 	 */
626 
627 	totalbytes = npages << PAGE_SHIFT;
628 	bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0));
629 	tailbytes = totalbytes - bytes;
630 	skipbytes = 0;
631 
632 	kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK |
633 			     UVMPAGER_MAPIN_READ);
634 
635 	s = splbio();
636 	mbp = pool_get(&bufpool, PR_WAITOK);
637 	splx(s);
638 	mbp->b_bufsize = totalbytes;
639 	mbp->b_data = (void *)kva;
640 	mbp->b_resid = mbp->b_bcount = bytes;
641 	mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL : 0);
642 	mbp->b_iodone = (async ? uvm_aio_biodone : 0);
643 	mbp->b_vp = vp;
644 	LIST_INIT(&mbp->b_dep);
645 
646 	/*
647 	 * if EOF is in the middle of the range, zero the part past EOF.
648 	 * if the page including EOF is not PG_FAKE, skip over it since
649 	 * in that case it has valid data that we need to preserve.
650 	 */
651 
652 	if (tailbytes > 0) {
653 		size_t tailstart = bytes;
654 
655 		if ((pgs[bytes >> PAGE_SHIFT]->flags & PG_FAKE) == 0) {
656 			tailstart = round_page(tailstart);
657 			tailbytes -= tailstart - bytes;
658 		}
659 		UVMHIST_LOG(ubchist, "tailbytes %p 0x%x 0x%x",
660 			    kva, tailstart, tailbytes,0);
661 		memset((void *)(kva + tailstart), 0, tailbytes);
662 	}
663 
664 	/*
665 	 * now loop over the pages, reading as needed.
666 	 */
667 
668 	if (write) {
669 		lockmgr(&gp->g_glock, LK_EXCLUSIVE, NULL);
670 	} else {
671 		lockmgr(&gp->g_glock, LK_SHARED, NULL);
672 	}
673 
674 	bp = NULL;
675 	for (offset = startoffset;
676 	     bytes > 0;
677 	     offset += iobytes, bytes -= iobytes) {
678 
679 		/*
680 		 * skip pages which don't need to be read.
681 		 */
682 
683 		pidx = (offset - startoffset) >> PAGE_SHIFT;
684 		while ((pgs[pidx]->flags & (PG_FAKE|PG_RDONLY)) == 0) {
685 			size_t b;
686 
687 			KASSERT((offset & (PAGE_SIZE - 1)) == 0);
688 			b = MIN(PAGE_SIZE, bytes);
689 			offset += b;
690 			bytes -= b;
691 			skipbytes += b;
692 			pidx++;
693 			UVMHIST_LOG(ubchist, "skipping, new offset 0x%x",
694 				    offset, 0,0,0);
695 			if (bytes == 0) {
696 				goto loopdone;
697 			}
698 		}
699 
700 		/*
701 		 * bmap the file to find out the blkno to read from and
702 		 * how much we can read in one i/o.  if bmap returns an error,
703 		 * skip the rest of the top-level i/o.
704 		 */
705 
706 		lbn = offset >> fs_bshift;
707 		error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
708 		if (error) {
709 			UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%x -> %d\n",
710 				    lbn, error,0,0);
711 			skipbytes += bytes;
712 			goto loopdone;
713 		}
714 
715 		/*
716 		 * see how many pages can be read with this i/o.
717 		 * reduce the i/o size if necessary to avoid
718 		 * overwriting pages with valid data.
719 		 */
720 
721 		iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
722 		    bytes);
723 		if (offset + iobytes > round_page(offset)) {
724 			pcount = 1;
725 			while (pidx + pcount < npages &&
726 			       pgs[pidx + pcount]->flags & PG_FAKE) {
727 				pcount++;
728 			}
729 			iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) -
730 				      (offset - trunc_page(offset)));
731 		}
732 
733 		/*
734 		 * if this block isn't allocated, zero it instead of reading it.
735 		 * if this is a read access, mark the pages we zeroed PG_RDONLY.
736 		 */
737 
738 		if (blkno < 0) {
739 			int holepages = (round_page(offset + iobytes) -
740 					 trunc_page(offset)) >> PAGE_SHIFT;
741 			UVMHIST_LOG(ubchist, "lbn 0x%x -> HOLE", lbn,0,0,0);
742 
743 			sawhole = TRUE;
744 			memset((char *)kva + (offset - startoffset), 0,
745 			       iobytes);
746 			skipbytes += iobytes;
747 
748 			for (i = 0; i < holepages; i++) {
749 				if (write) {
750 					pgs[pidx + i]->flags &= ~PG_CLEAN;
751 				} else {
752 					pgs[pidx + i]->flags |= PG_RDONLY;
753 				}
754 			}
755 			continue;
756 		}
757 
758 		/*
759 		 * allocate a sub-buf for this piece of the i/o
760 		 * (or just use mbp if there's only 1 piece),
761 		 * and start it going.
762 		 */
763 
764 		if (offset == startoffset && iobytes == bytes) {
765 			bp = mbp;
766 		} else {
767 			s = splbio();
768 			bp = pool_get(&bufpool, PR_WAITOK);
769 			splx(s);
770 			bp->b_data = (char *)kva + offset - startoffset;
771 			bp->b_resid = bp->b_bcount = iobytes;
772 			bp->b_flags = B_BUSY|B_READ|B_CALL;
773 			bp->b_iodone = uvm_aio_biodone1;
774 			bp->b_vp = vp;
775 			bp->b_proc = NULL;
776 			LIST_INIT(&bp->b_dep);
777 		}
778 		bp->b_lblkno = 0;
779 		bp->b_private = mbp;
780 		if (devvp->v_type == VBLK) {
781 			bp->b_dev = devvp->v_rdev;
782 		}
783 
784 		/* adjust physical blkno for partial blocks */
785 		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
786 				       dev_bshift);
787 
788 		UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x",
789 			    bp, offset, iobytes, bp->b_blkno);
790 
791 		VOP_STRATEGY(bp);
792 	}
793 
794 loopdone:
795 	if (skipbytes) {
796 		s = splbio();
797 		if (error) {
798 			mbp->b_flags |= B_ERROR;
799 			mbp->b_error = error;
800 		}
801 		mbp->b_resid -= skipbytes;
802 		if (mbp->b_resid == 0) {
803 			biodone(mbp);
804 		}
805 		splx(s);
806 	}
807 
808 	if (async) {
809 		UVMHIST_LOG(ubchist, "returning 0 (async)",0,0,0,0);
810 		lockmgr(&gp->g_glock, LK_RELEASE, NULL);
811 		return 0;
812 	}
813 	if (bp != NULL) {
814 		error = biowait(mbp);
815 	}
816 	s = splbio();
817 	pool_put(&bufpool, mbp);
818 	splx(s);
819 	uvm_pagermapout(kva, npages);
820 	raoffset = startoffset + totalbytes;
821 
822 	/*
823 	 * if this we encountered a hole then we have to do a little more work.
824 	 * for read faults, we marked the page PG_RDONLY so that future
825 	 * write accesses to the page will fault again.
826 	 * for write faults, we must make sure that the backing store for
827 	 * the page is completely allocated while the pages are locked.
828 	 */
829 
830 	if (!error && sawhole && write) {
831 		for (i = 0; i < npages; i++) {
832 			if (pgs[i] == NULL) {
833 				continue;
834 			}
835 			pgs[i]->flags &= ~PG_CLEAN;
836 			UVMHIST_LOG(ubchist, "mark dirty pg %p", pgs[i],0,0,0);
837 		}
838 		error = GOP_ALLOC(vp, startoffset, npages << PAGE_SHIFT, 0,
839 				  cred);
840 		UVMHIST_LOG(ubchist, "gop_alloc off 0x%x/0x%x -> %d",
841 		    startoffset, npages << PAGE_SHIFT, error,0);
842 	}
843 	lockmgr(&gp->g_glock, LK_RELEASE, NULL);
844 	simple_lock(&uobj->vmobjlock);
845 
846 	/*
847 	 * see if we want to start any readahead.
848 	 * XXXUBC for now, just read the next 128k on 64k boundaries.
849 	 * this is pretty nonsensical, but it is 50% faster than reading
850 	 * just the next 64k.
851 	 */
852 
853 raout:
854 	if (!error && !async && !write && ((int)raoffset & 0xffff) == 0 &&
855 	    PAGE_SHIFT <= 16) {
856 		off_t rasize;
857 		int racount;
858 
859 		/* XXXUBC temp limit, from above */
860 		racount = MIN(1 << (16 - PAGE_SHIFT), MAX_READ_AHEAD);
861 		rasize = racount << PAGE_SHIFT;
862 		(void) VOP_GETPAGES(vp, raoffset, NULL, &racount, 0,
863 				    VM_PROT_READ, 0, 0);
864 		simple_lock(&uobj->vmobjlock);
865 
866 		/* XXXUBC temp limit, from above */
867 		racount = MIN(1 << (16 - PAGE_SHIFT), MAX_READ_AHEAD);
868 		(void) VOP_GETPAGES(vp, raoffset + rasize, NULL, &racount, 0,
869 				    VM_PROT_READ, 0, 0);
870 		simple_lock(&uobj->vmobjlock);
871 	}
872 
873 	/*
874 	 * we're almost done!  release the pages...
875 	 * for errors, we free the pages.
876 	 * otherwise we activate them and mark them as valid and clean.
877 	 * also, unbusy pages that were not actually requested.
878 	 */
879 
880 	if (error) {
881 		for (i = 0; i < npages; i++) {
882 			if (pgs[i] == NULL) {
883 				continue;
884 			}
885 			UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
886 				    pgs[i], pgs[i]->flags, 0,0);
887 			if (pgs[i]->flags & PG_FAKE) {
888 				pgs[i]->flags |= PG_RELEASED;
889 			}
890 		}
891 		uvm_lock_pageq();
892 		uvm_page_unbusy(pgs, npages);
893 		uvm_unlock_pageq();
894 		simple_unlock(&uobj->vmobjlock);
895 		UVMHIST_LOG(ubchist, "returning error %d", error,0,0,0);
896 		return error;
897 	}
898 
899 out:
900 	UVMHIST_LOG(ubchist, "succeeding, npages %d", npages,0,0,0);
901 	uvm_lock_pageq();
902 	for (i = 0; i < npages; i++) {
903 		pg = pgs[i];
904 		if (pg == NULL) {
905 			continue;
906 		}
907 		UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
908 			    pg, pg->flags, 0,0);
909 		if (pg->flags & PG_FAKE && !overwrite) {
910 			pg->flags &= ~(PG_FAKE);
911 			pmap_clear_modify(pgs[i]);
912 		}
913 		if (write) {
914 			pg->flags &= ~(PG_RDONLY);
915 		}
916 		if (i < ridx || i >= ridx + orignpages || async) {
917 			UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x",
918 				    pg, pg->offset,0,0);
919 			if (pg->flags & PG_WANTED) {
920 				wakeup(pg);
921 			}
922 			if (pg->flags & PG_FAKE) {
923 				KASSERT(overwrite);
924 				uvm_pagezero(pg);
925 			}
926 			if (pg->flags & PG_RELEASED) {
927 				uvm_pagefree(pg);
928 				continue;
929 			}
930 			uvm_pageactivate(pg);
931 			pg->flags &= ~(PG_WANTED|PG_BUSY|PG_FAKE);
932 			UVM_PAGE_OWN(pg, NULL);
933 		}
934 	}
935 	uvm_unlock_pageq();
936 	simple_unlock(&uobj->vmobjlock);
937 	if (ap->a_m != NULL) {
938 		memcpy(ap->a_m, &pgs[ridx],
939 		       orignpages * sizeof(struct vm_page *));
940 	}
941 	return 0;
942 }
943 
944 /*
945  * generic VM putpages routine.
946  * Write the given range of pages to backing store.
947  *
948  * => "offhi == 0" means flush all pages at or after "offlo".
949  * => object should be locked by caller.   we may _unlock_ the object
950  *	if (and only if) we need to clean a page (PGO_CLEANIT), or
951  *	if PGO_SYNCIO is set and there are pages busy.
952  *	we return with the object locked.
953  * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O).
954  *	thus, a caller might want to unlock higher level resources
955  *	(e.g. vm_map) before calling flush.
956  * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, then we will neither
957  *	unlock the object nor block.
958  * => if PGO_ALLPAGES is set, then all pages in the object will be processed.
959  * => NOTE: we rely on the fact that the object's memq is a TAILQ and
960  *	that new pages are inserted on the tail end of the list.   thus,
961  *	we can make a complete pass through the object in one go by starting
962  *	at the head and working towards the tail (new pages are put in
963  *	front of us).
964  * => NOTE: we are allowed to lock the page queues, so the caller
965  *	must not be holding the page queue lock.
966  *
967  * note on "cleaning" object and PG_BUSY pages:
968  *	this routine is holding the lock on the object.   the only time
969  *	that it can run into a PG_BUSY page that it does not own is if
970  *	some other process has started I/O on the page (e.g. either
971  *	a pagein, or a pageout).    if the PG_BUSY page is being paged
972  *	in, then it can not be dirty (!PG_CLEAN) because no one has
973  *	had a chance to modify it yet.    if the PG_BUSY page is being
974  *	paged out then it means that someone else has already started
975  *	cleaning the page for us (how nice!).    in this case, if we
976  *	have syncio specified, then after we make our pass through the
977  *	object we need to wait for the other PG_BUSY pages to clear
978  *	off (i.e. we need to do an iosync).   also note that once a
979  *	page is PG_BUSY it must stay in its object until it is un-busyed.
980  *
981  * note on page traversal:
982  *	we can traverse the pages in an object either by going down the
983  *	linked list in "uobj->memq", or we can go over the address range
984  *	by page doing hash table lookups for each address.    depending
985  *	on how many pages are in the object it may be cheaper to do one
986  *	or the other.   we set "by_list" to true if we are using memq.
987  *	if the cost of a hash lookup was equal to the cost of the list
988  *	traversal we could compare the number of pages in the start->stop
989  *	range to the total number of pages in the object.   however, it
990  *	seems that a hash table lookup is more expensive than the linked
991  *	list traversal, so we multiply the number of pages in the
992  *	range by an estimate of the relatively higher cost of the hash lookup.
993  */
994 
995 int
996 genfs_putpages(v)
997 	void *v;
998 {
999 	struct vop_putpages_args /* {
1000 		struct vnode *a_vp;
1001 		voff_t a_offlo;
1002 		voff_t a_offhi;
1003 		int a_flags;
1004 	} */ *ap = v;
1005 	struct vnode *vp = ap->a_vp;
1006 	struct uvm_object *uobj = &vp->v_uobj;
1007 	struct simplelock *slock = &uobj->vmobjlock;
1008 	off_t startoff = ap->a_offlo;
1009 	off_t endoff = ap->a_offhi;
1010 	off_t off;
1011 	int flags = ap->a_flags;
1012 	int n = MAXBSIZE >> PAGE_SHIFT;
1013 	int i, s, error, npages, nback;
1014 	int freeflag;
1015 	struct vm_page *pgs[n], *pg, *nextpg, *tpg, curmp, endmp;
1016 	boolean_t wasclean, by_list, needs_clean;
1017 	boolean_t async = (flags & PGO_SYNCIO) == 0;
1018 	UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
1019 
1020 	KASSERT(flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE));
1021 	KASSERT((startoff & PAGE_MASK) == 0 && (endoff & PAGE_MASK) == 0);
1022 	KASSERT(startoff < endoff || endoff == 0);
1023 
1024 	UVMHIST_LOG(ubchist, "vp %p pages %d off 0x%x len 0x%x",
1025 	    vp, uobj->uo_npages, startoff, endoff - startoff);
1026 	if (uobj->uo_npages == 0) {
1027 		if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
1028 		    (vp->v_flag & VONWORKLST)) {
1029 			vp->v_flag &= ~VONWORKLST;
1030 			LIST_REMOVE(vp, v_synclist);
1031 		}
1032 		simple_unlock(slock);
1033 		return 0;
1034 	}
1035 
1036 	/*
1037 	 * the vnode has pages, set up to process the request.
1038 	 */
1039 
1040 	error = 0;
1041 	s = splbio();
1042 	wasclean = (vp->v_numoutput == 0);
1043 	splx(s);
1044 	off = startoff;
1045 	if (endoff == 0 || flags & PGO_ALLPAGES) {
1046 		endoff = trunc_page(LLONG_MAX);
1047 	}
1048 	by_list = (uobj->uo_npages <=
1049 	    ((endoff - startoff) >> PAGE_SHIFT) * UVM_PAGE_HASH_PENALTY);
1050 
1051 	/*
1052 	 * start the loop.  when scanning by list, hold the last page
1053 	 * in the list before we start.  pages allocated after we start
1054 	 * will be added to the end of the list, so we can stop at the
1055 	 * current last page.
1056 	 */
1057 
1058 	freeflag = (curproc == uvm.pagedaemon_proc) ? PG_PAGEOUT : PG_RELEASED;
1059 	curmp.uobject = uobj;
1060 	curmp.offset = (voff_t)-1;
1061 	curmp.flags = PG_BUSY;
1062 	endmp.uobject = uobj;
1063 	endmp.offset = (voff_t)-1;
1064 	endmp.flags = PG_BUSY;
1065 	if (by_list) {
1066 		pg = TAILQ_FIRST(&uobj->memq);
1067 		TAILQ_INSERT_TAIL(&uobj->memq, &endmp, listq);
1068 		PHOLD(curproc);
1069 	} else {
1070 		pg = uvm_pagelookup(uobj, off);
1071 	}
1072 	nextpg = NULL;
1073 	while (by_list || off < endoff) {
1074 		if (curproc->p_cpu->ci_schedstate.spc_flags &
1075 		    SPCF_SHOULDYIELD) {
1076 			simple_unlock(slock);
1077 			preempt(NULL);
1078 			simple_lock(slock);
1079 		}
1080 
1081 		/*
1082 		 * if the current page is not interesting, move on to the next.
1083 		 */
1084 
1085 		KASSERT(pg == NULL || pg->uobject == uobj);
1086 		KASSERT(pg == NULL ||
1087 			(pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1088 			(pg->flags & PG_BUSY) != 0);
1089 		if (by_list) {
1090 			if (pg == &endmp) {
1091 				break;
1092 			}
1093 			if (pg->offset < startoff || pg->offset >= endoff ||
1094 			    pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
1095 				pg = TAILQ_NEXT(pg, listq);
1096 				continue;
1097 			}
1098 			off = pg->offset;
1099 		} else if (pg == NULL || pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
1100 			off += PAGE_SIZE;
1101 			if (off < endoff) {
1102 				pg = uvm_pagelookup(uobj, off);
1103 			}
1104 			continue;
1105 		}
1106 
1107 		/*
1108 		 * if the current page needs to be cleaned and it's busy,
1109 		 * wait for it to become unbusy.
1110 		 */
1111 
1112 		if (flags & PGO_FREE) {
1113 			pmap_page_protect(pg, VM_PROT_NONE);
1114 		}
1115 		if (flags & PGO_CLEANIT) {
1116 			needs_clean = pmap_clear_modify(pg) ||
1117 				(pg->flags & PG_CLEAN) == 0;
1118 			pg->flags |= PG_CLEAN;
1119 		} else {
1120 			needs_clean = FALSE;
1121 		}
1122 		if (needs_clean && pg->flags & PG_BUSY) {
1123 			KASSERT(curproc != uvm.pagedaemon_proc);
1124 			UVMHIST_LOG(ubchist, "busy %p", pg,0,0,0);
1125 			if (by_list) {
1126 				TAILQ_INSERT_BEFORE(pg, &curmp, listq);
1127 				UVMHIST_LOG(ubchist, "curmp next %p",
1128 					    TAILQ_NEXT(&curmp, listq), 0,0,0);
1129 			}
1130 			pg->flags |= PG_WANTED;
1131 			pg->flags &= ~PG_CLEAN;
1132 			UVM_UNLOCK_AND_WAIT(pg, slock, 0, "genput", 0);
1133 			simple_lock(slock);
1134 			if (by_list) {
1135 				UVMHIST_LOG(ubchist, "after next %p",
1136 					    TAILQ_NEXT(&curmp, listq), 0,0,0);
1137 				pg = TAILQ_NEXT(&curmp, listq);
1138 				TAILQ_REMOVE(&uobj->memq, &curmp, listq);
1139 			} else {
1140 				pg = uvm_pagelookup(uobj, off);
1141 			}
1142 			continue;
1143 		}
1144 
1145 		/*
1146 		 * if we're cleaning, build a cluster.
1147 		 * the cluster will consist of pages which are currently dirty,
1148 		 * but they will be returned to us marked clean.
1149 		 * if not cleaning, just operate on the one page.
1150 		 */
1151 
1152 		if (needs_clean) {
1153 			wasclean = FALSE;
1154 			memset(pgs, 0, sizeof(pgs));
1155 			pg->flags |= PG_BUSY;
1156 			UVM_PAGE_OWN(pg, "genfs_putpages");
1157 
1158 			/*
1159 			 * first look backward.
1160 			 */
1161 
1162 			npages = MIN(n >> 1, off >> PAGE_SHIFT);
1163 			nback = npages;
1164 			uvn_findpages(uobj, off - PAGE_SIZE, &nback, &pgs[0],
1165 			    UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD);
1166 			if (nback) {
1167 				memmove(&pgs[0], &pgs[npages - nback],
1168 				    nback * sizeof(pgs[0]));
1169 			}
1170 			n -= nback;
1171 
1172 			/*
1173 			 * then plug in our page of interest.
1174 			 */
1175 
1176 			pgs[nback] = pg;
1177 
1178 			/*
1179 			 * then look forward to fill in the remaining space in
1180 			 * the array of pages.
1181 			 */
1182 
1183 			npages = MIN(n, (endoff - off) >> PAGE_SHIFT) - 1;
1184 			uvn_findpages(uobj, off + PAGE_SIZE, &npages,
1185 			    &pgs[nback + 1],
1186 			    UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY);
1187 			npages += nback + 1;
1188 		} else {
1189 			pgs[0] = pg;
1190 			npages = 1;
1191 		}
1192 
1193 		/*
1194 		 * apply FREE or DEACTIVATE options if requested.
1195 		 */
1196 
1197 		if (flags & (PGO_DEACTIVATE|PGO_FREE)) {
1198 			uvm_lock_pageq();
1199 		}
1200 		for (i = 0; i < npages; i++) {
1201 			tpg = pgs[i];
1202 			KASSERT(tpg->uobject == uobj);
1203 			if (flags & PGO_DEACTIVATE &&
1204 			    (tpg->pqflags & PQ_INACTIVE) == 0 &&
1205 			    tpg->wire_count == 0) {
1206 				(void) pmap_clear_reference(tpg);
1207 				uvm_pagedeactivate(tpg);
1208 			} else if (flags & PGO_FREE) {
1209 				pmap_page_protect(tpg, VM_PROT_NONE);
1210 				if (tpg->flags & PG_BUSY) {
1211 					tpg->flags |= freeflag;
1212 					if (freeflag == PG_PAGEOUT) {
1213 						uvmexp.paging++;
1214 						uvm_pagedequeue(tpg);
1215 					}
1216 				} else {
1217 					nextpg = TAILQ_NEXT(tpg, listq);
1218 					uvm_pagefree(tpg);
1219 				}
1220 			}
1221 		}
1222 		if (flags & (PGO_DEACTIVATE|PGO_FREE)) {
1223 			uvm_unlock_pageq();
1224 		}
1225 		if (needs_clean) {
1226 
1227 			/*
1228 			 * start the i/o.  if we're traversing by list,
1229 			 * keep our place in the list with a marker page.
1230 			 */
1231 
1232 			if (by_list) {
1233 				TAILQ_INSERT_AFTER(&uobj->memq, pg, &curmp,
1234 				    listq);
1235 			}
1236 			simple_unlock(slock);
1237 			error = GOP_WRITE(vp, pgs, npages, flags);
1238 			simple_lock(slock);
1239 			if (by_list) {
1240 				pg = TAILQ_NEXT(&curmp, listq);
1241 				TAILQ_REMOVE(&uobj->memq, &curmp, listq);
1242 			}
1243 			if (error == ENOMEM) {
1244 				for (i = 0; i < npages; i++) {
1245 					tpg = pgs[i];
1246 					if (tpg->flags & PG_PAGEOUT) {
1247 						tpg->flags &= ~PG_PAGEOUT;
1248 						uvmexp.paging--;
1249 					}
1250 					tpg->flags &= ~PG_CLEAN;
1251 					uvm_pageactivate(tpg);
1252 				}
1253 				uvm_page_unbusy(pgs, npages);
1254 			}
1255 			if (error) {
1256 				break;
1257 			}
1258 			if (by_list) {
1259 				continue;
1260 			}
1261 		}
1262 
1263 		/*
1264 		 * find the next page and continue if there was no error.
1265 		 */
1266 
1267 		if (by_list) {
1268 			if (nextpg) {
1269 				pg = nextpg;
1270 				nextpg = NULL;
1271 			} else {
1272 				pg = TAILQ_NEXT(pg, listq);
1273 			}
1274 		} else {
1275 			off += npages << PAGE_SHIFT;
1276 			if (off < endoff) {
1277 				pg = uvm_pagelookup(uobj, off);
1278 			}
1279 		}
1280 	}
1281 	if (by_list) {
1282 		TAILQ_REMOVE(&uobj->memq, &endmp, listq);
1283 		PRELE(curproc);
1284 	}
1285 
1286 	/*
1287 	 * if we're cleaning and there was nothing to clean,
1288 	 * take us off the syncer list.  if we started any i/o
1289 	 * and we're doing sync i/o, wait for all writes to finish.
1290 	 */
1291 
1292 	if ((flags & PGO_CLEANIT) && wasclean &&
1293 	    startoff == 0 && endoff == trunc_page(LLONG_MAX) &&
1294 	    LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
1295 	    (vp->v_flag & VONWORKLST)) {
1296 		vp->v_flag &= ~VONWORKLST;
1297 		LIST_REMOVE(vp, v_synclist);
1298 	}
1299 	if (!wasclean && !async) {
1300 		s = splbio();
1301 		while (vp->v_numoutput != 0) {
1302 			vp->v_flag |= VBWAIT;
1303 			UVM_UNLOCK_AND_WAIT(&vp->v_numoutput, slock, FALSE,
1304 			    "genput2", 0);
1305 			simple_lock(slock);
1306 		}
1307 		splx(s);
1308 	}
1309 	simple_unlock(&uobj->vmobjlock);
1310 	return error;
1311 }
1312 
1313 int
1314 genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
1315 {
1316 	int s, error, run;
1317 	int fs_bshift, dev_bshift;
1318 	vaddr_t kva;
1319 	off_t eof, offset, startoffset;
1320 	size_t bytes, iobytes, skipbytes;
1321 	daddr_t lbn, blkno;
1322 	struct vm_page *pg;
1323 	struct buf *mbp, *bp;
1324 	struct vnode *devvp;
1325 	boolean_t async = (flags & PGO_SYNCIO) == 0;
1326 	UVMHIST_FUNC("genfs_gop_write"); UVMHIST_CALLED(ubchist);
1327 
1328 	UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
1329 	    vp, pgs, npages, flags);
1330 
1331 	GOP_SIZE(vp, vp->v_size, &eof);
1332 	if (vp->v_type == VREG) {
1333 		fs_bshift = vp->v_mount->mnt_fs_bshift;
1334 		dev_bshift = vp->v_mount->mnt_dev_bshift;
1335 	} else {
1336 		fs_bshift = DEV_BSHIFT;
1337 		dev_bshift = DEV_BSHIFT;
1338 	}
1339 	error = 0;
1340 	pg = pgs[0];
1341 	startoffset = pg->offset;
1342 	bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
1343 	skipbytes = 0;
1344 	KASSERT(bytes != 0);
1345 
1346 	kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
1347 			     UVMPAGER_MAPIN_WAITOK);
1348 
1349 	s = splbio();
1350 	vp->v_numoutput += 2;
1351 	mbp = pool_get(&bufpool, PR_WAITOK);
1352 	UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x",
1353 		    vp, mbp, vp->v_numoutput, bytes);
1354 	splx(s);
1355 	mbp->b_bufsize = npages << PAGE_SHIFT;
1356 	mbp->b_data = (void *)kva;
1357 	mbp->b_resid = mbp->b_bcount = bytes;
1358 	mbp->b_flags = B_BUSY|B_WRITE|B_AGE| (async ? (B_CALL|B_ASYNC) : 0);
1359 	mbp->b_iodone = uvm_aio_biodone;
1360 	mbp->b_vp = vp;
1361 	LIST_INIT(&mbp->b_dep);
1362 
1363 	bp = NULL;
1364 	for (offset = startoffset;
1365 	     bytes > 0;
1366 	     offset += iobytes, bytes -= iobytes) {
1367 		lbn = offset >> fs_bshift;
1368 		error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
1369 		if (error) {
1370 			UVMHIST_LOG(ubchist, "VOP_BMAP() -> %d", error,0,0,0);
1371 			skipbytes += bytes;
1372 			bytes = 0;
1373 			break;
1374 		}
1375 
1376 		iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
1377 		    bytes);
1378 		if (blkno == (daddr_t)-1) {
1379 			skipbytes += iobytes;
1380 			continue;
1381 		}
1382 
1383 		/* if it's really one i/o, don't make a second buf */
1384 		if (offset == startoffset && iobytes == bytes) {
1385 			bp = mbp;
1386 		} else {
1387 			s = splbio();
1388 			vp->v_numoutput++;
1389 			bp = pool_get(&bufpool, PR_WAITOK);
1390 			UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
1391 				    vp, bp, vp->v_numoutput, 0);
1392 			splx(s);
1393 			bp->b_data = (char *)kva +
1394 				(vaddr_t)(offset - pg->offset);
1395 			bp->b_resid = bp->b_bcount = iobytes;
1396 			bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC;
1397 			bp->b_iodone = uvm_aio_biodone1;
1398 			bp->b_vp = vp;
1399 			LIST_INIT(&bp->b_dep);
1400 		}
1401 		bp->b_lblkno = 0;
1402 		bp->b_private = mbp;
1403 		if (devvp->v_type == VBLK) {
1404 			bp->b_dev = devvp->v_rdev;
1405 		}
1406 
1407 		/* adjust physical blkno for partial blocks */
1408 		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
1409 				       dev_bshift);
1410 		UVMHIST_LOG(ubchist, "vp %p offset 0x%x bcount 0x%x blkno 0x%x",
1411 			    vp, offset, bp->b_bcount, bp->b_blkno);
1412 		VOP_STRATEGY(bp);
1413 	}
1414 	if (skipbytes) {
1415 		UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0);
1416 		s = splbio();
1417 		if (error) {
1418 			mbp->b_flags |= B_ERROR;
1419 			mbp->b_error = error;
1420 		}
1421 		mbp->b_resid -= skipbytes;
1422 		if (mbp->b_resid == 0) {
1423 			biodone(mbp);
1424 		}
1425 		splx(s);
1426 	}
1427 	if (async) {
1428 		UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
1429 		return 0;
1430 	}
1431 	UVMHIST_LOG(ubchist, "waiting for mbp %p", mbp,0,0,0);
1432 	error = biowait(mbp);
1433 	uvm_aio_aiodone(mbp);
1434 	UVMHIST_LOG(ubchist, "returning, error %d", error,0,0,0);
1435 	return error;
1436 }
1437 
1438 /*
1439  * VOP_PUTPAGES() for vnodes which never have pages.
1440  */
1441 
1442 int
1443 genfs_null_putpages(void *v)
1444 {
1445 	struct vop_putpages_args /* {
1446 		struct vnode *a_vp;
1447 		voff_t a_offlo;
1448 		voff_t a_offhi;
1449 		int a_flags;
1450 	} */ *ap = v;
1451 	struct vnode *vp = ap->a_vp;
1452 
1453 	KASSERT(vp->v_uobj.uo_npages == 0);
1454 	simple_unlock(&vp->v_interlock);
1455 	return (0);
1456 }
1457 
1458 void
1459 genfs_node_init(struct vnode *vp, struct genfs_ops *ops)
1460 {
1461 	struct genfs_node *gp = VTOG(vp);
1462 
1463 	lockinit(&gp->g_glock, PINOD, "glock", 0, 0);
1464 	gp->g_op = ops;
1465 }
1466 
1467 void
1468 genfs_size(struct vnode *vp, off_t size, off_t *eobp)
1469 {
1470 	int bsize;
1471 
1472 	bsize = 1 << vp->v_mount->mnt_fs_bshift;
1473 	*eobp = (size + bsize - 1) & ~(bsize - 1);
1474 }
1475 
1476 int
1477 genfs_compat_getpages(void *v)
1478 {
1479 	struct vop_getpages_args /* {
1480 		struct vnode *a_vp;
1481 		voff_t a_offset;
1482 		struct vm_page **a_m;
1483 		int *a_count;
1484 		int a_centeridx;
1485 		vm_prot_t a_access_type;
1486 		int a_advice;
1487 		int a_flags;
1488 	} */ *ap = v;
1489 
1490 	off_t origoffset;
1491 	struct vnode *vp = ap->a_vp;
1492 	struct uvm_object *uobj = &vp->v_uobj;
1493 	struct vm_page *pg, **pgs;
1494 	vaddr_t kva;
1495 	int i, error, orignpages, npages;
1496 	struct iovec iov;
1497 	struct uio uio;
1498 	struct ucred *cred = curproc->p_ucred;
1499 	boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0;
1500 
1501 	error = 0;
1502 	origoffset = ap->a_offset;
1503 	orignpages = *ap->a_count;
1504 	pgs = ap->a_m;
1505 
1506 	if (write && (vp->v_flag & VONWORKLST) == 0) {
1507 		vn_syncer_add_to_worklist(vp, filedelay);
1508 	}
1509 	if (ap->a_flags & PGO_LOCKED) {
1510 		uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m,
1511 			      UFP_NOWAIT|UFP_NOALLOC|UFP_NORDONLY);
1512 
1513 		return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
1514 	}
1515 	if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= vp->v_size) {
1516 		simple_unlock(&uobj->vmobjlock);
1517 		return EINVAL;
1518 	}
1519 	npages = orignpages;
1520 	uvn_findpages(uobj, origoffset, &npages, pgs, UFP_ALL);
1521 	simple_unlock(&uobj->vmobjlock);
1522 	kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK |
1523 			     UVMPAGER_MAPIN_READ);
1524 	for (i = 0; i < npages; i++) {
1525 		pg = pgs[i];
1526 		if ((pg->flags & PG_FAKE) == 0) {
1527 			continue;
1528 		}
1529 		iov.iov_base = (char *)kva + (i << PAGE_SHIFT);
1530 		iov.iov_len = PAGE_SIZE;
1531 		uio.uio_iov = &iov;
1532 		uio.uio_iovcnt = 1;
1533 		uio.uio_offset = origoffset + (i << PAGE_SHIFT);
1534 		uio.uio_segflg = UIO_SYSSPACE;
1535 		uio.uio_rw = UIO_READ;
1536 		uio.uio_resid = PAGE_SIZE;
1537 		uio.uio_procp = curproc;
1538 		error = VOP_READ(vp, &uio, 0, cred);
1539 		if (error) {
1540 			break;
1541 		}
1542 	}
1543 	uvm_pagermapout(kva, npages);
1544 	simple_lock(&uobj->vmobjlock);
1545 	uvm_lock_pageq();
1546 	for (i = 0; i < npages; i++) {
1547 		pg = pgs[i];
1548 		if (error && (pg->flags & PG_FAKE) != 0) {
1549 			pg->flags |= PG_RELEASED;
1550 		} else {
1551 			pmap_clear_modify(pg);
1552 			uvm_pageactivate(pg);
1553 		}
1554 	}
1555 	if (error) {
1556 		uvm_page_unbusy(pgs, npages);
1557 	}
1558 	uvm_unlock_pageq();
1559 	simple_unlock(&uobj->vmobjlock);
1560 	return error;
1561 }
1562 
1563 int
1564 genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
1565     int flags)
1566 {
1567 	off_t offset;
1568 	struct iovec iov;
1569 	struct uio uio;
1570 	struct ucred *cred = curproc->p_ucred;
1571 	struct buf *bp;
1572 	vaddr_t kva;
1573 	int s, error;
1574 
1575 	offset = pgs[0]->offset;
1576 	kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
1577 			     UVMPAGER_MAPIN_WAITOK);
1578 
1579 	iov.iov_base = (void *)kva;
1580 	iov.iov_len = npages << PAGE_SHIFT;
1581 	uio.uio_iov = &iov;
1582 	uio.uio_iovcnt = npages;
1583 	uio.uio_offset = offset;
1584 	uio.uio_segflg = UIO_SYSSPACE;
1585 	uio.uio_rw = UIO_WRITE;
1586 	uio.uio_resid = npages << PAGE_SHIFT;
1587 	uio.uio_procp = curproc;
1588 	error = VOP_WRITE(vp, &uio, 0, cred);
1589 
1590 	s = splbio();
1591 	vp->v_numoutput++;
1592 	bp = pool_get(&bufpool, PR_WAITOK);
1593 	splx(s);
1594 
1595 	bp->b_flags = B_BUSY | B_WRITE | B_AGE;
1596 	bp->b_vp = vp;
1597 	bp->b_lblkno = offset >> vp->v_mount->mnt_fs_bshift;
1598 	bp->b_data = (char *)kva;
1599 	bp->b_bcount = npages << PAGE_SHIFT;
1600 	bp->b_bufsize = npages << PAGE_SHIFT;
1601 	bp->b_resid = 0;
1602 	LIST_INIT(&bp->b_dep);
1603 	if (error) {
1604 		bp->b_flags |= B_ERROR;
1605 		bp->b_error = error;
1606 	}
1607 	uvm_aio_aiodone(bp);
1608 	return error;
1609 }
1610