xref: /netbsd-src/sys/miscfs/genfs/genfs_vnops.c (revision 8a8f936f250a330d54f8a24ed0e92aadf9743a7b)
1 /*	$NetBSD: genfs_vnops.c,v 1.39 2001/10/03 14:13:08 enami Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  */
36 
37 #include "opt_nfsserver.h"
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/kernel.h>
43 #include <sys/mount.h>
44 #include <sys/namei.h>
45 #include <sys/vnode.h>
46 #include <sys/fcntl.h>
47 #include <sys/malloc.h>
48 #include <sys/poll.h>
49 #include <sys/mman.h>
50 
51 #include <miscfs/genfs/genfs.h>
52 #include <miscfs/genfs/genfs_node.h>
53 #include <miscfs/specfs/specdev.h>
54 
55 #include <uvm/uvm.h>
56 #include <uvm/uvm_pager.h>
57 
58 #ifdef NFSSERVER
59 #include <nfs/rpcv2.h>
60 #include <nfs/nfsproto.h>
61 #include <nfs/nfs.h>
62 #include <nfs/nqnfs.h>
63 #include <nfs/nfs_var.h>
64 #endif
65 
66 int
67 genfs_poll(v)
68 	void *v;
69 {
70 	struct vop_poll_args /* {
71 		struct vnode *a_vp;
72 		int a_events;
73 		struct proc *a_p;
74 	} */ *ap = v;
75 
76 	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
77 }
78 
79 int
80 genfs_fsync(v)
81 	void *v;
82 {
83 	struct vop_fsync_args /* {
84 		struct vnode *a_vp;
85 		struct ucred *a_cred;
86 		int a_flags;
87 		off_t offlo;
88 		off_t offhi;
89 		struct proc *a_p;
90 	} */ *ap = v;
91 	struct vnode *vp = ap->a_vp;
92 	int wait;
93 
94 	wait = (ap->a_flags & FSYNC_WAIT) != 0;
95 	vflushbuf(vp, wait);
96 	if ((ap->a_flags & FSYNC_DATAONLY) != 0)
97 		return (0);
98 	else
99 		return (VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0));
100 }
101 
102 int
103 genfs_seek(v)
104 	void *v;
105 {
106 	struct vop_seek_args /* {
107 		struct vnode *a_vp;
108 		off_t a_oldoff;
109 		off_t a_newoff;
110 		struct ucred *a_ucred;
111 	} */ *ap = v;
112 
113 	if (ap->a_newoff < 0)
114 		return (EINVAL);
115 
116 	return (0);
117 }
118 
119 int
120 genfs_abortop(v)
121 	void *v;
122 {
123 	struct vop_abortop_args /* {
124 		struct vnode *a_dvp;
125 		struct componentname *a_cnp;
126 	} */ *ap = v;
127 
128 	if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF)
129 		PNBUF_PUT(ap->a_cnp->cn_pnbuf);
130 	return (0);
131 }
132 
133 int
134 genfs_fcntl(v)
135 	void *v;
136 {
137 	struct vop_fcntl_args /* {
138 		struct vnode *a_vp;
139 		u_int a_command;
140 		caddr_t a_data;
141 		int a_fflag;
142 		struct ucred *a_cred;
143 		struct proc *a_p;
144 	} */ *ap = v;
145 
146 	if (ap->a_command == F_SETFL)
147 		return (0);
148 	else
149 		return (EOPNOTSUPP);
150 }
151 
152 /*ARGSUSED*/
153 int
154 genfs_badop(v)
155 	void *v;
156 {
157 
158 	panic("genfs: bad op");
159 }
160 
161 /*ARGSUSED*/
162 int
163 genfs_nullop(v)
164 	void *v;
165 {
166 
167 	return (0);
168 }
169 
170 /*ARGSUSED*/
171 int
172 genfs_einval(v)
173 	void *v;
174 {
175 
176 	return (EINVAL);
177 }
178 
179 /*ARGSUSED*/
180 int
181 genfs_eopnotsupp(v)
182 	void *v;
183 {
184 
185 	return (EOPNOTSUPP);
186 }
187 
188 /*
189  * Called when an fs doesn't support a particular vop but the vop needs to
190  * vrele, vput, or vunlock passed in vnodes.
191  */
192 int
193 genfs_eopnotsupp_rele(v)
194 	void *v;
195 {
196 	struct vop_generic_args /*
197 		struct vnodeop_desc *a_desc;
198 		/ * other random data follows, presumably * /
199 	} */ *ap = v;
200 	struct vnodeop_desc *desc = ap->a_desc;
201 	struct vnode *vp;
202 	int flags, i, j, offset;
203 
204 	flags = desc->vdesc_flags;
205 	for (i = 0; i < VDESC_MAX_VPS; flags >>=1, i++) {
206 		if ((offset = desc->vdesc_vp_offsets[i]) == VDESC_NO_OFFSET)
207 			break;	/* stop at end of list */
208 		if ((j = flags & VDESC_VP0_WILLPUT)) {
209 			vp = *VOPARG_OFFSETTO(struct vnode**,offset,ap);
210 			switch (j) {
211 			case VDESC_VP0_WILLPUT:
212 				vput(vp);
213 				break;
214 			case VDESC_VP0_WILLUNLOCK:
215 				VOP_UNLOCK(vp, 0);
216 				break;
217 			case VDESC_VP0_WILLRELE:
218 				vrele(vp);
219 				break;
220 			}
221 		}
222 	}
223 
224 	return (EOPNOTSUPP);
225 }
226 
227 /*ARGSUSED*/
228 int
229 genfs_ebadf(v)
230 	void *v;
231 {
232 
233 	return (EBADF);
234 }
235 
236 /* ARGSUSED */
237 int
238 genfs_enoioctl(v)
239 	void *v;
240 {
241 
242 	return (ENOTTY);
243 }
244 
245 
246 /*
247  * Eliminate all activity associated with the requested vnode
248  * and with all vnodes aliased to the requested vnode.
249  */
250 int
251 genfs_revoke(v)
252 	void *v;
253 {
254 	struct vop_revoke_args /* {
255 		struct vnode *a_vp;
256 		int a_flags;
257 	} */ *ap = v;
258 	struct vnode *vp, *vq;
259 	struct proc *p = curproc;	/* XXX */
260 
261 #ifdef DIAGNOSTIC
262 	if ((ap->a_flags & REVOKEALL) == 0)
263 		panic("genfs_revoke: not revokeall");
264 #endif
265 
266 	vp = ap->a_vp;
267 	simple_lock(&vp->v_interlock);
268 
269 	if (vp->v_flag & VALIASED) {
270 		/*
271 		 * If a vgone (or vclean) is already in progress,
272 		 * wait until it is done and return.
273 		 */
274 		if (vp->v_flag & VXLOCK) {
275 			vp->v_flag |= VXWANT;
276 			simple_unlock(&vp->v_interlock);
277 			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
278 			return (0);
279 		}
280 		/*
281 		 * Ensure that vp will not be vgone'd while we
282 		 * are eliminating its aliases.
283 		 */
284 		vp->v_flag |= VXLOCK;
285 		simple_unlock(&vp->v_interlock);
286 		while (vp->v_flag & VALIASED) {
287 			simple_lock(&spechash_slock);
288 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
289 				if (vq->v_rdev != vp->v_rdev ||
290 				    vq->v_type != vp->v_type || vp == vq)
291 					continue;
292 				simple_unlock(&spechash_slock);
293 				vgone(vq);
294 				break;
295 			}
296 			if (vq == NULLVP)
297 				simple_unlock(&spechash_slock);
298 		}
299 		/*
300 		 * Remove the lock so that vgone below will
301 		 * really eliminate the vnode after which time
302 		 * vgone will awaken any sleepers.
303 		 */
304 		simple_lock(&vp->v_interlock);
305 		vp->v_flag &= ~VXLOCK;
306 	}
307 	vgonel(vp, p);
308 	return (0);
309 }
310 
311 /*
312  * Lock the node.
313  */
314 int
315 genfs_lock(v)
316 	void *v;
317 {
318 	struct vop_lock_args /* {
319 		struct vnode *a_vp;
320 		int a_flags;
321 	} */ *ap = v;
322 	struct vnode *vp = ap->a_vp;
323 
324 	return (lockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock));
325 }
326 
327 /*
328  * Unlock the node.
329  */
330 int
331 genfs_unlock(v)
332 	void *v;
333 {
334 	struct vop_unlock_args /* {
335 		struct vnode *a_vp;
336 		int a_flags;
337 	} */ *ap = v;
338 	struct vnode *vp = ap->a_vp;
339 
340 	return (lockmgr(&vp->v_lock, ap->a_flags | LK_RELEASE,
341 		&vp->v_interlock));
342 }
343 
344 /*
345  * Return whether or not the node is locked.
346  */
347 int
348 genfs_islocked(v)
349 	void *v;
350 {
351 	struct vop_islocked_args /* {
352 		struct vnode *a_vp;
353 	} */ *ap = v;
354 	struct vnode *vp = ap->a_vp;
355 
356 	return (lockstatus(&vp->v_lock));
357 }
358 
359 /*
360  * Stubs to use when there is no locking to be done on the underlying object.
361  */
362 int
363 genfs_nolock(v)
364 	void *v;
365 {
366 	struct vop_lock_args /* {
367 		struct vnode *a_vp;
368 		int a_flags;
369 		struct proc *a_p;
370 	} */ *ap = v;
371 
372 	/*
373 	 * Since we are not using the lock manager, we must clear
374 	 * the interlock here.
375 	 */
376 	if (ap->a_flags & LK_INTERLOCK)
377 		simple_unlock(&ap->a_vp->v_interlock);
378 	return (0);
379 }
380 
381 int
382 genfs_nounlock(v)
383 	void *v;
384 {
385 	return (0);
386 }
387 
388 int
389 genfs_noislocked(v)
390 	void *v;
391 {
392 	return (0);
393 }
394 
395 /*
396  * Local lease check for NFS servers.  Just set up args and let
397  * nqsrv_getlease() do the rest.  If NFSSERVER is not in the kernel,
398  * this is a null operation.
399  */
400 int
401 genfs_lease_check(v)
402 	void *v;
403 {
404 #ifdef NFSSERVER
405 	struct vop_lease_args /* {
406 		struct vnode *a_vp;
407 		struct proc *a_p;
408 		struct ucred *a_cred;
409 		int a_flag;
410 	} */ *ap = v;
411 	u_int32_t duration = 0;
412 	int cache;
413 	u_quad_t frev;
414 
415 	(void) nqsrv_getlease(ap->a_vp, &duration, ND_CHECK | ap->a_flag,
416 	    NQLOCALSLP, ap->a_p, (struct mbuf *)0, &cache, &frev, ap->a_cred);
417 	return (0);
418 #else
419 	return (0);
420 #endif /* NFSSERVER */
421 }
422 
423 int
424 genfs_mmap(v)
425 	void *v;
426 {
427 	return 0;
428 }
429 
430 /*
431  * generic VM getpages routine.
432  * Return PG_BUSY pages for the given range,
433  * reading from backing store if necessary.
434  */
435 
436 int
437 genfs_getpages(v)
438 	void *v;
439 {
440 	struct vop_getpages_args /* {
441 		struct vnode *a_vp;
442 		voff_t a_offset;
443 		struct vm_page **a_m;
444 		int *a_count;
445 		int a_centeridx;
446 		vm_prot_t a_access_type;
447 		int a_advice;
448 		int a_flags;
449 	} */ *ap = v;
450 
451 	off_t newsize, diskeof, memeof;
452 	off_t offset, origoffset, startoffset, endoffset, raoffset;
453 	daddr_t lbn, blkno;
454 	int s, i, error, npages, orignpages, npgs, run, ridx, pidx, pcount;
455 	int fs_bshift, fs_bsize, dev_bshift;
456 	int flags = ap->a_flags;
457 	size_t bytes, iobytes, tailbytes, totalbytes, skipbytes;
458 	vaddr_t kva;
459 	struct buf *bp, *mbp;
460 	struct vnode *vp = ap->a_vp;
461 	struct vnode *devvp;
462 	struct genfs_node *gp = VTOG(vp);
463 	struct uvm_object *uobj = &vp->v_uobj;
464 	struct vm_page *pg, *pgs[16];			/* XXXUBC 16 */
465 	struct ucred *cred = curproc->p_ucred;		/* XXXUBC curproc */
466 	boolean_t async = (flags & PGO_SYNCIO) == 0;
467 	boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0;
468 	boolean_t sawhole = FALSE;
469 	boolean_t overwrite = (flags & PGO_OVERWRITE) != 0;
470 	UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist);
471 
472 	UVMHIST_LOG(ubchist, "vp %p off 0x%x/%x count %d",
473 		    vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count);
474 
475 	/* XXXUBC temp limit */
476 	if (*ap->a_count > 16) {
477 		panic("genfs_getpages: too many pages");
478 	}
479 
480 	error = 0;
481 	origoffset = ap->a_offset;
482 	orignpages = *ap->a_count;
483 	GOP_SIZE(vp, vp->v_size, &diskeof);
484 	if (flags & PGO_PASTEOF) {
485 		newsize = MAX(vp->v_size,
486 			      origoffset + (orignpages << PAGE_SHIFT));
487 		GOP_SIZE(vp, newsize, &memeof);
488 	} else {
489 		memeof = diskeof;
490 	}
491 	KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages);
492 	KASSERT((origoffset & (PAGE_SIZE - 1)) == 0 && origoffset >= 0);
493 	KASSERT(orignpages > 0);
494 
495 	/*
496 	 * Bounds-check the request.
497 	 */
498 
499 	if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) {
500 		if ((flags & PGO_LOCKED) == 0) {
501 			simple_unlock(&uobj->vmobjlock);
502 		}
503 		UVMHIST_LOG(ubchist, "off 0x%x count %d goes past EOF 0x%x",
504 			    origoffset, *ap->a_count, memeof,0);
505 		return EINVAL;
506 	}
507 
508 	/*
509 	 * For PGO_LOCKED requests, just return whatever's in memory.
510 	 */
511 
512 	if (flags & PGO_LOCKED) {
513 		uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m,
514 			      UFP_NOWAIT|UFP_NOALLOC|UFP_NORDONLY);
515 
516 		return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
517 	}
518 
519 	/* vnode is VOP_LOCKed, uobj is locked */
520 
521 	if (write && (vp->v_flag & VONWORKLST) == 0) {
522 		vn_syncer_add_to_worklist(vp, filedelay);
523 	}
524 
525 	/*
526 	 * find the requested pages and make some simple checks.
527 	 * leave space in the page array for a whole block.
528 	 */
529 
530 	if (vp->v_type == VREG) {
531 		fs_bshift = vp->v_mount->mnt_fs_bshift;
532 		dev_bshift = vp->v_mount->mnt_dev_bshift;
533 	} else {
534 		fs_bshift = DEV_BSHIFT;
535 		dev_bshift = DEV_BSHIFT;
536 	}
537 	fs_bsize = 1 << fs_bshift;
538 
539 	orignpages = MIN(orignpages,
540 	    round_page(memeof - origoffset) >> PAGE_SHIFT);
541 	npages = orignpages;
542 	startoffset = origoffset & ~(fs_bsize - 1);
543 	endoffset = round_page((origoffset + (npages << PAGE_SHIFT)
544 				+ fs_bsize - 1) & ~(fs_bsize - 1));
545 	endoffset = MIN(endoffset, round_page(memeof));
546 	ridx = (origoffset - startoffset) >> PAGE_SHIFT;
547 
548 	memset(pgs, 0, sizeof(pgs));
549 	uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL);
550 
551 	/*
552 	 * if the pages are already resident, just return them.
553 	 */
554 
555 	for (i = 0; i < npages; i++) {
556 		struct vm_page *pg = pgs[ridx + i];
557 
558 		if ((pg->flags & PG_FAKE) ||
559 		    (write && (pg->flags & PG_RDONLY))) {
560 			break;
561 		}
562 	}
563 	if (i == npages) {
564 		UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0);
565 		raoffset = origoffset + (orignpages << PAGE_SHIFT);
566 		npages += ridx;
567 		goto raout;
568 	}
569 
570 	/*
571 	 * if PGO_OVERWRITE is set, don't bother reading the pages.
572 	 */
573 
574 	if (flags & PGO_OVERWRITE) {
575 		UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0);
576 
577 		for (i = 0; i < npages; i++) {
578 			struct vm_page *pg = pgs[ridx + i];
579 
580 			pg->flags &= ~(PG_RDONLY|PG_CLEAN);
581 		}
582 		npages += ridx;
583 		goto out;
584 	}
585 
586 	/*
587 	 * the page wasn't resident and we're not overwriting,
588 	 * so we're going to have to do some i/o.
589 	 * find any additional pages needed to cover the expanded range.
590 	 */
591 
592 	npages = (endoffset - startoffset) >> PAGE_SHIFT;
593 	if (startoffset != origoffset || npages != orignpages) {
594 
595 		/*
596 		 * we need to avoid deadlocks caused by locking
597 		 * additional pages at lower offsets than pages we
598 		 * already have locked.  unlock them all and start over.
599 		 */
600 
601 		for (i = 0; i < orignpages; i++) {
602 			struct vm_page *pg = pgs[ridx + i];
603 
604 			if (pg->flags & PG_FAKE) {
605 				pg->flags |= PG_RELEASED;
606 			}
607 		}
608 		uvm_page_unbusy(&pgs[ridx], orignpages);
609 		memset(pgs, 0, sizeof(pgs));
610 
611 		UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x",
612 			    startoffset, endoffset, 0,0);
613 		npgs = npages;
614 		uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL);
615 	}
616 	simple_unlock(&uobj->vmobjlock);
617 
618 	/*
619 	 * read the desired page(s).
620 	 */
621 
622 	totalbytes = npages << PAGE_SHIFT;
623 	bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0));
624 	tailbytes = totalbytes - bytes;
625 	skipbytes = 0;
626 
627 	kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK |
628 			     UVMPAGER_MAPIN_READ);
629 
630 	s = splbio();
631 	mbp = pool_get(&bufpool, PR_WAITOK);
632 	splx(s);
633 	mbp->b_bufsize = totalbytes;
634 	mbp->b_data = (void *)kva;
635 	mbp->b_resid = mbp->b_bcount = bytes;
636 	mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL : 0);
637 	mbp->b_iodone = (async ? uvm_aio_biodone : 0);
638 	mbp->b_vp = vp;
639 	LIST_INIT(&mbp->b_dep);
640 
641 	/*
642 	 * if EOF is in the middle of the range, zero the part past EOF.
643 	 * if the page including EOF is not PG_FAKE, skip over it since
644 	 * in that case it has valid data that we need to preserve.
645 	 */
646 
647 	if (tailbytes > 0) {
648 		size_t tailstart = bytes;
649 
650 		if ((pgs[bytes >> PAGE_SHIFT]->flags & PG_FAKE) == 0) {
651 			tailstart = round_page(tailstart);
652 			tailbytes -= tailstart - bytes;
653 		}
654 		UVMHIST_LOG(ubchist, "tailbytes %p 0x%x 0x%x",
655 			    kva, tailstart, tailbytes,0);
656 		memset((void *)(kva + tailstart), 0, tailbytes);
657 	}
658 
659 	/*
660 	 * now loop over the pages, reading as needed.
661 	 */
662 
663 	if (write) {
664 		lockmgr(&gp->g_glock, LK_EXCLUSIVE, NULL);
665 	} else {
666 		lockmgr(&gp->g_glock, LK_SHARED, NULL);
667 	}
668 
669 	bp = NULL;
670 	for (offset = startoffset;
671 	     bytes > 0;
672 	     offset += iobytes, bytes -= iobytes) {
673 
674 		/*
675 		 * skip pages which don't need to be read.
676 		 */
677 
678 		pidx = (offset - startoffset) >> PAGE_SHIFT;
679 		while ((pgs[pidx]->flags & (PG_FAKE|PG_RDONLY)) == 0) {
680 			size_t b;
681 
682 			KASSERT((offset & (PAGE_SIZE - 1)) == 0);
683 			b = MIN(PAGE_SIZE, bytes);
684 			offset += b;
685 			bytes -= b;
686 			skipbytes += b;
687 			pidx++;
688 			UVMHIST_LOG(ubchist, "skipping, new offset 0x%x",
689 				    offset, 0,0,0);
690 			if (bytes == 0) {
691 				goto loopdone;
692 			}
693 		}
694 
695 		/*
696 		 * bmap the file to find out the blkno to read from and
697 		 * how much we can read in one i/o.  if bmap returns an error,
698 		 * skip the rest of the top-level i/o.
699 		 */
700 
701 		lbn = offset >> fs_bshift;
702 		error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
703 		if (error) {
704 			UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%x -> %d\n",
705 				    lbn, error,0,0);
706 			skipbytes += bytes;
707 			goto loopdone;
708 		}
709 
710 		/*
711 		 * see how many pages can be read with this i/o.
712 		 * reduce the i/o size if necessary to avoid
713 		 * overwriting pages with valid data.
714 		 */
715 
716 		iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
717 		    bytes);
718 		if (offset + iobytes > round_page(offset)) {
719 			pcount = 1;
720 			while (pidx + pcount < npages &&
721 			       pgs[pidx + pcount]->flags & PG_FAKE) {
722 				pcount++;
723 			}
724 			iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) -
725 				      (offset - trunc_page(offset)));
726 		}
727 
728 		/*
729 		 * if this block isn't allocated, zero it instead of reading it.
730 		 * if this is a read access, mark the pages we zeroed PG_RDONLY.
731 		 */
732 
733 		if (blkno < 0) {
734 			int holepages = (round_page(offset + iobytes) -
735 					 trunc_page(offset)) >> PAGE_SHIFT;
736 			UVMHIST_LOG(ubchist, "lbn 0x%x -> HOLE", lbn,0,0,0);
737 
738 			sawhole = TRUE;
739 			memset((char *)kva + (offset - startoffset), 0,
740 			       iobytes);
741 			skipbytes += iobytes;
742 
743 			for (i = 0; i < holepages; i++) {
744 				if (write) {
745 					pgs[pidx + i]->flags &= ~PG_CLEAN;
746 				} else {
747 					pgs[pidx + i]->flags |= PG_RDONLY;
748 				}
749 			}
750 			continue;
751 		}
752 
753 		/*
754 		 * allocate a sub-buf for this piece of the i/o
755 		 * (or just use mbp if there's only 1 piece),
756 		 * and start it going.
757 		 */
758 
759 		if (offset == startoffset && iobytes == bytes) {
760 			bp = mbp;
761 		} else {
762 			s = splbio();
763 			bp = pool_get(&bufpool, PR_WAITOK);
764 			splx(s);
765 			bp->b_data = (char *)kva + offset - startoffset;
766 			bp->b_resid = bp->b_bcount = iobytes;
767 			bp->b_flags = B_BUSY|B_READ|B_CALL;
768 			bp->b_iodone = uvm_aio_biodone1;
769 			bp->b_vp = vp;
770 			bp->b_proc = NULL;
771 			LIST_INIT(&bp->b_dep);
772 		}
773 		bp->b_lblkno = 0;
774 		bp->b_private = mbp;
775 		if (devvp->v_type == VBLK) {
776 			bp->b_dev = devvp->v_rdev;
777 		}
778 
779 		/* adjust physical blkno for partial blocks */
780 		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
781 				       dev_bshift);
782 
783 		UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x",
784 			    bp, offset, iobytes, bp->b_blkno);
785 
786 		VOP_STRATEGY(bp);
787 	}
788 
789 loopdone:
790 	if (skipbytes) {
791 		s = splbio();
792 		if (error) {
793 			mbp->b_flags |= B_ERROR;
794 			mbp->b_error = error;
795 		}
796 		mbp->b_resid -= skipbytes;
797 		if (mbp->b_resid == 0) {
798 			biodone(mbp);
799 		}
800 		splx(s);
801 	}
802 
803 	if (async) {
804 		UVMHIST_LOG(ubchist, "returning 0 (async)",0,0,0,0);
805 		lockmgr(&gp->g_glock, LK_RELEASE, NULL);
806 		return 0;
807 	}
808 	if (bp != NULL) {
809 		error = biowait(mbp);
810 	}
811 	s = splbio();
812 	pool_put(&bufpool, mbp);
813 	splx(s);
814 	uvm_pagermapout(kva, npages);
815 	raoffset = startoffset + totalbytes;
816 
817 	/*
818 	 * if this we encountered a hole then we have to do a little more work.
819 	 * for read faults, we marked the page PG_RDONLY so that future
820 	 * write accesses to the page will fault again.
821 	 * for write faults, we must make sure that the backing store for
822 	 * the page is completely allocated while the pages are locked.
823 	 */
824 
825 	if (!error && sawhole && write) {
826 		for (i = 0; i < npages; i++) {
827 			if (pgs[i] == NULL) {
828 				continue;
829 			}
830 			pgs[i]->flags &= ~PG_CLEAN;
831 			UVMHIST_LOG(ubchist, "mark dirty pg %p", pgs[i],0,0,0);
832 		}
833 		error = GOP_ALLOC(vp, startoffset, npages << PAGE_SHIFT, 0,
834 				  cred);
835 		UVMHIST_LOG(ubchist, "gop_alloc off 0x%x/0x%x -> %d",
836 		    startoffset, npages << PAGE_SHIFT, error,0);
837 	}
838 	lockmgr(&gp->g_glock, LK_RELEASE, NULL);
839 	simple_lock(&uobj->vmobjlock);
840 
841 	/*
842 	 * see if we want to start any readahead.
843 	 * XXXUBC for now, just read the next 128k on 64k boundaries.
844 	 * this is pretty nonsensical, but it is 50% faster than reading
845 	 * just the next 64k.
846 	 */
847 
848 raout:
849 	if (!error && !async && !write && ((int)raoffset & 0xffff) == 0 &&
850 	    PAGE_SHIFT <= 16) {
851 		int racount;
852 
853 		racount = 1 << (16 - PAGE_SHIFT);
854 		(void) VOP_GETPAGES(vp, raoffset, NULL, &racount, 0,
855 				    VM_PROT_READ, 0, 0);
856 		simple_lock(&uobj->vmobjlock);
857 
858 		racount = 1 << (16 - PAGE_SHIFT);
859 		(void) VOP_GETPAGES(vp, raoffset + 0x10000, NULL, &racount, 0,
860 				    VM_PROT_READ, 0, 0);
861 		simple_lock(&uobj->vmobjlock);
862 	}
863 
864 	/*
865 	 * we're almost done!  release the pages...
866 	 * for errors, we free the pages.
867 	 * otherwise we activate them and mark them as valid and clean.
868 	 * also, unbusy pages that were not actually requested.
869 	 */
870 
871 	if (error) {
872 		for (i = 0; i < npages; i++) {
873 			if (pgs[i] == NULL) {
874 				continue;
875 			}
876 			UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
877 				    pgs[i], pgs[i]->flags, 0,0);
878 			if (pgs[i]->flags & PG_FAKE) {
879 				pgs[i]->flags |= PG_RELEASED;
880 			}
881 		}
882 		uvm_lock_pageq();
883 		uvm_page_unbusy(pgs, npages);
884 		uvm_unlock_pageq();
885 		simple_unlock(&uobj->vmobjlock);
886 		UVMHIST_LOG(ubchist, "returning error %d", error,0,0,0);
887 		return error;
888 	}
889 
890 out:
891 	UVMHIST_LOG(ubchist, "succeeding, npages %d", npages,0,0,0);
892 	uvm_lock_pageq();
893 	for (i = 0; i < npages; i++) {
894 		pg = pgs[i];
895 		if (pg == NULL) {
896 			continue;
897 		}
898 		UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
899 			    pg, pg->flags, 0,0);
900 		if (pg->flags & PG_FAKE && !overwrite) {
901 			pg->flags &= ~(PG_FAKE);
902 			pmap_clear_modify(pgs[i]);
903 		}
904 		if (write) {
905 			pg->flags &= ~(PG_RDONLY);
906 		}
907 		if (i < ridx || i >= ridx + orignpages || async) {
908 			UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x",
909 				    pg, pg->offset,0,0);
910 			if (pg->flags & PG_WANTED) {
911 				wakeup(pg);
912 			}
913 			if (pg->flags & PG_FAKE) {
914 				KASSERT(overwrite);
915 				uvm_pagezero(pg);
916 			}
917 			if (pg->flags & PG_RELEASED) {
918 				uvm_pagefree(pg);
919 				continue;
920 			}
921 			uvm_pageactivate(pg);
922 			pg->flags &= ~(PG_WANTED|PG_BUSY|PG_FAKE);
923 			UVM_PAGE_OWN(pg, NULL);
924 		}
925 	}
926 	uvm_unlock_pageq();
927 	simple_unlock(&uobj->vmobjlock);
928 	if (ap->a_m != NULL) {
929 		memcpy(ap->a_m, &pgs[ridx],
930 		       orignpages * sizeof(struct vm_page *));
931 	}
932 	return 0;
933 }
934 
935 /*
936  * generic VM putpages routine.
937  * Write the given range of pages to backing store.
938  *
939  * => "offhi == 0" means flush all pages at or after "offlo".
940  * => object should be locked by caller.   we may _unlock_ the object
941  *	if (and only if) we need to clean a page (PGO_CLEANIT), or
942  *	if PGO_SYNCIO is set and there are pages busy.
943  *	we return with the object locked.
944  * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O).
945  *	thus, a caller might want to unlock higher level resources
946  *	(e.g. vm_map) before calling flush.
947  * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, then we will neither
948  *	unlock the object nor block.
949  * => if PGO_ALLPAGES is set, then all pages in the object will be processed.
950  * => NOTE: we rely on the fact that the object's memq is a TAILQ and
951  *	that new pages are inserted on the tail end of the list.   thus,
952  *	we can make a complete pass through the object in one go by starting
953  *	at the head and working towards the tail (new pages are put in
954  *	front of us).
955  * => NOTE: we are allowed to lock the page queues, so the caller
956  *	must not be holding the page queue lock.
957  *
958  * note on "cleaning" object and PG_BUSY pages:
959  *	this routine is holding the lock on the object.   the only time
960  *	that it can run into a PG_BUSY page that it does not own is if
961  *	some other process has started I/O on the page (e.g. either
962  *	a pagein, or a pageout).    if the PG_BUSY page is being paged
963  *	in, then it can not be dirty (!PG_CLEAN) because no one has
964  *	had a chance to modify it yet.    if the PG_BUSY page is being
965  *	paged out then it means that someone else has already started
966  *	cleaning the page for us (how nice!).    in this case, if we
967  *	have syncio specified, then after we make our pass through the
968  *	object we need to wait for the other PG_BUSY pages to clear
969  *	off (i.e. we need to do an iosync).   also note that once a
970  *	page is PG_BUSY it must stay in its object until it is un-busyed.
971  *
972  * note on page traversal:
973  *	we can traverse the pages in an object either by going down the
974  *	linked list in "uobj->memq", or we can go over the address range
975  *	by page doing hash table lookups for each address.    depending
976  *	on how many pages are in the object it may be cheaper to do one
977  *	or the other.   we set "by_list" to true if we are using memq.
978  *	if the cost of a hash lookup was equal to the cost of the list
979  *	traversal we could compare the number of pages in the start->stop
980  *	range to the total number of pages in the object.   however, it
981  *	seems that a hash table lookup is more expensive than the linked
982  *	list traversal, so we multiply the number of pages in the
983  *	range by an estimate of the relatively higher cost of the hash lookup.
984  */
985 
986 int
987 genfs_putpages(v)
988 	void *v;
989 {
990 	struct vop_putpages_args /* {
991 		struct vnode *a_vp;
992 		voff_t a_offlo;
993 		voff_t a_offhi;
994 		int a_flags;
995 	} */ *ap = v;
996 	struct vnode *vp = ap->a_vp;
997 	struct uvm_object *uobj = &vp->v_uobj;
998 	off_t startoff = ap->a_offlo;
999 	off_t endoff = ap->a_offhi;
1000 	off_t off;
1001 	int flags = ap->a_flags;
1002 	int n = MAXBSIZE >> PAGE_SHIFT;
1003 	int i, s, error, npages, nback;
1004 	int freeflag;
1005 	struct vm_page *pgs[n], *pg, *nextpg, *tpg, curmp, endmp;
1006 	boolean_t wasclean, by_list, needs_clean;
1007 	boolean_t async = (flags & PGO_SYNCIO) == 0;
1008 	UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
1009 
1010 	KASSERT(flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE));
1011 	KASSERT((startoff & PAGE_MASK) == 0 && (endoff & PAGE_MASK) == 0);
1012 	KASSERT(startoff < endoff || endoff == 0);
1013 
1014 	UVMHIST_LOG(ubchist, "vp %p pages %d off 0x%x len 0x%x",
1015 	    vp, uobj->uo_npages, startoff, endoff - startoff);
1016 	if (uobj->uo_npages == 0) {
1017 		if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
1018 		    (vp->v_flag & VONWORKLST)) {
1019 			vp->v_flag &= ~VONWORKLST;
1020 			LIST_REMOVE(vp, v_synclist);
1021 		}
1022 		simple_unlock(&uobj->vmobjlock);
1023 		return 0;
1024 	}
1025 
1026 	/*
1027 	 * the vnode has pages, set up to process the request.
1028 	 */
1029 
1030 	error = 0;
1031 	wasclean = TRUE;
1032 	off = startoff;
1033 	if (endoff == 0 || flags & PGO_ALLPAGES) {
1034 		endoff = trunc_page(LLONG_MAX);
1035 	}
1036 	by_list = (uobj->uo_npages <=
1037 	    ((endoff - startoff) >> PAGE_SHIFT) * UVM_PAGE_HASH_PENALTY);
1038 
1039 	/*
1040 	 * start the loop.  when scanning by list, hold the last page
1041 	 * in the list before we start.  pages allocated after we start
1042 	 * will be added to the end of the list, so we can stop at the
1043 	 * current last page.
1044 	 */
1045 
1046 	freeflag = (curproc == uvm.pagedaemon_proc) ? PG_PAGEOUT : PG_RELEASED;
1047 	curmp.uobject = uobj;
1048 	curmp.offset = (voff_t)-1;
1049 	curmp.flags = PG_BUSY;
1050 	endmp.uobject = uobj;
1051 	endmp.offset = (voff_t)-1;
1052 	endmp.flags = PG_BUSY;
1053 	if (by_list) {
1054 		pg = TAILQ_FIRST(&uobj->memq);
1055 		TAILQ_INSERT_TAIL(&uobj->memq, &endmp, listq);
1056 		PHOLD(curproc);
1057 	} else {
1058 		pg = uvm_pagelookup(uobj, off);
1059 	}
1060 	nextpg = NULL;
1061 	while (by_list || off < endoff) {
1062 
1063 		/*
1064 		 * if the current page is not interesting, move on to the next.
1065 		 */
1066 
1067 		KASSERT(pg == NULL || pg->uobject == uobj);
1068 		KASSERT(pg == NULL ||
1069 			(pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1070 			(pg->flags & PG_BUSY) != 0);
1071 		if (by_list) {
1072 			if (pg == &endmp) {
1073 				break;
1074 			}
1075 			if (pg->offset < startoff || pg->offset >= endoff ||
1076 			    pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
1077 				pg = TAILQ_NEXT(pg, listq);
1078 				continue;
1079 			}
1080 			off = pg->offset;
1081 		} else if (pg == NULL || pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
1082 			off += PAGE_SIZE;
1083 			if (off < endoff) {
1084 				pg = uvm_pagelookup(uobj, off);
1085 			}
1086 			continue;
1087 		}
1088 
1089 		/*
1090 		 * if the current page needs to be cleaned and it's busy,
1091 		 * wait for it to become unbusy.
1092 		 */
1093 
1094 		if (flags & PGO_FREE) {
1095 			pmap_page_protect(pg, VM_PROT_NONE);
1096 		}
1097 		if (flags & PGO_CLEANIT) {
1098 			needs_clean = pmap_clear_modify(pg) ||
1099 				(pg->flags & PG_CLEAN) == 0;
1100 			pg->flags |= PG_CLEAN;
1101 		} else {
1102 			needs_clean = FALSE;
1103 		}
1104 		if (needs_clean && pg->flags & PG_BUSY) {
1105 			KASSERT(curproc != uvm.pagedaemon_proc);
1106 			UVMHIST_LOG(ubchist, "busy %p", pg,0,0,0);
1107 			if (by_list) {
1108 				TAILQ_INSERT_BEFORE(pg, &curmp, listq);
1109 				UVMHIST_LOG(ubchist, "curmp next %p",
1110 					    TAILQ_NEXT(&curmp, listq), 0,0,0);
1111 			}
1112 			pg->flags |= PG_WANTED;
1113 			pg->flags &= ~PG_CLEAN;
1114 			UVM_UNLOCK_AND_WAIT(pg, &uobj->vmobjlock, 0,
1115 			    "genput", 0);
1116 			simple_lock(&uobj->vmobjlock);
1117 			if (by_list) {
1118 				UVMHIST_LOG(ubchist, "after next %p",
1119 					    TAILQ_NEXT(&curmp, listq), 0,0,0);
1120 				pg = TAILQ_NEXT(&curmp, listq);
1121 				TAILQ_REMOVE(&uobj->memq, &curmp, listq);
1122 			} else {
1123 				pg = uvm_pagelookup(uobj, off);
1124 			}
1125 			continue;
1126 		}
1127 
1128 		/*
1129 		 * if we're cleaning, build a cluster.
1130 		 * the cluster will consist of pages which are currently dirty,
1131 		 * but they will be returned to us marked clean.
1132 		 * if not cleaning, just operate on the one page.
1133 		 */
1134 
1135 		if (needs_clean) {
1136 			wasclean = FALSE;
1137 			memset(pgs, 0, sizeof(pgs));
1138 			pg->flags |= PG_BUSY;
1139 			UVM_PAGE_OWN(pg, "genfs_putpages");
1140 
1141 			/*
1142 			 * first look backward.
1143 			 */
1144 
1145 			npages = MIN(n >> 1, off >> PAGE_SHIFT);
1146 			nback = npages;
1147 			uvn_findpages(uobj, off - PAGE_SIZE, &nback, &pgs[0],
1148 			    UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD);
1149 			if (nback) {
1150 				memmove(&pgs[0], &pgs[npages - nback],
1151 				    nback * sizeof(pgs[0]));
1152 			}
1153 			n -= nback;
1154 
1155 			/*
1156 			 * then plug in our page of interest.
1157 			 */
1158 
1159 			pgs[nback] = pg;
1160 
1161 			/*
1162 			 * then look forward to fill in the remaining space in
1163 			 * the array of pages.
1164 			 */
1165 
1166 			npages = MIN(n, (endoff - off) >> PAGE_SHIFT) - 1;
1167 			uvn_findpages(uobj, off + PAGE_SIZE, &npages,
1168 			    &pgs[nback + 1],
1169 			    UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY);
1170 			npages += nback + 1;
1171 		} else {
1172 			pgs[0] = pg;
1173 			npages = 1;
1174 		}
1175 
1176 		/*
1177 		 * apply FREE or DEACTIVATE options if requested.
1178 		 */
1179 
1180 		if (flags & (PGO_DEACTIVATE|PGO_FREE)) {
1181 			uvm_lock_pageq();
1182 		}
1183 		for (i = 0; i < npages; i++) {
1184 			tpg = pgs[i];
1185 			KASSERT(tpg->uobject == uobj);
1186 			if (flags & PGO_DEACTIVATE &&
1187 			    (tpg->pqflags & PQ_INACTIVE) == 0 &&
1188 			    tpg->wire_count == 0) {
1189 				(void) pmap_clear_reference(tpg);
1190 				uvm_pagedeactivate(tpg);
1191 			} else if (flags & PGO_FREE) {
1192 				pmap_page_protect(tpg, VM_PROT_NONE);
1193 				if (tpg->flags & PG_BUSY) {
1194 					tpg->flags |= freeflag;
1195 					if (freeflag == PG_PAGEOUT) {
1196 						uvmexp.paging++;
1197 						uvm_pagedequeue(tpg);
1198 					}
1199 				} else {
1200 					nextpg = TAILQ_NEXT(tpg, listq);
1201 					uvm_pagefree(tpg);
1202 				}
1203 			}
1204 		}
1205 		if (flags & (PGO_DEACTIVATE|PGO_FREE)) {
1206 			uvm_unlock_pageq();
1207 		}
1208 		if (needs_clean) {
1209 
1210 			/*
1211 			 * start the i/o.  if we're traversing by list,
1212 			 * keep our place in the list with a marker page.
1213 			 */
1214 
1215 			if (by_list) {
1216 				TAILQ_INSERT_AFTER(&uobj->memq, pg, &curmp,
1217 				    listq);
1218 			}
1219 			simple_unlock(&uobj->vmobjlock);
1220 			error = GOP_WRITE(vp, pgs, npages, flags);
1221 			simple_lock(&uobj->vmobjlock);
1222 			if (by_list) {
1223 				pg = TAILQ_NEXT(&curmp, listq);
1224 				TAILQ_REMOVE(&uobj->memq, &curmp, listq);
1225 			}
1226 			if (error == ENOMEM) {
1227 				for (i = 0; i < npages; i++) {
1228 					tpg = pgs[i];
1229 					if (tpg->flags & PG_PAGEOUT) {
1230 						tpg->flags &= ~PG_PAGEOUT;
1231 						uvmexp.paging--;
1232 					}
1233 					tpg->flags &= ~PG_CLEAN;
1234 					uvm_pageactivate(tpg);
1235 				}
1236 				uvm_page_unbusy(pgs, npages);
1237 			}
1238 			if (error) {
1239 				break;
1240 			}
1241 			if (by_list) {
1242 				continue;
1243 			}
1244 		}
1245 
1246 		/*
1247 		 * find the next page and continue if there was no error.
1248 		 */
1249 
1250 		if (by_list) {
1251 			if (nextpg) {
1252 				pg = nextpg;
1253 				nextpg = NULL;
1254 			} else {
1255 				pg = TAILQ_NEXT(pg, listq);
1256 			}
1257 		} else {
1258 			off += PAGE_SIZE;
1259 			if (off < endoff) {
1260 				pg = uvm_pagelookup(uobj, off);
1261 			}
1262 		}
1263 	}
1264 	if (by_list) {
1265 		TAILQ_REMOVE(&uobj->memq, &endmp, listq);
1266 		PRELE(curproc);
1267 	}
1268 
1269 	/*
1270 	 * if we're cleaning and there was nothing to clean,
1271 	 * take us off the syncer list.  if we started any i/o
1272 	 * and we're doing sync i/o, wait for all writes to finish.
1273 	 */
1274 
1275 	if ((flags & PGO_CLEANIT) && wasclean &&
1276 	    startoff == 0 && endoff == trunc_page(LLONG_MAX) &&
1277 	    LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
1278 	    (vp->v_flag & VONWORKLST)) {
1279 		vp->v_flag &= ~VONWORKLST;
1280 		LIST_REMOVE(vp, v_synclist);
1281 	}
1282 	if (!wasclean && !async) {
1283 		s = splbio();
1284 		while (vp->v_numoutput != 0) {
1285 			vp->v_flag |= VBWAIT;
1286 			UVM_UNLOCK_AND_WAIT(&vp->v_numoutput, &uobj->vmobjlock,
1287 					    FALSE, "genput2",0);
1288 			simple_lock(&uobj->vmobjlock);
1289 		}
1290 		splx(s);
1291 	}
1292 	simple_unlock(&uobj->vmobjlock);
1293 	return error;
1294 }
1295 
1296 int
1297 genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
1298 {
1299 	int s, error, run;
1300 	int fs_bshift, dev_bshift;
1301 	vaddr_t kva;
1302 	off_t eof, offset, startoffset;
1303 	size_t bytes, iobytes, skipbytes;
1304 	daddr_t lbn, blkno;
1305 	struct vm_page *pg;
1306 	struct buf *mbp, *bp;
1307 	struct vnode *devvp;
1308 	boolean_t async = (flags & PGO_SYNCIO) == 0;
1309 	UVMHIST_FUNC("genfs_gop_write"); UVMHIST_CALLED(ubchist);
1310 
1311 	UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
1312 	    vp, pgs, npages, flags);
1313 
1314 	GOP_SIZE(vp, vp->v_size, &eof);
1315 	if (vp->v_type == VREG) {
1316 		fs_bshift = vp->v_mount->mnt_fs_bshift;
1317 		dev_bshift = vp->v_mount->mnt_dev_bshift;
1318 	} else {
1319 		fs_bshift = DEV_BSHIFT;
1320 		dev_bshift = DEV_BSHIFT;
1321 	}
1322 	error = 0;
1323 	pg = pgs[0];
1324 	startoffset = pg->offset;
1325 	bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
1326 	skipbytes = 0;
1327 	KASSERT(bytes != 0);
1328 
1329 	kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
1330 			     UVMPAGER_MAPIN_WAITOK);
1331 
1332 	s = splbio();
1333 	vp->v_numoutput += 2;
1334 	mbp = pool_get(&bufpool, PR_WAITOK);
1335 	UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x",
1336 		    vp, mbp, vp->v_numoutput, bytes);
1337 	splx(s);
1338 	mbp->b_bufsize = npages << PAGE_SHIFT;
1339 	mbp->b_data = (void *)kva;
1340 	mbp->b_resid = mbp->b_bcount = bytes;
1341 	mbp->b_flags = B_BUSY|B_WRITE|B_AGE| (async ? B_CALL : 0);
1342 	mbp->b_iodone = uvm_aio_biodone;
1343 	mbp->b_vp = vp;
1344 	LIST_INIT(&mbp->b_dep);
1345 
1346 	bp = NULL;
1347 	for (offset = startoffset;
1348 	     bytes > 0;
1349 	     offset += iobytes, bytes -= iobytes) {
1350 		lbn = offset >> fs_bshift;
1351 		error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
1352 		if (error) {
1353 			UVMHIST_LOG(ubchist, "VOP_BMAP() -> %d", error,0,0,0);
1354 			skipbytes += bytes;
1355 			bytes = 0;
1356 			break;
1357 		}
1358 
1359 		iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
1360 		    bytes);
1361 		if (blkno == (daddr_t)-1) {
1362 			skipbytes += iobytes;
1363 			continue;
1364 		}
1365 
1366 		/* if it's really one i/o, don't make a second buf */
1367 		if (offset == startoffset && iobytes == bytes) {
1368 			bp = mbp;
1369 		} else {
1370 			s = splbio();
1371 			vp->v_numoutput++;
1372 			bp = pool_get(&bufpool, PR_WAITOK);
1373 			UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
1374 				    vp, bp, vp->v_numoutput, 0);
1375 			splx(s);
1376 			bp->b_data = (char *)kva +
1377 				(vaddr_t)(offset - pg->offset);
1378 			bp->b_resid = bp->b_bcount = iobytes;
1379 			bp->b_flags = B_BUSY|B_WRITE|B_CALL;
1380 			bp->b_iodone = uvm_aio_biodone1;
1381 			bp->b_vp = vp;
1382 			LIST_INIT(&bp->b_dep);
1383 		}
1384 		bp->b_lblkno = 0;
1385 		bp->b_private = mbp;
1386 		if (devvp->v_type == VBLK) {
1387 			bp->b_dev = devvp->v_rdev;
1388 		}
1389 
1390 		/* adjust physical blkno for partial blocks */
1391 		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
1392 				       dev_bshift);
1393 		UVMHIST_LOG(ubchist, "vp %p offset 0x%x bcount 0x%x blkno 0x%x",
1394 			    vp, offset, bp->b_bcount, bp->b_blkno);
1395 		VOP_STRATEGY(bp);
1396 	}
1397 	if (skipbytes) {
1398 		UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0);
1399 		s = splbio();
1400 		if (error) {
1401 			mbp->b_flags |= B_ERROR;
1402 			mbp->b_error = error;
1403 		}
1404 		mbp->b_resid -= skipbytes;
1405 		if (mbp->b_resid == 0) {
1406 			biodone(mbp);
1407 		}
1408 		splx(s);
1409 	}
1410 	if (async) {
1411 		UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
1412 		return 0;
1413 	}
1414 	UVMHIST_LOG(ubchist, "waiting for mbp %p", mbp,0,0,0);
1415 	error = biowait(mbp);
1416 	uvm_aio_aiodone(mbp);
1417 	UVMHIST_LOG(ubchist, "returning, error %d", error,0,0,0);
1418 	return error;
1419 }
1420 
1421 void
1422 genfs_node_init(struct vnode *vp, struct genfs_ops *ops)
1423 {
1424 	struct genfs_node *gp = VTOG(vp);
1425 
1426 	lockinit(&gp->g_glock, PINOD, "glock", 0, 0);
1427 	gp->g_op = ops;
1428 }
1429 
1430 void
1431 genfs_size(struct vnode *vp, off_t size, off_t *eobp)
1432 {
1433 	int bsize;
1434 
1435 	bsize = 1 << vp->v_mount->mnt_fs_bshift;
1436 	*eobp = (size + bsize - 1) & ~(bsize - 1);
1437 }
1438