xref: /netbsd-src/sys/miscfs/genfs/genfs_vnops.c (revision 3b01aba77a7a698587faaae455bbfe740923c1f5)
1 /*	$NetBSD: genfs_vnops.c,v 1.35 2001/06/14 08:22:14 chs Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  */
36 
37 #include "opt_nfsserver.h"
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/kernel.h>
43 #include <sys/mount.h>
44 #include <sys/namei.h>
45 #include <sys/vnode.h>
46 #include <sys/fcntl.h>
47 #include <sys/malloc.h>
48 #include <sys/poll.h>
49 
50 #include <miscfs/genfs/genfs.h>
51 #include <miscfs/specfs/specdev.h>
52 
53 #include <uvm/uvm.h>
54 #include <uvm/uvm_pager.h>
55 
56 #ifdef NFSSERVER
57 #include <nfs/rpcv2.h>
58 #include <nfs/nfsproto.h>
59 #include <nfs/nfs.h>
60 #include <nfs/nqnfs.h>
61 #include <nfs/nfs_var.h>
62 #endif
63 
64 int
65 genfs_poll(v)
66 	void *v;
67 {
68 	struct vop_poll_args /* {
69 		struct vnode *a_vp;
70 		int a_events;
71 		struct proc *a_p;
72 	} */ *ap = v;
73 
74 	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
75 }
76 
77 int
78 genfs_fsync(v)
79 	void *v;
80 {
81 	struct vop_fsync_args /* {
82 		struct vnode *a_vp;
83 		struct ucred *a_cred;
84 		int a_flags;
85 		off_t offlo;
86 		off_t offhi;
87 		struct proc *a_p;
88 	} */ *ap = v;
89 	struct vnode *vp = ap->a_vp;
90 	int wait;
91 
92 	wait = (ap->a_flags & FSYNC_WAIT) != 0;
93 	vflushbuf(vp, wait);
94 	if ((ap->a_flags & FSYNC_DATAONLY) != 0)
95 		return (0);
96 	else
97 		return (VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0));
98 }
99 
100 int
101 genfs_seek(v)
102 	void *v;
103 {
104 	struct vop_seek_args /* {
105 		struct vnode *a_vp;
106 		off_t a_oldoff;
107 		off_t a_newoff;
108 		struct ucred *a_ucred;
109 	} */ *ap = v;
110 
111 	if (ap->a_newoff < 0)
112 		return (EINVAL);
113 
114 	return (0);
115 }
116 
117 int
118 genfs_abortop(v)
119 	void *v;
120 {
121 	struct vop_abortop_args /* {
122 		struct vnode *a_dvp;
123 		struct componentname *a_cnp;
124 	} */ *ap = v;
125 
126 	if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF)
127 		PNBUF_PUT(ap->a_cnp->cn_pnbuf);
128 	return (0);
129 }
130 
131 int
132 genfs_fcntl(v)
133 	void *v;
134 {
135 	struct vop_fcntl_args /* {
136 		struct vnode *a_vp;
137 		u_int a_command;
138 		caddr_t a_data;
139 		int a_fflag;
140 		struct ucred *a_cred;
141 		struct proc *a_p;
142 	} */ *ap = v;
143 
144 	if (ap->a_command == F_SETFL)
145 		return (0);
146 	else
147 		return (EOPNOTSUPP);
148 }
149 
150 /*ARGSUSED*/
151 int
152 genfs_badop(v)
153 	void *v;
154 {
155 
156 	panic("genfs: bad op");
157 }
158 
159 /*ARGSUSED*/
160 int
161 genfs_nullop(v)
162 	void *v;
163 {
164 
165 	return (0);
166 }
167 
168 /*ARGSUSED*/
169 int
170 genfs_einval(v)
171 	void *v;
172 {
173 
174 	return (EINVAL);
175 }
176 
177 /*ARGSUSED*/
178 int
179 genfs_eopnotsupp(v)
180 	void *v;
181 {
182 
183 	return (EOPNOTSUPP);
184 }
185 
186 /*
187  * Called when an fs doesn't support a particular vop but the vop needs to
188  * vrele, vput, or vunlock passed in vnodes.
189  */
190 int
191 genfs_eopnotsupp_rele(v)
192 	void *v;
193 {
194 	struct vop_generic_args /*
195 		struct vnodeop_desc *a_desc;
196 		/ * other random data follows, presumably * /
197 	} */ *ap = v;
198 	struct vnodeop_desc *desc = ap->a_desc;
199 	struct vnode *vp;
200 	int flags, i, j, offset;
201 
202 	flags = desc->vdesc_flags;
203 	for (i = 0; i < VDESC_MAX_VPS; flags >>=1, i++) {
204 		if ((offset = desc->vdesc_vp_offsets[i]) == VDESC_NO_OFFSET)
205 			break;	/* stop at end of list */
206 		if ((j = flags & VDESC_VP0_WILLPUT)) {
207 			vp = *VOPARG_OFFSETTO(struct vnode**,offset,ap);
208 			switch (j) {
209 			case VDESC_VP0_WILLPUT:
210 				vput(vp);
211 				break;
212 			case VDESC_VP0_WILLUNLOCK:
213 				VOP_UNLOCK(vp, 0);
214 				break;
215 			case VDESC_VP0_WILLRELE:
216 				vrele(vp);
217 				break;
218 			}
219 		}
220 	}
221 
222 	return (EOPNOTSUPP);
223 }
224 
225 /*ARGSUSED*/
226 int
227 genfs_ebadf(v)
228 	void *v;
229 {
230 
231 	return (EBADF);
232 }
233 
234 /* ARGSUSED */
235 int
236 genfs_enoioctl(v)
237 	void *v;
238 {
239 
240 	return (ENOTTY);
241 }
242 
243 
244 /*
245  * Eliminate all activity associated with the requested vnode
246  * and with all vnodes aliased to the requested vnode.
247  */
248 int
249 genfs_revoke(v)
250 	void *v;
251 {
252 	struct vop_revoke_args /* {
253 		struct vnode *a_vp;
254 		int a_flags;
255 	} */ *ap = v;
256 	struct vnode *vp, *vq;
257 	struct proc *p = curproc;	/* XXX */
258 
259 #ifdef DIAGNOSTIC
260 	if ((ap->a_flags & REVOKEALL) == 0)
261 		panic("genfs_revoke: not revokeall");
262 #endif
263 
264 	vp = ap->a_vp;
265 	simple_lock(&vp->v_interlock);
266 
267 	if (vp->v_flag & VALIASED) {
268 		/*
269 		 * If a vgone (or vclean) is already in progress,
270 		 * wait until it is done and return.
271 		 */
272 		if (vp->v_flag & VXLOCK) {
273 			vp->v_flag |= VXWANT;
274 			simple_unlock(&vp->v_interlock);
275 			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
276 			return (0);
277 		}
278 		/*
279 		 * Ensure that vp will not be vgone'd while we
280 		 * are eliminating its aliases.
281 		 */
282 		vp->v_flag |= VXLOCK;
283 		simple_unlock(&vp->v_interlock);
284 		while (vp->v_flag & VALIASED) {
285 			simple_lock(&spechash_slock);
286 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
287 				if (vq->v_rdev != vp->v_rdev ||
288 				    vq->v_type != vp->v_type || vp == vq)
289 					continue;
290 				simple_unlock(&spechash_slock);
291 				vgone(vq);
292 				break;
293 			}
294 			if (vq == NULLVP)
295 				simple_unlock(&spechash_slock);
296 		}
297 		/*
298 		 * Remove the lock so that vgone below will
299 		 * really eliminate the vnode after which time
300 		 * vgone will awaken any sleepers.
301 		 */
302 		simple_lock(&vp->v_interlock);
303 		vp->v_flag &= ~VXLOCK;
304 	}
305 	vgonel(vp, p);
306 	return (0);
307 }
308 
309 /*
310  * Lock the node.
311  */
312 int
313 genfs_lock(v)
314 	void *v;
315 {
316 	struct vop_lock_args /* {
317 		struct vnode *a_vp;
318 		int a_flags;
319 	} */ *ap = v;
320 	struct vnode *vp = ap->a_vp;
321 
322 	return (lockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock));
323 }
324 
325 /*
326  * Unlock the node.
327  */
328 int
329 genfs_unlock(v)
330 	void *v;
331 {
332 	struct vop_unlock_args /* {
333 		struct vnode *a_vp;
334 		int a_flags;
335 	} */ *ap = v;
336 	struct vnode *vp = ap->a_vp;
337 
338 	return (lockmgr(&vp->v_lock, ap->a_flags | LK_RELEASE,
339 		&vp->v_interlock));
340 }
341 
342 /*
343  * Return whether or not the node is locked.
344  */
345 int
346 genfs_islocked(v)
347 	void *v;
348 {
349 	struct vop_islocked_args /* {
350 		struct vnode *a_vp;
351 	} */ *ap = v;
352 	struct vnode *vp = ap->a_vp;
353 
354 	return (lockstatus(&vp->v_lock));
355 }
356 
357 /*
358  * Stubs to use when there is no locking to be done on the underlying object.
359  */
360 int
361 genfs_nolock(v)
362 	void *v;
363 {
364 	struct vop_lock_args /* {
365 		struct vnode *a_vp;
366 		int a_flags;
367 		struct proc *a_p;
368 	} */ *ap = v;
369 
370 	/*
371 	 * Since we are not using the lock manager, we must clear
372 	 * the interlock here.
373 	 */
374 	if (ap->a_flags & LK_INTERLOCK)
375 		simple_unlock(&ap->a_vp->v_interlock);
376 	return (0);
377 }
378 
379 int
380 genfs_nounlock(v)
381 	void *v;
382 {
383 	return (0);
384 }
385 
386 int
387 genfs_noislocked(v)
388 	void *v;
389 {
390 	return (0);
391 }
392 
393 /*
394  * Local lease check for NFS servers.  Just set up args and let
395  * nqsrv_getlease() do the rest.  If NFSSERVER is not in the kernel,
396  * this is a null operation.
397  */
398 int
399 genfs_lease_check(v)
400 	void *v;
401 {
402 #ifdef NFSSERVER
403 	struct vop_lease_args /* {
404 		struct vnode *a_vp;
405 		struct proc *a_p;
406 		struct ucred *a_cred;
407 		int a_flag;
408 	} */ *ap = v;
409 	u_int32_t duration = 0;
410 	int cache;
411 	u_quad_t frev;
412 
413 	(void) nqsrv_getlease(ap->a_vp, &duration, ND_CHECK | ap->a_flag,
414 	    NQLOCALSLP, ap->a_p, (struct mbuf *)0, &cache, &frev, ap->a_cred);
415 	return (0);
416 #else
417 	return (0);
418 #endif /* NFSSERVER */
419 }
420 
421 int
422 genfs_mmap(v)
423 	void *v;
424 {
425 	return 0;
426 }
427 
428 /*
429  * generic VM getpages routine.
430  * Return PG_BUSY pages for the given range,
431  * reading from backing store if necessary.
432  */
433 
434 int
435 genfs_getpages(v)
436 	void *v;
437 {
438 	struct vop_getpages_args /* {
439 		struct vnode *a_vp;
440 		voff_t a_offset;
441 		struct vm_page **a_m;
442 		int *a_count;
443 		int a_centeridx;
444 		vm_prot_t a_access_type;
445 		int a_advice;
446 		int a_flags;
447 	} */ *ap = v;
448 
449 	off_t newsize, diskeof, memeof;
450 	off_t offset, origoffset, startoffset, endoffset, raoffset;
451 	daddr_t lbn, blkno;
452 	int s, i, error, npages, orignpages, npgs, run, ridx, pidx, pcount;
453 	int fs_bshift, fs_bsize, dev_bshift, dev_bsize;
454 	int flags = ap->a_flags;
455 	size_t bytes, iobytes, tailbytes, totalbytes, skipbytes;
456 	vaddr_t kva;
457 	struct buf *bp, *mbp;
458 	struct vnode *vp = ap->a_vp;
459 	struct uvm_object *uobj = &vp->v_uvm.u_obj;
460 	struct vm_page *pgs[16];			/* XXXUBC 16 */
461 	struct ucred *cred = curproc->p_ucred;		/* XXXUBC curproc */
462 	boolean_t async = (flags & PGO_SYNCIO) == 0;
463 	boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0;
464 	boolean_t sawhole = FALSE;
465 	UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist);
466 
467 	UVMHIST_LOG(ubchist, "vp %p off 0x%x/%x count %d",
468 		    vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count);
469 
470 	/* XXXUBC temp limit */
471 	if (*ap->a_count > 16) {
472 		return EINVAL;
473 	}
474 
475 	error = 0;
476 	origoffset = ap->a_offset;
477 	orignpages = *ap->a_count;
478 	error = VOP_SIZE(vp, vp->v_uvm.u_size, &diskeof);
479 	if (error) {
480 		return error;
481 	}
482 	if (flags & PGO_PASTEOF) {
483 		newsize = MAX(vp->v_uvm.u_size,
484 			      origoffset + (orignpages << PAGE_SHIFT));
485 		error = VOP_SIZE(vp, newsize, &memeof);
486 		if (error) {
487 			return error;
488 		}
489 	} else {
490 		memeof = diskeof;
491 	}
492 	KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages);
493 	KASSERT((origoffset & (PAGE_SIZE - 1)) == 0 && origoffset >= 0);
494 	KASSERT(orignpages > 0);
495 
496 	/*
497 	 * Bounds-check the request.
498 	 */
499 
500 	if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) {
501 		if ((flags & PGO_LOCKED) == 0) {
502 			simple_unlock(&uobj->vmobjlock);
503 		}
504 		UVMHIST_LOG(ubchist, "off 0x%x count %d goes past EOF 0x%x",
505 			    origoffset, *ap->a_count, memeof,0);
506 		return EINVAL;
507 	}
508 
509 	/*
510 	 * For PGO_LOCKED requests, just return whatever's in memory.
511 	 */
512 
513 	if (flags & PGO_LOCKED) {
514 		uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m,
515 			      UFP_NOWAIT|UFP_NOALLOC|UFP_NORDONLY);
516 
517 		return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
518 	}
519 
520 	/* vnode is VOP_LOCKed, uobj is locked */
521 
522 	if (write && (vp->v_flag & VONWORKLST) == 0) {
523 		vn_syncer_add_to_worklist(vp, filedelay);
524 	}
525 
526 	/*
527 	 * find the requested pages and make some simple checks.
528 	 * leave space in the page array for a whole block.
529 	 */
530 
531 	fs_bshift = vp->v_mount->mnt_fs_bshift;
532 	fs_bsize = 1 << fs_bshift;
533 	dev_bshift = vp->v_mount->mnt_dev_bshift;
534 	dev_bsize = 1 << dev_bshift;
535 	KASSERT((diskeof & (dev_bsize - 1)) == 0);
536 	KASSERT((memeof & (dev_bsize - 1)) == 0);
537 
538 	orignpages = MIN(orignpages,
539 	    round_page(memeof - origoffset) >> PAGE_SHIFT);
540 	npages = orignpages;
541 	startoffset = origoffset & ~(fs_bsize - 1);
542 	endoffset = round_page((origoffset + (npages << PAGE_SHIFT)
543 				+ fs_bsize - 1) & ~(fs_bsize - 1));
544 	endoffset = MIN(endoffset, round_page(memeof));
545 	ridx = (origoffset - startoffset) >> PAGE_SHIFT;
546 
547 	memset(pgs, 0, sizeof(pgs));
548 	uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL);
549 
550 	/*
551 	 * if PGO_OVERWRITE is set, don't bother reading the pages.
552 	 * PGO_OVERWRITE also means that the caller guarantees
553 	 * that the pages already have backing store allocated.
554 	 */
555 
556 	if (flags & PGO_OVERWRITE) {
557 		UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0);
558 
559 		for (i = 0; i < npages; i++) {
560 			struct vm_page *pg = pgs[ridx + i];
561 
562 			if (pg->flags & PG_FAKE) {
563 				uvm_pagezero(pg);
564 				pg->flags &= ~(PG_FAKE);
565 			}
566 			pg->flags &= ~(PG_RDONLY);
567 		}
568 		npages += ridx;
569 		goto out;
570 	}
571 
572 	/*
573 	 * if the pages are already resident, just return them.
574 	 */
575 
576 	for (i = 0; i < npages; i++) {
577 		struct vm_page *pg = pgs[ridx + i];
578 
579 		if ((pg->flags & PG_FAKE) ||
580 		    (write && (pg->flags & PG_RDONLY))) {
581 			break;
582 		}
583 	}
584 	if (i == npages) {
585 		UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0);
586 		raoffset = origoffset + (orignpages << PAGE_SHIFT);
587 		npages += ridx;
588 		goto raout;
589 	}
590 
591 	/*
592 	 * the page wasn't resident and we're not overwriting,
593 	 * so we're going to have to do some i/o.
594 	 * find any additional pages needed to cover the expanded range.
595 	 */
596 
597 	npages = (endoffset - startoffset) >> PAGE_SHIFT;
598 	if (startoffset != origoffset || npages != orignpages) {
599 
600 		/*
601 		 * XXXUBC we need to avoid deadlocks caused by locking
602 		 * additional pages at lower offsets than pages we
603 		 * already have locked.  for now, unlock them all and
604 		 * start over.
605 		 */
606 
607 		for (i = 0; i < orignpages; i++) {
608 			struct vm_page *pg = pgs[ridx + i];
609 
610 			if (pg->flags & PG_FAKE) {
611 				pg->flags |= PG_RELEASED;
612 			}
613 		}
614 		uvm_page_unbusy(&pgs[ridx], orignpages);
615 		memset(pgs, 0, sizeof(pgs));
616 
617 		UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x",
618 			    startoffset, endoffset, 0,0);
619 		npgs = npages;
620 		uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL);
621 	}
622 	simple_unlock(&uobj->vmobjlock);
623 
624 	/*
625 	 * read the desired page(s).
626 	 */
627 
628 	totalbytes = npages << PAGE_SHIFT;
629 	bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0));
630 	tailbytes = totalbytes - bytes;
631 	skipbytes = 0;
632 
633 	kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK |
634 			     UVMPAGER_MAPIN_READ);
635 
636 	s = splbio();
637 	mbp = pool_get(&bufpool, PR_WAITOK);
638 	splx(s);
639 	mbp->b_bufsize = totalbytes;
640 	mbp->b_data = (void *)kva;
641 	mbp->b_resid = mbp->b_bcount = bytes;
642 	mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL : 0);
643 	mbp->b_iodone = uvm_aio_biodone;
644 	mbp->b_vp = vp;
645 	LIST_INIT(&mbp->b_dep);
646 
647 	/*
648 	 * if EOF is in the middle of the range, zero the part past EOF.
649 	 */
650 
651 	if (tailbytes > 0) {
652 		memset((void *)(kva + bytes), 0, tailbytes);
653 	}
654 
655 	/*
656 	 * now loop over the pages, reading as needed.
657 	 */
658 
659 	if (write) {
660 		lockmgr(&vp->v_glock, LK_EXCLUSIVE, NULL);
661 	} else {
662 		lockmgr(&vp->v_glock, LK_SHARED, NULL);
663 	}
664 
665 	bp = NULL;
666 	for (offset = startoffset;
667 	     bytes > 0;
668 	     offset += iobytes, bytes -= iobytes) {
669 
670 		/*
671 		 * skip pages which don't need to be read.
672 		 */
673 
674 		pidx = (offset - startoffset) >> PAGE_SHIFT;
675 		while ((pgs[pidx]->flags & (PG_FAKE|PG_RDONLY)) == 0) {
676 			size_t b;
677 
678 			KASSERT((offset & (PAGE_SIZE - 1)) == 0);
679 			b = MIN(PAGE_SIZE, bytes);
680 			offset += b;
681 			bytes -= b;
682 			skipbytes += b;
683 			pidx++;
684 			UVMHIST_LOG(ubchist, "skipping, new offset 0x%x",
685 				    offset, 0,0,0);
686 			if (bytes == 0) {
687 				goto loopdone;
688 			}
689 		}
690 
691 		/*
692 		 * bmap the file to find out the blkno to read from and
693 		 * how much we can read in one i/o.  if bmap returns an error,
694 		 * skip the rest of the top-level i/o.
695 		 */
696 
697 		lbn = offset >> fs_bshift;
698 		error = VOP_BMAP(vp, lbn, NULL, &blkno, &run);
699 		if (error) {
700 			UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%x -> %d\n",
701 				    lbn, error,0,0);
702 			skipbytes += bytes;
703 			goto loopdone;
704 		}
705 
706 		/*
707 		 * see how many pages can be read with this i/o.
708 		 * reduce the i/o size if necessary to avoid
709 		 * overwriting pages with valid data.
710 		 */
711 
712 		iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
713 		    bytes);
714 		if (offset + iobytes > round_page(offset)) {
715 			pcount = 1;
716 			while (pidx + pcount < npages &&
717 			       pgs[pidx + pcount]->flags & PG_FAKE) {
718 				pcount++;
719 			}
720 			iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) -
721 				      (offset - trunc_page(offset)));
722 		}
723 
724 		/*
725 		 * if this block isn't allocated, zero it instead of reading it.
726 		 * if this is a read access, mark the pages we zeroed PG_RDONLY.
727 		 */
728 
729 		if (blkno < 0) {
730 			int holepages = (round_page(offset + iobytes) -
731 					 trunc_page(offset)) >> PAGE_SHIFT;
732 			UVMHIST_LOG(ubchist, "lbn 0x%x -> HOLE", lbn,0,0,0);
733 
734 			sawhole = TRUE;
735 			memset((char *)kva + (offset - startoffset), 0,
736 			       iobytes);
737 			skipbytes += iobytes;
738 
739 			for (i = 0; i < holepages; i++) {
740 				if (write) {
741 					pgs[pidx + i]->flags &= ~PG_CLEAN;
742 				} else {
743 					pgs[pidx + i]->flags |= PG_RDONLY;
744 				}
745 			}
746 			continue;
747 		}
748 
749 		/*
750 		 * allocate a sub-buf for this piece of the i/o
751 		 * (or just use mbp if there's only 1 piece),
752 		 * and start it going.
753 		 */
754 
755 		if (offset == startoffset && iobytes == bytes) {
756 			bp = mbp;
757 		} else {
758 			s = splbio();
759 			bp = pool_get(&bufpool, PR_WAITOK);
760 			splx(s);
761 			bp->b_data = (char *)kva + offset - startoffset;
762 			bp->b_resid = bp->b_bcount = iobytes;
763 			bp->b_flags = B_BUSY|B_READ|B_CALL;
764 			bp->b_iodone = uvm_aio_biodone1;
765 			bp->b_vp = vp;
766 			LIST_INIT(&bp->b_dep);
767 		}
768 		bp->b_lblkno = 0;
769 		bp->b_private = mbp;
770 
771 		/* adjust physical blkno for partial blocks */
772 		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
773 				       dev_bshift);
774 
775 		UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x",
776 			    bp, offset, iobytes, bp->b_blkno);
777 
778 		VOP_STRATEGY(bp);
779 	}
780 
781 loopdone:
782 	if (skipbytes) {
783 		s = splbio();
784 		if (error) {
785 			mbp->b_flags |= B_ERROR;
786 			mbp->b_error = error;
787 		}
788 		mbp->b_resid -= skipbytes;
789 		if (mbp->b_resid == 0) {
790 			biodone(mbp);
791 		}
792 		splx(s);
793 	}
794 
795 	if (async) {
796 		UVMHIST_LOG(ubchist, "returning 0 (async)",0,0,0,0);
797 		lockmgr(&vp->v_glock, LK_RELEASE, NULL);
798 		return 0;
799 	}
800 	if (bp != NULL) {
801 		error = biowait(mbp);
802 	}
803 	s = splbio();
804 	pool_put(&bufpool, mbp);
805 	splx(s);
806 	uvm_pagermapout(kva, npages);
807 	raoffset = startoffset + totalbytes;
808 
809 	/*
810 	 * if this we encountered a hole then we have to do a little more work.
811 	 * for read faults, we marked the page PG_RDONLY so that future
812 	 * write accesses to the page will fault again.
813 	 * for write faults, we must make sure that the backing store for
814 	 * the page is completely allocated while the pages are locked.
815 	 */
816 
817 	if (error == 0 && sawhole && write) {
818 		error = VOP_BALLOCN(vp, startoffset, npages << PAGE_SHIFT,
819 				   cred, 0);
820 		if (error) {
821 			UVMHIST_LOG(ubchist, "balloc lbn 0x%x -> %d",
822 				    lbn, error,0,0);
823 			lockmgr(&vp->v_glock, LK_RELEASE, NULL);
824 			simple_lock(&uobj->vmobjlock);
825 			goto out;
826 		}
827 	}
828 	lockmgr(&vp->v_glock, LK_RELEASE, NULL);
829 	simple_lock(&uobj->vmobjlock);
830 
831 	/*
832 	 * see if we want to start any readahead.
833 	 * XXXUBC for now, just read the next 128k on 64k boundaries.
834 	 * this is pretty nonsensical, but it is 50% faster than reading
835 	 * just the next 64k.
836 	 */
837 
838 raout:
839 	if (!error && !async && !write && ((int)raoffset & 0xffff) == 0 &&
840 	    PAGE_SHIFT <= 16) {
841 		int racount;
842 
843 		racount = 1 << (16 - PAGE_SHIFT);
844 		(void) VOP_GETPAGES(vp, raoffset, NULL, &racount, 0,
845 				    VM_PROT_READ, 0, 0);
846 		simple_lock(&uobj->vmobjlock);
847 
848 		racount = 1 << (16 - PAGE_SHIFT);
849 		(void) VOP_GETPAGES(vp, raoffset + 0x10000, NULL, &racount, 0,
850 				    VM_PROT_READ, 0, 0);
851 		simple_lock(&uobj->vmobjlock);
852 	}
853 
854 	/*
855 	 * we're almost done!  release the pages...
856 	 * for errors, we free the pages.
857 	 * otherwise we activate them and mark them as valid and clean.
858 	 * also, unbusy pages that were not actually requested.
859 	 */
860 
861 out:
862 	if (error) {
863 		uvm_lock_pageq();
864 		for (i = 0; i < npages; i++) {
865 			if (pgs[i] == NULL) {
866 				continue;
867 			}
868 			UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
869 				    pgs[i], pgs[i]->flags, 0,0);
870 			if (pgs[i]->flags & PG_WANTED) {
871 				wakeup(pgs[i]);
872 			}
873 			if (pgs[i]->flags & PG_RELEASED) {
874 				uvm_unlock_pageq();
875 				(uobj->pgops->pgo_releasepg)(pgs[i], NULL);
876 				uvm_lock_pageq();
877 				continue;
878 			}
879 			if (pgs[i]->flags & PG_FAKE) {
880 				uvm_pagefree(pgs[i]);
881 				continue;
882 			}
883 			uvm_pageactivate(pgs[i]);
884 			pgs[i]->flags &= ~(PG_WANTED|PG_BUSY);
885 			UVM_PAGE_OWN(pgs[i], NULL);
886 		}
887 		uvm_unlock_pageq();
888 		simple_unlock(&uobj->vmobjlock);
889 		UVMHIST_LOG(ubchist, "returning error %d", error,0,0,0);
890 		return error;
891 	}
892 
893 	UVMHIST_LOG(ubchist, "succeeding, npages %d", npages,0,0,0);
894 	uvm_lock_pageq();
895 	for (i = 0; i < npages; i++) {
896 		if (pgs[i] == NULL) {
897 			continue;
898 		}
899 		UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
900 			    pgs[i], pgs[i]->flags, 0,0);
901 		if (pgs[i]->flags & PG_FAKE) {
902 			UVMHIST_LOG(ubchist, "unfaking pg %p offset 0x%x",
903 				    pgs[i], pgs[i]->offset,0,0);
904 			pgs[i]->flags &= ~(PG_FAKE);
905 			pmap_clear_modify(pgs[i]);
906 			pmap_clear_reference(pgs[i]);
907 		}
908 		if (write) {
909 			pgs[i]->flags &= ~(PG_RDONLY);
910 		}
911 		if (i < ridx || i >= ridx + orignpages || async) {
912 			UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x",
913 				    pgs[i], pgs[i]->offset,0,0);
914 			if (pgs[i]->flags & PG_WANTED) {
915 				wakeup(pgs[i]);
916 			}
917 			if (pgs[i]->flags & PG_RELEASED) {
918 				uvm_unlock_pageq();
919 				(uobj->pgops->pgo_releasepg)(pgs[i], NULL);
920 				uvm_lock_pageq();
921 				continue;
922 			}
923 			uvm_pageactivate(pgs[i]);
924 			pgs[i]->flags &= ~(PG_WANTED|PG_BUSY);
925 			UVM_PAGE_OWN(pgs[i], NULL);
926 		}
927 	}
928 	uvm_unlock_pageq();
929 	simple_unlock(&uobj->vmobjlock);
930 	if (ap->a_m != NULL) {
931 		memcpy(ap->a_m, &pgs[ridx],
932 		       orignpages * sizeof(struct vm_page *));
933 	}
934 	return 0;
935 }
936 
937 /*
938  * generic VM putpages routine.
939  * Write the given range of pages to backing store.
940  */
941 
942 int
943 genfs_putpages(v)
944 	void *v;
945 {
946 	struct vop_putpages_args /* {
947 		struct vnode *a_vp;
948 		struct vm_page **a_m;
949 		int a_count;
950 		int a_flags;
951 		int *a_rtvals;
952 	} */ *ap = v;
953 
954 	int s, error, npages, run;
955 	int fs_bshift, dev_bshift, dev_bsize;
956 	vaddr_t kva;
957 	off_t eof, offset, startoffset;
958 	size_t bytes, iobytes, skipbytes;
959 	daddr_t lbn, blkno;
960 	struct vm_page *pg;
961 	struct buf *mbp, *bp;
962 	struct vnode *vp = ap->a_vp;
963 	boolean_t async = (ap->a_flags & PGO_SYNCIO) == 0;
964 	UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
965 	UVMHIST_LOG(ubchist, "vp %p offset 0x%x count %d",
966 		    vp, ap->a_m[0]->offset, ap->a_count, 0);
967 
968 	simple_unlock(&vp->v_uvm.u_obj.vmobjlock);
969 
970 	error = VOP_SIZE(vp, vp->v_uvm.u_size, &eof);
971 	if (error) {
972 		return error;
973 	}
974 
975 	error = 0;
976 	npages = ap->a_count;
977 	fs_bshift = vp->v_mount->mnt_fs_bshift;
978 	dev_bshift = vp->v_mount->mnt_dev_bshift;
979 	dev_bsize = 1 << dev_bshift;
980 	KASSERT((eof & (dev_bsize - 1)) == 0);
981 
982 	pg = ap->a_m[0];
983 	startoffset = pg->offset;
984 	bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
985 	skipbytes = 0;
986 	KASSERT(bytes != 0);
987 
988 	kva = uvm_pagermapin(ap->a_m, npages, UVMPAGER_MAPIN_WAITOK);
989 
990 	s = splbio();
991 	vp->v_numoutput += 2;
992 	mbp = pool_get(&bufpool, PR_WAITOK);
993 	UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x",
994 		    vp, mbp, vp->v_numoutput, bytes);
995 	splx(s);
996 	mbp->b_bufsize = npages << PAGE_SHIFT;
997 	mbp->b_data = (void *)kva;
998 	mbp->b_resid = mbp->b_bcount = bytes;
999 	mbp->b_flags = B_BUSY|B_WRITE|B_AGE |
1000 		(async ? B_CALL : 0) |
1001 		(curproc == uvm.pagedaemon_proc ? B_PDAEMON : 0);
1002 	mbp->b_iodone = uvm_aio_biodone;
1003 	mbp->b_vp = vp;
1004 	LIST_INIT(&mbp->b_dep);
1005 
1006 	bp = NULL;
1007 	for (offset = startoffset;
1008 	     bytes > 0;
1009 	     offset += iobytes, bytes -= iobytes) {
1010 		lbn = offset >> fs_bshift;
1011 		error = VOP_BMAP(vp, lbn, NULL, &blkno, &run);
1012 		if (error) {
1013 			UVMHIST_LOG(ubchist, "VOP_BMAP() -> %d", error,0,0,0);
1014 			skipbytes += bytes;
1015 			bytes = 0;
1016 			break;
1017 		}
1018 
1019 		iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
1020 		    bytes);
1021 		if (blkno == (daddr_t)-1) {
1022 			skipbytes += iobytes;
1023 			continue;
1024 		}
1025 
1026 		/* if it's really one i/o, don't make a second buf */
1027 		if (offset == startoffset && iobytes == bytes) {
1028 			bp = mbp;
1029 		} else {
1030 			s = splbio();
1031 			vp->v_numoutput++;
1032 			bp = pool_get(&bufpool, PR_WAITOK);
1033 			UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
1034 				    vp, bp, vp->v_numoutput, 0);
1035 			splx(s);
1036 			bp->b_data = (char *)kva +
1037 				(vaddr_t)(offset - pg->offset);
1038 			bp->b_resid = bp->b_bcount = iobytes;
1039 			bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC;
1040 			bp->b_iodone = uvm_aio_biodone1;
1041 			bp->b_vp = vp;
1042 			LIST_INIT(&bp->b_dep);
1043 		}
1044 		bp->b_lblkno = 0;
1045 		bp->b_private = mbp;
1046 
1047 		/* adjust physical blkno for partial blocks */
1048 		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
1049 				       dev_bshift);
1050 		UVMHIST_LOG(ubchist, "vp %p offset 0x%x bcount 0x%x blkno 0x%x",
1051 			    vp, offset, bp->b_bcount, bp->b_blkno);
1052 		VOP_STRATEGY(bp);
1053 	}
1054 	if (skipbytes) {
1055 		UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0);
1056 		s = splbio();
1057 		mbp->b_resid -= skipbytes;
1058 		if (error) {
1059 			mbp->b_flags |= B_ERROR;
1060 			mbp->b_error = error;
1061 		}
1062 		if (mbp->b_resid == 0) {
1063 			biodone(mbp);
1064 		}
1065 		splx(s);
1066 	}
1067 	if (async) {
1068 		UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
1069 		return 0;
1070 	}
1071 	if (bp != NULL) {
1072 		UVMHIST_LOG(ubchist, "waiting for mbp %p", mbp,0,0,0);
1073 		error = biowait(mbp);
1074 	}
1075 	if (bioops.io_pageiodone) {
1076 		(*bioops.io_pageiodone)(mbp);
1077 	}
1078 	s = splbio();
1079 	vwakeup(mbp);
1080 	pool_put(&bufpool, mbp);
1081 	splx(s);
1082 	uvm_pagermapout(kva, npages);
1083 	UVMHIST_LOG(ubchist, "returning, error %d", error,0,0,0);
1084 	return error;
1085 }
1086 
1087 int
1088 genfs_size(v)
1089 	void *v;
1090 {
1091 	struct vop_size_args /* {
1092 		struct vnode *a_vp;
1093 		off_t a_size;
1094 		off_t *a_eobp;
1095 	} */ *ap = v;
1096 	int bsize;
1097 
1098 	bsize = 1 << ap->a_vp->v_mount->mnt_fs_bshift;
1099 	*ap->a_eobp = (ap->a_size + bsize - 1) & ~(bsize - 1);
1100 	return 0;
1101 }
1102