xref: /netbsd-src/sys/miscfs/genfs/genfs_io.c (revision fda613df024735a05ab86e243bc350e909ae3979)
1*fda613dfSriastradh /*	$NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $	*/
2735dd21eSpooka 
3735dd21eSpooka /*
4735dd21eSpooka  * Copyright (c) 1982, 1986, 1989, 1993
5735dd21eSpooka  *	The Regents of the University of California.  All rights reserved.
6735dd21eSpooka  *
7735dd21eSpooka  * Redistribution and use in source and binary forms, with or without
8735dd21eSpooka  * modification, are permitted provided that the following conditions
9735dd21eSpooka  * are met:
10735dd21eSpooka  * 1. Redistributions of source code must retain the above copyright
11735dd21eSpooka  *    notice, this list of conditions and the following disclaimer.
12735dd21eSpooka  * 2. Redistributions in binary form must reproduce the above copyright
13735dd21eSpooka  *    notice, this list of conditions and the following disclaimer in the
14735dd21eSpooka  *    documentation and/or other materials provided with the distribution.
15735dd21eSpooka  * 3. Neither the name of the University nor the names of its contributors
16735dd21eSpooka  *    may be used to endorse or promote products derived from this software
17735dd21eSpooka  *    without specific prior written permission.
18735dd21eSpooka  *
19735dd21eSpooka  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20735dd21eSpooka  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21735dd21eSpooka  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22735dd21eSpooka  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23735dd21eSpooka  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24735dd21eSpooka  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25735dd21eSpooka  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26735dd21eSpooka  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27735dd21eSpooka  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28735dd21eSpooka  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29735dd21eSpooka  * SUCH DAMAGE.
30735dd21eSpooka  *
31735dd21eSpooka  */
32735dd21eSpooka 
33735dd21eSpooka #include <sys/cdefs.h>
34*fda613dfSriastradh __KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $");
35735dd21eSpooka 
36735dd21eSpooka #include <sys/param.h>
37735dd21eSpooka #include <sys/systm.h>
38735dd21eSpooka #include <sys/proc.h>
39735dd21eSpooka #include <sys/kernel.h>
40735dd21eSpooka #include <sys/mount.h>
41735dd21eSpooka #include <sys/vnode.h>
42735dd21eSpooka #include <sys/kmem.h>
43735dd21eSpooka #include <sys/kauth.h>
44735dd21eSpooka #include <sys/fstrans.h>
45010ce493Spooka #include <sys/buf.h>
461d7848adSad #include <sys/atomic.h>
47735dd21eSpooka 
48735dd21eSpooka #include <miscfs/genfs/genfs.h>
49735dd21eSpooka #include <miscfs/genfs/genfs_node.h>
50735dd21eSpooka #include <miscfs/specfs/specdev.h>
51735dd21eSpooka 
52735dd21eSpooka #include <uvm/uvm.h>
53735dd21eSpooka #include <uvm/uvm_pager.h>
54881d12e6Sad #include <uvm/uvm_page_array.h>
55735dd21eSpooka 
56735dd21eSpooka static int genfs_do_directio(struct vmspace *, vaddr_t, size_t, struct vnode *,
57735dd21eSpooka     off_t, enum uio_rw);
58735dd21eSpooka static void genfs_dio_iodone(struct buf *);
59735dd21eSpooka 
602b81644cSriastradh static int genfs_getpages_read(struct vnode *, struct vm_page **, int, off_t,
612b81644cSriastradh     off_t, bool, bool, bool, bool);
62735dd21eSpooka static int genfs_do_io(struct vnode *, off_t, vaddr_t, size_t, int, enum uio_rw,
63735dd21eSpooka     void (*)(struct buf *));
644f2ae943Syamt static void genfs_rel_pages(struct vm_page **, unsigned int);
65735dd21eSpooka 
66735dd21eSpooka int genfs_maxdio = MAXPHYS;
67735dd21eSpooka 
68e15697fcSchs static void
genfs_rel_pages(struct vm_page ** pgs,unsigned int npages)694f2ae943Syamt genfs_rel_pages(struct vm_page **pgs, unsigned int npages)
70735dd21eSpooka {
714f2ae943Syamt 	unsigned int i;
72735dd21eSpooka 
73735dd21eSpooka 	for (i = 0; i < npages; i++) {
74735dd21eSpooka 		struct vm_page *pg = pgs[i];
75735dd21eSpooka 
76735dd21eSpooka 		if (pg == NULL || pg == PGO_DONTCARE)
77735dd21eSpooka 			continue;
78d2a0ebb6Sad 		KASSERT(uvm_page_owner_locked_p(pg, true));
79735dd21eSpooka 		if (pg->flags & PG_FAKE) {
80735dd21eSpooka 			pg->flags |= PG_RELEASED;
81735dd21eSpooka 		}
82735dd21eSpooka 	}
83735dd21eSpooka 	uvm_page_unbusy(pgs, npages);
84735dd21eSpooka }
85735dd21eSpooka 
86735dd21eSpooka /*
87735dd21eSpooka  * generic VM getpages routine.
88735dd21eSpooka  * Return PG_BUSY pages for the given range,
89735dd21eSpooka  * reading from backing store if necessary.
90735dd21eSpooka  */
91735dd21eSpooka 
92735dd21eSpooka int
genfs_getpages(void * v)93735dd21eSpooka genfs_getpages(void *v)
94735dd21eSpooka {
95735dd21eSpooka 	struct vop_getpages_args /* {
96735dd21eSpooka 		struct vnode *a_vp;
97735dd21eSpooka 		voff_t a_offset;
98735dd21eSpooka 		struct vm_page **a_m;
99735dd21eSpooka 		int *a_count;
100735dd21eSpooka 		int a_centeridx;
101735dd21eSpooka 		vm_prot_t a_access_type;
102735dd21eSpooka 		int a_advice;
103735dd21eSpooka 		int a_flags;
1042372674cSuebayasi 	} */ * const ap = v;
105735dd21eSpooka 
1066903a054Suebayasi 	off_t diskeof, memeof;
1071d7848adSad 	int i, error, npages, iflag;
1081907407bSyamt 	const int flags = ap->a_flags;
1092372674cSuebayasi 	struct vnode * const vp = ap->a_vp;
1102372674cSuebayasi 	struct uvm_object * const uobj = &vp->v_uobj;
1111907407bSyamt 	const bool async = (flags & PGO_SYNCIO) == 0;
11264cb3c88Suebayasi 	const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
1131907407bSyamt 	const bool overwrite = (flags & PGO_OVERWRITE) != 0;
11464cb3c88Suebayasi 	const bool blockalloc = memwrite && (flags & PGO_NOBLOCKALLOC) == 0;
1155f7e4301Sjdolecek 	const bool need_wapbl = (vp->v_mount->mnt_wapbl &&
1165f7e4301Sjdolecek 			(flags & PGO_JOURNALLOCKED) == 0);
117fca58884Schs 	const bool glocked = (flags & PGO_GLOCKHELD) != 0;
118f36a7657Shannken 	bool holds_wapbl = false;
119f36a7657Shannken 	struct mount *trans_mount = NULL;
120735dd21eSpooka 	UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist);
121735dd21eSpooka 
122cb32a134Spgoyette 	UVMHIST_LOG(ubchist, "vp %#jx off 0x%jx/%jx count %jd",
123cb32a134Spgoyette 	    (uintptr_t)vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count);
124735dd21eSpooka 
12505a3457eSad 	KASSERT(memwrite >= overwrite);
126735dd21eSpooka 	KASSERT(vp->v_type == VREG || vp->v_type == VDIR ||
127735dd21eSpooka 	    vp->v_type == VLNK || vp->v_type == VBLK);
128735dd21eSpooka 
1291d7848adSad 	/*
1301d7848adSad 	 * the object must be locked.  it can only be a read lock when
131ff872804Sad 	 * processing a read fault with PGO_LOCKED.
1321d7848adSad 	 */
1331d7848adSad 
1341d7848adSad 	KASSERT(rw_lock_held(uobj->vmobjlock));
1351d7848adSad 	KASSERT(rw_write_held(uobj->vmobjlock) ||
136ff872804Sad 	   ((flags & PGO_LOCKED) != 0 && !memwrite));
1371d7848adSad 
138560071c2Sjdolecek #ifdef DIAGNOSTIC
139560071c2Sjdolecek 	if ((flags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl)
140560071c2Sjdolecek                 WAPBL_JLOCK_ASSERT(vp->v_mount);
141560071c2Sjdolecek #endif
142560071c2Sjdolecek 
1431d7848adSad 	/*
1441d7848adSad 	 * check for reclaimed vnode.  v_interlock is not held here, but
1451d7848adSad 	 * VI_DEADCHECK is set with vmobjlock held.
1461d7848adSad 	 */
1471d7848adSad 
1481d7848adSad 	iflag = atomic_load_relaxed(&vp->v_iflag);
1491d7848adSad 	if (__predict_false((iflag & VI_DEADCHECK) != 0)) {
150d2a0ebb6Sad 		mutex_enter(vp->v_interlock);
151ad2fab45Shannken 		error = vdead_check(vp, VDEAD_NOWAIT);
152d2a0ebb6Sad 		mutex_exit(vp->v_interlock);
153ad2fab45Shannken 		if (error) {
154ad2fab45Shannken 			if ((flags & PGO_LOCKED) == 0)
155d2a0ebb6Sad 				rw_exit(uobj->vmobjlock);
156ad2fab45Shannken 			return error;
157ad2fab45Shannken 		}
1581d7848adSad 	}
159ad2fab45Shannken 
160735dd21eSpooka startover:
161735dd21eSpooka 	error = 0;
162680e7444Suebayasi 	const voff_t origvsize = vp->v_size;
163680e7444Suebayasi 	const off_t origoffset = ap->a_offset;
164bb4b25cfSuebayasi 	const int orignpages = *ap->a_count;
165f4e16ac9Suebayasi 
166735dd21eSpooka 	GOP_SIZE(vp, origvsize, &diskeof, 0);
167735dd21eSpooka 	if (flags & PGO_PASTEOF) {
1686903a054Suebayasi 		off_t newsize;
169735dd21eSpooka #if defined(DIAGNOSTIC)
170735dd21eSpooka 		off_t writeeof;
171735dd21eSpooka #endif /* defined(DIAGNOSTIC) */
172735dd21eSpooka 
173735dd21eSpooka 		newsize = MAX(origvsize,
174735dd21eSpooka 		    origoffset + (orignpages << PAGE_SHIFT));
175735dd21eSpooka 		GOP_SIZE(vp, newsize, &memeof, GOP_SIZE_MEM);
176735dd21eSpooka #if defined(DIAGNOSTIC)
177735dd21eSpooka 		GOP_SIZE(vp, vp->v_writesize, &writeeof, GOP_SIZE_MEM);
178735dd21eSpooka 		if (newsize > round_page(writeeof)) {
1796cd7b7a7Spooka 			panic("%s: past eof: %" PRId64 " vs. %" PRId64,
1806cd7b7a7Spooka 			    __func__, newsize, round_page(writeeof));
181735dd21eSpooka 		}
182735dd21eSpooka #endif /* defined(DIAGNOSTIC) */
183735dd21eSpooka 	} else {
184735dd21eSpooka 		GOP_SIZE(vp, origvsize, &memeof, GOP_SIZE_MEM);
185735dd21eSpooka 	}
186735dd21eSpooka 	KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages);
187ab579ad8Sriastradh 	KASSERT((origoffset & (PAGE_SIZE - 1)) == 0);
188ab579ad8Sriastradh 	KASSERT(origoffset >= 0);
189735dd21eSpooka 	KASSERT(orignpages > 0);
190735dd21eSpooka 
191735dd21eSpooka 	/*
192735dd21eSpooka 	 * Bounds-check the request.
193735dd21eSpooka 	 */
194735dd21eSpooka 
195735dd21eSpooka 	if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) {
196735dd21eSpooka 		if ((flags & PGO_LOCKED) == 0) {
197d2a0ebb6Sad 			rw_exit(uobj->vmobjlock);
198735dd21eSpooka 		}
199cb32a134Spgoyette 		UVMHIST_LOG(ubchist, "off 0x%jx count %jd goes past EOF 0x%jx",
200735dd21eSpooka 		    origoffset, *ap->a_count, memeof,0);
201735dd21eSpooka 		error = EINVAL;
202735dd21eSpooka 		goto out_err;
203735dd21eSpooka 	}
204735dd21eSpooka 
205735dd21eSpooka 	/* uobj is locked */
206735dd21eSpooka 
207735dd21eSpooka 	if ((flags & PGO_NOTIMESTAMP) == 0 &&
208735dd21eSpooka 	    (vp->v_type != VBLK ||
209735dd21eSpooka 	    (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) {
210735dd21eSpooka 		int updflags = 0;
211735dd21eSpooka 
212735dd21eSpooka 		if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
213735dd21eSpooka 			updflags = GOP_UPDATE_ACCESSED;
214735dd21eSpooka 		}
21564cb3c88Suebayasi 		if (memwrite) {
216735dd21eSpooka 			updflags |= GOP_UPDATE_MODIFIED;
217735dd21eSpooka 		}
218735dd21eSpooka 		if (updflags != 0) {
219735dd21eSpooka 			GOP_MARKUPDATE(vp, updflags);
220735dd21eSpooka 		}
221735dd21eSpooka 	}
222735dd21eSpooka 
223735dd21eSpooka 	/*
224735dd21eSpooka 	 * For PGO_LOCKED requests, just return whatever's in memory.
225735dd21eSpooka 	 */
226735dd21eSpooka 
227735dd21eSpooka 	if (flags & PGO_LOCKED) {
228735dd21eSpooka 		int nfound;
2299fa66d7aSuebayasi 		struct vm_page *pg;
230735dd21eSpooka 
231fca58884Schs 		KASSERT(!glocked);
232735dd21eSpooka 		npages = *ap->a_count;
233735dd21eSpooka #if defined(DEBUG)
234735dd21eSpooka 		for (i = 0; i < npages; i++) {
235735dd21eSpooka 			pg = ap->a_m[i];
236735dd21eSpooka 			KASSERT(pg == NULL || pg == PGO_DONTCARE);
237735dd21eSpooka 		}
238735dd21eSpooka #endif /* defined(DEBUG) */
239735dd21eSpooka  		nfound = uvn_findpages(uobj, origoffset, &npages,
24005a3457eSad 		    ap->a_m, NULL,
241ff872804Sad 		    UFP_NOWAIT | UFP_NOALLOC | UFP_NOBUSY |
242ff872804Sad 		    (memwrite ? UFP_NORDONLY : 0));
243735dd21eSpooka 		KASSERT(npages == *ap->a_count);
244735dd21eSpooka 		if (nfound == 0) {
245735dd21eSpooka 			error = EBUSY;
246735dd21eSpooka 			goto out_err;
247735dd21eSpooka 		}
24805a3457eSad 		/*
24905a3457eSad 		 * lock and unlock g_glock to ensure that no one is truncating
25005a3457eSad 		 * the file behind us.
25105a3457eSad 		 */
252a75c80a0Suebayasi 		if (!genfs_node_rdtrylock(vp)) {
253735dd21eSpooka 			/*
254735dd21eSpooka 			 * restore the array.
255735dd21eSpooka 			 */
256735dd21eSpooka 
257735dd21eSpooka 			for (i = 0; i < npages; i++) {
258735dd21eSpooka 				pg = ap->a_m[i];
259735dd21eSpooka 
260c87cbe9fSuebayasi 				if (pg != NULL && pg != PGO_DONTCARE) {
261735dd21eSpooka 					ap->a_m[i] = NULL;
262735dd21eSpooka 				}
2633c4c042eSuebayasi 				KASSERT(ap->a_m[i] == NULL ||
2643c4c042eSuebayasi 				    ap->a_m[i] == PGO_DONTCARE);
265735dd21eSpooka 			}
266735dd21eSpooka 		} else {
267a75c80a0Suebayasi 			genfs_node_unlock(vp);
268735dd21eSpooka 		}
269735dd21eSpooka 		error = (ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0);
270e15697fcSchs 		if (error == 0 && memwrite) {
27105a3457eSad 			for (i = 0; i < npages; i++) {
27205a3457eSad 				pg = ap->a_m[i];
27305a3457eSad 				if (pg == NULL || pg == PGO_DONTCARE) {
27405a3457eSad 					continue;
27505a3457eSad 				}
27605a3457eSad 				if (uvm_pagegetdirty(pg) ==
27705a3457eSad 				    UVM_PAGE_STATUS_CLEAN) {
27805a3457eSad 					uvm_pagemarkdirty(pg,
27905a3457eSad 					    UVM_PAGE_STATUS_UNKNOWN);
28005a3457eSad 				}
28105a3457eSad 			}
282e15697fcSchs 		}
283735dd21eSpooka 		goto out_err;
284735dd21eSpooka 	}
285d2a0ebb6Sad 	rw_exit(uobj->vmobjlock);
286735dd21eSpooka 
287735dd21eSpooka 	/*
288735dd21eSpooka 	 * find the requested pages and make some simple checks.
289735dd21eSpooka 	 * leave space in the page array for a whole block.
290735dd21eSpooka 	 */
291735dd21eSpooka 
292680e7444Suebayasi 	const int fs_bshift = (vp->v_type != VBLK) ?
293680e7444Suebayasi 	    vp->v_mount->mnt_fs_bshift : DEV_BSHIFT;
294680e7444Suebayasi 	const int fs_bsize = 1 << fs_bshift;
295a0629265Suebayasi #define	blk_mask	(fs_bsize - 1)
296a0629265Suebayasi #define	trunc_blk(x)	((x) & ~blk_mask)
297a0629265Suebayasi #define	round_blk(x)	(((x) + blk_mask) & ~blk_mask)
298735dd21eSpooka 
299bb4b25cfSuebayasi 	const int orignmempages = MIN(orignpages,
300735dd21eSpooka 	    round_page(memeof - origoffset) >> PAGE_SHIFT);
301bb4b25cfSuebayasi 	npages = orignmempages;
302a0629265Suebayasi 	const off_t startoffset = trunc_blk(origoffset);
303a0629265Suebayasi 	const off_t endoffset = MIN(
304a0629265Suebayasi 	    round_page(round_blk(origoffset + (npages << PAGE_SHIFT))),
305a0629265Suebayasi 	    round_page(memeof));
3069fa66d7aSuebayasi 	const int ridx = (origoffset - startoffset) >> PAGE_SHIFT;
307735dd21eSpooka 
308f4e16ac9Suebayasi 	const int pgs_size = sizeof(struct vm_page *) *
309735dd21eSpooka 	    ((endoffset - startoffset) >> PAGE_SHIFT);
310f4e16ac9Suebayasi 	struct vm_page **pgs, *pgs_onstack[UBC_MAX_PAGES];
3119fa66d7aSuebayasi 
312735dd21eSpooka 	if (pgs_size > sizeof(pgs_onstack)) {
313735dd21eSpooka 		pgs = kmem_zalloc(pgs_size, async ? KM_NOSLEEP : KM_SLEEP);
314735dd21eSpooka 		if (pgs == NULL) {
315735dd21eSpooka 			pgs = pgs_onstack;
316735dd21eSpooka 			error = ENOMEM;
31729f5c078Suebayasi 			goto out_err;
318735dd21eSpooka 		}
319735dd21eSpooka 	} else {
3202a274197Schristos 		pgs = pgs_onstack;
3212a274197Schristos 		(void)memset(pgs, 0, pgs_size);
322735dd21eSpooka 	}
3232a274197Schristos 
3243123ec52Srin 	UVMHIST_LOG(ubchist, "ridx %jd npages %jd startoff %#jx endoff %#jx",
325735dd21eSpooka 	    ridx, npages, startoffset, endoffset);
326735dd21eSpooka 
327f36a7657Shannken 	if (trans_mount == NULL) {
328f36a7657Shannken 		trans_mount = vp->v_mount;
329287643b0Shannken 		fstrans_start(trans_mount);
330f36a7657Shannken 		/*
331f36a7657Shannken 		 * check if this vnode is still valid.
332f36a7657Shannken 		 */
333f36a7657Shannken 		mutex_enter(vp->v_interlock);
334f36a7657Shannken 		error = vdead_check(vp, 0);
335f36a7657Shannken 		mutex_exit(vp->v_interlock);
336f36a7657Shannken 		if (error)
337f36a7657Shannken 			goto out_err_free;
3381c9818e8Shannken 		/*
3391c9818e8Shannken 		 * XXX: This assumes that we come here only via
3401c9818e8Shannken 		 * the mmio path
3411c9818e8Shannken 		 */
3425f7e4301Sjdolecek 		if (blockalloc && need_wapbl) {
343f36a7657Shannken 			error = WAPBL_BEGIN(trans_mount);
344f36a7657Shannken 			if (error)
3451c9818e8Shannken 				goto out_err_free;
346f36a7657Shannken 			holds_wapbl = true;
3471c9818e8Shannken 		}
3481c9818e8Shannken 	}
349735dd21eSpooka 
350735dd21eSpooka 	/*
351735dd21eSpooka 	 * hold g_glock to prevent a race with truncate.
352735dd21eSpooka 	 *
353735dd21eSpooka 	 * check if our idea of v_size is still valid.
354735dd21eSpooka 	 */
355735dd21eSpooka 
356fca58884Schs 	KASSERT(!glocked || genfs_node_wrlocked(vp));
357fca58884Schs 	if (!glocked) {
358735dd21eSpooka 		if (blockalloc) {
359fca58884Schs 			genfs_node_wrlock(vp);
360735dd21eSpooka 		} else {
361fca58884Schs 			genfs_node_rdlock(vp);
362fca58884Schs 		}
363735dd21eSpooka 	}
364d2a0ebb6Sad 	rw_enter(uobj->vmobjlock, RW_WRITER);
365735dd21eSpooka 	if (vp->v_size < origvsize) {
366fca58884Schs 		if (!glocked) {
367a75c80a0Suebayasi 			genfs_node_unlock(vp);
368fca58884Schs 		}
369735dd21eSpooka 		if (pgs != pgs_onstack)
370735dd21eSpooka 			kmem_free(pgs, pgs_size);
371735dd21eSpooka 		goto startover;
372735dd21eSpooka 	}
373735dd21eSpooka 
37405a3457eSad 	if (uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], NULL,
375bb4b25cfSuebayasi 	    async ? UFP_NOWAIT : UFP_ALL) != orignmempages) {
376fca58884Schs 		if (!glocked) {
377a75c80a0Suebayasi 			genfs_node_unlock(vp);
378fca58884Schs 		}
379735dd21eSpooka 		KASSERT(async != 0);
380bb4b25cfSuebayasi 		genfs_rel_pages(&pgs[ridx], orignmempages);
381d2a0ebb6Sad 		rw_exit(uobj->vmobjlock);
382735dd21eSpooka 		error = EBUSY;
383f4e16ac9Suebayasi 		goto out_err_free;
384735dd21eSpooka 	}
385735dd21eSpooka 
386735dd21eSpooka 	/*
387735dd21eSpooka 	 * if PGO_OVERWRITE is set, don't bother reading the pages.
388735dd21eSpooka 	 */
389735dd21eSpooka 
390735dd21eSpooka 	if (overwrite) {
391fca58884Schs 		if (!glocked) {
392a75c80a0Suebayasi 			genfs_node_unlock(vp);
393fca58884Schs 		}
394735dd21eSpooka 		UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0);
395735dd21eSpooka 
396735dd21eSpooka 		for (i = 0; i < npages; i++) {
3979fa66d7aSuebayasi 			struct vm_page *pg = pgs[ridx + i];
398735dd21eSpooka 
39905a3457eSad 			/*
40005a3457eSad 			 * it's caller's responsibility to allocate blocks
40105a3457eSad 			 * beforehand for the overwrite case.
40205a3457eSad 			 */
40305a3457eSad 
40405a3457eSad 			KASSERT((pg->flags & PG_RDONLY) == 0 || !blockalloc);
40505a3457eSad 			pg->flags &= ~PG_RDONLY;
40605a3457eSad 
40705a3457eSad 			/*
40805a3457eSad 			 * mark the page DIRTY.
40905a3457eSad 			 * otherwise another thread can do putpages and pull
41005a3457eSad 			 * our vnode from syncer's queue before our caller does
41105a3457eSad 			 * ubc_release.  note that putpages won't see CLEAN
41205a3457eSad 			 * pages even if they are BUSY.
41305a3457eSad 			 */
41405a3457eSad 
41505a3457eSad 			uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
416735dd21eSpooka 		}
417735dd21eSpooka 		npages += ridx;
418735dd21eSpooka 		goto out;
419735dd21eSpooka 	}
420735dd21eSpooka 
421735dd21eSpooka 	/*
42205a3457eSad 	 * if the pages are already resident, just return them.
42305a3457eSad 	 */
42405a3457eSad 
42505a3457eSad 	for (i = 0; i < npages; i++) {
42605a3457eSad 		struct vm_page *pg = pgs[ridx + i];
42705a3457eSad 
42805a3457eSad 		if ((pg->flags & PG_FAKE) ||
42905a3457eSad 		    (blockalloc && (pg->flags & PG_RDONLY) != 0)) {
43005a3457eSad 			break;
43105a3457eSad 		}
43205a3457eSad 	}
43305a3457eSad 	if (i == npages) {
43405a3457eSad 		if (!glocked) {
43505a3457eSad 			genfs_node_unlock(vp);
43605a3457eSad 		}
43705a3457eSad 		UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0);
43805a3457eSad 		npages += ridx;
43905a3457eSad 		goto out;
44005a3457eSad 	}
44105a3457eSad 
44205a3457eSad 	/*
443735dd21eSpooka 	 * the page wasn't resident and we're not overwriting,
444735dd21eSpooka 	 * so we're going to have to do some i/o.
445735dd21eSpooka 	 * find any additional pages needed to cover the expanded range.
446735dd21eSpooka 	 */
447735dd21eSpooka 
448735dd21eSpooka 	npages = (endoffset - startoffset) >> PAGE_SHIFT;
449bb4b25cfSuebayasi 	if (startoffset != origoffset || npages != orignmempages) {
4509fa66d7aSuebayasi 		int npgs;
451735dd21eSpooka 
452735dd21eSpooka 		/*
453735dd21eSpooka 		 * we need to avoid deadlocks caused by locking
454735dd21eSpooka 		 * additional pages at lower offsets than pages we
455735dd21eSpooka 		 * already have locked.  unlock them all and start over.
456735dd21eSpooka 		 */
457735dd21eSpooka 
458bb4b25cfSuebayasi 		genfs_rel_pages(&pgs[ridx], orignmempages);
459735dd21eSpooka 		memset(pgs, 0, pgs_size);
460735dd21eSpooka 
461cb32a134Spgoyette 		UVMHIST_LOG(ubchist, "reset npages start 0x%jx end 0x%jx",
462735dd21eSpooka 		    startoffset, endoffset, 0,0);
463735dd21eSpooka 		npgs = npages;
46405a3457eSad 		if (uvn_findpages(uobj, startoffset, &npgs, pgs, NULL,
465735dd21eSpooka 		    async ? UFP_NOWAIT : UFP_ALL) != npages) {
466fca58884Schs 			if (!glocked) {
467a75c80a0Suebayasi 				genfs_node_unlock(vp);
468fca58884Schs 			}
469735dd21eSpooka 			KASSERT(async != 0);
470735dd21eSpooka 			genfs_rel_pages(pgs, npages);
471d2a0ebb6Sad 			rw_exit(uobj->vmobjlock);
472735dd21eSpooka 			error = EBUSY;
473f4e16ac9Suebayasi 			goto out_err_free;
474735dd21eSpooka 		}
475735dd21eSpooka 	}
47653000cecSuebayasi 
477d2a0ebb6Sad 	rw_exit(uobj->vmobjlock);
4782b81644cSriastradh 	error = genfs_getpages_read(vp, pgs, npages, startoffset, diskeof,
4792b81644cSriastradh 	    async, memwrite, blockalloc, glocked);
4802b81644cSriastradh 	if (!glocked) {
4812b81644cSriastradh 		genfs_node_unlock(vp);
4822b81644cSriastradh 	}
483446694baSriastradh 	if (error == 0 && async)
484446694baSriastradh 		goto out_err_free;
485d2a0ebb6Sad 	rw_enter(uobj->vmobjlock, RW_WRITER);
486735dd21eSpooka 
4872b81644cSriastradh 	/*
4882b81644cSriastradh 	 * we're almost done!  release the pages...
4892b81644cSriastradh 	 * for errors, we free the pages.
4902b81644cSriastradh 	 * otherwise we activate them and mark them as valid and clean.
4912b81644cSriastradh 	 * also, unbusy pages that were not actually requested.
4922b81644cSriastradh 	 */
4932b81644cSriastradh 
4942b81644cSriastradh 	if (error) {
4952b81644cSriastradh 		genfs_rel_pages(pgs, npages);
496d2a0ebb6Sad 		rw_exit(uobj->vmobjlock);
497cb32a134Spgoyette 		UVMHIST_LOG(ubchist, "returning error %jd", error,0,0,0);
4982b81644cSriastradh 		goto out_err_free;
4992b81644cSriastradh 	}
5002b81644cSriastradh 
5012b81644cSriastradh out:
502cb32a134Spgoyette 	UVMHIST_LOG(ubchist, "succeeding, npages %jd", npages,0,0,0);
5032b81644cSriastradh 	error = 0;
5042b81644cSriastradh 	for (i = 0; i < npages; i++) {
5052b81644cSriastradh 		struct vm_page *pg = pgs[i];
5062b81644cSriastradh 		if (pg == NULL) {
5072b81644cSriastradh 			continue;
5082b81644cSriastradh 		}
509cb32a134Spgoyette 		UVMHIST_LOG(ubchist, "examining pg %#jx flags 0x%jx",
510cb32a134Spgoyette 		    (uintptr_t)pg, pg->flags, 0,0);
5112b81644cSriastradh 		if (pg->flags & PG_FAKE && !overwrite) {
51205a3457eSad 			/*
51305a3457eSad 			 * we've read page's contents from the backing storage.
51405a3457eSad 			 *
51505a3457eSad 			 * for a read fault, we keep them CLEAN;  if we
51605a3457eSad 			 * encountered a hole while reading, the pages can
51705a3457eSad 			 * already been dirtied with zeros.
51805a3457eSad 			 */
51905a3457eSad 			KASSERTMSG(blockalloc || uvm_pagegetdirty(pg) ==
52005a3457eSad 			    UVM_PAGE_STATUS_CLEAN, "page %p not clean", pg);
52105a3457eSad 			pg->flags &= ~PG_FAKE;
5222b81644cSriastradh 		}
5232b81644cSriastradh 		KASSERT(!memwrite || !blockalloc || (pg->flags & PG_RDONLY) == 0);
5242b81644cSriastradh 		if (i < ridx || i >= ridx + orignmempages || async) {
525cb32a134Spgoyette 			UVMHIST_LOG(ubchist, "unbusy pg %#jx offset 0x%jx",
526cb32a134Spgoyette 			    (uintptr_t)pg, pg->offset,0,0);
5272b81644cSriastradh 			if (pg->flags & PG_FAKE) {
5282b81644cSriastradh 				KASSERT(overwrite);
5292b81644cSriastradh 				uvm_pagezero(pg);
5302b81644cSriastradh 			}
5312b81644cSriastradh 			if (pg->flags & PG_RELEASED) {
5322b81644cSriastradh 				uvm_pagefree(pg);
5332b81644cSriastradh 				continue;
5342b81644cSriastradh 			}
53594843b13Sad 			uvm_pagelock(pg);
5362b81644cSriastradh 			uvm_pageenqueue(pg);
5371912643fSad 			uvm_pagewakeup(pg);
53894843b13Sad 			uvm_pageunlock(pg);
5391912643fSad 			pg->flags &= ~(PG_BUSY|PG_FAKE);
5402b81644cSriastradh 			UVM_PAGE_OWN(pg, NULL);
54105a3457eSad 		} else if (memwrite && !overwrite &&
54205a3457eSad 		    uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
54305a3457eSad 			/*
54405a3457eSad 			 * for a write fault, start dirtiness tracking of
54505a3457eSad 			 * requested pages.
54605a3457eSad 			 */
54705a3457eSad 			uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
5482b81644cSriastradh 		}
5492b81644cSriastradh 	}
550d2a0ebb6Sad 	rw_exit(uobj->vmobjlock);
5512b81644cSriastradh 	if (ap->a_m != NULL) {
5522b81644cSriastradh 		memcpy(ap->a_m, &pgs[ridx],
5532b81644cSriastradh 		    orignmempages * sizeof(struct vm_page *));
5542b81644cSriastradh 	}
5552b81644cSriastradh 
5562b81644cSriastradh out_err_free:
5572b81644cSriastradh 	if (pgs != NULL && pgs != pgs_onstack)
5582b81644cSriastradh 		kmem_free(pgs, pgs_size);
5592b81644cSriastradh out_err:
560f36a7657Shannken 	if (trans_mount != NULL) {
561f36a7657Shannken 		if (holds_wapbl)
562f36a7657Shannken 			WAPBL_END(trans_mount);
563f36a7657Shannken 		fstrans_done(trans_mount);
5642b81644cSriastradh 	}
5652b81644cSriastradh 	return error;
5662b81644cSriastradh }
5672b81644cSriastradh 
5682b81644cSriastradh /*
5692b81644cSriastradh  * genfs_getpages_read: Read the pages in with VOP_BMAP/VOP_STRATEGY.
57003a2126fSdholland  *
57103a2126fSdholland  * "glocked" (which is currently not actually used) tells us not whether
57203a2126fSdholland  * the genfs_node is locked on entry (it always is) but whether it was
57303a2126fSdholland  * locked on entry to genfs_getpages.
5742b81644cSriastradh  */
5752b81644cSriastradh static int
genfs_getpages_read(struct vnode * vp,struct vm_page ** pgs,int npages,off_t startoffset,off_t diskeof,bool async,bool memwrite,bool blockalloc,bool glocked)5762b81644cSriastradh genfs_getpages_read(struct vnode *vp, struct vm_page **pgs, int npages,
5772b81644cSriastradh     off_t startoffset, off_t diskeof,
5782b81644cSriastradh     bool async, bool memwrite, bool blockalloc, bool glocked)
57953000cecSuebayasi {
5802b81644cSriastradh 	struct uvm_object * const uobj = &vp->v_uobj;
5812b81644cSriastradh 	const int fs_bshift = (vp->v_type != VBLK) ?
5822b81644cSriastradh 	    vp->v_mount->mnt_fs_bshift : DEV_BSHIFT;
5832b81644cSriastradh 	const int dev_bshift = (vp->v_type != VBLK) ?
5842b81644cSriastradh 	    vp->v_mount->mnt_dev_bshift : DEV_BSHIFT;
5852b81644cSriastradh 	kauth_cred_t const cred = curlwp->l_cred;		/* XXXUBC curlwp */
58653000cecSuebayasi 	size_t bytes, iobytes, tailstart, tailbytes, totalbytes, skipbytes;
58753000cecSuebayasi 	vaddr_t kva;
58853000cecSuebayasi 	struct buf *bp, *mbp;
58953000cecSuebayasi 	bool sawhole = false;
5902b81644cSriastradh 	int i;
5912b81644cSriastradh 	int error = 0;
59253000cecSuebayasi 
59361497d42Sskrll 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
59461497d42Sskrll 
595735dd21eSpooka 	/*
596735dd21eSpooka 	 * read the desired page(s).
597735dd21eSpooka 	 */
598735dd21eSpooka 
599735dd21eSpooka 	totalbytes = npages << PAGE_SHIFT;
600735dd21eSpooka 	bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0));
601735dd21eSpooka 	tailbytes = totalbytes - bytes;
602735dd21eSpooka 	skipbytes = 0;
603735dd21eSpooka 
604735dd21eSpooka 	kva = uvm_pagermapin(pgs, npages,
6054f2ae943Syamt 	    UVMPAGER_MAPIN_READ | (async ? 0 : UVMPAGER_MAPIN_WAITOK));
6062b81644cSriastradh 	if (kva == 0)
6072b81644cSriastradh 		return EBUSY;
608735dd21eSpooka 
6094a780c9aSad 	mbp = getiobuf(vp, true);
610735dd21eSpooka 	mbp->b_bufsize = totalbytes;
611735dd21eSpooka 	mbp->b_data = (void *)kva;
612735dd21eSpooka 	mbp->b_resid = mbp->b_bcount = bytes;
61301f564d8Sad 	mbp->b_cflags |= BC_BUSY;
6144a780c9aSad 	if (async) {
6154a780c9aSad 		mbp->b_flags = B_READ | B_ASYNC;
6165232c510Schs 		mbp->b_iodone = uvm_aio_aiodone;
6174a780c9aSad 	} else {
6184a780c9aSad 		mbp->b_flags = B_READ;
6194a780c9aSad 		mbp->b_iodone = NULL;
6204a780c9aSad 	}
621735dd21eSpooka 	if (async)
622735dd21eSpooka 		BIO_SETPRIO(mbp, BPRIO_TIMELIMITED);
623735dd21eSpooka 	else
624735dd21eSpooka 		BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL);
625735dd21eSpooka 
626735dd21eSpooka 	/*
627735dd21eSpooka 	 * if EOF is in the middle of the range, zero the part past EOF.
628735dd21eSpooka 	 * skip over pages which are not PG_FAKE since in that case they have
629735dd21eSpooka 	 * valid data that we need to preserve.
630735dd21eSpooka 	 */
631735dd21eSpooka 
632735dd21eSpooka 	tailstart = bytes;
633735dd21eSpooka 	while (tailbytes > 0) {
634735dd21eSpooka 		const int len = PAGE_SIZE - (tailstart & PAGE_MASK);
635735dd21eSpooka 
636735dd21eSpooka 		KASSERT(len <= tailbytes);
637735dd21eSpooka 		if ((pgs[tailstart >> PAGE_SHIFT]->flags & PG_FAKE) != 0) {
638735dd21eSpooka 			memset((void *)(kva + tailstart), 0, len);
639cb32a134Spgoyette 			UVMHIST_LOG(ubchist, "tailbytes %#jx 0x%jx 0x%jx",
640cb32a134Spgoyette 			    (uintptr_t)kva, tailstart, len, 0);
641735dd21eSpooka 		}
642735dd21eSpooka 		tailstart += len;
643735dd21eSpooka 		tailbytes -= len;
644735dd21eSpooka 	}
645735dd21eSpooka 
646735dd21eSpooka 	/*
647735dd21eSpooka 	 * now loop over the pages, reading as needed.
648735dd21eSpooka 	 */
649735dd21eSpooka 
650735dd21eSpooka 	bp = NULL;
651b0b6ddc3Suebayasi 	off_t offset;
652b0b6ddc3Suebayasi 	for (offset = startoffset;
653735dd21eSpooka 	    bytes > 0;
654735dd21eSpooka 	    offset += iobytes, bytes -= iobytes) {
655a0629265Suebayasi 		int run;
65664e0246aSuebayasi 		daddr_t lbn, blkno;
6576903a054Suebayasi 		int pidx;
6581a2a3af3Suebayasi 		struct vnode *devvp;
659735dd21eSpooka 
660735dd21eSpooka 		/*
661735dd21eSpooka 		 * skip pages which don't need to be read.
662735dd21eSpooka 		 */
663735dd21eSpooka 
664735dd21eSpooka 		pidx = (offset - startoffset) >> PAGE_SHIFT;
665735dd21eSpooka 		while ((pgs[pidx]->flags & PG_FAKE) == 0) {
666735dd21eSpooka 			size_t b;
667735dd21eSpooka 
668735dd21eSpooka 			KASSERT((offset & (PAGE_SIZE - 1)) == 0);
669735dd21eSpooka 			if ((pgs[pidx]->flags & PG_RDONLY)) {
670735dd21eSpooka 				sawhole = true;
671735dd21eSpooka 			}
672735dd21eSpooka 			b = MIN(PAGE_SIZE, bytes);
673735dd21eSpooka 			offset += b;
674735dd21eSpooka 			bytes -= b;
675735dd21eSpooka 			skipbytes += b;
676735dd21eSpooka 			pidx++;
677cb32a134Spgoyette 			UVMHIST_LOG(ubchist, "skipping, new offset 0x%jx",
678735dd21eSpooka 			    offset, 0,0,0);
679735dd21eSpooka 			if (bytes == 0) {
680735dd21eSpooka 				goto loopdone;
681735dd21eSpooka 			}
682735dd21eSpooka 		}
683735dd21eSpooka 
684735dd21eSpooka 		/*
685735dd21eSpooka 		 * bmap the file to find out the blkno to read from and
686735dd21eSpooka 		 * how much we can read in one i/o.  if bmap returns an error,
687735dd21eSpooka 		 * skip the rest of the top-level i/o.
688735dd21eSpooka 		 */
689735dd21eSpooka 
690735dd21eSpooka 		lbn = offset >> fs_bshift;
691735dd21eSpooka 		error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
692735dd21eSpooka 		if (error) {
693bf748078Ssimonb 			UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd",
694735dd21eSpooka 			    lbn,error,0,0);
695735dd21eSpooka 			skipbytes += bytes;
6961b9d02ceSuebayasi 			bytes = 0;
697735dd21eSpooka 			goto loopdone;
698735dd21eSpooka 		}
699735dd21eSpooka 
700735dd21eSpooka 		/*
701735dd21eSpooka 		 * see how many pages can be read with this i/o.
702735dd21eSpooka 		 * reduce the i/o size if necessary to avoid
703735dd21eSpooka 		 * overwriting pages with valid data.
704735dd21eSpooka 		 */
705735dd21eSpooka 
706735dd21eSpooka 		iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
707735dd21eSpooka 		    bytes);
708735dd21eSpooka 		if (offset + iobytes > round_page(offset)) {
7096903a054Suebayasi 			int pcount;
7106903a054Suebayasi 
711735dd21eSpooka 			pcount = 1;
712735dd21eSpooka 			while (pidx + pcount < npages &&
713735dd21eSpooka 			    pgs[pidx + pcount]->flags & PG_FAKE) {
714735dd21eSpooka 				pcount++;
715735dd21eSpooka 			}
716735dd21eSpooka 			iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) -
717735dd21eSpooka 			    (offset - trunc_page(offset)));
718735dd21eSpooka 		}
719735dd21eSpooka 
720735dd21eSpooka 		/*
721735dd21eSpooka 		 * if this block isn't allocated, zero it instead of
722735dd21eSpooka 		 * reading it.  unless we are going to allocate blocks,
723735dd21eSpooka 		 * mark the pages we zeroed PG_RDONLY.
724735dd21eSpooka 		 */
725735dd21eSpooka 
7261b9d02ceSuebayasi 		if (blkno == (daddr_t)-1) {
727735dd21eSpooka 			int holepages = (round_page(offset + iobytes) -
728735dd21eSpooka 			    trunc_page(offset)) >> PAGE_SHIFT;
729cb32a134Spgoyette 			UVMHIST_LOG(ubchist, "lbn 0x%jx -> HOLE", lbn,0,0,0);
730735dd21eSpooka 
731735dd21eSpooka 			sawhole = true;
732735dd21eSpooka 			memset((char *)kva + (offset - startoffset), 0,
733735dd21eSpooka 			    iobytes);
734735dd21eSpooka 			skipbytes += iobytes;
735735dd21eSpooka 
73605a3457eSad 			if (!blockalloc) {
737d2a0ebb6Sad 				rw_enter(uobj->vmobjlock, RW_WRITER);
738735dd21eSpooka 				for (i = 0; i < holepages; i++) {
739735dd21eSpooka 					pgs[pidx + i]->flags |= PG_RDONLY;
740735dd21eSpooka 				}
741d2a0ebb6Sad 				rw_exit(uobj->vmobjlock);
74205a3457eSad 			}
743735dd21eSpooka 			continue;
744735dd21eSpooka 		}
745735dd21eSpooka 
746735dd21eSpooka 		/*
747735dd21eSpooka 		 * allocate a sub-buf for this piece of the i/o
748735dd21eSpooka 		 * (or just use mbp if there's only 1 piece),
749735dd21eSpooka 		 * and start it going.
750735dd21eSpooka 		 */
751735dd21eSpooka 
752735dd21eSpooka 		if (offset == startoffset && iobytes == bytes) {
753735dd21eSpooka 			bp = mbp;
754735dd21eSpooka 		} else {
755cb32a134Spgoyette 			UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
756cb32a134Spgoyette 			    (uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
7574a780c9aSad 			bp = getiobuf(vp, true);
758735dd21eSpooka 			nestiobuf_setup(mbp, bp, offset - startoffset, iobytes);
759735dd21eSpooka 		}
760735dd21eSpooka 		bp->b_lblkno = 0;
761735dd21eSpooka 
762735dd21eSpooka 		/* adjust physical blkno for partial blocks */
763735dd21eSpooka 		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
764735dd21eSpooka 		    dev_bshift);
765735dd21eSpooka 
766735dd21eSpooka 		UVMHIST_LOG(ubchist,
767cb32a134Spgoyette 		    "bp %#jx offset 0x%x bcount 0x%x blkno 0x%x",
768cb32a134Spgoyette 		    (uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno);
769735dd21eSpooka 
770735dd21eSpooka 		VOP_STRATEGY(devvp, bp);
771735dd21eSpooka 	}
772735dd21eSpooka 
773735dd21eSpooka loopdone:
774735dd21eSpooka 	nestiobuf_done(mbp, skipbytes, error);
775735dd21eSpooka 	if (async) {
776735dd21eSpooka 		UVMHIST_LOG(ubchist, "returning 0 (async)",0,0,0,0);
7772b81644cSriastradh 		return 0;
778735dd21eSpooka 	}
779735dd21eSpooka 	if (bp != NULL) {
780735dd21eSpooka 		error = biowait(mbp);
781735dd21eSpooka 	}
782735dd21eSpooka 
783e52fb162Srmind 	/* Remove the mapping (make KVA available as soon as possible) */
784e52fb162Srmind 	uvm_pagermapout(kva, npages);
785e52fb162Srmind 
786735dd21eSpooka 	/*
787735dd21eSpooka 	 * if this we encountered a hole then we have to do a little more work.
788735dd21eSpooka 	 * for read faults, we marked the page PG_RDONLY so that future
789735dd21eSpooka 	 * write accesses to the page will fault again.
790735dd21eSpooka 	 * for write faults, we must make sure that the backing store for
791735dd21eSpooka 	 * the page is completely allocated while the pages are locked.
792735dd21eSpooka 	 */
793735dd21eSpooka 
794735dd21eSpooka 	if (!error && sawhole && blockalloc) {
79536d65f11Ssimonb 		error = GOP_ALLOC(vp, startoffset,
79636d65f11Ssimonb 		    npages << PAGE_SHIFT, 0, cred);
797cb32a134Spgoyette 		UVMHIST_LOG(ubchist, "gop_alloc off 0x%jx/0x%jx -> %jd",
798735dd21eSpooka 		    startoffset, npages << PAGE_SHIFT, error,0);
799735dd21eSpooka 		if (!error) {
800d2a0ebb6Sad 			rw_enter(uobj->vmobjlock, RW_WRITER);
801735dd21eSpooka 			for (i = 0; i < npages; i++) {
8029fa66d7aSuebayasi 				struct vm_page *pg = pgs[i];
8039fa66d7aSuebayasi 
8049fa66d7aSuebayasi 				if (pg == NULL) {
805735dd21eSpooka 					continue;
806735dd21eSpooka 				}
80705a3457eSad 				pg->flags &= ~PG_RDONLY;
80805a3457eSad 				uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
809cb32a134Spgoyette 				UVMHIST_LOG(ubchist, "mark dirty pg %#jx",
810cb32a134Spgoyette 				    (uintptr_t)pg, 0, 0, 0);
811735dd21eSpooka 			}
812d2a0ebb6Sad 			rw_exit(uobj->vmobjlock);
813735dd21eSpooka 		}
814735dd21eSpooka 	}
81578a982c8Srmind 
81678a982c8Srmind 	putiobuf(mbp);
817e15697fcSchs 	return error;
818735dd21eSpooka }
819735dd21eSpooka 
820735dd21eSpooka /*
821735dd21eSpooka  * generic VM putpages routine.
822735dd21eSpooka  * Write the given range of pages to backing store.
823735dd21eSpooka  *
824735dd21eSpooka  * => "offhi == 0" means flush all pages at or after "offlo".
825735dd21eSpooka  * => object should be locked by caller.  we return with the
826735dd21eSpooka  *      object unlocked.
827735dd21eSpooka  * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O).
828735dd21eSpooka  *	thus, a caller might want to unlock higher level resources
829735dd21eSpooka  *	(e.g. vm_map) before calling flush.
830735dd21eSpooka  * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, we will not block
831735dd21eSpooka  * => if PGO_ALLPAGES is set, then all pages in the object will be processed.
832735dd21eSpooka  *
833735dd21eSpooka  * note on "cleaning" object and PG_BUSY pages:
834735dd21eSpooka  *	this routine is holding the lock on the object.   the only time
835735dd21eSpooka  *	that it can run into a PG_BUSY page that it does not own is if
836735dd21eSpooka  *	some other process has started I/O on the page (e.g. either
837735dd21eSpooka  *	a pagein, or a pageout).  if the PG_BUSY page is being paged
83805a3457eSad  *	in, then it can not be dirty (!UVM_PAGE_STATUS_CLEAN) because no
83905a3457eSad  *	one has	had a chance to modify it yet.  if the PG_BUSY page is
84005a3457eSad  *	being paged out then it means that someone else has already started
841735dd21eSpooka  *	cleaning the page for us (how nice!).  in this case, if we
842735dd21eSpooka  *	have syncio specified, then after we make our pass through the
843735dd21eSpooka  *	object we need to wait for the other PG_BUSY pages to clear
844735dd21eSpooka  *	off (i.e. we need to do an iosync).   also note that once a
845735dd21eSpooka  *	page is PG_BUSY it must stay in its object until it is un-busyed.
846735dd21eSpooka  */
847735dd21eSpooka 
848735dd21eSpooka int
genfs_putpages(void * v)849735dd21eSpooka genfs_putpages(void *v)
850735dd21eSpooka {
851735dd21eSpooka 	struct vop_putpages_args /* {
852735dd21eSpooka 		struct vnode *a_vp;
853735dd21eSpooka 		voff_t a_offlo;
854735dd21eSpooka 		voff_t a_offhi;
855735dd21eSpooka 		int a_flags;
8562372674cSuebayasi 	} */ * const ap = v;
857735dd21eSpooka 
858735dd21eSpooka 	return genfs_do_putpages(ap->a_vp, ap->a_offlo, ap->a_offhi,
859735dd21eSpooka 	    ap->a_flags, NULL);
860735dd21eSpooka }
861735dd21eSpooka 
862735dd21eSpooka int
genfs_do_putpages(struct vnode * vp,off_t startoff,off_t endoff,int origflags,struct vm_page ** busypg)86336c701bcSyamt genfs_do_putpages(struct vnode *vp, off_t startoff, off_t endoff,
86436c701bcSyamt     int origflags, struct vm_page **busypg)
865735dd21eSpooka {
8662372674cSuebayasi 	struct uvm_object * const uobj = &vp->v_uobj;
867d2a0ebb6Sad 	krwlock_t * const slock = uobj->vmobjlock;
868881d12e6Sad 	off_t nextoff;
8694a780c9aSad 	int i, error, npages, nback;
870735dd21eSpooka 	int freeflag;
871600f58d6Schristos 	/*
872600f58d6Schristos 	 * This array is larger than it should so that it's size is constant.
873600f58d6Schristos 	 * The right size is MAXPAGES.
874600f58d6Schristos 	 */
875600f58d6Schristos 	struct vm_page *pgs[MAXPHYS / MIN_PAGE_SIZE];
876600f58d6Schristos #define MAXPAGES (MAXPHYS / PAGE_SIZE)
877881d12e6Sad 	struct vm_page *pg, *tpg;
878881d12e6Sad 	struct uvm_page_array a;
879881d12e6Sad 	bool wasclean, needs_clean;
88036c701bcSyamt 	bool async = (origflags & PGO_SYNCIO) == 0;
881735dd21eSpooka 	bool pagedaemon = curlwp == uvm.pagedaemon_lwp;
88206a21e4cShannken 	struct mount *trans_mp;
88336c701bcSyamt 	int flags;
88405a3457eSad 	bool modified;		/* if we write out any pages */
88506a21e4cShannken 	bool holds_wapbl;
88605a3457eSad 	bool cleanall;		/* try to pull off from the syncer's list */
88736c701bcSyamt 	bool onworklst;
888d2a0ebb6Sad 	bool nodirty;
88905a3457eSad 	const bool dirtyonly = (origflags & (PGO_DEACTIVATE|PGO_FREE)) == 0;
890735dd21eSpooka 
891735dd21eSpooka 	UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
892735dd21eSpooka 
89336c701bcSyamt 	KASSERT(origflags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE));
894ab579ad8Sriastradh 	KASSERT((startoff & PAGE_MASK) == 0);
895ab579ad8Sriastradh 	KASSERT((endoff & PAGE_MASK) == 0);
896735dd21eSpooka 	KASSERT(startoff < endoff || endoff == 0);
897d2a0ebb6Sad 	KASSERT(rw_write_held(slock));
898735dd21eSpooka 
899cb32a134Spgoyette 	UVMHIST_LOG(ubchist, "vp %#jx pages %jd off 0x%jx len 0x%jx",
900cb32a134Spgoyette 	    (uintptr_t)vp, uobj->uo_npages, startoff, endoff - startoff);
901735dd21eSpooka 
902560071c2Sjdolecek #ifdef DIAGNOSTIC
903560071c2Sjdolecek 	if ((origflags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl)
904560071c2Sjdolecek                 WAPBL_JLOCK_ASSERT(vp->v_mount);
905560071c2Sjdolecek #endif
906560071c2Sjdolecek 
90706a21e4cShannken 	trans_mp = NULL;
90806a21e4cShannken 	holds_wapbl = false;
9090789b071Shannken 
91036c701bcSyamt retry:
91136c701bcSyamt 	modified = false;
91236c701bcSyamt 	flags = origflags;
91305a3457eSad 
91405a3457eSad 	/*
91505a3457eSad 	 * shortcut if we have no pages to process.
91605a3457eSad 	 */
91705a3457eSad 
91819303cecSchs 	nodirty = uvm_obj_clean_p(uobj);
919da3ef92bSad #ifdef DIAGNOSTIC
920da3ef92bSad 	mutex_enter(vp->v_interlock);
921da3ef92bSad 	KASSERT((vp->v_iflag & VI_ONWORKLST) != 0 || nodirty);
922da3ef92bSad 	mutex_exit(vp->v_interlock);
923da3ef92bSad #endif
924d2a0ebb6Sad 	if (uobj->uo_npages == 0 || (dirtyonly && nodirty)) {
925d2a0ebb6Sad 		mutex_enter(vp->v_interlock);
92619303cecSchs 		if (vp->v_iflag & VI_ONWORKLST && LIST_EMPTY(&vp->v_dirtyblkhd)) {
927735dd21eSpooka 			vn_syncer_remove_from_worklist(vp);
928735dd21eSpooka 		}
929d2a0ebb6Sad 		mutex_exit(vp->v_interlock);
93006a21e4cShannken 		if (trans_mp) {
93106a21e4cShannken 			if (holds_wapbl)
93206a21e4cShannken 				WAPBL_END(trans_mp);
93306a21e4cShannken 			fstrans_done(trans_mp);
93444f3404fShannken 		}
935d2a0ebb6Sad 		rw_exit(slock);
936735dd21eSpooka 		return (0);
937735dd21eSpooka 	}
938735dd21eSpooka 
939735dd21eSpooka 	/*
940735dd21eSpooka 	 * the vnode has pages, set up to process the request.
941735dd21eSpooka 	 */
942735dd21eSpooka 
94306a21e4cShannken 	if (trans_mp == NULL && (flags & PGO_CLEANIT) != 0) {
944735dd21eSpooka 		if (pagedaemon) {
94506a21e4cShannken 			/* Pagedaemon must not sleep here. */
94606a21e4cShannken 			trans_mp = vp->v_mount;
947287643b0Shannken 			error = fstrans_start_nowait(trans_mp);
94844f3404fShannken 			if (error) {
949d2a0ebb6Sad 				rw_exit(slock);
95006a21e4cShannken 				return error;
95106a21e4cShannken 			}
95206a21e4cShannken 		} else {
95306a21e4cShannken 			/*
95406a21e4cShannken 			 * Cannot use vdeadcheck() here as this operation
95506a21e4cShannken 			 * usually gets used from VOP_RECLAIM().  Test for
95606a21e4cShannken 			 * change of v_mount instead and retry on change.
95706a21e4cShannken 			 */
958d2a0ebb6Sad 			rw_exit(slock);
95906a21e4cShannken 			trans_mp = vp->v_mount;
960287643b0Shannken 			fstrans_start(trans_mp);
96106a21e4cShannken 			if (vp->v_mount != trans_mp) {
96206a21e4cShannken 				fstrans_done(trans_mp);
96306a21e4cShannken 				trans_mp = NULL;
96406a21e4cShannken 			} else {
96506a21e4cShannken 				holds_wapbl = (trans_mp->mnt_wapbl &&
96606a21e4cShannken 				    (origflags & PGO_JOURNALLOCKED) == 0);
96706a21e4cShannken 				if (holds_wapbl) {
96806a21e4cShannken 					error = WAPBL_BEGIN(trans_mp);
96906a21e4cShannken 					if (error) {
97006a21e4cShannken 						fstrans_done(trans_mp);
97144f3404fShannken 						return error;
97244f3404fShannken 					}
97344f3404fShannken 				}
97406a21e4cShannken 			}
975d2a0ebb6Sad 			rw_enter(slock, RW_WRITER);
9760789b071Shannken 			goto retry;
977735dd21eSpooka 		}
97806a21e4cShannken 	}
979735dd21eSpooka 
980735dd21eSpooka 	error = 0;
98119303cecSchs 	wasclean = uvm_obj_nowriteback_p(uobj);
982881d12e6Sad 	nextoff = startoff;
983735dd21eSpooka 	if (endoff == 0 || flags & PGO_ALLPAGES) {
984735dd21eSpooka 		endoff = trunc_page(LLONG_MAX);
985735dd21eSpooka 	}
986735dd21eSpooka 
987735dd21eSpooka 	/*
988735dd21eSpooka 	 * if this vnode is known not to have dirty pages,
989735dd21eSpooka 	 * don't bother to clean it out.
990735dd21eSpooka 	 */
991735dd21eSpooka 
992d2a0ebb6Sad 	if (nodirty) {
99306a7b095Sriastradh 		/* We handled the dirtyonly && nodirty case above.  */
99406a7b095Sriastradh 		KASSERT(!dirtyonly);
995735dd21eSpooka 		flags &= ~PGO_CLEANIT;
996735dd21eSpooka 	}
997735dd21eSpooka 
998735dd21eSpooka 	/*
999881d12e6Sad 	 * start the loop to scan pages.
1000735dd21eSpooka 	 */
1001735dd21eSpooka 
100205a3457eSad 	cleanall = true;
1003735dd21eSpooka 	freeflag = pagedaemon ? PG_PAGEOUT : PG_RELEASED;
10044bfe0439Sad 	uvm_page_array_init(&a, uobj, dirtyonly ? (UVM_PAGE_ARRAY_FILL_DIRTY |
10054bfe0439Sad 	    (!async ? UVM_PAGE_ARRAY_FILL_WRITEBACK : 0)) : 0);
1006881d12e6Sad 	for (;;) {
100705a3457eSad 		bool pgprotected;
100805a3457eSad 
1009735dd21eSpooka 		/*
101005a3457eSad 		 * if !dirtyonly, iterate over all resident pages in the range.
101105a3457eSad 		 *
101205a3457eSad 		 * if dirtyonly, only possibly dirty pages are interesting.
101305a3457eSad 		 * however, if we are asked to sync for integrity, we should
101405a3457eSad 		 * wait on pages being written back by other threads as well.
1015735dd21eSpooka 		 */
1016735dd21eSpooka 
10174bfe0439Sad 		pg = uvm_page_array_fill_and_peek(&a, nextoff, 0);
1018881d12e6Sad 		if (pg == NULL) {
1019735dd21eSpooka 			break;
1020735dd21eSpooka 		}
1021881d12e6Sad 
1022881d12e6Sad 		KASSERT(pg->uobject == uobj);
1023881d12e6Sad 		KASSERT((pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1024881d12e6Sad 		    (pg->flags & (PG_BUSY)) != 0);
1025881d12e6Sad 		KASSERT(pg->offset >= startoff);
1026881d12e6Sad 		KASSERT(pg->offset >= nextoff);
102705a3457eSad 		KASSERT(!dirtyonly ||
102805a3457eSad 		    uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN ||
102919303cecSchs 		    uvm_obj_page_writeback_p(pg));
1030881d12e6Sad 
1031881d12e6Sad 		if (pg->offset >= endoff) {
1032881d12e6Sad 			break;
1033c84e81caShannken 		}
1034881d12e6Sad 
1035881d12e6Sad 		/*
1036881d12e6Sad 		 * a preempt point.
1037881d12e6Sad 		 */
1038881d12e6Sad 
103916d4fad6Sad 		if (preempt_needed()) {
1040881d12e6Sad 			nextoff = pg->offset; /* visit this page again */
1041d2a0ebb6Sad 			rw_exit(slock);
1042881d12e6Sad 			preempt();
1043881d12e6Sad 			/*
1044881d12e6Sad 			 * as we dropped the object lock, our cached pages can
1045881d12e6Sad 			 * be stale.
1046881d12e6Sad 			 */
1047881d12e6Sad 			uvm_page_array_clear(&a);
1048d2a0ebb6Sad 			rw_enter(slock, RW_WRITER);
1049735dd21eSpooka 			continue;
1050735dd21eSpooka 		}
1051735dd21eSpooka 
1052735dd21eSpooka 		/*
105305a3457eSad 		 * if the current page is busy, wait for it to become unbusy.
1054735dd21eSpooka 		 */
1055735dd21eSpooka 
105605a3457eSad 		if ((pg->flags & PG_BUSY) != 0) {
1057cb32a134Spgoyette 			UVMHIST_LOG(ubchist, "busy %#jx", (uintptr_t)pg,
1058cb32a134Spgoyette 			   0, 0, 0);
105905a3457eSad 			if ((pg->flags & (PG_RELEASED|PG_PAGEOUT)) != 0
106005a3457eSad 			    && (flags & PGO_BUSYFAIL) != 0) {
1061cb32a134Spgoyette 				UVMHIST_LOG(ubchist, "busyfail %#jx",
1062cb32a134Spgoyette 				    (uintptr_t)pg, 0, 0, 0);
1063735dd21eSpooka 				error = EDEADLK;
1064735dd21eSpooka 				if (busypg != NULL)
1065735dd21eSpooka 					*busypg = pg;
1066735dd21eSpooka 				break;
1067735dd21eSpooka 			}
1068735dd21eSpooka 			if (pagedaemon) {
1069735dd21eSpooka 				/*
1070735dd21eSpooka 				 * someone has taken the page while we
1071735dd21eSpooka 				 * dropped the lock for fstrans_start.
1072735dd21eSpooka 				 */
1073735dd21eSpooka 				break;
1074735dd21eSpooka 			}
107505a3457eSad 			/*
107605a3457eSad 			 * don't bother to wait on other's activities
107705a3457eSad 			 * unless we are asked to sync for integrity.
107805a3457eSad 			 */
107905a3457eSad 			if (!async && (flags & PGO_RECLAIM) == 0) {
108005a3457eSad 				wasclean = false;
108105a3457eSad 				nextoff = pg->offset + PAGE_SIZE;
108205a3457eSad 				uvm_page_array_advance(&a);
108305a3457eSad 				continue;
108405a3457eSad 			}
1085881d12e6Sad 			nextoff = pg->offset; /* visit this page again */
10865972ba16Sad 			uvm_pagewait(pg, slock, "genput");
1087881d12e6Sad 			/*
1088881d12e6Sad 			 * as we dropped the object lock, our cached pages can
1089881d12e6Sad 			 * be stale.
1090881d12e6Sad 			 */
1091881d12e6Sad 			uvm_page_array_clear(&a);
1092d2a0ebb6Sad 			rw_enter(slock, RW_WRITER);
1093735dd21eSpooka 			continue;
1094735dd21eSpooka 		}
1095735dd21eSpooka 
1096881d12e6Sad 		nextoff = pg->offset + PAGE_SIZE;
1097881d12e6Sad 		uvm_page_array_advance(&a);
1098881d12e6Sad 
1099735dd21eSpooka 		/*
1100735dd21eSpooka 		 * if we're freeing, remove all mappings of the page now.
1101735dd21eSpooka 		 * if we're cleaning, check if the page is needs to be cleaned.
1102735dd21eSpooka 		 */
1103735dd21eSpooka 
110405a3457eSad 		pgprotected = false;
1105735dd21eSpooka 		if (flags & PGO_FREE) {
1106735dd21eSpooka 			pmap_page_protect(pg, VM_PROT_NONE);
110705a3457eSad 			pgprotected = true;
1108735dd21eSpooka 		} else if (flags & PGO_CLEANIT) {
1109735dd21eSpooka 
1110735dd21eSpooka 			/*
1111735dd21eSpooka 			 * if we still have some hope to pull this vnode off
1112735dd21eSpooka 			 * from the syncer queue, write-protect the page.
1113735dd21eSpooka 			 */
1114735dd21eSpooka 
111505a3457eSad 			if (cleanall && wasclean) {
1116735dd21eSpooka 
1117735dd21eSpooka 				/*
1118735dd21eSpooka 				 * uobj pages get wired only by uvm_fault
1119735dd21eSpooka 				 * where uobj is locked.
1120735dd21eSpooka 				 */
1121735dd21eSpooka 
1122735dd21eSpooka 				if (pg->wire_count == 0) {
1123735dd21eSpooka 					pmap_page_protect(pg,
1124735dd21eSpooka 					    VM_PROT_READ|VM_PROT_EXECUTE);
112505a3457eSad 					pgprotected = true;
1126735dd21eSpooka 				} else {
1127735dd21eSpooka 					cleanall = false;
1128735dd21eSpooka 				}
1129735dd21eSpooka 			}
1130735dd21eSpooka 		}
1131735dd21eSpooka 
1132735dd21eSpooka 		if (flags & PGO_CLEANIT) {
113305a3457eSad 			needs_clean = uvm_pagecheckdirty(pg, pgprotected);
1134735dd21eSpooka 		} else {
1135735dd21eSpooka 			needs_clean = false;
1136735dd21eSpooka 		}
1137735dd21eSpooka 
1138735dd21eSpooka 		/*
1139735dd21eSpooka 		 * if we're cleaning, build a cluster.
114005a3457eSad 		 * the cluster will consist of pages which are currently dirty.
1141735dd21eSpooka 		 * if not cleaning, just operate on the one page.
1142735dd21eSpooka 		 */
1143735dd21eSpooka 
1144735dd21eSpooka 		if (needs_clean) {
1145735dd21eSpooka 			wasclean = false;
1146735dd21eSpooka 			memset(pgs, 0, sizeof(pgs));
1147735dd21eSpooka 			pg->flags |= PG_BUSY;
1148735dd21eSpooka 			UVM_PAGE_OWN(pg, "genfs_putpages");
1149735dd21eSpooka 
1150735dd21eSpooka 			/*
1151e406c140Schs 			 * let the fs constrain the offset range of the cluster.
1152e406c140Schs 			 * we additionally constrain the range here such that
1153e406c140Schs 			 * it fits in the "pgs" pages array.
1154e406c140Schs 			 */
1155e406c140Schs 
1156881d12e6Sad 			off_t fslo, fshi, genlo, lo, off = pg->offset;
1157e406c140Schs 			GOP_PUTRANGE(vp, off, &fslo, &fshi);
1158e406c140Schs 			KASSERT(fslo == trunc_page(fslo));
1159e406c140Schs 			KASSERT(fslo <= off);
1160e406c140Schs 			KASSERT(fshi == trunc_page(fshi));
1161e406c140Schs 			KASSERT(fshi == 0 || off < fshi);
1162e406c140Schs 
1163e406c140Schs 			if (off > MAXPHYS / 2)
1164e406c140Schs 				genlo = trunc_page(off - (MAXPHYS / 2));
1165e406c140Schs 			else
1166e406c140Schs 				genlo = 0;
1167e406c140Schs 			lo = MAX(fslo, genlo);
1168e406c140Schs 
1169e406c140Schs 			/*
1170735dd21eSpooka 			 * first look backward.
1171735dd21eSpooka 			 */
1172735dd21eSpooka 
1173e406c140Schs 			npages = (off - lo) >> PAGE_SHIFT;
1174735dd21eSpooka 			nback = npages;
117505a3457eSad 			uvn_findpages(uobj, off - PAGE_SIZE, &nback,
117605a3457eSad 			    &pgs[0], NULL,
1177735dd21eSpooka 			    UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD);
1178735dd21eSpooka 			if (nback) {
1179735dd21eSpooka 				memmove(&pgs[0], &pgs[npages - nback],
1180735dd21eSpooka 				    nback * sizeof(pgs[0]));
1181735dd21eSpooka 				if (npages - nback < nback)
1182735dd21eSpooka 					memset(&pgs[nback], 0,
1183735dd21eSpooka 					    (npages - nback) * sizeof(pgs[0]));
1184735dd21eSpooka 				else
1185735dd21eSpooka 					memset(&pgs[npages - nback], 0,
1186735dd21eSpooka 					    nback * sizeof(pgs[0]));
1187735dd21eSpooka 			}
1188735dd21eSpooka 
1189735dd21eSpooka 			/*
1190735dd21eSpooka 			 * then plug in our page of interest.
1191735dd21eSpooka 			 */
1192735dd21eSpooka 
1193735dd21eSpooka 			pgs[nback] = pg;
1194735dd21eSpooka 
1195735dd21eSpooka 			/*
1196735dd21eSpooka 			 * then look forward to fill in the remaining space in
1197735dd21eSpooka 			 * the array of pages.
119805a3457eSad 			 *
119905a3457eSad 			 * pass our cached array of pages so that hopefully
120005a3457eSad 			 * uvn_findpages can find some good pages in it.
120105a3457eSad 			 * the array a was filled above with the one of
120205a3457eSad 			 * following sets of flags:
120305a3457eSad 			 *	0
120405a3457eSad 			 *	UVM_PAGE_ARRAY_FILL_DIRTY
120505a3457eSad 			 *	UVM_PAGE_ARRAY_FILL_DIRTY|WRITEBACK
12062806b3daSad 			 *
12072806b3daSad 			 * XXX this is fragile but it'll work: the array
12082806b3daSad 			 * was earlier filled sparsely, but UFP_DIRTYONLY
12092806b3daSad 			 * implies dense.  see corresponding comment in
12102806b3daSad 			 * uvn_findpages().
1211735dd21eSpooka 			 */
1212735dd21eSpooka 
1213d18e278dSchristos 			npages = MAXPAGES - nback - 1;
1214e406c140Schs 			if (fshi)
1215e406c140Schs 				npages = MIN(npages,
1216e406c140Schs 					     (fshi - off - 1) >> PAGE_SHIFT);
1217735dd21eSpooka 			uvn_findpages(uobj, off + PAGE_SIZE, &npages,
12182806b3daSad 			    &pgs[nback + 1], &a,
1219735dd21eSpooka 			    UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY);
1220735dd21eSpooka 			npages += nback + 1;
1221735dd21eSpooka 		} else {
1222735dd21eSpooka 			pgs[0] = pg;
1223735dd21eSpooka 			npages = 1;
1224735dd21eSpooka 			nback = 0;
1225735dd21eSpooka 		}
1226735dd21eSpooka 
1227735dd21eSpooka 		/*
1228735dd21eSpooka 		 * apply FREE or DEACTIVATE options if requested.
1229735dd21eSpooka 		 */
1230735dd21eSpooka 
1231735dd21eSpooka 		for (i = 0; i < npages; i++) {
1232735dd21eSpooka 			tpg = pgs[i];
1233735dd21eSpooka 			KASSERT(tpg->uobject == uobj);
123405a3457eSad 			KASSERT(i == 0 ||
123505a3457eSad 			    pgs[i-1]->offset + PAGE_SIZE == tpg->offset);
123605a3457eSad 			KASSERT(!needs_clean || uvm_pagegetdirty(pgs[i]) !=
123705a3457eSad 			    UVM_PAGE_STATUS_DIRTY);
123805a3457eSad 			if (needs_clean) {
123905a3457eSad 				/*
124005a3457eSad 				 * mark pages as WRITEBACK so that concurrent
124105a3457eSad 				 * fsync can find and wait for our activities.
124205a3457eSad 				 */
124319303cecSchs 				uvm_obj_page_set_writeback(pgs[i]);
124405a3457eSad 			}
1245735dd21eSpooka 			if (tpg->offset < startoff || tpg->offset >= endoff)
1246735dd21eSpooka 				continue;
1247735dd21eSpooka 			if (flags & PGO_DEACTIVATE && tpg->wire_count == 0) {
124894843b13Sad 				uvm_pagelock(tpg);
1249735dd21eSpooka 				uvm_pagedeactivate(tpg);
125094843b13Sad 				uvm_pageunlock(tpg);
1251735dd21eSpooka 			} else if (flags & PGO_FREE) {
1252735dd21eSpooka 				pmap_page_protect(tpg, VM_PROT_NONE);
1253735dd21eSpooka 				if (tpg->flags & PG_BUSY) {
1254735dd21eSpooka 					tpg->flags |= freeflag;
1255735dd21eSpooka 					if (pagedaemon) {
12564a780c9aSad 						uvm_pageout_start(1);
125794843b13Sad 						uvm_pagelock(tpg);
1258735dd21eSpooka 						uvm_pagedequeue(tpg);
125994843b13Sad 						uvm_pageunlock(tpg);
1260735dd21eSpooka 					}
1261735dd21eSpooka 				} else {
1262735dd21eSpooka 
1263735dd21eSpooka 					/*
1264735dd21eSpooka 					 * ``page is not busy''
1265735dd21eSpooka 					 * implies that npages is 1
1266735dd21eSpooka 					 * and needs_clean is false.
1267735dd21eSpooka 					 */
1268735dd21eSpooka 
1269881d12e6Sad 					KASSERT(npages == 1);
1270881d12e6Sad 					KASSERT(!needs_clean);
1271881d12e6Sad 					KASSERT(pg == tpg);
1272881d12e6Sad 					KASSERT(nextoff ==
1273881d12e6Sad 					    tpg->offset + PAGE_SIZE);
1274735dd21eSpooka 					uvm_pagefree(tpg);
1275735dd21eSpooka 					if (pagedaemon)
1276735dd21eSpooka 						uvmexp.pdfreed++;
1277735dd21eSpooka 				}
1278735dd21eSpooka 			}
1279735dd21eSpooka 		}
1280735dd21eSpooka 		if (needs_clean) {
1281735dd21eSpooka 			modified = true;
1282881d12e6Sad 			KASSERT(nextoff == pg->offset + PAGE_SIZE);
1283881d12e6Sad 			KASSERT(nback < npages);
1284881d12e6Sad 			nextoff = pg->offset + ((npages - nback) << PAGE_SHIFT);
1285881d12e6Sad 			KASSERT(pgs[nback] == pg);
1286881d12e6Sad 			KASSERT(nextoff == pgs[npages - 1]->offset + PAGE_SIZE);
1287735dd21eSpooka 
1288735dd21eSpooka 			/*
1289881d12e6Sad 			 * start the i/o.
1290735dd21eSpooka 			 */
1291d2a0ebb6Sad 			rw_exit(slock);
1292735dd21eSpooka 			error = GOP_WRITE(vp, pgs, npages, flags);
1293881d12e6Sad 			/*
1294881d12e6Sad 			 * as we dropped the object lock, our cached pages can
1295881d12e6Sad 			 * be stale.
1296881d12e6Sad 			 */
1297881d12e6Sad 			uvm_page_array_clear(&a);
1298d2a0ebb6Sad 			rw_enter(slock, RW_WRITER);
1299735dd21eSpooka 			if (error) {
1300735dd21eSpooka 				break;
1301735dd21eSpooka 			}
1302735dd21eSpooka 		}
1303735dd21eSpooka 	}
1304881d12e6Sad 	uvm_page_array_fini(&a);
1305735dd21eSpooka 
130605a3457eSad 	/*
130705a3457eSad 	 * update ctime/mtime if the modification we started writing out might
130805a3457eSad 	 * be from mmap'ed write.
130905a3457eSad 	 *
131005a3457eSad 	 * this is necessary when an application keeps a file mmaped and
131105a3457eSad 	 * repeatedly modifies it via the window.  note that, because we
131205a3457eSad 	 * don't always write-protect pages when cleaning, such modifications
131305a3457eSad 	 * might not involve any page faults.
131405a3457eSad 	 */
131505a3457eSad 
1316d2a0ebb6Sad 	mutex_enter(vp->v_interlock);
1317da3ef92bSad 	if (modified && (vp->v_iflag & VI_WRMAP) != 0 &&
1318735dd21eSpooka 	    (vp->v_type != VBLK ||
1319735dd21eSpooka 	    (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) {
1320735dd21eSpooka 		GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
1321735dd21eSpooka 	}
1322735dd21eSpooka 
1323735dd21eSpooka 	/*
132405a3457eSad 	 * if we no longer have any possibly dirty pages, take us off the
132505a3457eSad 	 * syncer list.
1326735dd21eSpooka 	 */
1327735dd21eSpooka 
132819303cecSchs 	if ((vp->v_iflag & VI_ONWORKLST) != 0 && uvm_obj_clean_p(uobj) &&
132919303cecSchs 	    LIST_EMPTY(&vp->v_dirtyblkhd)) {
1330735dd21eSpooka 		vn_syncer_remove_from_worklist(vp);
1331735dd21eSpooka 	}
1332735dd21eSpooka 
13334a780c9aSad 	/* Wait for output to complete. */
1334d2a0ebb6Sad 	rw_exit(slock);
13354a780c9aSad 	if (!wasclean && !async && vp->v_numoutput != 0) {
13364a780c9aSad 		while (vp->v_numoutput != 0)
1337d2a0ebb6Sad 			cv_wait(&vp->v_cv, vp->v_interlock);
1338735dd21eSpooka 	}
133936c701bcSyamt 	onworklst = (vp->v_iflag & VI_ONWORKLST) != 0;
1340d2a0ebb6Sad 	mutex_exit(vp->v_interlock);
1341735dd21eSpooka 
134236c701bcSyamt 	if ((flags & PGO_RECLAIM) != 0 && onworklst) {
134336c701bcSyamt 		/*
134436c701bcSyamt 		 * in the case of PGO_RECLAIM, ensure to make the vnode clean.
134536c701bcSyamt 		 * retrying is not a big deal because, in many cases,
134636c701bcSyamt 		 * uobj->uo_npages is already 0 here.
134736c701bcSyamt 		 */
1348d2a0ebb6Sad 		rw_enter(slock, RW_WRITER);
134936c701bcSyamt 		goto retry;
135036c701bcSyamt 	}
135136c701bcSyamt 
135206a21e4cShannken 	if (trans_mp) {
135306a21e4cShannken 		if (holds_wapbl)
135406a21e4cShannken 			WAPBL_END(trans_mp);
135506a21e4cShannken 		fstrans_done(trans_mp);
135644f3404fShannken 	}
13570789b071Shannken 
1358735dd21eSpooka 	return (error);
1359735dd21eSpooka }
1360735dd21eSpooka 
1361e406c140Schs /*
1362e406c140Schs  * Default putrange method for file systems that do not care
1363e406c140Schs  * how many pages are given to one GOP_WRITE() call.
1364e406c140Schs  */
1365e406c140Schs void
genfs_gop_putrange(struct vnode * vp,off_t off,off_t * lop,off_t * hip)1366e406c140Schs genfs_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
1367e406c140Schs {
1368e406c140Schs 
1369e406c140Schs 	*lop = 0;
1370e406c140Schs 	*hip = 0;
1371e406c140Schs }
1372e406c140Schs 
1373735dd21eSpooka int
genfs_gop_write(struct vnode * vp,struct vm_page ** pgs,int npages,int flags)1374735dd21eSpooka genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
1375735dd21eSpooka {
1376735dd21eSpooka 	off_t off;
1377735dd21eSpooka 	vaddr_t kva;
1378735dd21eSpooka 	size_t len;
1379735dd21eSpooka 	int error;
1380735dd21eSpooka 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1381735dd21eSpooka 
1382cb32a134Spgoyette 	UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
1383cb32a134Spgoyette 	    (uintptr_t)vp, (uintptr_t)pgs, npages, flags);
1384735dd21eSpooka 
1385735dd21eSpooka 	off = pgs[0]->offset;
1386735dd21eSpooka 	kva = uvm_pagermapin(pgs, npages,
1387735dd21eSpooka 	    UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
1388735dd21eSpooka 	len = npages << PAGE_SHIFT;
1389735dd21eSpooka 
1390735dd21eSpooka 	error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE,
13915232c510Schs 			    uvm_aio_aiodone);
1392735dd21eSpooka 
1393735dd21eSpooka 	return error;
1394735dd21eSpooka }
1395735dd21eSpooka 
1396881d12e6Sad /*
1397881d12e6Sad  * genfs_gop_write_rwmap:
1398881d12e6Sad  *
1399881d12e6Sad  * a variant of genfs_gop_write.  it's used by UDF for its directory buffers.
1400881d12e6Sad  * this maps pages with PROT_WRITE so that VOP_STRATEGY can modifies
1401881d12e6Sad  * the contents before writing it out to the underlying storage.
1402881d12e6Sad  */
1403881d12e6Sad 
1404e979c658Sreinoud int
genfs_gop_write_rwmap(struct vnode * vp,struct vm_page ** pgs,int npages,int flags)1405881d12e6Sad genfs_gop_write_rwmap(struct vnode *vp, struct vm_page **pgs, int npages,
1406881d12e6Sad     int flags)
1407e979c658Sreinoud {
1408e979c658Sreinoud 	off_t off;
1409e979c658Sreinoud 	vaddr_t kva;
1410e979c658Sreinoud 	size_t len;
1411e979c658Sreinoud 	int error;
1412e979c658Sreinoud 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1413e979c658Sreinoud 
1414cb32a134Spgoyette 	UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
1415cb32a134Spgoyette 	    (uintptr_t)vp, (uintptr_t)pgs, npages, flags);
1416e979c658Sreinoud 
1417e979c658Sreinoud 	off = pgs[0]->offset;
1418e979c658Sreinoud 	kva = uvm_pagermapin(pgs, npages,
1419e979c658Sreinoud 	    UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK);
1420e979c658Sreinoud 	len = npages << PAGE_SHIFT;
1421e979c658Sreinoud 
1422e979c658Sreinoud 	error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE,
14235232c510Schs 			    uvm_aio_aiodone);
1424e979c658Sreinoud 
1425e979c658Sreinoud 	return error;
1426e979c658Sreinoud }
1427e979c658Sreinoud 
1428735dd21eSpooka /*
1429735dd21eSpooka  * Backend routine for doing I/O to vnode pages.  Pages are already locked
1430735dd21eSpooka  * and mapped into kernel memory.  Here we just look up the underlying
1431735dd21eSpooka  * device block addresses and call the strategy routine.
1432735dd21eSpooka  */
1433735dd21eSpooka 
1434735dd21eSpooka static int
genfs_do_io(struct vnode * vp,off_t off,vaddr_t kva,size_t len,int flags,enum uio_rw rw,void (* iodone)(struct buf *))1435735dd21eSpooka genfs_do_io(struct vnode *vp, off_t off, vaddr_t kva, size_t len, int flags,
1436735dd21eSpooka     enum uio_rw rw, void (*iodone)(struct buf *))
1437735dd21eSpooka {
14381b9d02ceSuebayasi 	int s, error;
1439735dd21eSpooka 	int fs_bshift, dev_bshift;
1440735dd21eSpooka 	off_t eof, offset, startoffset;
1441735dd21eSpooka 	size_t bytes, iobytes, skipbytes;
1442735dd21eSpooka 	struct buf *mbp, *bp;
144364cb3c88Suebayasi 	const bool async = (flags & PGO_SYNCIO) == 0;
14448306a9edSchs 	const bool lazy = (flags & PGO_LAZY) == 0;
144564cb3c88Suebayasi 	const bool iowrite = rw == UIO_WRITE;
144664cb3c88Suebayasi 	const int brw = iowrite ? B_WRITE : B_READ;
1447735dd21eSpooka 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1448735dd21eSpooka 
1449cb32a134Spgoyette 	UVMHIST_LOG(ubchist, "vp %#jx kva %#jx len 0x%jx flags 0x%jx",
1450cb32a134Spgoyette 	    (uintptr_t)vp, (uintptr_t)kva, len, flags);
1451735dd21eSpooka 
1452*fda613dfSriastradh 	KASSERT(vp->v_size != VSIZENOTSET);
1453*fda613dfSriastradh 	KASSERT(vp->v_writesize != VSIZENOTSET);
1454*fda613dfSriastradh 	KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p"
1455*fda613dfSriastradh 	    " v_size=0x%llx v_writesize=0x%llx", vp,
1456*fda613dfSriastradh 	    (unsigned long long)vp->v_size,
1457*fda613dfSriastradh 	    (unsigned long long)vp->v_writesize);
1458735dd21eSpooka 	GOP_SIZE(vp, vp->v_writesize, &eof, 0);
1459735dd21eSpooka 	if (vp->v_type != VBLK) {
1460735dd21eSpooka 		fs_bshift = vp->v_mount->mnt_fs_bshift;
1461735dd21eSpooka 		dev_bshift = vp->v_mount->mnt_dev_bshift;
1462735dd21eSpooka 	} else {
1463735dd21eSpooka 		fs_bshift = DEV_BSHIFT;
1464735dd21eSpooka 		dev_bshift = DEV_BSHIFT;
1465735dd21eSpooka 	}
1466735dd21eSpooka 	error = 0;
1467735dd21eSpooka 	startoffset = off;
1468735dd21eSpooka 	bytes = MIN(len, eof - startoffset);
1469735dd21eSpooka 	skipbytes = 0;
1470735dd21eSpooka 	KASSERT(bytes != 0);
1471735dd21eSpooka 
147264cb3c88Suebayasi 	if (iowrite) {
1473881d12e6Sad 		/*
1474881d12e6Sad 		 * why += 2?
1475881d12e6Sad 		 * 1 for biodone, 1 for uvm_aio_aiodone.
1476881d12e6Sad 		 */
1477e225b7bdSrmind 		mutex_enter(vp->v_interlock);
1478735dd21eSpooka 		vp->v_numoutput += 2;
1479e225b7bdSrmind 		mutex_exit(vp->v_interlock);
1480735dd21eSpooka 	}
14814a780c9aSad 	mbp = getiobuf(vp, true);
1482cb32a134Spgoyette 	UVMHIST_LOG(ubchist, "vp %#jx mbp %#jx num now %jd bytes 0x%jx",
1483cb32a134Spgoyette 	    (uintptr_t)vp, (uintptr_t)mbp, vp->v_numoutput, bytes);
1484735dd21eSpooka 	mbp->b_bufsize = len;
1485735dd21eSpooka 	mbp->b_data = (void *)kva;
1486735dd21eSpooka 	mbp->b_resid = mbp->b_bcount = bytes;
148701f564d8Sad 	mbp->b_cflags |= BC_BUSY | BC_AGE;
14884a780c9aSad 	if (async) {
14894a780c9aSad 		mbp->b_flags = brw | B_ASYNC;
1490735dd21eSpooka 		mbp->b_iodone = iodone;
14914a780c9aSad 	} else {
14924a780c9aSad 		mbp->b_flags = brw;
14934a780c9aSad 		mbp->b_iodone = NULL;
14944a780c9aSad 	}
1495735dd21eSpooka 	if (curlwp == uvm.pagedaemon_lwp)
1496735dd21eSpooka 		BIO_SETPRIO(mbp, BPRIO_TIMELIMITED);
14978306a9edSchs 	else if (async || lazy)
1498735dd21eSpooka 		BIO_SETPRIO(mbp, BPRIO_TIMENONCRITICAL);
1499735dd21eSpooka 	else
1500735dd21eSpooka 		BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL);
1501735dd21eSpooka 
1502735dd21eSpooka 	bp = NULL;
1503735dd21eSpooka 	for (offset = startoffset;
1504735dd21eSpooka 	    bytes > 0;
1505735dd21eSpooka 	    offset += iobytes, bytes -= iobytes) {
15061b9d02ceSuebayasi 		int run;
15071b9d02ceSuebayasi 		daddr_t lbn, blkno;
15081b9d02ceSuebayasi 		struct vnode *devvp;
15091b9d02ceSuebayasi 
15101b9d02ceSuebayasi 		/*
15111b9d02ceSuebayasi 		 * bmap the file to find out the blkno to read from and
15121b9d02ceSuebayasi 		 * how much we can read in one i/o.  if bmap returns an error,
15131b9d02ceSuebayasi 		 * skip the rest of the top-level i/o.
15141b9d02ceSuebayasi 		 */
15151b9d02ceSuebayasi 
1516735dd21eSpooka 		lbn = offset >> fs_bshift;
1517735dd21eSpooka 		error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
1518735dd21eSpooka 		if (error) {
1519bf748078Ssimonb 			UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd",
15201b9d02ceSuebayasi 			    lbn, error, 0, 0);
1521735dd21eSpooka 			skipbytes += bytes;
1522735dd21eSpooka 			bytes = 0;
15231b9d02ceSuebayasi 			goto loopdone;
1524735dd21eSpooka 		}
1525735dd21eSpooka 
15261b9d02ceSuebayasi 		/*
15271b9d02ceSuebayasi 		 * see how many pages can be read with this i/o.
15281b9d02ceSuebayasi 		 * reduce the i/o size if necessary to avoid
15291b9d02ceSuebayasi 		 * overwriting pages with valid data.
15301b9d02ceSuebayasi 		 */
15311b9d02ceSuebayasi 
1532735dd21eSpooka 		iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
1533735dd21eSpooka 		    bytes);
15341b9d02ceSuebayasi 
15351b9d02ceSuebayasi 		/*
15361b9d02ceSuebayasi 		 * if this block isn't allocated, zero it instead of
15371b9d02ceSuebayasi 		 * reading it.  unless we are going to allocate blocks,
15381b9d02ceSuebayasi 		 * mark the pages we zeroed PG_RDONLY.
15391b9d02ceSuebayasi 		 */
15401b9d02ceSuebayasi 
1541735dd21eSpooka 		if (blkno == (daddr_t)-1) {
154264cb3c88Suebayasi 			if (!iowrite) {
1543735dd21eSpooka 				memset((char *)kva + (offset - startoffset), 0,
1544735dd21eSpooka 				    iobytes);
1545735dd21eSpooka 			}
1546735dd21eSpooka 			skipbytes += iobytes;
1547735dd21eSpooka 			continue;
1548735dd21eSpooka 		}
1549735dd21eSpooka 
15501b9d02ceSuebayasi 		/*
15511b9d02ceSuebayasi 		 * allocate a sub-buf for this piece of the i/o
15521b9d02ceSuebayasi 		 * (or just use mbp if there's only 1 piece),
15531b9d02ceSuebayasi 		 * and start it going.
15541b9d02ceSuebayasi 		 */
15551b9d02ceSuebayasi 
1556735dd21eSpooka 		if (offset == startoffset && iobytes == bytes) {
1557735dd21eSpooka 			bp = mbp;
1558735dd21eSpooka 		} else {
1559cb32a134Spgoyette 			UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
1560cb32a134Spgoyette 			    (uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
15614a780c9aSad 			bp = getiobuf(vp, true);
1562735dd21eSpooka 			nestiobuf_setup(mbp, bp, offset - startoffset, iobytes);
1563735dd21eSpooka 		}
1564735dd21eSpooka 		bp->b_lblkno = 0;
1565735dd21eSpooka 
1566735dd21eSpooka 		/* adjust physical blkno for partial blocks */
1567735dd21eSpooka 		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
1568735dd21eSpooka 		    dev_bshift);
15691b9d02ceSuebayasi 
1570735dd21eSpooka 		UVMHIST_LOG(ubchist,
1571cb32a134Spgoyette 		    "bp %#jx offset 0x%jx bcount 0x%jx blkno 0x%jx",
1572cb32a134Spgoyette 		    (uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno);
1573735dd21eSpooka 
1574735dd21eSpooka 		VOP_STRATEGY(devvp, bp);
1575735dd21eSpooka 	}
15761b9d02ceSuebayasi 
15771b9d02ceSuebayasi loopdone:
1578735dd21eSpooka 	if (skipbytes) {
1579cb32a134Spgoyette 		UVMHIST_LOG(ubchist, "skipbytes %jd", skipbytes, 0,0,0);
1580735dd21eSpooka 	}
1581735dd21eSpooka 	nestiobuf_done(mbp, skipbytes, error);
1582735dd21eSpooka 	if (async) {
1583735dd21eSpooka 		UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
1584735dd21eSpooka 		return (0);
1585735dd21eSpooka 	}
1586cb32a134Spgoyette 	UVMHIST_LOG(ubchist, "waiting for mbp %#jx", (uintptr_t)mbp, 0, 0, 0);
1587735dd21eSpooka 	error = biowait(mbp);
1588735dd21eSpooka 	s = splbio();
1589735dd21eSpooka 	(*iodone)(mbp);
1590735dd21eSpooka 	splx(s);
1591cb32a134Spgoyette 	UVMHIST_LOG(ubchist, "returning, error %jd", error, 0, 0, 0);
1592735dd21eSpooka 	return (error);
1593735dd21eSpooka }
1594735dd21eSpooka 
1595735dd21eSpooka int
genfs_compat_getpages(void * v)1596735dd21eSpooka genfs_compat_getpages(void *v)
1597735dd21eSpooka {
1598735dd21eSpooka 	struct vop_getpages_args /* {
1599735dd21eSpooka 		struct vnode *a_vp;
1600735dd21eSpooka 		voff_t a_offset;
1601735dd21eSpooka 		struct vm_page **a_m;
1602735dd21eSpooka 		int *a_count;
1603735dd21eSpooka 		int a_centeridx;
1604735dd21eSpooka 		vm_prot_t a_access_type;
1605735dd21eSpooka 		int a_advice;
1606735dd21eSpooka 		int a_flags;
1607735dd21eSpooka 	} */ *ap = v;
1608735dd21eSpooka 
1609735dd21eSpooka 	off_t origoffset;
1610735dd21eSpooka 	struct vnode *vp = ap->a_vp;
1611735dd21eSpooka 	struct uvm_object *uobj = &vp->v_uobj;
1612735dd21eSpooka 	struct vm_page *pg, **pgs;
1613735dd21eSpooka 	vaddr_t kva;
1614735dd21eSpooka 	int i, error, orignpages, npages;
1615735dd21eSpooka 	struct iovec iov;
1616735dd21eSpooka 	struct uio uio;
1617735dd21eSpooka 	kauth_cred_t cred = curlwp->l_cred;
161864cb3c88Suebayasi 	const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
1619735dd21eSpooka 
1620735dd21eSpooka 	error = 0;
1621735dd21eSpooka 	origoffset = ap->a_offset;
1622735dd21eSpooka 	orignpages = *ap->a_count;
1623735dd21eSpooka 	pgs = ap->a_m;
1624735dd21eSpooka 
1625735dd21eSpooka 	if (ap->a_flags & PGO_LOCKED) {
162605a3457eSad 		uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, NULL,
162764cb3c88Suebayasi 		    UFP_NOWAIT|UFP_NOALLOC| (memwrite ? UFP_NORDONLY : 0));
1628735dd21eSpooka 
1629e15697fcSchs 		error = ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
1630e15697fcSchs 		return error;
1631735dd21eSpooka 	}
1632735dd21eSpooka 	if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= vp->v_size) {
1633d2a0ebb6Sad 		rw_exit(uobj->vmobjlock);
1634e15697fcSchs 		return EINVAL;
1635735dd21eSpooka 	}
1636735dd21eSpooka 	if ((ap->a_flags & PGO_SYNCIO) == 0) {
1637d2a0ebb6Sad 		rw_exit(uobj->vmobjlock);
1638735dd21eSpooka 		return 0;
1639735dd21eSpooka 	}
1640735dd21eSpooka 	npages = orignpages;
164105a3457eSad 	uvn_findpages(uobj, origoffset, &npages, pgs, NULL, UFP_ALL);
1642d2a0ebb6Sad 	rw_exit(uobj->vmobjlock);
1643735dd21eSpooka 	kva = uvm_pagermapin(pgs, npages,
1644735dd21eSpooka 	    UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK);
1645735dd21eSpooka 	for (i = 0; i < npages; i++) {
1646735dd21eSpooka 		pg = pgs[i];
1647735dd21eSpooka 		if ((pg->flags & PG_FAKE) == 0) {
1648735dd21eSpooka 			continue;
1649735dd21eSpooka 		}
1650735dd21eSpooka 		iov.iov_base = (char *)kva + (i << PAGE_SHIFT);
1651735dd21eSpooka 		iov.iov_len = PAGE_SIZE;
1652735dd21eSpooka 		uio.uio_iov = &iov;
1653735dd21eSpooka 		uio.uio_iovcnt = 1;
1654735dd21eSpooka 		uio.uio_offset = origoffset + (i << PAGE_SHIFT);
1655735dd21eSpooka 		uio.uio_rw = UIO_READ;
1656735dd21eSpooka 		uio.uio_resid = PAGE_SIZE;
1657735dd21eSpooka 		UIO_SETUP_SYSSPACE(&uio);
1658735dd21eSpooka 		/* XXX vn_lock */
1659735dd21eSpooka 		error = VOP_READ(vp, &uio, 0, cred);
1660735dd21eSpooka 		if (error) {
1661735dd21eSpooka 			break;
1662735dd21eSpooka 		}
1663735dd21eSpooka 		if (uio.uio_resid) {
1664735dd21eSpooka 			memset(iov.iov_base, 0, uio.uio_resid);
1665735dd21eSpooka 		}
1666735dd21eSpooka 	}
1667735dd21eSpooka 	uvm_pagermapout(kva, npages);
1668d2a0ebb6Sad 	rw_enter(uobj->vmobjlock, RW_WRITER);
1669735dd21eSpooka 	for (i = 0; i < npages; i++) {
1670735dd21eSpooka 		pg = pgs[i];
1671735dd21eSpooka 		if (error && (pg->flags & PG_FAKE) != 0) {
1672735dd21eSpooka 			pg->flags |= PG_RELEASED;
1673735dd21eSpooka 		} else {
167405a3457eSad 			uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
167594843b13Sad 			uvm_pagelock(pg);
1676735dd21eSpooka 			uvm_pageactivate(pg);
167794843b13Sad 			uvm_pageunlock(pg);
1678735dd21eSpooka 		}
1679735dd21eSpooka 	}
1680735dd21eSpooka 	if (error) {
1681735dd21eSpooka 		uvm_page_unbusy(pgs, npages);
1682735dd21eSpooka 	}
1683d2a0ebb6Sad 	rw_exit(uobj->vmobjlock);
1684e15697fcSchs 	return error;
1685735dd21eSpooka }
1686735dd21eSpooka 
1687735dd21eSpooka int
genfs_compat_gop_write(struct vnode * vp,struct vm_page ** pgs,int npages,int flags)1688735dd21eSpooka genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
1689735dd21eSpooka     int flags)
1690735dd21eSpooka {
1691735dd21eSpooka 	off_t offset;
1692735dd21eSpooka 	struct iovec iov;
1693735dd21eSpooka 	struct uio uio;
1694735dd21eSpooka 	kauth_cred_t cred = curlwp->l_cred;
1695735dd21eSpooka 	struct buf *bp;
1696735dd21eSpooka 	vaddr_t kva;
16974a780c9aSad 	int error;
1698735dd21eSpooka 
1699735dd21eSpooka 	offset = pgs[0]->offset;
1700735dd21eSpooka 	kva = uvm_pagermapin(pgs, npages,
1701735dd21eSpooka 	    UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
1702735dd21eSpooka 
1703735dd21eSpooka 	iov.iov_base = (void *)kva;
1704735dd21eSpooka 	iov.iov_len = npages << PAGE_SHIFT;
1705735dd21eSpooka 	uio.uio_iov = &iov;
1706735dd21eSpooka 	uio.uio_iovcnt = 1;
1707735dd21eSpooka 	uio.uio_offset = offset;
1708735dd21eSpooka 	uio.uio_rw = UIO_WRITE;
1709735dd21eSpooka 	uio.uio_resid = npages << PAGE_SHIFT;
1710735dd21eSpooka 	UIO_SETUP_SYSSPACE(&uio);
1711735dd21eSpooka 	/* XXX vn_lock */
1712735dd21eSpooka 	error = VOP_WRITE(vp, &uio, 0, cred);
1713735dd21eSpooka 
1714e225b7bdSrmind 	mutex_enter(vp->v_interlock);
17154a780c9aSad 	vp->v_numoutput++;
1716e225b7bdSrmind 	mutex_exit(vp->v_interlock);
1717735dd21eSpooka 
17184a780c9aSad 	bp = getiobuf(vp, true);
171901f564d8Sad 	bp->b_cflags |= BC_BUSY | BC_AGE;
1720735dd21eSpooka 	bp->b_lblkno = offset >> vp->v_mount->mnt_fs_bshift;
1721735dd21eSpooka 	bp->b_data = (char *)kva;
1722735dd21eSpooka 	bp->b_bcount = npages << PAGE_SHIFT;
1723735dd21eSpooka 	bp->b_bufsize = npages << PAGE_SHIFT;
1724735dd21eSpooka 	bp->b_resid = 0;
1725735dd21eSpooka 	bp->b_error = error;
1726735dd21eSpooka 	uvm_aio_aiodone(bp);
1727735dd21eSpooka 	return (error);
1728735dd21eSpooka }
1729735dd21eSpooka 
1730735dd21eSpooka /*
1731735dd21eSpooka  * Process a uio using direct I/O.  If we reach a part of the request
1732735dd21eSpooka  * which cannot be processed in this fashion for some reason, just return.
1733735dd21eSpooka  * The caller must handle some additional part of the request using
1734735dd21eSpooka  * buffered I/O before trying direct I/O again.
1735735dd21eSpooka  */
1736735dd21eSpooka 
1737735dd21eSpooka void
genfs_directio(struct vnode * vp,struct uio * uio,int ioflag)1738735dd21eSpooka genfs_directio(struct vnode *vp, struct uio *uio, int ioflag)
1739735dd21eSpooka {
1740735dd21eSpooka 	struct vmspace *vs;
1741735dd21eSpooka 	struct iovec *iov;
1742735dd21eSpooka 	vaddr_t va;
1743735dd21eSpooka 	size_t len;
1744735dd21eSpooka 	const int mask = DEV_BSIZE - 1;
1745735dd21eSpooka 	int error;
1746f5bbefdbSjoerg 	bool need_wapbl = (vp->v_mount && vp->v_mount->mnt_wapbl &&
1747f5bbefdbSjoerg 	    (ioflag & IO_JOURNALLOCKED) == 0);
1748735dd21eSpooka 
1749560071c2Sjdolecek #ifdef DIAGNOSTIC
1750560071c2Sjdolecek 	if ((ioflag & IO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl)
1751560071c2Sjdolecek                 WAPBL_JLOCK_ASSERT(vp->v_mount);
1752560071c2Sjdolecek #endif
1753560071c2Sjdolecek 
1754735dd21eSpooka 	/*
1755735dd21eSpooka 	 * We only support direct I/O to user space for now.
1756735dd21eSpooka 	 */
1757735dd21eSpooka 
1758735dd21eSpooka 	if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) {
1759735dd21eSpooka 		return;
1760735dd21eSpooka 	}
1761735dd21eSpooka 
1762735dd21eSpooka 	/*
1763735dd21eSpooka 	 * If the vnode is mapped, we would need to get the getpages lock
1764735dd21eSpooka 	 * to stabilize the bmap, but then we would get into trouble while
1765735dd21eSpooka 	 * locking the pages if the pages belong to this same vnode (or a
1766735dd21eSpooka 	 * multi-vnode cascade to the same effect).  Just fall back to
1767735dd21eSpooka 	 * buffered I/O if the vnode is mapped to avoid this mess.
1768735dd21eSpooka 	 */
1769735dd21eSpooka 
1770735dd21eSpooka 	if (vp->v_vflag & VV_MAPPED) {
1771735dd21eSpooka 		return;
1772735dd21eSpooka 	}
1773735dd21eSpooka 
1774f5bbefdbSjoerg 	if (need_wapbl) {
1775ac6b1617Shannken 		error = WAPBL_BEGIN(vp->v_mount);
1776ac6b1617Shannken 		if (error)
1777ac6b1617Shannken 			return;
1778ac6b1617Shannken 	}
1779ac6b1617Shannken 
1780735dd21eSpooka 	/*
1781735dd21eSpooka 	 * Do as much of the uio as possible with direct I/O.
1782735dd21eSpooka 	 */
1783735dd21eSpooka 
1784735dd21eSpooka 	vs = uio->uio_vmspace;
1785735dd21eSpooka 	while (uio->uio_resid) {
1786735dd21eSpooka 		iov = uio->uio_iov;
1787735dd21eSpooka 		if (iov->iov_len == 0) {
1788735dd21eSpooka 			uio->uio_iov++;
1789735dd21eSpooka 			uio->uio_iovcnt--;
1790735dd21eSpooka 			continue;
1791735dd21eSpooka 		}
1792735dd21eSpooka 		va = (vaddr_t)iov->iov_base;
1793735dd21eSpooka 		len = MIN(iov->iov_len, genfs_maxdio);
1794735dd21eSpooka 		len &= ~mask;
1795735dd21eSpooka 
1796735dd21eSpooka 		/*
1797735dd21eSpooka 		 * If the next chunk is smaller than DEV_BSIZE or extends past
1798735dd21eSpooka 		 * the current EOF, then fall back to buffered I/O.
1799735dd21eSpooka 		 */
1800735dd21eSpooka 
1801735dd21eSpooka 		if (len == 0 || uio->uio_offset + len > vp->v_size) {
1802ac6b1617Shannken 			break;
1803735dd21eSpooka 		}
1804735dd21eSpooka 
1805735dd21eSpooka 		/*
1806735dd21eSpooka 		 * Check alignment.  The file offset must be at least
1807735dd21eSpooka 		 * sector-aligned.  The exact constraint on memory alignment
1808735dd21eSpooka 		 * is very hardware-dependent, but requiring sector-aligned
1809735dd21eSpooka 		 * addresses there too is safe.
1810735dd21eSpooka 		 */
1811735dd21eSpooka 
1812735dd21eSpooka 		if (uio->uio_offset & mask || va & mask) {
1813ac6b1617Shannken 			break;
1814735dd21eSpooka 		}
1815735dd21eSpooka 		error = genfs_do_directio(vs, va, len, vp, uio->uio_offset,
1816735dd21eSpooka 					  uio->uio_rw);
1817735dd21eSpooka 		if (error) {
1818735dd21eSpooka 			break;
1819735dd21eSpooka 		}
1820735dd21eSpooka 		iov->iov_base = (char *)iov->iov_base + len;
1821735dd21eSpooka 		iov->iov_len -= len;
1822735dd21eSpooka 		uio->uio_offset += len;
1823735dd21eSpooka 		uio->uio_resid -= len;
1824735dd21eSpooka 	}
1825ac6b1617Shannken 
1826f5bbefdbSjoerg 	if (need_wapbl)
1827ac6b1617Shannken 		WAPBL_END(vp->v_mount);
1828735dd21eSpooka }
1829735dd21eSpooka 
1830735dd21eSpooka /*
1831735dd21eSpooka  * Iodone routine for direct I/O.  We don't do much here since the request is
1832735dd21eSpooka  * always synchronous, so the caller will do most of the work after biowait().
1833735dd21eSpooka  */
1834735dd21eSpooka 
1835735dd21eSpooka static void
genfs_dio_iodone(struct buf * bp)1836735dd21eSpooka genfs_dio_iodone(struct buf *bp)
1837735dd21eSpooka {
1838735dd21eSpooka 
1839735dd21eSpooka 	KASSERT((bp->b_flags & B_ASYNC) == 0);
18404a780c9aSad 	if ((bp->b_flags & B_READ) == 0 && (bp->b_cflags & BC_AGE) != 0) {
18414a780c9aSad 		mutex_enter(bp->b_objlock);
1842735dd21eSpooka 		vwakeup(bp);
18434a780c9aSad 		mutex_exit(bp->b_objlock);
1844735dd21eSpooka 	}
1845735dd21eSpooka 	putiobuf(bp);
1846735dd21eSpooka }
1847735dd21eSpooka 
1848735dd21eSpooka /*
1849735dd21eSpooka  * Process one chunk of a direct I/O request.
1850735dd21eSpooka  */
1851735dd21eSpooka 
1852735dd21eSpooka static int
genfs_do_directio(struct vmspace * vs,vaddr_t uva,size_t len,struct vnode * vp,off_t off,enum uio_rw rw)1853735dd21eSpooka genfs_do_directio(struct vmspace *vs, vaddr_t uva, size_t len, struct vnode *vp,
1854735dd21eSpooka     off_t off, enum uio_rw rw)
1855735dd21eSpooka {
1856735dd21eSpooka 	struct vm_map *map;
185769d5d9c0Smartin 	struct pmap *upm, *kpm __unused;
1858735dd21eSpooka 	size_t klen = round_page(uva + len) - trunc_page(uva);
1859735dd21eSpooka 	off_t spoff, epoff;
1860735dd21eSpooka 	vaddr_t kva, puva;
1861735dd21eSpooka 	paddr_t pa;
1862735dd21eSpooka 	vm_prot_t prot;
18636a2419feSmartin 	int error, rv __diagused, poff, koff;
1864ac6b1617Shannken 	const int pgoflags = PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED |
1865735dd21eSpooka 		(rw == UIO_WRITE ? PGO_FREE : 0);
1866735dd21eSpooka 
1867735dd21eSpooka 	/*
1868735dd21eSpooka 	 * For writes, verify that this range of the file already has fully
1869735dd21eSpooka 	 * allocated backing store.  If there are any holes, just punt and
1870735dd21eSpooka 	 * make the caller take the buffered write path.
1871735dd21eSpooka 	 */
1872735dd21eSpooka 
1873735dd21eSpooka 	if (rw == UIO_WRITE) {
1874735dd21eSpooka 		daddr_t lbn, elbn, blkno;
1875735dd21eSpooka 		int bsize, bshift, run;
1876735dd21eSpooka 
1877735dd21eSpooka 		bshift = vp->v_mount->mnt_fs_bshift;
1878735dd21eSpooka 		bsize = 1 << bshift;
1879735dd21eSpooka 		lbn = off >> bshift;
1880735dd21eSpooka 		elbn = (off + len + bsize - 1) >> bshift;
1881735dd21eSpooka 		while (lbn < elbn) {
1882735dd21eSpooka 			error = VOP_BMAP(vp, lbn, NULL, &blkno, &run);
1883735dd21eSpooka 			if (error) {
1884735dd21eSpooka 				return error;
1885735dd21eSpooka 			}
1886735dd21eSpooka 			if (blkno == (daddr_t)-1) {
1887735dd21eSpooka 				return ENOSPC;
1888735dd21eSpooka 			}
1889735dd21eSpooka 			lbn += 1 + run;
1890735dd21eSpooka 		}
1891735dd21eSpooka 	}
1892735dd21eSpooka 
1893735dd21eSpooka 	/*
1894735dd21eSpooka 	 * Flush any cached pages for parts of the file that we're about to
1895735dd21eSpooka 	 * access.  If we're writing, invalidate pages as well.
1896735dd21eSpooka 	 */
1897735dd21eSpooka 
1898735dd21eSpooka 	spoff = trunc_page(off);
1899735dd21eSpooka 	epoff = round_page(off + len);
190013162282Sad 	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
1901735dd21eSpooka 	error = VOP_PUTPAGES(vp, spoff, epoff, pgoflags);
1902735dd21eSpooka 	if (error) {
1903735dd21eSpooka 		return error;
1904735dd21eSpooka 	}
1905735dd21eSpooka 
1906735dd21eSpooka 	/*
1907735dd21eSpooka 	 * Wire the user pages and remap them into kernel memory.
1908735dd21eSpooka 	 */
1909735dd21eSpooka 
1910735dd21eSpooka 	prot = rw == UIO_READ ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ;
1911735dd21eSpooka 	error = uvm_vslock(vs, (void *)uva, len, prot);
1912735dd21eSpooka 	if (error) {
1913735dd21eSpooka 		return error;
1914735dd21eSpooka 	}
1915735dd21eSpooka 
1916735dd21eSpooka 	map = &vs->vm_map;
1917735dd21eSpooka 	upm = vm_map_pmap(map);
1918735dd21eSpooka 	kpm = vm_map_pmap(kernel_map);
1919735dd21eSpooka 	puva = trunc_page(uva);
19202c6de4b4Smatt 	kva = uvm_km_alloc(kernel_map, klen, atop(puva) & uvmexp.colormask,
19212c6de4b4Smatt 	    UVM_KMF_VAONLY | UVM_KMF_WAITVA | UVM_KMF_COLORMATCH);
1922735dd21eSpooka 	for (poff = 0; poff < klen; poff += PAGE_SIZE) {
1923735dd21eSpooka 		rv = pmap_extract(upm, puva + poff, &pa);
1924735dd21eSpooka 		KASSERT(rv);
19252c6de4b4Smatt 		pmap_kenter_pa(kva + poff, pa, prot, PMAP_WIRED);
19268c8246dcSrmind 	}
1927735dd21eSpooka 	pmap_update(kpm);
1928735dd21eSpooka 
1929735dd21eSpooka 	/*
1930735dd21eSpooka 	 * Do the I/O.
1931735dd21eSpooka 	 */
1932735dd21eSpooka 
1933735dd21eSpooka 	koff = uva - trunc_page(uva);
1934735dd21eSpooka 	error = genfs_do_io(vp, off, kva + koff, len, PGO_SYNCIO, rw,
1935735dd21eSpooka 			    genfs_dio_iodone);
1936735dd21eSpooka 
1937735dd21eSpooka 	/*
1938735dd21eSpooka 	 * Tear down the kernel mapping.
1939735dd21eSpooka 	 */
1940735dd21eSpooka 
19412c6de4b4Smatt 	pmap_kremove(kva, klen);
1942735dd21eSpooka 	pmap_update(kpm);
1943735dd21eSpooka 	uvm_km_free(kernel_map, kva, klen, UVM_KMF_VAONLY);
1944735dd21eSpooka 
1945735dd21eSpooka 	/*
1946735dd21eSpooka 	 * Unwire the user pages.
1947735dd21eSpooka 	 */
1948735dd21eSpooka 
1949735dd21eSpooka 	uvm_vsunlock(vs, (void *)uva, len);
1950735dd21eSpooka 	return error;
1951735dd21eSpooka }
1952