1*fda613dfSriastradh /* $NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $ */
2735dd21eSpooka
3735dd21eSpooka /*
4735dd21eSpooka * Copyright (c) 1982, 1986, 1989, 1993
5735dd21eSpooka * The Regents of the University of California. All rights reserved.
6735dd21eSpooka *
7735dd21eSpooka * Redistribution and use in source and binary forms, with or without
8735dd21eSpooka * modification, are permitted provided that the following conditions
9735dd21eSpooka * are met:
10735dd21eSpooka * 1. Redistributions of source code must retain the above copyright
11735dd21eSpooka * notice, this list of conditions and the following disclaimer.
12735dd21eSpooka * 2. Redistributions in binary form must reproduce the above copyright
13735dd21eSpooka * notice, this list of conditions and the following disclaimer in the
14735dd21eSpooka * documentation and/or other materials provided with the distribution.
15735dd21eSpooka * 3. Neither the name of the University nor the names of its contributors
16735dd21eSpooka * may be used to endorse or promote products derived from this software
17735dd21eSpooka * without specific prior written permission.
18735dd21eSpooka *
19735dd21eSpooka * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20735dd21eSpooka * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21735dd21eSpooka * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22735dd21eSpooka * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23735dd21eSpooka * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24735dd21eSpooka * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25735dd21eSpooka * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26735dd21eSpooka * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27735dd21eSpooka * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28735dd21eSpooka * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29735dd21eSpooka * SUCH DAMAGE.
30735dd21eSpooka *
31735dd21eSpooka */
32735dd21eSpooka
33735dd21eSpooka #include <sys/cdefs.h>
34*fda613dfSriastradh __KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $");
35735dd21eSpooka
36735dd21eSpooka #include <sys/param.h>
37735dd21eSpooka #include <sys/systm.h>
38735dd21eSpooka #include <sys/proc.h>
39735dd21eSpooka #include <sys/kernel.h>
40735dd21eSpooka #include <sys/mount.h>
41735dd21eSpooka #include <sys/vnode.h>
42735dd21eSpooka #include <sys/kmem.h>
43735dd21eSpooka #include <sys/kauth.h>
44735dd21eSpooka #include <sys/fstrans.h>
45010ce493Spooka #include <sys/buf.h>
461d7848adSad #include <sys/atomic.h>
47735dd21eSpooka
48735dd21eSpooka #include <miscfs/genfs/genfs.h>
49735dd21eSpooka #include <miscfs/genfs/genfs_node.h>
50735dd21eSpooka #include <miscfs/specfs/specdev.h>
51735dd21eSpooka
52735dd21eSpooka #include <uvm/uvm.h>
53735dd21eSpooka #include <uvm/uvm_pager.h>
54881d12e6Sad #include <uvm/uvm_page_array.h>
55735dd21eSpooka
56735dd21eSpooka static int genfs_do_directio(struct vmspace *, vaddr_t, size_t, struct vnode *,
57735dd21eSpooka off_t, enum uio_rw);
58735dd21eSpooka static void genfs_dio_iodone(struct buf *);
59735dd21eSpooka
602b81644cSriastradh static int genfs_getpages_read(struct vnode *, struct vm_page **, int, off_t,
612b81644cSriastradh off_t, bool, bool, bool, bool);
62735dd21eSpooka static int genfs_do_io(struct vnode *, off_t, vaddr_t, size_t, int, enum uio_rw,
63735dd21eSpooka void (*)(struct buf *));
644f2ae943Syamt static void genfs_rel_pages(struct vm_page **, unsigned int);
65735dd21eSpooka
66735dd21eSpooka int genfs_maxdio = MAXPHYS;
67735dd21eSpooka
68e15697fcSchs static void
genfs_rel_pages(struct vm_page ** pgs,unsigned int npages)694f2ae943Syamt genfs_rel_pages(struct vm_page **pgs, unsigned int npages)
70735dd21eSpooka {
714f2ae943Syamt unsigned int i;
72735dd21eSpooka
73735dd21eSpooka for (i = 0; i < npages; i++) {
74735dd21eSpooka struct vm_page *pg = pgs[i];
75735dd21eSpooka
76735dd21eSpooka if (pg == NULL || pg == PGO_DONTCARE)
77735dd21eSpooka continue;
78d2a0ebb6Sad KASSERT(uvm_page_owner_locked_p(pg, true));
79735dd21eSpooka if (pg->flags & PG_FAKE) {
80735dd21eSpooka pg->flags |= PG_RELEASED;
81735dd21eSpooka }
82735dd21eSpooka }
83735dd21eSpooka uvm_page_unbusy(pgs, npages);
84735dd21eSpooka }
85735dd21eSpooka
86735dd21eSpooka /*
87735dd21eSpooka * generic VM getpages routine.
88735dd21eSpooka * Return PG_BUSY pages for the given range,
89735dd21eSpooka * reading from backing store if necessary.
90735dd21eSpooka */
91735dd21eSpooka
92735dd21eSpooka int
genfs_getpages(void * v)93735dd21eSpooka genfs_getpages(void *v)
94735dd21eSpooka {
95735dd21eSpooka struct vop_getpages_args /* {
96735dd21eSpooka struct vnode *a_vp;
97735dd21eSpooka voff_t a_offset;
98735dd21eSpooka struct vm_page **a_m;
99735dd21eSpooka int *a_count;
100735dd21eSpooka int a_centeridx;
101735dd21eSpooka vm_prot_t a_access_type;
102735dd21eSpooka int a_advice;
103735dd21eSpooka int a_flags;
1042372674cSuebayasi } */ * const ap = v;
105735dd21eSpooka
1066903a054Suebayasi off_t diskeof, memeof;
1071d7848adSad int i, error, npages, iflag;
1081907407bSyamt const int flags = ap->a_flags;
1092372674cSuebayasi struct vnode * const vp = ap->a_vp;
1102372674cSuebayasi struct uvm_object * const uobj = &vp->v_uobj;
1111907407bSyamt const bool async = (flags & PGO_SYNCIO) == 0;
11264cb3c88Suebayasi const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
1131907407bSyamt const bool overwrite = (flags & PGO_OVERWRITE) != 0;
11464cb3c88Suebayasi const bool blockalloc = memwrite && (flags & PGO_NOBLOCKALLOC) == 0;
1155f7e4301Sjdolecek const bool need_wapbl = (vp->v_mount->mnt_wapbl &&
1165f7e4301Sjdolecek (flags & PGO_JOURNALLOCKED) == 0);
117fca58884Schs const bool glocked = (flags & PGO_GLOCKHELD) != 0;
118f36a7657Shannken bool holds_wapbl = false;
119f36a7657Shannken struct mount *trans_mount = NULL;
120735dd21eSpooka UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist);
121735dd21eSpooka
122cb32a134Spgoyette UVMHIST_LOG(ubchist, "vp %#jx off 0x%jx/%jx count %jd",
123cb32a134Spgoyette (uintptr_t)vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count);
124735dd21eSpooka
12505a3457eSad KASSERT(memwrite >= overwrite);
126735dd21eSpooka KASSERT(vp->v_type == VREG || vp->v_type == VDIR ||
127735dd21eSpooka vp->v_type == VLNK || vp->v_type == VBLK);
128735dd21eSpooka
1291d7848adSad /*
1301d7848adSad * the object must be locked. it can only be a read lock when
131ff872804Sad * processing a read fault with PGO_LOCKED.
1321d7848adSad */
1331d7848adSad
1341d7848adSad KASSERT(rw_lock_held(uobj->vmobjlock));
1351d7848adSad KASSERT(rw_write_held(uobj->vmobjlock) ||
136ff872804Sad ((flags & PGO_LOCKED) != 0 && !memwrite));
1371d7848adSad
138560071c2Sjdolecek #ifdef DIAGNOSTIC
139560071c2Sjdolecek if ((flags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl)
140560071c2Sjdolecek WAPBL_JLOCK_ASSERT(vp->v_mount);
141560071c2Sjdolecek #endif
142560071c2Sjdolecek
1431d7848adSad /*
1441d7848adSad * check for reclaimed vnode. v_interlock is not held here, but
1451d7848adSad * VI_DEADCHECK is set with vmobjlock held.
1461d7848adSad */
1471d7848adSad
1481d7848adSad iflag = atomic_load_relaxed(&vp->v_iflag);
1491d7848adSad if (__predict_false((iflag & VI_DEADCHECK) != 0)) {
150d2a0ebb6Sad mutex_enter(vp->v_interlock);
151ad2fab45Shannken error = vdead_check(vp, VDEAD_NOWAIT);
152d2a0ebb6Sad mutex_exit(vp->v_interlock);
153ad2fab45Shannken if (error) {
154ad2fab45Shannken if ((flags & PGO_LOCKED) == 0)
155d2a0ebb6Sad rw_exit(uobj->vmobjlock);
156ad2fab45Shannken return error;
157ad2fab45Shannken }
1581d7848adSad }
159ad2fab45Shannken
160735dd21eSpooka startover:
161735dd21eSpooka error = 0;
162680e7444Suebayasi const voff_t origvsize = vp->v_size;
163680e7444Suebayasi const off_t origoffset = ap->a_offset;
164bb4b25cfSuebayasi const int orignpages = *ap->a_count;
165f4e16ac9Suebayasi
166735dd21eSpooka GOP_SIZE(vp, origvsize, &diskeof, 0);
167735dd21eSpooka if (flags & PGO_PASTEOF) {
1686903a054Suebayasi off_t newsize;
169735dd21eSpooka #if defined(DIAGNOSTIC)
170735dd21eSpooka off_t writeeof;
171735dd21eSpooka #endif /* defined(DIAGNOSTIC) */
172735dd21eSpooka
173735dd21eSpooka newsize = MAX(origvsize,
174735dd21eSpooka origoffset + (orignpages << PAGE_SHIFT));
175735dd21eSpooka GOP_SIZE(vp, newsize, &memeof, GOP_SIZE_MEM);
176735dd21eSpooka #if defined(DIAGNOSTIC)
177735dd21eSpooka GOP_SIZE(vp, vp->v_writesize, &writeeof, GOP_SIZE_MEM);
178735dd21eSpooka if (newsize > round_page(writeeof)) {
1796cd7b7a7Spooka panic("%s: past eof: %" PRId64 " vs. %" PRId64,
1806cd7b7a7Spooka __func__, newsize, round_page(writeeof));
181735dd21eSpooka }
182735dd21eSpooka #endif /* defined(DIAGNOSTIC) */
183735dd21eSpooka } else {
184735dd21eSpooka GOP_SIZE(vp, origvsize, &memeof, GOP_SIZE_MEM);
185735dd21eSpooka }
186735dd21eSpooka KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages);
187ab579ad8Sriastradh KASSERT((origoffset & (PAGE_SIZE - 1)) == 0);
188ab579ad8Sriastradh KASSERT(origoffset >= 0);
189735dd21eSpooka KASSERT(orignpages > 0);
190735dd21eSpooka
191735dd21eSpooka /*
192735dd21eSpooka * Bounds-check the request.
193735dd21eSpooka */
194735dd21eSpooka
195735dd21eSpooka if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) {
196735dd21eSpooka if ((flags & PGO_LOCKED) == 0) {
197d2a0ebb6Sad rw_exit(uobj->vmobjlock);
198735dd21eSpooka }
199cb32a134Spgoyette UVMHIST_LOG(ubchist, "off 0x%jx count %jd goes past EOF 0x%jx",
200735dd21eSpooka origoffset, *ap->a_count, memeof,0);
201735dd21eSpooka error = EINVAL;
202735dd21eSpooka goto out_err;
203735dd21eSpooka }
204735dd21eSpooka
205735dd21eSpooka /* uobj is locked */
206735dd21eSpooka
207735dd21eSpooka if ((flags & PGO_NOTIMESTAMP) == 0 &&
208735dd21eSpooka (vp->v_type != VBLK ||
209735dd21eSpooka (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) {
210735dd21eSpooka int updflags = 0;
211735dd21eSpooka
212735dd21eSpooka if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
213735dd21eSpooka updflags = GOP_UPDATE_ACCESSED;
214735dd21eSpooka }
21564cb3c88Suebayasi if (memwrite) {
216735dd21eSpooka updflags |= GOP_UPDATE_MODIFIED;
217735dd21eSpooka }
218735dd21eSpooka if (updflags != 0) {
219735dd21eSpooka GOP_MARKUPDATE(vp, updflags);
220735dd21eSpooka }
221735dd21eSpooka }
222735dd21eSpooka
223735dd21eSpooka /*
224735dd21eSpooka * For PGO_LOCKED requests, just return whatever's in memory.
225735dd21eSpooka */
226735dd21eSpooka
227735dd21eSpooka if (flags & PGO_LOCKED) {
228735dd21eSpooka int nfound;
2299fa66d7aSuebayasi struct vm_page *pg;
230735dd21eSpooka
231fca58884Schs KASSERT(!glocked);
232735dd21eSpooka npages = *ap->a_count;
233735dd21eSpooka #if defined(DEBUG)
234735dd21eSpooka for (i = 0; i < npages; i++) {
235735dd21eSpooka pg = ap->a_m[i];
236735dd21eSpooka KASSERT(pg == NULL || pg == PGO_DONTCARE);
237735dd21eSpooka }
238735dd21eSpooka #endif /* defined(DEBUG) */
239735dd21eSpooka nfound = uvn_findpages(uobj, origoffset, &npages,
24005a3457eSad ap->a_m, NULL,
241ff872804Sad UFP_NOWAIT | UFP_NOALLOC | UFP_NOBUSY |
242ff872804Sad (memwrite ? UFP_NORDONLY : 0));
243735dd21eSpooka KASSERT(npages == *ap->a_count);
244735dd21eSpooka if (nfound == 0) {
245735dd21eSpooka error = EBUSY;
246735dd21eSpooka goto out_err;
247735dd21eSpooka }
24805a3457eSad /*
24905a3457eSad * lock and unlock g_glock to ensure that no one is truncating
25005a3457eSad * the file behind us.
25105a3457eSad */
252a75c80a0Suebayasi if (!genfs_node_rdtrylock(vp)) {
253735dd21eSpooka /*
254735dd21eSpooka * restore the array.
255735dd21eSpooka */
256735dd21eSpooka
257735dd21eSpooka for (i = 0; i < npages; i++) {
258735dd21eSpooka pg = ap->a_m[i];
259735dd21eSpooka
260c87cbe9fSuebayasi if (pg != NULL && pg != PGO_DONTCARE) {
261735dd21eSpooka ap->a_m[i] = NULL;
262735dd21eSpooka }
2633c4c042eSuebayasi KASSERT(ap->a_m[i] == NULL ||
2643c4c042eSuebayasi ap->a_m[i] == PGO_DONTCARE);
265735dd21eSpooka }
266735dd21eSpooka } else {
267a75c80a0Suebayasi genfs_node_unlock(vp);
268735dd21eSpooka }
269735dd21eSpooka error = (ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0);
270e15697fcSchs if (error == 0 && memwrite) {
27105a3457eSad for (i = 0; i < npages; i++) {
27205a3457eSad pg = ap->a_m[i];
27305a3457eSad if (pg == NULL || pg == PGO_DONTCARE) {
27405a3457eSad continue;
27505a3457eSad }
27605a3457eSad if (uvm_pagegetdirty(pg) ==
27705a3457eSad UVM_PAGE_STATUS_CLEAN) {
27805a3457eSad uvm_pagemarkdirty(pg,
27905a3457eSad UVM_PAGE_STATUS_UNKNOWN);
28005a3457eSad }
28105a3457eSad }
282e15697fcSchs }
283735dd21eSpooka goto out_err;
284735dd21eSpooka }
285d2a0ebb6Sad rw_exit(uobj->vmobjlock);
286735dd21eSpooka
287735dd21eSpooka /*
288735dd21eSpooka * find the requested pages and make some simple checks.
289735dd21eSpooka * leave space in the page array for a whole block.
290735dd21eSpooka */
291735dd21eSpooka
292680e7444Suebayasi const int fs_bshift = (vp->v_type != VBLK) ?
293680e7444Suebayasi vp->v_mount->mnt_fs_bshift : DEV_BSHIFT;
294680e7444Suebayasi const int fs_bsize = 1 << fs_bshift;
295a0629265Suebayasi #define blk_mask (fs_bsize - 1)
296a0629265Suebayasi #define trunc_blk(x) ((x) & ~blk_mask)
297a0629265Suebayasi #define round_blk(x) (((x) + blk_mask) & ~blk_mask)
298735dd21eSpooka
299bb4b25cfSuebayasi const int orignmempages = MIN(orignpages,
300735dd21eSpooka round_page(memeof - origoffset) >> PAGE_SHIFT);
301bb4b25cfSuebayasi npages = orignmempages;
302a0629265Suebayasi const off_t startoffset = trunc_blk(origoffset);
303a0629265Suebayasi const off_t endoffset = MIN(
304a0629265Suebayasi round_page(round_blk(origoffset + (npages << PAGE_SHIFT))),
305a0629265Suebayasi round_page(memeof));
3069fa66d7aSuebayasi const int ridx = (origoffset - startoffset) >> PAGE_SHIFT;
307735dd21eSpooka
308f4e16ac9Suebayasi const int pgs_size = sizeof(struct vm_page *) *
309735dd21eSpooka ((endoffset - startoffset) >> PAGE_SHIFT);
310f4e16ac9Suebayasi struct vm_page **pgs, *pgs_onstack[UBC_MAX_PAGES];
3119fa66d7aSuebayasi
312735dd21eSpooka if (pgs_size > sizeof(pgs_onstack)) {
313735dd21eSpooka pgs = kmem_zalloc(pgs_size, async ? KM_NOSLEEP : KM_SLEEP);
314735dd21eSpooka if (pgs == NULL) {
315735dd21eSpooka pgs = pgs_onstack;
316735dd21eSpooka error = ENOMEM;
31729f5c078Suebayasi goto out_err;
318735dd21eSpooka }
319735dd21eSpooka } else {
3202a274197Schristos pgs = pgs_onstack;
3212a274197Schristos (void)memset(pgs, 0, pgs_size);
322735dd21eSpooka }
3232a274197Schristos
3243123ec52Srin UVMHIST_LOG(ubchist, "ridx %jd npages %jd startoff %#jx endoff %#jx",
325735dd21eSpooka ridx, npages, startoffset, endoffset);
326735dd21eSpooka
327f36a7657Shannken if (trans_mount == NULL) {
328f36a7657Shannken trans_mount = vp->v_mount;
329287643b0Shannken fstrans_start(trans_mount);
330f36a7657Shannken /*
331f36a7657Shannken * check if this vnode is still valid.
332f36a7657Shannken */
333f36a7657Shannken mutex_enter(vp->v_interlock);
334f36a7657Shannken error = vdead_check(vp, 0);
335f36a7657Shannken mutex_exit(vp->v_interlock);
336f36a7657Shannken if (error)
337f36a7657Shannken goto out_err_free;
3381c9818e8Shannken /*
3391c9818e8Shannken * XXX: This assumes that we come here only via
3401c9818e8Shannken * the mmio path
3411c9818e8Shannken */
3425f7e4301Sjdolecek if (blockalloc && need_wapbl) {
343f36a7657Shannken error = WAPBL_BEGIN(trans_mount);
344f36a7657Shannken if (error)
3451c9818e8Shannken goto out_err_free;
346f36a7657Shannken holds_wapbl = true;
3471c9818e8Shannken }
3481c9818e8Shannken }
349735dd21eSpooka
350735dd21eSpooka /*
351735dd21eSpooka * hold g_glock to prevent a race with truncate.
352735dd21eSpooka *
353735dd21eSpooka * check if our idea of v_size is still valid.
354735dd21eSpooka */
355735dd21eSpooka
356fca58884Schs KASSERT(!glocked || genfs_node_wrlocked(vp));
357fca58884Schs if (!glocked) {
358735dd21eSpooka if (blockalloc) {
359fca58884Schs genfs_node_wrlock(vp);
360735dd21eSpooka } else {
361fca58884Schs genfs_node_rdlock(vp);
362fca58884Schs }
363735dd21eSpooka }
364d2a0ebb6Sad rw_enter(uobj->vmobjlock, RW_WRITER);
365735dd21eSpooka if (vp->v_size < origvsize) {
366fca58884Schs if (!glocked) {
367a75c80a0Suebayasi genfs_node_unlock(vp);
368fca58884Schs }
369735dd21eSpooka if (pgs != pgs_onstack)
370735dd21eSpooka kmem_free(pgs, pgs_size);
371735dd21eSpooka goto startover;
372735dd21eSpooka }
373735dd21eSpooka
37405a3457eSad if (uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], NULL,
375bb4b25cfSuebayasi async ? UFP_NOWAIT : UFP_ALL) != orignmempages) {
376fca58884Schs if (!glocked) {
377a75c80a0Suebayasi genfs_node_unlock(vp);
378fca58884Schs }
379735dd21eSpooka KASSERT(async != 0);
380bb4b25cfSuebayasi genfs_rel_pages(&pgs[ridx], orignmempages);
381d2a0ebb6Sad rw_exit(uobj->vmobjlock);
382735dd21eSpooka error = EBUSY;
383f4e16ac9Suebayasi goto out_err_free;
384735dd21eSpooka }
385735dd21eSpooka
386735dd21eSpooka /*
387735dd21eSpooka * if PGO_OVERWRITE is set, don't bother reading the pages.
388735dd21eSpooka */
389735dd21eSpooka
390735dd21eSpooka if (overwrite) {
391fca58884Schs if (!glocked) {
392a75c80a0Suebayasi genfs_node_unlock(vp);
393fca58884Schs }
394735dd21eSpooka UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0);
395735dd21eSpooka
396735dd21eSpooka for (i = 0; i < npages; i++) {
3979fa66d7aSuebayasi struct vm_page *pg = pgs[ridx + i];
398735dd21eSpooka
39905a3457eSad /*
40005a3457eSad * it's caller's responsibility to allocate blocks
40105a3457eSad * beforehand for the overwrite case.
40205a3457eSad */
40305a3457eSad
40405a3457eSad KASSERT((pg->flags & PG_RDONLY) == 0 || !blockalloc);
40505a3457eSad pg->flags &= ~PG_RDONLY;
40605a3457eSad
40705a3457eSad /*
40805a3457eSad * mark the page DIRTY.
40905a3457eSad * otherwise another thread can do putpages and pull
41005a3457eSad * our vnode from syncer's queue before our caller does
41105a3457eSad * ubc_release. note that putpages won't see CLEAN
41205a3457eSad * pages even if they are BUSY.
41305a3457eSad */
41405a3457eSad
41505a3457eSad uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
416735dd21eSpooka }
417735dd21eSpooka npages += ridx;
418735dd21eSpooka goto out;
419735dd21eSpooka }
420735dd21eSpooka
421735dd21eSpooka /*
42205a3457eSad * if the pages are already resident, just return them.
42305a3457eSad */
42405a3457eSad
42505a3457eSad for (i = 0; i < npages; i++) {
42605a3457eSad struct vm_page *pg = pgs[ridx + i];
42705a3457eSad
42805a3457eSad if ((pg->flags & PG_FAKE) ||
42905a3457eSad (blockalloc && (pg->flags & PG_RDONLY) != 0)) {
43005a3457eSad break;
43105a3457eSad }
43205a3457eSad }
43305a3457eSad if (i == npages) {
43405a3457eSad if (!glocked) {
43505a3457eSad genfs_node_unlock(vp);
43605a3457eSad }
43705a3457eSad UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0);
43805a3457eSad npages += ridx;
43905a3457eSad goto out;
44005a3457eSad }
44105a3457eSad
44205a3457eSad /*
443735dd21eSpooka * the page wasn't resident and we're not overwriting,
444735dd21eSpooka * so we're going to have to do some i/o.
445735dd21eSpooka * find any additional pages needed to cover the expanded range.
446735dd21eSpooka */
447735dd21eSpooka
448735dd21eSpooka npages = (endoffset - startoffset) >> PAGE_SHIFT;
449bb4b25cfSuebayasi if (startoffset != origoffset || npages != orignmempages) {
4509fa66d7aSuebayasi int npgs;
451735dd21eSpooka
452735dd21eSpooka /*
453735dd21eSpooka * we need to avoid deadlocks caused by locking
454735dd21eSpooka * additional pages at lower offsets than pages we
455735dd21eSpooka * already have locked. unlock them all and start over.
456735dd21eSpooka */
457735dd21eSpooka
458bb4b25cfSuebayasi genfs_rel_pages(&pgs[ridx], orignmempages);
459735dd21eSpooka memset(pgs, 0, pgs_size);
460735dd21eSpooka
461cb32a134Spgoyette UVMHIST_LOG(ubchist, "reset npages start 0x%jx end 0x%jx",
462735dd21eSpooka startoffset, endoffset, 0,0);
463735dd21eSpooka npgs = npages;
46405a3457eSad if (uvn_findpages(uobj, startoffset, &npgs, pgs, NULL,
465735dd21eSpooka async ? UFP_NOWAIT : UFP_ALL) != npages) {
466fca58884Schs if (!glocked) {
467a75c80a0Suebayasi genfs_node_unlock(vp);
468fca58884Schs }
469735dd21eSpooka KASSERT(async != 0);
470735dd21eSpooka genfs_rel_pages(pgs, npages);
471d2a0ebb6Sad rw_exit(uobj->vmobjlock);
472735dd21eSpooka error = EBUSY;
473f4e16ac9Suebayasi goto out_err_free;
474735dd21eSpooka }
475735dd21eSpooka }
47653000cecSuebayasi
477d2a0ebb6Sad rw_exit(uobj->vmobjlock);
4782b81644cSriastradh error = genfs_getpages_read(vp, pgs, npages, startoffset, diskeof,
4792b81644cSriastradh async, memwrite, blockalloc, glocked);
4802b81644cSriastradh if (!glocked) {
4812b81644cSriastradh genfs_node_unlock(vp);
4822b81644cSriastradh }
483446694baSriastradh if (error == 0 && async)
484446694baSriastradh goto out_err_free;
485d2a0ebb6Sad rw_enter(uobj->vmobjlock, RW_WRITER);
486735dd21eSpooka
4872b81644cSriastradh /*
4882b81644cSriastradh * we're almost done! release the pages...
4892b81644cSriastradh * for errors, we free the pages.
4902b81644cSriastradh * otherwise we activate them and mark them as valid and clean.
4912b81644cSriastradh * also, unbusy pages that were not actually requested.
4922b81644cSriastradh */
4932b81644cSriastradh
4942b81644cSriastradh if (error) {
4952b81644cSriastradh genfs_rel_pages(pgs, npages);
496d2a0ebb6Sad rw_exit(uobj->vmobjlock);
497cb32a134Spgoyette UVMHIST_LOG(ubchist, "returning error %jd", error,0,0,0);
4982b81644cSriastradh goto out_err_free;
4992b81644cSriastradh }
5002b81644cSriastradh
5012b81644cSriastradh out:
502cb32a134Spgoyette UVMHIST_LOG(ubchist, "succeeding, npages %jd", npages,0,0,0);
5032b81644cSriastradh error = 0;
5042b81644cSriastradh for (i = 0; i < npages; i++) {
5052b81644cSriastradh struct vm_page *pg = pgs[i];
5062b81644cSriastradh if (pg == NULL) {
5072b81644cSriastradh continue;
5082b81644cSriastradh }
509cb32a134Spgoyette UVMHIST_LOG(ubchist, "examining pg %#jx flags 0x%jx",
510cb32a134Spgoyette (uintptr_t)pg, pg->flags, 0,0);
5112b81644cSriastradh if (pg->flags & PG_FAKE && !overwrite) {
51205a3457eSad /*
51305a3457eSad * we've read page's contents from the backing storage.
51405a3457eSad *
51505a3457eSad * for a read fault, we keep them CLEAN; if we
51605a3457eSad * encountered a hole while reading, the pages can
51705a3457eSad * already been dirtied with zeros.
51805a3457eSad */
51905a3457eSad KASSERTMSG(blockalloc || uvm_pagegetdirty(pg) ==
52005a3457eSad UVM_PAGE_STATUS_CLEAN, "page %p not clean", pg);
52105a3457eSad pg->flags &= ~PG_FAKE;
5222b81644cSriastradh }
5232b81644cSriastradh KASSERT(!memwrite || !blockalloc || (pg->flags & PG_RDONLY) == 0);
5242b81644cSriastradh if (i < ridx || i >= ridx + orignmempages || async) {
525cb32a134Spgoyette UVMHIST_LOG(ubchist, "unbusy pg %#jx offset 0x%jx",
526cb32a134Spgoyette (uintptr_t)pg, pg->offset,0,0);
5272b81644cSriastradh if (pg->flags & PG_FAKE) {
5282b81644cSriastradh KASSERT(overwrite);
5292b81644cSriastradh uvm_pagezero(pg);
5302b81644cSriastradh }
5312b81644cSriastradh if (pg->flags & PG_RELEASED) {
5322b81644cSriastradh uvm_pagefree(pg);
5332b81644cSriastradh continue;
5342b81644cSriastradh }
53594843b13Sad uvm_pagelock(pg);
5362b81644cSriastradh uvm_pageenqueue(pg);
5371912643fSad uvm_pagewakeup(pg);
53894843b13Sad uvm_pageunlock(pg);
5391912643fSad pg->flags &= ~(PG_BUSY|PG_FAKE);
5402b81644cSriastradh UVM_PAGE_OWN(pg, NULL);
54105a3457eSad } else if (memwrite && !overwrite &&
54205a3457eSad uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
54305a3457eSad /*
54405a3457eSad * for a write fault, start dirtiness tracking of
54505a3457eSad * requested pages.
54605a3457eSad */
54705a3457eSad uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
5482b81644cSriastradh }
5492b81644cSriastradh }
550d2a0ebb6Sad rw_exit(uobj->vmobjlock);
5512b81644cSriastradh if (ap->a_m != NULL) {
5522b81644cSriastradh memcpy(ap->a_m, &pgs[ridx],
5532b81644cSriastradh orignmempages * sizeof(struct vm_page *));
5542b81644cSriastradh }
5552b81644cSriastradh
5562b81644cSriastradh out_err_free:
5572b81644cSriastradh if (pgs != NULL && pgs != pgs_onstack)
5582b81644cSriastradh kmem_free(pgs, pgs_size);
5592b81644cSriastradh out_err:
560f36a7657Shannken if (trans_mount != NULL) {
561f36a7657Shannken if (holds_wapbl)
562f36a7657Shannken WAPBL_END(trans_mount);
563f36a7657Shannken fstrans_done(trans_mount);
5642b81644cSriastradh }
5652b81644cSriastradh return error;
5662b81644cSriastradh }
5672b81644cSriastradh
5682b81644cSriastradh /*
5692b81644cSriastradh * genfs_getpages_read: Read the pages in with VOP_BMAP/VOP_STRATEGY.
57003a2126fSdholland *
57103a2126fSdholland * "glocked" (which is currently not actually used) tells us not whether
57203a2126fSdholland * the genfs_node is locked on entry (it always is) but whether it was
57303a2126fSdholland * locked on entry to genfs_getpages.
5742b81644cSriastradh */
5752b81644cSriastradh static int
genfs_getpages_read(struct vnode * vp,struct vm_page ** pgs,int npages,off_t startoffset,off_t diskeof,bool async,bool memwrite,bool blockalloc,bool glocked)5762b81644cSriastradh genfs_getpages_read(struct vnode *vp, struct vm_page **pgs, int npages,
5772b81644cSriastradh off_t startoffset, off_t diskeof,
5782b81644cSriastradh bool async, bool memwrite, bool blockalloc, bool glocked)
57953000cecSuebayasi {
5802b81644cSriastradh struct uvm_object * const uobj = &vp->v_uobj;
5812b81644cSriastradh const int fs_bshift = (vp->v_type != VBLK) ?
5822b81644cSriastradh vp->v_mount->mnt_fs_bshift : DEV_BSHIFT;
5832b81644cSriastradh const int dev_bshift = (vp->v_type != VBLK) ?
5842b81644cSriastradh vp->v_mount->mnt_dev_bshift : DEV_BSHIFT;
5852b81644cSriastradh kauth_cred_t const cred = curlwp->l_cred; /* XXXUBC curlwp */
58653000cecSuebayasi size_t bytes, iobytes, tailstart, tailbytes, totalbytes, skipbytes;
58753000cecSuebayasi vaddr_t kva;
58853000cecSuebayasi struct buf *bp, *mbp;
58953000cecSuebayasi bool sawhole = false;
5902b81644cSriastradh int i;
5912b81644cSriastradh int error = 0;
59253000cecSuebayasi
59361497d42Sskrll UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
59461497d42Sskrll
595735dd21eSpooka /*
596735dd21eSpooka * read the desired page(s).
597735dd21eSpooka */
598735dd21eSpooka
599735dd21eSpooka totalbytes = npages << PAGE_SHIFT;
600735dd21eSpooka bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0));
601735dd21eSpooka tailbytes = totalbytes - bytes;
602735dd21eSpooka skipbytes = 0;
603735dd21eSpooka
604735dd21eSpooka kva = uvm_pagermapin(pgs, npages,
6054f2ae943Syamt UVMPAGER_MAPIN_READ | (async ? 0 : UVMPAGER_MAPIN_WAITOK));
6062b81644cSriastradh if (kva == 0)
6072b81644cSriastradh return EBUSY;
608735dd21eSpooka
6094a780c9aSad mbp = getiobuf(vp, true);
610735dd21eSpooka mbp->b_bufsize = totalbytes;
611735dd21eSpooka mbp->b_data = (void *)kva;
612735dd21eSpooka mbp->b_resid = mbp->b_bcount = bytes;
61301f564d8Sad mbp->b_cflags |= BC_BUSY;
6144a780c9aSad if (async) {
6154a780c9aSad mbp->b_flags = B_READ | B_ASYNC;
6165232c510Schs mbp->b_iodone = uvm_aio_aiodone;
6174a780c9aSad } else {
6184a780c9aSad mbp->b_flags = B_READ;
6194a780c9aSad mbp->b_iodone = NULL;
6204a780c9aSad }
621735dd21eSpooka if (async)
622735dd21eSpooka BIO_SETPRIO(mbp, BPRIO_TIMELIMITED);
623735dd21eSpooka else
624735dd21eSpooka BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL);
625735dd21eSpooka
626735dd21eSpooka /*
627735dd21eSpooka * if EOF is in the middle of the range, zero the part past EOF.
628735dd21eSpooka * skip over pages which are not PG_FAKE since in that case they have
629735dd21eSpooka * valid data that we need to preserve.
630735dd21eSpooka */
631735dd21eSpooka
632735dd21eSpooka tailstart = bytes;
633735dd21eSpooka while (tailbytes > 0) {
634735dd21eSpooka const int len = PAGE_SIZE - (tailstart & PAGE_MASK);
635735dd21eSpooka
636735dd21eSpooka KASSERT(len <= tailbytes);
637735dd21eSpooka if ((pgs[tailstart >> PAGE_SHIFT]->flags & PG_FAKE) != 0) {
638735dd21eSpooka memset((void *)(kva + tailstart), 0, len);
639cb32a134Spgoyette UVMHIST_LOG(ubchist, "tailbytes %#jx 0x%jx 0x%jx",
640cb32a134Spgoyette (uintptr_t)kva, tailstart, len, 0);
641735dd21eSpooka }
642735dd21eSpooka tailstart += len;
643735dd21eSpooka tailbytes -= len;
644735dd21eSpooka }
645735dd21eSpooka
646735dd21eSpooka /*
647735dd21eSpooka * now loop over the pages, reading as needed.
648735dd21eSpooka */
649735dd21eSpooka
650735dd21eSpooka bp = NULL;
651b0b6ddc3Suebayasi off_t offset;
652b0b6ddc3Suebayasi for (offset = startoffset;
653735dd21eSpooka bytes > 0;
654735dd21eSpooka offset += iobytes, bytes -= iobytes) {
655a0629265Suebayasi int run;
65664e0246aSuebayasi daddr_t lbn, blkno;
6576903a054Suebayasi int pidx;
6581a2a3af3Suebayasi struct vnode *devvp;
659735dd21eSpooka
660735dd21eSpooka /*
661735dd21eSpooka * skip pages which don't need to be read.
662735dd21eSpooka */
663735dd21eSpooka
664735dd21eSpooka pidx = (offset - startoffset) >> PAGE_SHIFT;
665735dd21eSpooka while ((pgs[pidx]->flags & PG_FAKE) == 0) {
666735dd21eSpooka size_t b;
667735dd21eSpooka
668735dd21eSpooka KASSERT((offset & (PAGE_SIZE - 1)) == 0);
669735dd21eSpooka if ((pgs[pidx]->flags & PG_RDONLY)) {
670735dd21eSpooka sawhole = true;
671735dd21eSpooka }
672735dd21eSpooka b = MIN(PAGE_SIZE, bytes);
673735dd21eSpooka offset += b;
674735dd21eSpooka bytes -= b;
675735dd21eSpooka skipbytes += b;
676735dd21eSpooka pidx++;
677cb32a134Spgoyette UVMHIST_LOG(ubchist, "skipping, new offset 0x%jx",
678735dd21eSpooka offset, 0,0,0);
679735dd21eSpooka if (bytes == 0) {
680735dd21eSpooka goto loopdone;
681735dd21eSpooka }
682735dd21eSpooka }
683735dd21eSpooka
684735dd21eSpooka /*
685735dd21eSpooka * bmap the file to find out the blkno to read from and
686735dd21eSpooka * how much we can read in one i/o. if bmap returns an error,
687735dd21eSpooka * skip the rest of the top-level i/o.
688735dd21eSpooka */
689735dd21eSpooka
690735dd21eSpooka lbn = offset >> fs_bshift;
691735dd21eSpooka error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
692735dd21eSpooka if (error) {
693bf748078Ssimonb UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd",
694735dd21eSpooka lbn,error,0,0);
695735dd21eSpooka skipbytes += bytes;
6961b9d02ceSuebayasi bytes = 0;
697735dd21eSpooka goto loopdone;
698735dd21eSpooka }
699735dd21eSpooka
700735dd21eSpooka /*
701735dd21eSpooka * see how many pages can be read with this i/o.
702735dd21eSpooka * reduce the i/o size if necessary to avoid
703735dd21eSpooka * overwriting pages with valid data.
704735dd21eSpooka */
705735dd21eSpooka
706735dd21eSpooka iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
707735dd21eSpooka bytes);
708735dd21eSpooka if (offset + iobytes > round_page(offset)) {
7096903a054Suebayasi int pcount;
7106903a054Suebayasi
711735dd21eSpooka pcount = 1;
712735dd21eSpooka while (pidx + pcount < npages &&
713735dd21eSpooka pgs[pidx + pcount]->flags & PG_FAKE) {
714735dd21eSpooka pcount++;
715735dd21eSpooka }
716735dd21eSpooka iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) -
717735dd21eSpooka (offset - trunc_page(offset)));
718735dd21eSpooka }
719735dd21eSpooka
720735dd21eSpooka /*
721735dd21eSpooka * if this block isn't allocated, zero it instead of
722735dd21eSpooka * reading it. unless we are going to allocate blocks,
723735dd21eSpooka * mark the pages we zeroed PG_RDONLY.
724735dd21eSpooka */
725735dd21eSpooka
7261b9d02ceSuebayasi if (blkno == (daddr_t)-1) {
727735dd21eSpooka int holepages = (round_page(offset + iobytes) -
728735dd21eSpooka trunc_page(offset)) >> PAGE_SHIFT;
729cb32a134Spgoyette UVMHIST_LOG(ubchist, "lbn 0x%jx -> HOLE", lbn,0,0,0);
730735dd21eSpooka
731735dd21eSpooka sawhole = true;
732735dd21eSpooka memset((char *)kva + (offset - startoffset), 0,
733735dd21eSpooka iobytes);
734735dd21eSpooka skipbytes += iobytes;
735735dd21eSpooka
73605a3457eSad if (!blockalloc) {
737d2a0ebb6Sad rw_enter(uobj->vmobjlock, RW_WRITER);
738735dd21eSpooka for (i = 0; i < holepages; i++) {
739735dd21eSpooka pgs[pidx + i]->flags |= PG_RDONLY;
740735dd21eSpooka }
741d2a0ebb6Sad rw_exit(uobj->vmobjlock);
74205a3457eSad }
743735dd21eSpooka continue;
744735dd21eSpooka }
745735dd21eSpooka
746735dd21eSpooka /*
747735dd21eSpooka * allocate a sub-buf for this piece of the i/o
748735dd21eSpooka * (or just use mbp if there's only 1 piece),
749735dd21eSpooka * and start it going.
750735dd21eSpooka */
751735dd21eSpooka
752735dd21eSpooka if (offset == startoffset && iobytes == bytes) {
753735dd21eSpooka bp = mbp;
754735dd21eSpooka } else {
755cb32a134Spgoyette UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
756cb32a134Spgoyette (uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
7574a780c9aSad bp = getiobuf(vp, true);
758735dd21eSpooka nestiobuf_setup(mbp, bp, offset - startoffset, iobytes);
759735dd21eSpooka }
760735dd21eSpooka bp->b_lblkno = 0;
761735dd21eSpooka
762735dd21eSpooka /* adjust physical blkno for partial blocks */
763735dd21eSpooka bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
764735dd21eSpooka dev_bshift);
765735dd21eSpooka
766735dd21eSpooka UVMHIST_LOG(ubchist,
767cb32a134Spgoyette "bp %#jx offset 0x%x bcount 0x%x blkno 0x%x",
768cb32a134Spgoyette (uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno);
769735dd21eSpooka
770735dd21eSpooka VOP_STRATEGY(devvp, bp);
771735dd21eSpooka }
772735dd21eSpooka
773735dd21eSpooka loopdone:
774735dd21eSpooka nestiobuf_done(mbp, skipbytes, error);
775735dd21eSpooka if (async) {
776735dd21eSpooka UVMHIST_LOG(ubchist, "returning 0 (async)",0,0,0,0);
7772b81644cSriastradh return 0;
778735dd21eSpooka }
779735dd21eSpooka if (bp != NULL) {
780735dd21eSpooka error = biowait(mbp);
781735dd21eSpooka }
782735dd21eSpooka
783e52fb162Srmind /* Remove the mapping (make KVA available as soon as possible) */
784e52fb162Srmind uvm_pagermapout(kva, npages);
785e52fb162Srmind
786735dd21eSpooka /*
787735dd21eSpooka * if this we encountered a hole then we have to do a little more work.
788735dd21eSpooka * for read faults, we marked the page PG_RDONLY so that future
789735dd21eSpooka * write accesses to the page will fault again.
790735dd21eSpooka * for write faults, we must make sure that the backing store for
791735dd21eSpooka * the page is completely allocated while the pages are locked.
792735dd21eSpooka */
793735dd21eSpooka
794735dd21eSpooka if (!error && sawhole && blockalloc) {
79536d65f11Ssimonb error = GOP_ALLOC(vp, startoffset,
79636d65f11Ssimonb npages << PAGE_SHIFT, 0, cred);
797cb32a134Spgoyette UVMHIST_LOG(ubchist, "gop_alloc off 0x%jx/0x%jx -> %jd",
798735dd21eSpooka startoffset, npages << PAGE_SHIFT, error,0);
799735dd21eSpooka if (!error) {
800d2a0ebb6Sad rw_enter(uobj->vmobjlock, RW_WRITER);
801735dd21eSpooka for (i = 0; i < npages; i++) {
8029fa66d7aSuebayasi struct vm_page *pg = pgs[i];
8039fa66d7aSuebayasi
8049fa66d7aSuebayasi if (pg == NULL) {
805735dd21eSpooka continue;
806735dd21eSpooka }
80705a3457eSad pg->flags &= ~PG_RDONLY;
80805a3457eSad uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
809cb32a134Spgoyette UVMHIST_LOG(ubchist, "mark dirty pg %#jx",
810cb32a134Spgoyette (uintptr_t)pg, 0, 0, 0);
811735dd21eSpooka }
812d2a0ebb6Sad rw_exit(uobj->vmobjlock);
813735dd21eSpooka }
814735dd21eSpooka }
81578a982c8Srmind
81678a982c8Srmind putiobuf(mbp);
817e15697fcSchs return error;
818735dd21eSpooka }
819735dd21eSpooka
820735dd21eSpooka /*
821735dd21eSpooka * generic VM putpages routine.
822735dd21eSpooka * Write the given range of pages to backing store.
823735dd21eSpooka *
824735dd21eSpooka * => "offhi == 0" means flush all pages at or after "offlo".
825735dd21eSpooka * => object should be locked by caller. we return with the
826735dd21eSpooka * object unlocked.
827735dd21eSpooka * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O).
828735dd21eSpooka * thus, a caller might want to unlock higher level resources
829735dd21eSpooka * (e.g. vm_map) before calling flush.
830735dd21eSpooka * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, we will not block
831735dd21eSpooka * => if PGO_ALLPAGES is set, then all pages in the object will be processed.
832735dd21eSpooka *
833735dd21eSpooka * note on "cleaning" object and PG_BUSY pages:
834735dd21eSpooka * this routine is holding the lock on the object. the only time
835735dd21eSpooka * that it can run into a PG_BUSY page that it does not own is if
836735dd21eSpooka * some other process has started I/O on the page (e.g. either
837735dd21eSpooka * a pagein, or a pageout). if the PG_BUSY page is being paged
83805a3457eSad * in, then it can not be dirty (!UVM_PAGE_STATUS_CLEAN) because no
83905a3457eSad * one has had a chance to modify it yet. if the PG_BUSY page is
84005a3457eSad * being paged out then it means that someone else has already started
841735dd21eSpooka * cleaning the page for us (how nice!). in this case, if we
842735dd21eSpooka * have syncio specified, then after we make our pass through the
843735dd21eSpooka * object we need to wait for the other PG_BUSY pages to clear
844735dd21eSpooka * off (i.e. we need to do an iosync). also note that once a
845735dd21eSpooka * page is PG_BUSY it must stay in its object until it is un-busyed.
846735dd21eSpooka */
847735dd21eSpooka
848735dd21eSpooka int
genfs_putpages(void * v)849735dd21eSpooka genfs_putpages(void *v)
850735dd21eSpooka {
851735dd21eSpooka struct vop_putpages_args /* {
852735dd21eSpooka struct vnode *a_vp;
853735dd21eSpooka voff_t a_offlo;
854735dd21eSpooka voff_t a_offhi;
855735dd21eSpooka int a_flags;
8562372674cSuebayasi } */ * const ap = v;
857735dd21eSpooka
858735dd21eSpooka return genfs_do_putpages(ap->a_vp, ap->a_offlo, ap->a_offhi,
859735dd21eSpooka ap->a_flags, NULL);
860735dd21eSpooka }
861735dd21eSpooka
862735dd21eSpooka int
genfs_do_putpages(struct vnode * vp,off_t startoff,off_t endoff,int origflags,struct vm_page ** busypg)86336c701bcSyamt genfs_do_putpages(struct vnode *vp, off_t startoff, off_t endoff,
86436c701bcSyamt int origflags, struct vm_page **busypg)
865735dd21eSpooka {
8662372674cSuebayasi struct uvm_object * const uobj = &vp->v_uobj;
867d2a0ebb6Sad krwlock_t * const slock = uobj->vmobjlock;
868881d12e6Sad off_t nextoff;
8694a780c9aSad int i, error, npages, nback;
870735dd21eSpooka int freeflag;
871600f58d6Schristos /*
872600f58d6Schristos * This array is larger than it should so that it's size is constant.
873600f58d6Schristos * The right size is MAXPAGES.
874600f58d6Schristos */
875600f58d6Schristos struct vm_page *pgs[MAXPHYS / MIN_PAGE_SIZE];
876600f58d6Schristos #define MAXPAGES (MAXPHYS / PAGE_SIZE)
877881d12e6Sad struct vm_page *pg, *tpg;
878881d12e6Sad struct uvm_page_array a;
879881d12e6Sad bool wasclean, needs_clean;
88036c701bcSyamt bool async = (origflags & PGO_SYNCIO) == 0;
881735dd21eSpooka bool pagedaemon = curlwp == uvm.pagedaemon_lwp;
88206a21e4cShannken struct mount *trans_mp;
88336c701bcSyamt int flags;
88405a3457eSad bool modified; /* if we write out any pages */
88506a21e4cShannken bool holds_wapbl;
88605a3457eSad bool cleanall; /* try to pull off from the syncer's list */
88736c701bcSyamt bool onworklst;
888d2a0ebb6Sad bool nodirty;
88905a3457eSad const bool dirtyonly = (origflags & (PGO_DEACTIVATE|PGO_FREE)) == 0;
890735dd21eSpooka
891735dd21eSpooka UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
892735dd21eSpooka
89336c701bcSyamt KASSERT(origflags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE));
894ab579ad8Sriastradh KASSERT((startoff & PAGE_MASK) == 0);
895ab579ad8Sriastradh KASSERT((endoff & PAGE_MASK) == 0);
896735dd21eSpooka KASSERT(startoff < endoff || endoff == 0);
897d2a0ebb6Sad KASSERT(rw_write_held(slock));
898735dd21eSpooka
899cb32a134Spgoyette UVMHIST_LOG(ubchist, "vp %#jx pages %jd off 0x%jx len 0x%jx",
900cb32a134Spgoyette (uintptr_t)vp, uobj->uo_npages, startoff, endoff - startoff);
901735dd21eSpooka
902560071c2Sjdolecek #ifdef DIAGNOSTIC
903560071c2Sjdolecek if ((origflags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl)
904560071c2Sjdolecek WAPBL_JLOCK_ASSERT(vp->v_mount);
905560071c2Sjdolecek #endif
906560071c2Sjdolecek
90706a21e4cShannken trans_mp = NULL;
90806a21e4cShannken holds_wapbl = false;
9090789b071Shannken
91036c701bcSyamt retry:
91136c701bcSyamt modified = false;
91236c701bcSyamt flags = origflags;
91305a3457eSad
91405a3457eSad /*
91505a3457eSad * shortcut if we have no pages to process.
91605a3457eSad */
91705a3457eSad
91819303cecSchs nodirty = uvm_obj_clean_p(uobj);
919da3ef92bSad #ifdef DIAGNOSTIC
920da3ef92bSad mutex_enter(vp->v_interlock);
921da3ef92bSad KASSERT((vp->v_iflag & VI_ONWORKLST) != 0 || nodirty);
922da3ef92bSad mutex_exit(vp->v_interlock);
923da3ef92bSad #endif
924d2a0ebb6Sad if (uobj->uo_npages == 0 || (dirtyonly && nodirty)) {
925d2a0ebb6Sad mutex_enter(vp->v_interlock);
92619303cecSchs if (vp->v_iflag & VI_ONWORKLST && LIST_EMPTY(&vp->v_dirtyblkhd)) {
927735dd21eSpooka vn_syncer_remove_from_worklist(vp);
928735dd21eSpooka }
929d2a0ebb6Sad mutex_exit(vp->v_interlock);
93006a21e4cShannken if (trans_mp) {
93106a21e4cShannken if (holds_wapbl)
93206a21e4cShannken WAPBL_END(trans_mp);
93306a21e4cShannken fstrans_done(trans_mp);
93444f3404fShannken }
935d2a0ebb6Sad rw_exit(slock);
936735dd21eSpooka return (0);
937735dd21eSpooka }
938735dd21eSpooka
939735dd21eSpooka /*
940735dd21eSpooka * the vnode has pages, set up to process the request.
941735dd21eSpooka */
942735dd21eSpooka
94306a21e4cShannken if (trans_mp == NULL && (flags & PGO_CLEANIT) != 0) {
944735dd21eSpooka if (pagedaemon) {
94506a21e4cShannken /* Pagedaemon must not sleep here. */
94606a21e4cShannken trans_mp = vp->v_mount;
947287643b0Shannken error = fstrans_start_nowait(trans_mp);
94844f3404fShannken if (error) {
949d2a0ebb6Sad rw_exit(slock);
95006a21e4cShannken return error;
95106a21e4cShannken }
95206a21e4cShannken } else {
95306a21e4cShannken /*
95406a21e4cShannken * Cannot use vdeadcheck() here as this operation
95506a21e4cShannken * usually gets used from VOP_RECLAIM(). Test for
95606a21e4cShannken * change of v_mount instead and retry on change.
95706a21e4cShannken */
958d2a0ebb6Sad rw_exit(slock);
95906a21e4cShannken trans_mp = vp->v_mount;
960287643b0Shannken fstrans_start(trans_mp);
96106a21e4cShannken if (vp->v_mount != trans_mp) {
96206a21e4cShannken fstrans_done(trans_mp);
96306a21e4cShannken trans_mp = NULL;
96406a21e4cShannken } else {
96506a21e4cShannken holds_wapbl = (trans_mp->mnt_wapbl &&
96606a21e4cShannken (origflags & PGO_JOURNALLOCKED) == 0);
96706a21e4cShannken if (holds_wapbl) {
96806a21e4cShannken error = WAPBL_BEGIN(trans_mp);
96906a21e4cShannken if (error) {
97006a21e4cShannken fstrans_done(trans_mp);
97144f3404fShannken return error;
97244f3404fShannken }
97344f3404fShannken }
97406a21e4cShannken }
975d2a0ebb6Sad rw_enter(slock, RW_WRITER);
9760789b071Shannken goto retry;
977735dd21eSpooka }
97806a21e4cShannken }
979735dd21eSpooka
980735dd21eSpooka error = 0;
98119303cecSchs wasclean = uvm_obj_nowriteback_p(uobj);
982881d12e6Sad nextoff = startoff;
983735dd21eSpooka if (endoff == 0 || flags & PGO_ALLPAGES) {
984735dd21eSpooka endoff = trunc_page(LLONG_MAX);
985735dd21eSpooka }
986735dd21eSpooka
987735dd21eSpooka /*
988735dd21eSpooka * if this vnode is known not to have dirty pages,
989735dd21eSpooka * don't bother to clean it out.
990735dd21eSpooka */
991735dd21eSpooka
992d2a0ebb6Sad if (nodirty) {
99306a7b095Sriastradh /* We handled the dirtyonly && nodirty case above. */
99406a7b095Sriastradh KASSERT(!dirtyonly);
995735dd21eSpooka flags &= ~PGO_CLEANIT;
996735dd21eSpooka }
997735dd21eSpooka
998735dd21eSpooka /*
999881d12e6Sad * start the loop to scan pages.
1000735dd21eSpooka */
1001735dd21eSpooka
100205a3457eSad cleanall = true;
1003735dd21eSpooka freeflag = pagedaemon ? PG_PAGEOUT : PG_RELEASED;
10044bfe0439Sad uvm_page_array_init(&a, uobj, dirtyonly ? (UVM_PAGE_ARRAY_FILL_DIRTY |
10054bfe0439Sad (!async ? UVM_PAGE_ARRAY_FILL_WRITEBACK : 0)) : 0);
1006881d12e6Sad for (;;) {
100705a3457eSad bool pgprotected;
100805a3457eSad
1009735dd21eSpooka /*
101005a3457eSad * if !dirtyonly, iterate over all resident pages in the range.
101105a3457eSad *
101205a3457eSad * if dirtyonly, only possibly dirty pages are interesting.
101305a3457eSad * however, if we are asked to sync for integrity, we should
101405a3457eSad * wait on pages being written back by other threads as well.
1015735dd21eSpooka */
1016735dd21eSpooka
10174bfe0439Sad pg = uvm_page_array_fill_and_peek(&a, nextoff, 0);
1018881d12e6Sad if (pg == NULL) {
1019735dd21eSpooka break;
1020735dd21eSpooka }
1021881d12e6Sad
1022881d12e6Sad KASSERT(pg->uobject == uobj);
1023881d12e6Sad KASSERT((pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1024881d12e6Sad (pg->flags & (PG_BUSY)) != 0);
1025881d12e6Sad KASSERT(pg->offset >= startoff);
1026881d12e6Sad KASSERT(pg->offset >= nextoff);
102705a3457eSad KASSERT(!dirtyonly ||
102805a3457eSad uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN ||
102919303cecSchs uvm_obj_page_writeback_p(pg));
1030881d12e6Sad
1031881d12e6Sad if (pg->offset >= endoff) {
1032881d12e6Sad break;
1033c84e81caShannken }
1034881d12e6Sad
1035881d12e6Sad /*
1036881d12e6Sad * a preempt point.
1037881d12e6Sad */
1038881d12e6Sad
103916d4fad6Sad if (preempt_needed()) {
1040881d12e6Sad nextoff = pg->offset; /* visit this page again */
1041d2a0ebb6Sad rw_exit(slock);
1042881d12e6Sad preempt();
1043881d12e6Sad /*
1044881d12e6Sad * as we dropped the object lock, our cached pages can
1045881d12e6Sad * be stale.
1046881d12e6Sad */
1047881d12e6Sad uvm_page_array_clear(&a);
1048d2a0ebb6Sad rw_enter(slock, RW_WRITER);
1049735dd21eSpooka continue;
1050735dd21eSpooka }
1051735dd21eSpooka
1052735dd21eSpooka /*
105305a3457eSad * if the current page is busy, wait for it to become unbusy.
1054735dd21eSpooka */
1055735dd21eSpooka
105605a3457eSad if ((pg->flags & PG_BUSY) != 0) {
1057cb32a134Spgoyette UVMHIST_LOG(ubchist, "busy %#jx", (uintptr_t)pg,
1058cb32a134Spgoyette 0, 0, 0);
105905a3457eSad if ((pg->flags & (PG_RELEASED|PG_PAGEOUT)) != 0
106005a3457eSad && (flags & PGO_BUSYFAIL) != 0) {
1061cb32a134Spgoyette UVMHIST_LOG(ubchist, "busyfail %#jx",
1062cb32a134Spgoyette (uintptr_t)pg, 0, 0, 0);
1063735dd21eSpooka error = EDEADLK;
1064735dd21eSpooka if (busypg != NULL)
1065735dd21eSpooka *busypg = pg;
1066735dd21eSpooka break;
1067735dd21eSpooka }
1068735dd21eSpooka if (pagedaemon) {
1069735dd21eSpooka /*
1070735dd21eSpooka * someone has taken the page while we
1071735dd21eSpooka * dropped the lock for fstrans_start.
1072735dd21eSpooka */
1073735dd21eSpooka break;
1074735dd21eSpooka }
107505a3457eSad /*
107605a3457eSad * don't bother to wait on other's activities
107705a3457eSad * unless we are asked to sync for integrity.
107805a3457eSad */
107905a3457eSad if (!async && (flags & PGO_RECLAIM) == 0) {
108005a3457eSad wasclean = false;
108105a3457eSad nextoff = pg->offset + PAGE_SIZE;
108205a3457eSad uvm_page_array_advance(&a);
108305a3457eSad continue;
108405a3457eSad }
1085881d12e6Sad nextoff = pg->offset; /* visit this page again */
10865972ba16Sad uvm_pagewait(pg, slock, "genput");
1087881d12e6Sad /*
1088881d12e6Sad * as we dropped the object lock, our cached pages can
1089881d12e6Sad * be stale.
1090881d12e6Sad */
1091881d12e6Sad uvm_page_array_clear(&a);
1092d2a0ebb6Sad rw_enter(slock, RW_WRITER);
1093735dd21eSpooka continue;
1094735dd21eSpooka }
1095735dd21eSpooka
1096881d12e6Sad nextoff = pg->offset + PAGE_SIZE;
1097881d12e6Sad uvm_page_array_advance(&a);
1098881d12e6Sad
1099735dd21eSpooka /*
1100735dd21eSpooka * if we're freeing, remove all mappings of the page now.
1101735dd21eSpooka * if we're cleaning, check if the page is needs to be cleaned.
1102735dd21eSpooka */
1103735dd21eSpooka
110405a3457eSad pgprotected = false;
1105735dd21eSpooka if (flags & PGO_FREE) {
1106735dd21eSpooka pmap_page_protect(pg, VM_PROT_NONE);
110705a3457eSad pgprotected = true;
1108735dd21eSpooka } else if (flags & PGO_CLEANIT) {
1109735dd21eSpooka
1110735dd21eSpooka /*
1111735dd21eSpooka * if we still have some hope to pull this vnode off
1112735dd21eSpooka * from the syncer queue, write-protect the page.
1113735dd21eSpooka */
1114735dd21eSpooka
111505a3457eSad if (cleanall && wasclean) {
1116735dd21eSpooka
1117735dd21eSpooka /*
1118735dd21eSpooka * uobj pages get wired only by uvm_fault
1119735dd21eSpooka * where uobj is locked.
1120735dd21eSpooka */
1121735dd21eSpooka
1122735dd21eSpooka if (pg->wire_count == 0) {
1123735dd21eSpooka pmap_page_protect(pg,
1124735dd21eSpooka VM_PROT_READ|VM_PROT_EXECUTE);
112505a3457eSad pgprotected = true;
1126735dd21eSpooka } else {
1127735dd21eSpooka cleanall = false;
1128735dd21eSpooka }
1129735dd21eSpooka }
1130735dd21eSpooka }
1131735dd21eSpooka
1132735dd21eSpooka if (flags & PGO_CLEANIT) {
113305a3457eSad needs_clean = uvm_pagecheckdirty(pg, pgprotected);
1134735dd21eSpooka } else {
1135735dd21eSpooka needs_clean = false;
1136735dd21eSpooka }
1137735dd21eSpooka
1138735dd21eSpooka /*
1139735dd21eSpooka * if we're cleaning, build a cluster.
114005a3457eSad * the cluster will consist of pages which are currently dirty.
1141735dd21eSpooka * if not cleaning, just operate on the one page.
1142735dd21eSpooka */
1143735dd21eSpooka
1144735dd21eSpooka if (needs_clean) {
1145735dd21eSpooka wasclean = false;
1146735dd21eSpooka memset(pgs, 0, sizeof(pgs));
1147735dd21eSpooka pg->flags |= PG_BUSY;
1148735dd21eSpooka UVM_PAGE_OWN(pg, "genfs_putpages");
1149735dd21eSpooka
1150735dd21eSpooka /*
1151e406c140Schs * let the fs constrain the offset range of the cluster.
1152e406c140Schs * we additionally constrain the range here such that
1153e406c140Schs * it fits in the "pgs" pages array.
1154e406c140Schs */
1155e406c140Schs
1156881d12e6Sad off_t fslo, fshi, genlo, lo, off = pg->offset;
1157e406c140Schs GOP_PUTRANGE(vp, off, &fslo, &fshi);
1158e406c140Schs KASSERT(fslo == trunc_page(fslo));
1159e406c140Schs KASSERT(fslo <= off);
1160e406c140Schs KASSERT(fshi == trunc_page(fshi));
1161e406c140Schs KASSERT(fshi == 0 || off < fshi);
1162e406c140Schs
1163e406c140Schs if (off > MAXPHYS / 2)
1164e406c140Schs genlo = trunc_page(off - (MAXPHYS / 2));
1165e406c140Schs else
1166e406c140Schs genlo = 0;
1167e406c140Schs lo = MAX(fslo, genlo);
1168e406c140Schs
1169e406c140Schs /*
1170735dd21eSpooka * first look backward.
1171735dd21eSpooka */
1172735dd21eSpooka
1173e406c140Schs npages = (off - lo) >> PAGE_SHIFT;
1174735dd21eSpooka nback = npages;
117505a3457eSad uvn_findpages(uobj, off - PAGE_SIZE, &nback,
117605a3457eSad &pgs[0], NULL,
1177735dd21eSpooka UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD);
1178735dd21eSpooka if (nback) {
1179735dd21eSpooka memmove(&pgs[0], &pgs[npages - nback],
1180735dd21eSpooka nback * sizeof(pgs[0]));
1181735dd21eSpooka if (npages - nback < nback)
1182735dd21eSpooka memset(&pgs[nback], 0,
1183735dd21eSpooka (npages - nback) * sizeof(pgs[0]));
1184735dd21eSpooka else
1185735dd21eSpooka memset(&pgs[npages - nback], 0,
1186735dd21eSpooka nback * sizeof(pgs[0]));
1187735dd21eSpooka }
1188735dd21eSpooka
1189735dd21eSpooka /*
1190735dd21eSpooka * then plug in our page of interest.
1191735dd21eSpooka */
1192735dd21eSpooka
1193735dd21eSpooka pgs[nback] = pg;
1194735dd21eSpooka
1195735dd21eSpooka /*
1196735dd21eSpooka * then look forward to fill in the remaining space in
1197735dd21eSpooka * the array of pages.
119805a3457eSad *
119905a3457eSad * pass our cached array of pages so that hopefully
120005a3457eSad * uvn_findpages can find some good pages in it.
120105a3457eSad * the array a was filled above with the one of
120205a3457eSad * following sets of flags:
120305a3457eSad * 0
120405a3457eSad * UVM_PAGE_ARRAY_FILL_DIRTY
120505a3457eSad * UVM_PAGE_ARRAY_FILL_DIRTY|WRITEBACK
12062806b3daSad *
12072806b3daSad * XXX this is fragile but it'll work: the array
12082806b3daSad * was earlier filled sparsely, but UFP_DIRTYONLY
12092806b3daSad * implies dense. see corresponding comment in
12102806b3daSad * uvn_findpages().
1211735dd21eSpooka */
1212735dd21eSpooka
1213d18e278dSchristos npages = MAXPAGES - nback - 1;
1214e406c140Schs if (fshi)
1215e406c140Schs npages = MIN(npages,
1216e406c140Schs (fshi - off - 1) >> PAGE_SHIFT);
1217735dd21eSpooka uvn_findpages(uobj, off + PAGE_SIZE, &npages,
12182806b3daSad &pgs[nback + 1], &a,
1219735dd21eSpooka UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY);
1220735dd21eSpooka npages += nback + 1;
1221735dd21eSpooka } else {
1222735dd21eSpooka pgs[0] = pg;
1223735dd21eSpooka npages = 1;
1224735dd21eSpooka nback = 0;
1225735dd21eSpooka }
1226735dd21eSpooka
1227735dd21eSpooka /*
1228735dd21eSpooka * apply FREE or DEACTIVATE options if requested.
1229735dd21eSpooka */
1230735dd21eSpooka
1231735dd21eSpooka for (i = 0; i < npages; i++) {
1232735dd21eSpooka tpg = pgs[i];
1233735dd21eSpooka KASSERT(tpg->uobject == uobj);
123405a3457eSad KASSERT(i == 0 ||
123505a3457eSad pgs[i-1]->offset + PAGE_SIZE == tpg->offset);
123605a3457eSad KASSERT(!needs_clean || uvm_pagegetdirty(pgs[i]) !=
123705a3457eSad UVM_PAGE_STATUS_DIRTY);
123805a3457eSad if (needs_clean) {
123905a3457eSad /*
124005a3457eSad * mark pages as WRITEBACK so that concurrent
124105a3457eSad * fsync can find and wait for our activities.
124205a3457eSad */
124319303cecSchs uvm_obj_page_set_writeback(pgs[i]);
124405a3457eSad }
1245735dd21eSpooka if (tpg->offset < startoff || tpg->offset >= endoff)
1246735dd21eSpooka continue;
1247735dd21eSpooka if (flags & PGO_DEACTIVATE && tpg->wire_count == 0) {
124894843b13Sad uvm_pagelock(tpg);
1249735dd21eSpooka uvm_pagedeactivate(tpg);
125094843b13Sad uvm_pageunlock(tpg);
1251735dd21eSpooka } else if (flags & PGO_FREE) {
1252735dd21eSpooka pmap_page_protect(tpg, VM_PROT_NONE);
1253735dd21eSpooka if (tpg->flags & PG_BUSY) {
1254735dd21eSpooka tpg->flags |= freeflag;
1255735dd21eSpooka if (pagedaemon) {
12564a780c9aSad uvm_pageout_start(1);
125794843b13Sad uvm_pagelock(tpg);
1258735dd21eSpooka uvm_pagedequeue(tpg);
125994843b13Sad uvm_pageunlock(tpg);
1260735dd21eSpooka }
1261735dd21eSpooka } else {
1262735dd21eSpooka
1263735dd21eSpooka /*
1264735dd21eSpooka * ``page is not busy''
1265735dd21eSpooka * implies that npages is 1
1266735dd21eSpooka * and needs_clean is false.
1267735dd21eSpooka */
1268735dd21eSpooka
1269881d12e6Sad KASSERT(npages == 1);
1270881d12e6Sad KASSERT(!needs_clean);
1271881d12e6Sad KASSERT(pg == tpg);
1272881d12e6Sad KASSERT(nextoff ==
1273881d12e6Sad tpg->offset + PAGE_SIZE);
1274735dd21eSpooka uvm_pagefree(tpg);
1275735dd21eSpooka if (pagedaemon)
1276735dd21eSpooka uvmexp.pdfreed++;
1277735dd21eSpooka }
1278735dd21eSpooka }
1279735dd21eSpooka }
1280735dd21eSpooka if (needs_clean) {
1281735dd21eSpooka modified = true;
1282881d12e6Sad KASSERT(nextoff == pg->offset + PAGE_SIZE);
1283881d12e6Sad KASSERT(nback < npages);
1284881d12e6Sad nextoff = pg->offset + ((npages - nback) << PAGE_SHIFT);
1285881d12e6Sad KASSERT(pgs[nback] == pg);
1286881d12e6Sad KASSERT(nextoff == pgs[npages - 1]->offset + PAGE_SIZE);
1287735dd21eSpooka
1288735dd21eSpooka /*
1289881d12e6Sad * start the i/o.
1290735dd21eSpooka */
1291d2a0ebb6Sad rw_exit(slock);
1292735dd21eSpooka error = GOP_WRITE(vp, pgs, npages, flags);
1293881d12e6Sad /*
1294881d12e6Sad * as we dropped the object lock, our cached pages can
1295881d12e6Sad * be stale.
1296881d12e6Sad */
1297881d12e6Sad uvm_page_array_clear(&a);
1298d2a0ebb6Sad rw_enter(slock, RW_WRITER);
1299735dd21eSpooka if (error) {
1300735dd21eSpooka break;
1301735dd21eSpooka }
1302735dd21eSpooka }
1303735dd21eSpooka }
1304881d12e6Sad uvm_page_array_fini(&a);
1305735dd21eSpooka
130605a3457eSad /*
130705a3457eSad * update ctime/mtime if the modification we started writing out might
130805a3457eSad * be from mmap'ed write.
130905a3457eSad *
131005a3457eSad * this is necessary when an application keeps a file mmaped and
131105a3457eSad * repeatedly modifies it via the window. note that, because we
131205a3457eSad * don't always write-protect pages when cleaning, such modifications
131305a3457eSad * might not involve any page faults.
131405a3457eSad */
131505a3457eSad
1316d2a0ebb6Sad mutex_enter(vp->v_interlock);
1317da3ef92bSad if (modified && (vp->v_iflag & VI_WRMAP) != 0 &&
1318735dd21eSpooka (vp->v_type != VBLK ||
1319735dd21eSpooka (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) {
1320735dd21eSpooka GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
1321735dd21eSpooka }
1322735dd21eSpooka
1323735dd21eSpooka /*
132405a3457eSad * if we no longer have any possibly dirty pages, take us off the
132505a3457eSad * syncer list.
1326735dd21eSpooka */
1327735dd21eSpooka
132819303cecSchs if ((vp->v_iflag & VI_ONWORKLST) != 0 && uvm_obj_clean_p(uobj) &&
132919303cecSchs LIST_EMPTY(&vp->v_dirtyblkhd)) {
1330735dd21eSpooka vn_syncer_remove_from_worklist(vp);
1331735dd21eSpooka }
1332735dd21eSpooka
13334a780c9aSad /* Wait for output to complete. */
1334d2a0ebb6Sad rw_exit(slock);
13354a780c9aSad if (!wasclean && !async && vp->v_numoutput != 0) {
13364a780c9aSad while (vp->v_numoutput != 0)
1337d2a0ebb6Sad cv_wait(&vp->v_cv, vp->v_interlock);
1338735dd21eSpooka }
133936c701bcSyamt onworklst = (vp->v_iflag & VI_ONWORKLST) != 0;
1340d2a0ebb6Sad mutex_exit(vp->v_interlock);
1341735dd21eSpooka
134236c701bcSyamt if ((flags & PGO_RECLAIM) != 0 && onworklst) {
134336c701bcSyamt /*
134436c701bcSyamt * in the case of PGO_RECLAIM, ensure to make the vnode clean.
134536c701bcSyamt * retrying is not a big deal because, in many cases,
134636c701bcSyamt * uobj->uo_npages is already 0 here.
134736c701bcSyamt */
1348d2a0ebb6Sad rw_enter(slock, RW_WRITER);
134936c701bcSyamt goto retry;
135036c701bcSyamt }
135136c701bcSyamt
135206a21e4cShannken if (trans_mp) {
135306a21e4cShannken if (holds_wapbl)
135406a21e4cShannken WAPBL_END(trans_mp);
135506a21e4cShannken fstrans_done(trans_mp);
135644f3404fShannken }
13570789b071Shannken
1358735dd21eSpooka return (error);
1359735dd21eSpooka }
1360735dd21eSpooka
1361e406c140Schs /*
1362e406c140Schs * Default putrange method for file systems that do not care
1363e406c140Schs * how many pages are given to one GOP_WRITE() call.
1364e406c140Schs */
1365e406c140Schs void
genfs_gop_putrange(struct vnode * vp,off_t off,off_t * lop,off_t * hip)1366e406c140Schs genfs_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
1367e406c140Schs {
1368e406c140Schs
1369e406c140Schs *lop = 0;
1370e406c140Schs *hip = 0;
1371e406c140Schs }
1372e406c140Schs
1373735dd21eSpooka int
genfs_gop_write(struct vnode * vp,struct vm_page ** pgs,int npages,int flags)1374735dd21eSpooka genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
1375735dd21eSpooka {
1376735dd21eSpooka off_t off;
1377735dd21eSpooka vaddr_t kva;
1378735dd21eSpooka size_t len;
1379735dd21eSpooka int error;
1380735dd21eSpooka UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1381735dd21eSpooka
1382cb32a134Spgoyette UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
1383cb32a134Spgoyette (uintptr_t)vp, (uintptr_t)pgs, npages, flags);
1384735dd21eSpooka
1385735dd21eSpooka off = pgs[0]->offset;
1386735dd21eSpooka kva = uvm_pagermapin(pgs, npages,
1387735dd21eSpooka UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
1388735dd21eSpooka len = npages << PAGE_SHIFT;
1389735dd21eSpooka
1390735dd21eSpooka error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE,
13915232c510Schs uvm_aio_aiodone);
1392735dd21eSpooka
1393735dd21eSpooka return error;
1394735dd21eSpooka }
1395735dd21eSpooka
1396881d12e6Sad /*
1397881d12e6Sad * genfs_gop_write_rwmap:
1398881d12e6Sad *
1399881d12e6Sad * a variant of genfs_gop_write. it's used by UDF for its directory buffers.
1400881d12e6Sad * this maps pages with PROT_WRITE so that VOP_STRATEGY can modifies
1401881d12e6Sad * the contents before writing it out to the underlying storage.
1402881d12e6Sad */
1403881d12e6Sad
1404e979c658Sreinoud int
genfs_gop_write_rwmap(struct vnode * vp,struct vm_page ** pgs,int npages,int flags)1405881d12e6Sad genfs_gop_write_rwmap(struct vnode *vp, struct vm_page **pgs, int npages,
1406881d12e6Sad int flags)
1407e979c658Sreinoud {
1408e979c658Sreinoud off_t off;
1409e979c658Sreinoud vaddr_t kva;
1410e979c658Sreinoud size_t len;
1411e979c658Sreinoud int error;
1412e979c658Sreinoud UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1413e979c658Sreinoud
1414cb32a134Spgoyette UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
1415cb32a134Spgoyette (uintptr_t)vp, (uintptr_t)pgs, npages, flags);
1416e979c658Sreinoud
1417e979c658Sreinoud off = pgs[0]->offset;
1418e979c658Sreinoud kva = uvm_pagermapin(pgs, npages,
1419e979c658Sreinoud UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK);
1420e979c658Sreinoud len = npages << PAGE_SHIFT;
1421e979c658Sreinoud
1422e979c658Sreinoud error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE,
14235232c510Schs uvm_aio_aiodone);
1424e979c658Sreinoud
1425e979c658Sreinoud return error;
1426e979c658Sreinoud }
1427e979c658Sreinoud
1428735dd21eSpooka /*
1429735dd21eSpooka * Backend routine for doing I/O to vnode pages. Pages are already locked
1430735dd21eSpooka * and mapped into kernel memory. Here we just look up the underlying
1431735dd21eSpooka * device block addresses and call the strategy routine.
1432735dd21eSpooka */
1433735dd21eSpooka
1434735dd21eSpooka static int
genfs_do_io(struct vnode * vp,off_t off,vaddr_t kva,size_t len,int flags,enum uio_rw rw,void (* iodone)(struct buf *))1435735dd21eSpooka genfs_do_io(struct vnode *vp, off_t off, vaddr_t kva, size_t len, int flags,
1436735dd21eSpooka enum uio_rw rw, void (*iodone)(struct buf *))
1437735dd21eSpooka {
14381b9d02ceSuebayasi int s, error;
1439735dd21eSpooka int fs_bshift, dev_bshift;
1440735dd21eSpooka off_t eof, offset, startoffset;
1441735dd21eSpooka size_t bytes, iobytes, skipbytes;
1442735dd21eSpooka struct buf *mbp, *bp;
144364cb3c88Suebayasi const bool async = (flags & PGO_SYNCIO) == 0;
14448306a9edSchs const bool lazy = (flags & PGO_LAZY) == 0;
144564cb3c88Suebayasi const bool iowrite = rw == UIO_WRITE;
144664cb3c88Suebayasi const int brw = iowrite ? B_WRITE : B_READ;
1447735dd21eSpooka UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1448735dd21eSpooka
1449cb32a134Spgoyette UVMHIST_LOG(ubchist, "vp %#jx kva %#jx len 0x%jx flags 0x%jx",
1450cb32a134Spgoyette (uintptr_t)vp, (uintptr_t)kva, len, flags);
1451735dd21eSpooka
1452*fda613dfSriastradh KASSERT(vp->v_size != VSIZENOTSET);
1453*fda613dfSriastradh KASSERT(vp->v_writesize != VSIZENOTSET);
1454*fda613dfSriastradh KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p"
1455*fda613dfSriastradh " v_size=0x%llx v_writesize=0x%llx", vp,
1456*fda613dfSriastradh (unsigned long long)vp->v_size,
1457*fda613dfSriastradh (unsigned long long)vp->v_writesize);
1458735dd21eSpooka GOP_SIZE(vp, vp->v_writesize, &eof, 0);
1459735dd21eSpooka if (vp->v_type != VBLK) {
1460735dd21eSpooka fs_bshift = vp->v_mount->mnt_fs_bshift;
1461735dd21eSpooka dev_bshift = vp->v_mount->mnt_dev_bshift;
1462735dd21eSpooka } else {
1463735dd21eSpooka fs_bshift = DEV_BSHIFT;
1464735dd21eSpooka dev_bshift = DEV_BSHIFT;
1465735dd21eSpooka }
1466735dd21eSpooka error = 0;
1467735dd21eSpooka startoffset = off;
1468735dd21eSpooka bytes = MIN(len, eof - startoffset);
1469735dd21eSpooka skipbytes = 0;
1470735dd21eSpooka KASSERT(bytes != 0);
1471735dd21eSpooka
147264cb3c88Suebayasi if (iowrite) {
1473881d12e6Sad /*
1474881d12e6Sad * why += 2?
1475881d12e6Sad * 1 for biodone, 1 for uvm_aio_aiodone.
1476881d12e6Sad */
1477e225b7bdSrmind mutex_enter(vp->v_interlock);
1478735dd21eSpooka vp->v_numoutput += 2;
1479e225b7bdSrmind mutex_exit(vp->v_interlock);
1480735dd21eSpooka }
14814a780c9aSad mbp = getiobuf(vp, true);
1482cb32a134Spgoyette UVMHIST_LOG(ubchist, "vp %#jx mbp %#jx num now %jd bytes 0x%jx",
1483cb32a134Spgoyette (uintptr_t)vp, (uintptr_t)mbp, vp->v_numoutput, bytes);
1484735dd21eSpooka mbp->b_bufsize = len;
1485735dd21eSpooka mbp->b_data = (void *)kva;
1486735dd21eSpooka mbp->b_resid = mbp->b_bcount = bytes;
148701f564d8Sad mbp->b_cflags |= BC_BUSY | BC_AGE;
14884a780c9aSad if (async) {
14894a780c9aSad mbp->b_flags = brw | B_ASYNC;
1490735dd21eSpooka mbp->b_iodone = iodone;
14914a780c9aSad } else {
14924a780c9aSad mbp->b_flags = brw;
14934a780c9aSad mbp->b_iodone = NULL;
14944a780c9aSad }
1495735dd21eSpooka if (curlwp == uvm.pagedaemon_lwp)
1496735dd21eSpooka BIO_SETPRIO(mbp, BPRIO_TIMELIMITED);
14978306a9edSchs else if (async || lazy)
1498735dd21eSpooka BIO_SETPRIO(mbp, BPRIO_TIMENONCRITICAL);
1499735dd21eSpooka else
1500735dd21eSpooka BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL);
1501735dd21eSpooka
1502735dd21eSpooka bp = NULL;
1503735dd21eSpooka for (offset = startoffset;
1504735dd21eSpooka bytes > 0;
1505735dd21eSpooka offset += iobytes, bytes -= iobytes) {
15061b9d02ceSuebayasi int run;
15071b9d02ceSuebayasi daddr_t lbn, blkno;
15081b9d02ceSuebayasi struct vnode *devvp;
15091b9d02ceSuebayasi
15101b9d02ceSuebayasi /*
15111b9d02ceSuebayasi * bmap the file to find out the blkno to read from and
15121b9d02ceSuebayasi * how much we can read in one i/o. if bmap returns an error,
15131b9d02ceSuebayasi * skip the rest of the top-level i/o.
15141b9d02ceSuebayasi */
15151b9d02ceSuebayasi
1516735dd21eSpooka lbn = offset >> fs_bshift;
1517735dd21eSpooka error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
1518735dd21eSpooka if (error) {
1519bf748078Ssimonb UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd",
15201b9d02ceSuebayasi lbn, error, 0, 0);
1521735dd21eSpooka skipbytes += bytes;
1522735dd21eSpooka bytes = 0;
15231b9d02ceSuebayasi goto loopdone;
1524735dd21eSpooka }
1525735dd21eSpooka
15261b9d02ceSuebayasi /*
15271b9d02ceSuebayasi * see how many pages can be read with this i/o.
15281b9d02ceSuebayasi * reduce the i/o size if necessary to avoid
15291b9d02ceSuebayasi * overwriting pages with valid data.
15301b9d02ceSuebayasi */
15311b9d02ceSuebayasi
1532735dd21eSpooka iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
1533735dd21eSpooka bytes);
15341b9d02ceSuebayasi
15351b9d02ceSuebayasi /*
15361b9d02ceSuebayasi * if this block isn't allocated, zero it instead of
15371b9d02ceSuebayasi * reading it. unless we are going to allocate blocks,
15381b9d02ceSuebayasi * mark the pages we zeroed PG_RDONLY.
15391b9d02ceSuebayasi */
15401b9d02ceSuebayasi
1541735dd21eSpooka if (blkno == (daddr_t)-1) {
154264cb3c88Suebayasi if (!iowrite) {
1543735dd21eSpooka memset((char *)kva + (offset - startoffset), 0,
1544735dd21eSpooka iobytes);
1545735dd21eSpooka }
1546735dd21eSpooka skipbytes += iobytes;
1547735dd21eSpooka continue;
1548735dd21eSpooka }
1549735dd21eSpooka
15501b9d02ceSuebayasi /*
15511b9d02ceSuebayasi * allocate a sub-buf for this piece of the i/o
15521b9d02ceSuebayasi * (or just use mbp if there's only 1 piece),
15531b9d02ceSuebayasi * and start it going.
15541b9d02ceSuebayasi */
15551b9d02ceSuebayasi
1556735dd21eSpooka if (offset == startoffset && iobytes == bytes) {
1557735dd21eSpooka bp = mbp;
1558735dd21eSpooka } else {
1559cb32a134Spgoyette UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
1560cb32a134Spgoyette (uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
15614a780c9aSad bp = getiobuf(vp, true);
1562735dd21eSpooka nestiobuf_setup(mbp, bp, offset - startoffset, iobytes);
1563735dd21eSpooka }
1564735dd21eSpooka bp->b_lblkno = 0;
1565735dd21eSpooka
1566735dd21eSpooka /* adjust physical blkno for partial blocks */
1567735dd21eSpooka bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
1568735dd21eSpooka dev_bshift);
15691b9d02ceSuebayasi
1570735dd21eSpooka UVMHIST_LOG(ubchist,
1571cb32a134Spgoyette "bp %#jx offset 0x%jx bcount 0x%jx blkno 0x%jx",
1572cb32a134Spgoyette (uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno);
1573735dd21eSpooka
1574735dd21eSpooka VOP_STRATEGY(devvp, bp);
1575735dd21eSpooka }
15761b9d02ceSuebayasi
15771b9d02ceSuebayasi loopdone:
1578735dd21eSpooka if (skipbytes) {
1579cb32a134Spgoyette UVMHIST_LOG(ubchist, "skipbytes %jd", skipbytes, 0,0,0);
1580735dd21eSpooka }
1581735dd21eSpooka nestiobuf_done(mbp, skipbytes, error);
1582735dd21eSpooka if (async) {
1583735dd21eSpooka UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
1584735dd21eSpooka return (0);
1585735dd21eSpooka }
1586cb32a134Spgoyette UVMHIST_LOG(ubchist, "waiting for mbp %#jx", (uintptr_t)mbp, 0, 0, 0);
1587735dd21eSpooka error = biowait(mbp);
1588735dd21eSpooka s = splbio();
1589735dd21eSpooka (*iodone)(mbp);
1590735dd21eSpooka splx(s);
1591cb32a134Spgoyette UVMHIST_LOG(ubchist, "returning, error %jd", error, 0, 0, 0);
1592735dd21eSpooka return (error);
1593735dd21eSpooka }
1594735dd21eSpooka
1595735dd21eSpooka int
genfs_compat_getpages(void * v)1596735dd21eSpooka genfs_compat_getpages(void *v)
1597735dd21eSpooka {
1598735dd21eSpooka struct vop_getpages_args /* {
1599735dd21eSpooka struct vnode *a_vp;
1600735dd21eSpooka voff_t a_offset;
1601735dd21eSpooka struct vm_page **a_m;
1602735dd21eSpooka int *a_count;
1603735dd21eSpooka int a_centeridx;
1604735dd21eSpooka vm_prot_t a_access_type;
1605735dd21eSpooka int a_advice;
1606735dd21eSpooka int a_flags;
1607735dd21eSpooka } */ *ap = v;
1608735dd21eSpooka
1609735dd21eSpooka off_t origoffset;
1610735dd21eSpooka struct vnode *vp = ap->a_vp;
1611735dd21eSpooka struct uvm_object *uobj = &vp->v_uobj;
1612735dd21eSpooka struct vm_page *pg, **pgs;
1613735dd21eSpooka vaddr_t kva;
1614735dd21eSpooka int i, error, orignpages, npages;
1615735dd21eSpooka struct iovec iov;
1616735dd21eSpooka struct uio uio;
1617735dd21eSpooka kauth_cred_t cred = curlwp->l_cred;
161864cb3c88Suebayasi const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
1619735dd21eSpooka
1620735dd21eSpooka error = 0;
1621735dd21eSpooka origoffset = ap->a_offset;
1622735dd21eSpooka orignpages = *ap->a_count;
1623735dd21eSpooka pgs = ap->a_m;
1624735dd21eSpooka
1625735dd21eSpooka if (ap->a_flags & PGO_LOCKED) {
162605a3457eSad uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, NULL,
162764cb3c88Suebayasi UFP_NOWAIT|UFP_NOALLOC| (memwrite ? UFP_NORDONLY : 0));
1628735dd21eSpooka
1629e15697fcSchs error = ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
1630e15697fcSchs return error;
1631735dd21eSpooka }
1632735dd21eSpooka if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= vp->v_size) {
1633d2a0ebb6Sad rw_exit(uobj->vmobjlock);
1634e15697fcSchs return EINVAL;
1635735dd21eSpooka }
1636735dd21eSpooka if ((ap->a_flags & PGO_SYNCIO) == 0) {
1637d2a0ebb6Sad rw_exit(uobj->vmobjlock);
1638735dd21eSpooka return 0;
1639735dd21eSpooka }
1640735dd21eSpooka npages = orignpages;
164105a3457eSad uvn_findpages(uobj, origoffset, &npages, pgs, NULL, UFP_ALL);
1642d2a0ebb6Sad rw_exit(uobj->vmobjlock);
1643735dd21eSpooka kva = uvm_pagermapin(pgs, npages,
1644735dd21eSpooka UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK);
1645735dd21eSpooka for (i = 0; i < npages; i++) {
1646735dd21eSpooka pg = pgs[i];
1647735dd21eSpooka if ((pg->flags & PG_FAKE) == 0) {
1648735dd21eSpooka continue;
1649735dd21eSpooka }
1650735dd21eSpooka iov.iov_base = (char *)kva + (i << PAGE_SHIFT);
1651735dd21eSpooka iov.iov_len = PAGE_SIZE;
1652735dd21eSpooka uio.uio_iov = &iov;
1653735dd21eSpooka uio.uio_iovcnt = 1;
1654735dd21eSpooka uio.uio_offset = origoffset + (i << PAGE_SHIFT);
1655735dd21eSpooka uio.uio_rw = UIO_READ;
1656735dd21eSpooka uio.uio_resid = PAGE_SIZE;
1657735dd21eSpooka UIO_SETUP_SYSSPACE(&uio);
1658735dd21eSpooka /* XXX vn_lock */
1659735dd21eSpooka error = VOP_READ(vp, &uio, 0, cred);
1660735dd21eSpooka if (error) {
1661735dd21eSpooka break;
1662735dd21eSpooka }
1663735dd21eSpooka if (uio.uio_resid) {
1664735dd21eSpooka memset(iov.iov_base, 0, uio.uio_resid);
1665735dd21eSpooka }
1666735dd21eSpooka }
1667735dd21eSpooka uvm_pagermapout(kva, npages);
1668d2a0ebb6Sad rw_enter(uobj->vmobjlock, RW_WRITER);
1669735dd21eSpooka for (i = 0; i < npages; i++) {
1670735dd21eSpooka pg = pgs[i];
1671735dd21eSpooka if (error && (pg->flags & PG_FAKE) != 0) {
1672735dd21eSpooka pg->flags |= PG_RELEASED;
1673735dd21eSpooka } else {
167405a3457eSad uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
167594843b13Sad uvm_pagelock(pg);
1676735dd21eSpooka uvm_pageactivate(pg);
167794843b13Sad uvm_pageunlock(pg);
1678735dd21eSpooka }
1679735dd21eSpooka }
1680735dd21eSpooka if (error) {
1681735dd21eSpooka uvm_page_unbusy(pgs, npages);
1682735dd21eSpooka }
1683d2a0ebb6Sad rw_exit(uobj->vmobjlock);
1684e15697fcSchs return error;
1685735dd21eSpooka }
1686735dd21eSpooka
1687735dd21eSpooka int
genfs_compat_gop_write(struct vnode * vp,struct vm_page ** pgs,int npages,int flags)1688735dd21eSpooka genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
1689735dd21eSpooka int flags)
1690735dd21eSpooka {
1691735dd21eSpooka off_t offset;
1692735dd21eSpooka struct iovec iov;
1693735dd21eSpooka struct uio uio;
1694735dd21eSpooka kauth_cred_t cred = curlwp->l_cred;
1695735dd21eSpooka struct buf *bp;
1696735dd21eSpooka vaddr_t kva;
16974a780c9aSad int error;
1698735dd21eSpooka
1699735dd21eSpooka offset = pgs[0]->offset;
1700735dd21eSpooka kva = uvm_pagermapin(pgs, npages,
1701735dd21eSpooka UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
1702735dd21eSpooka
1703735dd21eSpooka iov.iov_base = (void *)kva;
1704735dd21eSpooka iov.iov_len = npages << PAGE_SHIFT;
1705735dd21eSpooka uio.uio_iov = &iov;
1706735dd21eSpooka uio.uio_iovcnt = 1;
1707735dd21eSpooka uio.uio_offset = offset;
1708735dd21eSpooka uio.uio_rw = UIO_WRITE;
1709735dd21eSpooka uio.uio_resid = npages << PAGE_SHIFT;
1710735dd21eSpooka UIO_SETUP_SYSSPACE(&uio);
1711735dd21eSpooka /* XXX vn_lock */
1712735dd21eSpooka error = VOP_WRITE(vp, &uio, 0, cred);
1713735dd21eSpooka
1714e225b7bdSrmind mutex_enter(vp->v_interlock);
17154a780c9aSad vp->v_numoutput++;
1716e225b7bdSrmind mutex_exit(vp->v_interlock);
1717735dd21eSpooka
17184a780c9aSad bp = getiobuf(vp, true);
171901f564d8Sad bp->b_cflags |= BC_BUSY | BC_AGE;
1720735dd21eSpooka bp->b_lblkno = offset >> vp->v_mount->mnt_fs_bshift;
1721735dd21eSpooka bp->b_data = (char *)kva;
1722735dd21eSpooka bp->b_bcount = npages << PAGE_SHIFT;
1723735dd21eSpooka bp->b_bufsize = npages << PAGE_SHIFT;
1724735dd21eSpooka bp->b_resid = 0;
1725735dd21eSpooka bp->b_error = error;
1726735dd21eSpooka uvm_aio_aiodone(bp);
1727735dd21eSpooka return (error);
1728735dd21eSpooka }
1729735dd21eSpooka
1730735dd21eSpooka /*
1731735dd21eSpooka * Process a uio using direct I/O. If we reach a part of the request
1732735dd21eSpooka * which cannot be processed in this fashion for some reason, just return.
1733735dd21eSpooka * The caller must handle some additional part of the request using
1734735dd21eSpooka * buffered I/O before trying direct I/O again.
1735735dd21eSpooka */
1736735dd21eSpooka
1737735dd21eSpooka void
genfs_directio(struct vnode * vp,struct uio * uio,int ioflag)1738735dd21eSpooka genfs_directio(struct vnode *vp, struct uio *uio, int ioflag)
1739735dd21eSpooka {
1740735dd21eSpooka struct vmspace *vs;
1741735dd21eSpooka struct iovec *iov;
1742735dd21eSpooka vaddr_t va;
1743735dd21eSpooka size_t len;
1744735dd21eSpooka const int mask = DEV_BSIZE - 1;
1745735dd21eSpooka int error;
1746f5bbefdbSjoerg bool need_wapbl = (vp->v_mount && vp->v_mount->mnt_wapbl &&
1747f5bbefdbSjoerg (ioflag & IO_JOURNALLOCKED) == 0);
1748735dd21eSpooka
1749560071c2Sjdolecek #ifdef DIAGNOSTIC
1750560071c2Sjdolecek if ((ioflag & IO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl)
1751560071c2Sjdolecek WAPBL_JLOCK_ASSERT(vp->v_mount);
1752560071c2Sjdolecek #endif
1753560071c2Sjdolecek
1754735dd21eSpooka /*
1755735dd21eSpooka * We only support direct I/O to user space for now.
1756735dd21eSpooka */
1757735dd21eSpooka
1758735dd21eSpooka if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) {
1759735dd21eSpooka return;
1760735dd21eSpooka }
1761735dd21eSpooka
1762735dd21eSpooka /*
1763735dd21eSpooka * If the vnode is mapped, we would need to get the getpages lock
1764735dd21eSpooka * to stabilize the bmap, but then we would get into trouble while
1765735dd21eSpooka * locking the pages if the pages belong to this same vnode (or a
1766735dd21eSpooka * multi-vnode cascade to the same effect). Just fall back to
1767735dd21eSpooka * buffered I/O if the vnode is mapped to avoid this mess.
1768735dd21eSpooka */
1769735dd21eSpooka
1770735dd21eSpooka if (vp->v_vflag & VV_MAPPED) {
1771735dd21eSpooka return;
1772735dd21eSpooka }
1773735dd21eSpooka
1774f5bbefdbSjoerg if (need_wapbl) {
1775ac6b1617Shannken error = WAPBL_BEGIN(vp->v_mount);
1776ac6b1617Shannken if (error)
1777ac6b1617Shannken return;
1778ac6b1617Shannken }
1779ac6b1617Shannken
1780735dd21eSpooka /*
1781735dd21eSpooka * Do as much of the uio as possible with direct I/O.
1782735dd21eSpooka */
1783735dd21eSpooka
1784735dd21eSpooka vs = uio->uio_vmspace;
1785735dd21eSpooka while (uio->uio_resid) {
1786735dd21eSpooka iov = uio->uio_iov;
1787735dd21eSpooka if (iov->iov_len == 0) {
1788735dd21eSpooka uio->uio_iov++;
1789735dd21eSpooka uio->uio_iovcnt--;
1790735dd21eSpooka continue;
1791735dd21eSpooka }
1792735dd21eSpooka va = (vaddr_t)iov->iov_base;
1793735dd21eSpooka len = MIN(iov->iov_len, genfs_maxdio);
1794735dd21eSpooka len &= ~mask;
1795735dd21eSpooka
1796735dd21eSpooka /*
1797735dd21eSpooka * If the next chunk is smaller than DEV_BSIZE or extends past
1798735dd21eSpooka * the current EOF, then fall back to buffered I/O.
1799735dd21eSpooka */
1800735dd21eSpooka
1801735dd21eSpooka if (len == 0 || uio->uio_offset + len > vp->v_size) {
1802ac6b1617Shannken break;
1803735dd21eSpooka }
1804735dd21eSpooka
1805735dd21eSpooka /*
1806735dd21eSpooka * Check alignment. The file offset must be at least
1807735dd21eSpooka * sector-aligned. The exact constraint on memory alignment
1808735dd21eSpooka * is very hardware-dependent, but requiring sector-aligned
1809735dd21eSpooka * addresses there too is safe.
1810735dd21eSpooka */
1811735dd21eSpooka
1812735dd21eSpooka if (uio->uio_offset & mask || va & mask) {
1813ac6b1617Shannken break;
1814735dd21eSpooka }
1815735dd21eSpooka error = genfs_do_directio(vs, va, len, vp, uio->uio_offset,
1816735dd21eSpooka uio->uio_rw);
1817735dd21eSpooka if (error) {
1818735dd21eSpooka break;
1819735dd21eSpooka }
1820735dd21eSpooka iov->iov_base = (char *)iov->iov_base + len;
1821735dd21eSpooka iov->iov_len -= len;
1822735dd21eSpooka uio->uio_offset += len;
1823735dd21eSpooka uio->uio_resid -= len;
1824735dd21eSpooka }
1825ac6b1617Shannken
1826f5bbefdbSjoerg if (need_wapbl)
1827ac6b1617Shannken WAPBL_END(vp->v_mount);
1828735dd21eSpooka }
1829735dd21eSpooka
1830735dd21eSpooka /*
1831735dd21eSpooka * Iodone routine for direct I/O. We don't do much here since the request is
1832735dd21eSpooka * always synchronous, so the caller will do most of the work after biowait().
1833735dd21eSpooka */
1834735dd21eSpooka
1835735dd21eSpooka static void
genfs_dio_iodone(struct buf * bp)1836735dd21eSpooka genfs_dio_iodone(struct buf *bp)
1837735dd21eSpooka {
1838735dd21eSpooka
1839735dd21eSpooka KASSERT((bp->b_flags & B_ASYNC) == 0);
18404a780c9aSad if ((bp->b_flags & B_READ) == 0 && (bp->b_cflags & BC_AGE) != 0) {
18414a780c9aSad mutex_enter(bp->b_objlock);
1842735dd21eSpooka vwakeup(bp);
18434a780c9aSad mutex_exit(bp->b_objlock);
1844735dd21eSpooka }
1845735dd21eSpooka putiobuf(bp);
1846735dd21eSpooka }
1847735dd21eSpooka
1848735dd21eSpooka /*
1849735dd21eSpooka * Process one chunk of a direct I/O request.
1850735dd21eSpooka */
1851735dd21eSpooka
1852735dd21eSpooka static int
genfs_do_directio(struct vmspace * vs,vaddr_t uva,size_t len,struct vnode * vp,off_t off,enum uio_rw rw)1853735dd21eSpooka genfs_do_directio(struct vmspace *vs, vaddr_t uva, size_t len, struct vnode *vp,
1854735dd21eSpooka off_t off, enum uio_rw rw)
1855735dd21eSpooka {
1856735dd21eSpooka struct vm_map *map;
185769d5d9c0Smartin struct pmap *upm, *kpm __unused;
1858735dd21eSpooka size_t klen = round_page(uva + len) - trunc_page(uva);
1859735dd21eSpooka off_t spoff, epoff;
1860735dd21eSpooka vaddr_t kva, puva;
1861735dd21eSpooka paddr_t pa;
1862735dd21eSpooka vm_prot_t prot;
18636a2419feSmartin int error, rv __diagused, poff, koff;
1864ac6b1617Shannken const int pgoflags = PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED |
1865735dd21eSpooka (rw == UIO_WRITE ? PGO_FREE : 0);
1866735dd21eSpooka
1867735dd21eSpooka /*
1868735dd21eSpooka * For writes, verify that this range of the file already has fully
1869735dd21eSpooka * allocated backing store. If there are any holes, just punt and
1870735dd21eSpooka * make the caller take the buffered write path.
1871735dd21eSpooka */
1872735dd21eSpooka
1873735dd21eSpooka if (rw == UIO_WRITE) {
1874735dd21eSpooka daddr_t lbn, elbn, blkno;
1875735dd21eSpooka int bsize, bshift, run;
1876735dd21eSpooka
1877735dd21eSpooka bshift = vp->v_mount->mnt_fs_bshift;
1878735dd21eSpooka bsize = 1 << bshift;
1879735dd21eSpooka lbn = off >> bshift;
1880735dd21eSpooka elbn = (off + len + bsize - 1) >> bshift;
1881735dd21eSpooka while (lbn < elbn) {
1882735dd21eSpooka error = VOP_BMAP(vp, lbn, NULL, &blkno, &run);
1883735dd21eSpooka if (error) {
1884735dd21eSpooka return error;
1885735dd21eSpooka }
1886735dd21eSpooka if (blkno == (daddr_t)-1) {
1887735dd21eSpooka return ENOSPC;
1888735dd21eSpooka }
1889735dd21eSpooka lbn += 1 + run;
1890735dd21eSpooka }
1891735dd21eSpooka }
1892735dd21eSpooka
1893735dd21eSpooka /*
1894735dd21eSpooka * Flush any cached pages for parts of the file that we're about to
1895735dd21eSpooka * access. If we're writing, invalidate pages as well.
1896735dd21eSpooka */
1897735dd21eSpooka
1898735dd21eSpooka spoff = trunc_page(off);
1899735dd21eSpooka epoff = round_page(off + len);
190013162282Sad rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
1901735dd21eSpooka error = VOP_PUTPAGES(vp, spoff, epoff, pgoflags);
1902735dd21eSpooka if (error) {
1903735dd21eSpooka return error;
1904735dd21eSpooka }
1905735dd21eSpooka
1906735dd21eSpooka /*
1907735dd21eSpooka * Wire the user pages and remap them into kernel memory.
1908735dd21eSpooka */
1909735dd21eSpooka
1910735dd21eSpooka prot = rw == UIO_READ ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ;
1911735dd21eSpooka error = uvm_vslock(vs, (void *)uva, len, prot);
1912735dd21eSpooka if (error) {
1913735dd21eSpooka return error;
1914735dd21eSpooka }
1915735dd21eSpooka
1916735dd21eSpooka map = &vs->vm_map;
1917735dd21eSpooka upm = vm_map_pmap(map);
1918735dd21eSpooka kpm = vm_map_pmap(kernel_map);
1919735dd21eSpooka puva = trunc_page(uva);
19202c6de4b4Smatt kva = uvm_km_alloc(kernel_map, klen, atop(puva) & uvmexp.colormask,
19212c6de4b4Smatt UVM_KMF_VAONLY | UVM_KMF_WAITVA | UVM_KMF_COLORMATCH);
1922735dd21eSpooka for (poff = 0; poff < klen; poff += PAGE_SIZE) {
1923735dd21eSpooka rv = pmap_extract(upm, puva + poff, &pa);
1924735dd21eSpooka KASSERT(rv);
19252c6de4b4Smatt pmap_kenter_pa(kva + poff, pa, prot, PMAP_WIRED);
19268c8246dcSrmind }
1927735dd21eSpooka pmap_update(kpm);
1928735dd21eSpooka
1929735dd21eSpooka /*
1930735dd21eSpooka * Do the I/O.
1931735dd21eSpooka */
1932735dd21eSpooka
1933735dd21eSpooka koff = uva - trunc_page(uva);
1934735dd21eSpooka error = genfs_do_io(vp, off, kva + koff, len, PGO_SYNCIO, rw,
1935735dd21eSpooka genfs_dio_iodone);
1936735dd21eSpooka
1937735dd21eSpooka /*
1938735dd21eSpooka * Tear down the kernel mapping.
1939735dd21eSpooka */
1940735dd21eSpooka
19412c6de4b4Smatt pmap_kremove(kva, klen);
1942735dd21eSpooka pmap_update(kpm);
1943735dd21eSpooka uvm_km_free(kernel_map, kva, klen, UVM_KMF_VAONLY);
1944735dd21eSpooka
1945735dd21eSpooka /*
1946735dd21eSpooka * Unwire the user pages.
1947735dd21eSpooka */
1948735dd21eSpooka
1949735dd21eSpooka uvm_vsunlock(vs, (void *)uva, len);
1950735dd21eSpooka return error;
1951735dd21eSpooka }
1952