xref: /openbsd-src/sys/uvm/uvm_fault.c (revision 4016c7defd6f4ebd4fcdcfbd79f9ddc6bf06e457)
1*4016c7deSmpi /*	$OpenBSD: uvm_fault.c,v 1.162 2025/01/22 10:52:09 mpi Exp $	*/
22c932f6fSmiod /*	$NetBSD: uvm_fault.c,v 1.51 2000/08/06 00:22:53 thorpej Exp $	*/
3cd7ee8acSart 
4cd7ee8acSart /*
5cd7ee8acSart  * Copyright (c) 1997 Charles D. Cranor and Washington University.
6cd7ee8acSart  * All rights reserved.
7cd7ee8acSart  *
8cd7ee8acSart  * Redistribution and use in source and binary forms, with or without
9cd7ee8acSart  * modification, are permitted provided that the following conditions
10cd7ee8acSart  * are met:
11cd7ee8acSart  * 1. Redistributions of source code must retain the above copyright
12cd7ee8acSart  *    notice, this list of conditions and the following disclaimer.
13cd7ee8acSart  * 2. Redistributions in binary form must reproduce the above copyright
14cd7ee8acSart  *    notice, this list of conditions and the following disclaimer in the
15cd7ee8acSart  *    documentation and/or other materials provided with the distribution.
16cd7ee8acSart  *
17cd7ee8acSart  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18cd7ee8acSart  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19cd7ee8acSart  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20cd7ee8acSart  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21cd7ee8acSart  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22cd7ee8acSart  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23cd7ee8acSart  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24cd7ee8acSart  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25cd7ee8acSart  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26cd7ee8acSart  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27cd7ee8acSart  *
28cd7ee8acSart  * from: Id: uvm_fault.c,v 1.1.2.23 1998/02/06 05:29:05 chs Exp
29cd7ee8acSart  */
30cd7ee8acSart 
31cd7ee8acSart /*
32cd7ee8acSart  * uvm_fault.c: fault handler
33cd7ee8acSart  */
34cd7ee8acSart 
35cd7ee8acSart #include <sys/param.h>
36cd7ee8acSart #include <sys/systm.h>
37cd7ee8acSart #include <sys/kernel.h>
38627a59d1Smpi #include <sys/percpu.h>
39cd7ee8acSart #include <sys/proc.h>
40cd7ee8acSart #include <sys/malloc.h>
41cd7ee8acSart #include <sys/mman.h>
42c28df561Smpi #include <sys/tracepoint.h>
43cd7ee8acSart 
44cd7ee8acSart #include <uvm/uvm.h>
45cd7ee8acSart 
46cd7ee8acSart /*
47cd7ee8acSart  *
48cd7ee8acSart  * a word on page faults:
49cd7ee8acSart  *
50cd7ee8acSart  * types of page faults we handle:
51cd7ee8acSart  *
52cd7ee8acSart  * CASE 1: upper layer faults                   CASE 2: lower layer faults
53cd7ee8acSart  *
54cd7ee8acSart  *    CASE 1A         CASE 1B                  CASE 2A        CASE 2B
55cd7ee8acSart  *    read/write1     write>1                  read/write   +-cow_write/zero
56cd7ee8acSart  *         |             |                         |        |
57cd7ee8acSart  *      +--|--+       +--|--+     +-----+       +  |  +     | +-----+
582ed91a58Smpi  * amap |  V  |       |  ---------> new |          |        | |  ^  |
59cd7ee8acSart  *      +-----+       +-----+     +-----+       +  |  +     | +--|--+
60cd7ee8acSart  *                                                 |        |    |
61cd7ee8acSart  *      +-----+       +-----+                   +--|--+     | +--|--+
622ed91a58Smpi  * uobj | d/c |       | d/c |                   |  V  |     +----+  |
63cd7ee8acSart  *      +-----+       +-----+                   +-----+       +-----+
64cd7ee8acSart  *
65cd7ee8acSart  * d/c = don't care
66cd7ee8acSart  *
67cd7ee8acSart  *   case [0]: layerless fault
68cd7ee8acSart  *	no amap or uobj is present.   this is an error.
69cd7ee8acSart  *
70cd7ee8acSart  *   case [1]: upper layer fault [anon active]
71cd7ee8acSart  *     1A: [read] or [write with anon->an_ref == 1]
722ed91a58Smpi  *		I/O takes place in upper level anon and uobj is not touched.
73cd7ee8acSart  *     1B: [write with anon->an_ref > 1]
74cd7ee8acSart  *		new anon is alloc'd and data is copied off ["COW"]
75cd7ee8acSart  *
76cd7ee8acSart  *   case [2]: lower layer fault [uobj]
77cd7ee8acSart  *     2A: [read on non-NULL uobj] or [write to non-copy_on_write area]
78cd7ee8acSart  *		I/O takes place directly in object.
79cd7ee8acSart  *     2B: [write to copy_on_write] or [read on NULL uobj]
80cd7ee8acSart  *		data is "promoted" from uobj to a new anon.
81cd7ee8acSart  *		if uobj is null, then we zero fill.
82cd7ee8acSart  *
83cd7ee8acSart  * we follow the standard UVM locking protocol ordering:
84cd7ee8acSart  *
85cd7ee8acSart  * MAPS => AMAP => UOBJ => ANON => PAGE QUEUES (PQ)
86cd7ee8acSart  * we hold a PG_BUSY page if we unlock for I/O
87cd7ee8acSart  *
88cd7ee8acSart  *
89cd7ee8acSart  * the code is structured as follows:
90cd7ee8acSart  *
91cd7ee8acSart  *     - init the "IN" params in the ufi structure
922ed91a58Smpi  *   ReFault: (ERESTART returned to the loop in uvm_fault)
93cd7ee8acSart  *     - do lookups [locks maps], check protection, handle needs_copy
94cd7ee8acSart  *     - check for case 0 fault (error)
95cd7ee8acSart  *     - establish "range" of fault
96cd7ee8acSart  *     - if we have an amap lock it and extract the anons
97cd7ee8acSart  *     - if sequential advice deactivate pages behind us
98cd7ee8acSart  *     - at the same time check pmap for unmapped areas and anon for pages
99cd7ee8acSart  *	 that we could map in (and do map it if found)
100cd7ee8acSart  *     - check object for resident pages that we could map in
101cd7ee8acSart  *     - if (case 2) goto Case2
102cd7ee8acSart  *     - >>> handle case 1
103cd7ee8acSart  *           - ensure source anon is resident in RAM
104cd7ee8acSart  *           - if case 1B alloc new anon and copy from source
105cd7ee8acSart  *           - map the correct page in
106cd7ee8acSart  *   Case2:
107cd7ee8acSart  *     - >>> handle case 2
108cd7ee8acSart  *           - ensure source page is resident (if uobj)
109cd7ee8acSart  *           - if case 2B alloc new anon and copy from source (could be zero
110cd7ee8acSart  *		fill if uobj == NULL)
111cd7ee8acSart  *           - map the correct page in
112cd7ee8acSart  *     - done!
113cd7ee8acSart  *
114cd7ee8acSart  * note on paging:
115cd7ee8acSart  *   if we have to do I/O we place a PG_BUSY page in the correct object,
116cd7ee8acSart  * unlock everything, and do the I/O.   when I/O is done we must reverify
117cd7ee8acSart  * the state of the world before assuming that our data structures are
118cd7ee8acSart  * valid.   [because mappings could change while the map is unlocked]
119cd7ee8acSart  *
120cd7ee8acSart  *  alternative 1: unbusy the page in question and restart the page fault
121cd7ee8acSart  *    from the top (ReFault).   this is easy but does not take advantage
122cd7ee8acSart  *    of the information that we already have from our previous lookup,
123cd7ee8acSart  *    although it is possible that the "hints" in the vm_map will help here.
124cd7ee8acSart  *
125cd7ee8acSart  * alternative 2: the system already keeps track of a "version" number of
126cd7ee8acSart  *    a map.   [i.e. every time you write-lock a map (e.g. to change a
127cd7ee8acSart  *    mapping) you bump the version number up by one...]   so, we can save
128cd7ee8acSart  *    the version number of the map before we release the lock and start I/O.
129cd7ee8acSart  *    then when I/O is done we can relock and check the version numbers
130cd7ee8acSart  *    to see if anything changed.    this might save us some over 1 because
131cd7ee8acSart  *    we don't have to unbusy the page and may be less compares(?).
132cd7ee8acSart  *
133cd7ee8acSart  * alternative 3: put in backpointers or a way to "hold" part of a map
134cd7ee8acSart  *    in place while I/O is in progress.   this could be complex to
135cd7ee8acSart  *    implement (especially with structures like amap that can be referenced
136cd7ee8acSart  *    by multiple map entries, and figuring out what should wait could be
137cd7ee8acSart  *    complex as well...).
138cd7ee8acSart  *
1392ed91a58Smpi  * we use alternative 2.  given that we are multi-threaded now we may want
1402ed91a58Smpi  * to reconsider the choice.
141cd7ee8acSart  */
142cd7ee8acSart 
143cd7ee8acSart /*
144cd7ee8acSart  * local data structures
145cd7ee8acSart  */
146cd7ee8acSart struct uvm_advice {
147cd7ee8acSart 	int nback;
148cd7ee8acSart 	int nforw;
149cd7ee8acSart };
150cd7ee8acSart 
151cd7ee8acSart /*
1521c7ad6bdSmiod  * page range array: set up in uvmfault_init().
153cd7ee8acSart  */
154e087cc70Sguenther static struct uvm_advice uvmadvice[MADV_MASK + 1];
155cd7ee8acSart 
156cd7ee8acSart #define UVM_MAXRANGE 16	/* must be max() of nback+nforw+1 */
157cd7ee8acSart 
158cd7ee8acSart /*
159cd7ee8acSart  * private prototypes
160cd7ee8acSart  */
1611e3e475dSoga static void uvmfault_amapcopy(struct uvm_faultinfo *);
16256b7e380Smpi static inline void uvmfault_anonflush(struct vm_anon **, int);
163ca5c6958Soga void	uvmfault_unlockmaps(struct uvm_faultinfo *, boolean_t);
1640372dd1aSariane void	uvmfault_update_stats(struct uvm_faultinfo *);
165cd7ee8acSart 
166cd7ee8acSart /*
167cd7ee8acSart  * inline functions
168cd7ee8acSart  */
169cd7ee8acSart /*
170cd7ee8acSart  * uvmfault_anonflush: try and deactivate pages in specified anons
171cd7ee8acSart  *
172cd7ee8acSart  * => does not have to deactivate page if it is busy
173cd7ee8acSart  */
17456b7e380Smpi static inline void
1752023d591Soga uvmfault_anonflush(struct vm_anon **anons, int n)
176cd7ee8acSart {
177cd7ee8acSart 	int lcv;
178cd7ee8acSart 	struct vm_page *pg;
179cd7ee8acSart 
180cd7ee8acSart 	for (lcv = 0; lcv < n; lcv++) {
181cd7ee8acSart 		if (anons[lcv] == NULL)
182cd7ee8acSart 			continue;
18319dcab73Smpi 		KASSERT(rw_lock_held(anons[lcv]->an_lock));
1848d0b5bafSpedro 		pg = anons[lcv]->an_page;
1856f909936Svisa 		if (pg && (pg->pg_flags & PG_BUSY) == 0) {
186cd7ee8acSart 			uvm_lock_pageq();
187cd7ee8acSart 			if (pg->wire_count == 0) {
188cd7ee8acSart 				uvm_pagedeactivate(pg);
189cd7ee8acSart 			}
190cd7ee8acSart 			uvm_unlock_pageq();
191cd7ee8acSart 		}
192cd7ee8acSart 	}
193cd7ee8acSart }
194cd7ee8acSart 
195cd7ee8acSart /*
196cd7ee8acSart  * normal functions
197cd7ee8acSart  */
198cd7ee8acSart /*
1991c7ad6bdSmiod  * uvmfault_init: compute proper values for the uvmadvice[] array.
2001c7ad6bdSmiod  */
2011c7ad6bdSmiod void
202c799dc6dSnaddy uvmfault_init(void)
2031c7ad6bdSmiod {
2041c7ad6bdSmiod 	int npages;
2051c7ad6bdSmiod 
2061c7ad6bdSmiod 	npages = atop(16384);
2071c7ad6bdSmiod 	if (npages > 0) {
2081c7ad6bdSmiod 		KASSERT(npages <= UVM_MAXRANGE / 2);
20915cd8707Sguenther 		uvmadvice[MADV_NORMAL].nforw = npages;
21015cd8707Sguenther 		uvmadvice[MADV_NORMAL].nback = npages - 1;
2111c7ad6bdSmiod 	}
2121c7ad6bdSmiod 
2131c7ad6bdSmiod 	npages = atop(32768);
2141c7ad6bdSmiod 	if (npages > 0) {
2151c7ad6bdSmiod 		KASSERT(npages <= UVM_MAXRANGE / 2);
21615cd8707Sguenther 		uvmadvice[MADV_SEQUENTIAL].nforw = npages - 1;
21715cd8707Sguenther 		uvmadvice[MADV_SEQUENTIAL].nback = npages;
2181c7ad6bdSmiod 	}
2191c7ad6bdSmiod }
2201c7ad6bdSmiod 
2211c7ad6bdSmiod /*
222cd7ee8acSart  * uvmfault_amapcopy: clear "needs_copy" in a map.
223cd7ee8acSart  *
2242ed91a58Smpi  * => called with VM data structures unlocked (usually, see below)
2252ed91a58Smpi  * => we get a write lock on the maps and clear needs_copy for a VA
226cd7ee8acSart  * => if we are out of RAM we sleep (waiting for more)
227cd7ee8acSart  */
2281e3e475dSoga static void
2292023d591Soga uvmfault_amapcopy(struct uvm_faultinfo *ufi)
230cd7ee8acSart {
2312ed91a58Smpi 	for (;;) {
23252887a38Smpi 		/*
23352887a38Smpi 		 * no mapping?  give up.
23452887a38Smpi 		 */
235cd7ee8acSart 		if (uvmfault_lookup(ufi, TRUE) == FALSE)
236cd7ee8acSart 			return;
237cd7ee8acSart 
23852887a38Smpi 		/*
23952887a38Smpi 		 * copy if needed.
24052887a38Smpi 		 */
241cd7ee8acSart 		if (UVM_ET_ISNEEDSCOPY(ufi->entry))
242003f5e42Sderaadt 			amap_copy(ufi->map, ufi->entry, M_NOWAIT,
243003f5e42Sderaadt 				UVM_ET_ISSTACK(ufi->entry) ? FALSE : TRUE,
244cd7ee8acSart 				ufi->orig_rvaddr, ufi->orig_rvaddr + 1);
245cd7ee8acSart 
24652887a38Smpi 		/*
24752887a38Smpi 		 * didn't work?  must be out of RAM.   unlock and sleep.
24852887a38Smpi 		 */
249cd7ee8acSart 		if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
250cd7ee8acSart 			uvmfault_unlockmaps(ufi, TRUE);
251cd7ee8acSart 			uvm_wait("fltamapcopy");
252cd7ee8acSart 			continue;
253cd7ee8acSart 		}
254cd7ee8acSart 
25552887a38Smpi 		/*
25652887a38Smpi 		 * got it!   unlock and return.
25752887a38Smpi 		 */
258cd7ee8acSart 		uvmfault_unlockmaps(ufi, TRUE);
259cd7ee8acSart 		return;
260cd7ee8acSart 	}
261cd7ee8acSart 	/*NOTREACHED*/
262cd7ee8acSart }
263cd7ee8acSart 
264cd7ee8acSart /*
265cd7ee8acSart  * uvmfault_anonget: get data in an anon into a non-busy, non-released
266cd7ee8acSart  * page in that anon.
267cd7ee8acSart  *
2682ed91a58Smpi  * => Map, amap and thus anon should be locked by caller.
2692ed91a58Smpi  * => If we fail, we unlock everything and error is returned.
2702ed91a58Smpi  * => If we are successful, return with everything still locked.
2712ed91a58Smpi  * => We do not move the page on the queues [gets moved later].  If we
2722ed91a58Smpi  *    allocate a new page [we_own], it gets put on the queues.  Either way,
2732ed91a58Smpi  *    the result is that the page is on the queues at return time
274cd7ee8acSart  */
27528fbabcfSart int
2762023d591Soga uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap,
2772023d591Soga     struct vm_anon *anon)
278cd7ee8acSart {
279cd7ee8acSart 	struct vm_page *pg;
2802ed91a58Smpi 	int error;
281cd7ee8acSart 
28219dcab73Smpi 	KASSERT(rw_lock_held(anon->an_lock));
28319dcab73Smpi 	KASSERT(anon->an_lock == amap->am_lock);
28419dcab73Smpi 
2852ed91a58Smpi 	/* Increment the counters.*/
286627a59d1Smpi 	counters_inc(uvmexp_counters, flt_anget);
2872ed91a58Smpi 	if (anon->an_page) {
2888f15e6a4Sguenther 		curproc->p_ru.ru_minflt++;
2892ed91a58Smpi 	} else {
2908f15e6a4Sguenther 		curproc->p_ru.ru_majflt++;
2912ed91a58Smpi 	}
2922ed91a58Smpi 	error = 0;
293cd7ee8acSart 
2942ed91a58Smpi 	/*
2952ed91a58Smpi 	 * Loop until we get the anon data, or fail.
2962ed91a58Smpi 	 */
2972ed91a58Smpi 	for (;;) {
2982ed91a58Smpi 		boolean_t we_own, locked;
2992ed91a58Smpi 		/*
3002ed91a58Smpi 		 * Note: 'we_own' will become true if we set PG_BUSY on a page.
3012ed91a58Smpi 		 */
3022ed91a58Smpi 		we_own = FALSE;
3038d0b5bafSpedro 		pg = anon->an_page;
304cd7ee8acSart 
3052ed91a58Smpi 		/*
3062ed91a58Smpi 		 * Is page resident?  Make sure it is not busy/released.
3072ed91a58Smpi 		 */
308cd7ee8acSart 		if (pg) {
3096f909936Svisa 			KASSERT(pg->pg_flags & PQ_ANON);
3106f909936Svisa 			KASSERT(pg->uanon == anon);
3116f909936Svisa 
312cd7ee8acSart 			/*
3136f909936Svisa 			 * if the page is busy, we drop all the locks and
3146f909936Svisa 			 * try again.
315cd7ee8acSart 			 */
3166f909936Svisa 			if ((pg->pg_flags & (PG_BUSY|PG_RELEASED)) == 0)
31734e43087Smpi 				return 0;
318627a59d1Smpi 			counters_inc(uvmexp_counters, flt_pgwait);
319cd7ee8acSart 
320cd7ee8acSart 			/*
3212ed91a58Smpi 			 * The last unlock must be an atomic unlock and wait
3222ed91a58Smpi 			 * on the owner of page.
323cd7ee8acSart 			 */
324b83f5574Smpi 			KASSERT(pg->uobject == NULL);
32519dcab73Smpi 			uvmfault_unlockall(ufi, NULL, NULL);
326b83f5574Smpi 			uvm_pagewait(pg, anon->an_lock, "anonget");
327cd7ee8acSart 		} else {
3282ed91a58Smpi 			/*
3292ed91a58Smpi 			 * No page, therefore allocate one.
3302ed91a58Smpi 			 */
3318a42ed70Sart 			pg = uvm_pagealloc(NULL, 0, anon, 0);
3322ed91a58Smpi 			if (pg == NULL) {
3332ed91a58Smpi 				/* Out of memory.  Wait a little. */
334ec3489eeSmpi 				uvmfault_unlockall(ufi, amap, NULL);
335627a59d1Smpi 				counters_inc(uvmexp_counters, flt_noram);
336cd7ee8acSart 				uvm_wait("flt_noram1");
337cd7ee8acSart 			} else {
3382ed91a58Smpi 				/* PG_BUSY bit is set. */
339cd7ee8acSart 				we_own = TRUE;
340ec3489eeSmpi 				uvmfault_unlockall(ufi, amap, NULL);
341cd7ee8acSart 
342cd7ee8acSart 				/*
34352887a38Smpi 				 * Pass a PG_BUSY+PG_FAKE+PG_CLEAN page into
34452887a38Smpi 				 * the uvm_swap_get() function with all data
34552887a38Smpi 				 * structures unlocked.  Note that it is OK
34652887a38Smpi 				 * to read an_swslot here, because we hold
34752887a38Smpi 				 * PG_BUSY on the page.
348cd7ee8acSart 				 */
349627a59d1Smpi 				counters_inc(uvmexp_counters, pageins);
3502ed91a58Smpi 				error = uvm_swap_get(pg, anon->an_swslot,
351cd7ee8acSart 				    PGO_SYNCIO);
352cd7ee8acSart 
353cd7ee8acSart 				/*
3542ed91a58Smpi 				 * We clean up after the I/O below in the
3552ed91a58Smpi 				 * 'we_own' case.
356cd7ee8acSart 				 */
357cd7ee8acSart 			}
358cd7ee8acSart 		}
359cd7ee8acSart 
3602ed91a58Smpi 		/*
3612ed91a58Smpi 		 * Re-lock the map and anon.
3622ed91a58Smpi 		 */
363cd7ee8acSart 		locked = uvmfault_relock(ufi);
36419dcab73Smpi 		if (locked || we_own) {
36519dcab73Smpi 			rw_enter(anon->an_lock, RW_WRITE);
36619dcab73Smpi 		}
367cd7ee8acSart 
368cd7ee8acSart 		/*
3692ed91a58Smpi 		 * If we own the page (i.e. we set PG_BUSY), then we need
3702ed91a58Smpi 		 * to clean up after the I/O.  There are three cases to
371cd7ee8acSart 		 * consider:
3722ed91a58Smpi 		 *
3732ed91a58Smpi 		 * 1) Page was released during I/O: free anon and ReFault.
3742ed91a58Smpi 		 * 2) I/O not OK.  Free the page and cause the fault to fail.
3752ed91a58Smpi 		 * 3) I/O OK!  Activate the page and sync with the non-we_own
3762ed91a58Smpi 		 *    case (i.e. drop anon lock if not locked).
377cd7ee8acSart 		 */
378cd7ee8acSart 		if (we_own) {
3799662fca4Sart 			if (pg->pg_flags & PG_WANTED) {
3801e3afca1Ssmart 				wakeup(pg);
381cd7ee8acSart 			}
382cd7ee8acSart 
383cd7ee8acSart 			/*
384cd7ee8acSart 			 * if we were RELEASED during I/O, then our anon is
385cd7ee8acSart 			 * no longer part of an amap.   we need to free the
386cd7ee8acSart 			 * anon and try again.
387cd7ee8acSart 			 */
3889662fca4Sart 			if (pg->pg_flags & PG_RELEASED) {
38919dcab73Smpi 				KASSERT(anon->an_ref == 0);
39052887a38Smpi 				/*
39152887a38Smpi 				 * Released while we had unlocked amap.
39252887a38Smpi 				 */
393cd7ee8acSart 				if (locked)
39438b0cdf0Smpi 					uvmfault_unlockall(ufi, NULL, NULL);
39519dcab73Smpi 				uvm_anon_release(anon);	/* frees page for us */
396627a59d1Smpi 				counters_inc(uvmexp_counters, flt_pgrele);
39734e43087Smpi 				return ERESTART;	/* refault! */
398cd7ee8acSart 			}
399cd7ee8acSart 
4002ed91a58Smpi 			if (error != VM_PAGER_OK) {
4012ed91a58Smpi 				KASSERT(error != VM_PAGER_PEND);
4021414b0faSart 
403cd7ee8acSart 				/* remove page from anon */
4048d0b5bafSpedro 				anon->an_page = NULL;
405cd7ee8acSart 
406cd7ee8acSart 				/*
4072ed91a58Smpi 				 * Remove the swap slot from the anon and
4082ed91a58Smpi 				 * mark the anon as having no real slot.
4092ed91a58Smpi 				 * Do not free the swap slot, thus preventing
41028fbabcfSart 				 * it from being used again.
41128fbabcfSart 				 */
41228fbabcfSart 				uvm_swap_markbad(anon->an_swslot, 1);
41328fbabcfSart 				anon->an_swslot = SWSLOT_BAD;
41428fbabcfSart 
41528fbabcfSart 				/*
4162ed91a58Smpi 				 * Note: page was never !PG_BUSY, so it
4172ed91a58Smpi 				 * cannot be mapped and thus no need to
41852887a38Smpi 				 * pmap_page_protect() it.
419cd7ee8acSart 				 */
420cd7ee8acSart 				uvm_lock_pageq();
421cd7ee8acSart 				uvm_pagefree(pg);
422cd7ee8acSart 				uvm_unlock_pageq();
423cd7ee8acSart 
4242ed91a58Smpi 				if (locked) {
42538b0cdf0Smpi 					uvmfault_unlockall(ufi, NULL, NULL);
4262ed91a58Smpi 				}
42719dcab73Smpi 				rw_exit(anon->an_lock);
42834e43087Smpi 				/*
42934e43087Smpi 				 * An error occurred while trying to bring
43034e43087Smpi 				 * in the page -- this is the only error we
43134e43087Smpi 				 * return right now.
43234e43087Smpi 				 */
43334e43087Smpi 				return EACCES;	/* XXX */
434cd7ee8acSart 			}
435cd7ee8acSart 
436cd7ee8acSart 			/*
43752887a38Smpi 			 * We have successfully read the page, activate it.
438cd7ee8acSart 			 */
4392c7adcb7Sart 			pmap_clear_modify(pg);
440cd7ee8acSart 			uvm_lock_pageq();
441cd7ee8acSart 			uvm_pageactivate(pg);
442cd7ee8acSart 			uvm_unlock_pageq();
443f8cbc53aSmpi 			atomic_clearbits_int(&pg->pg_flags,
444f8cbc53aSmpi 			    PG_WANTED|PG_BUSY|PG_FAKE);
445f8cbc53aSmpi 			UVM_PAGE_OWN(pg, NULL);
446cd7ee8acSart 		}
447cd7ee8acSart 
4482ed91a58Smpi 		/*
4492ed91a58Smpi 		 * We were not able to re-lock the map - restart the fault.
4502ed91a58Smpi 		 */
45119dcab73Smpi 		if (!locked) {
45219dcab73Smpi 			if (we_own) {
45319dcab73Smpi 				rw_exit(anon->an_lock);
45419dcab73Smpi 			}
45534e43087Smpi 			return ERESTART;
45619dcab73Smpi 		}
457cd7ee8acSart 
4582ed91a58Smpi 		/*
4592ed91a58Smpi 		 * Verify that no one has touched the amap and moved
4602ed91a58Smpi 		 * the anon on us.
4612ed91a58Smpi 		 */
4622ed91a58Smpi 		if (ufi != NULL && amap_lookup(&ufi->entry->aref,
463cd7ee8acSart 		    ufi->orig_rvaddr - ufi->entry->start) != anon) {
464ec3489eeSmpi 			uvmfault_unlockall(ufi, amap, NULL);
46534e43087Smpi 			return ERESTART;
466cd7ee8acSart 		}
467cd7ee8acSart 
4682ed91a58Smpi 		/*
4692ed91a58Smpi 		 * Retry..
4702ed91a58Smpi 		 */
471627a59d1Smpi 		counters_inc(uvmexp_counters, flt_anretry);
472cd7ee8acSart 		continue;
473cd7ee8acSart 
4742ed91a58Smpi 	}
475cd7ee8acSart 	/*NOTREACHED*/
476cd7ee8acSart }
477cd7ee8acSart 
478cd7ee8acSart /*
479dceff774Smpi  * uvmfault_promote: promote data to a new anon.  used for 1B and 2B.
480dceff774Smpi  *
481dceff774Smpi  *	1. allocate an anon and a page.
482dceff774Smpi  *	2. fill its contents.
483dceff774Smpi  *
484dceff774Smpi  * => if we fail (result != 0) we unlock everything.
485dceff774Smpi  * => on success, return a new locked anon via 'nanon'.
486dceff774Smpi  * => it's caller's responsibility to put the promoted nanon->an_page to the
487dceff774Smpi  *    page queue.
488dceff774Smpi  */
489dceff774Smpi int
490dceff774Smpi uvmfault_promote(struct uvm_faultinfo *ufi,
491dceff774Smpi     struct vm_page *uobjpage,
492dceff774Smpi     struct vm_anon **nanon, /* OUT: allocated anon */
493dceff774Smpi     struct vm_page **npg)
494dceff774Smpi {
495dceff774Smpi 	struct vm_amap *amap = ufi->entry->aref.ar_amap;
496e9d70b48Smpi 	struct uvm_object *uobj = NULL;
497dceff774Smpi 	struct vm_anon *anon;
498dceff774Smpi 	struct vm_page *pg = NULL;
499dceff774Smpi 
500e9d70b48Smpi 	if (uobjpage != PGO_DONTCARE)
501e9d70b48Smpi 		uobj = uobjpage->uobject;
502e9d70b48Smpi 
503e9d70b48Smpi 	KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
504e9d70b48Smpi 
505dceff774Smpi 	anon = uvm_analloc();
506dceff774Smpi 	if (anon) {
507dceff774Smpi 		anon->an_lock = amap->am_lock;
508dceff774Smpi 		pg = uvm_pagealloc(NULL, 0, anon,
509dceff774Smpi 		    (uobjpage == PGO_DONTCARE) ? UVM_PGA_ZERO : 0);
510dceff774Smpi 	}
511dceff774Smpi 
512dceff774Smpi 	/* check for out of RAM */
513dceff774Smpi 	if (anon == NULL || pg == NULL) {
514e9d70b48Smpi 		uvmfault_unlockall(ufi, amap, uobj);
515dceff774Smpi 		if (anon == NULL)
516dceff774Smpi 			counters_inc(uvmexp_counters, flt_noanon);
517dceff774Smpi 		else {
518dceff774Smpi 			anon->an_lock = NULL;
519dceff774Smpi 			anon->an_ref--;
520dceff774Smpi 			uvm_anfree(anon);
521dceff774Smpi 			counters_inc(uvmexp_counters, flt_noram);
522dceff774Smpi 		}
523dceff774Smpi 
524dceff774Smpi 		if (uvm_swapisfull())
525dceff774Smpi 			return ENOMEM;
526dceff774Smpi 
527dceff774Smpi 		/* out of RAM, wait for more */
528dceff774Smpi 		if (anon == NULL)
529dceff774Smpi 			uvm_anwait();
530dceff774Smpi 		else
531dceff774Smpi 			uvm_wait("flt_noram3");
532dceff774Smpi 		return ERESTART;
533dceff774Smpi 	}
534dceff774Smpi 
535dceff774Smpi 	/*
536dceff774Smpi 	 * copy the page [pg now dirty]
537dceff774Smpi 	 */
538dceff774Smpi 	if (uobjpage != PGO_DONTCARE)
539dceff774Smpi 		uvm_pagecopy(uobjpage, pg);
540dceff774Smpi 
541dceff774Smpi 	*nanon = anon;
542dceff774Smpi 	*npg = pg;
543dceff774Smpi 	return 0;
544dceff774Smpi }
545dceff774Smpi 
546dceff774Smpi /*
5470372dd1aSariane  * Update statistics after fault resolution.
5480372dd1aSariane  * - maxrss
5490372dd1aSariane  */
5500372dd1aSariane void
5510372dd1aSariane uvmfault_update_stats(struct uvm_faultinfo *ufi)
5520372dd1aSariane {
5530372dd1aSariane 	struct vm_map		*map;
5540372dd1aSariane 	struct proc		*p;
5550372dd1aSariane 	vsize_t			 res;
5560372dd1aSariane 
5570372dd1aSariane 	map = ufi->orig_map;
5580372dd1aSariane 
559b9032da0Smlarkin 	/*
560b9032da0Smlarkin 	 * If this is a nested pmap (eg, a virtual machine pmap managed
561b9032da0Smlarkin 	 * by vmm(4) on amd64/i386), don't do any updating, just return.
562b9032da0Smlarkin 	 *
563b9032da0Smlarkin 	 * pmap_nested() on other archs is #defined to 0, so this is a
564b9032da0Smlarkin 	 * no-op.
565b9032da0Smlarkin 	 */
566b9032da0Smlarkin 	if (pmap_nested(map->pmap))
567b9032da0Smlarkin 		return;
568b9032da0Smlarkin 
56935164244Stedu 	/* Update the maxrss for the process. */
5700372dd1aSariane 	if (map->flags & VM_MAP_ISVMSPACE) {
5710372dd1aSariane 		p = curproc;
5720372dd1aSariane 		KASSERT(p != NULL && &p->p_vmspace->vm_map == map);
5730372dd1aSariane 
5740372dd1aSariane 		res = pmap_resident_count(map->pmap);
5750372dd1aSariane 		/* Convert res from pages to kilobytes. */
5760372dd1aSariane 		res <<= (PAGE_SHIFT - 10);
5770372dd1aSariane 
5780372dd1aSariane 		if (p->p_ru.ru_maxrss < res)
5790372dd1aSariane 			p->p_ru.ru_maxrss = res;
5800372dd1aSariane 	}
5810372dd1aSariane }
5820372dd1aSariane 
5832ed91a58Smpi /*
5842ed91a58Smpi  *   F A U L T   -   m a i n   e n t r y   p o i n t
5852ed91a58Smpi  */
5862ed91a58Smpi 
5872ed91a58Smpi /*
5882ed91a58Smpi  * uvm_fault: page fault handler
5892ed91a58Smpi  *
5902ed91a58Smpi  * => called from MD code to resolve a page fault
5912ed91a58Smpi  * => VM data structures usually should be unlocked.   however, it is
5922ed91a58Smpi  *	possible to call here with the main map locked if the caller
5932ed91a58Smpi  *	gets a write lock, sets it recursive, and then calls us (c.f.
5942ed91a58Smpi  *	uvm_map_pageable).   this should be avoided because it keeps
5952ed91a58Smpi  *	the map locked off during I/O.
5962ed91a58Smpi  * => MUST NEVER BE CALLED IN INTERRUPT CONTEXT
5972ed91a58Smpi  */
5982ed91a58Smpi #define MASK(entry)     (UVM_ET_ISCOPYONWRITE(entry) ? \
5992ed91a58Smpi 			 ~PROT_WRITE : PROT_MASK)
6007f5d8661Smpi struct uvm_faultctx {
6017f5d8661Smpi 	/*
6027f5d8661Smpi 	 * the following members are set up by uvm_fault_check() and
6037f5d8661Smpi 	 * read-only after that.
6047f5d8661Smpi 	 */
6057f5d8661Smpi 	vm_prot_t enter_prot;
606b004aefeSmpi 	vm_prot_t access_type;
6077f5d8661Smpi 	vaddr_t startva;
6087f5d8661Smpi 	int npages;
6097f5d8661Smpi 	int centeridx;
6107f5d8661Smpi 	boolean_t narrow;
6117f5d8661Smpi 	boolean_t wired;
6127f5d8661Smpi 	paddr_t pa_flags;
613cce913b9Smpi 	boolean_t promote;
614552563d5Smpi 	int lower_lock_type;
6157f5d8661Smpi };
6167f5d8661Smpi 
6172ed91a58Smpi int		uvm_fault_check(
6182ed91a58Smpi 		    struct uvm_faultinfo *, struct uvm_faultctx *,
619d6897f14Smpi 		    struct vm_anon ***, vm_fault_t);
6202ed91a58Smpi 
6212ed91a58Smpi int		uvm_fault_upper(
6222ed91a58Smpi 		    struct uvm_faultinfo *, struct uvm_faultctx *,
62358243cbfSmpi 		    struct vm_anon **);
6242ed91a58Smpi boolean_t	uvm_fault_upper_lookup(
6252ed91a58Smpi 		    struct uvm_faultinfo *, const struct uvm_faultctx *,
6262ed91a58Smpi 		    struct vm_anon **, struct vm_page **);
6272ed91a58Smpi 
6282ed91a58Smpi int		uvm_fault_lower(
6292ed91a58Smpi 		    struct uvm_faultinfo *, struct uvm_faultctx *,
63058243cbfSmpi 		    struct vm_page **);
631cce913b9Smpi int		uvm_fault_lower_io(
632cce913b9Smpi 		    struct uvm_faultinfo *, struct uvm_faultctx *,
633cce913b9Smpi 		    struct uvm_object **, struct vm_page **);
6343053940aSmpi 
6352ed91a58Smpi int
6362ed91a58Smpi uvm_fault(vm_map_t orig_map, vaddr_t vaddr, vm_fault_t fault_type,
6372ed91a58Smpi     vm_prot_t access_type)
6382ed91a58Smpi {
6392ed91a58Smpi 	struct uvm_faultinfo ufi;
6402ed91a58Smpi 	struct uvm_faultctx flt;
6412ed91a58Smpi 	boolean_t shadowed;
6422ed91a58Smpi 	struct vm_anon *anons_store[UVM_MAXRANGE], **anons;
6432ed91a58Smpi 	struct vm_page *pages[UVM_MAXRANGE];
6442ed91a58Smpi 	int error;
6452ed91a58Smpi 
6462ed91a58Smpi 	counters_inc(uvmexp_counters, faults);
6472ed91a58Smpi 	TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL);
6482ed91a58Smpi 
6492ed91a58Smpi 	/*
6502ed91a58Smpi 	 * init the IN parameters in the ufi
6512ed91a58Smpi 	 */
6522ed91a58Smpi 	ufi.orig_map = orig_map;
6532ed91a58Smpi 	ufi.orig_rvaddr = trunc_page(vaddr);
6542ed91a58Smpi 	ufi.orig_size = PAGE_SIZE;	/* can't get any smaller than this */
6552ed91a58Smpi 	flt.access_type = access_type;
656d6897f14Smpi 	flt.narrow = FALSE;		/* assume normal fault for now */
657d6897f14Smpi 	flt.wired = FALSE;		/* assume non-wired fault for now */
658552563d5Smpi 	flt.lower_lock_type = RW_WRITE;	/* exclusive lock for now */
6592ed91a58Smpi 
6602ed91a58Smpi 	error = ERESTART;
6612ed91a58Smpi 	while (error == ERESTART) { /* ReFault: */
6622ed91a58Smpi 		anons = anons_store;
6632ed91a58Smpi 
664d6897f14Smpi 		error = uvm_fault_check(&ufi, &flt, &anons, fault_type);
6652ed91a58Smpi 		if (error != 0)
6662ed91a58Smpi 			continue;
6672ed91a58Smpi 
6682ed91a58Smpi 		/* True if there is an anon at the faulting address */
6692ed91a58Smpi 		shadowed = uvm_fault_upper_lookup(&ufi, &flt, anons, pages);
6702ed91a58Smpi 		if (shadowed == TRUE) {
6712ed91a58Smpi 			/* case 1: fault on an anon in our amap */
67258243cbfSmpi 			error = uvm_fault_upper(&ufi, &flt, anons);
6732ed91a58Smpi 		} else {
6744bb42341Smpi 			struct uvm_object *uobj = ufi.entry->object.uvm_obj;
6754bb42341Smpi 
6764bb42341Smpi 			/*
6774bb42341Smpi 			 * if the desired page is not shadowed by the amap and
6784bb42341Smpi 			 * we have a backing object, then we check to see if
6794bb42341Smpi 			 * the backing object would prefer to handle the fault
6804bb42341Smpi 			 * itself (rather than letting us do it with the usual
6814bb42341Smpi 			 * pgo_get hook).  the backing object signals this by
6824bb42341Smpi 			 * providing a pgo_fault routine.
6834bb42341Smpi 			 */
6844bb42341Smpi 			if (uobj != NULL && uobj->pgops->pgo_fault != NULL) {
68569c04514Smpi 				rw_enter(uobj->vmobjlock, RW_WRITE);
686f46a341eSmpi 				KERNEL_LOCK();
6874bb42341Smpi 				error = uobj->pgops->pgo_fault(&ufi,
6884bb42341Smpi 				    flt.startva, pages, flt.npages,
6894bb42341Smpi 				    flt.centeridx, fault_type, flt.access_type,
6904bb42341Smpi 				    PGO_LOCKED);
6912ed91a58Smpi 				KERNEL_UNLOCK();
6924bb42341Smpi 			} else {
6934bb42341Smpi 				/* case 2: fault on backing obj or zero fill */
69458243cbfSmpi 				error = uvm_fault_lower(&ufi, &flt, pages);
6954bb42341Smpi 			}
6962ed91a58Smpi 		}
6972ed91a58Smpi 	}
6982ed91a58Smpi 
6992ed91a58Smpi 	return error;
7002ed91a58Smpi }
7012ed91a58Smpi 
7027f5d8661Smpi /*
7037f5d8661Smpi  * uvm_fault_check: check prot, handle needs-copy, etc.
7047f5d8661Smpi  *
7057f5d8661Smpi  *	1. lookup entry.
7067f5d8661Smpi  *	2. check protection.
7077f5d8661Smpi  *	3. adjust fault condition (mainly for simulated fault).
7087f5d8661Smpi  *	4. handle needs-copy (lazy amap copy).
7097f5d8661Smpi  *	5. establish range of interest for neighbor fault (aka pre-fault).
7107f5d8661Smpi  *	6. look up anons (if amap exists).
7117f5d8661Smpi  *	7. flush pages (if MADV_SEQUENTIAL)
7127f5d8661Smpi  *
7137f5d8661Smpi  * => called with nothing locked.
7147f5d8661Smpi  * => if we fail (result != 0) we unlock everything.
7157f5d8661Smpi  * => initialize/adjust many members of flt.
7167f5d8661Smpi  */
7177f5d8661Smpi int
7187f5d8661Smpi uvm_fault_check(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
719d6897f14Smpi     struct vm_anon ***ranons, vm_fault_t fault_type)
7207f5d8661Smpi {
7217f5d8661Smpi 	struct vm_amap *amap;
7227f5d8661Smpi 	struct uvm_object *uobj;
7237f5d8661Smpi 	int nback, nforw;
7247f5d8661Smpi 
7252ed91a58Smpi 	/*
7262ed91a58Smpi 	 * lookup and lock the maps
7272ed91a58Smpi 	 */
7287f5d8661Smpi 	if (uvmfault_lookup(ufi, FALSE) == FALSE) {
7292ed91a58Smpi 		return EFAULT;
7307f5d8661Smpi 	}
7312ed91a58Smpi 	/* locked: maps(read) */
7327f5d8661Smpi 
7337f5d8661Smpi #ifdef DIAGNOSTIC
7347f5d8661Smpi 	if ((ufi->map->flags & VM_MAP_PAGEABLE) == 0)
7357f5d8661Smpi 		panic("uvm_fault: fault on non-pageable map (%p, 0x%lx)",
7367f5d8661Smpi 		    ufi->map, ufi->orig_rvaddr);
7377f5d8661Smpi #endif
7387f5d8661Smpi 
7392ed91a58Smpi 	/*
7402ed91a58Smpi 	 * check protection
7412ed91a58Smpi 	 */
742b004aefeSmpi 	if ((ufi->entry->protection & flt->access_type) != flt->access_type) {
7437f5d8661Smpi 		uvmfault_unlockmaps(ufi, FALSE);
7442ed91a58Smpi 		return EACCES;
7457f5d8661Smpi 	}
7467f5d8661Smpi 
7477f5d8661Smpi 	/*
7487f5d8661Smpi 	 * "enter_prot" is the protection we want to enter the page in at.
7497f5d8661Smpi 	 * for certain pages (e.g. copy-on-write pages) this protection can
7507f5d8661Smpi 	 * be more strict than ufi->entry->protection.  "wired" means either
7517f5d8661Smpi 	 * the entry is wired or we are fault-wiring the pg.
7527f5d8661Smpi 	 */
7537f5d8661Smpi 	flt->enter_prot = ufi->entry->protection;
7547f5d8661Smpi 	flt->pa_flags = UVM_ET_ISWC(ufi->entry) ? PMAP_WC : 0;
755d6897f14Smpi 	if (VM_MAPENT_ISWIRED(ufi->entry) || (fault_type == VM_FAULT_WIRE)) {
756d6897f14Smpi 		flt->wired = TRUE;
757b004aefeSmpi 		flt->access_type = flt->enter_prot; /* full access for wired */
758d6897f14Smpi 		/*  don't look for neighborhood * pages on "wire" fault */
759d6897f14Smpi 		flt->narrow = TRUE;
760d6897f14Smpi 	}
7617f5d8661Smpi 
7627f5d8661Smpi 	/* handle "needs_copy" case. */
7637f5d8661Smpi 	if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
764b004aefeSmpi 		if ((flt->access_type & PROT_WRITE) ||
7657f5d8661Smpi 		    (ufi->entry->object.uvm_obj == NULL)) {
7667f5d8661Smpi 			/* need to clear */
7677f5d8661Smpi 			uvmfault_unlockmaps(ufi, FALSE);
7687f5d8661Smpi 			uvmfault_amapcopy(ufi);
769627a59d1Smpi 			counters_inc(uvmexp_counters, flt_amcopy);
7702ed91a58Smpi 			return ERESTART;
7717f5d8661Smpi 		} else {
7727f5d8661Smpi 			/*
7737f5d8661Smpi 			 * ensure that we pmap_enter page R/O since
7747f5d8661Smpi 			 * needs_copy is still true
7757f5d8661Smpi 			 */
7767f5d8661Smpi 			flt->enter_prot &= ~PROT_WRITE;
7777f5d8661Smpi 		}
7787f5d8661Smpi 	}
7797f5d8661Smpi 
7802ed91a58Smpi 	/*
7812ed91a58Smpi 	 * identify the players
7822ed91a58Smpi 	 */
7832ed91a58Smpi 	amap = ufi->entry->aref.ar_amap;	/* upper layer */
7842ed91a58Smpi 	uobj = ufi->entry->object.uvm_obj;	/* lower layer */
7857f5d8661Smpi 
7867f5d8661Smpi 	/*
7877f5d8661Smpi 	 * check for a case 0 fault.  if nothing backing the entry then
7887f5d8661Smpi 	 * error now.
7897f5d8661Smpi 	 */
7907f5d8661Smpi 	if (amap == NULL && uobj == NULL) {
7917f5d8661Smpi 		uvmfault_unlockmaps(ufi, FALSE);
7922ed91a58Smpi 		return EFAULT;
7937f5d8661Smpi 	}
7947f5d8661Smpi 
7957f5d8661Smpi 	/*
7963b9e4e18Smpi 	 * for a case 2B fault waste no time on adjacent pages because
7973b9e4e18Smpi 	 * they are likely already entered.
7983b9e4e18Smpi 	 */
7993b9e4e18Smpi 	if (uobj != NULL && amap != NULL &&
8003b9e4e18Smpi 	    (flt->access_type & PROT_WRITE) != 0) {
8013b9e4e18Smpi 		/* wide fault (!narrow) */
8023b9e4e18Smpi 		flt->narrow = TRUE;
8033b9e4e18Smpi 	}
8043b9e4e18Smpi 
8053b9e4e18Smpi 	/*
8067f5d8661Smpi 	 * establish range of interest based on advice from mapper
8077f5d8661Smpi 	 * and then clip to fit map entry.   note that we only want
8087f5d8661Smpi 	 * to do this the first time through the fault.   if we
8097f5d8661Smpi 	 * ReFault we will disable this by setting "narrow" to true.
8107f5d8661Smpi 	 */
8117f5d8661Smpi 	if (flt->narrow == FALSE) {
8127f5d8661Smpi 
8137f5d8661Smpi 		/* wide fault (!narrow) */
8147f5d8661Smpi 		nback = min(uvmadvice[ufi->entry->advice].nback,
8157f5d8661Smpi 		    (ufi->orig_rvaddr - ufi->entry->start) >> PAGE_SHIFT);
8167f5d8661Smpi 		flt->startva = ufi->orig_rvaddr - ((vsize_t)nback << PAGE_SHIFT);
8177f5d8661Smpi 		nforw = min(uvmadvice[ufi->entry->advice].nforw,
8187f5d8661Smpi 		    ((ufi->entry->end - ufi->orig_rvaddr) >> PAGE_SHIFT) - 1);
8197f5d8661Smpi 		/*
8207f5d8661Smpi 		 * note: "-1" because we don't want to count the
8217f5d8661Smpi 		 * faulting page as forw
8227f5d8661Smpi 		 */
8237f5d8661Smpi 		flt->npages = nback + nforw + 1;
8247f5d8661Smpi 		flt->centeridx = nback;
8257f5d8661Smpi 
8267f5d8661Smpi 		flt->narrow = TRUE;	/* ensure only once per-fault */
8277f5d8661Smpi 	} else {
8287f5d8661Smpi 		/* narrow fault! */
8297f5d8661Smpi 		nback = nforw = 0;
8307f5d8661Smpi 		flt->startva = ufi->orig_rvaddr;
8317f5d8661Smpi 		flt->npages = 1;
8327f5d8661Smpi 		flt->centeridx = 0;
8337f5d8661Smpi 	}
8347f5d8661Smpi 
8352ed91a58Smpi 	/*
8362ed91a58Smpi 	 * if we've got an amap then lock it and extract current anons.
8372ed91a58Smpi 	 */
8387f5d8661Smpi 	if (amap) {
839335383c9Smpi 		amap_lock(amap, RW_WRITE);
8407f5d8661Smpi 		amap_lookups(&ufi->entry->aref,
8417f5d8661Smpi 		    flt->startva - ufi->entry->start, *ranons, flt->npages);
8427f5d8661Smpi 	} else {
8437f5d8661Smpi 		*ranons = NULL;	/* to be safe */
8447f5d8661Smpi 	}
8457f5d8661Smpi 
8467f5d8661Smpi 	/*
8477f5d8661Smpi 	 * for MADV_SEQUENTIAL mappings we want to deactivate the back pages
8487f5d8661Smpi 	 * now and then forget about them (for the rest of the fault).
8497f5d8661Smpi 	 */
8507f5d8661Smpi 	if (ufi->entry->advice == MADV_SEQUENTIAL && nback != 0) {
8517f5d8661Smpi 		/* flush back-page anons? */
8527f5d8661Smpi 		if (amap)
8537f5d8661Smpi 			uvmfault_anonflush(*ranons, nback);
8547f5d8661Smpi 
85552887a38Smpi 		/*
85652887a38Smpi 		 * flush object?
85752887a38Smpi 		 */
8587f5d8661Smpi 		if (uobj) {
8597f5d8661Smpi 			voff_t uoff;
8607f5d8661Smpi 
8617f5d8661Smpi 			uoff = (flt->startva - ufi->entry->start) + ufi->entry->offset;
86269c04514Smpi 			rw_enter(uobj->vmobjlock, RW_WRITE);
8637f5d8661Smpi 			(void) uobj->pgops->pgo_flush(uobj, uoff, uoff +
8647f5d8661Smpi 			    ((vsize_t)nback << PAGE_SHIFT), PGO_DEACTIVATE);
86569c04514Smpi 			rw_exit(uobj->vmobjlock);
8667f5d8661Smpi 		}
8677f5d8661Smpi 
8687f5d8661Smpi 		/* now forget about the backpages */
8697f5d8661Smpi 		if (amap)
8707f5d8661Smpi 			*ranons += nback;
8717f5d8661Smpi 		flt->startva += ((vsize_t)nback << PAGE_SHIFT);
8727f5d8661Smpi 		flt->npages -= nback;
8737f5d8661Smpi 		flt->centeridx = 0;
8747f5d8661Smpi 	}
8757f5d8661Smpi 
8767f5d8661Smpi 	return 0;
8777f5d8661Smpi }
8787f5d8661Smpi 
8790372dd1aSariane /*
8802ed91a58Smpi  * uvm_fault_upper_lookup: look up existing h/w mapping and amap.
8816d51fca8Smpi  *
8822ed91a58Smpi  * iterate range of interest:
8832ed91a58Smpi  *	1. check if h/w mapping exists.  if yes, we don't care
8842ed91a58Smpi  *	2. check if anon exists.  if not, page is lower.
8852ed91a58Smpi  *	3. if anon exists, enter h/w mapping for neighbors.
8862ed91a58Smpi  *
8872ed91a58Smpi  * => called with amap locked (if exists).
8882ed91a58Smpi  */
8892ed91a58Smpi boolean_t
8902ed91a58Smpi uvm_fault_upper_lookup(struct uvm_faultinfo *ufi,
8912ed91a58Smpi     const struct uvm_faultctx *flt, struct vm_anon **anons,
8922ed91a58Smpi     struct vm_page **pages)
8932ed91a58Smpi {
8942ed91a58Smpi 	struct vm_amap *amap = ufi->entry->aref.ar_amap;
8952ed91a58Smpi 	struct vm_anon *anon;
896a52f395cSmpi 	struct vm_page *pg;
8972ed91a58Smpi 	boolean_t shadowed;
8982ed91a58Smpi 	vaddr_t currva;
8992ed91a58Smpi 	paddr_t pa;
900a52f395cSmpi 	int lcv, entered = 0;
9012ed91a58Smpi 
9022ed91a58Smpi 	/* locked: maps(read), amap(if there) */
9032ed91a58Smpi 	KASSERT(amap == NULL ||
9042ed91a58Smpi 	    rw_write_held(amap->am_lock));
9052ed91a58Smpi 
9062ed91a58Smpi 	/*
9072ed91a58Smpi 	 * map in the backpages and frontpages we found in the amap in hopes
9082ed91a58Smpi 	 * of preventing future faults.    we also init the pages[] array as
9092ed91a58Smpi 	 * we go.
9102ed91a58Smpi 	 */
9112ed91a58Smpi 	currva = flt->startva;
9122ed91a58Smpi 	shadowed = FALSE;
9132ed91a58Smpi 	for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
9142ed91a58Smpi 		/*
9152ed91a58Smpi 		 * unmapped or center page.   check if any anon at this level.
9162ed91a58Smpi 		 */
9172ed91a58Smpi 		if (amap == NULL || anons[lcv] == NULL) {
9182ed91a58Smpi 			pages[lcv] = NULL;
9192ed91a58Smpi 			continue;
9202ed91a58Smpi 		}
9212ed91a58Smpi 
9222ed91a58Smpi 		/*
9232ed91a58Smpi 		 * check for present page and map if possible.
9242ed91a58Smpi 		 */
9252ed91a58Smpi 		pages[lcv] = PGO_DONTCARE;
9262ed91a58Smpi 		if (lcv == flt->centeridx) {	/* save center for later! */
9272ed91a58Smpi 			shadowed = TRUE;
9282ed91a58Smpi 			continue;
9292ed91a58Smpi 		}
930a52f395cSmpi 
9312ed91a58Smpi 		anon = anons[lcv];
932a52f395cSmpi 		pg = anon->an_page;
933a52f395cSmpi 
9342ed91a58Smpi 		KASSERT(anon->an_lock == amap->am_lock);
935a52f395cSmpi 
936a52f395cSmpi 		/*
937a52f395cSmpi 		 * ignore busy pages.
938a52f395cSmpi 		 * don't play with VAs that are already mapped.
939a52f395cSmpi 		 */
940a52f395cSmpi 		if (pg && (pg->pg_flags & (PG_RELEASED|PG_BUSY)) == 0 &&
941a52f395cSmpi 		    !pmap_extract(ufi->orig_map->pmap, currva, &pa)) {
9422ed91a58Smpi 			uvm_lock_pageq();
943a52f395cSmpi 			uvm_pageactivate(pg);	/* reactivate */
9442ed91a58Smpi 			uvm_unlock_pageq();
9452ed91a58Smpi 			counters_inc(uvmexp_counters, flt_namap);
9462ed91a58Smpi 
9478a233859Smpi 			/* No fault-ahead when wired. */
9488a233859Smpi 			KASSERT(flt->wired == FALSE);
9498a233859Smpi 
9502ed91a58Smpi 			/*
9512ed91a58Smpi 			 * Since this isn't the page that's actually faulting,
9522ed91a58Smpi 			 * ignore pmap_enter() failures; it's not critical
9532ed91a58Smpi 			 * that we enter these right now.
9542ed91a58Smpi 			 */
9552ed91a58Smpi 			(void) pmap_enter(ufi->orig_map->pmap, currva,
956a52f395cSmpi 			    VM_PAGE_TO_PHYS(pg) | flt->pa_flags,
9572ed91a58Smpi 			    (anon->an_ref > 1) ?
9582ed91a58Smpi 			    (flt->enter_prot & ~PROT_WRITE) : flt->enter_prot,
9598a233859Smpi 			    PMAP_CANFAIL);
960a52f395cSmpi 			entered++;
9612ed91a58Smpi 		}
9622ed91a58Smpi 	}
963a52f395cSmpi 	if (entered > 0)
9642ed91a58Smpi 		pmap_update(ufi->orig_map->pmap);
9652ed91a58Smpi 
9662ed91a58Smpi 	return shadowed;
9672ed91a58Smpi }
9682ed91a58Smpi 
9692ed91a58Smpi /*
9702ed91a58Smpi  * uvm_fault_upper: handle upper fault.
9712ed91a58Smpi  *
9722ed91a58Smpi  *	1. acquire anon lock.
9732ed91a58Smpi  *	2. get anon.  let uvmfault_anonget do the dirty work.
9742ed91a58Smpi  *	3. if COW, promote data to new anon
9752ed91a58Smpi  *	4. enter h/w mapping
9766d51fca8Smpi  */
9776d51fca8Smpi int
9786d51fca8Smpi uvm_fault_upper(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
97958243cbfSmpi    struct vm_anon **anons)
9806d51fca8Smpi {
9816d51fca8Smpi 	struct vm_amap *amap = ufi->entry->aref.ar_amap;
9826d51fca8Smpi 	struct vm_anon *oanon, *anon = anons[flt->centeridx];
9836d51fca8Smpi 	struct vm_page *pg = NULL;
9846d51fca8Smpi 	int error, ret;
9856d51fca8Smpi 
98652887a38Smpi 	/* locked: maps(read), amap, anon */
98719dcab73Smpi 	KASSERT(rw_write_held(amap->am_lock));
98819dcab73Smpi 	KASSERT(anon->an_lock == amap->am_lock);
98919dcab73Smpi 
9906d51fca8Smpi 	/*
9916d51fca8Smpi 	 * no matter if we have case 1A or case 1B we are going to need to
9926d51fca8Smpi 	 * have the anon's memory resident.   ensure that now.
9936d51fca8Smpi 	 */
9946d51fca8Smpi 	/*
9956d51fca8Smpi 	 * let uvmfault_anonget do the dirty work.
9962ed91a58Smpi 	 * if it fails (!OK) it will unlock everything for us.
9972ed91a58Smpi 	 * if it succeeds, locks are still valid and locked.
9986d51fca8Smpi 	 * also, if it is OK, then the anon's page is on the queues.
9996d51fca8Smpi 	 */
10006d51fca8Smpi 	error = uvmfault_anonget(ufi, amap, anon);
10016d51fca8Smpi 	switch (error) {
100234e43087Smpi 	case 0:
10036d51fca8Smpi 		break;
10046d51fca8Smpi 
100534e43087Smpi 	case ERESTART:
10066d51fca8Smpi 		return ERESTART;
10076d51fca8Smpi 
10086d51fca8Smpi 	default:
100934e43087Smpi 		return error;
10106d51fca8Smpi 	}
10116d51fca8Smpi 
101219dcab73Smpi 	KASSERT(rw_write_held(amap->am_lock));
101319dcab73Smpi 	KASSERT(anon->an_lock == amap->am_lock);
101419dcab73Smpi 
10156d51fca8Smpi 	/*
10166d51fca8Smpi 	 * if we are case 1B then we will need to allocate a new blank
10176d51fca8Smpi 	 * anon to transfer the data into.   note that we have a lock
10186d51fca8Smpi 	 * on anon, so no one can busy or release the page until we are done.
10196d51fca8Smpi 	 * also note that the ref count can't drop to zero here because
10206d51fca8Smpi 	 * it is > 1 and we are only dropping one ref.
10216d51fca8Smpi 	 *
10226d51fca8Smpi 	 * in the (hopefully very rare) case that we are out of RAM we
10232ed91a58Smpi 	 * will unlock, wait for more RAM, and refault.
10246d51fca8Smpi 	 *
10256d51fca8Smpi 	 * if we are out of anon VM we wait for RAM to become available.
10266d51fca8Smpi 	 */
10276d51fca8Smpi 
1028b004aefeSmpi 	if ((flt->access_type & PROT_WRITE) != 0 && anon->an_ref > 1) {
1029335383c9Smpi 		/* promoting requires a write lock. */
1030335383c9Smpi 		KASSERT(rw_write_held(amap->am_lock));
1031335383c9Smpi 
1032627a59d1Smpi 		counters_inc(uvmexp_counters, flt_acow);
10336d51fca8Smpi 		oanon = anon;		/* oanon = old */
10346d51fca8Smpi 
1035dceff774Smpi 		error = uvmfault_promote(ufi, oanon->an_page, &anon, &pg);
1036dceff774Smpi 		if (error)
1037dceff774Smpi 			return error;
10386d51fca8Smpi 
10396d51fca8Smpi 		/* un-busy! new page */
10406d51fca8Smpi 		atomic_clearbits_int(&pg->pg_flags, PG_BUSY|PG_FAKE);
10416d51fca8Smpi 		UVM_PAGE_OWN(pg, NULL);
10426d51fca8Smpi 		ret = amap_add(&ufi->entry->aref,
10436d51fca8Smpi 		    ufi->orig_rvaddr - ufi->entry->start, anon, 1);
10446d51fca8Smpi 		KASSERT(ret == 0);
10456d51fca8Smpi 
1046335383c9Smpi 		KASSERT(anon->an_lock == oanon->an_lock);
1047335383c9Smpi 
10486d51fca8Smpi 		/* deref: can not drop to zero here by defn! */
1049335383c9Smpi 		KASSERT(oanon->an_ref > 1);
10506d51fca8Smpi 		oanon->an_ref--;
10516d51fca8Smpi 
10522f953554Sguenther #if defined(MULTIPROCESSOR) && !defined(__HAVE_PMAP_MPSAFE_ENTER_COW)
105343687ba5Sguenther 		/*
105443687ba5Sguenther 		 * If there are multiple threads, either uvm or the
105543687ba5Sguenther 		 * pmap has to make sure no threads see the old RO
105643687ba5Sguenther 		 * mapping once any have seen the new RW mapping.
105743687ba5Sguenther 		 * uvm does it by inserting the new mapping RO and
105843687ba5Sguenther 		 * letting it fault again.
10592f953554Sguenther 		 * This is only a problem on MP systems.
106043687ba5Sguenther 		 */
10617376e01dSguenther 		if (P_HASSIBLING(curproc)) {
106243687ba5Sguenther 			flt->enter_prot &= ~PROT_WRITE;
10637376e01dSguenther 			flt->access_type &= ~PROT_WRITE;
10647376e01dSguenther 		}
106543687ba5Sguenther #endif
106643687ba5Sguenther 
10676d51fca8Smpi 		/*
10686d51fca8Smpi 		 * note: anon is _not_ locked, but we have the sole references
10696d51fca8Smpi 		 * to in from amap.
10706d51fca8Smpi 		 * thus, no one can get at it until we are done with it.
10716d51fca8Smpi 		 */
10726d51fca8Smpi 	} else {
1073627a59d1Smpi 		counters_inc(uvmexp_counters, flt_anon);
10746d51fca8Smpi 		oanon = anon;
10756d51fca8Smpi 		pg = anon->an_page;
10766d51fca8Smpi 		if (anon->an_ref > 1)     /* disallow writes to ref > 1 anons */
10776d51fca8Smpi 			flt->enter_prot = flt->enter_prot & ~PROT_WRITE;
10786d51fca8Smpi 	}
10796d51fca8Smpi 
10806d51fca8Smpi 	/*
10812ed91a58Smpi 	 * now map the page in .
10826d51fca8Smpi 	 */
10836d51fca8Smpi 	if (pmap_enter(ufi->orig_map->pmap, ufi->orig_rvaddr,
10846d51fca8Smpi 	    VM_PAGE_TO_PHYS(pg) | flt->pa_flags, flt->enter_prot,
1085b004aefeSmpi 	    flt->access_type | PMAP_CANFAIL | (flt->wired ? PMAP_WIRED : 0)) != 0) {
10866d51fca8Smpi 		/*
10876d51fca8Smpi 		 * No need to undo what we did; we can simply think of
10886d51fca8Smpi 		 * this as the pmap throwing away the mapping information.
10896d51fca8Smpi 		 *
10906d51fca8Smpi 		 * We do, however, have to go through the ReFault path,
10916d51fca8Smpi 		 * as the map may change while we're asleep.
10926d51fca8Smpi 		 */
10936d51fca8Smpi 		uvmfault_unlockall(ufi, amap, NULL);
10946d51fca8Smpi 		if (uvm_swapisfull()) {
10956d51fca8Smpi 			/* XXX instrumentation */
10966d51fca8Smpi 			return ENOMEM;
10976d51fca8Smpi 		}
1098679d40f1Skettenis #ifdef __HAVE_PMAP_POPULATE
1099679d40f1Skettenis 		pmap_populate(ufi->orig_map->pmap, ufi->orig_rvaddr);
1100679d40f1Skettenis #else
11016d51fca8Smpi 		/* XXX instrumentation */
11026d51fca8Smpi 		uvm_wait("flt_pmfail1");
1103679d40f1Skettenis #endif
11046d51fca8Smpi 		return ERESTART;
11056d51fca8Smpi 	}
11066d51fca8Smpi 
110752887a38Smpi 	/*
110852887a38Smpi 	 * ... update the page queues.
110952887a38Smpi 	 */
11106d51fca8Smpi 	uvm_lock_pageq();
111196ec8e93Smpi 	if (flt->wired) {
1112e5ad67b7Smpi 		uvm_pagewire(pg);
111396ec8e93Smpi 	} else {
111496ec8e93Smpi 		uvm_pageactivate(pg);
111596ec8e93Smpi 	}
111696ec8e93Smpi 	uvm_unlock_pageq();
111796ec8e93Smpi 
111896ec8e93Smpi 	if (flt->wired) {
11196d51fca8Smpi 		/*
11206d51fca8Smpi 		 * since the now-wired page cannot be paged out,
11216d51fca8Smpi 		 * release its swap resources for others to use.
11226d51fca8Smpi 		 * since an anon with no swap cannot be PG_CLEAN,
11236d51fca8Smpi 		 * clear its clean flag now.
11246d51fca8Smpi 		 */
11256d51fca8Smpi 		atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
11266d51fca8Smpi 		uvm_anon_dropswap(anon);
11276d51fca8Smpi 	}
11286d51fca8Smpi 
11292ed91a58Smpi 	/*
11302ed91a58Smpi 	 * done case 1!  finish up by unlocking everything and returning success
11312ed91a58Smpi 	 */
11326d51fca8Smpi 	uvmfault_unlockall(ufi, amap, NULL);
11336d51fca8Smpi 	pmap_update(ufi->orig_map->pmap);
11346d51fca8Smpi 	return 0;
11356d51fca8Smpi }
11366d51fca8Smpi 
1137cd713f80Smpi /*
1138a3afc610Smpi  * uvm_fault_lower_lookup: look up on-memory uobj pages.
1139cd713f80Smpi  *
1140a3afc610Smpi  *	1. get on-memory pages.
1141a3afc610Smpi  *	2. if failed, give up (get only center page later).
1142a3afc610Smpi  *	3. if succeeded, enter h/w mapping of neighbor pages.
1143cd713f80Smpi  */
1144427225b6Smpi 
1145a3afc610Smpi struct vm_page *
1146a3afc610Smpi uvm_fault_lower_lookup(
1147a3afc610Smpi 	struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
1148a3afc610Smpi 	struct vm_page **pages)
1149a3afc610Smpi {
1150a3afc610Smpi 	struct uvm_object *uobj = ufi->entry->object.uvm_obj;
1151a3afc610Smpi 	struct vm_page *uobjpage = NULL;
11525797ad06Smpi 	int lcv, gotpages, entered;
1153a3afc610Smpi 	vaddr_t currva;
11545797ad06Smpi 	paddr_t pa;
1155a3afc610Smpi 
1156552563d5Smpi 	rw_enter(uobj->vmobjlock, flt->lower_lock_type);
115769c04514Smpi 
1158627a59d1Smpi 	counters_inc(uvmexp_counters, flt_lget);
11593053940aSmpi 	gotpages = flt->npages;
1160a3afc610Smpi 	(void) uobj->pgops->pgo_get(uobj,
1161a3afc610Smpi 	    ufi->entry->offset + (flt->startva - ufi->entry->start),
11623053940aSmpi 	    pages, &gotpages, flt->centeridx,
1163a3afc610Smpi 	    flt->access_type & MASK(ufi->entry), ufi->entry->advice,
1164a3afc610Smpi 	    PGO_LOCKED);
1165cd7ee8acSart 
1166a3afc610Smpi 	/*
1167a3afc610Smpi 	 * check for pages to map, if we got any
1168a3afc610Smpi 	 */
1169a3afc610Smpi 	if (gotpages == 0) {
1170a3afc610Smpi 		return NULL;
1171a3afc610Smpi 	}
1172a3afc610Smpi 
11735797ad06Smpi 	entered = 0;
11743053940aSmpi 	currva = flt->startva;
1175a3afc610Smpi 	for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
1176cd7ee8acSart 		if (pages[lcv] == NULL ||
1177cd7ee8acSart 		    pages[lcv] == PGO_DONTCARE)
1178cd7ee8acSart 			continue;
1179cd7ee8acSart 
11800528dcd0Smpi 		KASSERT((pages[lcv]->pg_flags & PG_BUSY) == 0);
11819662fca4Sart 		KASSERT((pages[lcv]->pg_flags & PG_RELEASED) == 0);
1182cd7ee8acSart 
1183cd7ee8acSart 		/*
11840528dcd0Smpi 		 * if center page is resident and not PG_BUSY, then pgo_get
11850528dcd0Smpi 		 * gave us a handle to it.
11860528dcd0Smpi 		 * remember this page as "uobjpage." (for later use).
1187cd7ee8acSart 		 */
11883053940aSmpi 		if (lcv == flt->centeridx) {
1189cd7ee8acSart 			uobjpage = pages[lcv];
1190cd7ee8acSart 			continue;
1191cd7ee8acSart 		}
1192cd7ee8acSart 
11935797ad06Smpi 		if (pmap_extract(ufi->orig_map->pmap, currva, &pa))
11940528dcd0Smpi 			continue;
1195cd7ee8acSart 
11960528dcd0Smpi 		/*
11970528dcd0Smpi 		 * calling pgo_get with PGO_LOCKED returns us pages which
11980528dcd0Smpi 		 * are neither busy nor released, so we don't need to check
11990528dcd0Smpi 		 * for this.  we can just directly enter the pages.
12000528dcd0Smpi 		 */
12015797ad06Smpi 		if (pages[lcv]->wire_count == 0) {
1202cd7ee8acSart 			uvm_lock_pageq();
12035797ad06Smpi 			uvm_pageactivate(pages[lcv]);
1204cd7ee8acSart 			uvm_unlock_pageq();
12055797ad06Smpi 		}
1206627a59d1Smpi 		counters_inc(uvmexp_counters, flt_nomap);
1207cac1bff1Sart 
12088a233859Smpi 		/* No fault-ahead when wired. */
12098a233859Smpi 		KASSERT(flt->wired == FALSE);
12108a233859Smpi 
121165f111fbSart 		/*
12120528dcd0Smpi 		 * Since this page isn't the page that's actually faulting,
12130528dcd0Smpi 		 * ignore pmap_enter() failures; it's not critical that we
121465f111fbSart 		 * enter these right now.
12150528dcd0Smpi 		 * NOTE: page can't be PG_WANTED or PG_RELEASED because we've
12160528dcd0Smpi 		 * held the lock the whole time we've had the handle.
121765f111fbSart 		 */
12183053940aSmpi 		(void) pmap_enter(ufi->orig_map->pmap, currva,
12193053940aSmpi 		    VM_PAGE_TO_PHYS(pages[lcv]) | flt->pa_flags,
12208a233859Smpi 		    flt->enter_prot & MASK(ufi->entry), PMAP_CANFAIL);
12215797ad06Smpi 		entered++;
1222cd7ee8acSart 
1223a3afc610Smpi 	}
12245797ad06Smpi 	if (entered > 0)
12253053940aSmpi 		pmap_update(ufi->orig_map->pmap);
1226a3afc610Smpi 
1227a3afc610Smpi 	return uobjpage;
1228a3afc610Smpi }
1229a3afc610Smpi 
1230a3afc610Smpi /*
1231a3afc610Smpi  * uvm_fault_lower: handle lower fault.
1232a3afc610Smpi  *
1233cce913b9Smpi  *	1. check uobj
1234cce913b9Smpi  *	1.1. if null, ZFOD.
1235cce913b9Smpi  *	1.2. if not null, look up unnmapped neighbor pages.
1236cce913b9Smpi  *	2. for center page, check if promote.
1237cce913b9Smpi  *	2.1. ZFOD always needs promotion.
1238cce913b9Smpi  *	2.2. other uobjs, when entry is marked COW (usually MAP_PRIVATE vnode).
1239cce913b9Smpi  *	3. if uobj is not ZFOD and page is not found, do i/o.
1240cce913b9Smpi  *	4. dispatch either direct / promote fault.
1241a3afc610Smpi  */
1242a3afc610Smpi int
1243a3afc610Smpi uvm_fault_lower(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
124458243cbfSmpi    struct vm_page **pages)
1245a3afc610Smpi {
1246a3afc610Smpi 	struct vm_amap *amap = ufi->entry->aref.ar_amap;
1247a3afc610Smpi 	struct uvm_object *uobj = ufi->entry->object.uvm_obj;
1248cce913b9Smpi 	int dropswap = 0;
1249a3afc610Smpi 	struct vm_page *uobjpage, *pg = NULL;
1250a3afc610Smpi 	struct vm_anon *anon = NULL;
1251cce913b9Smpi 	int error;
1252a3afc610Smpi 
1253a3afc610Smpi 	/*
1254a3afc610Smpi 	 * now, if the desired page is not shadowed by the amap and we have
1255a3afc610Smpi 	 * a backing object that does not have a special fault routine, then
1256a3afc610Smpi 	 * we ask (with pgo_get) the object for resident pages that we care
1257a3afc610Smpi 	 * about and attempt to map them in.  we do not let pgo_get block
1258a3afc610Smpi 	 * (PGO_LOCKED).
1259a3afc610Smpi 	 */
1260a3afc610Smpi 	if (uobj == NULL) {
126152887a38Smpi 		/* zero fill; don't care neighbor pages */
1262cd7ee8acSart 		uobjpage = NULL;
1263a3afc610Smpi 	} else {
1264a3afc610Smpi 		uobjpage = uvm_fault_lower_lookup(ufi, flt, pages);
1265cd7ee8acSart 	}
1266cd7ee8acSart 
1267cd7ee8acSart 	/*
1268cd7ee8acSart 	 * note that at this point we are done with any front or back pages.
1269cd7ee8acSart 	 * we are now going to focus on the center page (i.e. the one we've
1270427225b6Smpi 	 * faulted on).  if we have faulted on the bottom (uobj)
1271cd7ee8acSart 	 * layer [i.e. case 2] and the page was both present and available,
1272cd7ee8acSart 	 * then we've got a pointer to it as "uobjpage" and we've already
1273cd7ee8acSart 	 * made it BUSY.
1274cd7ee8acSart 	 */
1275cd7ee8acSart 
1276cd7ee8acSart 	/*
127769c04514Smpi 	 * locked:
127869c04514Smpi 	 */
127969c04514Smpi 	KASSERT(amap == NULL ||
128069c04514Smpi 	    rw_write_held(amap->am_lock));
128169c04514Smpi 	KASSERT(uobj == NULL ||
1282552563d5Smpi 	    rw_status(uobj->vmobjlock) == flt->lower_lock_type);
128369c04514Smpi 
128469c04514Smpi 	/*
1285cd7ee8acSart 	 * note that uobjpage can not be PGO_DONTCARE at this point.  we now
1286cd7ee8acSart 	 * set uobjpage to PGO_DONTCARE if we are doing a zero fill.  if we
1287cd7ee8acSart 	 * have a backing object, check and see if we are going to promote
1288cd7ee8acSart 	 * the data up to an anon during the fault.
1289cd7ee8acSart 	 */
1290cd7ee8acSart 	if (uobj == NULL) {
1291cd7ee8acSart 		uobjpage = PGO_DONTCARE;
1292cce913b9Smpi 		flt->promote = TRUE;		/* always need anon here */
1293cd7ee8acSart 	} else {
12941496ff33Sart 		KASSERT(uobjpage != PGO_DONTCARE);
1295cce913b9Smpi 		flt->promote = (flt->access_type & PROT_WRITE) &&
12963053940aSmpi 		     UVM_ET_ISCOPYONWRITE(ufi->entry);
1297cd7ee8acSart 	}
1298cd7ee8acSart 
1299cd7ee8acSart 	/*
1300cd7ee8acSart 	 * if uobjpage is not null then we do not need to do I/O to get the
1301cd7ee8acSart 	 * uobjpage.
1302cd7ee8acSart 	 *
1303b8a635f6Stedu 	 * if uobjpage is null, then we need to ask the pager to
1304cd7ee8acSart 	 * get the data for us.   once we have the data, we need to reverify
1305cd7ee8acSart 	 * the state the world.   we are currently not holding any resources.
1306cd7ee8acSart 	 */
1307cd7ee8acSart 	if (uobjpage) {
1308cd7ee8acSart 		/* update rusage counters */
13098f15e6a4Sguenther 		curproc->p_ru.ru_minflt++;
13100528dcd0Smpi 		if (uobjpage != PGO_DONTCARE) {
13110528dcd0Smpi 			uvm_lock_pageq();
13120528dcd0Smpi 			uvm_pageactivate(uobjpage);
13130528dcd0Smpi 			uvm_unlock_pageq();
13140528dcd0Smpi 		}
1315cd7ee8acSart 	} else {
1316cce913b9Smpi 		error = uvm_fault_lower_io(ufi, flt, &uobj, &uobjpage);
1317cce913b9Smpi 		if (error != 0)
1318cce913b9Smpi 			return error;
1319cd7ee8acSart 	}
1320cd7ee8acSart 
1321cd7ee8acSart 	/*
1322cd7ee8acSart 	 * notes:
1323cd7ee8acSart 	 *  - at this point uobjpage can not be NULL
1324cd7ee8acSart 	 *  - at this point uobjpage could be PG_WANTED (handle later)
1325cd7ee8acSart 	 */
1326cce913b9Smpi 	if (flt->promote == FALSE) {
1327cd7ee8acSart 		/*
1328cd7ee8acSart 		 * we are not promoting.   if the mapping is COW ensure that we
1329cd7ee8acSart 		 * don't give more access than we should (e.g. when doing a read
1330cd7ee8acSart 		 * fault on a COPYONWRITE mapping we want to map the COW page in
1331cd7ee8acSart 		 * R/O even though the entry protection could be R/W).
1332cd7ee8acSart 		 *
1333cd7ee8acSart 		 * set "pg" to the page we want to map in (uobjpage, usually)
1334cd7ee8acSart 		 */
1335627a59d1Smpi 		counters_inc(uvmexp_counters, flt_obj);
13363053940aSmpi 		if (UVM_ET_ISCOPYONWRITE(ufi->entry))
13373053940aSmpi 			flt->enter_prot &= ~PROT_WRITE;
1338cd7ee8acSart 		pg = uobjpage;		/* map in the actual object */
1339cd7ee8acSart 
1340cd7ee8acSart 		/* assert(uobjpage != PGO_DONTCARE) */
1341cd7ee8acSart 
1342cd7ee8acSart 		/*
13436f909936Svisa 		 * we are faulting directly on the page.
1344cd7ee8acSart 		 */
1345cd7ee8acSart 	} else {
1346552563d5Smpi 		KASSERT(amap != NULL);
1347552563d5Smpi 
1348552563d5Smpi 		/* promoting requires a write lock. */
1349552563d5Smpi 	        KASSERT(rw_write_held(amap->am_lock));
1350552563d5Smpi 	        KASSERT(uobj == NULL ||
1351552563d5Smpi 	            rw_status(uobj->vmobjlock) == flt->lower_lock_type);
1352552563d5Smpi 
1353cd7ee8acSart 		/*
1354cd7ee8acSart 		 * if we are going to promote the data to an anon we
1355cd7ee8acSart 		 * allocate a blank anon here and plug it into our amap.
1356cd7ee8acSart 		 */
1357dceff774Smpi 		error = uvmfault_promote(ufi, uobjpage, &anon, &pg);
1358dceff774Smpi 		if (error)
1359dceff774Smpi 			return error;
1360cd7ee8acSart 
136152887a38Smpi 		/*
136252887a38Smpi 		 * fill in the data
136352887a38Smpi 		 */
1364cd7ee8acSart 		if (uobjpage != PGO_DONTCARE) {
1365627a59d1Smpi 			counters_inc(uvmexp_counters, flt_prcopy);
1366cd7ee8acSart 
1367cd7ee8acSart 			/*
1368cd7ee8acSart 			 * promote to shared amap?  make sure all sharing
1369cd7ee8acSart 			 * procs see it
1370cd7ee8acSart 			 */
1371cd7ee8acSart 			if ((amap_flags(amap) & AMAP_SHARED) != 0) {
13721e8cdc2eSderaadt 				pmap_page_protect(uobjpage, PROT_NONE);
1373cd7ee8acSart 			}
1374b3774972Sguenther #if defined(MULTIPROCESSOR) && !defined(__HAVE_PMAP_MPSAFE_ENTER_COW)
1375b3774972Sguenther 			/*
1376b3774972Sguenther 			 * Otherwise:
1377b3774972Sguenther 			 * If there are multiple threads, either uvm or the
1378b3774972Sguenther 			 * pmap has to make sure no threads see the old RO
1379b3774972Sguenther 			 * mapping once any have seen the new RW mapping.
1380b3774972Sguenther 			 * uvm does it here by forcing it to PROT_NONE before
1381b3774972Sguenther 			 * inserting the new mapping.
1382b3774972Sguenther 			 */
1383b3774972Sguenther 			else if (P_HASSIBLING(curproc)) {
1384b3774972Sguenther 				pmap_page_protect(uobjpage, PROT_NONE);
1385b3774972Sguenther 			}
1386b3774972Sguenther #endif
138707c549d8Smpi 			/* done with copied uobjpage. */
138869c04514Smpi 			rw_exit(uobj->vmobjlock);
1389cd7ee8acSart 			uobj = NULL;
1390cd7ee8acSart 		} else {
1391627a59d1Smpi 			counters_inc(uvmexp_counters, flt_przero);
1392b1990b04Sart 			/*
1393dceff774Smpi 			 * Page is zero'd and marked dirty by uvm_pagealloc(),
1394dceff774Smpi 			 * called in uvmfault_promote() above.
1395b1990b04Sart 			 */
1396cd7ee8acSart 		}
1397cd7ee8acSart 
13983053940aSmpi 		if (amap_add(&ufi->entry->aref,
13993053940aSmpi 		    ufi->orig_rvaddr - ufi->entry->start, anon, 0)) {
1400b16b5f31Smpi 			if (pg->pg_flags & PG_WANTED)
1401b16b5f31Smpi 				wakeup(pg);
1402b16b5f31Smpi 
1403b16b5f31Smpi 			atomic_clearbits_int(&pg->pg_flags,
1404b16b5f31Smpi 			    PG_BUSY|PG_FAKE|PG_WANTED);
1405b16b5f31Smpi 			UVM_PAGE_OWN(pg, NULL);
140669c04514Smpi 			uvmfault_unlockall(ufi, amap, uobj);
14074bfd0d76Sstefan 			uvm_anfree(anon);
1408627a59d1Smpi 			counters_inc(uvmexp_counters, flt_noamap);
14094bfd0d76Sstefan 
1410afd3b31eSmpi 			if (uvm_swapisfull())
14114bfd0d76Sstefan 				return (ENOMEM);
14124bfd0d76Sstefan 
14133053940aSmpi 			amap_populate(&ufi->entry->aref,
14143053940aSmpi 			    ufi->orig_rvaddr - ufi->entry->start);
14153053940aSmpi 			return ERESTART;
14164bfd0d76Sstefan 		}
1417cd7ee8acSart 	}
1418cd7ee8acSart 
1419552563d5Smpi 	/*
1420552563d5Smpi 	 * anon must be write locked (promotion).  uobj can be either.
1421552563d5Smpi 	 *
1422552563d5Smpi 	 * Note: pg is either the uobjpage or the new page in the new anon.
1423552563d5Smpi 	 */
1424552563d5Smpi 	KASSERT(amap == NULL ||
1425552563d5Smpi 	    rw_write_held(amap->am_lock));
1426552563d5Smpi 	KASSERT(uobj == NULL ||
1427552563d5Smpi 	    rw_status(uobj->vmobjlock) == flt->lower_lock_type);
1428552563d5Smpi 	KASSERT(anon == NULL || anon->an_lock == amap->am_lock);
1429552563d5Smpi 
1430cd7ee8acSart 	/*
1431cd7ee8acSart 	 * all resources are present.   we can now map it in and free our
1432cd7ee8acSart 	 * resources.
1433cd7ee8acSart 	 */
14343053940aSmpi 	if (pmap_enter(ufi->orig_map->pmap, ufi->orig_rvaddr,
14353053940aSmpi 	    VM_PAGE_TO_PHYS(pg) | flt->pa_flags, flt->enter_prot,
1436b004aefeSmpi 	    flt->access_type | PMAP_CANFAIL | (flt->wired ? PMAP_WIRED : 0)) != 0) {
143765f111fbSart 		/*
143865f111fbSart 		 * No need to undo what we did; we can simply think of
143965f111fbSart 		 * this as the pmap throwing away the mapping information.
144065f111fbSart 		 *
144165f111fbSart 		 * We do, however, have to go through the ReFault path,
144265f111fbSart 		 * as the map may change while we're asleep.
144365f111fbSart 		 */
14449662fca4Sart 		if (pg->pg_flags & PG_WANTED)
1445b8a635f6Stedu 			wakeup(pg);
144665f111fbSart 
144765d6360cSart 		atomic_clearbits_int(&pg->pg_flags, PG_BUSY|PG_FAKE|PG_WANTED);
144865f111fbSart 		UVM_PAGE_OWN(pg, NULL);
14493053940aSmpi 		uvmfault_unlockall(ufi, amap, uobj);
1450afd3b31eSmpi 		if (uvm_swapisfull()) {
145165f111fbSart 			/* XXX instrumentation */
1452159b0ca6Sart 			return (ENOMEM);
145365f111fbSart 		}
1454679d40f1Skettenis #ifdef __HAVE_PMAP_POPULATE
1455679d40f1Skettenis 		pmap_populate(ufi->orig_map->pmap, ufi->orig_rvaddr);
1456679d40f1Skettenis #else
145765f111fbSart 		/* XXX instrumentation */
145865f111fbSart 		uvm_wait("flt_pmfail2");
1459679d40f1Skettenis #endif
14603053940aSmpi 		return ERESTART;
146165f111fbSart 	}
1462cd7ee8acSart 
146369c04514Smpi 	uvm_lock_pageq();
146496ec8e93Smpi 	if (flt->wired) {
1465cd7ee8acSart 		uvm_pagewire(pg);
146665d6360cSart 		if (pg->pg_flags & PQ_AOBJ) {
14678a42ed70Sart 			/*
14688a42ed70Sart 			 * since the now-wired page cannot be paged out,
14698a42ed70Sart 			 * release its swap resources for others to use.
147069c04514Smpi 			 * since an aobj page with no swap cannot be clean,
147169c04514Smpi 			 * mark it dirty now.
147269c04514Smpi 			 *
147369c04514Smpi 			 * use pg->uobject here.  if the page is from a
147469c04514Smpi 			 * tmpfs vnode, the pages are backed by its UAO and
147569c04514Smpi 			 * not the vnode.
14768a42ed70Sart 			 */
147769c04514Smpi 			KASSERT(uobj != NULL);
147869c04514Smpi 			KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock);
147965d6360cSart 			atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
148096ec8e93Smpi 			dropswap = 1;
14818a42ed70Sart 		}
14828a42ed70Sart 	} else {
1483cd7ee8acSart 		uvm_pageactivate(pg);
1484e5ad67b7Smpi 	}
148596ec8e93Smpi 	uvm_unlock_pageq();
148696ec8e93Smpi 
148796ec8e93Smpi 	if (dropswap)
148896ec8e93Smpi 		uao_dropswap(uobj, pg->offset >> PAGE_SHIFT);
1489cd7ee8acSart 
14909662fca4Sart 	if (pg->pg_flags & PG_WANTED)
1491b8a635f6Stedu 		wakeup(pg);
1492cd7ee8acSart 
149365d6360cSart 	atomic_clearbits_int(&pg->pg_flags, PG_BUSY|PG_FAKE|PG_WANTED);
1494cd7ee8acSart 	UVM_PAGE_OWN(pg, NULL);
14953053940aSmpi 	uvmfault_unlockall(ufi, amap, uobj);
14963053940aSmpi 	pmap_update(ufi->orig_map->pmap);
1497cd7ee8acSart 
149873c19439Sart 	return (0);
1499cd7ee8acSart }
1500cd7ee8acSart 
1501cce913b9Smpi /*
1502cce913b9Smpi  * uvm_fault_lower_io: get lower page from backing store.
1503cce913b9Smpi  *
1504cce913b9Smpi  *	1. unlock everything, because i/o will block.
1505cce913b9Smpi  *	2. call pgo_get.
1506cce913b9Smpi  *	3. if failed, recover.
1507cce913b9Smpi  *	4. if succeeded, relock everything and verify things.
1508cce913b9Smpi  */
1509cce913b9Smpi int
1510cce913b9Smpi uvm_fault_lower_io(
1511cce913b9Smpi 	struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
1512cce913b9Smpi 	struct uvm_object **ruobj, struct vm_page **ruobjpage)
1513cce913b9Smpi {
1514cce913b9Smpi 	struct vm_amap * const amap = ufi->entry->aref.ar_amap;
1515cce913b9Smpi 	struct uvm_object *uobj = *ruobj;
1516cce913b9Smpi 	struct vm_page *pg;
1517cce913b9Smpi 	boolean_t locked;
15188774a958Smpi 	int gotpages, advice;
1519cce913b9Smpi 	int result;
1520cce913b9Smpi 	voff_t uoff;
15218774a958Smpi 	vm_prot_t access_type;
15228774a958Smpi 
15238774a958Smpi 	/* grab everything we need from the entry before we unlock */
15248774a958Smpi 	uoff = (ufi->orig_rvaddr - ufi->entry->start) + ufi->entry->offset;
15258774a958Smpi 	access_type = flt->access_type & MASK(ufi->entry);
15268774a958Smpi 	advice = ufi->entry->advice;
15278774a958Smpi 
15288774a958Smpi 	uvmfault_unlockall(ufi, amap, NULL);
1529cce913b9Smpi 
1530cce913b9Smpi 	/* update rusage counters */
1531cce913b9Smpi 	curproc->p_ru.ru_majflt++;
1532cce913b9Smpi 
15338774a958Smpi 	KASSERT(rw_write_held(uobj->vmobjlock));
1534cce913b9Smpi 
1535cce913b9Smpi 	counters_inc(uvmexp_counters, flt_get);
1536cce913b9Smpi 	gotpages = 1;
1537cce913b9Smpi 	pg = NULL;
1538cce913b9Smpi 	result = uobj->pgops->pgo_get(uobj, uoff, &pg, &gotpages,
15398774a958Smpi 	    0, access_type, advice, PGO_SYNCIO);
1540cce913b9Smpi 
1541cce913b9Smpi 	/*
1542cce913b9Smpi 	 * recover from I/O
1543cce913b9Smpi 	 */
1544cce913b9Smpi 	if (result != VM_PAGER_OK) {
1545cce913b9Smpi 		KASSERT(result != VM_PAGER_PEND);
1546cce913b9Smpi 
1547cce913b9Smpi 		if (result == VM_PAGER_AGAIN) {
1548cce913b9Smpi 			tsleep_nsec(&nowake, PVM, "fltagain2", MSEC_TO_NSEC(5));
1549cce913b9Smpi 			return ERESTART;
1550cce913b9Smpi 		}
1551cce913b9Smpi 
1552cce913b9Smpi 		if (!UVM_ET_ISNOFAULT(ufi->entry))
1553cce913b9Smpi 			return (EIO);
1554cce913b9Smpi 
1555cce913b9Smpi 		pg = PGO_DONTCARE;
1556cce913b9Smpi 		uobj = NULL;
1557cce913b9Smpi 		flt->promote = TRUE;
1558cce913b9Smpi 	}
1559cce913b9Smpi 
1560cce913b9Smpi 	/* re-verify the state of the world.  */
1561cce913b9Smpi 	locked = uvmfault_relock(ufi);
1562cce913b9Smpi 	if (locked && amap != NULL)
1563335383c9Smpi 		amap_lock(amap, RW_WRITE);
1564cce913b9Smpi 
1565cce913b9Smpi 	/* might be changed */
1566cce913b9Smpi 	if (pg != PGO_DONTCARE) {
1567cce913b9Smpi 		uobj = pg->uobject;
1568552563d5Smpi 		rw_enter(uobj->vmobjlock, flt->lower_lock_type);
1569552563d5Smpi 		KASSERT((pg->pg_flags & PG_BUSY) != 0);
1570552563d5Smpi 		KASSERT(flt->lower_lock_type == RW_WRITE);
1571cce913b9Smpi 	}
1572cce913b9Smpi 
1573cce913b9Smpi 	/*
1574cce913b9Smpi 	 * Re-verify that amap slot is still free. if there is
1575cce913b9Smpi 	 * a problem, we clean up.
1576cce913b9Smpi 	 */
1577cce913b9Smpi 	if (locked && amap && amap_lookup(&ufi->entry->aref,
1578cce913b9Smpi 	      ufi->orig_rvaddr - ufi->entry->start)) {
1579cce913b9Smpi 		if (locked)
1580cce913b9Smpi 			uvmfault_unlockall(ufi, amap, NULL);
1581cce913b9Smpi 		locked = FALSE;
1582cce913b9Smpi 	}
1583cce913b9Smpi 
15840528dcd0Smpi 	/* release the page now, still holding object lock */
15850528dcd0Smpi 	if (pg != PGO_DONTCARE) {
1586cce913b9Smpi 		uvm_lock_pageq();
1587cce913b9Smpi 		uvm_pageactivate(pg);
1588cce913b9Smpi 		uvm_unlock_pageq();
1589cce913b9Smpi 
1590cce913b9Smpi 		if (pg->pg_flags & PG_WANTED)
1591cce913b9Smpi 			wakeup(pg);
1592cce913b9Smpi 		atomic_clearbits_int(&pg->pg_flags, PG_BUSY|PG_WANTED);
1593cce913b9Smpi 		UVM_PAGE_OWN(pg, NULL);
1594cce913b9Smpi 	}
1595cce913b9Smpi 
1596cce913b9Smpi 	if (locked == FALSE) {
1597cce913b9Smpi 		if (pg != PGO_DONTCARE)
1598cce913b9Smpi 			rw_exit(uobj->vmobjlock);
1599cce913b9Smpi 		return ERESTART;
1600cce913b9Smpi 	}
1601cce913b9Smpi 
1602cce913b9Smpi 	/*
16030528dcd0Smpi 	 * we have the data in pg.  we are holding object lock (so the page
16040528dcd0Smpi 	 * can't be released on us).
1605cce913b9Smpi 	 */
1606cce913b9Smpi 	*ruobj = uobj;
1607cce913b9Smpi 	*ruobjpage = pg;
1608cce913b9Smpi 	return 0;
1609cce913b9Smpi }
1610cd7ee8acSart 
1611cd7ee8acSart /*
1612cd7ee8acSart  * uvm_fault_wire: wire down a range of virtual addresses in a map.
1613cd7ee8acSart  *
161407b6088bSart  * => map may be read-locked by caller, but MUST NOT be write-locked.
161507b6088bSart  * => if map is read-locked, any operations which may cause map to
161607b6088bSart  *	be write-locked in uvm_fault() must be taken care of by
161707b6088bSart  *	the caller.  See uvm_map_pageable().
1618cd7ee8acSart  */
1619cd7ee8acSart int
16202023d591Soga uvm_fault_wire(vm_map_t map, vaddr_t start, vaddr_t end, vm_prot_t access_type)
1621cd7ee8acSart {
1622cd7ee8acSart 	vaddr_t va;
16231414b0faSart 	int rv;
16241414b0faSart 
1625cd7ee8acSart 	/*
162628fbabcfSart 	 * now fault it in a page at a time.   if the fault fails then we have
16271e8cdc2eSderaadt 	 * to undo what we have done.   note that in uvm_fault PROT_NONE
162828fbabcfSart 	 * is replaced with the max protection if fault_type is VM_FAULT_WIRE.
1629cd7ee8acSart 	 */
1630cd7ee8acSart 	for (va = start ; va < end ; va += PAGE_SIZE) {
16311414b0faSart 		rv = uvm_fault(map, va, VM_FAULT_WIRE, access_type);
16321414b0faSart 		if (rv) {
1633cd7ee8acSart 			if (va != start) {
16347cb53682Sart 				uvm_fault_unwire(map, start, va);
1635cd7ee8acSart 			}
16361414b0faSart 			return (rv);
1637cd7ee8acSart 		}
1638cd7ee8acSart 	}
16391414b0faSart 
164073c19439Sart 	return (0);
1641cd7ee8acSart }
1642cd7ee8acSart 
1643cd7ee8acSart /*
1644cd7ee8acSart  * uvm_fault_unwire(): unwire range of virtual space.
1645cd7ee8acSart  */
1646cd7ee8acSart void
16472023d591Soga uvm_fault_unwire(vm_map_t map, vaddr_t start, vaddr_t end)
1648cd7ee8acSart {
164907b6088bSart 
165007b6088bSart 	vm_map_lock_read(map);
165107b6088bSart 	uvm_fault_unwire_locked(map, start, end);
165207b6088bSart 	vm_map_unlock_read(map);
165307b6088bSart }
165407b6088bSart 
165507b6088bSart /*
165607b6088bSart  * uvm_fault_unwire_locked(): the guts of uvm_fault_unwire().
165707b6088bSart  *
165807b6088bSart  * => map must be at least read-locked.
165907b6088bSart  */
166007b6088bSart void
16612023d591Soga uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end)
166207b6088bSart {
166369c04514Smpi 	vm_map_entry_t entry, oentry = NULL, next;
16647cb53682Sart 	pmap_t pmap = vm_map_pmap(map);
1665cd7ee8acSart 	vaddr_t va;
1666cd7ee8acSart 	paddr_t pa;
1667cd7ee8acSart 	struct vm_page *pg;
1668cd7ee8acSart 
16691496ff33Sart 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
167055490b01Smpi 	vm_map_assert_anylock(map);
16717cb53682Sart 
1672cd7ee8acSart 	/*
1673cd7ee8acSart 	 * we assume that the area we are unwiring has actually been wired
1674cd7ee8acSart 	 * in the first place.   this means that we should be able to extract
167569c04514Smpi 	 * the PAs from the pmap.
1676cd7ee8acSart 	 */
167752887a38Smpi 
167852887a38Smpi 	/*
167952887a38Smpi 	 * find the beginning map entry for the region.
168052887a38Smpi 	 */
1681cac1bff1Sart 	KASSERT(start >= vm_map_min(map) && end <= vm_map_max(map));
168207b6088bSart 	if (uvm_map_lookup_entry(map, start, &entry) == FALSE)
168307b6088bSart 		panic("uvm_fault_unwire_locked: address not in map");
168407b6088bSart 
1685cd7ee8acSart 	for (va = start; va < end ; va += PAGE_SIZE) {
168652887a38Smpi 		/*
168752887a38Smpi 		 * find the map entry for the current address.
168852887a38Smpi 		 */
1689cac1bff1Sart 		KASSERT(va >= entry->start);
1690f77c8782Skettenis 		while (entry && va >= entry->end) {
1691415d6aa0Sdlg 			next = RBT_NEXT(uvm_map_addr, entry);
1692181c6205Sariane 			entry = next;
169307b6088bSart 		}
169407b6088bSart 
1695f77c8782Skettenis 		if (entry == NULL)
1696f77c8782Skettenis 			return;
1697f77c8782Skettenis 		if (va < entry->start)
1698f77c8782Skettenis 			continue;
1699f77c8782Skettenis 
170052887a38Smpi 		/*
170169c04514Smpi 		 * lock it.
170269c04514Smpi 		 */
170369c04514Smpi 		if (entry != oentry) {
170469c04514Smpi 			if (oentry != NULL) {
170569c04514Smpi 				uvm_map_unlock_entry(oentry);
170669c04514Smpi 			}
170769c04514Smpi 			uvm_map_lock_entry(entry);
170869c04514Smpi 			oentry = entry;
170969c04514Smpi 		}
171069c04514Smpi 
17110535051cSmpi 		if (!pmap_extract(pmap, va, &pa))
17120535051cSmpi 			continue;
17130535051cSmpi 
171469c04514Smpi 		/*
171552887a38Smpi 		 * if the entry is no longer wired, tell the pmap.
171652887a38Smpi 		 */
171707b6088bSart 		if (VM_MAPENT_ISWIRED(entry) == 0)
1718ebb3c897Sart 			pmap_unwire(pmap, va);
1719fb33f38cSniklas 
1720cd7ee8acSart 		pg = PHYS_TO_VM_PAGE(pa);
172169c04514Smpi 		if (pg) {
172269c04514Smpi 			uvm_lock_pageq();
1723cd7ee8acSart 			uvm_pageunwire(pg);
172469c04514Smpi 			uvm_unlock_pageq();
172569c04514Smpi 		}
1726cd7ee8acSart 	}
1727cd7ee8acSart 
172869c04514Smpi 	if (oentry != NULL) {
1729f77c8782Skettenis 		uvm_map_unlock_entry(oentry);
173069c04514Smpi 	}
1731cd7ee8acSart }
1732ca5c6958Soga 
1733ca5c6958Soga /*
1734ca5c6958Soga  * uvmfault_unlockmaps: unlock the maps
1735ca5c6958Soga  */
1736ca5c6958Soga void
1737ca5c6958Soga uvmfault_unlockmaps(struct uvm_faultinfo *ufi, boolean_t write_locked)
1738ca5c6958Soga {
1739ca5c6958Soga 	/*
1740ca5c6958Soga 	 * ufi can be NULL when this isn't really a fault,
1741ca5c6958Soga 	 * but merely paging in anon data.
1742ca5c6958Soga 	 */
1743ca5c6958Soga 	if (ufi == NULL) {
1744ca5c6958Soga 		return;
1745ca5c6958Soga 	}
1746ca5c6958Soga 
17470372dd1aSariane 	uvmfault_update_stats(ufi);
1748ca5c6958Soga 	if (write_locked) {
1749ca5c6958Soga 		vm_map_unlock(ufi->map);
1750ca5c6958Soga 	} else {
1751ca5c6958Soga 		vm_map_unlock_read(ufi->map);
1752ca5c6958Soga 	}
1753ca5c6958Soga }
1754ca5c6958Soga 
1755ca5c6958Soga /*
1756ca5c6958Soga  * uvmfault_unlockall: unlock everything passed in.
1757ca5c6958Soga  *
1758ca5c6958Soga  * => maps must be read-locked (not write-locked).
1759ca5c6958Soga  */
1760ca5c6958Soga void
1761ca5c6958Soga uvmfault_unlockall(struct uvm_faultinfo *ufi, struct vm_amap *amap,
1762ec3489eeSmpi     struct uvm_object *uobj)
1763ca5c6958Soga {
176469c04514Smpi 	if (uobj)
176569c04514Smpi 		rw_exit(uobj->vmobjlock);
176619dcab73Smpi 	if (amap != NULL)
176719dcab73Smpi 		amap_unlock(amap);
1768ca5c6958Soga 	uvmfault_unlockmaps(ufi, FALSE);
1769ca5c6958Soga }
1770ca5c6958Soga 
1771ca5c6958Soga /*
1772ca5c6958Soga  * uvmfault_lookup: lookup a virtual address in a map
1773ca5c6958Soga  *
1774ca5c6958Soga  * => caller must provide a uvm_faultinfo structure with the IN
1775ca5c6958Soga  *	params properly filled in
1776ca5c6958Soga  * => we will lookup the map entry (handling submaps) as we go
1777ca5c6958Soga  * => if the lookup is a success we will return with the maps locked
1778ca5c6958Soga  * => if "write_lock" is TRUE, we write_lock the map, otherwise we only
1779ca5c6958Soga  *	get a read lock.
1780ca5c6958Soga  * => note that submaps can only appear in the kernel and they are
1781ca5c6958Soga  *	required to use the same virtual addresses as the map they
1782ca5c6958Soga  *	are referenced by (thus address translation between the main
1783ca5c6958Soga  *	map and the submap is unnecessary).
1784ca5c6958Soga  */
1785ca5c6958Soga 
1786ca5c6958Soga boolean_t
1787ca5c6958Soga uvmfault_lookup(struct uvm_faultinfo *ufi, boolean_t write_lock)
1788ca5c6958Soga {
1789ca5c6958Soga 	vm_map_t tmpmap;
1790ca5c6958Soga 
179152887a38Smpi 	/*
179252887a38Smpi 	 * init ufi values for lookup.
179352887a38Smpi 	 */
1794ca5c6958Soga 	ufi->map = ufi->orig_map;
1795ca5c6958Soga 	ufi->size = ufi->orig_size;
1796ca5c6958Soga 
1797ca5c6958Soga 	/*
1798ca5c6958Soga 	 * keep going down levels until we are done.   note that there can
1799ca5c6958Soga 	 * only be two levels so we won't loop very long.
1800ca5c6958Soga 	 */
1801ca5c6958Soga 	while (1) {
1802181c6205Sariane 		if (ufi->orig_rvaddr < ufi->map->min_offset ||
1803181c6205Sariane 		    ufi->orig_rvaddr >= ufi->map->max_offset)
1804b9df1565Smpi 			return FALSE;
180597581e8aSariane 
180635164244Stedu 		/* lock map */
1807ca5c6958Soga 		if (write_lock) {
1808ca5c6958Soga 			vm_map_lock(ufi->map);
1809ca5c6958Soga 		} else {
1810ca5c6958Soga 			vm_map_lock_read(ufi->map);
1811ca5c6958Soga 		}
1812ca5c6958Soga 
181335164244Stedu 		/* lookup */
1814ca5c6958Soga 		if (!uvm_map_lookup_entry(ufi->map, ufi->orig_rvaddr,
1815ca5c6958Soga 		    &ufi->entry)) {
1816ca5c6958Soga 			uvmfault_unlockmaps(ufi, write_lock);
1817b9df1565Smpi 			return FALSE;
1818ca5c6958Soga 		}
1819ca5c6958Soga 
182035164244Stedu 		/* reduce size if necessary */
1821ca5c6958Soga 		if (ufi->entry->end - ufi->orig_rvaddr < ufi->size)
1822ca5c6958Soga 			ufi->size = ufi->entry->end - ufi->orig_rvaddr;
1823ca5c6958Soga 
1824ca5c6958Soga 		/*
1825ca5c6958Soga 		 * submap?    replace map with the submap and lookup again.
1826ca5c6958Soga 		 * note: VAs in submaps must match VAs in main map.
1827ca5c6958Soga 		 */
1828ca5c6958Soga 		if (UVM_ET_ISSUBMAP(ufi->entry)) {
1829ca5c6958Soga 			tmpmap = ufi->entry->object.sub_map;
18306bc7ce64Soga 			uvmfault_unlockmaps(ufi, write_lock);
1831ca5c6958Soga 			ufi->map = tmpmap;
1832ca5c6958Soga 			continue;
1833ca5c6958Soga 		}
1834ca5c6958Soga 
183552887a38Smpi 		/*
183652887a38Smpi 		 * got it!
183752887a38Smpi 		 */
1838ca5c6958Soga 		ufi->mapv = ufi->map->timestamp;
1839b9df1565Smpi 		return TRUE;
1840ca5c6958Soga 
184152887a38Smpi 	}	/* while loop */
184252887a38Smpi 
1843ca5c6958Soga 	/*NOTREACHED*/
1844ca5c6958Soga }
1845ca5c6958Soga 
1846ca5c6958Soga /*
1847ca5c6958Soga  * uvmfault_relock: attempt to relock the same version of the map
1848ca5c6958Soga  *
1849ca5c6958Soga  * => fault data structures should be unlocked before calling.
1850ca5c6958Soga  * => if a success (TRUE) maps will be locked after call.
1851ca5c6958Soga  */
1852ca5c6958Soga boolean_t
1853ca5c6958Soga uvmfault_relock(struct uvm_faultinfo *ufi)
1854ca5c6958Soga {
1855ca5c6958Soga 	/*
1856ca5c6958Soga 	 * ufi can be NULL when this isn't really a fault,
1857ca5c6958Soga 	 * but merely paging in anon data.
1858ca5c6958Soga 	 */
1859ca5c6958Soga 	if (ufi == NULL) {
1860ca5c6958Soga 		return TRUE;
1861ca5c6958Soga 	}
1862ca5c6958Soga 
1863627a59d1Smpi 	counters_inc(uvmexp_counters, flt_relck);
1864ca5c6958Soga 
1865ca5c6958Soga 	/*
1866ca5c6958Soga 	 * relock map.   fail if version mismatch (in which case nothing
1867ca5c6958Soga 	 * gets locked).
1868ca5c6958Soga 	 */
1869ca5c6958Soga 	vm_map_lock_read(ufi->map);
1870ca5c6958Soga 	if (ufi->mapv != ufi->map->timestamp) {
1871ca5c6958Soga 		vm_map_unlock_read(ufi->map);
1872b9df1565Smpi 		return FALSE;
1873ca5c6958Soga 	}
1874ca5c6958Soga 
1875627a59d1Smpi 	counters_inc(uvmexp_counters, flt_relckok);
1876b9df1565Smpi 	return TRUE;		/* got it! */
1877ca5c6958Soga }
1878