xref: /openbsd-src/sys/uvm/uvm_pmemrange.c (revision 73b046f9615d79a30f562d33b6847cb1722e3d2a)
1*73b046f9Smpi /*	$OpenBSD: uvm_pmemrange.c,v 1.76 2024/11/08 15:54:33 mpi Exp $	*/
2a3544580Soga 
3a3544580Soga /*
482673a18Smpi  * Copyright (c) 2024 Martin Pieuchot <mpi@openbsd.org>
5a3544580Soga  * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
6a3544580Soga  *
7a3544580Soga  * Permission to use, copy, modify, and distribute this software for any
8a3544580Soga  * purpose with or without fee is hereby granted, provided that the above
9a3544580Soga  * copyright notice and this permission notice appear in all copies.
10a3544580Soga  *
11a3544580Soga  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12a3544580Soga  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13a3544580Soga  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14a3544580Soga  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15a3544580Soga  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16a3544580Soga  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17a3544580Soga  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18a3544580Soga  */
19a3544580Soga 
20a3544580Soga #include <sys/param.h>
2190ee2fe0Sbeck #include <sys/systm.h>
22a3544580Soga #include <uvm/uvm.h>
23a3544580Soga #include <sys/malloc.h>
2490ee2fe0Sbeck #include <sys/kernel.h>
2563cf9471Smpi #include <sys/proc.h>
260bd995e1Stedu #include <sys/mount.h>
27a3544580Soga 
28a3544580Soga /*
29a3544580Soga  * 2 trees: addr tree and size tree.
30a3544580Soga  *
31a3544580Soga  * The allocator keeps chunks of free pages (called a range).
32a3544580Soga  * Two pages are part of the same range if:
33a3544580Soga  * - all pages in between are part of that range,
34a3544580Soga  * - they are of the same memory type (zeroed or non-zeroed),
35a3544580Soga  * - they are part of the same pmemrange.
36a3544580Soga  * A pmemrange is a range of memory which is part of the same vm_physseg
37a3544580Soga  * and has a use-count.
38a3544580Soga  *
39a3544580Soga  * addr tree is vm_page[0].objt
40a3544580Soga  * size tree is vm_page[1].objt
41a3544580Soga  *
42a3544580Soga  * The size tree is not used for memory ranges of 1 page, instead,
43a3544580Soga  * single queue is vm_page[0].pageq
44a3544580Soga  *
459bec9e43Sjsg  * vm_page[0].fpgsz describes the length of a free range. Two adjacent ranges
46a3544580Soga  * are joined, unless:
47a3544580Soga  * - they have pages in between them which are not free
48a3544580Soga  * - they belong to different memtypes (zeroed vs dirty memory)
49a3544580Soga  * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
50a3544580Soga  * - they are not a continuation of the same array
51a3544580Soga  * The latter issue is caused by vm_physseg ordering and splitting from the
52a3544580Soga  * MD initialization machinery. The MD code is dependant on freelists and
53a3544580Soga  * happens to split ISA memory from non-ISA memory.
54a3544580Soga  * (Note: freelists die die die!)
55a3544580Soga  *
56a3544580Soga  * uvm_page_init guarantees that every vm_physseg contains an array of
57a3544580Soga  * struct vm_page. Also, uvm_page_physload allocates an array of struct
58a3544580Soga  * vm_page. This code depends on that array. The array may break across
59a3544580Soga  * vm_physsegs boundaries.
60a3544580Soga  */
61a3544580Soga 
62a3544580Soga /*
63a3544580Soga  * Validate the flags of the page. (Used in asserts.)
64a3544580Soga  * Any free page must have the PQ_FREE flag set.
65a3544580Soga  * Free pages may be zeroed.
66a3544580Soga  * Pmap flags are left untouched.
67a3544580Soga  *
68a3544580Soga  * The PQ_FREE flag is not checked here: by not checking, we can easily use
69a3544580Soga  * this check in pages which are freed.
70a3544580Soga  */
71a3544580Soga #define VALID_FLAGS(pg_flags)						\
721a0bb2c4Smiod 	(((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
73a3544580Soga 
74a3544580Soga /* Tree comparators. */
753accc929Sdlg int	uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *,
763accc929Sdlg 	    const struct uvm_pmemrange *);
77a3544580Soga int	uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
78a3544580Soga int	uvm_pmr_pg_to_memtype(struct vm_page *);
79a3544580Soga 
80a3544580Soga #ifdef DDB
81a3544580Soga void	uvm_pmr_print(void);
82a3544580Soga #endif
83a3544580Soga 
84d60475a6Smpi static inline int
85d60475a6Smpi in_pagedaemon(int allowsyncer)
86d60475a6Smpi {
87d60475a6Smpi 	if (curcpu()->ci_idepth > 0)
88d60475a6Smpi 		return 0;
89d60475a6Smpi 	if (curproc == uvm.pagedaemon_proc)
90d60475a6Smpi 		return 1;
91d60475a6Smpi 	/* XXX why is the syncer allowed to use the pagedaemon's reserve? */
92d60475a6Smpi 	if (allowsyncer && (curproc == syncerproc))
93d60475a6Smpi 		return 1;
94d60475a6Smpi 	return 0;
95d60475a6Smpi }
96d60475a6Smpi 
97a3544580Soga /*
98a3544580Soga  * Memory types. The page flags are used to derive what the current memory
99a3544580Soga  * type of a page is.
100a3544580Soga  */
101a3544580Soga int
102a3544580Soga uvm_pmr_pg_to_memtype(struct vm_page *pg)
103a3544580Soga {
104a3544580Soga 	if (pg->pg_flags & PG_ZERO)
105a3544580Soga 		return UVM_PMR_MEMTYPE_ZERO;
106a3544580Soga 	/* Default: dirty memory. */
107a3544580Soga 	return UVM_PMR_MEMTYPE_DIRTY;
108a3544580Soga }
109a3544580Soga 
110a3544580Soga /* Trees. */
111262a556aSdlg RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
112262a556aSdlg RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
1133accc929Sdlg RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
114a3544580Soga     uvm_pmemrange_addr_cmp);
115a3544580Soga 
116a3544580Soga /* Validation. */
117a3544580Soga #ifdef DEBUG
118a3544580Soga void	uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
119a3544580Soga #else
120a3544580Soga #define uvm_pmr_assertvalid(pmr)	do {} while (0)
121a3544580Soga #endif
122a3544580Soga 
123dba5c185Smiod psize_t			 uvm_pmr_get1page(psize_t, int, struct pglist *,
124ef440d08Skettenis 			    paddr_t, paddr_t, int);
125a3544580Soga 
126a3544580Soga struct uvm_pmemrange	*uvm_pmr_allocpmr(void);
127a3544580Soga struct vm_page		*uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
128a3544580Soga struct vm_page		*uvm_pmr_nextsz(struct uvm_pmemrange *,
129a3544580Soga 			    struct vm_page *, int);
130a3544580Soga void			 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
131a3544580Soga 			    struct vm_page *pg, struct vm_page **pg_prev,
132a3544580Soga 			    struct vm_page **pg_next);
133a3544580Soga struct vm_page		*uvm_pmr_findnextsegment(struct uvm_pmemrange *,
134a3544580Soga 			    struct vm_page *, paddr_t);
1351e57804eSbeck struct vm_page		*uvm_pmr_findprevsegment(struct uvm_pmemrange *,
1361e57804eSbeck 			    struct vm_page *, paddr_t);
137a3544580Soga psize_t			 uvm_pmr_remove_1strange(struct pglist *, paddr_t,
138a3544580Soga 			    struct vm_page **, int);
1391e57804eSbeck psize_t			 uvm_pmr_remove_1strange_reverse(struct pglist *,
1401e57804eSbeck     			    paddr_t *);
141a3544580Soga void			 uvm_pmr_split(paddr_t);
142a3544580Soga struct uvm_pmemrange	*uvm_pmemrange_find(paddr_t);
143a3544580Soga struct uvm_pmemrange	*uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
144a3544580Soga 			    struct uvm_pmemrange *);
145a3544580Soga psize_t			 pow2divide(psize_t, psize_t);
146a3544580Soga struct vm_page		*uvm_pmr_rootupdate(struct uvm_pmemrange *,
147a3544580Soga 			    struct vm_page *, paddr_t, paddr_t, int);
148a3544580Soga 
149a3544580Soga /*
150a3544580Soga  * Computes num/denom and rounds it up to the next power-of-2.
151a3544580Soga  *
152a3544580Soga  * This is a division function which calculates an approximation of
153a3544580Soga  * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
154a3544580Soga  * have to be accurate.
155a3544580Soga  *
156a3544580Soga  * Providing too large a value makes the allocator slightly faster, at the
157a3544580Soga  * risk of hitting the failure case more often. Providing too small a value
158a3544580Soga  * makes the allocator a bit slower, but less likely to hit a failure case.
159a3544580Soga  */
160a3544580Soga psize_t
161a3544580Soga pow2divide(psize_t num, psize_t denom)
162a3544580Soga {
163a3544580Soga 	int rshift;
164a3544580Soga 
1654b9fb15fSoga 	for (rshift = 0; num > denom; rshift++, denom <<= 1)
1664b9fb15fSoga 		;
167a3544580Soga 	return (paddr_t)1 << rshift;
168a3544580Soga }
169a3544580Soga 
170a3544580Soga /*
171a3544580Soga  * Predicate: lhs is a subrange or rhs.
172a3544580Soga  *
173a3544580Soga  * If rhs_low == 0: don't care about lower bound.
174a3544580Soga  * If rhs_high == 0: don't care about upper bound.
175a3544580Soga  */
176a3544580Soga #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high)	\
177a3544580Soga 	(((rhs_low) == 0 || (lhs_low) >= (rhs_low)) &&			\
178a3544580Soga 	((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
179a3544580Soga 
180a3544580Soga /*
181a3544580Soga  * Predicate: lhs intersects with rhs.
182a3544580Soga  *
183a3544580Soga  * If rhs_low == 0: don't care about lower bound.
184a3544580Soga  * If rhs_high == 0: don't care about upper bound.
185a3544580Soga  * Ranges don't intersect if they don't have any page in common, array
186a3544580Soga  * semantics mean that < instead of <= should be used here.
187a3544580Soga  */
188a3544580Soga #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high)	\
189a3544580Soga 	(((rhs_low) == 0 || (rhs_low) < (lhs_high)) &&			\
190a3544580Soga 	((rhs_high) == 0 || (lhs_low) < (rhs_high)))
191a3544580Soga 
192a3544580Soga /*
193a3544580Soga  * Align to power-of-2 alignment.
194a3544580Soga  */
195a3544580Soga #define PMR_ALIGN(pgno, align)						\
196a3544580Soga 	(((pgno) + ((align) - 1)) & ~((align) - 1))
1971e57804eSbeck #define PMR_ALIGN_DOWN(pgno, align)					\
1981e57804eSbeck 	((pgno) & ~((align) - 1))
199a3544580Soga 
200a3544580Soga 
201a3544580Soga /*
202a3544580Soga  * Comparator: sort by address ascending.
203a3544580Soga  */
204a3544580Soga int
2053accc929Sdlg uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs,
2063accc929Sdlg     const struct uvm_pmemrange *rhs)
207a3544580Soga {
208a3544580Soga 	return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
209a3544580Soga }
210a3544580Soga 
211a3544580Soga /*
212a3544580Soga  * Comparator: sort by use ascending.
213a3544580Soga  *
214a3544580Soga  * The higher the use value of a range, the more devices need memory in
2157a3af63bStb  * this range. Therefore allocate from the range with the lowest use first.
216a3544580Soga  */
217a3544580Soga int
218a3544580Soga uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
219a3544580Soga {
220a3544580Soga 	int result;
221a3544580Soga 
222a3544580Soga 	result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
223a3544580Soga 	if (result == 0)
224a3544580Soga 		result = uvm_pmemrange_addr_cmp(lhs, rhs);
225a3544580Soga 	return result;
226a3544580Soga }
227a3544580Soga 
228a3544580Soga int
229262a556aSdlg uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
230a3544580Soga {
231a3544580Soga 	paddr_t lhs_addr, rhs_addr;
232a3544580Soga 
233a3544580Soga 	lhs_addr = VM_PAGE_TO_PHYS(lhs);
234a3544580Soga 	rhs_addr = VM_PAGE_TO_PHYS(rhs);
235a3544580Soga 
236a3544580Soga 	return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
237a3544580Soga }
238a3544580Soga 
239a3544580Soga int
240262a556aSdlg uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
241a3544580Soga {
242a3544580Soga 	psize_t lhs_size, rhs_size;
243a3544580Soga 	int cmp;
244a3544580Soga 
245a3544580Soga 	/* Using second tree, so we receive pg[1] instead of pg[0]. */
246a3544580Soga 	lhs_size = (lhs - 1)->fpgsz;
247a3544580Soga 	rhs_size = (rhs - 1)->fpgsz;
248a3544580Soga 
249a3544580Soga 	cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
250a3544580Soga 	if (cmp == 0)
251a3544580Soga 		cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
252a3544580Soga 	return cmp;
253a3544580Soga }
254a3544580Soga 
255a3544580Soga /*
256a3544580Soga  * Find the first range of free pages that is at least sz pages long.
257a3544580Soga  */
258a3544580Soga struct vm_page *
259a3544580Soga uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
260a3544580Soga {
261a3544580Soga 	struct	vm_page *node, *best;
262a3544580Soga 
263a3544580Soga 	KASSERT(sz >= 1);
264a3544580Soga 
265a3544580Soga 	if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
266a3544580Soga 		return TAILQ_FIRST(&pmr->single[mti]);
267a3544580Soga 
268262a556aSdlg 	node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]);
269a3544580Soga 	best = NULL;
270a3544580Soga 	while (node != NULL) {
271a3544580Soga 		if ((node - 1)->fpgsz >= sz) {
272a3544580Soga 			best = (node - 1);
273262a556aSdlg 			node = RBT_LEFT(uvm_objtree, node);
274a3544580Soga 		} else
275262a556aSdlg 			node = RBT_RIGHT(uvm_objtree, node);
276a3544580Soga 	}
277a3544580Soga 	return best;
278a3544580Soga }
279a3544580Soga 
280a3544580Soga /*
281a3544580Soga  * Finds the next range. The next range has a size >= pg->fpgsz.
282a3544580Soga  * Returns NULL if no more ranges are available.
283a3544580Soga  */
284a3544580Soga struct vm_page *
285a3544580Soga uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
286a3544580Soga {
287a3544580Soga 	struct vm_page *npg;
288a3544580Soga 
289a3544580Soga 	KASSERT(pmr != NULL && pg != NULL);
290a3544580Soga 	if (pg->fpgsz == 1) {
291a3544580Soga 		if (TAILQ_NEXT(pg, pageq) != NULL)
292a3544580Soga 			return TAILQ_NEXT(pg, pageq);
293a3544580Soga 		else
294262a556aSdlg 			npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]);
295a3544580Soga 	} else
296262a556aSdlg 		npg = RBT_NEXT(uvm_pmr_size, pg + 1);
297a3544580Soga 
298a3544580Soga 	return npg == NULL ? NULL : npg - 1;
299a3544580Soga }
300a3544580Soga 
301a3544580Soga /*
302a3544580Soga  * Finds the previous and next ranges relative to the (uninserted) pg range.
303a3544580Soga  *
304a3544580Soga  * *pg_prev == NULL if no previous range is available, that can join with
305a3544580Soga  * 	pg.
306a3544580Soga  * *pg_next == NULL if no next range is available, that can join with
307a3544580Soga  * 	pg.
308a3544580Soga  */
309a3544580Soga void
310a3544580Soga uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
311a3544580Soga     struct vm_page **pg_prev, struct vm_page **pg_next)
312a3544580Soga {
313a3544580Soga 	KASSERT(pg_prev != NULL && pg_next != NULL);
314a3544580Soga 
315262a556aSdlg 	*pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
316a3544580Soga 	if (*pg_next == NULL)
317262a556aSdlg 		*pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
318a3544580Soga 	else
319262a556aSdlg 		*pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next);
320a3544580Soga 
321a3544580Soga 	KDASSERT(*pg_next == NULL ||
322a3544580Soga 	    VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
323a3544580Soga 	KDASSERT(*pg_prev == NULL ||
324a3544580Soga 	    VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
325a3544580Soga 
326a3544580Soga 	/* Reset if not contig. */
327a3544580Soga 	if (*pg_prev != NULL &&
328a3544580Soga 	    (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
329a3544580Soga 	    != atop(VM_PAGE_TO_PHYS(pg)) ||
330a3544580Soga 	    *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
331a3544580Soga 	    uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
332a3544580Soga 		*pg_prev = NULL;
333a3544580Soga 	if (*pg_next != NULL &&
334a3544580Soga 	    (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
335a3544580Soga 	    != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
336a3544580Soga 	    pg + pg->fpgsz != *pg_next || /* Array broke. */
337a3544580Soga 	    uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
338a3544580Soga 		*pg_next = NULL;
339a3544580Soga 	return;
340a3544580Soga }
341a3544580Soga 
342a3544580Soga /*
343a3544580Soga  * Remove a range from the address tree.
344a3544580Soga  * Address tree maintains pmr counters.
345a3544580Soga  */
346a3544580Soga void
347a3544580Soga uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
348a3544580Soga {
349262a556aSdlg 	KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
350a3544580Soga 	KDASSERT(pg->pg_flags & PQ_FREE);
351262a556aSdlg 	RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
352a3544580Soga 
353a3544580Soga 	pmr->nsegs--;
354a3544580Soga }
355a3544580Soga /*
356a3544580Soga  * Remove a range from the size tree.
357a3544580Soga  */
358a3544580Soga void
359a3544580Soga uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
360a3544580Soga {
361a3544580Soga 	int memtype;
362a3544580Soga #ifdef DEBUG
363a3544580Soga 	struct vm_page *i;
364a3544580Soga #endif
365a3544580Soga 
366a3544580Soga 	KDASSERT(pg->fpgsz >= 1);
367a3544580Soga 	KDASSERT(pg->pg_flags & PQ_FREE);
368a3544580Soga 	memtype = uvm_pmr_pg_to_memtype(pg);
369a3544580Soga 
370a3544580Soga 	if (pg->fpgsz == 1) {
371a3544580Soga #ifdef DEBUG
372a3544580Soga 		TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
373a3544580Soga 			if (i == pg)
374a3544580Soga 				break;
375a3544580Soga 		}
376a3544580Soga 		KDASSERT(i == pg);
377a3544580Soga #endif
378a3544580Soga 		TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
379a3544580Soga 	} else {
3806d0d0baaSdlg 		KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype],
381a3544580Soga 		    pg + 1) == pg + 1);
382262a556aSdlg 		RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
383a3544580Soga 	}
384a3544580Soga }
385a3544580Soga /* Remove from both trees. */
386a3544580Soga void
387a3544580Soga uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
388a3544580Soga {
389a3544580Soga 	uvm_pmr_assertvalid(pmr);
390a3544580Soga 	uvm_pmr_remove_size(pmr, pg);
391a3544580Soga 	uvm_pmr_remove_addr(pmr, pg);
392a3544580Soga 	uvm_pmr_assertvalid(pmr);
393a3544580Soga }
394a3544580Soga 
395a3544580Soga /*
396a3544580Soga  * Insert the range described in pg.
397a3544580Soga  * Returns the range thus created (which may be joined with the previous and
398a3544580Soga  * next ranges).
399a3544580Soga  * If no_join, the caller guarantees that the range cannot possibly join
4009bec9e43Sjsg  * with adjacent ranges.
401a3544580Soga  */
402a3544580Soga struct vm_page *
403a3544580Soga uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
404a3544580Soga {
405a3544580Soga 	struct vm_page *prev, *next;
406a3544580Soga 
407a3544580Soga #ifdef DEBUG
408a3544580Soga 	struct vm_page *i;
409a3544580Soga 	int mt;
410a3544580Soga #endif
411a3544580Soga 
412a3544580Soga 	KDASSERT(pg->pg_flags & PQ_FREE);
413a3544580Soga 	KDASSERT(pg->fpgsz >= 1);
414a3544580Soga 
415a3544580Soga #ifdef DEBUG
416a3544580Soga 	for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
417a3544580Soga 		TAILQ_FOREACH(i, &pmr->single[mt], pageq)
418a3544580Soga 			KDASSERT(i != pg);
419a3544580Soga 		if (pg->fpgsz > 1) {
4206d0d0baaSdlg 			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt],
421a3544580Soga 			    pg + 1) == NULL);
422a3544580Soga 		}
4236d0d0baaSdlg 		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
424a3544580Soga 	}
425a3544580Soga #endif
426a3544580Soga 
427a3544580Soga 	if (!no_join) {
428a3544580Soga 		uvm_pmr_pnaddr(pmr, pg, &prev, &next);
429a3544580Soga 		if (next != NULL) {
430a3544580Soga 			uvm_pmr_remove_size(pmr, next);
431a3544580Soga 			uvm_pmr_remove_addr(pmr, next);
432a3544580Soga 			pg->fpgsz += next->fpgsz;
433a3544580Soga 			next->fpgsz = 0;
434a3544580Soga 		}
435a3544580Soga 		if (prev != NULL) {
436a3544580Soga 			uvm_pmr_remove_size(pmr, prev);
437a3544580Soga 			prev->fpgsz += pg->fpgsz;
438a3544580Soga 			pg->fpgsz = 0;
439a3544580Soga 			return prev;
440a3544580Soga 		}
441a3544580Soga 	}
442a3544580Soga 
443262a556aSdlg 	RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg);
444a3544580Soga 
445a3544580Soga 	pmr->nsegs++;
446a3544580Soga 
447a3544580Soga 	return pg;
448a3544580Soga }
449a3544580Soga /*
450a3544580Soga  * Insert the range described in pg.
451a3544580Soga  * Returns the range thus created (which may be joined with the previous and
452a3544580Soga  * next ranges).
453a3544580Soga  * Page must already be in the address tree.
454a3544580Soga  */
455a3544580Soga void
456a3544580Soga uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
457a3544580Soga {
458a3544580Soga 	int memtype;
459a3544580Soga #ifdef DEBUG
460a3544580Soga 	struct vm_page *i;
461a3544580Soga 	int mti;
462a3544580Soga #endif
463a3544580Soga 
464a3544580Soga 	KDASSERT(pg->fpgsz >= 1);
465a3544580Soga 	KDASSERT(pg->pg_flags & PQ_FREE);
466a3544580Soga 
467a3544580Soga 	memtype = uvm_pmr_pg_to_memtype(pg);
468a3544580Soga #ifdef DEBUG
469a3544580Soga 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
470a3544580Soga 		TAILQ_FOREACH(i, &pmr->single[mti], pageq)
471a3544580Soga 			KDASSERT(i != pg);
472a3544580Soga 		if (pg->fpgsz > 1) {
4736d0d0baaSdlg 			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti],
474a3544580Soga 			    pg + 1) == NULL);
475a3544580Soga 		}
4766d0d0baaSdlg 		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
477a3544580Soga 	}
478a3544580Soga 	for (i = pg; i < pg + pg->fpgsz; i++)
479a3544580Soga 		KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
480a3544580Soga #endif
481a3544580Soga 
482a3544580Soga 	if (pg->fpgsz == 1)
483a3544580Soga 		TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
484a3544580Soga 	else
485262a556aSdlg 		RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
486a3544580Soga }
487a3544580Soga /* Insert in both trees. */
488a3544580Soga struct vm_page *
489a3544580Soga uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
490a3544580Soga {
491a3544580Soga 	uvm_pmr_assertvalid(pmr);
492a3544580Soga 	pg = uvm_pmr_insert_addr(pmr, pg, no_join);
493a3544580Soga 	uvm_pmr_insert_size(pmr, pg);
494a3544580Soga 	uvm_pmr_assertvalid(pmr);
495a3544580Soga 	return pg;
496a3544580Soga }
497a3544580Soga 
498a3544580Soga /*
499a3544580Soga  * Find the last page that is part of this segment.
500a3544580Soga  * => pg: the range at which to start the search.
501a3544580Soga  * => boundary: the page number boundary specification (0 = no boundary).
502a3544580Soga  * => pmr: the pmemrange of the page.
503a3544580Soga  *
504a3544580Soga  * This function returns 1 before the next range, so if you want to have the
505a3544580Soga  * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
506a3544580Soga  * The reason is that this way, the length of the segment is easily
507a3544580Soga  * calculated using: atop(result) - atop(pg) + 1.
508a3544580Soga  * Hence this function also never returns NULL.
509a3544580Soga  */
510a3544580Soga struct vm_page *
511a3544580Soga uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
512a3544580Soga     struct vm_page *pg, paddr_t boundary)
513a3544580Soga {
514a3544580Soga 	paddr_t	first_boundary;
515a3544580Soga 	struct	vm_page *next;
516a3544580Soga 	struct	vm_page *prev;
517a3544580Soga 
518a3544580Soga 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
519a3544580Soga 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
520a3544580Soga 	if (boundary != 0) {
521a3544580Soga 		first_boundary =
522a3544580Soga 		    PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
523a3544580Soga 	} else
524a3544580Soga 		first_boundary = 0;
525a3544580Soga 
526a3544580Soga 	/*
527a3544580Soga 	 * Increase next until it hits the first page of the next segment.
528a3544580Soga 	 *
529a3544580Soga 	 * While loop checks the following:
530a3544580Soga 	 * - next != NULL	we have not reached the end of pgl
531a3544580Soga 	 * - boundary == 0 || next < first_boundary
532a3544580Soga 	 *			we do not cross a boundary
533a3544580Soga 	 * - atop(prev) + 1 == atop(next)
534a3544580Soga 	 *			still in the same segment
535a3544580Soga 	 * - low <= last
536a3544580Soga 	 * - high > last	still in the same memory range
537a3544580Soga 	 * - memtype is equal	allocator is unable to view different memtypes
538a3544580Soga 	 *			as part of the same segment
539a3544580Soga 	 * - prev + 1 == next	no array breakage occurs
540a3544580Soga 	 */
541a3544580Soga 	prev = pg;
542a3544580Soga 	next = TAILQ_NEXT(prev, pageq);
543a3544580Soga 	while (next != NULL &&
544a3544580Soga 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
545a3544580Soga 	    atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
546a3544580Soga 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
547a3544580Soga 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
548a3544580Soga 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
549a3544580Soga 	    prev + 1 == next) {
550a3544580Soga 		prev = next;
551a3544580Soga 		next = TAILQ_NEXT(prev, pageq);
552a3544580Soga 	}
553a3544580Soga 
554a3544580Soga 	/*
555a3544580Soga 	 * End of this segment.
556a3544580Soga 	 */
557a3544580Soga 	return prev;
558a3544580Soga }
559a3544580Soga 
560a3544580Soga /*
5611e57804eSbeck  * Find the first page that is part of this segment.
5621e57804eSbeck  * => pg: the range at which to start the search.
5631e57804eSbeck  * => boundary: the page number boundary specification (0 = no boundary).
5641e57804eSbeck  * => pmr: the pmemrange of the page.
5651e57804eSbeck  *
5661e57804eSbeck  * This function returns 1 after the previous range, so if you want to have the
5671e57804eSbeck  * previous range, you need to run TAILQ_NEXT(result, pageq) after calling.
5681e57804eSbeck  * The reason is that this way, the length of the segment is easily
5691e57804eSbeck  * calculated using: atop(pg) - atop(result) + 1.
5701e57804eSbeck  * Hence this function also never returns NULL.
5711e57804eSbeck  */
5721e57804eSbeck struct vm_page *
5731e57804eSbeck uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr,
5741e57804eSbeck     struct vm_page *pg, paddr_t boundary)
5751e57804eSbeck {
5761e57804eSbeck 	paddr_t	first_boundary;
5771e57804eSbeck 	struct	vm_page *next;
5781e57804eSbeck 	struct	vm_page *prev;
5791e57804eSbeck 
5801e57804eSbeck 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
5811e57804eSbeck 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
5821e57804eSbeck 	if (boundary != 0) {
5831e57804eSbeck 		first_boundary =
5841e57804eSbeck 		    PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary);
5851e57804eSbeck 	} else
5861e57804eSbeck 		first_boundary = 0;
5871e57804eSbeck 
5881e57804eSbeck 	/*
5891e57804eSbeck 	 * Increase next until it hits the first page of the previous segment.
5901e57804eSbeck 	 *
5911e57804eSbeck 	 * While loop checks the following:
5921e57804eSbeck 	 * - next != NULL	we have not reached the end of pgl
5931e57804eSbeck 	 * - boundary == 0 || next >= first_boundary
5941e57804eSbeck 	 *			we do not cross a boundary
5951e57804eSbeck 	 * - atop(prev) - 1 == atop(next)
5961e57804eSbeck 	 *			still in the same segment
5971e57804eSbeck 	 * - low <= last
5981e57804eSbeck 	 * - high > last	still in the same memory range
5991e57804eSbeck 	 * - memtype is equal	allocator is unable to view different memtypes
6001e57804eSbeck 	 *			as part of the same segment
6011e57804eSbeck 	 * - prev - 1 == next	no array breakage occurs
6021e57804eSbeck 	 */
6031e57804eSbeck 	prev = pg;
6041e57804eSbeck 	next = TAILQ_NEXT(prev, pageq);
6051e57804eSbeck 	while (next != NULL &&
6061e57804eSbeck 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) &&
6071e57804eSbeck 	    atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) &&
6081e57804eSbeck 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
6091e57804eSbeck 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
6101e57804eSbeck 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
6111e57804eSbeck 	    prev - 1 == next) {
6121e57804eSbeck 		prev = next;
6131e57804eSbeck 		next = TAILQ_NEXT(prev, pageq);
6141e57804eSbeck 	}
6151e57804eSbeck 
6161e57804eSbeck 	/*
6171e57804eSbeck 	 * Start of this segment.
6181e57804eSbeck 	 */
6191e57804eSbeck 	return prev;
6201e57804eSbeck }
6211e57804eSbeck 
6221e57804eSbeck /*
623a3544580Soga  * Remove the first segment of contiguous pages from pgl.
624a3544580Soga  * A segment ends if it crosses boundary (unless boundary = 0) or
625a3544580Soga  * if it would enter a different uvm_pmemrange.
626a3544580Soga  *
627a3544580Soga  * Work: the page range that the caller is currently working with.
628a3544580Soga  * May be null.
629a3544580Soga  *
630a3544580Soga  * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
631a3544580Soga  * the first segment is erased (which, if called by uvm_pmr_getpages(),
632a3544580Soga  * probably is the smallest or very close to it).
633a3544580Soga  */
634a3544580Soga psize_t
635a3544580Soga uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
636a3544580Soga     struct vm_page **work, int is_desperate)
637a3544580Soga {
6381e57804eSbeck 	struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest;
639a3544580Soga 	psize_t count;
640a3544580Soga 	struct uvm_pmemrange *pmr, *pmr_iter;
641a3544580Soga 
642a3544580Soga 	KASSERT(!TAILQ_EMPTY(pgl));
643a3544580Soga 
644a3544580Soga 	/*
645a3544580Soga 	 * Initialize to first page.
646a3544580Soga 	 * Unless desperate scan finds a better candidate, this is what'll be
647a3544580Soga 	 * erased.
648a3544580Soga 	 */
649a3544580Soga 	start = TAILQ_FIRST(pgl);
650a3544580Soga 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
651a3544580Soga 	end = uvm_pmr_findnextsegment(pmr, start, boundary);
652a3544580Soga 
653a3544580Soga 	/*
654a3544580Soga 	 * If we are desperate, we _really_ want to get rid of the smallest
655a3544580Soga 	 * element (rather than a close match to the smallest element).
656a3544580Soga 	 */
657a3544580Soga 	if (is_desperate) {
658d3b3d34dSthib 		/* Linear search for smallest segment. */
659a3544580Soga 		pmr_iter = pmr;
660a3544580Soga 		for (iter = TAILQ_NEXT(end, pageq);
661a3544580Soga 		    iter != NULL && start != end;
662a3544580Soga 		    iter = TAILQ_NEXT(iter_end, pageq)) {
663a3544580Soga 			/*
664a3544580Soga 			 * Only update pmr if it doesn't match current
665a3544580Soga 			 * iteration.
666a3544580Soga 			 */
667a3544580Soga 			if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
668a3544580Soga 			    pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
669a3544580Soga 				pmr_iter = uvm_pmemrange_find(atop(
670a3544580Soga 				    VM_PAGE_TO_PHYS(iter)));
671a3544580Soga 			}
672a3544580Soga 
673a3544580Soga 			iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
674a3544580Soga 			    boundary);
675a3544580Soga 
676a3544580Soga 			/*
677a3544580Soga 			 * Current iteration is smaller than best match so
678a3544580Soga 			 * far; update.
679a3544580Soga 			 */
680a3544580Soga 			if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
681a3544580Soga 			    VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
682a3544580Soga 				start = iter;
683a3544580Soga 				end = iter_end;
684a3544580Soga 				pmr = pmr_iter;
685a3544580Soga 			}
686a3544580Soga 		}
687a3544580Soga 	}
688a3544580Soga 
689a3544580Soga 	/*
690a3544580Soga 	 * Calculate count and end of the list.
691a3544580Soga 	 */
692a3544580Soga 	count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
6931e57804eSbeck 	lowest = start;
694a3544580Soga 	end = TAILQ_NEXT(end, pageq);
695a3544580Soga 
696a3544580Soga 	/*
697a3544580Soga 	 * Actually remove the range of pages.
698a3544580Soga 	 *
699a3544580Soga 	 * Sadly, this cannot be done using pointer iteration:
700a3544580Soga 	 * vm_physseg is not guaranteed to be sorted on address, hence
701a3544580Soga 	 * uvm_page_init() may not have initialized its array sorted by
702a3544580Soga 	 * page number.
703a3544580Soga 	 */
704a3544580Soga 	for (iter = start; iter != end; iter = iter_end) {
705a3544580Soga 		iter_end = TAILQ_NEXT(iter, pageq);
706a3544580Soga 		TAILQ_REMOVE(pgl, iter, pageq);
707a3544580Soga 	}
708a3544580Soga 
7091e57804eSbeck 	lowest->fpgsz = count;
7101e57804eSbeck 	inserted = uvm_pmr_insert(pmr, lowest, 0);
711a3544580Soga 
712a3544580Soga 	/*
713a3544580Soga 	 * If the caller was working on a range and this function modified
714a3544580Soga 	 * that range, update the pointer.
715a3544580Soga 	 */
716a3544580Soga 	if (work != NULL && *work != NULL &&
717a3544580Soga 	    atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
718a3544580Soga 	    atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
719a3544580Soga 	    atop(VM_PAGE_TO_PHYS(*work)))
720a3544580Soga 		*work = inserted;
721a3544580Soga 	return count;
722a3544580Soga }
723a3544580Soga 
724a3544580Soga /*
7251e57804eSbeck  * Remove the first segment of contiguous pages from a pgl
7261e57804eSbeck  * with the list elements in reverse order of physaddr.
7271e57804eSbeck  *
7281e57804eSbeck  * A segment ends if it would enter a different uvm_pmemrange.
7291e57804eSbeck  *
7301e57804eSbeck  * Stores starting physical address of the segment in pstart.
7311e57804eSbeck  */
7321e57804eSbeck psize_t
7331e57804eSbeck uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart)
7341e57804eSbeck {
7351e57804eSbeck 	struct vm_page *start, *end, *iter, *iter_end, *lowest;
7361e57804eSbeck 	psize_t count;
7371e57804eSbeck 	struct uvm_pmemrange *pmr;
7381e57804eSbeck 
7391e57804eSbeck 	KASSERT(!TAILQ_EMPTY(pgl));
7401e57804eSbeck 
7411e57804eSbeck 	start = TAILQ_FIRST(pgl);
7421e57804eSbeck 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
7431e57804eSbeck 	end = uvm_pmr_findprevsegment(pmr, start, 0);
7441e57804eSbeck 
7451e57804eSbeck 	KASSERT(end <= start);
7461e57804eSbeck 
7471e57804eSbeck 	/*
7481e57804eSbeck 	 * Calculate count and end of the list.
7491e57804eSbeck 	 */
7501e57804eSbeck 	count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1;
7511e57804eSbeck 	lowest = end;
7521e57804eSbeck 	end = TAILQ_NEXT(end, pageq);
7531e57804eSbeck 
7541e57804eSbeck 	/*
7551e57804eSbeck 	 * Actually remove the range of pages.
7561e57804eSbeck 	 *
7571e57804eSbeck 	 * Sadly, this cannot be done using pointer iteration:
7581e57804eSbeck 	 * vm_physseg is not guaranteed to be sorted on address, hence
7591e57804eSbeck 	 * uvm_page_init() may not have initialized its array sorted by
7601e57804eSbeck 	 * page number.
7611e57804eSbeck 	 */
7621e57804eSbeck 	for (iter = start; iter != end; iter = iter_end) {
7631e57804eSbeck 		iter_end = TAILQ_NEXT(iter, pageq);
7641e57804eSbeck 		TAILQ_REMOVE(pgl, iter, pageq);
7651e57804eSbeck 	}
7661e57804eSbeck 
7671e57804eSbeck 	lowest->fpgsz = count;
7681e57804eSbeck 	(void) uvm_pmr_insert(pmr, lowest, 0);
7691e57804eSbeck 
7701e57804eSbeck 	*pstart = VM_PAGE_TO_PHYS(lowest);
7711e57804eSbeck 	return count;
7721e57804eSbeck }
7731e57804eSbeck 
7741e57804eSbeck /*
775a3544580Soga  * Extract a number of pages from a segment of free pages.
776a3544580Soga  * Called by uvm_pmr_getpages.
777a3544580Soga  *
778a3544580Soga  * Returns the segment that was created from pages left over at the tail
779a3544580Soga  * of the remove set of pages, or NULL if no pages were left at the tail.
780a3544580Soga  */
781a3544580Soga struct vm_page *
782a3544580Soga uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
783a3544580Soga     paddr_t start, paddr_t end, struct pglist *result)
784a3544580Soga {
785a3544580Soga 	struct vm_page *after, *pg_i;
786a3544580Soga 	psize_t before_sz, after_sz;
787a3544580Soga #ifdef DEBUG
788a3544580Soga 	psize_t i;
789a3544580Soga #endif
790a3544580Soga 
791a3544580Soga 	KDASSERT(end > start);
792a3544580Soga 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
793a3544580Soga 	KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
794a3544580Soga 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
795a3544580Soga 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
796a3544580Soga 
797a3544580Soga 	before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
798a3544580Soga 	after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
799a3544580Soga 	KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
800a3544580Soga 	uvm_pmr_assertvalid(pmr);
801a3544580Soga 
802a3544580Soga 	uvm_pmr_remove_size(pmr, pg);
803a3544580Soga 	if (before_sz == 0)
804a3544580Soga 		uvm_pmr_remove_addr(pmr, pg);
80556db1aa6Sariane 	after = pg + before_sz + (end - start);
806a3544580Soga 
807a3544580Soga 	/* Add selected pages to result. */
80856db1aa6Sariane 	for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
809c6b41277Sariane 		KASSERT(pg_i->pg_flags & PQ_FREE);
810a3544580Soga 		pg_i->fpgsz = 0;
811a3544580Soga 		TAILQ_INSERT_TAIL(result, pg_i, pageq);
812a3544580Soga 	}
813a3544580Soga 
814a3544580Soga 	/* Before handling. */
815a3544580Soga 	if (before_sz > 0) {
816a3544580Soga 		pg->fpgsz = before_sz;
817a3544580Soga 		uvm_pmr_insert_size(pmr, pg);
818a3544580Soga 	}
819a3544580Soga 
820a3544580Soga 	/* After handling. */
821a3544580Soga 	if (after_sz > 0) {
822a3544580Soga #ifdef DEBUG
823a3544580Soga 		for (i = 0; i < after_sz; i++) {
824a3544580Soga 			KASSERT(!uvm_pmr_isfree(after + i));
825a3544580Soga 		}
826a3544580Soga #endif
827a3544580Soga 		KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
828a3544580Soga 		after->fpgsz = after_sz;
829a3544580Soga 		after = uvm_pmr_insert_addr(pmr, after, 1);
830a3544580Soga 		uvm_pmr_insert_size(pmr, after);
831a3544580Soga 	}
832a3544580Soga 
833a3544580Soga 	uvm_pmr_assertvalid(pmr);
83456db1aa6Sariane 	return (after_sz > 0 ? after : NULL);
835a3544580Soga }
836a3544580Soga 
837a3544580Soga /*
838a3544580Soga  * Acquire a number of pages.
839a3544580Soga  *
840a3544580Soga  * count:	the number of pages returned
841a3544580Soga  * start:	lowest page number
842a3544580Soga  * end:		highest page number +1
843a3544580Soga  * 		(start = end = 0: no limitation)
844a3544580Soga  * align:	power-of-2 alignment constraint (align = 1: no alignment)
845a3544580Soga  * boundary:	power-of-2 boundary (boundary = 0: no boundary)
846a3544580Soga  * maxseg:	maximum number of segments to return
847a3544580Soga  * flags:	UVM_PLA_* flags
848a3544580Soga  * result:	returned pages storage (uses pageq)
849a3544580Soga  */
850a3544580Soga int
851a3544580Soga uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
852a3544580Soga     paddr_t boundary, int maxseg, int flags, struct pglist *result)
853a3544580Soga {
854a3544580Soga 	struct	uvm_pmemrange *pmr;	/* Iterate memory ranges. */
855a3544580Soga 	struct	vm_page *found, *f_next; /* Iterate chunks. */
856a3544580Soga 	psize_t	fcount;			/* Current found pages. */
857a3544580Soga 	int	fnsegs;			/* Current segment counter. */
858a3544580Soga 	int	try, start_try;
859a3544580Soga 	psize_t	search[3];
860a3544580Soga 	paddr_t	fstart, fend;		/* Pages to be taken from found. */
861a3544580Soga 	int	memtype;		/* Requested memtype. */
862a3544580Soga 	int	memtype_init;		/* Best memtype. */
863a3544580Soga 	int	desperate;		/* True if allocation failed. */
864a19ffaa9Sariane #ifdef DIAGNOSTIC
865a19ffaa9Sariane 	struct	vm_page *diag_prev;	/* Used during validation. */
866a19ffaa9Sariane #endif /* DIAGNOSTIC */
867a3544580Soga 
868a3544580Soga 	/*
869a3544580Soga 	 * Validate arguments.
870a3544580Soga 	 */
8716aa35f1fSbeck 	KASSERT(count > 0);
8726aa35f1fSbeck 	KASSERT(start == 0 || end == 0 || start < end);
8736aa35f1fSbeck 	KASSERT(align >= 1);
8746aa35f1fSbeck 	KASSERT(powerof2(align));
8756aa35f1fSbeck 	KASSERT(maxseg > 0);
8766aa35f1fSbeck 	KASSERT(boundary == 0 || powerof2(boundary));
8776aa35f1fSbeck 	KASSERT(boundary == 0 || maxseg * boundary >= count);
8786aa35f1fSbeck 	KASSERT(TAILQ_EMPTY(result));
87903c020efSmpi 	KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
880a3544580Soga 
881a3544580Soga 	/*
882a3544580Soga 	 * TRYCONTIG is a noop if you only want a single segment.
883a3544580Soga 	 * Remove it if that's the case: otherwise it'll deny the fast
884a3544580Soga 	 * allocation.
885a3544580Soga 	 */
886a3544580Soga 	if (maxseg == 1 || count == 1)
887a3544580Soga 		flags &= ~UVM_PLA_TRYCONTIG;
888a3544580Soga 
889a3544580Soga 	/*
890a3544580Soga 	 * Configure search.
891a3544580Soga 	 *
892a3544580Soga 	 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
893a3544580Soga 	 * search[1] is multiple segments, chosen to fulfill the search in
894a3544580Soga 	 *   approximately even-sized segments.
895a3544580Soga 	 *   This is a good trade-off between slightly reduced allocation speed
896a3544580Soga 	 *   and less fragmentation.
897a3544580Soga 	 * search[2] is the worst case, in which all segments are evaluated.
898a3544580Soga 	 *   This provides the least fragmentation, but makes the search
899a3544580Soga 	 *   possibly longer (although in the case it is selected, that no
900a3544580Soga 	 *   longer matters most).
901a3544580Soga 	 *
902a3544580Soga 	 * The exception is when maxseg == 1: since we can only fulfill that
903a3544580Soga 	 * with one segment of size pages, only a single search type has to
904a3544580Soga 	 * be attempted.
905a3544580Soga 	 */
906a3544580Soga 	if (maxseg == 1 || count == 1) {
907a3544580Soga 		start_try = 2;
908a3544580Soga 		search[2] = count;
909a3544580Soga 	} else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
910a3544580Soga 		start_try = 2;
911a3544580Soga 		search[2] = 1;
912a3544580Soga 	} else {
913a3544580Soga 		start_try = 0;
914a3544580Soga 		search[0] = count;
915a3544580Soga 		search[1] = pow2divide(count, maxseg);
916a3544580Soga 		search[2] = 1;
917a3544580Soga 		if ((flags & UVM_PLA_TRYCONTIG) == 0)
918a3544580Soga 			start_try = 1;
919a3544580Soga 		if (search[1] >= search[0]) {
920a3544580Soga 			search[1] = search[0];
921a3544580Soga 			start_try = 1;
922a3544580Soga 		}
923a3544580Soga 		if (search[2] >= search[start_try]) {
924a3544580Soga 			start_try = 2;
925a3544580Soga 		}
926a3544580Soga 	}
927a3544580Soga 
928a3544580Soga 	/*
929a3544580Soga 	 * Memory type: if zeroed memory is requested, traverse the zero set.
930a3544580Soga 	 * Otherwise, traverse the dirty set.
931a3544580Soga 	 *
932a3544580Soga 	 * The memtype iterator is reinitialized to memtype_init on entrance
933a3544580Soga 	 * of a pmemrange.
934a3544580Soga 	 */
935a3544580Soga 	if (flags & UVM_PLA_ZERO)
936a3544580Soga 		memtype_init = UVM_PMR_MEMTYPE_ZERO;
937a3544580Soga 	else
938a3544580Soga 		memtype_init = UVM_PMR_MEMTYPE_DIRTY;
939a3544580Soga 
940a3544580Soga 	/*
941a3544580Soga 	 * Initially, we're not desperate.
942a3544580Soga 	 *
943a3544580Soga 	 * Note that if we return from a sleep, we are still desperate.
944a3544580Soga 	 * Chances are that memory pressure is still high, so resetting
945a3544580Soga 	 * seems over-optimistic to me.
946a3544580Soga 	 */
947a3544580Soga 	desperate = 0;
948a3544580Soga 
94990ee2fe0Sbeck 	uvm_lock_fpageq();
95017fb2b23Smpi retry:		/* Return point after sleeping. */
95103c020efSmpi 	/*
95203c020efSmpi 	 * check to see if we need to generate some free pages waking
95303c020efSmpi 	 * the pagedaemon.
95403c020efSmpi 	 */
95503c020efSmpi 	if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
95603c020efSmpi 	    ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
95703c020efSmpi 	    (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
95803c020efSmpi 		wakeup(&uvm.pagedaemon);
95903c020efSmpi 
96003c020efSmpi 	/*
96103c020efSmpi 	 * fail if any of these conditions is true:
96203c020efSmpi 	 * [1]  there really are no free pages, or
96303c020efSmpi 	 * [2]  only kernel "reserved" pages remain and
96403c020efSmpi 	 *        the UVM_PLA_USERESERVE flag wasn't used.
96503c020efSmpi 	 * [3]  only pagedaemon "reserved" pages remain and
96603c020efSmpi 	 *        the requestor isn't the pagedaemon nor the syncer.
96703c020efSmpi 	 */
968b7757c6eSkettenis 	if ((uvmexp.free <= (uvmexp.reserve_kernel + count)) &&
96903c020efSmpi 	    !(flags & UVM_PLA_USERESERVE)) {
97003c020efSmpi 		uvm_unlock_fpageq();
97103c020efSmpi 		return ENOMEM;
97203c020efSmpi 	}
97303c020efSmpi 
97403c020efSmpi 	if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) &&
975d60475a6Smpi 	    !in_pagedaemon(1)) {
9763976fff1Smpi 	    	uvm_unlock_fpageq();
97703c020efSmpi 		if (flags & UVM_PLA_WAITOK) {
97803c020efSmpi 			uvm_wait("uvm_pmr_getpages");
97917fb2b23Smpi 			uvm_lock_fpageq();
98017fb2b23Smpi 			goto retry;
98103c020efSmpi 		}
98203c020efSmpi 		return ENOMEM;
98303c020efSmpi 	}
98403c020efSmpi 
985a3544580Soga 	fcount = 0;
986a3544580Soga 	fnsegs = 0;
987a3544580Soga 
9883d6ee858Soga retry_desperate:
989a3544580Soga 	/*
990a3544580Soga 	 * If we just want any page(s), go for the really fast option.
991a3544580Soga 	 */
992a3544580Soga 	if (count <= maxseg && align == 1 && boundary == 0 &&
993a3544580Soga 	    (flags & UVM_PLA_TRYCONTIG) == 0) {
994a3544580Soga 		fcount += uvm_pmr_get1page(count - fcount, memtype_init,
995ef440d08Skettenis 		    result, start, end, 0);
996a3544580Soga 
997a3544580Soga 		/*
9984af3577fSjsg 		 * If we found sufficient pages, go to the success exit code.
999a3544580Soga 		 *
1000a3544580Soga 		 * Otherwise, go immediately to fail, since we collected
1001a3544580Soga 		 * all we could anyway.
1002a3544580Soga 		 */
1003a3544580Soga 		if (fcount == count)
10043d6ee858Soga 			goto out;
1005a3544580Soga 		else
10063d6ee858Soga 			goto fail;
1007a3544580Soga 	}
1008a3544580Soga 
1009a3544580Soga 	/*
1010ddc77f9fSoga 	 * The heart of the contig case.
1011a3544580Soga 	 *
1012a3544580Soga 	 * The code actually looks like this:
1013a3544580Soga 	 *
1014a3544580Soga 	 * foreach (struct pmemrange) {
1015a3544580Soga 	 *	foreach (memtype) {
1016a3544580Soga 	 *		foreach(try) {
1017a3544580Soga 	 *			foreach (free range of memtype in pmemrange,
1018a3544580Soga 	 *			    starting at search[try]) {
1019a3544580Soga 	 *				while (range has space left)
1020a3544580Soga 	 *					take from range
1021a3544580Soga 	 *			}
1022a3544580Soga 	 *		}
1023a3544580Soga 	 *	}
1024a3544580Soga 	 *
1025a3544580Soga 	 *	if next pmemrange has higher usecount than current:
1026a3544580Soga 	 *		enter desperate case (which will drain the pmemranges
1027a3544580Soga 	 *		until empty prior to moving to the next one)
1028a3544580Soga 	 * }
1029a3544580Soga 	 *
1030a3544580Soga 	 * When desperate is activated, try always starts at the highest
1031a3544580Soga 	 * value. The memtype loop is using a goto ReScanMemtype.
1032a3544580Soga 	 * The try loop is using a goto ReScan.
1033a3544580Soga 	 * The 'range has space left' loop uses label DrainFound.
1034a3544580Soga 	 *
1035a3544580Soga 	 * Writing them all as loops would take up a lot of screen space in
1036a3544580Soga 	 * the form of indentation and some parts are easier to express
1037a3544580Soga 	 * using the labels.
1038a3544580Soga 	 */
1039a3544580Soga 
1040a3544580Soga 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1041a3544580Soga 		/* Empty range. */
1042a3544580Soga 		if (pmr->nsegs == 0)
1043a3544580Soga 			continue;
1044a3544580Soga 
1045a3544580Soga 		/* Outside requested range. */
1046a3544580Soga 		if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1047a3544580Soga 			continue;
1048a3544580Soga 
1049a3544580Soga 		memtype = memtype_init;
1050a3544580Soga 
10513d6ee858Soga rescan_memtype:	/* Return point at memtype++. */
1052a3544580Soga 		try = start_try;
1053a3544580Soga 
10543d6ee858Soga rescan:		/* Return point at try++. */
1055a3544580Soga 		for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
1056a3544580Soga 		    found != NULL;
1057a3544580Soga 		    found = f_next) {
1058a3544580Soga 			f_next = uvm_pmr_nextsz(pmr, found, memtype);
1059a3544580Soga 
1060a3544580Soga 			fstart = atop(VM_PAGE_TO_PHYS(found));
1061a3544580Soga 			if (start != 0)
1062a3544580Soga 				fstart = MAX(start, fstart);
10633d6ee858Soga drain_found:
1064a3544580Soga 			/*
1065a3544580Soga 			 * Throw away the first segment if fnsegs == maxseg
1066a3544580Soga 			 *
1067a3544580Soga 			 * Note that f_next is still valid after this call,
1068a3544580Soga 			 * since we only allocated from entries before f_next.
1069a3544580Soga 			 * We don't revisit the entries we already extracted
1070a3544580Soga 			 * from unless we entered the desperate case.
1071a3544580Soga 			 */
1072a3544580Soga 			if (fnsegs == maxseg) {
1073a3544580Soga 				fnsegs--;
1074a3544580Soga 				fcount -=
1075a3544580Soga 				    uvm_pmr_remove_1strange(result, boundary,
1076a3544580Soga 				    &found, desperate);
1077a3544580Soga 			}
1078a3544580Soga 
1079a3544580Soga 			fstart = PMR_ALIGN(fstart, align);
1080a3544580Soga 			fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
1081c6b41277Sariane 			if (end != 0)
1082c6b41277Sariane 				fend = MIN(end, fend);
1083a3544580Soga 			if (boundary != 0) {
1084a3544580Soga 				fend =
1085a3544580Soga 				    MIN(fend, PMR_ALIGN(fstart + 1, boundary));
1086a3544580Soga 			}
1087c6b41277Sariane 			if (fstart >= fend)
1088c6b41277Sariane 				continue;
1089a3544580Soga 			if (fend - fstart > count - fcount)
1090a3544580Soga 				fend = fstart + (count - fcount);
1091a3544580Soga 
1092a3544580Soga 			fcount += fend - fstart;
1093a3544580Soga 			fnsegs++;
1094a3544580Soga 			found = uvm_pmr_extract_range(pmr, found,
1095a3544580Soga 			    fstart, fend, result);
1096a3544580Soga 
1097a3544580Soga 			if (fcount == count)
10983d6ee858Soga 				goto out;
1099a3544580Soga 
1100a3544580Soga 			/*
1101a3544580Soga 			 * If there's still space left in found, try to
11024af3577fSjsg 			 * fully drain it prior to continuing.
1103a3544580Soga 			 */
1104a3544580Soga 			if (found != NULL) {
1105a3544580Soga 				fstart = fend;
11063d6ee858Soga 				goto drain_found;
1107a3544580Soga 			}
1108a3544580Soga 		}
1109a3544580Soga 
111035164244Stedu 		/* Try a smaller search now. */
1111a3544580Soga 		if (++try < nitems(search))
11123d6ee858Soga 			goto rescan;
1113a3544580Soga 
1114a3544580Soga 		/*
1115a3544580Soga 		 * Exhaust all memory types prior to going to the next memory
1116a3544580Soga 		 * segment.
1117a3544580Soga 		 * This means that zero-vs-dirty are eaten prior to moving
1118a3544580Soga 		 * to a pmemrange with a higher use-count.
1119a3544580Soga 		 *
1120a3544580Soga 		 * Code is basically a difficult way of writing:
1121a3544580Soga 		 * memtype = memtype_init;
1122a3544580Soga 		 * do {
1123a3544580Soga 		 *	...;
1124a3544580Soga 		 *	memtype += 1;
1125a3544580Soga 		 *	memtype %= MEMTYPE_MAX;
1126a3544580Soga 		 * } while (memtype != memtype_init);
1127a3544580Soga 		 */
1128a3544580Soga 		memtype += 1;
1129a3544580Soga 		if (memtype == UVM_PMR_MEMTYPE_MAX)
1130a3544580Soga 			memtype = 0;
1131a3544580Soga 		if (memtype != memtype_init)
11323d6ee858Soga 			goto rescan_memtype;
1133a3544580Soga 
1134a3544580Soga 		/*
1135a3544580Soga 		 * If not desperate, enter desperate case prior to eating all
1136a3544580Soga 		 * the good stuff in the next range.
1137a3544580Soga 		 */
1138a3544580Soga 		if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
1139a3544580Soga 		    TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
1140a3544580Soga 			break;
1141a3544580Soga 	}
1142a3544580Soga 
1143a3544580Soga 	/*
1144a3544580Soga 	 * Not enough memory of the requested type available. Fall back to
1145a3544580Soga 	 * less good memory that we'll clean up better later.
1146a3544580Soga 	 *
1147a3544580Soga 	 * This algorithm is not very smart though, it just starts scanning
1148a3544580Soga 	 * a different typed range, but the nicer ranges of the previous
1149a3544580Soga 	 * iteration may fall out. Hence there is a small chance of a false
1150a3544580Soga 	 * negative.
1151a3544580Soga 	 *
11524af3577fSjsg 	 * When desperate: scan all sizes starting at the smallest
1153a3544580Soga 	 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
1154a3544580Soga 	 * allow us to hit the fast path now).
1155a3544580Soga 	 *
1156a3544580Soga 	 * Also, because we will revisit entries we scanned before, we need
1157a3544580Soga 	 * to reset the page queue, or we may end up releasing entries in
1158a3544580Soga 	 * such a way as to invalidate f_next.
1159a3544580Soga 	 */
1160a3544580Soga 	if (!desperate) {
1161a3544580Soga 		desperate = 1;
1162a3544580Soga 		start_try = nitems(search) - 1;
1163a3544580Soga 		flags &= ~UVM_PLA_TRYCONTIG;
1164a3544580Soga 
1165a3544580Soga 		while (!TAILQ_EMPTY(result))
1166a3544580Soga 			uvm_pmr_remove_1strange(result, 0, NULL, 0);
1167a3544580Soga 		fnsegs = 0;
1168a3544580Soga 		fcount = 0;
11693d6ee858Soga 		goto retry_desperate;
1170a3544580Soga 	}
1171a3544580Soga 
11723d6ee858Soga fail:
117335164244Stedu 	/* Allocation failed. */
1174a3544580Soga 	/* XXX: claim from memory reserve here */
1175a3544580Soga 
1176a3544580Soga 	while (!TAILQ_EMPTY(result))
1177a3544580Soga 		uvm_pmr_remove_1strange(result, 0, NULL, 0);
1178a3544580Soga 
1179a3544580Soga 	if (flags & UVM_PLA_WAITOK) {
118090ee2fe0Sbeck 		if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
118190ee2fe0Sbeck 		    flags & UVM_PLA_FAILOK) == 0)
11823d6ee858Soga 			goto retry;
118390ee2fe0Sbeck 		KASSERT(flags & UVM_PLA_FAILOK);
1184ba03bb80Smpi 	} else if (!(flags & UVM_PLA_NOWAKE)) {
1185ba03bb80Smpi 		struct uvm_pmalloc *pma = &nowait_pma;
1186ba03bb80Smpi 
1187ba03bb80Smpi 		if (!(nowait_pma.pm_flags & UVM_PMA_LINKED)) {
1188ba03bb80Smpi 			nowait_pma.pm_flags = UVM_PMA_LINKED;
1189ba03bb80Smpi 			TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, pma, pmq);
11908f004b5aSthib 			wakeup(&uvm.pagedaemon);
1191d4c6c9b5Sbeck 		}
1192d4c6c9b5Sbeck 	}
119390ee2fe0Sbeck 	uvm_unlock_fpageq();
1194a3544580Soga 
1195a3544580Soga 	return ENOMEM;
1196a3544580Soga 
11973d6ee858Soga out:
11984af3577fSjsg 	/* Allocation successful. */
1199a3544580Soga 	uvmexp.free -= fcount;
1200a3544580Soga 
1201a3544580Soga 	uvm_unlock_fpageq();
1202a3544580Soga 
1203a3544580Soga 	/* Update statistics and zero pages if UVM_PLA_ZERO. */
1204a19ffaa9Sariane #ifdef DIAGNOSTIC
1205a19ffaa9Sariane 	fnsegs = 0;
1206a19ffaa9Sariane 	fcount = 0;
1207a19ffaa9Sariane 	diag_prev = NULL;
1208a19ffaa9Sariane #endif /* DIAGNOSTIC */
1209a3544580Soga 	TAILQ_FOREACH(found, result, pageq) {
12101a0bb2c4Smiod 		atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1211a3544580Soga 
1212a3544580Soga 		if (found->pg_flags & PG_ZERO) {
1213e216ee02Sblambert 			uvm_lock_fpageq();
1214a3544580Soga 			uvmexp.zeropages--;
1215ef440d08Skettenis 			if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1216ef440d08Skettenis 				wakeup(&uvmexp.zeropages);
1217e216ee02Sblambert 			uvm_unlock_fpageq();
1218a3544580Soga 		}
1219a3544580Soga 		if (flags & UVM_PLA_ZERO) {
1220a3544580Soga 			if (found->pg_flags & PG_ZERO)
1221a3544580Soga 				uvmexp.pga_zerohit++;
1222a3544580Soga 			else {
1223a3544580Soga 				uvmexp.pga_zeromiss++;
1224a3544580Soga 				uvm_pagezero(found);
1225a3544580Soga 			}
1226a3544580Soga 		}
1227a3544580Soga 		atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1228a3544580Soga 
1229a3544580Soga 		found->uobject = NULL;
1230a3544580Soga 		found->uanon = NULL;
1231a3544580Soga 		found->pg_version++;
1232a3544580Soga 
1233a3544580Soga 		/*
1234a3544580Soga 		 * Validate that the page matches range criterium.
1235a3544580Soga 		 */
1236a3544580Soga 		KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1237a3544580Soga 		KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1238a19ffaa9Sariane 
1239a19ffaa9Sariane #ifdef DIAGNOSTIC
1240a19ffaa9Sariane 		/*
1241a19ffaa9Sariane 		 * Update fcount (# found pages) and
1242a19ffaa9Sariane 		 * fnsegs (# found segments) counters.
1243a19ffaa9Sariane 		 */
1244a19ffaa9Sariane 		if (diag_prev == NULL ||
1245a19ffaa9Sariane 		    /* new segment if it contains a hole */
1246a19ffaa9Sariane 		    atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1247a19ffaa9Sariane 		    atop(VM_PAGE_TO_PHYS(found)) ||
1248a19ffaa9Sariane 		    /* new segment if it crosses boundary */
1249a19ffaa9Sariane 		    (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1250a19ffaa9Sariane 		    (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1251a19ffaa9Sariane 			fnsegs++;
1252a19ffaa9Sariane 		fcount++;
1253a19ffaa9Sariane 
1254a19ffaa9Sariane 		diag_prev = found;
1255a19ffaa9Sariane #endif /* DIAGNOSTIC */
1256a3544580Soga 	}
1257a3544580Soga 
1258a19ffaa9Sariane #ifdef DIAGNOSTIC
1259a19ffaa9Sariane 	/*
1260a19ffaa9Sariane 	 * Panic on algorithm failure.
1261a19ffaa9Sariane 	 */
1262a19ffaa9Sariane 	if (fcount != count || fnsegs > maxseg) {
1263a19ffaa9Sariane 		panic("pmemrange allocation error: "
1264a19ffaa9Sariane 		    "allocated %ld pages in %d segments, "
1265a19ffaa9Sariane 		    "but request was %ld pages in %d segments",
1266a19ffaa9Sariane 		    fcount, fnsegs, count, maxseg);
1267a19ffaa9Sariane 	}
1268a19ffaa9Sariane #endif /* DIAGNOSTIC */
1269a19ffaa9Sariane 
1270a3544580Soga 	return 0;
1271a3544580Soga }
1272a3544580Soga 
1273a3544580Soga /*
127482673a18Smpi  * Acquire a single page.
127582673a18Smpi  *
127682673a18Smpi  * flags:	UVM_PLA_* flags
127782673a18Smpi  * result:	returned page.
127882673a18Smpi  */
127982673a18Smpi struct vm_page *
128082673a18Smpi uvm_pmr_getone(int flags)
128182673a18Smpi {
128282673a18Smpi 	struct vm_page *pg;
128382673a18Smpi 	struct pglist pgl;
128482673a18Smpi 
128582673a18Smpi 	TAILQ_INIT(&pgl);
128682673a18Smpi 	if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, flags, &pgl) != 0)
128782673a18Smpi 		return NULL;
128882673a18Smpi 
128982673a18Smpi 	pg = TAILQ_FIRST(&pgl);
129082673a18Smpi 	KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL);
129182673a18Smpi 
129282673a18Smpi 	return pg;
129382673a18Smpi }
129482673a18Smpi 
129582673a18Smpi /*
1296a3544580Soga  * Free a number of contig pages (invoked by uvm_page_init).
1297a3544580Soga  */
1298a3544580Soga void
1299a3544580Soga uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1300a3544580Soga {
1301a3544580Soga 	struct uvm_pmemrange *pmr;
1302a3544580Soga 	psize_t i, pmr_count;
130332007cedSmiod 	struct vm_page *firstpg = pg;
1304a3544580Soga 
1305a3544580Soga 	for (i = 0; i < count; i++) {
1306a3544580Soga 		KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1307a3544580Soga 		    atop(VM_PAGE_TO_PHYS(pg)) + i);
1308a3544580Soga 
1309a3544580Soga 		if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1310a3544580Soga 		    VALID_FLAGS(pg[i].pg_flags))) {
1311a3544580Soga 			printf("Flags: 0x%x, will panic now.\n",
1312a3544580Soga 			    pg[i].pg_flags);
1313a3544580Soga 		}
1314a3544580Soga 		KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1315a3544580Soga 		    VALID_FLAGS(pg[i].pg_flags));
1316a3544580Soga 		atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1317a3544580Soga 		atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1318a3544580Soga 	}
1319a3544580Soga 
1320a3544580Soga 	uvm_lock_fpageq();
1321a3544580Soga 
132232007cedSmiod 	for (i = count; i > 0; i -= pmr_count) {
1323a3544580Soga 		pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1324a3544580Soga 		KASSERT(pmr != NULL);
1325a3544580Soga 
132632007cedSmiod 		pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1327a3544580Soga 		pg->fpgsz = pmr_count;
1328a3544580Soga 		uvm_pmr_insert(pmr, pg, 0);
1329a3544580Soga 
1330a3544580Soga 		uvmexp.free += pmr_count;
1331a3544580Soga 		pg += pmr_count;
1332a3544580Soga 	}
1333a3544580Soga 	wakeup(&uvmexp.free);
1334ef440d08Skettenis 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1335ef440d08Skettenis 		wakeup(&uvmexp.zeropages);
1336a3544580Soga 
133732007cedSmiod 	uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
133890ee2fe0Sbeck 
1339a3544580Soga 	uvm_unlock_fpageq();
1340a3544580Soga }
1341a3544580Soga 
1342a3544580Soga /*
1343a3544580Soga  * Free all pages in the queue.
1344a3544580Soga  */
1345a3544580Soga void
1346a3544580Soga uvm_pmr_freepageq(struct pglist *pgl)
1347a3544580Soga {
1348a3544580Soga 	struct vm_page *pg;
134990ee2fe0Sbeck 	paddr_t pstart;
135090ee2fe0Sbeck 	psize_t plen;
1351a3544580Soga 
1352a3544580Soga 	TAILQ_FOREACH(pg, pgl, pageq) {
1353a3544580Soga 		if (!((pg->pg_flags & PQ_FREE) == 0 &&
1354a3544580Soga 		    VALID_FLAGS(pg->pg_flags))) {
1355a3544580Soga 			printf("Flags: 0x%x, will panic now.\n",
1356a3544580Soga 			    pg->pg_flags);
1357a3544580Soga 		}
1358a3544580Soga 		KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1359a3544580Soga 		    VALID_FLAGS(pg->pg_flags));
1360a3544580Soga 		atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1361a3544580Soga 		atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1362a3544580Soga 	}
1363a3544580Soga 
1364a3544580Soga 	uvm_lock_fpageq();
136590ee2fe0Sbeck 	while (!TAILQ_EMPTY(pgl)) {
13661e57804eSbeck 		pg = TAILQ_FIRST(pgl);
13671e57804eSbeck 		if (pg == TAILQ_NEXT(pg, pageq) + 1) {
13681e57804eSbeck 			/*
13691e57804eSbeck 			 * If pg is one behind the position of the
13701e57804eSbeck 			 * next page in the list in the page array,
13711e57804eSbeck 			 * try going backwards instead of forward.
13721e57804eSbeck 			 */
13731e57804eSbeck 			plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart);
13741e57804eSbeck 		} else {
137590ee2fe0Sbeck 			pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
137690ee2fe0Sbeck 			plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
13771e57804eSbeck 		}
137890ee2fe0Sbeck 		uvmexp.free += plen;
137990ee2fe0Sbeck 
138090ee2fe0Sbeck 		uvm_wakeup_pla(pstart, ptoa(plen));
138190ee2fe0Sbeck 	}
1382a3544580Soga 	wakeup(&uvmexp.free);
1383ef440d08Skettenis 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1384ef440d08Skettenis 		wakeup(&uvmexp.zeropages);
1385a3544580Soga 	uvm_unlock_fpageq();
1386a3544580Soga 
1387a3544580Soga 	return;
1388a3544580Soga }
1389a3544580Soga 
1390a3544580Soga /*
1391a3544580Soga  * Store a pmemrange in the list.
1392a3544580Soga  *
1393a3544580Soga  * The list is sorted by use.
1394a3544580Soga  */
1395a3544580Soga struct uvm_pmemrange *
1396a3544580Soga uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1397a3544580Soga     struct uvm_pmemrange *pmr)
1398a3544580Soga {
1399a3544580Soga 	struct uvm_pmemrange *iter;
1400a3544580Soga 	int cmp = 1;
1401a3544580Soga 
1402a3544580Soga 	TAILQ_FOREACH(iter, useq, pmr_use) {
1403a3544580Soga 		cmp = uvm_pmemrange_use_cmp(pmr, iter);
1404a3544580Soga 		if (cmp == 0)
1405a3544580Soga 			return iter;
1406a3544580Soga 		if (cmp == -1)
1407a3544580Soga 			break;
1408a3544580Soga 	}
1409a3544580Soga 
1410a3544580Soga 	if (iter == NULL)
1411a3544580Soga 		TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1412a3544580Soga 	else
1413a3544580Soga 		TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1414a3544580Soga 	return NULL;
1415a3544580Soga }
1416a3544580Soga 
1417a3544580Soga #ifdef DEBUG
1418a3544580Soga /*
1419a3544580Soga  * Validation of the whole pmemrange.
1420a3544580Soga  * Called with fpageq locked.
1421a3544580Soga  */
1422a3544580Soga void
1423a3544580Soga uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1424a3544580Soga {
1425a3544580Soga 	struct vm_page *prev, *next, *i, *xref;
1426a3544580Soga 	int lcv, mti;
1427a3544580Soga 
1428b0fb607fSthib 	/* Empty range */
1429b0fb607fSthib 	if (pmr->nsegs == 0)
1430b0fb607fSthib 		return;
1431b0fb607fSthib 
1432a3544580Soga 	/* Validate address tree. */
14336d0d0baaSdlg 	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1434a3544580Soga 		/* Validate the range. */
1435a3544580Soga 		KASSERT(i->fpgsz > 0);
1436a3544580Soga 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1437a3544580Soga 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1438a3544580Soga 		    <= pmr->high);
1439a3544580Soga 
1440a3544580Soga 		/* Validate each page in this range. */
1441a3544580Soga 		for (lcv = 0; lcv < i->fpgsz; lcv++) {
1442a3544580Soga 			/*
1443a3544580Soga 			 * Only the first page has a size specification.
1444a3544580Soga 			 * Rest is size 0.
1445a3544580Soga 			 */
1446a3544580Soga 			KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1447a3544580Soga 			/*
1448a3544580Soga 			 * Flag check.
1449a3544580Soga 			 */
1450a3544580Soga 			KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1451a3544580Soga 			    (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1452a3544580Soga 			/*
1453a3544580Soga 			 * Free pages are:
1454a3544580Soga 			 * - not wired
1455a3544580Soga 			 * - have no vm_anon
1456a3544580Soga 			 * - have no uvm_object
1457a3544580Soga 			 */
1458a3544580Soga 			KASSERT(i[lcv].wire_count == 0);
1459a3544580Soga 			KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1460a3544580Soga 			    i[lcv].uanon == NULL);
1461a3544580Soga 			KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1462a3544580Soga 			    i[lcv].uobject == NULL);
1463a3544580Soga 			/*
1464a3544580Soga 			 * Pages in a single range always have the same
1465a3544580Soga 			 * memtype.
1466a3544580Soga 			 */
1467a3544580Soga 			KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1468a3544580Soga 			    uvm_pmr_pg_to_memtype(&i[lcv]));
1469a3544580Soga 		}
1470a3544580Soga 
1471a3544580Soga 		/* Check that it shouldn't be joined with its predecessor. */
14726d0d0baaSdlg 		prev = RBT_PREV(uvm_pmr_addr, i);
1473a3544580Soga 		if (prev != NULL) {
1474a3544580Soga 			KASSERT(uvm_pmr_pg_to_memtype(i) !=
1475a3544580Soga 			    uvm_pmr_pg_to_memtype(prev) ||
1476a3544580Soga 			    atop(VM_PAGE_TO_PHYS(i)) >
1477a3544580Soga 			    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1478a3544580Soga 			    prev + prev->fpgsz != i);
1479a3544580Soga 		}
1480a3544580Soga 
1481a3544580Soga 		/* Assert i is in the size tree as well. */
1482a3544580Soga 		if (i->fpgsz == 1) {
1483a3544580Soga 			TAILQ_FOREACH(xref,
1484a3544580Soga 			    &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1485a3544580Soga 				if (xref == i)
1486a3544580Soga 					break;
1487a3544580Soga 			}
1488a3544580Soga 			KASSERT(xref == i);
1489a3544580Soga 		} else {
14906d0d0baaSdlg 			KASSERT(RBT_FIND(uvm_pmr_size,
1491a3544580Soga 			    &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1492a3544580Soga 			    i + 1);
1493a3544580Soga 		}
1494a3544580Soga 	}
1495a3544580Soga 
1496a3544580Soga 	/* Validate size tree. */
1497a3544580Soga 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1498a3544580Soga 		for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1499a3544580Soga 			next = uvm_pmr_nextsz(pmr, i, mti);
1500a3544580Soga 			if (next != NULL) {
1501a3544580Soga 				KASSERT(i->fpgsz <=
1502a3544580Soga 				    next->fpgsz);
1503a3544580Soga 			}
1504a3544580Soga 
1505a3544580Soga 			/* Assert i is in the addr tree as well. */
15066d0d0baaSdlg 			KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1507a3544580Soga 
1508a3544580Soga 			/* Assert i is of the correct memory type. */
1509a3544580Soga 			KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1510a3544580Soga 		}
1511a3544580Soga 	}
1512a3544580Soga 
1513a3544580Soga 	/* Validate nsegs statistic. */
1514a3544580Soga 	lcv = 0;
15156d0d0baaSdlg 	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1516a3544580Soga 		lcv++;
1517a3544580Soga 	KASSERT(pmr->nsegs == lcv);
1518a3544580Soga }
1519a3544580Soga #endif /* DEBUG */
1520a3544580Soga 
1521a3544580Soga /*
1522a3544580Soga  * Split pmr at split point pageno.
1523a3544580Soga  * Called with fpageq unlocked.
1524a3544580Soga  *
1525a3544580Soga  * Split is only applied if a pmemrange spans pageno.
1526a3544580Soga  */
1527a3544580Soga void
1528a3544580Soga uvm_pmr_split(paddr_t pageno)
1529a3544580Soga {
1530a3544580Soga 	struct uvm_pmemrange *pmr, *drain;
1531a3544580Soga 	struct vm_page *rebuild, *prev, *next;
1532a3544580Soga 	psize_t prev_sz;
1533a3544580Soga 
1534a3544580Soga 	uvm_lock_fpageq();
1535a3544580Soga 	pmr = uvm_pmemrange_find(pageno);
1536a3544580Soga 	if (pmr == NULL || !(pmr->low < pageno)) {
1537a3544580Soga 		/* No split required. */
1538a3544580Soga 		uvm_unlock_fpageq();
1539a3544580Soga 		return;
1540a3544580Soga 	}
1541a3544580Soga 
1542a3544580Soga 	KASSERT(pmr->low < pageno);
1543a3544580Soga 	KASSERT(pmr->high > pageno);
1544a3544580Soga 
1545b426ab7bSthib 	/*
1546b426ab7bSthib 	 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1547b426ab7bSthib 	 * uvm_kmemalloc which calls into pmemrange, making the locking
1548b426ab7bSthib 	 * a bit hard, so we just race!
1549b426ab7bSthib 	 */
1550b426ab7bSthib 	uvm_unlock_fpageq();
1551a3544580Soga 	drain = uvm_pmr_allocpmr();
1552b426ab7bSthib 	uvm_lock_fpageq();
1553b426ab7bSthib 	pmr = uvm_pmemrange_find(pageno);
1554b426ab7bSthib 	if (pmr == NULL || !(pmr->low < pageno)) {
1555b426ab7bSthib 		/*
1556b426ab7bSthib 		 * We lost the race since someone else ran this or a related
1557b426ab7bSthib 		 * function, however this should be triggered very rarely so
1558b426ab7bSthib 		 * we just leak the pmr.
1559b426ab7bSthib 		 */
1560b426ab7bSthib 		printf("uvm_pmr_split: lost one pmr\n");
1561b426ab7bSthib 		uvm_unlock_fpageq();
1562b426ab7bSthib 		return;
1563b426ab7bSthib 	}
1564b426ab7bSthib 
1565a3544580Soga 	drain->low = pageno;
1566a3544580Soga 	drain->high = pmr->high;
1567a3544580Soga 	drain->use = pmr->use;
1568a3544580Soga 
1569a3544580Soga 	uvm_pmr_assertvalid(pmr);
1570a3544580Soga 	uvm_pmr_assertvalid(drain);
1571a3544580Soga 	KASSERT(drain->nsegs == 0);
1572a3544580Soga 
1573262a556aSdlg 	RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1574a3544580Soga 		if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1575a3544580Soga 			break;
1576a3544580Soga 	}
1577a3544580Soga 	if (rebuild == NULL)
1578262a556aSdlg 		prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1579a3544580Soga 	else
1580262a556aSdlg 		prev = RBT_PREV(uvm_pmr_addr, rebuild);
1581a3544580Soga 	KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1582a3544580Soga 
1583a3544580Soga 	/*
1584a3544580Soga 	 * Handle free chunk that spans the split point.
1585a3544580Soga 	 */
1586a3544580Soga 	if (prev != NULL &&
1587a3544580Soga 	    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1588a3544580Soga 		psize_t before, after;
1589a3544580Soga 
1590a3544580Soga 		KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1591a3544580Soga 
1592a3544580Soga 		uvm_pmr_remove(pmr, prev);
1593a3544580Soga 		prev_sz = prev->fpgsz;
1594a3544580Soga 		before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1595a3544580Soga 		after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1596a3544580Soga 
1597a3544580Soga 		KASSERT(before > 0);
1598a3544580Soga 		KASSERT(after > 0);
1599a3544580Soga 
1600a3544580Soga 		prev->fpgsz = before;
1601a3544580Soga 		uvm_pmr_insert(pmr, prev, 1);
1602a3544580Soga 		(prev + before)->fpgsz = after;
1603a3544580Soga 		uvm_pmr_insert(drain, prev + before, 1);
1604a3544580Soga 	}
1605a3544580Soga 
160635164244Stedu 	/* Move free chunks that no longer fall in the range. */
1607a3544580Soga 	for (; rebuild != NULL; rebuild = next) {
1608262a556aSdlg 		next = RBT_NEXT(uvm_pmr_addr, rebuild);
1609a3544580Soga 
1610a3544580Soga 		uvm_pmr_remove(pmr, rebuild);
1611a3544580Soga 		uvm_pmr_insert(drain, rebuild, 1);
1612a3544580Soga 	}
1613a3544580Soga 
1614a3544580Soga 	pmr->high = pageno;
1615a3544580Soga 	uvm_pmr_assertvalid(pmr);
1616a3544580Soga 	uvm_pmr_assertvalid(drain);
1617a3544580Soga 
16183accc929Sdlg 	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1619a3544580Soga 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1620a3544580Soga 	uvm_unlock_fpageq();
1621a3544580Soga }
1622a3544580Soga 
1623a3544580Soga /*
1624a3544580Soga  * Increase the usage counter for the given range of memory.
1625a3544580Soga  *
1626a3544580Soga  * The more usage counters a given range of memory has, the more will be
1627a3544580Soga  * attempted not to allocate from it.
1628a3544580Soga  *
1629a3544580Soga  * Addresses here are in paddr_t, not page-numbers.
1630a3544580Soga  * The lowest and highest allowed address are specified.
1631a3544580Soga  */
1632a3544580Soga void
1633a3544580Soga uvm_pmr_use_inc(paddr_t low, paddr_t high)
1634a3544580Soga {
1635a3544580Soga 	struct uvm_pmemrange *pmr;
1636b426ab7bSthib 	paddr_t sz;
1637a3544580Soga 
1638b426ab7bSthib 	/* pmr uses page numbers, translate low and high. */
1639a3544580Soga 	high++;
1640a3544580Soga 	high = atop(trunc_page(high));
1641b426ab7bSthib 	low = atop(round_page(low));
1642a3544580Soga 	uvm_pmr_split(low);
1643a3544580Soga 	uvm_pmr_split(high);
1644a3544580Soga 
1645c37416d6Smiod 	sz = 0;
1646a3544580Soga 	uvm_lock_fpageq();
1647a3544580Soga 	/* Increase use count on segments in range. */
16483accc929Sdlg 	RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1649a3544580Soga 		if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1650a3544580Soga 			TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1651a3544580Soga 			pmr->use++;
1652b426ab7bSthib 			sz += pmr->high - pmr->low;
1653a3544580Soga 			uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1654a3544580Soga 		}
1655a3544580Soga 		uvm_pmr_assertvalid(pmr);
1656a3544580Soga 	}
1657a3544580Soga 	uvm_unlock_fpageq();
1658b426ab7bSthib 
1659b426ab7bSthib 	KASSERT(sz >= high - low);
1660a3544580Soga }
1661a3544580Soga 
1662a3544580Soga /*
1663a3544580Soga  * Allocate a pmemrange.
1664a3544580Soga  *
1665a3544580Soga  * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1666a3544580Soga  * If called after uvm_init, malloc is used.
1667a3544580Soga  * (And if called in between, you're dead.)
1668a3544580Soga  */
1669a3544580Soga struct uvm_pmemrange *
1670b426ab7bSthib uvm_pmr_allocpmr(void)
1671a3544580Soga {
1672a3544580Soga 	struct uvm_pmemrange *nw;
1673a3544580Soga 	int i;
1674a3544580Soga 
1675b426ab7bSthib 	/* We're only ever hitting the !uvm.page_init_done case for now. */
1676a3544580Soga 	if (!uvm.page_init_done) {
1677a3544580Soga 		nw = (struct uvm_pmemrange *)
1678a3544580Soga 		    uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1679a3544580Soga 	} else {
1680a3544580Soga 		nw = malloc(sizeof(struct uvm_pmemrange),
1681b426ab7bSthib 		    M_VMMAP, M_NOWAIT);
1682a3544580Soga 	}
1683b426ab7bSthib 	KASSERT(nw != NULL);
16846c0aa6dcStedu 	memset(nw, 0, sizeof(struct uvm_pmemrange));
1685262a556aSdlg 	RBT_INIT(uvm_pmr_addr, &nw->addr);
1686a3544580Soga 	for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1687262a556aSdlg 		RBT_INIT(uvm_pmr_size, &nw->size[i]);
1688a3544580Soga 		TAILQ_INIT(&nw->single[i]);
1689a3544580Soga 	}
1690a3544580Soga 	return nw;
1691a3544580Soga }
1692a3544580Soga 
1693a3544580Soga /*
1694a3544580Soga  * Initialization of pmr.
1695a3544580Soga  * Called by uvm_page_init.
1696a3544580Soga  *
1697a3544580Soga  * Sets up pmemranges.
1698a3544580Soga  */
1699a3544580Soga void
1700a3544580Soga uvm_pmr_init(void)
1701a3544580Soga {
1702a3544580Soga 	struct uvm_pmemrange *new_pmr;
1703a3544580Soga 	int i;
1704a3544580Soga 
1705a3544580Soga 	TAILQ_INIT(&uvm.pmr_control.use);
17063accc929Sdlg 	RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
170790ee2fe0Sbeck 	TAILQ_INIT(&uvm.pmr_control.allocs);
1708a3544580Soga 
1709b426ab7bSthib 	/* By default, one range for the entire address space. */
1710a3544580Soga 	new_pmr = uvm_pmr_allocpmr();
1711a3544580Soga 	new_pmr->low = 0;
1712a3544580Soga 	new_pmr->high = atop((paddr_t)-1) + 1;
1713a3544580Soga 
17143accc929Sdlg 	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1715a3544580Soga 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1716a3544580Soga 
1717b426ab7bSthib 	for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1718b426ab7bSthib 		uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1719b426ab7bSthib 	    	    uvm_md_constraints[i]->ucr_high);
1720b426ab7bSthib 	}
1721a3544580Soga }
1722a3544580Soga 
1723a3544580Soga /*
1724a3544580Soga  * Find the pmemrange that contains the given page number.
1725a3544580Soga  *
1726a3544580Soga  * (Manually traverses the binary tree, because that is cheaper on stack
1727a3544580Soga  * usage.)
1728a3544580Soga  */
1729a3544580Soga struct uvm_pmemrange *
1730a3544580Soga uvm_pmemrange_find(paddr_t pageno)
1731a3544580Soga {
1732a3544580Soga 	struct uvm_pmemrange *pmr;
1733a3544580Soga 
17343accc929Sdlg 	pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1735a3544580Soga 	while (pmr != NULL) {
1736a3544580Soga 		if (pmr->low > pageno)
17373accc929Sdlg 			pmr = RBT_LEFT(uvm_pmemrange_addr, pmr);
1738a3544580Soga 		else if (pmr->high <= pageno)
17393accc929Sdlg 			pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr);
1740a3544580Soga 		else
1741a3544580Soga 			break;
1742a3544580Soga 	}
1743a3544580Soga 
1744a3544580Soga 	return pmr;
1745a3544580Soga }
1746a3544580Soga 
1747a3544580Soga #if defined(DDB) || defined(DEBUG)
1748a3544580Soga /*
1749a3544580Soga  * Return true if the given page is in any of the free lists.
1750a3544580Soga  * Used by uvm_page_printit.
1751a3544580Soga  * This function is safe, even if the page is not on the freeq.
1752a3544580Soga  * Note: does not apply locking, only called from ddb.
1753a3544580Soga  */
1754a3544580Soga int
1755a3544580Soga uvm_pmr_isfree(struct vm_page *pg)
1756a3544580Soga {
1757a3544580Soga 	struct vm_page *r;
1758a3544580Soga 	struct uvm_pmemrange *pmr;
1759a3544580Soga 
1760a3544580Soga 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1761a3544580Soga 	if (pmr == NULL)
1762a3544580Soga 		return 0;
1763262a556aSdlg 	r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1764a3544580Soga 	if (r == NULL)
1765262a556aSdlg 		r = RBT_MAX(uvm_pmr_addr, &pmr->addr);
176606c0d188Svisa 	else if (r != pg)
1767262a556aSdlg 		r = RBT_PREV(uvm_pmr_addr, r);
1768a3544580Soga 	if (r == NULL)
1769a3544580Soga 		return 0; /* Empty tree. */
1770a3544580Soga 
1771a3544580Soga 	KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1772a3544580Soga 	return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1773a3544580Soga 	    atop(VM_PAGE_TO_PHYS(pg));
1774a3544580Soga }
1775a3544580Soga #endif /* DEBUG */
1776a3544580Soga 
1777a3544580Soga /*
1778a3544580Soga  * Given a root of a tree, find a range which intersects start, end and
1779a3544580Soga  * is of the same memtype.
1780a3544580Soga  *
1781a3544580Soga  * Page must be in the address tree.
1782a3544580Soga  */
1783a3544580Soga struct vm_page*
1784a3544580Soga uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1785a3544580Soga     paddr_t start, paddr_t end, int memtype)
1786a3544580Soga {
1787a3544580Soga 	int	direction;
1788a3544580Soga 	struct	vm_page *root;
1789a3544580Soga 	struct	vm_page *high, *high_next;
1790a3544580Soga 	struct	vm_page *low, *low_next;
1791a3544580Soga 
1792a3544580Soga 	KDASSERT(pmr != NULL && init_root != NULL);
1793a3544580Soga 	root = init_root;
1794a3544580Soga 
179535164244Stedu 	/* Which direction to use for searching. */
1796a3544580Soga 	if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1797a3544580Soga 		direction =  1;
1798a3544580Soga 	else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1799a3544580Soga 		direction = -1;
1800a3544580Soga 	else /* nothing to do */
1801a3544580Soga 		return root;
1802a3544580Soga 
180335164244Stedu 	/* First, update root to fall within the chosen range. */
1804a3544580Soga 	while (root && !PMR_INTERSECTS_WITH(
1805a3544580Soga 	    atop(VM_PAGE_TO_PHYS(root)),
1806a3544580Soga 	    atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1807a3544580Soga 	    start, end)) {
1808a3544580Soga 		if (direction == 1)
1809262a556aSdlg 			root = RBT_RIGHT(uvm_objtree, root);
1810a3544580Soga 		else
1811262a556aSdlg 			root = RBT_LEFT(uvm_objtree, root);
1812a3544580Soga 	}
1813a3544580Soga 	if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1814a3544580Soga 		return root;
1815a3544580Soga 
1816a3544580Soga 	/*
1817a3544580Soga 	 * Root is valid, but of the wrong memtype.
1818a3544580Soga 	 *
1819a3544580Soga 	 * Try to find a range that has the given memtype in the subtree
1820a3544580Soga 	 * (memtype mismatches are costly, either because the conversion
1821a3544580Soga 	 * is expensive, or a later allocation will need to do the opposite
1822a3544580Soga 	 * conversion, which will be expensive).
1823a3544580Soga 	 *
1824a3544580Soga 	 *
1825a3544580Soga 	 * First, simply increase address until we hit something we can use.
1826a3544580Soga 	 * Cache the upper page, so we can page-walk later.
1827a3544580Soga 	 */
1828a3544580Soga 	high = root;
1829262a556aSdlg 	high_next = RBT_RIGHT(uvm_objtree, high);
1830a3544580Soga 	while (high_next != NULL && PMR_INTERSECTS_WITH(
1831a3544580Soga 	    atop(VM_PAGE_TO_PHYS(high_next)),
1832a3544580Soga 	    atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1833a3544580Soga 	    start, end)) {
1834a3544580Soga 		high = high_next;
1835a3544580Soga 		if (uvm_pmr_pg_to_memtype(high) == memtype)
1836a3544580Soga 			return high;
1837262a556aSdlg 		high_next = RBT_RIGHT(uvm_objtree, high);
1838a3544580Soga 	}
1839a3544580Soga 
1840a3544580Soga 	/*
1841a3544580Soga 	 * Second, decrease the address until we hit something we can use.
1842a3544580Soga 	 * Cache the lower page, so we can page-walk later.
1843a3544580Soga 	 */
1844a3544580Soga 	low = root;
1845262a556aSdlg 	low_next = RBT_LEFT(uvm_objtree, low);
1846a3544580Soga 	while (low_next != NULL && PMR_INTERSECTS_WITH(
1847a3544580Soga 	    atop(VM_PAGE_TO_PHYS(low_next)),
1848a3544580Soga 	    atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1849a3544580Soga 	    start, end)) {
1850a3544580Soga 		low = low_next;
1851a3544580Soga 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1852a3544580Soga 			return low;
1853262a556aSdlg 		low_next = RBT_LEFT(uvm_objtree, low);
1854a3544580Soga 	}
1855a3544580Soga 
1856baa04740Smiod 	if (low == high)
1857baa04740Smiod 		return NULL;
1858baa04740Smiod 
185935164244Stedu 	/* No hits. Walk the address tree until we find something usable. */
1860262a556aSdlg 	for (low = RBT_NEXT(uvm_pmr_addr, low);
1861a3544580Soga 	    low != high;
1862262a556aSdlg 	    low = RBT_NEXT(uvm_pmr_addr, low)) {
1863e74ab57bSariane 		KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1864e74ab57bSariane 	    	    atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1865a3544580Soga 	    	    start, end));
1866a3544580Soga 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1867a3544580Soga 			return low;
1868a3544580Soga 	}
1869a3544580Soga 
187035164244Stedu 	/* Nothing found. */
1871a3544580Soga 	return NULL;
1872a3544580Soga }
1873a3544580Soga 
1874a3544580Soga /*
1875a3544580Soga  * Allocate any page, the fastest way. Page number constraints only.
1876a3544580Soga  */
1877dba5c185Smiod psize_t
1878a3544580Soga uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1879ef440d08Skettenis     paddr_t start, paddr_t end, int memtype_only)
1880a3544580Soga {
1881a3544580Soga 	struct	uvm_pmemrange *pmr;
1882a3544580Soga 	struct	vm_page *found, *splitpg;
1883a3544580Soga 	psize_t	fcount;
1884a3544580Soga 	int	memtype;
1885a3544580Soga 
1886a3544580Soga 	fcount = 0;
1887d04c0e5aSthib 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1888d04c0e5aSthib 		/* We're done. */
1889d04c0e5aSthib 		if (fcount == count)
1890d04c0e5aSthib 			break;
1891d04c0e5aSthib 
1892a3544580Soga 		/* Outside requested range. */
1893a3544580Soga 		if (!(start == 0 && end == 0) &&
1894a3544580Soga 		    !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1895a3544580Soga 			continue;
1896a3544580Soga 
1897a3544580Soga 		/* Range is empty. */
1898a3544580Soga 		if (pmr->nsegs == 0)
1899a3544580Soga 			continue;
1900a3544580Soga 
1901d04c0e5aSthib 		/* Loop over all memtypes, starting at memtype_init. */
1902a3544580Soga 		memtype = memtype_init;
1903d04c0e5aSthib 		while (fcount != count) {
1904a3544580Soga 			found = TAILQ_FIRST(&pmr->single[memtype]);
1905a3544580Soga 			/*
1906a3544580Soga 			 * If found is outside the range, walk the list
1907a3544580Soga 			 * until we find something that intersects with
1908a3544580Soga 			 * boundaries.
1909a3544580Soga 			 */
1910a3544580Soga 			while (found && !PMR_INTERSECTS_WITH(
1911a3544580Soga 			    atop(VM_PAGE_TO_PHYS(found)),
1912a3544580Soga 			    atop(VM_PAGE_TO_PHYS(found)) + 1,
1913a3544580Soga 			    start, end))
1914a3544580Soga 				found = TAILQ_NEXT(found, pageq);
1915a3544580Soga 
1916a3544580Soga 			if (found == NULL) {
19176fd881a3Skettenis 				/*
19186fd881a3Skettenis 				 * Check if the size tree contains a range
19196fd881a3Skettenis 				 * that intersects with the boundaries. As the
19206fd881a3Skettenis 				 * allocation is for any page, try the smallest
19216fd881a3Skettenis 				 * range so that large ranges are preserved for
19226fd881a3Skettenis 				 * more constrained cases. Only one entry is
19236fd881a3Skettenis 				 * checked here, to avoid a brute-force search.
19246fd881a3Skettenis 				 *
19256fd881a3Skettenis 				 * Note that a size tree gives pg[1] instead of
19266fd881a3Skettenis 				 * pg[0].
19276fd881a3Skettenis 				 */
1928262a556aSdlg 				found = RBT_MIN(uvm_pmr_size,
19296fd881a3Skettenis 				    &pmr->size[memtype]);
193088fc093eSariane 				if (found != NULL) {
1931a3544580Soga 					found--;
19326fd881a3Skettenis 					if (!PMR_INTERSECTS_WITH(
19336fd881a3Skettenis 					    atop(VM_PAGE_TO_PHYS(found)),
19346fd881a3Skettenis 					    atop(VM_PAGE_TO_PHYS(found)) +
19356fd881a3Skettenis 					    found->fpgsz, start, end))
19366fd881a3Skettenis 						found = NULL;
19376fd881a3Skettenis 				}
19386fd881a3Skettenis 			}
19396fd881a3Skettenis 			if (found == NULL) {
19406fd881a3Skettenis 				/*
19416fd881a3Skettenis 				 * Try address-guided search to meet the page
19426fd881a3Skettenis 				 * number constraints.
19436fd881a3Skettenis 				 */
1944262a556aSdlg 				found = RBT_ROOT(uvm_pmr_addr, &pmr->addr);
19456fd881a3Skettenis 				if (found != NULL) {
1946a3544580Soga 					found = uvm_pmr_rootupdate(pmr, found,
1947a3544580Soga 					    start, end, memtype);
1948a3544580Soga 				}
194988fc093eSariane 			}
1950a3544580Soga 			if (found != NULL) {
1951a3544580Soga 				uvm_pmr_assertvalid(pmr);
1952a3544580Soga 				uvm_pmr_remove_size(pmr, found);
1953a3544580Soga 
1954a3544580Soga 				/*
1955a3544580Soga 				 * If the page intersects the end, then it'll
1956a3544580Soga 				 * need splitting.
1957a3544580Soga 				 *
1958a3544580Soga 				 * Note that we don't need to split if the page
1959a3544580Soga 				 * intersects start: the drain function will
1960a3544580Soga 				 * simply stop on hitting start.
1961a3544580Soga 				 */
1962a3544580Soga 				if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1963a3544580Soga 				    found->fpgsz > end) {
1964a3544580Soga 					psize_t splitsz =
1965a3544580Soga 					    atop(VM_PAGE_TO_PHYS(found)) +
1966a3544580Soga 					    found->fpgsz - end;
1967a3544580Soga 
1968a3544580Soga 					uvm_pmr_remove_addr(pmr, found);
1969a3544580Soga 					uvm_pmr_assertvalid(pmr);
1970a3544580Soga 					found->fpgsz -= splitsz;
1971a3544580Soga 					splitpg = found + found->fpgsz;
1972a3544580Soga 					splitpg->fpgsz = splitsz;
1973a3544580Soga 					uvm_pmr_insert(pmr, splitpg, 1);
1974a3544580Soga 
1975a3544580Soga 					/*
1976a3544580Soga 					 * At this point, splitpg and found
1977a3544580Soga 					 * actually should be joined.
1978a3544580Soga 					 * But we explicitly disable that,
1979a3544580Soga 					 * because we will start subtracting
1980a3544580Soga 					 * from found.
1981a3544580Soga 					 */
1982a3544580Soga 					KASSERT(start == 0 ||
1983a3544580Soga 					    atop(VM_PAGE_TO_PHYS(found)) +
1984a3544580Soga 					    found->fpgsz > start);
1985a3544580Soga 					uvm_pmr_insert_addr(pmr, found, 1);
1986a3544580Soga 				}
1987a3544580Soga 
1988a3544580Soga 				/*
1989a3544580Soga 				 * Fetch pages from the end.
1990a3544580Soga 				 * If the range is larger than the requested
1991a3544580Soga 				 * number of pages, this saves us an addr-tree
1992a3544580Soga 				 * update.
1993a3544580Soga 				 *
1994a3544580Soga 				 * Since we take from the end and insert at
1995a3544580Soga 				 * the head, any ranges keep preserved.
1996a3544580Soga 				 */
1997a3544580Soga 				while (found->fpgsz > 0 && fcount < count &&
1998a3544580Soga 				    (start == 0 ||
1999a3544580Soga 				    atop(VM_PAGE_TO_PHYS(found)) +
2000a3544580Soga 				    found->fpgsz > start)) {
2001a3544580Soga 					found->fpgsz--;
2002a3544580Soga 					fcount++;
2003a3544580Soga 					TAILQ_INSERT_HEAD(result,
2004a3544580Soga 					    &found[found->fpgsz], pageq);
2005a3544580Soga 				}
2006a3544580Soga 				if (found->fpgsz > 0) {
2007a3544580Soga 					uvm_pmr_insert_size(pmr, found);
2008a3544580Soga 					KDASSERT(fcount == count);
2009a3544580Soga 					uvm_pmr_assertvalid(pmr);
2010a3544580Soga 					return fcount;
2011a3544580Soga 				}
2012a3544580Soga 
2013a3544580Soga 				/*
2014a3544580Soga 				 * Delayed addr-tree removal.
2015a3544580Soga 				 */
2016a3544580Soga 				uvm_pmr_remove_addr(pmr, found);
2017a3544580Soga 				uvm_pmr_assertvalid(pmr);
2018a3544580Soga 			} else {
2019ef440d08Skettenis 				if (memtype_only)
2020ef440d08Skettenis 					break;
2021a3544580Soga 				/*
2022a3544580Soga 				 * Skip to the next memtype.
2023a3544580Soga 				 */
2024a3544580Soga 				memtype += 1;
2025a3544580Soga 				if (memtype == UVM_PMR_MEMTYPE_MAX)
2026a3544580Soga 					memtype = 0;
2027d04c0e5aSthib 				if (memtype == memtype_init)
2028d04c0e5aSthib 					break;
2029a3544580Soga 			}
2030d04c0e5aSthib 		}
2031a3544580Soga 	}
2032a3544580Soga 
2033a3544580Soga 	/*
2034a3544580Soga 	 * Search finished.
2035a3544580Soga 	 *
2036a3544580Soga 	 * Ran out of ranges before enough pages were gathered, or we hit the
2037a3544580Soga 	 * case where found->fpgsz == count - fcount, in which case the
2038a3544580Soga 	 * above exit condition didn't trigger.
2039a3544580Soga 	 *
2040a3544580Soga 	 * On failure, caller will free the pages.
2041a3544580Soga 	 */
2042a3544580Soga 	return fcount;
2043a3544580Soga }
2044a3544580Soga 
2045a3544580Soga #ifdef DDB
2046a3544580Soga /*
2047a3544580Soga  * Print information about pmemrange.
2048a3544580Soga  * Does not do locking (so either call it from DDB or acquire fpageq lock
2049a3544580Soga  * before invoking.
2050a3544580Soga  */
2051a3544580Soga void
2052a3544580Soga uvm_pmr_print(void)
2053a3544580Soga {
2054a3544580Soga 	struct	uvm_pmemrange *pmr;
2055a3544580Soga 	struct	vm_page *pg;
2056a3544580Soga 	psize_t	size[UVM_PMR_MEMTYPE_MAX];
2057a3544580Soga 	psize_t	free;
2058a3544580Soga 	int	useq_len;
2059a3544580Soga 	int	mt;
2060a3544580Soga 
2061a3544580Soga 	printf("Ranges, use queue:\n");
2062a3544580Soga 	useq_len = 0;
2063a3544580Soga 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
2064a3544580Soga 		useq_len++;
2065a3544580Soga 		free = 0;
2066a3544580Soga 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2067262a556aSdlg 			pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]);
2068a3544580Soga 			if (pg != NULL)
2069a3544580Soga 				pg--;
2070a3544580Soga 			else
2071a3544580Soga 				pg = TAILQ_FIRST(&pmr->single[mt]);
2072a3544580Soga 			size[mt] = (pg == NULL ? 0 : pg->fpgsz);
2073a3544580Soga 
2074262a556aSdlg 			RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
2075a3544580Soga 				free += pg->fpgsz;
2076a3544580Soga 		}
2077a3544580Soga 
2078a3544580Soga 		printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
2079a3544580Soga 		    (unsigned long)pmr->low, (unsigned long)pmr->high,
2080a3544580Soga 		    pmr->use, (unsigned long)pmr->nsegs);
2081a3544580Soga 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2082a3544580Soga 			printf(" maxsegsz[%d]=0x%lx", mt,
2083a3544580Soga 			    (unsigned long)size[mt]);
2084a3544580Soga 		}
2085a3544580Soga 		printf(" free=0x%lx\n", (unsigned long)free);
2086a3544580Soga 	}
2087a3544580Soga 	printf("#ranges = %d\n", useq_len);
2088a3544580Soga }
2089a3544580Soga #endif
2090205bb46dSariane 
209190ee2fe0Sbeck /*
209290ee2fe0Sbeck  * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
209390ee2fe0Sbeck  * in a specific physmem area.
209490ee2fe0Sbeck  *
209590ee2fe0Sbeck  * Returns ENOMEM if the pagedaemon failed to free any pages.
209690ee2fe0Sbeck  * If not failok, failure will lead to panic.
209790ee2fe0Sbeck  *
209890ee2fe0Sbeck  * Must be called with fpageq locked.
209990ee2fe0Sbeck  */
210090ee2fe0Sbeck int
210190ee2fe0Sbeck uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
210290ee2fe0Sbeck {
210390ee2fe0Sbeck 	struct uvm_pmalloc pma;
210490ee2fe0Sbeck 	const char *wmsg = "pmrwait";
210590ee2fe0Sbeck 
2106d60475a6Smpi 	KASSERT(curcpu()->ci_idepth == 0);
2107d60475a6Smpi 
210890ee2fe0Sbeck 	if (curproc == uvm.pagedaemon_proc) {
21099a0f4daeSbeck 		/*
21100bd995e1Stedu 		 * This is not that uncommon when the pagedaemon is trying
21110bd995e1Stedu 		 * to flush out a large mmapped file. VOP_WRITE will circle
21120bd995e1Stedu 		 * back through the buffer cache and try to get more memory.
21130bd995e1Stedu 		 * The pagedaemon starts by calling bufbackoff, but we can
21140bd995e1Stedu 		 * easily use up that reserve in a single scan iteration.
21150bd995e1Stedu 		 */
21160bd995e1Stedu 		uvm_unlock_fpageq();
21170b4f309dSmpi 		if (bufbackoff(NULL, atop(size)) >= atop(size)) {
21180bd995e1Stedu 			uvm_lock_fpageq();
21190bd995e1Stedu 			return 0;
21200bd995e1Stedu 		}
21210bd995e1Stedu 		uvm_lock_fpageq();
21220bd995e1Stedu 
21230bd995e1Stedu 		/*
21249a0f4daeSbeck 		 * XXX detect pagedaemon deadlock - see comment in
21259a0f4daeSbeck 		 * uvm_wait(), as this is exactly the same issue.
21269a0f4daeSbeck 		 */
21279a0f4daeSbeck 		printf("pagedaemon: wait_pla deadlock detected!\n");
21284bc97b15Scheloha 		msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg,
21294bc97b15Scheloha 		    MSEC_TO_NSEC(125));
21309a0f4daeSbeck #if defined(DEBUG)
21319a0f4daeSbeck 		/* DEBUG: panic so we can debug it */
21329a0f4daeSbeck 		panic("wait_pla pagedaemon deadlock");
21339a0f4daeSbeck #endif
213490ee2fe0Sbeck 		return 0;
213590ee2fe0Sbeck 	}
213690ee2fe0Sbeck 
213790ee2fe0Sbeck 	for (;;) {
213890ee2fe0Sbeck 		pma.pm_constraint.ucr_low = low;
213990ee2fe0Sbeck 		pma.pm_constraint.ucr_high = high;
214090ee2fe0Sbeck 		pma.pm_size = size;
214190ee2fe0Sbeck 		pma.pm_flags = UVM_PMA_LINKED;
214290ee2fe0Sbeck 		TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
214390ee2fe0Sbeck 
214490ee2fe0Sbeck 		wakeup(&uvm.pagedaemon);		/* wake the daemon! */
214590ee2fe0Sbeck 		while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
2146744ebc9fSmpi 			msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP);
214790ee2fe0Sbeck 
214890ee2fe0Sbeck 		if (!(pma.pm_flags & UVM_PMA_FREED) &&
214990ee2fe0Sbeck 		    pma.pm_flags & UVM_PMA_FAIL) {
215090ee2fe0Sbeck 			if (failok)
215190ee2fe0Sbeck 				return ENOMEM;
215290ee2fe0Sbeck 			printf("uvm_wait: failed to free %ld pages between "
215390ee2fe0Sbeck 			    "0x%lx-0x%lx\n", atop(size), low, high);
215490ee2fe0Sbeck 		} else
215590ee2fe0Sbeck 			return 0;
215690ee2fe0Sbeck 	}
215790ee2fe0Sbeck 	/* UNREACHABLE */
215890ee2fe0Sbeck }
215990ee2fe0Sbeck 
216090ee2fe0Sbeck /*
216190ee2fe0Sbeck  * Wake up uvm_pmalloc sleepers.
216290ee2fe0Sbeck  */
216390ee2fe0Sbeck void
216490ee2fe0Sbeck uvm_wakeup_pla(paddr_t low, psize_t len)
216590ee2fe0Sbeck {
216690ee2fe0Sbeck 	struct uvm_pmalloc *pma, *pma_next;
216790ee2fe0Sbeck 	paddr_t high;
216890ee2fe0Sbeck 
216990ee2fe0Sbeck 	high = low + len;
217090ee2fe0Sbeck 
217135164244Stedu 	/* Wake specific allocations waiting for this memory. */
217290ee2fe0Sbeck 	for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
217390ee2fe0Sbeck 	    pma = pma_next) {
217490ee2fe0Sbeck 		pma_next = TAILQ_NEXT(pma, pmq);
217590ee2fe0Sbeck 
217690ee2fe0Sbeck 		if (low < pma->pm_constraint.ucr_high &&
217790ee2fe0Sbeck 		    high > pma->pm_constraint.ucr_low) {
217890ee2fe0Sbeck 			pma->pm_flags |= UVM_PMA_FREED;
217990ee2fe0Sbeck 			if (!(pma->pm_flags & UVM_PMA_BUSY)) {
218090ee2fe0Sbeck 				pma->pm_flags &= ~UVM_PMA_LINKED;
218190ee2fe0Sbeck 				TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
218290ee2fe0Sbeck 				    pmq);
218390ee2fe0Sbeck 				wakeup(pma);
218490ee2fe0Sbeck 			}
218590ee2fe0Sbeck 		}
218690ee2fe0Sbeck 	}
218790ee2fe0Sbeck }
2188ef440d08Skettenis 
2189ef440d08Skettenis void
2190ef440d08Skettenis uvm_pagezero_thread(void *arg)
2191ef440d08Skettenis {
2192ef440d08Skettenis 	struct pglist pgl;
2193ef440d08Skettenis 	struct vm_page *pg;
2194ef440d08Skettenis 	int count;
2195ef440d08Skettenis 
2196ef440d08Skettenis 	/* Run at the lowest possible priority. */
2197ef440d08Skettenis 	curproc->p_p->ps_nice = NZERO + PRIO_MAX;
2198ef440d08Skettenis 
2199ef440d08Skettenis 	KERNEL_UNLOCK();
2200ef440d08Skettenis 
22012dfb4493Sguenther 	TAILQ_INIT(&pgl);
2202ef440d08Skettenis 	for (;;) {
2203ef440d08Skettenis 		uvm_lock_fpageq();
2204ef440d08Skettenis 		while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
2205ef440d08Skettenis 		    (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
2206ef440d08Skettenis 		     &pgl, 0, 0, 1)) == 0) {
22072404448fSjsg 			msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock,
22082404448fSjsg 			    MAXPRI, "pgzero", INFSLP);
2209ef440d08Skettenis 		}
2210ef440d08Skettenis 		uvm_unlock_fpageq();
2211ef440d08Skettenis 
2212ef440d08Skettenis 		TAILQ_FOREACH(pg, &pgl, pageq) {
2213ef440d08Skettenis 			uvm_pagezero(pg);
2214ef440d08Skettenis 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
2215ef440d08Skettenis 		}
2216ef440d08Skettenis 
2217ef440d08Skettenis 		uvm_lock_fpageq();
2218ef440d08Skettenis 		while (!TAILQ_EMPTY(&pgl))
2219ef440d08Skettenis 			uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
2220ef440d08Skettenis 		uvmexp.zeropages += count;
2221ef440d08Skettenis  		uvm_unlock_fpageq();
2222ef440d08Skettenis 
2223ef440d08Skettenis 		yield();
2224ef440d08Skettenis 	}
2225ef440d08Skettenis }
222682673a18Smpi 
222782673a18Smpi #if defined(MULTIPROCESSOR) && defined(__HAVE_UVM_PERCPU)
222882673a18Smpi int
222982673a18Smpi uvm_pmr_cache_alloc(struct uvm_pmr_cache_item *upci)
223082673a18Smpi {
223182673a18Smpi 	struct vm_page *pg;
223282673a18Smpi 	struct pglist pgl;
223382673a18Smpi 	int flags = UVM_PLA_NOWAIT|UVM_PLA_NOWAKE;
223482673a18Smpi 	int npages = UVM_PMR_CACHEMAGSZ;
223582673a18Smpi 
223682673a18Smpi 	splassert(IPL_VM);
223782673a18Smpi 	KASSERT(upci->upci_npages == 0);
223882673a18Smpi 
223982673a18Smpi 	TAILQ_INIT(&pgl);
224082673a18Smpi 	if (uvm_pmr_getpages(npages, 0, 0, 1, 0, npages, flags, &pgl))
224182673a18Smpi 		return -1;
224282673a18Smpi 
224382673a18Smpi 	while ((pg = TAILQ_FIRST(&pgl)) != NULL) {
224482673a18Smpi 		TAILQ_REMOVE(&pgl, pg, pageq);
224582673a18Smpi 		upci->upci_pages[upci->upci_npages] = pg;
224682673a18Smpi 		upci->upci_npages++;
224782673a18Smpi 	}
224882673a18Smpi 	atomic_add_int(&uvmexp.percpucaches, npages);
224982673a18Smpi 
225082673a18Smpi 	return 0;
225182673a18Smpi }
225282673a18Smpi 
225382673a18Smpi struct vm_page *
225482673a18Smpi uvm_pmr_cache_get(int flags)
225582673a18Smpi {
225682673a18Smpi 	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
225782673a18Smpi 	struct uvm_pmr_cache_item *upci;
225882673a18Smpi 	struct vm_page *pg;
225982673a18Smpi 	int s;
226082673a18Smpi 
226182673a18Smpi 	s = splvm();
226282673a18Smpi 	upci = &upc->upc_magz[upc->upc_actv];
226382673a18Smpi 	if (upci->upci_npages == 0) {
226482673a18Smpi 		unsigned int prev;
226582673a18Smpi 
226682673a18Smpi 		prev = (upc->upc_actv == 0) ?  1 : 0;
226782673a18Smpi 		upci = &upc->upc_magz[prev];
226882673a18Smpi 		if (upci->upci_npages == 0) {
226982673a18Smpi 			atomic_inc_int(&uvmexp.pcpmiss);
227082673a18Smpi 			if (uvm_pmr_cache_alloc(upci)) {
227182673a18Smpi 				splx(s);
227282673a18Smpi 				return uvm_pmr_getone(flags);
227382673a18Smpi 			}
227482673a18Smpi 		}
227582673a18Smpi 		/* Swap magazines */
227682673a18Smpi 		upc->upc_actv = prev;
227782673a18Smpi 	} else {
227882673a18Smpi 		atomic_inc_int(&uvmexp.pcphit);
227982673a18Smpi 	}
228082673a18Smpi 
228182673a18Smpi 	atomic_dec_int(&uvmexp.percpucaches);
228282673a18Smpi 	upci->upci_npages--;
228382673a18Smpi 	pg = upci->upci_pages[upci->upci_npages];
228482673a18Smpi 	splx(s);
228582673a18Smpi 
228682673a18Smpi 	if (flags & UVM_PLA_ZERO)
228782673a18Smpi 		uvm_pagezero(pg);
228882673a18Smpi 
228982673a18Smpi 	return pg;
229082673a18Smpi }
229182673a18Smpi 
2292804e3d51Smpi unsigned int
229382673a18Smpi uvm_pmr_cache_free(struct uvm_pmr_cache_item *upci)
229482673a18Smpi {
229582673a18Smpi 	struct pglist pgl;
229682673a18Smpi 	unsigned int i;
229782673a18Smpi 
229882673a18Smpi 	splassert(IPL_VM);
229982673a18Smpi 
230082673a18Smpi 	TAILQ_INIT(&pgl);
230182673a18Smpi 	for (i = 0; i < upci->upci_npages; i++)
230282673a18Smpi 		TAILQ_INSERT_TAIL(&pgl, upci->upci_pages[i], pageq);
230382673a18Smpi 
230482673a18Smpi 	uvm_pmr_freepageq(&pgl);
230582673a18Smpi 
230682673a18Smpi 	atomic_sub_int(&uvmexp.percpucaches, upci->upci_npages);
230782673a18Smpi 	upci->upci_npages = 0;
230882673a18Smpi 	memset(upci->upci_pages, 0, sizeof(upci->upci_pages));
2309804e3d51Smpi 
2310804e3d51Smpi 	return i;
231182673a18Smpi }
231282673a18Smpi 
231382673a18Smpi void
231482673a18Smpi uvm_pmr_cache_put(struct vm_page *pg)
231582673a18Smpi {
231682673a18Smpi 	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
231782673a18Smpi 	struct uvm_pmr_cache_item *upci;
23189326bf8dSmpi 	struct uvm_pmemrange *pmr;
231982673a18Smpi 	int s;
232082673a18Smpi 
23219326bf8dSmpi 	/*
23229326bf8dSmpi 	 * Always give back low pages to the allocator to not accelerate
23239326bf8dSmpi 	 * their exhaustion.
23249326bf8dSmpi 	 */
23259326bf8dSmpi 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
23269326bf8dSmpi 	if (pmr->use > 0) {
23279326bf8dSmpi 		uvm_pmr_freepages(pg, 1);
23289326bf8dSmpi 		return;
23299326bf8dSmpi 	}
23309326bf8dSmpi 
233182673a18Smpi 	s = splvm();
233282673a18Smpi 	upci = &upc->upc_magz[upc->upc_actv];
233382673a18Smpi 	if (upci->upci_npages >= UVM_PMR_CACHEMAGSZ) {
233482673a18Smpi 		unsigned int prev;
233582673a18Smpi 
233682673a18Smpi 		prev = (upc->upc_actv == 0) ?  1 : 0;
233782673a18Smpi 		upci = &upc->upc_magz[prev];
233882673a18Smpi 		if (upci->upci_npages > 0)
233982673a18Smpi 			uvm_pmr_cache_free(upci);
234082673a18Smpi 
234182673a18Smpi 		/* Swap magazines */
234282673a18Smpi 		upc->upc_actv = prev;
234382673a18Smpi 		KASSERT(upci->upci_npages == 0);
234482673a18Smpi 	}
234582673a18Smpi 
234682673a18Smpi 	upci->upci_pages[upci->upci_npages] = pg;
234782673a18Smpi 	upci->upci_npages++;
234882673a18Smpi 	atomic_inc_int(&uvmexp.percpucaches);
234982673a18Smpi 	splx(s);
235082673a18Smpi }
235182673a18Smpi 
2352804e3d51Smpi unsigned int
235382673a18Smpi uvm_pmr_cache_drain(void)
235482673a18Smpi {
235582673a18Smpi 	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2356804e3d51Smpi 	unsigned int freed = 0;
235782673a18Smpi 	int s;
235882673a18Smpi 
235982673a18Smpi 	s = splvm();
2360804e3d51Smpi 	freed += uvm_pmr_cache_free(&upc->upc_magz[0]);
2361804e3d51Smpi 	freed += uvm_pmr_cache_free(&upc->upc_magz[1]);
236282673a18Smpi 	splx(s);
2363804e3d51Smpi 
2364804e3d51Smpi 	return freed;
236582673a18Smpi }
236682673a18Smpi 
236782673a18Smpi #else /* !(MULTIPROCESSOR && __HAVE_UVM_PERCPU) */
236882673a18Smpi 
236982673a18Smpi struct vm_page *
237082673a18Smpi uvm_pmr_cache_get(int flags)
237182673a18Smpi {
237282673a18Smpi 	return uvm_pmr_getone(flags);
237382673a18Smpi }
237482673a18Smpi 
237582673a18Smpi void
237682673a18Smpi uvm_pmr_cache_put(struct vm_page *pg)
237782673a18Smpi {
237882673a18Smpi 	uvm_pmr_freepages(pg, 1);
237982673a18Smpi }
238082673a18Smpi 
2381804e3d51Smpi unsigned int
238282673a18Smpi uvm_pmr_cache_drain(void)
238382673a18Smpi {
2384804e3d51Smpi 	return 0;
238582673a18Smpi }
238682673a18Smpi #endif
2387