xref: /openbsd-src/sys/uvm/uvm_pmemrange.c (revision 7350f337b9e3eb4461d99580e625c7ef148d107c)
1 /*	$OpenBSD: uvm_pmemrange.c,v 1.54 2019/05/09 20:36:44 beck Exp $	*/
2 
3 /*
4  * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/systm.h>
21 #include <uvm/uvm.h>
22 #include <sys/malloc.h>
23 #include <sys/kernel.h>
24 #include <sys/kthread.h>
25 #include <sys/mount.h>
26 
27 /*
28  * 2 trees: addr tree and size tree.
29  *
30  * The allocator keeps chunks of free pages (called a range).
31  * Two pages are part of the same range if:
32  * - all pages in between are part of that range,
33  * - they are of the same memory type (zeroed or non-zeroed),
34  * - they are part of the same pmemrange.
35  * A pmemrange is a range of memory which is part of the same vm_physseg
36  * and has a use-count.
37  *
38  * addr tree is vm_page[0].objt
39  * size tree is vm_page[1].objt
40  *
41  * The size tree is not used for memory ranges of 1 page, instead,
42  * single queue is vm_page[0].pageq
43  *
44  * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges
45  * are joined, unless:
46  * - they have pages in between them which are not free
47  * - they belong to different memtypes (zeroed vs dirty memory)
48  * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
49  * - they are not a continuation of the same array
50  * The latter issue is caused by vm_physseg ordering and splitting from the
51  * MD initialization machinery. The MD code is dependant on freelists and
52  * happens to split ISA memory from non-ISA memory.
53  * (Note: freelists die die die!)
54  *
55  * uvm_page_init guarantees that every vm_physseg contains an array of
56  * struct vm_page. Also, uvm_page_physload allocates an array of struct
57  * vm_page. This code depends on that array. The array may break across
58  * vm_physsegs boundaries.
59  */
60 
61 /*
62  * Validate the flags of the page. (Used in asserts.)
63  * Any free page must have the PQ_FREE flag set.
64  * Free pages may be zeroed.
65  * Pmap flags are left untouched.
66  *
67  * The PQ_FREE flag is not checked here: by not checking, we can easily use
68  * this check in pages which are freed.
69  */
70 #define VALID_FLAGS(pg_flags)						\
71 	(((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
72 
73 /* Tree comparators. */
74 int	uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *,
75 	    const struct uvm_pmemrange *);
76 int	uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
77 int	uvm_pmr_pg_to_memtype(struct vm_page *);
78 
79 #ifdef DDB
80 void	uvm_pmr_print(void);
81 #endif
82 
83 /*
84  * Memory types. The page flags are used to derive what the current memory
85  * type of a page is.
86  */
87 int
88 uvm_pmr_pg_to_memtype(struct vm_page *pg)
89 {
90 	if (pg->pg_flags & PG_ZERO)
91 		return UVM_PMR_MEMTYPE_ZERO;
92 	/* Default: dirty memory. */
93 	return UVM_PMR_MEMTYPE_DIRTY;
94 }
95 
96 /* Trees. */
97 RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
98 RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
99 RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
100     uvm_pmemrange_addr_cmp);
101 
102 /* Validation. */
103 #ifdef DEBUG
104 void	uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
105 #else
106 #define uvm_pmr_assertvalid(pmr)	do {} while (0)
107 #endif
108 
109 psize_t			 uvm_pmr_get1page(psize_t, int, struct pglist *,
110 			    paddr_t, paddr_t, int);
111 
112 struct uvm_pmemrange	*uvm_pmr_allocpmr(void);
113 struct vm_page		*uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
114 struct vm_page		*uvm_pmr_nextsz(struct uvm_pmemrange *,
115 			    struct vm_page *, int);
116 void			 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
117 			    struct vm_page *pg, struct vm_page **pg_prev,
118 			    struct vm_page **pg_next);
119 struct vm_page		*uvm_pmr_findnextsegment(struct uvm_pmemrange *,
120 			    struct vm_page *, paddr_t);
121 psize_t			 uvm_pmr_remove_1strange(struct pglist *, paddr_t,
122 			    struct vm_page **, int);
123 void			 uvm_pmr_split(paddr_t);
124 struct uvm_pmemrange	*uvm_pmemrange_find(paddr_t);
125 struct uvm_pmemrange	*uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
126 			    struct uvm_pmemrange *);
127 psize_t			 pow2divide(psize_t, psize_t);
128 struct vm_page		*uvm_pmr_rootupdate(struct uvm_pmemrange *,
129 			    struct vm_page *, paddr_t, paddr_t, int);
130 
131 /*
132  * Computes num/denom and rounds it up to the next power-of-2.
133  *
134  * This is a division function which calculates an approximation of
135  * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
136  * have to be accurate.
137  *
138  * Providing too large a value makes the allocator slightly faster, at the
139  * risk of hitting the failure case more often. Providing too small a value
140  * makes the allocator a bit slower, but less likely to hit a failure case.
141  */
142 psize_t
143 pow2divide(psize_t num, psize_t denom)
144 {
145 	int rshift;
146 
147 	for (rshift = 0; num > denom; rshift++, denom <<= 1)
148 		;
149 	return (paddr_t)1 << rshift;
150 }
151 
152 /*
153  * Predicate: lhs is a subrange or rhs.
154  *
155  * If rhs_low == 0: don't care about lower bound.
156  * If rhs_high == 0: don't care about upper bound.
157  */
158 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high)	\
159 	(((rhs_low) == 0 || (lhs_low) >= (rhs_low)) &&			\
160 	((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
161 
162 /*
163  * Predicate: lhs intersects with rhs.
164  *
165  * If rhs_low == 0: don't care about lower bound.
166  * If rhs_high == 0: don't care about upper bound.
167  * Ranges don't intersect if they don't have any page in common, array
168  * semantics mean that < instead of <= should be used here.
169  */
170 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high)	\
171 	(((rhs_low) == 0 || (rhs_low) < (lhs_high)) &&			\
172 	((rhs_high) == 0 || (lhs_low) < (rhs_high)))
173 
174 /*
175  * Align to power-of-2 alignment.
176  */
177 #define PMR_ALIGN(pgno, align)						\
178 	(((pgno) + ((align) - 1)) & ~((align) - 1))
179 
180 
181 /*
182  * Comparator: sort by address ascending.
183  */
184 int
185 uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs,
186     const struct uvm_pmemrange *rhs)
187 {
188 	return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
189 }
190 
191 /*
192  * Comparator: sort by use ascending.
193  *
194  * The higher the use value of a range, the more devices need memory in
195  * this range. Therefore allocate from the range with the lowest use first.
196  */
197 int
198 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
199 {
200 	int result;
201 
202 	result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
203 	if (result == 0)
204 		result = uvm_pmemrange_addr_cmp(lhs, rhs);
205 	return result;
206 }
207 
208 int
209 uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
210 {
211 	paddr_t lhs_addr, rhs_addr;
212 
213 	lhs_addr = VM_PAGE_TO_PHYS(lhs);
214 	rhs_addr = VM_PAGE_TO_PHYS(rhs);
215 
216 	return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
217 }
218 
219 int
220 uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
221 {
222 	psize_t lhs_size, rhs_size;
223 	int cmp;
224 
225 	/* Using second tree, so we receive pg[1] instead of pg[0]. */
226 	lhs_size = (lhs - 1)->fpgsz;
227 	rhs_size = (rhs - 1)->fpgsz;
228 
229 	cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
230 	if (cmp == 0)
231 		cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
232 	return cmp;
233 }
234 
235 /*
236  * Find the first range of free pages that is at least sz pages long.
237  */
238 struct vm_page *
239 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
240 {
241 	struct	vm_page *node, *best;
242 
243 	KASSERT(sz >= 1);
244 
245 	if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
246 		return TAILQ_FIRST(&pmr->single[mti]);
247 
248 	node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]);
249 	best = NULL;
250 	while (node != NULL) {
251 		if ((node - 1)->fpgsz >= sz) {
252 			best = (node - 1);
253 			node = RBT_LEFT(uvm_objtree, node);
254 		} else
255 			node = RBT_RIGHT(uvm_objtree, node);
256 	}
257 	return best;
258 }
259 
260 /*
261  * Finds the next range. The next range has a size >= pg->fpgsz.
262  * Returns NULL if no more ranges are available.
263  */
264 struct vm_page *
265 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
266 {
267 	struct vm_page *npg;
268 
269 	KASSERT(pmr != NULL && pg != NULL);
270 	if (pg->fpgsz == 1) {
271 		if (TAILQ_NEXT(pg, pageq) != NULL)
272 			return TAILQ_NEXT(pg, pageq);
273 		else
274 			npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]);
275 	} else
276 		npg = RBT_NEXT(uvm_pmr_size, pg + 1);
277 
278 	return npg == NULL ? NULL : npg - 1;
279 }
280 
281 /*
282  * Finds the previous and next ranges relative to the (uninserted) pg range.
283  *
284  * *pg_prev == NULL if no previous range is available, that can join with
285  * 	pg.
286  * *pg_next == NULL if no next range is available, that can join with
287  * 	pg.
288  */
289 void
290 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
291     struct vm_page **pg_prev, struct vm_page **pg_next)
292 {
293 	KASSERT(pg_prev != NULL && pg_next != NULL);
294 
295 	*pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
296 	if (*pg_next == NULL)
297 		*pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
298 	else
299 		*pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next);
300 
301 	KDASSERT(*pg_next == NULL ||
302 	    VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
303 	KDASSERT(*pg_prev == NULL ||
304 	    VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
305 
306 	/* Reset if not contig. */
307 	if (*pg_prev != NULL &&
308 	    (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
309 	    != atop(VM_PAGE_TO_PHYS(pg)) ||
310 	    *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
311 	    uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
312 		*pg_prev = NULL;
313 	if (*pg_next != NULL &&
314 	    (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
315 	    != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
316 	    pg + pg->fpgsz != *pg_next || /* Array broke. */
317 	    uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
318 		*pg_next = NULL;
319 	return;
320 }
321 
322 /*
323  * Remove a range from the address tree.
324  * Address tree maintains pmr counters.
325  */
326 void
327 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
328 {
329 	KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
330 	KDASSERT(pg->pg_flags & PQ_FREE);
331 	RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
332 
333 	pmr->nsegs--;
334 }
335 /*
336  * Remove a range from the size tree.
337  */
338 void
339 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
340 {
341 	int memtype;
342 #ifdef DEBUG
343 	struct vm_page *i;
344 #endif
345 
346 	KDASSERT(pg->fpgsz >= 1);
347 	KDASSERT(pg->pg_flags & PQ_FREE);
348 	memtype = uvm_pmr_pg_to_memtype(pg);
349 
350 	if (pg->fpgsz == 1) {
351 #ifdef DEBUG
352 		TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
353 			if (i == pg)
354 				break;
355 		}
356 		KDASSERT(i == pg);
357 #endif
358 		TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
359 	} else {
360 		KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype],
361 		    pg + 1) == pg + 1);
362 		RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
363 	}
364 }
365 /* Remove from both trees. */
366 void
367 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
368 {
369 	uvm_pmr_assertvalid(pmr);
370 	uvm_pmr_remove_size(pmr, pg);
371 	uvm_pmr_remove_addr(pmr, pg);
372 	uvm_pmr_assertvalid(pmr);
373 }
374 
375 /*
376  * Insert the range described in pg.
377  * Returns the range thus created (which may be joined with the previous and
378  * next ranges).
379  * If no_join, the caller guarantees that the range cannot possibly join
380  * with adjecent ranges.
381  */
382 struct vm_page *
383 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
384 {
385 	struct vm_page *prev, *next;
386 
387 #ifdef DEBUG
388 	struct vm_page *i;
389 	int mt;
390 #endif
391 
392 	KDASSERT(pg->pg_flags & PQ_FREE);
393 	KDASSERT(pg->fpgsz >= 1);
394 
395 #ifdef DEBUG
396 	for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
397 		TAILQ_FOREACH(i, &pmr->single[mt], pageq)
398 			KDASSERT(i != pg);
399 		if (pg->fpgsz > 1) {
400 			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt],
401 			    pg + 1) == NULL);
402 		}
403 		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
404 	}
405 #endif
406 
407 	if (!no_join) {
408 		uvm_pmr_pnaddr(pmr, pg, &prev, &next);
409 		if (next != NULL) {
410 			uvm_pmr_remove_size(pmr, next);
411 			uvm_pmr_remove_addr(pmr, next);
412 			pg->fpgsz += next->fpgsz;
413 			next->fpgsz = 0;
414 		}
415 		if (prev != NULL) {
416 			uvm_pmr_remove_size(pmr, prev);
417 			prev->fpgsz += pg->fpgsz;
418 			pg->fpgsz = 0;
419 			return prev;
420 		}
421 	}
422 
423 	RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg);
424 
425 	pmr->nsegs++;
426 
427 	return pg;
428 }
429 /*
430  * Insert the range described in pg.
431  * Returns the range thus created (which may be joined with the previous and
432  * next ranges).
433  * Page must already be in the address tree.
434  */
435 void
436 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
437 {
438 	int memtype;
439 #ifdef DEBUG
440 	struct vm_page *i;
441 	int mti;
442 #endif
443 
444 	KDASSERT(pg->fpgsz >= 1);
445 	KDASSERT(pg->pg_flags & PQ_FREE);
446 
447 	memtype = uvm_pmr_pg_to_memtype(pg);
448 #ifdef DEBUG
449 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
450 		TAILQ_FOREACH(i, &pmr->single[mti], pageq)
451 			KDASSERT(i != pg);
452 		if (pg->fpgsz > 1) {
453 			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti],
454 			    pg + 1) == NULL);
455 		}
456 		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
457 	}
458 	for (i = pg; i < pg + pg->fpgsz; i++)
459 		KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
460 #endif
461 
462 	if (pg->fpgsz == 1)
463 		TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
464 	else
465 		RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
466 }
467 /* Insert in both trees. */
468 struct vm_page *
469 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
470 {
471 	uvm_pmr_assertvalid(pmr);
472 	pg = uvm_pmr_insert_addr(pmr, pg, no_join);
473 	uvm_pmr_insert_size(pmr, pg);
474 	uvm_pmr_assertvalid(pmr);
475 	return pg;
476 }
477 
478 /*
479  * Find the last page that is part of this segment.
480  * => pg: the range at which to start the search.
481  * => boundary: the page number boundary specification (0 = no boundary).
482  * => pmr: the pmemrange of the page.
483  *
484  * This function returns 1 before the next range, so if you want to have the
485  * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
486  * The reason is that this way, the length of the segment is easily
487  * calculated using: atop(result) - atop(pg) + 1.
488  * Hence this function also never returns NULL.
489  */
490 struct vm_page *
491 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
492     struct vm_page *pg, paddr_t boundary)
493 {
494 	paddr_t	first_boundary;
495 	struct	vm_page *next;
496 	struct	vm_page *prev;
497 
498 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
499 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
500 	if (boundary != 0) {
501 		first_boundary =
502 		    PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
503 	} else
504 		first_boundary = 0;
505 
506 	/*
507 	 * Increase next until it hits the first page of the next segment.
508 	 *
509 	 * While loop checks the following:
510 	 * - next != NULL	we have not reached the end of pgl
511 	 * - boundary == 0 || next < first_boundary
512 	 *			we do not cross a boundary
513 	 * - atop(prev) + 1 == atop(next)
514 	 *			still in the same segment
515 	 * - low <= last
516 	 * - high > last	still in the same memory range
517 	 * - memtype is equal	allocator is unable to view different memtypes
518 	 *			as part of the same segment
519 	 * - prev + 1 == next	no array breakage occurs
520 	 */
521 	prev = pg;
522 	next = TAILQ_NEXT(prev, pageq);
523 	while (next != NULL &&
524 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
525 	    atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
526 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
527 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
528 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
529 	    prev + 1 == next) {
530 		prev = next;
531 		next = TAILQ_NEXT(prev, pageq);
532 	}
533 
534 	/*
535 	 * End of this segment.
536 	 */
537 	return prev;
538 }
539 
540 /*
541  * Remove the first segment of contiguous pages from pgl.
542  * A segment ends if it crosses boundary (unless boundary = 0) or
543  * if it would enter a different uvm_pmemrange.
544  *
545  * Work: the page range that the caller is currently working with.
546  * May be null.
547  *
548  * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
549  * the first segment is erased (which, if called by uvm_pmr_getpages(),
550  * probably is the smallest or very close to it).
551  */
552 psize_t
553 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
554     struct vm_page **work, int is_desperate)
555 {
556 	struct vm_page *start, *end, *iter, *iter_end, *inserted;
557 	psize_t count;
558 	struct uvm_pmemrange *pmr, *pmr_iter;
559 
560 	KASSERT(!TAILQ_EMPTY(pgl));
561 
562 	/*
563 	 * Initialize to first page.
564 	 * Unless desperate scan finds a better candidate, this is what'll be
565 	 * erased.
566 	 */
567 	start = TAILQ_FIRST(pgl);
568 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
569 	end = uvm_pmr_findnextsegment(pmr, start, boundary);
570 
571 	/*
572 	 * If we are desperate, we _really_ want to get rid of the smallest
573 	 * element (rather than a close match to the smallest element).
574 	 */
575 	if (is_desperate) {
576 		/* Linear search for smallest segment. */
577 		pmr_iter = pmr;
578 		for (iter = TAILQ_NEXT(end, pageq);
579 		    iter != NULL && start != end;
580 		    iter = TAILQ_NEXT(iter_end, pageq)) {
581 			/*
582 			 * Only update pmr if it doesn't match current
583 			 * iteration.
584 			 */
585 			if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
586 			    pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
587 				pmr_iter = uvm_pmemrange_find(atop(
588 				    VM_PAGE_TO_PHYS(iter)));
589 			}
590 
591 			iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
592 			    boundary);
593 
594 			/*
595 			 * Current iteration is smaller than best match so
596 			 * far; update.
597 			 */
598 			if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
599 			    VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
600 				start = iter;
601 				end = iter_end;
602 				pmr = pmr_iter;
603 			}
604 		}
605 	}
606 
607 	/*
608 	 * Calculate count and end of the list.
609 	 */
610 	count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
611 	end = TAILQ_NEXT(end, pageq);
612 
613 	/*
614 	 * Actually remove the range of pages.
615 	 *
616 	 * Sadly, this cannot be done using pointer iteration:
617 	 * vm_physseg is not guaranteed to be sorted on address, hence
618 	 * uvm_page_init() may not have initialized its array sorted by
619 	 * page number.
620 	 */
621 	for (iter = start; iter != end; iter = iter_end) {
622 		iter_end = TAILQ_NEXT(iter, pageq);
623 		TAILQ_REMOVE(pgl, iter, pageq);
624 	}
625 
626 	start->fpgsz = count;
627 	inserted = uvm_pmr_insert(pmr, start, 0);
628 
629 	/*
630 	 * If the caller was working on a range and this function modified
631 	 * that range, update the pointer.
632 	 */
633 	if (work != NULL && *work != NULL &&
634 	    atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
635 	    atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
636 	    atop(VM_PAGE_TO_PHYS(*work)))
637 		*work = inserted;
638 	return count;
639 }
640 
641 /*
642  * Extract a number of pages from a segment of free pages.
643  * Called by uvm_pmr_getpages.
644  *
645  * Returns the segment that was created from pages left over at the tail
646  * of the remove set of pages, or NULL if no pages were left at the tail.
647  */
648 struct vm_page *
649 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
650     paddr_t start, paddr_t end, struct pglist *result)
651 {
652 	struct vm_page *after, *pg_i;
653 	psize_t before_sz, after_sz;
654 #ifdef DEBUG
655 	psize_t i;
656 #endif
657 
658 	KDASSERT(end > start);
659 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
660 	KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
661 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
662 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
663 
664 	before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
665 	after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
666 	KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
667 	uvm_pmr_assertvalid(pmr);
668 
669 	uvm_pmr_remove_size(pmr, pg);
670 	if (before_sz == 0)
671 		uvm_pmr_remove_addr(pmr, pg);
672 	after = pg + before_sz + (end - start);
673 
674 	/* Add selected pages to result. */
675 	for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
676 		KASSERT(pg_i->pg_flags & PQ_FREE);
677 		pg_i->fpgsz = 0;
678 		TAILQ_INSERT_TAIL(result, pg_i, pageq);
679 	}
680 
681 	/* Before handling. */
682 	if (before_sz > 0) {
683 		pg->fpgsz = before_sz;
684 		uvm_pmr_insert_size(pmr, pg);
685 	}
686 
687 	/* After handling. */
688 	if (after_sz > 0) {
689 #ifdef DEBUG
690 		for (i = 0; i < after_sz; i++) {
691 			KASSERT(!uvm_pmr_isfree(after + i));
692 		}
693 #endif
694 		KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
695 		after->fpgsz = after_sz;
696 		after = uvm_pmr_insert_addr(pmr, after, 1);
697 		uvm_pmr_insert_size(pmr, after);
698 	}
699 
700 	uvm_pmr_assertvalid(pmr);
701 	return (after_sz > 0 ? after : NULL);
702 }
703 
704 /*
705  * Indicate to the page daemon that a nowait call failed and it should
706  * recover at least some memory in the most restricted region (assumed
707  * to be dma_constraint).
708  */
709 extern volatile int uvm_nowait_failed;
710 
711 /*
712  * Acquire a number of pages.
713  *
714  * count:	the number of pages returned
715  * start:	lowest page number
716  * end:		highest page number +1
717  * 		(start = end = 0: no limitation)
718  * align:	power-of-2 alignment constraint (align = 1: no alignment)
719  * boundary:	power-of-2 boundary (boundary = 0: no boundary)
720  * maxseg:	maximum number of segments to return
721  * flags:	UVM_PLA_* flags
722  * result:	returned pages storage (uses pageq)
723  */
724 int
725 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
726     paddr_t boundary, int maxseg, int flags, struct pglist *result)
727 {
728 	struct	uvm_pmemrange *pmr;	/* Iterate memory ranges. */
729 	struct	vm_page *found, *f_next; /* Iterate chunks. */
730 	psize_t	fcount;			/* Current found pages. */
731 	int	fnsegs;			/* Current segment counter. */
732 	int	try, start_try;
733 	psize_t	search[3];
734 	paddr_t	fstart, fend;		/* Pages to be taken from found. */
735 	int	memtype;		/* Requested memtype. */
736 	int	memtype_init;		/* Best memtype. */
737 	int	desperate;		/* True if allocation failed. */
738 #ifdef DIAGNOSTIC
739 	struct	vm_page *diag_prev;	/* Used during validation. */
740 #endif /* DIAGNOSTIC */
741 
742 	/*
743 	 * Validate arguments.
744 	 */
745 	KASSERT(count > 0);
746 	KASSERT(start == 0 || end == 0 || start < end);
747 	KASSERT(align >= 1);
748 	KASSERT(powerof2(align));
749 	KASSERT(maxseg > 0);
750 	KASSERT(boundary == 0 || powerof2(boundary));
751 	KASSERT(boundary == 0 || maxseg * boundary >= count);
752 	KASSERT(TAILQ_EMPTY(result));
753 
754 	/*
755 	 * TRYCONTIG is a noop if you only want a single segment.
756 	 * Remove it if that's the case: otherwise it'll deny the fast
757 	 * allocation.
758 	 */
759 	if (maxseg == 1 || count == 1)
760 		flags &= ~UVM_PLA_TRYCONTIG;
761 
762 	/*
763 	 * Configure search.
764 	 *
765 	 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
766 	 * search[1] is multiple segments, chosen to fulfill the search in
767 	 *   approximately even-sized segments.
768 	 *   This is a good trade-off between slightly reduced allocation speed
769 	 *   and less fragmentation.
770 	 * search[2] is the worst case, in which all segments are evaluated.
771 	 *   This provides the least fragmentation, but makes the search
772 	 *   possibly longer (although in the case it is selected, that no
773 	 *   longer matters most).
774 	 *
775 	 * The exception is when maxseg == 1: since we can only fulfill that
776 	 * with one segment of size pages, only a single search type has to
777 	 * be attempted.
778 	 */
779 	if (maxseg == 1 || count == 1) {
780 		start_try = 2;
781 		search[2] = count;
782 	} else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
783 		start_try = 2;
784 		search[2] = 1;
785 	} else {
786 		start_try = 0;
787 		search[0] = count;
788 		search[1] = pow2divide(count, maxseg);
789 		search[2] = 1;
790 		if ((flags & UVM_PLA_TRYCONTIG) == 0)
791 			start_try = 1;
792 		if (search[1] >= search[0]) {
793 			search[1] = search[0];
794 			start_try = 1;
795 		}
796 		if (search[2] >= search[start_try]) {
797 			start_try = 2;
798 		}
799 	}
800 
801 	/*
802 	 * Memory type: if zeroed memory is requested, traverse the zero set.
803 	 * Otherwise, traverse the dirty set.
804 	 *
805 	 * The memtype iterator is reinitialized to memtype_init on entrance
806 	 * of a pmemrange.
807 	 */
808 	if (flags & UVM_PLA_ZERO)
809 		memtype_init = UVM_PMR_MEMTYPE_ZERO;
810 	else
811 		memtype_init = UVM_PMR_MEMTYPE_DIRTY;
812 
813 	/*
814 	 * Initially, we're not desperate.
815 	 *
816 	 * Note that if we return from a sleep, we are still desperate.
817 	 * Chances are that memory pressure is still high, so resetting
818 	 * seems over-optimistic to me.
819 	 */
820 	desperate = 0;
821 
822 	uvm_lock_fpageq();
823 
824 retry:		/* Return point after sleeping. */
825 	fcount = 0;
826 	fnsegs = 0;
827 
828 retry_desperate:
829 	/*
830 	 * If we just want any page(s), go for the really fast option.
831 	 */
832 	if (count <= maxseg && align == 1 && boundary == 0 &&
833 	    (flags & UVM_PLA_TRYCONTIG) == 0) {
834 		fcount += uvm_pmr_get1page(count - fcount, memtype_init,
835 		    result, start, end, 0);
836 
837 		/*
838 		 * If we found sufficient pages, go to the succes exit code.
839 		 *
840 		 * Otherwise, go immediately to fail, since we collected
841 		 * all we could anyway.
842 		 */
843 		if (fcount == count)
844 			goto out;
845 		else
846 			goto fail;
847 	}
848 
849 	/*
850 	 * The heart of the contig case.
851 	 *
852 	 * The code actually looks like this:
853 	 *
854 	 * foreach (struct pmemrange) {
855 	 *	foreach (memtype) {
856 	 *		foreach(try) {
857 	 *			foreach (free range of memtype in pmemrange,
858 	 *			    starting at search[try]) {
859 	 *				while (range has space left)
860 	 *					take from range
861 	 *			}
862 	 *		}
863 	 *	}
864 	 *
865 	 *	if next pmemrange has higher usecount than current:
866 	 *		enter desperate case (which will drain the pmemranges
867 	 *		until empty prior to moving to the next one)
868 	 * }
869 	 *
870 	 * When desperate is activated, try always starts at the highest
871 	 * value. The memtype loop is using a goto ReScanMemtype.
872 	 * The try loop is using a goto ReScan.
873 	 * The 'range has space left' loop uses label DrainFound.
874 	 *
875 	 * Writing them all as loops would take up a lot of screen space in
876 	 * the form of indentation and some parts are easier to express
877 	 * using the labels.
878 	 */
879 
880 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
881 		/* Empty range. */
882 		if (pmr->nsegs == 0)
883 			continue;
884 
885 		/* Outside requested range. */
886 		if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
887 			continue;
888 
889 		memtype = memtype_init;
890 
891 rescan_memtype:	/* Return point at memtype++. */
892 		try = start_try;
893 
894 rescan:		/* Return point at try++. */
895 		for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
896 		    found != NULL;
897 		    found = f_next) {
898 			f_next = uvm_pmr_nextsz(pmr, found, memtype);
899 
900 			fstart = atop(VM_PAGE_TO_PHYS(found));
901 			if (start != 0)
902 				fstart = MAX(start, fstart);
903 drain_found:
904 			/*
905 			 * Throw away the first segment if fnsegs == maxseg
906 			 *
907 			 * Note that f_next is still valid after this call,
908 			 * since we only allocated from entries before f_next.
909 			 * We don't revisit the entries we already extracted
910 			 * from unless we entered the desperate case.
911 			 */
912 			if (fnsegs == maxseg) {
913 				fnsegs--;
914 				fcount -=
915 				    uvm_pmr_remove_1strange(result, boundary,
916 				    &found, desperate);
917 			}
918 
919 			fstart = PMR_ALIGN(fstart, align);
920 			fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
921 			if (end != 0)
922 				fend = MIN(end, fend);
923 			if (boundary != 0) {
924 				fend =
925 				    MIN(fend, PMR_ALIGN(fstart + 1, boundary));
926 			}
927 			if (fstart >= fend)
928 				continue;
929 			if (fend - fstart > count - fcount)
930 				fend = fstart + (count - fcount);
931 
932 			fcount += fend - fstart;
933 			fnsegs++;
934 			found = uvm_pmr_extract_range(pmr, found,
935 			    fstart, fend, result);
936 
937 			if (fcount == count)
938 				goto out;
939 
940 			/*
941 			 * If there's still space left in found, try to
942 			 * fully drain it prior to continueing.
943 			 */
944 			if (found != NULL) {
945 				fstart = fend;
946 				goto drain_found;
947 			}
948 		}
949 
950 		/* Try a smaller search now. */
951 		if (++try < nitems(search))
952 			goto rescan;
953 
954 		/*
955 		 * Exhaust all memory types prior to going to the next memory
956 		 * segment.
957 		 * This means that zero-vs-dirty are eaten prior to moving
958 		 * to a pmemrange with a higher use-count.
959 		 *
960 		 * Code is basically a difficult way of writing:
961 		 * memtype = memtype_init;
962 		 * do {
963 		 *	...;
964 		 *	memtype += 1;
965 		 *	memtype %= MEMTYPE_MAX;
966 		 * } while (memtype != memtype_init);
967 		 */
968 		memtype += 1;
969 		if (memtype == UVM_PMR_MEMTYPE_MAX)
970 			memtype = 0;
971 		if (memtype != memtype_init)
972 			goto rescan_memtype;
973 
974 		/*
975 		 * If not desperate, enter desperate case prior to eating all
976 		 * the good stuff in the next range.
977 		 */
978 		if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
979 		    TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
980 			break;
981 	}
982 
983 	/*
984 	 * Not enough memory of the requested type available. Fall back to
985 	 * less good memory that we'll clean up better later.
986 	 *
987 	 * This algorithm is not very smart though, it just starts scanning
988 	 * a different typed range, but the nicer ranges of the previous
989 	 * iteration may fall out. Hence there is a small chance of a false
990 	 * negative.
991 	 *
992 	 * When desparate: scan all sizes starting at the smallest
993 	 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
994 	 * allow us to hit the fast path now).
995 	 *
996 	 * Also, because we will revisit entries we scanned before, we need
997 	 * to reset the page queue, or we may end up releasing entries in
998 	 * such a way as to invalidate f_next.
999 	 */
1000 	if (!desperate) {
1001 		desperate = 1;
1002 		start_try = nitems(search) - 1;
1003 		flags &= ~UVM_PLA_TRYCONTIG;
1004 
1005 		while (!TAILQ_EMPTY(result))
1006 			uvm_pmr_remove_1strange(result, 0, NULL, 0);
1007 		fnsegs = 0;
1008 		fcount = 0;
1009 		goto retry_desperate;
1010 	}
1011 
1012 fail:
1013 	/* Allocation failed. */
1014 	/* XXX: claim from memory reserve here */
1015 
1016 	while (!TAILQ_EMPTY(result))
1017 		uvm_pmr_remove_1strange(result, 0, NULL, 0);
1018 
1019 	if (flags & UVM_PLA_WAITOK) {
1020 		if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
1021 		    flags & UVM_PLA_FAILOK) == 0)
1022 			goto retry;
1023 		KASSERT(flags & UVM_PLA_FAILOK);
1024 	} else {
1025 		if (!(flags & UVM_PLA_NOWAKE)) {
1026 			uvm_nowait_failed = 1;
1027 			wakeup(&uvm.pagedaemon);
1028 		}
1029 	}
1030 	uvm_unlock_fpageq();
1031 
1032 	return ENOMEM;
1033 
1034 out:
1035 	/* Allocation succesful. */
1036 	uvmexp.free -= fcount;
1037 
1038 	uvm_unlock_fpageq();
1039 
1040 	/* Update statistics and zero pages if UVM_PLA_ZERO. */
1041 #ifdef DIAGNOSTIC
1042 	fnsegs = 0;
1043 	fcount = 0;
1044 	diag_prev = NULL;
1045 #endif /* DIAGNOSTIC */
1046 	TAILQ_FOREACH(found, result, pageq) {
1047 		atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1048 
1049 		if (found->pg_flags & PG_ZERO) {
1050 			uvm_lock_fpageq();
1051 			uvmexp.zeropages--;
1052 			if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1053 				wakeup(&uvmexp.zeropages);
1054 			uvm_unlock_fpageq();
1055 		}
1056 		if (flags & UVM_PLA_ZERO) {
1057 			if (found->pg_flags & PG_ZERO)
1058 				uvmexp.pga_zerohit++;
1059 			else {
1060 				uvmexp.pga_zeromiss++;
1061 				uvm_pagezero(found);
1062 			}
1063 		}
1064 		atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1065 
1066 		found->uobject = NULL;
1067 		found->uanon = NULL;
1068 		found->pg_version++;
1069 
1070 		/*
1071 		 * Validate that the page matches range criterium.
1072 		 */
1073 		KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1074 		KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1075 
1076 #ifdef DIAGNOSTIC
1077 		/*
1078 		 * Update fcount (# found pages) and
1079 		 * fnsegs (# found segments) counters.
1080 		 */
1081 		if (diag_prev == NULL ||
1082 		    /* new segment if it contains a hole */
1083 		    atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1084 		    atop(VM_PAGE_TO_PHYS(found)) ||
1085 		    /* new segment if it crosses boundary */
1086 		    (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1087 		    (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1088 			fnsegs++;
1089 		fcount++;
1090 
1091 		diag_prev = found;
1092 #endif /* DIAGNOSTIC */
1093 	}
1094 
1095 #ifdef DIAGNOSTIC
1096 	/*
1097 	 * Panic on algorithm failure.
1098 	 */
1099 	if (fcount != count || fnsegs > maxseg) {
1100 		panic("pmemrange allocation error: "
1101 		    "allocated %ld pages in %d segments, "
1102 		    "but request was %ld pages in %d segments",
1103 		    fcount, fnsegs, count, maxseg);
1104 	}
1105 #endif /* DIAGNOSTIC */
1106 
1107 	return 0;
1108 }
1109 
1110 /*
1111  * Free a number of contig pages (invoked by uvm_page_init).
1112  */
1113 void
1114 uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1115 {
1116 	struct uvm_pmemrange *pmr;
1117 	psize_t i, pmr_count;
1118 	struct vm_page *firstpg = pg;
1119 
1120 	for (i = 0; i < count; i++) {
1121 		KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1122 		    atop(VM_PAGE_TO_PHYS(pg)) + i);
1123 
1124 		if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1125 		    VALID_FLAGS(pg[i].pg_flags))) {
1126 			printf("Flags: 0x%x, will panic now.\n",
1127 			    pg[i].pg_flags);
1128 		}
1129 		KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1130 		    VALID_FLAGS(pg[i].pg_flags));
1131 		atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1132 		atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1133 	}
1134 
1135 	uvm_lock_fpageq();
1136 
1137 	for (i = count; i > 0; i -= pmr_count) {
1138 		pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1139 		KASSERT(pmr != NULL);
1140 
1141 		pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1142 		pg->fpgsz = pmr_count;
1143 		uvm_pmr_insert(pmr, pg, 0);
1144 
1145 		uvmexp.free += pmr_count;
1146 		pg += pmr_count;
1147 	}
1148 	wakeup(&uvmexp.free);
1149 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1150 		wakeup(&uvmexp.zeropages);
1151 
1152 	uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
1153 
1154 	uvm_unlock_fpageq();
1155 }
1156 
1157 /*
1158  * Free all pages in the queue.
1159  */
1160 void
1161 uvm_pmr_freepageq(struct pglist *pgl)
1162 {
1163 	struct vm_page *pg;
1164 	paddr_t pstart;
1165 	psize_t plen;
1166 
1167 	TAILQ_FOREACH(pg, pgl, pageq) {
1168 		if (!((pg->pg_flags & PQ_FREE) == 0 &&
1169 		    VALID_FLAGS(pg->pg_flags))) {
1170 			printf("Flags: 0x%x, will panic now.\n",
1171 			    pg->pg_flags);
1172 		}
1173 		KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1174 		    VALID_FLAGS(pg->pg_flags));
1175 		atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1176 		atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1177 	}
1178 
1179 	uvm_lock_fpageq();
1180 	while (!TAILQ_EMPTY(pgl)) {
1181 		pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
1182 		plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
1183 		uvmexp.free += plen;
1184 
1185 		uvm_wakeup_pla(pstart, ptoa(plen));
1186 	}
1187 	wakeup(&uvmexp.free);
1188 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1189 		wakeup(&uvmexp.zeropages);
1190 	uvm_unlock_fpageq();
1191 
1192 	return;
1193 }
1194 
1195 /*
1196  * Store a pmemrange in the list.
1197  *
1198  * The list is sorted by use.
1199  */
1200 struct uvm_pmemrange *
1201 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1202     struct uvm_pmemrange *pmr)
1203 {
1204 	struct uvm_pmemrange *iter;
1205 	int cmp = 1;
1206 
1207 	TAILQ_FOREACH(iter, useq, pmr_use) {
1208 		cmp = uvm_pmemrange_use_cmp(pmr, iter);
1209 		if (cmp == 0)
1210 			return iter;
1211 		if (cmp == -1)
1212 			break;
1213 	}
1214 
1215 	if (iter == NULL)
1216 		TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1217 	else
1218 		TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1219 	return NULL;
1220 }
1221 
1222 #ifdef DEBUG
1223 /*
1224  * Validation of the whole pmemrange.
1225  * Called with fpageq locked.
1226  */
1227 void
1228 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1229 {
1230 	struct vm_page *prev, *next, *i, *xref;
1231 	int lcv, mti;
1232 
1233 	/* Empty range */
1234 	if (pmr->nsegs == 0)
1235 		return;
1236 
1237 	/* Validate address tree. */
1238 	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1239 		/* Validate the range. */
1240 		KASSERT(i->fpgsz > 0);
1241 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1242 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1243 		    <= pmr->high);
1244 
1245 		/* Validate each page in this range. */
1246 		for (lcv = 0; lcv < i->fpgsz; lcv++) {
1247 			/*
1248 			 * Only the first page has a size specification.
1249 			 * Rest is size 0.
1250 			 */
1251 			KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1252 			/*
1253 			 * Flag check.
1254 			 */
1255 			KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1256 			    (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1257 			/*
1258 			 * Free pages are:
1259 			 * - not wired
1260 			 * - have no vm_anon
1261 			 * - have no uvm_object
1262 			 */
1263 			KASSERT(i[lcv].wire_count == 0);
1264 			KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1265 			    i[lcv].uanon == NULL);
1266 			KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1267 			    i[lcv].uobject == NULL);
1268 			/*
1269 			 * Pages in a single range always have the same
1270 			 * memtype.
1271 			 */
1272 			KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1273 			    uvm_pmr_pg_to_memtype(&i[lcv]));
1274 		}
1275 
1276 		/* Check that it shouldn't be joined with its predecessor. */
1277 		prev = RBT_PREV(uvm_pmr_addr, i);
1278 		if (prev != NULL) {
1279 			KASSERT(uvm_pmr_pg_to_memtype(i) !=
1280 			    uvm_pmr_pg_to_memtype(prev) ||
1281 			    atop(VM_PAGE_TO_PHYS(i)) >
1282 			    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1283 			    prev + prev->fpgsz != i);
1284 		}
1285 
1286 		/* Assert i is in the size tree as well. */
1287 		if (i->fpgsz == 1) {
1288 			TAILQ_FOREACH(xref,
1289 			    &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1290 				if (xref == i)
1291 					break;
1292 			}
1293 			KASSERT(xref == i);
1294 		} else {
1295 			KASSERT(RBT_FIND(uvm_pmr_size,
1296 			    &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1297 			    i + 1);
1298 		}
1299 	}
1300 
1301 	/* Validate size tree. */
1302 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1303 		for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1304 			next = uvm_pmr_nextsz(pmr, i, mti);
1305 			if (next != NULL) {
1306 				KASSERT(i->fpgsz <=
1307 				    next->fpgsz);
1308 			}
1309 
1310 			/* Assert i is in the addr tree as well. */
1311 			KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1312 
1313 			/* Assert i is of the correct memory type. */
1314 			KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1315 		}
1316 	}
1317 
1318 	/* Validate nsegs statistic. */
1319 	lcv = 0;
1320 	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1321 		lcv++;
1322 	KASSERT(pmr->nsegs == lcv);
1323 }
1324 #endif /* DEBUG */
1325 
1326 /*
1327  * Split pmr at split point pageno.
1328  * Called with fpageq unlocked.
1329  *
1330  * Split is only applied if a pmemrange spans pageno.
1331  */
1332 void
1333 uvm_pmr_split(paddr_t pageno)
1334 {
1335 	struct uvm_pmemrange *pmr, *drain;
1336 	struct vm_page *rebuild, *prev, *next;
1337 	psize_t prev_sz;
1338 
1339 	uvm_lock_fpageq();
1340 	pmr = uvm_pmemrange_find(pageno);
1341 	if (pmr == NULL || !(pmr->low < pageno)) {
1342 		/* No split required. */
1343 		uvm_unlock_fpageq();
1344 		return;
1345 	}
1346 
1347 	KASSERT(pmr->low < pageno);
1348 	KASSERT(pmr->high > pageno);
1349 
1350 	/*
1351 	 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1352 	 * uvm_kmemalloc which calls into pmemrange, making the locking
1353 	 * a bit hard, so we just race!
1354 	 */
1355 	uvm_unlock_fpageq();
1356 	drain = uvm_pmr_allocpmr();
1357 	uvm_lock_fpageq();
1358 	pmr = uvm_pmemrange_find(pageno);
1359 	if (pmr == NULL || !(pmr->low < pageno)) {
1360 		/*
1361 		 * We lost the race since someone else ran this or a related
1362 		 * function, however this should be triggered very rarely so
1363 		 * we just leak the pmr.
1364 		 */
1365 		printf("uvm_pmr_split: lost one pmr\n");
1366 		uvm_unlock_fpageq();
1367 		return;
1368 	}
1369 
1370 	drain->low = pageno;
1371 	drain->high = pmr->high;
1372 	drain->use = pmr->use;
1373 
1374 	uvm_pmr_assertvalid(pmr);
1375 	uvm_pmr_assertvalid(drain);
1376 	KASSERT(drain->nsegs == 0);
1377 
1378 	RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1379 		if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1380 			break;
1381 	}
1382 	if (rebuild == NULL)
1383 		prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1384 	else
1385 		prev = RBT_PREV(uvm_pmr_addr, rebuild);
1386 	KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1387 
1388 	/*
1389 	 * Handle free chunk that spans the split point.
1390 	 */
1391 	if (prev != NULL &&
1392 	    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1393 		psize_t before, after;
1394 
1395 		KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1396 
1397 		uvm_pmr_remove(pmr, prev);
1398 		prev_sz = prev->fpgsz;
1399 		before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1400 		after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1401 
1402 		KASSERT(before > 0);
1403 		KASSERT(after > 0);
1404 
1405 		prev->fpgsz = before;
1406 		uvm_pmr_insert(pmr, prev, 1);
1407 		(prev + before)->fpgsz = after;
1408 		uvm_pmr_insert(drain, prev + before, 1);
1409 	}
1410 
1411 	/* Move free chunks that no longer fall in the range. */
1412 	for (; rebuild != NULL; rebuild = next) {
1413 		next = RBT_NEXT(uvm_pmr_addr, rebuild);
1414 
1415 		uvm_pmr_remove(pmr, rebuild);
1416 		uvm_pmr_insert(drain, rebuild, 1);
1417 	}
1418 
1419 	pmr->high = pageno;
1420 	uvm_pmr_assertvalid(pmr);
1421 	uvm_pmr_assertvalid(drain);
1422 
1423 	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1424 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1425 	uvm_unlock_fpageq();
1426 }
1427 
1428 /*
1429  * Increase the usage counter for the given range of memory.
1430  *
1431  * The more usage counters a given range of memory has, the more will be
1432  * attempted not to allocate from it.
1433  *
1434  * Addresses here are in paddr_t, not page-numbers.
1435  * The lowest and highest allowed address are specified.
1436  */
1437 void
1438 uvm_pmr_use_inc(paddr_t low, paddr_t high)
1439 {
1440 	struct uvm_pmemrange *pmr;
1441 	paddr_t sz;
1442 
1443 	/* pmr uses page numbers, translate low and high. */
1444 	high++;
1445 	high = atop(trunc_page(high));
1446 	low = atop(round_page(low));
1447 	uvm_pmr_split(low);
1448 	uvm_pmr_split(high);
1449 
1450 	sz = 0;
1451 	uvm_lock_fpageq();
1452 	/* Increase use count on segments in range. */
1453 	RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1454 		if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1455 			TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1456 			pmr->use++;
1457 			sz += pmr->high - pmr->low;
1458 			uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1459 		}
1460 		uvm_pmr_assertvalid(pmr);
1461 	}
1462 	uvm_unlock_fpageq();
1463 
1464 	KASSERT(sz >= high - low);
1465 }
1466 
1467 /*
1468  * Allocate a pmemrange.
1469  *
1470  * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1471  * If called after uvm_init, malloc is used.
1472  * (And if called in between, you're dead.)
1473  */
1474 struct uvm_pmemrange *
1475 uvm_pmr_allocpmr(void)
1476 {
1477 	struct uvm_pmemrange *nw;
1478 	int i;
1479 
1480 	/* We're only ever hitting the !uvm.page_init_done case for now. */
1481 	if (!uvm.page_init_done) {
1482 		nw = (struct uvm_pmemrange *)
1483 		    uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1484 	} else {
1485 		nw = malloc(sizeof(struct uvm_pmemrange),
1486 		    M_VMMAP, M_NOWAIT);
1487 	}
1488 	KASSERT(nw != NULL);
1489 	memset(nw, 0, sizeof(struct uvm_pmemrange));
1490 	RBT_INIT(uvm_pmr_addr, &nw->addr);
1491 	for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1492 		RBT_INIT(uvm_pmr_size, &nw->size[i]);
1493 		TAILQ_INIT(&nw->single[i]);
1494 	}
1495 	return nw;
1496 }
1497 
1498 /*
1499  * Initialization of pmr.
1500  * Called by uvm_page_init.
1501  *
1502  * Sets up pmemranges.
1503  */
1504 void
1505 uvm_pmr_init(void)
1506 {
1507 	struct uvm_pmemrange *new_pmr;
1508 	int i;
1509 
1510 	TAILQ_INIT(&uvm.pmr_control.use);
1511 	RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1512 	TAILQ_INIT(&uvm.pmr_control.allocs);
1513 
1514 	/* By default, one range for the entire address space. */
1515 	new_pmr = uvm_pmr_allocpmr();
1516 	new_pmr->low = 0;
1517 	new_pmr->high = atop((paddr_t)-1) + 1;
1518 
1519 	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1520 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1521 
1522 	for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1523 		uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1524 	    	    uvm_md_constraints[i]->ucr_high);
1525 	}
1526 }
1527 
1528 /*
1529  * Find the pmemrange that contains the given page number.
1530  *
1531  * (Manually traverses the binary tree, because that is cheaper on stack
1532  * usage.)
1533  */
1534 struct uvm_pmemrange *
1535 uvm_pmemrange_find(paddr_t pageno)
1536 {
1537 	struct uvm_pmemrange *pmr;
1538 
1539 	pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1540 	while (pmr != NULL) {
1541 		if (pmr->low > pageno)
1542 			pmr = RBT_LEFT(uvm_pmemrange_addr, pmr);
1543 		else if (pmr->high <= pageno)
1544 			pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr);
1545 		else
1546 			break;
1547 	}
1548 
1549 	return pmr;
1550 }
1551 
1552 #if defined(DDB) || defined(DEBUG)
1553 /*
1554  * Return true if the given page is in any of the free lists.
1555  * Used by uvm_page_printit.
1556  * This function is safe, even if the page is not on the freeq.
1557  * Note: does not apply locking, only called from ddb.
1558  */
1559 int
1560 uvm_pmr_isfree(struct vm_page *pg)
1561 {
1562 	struct vm_page *r;
1563 	struct uvm_pmemrange *pmr;
1564 
1565 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1566 	if (pmr == NULL)
1567 		return 0;
1568 	r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1569 	if (r == NULL)
1570 		r = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1571 	else if (r != pg)
1572 		r = RBT_PREV(uvm_pmr_addr, r);
1573 	if (r == NULL)
1574 		return 0; /* Empty tree. */
1575 
1576 	KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1577 	return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1578 	    atop(VM_PAGE_TO_PHYS(pg));
1579 }
1580 #endif /* DEBUG */
1581 
1582 /*
1583  * Given a root of a tree, find a range which intersects start, end and
1584  * is of the same memtype.
1585  *
1586  * Page must be in the address tree.
1587  */
1588 struct vm_page*
1589 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1590     paddr_t start, paddr_t end, int memtype)
1591 {
1592 	int	direction;
1593 	struct	vm_page *root;
1594 	struct	vm_page *high, *high_next;
1595 	struct	vm_page *low, *low_next;
1596 
1597 	KDASSERT(pmr != NULL && init_root != NULL);
1598 	root = init_root;
1599 
1600 	/* Which direction to use for searching. */
1601 	if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1602 		direction =  1;
1603 	else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1604 		direction = -1;
1605 	else /* nothing to do */
1606 		return root;
1607 
1608 	/* First, update root to fall within the chosen range. */
1609 	while (root && !PMR_INTERSECTS_WITH(
1610 	    atop(VM_PAGE_TO_PHYS(root)),
1611 	    atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1612 	    start, end)) {
1613 		if (direction == 1)
1614 			root = RBT_RIGHT(uvm_objtree, root);
1615 		else
1616 			root = RBT_LEFT(uvm_objtree, root);
1617 	}
1618 	if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1619 		return root;
1620 
1621 	/*
1622 	 * Root is valid, but of the wrong memtype.
1623 	 *
1624 	 * Try to find a range that has the given memtype in the subtree
1625 	 * (memtype mismatches are costly, either because the conversion
1626 	 * is expensive, or a later allocation will need to do the opposite
1627 	 * conversion, which will be expensive).
1628 	 *
1629 	 *
1630 	 * First, simply increase address until we hit something we can use.
1631 	 * Cache the upper page, so we can page-walk later.
1632 	 */
1633 	high = root;
1634 	high_next = RBT_RIGHT(uvm_objtree, high);
1635 	while (high_next != NULL && PMR_INTERSECTS_WITH(
1636 	    atop(VM_PAGE_TO_PHYS(high_next)),
1637 	    atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1638 	    start, end)) {
1639 		high = high_next;
1640 		if (uvm_pmr_pg_to_memtype(high) == memtype)
1641 			return high;
1642 		high_next = RBT_RIGHT(uvm_objtree, high);
1643 	}
1644 
1645 	/*
1646 	 * Second, decrease the address until we hit something we can use.
1647 	 * Cache the lower page, so we can page-walk later.
1648 	 */
1649 	low = root;
1650 	low_next = RBT_LEFT(uvm_objtree, low);
1651 	while (low_next != NULL && PMR_INTERSECTS_WITH(
1652 	    atop(VM_PAGE_TO_PHYS(low_next)),
1653 	    atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1654 	    start, end)) {
1655 		low = low_next;
1656 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1657 			return low;
1658 		low_next = RBT_LEFT(uvm_objtree, low);
1659 	}
1660 
1661 	if (low == high)
1662 		return NULL;
1663 
1664 	/* No hits. Walk the address tree until we find something usable. */
1665 	for (low = RBT_NEXT(uvm_pmr_addr, low);
1666 	    low != high;
1667 	    low = RBT_NEXT(uvm_pmr_addr, low)) {
1668 		KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1669 	    	    atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1670 	    	    start, end));
1671 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1672 			return low;
1673 	}
1674 
1675 	/* Nothing found. */
1676 	return NULL;
1677 }
1678 
1679 /*
1680  * Allocate any page, the fastest way. Page number constraints only.
1681  */
1682 psize_t
1683 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1684     paddr_t start, paddr_t end, int memtype_only)
1685 {
1686 	struct	uvm_pmemrange *pmr;
1687 	struct	vm_page *found, *splitpg;
1688 	psize_t	fcount;
1689 	int	memtype;
1690 
1691 	fcount = 0;
1692 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1693 		/* We're done. */
1694 		if (fcount == count)
1695 			break;
1696 
1697 		/* Outside requested range. */
1698 		if (!(start == 0 && end == 0) &&
1699 		    !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1700 			continue;
1701 
1702 		/* Range is empty. */
1703 		if (pmr->nsegs == 0)
1704 			continue;
1705 
1706 		/* Loop over all memtypes, starting at memtype_init. */
1707 		memtype = memtype_init;
1708 		while (fcount != count) {
1709 			found = TAILQ_FIRST(&pmr->single[memtype]);
1710 			/*
1711 			 * If found is outside the range, walk the list
1712 			 * until we find something that intersects with
1713 			 * boundaries.
1714 			 */
1715 			while (found && !PMR_INTERSECTS_WITH(
1716 			    atop(VM_PAGE_TO_PHYS(found)),
1717 			    atop(VM_PAGE_TO_PHYS(found)) + 1,
1718 			    start, end))
1719 				found = TAILQ_NEXT(found, pageq);
1720 
1721 			if (found == NULL) {
1722 				/*
1723 				 * Check if the size tree contains a range
1724 				 * that intersects with the boundaries. As the
1725 				 * allocation is for any page, try the smallest
1726 				 * range so that large ranges are preserved for
1727 				 * more constrained cases. Only one entry is
1728 				 * checked here, to avoid a brute-force search.
1729 				 *
1730 				 * Note that a size tree gives pg[1] instead of
1731 				 * pg[0].
1732 				 */
1733 				found = RBT_MIN(uvm_pmr_size,
1734 				    &pmr->size[memtype]);
1735 				if (found != NULL) {
1736 					found--;
1737 					if (!PMR_INTERSECTS_WITH(
1738 					    atop(VM_PAGE_TO_PHYS(found)),
1739 					    atop(VM_PAGE_TO_PHYS(found)) +
1740 					    found->fpgsz, start, end))
1741 						found = NULL;
1742 				}
1743 			}
1744 			if (found == NULL) {
1745 				/*
1746 				 * Try address-guided search to meet the page
1747 				 * number constraints.
1748 				 */
1749 				found = RBT_ROOT(uvm_pmr_addr, &pmr->addr);
1750 				if (found != NULL) {
1751 					found = uvm_pmr_rootupdate(pmr, found,
1752 					    start, end, memtype);
1753 				}
1754 			}
1755 			if (found != NULL) {
1756 				uvm_pmr_assertvalid(pmr);
1757 				uvm_pmr_remove_size(pmr, found);
1758 
1759 				/*
1760 				 * If the page intersects the end, then it'll
1761 				 * need splitting.
1762 				 *
1763 				 * Note that we don't need to split if the page
1764 				 * intersects start: the drain function will
1765 				 * simply stop on hitting start.
1766 				 */
1767 				if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1768 				    found->fpgsz > end) {
1769 					psize_t splitsz =
1770 					    atop(VM_PAGE_TO_PHYS(found)) +
1771 					    found->fpgsz - end;
1772 
1773 					uvm_pmr_remove_addr(pmr, found);
1774 					uvm_pmr_assertvalid(pmr);
1775 					found->fpgsz -= splitsz;
1776 					splitpg = found + found->fpgsz;
1777 					splitpg->fpgsz = splitsz;
1778 					uvm_pmr_insert(pmr, splitpg, 1);
1779 
1780 					/*
1781 					 * At this point, splitpg and found
1782 					 * actually should be joined.
1783 					 * But we explicitly disable that,
1784 					 * because we will start subtracting
1785 					 * from found.
1786 					 */
1787 					KASSERT(start == 0 ||
1788 					    atop(VM_PAGE_TO_PHYS(found)) +
1789 					    found->fpgsz > start);
1790 					uvm_pmr_insert_addr(pmr, found, 1);
1791 				}
1792 
1793 				/*
1794 				 * Fetch pages from the end.
1795 				 * If the range is larger than the requested
1796 				 * number of pages, this saves us an addr-tree
1797 				 * update.
1798 				 *
1799 				 * Since we take from the end and insert at
1800 				 * the head, any ranges keep preserved.
1801 				 */
1802 				while (found->fpgsz > 0 && fcount < count &&
1803 				    (start == 0 ||
1804 				    atop(VM_PAGE_TO_PHYS(found)) +
1805 				    found->fpgsz > start)) {
1806 					found->fpgsz--;
1807 					fcount++;
1808 					TAILQ_INSERT_HEAD(result,
1809 					    &found[found->fpgsz], pageq);
1810 				}
1811 				if (found->fpgsz > 0) {
1812 					uvm_pmr_insert_size(pmr, found);
1813 					KDASSERT(fcount == count);
1814 					uvm_pmr_assertvalid(pmr);
1815 					return fcount;
1816 				}
1817 
1818 				/*
1819 				 * Delayed addr-tree removal.
1820 				 */
1821 				uvm_pmr_remove_addr(pmr, found);
1822 				uvm_pmr_assertvalid(pmr);
1823 			} else {
1824 				if (memtype_only)
1825 					break;
1826 				/*
1827 				 * Skip to the next memtype.
1828 				 */
1829 				memtype += 1;
1830 				if (memtype == UVM_PMR_MEMTYPE_MAX)
1831 					memtype = 0;
1832 				if (memtype == memtype_init)
1833 					break;
1834 			}
1835 		}
1836 	}
1837 
1838 	/*
1839 	 * Search finished.
1840 	 *
1841 	 * Ran out of ranges before enough pages were gathered, or we hit the
1842 	 * case where found->fpgsz == count - fcount, in which case the
1843 	 * above exit condition didn't trigger.
1844 	 *
1845 	 * On failure, caller will free the pages.
1846 	 */
1847 	return fcount;
1848 }
1849 
1850 #ifdef DDB
1851 /*
1852  * Print information about pmemrange.
1853  * Does not do locking (so either call it from DDB or acquire fpageq lock
1854  * before invoking.
1855  */
1856 void
1857 uvm_pmr_print(void)
1858 {
1859 	struct	uvm_pmemrange *pmr;
1860 	struct	vm_page *pg;
1861 	psize_t	size[UVM_PMR_MEMTYPE_MAX];
1862 	psize_t	free;
1863 	int	useq_len;
1864 	int	mt;
1865 
1866 	printf("Ranges, use queue:\n");
1867 	useq_len = 0;
1868 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1869 		useq_len++;
1870 		free = 0;
1871 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
1872 			pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]);
1873 			if (pg != NULL)
1874 				pg--;
1875 			else
1876 				pg = TAILQ_FIRST(&pmr->single[mt]);
1877 			size[mt] = (pg == NULL ? 0 : pg->fpgsz);
1878 
1879 			RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
1880 				free += pg->fpgsz;
1881 		}
1882 
1883 		printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
1884 		    (unsigned long)pmr->low, (unsigned long)pmr->high,
1885 		    pmr->use, (unsigned long)pmr->nsegs);
1886 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
1887 			printf(" maxsegsz[%d]=0x%lx", mt,
1888 			    (unsigned long)size[mt]);
1889 		}
1890 		printf(" free=0x%lx\n", (unsigned long)free);
1891 	}
1892 	printf("#ranges = %d\n", useq_len);
1893 }
1894 #endif
1895 
1896 /*
1897  * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
1898  * in a specific physmem area.
1899  *
1900  * Returns ENOMEM if the pagedaemon failed to free any pages.
1901  * If not failok, failure will lead to panic.
1902  *
1903  * Must be called with fpageq locked.
1904  */
1905 int
1906 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
1907 {
1908 	struct uvm_pmalloc pma;
1909 	const char *wmsg = "pmrwait";
1910 
1911 	if (curproc == uvm.pagedaemon_proc) {
1912 		/*
1913 		 * This is not that uncommon when the pagedaemon is trying
1914 		 * to flush out a large mmapped file. VOP_WRITE will circle
1915 		 * back through the buffer cache and try to get more memory.
1916 		 * The pagedaemon starts by calling bufbackoff, but we can
1917 		 * easily use up that reserve in a single scan iteration.
1918 		 */
1919 		uvm_unlock_fpageq();
1920 		if (bufbackoff(NULL, atop(size)) == 0) {
1921 			uvm_lock_fpageq();
1922 			return 0;
1923 		}
1924 		uvm_lock_fpageq();
1925 
1926 		/*
1927 		 * XXX detect pagedaemon deadlock - see comment in
1928 		 * uvm_wait(), as this is exactly the same issue.
1929 		 */
1930 		printf("pagedaemon: wait_pla deadlock detected!\n");
1931 		msleep(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, hz >> 3);
1932 #if defined(DEBUG)
1933 		/* DEBUG: panic so we can debug it */
1934 		panic("wait_pla pagedaemon deadlock");
1935 #endif
1936 		return 0;
1937 	}
1938 
1939 	for (;;) {
1940 		pma.pm_constraint.ucr_low = low;
1941 		pma.pm_constraint.ucr_high = high;
1942 		pma.pm_size = size;
1943 		pma.pm_flags = UVM_PMA_LINKED;
1944 		TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
1945 
1946 		wakeup(&uvm.pagedaemon);		/* wake the daemon! */
1947 		while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
1948 			msleep(&pma, &uvm.fpageqlock, PVM, wmsg, 0);
1949 
1950 		if (!(pma.pm_flags & UVM_PMA_FREED) &&
1951 		    pma.pm_flags & UVM_PMA_FAIL) {
1952 			if (failok)
1953 				return ENOMEM;
1954 			printf("uvm_wait: failed to free %ld pages between "
1955 			    "0x%lx-0x%lx\n", atop(size), low, high);
1956 		} else
1957 			return 0;
1958 	}
1959 	/* UNREACHABLE */
1960 }
1961 
1962 /*
1963  * Wake up uvm_pmalloc sleepers.
1964  */
1965 void
1966 uvm_wakeup_pla(paddr_t low, psize_t len)
1967 {
1968 	struct uvm_pmalloc *pma, *pma_next;
1969 	paddr_t high;
1970 
1971 	high = low + len;
1972 
1973 	/* Wake specific allocations waiting for this memory. */
1974 	for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
1975 	    pma = pma_next) {
1976 		pma_next = TAILQ_NEXT(pma, pmq);
1977 
1978 		if (low < pma->pm_constraint.ucr_high &&
1979 		    high > pma->pm_constraint.ucr_low) {
1980 			pma->pm_flags |= UVM_PMA_FREED;
1981 			if (!(pma->pm_flags & UVM_PMA_BUSY)) {
1982 				pma->pm_flags &= ~UVM_PMA_LINKED;
1983 				TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
1984 				    pmq);
1985 				wakeup(pma);
1986 			}
1987 		}
1988 	}
1989 }
1990 
1991 void
1992 uvm_pagezero_thread(void *arg)
1993 {
1994 	struct pglist pgl;
1995 	struct vm_page *pg;
1996 	int count;
1997 
1998 	/* Run at the lowest possible priority. */
1999 	curproc->p_p->ps_nice = NZERO + PRIO_MAX;
2000 
2001 	KERNEL_UNLOCK();
2002 
2003 	TAILQ_INIT(&pgl);
2004 	for (;;) {
2005 		uvm_lock_fpageq();
2006 		while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
2007 		    (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
2008 		     &pgl, 0, 0, 1)) == 0) {
2009 			msleep(&uvmexp.zeropages, &uvm.fpageqlock, MAXPRI,
2010 			    "pgzero", 0);
2011 		}
2012 		uvm_unlock_fpageq();
2013 
2014 		TAILQ_FOREACH(pg, &pgl, pageq) {
2015 			uvm_pagezero(pg);
2016 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
2017 		}
2018 
2019 		uvm_lock_fpageq();
2020 		while (!TAILQ_EMPTY(&pgl))
2021 			uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
2022 		uvmexp.zeropages += count;
2023  		uvm_unlock_fpageq();
2024 
2025 		yield();
2026 	}
2027 }
2028