xref: /openbsd-src/sys/uvm/uvm_pmemrange.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /*	$OpenBSD: uvm_pmemrange.c,v 1.40 2014/04/13 23:14:15 tedu Exp $	*/
2 
3 /*
4  * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/systm.h>
21 #include <uvm/uvm.h>
22 #include <sys/malloc.h>
23 #include <sys/proc.h>		/* XXX for atomic */
24 #include <sys/kernel.h>
25 #include <sys/mount.h>
26 
27 /*
28  * 2 trees: addr tree and size tree.
29  *
30  * The allocator keeps chunks of free pages (called a range).
31  * Two pages are part of the same range if:
32  * - all pages in between are part of that range,
33  * - they are of the same memory type (zeroed or non-zeroed),
34  * - they are part of the same pmemrange.
35  * A pmemrange is a range of memory which is part of the same vm_physseg
36  * and has a use-count.
37  *
38  * addr tree is vm_page[0].objt
39  * size tree is vm_page[1].objt
40  *
41  * The size tree is not used for memory ranges of 1 page, instead,
42  * single queue is vm_page[0].pageq
43  *
44  * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges
45  * are joined, unless:
46  * - they have pages in between them which are not free
47  * - they belong to different memtypes (zeroed vs dirty memory)
48  * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
49  * - they are not a continuation of the same array
50  * The latter issue is caused by vm_physseg ordering and splitting from the
51  * MD initialization machinery. The MD code is dependant on freelists and
52  * happens to split ISA memory from non-ISA memory.
53  * (Note: freelists die die die!)
54  *
55  * uvm_page_init guarantees that every vm_physseg contains an array of
56  * struct vm_page. Also, uvm_page_physload allocates an array of struct
57  * vm_page. This code depends on that array. The array may break across
58  * vm_physsegs boundaries.
59  */
60 
61 /*
62  * Validate the flags of the page. (Used in asserts.)
63  * Any free page must have the PQ_FREE flag set.
64  * Free pages may be zeroed.
65  * Pmap flags are left untouched.
66  *
67  * The PQ_FREE flag is not checked here: by not checking, we can easily use
68  * this check in pages which are freed.
69  */
70 #define VALID_FLAGS(pg_flags)						\
71 	(((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
72 
73 /* Tree comparators. */
74 int	uvm_pmemrange_addr_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
75 int	uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
76 int	uvm_pmr_addr_cmp(struct vm_page *, struct vm_page *);
77 int	uvm_pmr_size_cmp(struct vm_page *, struct vm_page *);
78 int	uvm_pmr_pg_to_memtype(struct vm_page *);
79 
80 #ifdef DDB
81 void	uvm_pmr_print(void);
82 #endif
83 
84 /*
85  * Memory types. The page flags are used to derive what the current memory
86  * type of a page is.
87  */
88 int
89 uvm_pmr_pg_to_memtype(struct vm_page *pg)
90 {
91 	if (pg->pg_flags & PG_ZERO)
92 		return UVM_PMR_MEMTYPE_ZERO;
93 	/* Default: dirty memory. */
94 	return UVM_PMR_MEMTYPE_DIRTY;
95 }
96 
97 /* Trees. */
98 RB_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
99 RB_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
100 RB_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
101     uvm_pmemrange_addr_cmp);
102 
103 /* Validation. */
104 #ifdef DEBUG
105 void	uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
106 #else
107 #define uvm_pmr_assertvalid(pmr)	do {} while (0)
108 #endif
109 
110 int			 uvm_pmr_get1page(psize_t, int, struct pglist *,
111 			    paddr_t, paddr_t);
112 
113 struct uvm_pmemrange	*uvm_pmr_allocpmr(void);
114 struct vm_page		*uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
115 struct vm_page		*uvm_pmr_nextsz(struct uvm_pmemrange *,
116 			    struct vm_page *, int);
117 void			 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
118 			    struct vm_page *pg, struct vm_page **pg_prev,
119 			    struct vm_page **pg_next);
120 struct vm_page		*uvm_pmr_findnextsegment(struct uvm_pmemrange *,
121 			    struct vm_page *, paddr_t);
122 psize_t			 uvm_pmr_remove_1strange(struct pglist *, paddr_t,
123 			    struct vm_page **, int);
124 void			 uvm_pmr_split(paddr_t);
125 struct uvm_pmemrange	*uvm_pmemrange_find(paddr_t);
126 struct uvm_pmemrange	*uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
127 			    struct uvm_pmemrange *);
128 psize_t			 pow2divide(psize_t, psize_t);
129 struct vm_page		*uvm_pmr_rootupdate(struct uvm_pmemrange *,
130 			    struct vm_page *, paddr_t, paddr_t, int);
131 
132 /*
133  * Computes num/denom and rounds it up to the next power-of-2.
134  *
135  * This is a division function which calculates an approximation of
136  * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
137  * have to be accurate.
138  *
139  * Providing too large a value makes the allocator slightly faster, at the
140  * risk of hitting the failure case more often. Providing too small a value
141  * makes the allocator a bit slower, but less likely to hit a failure case.
142  */
143 psize_t
144 pow2divide(psize_t num, psize_t denom)
145 {
146 	int rshift;
147 
148 	for (rshift = 0; num > denom; rshift++, denom <<= 1)
149 		;
150 	return (paddr_t)1 << rshift;
151 }
152 
153 /*
154  * Predicate: lhs is a subrange or rhs.
155  *
156  * If rhs_low == 0: don't care about lower bound.
157  * If rhs_high == 0: don't care about upper bound.
158  */
159 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high)	\
160 	(((rhs_low) == 0 || (lhs_low) >= (rhs_low)) &&			\
161 	((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
162 
163 /*
164  * Predicate: lhs intersects with rhs.
165  *
166  * If rhs_low == 0: don't care about lower bound.
167  * If rhs_high == 0: don't care about upper bound.
168  * Ranges don't intersect if they don't have any page in common, array
169  * semantics mean that < instead of <= should be used here.
170  */
171 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high)	\
172 	(((rhs_low) == 0 || (rhs_low) < (lhs_high)) &&			\
173 	((rhs_high) == 0 || (lhs_low) < (rhs_high)))
174 
175 /*
176  * Align to power-of-2 alignment.
177  */
178 #define PMR_ALIGN(pgno, align)						\
179 	(((pgno) + ((align) - 1)) & ~((align) - 1))
180 
181 
182 /*
183  * Comparator: sort by address ascending.
184  */
185 int
186 uvm_pmemrange_addr_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
187 {
188 	return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
189 }
190 
191 /*
192  * Comparator: sort by use ascending.
193  *
194  * The higher the use value of a range, the more devices need memory in
195  * this range. Therefor allocate from the range with the lowest use first.
196  */
197 int
198 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
199 {
200 	int result;
201 
202 	result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
203 	if (result == 0)
204 		result = uvm_pmemrange_addr_cmp(lhs, rhs);
205 	return result;
206 }
207 
208 int
209 uvm_pmr_addr_cmp(struct vm_page *lhs, struct vm_page *rhs)
210 {
211 	paddr_t lhs_addr, rhs_addr;
212 
213 	lhs_addr = VM_PAGE_TO_PHYS(lhs);
214 	rhs_addr = VM_PAGE_TO_PHYS(rhs);
215 
216 	return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
217 }
218 
219 int
220 uvm_pmr_size_cmp(struct vm_page *lhs, struct vm_page *rhs)
221 {
222 	psize_t lhs_size, rhs_size;
223 	int cmp;
224 
225 	/* Using second tree, so we receive pg[1] instead of pg[0]. */
226 	lhs_size = (lhs - 1)->fpgsz;
227 	rhs_size = (rhs - 1)->fpgsz;
228 
229 	cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
230 	if (cmp == 0)
231 		cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
232 	return cmp;
233 }
234 
235 /*
236  * Find the first range of free pages that is at least sz pages long.
237  */
238 struct vm_page *
239 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
240 {
241 	struct	vm_page *node, *best;
242 
243 	KASSERT(sz >= 1);
244 
245 	if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
246 		return TAILQ_FIRST(&pmr->single[mti]);
247 
248 	node = RB_ROOT(&pmr->size[mti]);
249 	best = NULL;
250 	while (node != NULL) {
251 		if ((node - 1)->fpgsz >= sz) {
252 			best = (node - 1);
253 			node = RB_LEFT(node, objt);
254 		} else
255 			node = RB_RIGHT(node, objt);
256 	}
257 	return best;
258 }
259 
260 /*
261  * Finds the next range. The next range has a size >= pg->fpgsz.
262  * Returns NULL if no more ranges are available.
263  */
264 struct vm_page *
265 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
266 {
267 	struct vm_page *npg;
268 
269 	KASSERT(pmr != NULL && pg != NULL);
270 	if (pg->fpgsz == 1) {
271 		if (TAILQ_NEXT(pg, pageq) != NULL)
272 			return TAILQ_NEXT(pg, pageq);
273 		else
274 			npg = RB_MIN(uvm_pmr_size, &pmr->size[mt]);
275 	} else
276 		npg = RB_NEXT(uvm_pmr_size, &pmr->size[mt], pg + 1);
277 
278 	return npg == NULL ? NULL : npg - 1;
279 }
280 
281 /*
282  * Finds the previous and next ranges relative to the (uninserted) pg range.
283  *
284  * *pg_prev == NULL if no previous range is available, that can join with
285  * 	pg.
286  * *pg_next == NULL if no next range is available, that can join with
287  * 	pg.
288  */
289 void
290 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
291     struct vm_page **pg_prev, struct vm_page **pg_next)
292 {
293 	KASSERT(pg_prev != NULL && pg_next != NULL);
294 
295 	*pg_next = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg);
296 	if (*pg_next == NULL)
297 		*pg_prev = RB_MAX(uvm_pmr_addr, &pmr->addr);
298 	else
299 		*pg_prev = RB_PREV(uvm_pmr_addr, &pmr->addr, *pg_next);
300 
301 	KDASSERT(*pg_next == NULL ||
302 	    VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
303 	KDASSERT(*pg_prev == NULL ||
304 	    VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
305 
306 	/* Reset if not contig. */
307 	if (*pg_prev != NULL &&
308 	    (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
309 	    != atop(VM_PAGE_TO_PHYS(pg)) ||
310 	    *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
311 	    uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
312 		*pg_prev = NULL;
313 	if (*pg_next != NULL &&
314 	    (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
315 	    != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
316 	    pg + pg->fpgsz != *pg_next || /* Array broke. */
317 	    uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
318 		*pg_next = NULL;
319 	return;
320 }
321 
322 /*
323  * Remove a range from the address tree.
324  * Address tree maintains pmr counters.
325  */
326 void
327 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
328 {
329 	KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
330 	KDASSERT(pg->pg_flags & PQ_FREE);
331 	RB_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
332 
333 	pmr->nsegs--;
334 }
335 /*
336  * Remove a range from the size tree.
337  */
338 void
339 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
340 {
341 	int memtype;
342 #ifdef DEBUG
343 	struct vm_page *i;
344 #endif
345 
346 	KDASSERT(pg->fpgsz >= 1);
347 	KDASSERT(pg->pg_flags & PQ_FREE);
348 	memtype = uvm_pmr_pg_to_memtype(pg);
349 
350 	if (pg->fpgsz == 1) {
351 #ifdef DEBUG
352 		TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
353 			if (i == pg)
354 				break;
355 		}
356 		KDASSERT(i == pg);
357 #endif
358 		TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
359 	} else {
360 		KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[memtype],
361 		    pg + 1) == pg + 1);
362 		RB_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
363 	}
364 }
365 /* Remove from both trees. */
366 void
367 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
368 {
369 	uvm_pmr_assertvalid(pmr);
370 	uvm_pmr_remove_size(pmr, pg);
371 	uvm_pmr_remove_addr(pmr, pg);
372 	uvm_pmr_assertvalid(pmr);
373 }
374 
375 /*
376  * Insert the range described in pg.
377  * Returns the range thus created (which may be joined with the previous and
378  * next ranges).
379  * If no_join, the caller guarantees that the range cannot possibly join
380  * with adjecent ranges.
381  */
382 struct vm_page *
383 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
384 {
385 	struct vm_page *prev, *next;
386 
387 #ifdef DEBUG
388 	struct vm_page *i;
389 	int mt;
390 #endif
391 
392 	KDASSERT(pg->pg_flags & PQ_FREE);
393 	KDASSERT(pg->fpgsz >= 1);
394 
395 #ifdef DEBUG
396 	for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
397 		TAILQ_FOREACH(i, &pmr->single[mt], pageq)
398 			KDASSERT(i != pg);
399 		if (pg->fpgsz > 1) {
400 			KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mt],
401 			    pg + 1) == NULL);
402 		}
403 		KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
404 	}
405 #endif
406 
407 	if (!no_join) {
408 		uvm_pmr_pnaddr(pmr, pg, &prev, &next);
409 		if (next != NULL) {
410 			uvm_pmr_remove_size(pmr, next);
411 			uvm_pmr_remove_addr(pmr, next);
412 			pg->fpgsz += next->fpgsz;
413 			next->fpgsz = 0;
414 		}
415 		if (prev != NULL) {
416 			uvm_pmr_remove_size(pmr, prev);
417 			prev->fpgsz += pg->fpgsz;
418 			pg->fpgsz = 0;
419 			return prev;
420 		}
421 	}
422 
423 	RB_INSERT(uvm_pmr_addr, &pmr->addr, pg);
424 
425 	pmr->nsegs++;
426 
427 	return pg;
428 }
429 /*
430  * Insert the range described in pg.
431  * Returns the range thus created (which may be joined with the previous and
432  * next ranges).
433  * Page must already be in the address tree.
434  */
435 void
436 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
437 {
438 	int memtype;
439 #ifdef DEBUG
440 	struct vm_page *i;
441 	int mti;
442 #endif
443 
444 	KDASSERT(pg->fpgsz >= 1);
445 	KDASSERT(pg->pg_flags & PQ_FREE);
446 
447 	memtype = uvm_pmr_pg_to_memtype(pg);
448 #ifdef DEBUG
449 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
450 		TAILQ_FOREACH(i, &pmr->single[mti], pageq)
451 			KDASSERT(i != pg);
452 		if (pg->fpgsz > 1) {
453 			KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mti],
454 			    pg + 1) == NULL);
455 		}
456 		KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
457 	}
458 	for (i = pg; i < pg + pg->fpgsz; i++)
459 		KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
460 #endif
461 
462 	if (pg->fpgsz == 1)
463 		TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
464 	else
465 		RB_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
466 }
467 /* Insert in both trees. */
468 struct vm_page *
469 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
470 {
471 	uvm_pmr_assertvalid(pmr);
472 	pg = uvm_pmr_insert_addr(pmr, pg, no_join);
473 	uvm_pmr_insert_size(pmr, pg);
474 	uvm_pmr_assertvalid(pmr);
475 	return pg;
476 }
477 
478 /*
479  * Find the last page that is part of this segment.
480  * => pg: the range at which to start the search.
481  * => boundary: the page number boundary specification (0 = no boundary).
482  * => pmr: the pmemrange of the page.
483  *
484  * This function returns 1 before the next range, so if you want to have the
485  * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
486  * The reason is that this way, the length of the segment is easily
487  * calculated using: atop(result) - atop(pg) + 1.
488  * Hence this function also never returns NULL.
489  */
490 struct vm_page *
491 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
492     struct vm_page *pg, paddr_t boundary)
493 {
494 	paddr_t	first_boundary;
495 	struct	vm_page *next;
496 	struct	vm_page *prev;
497 
498 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
499 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
500 	if (boundary != 0) {
501 		first_boundary =
502 		    PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
503 	} else
504 		first_boundary = 0;
505 
506 	/*
507 	 * Increase next until it hits the first page of the next segment.
508 	 *
509 	 * While loop checks the following:
510 	 * - next != NULL	we have not reached the end of pgl
511 	 * - boundary == 0 || next < first_boundary
512 	 *			we do not cross a boundary
513 	 * - atop(prev) + 1 == atop(next)
514 	 *			still in the same segment
515 	 * - low <= last
516 	 * - high > last	still in the same memory range
517 	 * - memtype is equal	allocator is unable to view different memtypes
518 	 *			as part of the same segment
519 	 * - prev + 1 == next	no array breakage occurs
520 	 */
521 	prev = pg;
522 	next = TAILQ_NEXT(prev, pageq);
523 	while (next != NULL &&
524 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
525 	    atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
526 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
527 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
528 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
529 	    prev + 1 == next) {
530 		prev = next;
531 		next = TAILQ_NEXT(prev, pageq);
532 	}
533 
534 	/*
535 	 * End of this segment.
536 	 */
537 	return prev;
538 }
539 
540 /*
541  * Remove the first segment of contiguous pages from pgl.
542  * A segment ends if it crosses boundary (unless boundary = 0) or
543  * if it would enter a different uvm_pmemrange.
544  *
545  * Work: the page range that the caller is currently working with.
546  * May be null.
547  *
548  * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
549  * the first segment is erased (which, if called by uvm_pmr_getpages(),
550  * probably is the smallest or very close to it).
551  */
552 psize_t
553 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
554     struct vm_page **work, int is_desperate)
555 {
556 	struct vm_page *start, *end, *iter, *iter_end, *inserted;
557 	psize_t count;
558 	struct uvm_pmemrange *pmr, *pmr_iter;
559 
560 	KASSERT(!TAILQ_EMPTY(pgl));
561 
562 	/*
563 	 * Initialize to first page.
564 	 * Unless desperate scan finds a better candidate, this is what'll be
565 	 * erased.
566 	 */
567 	start = TAILQ_FIRST(pgl);
568 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
569 	end = uvm_pmr_findnextsegment(pmr, start, boundary);
570 
571 	/*
572 	 * If we are desperate, we _really_ want to get rid of the smallest
573 	 * element (rather than a close match to the smallest element).
574 	 */
575 	if (is_desperate) {
576 		/* Linear search for smallest segment. */
577 		pmr_iter = pmr;
578 		for (iter = TAILQ_NEXT(end, pageq);
579 		    iter != NULL && start != end;
580 		    iter = TAILQ_NEXT(iter_end, pageq)) {
581 			/*
582 			 * Only update pmr if it doesn't match current
583 			 * iteration.
584 			 */
585 			if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
586 			    pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
587 				pmr_iter = uvm_pmemrange_find(atop(
588 				    VM_PAGE_TO_PHYS(iter)));
589 			}
590 
591 			iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
592 			    boundary);
593 
594 			/*
595 			 * Current iteration is smaller than best match so
596 			 * far; update.
597 			 */
598 			if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
599 			    VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
600 				start = iter;
601 				end = iter_end;
602 				pmr = pmr_iter;
603 			}
604 		}
605 	}
606 
607 	/*
608 	 * Calculate count and end of the list.
609 	 */
610 	count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
611 	end = TAILQ_NEXT(end, pageq);
612 
613 	/*
614 	 * Actually remove the range of pages.
615 	 *
616 	 * Sadly, this cannot be done using pointer iteration:
617 	 * vm_physseg is not guaranteed to be sorted on address, hence
618 	 * uvm_page_init() may not have initialized its array sorted by
619 	 * page number.
620 	 */
621 	for (iter = start; iter != end; iter = iter_end) {
622 		iter_end = TAILQ_NEXT(iter, pageq);
623 		TAILQ_REMOVE(pgl, iter, pageq);
624 	}
625 
626 	start->fpgsz = count;
627 	inserted = uvm_pmr_insert(pmr, start, 0);
628 
629 	/*
630 	 * If the caller was working on a range and this function modified
631 	 * that range, update the pointer.
632 	 */
633 	if (work != NULL && *work != NULL &&
634 	    atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
635 	    atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
636 	    atop(VM_PAGE_TO_PHYS(*work)))
637 		*work = inserted;
638 	return count;
639 }
640 
641 /*
642  * Extract a number of pages from a segment of free pages.
643  * Called by uvm_pmr_getpages.
644  *
645  * Returns the segment that was created from pages left over at the tail
646  * of the remove set of pages, or NULL if no pages were left at the tail.
647  */
648 struct vm_page *
649 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
650     paddr_t start, paddr_t end, struct pglist *result)
651 {
652 	struct vm_page *after, *pg_i;
653 	psize_t before_sz, after_sz;
654 #ifdef DEBUG
655 	psize_t i;
656 #endif
657 
658 	KDASSERT(end > start);
659 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
660 	KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
661 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
662 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
663 
664 	before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
665 	after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
666 	KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
667 	uvm_pmr_assertvalid(pmr);
668 
669 	uvm_pmr_remove_size(pmr, pg);
670 	if (before_sz == 0)
671 		uvm_pmr_remove_addr(pmr, pg);
672 	after = pg + before_sz + (end - start);
673 
674 	/* Add selected pages to result. */
675 	for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
676 		KASSERT(pg_i->pg_flags & PQ_FREE);
677 		pg_i->fpgsz = 0;
678 		TAILQ_INSERT_TAIL(result, pg_i, pageq);
679 	}
680 
681 	/* Before handling. */
682 	if (before_sz > 0) {
683 		pg->fpgsz = before_sz;
684 		uvm_pmr_insert_size(pmr, pg);
685 	}
686 
687 	/* After handling. */
688 	if (after_sz > 0) {
689 #ifdef DEBUG
690 		for (i = 0; i < after_sz; i++) {
691 			KASSERT(!uvm_pmr_isfree(after + i));
692 		}
693 #endif
694 		KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
695 		after->fpgsz = after_sz;
696 		after = uvm_pmr_insert_addr(pmr, after, 1);
697 		uvm_pmr_insert_size(pmr, after);
698 	}
699 
700 	uvm_pmr_assertvalid(pmr);
701 	return (after_sz > 0 ? after : NULL);
702 }
703 
704 /*
705  * Acquire a number of pages.
706  *
707  * count:	the number of pages returned
708  * start:	lowest page number
709  * end:		highest page number +1
710  * 		(start = end = 0: no limitation)
711  * align:	power-of-2 alignment constraint (align = 1: no alignment)
712  * boundary:	power-of-2 boundary (boundary = 0: no boundary)
713  * maxseg:	maximum number of segments to return
714  * flags:	UVM_PLA_* flags
715  * result:	returned pages storage (uses pageq)
716  */
717 int
718 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
719     paddr_t boundary, int maxseg, int flags, struct pglist *result)
720 {
721 	struct	uvm_pmemrange *pmr;	/* Iterate memory ranges. */
722 	struct	vm_page *found, *f_next; /* Iterate chunks. */
723 	psize_t	fcount;			/* Current found pages. */
724 	int	fnsegs;			/* Current segment counter. */
725 	int	try, start_try;
726 	psize_t	search[3];
727 	paddr_t	fstart, fend;		/* Pages to be taken from found. */
728 	int	memtype;		/* Requested memtype. */
729 	int	memtype_init;		/* Best memtype. */
730 	int	desperate;		/* True if allocation failed. */
731 #ifdef DIAGNOSTIC
732 	struct	vm_page *diag_prev;	/* Used during validation. */
733 #endif /* DIAGNOSTIC */
734 
735 	/*
736 	 * Validate arguments.
737 	 */
738 	KASSERT(count > 0);
739 	KASSERT(start == 0 || end == 0 || start < end);
740 	KASSERT(align >= 1);
741 	KASSERT(powerof2(align));
742 	KASSERT(maxseg > 0);
743 	KASSERT(boundary == 0 || powerof2(boundary));
744 	KASSERT(boundary == 0 || maxseg * boundary >= count);
745 	KASSERT(TAILQ_EMPTY(result));
746 
747 	/*
748 	 * TRYCONTIG is a noop if you only want a single segment.
749 	 * Remove it if that's the case: otherwise it'll deny the fast
750 	 * allocation.
751 	 */
752 	if (maxseg == 1 || count == 1)
753 		flags &= ~UVM_PLA_TRYCONTIG;
754 
755 	/*
756 	 * Configure search.
757 	 *
758 	 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
759 	 * search[1] is multiple segments, chosen to fulfill the search in
760 	 *   approximately even-sized segments.
761 	 *   This is a good trade-off between slightly reduced allocation speed
762 	 *   and less fragmentation.
763 	 * search[2] is the worst case, in which all segments are evaluated.
764 	 *   This provides the least fragmentation, but makes the search
765 	 *   possibly longer (although in the case it is selected, that no
766 	 *   longer matters most).
767 	 *
768 	 * The exception is when maxseg == 1: since we can only fulfill that
769 	 * with one segment of size pages, only a single search type has to
770 	 * be attempted.
771 	 */
772 	if (maxseg == 1 || count == 1) {
773 		start_try = 2;
774 		search[2] = count;
775 	} else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
776 		start_try = 2;
777 		search[2] = 1;
778 	} else {
779 		start_try = 0;
780 		search[0] = count;
781 		search[1] = pow2divide(count, maxseg);
782 		search[2] = 1;
783 		if ((flags & UVM_PLA_TRYCONTIG) == 0)
784 			start_try = 1;
785 		if (search[1] >= search[0]) {
786 			search[1] = search[0];
787 			start_try = 1;
788 		}
789 		if (search[2] >= search[start_try]) {
790 			start_try = 2;
791 		}
792 	}
793 
794 	/*
795 	 * Memory type: if zeroed memory is requested, traverse the zero set.
796 	 * Otherwise, traverse the dirty set.
797 	 *
798 	 * The memtype iterator is reinitialized to memtype_init on entrance
799 	 * of a pmemrange.
800 	 */
801 	if (flags & UVM_PLA_ZERO)
802 		memtype_init = UVM_PMR_MEMTYPE_ZERO;
803 	else
804 		memtype_init = UVM_PMR_MEMTYPE_DIRTY;
805 
806 	/*
807 	 * Initially, we're not desperate.
808 	 *
809 	 * Note that if we return from a sleep, we are still desperate.
810 	 * Chances are that memory pressure is still high, so resetting
811 	 * seems over-optimistic to me.
812 	 */
813 	desperate = 0;
814 
815 	uvm_lock_fpageq();
816 
817 retry:		/* Return point after sleeping. */
818 	fcount = 0;
819 	fnsegs = 0;
820 
821 retry_desperate:
822 	/*
823 	 * If we just want any page(s), go for the really fast option.
824 	 */
825 	if (count <= maxseg && align == 1 && boundary == 0 &&
826 	    (flags & UVM_PLA_TRYCONTIG) == 0) {
827 		fcount += uvm_pmr_get1page(count - fcount, memtype_init,
828 		    result, start, end);
829 
830 		/*
831 		 * If we found sufficient pages, go to the succes exit code.
832 		 *
833 		 * Otherwise, go immediately to fail, since we collected
834 		 * all we could anyway.
835 		 */
836 		if (fcount == count)
837 			goto out;
838 		else
839 			goto fail;
840 	}
841 
842 	/*
843 	 * The heart of the contig case.
844 	 *
845 	 * The code actually looks like this:
846 	 *
847 	 * foreach (struct pmemrange) {
848 	 *	foreach (memtype) {
849 	 *		foreach(try) {
850 	 *			foreach (free range of memtype in pmemrange,
851 	 *			    starting at search[try]) {
852 	 *				while (range has space left)
853 	 *					take from range
854 	 *			}
855 	 *		}
856 	 *	}
857 	 *
858 	 *	if next pmemrange has higher usecount than current:
859 	 *		enter desperate case (which will drain the pmemranges
860 	 *		until empty prior to moving to the next one)
861 	 * }
862 	 *
863 	 * When desperate is activated, try always starts at the highest
864 	 * value. The memtype loop is using a goto ReScanMemtype.
865 	 * The try loop is using a goto ReScan.
866 	 * The 'range has space left' loop uses label DrainFound.
867 	 *
868 	 * Writing them all as loops would take up a lot of screen space in
869 	 * the form of indentation and some parts are easier to express
870 	 * using the labels.
871 	 */
872 
873 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
874 		/* Empty range. */
875 		if (pmr->nsegs == 0)
876 			continue;
877 
878 		/* Outside requested range. */
879 		if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
880 			continue;
881 
882 		memtype = memtype_init;
883 
884 rescan_memtype:	/* Return point at memtype++. */
885 		try = start_try;
886 
887 rescan:		/* Return point at try++. */
888 		for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
889 		    found != NULL;
890 		    found = f_next) {
891 			f_next = uvm_pmr_nextsz(pmr, found, memtype);
892 
893 			fstart = atop(VM_PAGE_TO_PHYS(found));
894 			if (start != 0)
895 				fstart = MAX(start, fstart);
896 drain_found:
897 			/*
898 			 * Throw away the first segment if fnsegs == maxseg
899 			 *
900 			 * Note that f_next is still valid after this call,
901 			 * since we only allocated from entries before f_next.
902 			 * We don't revisit the entries we already extracted
903 			 * from unless we entered the desperate case.
904 			 */
905 			if (fnsegs == maxseg) {
906 				fnsegs--;
907 				fcount -=
908 				    uvm_pmr_remove_1strange(result, boundary,
909 				    &found, desperate);
910 			}
911 
912 			fstart = PMR_ALIGN(fstart, align);
913 			fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
914 			if (end != 0)
915 				fend = MIN(end, fend);
916 			if (boundary != 0) {
917 				fend =
918 				    MIN(fend, PMR_ALIGN(fstart + 1, boundary));
919 			}
920 			if (fstart >= fend)
921 				continue;
922 			if (fend - fstart > count - fcount)
923 				fend = fstart + (count - fcount);
924 
925 			fcount += fend - fstart;
926 			fnsegs++;
927 			found = uvm_pmr_extract_range(pmr, found,
928 			    fstart, fend, result);
929 
930 			if (fcount == count)
931 				goto out;
932 
933 			/*
934 			 * If there's still space left in found, try to
935 			 * fully drain it prior to continueing.
936 			 */
937 			if (found != NULL) {
938 				fstart = fend;
939 				goto drain_found;
940 			}
941 		}
942 
943 		/* Try a smaller search now. */
944 		if (++try < nitems(search))
945 			goto rescan;
946 
947 		/*
948 		 * Exhaust all memory types prior to going to the next memory
949 		 * segment.
950 		 * This means that zero-vs-dirty are eaten prior to moving
951 		 * to a pmemrange with a higher use-count.
952 		 *
953 		 * Code is basically a difficult way of writing:
954 		 * memtype = memtype_init;
955 		 * do {
956 		 *	...;
957 		 *	memtype += 1;
958 		 *	memtype %= MEMTYPE_MAX;
959 		 * } while (memtype != memtype_init);
960 		 */
961 		memtype += 1;
962 		if (memtype == UVM_PMR_MEMTYPE_MAX)
963 			memtype = 0;
964 		if (memtype != memtype_init)
965 			goto rescan_memtype;
966 
967 		/*
968 		 * If not desperate, enter desperate case prior to eating all
969 		 * the good stuff in the next range.
970 		 */
971 		if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
972 		    TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
973 			break;
974 	}
975 
976 	/*
977 	 * Not enough memory of the requested type available. Fall back to
978 	 * less good memory that we'll clean up better later.
979 	 *
980 	 * This algorithm is not very smart though, it just starts scanning
981 	 * a different typed range, but the nicer ranges of the previous
982 	 * iteration may fall out. Hence there is a small chance of a false
983 	 * negative.
984 	 *
985 	 * When desparate: scan all sizes starting at the smallest
986 	 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
987 	 * allow us to hit the fast path now).
988 	 *
989 	 * Also, because we will revisit entries we scanned before, we need
990 	 * to reset the page queue, or we may end up releasing entries in
991 	 * such a way as to invalidate f_next.
992 	 */
993 	if (!desperate) {
994 		desperate = 1;
995 		start_try = nitems(search) - 1;
996 		flags &= ~UVM_PLA_TRYCONTIG;
997 
998 		while (!TAILQ_EMPTY(result))
999 			uvm_pmr_remove_1strange(result, 0, NULL, 0);
1000 		fnsegs = 0;
1001 		fcount = 0;
1002 		goto retry_desperate;
1003 	}
1004 
1005 fail:
1006 	/* Allocation failed. */
1007 	/* XXX: claim from memory reserve here */
1008 
1009 	while (!TAILQ_EMPTY(result))
1010 		uvm_pmr_remove_1strange(result, 0, NULL, 0);
1011 
1012 	if (flags & UVM_PLA_WAITOK) {
1013 		if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
1014 		    flags & UVM_PLA_FAILOK) == 0)
1015 			goto retry;
1016 		KASSERT(flags & UVM_PLA_FAILOK);
1017 	} else
1018 		wakeup(&uvm.pagedaemon);
1019 	uvm_unlock_fpageq();
1020 
1021 	return ENOMEM;
1022 
1023 out:
1024 	/* Allocation succesful. */
1025 	uvmexp.free -= fcount;
1026 
1027 	uvm_unlock_fpageq();
1028 
1029 	/* Update statistics and zero pages if UVM_PLA_ZERO. */
1030 #ifdef DIAGNOSTIC
1031 	fnsegs = 0;
1032 	fcount = 0;
1033 	diag_prev = NULL;
1034 #endif /* DIAGNOSTIC */
1035 	TAILQ_FOREACH(found, result, pageq) {
1036 		atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1037 
1038 		if (found->pg_flags & PG_ZERO) {
1039 			uvmexp.zeropages--;
1040 		}
1041 		if (flags & UVM_PLA_ZERO) {
1042 			if (found->pg_flags & PG_ZERO)
1043 				uvmexp.pga_zerohit++;
1044 			else {
1045 				uvmexp.pga_zeromiss++;
1046 				uvm_pagezero(found);
1047 			}
1048 		}
1049 		atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1050 
1051 		found->uobject = NULL;
1052 		found->uanon = NULL;
1053 		found->pg_version++;
1054 
1055 		/*
1056 		 * Validate that the page matches range criterium.
1057 		 */
1058 		KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1059 		KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1060 
1061 #ifdef DIAGNOSTIC
1062 		/*
1063 		 * Update fcount (# found pages) and
1064 		 * fnsegs (# found segments) counters.
1065 		 */
1066 		if (diag_prev == NULL ||
1067 		    /* new segment if it contains a hole */
1068 		    atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1069 		    atop(VM_PAGE_TO_PHYS(found)) ||
1070 		    /* new segment if it crosses boundary */
1071 		    (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1072 		    (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1073 			fnsegs++;
1074 		fcount++;
1075 
1076 		diag_prev = found;
1077 #endif /* DIAGNOSTIC */
1078 	}
1079 
1080 #ifdef DIAGNOSTIC
1081 	/*
1082 	 * Panic on algorithm failure.
1083 	 */
1084 	if (fcount != count || fnsegs > maxseg) {
1085 		panic("pmemrange allocation error: "
1086 		    "allocated %ld pages in %d segments, "
1087 		    "but request was %ld pages in %d segments",
1088 		    fcount, fnsegs, count, maxseg);
1089 	}
1090 #endif /* DIAGNOSTIC */
1091 
1092 	return 0;
1093 }
1094 
1095 /*
1096  * Free a number of contig pages (invoked by uvm_page_init).
1097  */
1098 void
1099 uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1100 {
1101 	struct uvm_pmemrange *pmr;
1102 	psize_t i, pmr_count;
1103 	struct vm_page *firstpg = pg;
1104 
1105 	for (i = 0; i < count; i++) {
1106 		KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1107 		    atop(VM_PAGE_TO_PHYS(pg)) + i);
1108 
1109 		if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1110 		    VALID_FLAGS(pg[i].pg_flags))) {
1111 			printf("Flags: 0x%x, will panic now.\n",
1112 			    pg[i].pg_flags);
1113 		}
1114 		KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1115 		    VALID_FLAGS(pg[i].pg_flags));
1116 		atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1117 		atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1118 	}
1119 
1120 	uvm_lock_fpageq();
1121 
1122 	for (i = count; i > 0; i -= pmr_count) {
1123 		pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1124 		KASSERT(pmr != NULL);
1125 
1126 		pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1127 		pg->fpgsz = pmr_count;
1128 		uvm_pmr_insert(pmr, pg, 0);
1129 
1130 		uvmexp.free += pmr_count;
1131 		pg += pmr_count;
1132 	}
1133 	wakeup(&uvmexp.free);
1134 
1135 	uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
1136 
1137 	uvm_unlock_fpageq();
1138 }
1139 
1140 /*
1141  * Free all pages in the queue.
1142  */
1143 void
1144 uvm_pmr_freepageq(struct pglist *pgl)
1145 {
1146 	struct vm_page *pg;
1147 	paddr_t pstart;
1148 	psize_t plen;
1149 
1150 	TAILQ_FOREACH(pg, pgl, pageq) {
1151 		if (!((pg->pg_flags & PQ_FREE) == 0 &&
1152 		    VALID_FLAGS(pg->pg_flags))) {
1153 			printf("Flags: 0x%x, will panic now.\n",
1154 			    pg->pg_flags);
1155 		}
1156 		KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1157 		    VALID_FLAGS(pg->pg_flags));
1158 		atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1159 		atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1160 	}
1161 
1162 	uvm_lock_fpageq();
1163 	while (!TAILQ_EMPTY(pgl)) {
1164 		pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
1165 		plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
1166 		uvmexp.free += plen;
1167 
1168 		uvm_wakeup_pla(pstart, ptoa(plen));
1169 	}
1170 	wakeup(&uvmexp.free);
1171 	uvm_unlock_fpageq();
1172 
1173 	return;
1174 }
1175 
1176 /*
1177  * Store a pmemrange in the list.
1178  *
1179  * The list is sorted by use.
1180  */
1181 struct uvm_pmemrange *
1182 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1183     struct uvm_pmemrange *pmr)
1184 {
1185 	struct uvm_pmemrange *iter;
1186 	int cmp = 1;
1187 
1188 	TAILQ_FOREACH(iter, useq, pmr_use) {
1189 		cmp = uvm_pmemrange_use_cmp(pmr, iter);
1190 		if (cmp == 0)
1191 			return iter;
1192 		if (cmp == -1)
1193 			break;
1194 	}
1195 
1196 	if (iter == NULL)
1197 		TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1198 	else
1199 		TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1200 	return NULL;
1201 }
1202 
1203 #ifdef DEBUG
1204 /*
1205  * Validation of the whole pmemrange.
1206  * Called with fpageq locked.
1207  */
1208 void
1209 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1210 {
1211 	struct vm_page *prev, *next, *i, *xref;
1212 	int lcv, mti;
1213 
1214 	/* Empty range */
1215 	if (pmr->nsegs == 0)
1216 		return;
1217 
1218 	/* Validate address tree. */
1219 	RB_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1220 		/* Validate the range. */
1221 		KASSERT(i->fpgsz > 0);
1222 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1223 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1224 		    <= pmr->high);
1225 
1226 		/* Validate each page in this range. */
1227 		for (lcv = 0; lcv < i->fpgsz; lcv++) {
1228 			/*
1229 			 * Only the first page has a size specification.
1230 			 * Rest is size 0.
1231 			 */
1232 			KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1233 			/*
1234 			 * Flag check.
1235 			 */
1236 			KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1237 			    (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1238 			/*
1239 			 * Free pages are:
1240 			 * - not wired
1241 			 * - not loaned
1242 			 * - have no vm_anon
1243 			 * - have no uvm_object
1244 			 */
1245 			KASSERT(i[lcv].wire_count == 0);
1246 			KASSERT(i[lcv].loan_count == 0);
1247 			KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1248 			    i[lcv].uanon == NULL);
1249 			KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1250 			    i[lcv].uobject == NULL);
1251 			/*
1252 			 * Pages in a single range always have the same
1253 			 * memtype.
1254 			 */
1255 			KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1256 			    uvm_pmr_pg_to_memtype(&i[lcv]));
1257 		}
1258 
1259 		/* Check that it shouldn't be joined with its predecessor. */
1260 		prev = RB_PREV(uvm_pmr_addr, &pmr->addr, i);
1261 		if (prev != NULL) {
1262 			KASSERT(uvm_pmr_pg_to_memtype(i) !=
1263 			    uvm_pmr_pg_to_memtype(prev) ||
1264 			    atop(VM_PAGE_TO_PHYS(i)) >
1265 			    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1266 			    prev + prev->fpgsz != i);
1267 		}
1268 
1269 		/* Assert i is in the size tree as well. */
1270 		if (i->fpgsz == 1) {
1271 			TAILQ_FOREACH(xref,
1272 			    &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1273 				if (xref == i)
1274 					break;
1275 			}
1276 			KASSERT(xref == i);
1277 		} else {
1278 			KASSERT(RB_FIND(uvm_pmr_size,
1279 			    &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1280 			    i + 1);
1281 		}
1282 	}
1283 
1284 	/* Validate size tree. */
1285 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1286 		for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1287 			next = uvm_pmr_nextsz(pmr, i, mti);
1288 			if (next != NULL) {
1289 				KASSERT(i->fpgsz <=
1290 				    next->fpgsz);
1291 			}
1292 
1293 			/* Assert i is in the addr tree as well. */
1294 			KASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1295 
1296 			/* Assert i is of the correct memory type. */
1297 			KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1298 		}
1299 	}
1300 
1301 	/* Validate nsegs statistic. */
1302 	lcv = 0;
1303 	RB_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1304 		lcv++;
1305 	KASSERT(pmr->nsegs == lcv);
1306 }
1307 #endif /* DEBUG */
1308 
1309 /*
1310  * Split pmr at split point pageno.
1311  * Called with fpageq unlocked.
1312  *
1313  * Split is only applied if a pmemrange spans pageno.
1314  */
1315 void
1316 uvm_pmr_split(paddr_t pageno)
1317 {
1318 	struct uvm_pmemrange *pmr, *drain;
1319 	struct vm_page *rebuild, *prev, *next;
1320 	psize_t prev_sz;
1321 
1322 	uvm_lock_fpageq();
1323 	pmr = uvm_pmemrange_find(pageno);
1324 	if (pmr == NULL || !(pmr->low < pageno)) {
1325 		/* No split required. */
1326 		uvm_unlock_fpageq();
1327 		return;
1328 	}
1329 
1330 	KASSERT(pmr->low < pageno);
1331 	KASSERT(pmr->high > pageno);
1332 
1333 	/*
1334 	 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1335 	 * uvm_kmemalloc which calls into pmemrange, making the locking
1336 	 * a bit hard, so we just race!
1337 	 */
1338 	uvm_unlock_fpageq();
1339 	drain = uvm_pmr_allocpmr();
1340 	uvm_lock_fpageq();
1341 	pmr = uvm_pmemrange_find(pageno);
1342 	if (pmr == NULL || !(pmr->low < pageno)) {
1343 		/*
1344 		 * We lost the race since someone else ran this or a related
1345 		 * function, however this should be triggered very rarely so
1346 		 * we just leak the pmr.
1347 		 */
1348 		printf("uvm_pmr_split: lost one pmr\n");
1349 		uvm_unlock_fpageq();
1350 		return;
1351 	}
1352 
1353 	drain->low = pageno;
1354 	drain->high = pmr->high;
1355 	drain->use = pmr->use;
1356 
1357 	uvm_pmr_assertvalid(pmr);
1358 	uvm_pmr_assertvalid(drain);
1359 	KASSERT(drain->nsegs == 0);
1360 
1361 	RB_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1362 		if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1363 			break;
1364 	}
1365 	if (rebuild == NULL)
1366 		prev = RB_MAX(uvm_pmr_addr, &pmr->addr);
1367 	else
1368 		prev = RB_PREV(uvm_pmr_addr, &pmr->addr, rebuild);
1369 	KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1370 
1371 	/*
1372 	 * Handle free chunk that spans the split point.
1373 	 */
1374 	if (prev != NULL &&
1375 	    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1376 		psize_t before, after;
1377 
1378 		KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1379 
1380 		uvm_pmr_remove(pmr, prev);
1381 		prev_sz = prev->fpgsz;
1382 		before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1383 		after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1384 
1385 		KASSERT(before > 0);
1386 		KASSERT(after > 0);
1387 
1388 		prev->fpgsz = before;
1389 		uvm_pmr_insert(pmr, prev, 1);
1390 		(prev + before)->fpgsz = after;
1391 		uvm_pmr_insert(drain, prev + before, 1);
1392 	}
1393 
1394 	/* Move free chunks that no longer fall in the range. */
1395 	for (; rebuild != NULL; rebuild = next) {
1396 		next = RB_NEXT(uvm_pmr_addr, &pmr->addr, rebuild);
1397 
1398 		uvm_pmr_remove(pmr, rebuild);
1399 		uvm_pmr_insert(drain, rebuild, 1);
1400 	}
1401 
1402 	pmr->high = pageno;
1403 	uvm_pmr_assertvalid(pmr);
1404 	uvm_pmr_assertvalid(drain);
1405 
1406 	RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1407 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1408 	uvm_unlock_fpageq();
1409 }
1410 
1411 /*
1412  * Increase the usage counter for the given range of memory.
1413  *
1414  * The more usage counters a given range of memory has, the more will be
1415  * attempted not to allocate from it.
1416  *
1417  * Addresses here are in paddr_t, not page-numbers.
1418  * The lowest and highest allowed address are specified.
1419  */
1420 void
1421 uvm_pmr_use_inc(paddr_t low, paddr_t high)
1422 {
1423 	struct uvm_pmemrange *pmr;
1424 	paddr_t sz;
1425 
1426 	/* pmr uses page numbers, translate low and high. */
1427 	high++;
1428 	high = atop(trunc_page(high));
1429 	low = atop(round_page(low));
1430 	uvm_pmr_split(low);
1431 	uvm_pmr_split(high);
1432 
1433 	sz = 0;
1434 	uvm_lock_fpageq();
1435 	/* Increase use count on segments in range. */
1436 	RB_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1437 		if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1438 			TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1439 			pmr->use++;
1440 			sz += pmr->high - pmr->low;
1441 			uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1442 		}
1443 		uvm_pmr_assertvalid(pmr);
1444 	}
1445 	uvm_unlock_fpageq();
1446 
1447 	KASSERT(sz >= high - low);
1448 }
1449 
1450 /*
1451  * Allocate a pmemrange.
1452  *
1453  * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1454  * If called after uvm_init, malloc is used.
1455  * (And if called in between, you're dead.)
1456  */
1457 struct uvm_pmemrange *
1458 uvm_pmr_allocpmr(void)
1459 {
1460 	struct uvm_pmemrange *nw;
1461 	int i;
1462 
1463 	/* We're only ever hitting the !uvm.page_init_done case for now. */
1464 	if (!uvm.page_init_done) {
1465 		nw = (struct uvm_pmemrange *)
1466 		    uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1467 	} else {
1468 		nw = malloc(sizeof(struct uvm_pmemrange),
1469 		    M_VMMAP, M_NOWAIT);
1470 	}
1471 	KASSERT(nw != NULL);
1472 	bzero(nw, sizeof(struct uvm_pmemrange));
1473 	RB_INIT(&nw->addr);
1474 	for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1475 		RB_INIT(&nw->size[i]);
1476 		TAILQ_INIT(&nw->single[i]);
1477 	}
1478 	return nw;
1479 }
1480 
1481 /*
1482  * Initialization of pmr.
1483  * Called by uvm_page_init.
1484  *
1485  * Sets up pmemranges.
1486  */
1487 void
1488 uvm_pmr_init(void)
1489 {
1490 	struct uvm_pmemrange *new_pmr;
1491 	int i;
1492 
1493 	TAILQ_INIT(&uvm.pmr_control.use);
1494 	RB_INIT(&uvm.pmr_control.addr);
1495 	TAILQ_INIT(&uvm.pmr_control.allocs);
1496 
1497 	/* By default, one range for the entire address space. */
1498 	new_pmr = uvm_pmr_allocpmr();
1499 	new_pmr->low = 0;
1500 	new_pmr->high = atop((paddr_t)-1) + 1;
1501 
1502 	RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1503 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1504 
1505 	for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1506 		uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1507 	    	    uvm_md_constraints[i]->ucr_high);
1508 	}
1509 }
1510 
1511 /*
1512  * Find the pmemrange that contains the given page number.
1513  *
1514  * (Manually traverses the binary tree, because that is cheaper on stack
1515  * usage.)
1516  */
1517 struct uvm_pmemrange *
1518 uvm_pmemrange_find(paddr_t pageno)
1519 {
1520 	struct uvm_pmemrange *pmr;
1521 
1522 	pmr = RB_ROOT(&uvm.pmr_control.addr);
1523 	while (pmr != NULL) {
1524 		if (pmr->low > pageno)
1525 			pmr = RB_LEFT(pmr, pmr_addr);
1526 		else if (pmr->high <= pageno)
1527 			pmr = RB_RIGHT(pmr, pmr_addr);
1528 		else
1529 			break;
1530 	}
1531 
1532 	return pmr;
1533 }
1534 
1535 #if defined(DDB) || defined(DEBUG)
1536 /*
1537  * Return true if the given page is in any of the free lists.
1538  * Used by uvm_page_printit.
1539  * This function is safe, even if the page is not on the freeq.
1540  * Note: does not apply locking, only called from ddb.
1541  */
1542 int
1543 uvm_pmr_isfree(struct vm_page *pg)
1544 {
1545 	struct vm_page *r;
1546 	struct uvm_pmemrange *pmr;
1547 
1548 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1549 	if (pmr == NULL)
1550 		return 0;
1551 	r = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1552 	if (r == NULL)
1553 		r = RB_MAX(uvm_pmr_addr, &pmr->addr);
1554 	else
1555 		r = RB_PREV(uvm_pmr_addr, &pmr->addr, r);
1556 	if (r == NULL)
1557 		return 0; /* Empty tree. */
1558 
1559 	KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1560 	return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1561 	    atop(VM_PAGE_TO_PHYS(pg));
1562 }
1563 #endif /* DEBUG */
1564 
1565 /*
1566  * Given a root of a tree, find a range which intersects start, end and
1567  * is of the same memtype.
1568  *
1569  * Page must be in the address tree.
1570  */
1571 struct vm_page*
1572 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1573     paddr_t start, paddr_t end, int memtype)
1574 {
1575 	int	direction;
1576 	struct	vm_page *root;
1577 	struct	vm_page *high, *high_next;
1578 	struct	vm_page *low, *low_next;
1579 
1580 	KDASSERT(pmr != NULL && init_root != NULL);
1581 	root = init_root;
1582 
1583 	/* Which direction to use for searching. */
1584 	if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1585 		direction =  1;
1586 	else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1587 		direction = -1;
1588 	else /* nothing to do */
1589 		return root;
1590 
1591 	/* First, update root to fall within the chosen range. */
1592 	while (root && !PMR_INTERSECTS_WITH(
1593 	    atop(VM_PAGE_TO_PHYS(root)),
1594 	    atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1595 	    start, end)) {
1596 		if (direction == 1)
1597 			root = RB_RIGHT(root, objt);
1598 		else
1599 			root = RB_LEFT(root, objt);
1600 	}
1601 	if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1602 		return root;
1603 
1604 	/*
1605 	 * Root is valid, but of the wrong memtype.
1606 	 *
1607 	 * Try to find a range that has the given memtype in the subtree
1608 	 * (memtype mismatches are costly, either because the conversion
1609 	 * is expensive, or a later allocation will need to do the opposite
1610 	 * conversion, which will be expensive).
1611 	 *
1612 	 *
1613 	 * First, simply increase address until we hit something we can use.
1614 	 * Cache the upper page, so we can page-walk later.
1615 	 */
1616 	high = root;
1617 	high_next = RB_RIGHT(high, objt);
1618 	while (high_next != NULL && PMR_INTERSECTS_WITH(
1619 	    atop(VM_PAGE_TO_PHYS(high_next)),
1620 	    atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1621 	    start, end)) {
1622 		high = high_next;
1623 		if (uvm_pmr_pg_to_memtype(high) == memtype)
1624 			return high;
1625 		high_next = RB_RIGHT(high, objt);
1626 	}
1627 
1628 	/*
1629 	 * Second, decrease the address until we hit something we can use.
1630 	 * Cache the lower page, so we can page-walk later.
1631 	 */
1632 	low = root;
1633 	low_next = RB_LEFT(low, objt);
1634 	while (low_next != NULL && PMR_INTERSECTS_WITH(
1635 	    atop(VM_PAGE_TO_PHYS(low_next)),
1636 	    atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1637 	    start, end)) {
1638 		low = low_next;
1639 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1640 			return low;
1641 		low_next = RB_LEFT(low, objt);
1642 	}
1643 
1644 	if (low == high)
1645 		return NULL;
1646 
1647 	/* No hits. Walk the address tree until we find something usable. */
1648 	for (low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low);
1649 	    low != high;
1650 	    low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low)) {
1651 		KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1652 	    	    atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1653 	    	    start, end));
1654 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1655 			return low;
1656 	}
1657 
1658 	/* Nothing found. */
1659 	return NULL;
1660 }
1661 
1662 /*
1663  * Allocate any page, the fastest way. Page number constraints only.
1664  */
1665 int
1666 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1667     paddr_t start, paddr_t end)
1668 {
1669 	struct	uvm_pmemrange *pmr;
1670 	struct	vm_page *found, *splitpg;
1671 	psize_t	fcount;
1672 	int	memtype;
1673 
1674 	fcount = 0;
1675 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1676 		/* We're done. */
1677 		if (fcount == count)
1678 			break;
1679 
1680 		/* Outside requested range. */
1681 		if (!(start == 0 && end == 0) &&
1682 		    !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1683 			continue;
1684 
1685 		/* Range is empty. */
1686 		if (pmr->nsegs == 0)
1687 			continue;
1688 
1689 		/* Loop over all memtypes, starting at memtype_init. */
1690 		memtype = memtype_init;
1691 		while (fcount != count) {
1692 			found = TAILQ_FIRST(&pmr->single[memtype]);
1693 			/*
1694 			 * If found is outside the range, walk the list
1695 			 * until we find something that intersects with
1696 			 * boundaries.
1697 			 */
1698 			while (found && !PMR_INTERSECTS_WITH(
1699 			    atop(VM_PAGE_TO_PHYS(found)),
1700 			    atop(VM_PAGE_TO_PHYS(found)) + 1,
1701 			    start, end))
1702 				found = TAILQ_NEXT(found, pageq);
1703 
1704 			if (found == NULL) {
1705 				found = RB_ROOT(&pmr->size[memtype]);
1706 				/* Size tree gives pg[1] instead of pg[0] */
1707 				if (found != NULL) {
1708 					found--;
1709 
1710 					found = uvm_pmr_rootupdate(pmr, found,
1711 					    start, end, memtype);
1712 				}
1713 			}
1714 			if (found != NULL) {
1715 				uvm_pmr_assertvalid(pmr);
1716 				uvm_pmr_remove_size(pmr, found);
1717 
1718 				/*
1719 				 * If the page intersects the end, then it'll
1720 				 * need splitting.
1721 				 *
1722 				 * Note that we don't need to split if the page
1723 				 * intersects start: the drain function will
1724 				 * simply stop on hitting start.
1725 				 */
1726 				if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1727 				    found->fpgsz > end) {
1728 					psize_t splitsz =
1729 					    atop(VM_PAGE_TO_PHYS(found)) +
1730 					    found->fpgsz - end;
1731 
1732 					uvm_pmr_remove_addr(pmr, found);
1733 					uvm_pmr_assertvalid(pmr);
1734 					found->fpgsz -= splitsz;
1735 					splitpg = found + found->fpgsz;
1736 					splitpg->fpgsz = splitsz;
1737 					uvm_pmr_insert(pmr, splitpg, 1);
1738 
1739 					/*
1740 					 * At this point, splitpg and found
1741 					 * actually should be joined.
1742 					 * But we explicitly disable that,
1743 					 * because we will start subtracting
1744 					 * from found.
1745 					 */
1746 					KASSERT(start == 0 ||
1747 					    atop(VM_PAGE_TO_PHYS(found)) +
1748 					    found->fpgsz > start);
1749 					uvm_pmr_insert_addr(pmr, found, 1);
1750 				}
1751 
1752 				/*
1753 				 * Fetch pages from the end.
1754 				 * If the range is larger than the requested
1755 				 * number of pages, this saves us an addr-tree
1756 				 * update.
1757 				 *
1758 				 * Since we take from the end and insert at
1759 				 * the head, any ranges keep preserved.
1760 				 */
1761 				while (found->fpgsz > 0 && fcount < count &&
1762 				    (start == 0 ||
1763 				    atop(VM_PAGE_TO_PHYS(found)) +
1764 				    found->fpgsz > start)) {
1765 					found->fpgsz--;
1766 					fcount++;
1767 					TAILQ_INSERT_HEAD(result,
1768 					    &found[found->fpgsz], pageq);
1769 				}
1770 				if (found->fpgsz > 0) {
1771 					uvm_pmr_insert_size(pmr, found);
1772 					KDASSERT(fcount == count);
1773 					uvm_pmr_assertvalid(pmr);
1774 					return fcount;
1775 				}
1776 
1777 				/*
1778 				 * Delayed addr-tree removal.
1779 				 */
1780 				uvm_pmr_remove_addr(pmr, found);
1781 				uvm_pmr_assertvalid(pmr);
1782 			} else {
1783 				/*
1784 				 * Skip to the next memtype.
1785 				 */
1786 				memtype += 1;
1787 				if (memtype == UVM_PMR_MEMTYPE_MAX)
1788 					memtype = 0;
1789 				if (memtype == memtype_init)
1790 					break;
1791 			}
1792 		}
1793 	}
1794 
1795 	/*
1796 	 * Search finished.
1797 	 *
1798 	 * Ran out of ranges before enough pages were gathered, or we hit the
1799 	 * case where found->fpgsz == count - fcount, in which case the
1800 	 * above exit condition didn't trigger.
1801 	 *
1802 	 * On failure, caller will free the pages.
1803 	 */
1804 	return fcount;
1805 }
1806 
1807 #ifdef DDB
1808 /*
1809  * Print information about pmemrange.
1810  * Does not do locking (so either call it from DDB or acquire fpageq lock
1811  * before invoking.
1812  */
1813 void
1814 uvm_pmr_print(void)
1815 {
1816 	struct	uvm_pmemrange *pmr;
1817 	struct	vm_page *pg;
1818 	psize_t	size[UVM_PMR_MEMTYPE_MAX];
1819 	psize_t	free;
1820 	int	useq_len;
1821 	int	mt;
1822 
1823 	printf("Ranges, use queue:\n");
1824 	useq_len = 0;
1825 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1826 		useq_len++;
1827 		free = 0;
1828 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
1829 			pg = RB_MAX(uvm_pmr_size, &pmr->size[mt]);
1830 			if (pg != NULL)
1831 				pg--;
1832 			else
1833 				pg = TAILQ_FIRST(&pmr->single[mt]);
1834 			size[mt] = (pg == NULL ? 0 : pg->fpgsz);
1835 
1836 			RB_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
1837 				free += pg->fpgsz;
1838 		}
1839 
1840 		printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
1841 		    (unsigned long)pmr->low, (unsigned long)pmr->high,
1842 		    pmr->use, (unsigned long)pmr->nsegs);
1843 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
1844 			printf(" maxsegsz[%d]=0x%lx", mt,
1845 			    (unsigned long)size[mt]);
1846 		}
1847 		printf(" free=0x%lx\n", (unsigned long)free);
1848 	}
1849 	printf("#ranges = %d\n", useq_len);
1850 }
1851 #endif
1852 
1853 /*
1854  * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
1855  * in a specific physmem area.
1856  *
1857  * Returns ENOMEM if the pagedaemon failed to free any pages.
1858  * If not failok, failure will lead to panic.
1859  *
1860  * Must be called with fpageq locked.
1861  */
1862 int
1863 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
1864 {
1865 	struct uvm_pmalloc pma;
1866 	const char *wmsg = "pmrwait";
1867 
1868 	if (curproc == uvm.pagedaemon_proc) {
1869 		/*
1870 		 * This is not that uncommon when the pagedaemon is trying
1871 		 * to flush out a large mmapped file. VOP_WRITE will circle
1872 		 * back through the buffer cache and try to get more memory.
1873 		 * The pagedaemon starts by calling bufbackoff, but we can
1874 		 * easily use up that reserve in a single scan iteration.
1875 		 */
1876 		uvm_unlock_fpageq();
1877 		if (bufbackoff(NULL, atop(size)) == 0) {
1878 			uvm_lock_fpageq();
1879 			return 0;
1880 		}
1881 		uvm_lock_fpageq();
1882 
1883 		/*
1884 		 * XXX detect pagedaemon deadlock - see comment in
1885 		 * uvm_wait(), as this is exactly the same issue.
1886 		 */
1887 		printf("pagedaemon: wait_pla deadlock detected!\n");
1888 		msleep(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, hz >> 3);
1889 #if defined(DEBUG)
1890 		/* DEBUG: panic so we can debug it */
1891 		panic("wait_pla pagedaemon deadlock");
1892 #endif
1893 		return 0;
1894 	}
1895 
1896 	for (;;) {
1897 		pma.pm_constraint.ucr_low = low;
1898 		pma.pm_constraint.ucr_high = high;
1899 		pma.pm_size = size;
1900 		pma.pm_flags = UVM_PMA_LINKED;
1901 		TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
1902 
1903 		wakeup(&uvm.pagedaemon);		/* wake the daemon! */
1904 		while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
1905 			msleep(&pma, &uvm.fpageqlock, PVM, wmsg, 0);
1906 
1907 		if (!(pma.pm_flags & UVM_PMA_FREED) &&
1908 		    pma.pm_flags & UVM_PMA_FAIL) {
1909 			if (failok)
1910 				return ENOMEM;
1911 			printf("uvm_wait: failed to free %ld pages between "
1912 			    "0x%lx-0x%lx\n", atop(size), low, high);
1913 		} else
1914 			return 0;
1915 	}
1916 	/* UNREACHABLE */
1917 }
1918 
1919 /*
1920  * Wake up uvm_pmalloc sleepers.
1921  */
1922 void
1923 uvm_wakeup_pla(paddr_t low, psize_t len)
1924 {
1925 	struct uvm_pmalloc *pma, *pma_next;
1926 	paddr_t high;
1927 
1928 	high = low + len;
1929 
1930 	/* Wake specific allocations waiting for this memory. */
1931 	for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
1932 	    pma = pma_next) {
1933 		pma_next = TAILQ_NEXT(pma, pmq);
1934 
1935 		if (low < pma->pm_constraint.ucr_high &&
1936 		    high > pma->pm_constraint.ucr_low) {
1937 			pma->pm_flags |= UVM_PMA_FREED;
1938 			if (!(pma->pm_flags & UVM_PMA_BUSY)) {
1939 				pma->pm_flags &= ~UVM_PMA_LINKED;
1940 				TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
1941 				    pmq);
1942 				wakeup(pma);
1943 			}
1944 		}
1945 	}
1946 }
1947