xref: /openbsd-src/sys/uvm/uvm_pmemrange.c (revision 91f110e064cd7c194e59e019b83bb7496c1c84d4)
1 /*	$OpenBSD: uvm_pmemrange.c,v 1.38 2014/03/21 21:39:36 miod Exp $	*/
2 
3 /*
4  * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/systm.h>
21 #include <uvm/uvm.h>
22 #include <sys/malloc.h>
23 #include <sys/proc.h>		/* XXX for atomic */
24 #include <sys/kernel.h>
25 #include <sys/mount.h>
26 
27 /*
28  * 2 trees: addr tree and size tree.
29  *
30  * The allocator keeps chunks of free pages (called a range).
31  * Two pages are part of the same range if:
32  * - all pages in between are part of that range,
33  * - they are of the same memory type (zeroed or non-zeroed),
34  * - they are part of the same pmemrange.
35  * A pmemrange is a range of memory which is part of the same vm_physseg
36  * and has a use-count.
37  *
38  * addr tree is vm_page[0].objt
39  * size tree is vm_page[1].objt
40  *
41  * The size tree is not used for memory ranges of 1 page, instead,
42  * single queue is vm_page[0].pageq
43  *
44  * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges
45  * are joined, unless:
46  * - they have pages in between them which are not free
47  * - they belong to different memtypes (zeroed vs dirty memory)
48  * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
49  * - they are not a continuation of the same array
50  * The latter issue is caused by vm_physseg ordering and splitting from the
51  * MD initialization machinery. The MD code is dependant on freelists and
52  * happens to split ISA memory from non-ISA memory.
53  * (Note: freelists die die die!)
54  *
55  * uvm_page_init guarantees that every vm_physseg contains an array of
56  * struct vm_page. Also, uvm_page_physload allocates an array of struct
57  * vm_page. This code depends on that array. The array may break across
58  * vm_physsegs boundaries.
59  */
60 
61 /*
62  * Validate the flags of the page. (Used in asserts.)
63  * Any free page must have the PQ_FREE flag set.
64  * Free pages may be zeroed.
65  * Pmap flags are left untouched.
66  *
67  * The PQ_FREE flag is not checked here: by not checking, we can easily use
68  * this check in pages which are freed.
69  */
70 #define VALID_FLAGS(pg_flags)						\
71 	(((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
72 
73 /* Tree comparators. */
74 int	uvm_pmemrange_addr_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
75 int	uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
76 int	uvm_pmr_addr_cmp(struct vm_page *, struct vm_page *);
77 int	uvm_pmr_size_cmp(struct vm_page *, struct vm_page *);
78 int	uvm_pmr_pg_to_memtype(struct vm_page *);
79 
80 #ifdef DDB
81 void	uvm_pmr_print(void);
82 #endif
83 
84 /*
85  * Memory types. The page flags are used to derive what the current memory
86  * type of a page is.
87  */
88 int
89 uvm_pmr_pg_to_memtype(struct vm_page *pg)
90 {
91 	if (pg->pg_flags & PG_ZERO)
92 		return UVM_PMR_MEMTYPE_ZERO;
93 	/* Default: dirty memory. */
94 	return UVM_PMR_MEMTYPE_DIRTY;
95 }
96 
97 /* Trees. */
98 RB_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
99 RB_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
100 RB_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
101     uvm_pmemrange_addr_cmp);
102 
103 /* Validation. */
104 #ifdef DEBUG
105 void	uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
106 #else
107 #define uvm_pmr_assertvalid(pmr)	do {} while (0)
108 #endif
109 
110 int			 uvm_pmr_get1page(psize_t, int, struct pglist *,
111 			    paddr_t, paddr_t);
112 
113 struct uvm_pmemrange	*uvm_pmr_allocpmr(void);
114 struct vm_page		*uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
115 struct vm_page		*uvm_pmr_nextsz(struct uvm_pmemrange *,
116 			    struct vm_page *, int);
117 void			 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
118 			    struct vm_page *pg, struct vm_page **pg_prev,
119 			    struct vm_page **pg_next);
120 struct vm_page		*uvm_pmr_findnextsegment(struct uvm_pmemrange *,
121 			    struct vm_page *, paddr_t);
122 psize_t			 uvm_pmr_remove_1strange(struct pglist *, paddr_t,
123 			    struct vm_page **, int);
124 void			 uvm_pmr_split(paddr_t);
125 struct uvm_pmemrange	*uvm_pmemrange_find(paddr_t);
126 struct uvm_pmemrange	*uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
127 			    struct uvm_pmemrange *);
128 psize_t			 pow2divide(psize_t, psize_t);
129 struct vm_page		*uvm_pmr_rootupdate(struct uvm_pmemrange *,
130 			    struct vm_page *, paddr_t, paddr_t, int);
131 
132 /*
133  * Computes num/denom and rounds it up to the next power-of-2.
134  *
135  * This is a division function which calculates an approximation of
136  * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
137  * have to be accurate.
138  *
139  * Providing too large a value makes the allocator slightly faster, at the
140  * risk of hitting the failure case more often. Providing too small a value
141  * makes the allocator a bit slower, but less likely to hit a failure case.
142  */
143 psize_t
144 pow2divide(psize_t num, psize_t denom)
145 {
146 	int rshift;
147 
148 	for (rshift = 0; num > denom; rshift++, denom <<= 1)
149 		;
150 	return (paddr_t)1 << rshift;
151 }
152 
153 /*
154  * Predicate: lhs is a subrange or rhs.
155  *
156  * If rhs_low == 0: don't care about lower bound.
157  * If rhs_high == 0: don't care about upper bound.
158  */
159 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high)	\
160 	(((rhs_low) == 0 || (lhs_low) >= (rhs_low)) &&			\
161 	((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
162 
163 /*
164  * Predicate: lhs intersects with rhs.
165  *
166  * If rhs_low == 0: don't care about lower bound.
167  * If rhs_high == 0: don't care about upper bound.
168  * Ranges don't intersect if they don't have any page in common, array
169  * semantics mean that < instead of <= should be used here.
170  */
171 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high)	\
172 	(((rhs_low) == 0 || (rhs_low) < (lhs_high)) &&			\
173 	((rhs_high) == 0 || (lhs_low) < (rhs_high)))
174 
175 /*
176  * Align to power-of-2 alignment.
177  */
178 #define PMR_ALIGN(pgno, align)						\
179 	(((pgno) + ((align) - 1)) & ~((align) - 1))
180 
181 
182 /*
183  * Comparator: sort by address ascending.
184  */
185 int
186 uvm_pmemrange_addr_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
187 {
188 	return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
189 }
190 
191 /*
192  * Comparator: sort by use ascending.
193  *
194  * The higher the use value of a range, the more devices need memory in
195  * this range. Therefor allocate from the range with the lowest use first.
196  */
197 int
198 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
199 {
200 	int result;
201 
202 	result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
203 	if (result == 0)
204 		result = uvm_pmemrange_addr_cmp(lhs, rhs);
205 	return result;
206 }
207 
208 int
209 uvm_pmr_addr_cmp(struct vm_page *lhs, struct vm_page *rhs)
210 {
211 	paddr_t lhs_addr, rhs_addr;
212 
213 	lhs_addr = VM_PAGE_TO_PHYS(lhs);
214 	rhs_addr = VM_PAGE_TO_PHYS(rhs);
215 
216 	return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
217 }
218 
219 int
220 uvm_pmr_size_cmp(struct vm_page *lhs, struct vm_page *rhs)
221 {
222 	psize_t lhs_size, rhs_size;
223 	int cmp;
224 
225 	/* Using second tree, so we receive pg[1] instead of pg[0]. */
226 	lhs_size = (lhs - 1)->fpgsz;
227 	rhs_size = (rhs - 1)->fpgsz;
228 
229 	cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
230 	if (cmp == 0)
231 		cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
232 	return cmp;
233 }
234 
235 /*
236  * Find the first range of free pages that is at least sz pages long.
237  */
238 struct vm_page *
239 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
240 {
241 	struct	vm_page *node, *best;
242 
243 	KASSERT(sz >= 1);
244 
245 	if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
246 		return TAILQ_FIRST(&pmr->single[mti]);
247 
248 	node = RB_ROOT(&pmr->size[mti]);
249 	best = NULL;
250 	while (node != NULL) {
251 		if ((node - 1)->fpgsz >= sz) {
252 			best = (node - 1);
253 			node = RB_LEFT(node, objt);
254 		} else
255 			node = RB_RIGHT(node, objt);
256 	}
257 	return best;
258 }
259 
260 /*
261  * Finds the next range. The next range has a size >= pg->fpgsz.
262  * Returns NULL if no more ranges are available.
263  */
264 struct vm_page *
265 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
266 {
267 	struct vm_page *npg;
268 
269 	KASSERT(pmr != NULL && pg != NULL);
270 	if (pg->fpgsz == 1) {
271 		if (TAILQ_NEXT(pg, pageq) != NULL)
272 			return TAILQ_NEXT(pg, pageq);
273 		else
274 			npg = RB_MIN(uvm_pmr_size, &pmr->size[mt]);
275 	} else
276 		npg = RB_NEXT(uvm_pmr_size, &pmr->size[mt], pg + 1);
277 
278 	return npg == NULL ? NULL : npg - 1;
279 }
280 
281 /*
282  * Finds the previous and next ranges relative to the (uninserted) pg range.
283  *
284  * *pg_prev == NULL if no previous range is available, that can join with
285  * 	pg.
286  * *pg_next == NULL if no next range is available, that can join with
287  * 	pg.
288  */
289 void
290 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
291     struct vm_page **pg_prev, struct vm_page **pg_next)
292 {
293 	KASSERT(pg_prev != NULL && pg_next != NULL);
294 
295 	*pg_next = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg);
296 	if (*pg_next == NULL)
297 		*pg_prev = RB_MAX(uvm_pmr_addr, &pmr->addr);
298 	else
299 		*pg_prev = RB_PREV(uvm_pmr_addr, &pmr->addr, *pg_next);
300 
301 	KDASSERT(*pg_next == NULL ||
302 	    VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
303 	KDASSERT(*pg_prev == NULL ||
304 	    VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
305 
306 	/* Reset if not contig. */
307 	if (*pg_prev != NULL &&
308 	    (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
309 	    != atop(VM_PAGE_TO_PHYS(pg)) ||
310 	    *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
311 	    uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
312 		*pg_prev = NULL;
313 	if (*pg_next != NULL &&
314 	    (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
315 	    != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
316 	    pg + pg->fpgsz != *pg_next || /* Array broke. */
317 	    uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
318 		*pg_next = NULL;
319 	return;
320 }
321 
322 /*
323  * Remove a range from the address tree.
324  * Address tree maintains pmr counters.
325  */
326 void
327 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
328 {
329 	KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
330 	KDASSERT(pg->pg_flags & PQ_FREE);
331 	RB_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
332 
333 	pmr->nsegs--;
334 }
335 /*
336  * Remove a range from the size tree.
337  */
338 void
339 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
340 {
341 	int memtype;
342 #ifdef DEBUG
343 	struct vm_page *i;
344 #endif
345 
346 	KDASSERT(pg->fpgsz >= 1);
347 	KDASSERT(pg->pg_flags & PQ_FREE);
348 	memtype = uvm_pmr_pg_to_memtype(pg);
349 
350 	if (pg->fpgsz == 1) {
351 #ifdef DEBUG
352 		TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
353 			if (i == pg)
354 				break;
355 		}
356 		KDASSERT(i == pg);
357 #endif
358 		TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
359 	} else {
360 		KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[memtype],
361 		    pg + 1) == pg + 1);
362 		RB_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
363 	}
364 }
365 /* Remove from both trees. */
366 void
367 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
368 {
369 	uvm_pmr_assertvalid(pmr);
370 	uvm_pmr_remove_size(pmr, pg);
371 	uvm_pmr_remove_addr(pmr, pg);
372 	uvm_pmr_assertvalid(pmr);
373 }
374 
375 /*
376  * Insert the range described in pg.
377  * Returns the range thus created (which may be joined with the previous and
378  * next ranges).
379  * If no_join, the caller guarantees that the range cannot possibly join
380  * with adjecent ranges.
381  */
382 struct vm_page *
383 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
384 {
385 	struct vm_page *prev, *next;
386 
387 #ifdef DEBUG
388 	struct vm_page *i;
389 	int mt;
390 #endif
391 
392 	KDASSERT(pg->pg_flags & PQ_FREE);
393 	KDASSERT(pg->fpgsz >= 1);
394 
395 #ifdef DEBUG
396 	for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
397 		TAILQ_FOREACH(i, &pmr->single[mt], pageq)
398 			KDASSERT(i != pg);
399 		if (pg->fpgsz > 1) {
400 			KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mt],
401 			    pg + 1) == NULL);
402 		}
403 		KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
404 	}
405 #endif
406 
407 	if (!no_join) {
408 		uvm_pmr_pnaddr(pmr, pg, &prev, &next);
409 		if (next != NULL) {
410 			uvm_pmr_remove_size(pmr, next);
411 			uvm_pmr_remove_addr(pmr, next);
412 			pg->fpgsz += next->fpgsz;
413 			next->fpgsz = 0;
414 		}
415 		if (prev != NULL) {
416 			uvm_pmr_remove_size(pmr, prev);
417 			prev->fpgsz += pg->fpgsz;
418 			pg->fpgsz = 0;
419 			return prev;
420 		}
421 	}
422 
423 	RB_INSERT(uvm_pmr_addr, &pmr->addr, pg);
424 
425 	pmr->nsegs++;
426 
427 	return pg;
428 }
429 /*
430  * Insert the range described in pg.
431  * Returns the range thus created (which may be joined with the previous and
432  * next ranges).
433  * Page must already be in the address tree.
434  */
435 void
436 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
437 {
438 	int memtype;
439 #ifdef DEBUG
440 	struct vm_page *i;
441 	int mti;
442 #endif
443 
444 	KDASSERT(pg->fpgsz >= 1);
445 	KDASSERT(pg->pg_flags & PQ_FREE);
446 
447 	memtype = uvm_pmr_pg_to_memtype(pg);
448 #ifdef DEBUG
449 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
450 		TAILQ_FOREACH(i, &pmr->single[mti], pageq)
451 			KDASSERT(i != pg);
452 		if (pg->fpgsz > 1) {
453 			KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mti],
454 			    pg + 1) == NULL);
455 		}
456 		KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
457 	}
458 	for (i = pg; i < pg + pg->fpgsz; i++)
459 		KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
460 #endif
461 
462 	if (pg->fpgsz == 1)
463 		TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
464 	else
465 		RB_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
466 }
467 /* Insert in both trees. */
468 struct vm_page *
469 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
470 {
471 	uvm_pmr_assertvalid(pmr);
472 	pg = uvm_pmr_insert_addr(pmr, pg, no_join);
473 	uvm_pmr_insert_size(pmr, pg);
474 	uvm_pmr_assertvalid(pmr);
475 	return pg;
476 }
477 
478 /*
479  * Find the last page that is part of this segment.
480  * => pg: the range at which to start the search.
481  * => boundary: the page number boundary specification (0 = no boundary).
482  * => pmr: the pmemrange of the page.
483  *
484  * This function returns 1 before the next range, so if you want to have the
485  * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
486  * The reason is that this way, the length of the segment is easily
487  * calculated using: atop(result) - atop(pg) + 1.
488  * Hence this function also never returns NULL.
489  */
490 struct vm_page *
491 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
492     struct vm_page *pg, paddr_t boundary)
493 {
494 	paddr_t	first_boundary;
495 	struct	vm_page *next;
496 	struct	vm_page *prev;
497 
498 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
499 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
500 	if (boundary != 0) {
501 		first_boundary =
502 		    PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
503 	} else
504 		first_boundary = 0;
505 
506 	/*
507 	 * Increase next until it hits the first page of the next segment.
508 	 *
509 	 * While loop checks the following:
510 	 * - next != NULL	we have not reached the end of pgl
511 	 * - boundary == 0 || next < first_boundary
512 	 *			we do not cross a boundary
513 	 * - atop(prev) + 1 == atop(next)
514 	 *			still in the same segment
515 	 * - low <= last
516 	 * - high > last	still in the same memory range
517 	 * - memtype is equal	allocator is unable to view different memtypes
518 	 *			as part of the same segment
519 	 * - prev + 1 == next	no array breakage occurs
520 	 */
521 	prev = pg;
522 	next = TAILQ_NEXT(prev, pageq);
523 	while (next != NULL &&
524 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
525 	    atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
526 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
527 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
528 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
529 	    prev + 1 == next) {
530 		prev = next;
531 		next = TAILQ_NEXT(prev, pageq);
532 	}
533 
534 	/*
535 	 * End of this segment.
536 	 */
537 	return prev;
538 }
539 
540 /*
541  * Remove the first segment of contiguous pages from pgl.
542  * A segment ends if it crosses boundary (unless boundary = 0) or
543  * if it would enter a different uvm_pmemrange.
544  *
545  * Work: the page range that the caller is currently working with.
546  * May be null.
547  *
548  * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
549  * the first segment is erased (which, if called by uvm_pmr_getpages(),
550  * probably is the smallest or very close to it).
551  */
552 psize_t
553 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
554     struct vm_page **work, int is_desperate)
555 {
556 	struct vm_page *start, *end, *iter, *iter_end, *inserted;
557 	psize_t count;
558 	struct uvm_pmemrange *pmr, *pmr_iter;
559 
560 	KASSERT(!TAILQ_EMPTY(pgl));
561 
562 	/*
563 	 * Initialize to first page.
564 	 * Unless desperate scan finds a better candidate, this is what'll be
565 	 * erased.
566 	 */
567 	start = TAILQ_FIRST(pgl);
568 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
569 	end = uvm_pmr_findnextsegment(pmr, start, boundary);
570 
571 	/*
572 	 * If we are desperate, we _really_ want to get rid of the smallest
573 	 * element (rather than a close match to the smallest element).
574 	 */
575 	if (is_desperate) {
576 		/* Linear search for smallest segment. */
577 		pmr_iter = pmr;
578 		for (iter = TAILQ_NEXT(end, pageq);
579 		    iter != NULL && start != end;
580 		    iter = TAILQ_NEXT(iter_end, pageq)) {
581 			/*
582 			 * Only update pmr if it doesn't match current
583 			 * iteration.
584 			 */
585 			if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
586 			    pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
587 				pmr_iter = uvm_pmemrange_find(atop(
588 				    VM_PAGE_TO_PHYS(iter)));
589 			}
590 
591 			iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
592 			    boundary);
593 
594 			/*
595 			 * Current iteration is smaller than best match so
596 			 * far; update.
597 			 */
598 			if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
599 			    VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
600 				start = iter;
601 				end = iter_end;
602 				pmr = pmr_iter;
603 			}
604 		}
605 	}
606 
607 	/*
608 	 * Calculate count and end of the list.
609 	 */
610 	count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
611 	end = TAILQ_NEXT(end, pageq);
612 
613 	/*
614 	 * Actually remove the range of pages.
615 	 *
616 	 * Sadly, this cannot be done using pointer iteration:
617 	 * vm_physseg is not guaranteed to be sorted on address, hence
618 	 * uvm_page_init() may not have initialized its array sorted by
619 	 * page number.
620 	 */
621 	for (iter = start; iter != end; iter = iter_end) {
622 		iter_end = TAILQ_NEXT(iter, pageq);
623 		TAILQ_REMOVE(pgl, iter, pageq);
624 	}
625 
626 	start->fpgsz = count;
627 	inserted = uvm_pmr_insert(pmr, start, 0);
628 
629 	/*
630 	 * If the caller was working on a range and this function modified
631 	 * that range, update the pointer.
632 	 */
633 	if (work != NULL && *work != NULL &&
634 	    atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
635 	    atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
636 	    atop(VM_PAGE_TO_PHYS(*work)))
637 		*work = inserted;
638 	return count;
639 }
640 
641 /*
642  * Extract a number of pages from a segment of free pages.
643  * Called by uvm_pmr_getpages.
644  *
645  * Returns the segment that was created from pages left over at the tail
646  * of the remove set of pages, or NULL if no pages were left at the tail.
647  */
648 struct vm_page *
649 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
650     paddr_t start, paddr_t end, struct pglist *result)
651 {
652 	struct vm_page *after, *pg_i;
653 	psize_t before_sz, after_sz;
654 #ifdef DEBUG
655 	psize_t i;
656 #endif
657 
658 	KDASSERT(end > start);
659 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
660 	KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
661 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
662 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
663 
664 	before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
665 	after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
666 	KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
667 	uvm_pmr_assertvalid(pmr);
668 
669 	uvm_pmr_remove_size(pmr, pg);
670 	if (before_sz == 0)
671 		uvm_pmr_remove_addr(pmr, pg);
672 	after = pg + before_sz + (end - start);
673 
674 	/* Add selected pages to result. */
675 	for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
676 		KASSERT(pg_i->pg_flags & PQ_FREE);
677 		pg_i->fpgsz = 0;
678 		TAILQ_INSERT_TAIL(result, pg_i, pageq);
679 	}
680 
681 	/* Before handling. */
682 	if (before_sz > 0) {
683 		pg->fpgsz = before_sz;
684 		uvm_pmr_insert_size(pmr, pg);
685 	}
686 
687 	/* After handling. */
688 	if (after_sz > 0) {
689 #ifdef DEBUG
690 		for (i = 0; i < after_sz; i++) {
691 			KASSERT(!uvm_pmr_isfree(after + i));
692 		}
693 #endif
694 		KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
695 		after->fpgsz = after_sz;
696 		after = uvm_pmr_insert_addr(pmr, after, 1);
697 		uvm_pmr_insert_size(pmr, after);
698 	}
699 
700 	uvm_pmr_assertvalid(pmr);
701 	return (after_sz > 0 ? after : NULL);
702 }
703 
704 /*
705  * Acquire a number of pages.
706  *
707  * count:	the number of pages returned
708  * start:	lowest page number
709  * end:		highest page number +1
710  * 		(start = end = 0: no limitation)
711  * align:	power-of-2 alignment constraint (align = 1: no alignment)
712  * boundary:	power-of-2 boundary (boundary = 0: no boundary)
713  * maxseg:	maximum number of segments to return
714  * flags:	UVM_PLA_* flags
715  * result:	returned pages storage (uses pageq)
716  */
717 int
718 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
719     paddr_t boundary, int maxseg, int flags, struct pglist *result)
720 {
721 	struct	uvm_pmemrange *pmr;	/* Iterate memory ranges. */
722 	struct	vm_page *found, *f_next; /* Iterate chunks. */
723 	psize_t	fcount;			/* Current found pages. */
724 	int	fnsegs;			/* Current segment counter. */
725 	int	try, start_try;
726 	psize_t	search[3];
727 	paddr_t	fstart, fend;		/* Pages to be taken from found. */
728 	int	memtype;		/* Requested memtype. */
729 	int	memtype_init;		/* Best memtype. */
730 	int	desperate;		/* True if allocation failed. */
731 #ifdef DIAGNOSTIC
732 	struct	vm_page *diag_prev;	/* Used during validation. */
733 #endif /* DIAGNOSTIC */
734 
735 	/*
736 	 * Validate arguments.
737 	 */
738 	KASSERT(count > 0);
739 	KASSERT(start == 0 || end == 0 || start < end);
740 	KASSERT(align >= 1);
741 	KASSERT(powerof2(align));
742 	KASSERT(maxseg > 0);
743 	KASSERT(boundary == 0 || powerof2(boundary));
744 	KASSERT(boundary == 0 || maxseg * boundary >= count);
745 	KASSERT(TAILQ_EMPTY(result));
746 
747 	/*
748 	 * TRYCONTIG is a noop if you only want a single segment.
749 	 * Remove it if that's the case: otherwise it'll deny the fast
750 	 * allocation.
751 	 */
752 	if (maxseg == 1 || count == 1)
753 		flags &= ~UVM_PLA_TRYCONTIG;
754 
755 	/*
756 	 * Configure search.
757 	 *
758 	 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
759 	 * search[1] is multiple segments, chosen to fulfill the search in
760 	 *   approximately even-sized segments.
761 	 *   This is a good trade-off between slightly reduced allocation speed
762 	 *   and less fragmentation.
763 	 * search[2] is the worst case, in which all segments are evaluated.
764 	 *   This provides the least fragmentation, but makes the search
765 	 *   possibly longer (although in the case it is selected, that no
766 	 *   longer matters most).
767 	 *
768 	 * The exception is when maxseg == 1: since we can only fulfill that
769 	 * with one segment of size pages, only a single search type has to
770 	 * be attempted.
771 	 */
772 	if (maxseg == 1 || count == 1) {
773 		start_try = 2;
774 		search[2] = count;
775 	} else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
776 		start_try = 2;
777 		search[2] = 1;
778 	} else {
779 		start_try = 0;
780 		search[0] = count;
781 		search[1] = pow2divide(count, maxseg);
782 		search[2] = 1;
783 		if ((flags & UVM_PLA_TRYCONTIG) == 0)
784 			start_try = 1;
785 		if (search[1] >= search[0]) {
786 			search[1] = search[0];
787 			start_try = 1;
788 		}
789 		if (search[2] >= search[start_try]) {
790 			start_try = 2;
791 		}
792 	}
793 
794 	/*
795 	 * Memory type: if zeroed memory is requested, traverse the zero set.
796 	 * Otherwise, traverse the dirty set.
797 	 *
798 	 * The memtype iterator is reinitialized to memtype_init on entrance
799 	 * of a pmemrange.
800 	 */
801 	if (flags & UVM_PLA_ZERO)
802 		memtype_init = UVM_PMR_MEMTYPE_ZERO;
803 	else
804 		memtype_init = UVM_PMR_MEMTYPE_DIRTY;
805 
806 	/*
807 	 * Initially, we're not desperate.
808 	 *
809 	 * Note that if we return from a sleep, we are still desperate.
810 	 * Chances are that memory pressure is still high, so resetting
811 	 * seems over-optimistic to me.
812 	 */
813 	desperate = 0;
814 
815 	uvm_lock_fpageq();
816 
817 retry:		/* Return point after sleeping. */
818 	fcount = 0;
819 	fnsegs = 0;
820 
821 retry_desperate:
822 	/*
823 	 * If we just want any page(s), go for the really fast option.
824 	 */
825 	if (count <= maxseg && align == 1 && boundary == 0 &&
826 	    (flags & UVM_PLA_TRYCONTIG) == 0) {
827 		fcount += uvm_pmr_get1page(count - fcount, memtype_init,
828 		    result, start, end);
829 
830 		/*
831 		 * If we found sufficient pages, go to the succes exit code.
832 		 *
833 		 * Otherwise, go immediately to fail, since we collected
834 		 * all we could anyway.
835 		 */
836 		if (fcount == count)
837 			goto out;
838 		else
839 			goto fail;
840 	}
841 
842 	/*
843 	 * The heart of the contig case.
844 	 *
845 	 * The code actually looks like this:
846 	 *
847 	 * foreach (struct pmemrange) {
848 	 *	foreach (memtype) {
849 	 *		foreach(try) {
850 	 *			foreach (free range of memtype in pmemrange,
851 	 *			    starting at search[try]) {
852 	 *				while (range has space left)
853 	 *					take from range
854 	 *			}
855 	 *		}
856 	 *	}
857 	 *
858 	 *	if next pmemrange has higher usecount than current:
859 	 *		enter desperate case (which will drain the pmemranges
860 	 *		until empty prior to moving to the next one)
861 	 * }
862 	 *
863 	 * When desperate is activated, try always starts at the highest
864 	 * value. The memtype loop is using a goto ReScanMemtype.
865 	 * The try loop is using a goto ReScan.
866 	 * The 'range has space left' loop uses label DrainFound.
867 	 *
868 	 * Writing them all as loops would take up a lot of screen space in
869 	 * the form of indentation and some parts are easier to express
870 	 * using the labels.
871 	 */
872 
873 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
874 		/* Empty range. */
875 		if (pmr->nsegs == 0)
876 			continue;
877 
878 		/* Outside requested range. */
879 		if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
880 			continue;
881 
882 		memtype = memtype_init;
883 
884 rescan_memtype:	/* Return point at memtype++. */
885 		try = start_try;
886 
887 rescan:		/* Return point at try++. */
888 		for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
889 		    found != NULL;
890 		    found = f_next) {
891 			f_next = uvm_pmr_nextsz(pmr, found, memtype);
892 
893 			fstart = atop(VM_PAGE_TO_PHYS(found));
894 			if (start != 0)
895 				fstart = MAX(start, fstart);
896 drain_found:
897 			/*
898 			 * Throw away the first segment if fnsegs == maxseg
899 			 *
900 			 * Note that f_next is still valid after this call,
901 			 * since we only allocated from entries before f_next.
902 			 * We don't revisit the entries we already extracted
903 			 * from unless we entered the desperate case.
904 			 */
905 			if (fnsegs == maxseg) {
906 				fnsegs--;
907 				fcount -=
908 				    uvm_pmr_remove_1strange(result, boundary,
909 				    &found, desperate);
910 			}
911 
912 			fstart = PMR_ALIGN(fstart, align);
913 			fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
914 			if (end != 0)
915 				fend = MIN(end, fend);
916 			if (boundary != 0) {
917 				fend =
918 				    MIN(fend, PMR_ALIGN(fstart + 1, boundary));
919 			}
920 			if (fstart >= fend)
921 				continue;
922 			if (fend - fstart > count - fcount)
923 				fend = fstart + (count - fcount);
924 
925 			fcount += fend - fstart;
926 			fnsegs++;
927 			found = uvm_pmr_extract_range(pmr, found,
928 			    fstart, fend, result);
929 
930 			if (fcount == count)
931 				goto out;
932 
933 			/*
934 			 * If there's still space left in found, try to
935 			 * fully drain it prior to continueing.
936 			 */
937 			if (found != NULL) {
938 				fstart = fend;
939 				goto drain_found;
940 			}
941 		}
942 
943 		/*
944 		 * Try a smaller search now.
945 		 */
946 		if (++try < nitems(search))
947 			goto rescan;
948 
949 		/*
950 		 * Exhaust all memory types prior to going to the next memory
951 		 * segment.
952 		 * This means that zero-vs-dirty are eaten prior to moving
953 		 * to a pmemrange with a higher use-count.
954 		 *
955 		 * Code is basically a difficult way of writing:
956 		 * memtype = memtype_init;
957 		 * do {
958 		 *	...;
959 		 *	memtype += 1;
960 		 *	memtype %= MEMTYPE_MAX;
961 		 * } while (memtype != memtype_init);
962 		 */
963 		memtype += 1;
964 		if (memtype == UVM_PMR_MEMTYPE_MAX)
965 			memtype = 0;
966 		if (memtype != memtype_init)
967 			goto rescan_memtype;
968 
969 		/*
970 		 * If not desperate, enter desperate case prior to eating all
971 		 * the good stuff in the next range.
972 		 */
973 		if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
974 		    TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
975 			break;
976 	}
977 
978 	/*
979 	 * Not enough memory of the requested type available. Fall back to
980 	 * less good memory that we'll clean up better later.
981 	 *
982 	 * This algorithm is not very smart though, it just starts scanning
983 	 * a different typed range, but the nicer ranges of the previous
984 	 * iteration may fall out. Hence there is a small chance of a false
985 	 * negative.
986 	 *
987 	 * When desparate: scan all sizes starting at the smallest
988 	 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
989 	 * allow us to hit the fast path now).
990 	 *
991 	 * Also, because we will revisit entries we scanned before, we need
992 	 * to reset the page queue, or we may end up releasing entries in
993 	 * such a way as to invalidate f_next.
994 	 */
995 	if (!desperate) {
996 		desperate = 1;
997 		start_try = nitems(search) - 1;
998 		flags &= ~UVM_PLA_TRYCONTIG;
999 
1000 		while (!TAILQ_EMPTY(result))
1001 			uvm_pmr_remove_1strange(result, 0, NULL, 0);
1002 		fnsegs = 0;
1003 		fcount = 0;
1004 		goto retry_desperate;
1005 	}
1006 
1007 fail:
1008 	/*
1009 	 * Allocation failed.
1010 	 */
1011 
1012 	/* XXX: claim from memory reserve here */
1013 
1014 	while (!TAILQ_EMPTY(result))
1015 		uvm_pmr_remove_1strange(result, 0, NULL, 0);
1016 
1017 	if (flags & UVM_PLA_WAITOK) {
1018 		if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
1019 		    flags & UVM_PLA_FAILOK) == 0)
1020 			goto retry;
1021 		KASSERT(flags & UVM_PLA_FAILOK);
1022 	} else
1023 		wakeup(&uvm.pagedaemon);
1024 	uvm_unlock_fpageq();
1025 
1026 	return ENOMEM;
1027 
1028 out:
1029 
1030 	/*
1031 	 * Allocation succesful.
1032 	 */
1033 
1034 	uvmexp.free -= fcount;
1035 
1036 	uvm_unlock_fpageq();
1037 
1038 	/* Update statistics and zero pages if UVM_PLA_ZERO. */
1039 #ifdef DIAGNOSTIC
1040 	fnsegs = 0;
1041 	fcount = 0;
1042 	diag_prev = NULL;
1043 #endif /* DIAGNOSTIC */
1044 	TAILQ_FOREACH(found, result, pageq) {
1045 		atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1046 
1047 		if (found->pg_flags & PG_ZERO) {
1048 			uvmexp.zeropages--;
1049 		}
1050 		if (flags & UVM_PLA_ZERO) {
1051 			if (found->pg_flags & PG_ZERO)
1052 				uvmexp.pga_zerohit++;
1053 			else {
1054 				uvmexp.pga_zeromiss++;
1055 				uvm_pagezero(found);
1056 			}
1057 		}
1058 		atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1059 
1060 		found->uobject = NULL;
1061 		found->uanon = NULL;
1062 		found->pg_version++;
1063 
1064 		/*
1065 		 * Validate that the page matches range criterium.
1066 		 */
1067 		KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1068 		KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1069 
1070 #ifdef DIAGNOSTIC
1071 		/*
1072 		 * Update fcount (# found pages) and
1073 		 * fnsegs (# found segments) counters.
1074 		 */
1075 		if (diag_prev == NULL ||
1076 		    /* new segment if it contains a hole */
1077 		    atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1078 		    atop(VM_PAGE_TO_PHYS(found)) ||
1079 		    /* new segment if it crosses boundary */
1080 		    (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1081 		    (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1082 			fnsegs++;
1083 		fcount++;
1084 
1085 		diag_prev = found;
1086 #endif /* DIAGNOSTIC */
1087 	}
1088 
1089 #ifdef DIAGNOSTIC
1090 	/*
1091 	 * Panic on algorithm failure.
1092 	 */
1093 	if (fcount != count || fnsegs > maxseg) {
1094 		panic("pmemrange allocation error: "
1095 		    "allocated %ld pages in %d segments, "
1096 		    "but request was %ld pages in %d segments",
1097 		    fcount, fnsegs, count, maxseg);
1098 	}
1099 #endif /* DIAGNOSTIC */
1100 
1101 	return 0;
1102 }
1103 
1104 /*
1105  * Free a number of contig pages (invoked by uvm_page_init).
1106  */
1107 void
1108 uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1109 {
1110 	struct uvm_pmemrange *pmr;
1111 	psize_t i, pmr_count;
1112 	struct vm_page *firstpg = pg;
1113 
1114 	for (i = 0; i < count; i++) {
1115 		KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1116 		    atop(VM_PAGE_TO_PHYS(pg)) + i);
1117 
1118 		if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1119 		    VALID_FLAGS(pg[i].pg_flags))) {
1120 			printf("Flags: 0x%x, will panic now.\n",
1121 			    pg[i].pg_flags);
1122 		}
1123 		KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1124 		    VALID_FLAGS(pg[i].pg_flags));
1125 		atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1126 		atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1127 	}
1128 
1129 	uvm_lock_fpageq();
1130 
1131 	for (i = count; i > 0; i -= pmr_count) {
1132 		pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1133 		KASSERT(pmr != NULL);
1134 
1135 		pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1136 		pg->fpgsz = pmr_count;
1137 		uvm_pmr_insert(pmr, pg, 0);
1138 
1139 		uvmexp.free += pmr_count;
1140 		pg += pmr_count;
1141 	}
1142 	wakeup(&uvmexp.free);
1143 
1144 	uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
1145 
1146 	uvm_unlock_fpageq();
1147 }
1148 
1149 /*
1150  * Free all pages in the queue.
1151  */
1152 void
1153 uvm_pmr_freepageq(struct pglist *pgl)
1154 {
1155 	struct vm_page *pg;
1156 	paddr_t pstart;
1157 	psize_t plen;
1158 
1159 	TAILQ_FOREACH(pg, pgl, pageq) {
1160 		if (!((pg->pg_flags & PQ_FREE) == 0 &&
1161 		    VALID_FLAGS(pg->pg_flags))) {
1162 			printf("Flags: 0x%x, will panic now.\n",
1163 			    pg->pg_flags);
1164 		}
1165 		KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1166 		    VALID_FLAGS(pg->pg_flags));
1167 		atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1168 		atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1169 	}
1170 
1171 	uvm_lock_fpageq();
1172 	while (!TAILQ_EMPTY(pgl)) {
1173 		pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
1174 		plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
1175 		uvmexp.free += plen;
1176 
1177 		uvm_wakeup_pla(pstart, ptoa(plen));
1178 	}
1179 	wakeup(&uvmexp.free);
1180 	uvm_unlock_fpageq();
1181 
1182 	return;
1183 }
1184 
1185 /*
1186  * Store a pmemrange in the list.
1187  *
1188  * The list is sorted by use.
1189  */
1190 struct uvm_pmemrange *
1191 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1192     struct uvm_pmemrange *pmr)
1193 {
1194 	struct uvm_pmemrange *iter;
1195 	int cmp = 1;
1196 
1197 	TAILQ_FOREACH(iter, useq, pmr_use) {
1198 		cmp = uvm_pmemrange_use_cmp(pmr, iter);
1199 		if (cmp == 0)
1200 			return iter;
1201 		if (cmp == -1)
1202 			break;
1203 	}
1204 
1205 	if (iter == NULL)
1206 		TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1207 	else
1208 		TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1209 	return NULL;
1210 }
1211 
1212 #ifdef DEBUG
1213 /*
1214  * Validation of the whole pmemrange.
1215  * Called with fpageq locked.
1216  */
1217 void
1218 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1219 {
1220 	struct vm_page *prev, *next, *i, *xref;
1221 	int lcv, mti;
1222 
1223 	/* Empty range */
1224 	if (pmr->nsegs == 0)
1225 		return;
1226 
1227 	/* Validate address tree. */
1228 	RB_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1229 		/* Validate the range. */
1230 		KASSERT(i->fpgsz > 0);
1231 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1232 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1233 		    <= pmr->high);
1234 
1235 		/* Validate each page in this range. */
1236 		for (lcv = 0; lcv < i->fpgsz; lcv++) {
1237 			/*
1238 			 * Only the first page has a size specification.
1239 			 * Rest is size 0.
1240 			 */
1241 			KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1242 			/*
1243 			 * Flag check.
1244 			 */
1245 			KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1246 			    (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1247 			/*
1248 			 * Free pages are:
1249 			 * - not wired
1250 			 * - not loaned
1251 			 * - have no vm_anon
1252 			 * - have no uvm_object
1253 			 */
1254 			KASSERT(i[lcv].wire_count == 0);
1255 			KASSERT(i[lcv].loan_count == 0);
1256 			KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1257 			    i[lcv].uanon == NULL);
1258 			KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1259 			    i[lcv].uobject == NULL);
1260 			/*
1261 			 * Pages in a single range always have the same
1262 			 * memtype.
1263 			 */
1264 			KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1265 			    uvm_pmr_pg_to_memtype(&i[lcv]));
1266 		}
1267 
1268 		/* Check that it shouldn't be joined with its predecessor. */
1269 		prev = RB_PREV(uvm_pmr_addr, &pmr->addr, i);
1270 		if (prev != NULL) {
1271 			KASSERT(uvm_pmr_pg_to_memtype(i) !=
1272 			    uvm_pmr_pg_to_memtype(prev) ||
1273 			    atop(VM_PAGE_TO_PHYS(i)) >
1274 			    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1275 			    prev + prev->fpgsz != i);
1276 		}
1277 
1278 		/* Assert i is in the size tree as well. */
1279 		if (i->fpgsz == 1) {
1280 			TAILQ_FOREACH(xref,
1281 			    &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1282 				if (xref == i)
1283 					break;
1284 			}
1285 			KASSERT(xref == i);
1286 		} else {
1287 			KASSERT(RB_FIND(uvm_pmr_size,
1288 			    &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1289 			    i + 1);
1290 		}
1291 	}
1292 
1293 	/* Validate size tree. */
1294 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1295 		for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1296 			next = uvm_pmr_nextsz(pmr, i, mti);
1297 			if (next != NULL) {
1298 				KASSERT(i->fpgsz <=
1299 				    next->fpgsz);
1300 			}
1301 
1302 			/* Assert i is in the addr tree as well. */
1303 			KASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1304 
1305 			/* Assert i is of the correct memory type. */
1306 			KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1307 		}
1308 	}
1309 
1310 	/* Validate nsegs statistic. */
1311 	lcv = 0;
1312 	RB_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1313 		lcv++;
1314 	KASSERT(pmr->nsegs == lcv);
1315 }
1316 #endif /* DEBUG */
1317 
1318 /*
1319  * Split pmr at split point pageno.
1320  * Called with fpageq unlocked.
1321  *
1322  * Split is only applied if a pmemrange spans pageno.
1323  */
1324 void
1325 uvm_pmr_split(paddr_t pageno)
1326 {
1327 	struct uvm_pmemrange *pmr, *drain;
1328 	struct vm_page *rebuild, *prev, *next;
1329 	psize_t prev_sz;
1330 
1331 	uvm_lock_fpageq();
1332 	pmr = uvm_pmemrange_find(pageno);
1333 	if (pmr == NULL || !(pmr->low < pageno)) {
1334 		/* No split required. */
1335 		uvm_unlock_fpageq();
1336 		return;
1337 	}
1338 
1339 	KASSERT(pmr->low < pageno);
1340 	KASSERT(pmr->high > pageno);
1341 
1342 	/*
1343 	 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1344 	 * uvm_kmemalloc which calls into pmemrange, making the locking
1345 	 * a bit hard, so we just race!
1346 	 */
1347 	uvm_unlock_fpageq();
1348 	drain = uvm_pmr_allocpmr();
1349 	uvm_lock_fpageq();
1350 	pmr = uvm_pmemrange_find(pageno);
1351 	if (pmr == NULL || !(pmr->low < pageno)) {
1352 		/*
1353 		 * We lost the race since someone else ran this or a related
1354 		 * function, however this should be triggered very rarely so
1355 		 * we just leak the pmr.
1356 		 */
1357 		printf("uvm_pmr_split: lost one pmr\n");
1358 		uvm_unlock_fpageq();
1359 		return;
1360 	}
1361 
1362 	drain->low = pageno;
1363 	drain->high = pmr->high;
1364 	drain->use = pmr->use;
1365 
1366 	uvm_pmr_assertvalid(pmr);
1367 	uvm_pmr_assertvalid(drain);
1368 	KASSERT(drain->nsegs == 0);
1369 
1370 	RB_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1371 		if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1372 			break;
1373 	}
1374 	if (rebuild == NULL)
1375 		prev = RB_MAX(uvm_pmr_addr, &pmr->addr);
1376 	else
1377 		prev = RB_PREV(uvm_pmr_addr, &pmr->addr, rebuild);
1378 	KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1379 
1380 	/*
1381 	 * Handle free chunk that spans the split point.
1382 	 */
1383 	if (prev != NULL &&
1384 	    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1385 		psize_t before, after;
1386 
1387 		KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1388 
1389 		uvm_pmr_remove(pmr, prev);
1390 		prev_sz = prev->fpgsz;
1391 		before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1392 		after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1393 
1394 		KASSERT(before > 0);
1395 		KASSERT(after > 0);
1396 
1397 		prev->fpgsz = before;
1398 		uvm_pmr_insert(pmr, prev, 1);
1399 		(prev + before)->fpgsz = after;
1400 		uvm_pmr_insert(drain, prev + before, 1);
1401 	}
1402 
1403 	/*
1404 	 * Move free chunks that no longer fall in the range.
1405 	 */
1406 	for (; rebuild != NULL; rebuild = next) {
1407 		next = RB_NEXT(uvm_pmr_addr, &pmr->addr, rebuild);
1408 
1409 		uvm_pmr_remove(pmr, rebuild);
1410 		uvm_pmr_insert(drain, rebuild, 1);
1411 	}
1412 
1413 	pmr->high = pageno;
1414 	uvm_pmr_assertvalid(pmr);
1415 	uvm_pmr_assertvalid(drain);
1416 
1417 	RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1418 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1419 	uvm_unlock_fpageq();
1420 }
1421 
1422 /*
1423  * Increase the usage counter for the given range of memory.
1424  *
1425  * The more usage counters a given range of memory has, the more will be
1426  * attempted not to allocate from it.
1427  *
1428  * Addresses here are in paddr_t, not page-numbers.
1429  * The lowest and highest allowed address are specified.
1430  */
1431 void
1432 uvm_pmr_use_inc(paddr_t low, paddr_t high)
1433 {
1434 	struct uvm_pmemrange *pmr;
1435 	paddr_t sz;
1436 
1437 	/* pmr uses page numbers, translate low and high. */
1438 	high++;
1439 	high = atop(trunc_page(high));
1440 	low = atop(round_page(low));
1441 	uvm_pmr_split(low);
1442 	uvm_pmr_split(high);
1443 
1444 	sz = 0;
1445 	uvm_lock_fpageq();
1446 	/* Increase use count on segments in range. */
1447 	RB_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1448 		if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1449 			TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1450 			pmr->use++;
1451 			sz += pmr->high - pmr->low;
1452 			uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1453 		}
1454 		uvm_pmr_assertvalid(pmr);
1455 	}
1456 	uvm_unlock_fpageq();
1457 
1458 	KASSERT(sz >= high - low);
1459 }
1460 
1461 /*
1462  * Allocate a pmemrange.
1463  *
1464  * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1465  * If called after uvm_init, malloc is used.
1466  * (And if called in between, you're dead.)
1467  */
1468 struct uvm_pmemrange *
1469 uvm_pmr_allocpmr(void)
1470 {
1471 	struct uvm_pmemrange *nw;
1472 	int i;
1473 
1474 	/* We're only ever hitting the !uvm.page_init_done case for now. */
1475 	if (!uvm.page_init_done) {
1476 		nw = (struct uvm_pmemrange *)
1477 		    uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1478 	} else {
1479 		nw = malloc(sizeof(struct uvm_pmemrange),
1480 		    M_VMMAP, M_NOWAIT);
1481 	}
1482 	KASSERT(nw != NULL);
1483 	bzero(nw, sizeof(struct uvm_pmemrange));
1484 	RB_INIT(&nw->addr);
1485 	for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1486 		RB_INIT(&nw->size[i]);
1487 		TAILQ_INIT(&nw->single[i]);
1488 	}
1489 	return nw;
1490 }
1491 
1492 /*
1493  * Initialization of pmr.
1494  * Called by uvm_page_init.
1495  *
1496  * Sets up pmemranges.
1497  */
1498 void
1499 uvm_pmr_init(void)
1500 {
1501 	struct uvm_pmemrange *new_pmr;
1502 	int i;
1503 
1504 	TAILQ_INIT(&uvm.pmr_control.use);
1505 	RB_INIT(&uvm.pmr_control.addr);
1506 	TAILQ_INIT(&uvm.pmr_control.allocs);
1507 
1508 	/* By default, one range for the entire address space. */
1509 	new_pmr = uvm_pmr_allocpmr();
1510 	new_pmr->low = 0;
1511 	new_pmr->high = atop((paddr_t)-1) + 1;
1512 
1513 	RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1514 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1515 
1516 	for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1517 		uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1518 	    	    uvm_md_constraints[i]->ucr_high);
1519 	}
1520 }
1521 
1522 /*
1523  * Find the pmemrange that contains the given page number.
1524  *
1525  * (Manually traverses the binary tree, because that is cheaper on stack
1526  * usage.)
1527  */
1528 struct uvm_pmemrange *
1529 uvm_pmemrange_find(paddr_t pageno)
1530 {
1531 	struct uvm_pmemrange *pmr;
1532 
1533 	pmr = RB_ROOT(&uvm.pmr_control.addr);
1534 	while (pmr != NULL) {
1535 		if (pmr->low > pageno)
1536 			pmr = RB_LEFT(pmr, pmr_addr);
1537 		else if (pmr->high <= pageno)
1538 			pmr = RB_RIGHT(pmr, pmr_addr);
1539 		else
1540 			break;
1541 	}
1542 
1543 	return pmr;
1544 }
1545 
1546 #if defined(DDB) || defined(DEBUG)
1547 /*
1548  * Return true if the given page is in any of the free lists.
1549  * Used by uvm_page_printit.
1550  * This function is safe, even if the page is not on the freeq.
1551  * Note: does not apply locking, only called from ddb.
1552  */
1553 int
1554 uvm_pmr_isfree(struct vm_page *pg)
1555 {
1556 	struct vm_page *r;
1557 	struct uvm_pmemrange *pmr;
1558 
1559 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1560 	if (pmr == NULL)
1561 		return 0;
1562 	r = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1563 	if (r == NULL)
1564 		r = RB_MAX(uvm_pmr_addr, &pmr->addr);
1565 	else
1566 		r = RB_PREV(uvm_pmr_addr, &pmr->addr, r);
1567 	if (r == NULL)
1568 		return 0; /* Empty tree. */
1569 
1570 	KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1571 	return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1572 	    atop(VM_PAGE_TO_PHYS(pg));
1573 }
1574 #endif /* DEBUG */
1575 
1576 /*
1577  * Given a root of a tree, find a range which intersects start, end and
1578  * is of the same memtype.
1579  *
1580  * Page must be in the address tree.
1581  */
1582 struct vm_page*
1583 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1584     paddr_t start, paddr_t end, int memtype)
1585 {
1586 	int	direction;
1587 	struct	vm_page *root;
1588 	struct	vm_page *high, *high_next;
1589 	struct	vm_page *low, *low_next;
1590 
1591 	KDASSERT(pmr != NULL && init_root != NULL);
1592 	root = init_root;
1593 
1594 	/*
1595 	 * Which direction to use for searching.
1596 	 */
1597 	if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1598 		direction =  1;
1599 	else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1600 		direction = -1;
1601 	else /* nothing to do */
1602 		return root;
1603 
1604 	/*
1605 	 * First, update root to fall within the chosen range.
1606 	 */
1607 	while (root && !PMR_INTERSECTS_WITH(
1608 	    atop(VM_PAGE_TO_PHYS(root)),
1609 	    atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1610 	    start, end)) {
1611 		if (direction == 1)
1612 			root = RB_RIGHT(root, objt);
1613 		else
1614 			root = RB_LEFT(root, objt);
1615 	}
1616 	if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1617 		return root;
1618 
1619 	/*
1620 	 * Root is valid, but of the wrong memtype.
1621 	 *
1622 	 * Try to find a range that has the given memtype in the subtree
1623 	 * (memtype mismatches are costly, either because the conversion
1624 	 * is expensive, or a later allocation will need to do the opposite
1625 	 * conversion, which will be expensive).
1626 	 *
1627 	 *
1628 	 * First, simply increase address until we hit something we can use.
1629 	 * Cache the upper page, so we can page-walk later.
1630 	 */
1631 	high = root;
1632 	high_next = RB_RIGHT(high, objt);
1633 	while (high_next != NULL && PMR_INTERSECTS_WITH(
1634 	    atop(VM_PAGE_TO_PHYS(high_next)),
1635 	    atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1636 	    start, end)) {
1637 		high = high_next;
1638 		if (uvm_pmr_pg_to_memtype(high) == memtype)
1639 			return high;
1640 		high_next = RB_RIGHT(high, objt);
1641 	}
1642 
1643 	/*
1644 	 * Second, decrease the address until we hit something we can use.
1645 	 * Cache the lower page, so we can page-walk later.
1646 	 */
1647 	low = root;
1648 	low_next = RB_RIGHT(low, objt);
1649 	while (low_next != NULL && PMR_INTERSECTS_WITH(
1650 	    atop(VM_PAGE_TO_PHYS(low_next)),
1651 	    atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1652 	    start, end)) {
1653 		low = low_next;
1654 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1655 			return low;
1656 		low_next = RB_RIGHT(low, objt);
1657 	}
1658 
1659 	/*
1660 	 * Ack, no hits. Walk the address tree until to find something usable.
1661 	 */
1662 	for (low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low);
1663 	    low != high;
1664 	    low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low)) {
1665 		KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1666 	    	    atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1667 	    	    start, end));
1668 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1669 			return low;
1670 	}
1671 
1672 	/*
1673 	 * Nothing found.
1674 	 */
1675 	return NULL;
1676 }
1677 
1678 /*
1679  * Allocate any page, the fastest way. Page number constraints only.
1680  */
1681 int
1682 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1683     paddr_t start, paddr_t end)
1684 {
1685 	struct	uvm_pmemrange *pmr;
1686 	struct	vm_page *found, *splitpg;
1687 	psize_t	fcount;
1688 	int	memtype;
1689 
1690 	fcount = 0;
1691 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1692 		/* We're done. */
1693 		if (fcount == count)
1694 			break;
1695 
1696 		/* Outside requested range. */
1697 		if (!(start == 0 && end == 0) &&
1698 		    !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1699 			continue;
1700 
1701 		/* Range is empty. */
1702 		if (pmr->nsegs == 0)
1703 			continue;
1704 
1705 		/* Loop over all memtypes, starting at memtype_init. */
1706 		memtype = memtype_init;
1707 		while (fcount != count) {
1708 			found = TAILQ_FIRST(&pmr->single[memtype]);
1709 			/*
1710 			 * If found is outside the range, walk the list
1711 			 * until we find something that intersects with
1712 			 * boundaries.
1713 			 */
1714 			while (found && !PMR_INTERSECTS_WITH(
1715 			    atop(VM_PAGE_TO_PHYS(found)),
1716 			    atop(VM_PAGE_TO_PHYS(found)) + 1,
1717 			    start, end))
1718 				found = TAILQ_NEXT(found, pageq);
1719 
1720 			if (found == NULL) {
1721 				found = RB_ROOT(&pmr->size[memtype]);
1722 				/* Size tree gives pg[1] instead of pg[0] */
1723 				if (found != NULL) {
1724 					found--;
1725 
1726 					found = uvm_pmr_rootupdate(pmr, found,
1727 					    start, end, memtype);
1728 				}
1729 			}
1730 			if (found != NULL) {
1731 				uvm_pmr_assertvalid(pmr);
1732 				uvm_pmr_remove_size(pmr, found);
1733 
1734 				/*
1735 				 * If the page intersects the end, then it'll
1736 				 * need splitting.
1737 				 *
1738 				 * Note that we don't need to split if the page
1739 				 * intersects start: the drain function will
1740 				 * simply stop on hitting start.
1741 				 */
1742 				if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1743 				    found->fpgsz > end) {
1744 					psize_t splitsz =
1745 					    atop(VM_PAGE_TO_PHYS(found)) +
1746 					    found->fpgsz - end;
1747 
1748 					uvm_pmr_remove_addr(pmr, found);
1749 					uvm_pmr_assertvalid(pmr);
1750 					found->fpgsz -= splitsz;
1751 					splitpg = found + found->fpgsz;
1752 					splitpg->fpgsz = splitsz;
1753 					uvm_pmr_insert(pmr, splitpg, 1);
1754 
1755 					/*
1756 					 * At this point, splitpg and found
1757 					 * actually should be joined.
1758 					 * But we explicitly disable that,
1759 					 * because we will start subtracting
1760 					 * from found.
1761 					 */
1762 					KASSERT(start == 0 ||
1763 					    atop(VM_PAGE_TO_PHYS(found)) +
1764 					    found->fpgsz > start);
1765 					uvm_pmr_insert_addr(pmr, found, 1);
1766 				}
1767 
1768 				/*
1769 				 * Fetch pages from the end.
1770 				 * If the range is larger than the requested
1771 				 * number of pages, this saves us an addr-tree
1772 				 * update.
1773 				 *
1774 				 * Since we take from the end and insert at
1775 				 * the head, any ranges keep preserved.
1776 				 */
1777 				while (found->fpgsz > 0 && fcount < count &&
1778 				    (start == 0 ||
1779 				    atop(VM_PAGE_TO_PHYS(found)) +
1780 				    found->fpgsz > start)) {
1781 					found->fpgsz--;
1782 					fcount++;
1783 					TAILQ_INSERT_HEAD(result,
1784 					    &found[found->fpgsz], pageq);
1785 				}
1786 				if (found->fpgsz > 0) {
1787 					uvm_pmr_insert_size(pmr, found);
1788 					KDASSERT(fcount == count);
1789 					uvm_pmr_assertvalid(pmr);
1790 					return fcount;
1791 				}
1792 
1793 				/*
1794 				 * Delayed addr-tree removal.
1795 				 */
1796 				uvm_pmr_remove_addr(pmr, found);
1797 				uvm_pmr_assertvalid(pmr);
1798 			} else {
1799 				/*
1800 				 * Skip to the next memtype.
1801 				 */
1802 				memtype += 1;
1803 				if (memtype == UVM_PMR_MEMTYPE_MAX)
1804 					memtype = 0;
1805 				if (memtype == memtype_init)
1806 					break;
1807 			}
1808 		}
1809 	}
1810 
1811 	/*
1812 	 * Search finished.
1813 	 *
1814 	 * Ran out of ranges before enough pages were gathered, or we hit the
1815 	 * case where found->fpgsz == count - fcount, in which case the
1816 	 * above exit condition didn't trigger.
1817 	 *
1818 	 * On failure, caller will free the pages.
1819 	 */
1820 	return fcount;
1821 }
1822 
1823 #ifdef DDB
1824 /*
1825  * Print information about pmemrange.
1826  * Does not do locking (so either call it from DDB or acquire fpageq lock
1827  * before invoking.
1828  */
1829 void
1830 uvm_pmr_print(void)
1831 {
1832 	struct	uvm_pmemrange *pmr;
1833 	struct	vm_page *pg;
1834 	psize_t	size[UVM_PMR_MEMTYPE_MAX];
1835 	psize_t	free;
1836 	int	useq_len;
1837 	int	mt;
1838 
1839 	printf("Ranges, use queue:\n");
1840 	useq_len = 0;
1841 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1842 		useq_len++;
1843 		free = 0;
1844 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
1845 			pg = RB_MAX(uvm_pmr_size, &pmr->size[mt]);
1846 			if (pg != NULL)
1847 				pg--;
1848 			else
1849 				pg = TAILQ_FIRST(&pmr->single[mt]);
1850 			size[mt] = (pg == NULL ? 0 : pg->fpgsz);
1851 
1852 			RB_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
1853 				free += pg->fpgsz;
1854 		}
1855 
1856 		printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
1857 		    (unsigned long)pmr->low, (unsigned long)pmr->high,
1858 		    pmr->use, (unsigned long)pmr->nsegs);
1859 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
1860 			printf(" maxsegsz[%d]=0x%lx", mt,
1861 			    (unsigned long)size[mt]);
1862 		}
1863 		printf(" free=0x%lx\n", (unsigned long)free);
1864 	}
1865 	printf("#ranges = %d\n", useq_len);
1866 }
1867 #endif
1868 
1869 /*
1870  * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
1871  * in a specific physmem area.
1872  *
1873  * Returns ENOMEM if the pagedaemon failed to free any pages.
1874  * If not failok, failure will lead to panic.
1875  *
1876  * Must be called with fpageq locked.
1877  */
1878 int
1879 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
1880 {
1881 	struct uvm_pmalloc pma;
1882 	const char *wmsg = "pmrwait";
1883 
1884 	if (curproc == uvm.pagedaemon_proc) {
1885 		/*
1886 		 * This is not that uncommon when the pagedaemon is trying
1887 		 * to flush out a large mmapped file. VOP_WRITE will circle
1888 		 * back through the buffer cache and try to get more memory.
1889 		 * The pagedaemon starts by calling bufbackoff, but we can
1890 		 * easily use up that reserve in a single scan iteration.
1891 		 */
1892 		uvm_unlock_fpageq();
1893 		if (bufbackoff(NULL, atop(size)) == 0) {
1894 			uvm_lock_fpageq();
1895 			return 0;
1896 		}
1897 		uvm_lock_fpageq();
1898 
1899 		/*
1900 		 * XXX detect pagedaemon deadlock - see comment in
1901 		 * uvm_wait(), as this is exactly the same issue.
1902 		 */
1903 		printf("pagedaemon: wait_pla deadlock detected!\n");
1904 		msleep(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, hz >> 3);
1905 #if defined(DEBUG)
1906 		/* DEBUG: panic so we can debug it */
1907 		panic("wait_pla pagedaemon deadlock");
1908 #endif
1909 		return 0;
1910 	}
1911 
1912 	for (;;) {
1913 		pma.pm_constraint.ucr_low = low;
1914 		pma.pm_constraint.ucr_high = high;
1915 		pma.pm_size = size;
1916 		pma.pm_flags = UVM_PMA_LINKED;
1917 		TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
1918 
1919 		wakeup(&uvm.pagedaemon);		/* wake the daemon! */
1920 		while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
1921 			msleep(&pma, &uvm.fpageqlock, PVM, wmsg, 0);
1922 
1923 		if (!(pma.pm_flags & UVM_PMA_FREED) &&
1924 		    pma.pm_flags & UVM_PMA_FAIL) {
1925 			if (failok)
1926 				return ENOMEM;
1927 			printf("uvm_wait: failed to free %ld pages between "
1928 			    "0x%lx-0x%lx\n", atop(size), low, high);
1929 		} else
1930 			return 0;
1931 	}
1932 	/* UNREACHABLE */
1933 }
1934 
1935 /*
1936  * Wake up uvm_pmalloc sleepers.
1937  */
1938 void
1939 uvm_wakeup_pla(paddr_t low, psize_t len)
1940 {
1941 	struct uvm_pmalloc *pma, *pma_next;
1942 	paddr_t high;
1943 
1944 	high = low + len;
1945 
1946 	/*
1947 	 * Wake specific allocations waiting for this memory.
1948 	 */
1949 	for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
1950 	    pma = pma_next) {
1951 		pma_next = TAILQ_NEXT(pma, pmq);
1952 
1953 		if (low < pma->pm_constraint.ucr_high &&
1954 		    high > pma->pm_constraint.ucr_low) {
1955 			pma->pm_flags |= UVM_PMA_FREED;
1956 			if (!(pma->pm_flags & UVM_PMA_BUSY)) {
1957 				pma->pm_flags &= ~UVM_PMA_LINKED;
1958 				TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
1959 				    pmq);
1960 				wakeup(pma);
1961 			}
1962 		}
1963 	}
1964 }
1965