xref: /openbsd-src/sys/uvm/uvm_pmemrange.c (revision a0747c9f67a4ae71ccb71e62a28d1ea19e06a63c)
1 /*	$OpenBSD: uvm_pmemrange.c,v 1.61 2021/03/12 14:15:49 jsg Exp $	*/
2 
3 /*
4  * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/systm.h>
21 #include <uvm/uvm.h>
22 #include <sys/malloc.h>
23 #include <sys/kernel.h>
24 #include <sys/proc.h>
25 #include <sys/mount.h>
26 
27 /*
28  * 2 trees: addr tree and size tree.
29  *
30  * The allocator keeps chunks of free pages (called a range).
31  * Two pages are part of the same range if:
32  * - all pages in between are part of that range,
33  * - they are of the same memory type (zeroed or non-zeroed),
34  * - they are part of the same pmemrange.
35  * A pmemrange is a range of memory which is part of the same vm_physseg
36  * and has a use-count.
37  *
38  * addr tree is vm_page[0].objt
39  * size tree is vm_page[1].objt
40  *
41  * The size tree is not used for memory ranges of 1 page, instead,
42  * single queue is vm_page[0].pageq
43  *
44  * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges
45  * are joined, unless:
46  * - they have pages in between them which are not free
47  * - they belong to different memtypes (zeroed vs dirty memory)
48  * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
49  * - they are not a continuation of the same array
50  * The latter issue is caused by vm_physseg ordering and splitting from the
51  * MD initialization machinery. The MD code is dependant on freelists and
52  * happens to split ISA memory from non-ISA memory.
53  * (Note: freelists die die die!)
54  *
55  * uvm_page_init guarantees that every vm_physseg contains an array of
56  * struct vm_page. Also, uvm_page_physload allocates an array of struct
57  * vm_page. This code depends on that array. The array may break across
58  * vm_physsegs boundaries.
59  */
60 
61 /*
62  * Validate the flags of the page. (Used in asserts.)
63  * Any free page must have the PQ_FREE flag set.
64  * Free pages may be zeroed.
65  * Pmap flags are left untouched.
66  *
67  * The PQ_FREE flag is not checked here: by not checking, we can easily use
68  * this check in pages which are freed.
69  */
70 #define VALID_FLAGS(pg_flags)						\
71 	(((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
72 
73 /* Tree comparators. */
74 int	uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *,
75 	    const struct uvm_pmemrange *);
76 int	uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
77 int	uvm_pmr_pg_to_memtype(struct vm_page *);
78 
79 #ifdef DDB
80 void	uvm_pmr_print(void);
81 #endif
82 
83 /*
84  * Memory types. The page flags are used to derive what the current memory
85  * type of a page is.
86  */
87 int
88 uvm_pmr_pg_to_memtype(struct vm_page *pg)
89 {
90 	if (pg->pg_flags & PG_ZERO)
91 		return UVM_PMR_MEMTYPE_ZERO;
92 	/* Default: dirty memory. */
93 	return UVM_PMR_MEMTYPE_DIRTY;
94 }
95 
96 /* Trees. */
97 RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
98 RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
99 RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
100     uvm_pmemrange_addr_cmp);
101 
102 /* Validation. */
103 #ifdef DEBUG
104 void	uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
105 #else
106 #define uvm_pmr_assertvalid(pmr)	do {} while (0)
107 #endif
108 
109 psize_t			 uvm_pmr_get1page(psize_t, int, struct pglist *,
110 			    paddr_t, paddr_t, int);
111 
112 struct uvm_pmemrange	*uvm_pmr_allocpmr(void);
113 struct vm_page		*uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
114 struct vm_page		*uvm_pmr_nextsz(struct uvm_pmemrange *,
115 			    struct vm_page *, int);
116 void			 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
117 			    struct vm_page *pg, struct vm_page **pg_prev,
118 			    struct vm_page **pg_next);
119 struct vm_page		*uvm_pmr_findnextsegment(struct uvm_pmemrange *,
120 			    struct vm_page *, paddr_t);
121 struct vm_page		*uvm_pmr_findprevsegment(struct uvm_pmemrange *,
122 			    struct vm_page *, paddr_t);
123 psize_t			 uvm_pmr_remove_1strange(struct pglist *, paddr_t,
124 			    struct vm_page **, int);
125 psize_t			 uvm_pmr_remove_1strange_reverse(struct pglist *,
126     			    paddr_t *);
127 void			 uvm_pmr_split(paddr_t);
128 struct uvm_pmemrange	*uvm_pmemrange_find(paddr_t);
129 struct uvm_pmemrange	*uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
130 			    struct uvm_pmemrange *);
131 psize_t			 pow2divide(psize_t, psize_t);
132 struct vm_page		*uvm_pmr_rootupdate(struct uvm_pmemrange *,
133 			    struct vm_page *, paddr_t, paddr_t, int);
134 
135 /*
136  * Computes num/denom and rounds it up to the next power-of-2.
137  *
138  * This is a division function which calculates an approximation of
139  * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
140  * have to be accurate.
141  *
142  * Providing too large a value makes the allocator slightly faster, at the
143  * risk of hitting the failure case more often. Providing too small a value
144  * makes the allocator a bit slower, but less likely to hit a failure case.
145  */
146 psize_t
147 pow2divide(psize_t num, psize_t denom)
148 {
149 	int rshift;
150 
151 	for (rshift = 0; num > denom; rshift++, denom <<= 1)
152 		;
153 	return (paddr_t)1 << rshift;
154 }
155 
156 /*
157  * Predicate: lhs is a subrange or rhs.
158  *
159  * If rhs_low == 0: don't care about lower bound.
160  * If rhs_high == 0: don't care about upper bound.
161  */
162 #define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high)	\
163 	(((rhs_low) == 0 || (lhs_low) >= (rhs_low)) &&			\
164 	((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
165 
166 /*
167  * Predicate: lhs intersects with rhs.
168  *
169  * If rhs_low == 0: don't care about lower bound.
170  * If rhs_high == 0: don't care about upper bound.
171  * Ranges don't intersect if they don't have any page in common, array
172  * semantics mean that < instead of <= should be used here.
173  */
174 #define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high)	\
175 	(((rhs_low) == 0 || (rhs_low) < (lhs_high)) &&			\
176 	((rhs_high) == 0 || (lhs_low) < (rhs_high)))
177 
178 /*
179  * Align to power-of-2 alignment.
180  */
181 #define PMR_ALIGN(pgno, align)						\
182 	(((pgno) + ((align) - 1)) & ~((align) - 1))
183 #define PMR_ALIGN_DOWN(pgno, align)					\
184 	((pgno) & ~((align) - 1))
185 
186 
187 /*
188  * Comparator: sort by address ascending.
189  */
190 int
191 uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs,
192     const struct uvm_pmemrange *rhs)
193 {
194 	return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
195 }
196 
197 /*
198  * Comparator: sort by use ascending.
199  *
200  * The higher the use value of a range, the more devices need memory in
201  * this range. Therefore allocate from the range with the lowest use first.
202  */
203 int
204 uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
205 {
206 	int result;
207 
208 	result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
209 	if (result == 0)
210 		result = uvm_pmemrange_addr_cmp(lhs, rhs);
211 	return result;
212 }
213 
214 int
215 uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
216 {
217 	paddr_t lhs_addr, rhs_addr;
218 
219 	lhs_addr = VM_PAGE_TO_PHYS(lhs);
220 	rhs_addr = VM_PAGE_TO_PHYS(rhs);
221 
222 	return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
223 }
224 
225 int
226 uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
227 {
228 	psize_t lhs_size, rhs_size;
229 	int cmp;
230 
231 	/* Using second tree, so we receive pg[1] instead of pg[0]. */
232 	lhs_size = (lhs - 1)->fpgsz;
233 	rhs_size = (rhs - 1)->fpgsz;
234 
235 	cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
236 	if (cmp == 0)
237 		cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
238 	return cmp;
239 }
240 
241 /*
242  * Find the first range of free pages that is at least sz pages long.
243  */
244 struct vm_page *
245 uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
246 {
247 	struct	vm_page *node, *best;
248 
249 	KASSERT(sz >= 1);
250 
251 	if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
252 		return TAILQ_FIRST(&pmr->single[mti]);
253 
254 	node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]);
255 	best = NULL;
256 	while (node != NULL) {
257 		if ((node - 1)->fpgsz >= sz) {
258 			best = (node - 1);
259 			node = RBT_LEFT(uvm_objtree, node);
260 		} else
261 			node = RBT_RIGHT(uvm_objtree, node);
262 	}
263 	return best;
264 }
265 
266 /*
267  * Finds the next range. The next range has a size >= pg->fpgsz.
268  * Returns NULL if no more ranges are available.
269  */
270 struct vm_page *
271 uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
272 {
273 	struct vm_page *npg;
274 
275 	KASSERT(pmr != NULL && pg != NULL);
276 	if (pg->fpgsz == 1) {
277 		if (TAILQ_NEXT(pg, pageq) != NULL)
278 			return TAILQ_NEXT(pg, pageq);
279 		else
280 			npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]);
281 	} else
282 		npg = RBT_NEXT(uvm_pmr_size, pg + 1);
283 
284 	return npg == NULL ? NULL : npg - 1;
285 }
286 
287 /*
288  * Finds the previous and next ranges relative to the (uninserted) pg range.
289  *
290  * *pg_prev == NULL if no previous range is available, that can join with
291  * 	pg.
292  * *pg_next == NULL if no next range is available, that can join with
293  * 	pg.
294  */
295 void
296 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
297     struct vm_page **pg_prev, struct vm_page **pg_next)
298 {
299 	KASSERT(pg_prev != NULL && pg_next != NULL);
300 
301 	*pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
302 	if (*pg_next == NULL)
303 		*pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
304 	else
305 		*pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next);
306 
307 	KDASSERT(*pg_next == NULL ||
308 	    VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
309 	KDASSERT(*pg_prev == NULL ||
310 	    VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
311 
312 	/* Reset if not contig. */
313 	if (*pg_prev != NULL &&
314 	    (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
315 	    != atop(VM_PAGE_TO_PHYS(pg)) ||
316 	    *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
317 	    uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
318 		*pg_prev = NULL;
319 	if (*pg_next != NULL &&
320 	    (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
321 	    != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
322 	    pg + pg->fpgsz != *pg_next || /* Array broke. */
323 	    uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
324 		*pg_next = NULL;
325 	return;
326 }
327 
328 /*
329  * Remove a range from the address tree.
330  * Address tree maintains pmr counters.
331  */
332 void
333 uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
334 {
335 	KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
336 	KDASSERT(pg->pg_flags & PQ_FREE);
337 	RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
338 
339 	pmr->nsegs--;
340 }
341 /*
342  * Remove a range from the size tree.
343  */
344 void
345 uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
346 {
347 	int memtype;
348 #ifdef DEBUG
349 	struct vm_page *i;
350 #endif
351 
352 	KDASSERT(pg->fpgsz >= 1);
353 	KDASSERT(pg->pg_flags & PQ_FREE);
354 	memtype = uvm_pmr_pg_to_memtype(pg);
355 
356 	if (pg->fpgsz == 1) {
357 #ifdef DEBUG
358 		TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
359 			if (i == pg)
360 				break;
361 		}
362 		KDASSERT(i == pg);
363 #endif
364 		TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
365 	} else {
366 		KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype],
367 		    pg + 1) == pg + 1);
368 		RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
369 	}
370 }
371 /* Remove from both trees. */
372 void
373 uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
374 {
375 	uvm_pmr_assertvalid(pmr);
376 	uvm_pmr_remove_size(pmr, pg);
377 	uvm_pmr_remove_addr(pmr, pg);
378 	uvm_pmr_assertvalid(pmr);
379 }
380 
381 /*
382  * Insert the range described in pg.
383  * Returns the range thus created (which may be joined with the previous and
384  * next ranges).
385  * If no_join, the caller guarantees that the range cannot possibly join
386  * with adjecent ranges.
387  */
388 struct vm_page *
389 uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
390 {
391 	struct vm_page *prev, *next;
392 
393 #ifdef DEBUG
394 	struct vm_page *i;
395 	int mt;
396 #endif
397 
398 	KDASSERT(pg->pg_flags & PQ_FREE);
399 	KDASSERT(pg->fpgsz >= 1);
400 
401 #ifdef DEBUG
402 	for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
403 		TAILQ_FOREACH(i, &pmr->single[mt], pageq)
404 			KDASSERT(i != pg);
405 		if (pg->fpgsz > 1) {
406 			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt],
407 			    pg + 1) == NULL);
408 		}
409 		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
410 	}
411 #endif
412 
413 	if (!no_join) {
414 		uvm_pmr_pnaddr(pmr, pg, &prev, &next);
415 		if (next != NULL) {
416 			uvm_pmr_remove_size(pmr, next);
417 			uvm_pmr_remove_addr(pmr, next);
418 			pg->fpgsz += next->fpgsz;
419 			next->fpgsz = 0;
420 		}
421 		if (prev != NULL) {
422 			uvm_pmr_remove_size(pmr, prev);
423 			prev->fpgsz += pg->fpgsz;
424 			pg->fpgsz = 0;
425 			return prev;
426 		}
427 	}
428 
429 	RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg);
430 
431 	pmr->nsegs++;
432 
433 	return pg;
434 }
435 /*
436  * Insert the range described in pg.
437  * Returns the range thus created (which may be joined with the previous and
438  * next ranges).
439  * Page must already be in the address tree.
440  */
441 void
442 uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
443 {
444 	int memtype;
445 #ifdef DEBUG
446 	struct vm_page *i;
447 	int mti;
448 #endif
449 
450 	KDASSERT(pg->fpgsz >= 1);
451 	KDASSERT(pg->pg_flags & PQ_FREE);
452 
453 	memtype = uvm_pmr_pg_to_memtype(pg);
454 #ifdef DEBUG
455 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
456 		TAILQ_FOREACH(i, &pmr->single[mti], pageq)
457 			KDASSERT(i != pg);
458 		if (pg->fpgsz > 1) {
459 			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti],
460 			    pg + 1) == NULL);
461 		}
462 		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
463 	}
464 	for (i = pg; i < pg + pg->fpgsz; i++)
465 		KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
466 #endif
467 
468 	if (pg->fpgsz == 1)
469 		TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
470 	else
471 		RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
472 }
473 /* Insert in both trees. */
474 struct vm_page *
475 uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
476 {
477 	uvm_pmr_assertvalid(pmr);
478 	pg = uvm_pmr_insert_addr(pmr, pg, no_join);
479 	uvm_pmr_insert_size(pmr, pg);
480 	uvm_pmr_assertvalid(pmr);
481 	return pg;
482 }
483 
484 /*
485  * Find the last page that is part of this segment.
486  * => pg: the range at which to start the search.
487  * => boundary: the page number boundary specification (0 = no boundary).
488  * => pmr: the pmemrange of the page.
489  *
490  * This function returns 1 before the next range, so if you want to have the
491  * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
492  * The reason is that this way, the length of the segment is easily
493  * calculated using: atop(result) - atop(pg) + 1.
494  * Hence this function also never returns NULL.
495  */
496 struct vm_page *
497 uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
498     struct vm_page *pg, paddr_t boundary)
499 {
500 	paddr_t	first_boundary;
501 	struct	vm_page *next;
502 	struct	vm_page *prev;
503 
504 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
505 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
506 	if (boundary != 0) {
507 		first_boundary =
508 		    PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
509 	} else
510 		first_boundary = 0;
511 
512 	/*
513 	 * Increase next until it hits the first page of the next segment.
514 	 *
515 	 * While loop checks the following:
516 	 * - next != NULL	we have not reached the end of pgl
517 	 * - boundary == 0 || next < first_boundary
518 	 *			we do not cross a boundary
519 	 * - atop(prev) + 1 == atop(next)
520 	 *			still in the same segment
521 	 * - low <= last
522 	 * - high > last	still in the same memory range
523 	 * - memtype is equal	allocator is unable to view different memtypes
524 	 *			as part of the same segment
525 	 * - prev + 1 == next	no array breakage occurs
526 	 */
527 	prev = pg;
528 	next = TAILQ_NEXT(prev, pageq);
529 	while (next != NULL &&
530 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
531 	    atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
532 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
533 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
534 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
535 	    prev + 1 == next) {
536 		prev = next;
537 		next = TAILQ_NEXT(prev, pageq);
538 	}
539 
540 	/*
541 	 * End of this segment.
542 	 */
543 	return prev;
544 }
545 
546 /*
547  * Find the first page that is part of this segment.
548  * => pg: the range at which to start the search.
549  * => boundary: the page number boundary specification (0 = no boundary).
550  * => pmr: the pmemrange of the page.
551  *
552  * This function returns 1 after the previous range, so if you want to have the
553  * previous range, you need to run TAILQ_NEXT(result, pageq) after calling.
554  * The reason is that this way, the length of the segment is easily
555  * calculated using: atop(pg) - atop(result) + 1.
556  * Hence this function also never returns NULL.
557  */
558 struct vm_page *
559 uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr,
560     struct vm_page *pg, paddr_t boundary)
561 {
562 	paddr_t	first_boundary;
563 	struct	vm_page *next;
564 	struct	vm_page *prev;
565 
566 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
567 	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
568 	if (boundary != 0) {
569 		first_boundary =
570 		    PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary);
571 	} else
572 		first_boundary = 0;
573 
574 	/*
575 	 * Increase next until it hits the first page of the previous segment.
576 	 *
577 	 * While loop checks the following:
578 	 * - next != NULL	we have not reached the end of pgl
579 	 * - boundary == 0 || next >= first_boundary
580 	 *			we do not cross a boundary
581 	 * - atop(prev) - 1 == atop(next)
582 	 *			still in the same segment
583 	 * - low <= last
584 	 * - high > last	still in the same memory range
585 	 * - memtype is equal	allocator is unable to view different memtypes
586 	 *			as part of the same segment
587 	 * - prev - 1 == next	no array breakage occurs
588 	 */
589 	prev = pg;
590 	next = TAILQ_NEXT(prev, pageq);
591 	while (next != NULL &&
592 	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) &&
593 	    atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) &&
594 	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
595 	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
596 	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
597 	    prev - 1 == next) {
598 		prev = next;
599 		next = TAILQ_NEXT(prev, pageq);
600 	}
601 
602 	/*
603 	 * Start of this segment.
604 	 */
605 	return prev;
606 }
607 
608 /*
609  * Remove the first segment of contiguous pages from pgl.
610  * A segment ends if it crosses boundary (unless boundary = 0) or
611  * if it would enter a different uvm_pmemrange.
612  *
613  * Work: the page range that the caller is currently working with.
614  * May be null.
615  *
616  * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
617  * the first segment is erased (which, if called by uvm_pmr_getpages(),
618  * probably is the smallest or very close to it).
619  */
620 psize_t
621 uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
622     struct vm_page **work, int is_desperate)
623 {
624 	struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest;
625 	psize_t count;
626 	struct uvm_pmemrange *pmr, *pmr_iter;
627 
628 	KASSERT(!TAILQ_EMPTY(pgl));
629 
630 	/*
631 	 * Initialize to first page.
632 	 * Unless desperate scan finds a better candidate, this is what'll be
633 	 * erased.
634 	 */
635 	start = TAILQ_FIRST(pgl);
636 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
637 	end = uvm_pmr_findnextsegment(pmr, start, boundary);
638 
639 	/*
640 	 * If we are desperate, we _really_ want to get rid of the smallest
641 	 * element (rather than a close match to the smallest element).
642 	 */
643 	if (is_desperate) {
644 		/* Linear search for smallest segment. */
645 		pmr_iter = pmr;
646 		for (iter = TAILQ_NEXT(end, pageq);
647 		    iter != NULL && start != end;
648 		    iter = TAILQ_NEXT(iter_end, pageq)) {
649 			/*
650 			 * Only update pmr if it doesn't match current
651 			 * iteration.
652 			 */
653 			if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
654 			    pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
655 				pmr_iter = uvm_pmemrange_find(atop(
656 				    VM_PAGE_TO_PHYS(iter)));
657 			}
658 
659 			iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
660 			    boundary);
661 
662 			/*
663 			 * Current iteration is smaller than best match so
664 			 * far; update.
665 			 */
666 			if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
667 			    VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
668 				start = iter;
669 				end = iter_end;
670 				pmr = pmr_iter;
671 			}
672 		}
673 	}
674 
675 	/*
676 	 * Calculate count and end of the list.
677 	 */
678 	count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
679 	lowest = start;
680 	end = TAILQ_NEXT(end, pageq);
681 
682 	/*
683 	 * Actually remove the range of pages.
684 	 *
685 	 * Sadly, this cannot be done using pointer iteration:
686 	 * vm_physseg is not guaranteed to be sorted on address, hence
687 	 * uvm_page_init() may not have initialized its array sorted by
688 	 * page number.
689 	 */
690 	for (iter = start; iter != end; iter = iter_end) {
691 		iter_end = TAILQ_NEXT(iter, pageq);
692 		TAILQ_REMOVE(pgl, iter, pageq);
693 	}
694 
695 	lowest->fpgsz = count;
696 	inserted = uvm_pmr_insert(pmr, lowest, 0);
697 
698 	/*
699 	 * If the caller was working on a range and this function modified
700 	 * that range, update the pointer.
701 	 */
702 	if (work != NULL && *work != NULL &&
703 	    atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
704 	    atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
705 	    atop(VM_PAGE_TO_PHYS(*work)))
706 		*work = inserted;
707 	return count;
708 }
709 
710 /*
711  * Remove the first segment of contiguous pages from a pgl
712  * with the list elements in reverse order of physaddr.
713  *
714  * A segment ends if it would enter a different uvm_pmemrange.
715  *
716  * Stores starting physical address of the segment in pstart.
717  */
718 psize_t
719 uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart)
720 {
721 	struct vm_page *start, *end, *iter, *iter_end, *lowest;
722 	psize_t count;
723 	struct uvm_pmemrange *pmr;
724 
725 	KASSERT(!TAILQ_EMPTY(pgl));
726 
727 	start = TAILQ_FIRST(pgl);
728 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
729 	end = uvm_pmr_findprevsegment(pmr, start, 0);
730 
731 	KASSERT(end <= start);
732 
733 	/*
734 	 * Calculate count and end of the list.
735 	 */
736 	count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1;
737 	lowest = end;
738 	end = TAILQ_NEXT(end, pageq);
739 
740 	/*
741 	 * Actually remove the range of pages.
742 	 *
743 	 * Sadly, this cannot be done using pointer iteration:
744 	 * vm_physseg is not guaranteed to be sorted on address, hence
745 	 * uvm_page_init() may not have initialized its array sorted by
746 	 * page number.
747 	 */
748 	for (iter = start; iter != end; iter = iter_end) {
749 		iter_end = TAILQ_NEXT(iter, pageq);
750 		TAILQ_REMOVE(pgl, iter, pageq);
751 	}
752 
753 	lowest->fpgsz = count;
754 	(void) uvm_pmr_insert(pmr, lowest, 0);
755 
756 	*pstart = VM_PAGE_TO_PHYS(lowest);
757 	return count;
758 }
759 
760 /*
761  * Extract a number of pages from a segment of free pages.
762  * Called by uvm_pmr_getpages.
763  *
764  * Returns the segment that was created from pages left over at the tail
765  * of the remove set of pages, or NULL if no pages were left at the tail.
766  */
767 struct vm_page *
768 uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
769     paddr_t start, paddr_t end, struct pglist *result)
770 {
771 	struct vm_page *after, *pg_i;
772 	psize_t before_sz, after_sz;
773 #ifdef DEBUG
774 	psize_t i;
775 #endif
776 
777 	KDASSERT(end > start);
778 	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
779 	KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
780 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
781 	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
782 
783 	before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
784 	after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
785 	KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
786 	uvm_pmr_assertvalid(pmr);
787 
788 	uvm_pmr_remove_size(pmr, pg);
789 	if (before_sz == 0)
790 		uvm_pmr_remove_addr(pmr, pg);
791 	after = pg + before_sz + (end - start);
792 
793 	/* Add selected pages to result. */
794 	for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
795 		KASSERT(pg_i->pg_flags & PQ_FREE);
796 		pg_i->fpgsz = 0;
797 		TAILQ_INSERT_TAIL(result, pg_i, pageq);
798 	}
799 
800 	/* Before handling. */
801 	if (before_sz > 0) {
802 		pg->fpgsz = before_sz;
803 		uvm_pmr_insert_size(pmr, pg);
804 	}
805 
806 	/* After handling. */
807 	if (after_sz > 0) {
808 #ifdef DEBUG
809 		for (i = 0; i < after_sz; i++) {
810 			KASSERT(!uvm_pmr_isfree(after + i));
811 		}
812 #endif
813 		KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
814 		after->fpgsz = after_sz;
815 		after = uvm_pmr_insert_addr(pmr, after, 1);
816 		uvm_pmr_insert_size(pmr, after);
817 	}
818 
819 	uvm_pmr_assertvalid(pmr);
820 	return (after_sz > 0 ? after : NULL);
821 }
822 
823 /*
824  * Indicate to the page daemon that a nowait call failed and it should
825  * recover at least some memory in the most restricted region (assumed
826  * to be dma_constraint).
827  */
828 extern volatile int uvm_nowait_failed;
829 
830 /*
831  * Acquire a number of pages.
832  *
833  * count:	the number of pages returned
834  * start:	lowest page number
835  * end:		highest page number +1
836  * 		(start = end = 0: no limitation)
837  * align:	power-of-2 alignment constraint (align = 1: no alignment)
838  * boundary:	power-of-2 boundary (boundary = 0: no boundary)
839  * maxseg:	maximum number of segments to return
840  * flags:	UVM_PLA_* flags
841  * result:	returned pages storage (uses pageq)
842  */
843 int
844 uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
845     paddr_t boundary, int maxseg, int flags, struct pglist *result)
846 {
847 	struct	uvm_pmemrange *pmr;	/* Iterate memory ranges. */
848 	struct	vm_page *found, *f_next; /* Iterate chunks. */
849 	psize_t	fcount;			/* Current found pages. */
850 	int	fnsegs;			/* Current segment counter. */
851 	int	try, start_try;
852 	psize_t	search[3];
853 	paddr_t	fstart, fend;		/* Pages to be taken from found. */
854 	int	memtype;		/* Requested memtype. */
855 	int	memtype_init;		/* Best memtype. */
856 	int	desperate;		/* True if allocation failed. */
857 #ifdef DIAGNOSTIC
858 	struct	vm_page *diag_prev;	/* Used during validation. */
859 #endif /* DIAGNOSTIC */
860 
861 	/*
862 	 * Validate arguments.
863 	 */
864 	KASSERT(count > 0);
865 	KASSERT(start == 0 || end == 0 || start < end);
866 	KASSERT(align >= 1);
867 	KASSERT(powerof2(align));
868 	KASSERT(maxseg > 0);
869 	KASSERT(boundary == 0 || powerof2(boundary));
870 	KASSERT(boundary == 0 || maxseg * boundary >= count);
871 	KASSERT(TAILQ_EMPTY(result));
872 	KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
873 
874 	/*
875 	 * TRYCONTIG is a noop if you only want a single segment.
876 	 * Remove it if that's the case: otherwise it'll deny the fast
877 	 * allocation.
878 	 */
879 	if (maxseg == 1 || count == 1)
880 		flags &= ~UVM_PLA_TRYCONTIG;
881 
882 	/*
883 	 * Configure search.
884 	 *
885 	 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
886 	 * search[1] is multiple segments, chosen to fulfill the search in
887 	 *   approximately even-sized segments.
888 	 *   This is a good trade-off between slightly reduced allocation speed
889 	 *   and less fragmentation.
890 	 * search[2] is the worst case, in which all segments are evaluated.
891 	 *   This provides the least fragmentation, but makes the search
892 	 *   possibly longer (although in the case it is selected, that no
893 	 *   longer matters most).
894 	 *
895 	 * The exception is when maxseg == 1: since we can only fulfill that
896 	 * with one segment of size pages, only a single search type has to
897 	 * be attempted.
898 	 */
899 	if (maxseg == 1 || count == 1) {
900 		start_try = 2;
901 		search[2] = count;
902 	} else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
903 		start_try = 2;
904 		search[2] = 1;
905 	} else {
906 		start_try = 0;
907 		search[0] = count;
908 		search[1] = pow2divide(count, maxseg);
909 		search[2] = 1;
910 		if ((flags & UVM_PLA_TRYCONTIG) == 0)
911 			start_try = 1;
912 		if (search[1] >= search[0]) {
913 			search[1] = search[0];
914 			start_try = 1;
915 		}
916 		if (search[2] >= search[start_try]) {
917 			start_try = 2;
918 		}
919 	}
920 
921 	/*
922 	 * Memory type: if zeroed memory is requested, traverse the zero set.
923 	 * Otherwise, traverse the dirty set.
924 	 *
925 	 * The memtype iterator is reinitialized to memtype_init on entrance
926 	 * of a pmemrange.
927 	 */
928 	if (flags & UVM_PLA_ZERO)
929 		memtype_init = UVM_PMR_MEMTYPE_ZERO;
930 	else
931 		memtype_init = UVM_PMR_MEMTYPE_DIRTY;
932 
933 	/*
934 	 * Initially, we're not desperate.
935 	 *
936 	 * Note that if we return from a sleep, we are still desperate.
937 	 * Chances are that memory pressure is still high, so resetting
938 	 * seems over-optimistic to me.
939 	 */
940 	desperate = 0;
941 
942 again:
943 	uvm_lock_fpageq();
944 
945 	/*
946 	 * check to see if we need to generate some free pages waking
947 	 * the pagedaemon.
948 	 */
949 	if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
950 	    ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
951 	    (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
952 		wakeup(&uvm.pagedaemon);
953 
954 	/*
955 	 * fail if any of these conditions is true:
956 	 * [1]  there really are no free pages, or
957 	 * [2]  only kernel "reserved" pages remain and
958 	 *        the UVM_PLA_USERESERVE flag wasn't used.
959 	 * [3]  only pagedaemon "reserved" pages remain and
960 	 *        the requestor isn't the pagedaemon nor the syncer.
961 	 */
962 	if ((uvmexp.free <= uvmexp.reserve_kernel) &&
963 	    !(flags & UVM_PLA_USERESERVE)) {
964 		uvm_unlock_fpageq();
965 		return ENOMEM;
966 	}
967 
968 	if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) &&
969 	    (curproc != uvm.pagedaemon_proc) && (curproc != syncerproc)) {
970 		uvm_unlock_fpageq();
971 		if (flags & UVM_PLA_WAITOK) {
972 			uvm_wait("uvm_pmr_getpages");
973 			goto again;
974 		}
975 		return ENOMEM;
976 	}
977 
978 retry:		/* Return point after sleeping. */
979 	fcount = 0;
980 	fnsegs = 0;
981 
982 retry_desperate:
983 	/*
984 	 * If we just want any page(s), go for the really fast option.
985 	 */
986 	if (count <= maxseg && align == 1 && boundary == 0 &&
987 	    (flags & UVM_PLA_TRYCONTIG) == 0) {
988 		fcount += uvm_pmr_get1page(count - fcount, memtype_init,
989 		    result, start, end, 0);
990 
991 		/*
992 		 * If we found sufficient pages, go to the success exit code.
993 		 *
994 		 * Otherwise, go immediately to fail, since we collected
995 		 * all we could anyway.
996 		 */
997 		if (fcount == count)
998 			goto out;
999 		else
1000 			goto fail;
1001 	}
1002 
1003 	/*
1004 	 * The heart of the contig case.
1005 	 *
1006 	 * The code actually looks like this:
1007 	 *
1008 	 * foreach (struct pmemrange) {
1009 	 *	foreach (memtype) {
1010 	 *		foreach(try) {
1011 	 *			foreach (free range of memtype in pmemrange,
1012 	 *			    starting at search[try]) {
1013 	 *				while (range has space left)
1014 	 *					take from range
1015 	 *			}
1016 	 *		}
1017 	 *	}
1018 	 *
1019 	 *	if next pmemrange has higher usecount than current:
1020 	 *		enter desperate case (which will drain the pmemranges
1021 	 *		until empty prior to moving to the next one)
1022 	 * }
1023 	 *
1024 	 * When desperate is activated, try always starts at the highest
1025 	 * value. The memtype loop is using a goto ReScanMemtype.
1026 	 * The try loop is using a goto ReScan.
1027 	 * The 'range has space left' loop uses label DrainFound.
1028 	 *
1029 	 * Writing them all as loops would take up a lot of screen space in
1030 	 * the form of indentation and some parts are easier to express
1031 	 * using the labels.
1032 	 */
1033 
1034 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1035 		/* Empty range. */
1036 		if (pmr->nsegs == 0)
1037 			continue;
1038 
1039 		/* Outside requested range. */
1040 		if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1041 			continue;
1042 
1043 		memtype = memtype_init;
1044 
1045 rescan_memtype:	/* Return point at memtype++. */
1046 		try = start_try;
1047 
1048 rescan:		/* Return point at try++. */
1049 		for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
1050 		    found != NULL;
1051 		    found = f_next) {
1052 			f_next = uvm_pmr_nextsz(pmr, found, memtype);
1053 
1054 			fstart = atop(VM_PAGE_TO_PHYS(found));
1055 			if (start != 0)
1056 				fstart = MAX(start, fstart);
1057 drain_found:
1058 			/*
1059 			 * Throw away the first segment if fnsegs == maxseg
1060 			 *
1061 			 * Note that f_next is still valid after this call,
1062 			 * since we only allocated from entries before f_next.
1063 			 * We don't revisit the entries we already extracted
1064 			 * from unless we entered the desperate case.
1065 			 */
1066 			if (fnsegs == maxseg) {
1067 				fnsegs--;
1068 				fcount -=
1069 				    uvm_pmr_remove_1strange(result, boundary,
1070 				    &found, desperate);
1071 			}
1072 
1073 			fstart = PMR_ALIGN(fstart, align);
1074 			fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
1075 			if (end != 0)
1076 				fend = MIN(end, fend);
1077 			if (boundary != 0) {
1078 				fend =
1079 				    MIN(fend, PMR_ALIGN(fstart + 1, boundary));
1080 			}
1081 			if (fstart >= fend)
1082 				continue;
1083 			if (fend - fstart > count - fcount)
1084 				fend = fstart + (count - fcount);
1085 
1086 			fcount += fend - fstart;
1087 			fnsegs++;
1088 			found = uvm_pmr_extract_range(pmr, found,
1089 			    fstart, fend, result);
1090 
1091 			if (fcount == count)
1092 				goto out;
1093 
1094 			/*
1095 			 * If there's still space left in found, try to
1096 			 * fully drain it prior to continuing.
1097 			 */
1098 			if (found != NULL) {
1099 				fstart = fend;
1100 				goto drain_found;
1101 			}
1102 		}
1103 
1104 		/* Try a smaller search now. */
1105 		if (++try < nitems(search))
1106 			goto rescan;
1107 
1108 		/*
1109 		 * Exhaust all memory types prior to going to the next memory
1110 		 * segment.
1111 		 * This means that zero-vs-dirty are eaten prior to moving
1112 		 * to a pmemrange with a higher use-count.
1113 		 *
1114 		 * Code is basically a difficult way of writing:
1115 		 * memtype = memtype_init;
1116 		 * do {
1117 		 *	...;
1118 		 *	memtype += 1;
1119 		 *	memtype %= MEMTYPE_MAX;
1120 		 * } while (memtype != memtype_init);
1121 		 */
1122 		memtype += 1;
1123 		if (memtype == UVM_PMR_MEMTYPE_MAX)
1124 			memtype = 0;
1125 		if (memtype != memtype_init)
1126 			goto rescan_memtype;
1127 
1128 		/*
1129 		 * If not desperate, enter desperate case prior to eating all
1130 		 * the good stuff in the next range.
1131 		 */
1132 		if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
1133 		    TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
1134 			break;
1135 	}
1136 
1137 	/*
1138 	 * Not enough memory of the requested type available. Fall back to
1139 	 * less good memory that we'll clean up better later.
1140 	 *
1141 	 * This algorithm is not very smart though, it just starts scanning
1142 	 * a different typed range, but the nicer ranges of the previous
1143 	 * iteration may fall out. Hence there is a small chance of a false
1144 	 * negative.
1145 	 *
1146 	 * When desperate: scan all sizes starting at the smallest
1147 	 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
1148 	 * allow us to hit the fast path now).
1149 	 *
1150 	 * Also, because we will revisit entries we scanned before, we need
1151 	 * to reset the page queue, or we may end up releasing entries in
1152 	 * such a way as to invalidate f_next.
1153 	 */
1154 	if (!desperate) {
1155 		desperate = 1;
1156 		start_try = nitems(search) - 1;
1157 		flags &= ~UVM_PLA_TRYCONTIG;
1158 
1159 		while (!TAILQ_EMPTY(result))
1160 			uvm_pmr_remove_1strange(result, 0, NULL, 0);
1161 		fnsegs = 0;
1162 		fcount = 0;
1163 		goto retry_desperate;
1164 	}
1165 
1166 fail:
1167 	/* Allocation failed. */
1168 	/* XXX: claim from memory reserve here */
1169 
1170 	while (!TAILQ_EMPTY(result))
1171 		uvm_pmr_remove_1strange(result, 0, NULL, 0);
1172 
1173 	if (flags & UVM_PLA_WAITOK) {
1174 		if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
1175 		    flags & UVM_PLA_FAILOK) == 0)
1176 			goto retry;
1177 		KASSERT(flags & UVM_PLA_FAILOK);
1178 	} else {
1179 		if (!(flags & UVM_PLA_NOWAKE)) {
1180 			uvm_nowait_failed = 1;
1181 			wakeup(&uvm.pagedaemon);
1182 		}
1183 	}
1184 	uvm_unlock_fpageq();
1185 
1186 	return ENOMEM;
1187 
1188 out:
1189 	/* Allocation successful. */
1190 	uvmexp.free -= fcount;
1191 
1192 	uvm_unlock_fpageq();
1193 
1194 	/* Update statistics and zero pages if UVM_PLA_ZERO. */
1195 #ifdef DIAGNOSTIC
1196 	fnsegs = 0;
1197 	fcount = 0;
1198 	diag_prev = NULL;
1199 #endif /* DIAGNOSTIC */
1200 	TAILQ_FOREACH(found, result, pageq) {
1201 		atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1202 
1203 		if (found->pg_flags & PG_ZERO) {
1204 			uvm_lock_fpageq();
1205 			uvmexp.zeropages--;
1206 			if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1207 				wakeup(&uvmexp.zeropages);
1208 			uvm_unlock_fpageq();
1209 		}
1210 		if (flags & UVM_PLA_ZERO) {
1211 			if (found->pg_flags & PG_ZERO)
1212 				uvmexp.pga_zerohit++;
1213 			else {
1214 				uvmexp.pga_zeromiss++;
1215 				uvm_pagezero(found);
1216 			}
1217 		}
1218 		atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1219 
1220 		found->uobject = NULL;
1221 		found->uanon = NULL;
1222 		found->pg_version++;
1223 
1224 		/*
1225 		 * Validate that the page matches range criterium.
1226 		 */
1227 		KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1228 		KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1229 
1230 #ifdef DIAGNOSTIC
1231 		/*
1232 		 * Update fcount (# found pages) and
1233 		 * fnsegs (# found segments) counters.
1234 		 */
1235 		if (diag_prev == NULL ||
1236 		    /* new segment if it contains a hole */
1237 		    atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1238 		    atop(VM_PAGE_TO_PHYS(found)) ||
1239 		    /* new segment if it crosses boundary */
1240 		    (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1241 		    (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1242 			fnsegs++;
1243 		fcount++;
1244 
1245 		diag_prev = found;
1246 #endif /* DIAGNOSTIC */
1247 	}
1248 
1249 #ifdef DIAGNOSTIC
1250 	/*
1251 	 * Panic on algorithm failure.
1252 	 */
1253 	if (fcount != count || fnsegs > maxseg) {
1254 		panic("pmemrange allocation error: "
1255 		    "allocated %ld pages in %d segments, "
1256 		    "but request was %ld pages in %d segments",
1257 		    fcount, fnsegs, count, maxseg);
1258 	}
1259 #endif /* DIAGNOSTIC */
1260 
1261 	return 0;
1262 }
1263 
1264 /*
1265  * Free a number of contig pages (invoked by uvm_page_init).
1266  */
1267 void
1268 uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1269 {
1270 	struct uvm_pmemrange *pmr;
1271 	psize_t i, pmr_count;
1272 	struct vm_page *firstpg = pg;
1273 
1274 	for (i = 0; i < count; i++) {
1275 		KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1276 		    atop(VM_PAGE_TO_PHYS(pg)) + i);
1277 
1278 		if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1279 		    VALID_FLAGS(pg[i].pg_flags))) {
1280 			printf("Flags: 0x%x, will panic now.\n",
1281 			    pg[i].pg_flags);
1282 		}
1283 		KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1284 		    VALID_FLAGS(pg[i].pg_flags));
1285 		atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1286 		atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1287 	}
1288 
1289 	uvm_lock_fpageq();
1290 
1291 	for (i = count; i > 0; i -= pmr_count) {
1292 		pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1293 		KASSERT(pmr != NULL);
1294 
1295 		pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1296 		pg->fpgsz = pmr_count;
1297 		uvm_pmr_insert(pmr, pg, 0);
1298 
1299 		uvmexp.free += pmr_count;
1300 		pg += pmr_count;
1301 	}
1302 	wakeup(&uvmexp.free);
1303 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1304 		wakeup(&uvmexp.zeropages);
1305 
1306 	uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
1307 
1308 	uvm_unlock_fpageq();
1309 }
1310 
1311 /*
1312  * Free all pages in the queue.
1313  */
1314 void
1315 uvm_pmr_freepageq(struct pglist *pgl)
1316 {
1317 	struct vm_page *pg;
1318 	paddr_t pstart;
1319 	psize_t plen;
1320 
1321 	TAILQ_FOREACH(pg, pgl, pageq) {
1322 		if (!((pg->pg_flags & PQ_FREE) == 0 &&
1323 		    VALID_FLAGS(pg->pg_flags))) {
1324 			printf("Flags: 0x%x, will panic now.\n",
1325 			    pg->pg_flags);
1326 		}
1327 		KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1328 		    VALID_FLAGS(pg->pg_flags));
1329 		atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1330 		atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1331 	}
1332 
1333 	uvm_lock_fpageq();
1334 	while (!TAILQ_EMPTY(pgl)) {
1335 		pg = TAILQ_FIRST(pgl);
1336 		if (pg == TAILQ_NEXT(pg, pageq) + 1) {
1337 			/*
1338 			 * If pg is one behind the position of the
1339 			 * next page in the list in the page array,
1340 			 * try going backwards instead of forward.
1341 			 */
1342 			plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart);
1343 		} else {
1344 			pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
1345 			plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
1346 		}
1347 		uvmexp.free += plen;
1348 
1349 		uvm_wakeup_pla(pstart, ptoa(plen));
1350 	}
1351 	wakeup(&uvmexp.free);
1352 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1353 		wakeup(&uvmexp.zeropages);
1354 	uvm_unlock_fpageq();
1355 
1356 	return;
1357 }
1358 
1359 /*
1360  * Store a pmemrange in the list.
1361  *
1362  * The list is sorted by use.
1363  */
1364 struct uvm_pmemrange *
1365 uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1366     struct uvm_pmemrange *pmr)
1367 {
1368 	struct uvm_pmemrange *iter;
1369 	int cmp = 1;
1370 
1371 	TAILQ_FOREACH(iter, useq, pmr_use) {
1372 		cmp = uvm_pmemrange_use_cmp(pmr, iter);
1373 		if (cmp == 0)
1374 			return iter;
1375 		if (cmp == -1)
1376 			break;
1377 	}
1378 
1379 	if (iter == NULL)
1380 		TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1381 	else
1382 		TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1383 	return NULL;
1384 }
1385 
1386 #ifdef DEBUG
1387 /*
1388  * Validation of the whole pmemrange.
1389  * Called with fpageq locked.
1390  */
1391 void
1392 uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1393 {
1394 	struct vm_page *prev, *next, *i, *xref;
1395 	int lcv, mti;
1396 
1397 	/* Empty range */
1398 	if (pmr->nsegs == 0)
1399 		return;
1400 
1401 	/* Validate address tree. */
1402 	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1403 		/* Validate the range. */
1404 		KASSERT(i->fpgsz > 0);
1405 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1406 		KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1407 		    <= pmr->high);
1408 
1409 		/* Validate each page in this range. */
1410 		for (lcv = 0; lcv < i->fpgsz; lcv++) {
1411 			/*
1412 			 * Only the first page has a size specification.
1413 			 * Rest is size 0.
1414 			 */
1415 			KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1416 			/*
1417 			 * Flag check.
1418 			 */
1419 			KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1420 			    (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1421 			/*
1422 			 * Free pages are:
1423 			 * - not wired
1424 			 * - have no vm_anon
1425 			 * - have no uvm_object
1426 			 */
1427 			KASSERT(i[lcv].wire_count == 0);
1428 			KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1429 			    i[lcv].uanon == NULL);
1430 			KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1431 			    i[lcv].uobject == NULL);
1432 			/*
1433 			 * Pages in a single range always have the same
1434 			 * memtype.
1435 			 */
1436 			KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1437 			    uvm_pmr_pg_to_memtype(&i[lcv]));
1438 		}
1439 
1440 		/* Check that it shouldn't be joined with its predecessor. */
1441 		prev = RBT_PREV(uvm_pmr_addr, i);
1442 		if (prev != NULL) {
1443 			KASSERT(uvm_pmr_pg_to_memtype(i) !=
1444 			    uvm_pmr_pg_to_memtype(prev) ||
1445 			    atop(VM_PAGE_TO_PHYS(i)) >
1446 			    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1447 			    prev + prev->fpgsz != i);
1448 		}
1449 
1450 		/* Assert i is in the size tree as well. */
1451 		if (i->fpgsz == 1) {
1452 			TAILQ_FOREACH(xref,
1453 			    &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1454 				if (xref == i)
1455 					break;
1456 			}
1457 			KASSERT(xref == i);
1458 		} else {
1459 			KASSERT(RBT_FIND(uvm_pmr_size,
1460 			    &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1461 			    i + 1);
1462 		}
1463 	}
1464 
1465 	/* Validate size tree. */
1466 	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1467 		for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1468 			next = uvm_pmr_nextsz(pmr, i, mti);
1469 			if (next != NULL) {
1470 				KASSERT(i->fpgsz <=
1471 				    next->fpgsz);
1472 			}
1473 
1474 			/* Assert i is in the addr tree as well. */
1475 			KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1476 
1477 			/* Assert i is of the correct memory type. */
1478 			KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1479 		}
1480 	}
1481 
1482 	/* Validate nsegs statistic. */
1483 	lcv = 0;
1484 	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1485 		lcv++;
1486 	KASSERT(pmr->nsegs == lcv);
1487 }
1488 #endif /* DEBUG */
1489 
1490 /*
1491  * Split pmr at split point pageno.
1492  * Called with fpageq unlocked.
1493  *
1494  * Split is only applied if a pmemrange spans pageno.
1495  */
1496 void
1497 uvm_pmr_split(paddr_t pageno)
1498 {
1499 	struct uvm_pmemrange *pmr, *drain;
1500 	struct vm_page *rebuild, *prev, *next;
1501 	psize_t prev_sz;
1502 
1503 	uvm_lock_fpageq();
1504 	pmr = uvm_pmemrange_find(pageno);
1505 	if (pmr == NULL || !(pmr->low < pageno)) {
1506 		/* No split required. */
1507 		uvm_unlock_fpageq();
1508 		return;
1509 	}
1510 
1511 	KASSERT(pmr->low < pageno);
1512 	KASSERT(pmr->high > pageno);
1513 
1514 	/*
1515 	 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1516 	 * uvm_kmemalloc which calls into pmemrange, making the locking
1517 	 * a bit hard, so we just race!
1518 	 */
1519 	uvm_unlock_fpageq();
1520 	drain = uvm_pmr_allocpmr();
1521 	uvm_lock_fpageq();
1522 	pmr = uvm_pmemrange_find(pageno);
1523 	if (pmr == NULL || !(pmr->low < pageno)) {
1524 		/*
1525 		 * We lost the race since someone else ran this or a related
1526 		 * function, however this should be triggered very rarely so
1527 		 * we just leak the pmr.
1528 		 */
1529 		printf("uvm_pmr_split: lost one pmr\n");
1530 		uvm_unlock_fpageq();
1531 		return;
1532 	}
1533 
1534 	drain->low = pageno;
1535 	drain->high = pmr->high;
1536 	drain->use = pmr->use;
1537 
1538 	uvm_pmr_assertvalid(pmr);
1539 	uvm_pmr_assertvalid(drain);
1540 	KASSERT(drain->nsegs == 0);
1541 
1542 	RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1543 		if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1544 			break;
1545 	}
1546 	if (rebuild == NULL)
1547 		prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1548 	else
1549 		prev = RBT_PREV(uvm_pmr_addr, rebuild);
1550 	KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1551 
1552 	/*
1553 	 * Handle free chunk that spans the split point.
1554 	 */
1555 	if (prev != NULL &&
1556 	    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1557 		psize_t before, after;
1558 
1559 		KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1560 
1561 		uvm_pmr_remove(pmr, prev);
1562 		prev_sz = prev->fpgsz;
1563 		before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1564 		after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1565 
1566 		KASSERT(before > 0);
1567 		KASSERT(after > 0);
1568 
1569 		prev->fpgsz = before;
1570 		uvm_pmr_insert(pmr, prev, 1);
1571 		(prev + before)->fpgsz = after;
1572 		uvm_pmr_insert(drain, prev + before, 1);
1573 	}
1574 
1575 	/* Move free chunks that no longer fall in the range. */
1576 	for (; rebuild != NULL; rebuild = next) {
1577 		next = RBT_NEXT(uvm_pmr_addr, rebuild);
1578 
1579 		uvm_pmr_remove(pmr, rebuild);
1580 		uvm_pmr_insert(drain, rebuild, 1);
1581 	}
1582 
1583 	pmr->high = pageno;
1584 	uvm_pmr_assertvalid(pmr);
1585 	uvm_pmr_assertvalid(drain);
1586 
1587 	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1588 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1589 	uvm_unlock_fpageq();
1590 }
1591 
1592 /*
1593  * Increase the usage counter for the given range of memory.
1594  *
1595  * The more usage counters a given range of memory has, the more will be
1596  * attempted not to allocate from it.
1597  *
1598  * Addresses here are in paddr_t, not page-numbers.
1599  * The lowest and highest allowed address are specified.
1600  */
1601 void
1602 uvm_pmr_use_inc(paddr_t low, paddr_t high)
1603 {
1604 	struct uvm_pmemrange *pmr;
1605 	paddr_t sz;
1606 
1607 	/* pmr uses page numbers, translate low and high. */
1608 	high++;
1609 	high = atop(trunc_page(high));
1610 	low = atop(round_page(low));
1611 	uvm_pmr_split(low);
1612 	uvm_pmr_split(high);
1613 
1614 	sz = 0;
1615 	uvm_lock_fpageq();
1616 	/* Increase use count on segments in range. */
1617 	RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1618 		if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1619 			TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1620 			pmr->use++;
1621 			sz += pmr->high - pmr->low;
1622 			uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1623 		}
1624 		uvm_pmr_assertvalid(pmr);
1625 	}
1626 	uvm_unlock_fpageq();
1627 
1628 	KASSERT(sz >= high - low);
1629 }
1630 
1631 /*
1632  * Allocate a pmemrange.
1633  *
1634  * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1635  * If called after uvm_init, malloc is used.
1636  * (And if called in between, you're dead.)
1637  */
1638 struct uvm_pmemrange *
1639 uvm_pmr_allocpmr(void)
1640 {
1641 	struct uvm_pmemrange *nw;
1642 	int i;
1643 
1644 	/* We're only ever hitting the !uvm.page_init_done case for now. */
1645 	if (!uvm.page_init_done) {
1646 		nw = (struct uvm_pmemrange *)
1647 		    uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1648 	} else {
1649 		nw = malloc(sizeof(struct uvm_pmemrange),
1650 		    M_VMMAP, M_NOWAIT);
1651 	}
1652 	KASSERT(nw != NULL);
1653 	memset(nw, 0, sizeof(struct uvm_pmemrange));
1654 	RBT_INIT(uvm_pmr_addr, &nw->addr);
1655 	for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1656 		RBT_INIT(uvm_pmr_size, &nw->size[i]);
1657 		TAILQ_INIT(&nw->single[i]);
1658 	}
1659 	return nw;
1660 }
1661 
1662 /*
1663  * Initialization of pmr.
1664  * Called by uvm_page_init.
1665  *
1666  * Sets up pmemranges.
1667  */
1668 void
1669 uvm_pmr_init(void)
1670 {
1671 	struct uvm_pmemrange *new_pmr;
1672 	int i;
1673 
1674 	TAILQ_INIT(&uvm.pmr_control.use);
1675 	RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1676 	TAILQ_INIT(&uvm.pmr_control.allocs);
1677 
1678 	/* By default, one range for the entire address space. */
1679 	new_pmr = uvm_pmr_allocpmr();
1680 	new_pmr->low = 0;
1681 	new_pmr->high = atop((paddr_t)-1) + 1;
1682 
1683 	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1684 	uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1685 
1686 	for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1687 		uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1688 	    	    uvm_md_constraints[i]->ucr_high);
1689 	}
1690 }
1691 
1692 /*
1693  * Find the pmemrange that contains the given page number.
1694  *
1695  * (Manually traverses the binary tree, because that is cheaper on stack
1696  * usage.)
1697  */
1698 struct uvm_pmemrange *
1699 uvm_pmemrange_find(paddr_t pageno)
1700 {
1701 	struct uvm_pmemrange *pmr;
1702 
1703 	pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1704 	while (pmr != NULL) {
1705 		if (pmr->low > pageno)
1706 			pmr = RBT_LEFT(uvm_pmemrange_addr, pmr);
1707 		else if (pmr->high <= pageno)
1708 			pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr);
1709 		else
1710 			break;
1711 	}
1712 
1713 	return pmr;
1714 }
1715 
1716 #if defined(DDB) || defined(DEBUG)
1717 /*
1718  * Return true if the given page is in any of the free lists.
1719  * Used by uvm_page_printit.
1720  * This function is safe, even if the page is not on the freeq.
1721  * Note: does not apply locking, only called from ddb.
1722  */
1723 int
1724 uvm_pmr_isfree(struct vm_page *pg)
1725 {
1726 	struct vm_page *r;
1727 	struct uvm_pmemrange *pmr;
1728 
1729 	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1730 	if (pmr == NULL)
1731 		return 0;
1732 	r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1733 	if (r == NULL)
1734 		r = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1735 	else if (r != pg)
1736 		r = RBT_PREV(uvm_pmr_addr, r);
1737 	if (r == NULL)
1738 		return 0; /* Empty tree. */
1739 
1740 	KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1741 	return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1742 	    atop(VM_PAGE_TO_PHYS(pg));
1743 }
1744 #endif /* DEBUG */
1745 
1746 /*
1747  * Given a root of a tree, find a range which intersects start, end and
1748  * is of the same memtype.
1749  *
1750  * Page must be in the address tree.
1751  */
1752 struct vm_page*
1753 uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1754     paddr_t start, paddr_t end, int memtype)
1755 {
1756 	int	direction;
1757 	struct	vm_page *root;
1758 	struct	vm_page *high, *high_next;
1759 	struct	vm_page *low, *low_next;
1760 
1761 	KDASSERT(pmr != NULL && init_root != NULL);
1762 	root = init_root;
1763 
1764 	/* Which direction to use for searching. */
1765 	if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1766 		direction =  1;
1767 	else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1768 		direction = -1;
1769 	else /* nothing to do */
1770 		return root;
1771 
1772 	/* First, update root to fall within the chosen range. */
1773 	while (root && !PMR_INTERSECTS_WITH(
1774 	    atop(VM_PAGE_TO_PHYS(root)),
1775 	    atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1776 	    start, end)) {
1777 		if (direction == 1)
1778 			root = RBT_RIGHT(uvm_objtree, root);
1779 		else
1780 			root = RBT_LEFT(uvm_objtree, root);
1781 	}
1782 	if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1783 		return root;
1784 
1785 	/*
1786 	 * Root is valid, but of the wrong memtype.
1787 	 *
1788 	 * Try to find a range that has the given memtype in the subtree
1789 	 * (memtype mismatches are costly, either because the conversion
1790 	 * is expensive, or a later allocation will need to do the opposite
1791 	 * conversion, which will be expensive).
1792 	 *
1793 	 *
1794 	 * First, simply increase address until we hit something we can use.
1795 	 * Cache the upper page, so we can page-walk later.
1796 	 */
1797 	high = root;
1798 	high_next = RBT_RIGHT(uvm_objtree, high);
1799 	while (high_next != NULL && PMR_INTERSECTS_WITH(
1800 	    atop(VM_PAGE_TO_PHYS(high_next)),
1801 	    atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1802 	    start, end)) {
1803 		high = high_next;
1804 		if (uvm_pmr_pg_to_memtype(high) == memtype)
1805 			return high;
1806 		high_next = RBT_RIGHT(uvm_objtree, high);
1807 	}
1808 
1809 	/*
1810 	 * Second, decrease the address until we hit something we can use.
1811 	 * Cache the lower page, so we can page-walk later.
1812 	 */
1813 	low = root;
1814 	low_next = RBT_LEFT(uvm_objtree, low);
1815 	while (low_next != NULL && PMR_INTERSECTS_WITH(
1816 	    atop(VM_PAGE_TO_PHYS(low_next)),
1817 	    atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1818 	    start, end)) {
1819 		low = low_next;
1820 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1821 			return low;
1822 		low_next = RBT_LEFT(uvm_objtree, low);
1823 	}
1824 
1825 	if (low == high)
1826 		return NULL;
1827 
1828 	/* No hits. Walk the address tree until we find something usable. */
1829 	for (low = RBT_NEXT(uvm_pmr_addr, low);
1830 	    low != high;
1831 	    low = RBT_NEXT(uvm_pmr_addr, low)) {
1832 		KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1833 	    	    atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1834 	    	    start, end));
1835 		if (uvm_pmr_pg_to_memtype(low) == memtype)
1836 			return low;
1837 	}
1838 
1839 	/* Nothing found. */
1840 	return NULL;
1841 }
1842 
1843 /*
1844  * Allocate any page, the fastest way. Page number constraints only.
1845  */
1846 psize_t
1847 uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1848     paddr_t start, paddr_t end, int memtype_only)
1849 {
1850 	struct	uvm_pmemrange *pmr;
1851 	struct	vm_page *found, *splitpg;
1852 	psize_t	fcount;
1853 	int	memtype;
1854 
1855 	fcount = 0;
1856 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1857 		/* We're done. */
1858 		if (fcount == count)
1859 			break;
1860 
1861 		/* Outside requested range. */
1862 		if (!(start == 0 && end == 0) &&
1863 		    !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1864 			continue;
1865 
1866 		/* Range is empty. */
1867 		if (pmr->nsegs == 0)
1868 			continue;
1869 
1870 		/* Loop over all memtypes, starting at memtype_init. */
1871 		memtype = memtype_init;
1872 		while (fcount != count) {
1873 			found = TAILQ_FIRST(&pmr->single[memtype]);
1874 			/*
1875 			 * If found is outside the range, walk the list
1876 			 * until we find something that intersects with
1877 			 * boundaries.
1878 			 */
1879 			while (found && !PMR_INTERSECTS_WITH(
1880 			    atop(VM_PAGE_TO_PHYS(found)),
1881 			    atop(VM_PAGE_TO_PHYS(found)) + 1,
1882 			    start, end))
1883 				found = TAILQ_NEXT(found, pageq);
1884 
1885 			if (found == NULL) {
1886 				/*
1887 				 * Check if the size tree contains a range
1888 				 * that intersects with the boundaries. As the
1889 				 * allocation is for any page, try the smallest
1890 				 * range so that large ranges are preserved for
1891 				 * more constrained cases. Only one entry is
1892 				 * checked here, to avoid a brute-force search.
1893 				 *
1894 				 * Note that a size tree gives pg[1] instead of
1895 				 * pg[0].
1896 				 */
1897 				found = RBT_MIN(uvm_pmr_size,
1898 				    &pmr->size[memtype]);
1899 				if (found != NULL) {
1900 					found--;
1901 					if (!PMR_INTERSECTS_WITH(
1902 					    atop(VM_PAGE_TO_PHYS(found)),
1903 					    atop(VM_PAGE_TO_PHYS(found)) +
1904 					    found->fpgsz, start, end))
1905 						found = NULL;
1906 				}
1907 			}
1908 			if (found == NULL) {
1909 				/*
1910 				 * Try address-guided search to meet the page
1911 				 * number constraints.
1912 				 */
1913 				found = RBT_ROOT(uvm_pmr_addr, &pmr->addr);
1914 				if (found != NULL) {
1915 					found = uvm_pmr_rootupdate(pmr, found,
1916 					    start, end, memtype);
1917 				}
1918 			}
1919 			if (found != NULL) {
1920 				uvm_pmr_assertvalid(pmr);
1921 				uvm_pmr_remove_size(pmr, found);
1922 
1923 				/*
1924 				 * If the page intersects the end, then it'll
1925 				 * need splitting.
1926 				 *
1927 				 * Note that we don't need to split if the page
1928 				 * intersects start: the drain function will
1929 				 * simply stop on hitting start.
1930 				 */
1931 				if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1932 				    found->fpgsz > end) {
1933 					psize_t splitsz =
1934 					    atop(VM_PAGE_TO_PHYS(found)) +
1935 					    found->fpgsz - end;
1936 
1937 					uvm_pmr_remove_addr(pmr, found);
1938 					uvm_pmr_assertvalid(pmr);
1939 					found->fpgsz -= splitsz;
1940 					splitpg = found + found->fpgsz;
1941 					splitpg->fpgsz = splitsz;
1942 					uvm_pmr_insert(pmr, splitpg, 1);
1943 
1944 					/*
1945 					 * At this point, splitpg and found
1946 					 * actually should be joined.
1947 					 * But we explicitly disable that,
1948 					 * because we will start subtracting
1949 					 * from found.
1950 					 */
1951 					KASSERT(start == 0 ||
1952 					    atop(VM_PAGE_TO_PHYS(found)) +
1953 					    found->fpgsz > start);
1954 					uvm_pmr_insert_addr(pmr, found, 1);
1955 				}
1956 
1957 				/*
1958 				 * Fetch pages from the end.
1959 				 * If the range is larger than the requested
1960 				 * number of pages, this saves us an addr-tree
1961 				 * update.
1962 				 *
1963 				 * Since we take from the end and insert at
1964 				 * the head, any ranges keep preserved.
1965 				 */
1966 				while (found->fpgsz > 0 && fcount < count &&
1967 				    (start == 0 ||
1968 				    atop(VM_PAGE_TO_PHYS(found)) +
1969 				    found->fpgsz > start)) {
1970 					found->fpgsz--;
1971 					fcount++;
1972 					TAILQ_INSERT_HEAD(result,
1973 					    &found[found->fpgsz], pageq);
1974 				}
1975 				if (found->fpgsz > 0) {
1976 					uvm_pmr_insert_size(pmr, found);
1977 					KDASSERT(fcount == count);
1978 					uvm_pmr_assertvalid(pmr);
1979 					return fcount;
1980 				}
1981 
1982 				/*
1983 				 * Delayed addr-tree removal.
1984 				 */
1985 				uvm_pmr_remove_addr(pmr, found);
1986 				uvm_pmr_assertvalid(pmr);
1987 			} else {
1988 				if (memtype_only)
1989 					break;
1990 				/*
1991 				 * Skip to the next memtype.
1992 				 */
1993 				memtype += 1;
1994 				if (memtype == UVM_PMR_MEMTYPE_MAX)
1995 					memtype = 0;
1996 				if (memtype == memtype_init)
1997 					break;
1998 			}
1999 		}
2000 	}
2001 
2002 	/*
2003 	 * Search finished.
2004 	 *
2005 	 * Ran out of ranges before enough pages were gathered, or we hit the
2006 	 * case where found->fpgsz == count - fcount, in which case the
2007 	 * above exit condition didn't trigger.
2008 	 *
2009 	 * On failure, caller will free the pages.
2010 	 */
2011 	return fcount;
2012 }
2013 
2014 #ifdef DDB
2015 /*
2016  * Print information about pmemrange.
2017  * Does not do locking (so either call it from DDB or acquire fpageq lock
2018  * before invoking.
2019  */
2020 void
2021 uvm_pmr_print(void)
2022 {
2023 	struct	uvm_pmemrange *pmr;
2024 	struct	vm_page *pg;
2025 	psize_t	size[UVM_PMR_MEMTYPE_MAX];
2026 	psize_t	free;
2027 	int	useq_len;
2028 	int	mt;
2029 
2030 	printf("Ranges, use queue:\n");
2031 	useq_len = 0;
2032 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
2033 		useq_len++;
2034 		free = 0;
2035 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2036 			pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]);
2037 			if (pg != NULL)
2038 				pg--;
2039 			else
2040 				pg = TAILQ_FIRST(&pmr->single[mt]);
2041 			size[mt] = (pg == NULL ? 0 : pg->fpgsz);
2042 
2043 			RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
2044 				free += pg->fpgsz;
2045 		}
2046 
2047 		printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
2048 		    (unsigned long)pmr->low, (unsigned long)pmr->high,
2049 		    pmr->use, (unsigned long)pmr->nsegs);
2050 		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2051 			printf(" maxsegsz[%d]=0x%lx", mt,
2052 			    (unsigned long)size[mt]);
2053 		}
2054 		printf(" free=0x%lx\n", (unsigned long)free);
2055 	}
2056 	printf("#ranges = %d\n", useq_len);
2057 }
2058 #endif
2059 
2060 /*
2061  * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
2062  * in a specific physmem area.
2063  *
2064  * Returns ENOMEM if the pagedaemon failed to free any pages.
2065  * If not failok, failure will lead to panic.
2066  *
2067  * Must be called with fpageq locked.
2068  */
2069 int
2070 uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
2071 {
2072 	struct uvm_pmalloc pma;
2073 	const char *wmsg = "pmrwait";
2074 
2075 	if (curproc == uvm.pagedaemon_proc) {
2076 		/*
2077 		 * This is not that uncommon when the pagedaemon is trying
2078 		 * to flush out a large mmapped file. VOP_WRITE will circle
2079 		 * back through the buffer cache and try to get more memory.
2080 		 * The pagedaemon starts by calling bufbackoff, but we can
2081 		 * easily use up that reserve in a single scan iteration.
2082 		 */
2083 		uvm_unlock_fpageq();
2084 		if (bufbackoff(NULL, atop(size)) == 0) {
2085 			uvm_lock_fpageq();
2086 			return 0;
2087 		}
2088 		uvm_lock_fpageq();
2089 
2090 		/*
2091 		 * XXX detect pagedaemon deadlock - see comment in
2092 		 * uvm_wait(), as this is exactly the same issue.
2093 		 */
2094 		printf("pagedaemon: wait_pla deadlock detected!\n");
2095 		msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg,
2096 		    MSEC_TO_NSEC(125));
2097 #if defined(DEBUG)
2098 		/* DEBUG: panic so we can debug it */
2099 		panic("wait_pla pagedaemon deadlock");
2100 #endif
2101 		return 0;
2102 	}
2103 
2104 	for (;;) {
2105 		pma.pm_constraint.ucr_low = low;
2106 		pma.pm_constraint.ucr_high = high;
2107 		pma.pm_size = size;
2108 		pma.pm_flags = UVM_PMA_LINKED;
2109 		TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
2110 
2111 		wakeup(&uvm.pagedaemon);		/* wake the daemon! */
2112 		while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
2113 			msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP);
2114 
2115 		if (!(pma.pm_flags & UVM_PMA_FREED) &&
2116 		    pma.pm_flags & UVM_PMA_FAIL) {
2117 			if (failok)
2118 				return ENOMEM;
2119 			printf("uvm_wait: failed to free %ld pages between "
2120 			    "0x%lx-0x%lx\n", atop(size), low, high);
2121 		} else
2122 			return 0;
2123 	}
2124 	/* UNREACHABLE */
2125 }
2126 
2127 /*
2128  * Wake up uvm_pmalloc sleepers.
2129  */
2130 void
2131 uvm_wakeup_pla(paddr_t low, psize_t len)
2132 {
2133 	struct uvm_pmalloc *pma, *pma_next;
2134 	paddr_t high;
2135 
2136 	high = low + len;
2137 
2138 	/* Wake specific allocations waiting for this memory. */
2139 	for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
2140 	    pma = pma_next) {
2141 		pma_next = TAILQ_NEXT(pma, pmq);
2142 
2143 		if (low < pma->pm_constraint.ucr_high &&
2144 		    high > pma->pm_constraint.ucr_low) {
2145 			pma->pm_flags |= UVM_PMA_FREED;
2146 			if (!(pma->pm_flags & UVM_PMA_BUSY)) {
2147 				pma->pm_flags &= ~UVM_PMA_LINKED;
2148 				TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
2149 				    pmq);
2150 				wakeup(pma);
2151 			}
2152 		}
2153 	}
2154 }
2155 
2156 void
2157 uvm_pagezero_thread(void *arg)
2158 {
2159 	struct pglist pgl;
2160 	struct vm_page *pg;
2161 	int count;
2162 
2163 	/* Run at the lowest possible priority. */
2164 	curproc->p_p->ps_nice = NZERO + PRIO_MAX;
2165 
2166 	KERNEL_UNLOCK();
2167 
2168 	TAILQ_INIT(&pgl);
2169 	for (;;) {
2170 		uvm_lock_fpageq();
2171 		while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
2172 		    (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
2173 		     &pgl, 0, 0, 1)) == 0) {
2174 			msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock,
2175 			    MAXPRI, "pgzero", INFSLP);
2176 		}
2177 		uvm_unlock_fpageq();
2178 
2179 		TAILQ_FOREACH(pg, &pgl, pageq) {
2180 			uvm_pagezero(pg);
2181 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
2182 		}
2183 
2184 		uvm_lock_fpageq();
2185 		while (!TAILQ_EMPTY(&pgl))
2186 			uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
2187 		uvmexp.zeropages += count;
2188  		uvm_unlock_fpageq();
2189 
2190 		yield();
2191 	}
2192 }
2193