xref: /freebsd-src/sys/vm/vm_phys.c (revision 0078df5f025854600595462e56fa95d34e732141)
111752d88SAlan Cox /*-
24d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
3fe267a55SPedro F. Giffuni  *
411752d88SAlan Cox  * Copyright (c) 2002-2006 Rice University
511752d88SAlan Cox  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
611752d88SAlan Cox  * All rights reserved.
711752d88SAlan Cox  *
811752d88SAlan Cox  * This software was developed for the FreeBSD Project by Alan L. Cox,
911752d88SAlan Cox  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
1011752d88SAlan Cox  *
1111752d88SAlan Cox  * Redistribution and use in source and binary forms, with or without
1211752d88SAlan Cox  * modification, are permitted provided that the following conditions
1311752d88SAlan Cox  * are met:
1411752d88SAlan Cox  * 1. Redistributions of source code must retain the above copyright
1511752d88SAlan Cox  *    notice, this list of conditions and the following disclaimer.
1611752d88SAlan Cox  * 2. Redistributions in binary form must reproduce the above copyright
1711752d88SAlan Cox  *    notice, this list of conditions and the following disclaimer in the
1811752d88SAlan Cox  *    documentation and/or other materials provided with the distribution.
1911752d88SAlan Cox  *
2011752d88SAlan Cox  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
2111752d88SAlan Cox  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
2211752d88SAlan Cox  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
2311752d88SAlan Cox  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
2411752d88SAlan Cox  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
2511752d88SAlan Cox  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
2611752d88SAlan Cox  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
2711752d88SAlan Cox  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
2811752d88SAlan Cox  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2911752d88SAlan Cox  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
3011752d88SAlan Cox  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
3111752d88SAlan Cox  * POSSIBILITY OF SUCH DAMAGE.
3211752d88SAlan Cox  */
3311752d88SAlan Cox 
34fbd80bd0SAlan Cox /*
35fbd80bd0SAlan Cox  *	Physical memory system implementation
36fbd80bd0SAlan Cox  *
37fbd80bd0SAlan Cox  * Any external functions defined by this module are only to be used by the
38fbd80bd0SAlan Cox  * virtual memory system.
39fbd80bd0SAlan Cox  */
40fbd80bd0SAlan Cox 
4111752d88SAlan Cox #include <sys/cdefs.h>
4211752d88SAlan Cox #include "opt_ddb.h"
43174b5f38SJohn Baldwin #include "opt_vm.h"
4411752d88SAlan Cox 
4511752d88SAlan Cox #include <sys/param.h>
4611752d88SAlan Cox #include <sys/systm.h>
47662e7fa8SMark Johnston #include <sys/domainset.h>
4811752d88SAlan Cox #include <sys/lock.h>
4911752d88SAlan Cox #include <sys/kernel.h>
50b16b4c22SMark Johnston #include <sys/kthread.h>
5111752d88SAlan Cox #include <sys/malloc.h>
5211752d88SAlan Cox #include <sys/mutex.h>
537e226537SAttilio Rao #include <sys/proc.h>
5411752d88SAlan Cox #include <sys/queue.h>
5538d6b2dcSRoger Pau Monné #include <sys/rwlock.h>
5611752d88SAlan Cox #include <sys/sbuf.h>
57b16b4c22SMark Johnston #include <sys/sched.h>
5811752d88SAlan Cox #include <sys/sysctl.h>
5938d6b2dcSRoger Pau Monné #include <sys/tree.h>
60b16b4c22SMark Johnston #include <sys/tslog.h>
61b16b4c22SMark Johnston #include <sys/unistd.h>
6211752d88SAlan Cox #include <sys/vmmeter.h>
6311752d88SAlan Cox 
6411752d88SAlan Cox #include <ddb/ddb.h>
6511752d88SAlan Cox 
6611752d88SAlan Cox #include <vm/vm.h>
6701e115abSDoug Moore #include <vm/vm_extern.h>
6811752d88SAlan Cox #include <vm/vm_param.h>
6911752d88SAlan Cox #include <vm/vm_kern.h>
7011752d88SAlan Cox #include <vm/vm_object.h>
7111752d88SAlan Cox #include <vm/vm_page.h>
7211752d88SAlan Cox #include <vm/vm_phys.h>
73e2068d0bSJeff Roberson #include <vm/vm_pagequeue.h>
7411752d88SAlan Cox 
75449c2e92SKonstantin Belousov _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
76449c2e92SKonstantin Belousov     "Too many physsegs.");
77c9b06fa5SDoug Moore _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t),
78c9b06fa5SDoug Moore     "vm_paddr_t too big for ffsll, flsll.");
7911752d88SAlan Cox 
80b6715dabSJeff Roberson #ifdef NUMA
81cdfeced8SJeff Roberson struct mem_affinity __read_mostly *mem_affinity;
82cdfeced8SJeff Roberson int __read_mostly *mem_locality;
83c415cfc8SZhenlei Huang 
84c415cfc8SZhenlei Huang static int numa_disabled;
85c415cfc8SZhenlei Huang static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
86c415cfc8SZhenlei Huang     "NUMA options");
87c415cfc8SZhenlei Huang SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
88c415cfc8SZhenlei Huang     &numa_disabled, 0, "NUMA-awareness in the allocators is disabled");
8962d70a81SJohn Baldwin #endif
90a3870a18SJohn Baldwin 
91cdfeced8SJeff Roberson int __read_mostly vm_ndomains = 1;
92463406acSMark Johnston domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
937e226537SAttilio Rao 
94cdfeced8SJeff Roberson struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
95cdfeced8SJeff Roberson int __read_mostly vm_phys_nsegs;
9681302f1dSMark Johnston static struct vm_phys_seg vm_phys_early_segs[8];
9781302f1dSMark Johnston static int vm_phys_early_nsegs;
9811752d88SAlan Cox 
9938d6b2dcSRoger Pau Monné struct vm_phys_fictitious_seg;
10038d6b2dcSRoger Pau Monné static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
10138d6b2dcSRoger Pau Monné     struct vm_phys_fictitious_seg *);
10238d6b2dcSRoger Pau Monné 
10338d6b2dcSRoger Pau Monné RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
104b649c2acSDoug Moore     RB_INITIALIZER(&vm_phys_fictitious_tree);
10538d6b2dcSRoger Pau Monné 
10638d6b2dcSRoger Pau Monné struct vm_phys_fictitious_seg {
10738d6b2dcSRoger Pau Monné 	RB_ENTRY(vm_phys_fictitious_seg) node;
10838d6b2dcSRoger Pau Monné 	/* Memory region data */
109b6de32bdSKonstantin Belousov 	vm_paddr_t	start;
110b6de32bdSKonstantin Belousov 	vm_paddr_t	end;
111b6de32bdSKonstantin Belousov 	vm_page_t	first_page;
11238d6b2dcSRoger Pau Monné };
11338d6b2dcSRoger Pau Monné 
11438d6b2dcSRoger Pau Monné RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
11538d6b2dcSRoger Pau Monné     vm_phys_fictitious_cmp);
11638d6b2dcSRoger Pau Monné 
117cdfeced8SJeff Roberson static struct rwlock_padalign vm_phys_fictitious_reg_lock;
118c0432fc3SMark Johnston MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
119b6de32bdSKonstantin Belousov 
120cdfeced8SJeff Roberson static struct vm_freelist __aligned(CACHE_LINE_SIZE)
121f2a496d6SKonstantin Belousov     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
122f2a496d6SKonstantin Belousov     [VM_NFREEORDER_MAX];
12311752d88SAlan Cox 
124cdfeced8SJeff Roberson static int __read_mostly vm_nfreelists;
125d866a563SAlan Cox 
126d866a563SAlan Cox /*
12721943937SJeff Roberson  * These "avail lists" are globals used to communicate boot-time physical
12821943937SJeff Roberson  * memory layout to other parts of the kernel.  Each physically contiguous
12921943937SJeff Roberson  * region of memory is defined by a start address at an even index and an
13021943937SJeff Roberson  * end address at the following odd index.  Each list is terminated by a
13121943937SJeff Roberson  * pair of zero entries.
13221943937SJeff Roberson  *
13321943937SJeff Roberson  * dump_avail tells the dump code what regions to include in a crash dump, and
13421943937SJeff Roberson  * phys_avail is all of the remaining physical memory that is available for
13521943937SJeff Roberson  * the vm system.
13621943937SJeff Roberson  *
13721943937SJeff Roberson  * Initially dump_avail and phys_avail are identical.  Boot time memory
13821943937SJeff Roberson  * allocations remove extents from phys_avail that may still be included
13921943937SJeff Roberson  * in dumps.
14021943937SJeff Roberson  */
14121943937SJeff Roberson vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
14221943937SJeff Roberson vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
14321943937SJeff Roberson 
14421943937SJeff Roberson /*
145d866a563SAlan Cox  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
146d866a563SAlan Cox  */
147cdfeced8SJeff Roberson static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
148b16b4c22SMark Johnston static int __read_mostly vm_default_freepool;
149d866a563SAlan Cox 
150d866a563SAlan Cox CTASSERT(VM_FREELIST_DEFAULT == 0);
151d866a563SAlan Cox 
152d866a563SAlan Cox #ifdef VM_FREELIST_DMA32
153d866a563SAlan Cox #define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
154d866a563SAlan Cox #endif
155d866a563SAlan Cox 
156d866a563SAlan Cox /*
157d866a563SAlan Cox  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
158d866a563SAlan Cox  * the ordering of the free list boundaries.
159d866a563SAlan Cox  */
160d866a563SAlan Cox #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
161d866a563SAlan Cox CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
162d866a563SAlan Cox #endif
16311752d88SAlan Cox 
16411752d88SAlan Cox static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
1657029da5cSPawel Biernacki SYSCTL_OID(_vm, OID_AUTO, phys_free,
166114484b7SMark Johnston     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1677029da5cSPawel Biernacki     sysctl_vm_phys_free, "A",
1687029da5cSPawel Biernacki     "Phys Free Info");
16911752d88SAlan Cox 
17011752d88SAlan Cox static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
1717029da5cSPawel Biernacki SYSCTL_OID(_vm, OID_AUTO, phys_segs,
172114484b7SMark Johnston     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1737029da5cSPawel Biernacki     sysctl_vm_phys_segs, "A",
1747029da5cSPawel Biernacki     "Phys Seg Info");
17511752d88SAlan Cox 
176b6715dabSJeff Roberson #ifdef NUMA
177415d7ccaSAdrian Chadd static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
1787029da5cSPawel Biernacki SYSCTL_OID(_vm, OID_AUTO, phys_locality,
179114484b7SMark Johnston     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1807029da5cSPawel Biernacki     sysctl_vm_phys_locality, "A",
1817029da5cSPawel Biernacki     "Phys Locality Info");
1826520495aSAdrian Chadd #endif
183415d7ccaSAdrian Chadd 
1847e226537SAttilio Rao SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
1857e226537SAttilio Rao     &vm_ndomains, 0, "Number of physical memory domains available.");
186a3870a18SJohn Baldwin 
187d866a563SAlan Cox static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
188d866a563SAlan Cox static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
18911752d88SAlan Cox static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
190*0078df5fSDoug Moore     int order, int pool, int tail);
191c606ab59SDoug Moore 
192b16b4c22SMark Johnston static bool __diagused
193b16b4c22SMark Johnston vm_phys_pool_valid(int pool)
194b16b4c22SMark Johnston {
195b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
196b16b4c22SMark Johnston 	if (pool == VM_FREEPOOL_LAZYINIT)
197b16b4c22SMark Johnston 		return (false);
198b16b4c22SMark Johnston #endif
199b16b4c22SMark Johnston 	return (pool >= 0 && pool < VM_NFREEPOOL);
200b16b4c22SMark Johnston }
201b16b4c22SMark Johnston 
20238d6b2dcSRoger Pau Monné /*
20338d6b2dcSRoger Pau Monné  * Red-black tree helpers for vm fictitious range management.
20438d6b2dcSRoger Pau Monné  */
20538d6b2dcSRoger Pau Monné static inline int
20638d6b2dcSRoger Pau Monné vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
20738d6b2dcSRoger Pau Monné     struct vm_phys_fictitious_seg *range)
20838d6b2dcSRoger Pau Monné {
20938d6b2dcSRoger Pau Monné 
21038d6b2dcSRoger Pau Monné 	KASSERT(range->start != 0 && range->end != 0,
21138d6b2dcSRoger Pau Monné 	    ("Invalid range passed on search for vm_fictitious page"));
21238d6b2dcSRoger Pau Monné 	if (p->start >= range->end)
21338d6b2dcSRoger Pau Monné 		return (1);
21438d6b2dcSRoger Pau Monné 	if (p->start < range->start)
21538d6b2dcSRoger Pau Monné 		return (-1);
21638d6b2dcSRoger Pau Monné 
21738d6b2dcSRoger Pau Monné 	return (0);
21838d6b2dcSRoger Pau Monné }
21938d6b2dcSRoger Pau Monné 
22038d6b2dcSRoger Pau Monné static int
22138d6b2dcSRoger Pau Monné vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
22238d6b2dcSRoger Pau Monné     struct vm_phys_fictitious_seg *p2)
22338d6b2dcSRoger Pau Monné {
22438d6b2dcSRoger Pau Monné 
22538d6b2dcSRoger Pau Monné 	/* Check if this is a search for a page */
22638d6b2dcSRoger Pau Monné 	if (p1->end == 0)
22738d6b2dcSRoger Pau Monné 		return (vm_phys_fictitious_in_range(p1, p2));
22838d6b2dcSRoger Pau Monné 
22938d6b2dcSRoger Pau Monné 	KASSERT(p2->end != 0,
23038d6b2dcSRoger Pau Monné     ("Invalid range passed as second parameter to vm fictitious comparison"));
23138d6b2dcSRoger Pau Monné 
23238d6b2dcSRoger Pau Monné 	/* Searching to add a new range */
23338d6b2dcSRoger Pau Monné 	if (p1->end <= p2->start)
23438d6b2dcSRoger Pau Monné 		return (-1);
23538d6b2dcSRoger Pau Monné 	if (p1->start >= p2->end)
23638d6b2dcSRoger Pau Monné 		return (1);
23738d6b2dcSRoger Pau Monné 
23838d6b2dcSRoger Pau Monné 	panic("Trying to add overlapping vm fictitious ranges:\n"
23938d6b2dcSRoger Pau Monné 	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
24038d6b2dcSRoger Pau Monné 	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
24138d6b2dcSRoger Pau Monné }
24238d6b2dcSRoger Pau Monné 
2436f4acaf4SJeff Roberson int
244cb20a74cSStephen J. Kiernan vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used,
245cb20a74cSStephen J. Kiernan     vm_paddr_t high __numa_used)
246449c2e92SKonstantin Belousov {
247b6715dabSJeff Roberson #ifdef NUMA
2486f4acaf4SJeff Roberson 	domainset_t mask;
2496f4acaf4SJeff Roberson 	int i;
250449c2e92SKonstantin Belousov 
2516f4acaf4SJeff Roberson 	if (vm_ndomains == 1 || mem_affinity == NULL)
2526f4acaf4SJeff Roberson 		return (0);
2536f4acaf4SJeff Roberson 
2546f4acaf4SJeff Roberson 	DOMAINSET_ZERO(&mask);
2556f4acaf4SJeff Roberson 	/*
2566f4acaf4SJeff Roberson 	 * Check for any memory that overlaps low, high.
2576f4acaf4SJeff Roberson 	 */
2586f4acaf4SJeff Roberson 	for (i = 0; mem_affinity[i].end != 0; i++)
2596f4acaf4SJeff Roberson 		if (mem_affinity[i].start <= high &&
2606f4acaf4SJeff Roberson 		    mem_affinity[i].end >= low)
2616f4acaf4SJeff Roberson 			DOMAINSET_SET(mem_affinity[i].domain, &mask);
2626f4acaf4SJeff Roberson 	if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
2636f4acaf4SJeff Roberson 		return (prefer);
2646f4acaf4SJeff Roberson 	if (DOMAINSET_EMPTY(&mask))
2656f4acaf4SJeff Roberson 		panic("vm_phys_domain_match:  Impossible constraint");
2666f4acaf4SJeff Roberson 	return (DOMAINSET_FFS(&mask) - 1);
2676f4acaf4SJeff Roberson #else
2686f4acaf4SJeff Roberson 	return (0);
2696f4acaf4SJeff Roberson #endif
270449c2e92SKonstantin Belousov }
271449c2e92SKonstantin Belousov 
27211752d88SAlan Cox /*
27311752d88SAlan Cox  * Outputs the state of the physical memory allocator, specifically,
27411752d88SAlan Cox  * the amount of physical memory in each free list.
27511752d88SAlan Cox  */
27611752d88SAlan Cox static int
27711752d88SAlan Cox sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
27811752d88SAlan Cox {
27911752d88SAlan Cox 	struct sbuf sbuf;
28011752d88SAlan Cox 	struct vm_freelist *fl;
2817e226537SAttilio Rao 	int dom, error, flind, oind, pind;
28211752d88SAlan Cox 
28300f0e671SMatthew D Fleming 	error = sysctl_wire_old_buffer(req, 0);
28400f0e671SMatthew D Fleming 	if (error != 0)
28500f0e671SMatthew D Fleming 		return (error);
2867e226537SAttilio Rao 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
2877e226537SAttilio Rao 	for (dom = 0; dom < vm_ndomains; dom++) {
288eb2f42fbSAlan Cox 		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
28911752d88SAlan Cox 		for (flind = 0; flind < vm_nfreelists; flind++) {
290eb2f42fbSAlan Cox 			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
29111752d88SAlan Cox 			    "\n  ORDER (SIZE)  |  NUMBER"
29211752d88SAlan Cox 			    "\n              ", flind);
29311752d88SAlan Cox 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
29411752d88SAlan Cox 				sbuf_printf(&sbuf, "  |  POOL %d", pind);
29511752d88SAlan Cox 			sbuf_printf(&sbuf, "\n--            ");
29611752d88SAlan Cox 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
29711752d88SAlan Cox 				sbuf_printf(&sbuf, "-- --      ");
29811752d88SAlan Cox 			sbuf_printf(&sbuf, "--\n");
29911752d88SAlan Cox 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
300d689bc00SAlan Cox 				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
30111752d88SAlan Cox 				    1 << (PAGE_SHIFT - 10 + oind));
30211752d88SAlan Cox 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
3037e226537SAttilio Rao 				fl = vm_phys_free_queues[dom][flind][pind];
304eb2f42fbSAlan Cox 					sbuf_printf(&sbuf, "  |  %6d",
3057e226537SAttilio Rao 					    fl[oind].lcnt);
30611752d88SAlan Cox 				}
30711752d88SAlan Cox 				sbuf_printf(&sbuf, "\n");
30811752d88SAlan Cox 			}
3097e226537SAttilio Rao 		}
31011752d88SAlan Cox 	}
3114e657159SMatthew D Fleming 	error = sbuf_finish(&sbuf);
31211752d88SAlan Cox 	sbuf_delete(&sbuf);
31311752d88SAlan Cox 	return (error);
31411752d88SAlan Cox }
31511752d88SAlan Cox 
31611752d88SAlan Cox /*
31711752d88SAlan Cox  * Outputs the set of physical memory segments.
31811752d88SAlan Cox  */
31911752d88SAlan Cox static int
32011752d88SAlan Cox sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
32111752d88SAlan Cox {
32211752d88SAlan Cox 	struct sbuf sbuf;
32311752d88SAlan Cox 	struct vm_phys_seg *seg;
32411752d88SAlan Cox 	int error, segind;
32511752d88SAlan Cox 
32600f0e671SMatthew D Fleming 	error = sysctl_wire_old_buffer(req, 0);
32700f0e671SMatthew D Fleming 	if (error != 0)
32800f0e671SMatthew D Fleming 		return (error);
3294e657159SMatthew D Fleming 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
33011752d88SAlan Cox 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
33111752d88SAlan Cox 		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
33211752d88SAlan Cox 		seg = &vm_phys_segs[segind];
33311752d88SAlan Cox 		sbuf_printf(&sbuf, "start:     %#jx\n",
33411752d88SAlan Cox 		    (uintmax_t)seg->start);
33511752d88SAlan Cox 		sbuf_printf(&sbuf, "end:       %#jx\n",
33611752d88SAlan Cox 		    (uintmax_t)seg->end);
337a3870a18SJohn Baldwin 		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
33811752d88SAlan Cox 		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
33911752d88SAlan Cox 	}
3404e657159SMatthew D Fleming 	error = sbuf_finish(&sbuf);
34111752d88SAlan Cox 	sbuf_delete(&sbuf);
34211752d88SAlan Cox 	return (error);
34311752d88SAlan Cox }
34411752d88SAlan Cox 
345415d7ccaSAdrian Chadd /*
346415d7ccaSAdrian Chadd  * Return affinity, or -1 if there's no affinity information.
347415d7ccaSAdrian Chadd  */
3486520495aSAdrian Chadd int
349cb20a74cSStephen J. Kiernan vm_phys_mem_affinity(int f __numa_used, int t __numa_used)
350415d7ccaSAdrian Chadd {
351415d7ccaSAdrian Chadd 
352b6715dabSJeff Roberson #ifdef NUMA
353415d7ccaSAdrian Chadd 	if (mem_locality == NULL)
354415d7ccaSAdrian Chadd 		return (-1);
355415d7ccaSAdrian Chadd 	if (f >= vm_ndomains || t >= vm_ndomains)
356415d7ccaSAdrian Chadd 		return (-1);
357415d7ccaSAdrian Chadd 	return (mem_locality[f * vm_ndomains + t]);
3586520495aSAdrian Chadd #else
3596520495aSAdrian Chadd 	return (-1);
3606520495aSAdrian Chadd #endif
361415d7ccaSAdrian Chadd }
362415d7ccaSAdrian Chadd 
363b6715dabSJeff Roberson #ifdef NUMA
364415d7ccaSAdrian Chadd /*
365415d7ccaSAdrian Chadd  * Outputs the VM locality table.
366415d7ccaSAdrian Chadd  */
367415d7ccaSAdrian Chadd static int
368415d7ccaSAdrian Chadd sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
369415d7ccaSAdrian Chadd {
370415d7ccaSAdrian Chadd 	struct sbuf sbuf;
371415d7ccaSAdrian Chadd 	int error, i, j;
372415d7ccaSAdrian Chadd 
373415d7ccaSAdrian Chadd 	error = sysctl_wire_old_buffer(req, 0);
374415d7ccaSAdrian Chadd 	if (error != 0)
375415d7ccaSAdrian Chadd 		return (error);
376415d7ccaSAdrian Chadd 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
377415d7ccaSAdrian Chadd 
378415d7ccaSAdrian Chadd 	sbuf_printf(&sbuf, "\n");
379415d7ccaSAdrian Chadd 
380415d7ccaSAdrian Chadd 	for (i = 0; i < vm_ndomains; i++) {
381415d7ccaSAdrian Chadd 		sbuf_printf(&sbuf, "%d: ", i);
382415d7ccaSAdrian Chadd 		for (j = 0; j < vm_ndomains; j++) {
383415d7ccaSAdrian Chadd 			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
384415d7ccaSAdrian Chadd 		}
385415d7ccaSAdrian Chadd 		sbuf_printf(&sbuf, "\n");
386415d7ccaSAdrian Chadd 	}
387415d7ccaSAdrian Chadd 	error = sbuf_finish(&sbuf);
388415d7ccaSAdrian Chadd 	sbuf_delete(&sbuf);
389415d7ccaSAdrian Chadd 	return (error);
390415d7ccaSAdrian Chadd }
3916520495aSAdrian Chadd #endif
392415d7ccaSAdrian Chadd 
3937e226537SAttilio Rao static void
394*0078df5fSDoug Moore vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int pool,
395*0078df5fSDoug Moore     int tail)
396a3870a18SJohn Baldwin {
397a3870a18SJohn Baldwin 
3987e226537SAttilio Rao 	m->order = order;
399*0078df5fSDoug Moore 	m->pool = pool;
4007e226537SAttilio Rao 	if (tail)
4015cd29d0fSMark Johnston 		TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
4027e226537SAttilio Rao 	else
4035cd29d0fSMark Johnston 		TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
4047e226537SAttilio Rao 	fl[order].lcnt++;
405a3870a18SJohn Baldwin }
4067e226537SAttilio Rao 
4077e226537SAttilio Rao static void
4087e226537SAttilio Rao vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
4097e226537SAttilio Rao {
4107e226537SAttilio Rao 
4115cd29d0fSMark Johnston 	TAILQ_REMOVE(&fl[order].pl, m, listq);
4127e226537SAttilio Rao 	fl[order].lcnt--;
4137e226537SAttilio Rao 	m->order = VM_NFREEORDER;
414a3870a18SJohn Baldwin }
415a3870a18SJohn Baldwin 
41611752d88SAlan Cox /*
41711752d88SAlan Cox  * Create a physical memory segment.
41811752d88SAlan Cox  */
41911752d88SAlan Cox static void
420d866a563SAlan Cox _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
42111752d88SAlan Cox {
42211752d88SAlan Cox 	struct vm_phys_seg *seg;
42311752d88SAlan Cox 
42411752d88SAlan Cox 	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
42511752d88SAlan Cox 	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
426ef435ae7SJeff Roberson 	KASSERT(domain >= 0 && domain < vm_ndomains,
4277e226537SAttilio Rao 	    ("vm_phys_create_seg: invalid domain provided"));
42811752d88SAlan Cox 	seg = &vm_phys_segs[vm_phys_nsegs++];
429271f0f12SAlan Cox 	while (seg > vm_phys_segs && (seg - 1)->start >= end) {
430271f0f12SAlan Cox 		*seg = *(seg - 1);
431271f0f12SAlan Cox 		seg--;
432271f0f12SAlan Cox 	}
43311752d88SAlan Cox 	seg->start = start;
43411752d88SAlan Cox 	seg->end = end;
435a3870a18SJohn Baldwin 	seg->domain = domain;
43611752d88SAlan Cox }
43711752d88SAlan Cox 
438a3870a18SJohn Baldwin static void
439d866a563SAlan Cox vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
440a3870a18SJohn Baldwin {
441b6715dabSJeff Roberson #ifdef NUMA
442a3870a18SJohn Baldwin 	int i;
443a3870a18SJohn Baldwin 
444a3870a18SJohn Baldwin 	if (mem_affinity == NULL) {
445d866a563SAlan Cox 		_vm_phys_create_seg(start, end, 0);
446a3870a18SJohn Baldwin 		return;
447a3870a18SJohn Baldwin 	}
448a3870a18SJohn Baldwin 
449a3870a18SJohn Baldwin 	for (i = 0;; i++) {
450a3870a18SJohn Baldwin 		if (mem_affinity[i].end == 0)
451a3870a18SJohn Baldwin 			panic("Reached end of affinity info");
452a3870a18SJohn Baldwin 		if (mem_affinity[i].end <= start)
453a3870a18SJohn Baldwin 			continue;
454a3870a18SJohn Baldwin 		if (mem_affinity[i].start > start)
455a3870a18SJohn Baldwin 			panic("No affinity info for start %jx",
456a3870a18SJohn Baldwin 			    (uintmax_t)start);
457a3870a18SJohn Baldwin 		if (mem_affinity[i].end >= end) {
458d866a563SAlan Cox 			_vm_phys_create_seg(start, end,
459a3870a18SJohn Baldwin 			    mem_affinity[i].domain);
460a3870a18SJohn Baldwin 			break;
461a3870a18SJohn Baldwin 		}
462d866a563SAlan Cox 		_vm_phys_create_seg(start, mem_affinity[i].end,
463a3870a18SJohn Baldwin 		    mem_affinity[i].domain);
464a3870a18SJohn Baldwin 		start = mem_affinity[i].end;
465a3870a18SJohn Baldwin 	}
46662d70a81SJohn Baldwin #else
46762d70a81SJohn Baldwin 	_vm_phys_create_seg(start, end, 0);
46862d70a81SJohn Baldwin #endif
469a3870a18SJohn Baldwin }
470a3870a18SJohn Baldwin 
47111752d88SAlan Cox /*
472271f0f12SAlan Cox  * Add a physical memory segment.
473271f0f12SAlan Cox  */
474271f0f12SAlan Cox void
475271f0f12SAlan Cox vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
476271f0f12SAlan Cox {
477d866a563SAlan Cox 	vm_paddr_t paddr;
478271f0f12SAlan Cox 
479271f0f12SAlan Cox 	KASSERT((start & PAGE_MASK) == 0,
480271f0f12SAlan Cox 	    ("vm_phys_define_seg: start is not page aligned"));
481271f0f12SAlan Cox 	KASSERT((end & PAGE_MASK) == 0,
482271f0f12SAlan Cox 	    ("vm_phys_define_seg: end is not page aligned"));
483d866a563SAlan Cox 
484d866a563SAlan Cox 	/*
485d866a563SAlan Cox 	 * Split the physical memory segment if it spans two or more free
486d866a563SAlan Cox 	 * list boundaries.
487d866a563SAlan Cox 	 */
488d866a563SAlan Cox 	paddr = start;
489d866a563SAlan Cox #ifdef	VM_FREELIST_LOWMEM
490d866a563SAlan Cox 	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
491d866a563SAlan Cox 		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
492d866a563SAlan Cox 		paddr = VM_LOWMEM_BOUNDARY;
493d866a563SAlan Cox 	}
494271f0f12SAlan Cox #endif
495d866a563SAlan Cox #ifdef	VM_FREELIST_DMA32
496d866a563SAlan Cox 	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
497d866a563SAlan Cox 		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
498d866a563SAlan Cox 		paddr = VM_DMA32_BOUNDARY;
499d866a563SAlan Cox 	}
500d866a563SAlan Cox #endif
501d866a563SAlan Cox 	vm_phys_create_seg(paddr, end);
502271f0f12SAlan Cox }
503271f0f12SAlan Cox 
504271f0f12SAlan Cox /*
50511752d88SAlan Cox  * Initialize the physical memory allocator.
506d866a563SAlan Cox  *
507d866a563SAlan Cox  * Requires that vm_page_array is initialized!
50811752d88SAlan Cox  */
50911752d88SAlan Cox void
51011752d88SAlan Cox vm_phys_init(void)
51111752d88SAlan Cox {
51211752d88SAlan Cox 	struct vm_freelist *fl;
51372aebdd7SAlan Cox 	struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
51452526922SJohn Baldwin #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE)
515d866a563SAlan Cox 	u_long npages;
51652526922SJohn Baldwin #endif
517d866a563SAlan Cox 	int dom, flind, freelist, oind, pind, segind;
51811752d88SAlan Cox 
519d866a563SAlan Cox 	/*
520d866a563SAlan Cox 	 * Compute the number of free lists, and generate the mapping from the
521d866a563SAlan Cox 	 * manifest constants VM_FREELIST_* to the free list indices.
522d866a563SAlan Cox 	 *
523d866a563SAlan Cox 	 * Initially, the entries of vm_freelist_to_flind[] are set to either
524d866a563SAlan Cox 	 * 0 or 1 to indicate which free lists should be created.
525d866a563SAlan Cox 	 */
52652526922SJohn Baldwin #ifdef	VM_DMA32_NPAGES_THRESHOLD
527d866a563SAlan Cox 	npages = 0;
52852526922SJohn Baldwin #endif
529d866a563SAlan Cox 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
530d866a563SAlan Cox 		seg = &vm_phys_segs[segind];
531d866a563SAlan Cox #ifdef	VM_FREELIST_LOWMEM
532d866a563SAlan Cox 		if (seg->end <= VM_LOWMEM_BOUNDARY)
533d866a563SAlan Cox 			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
534d866a563SAlan Cox 		else
535d866a563SAlan Cox #endif
536d866a563SAlan Cox #ifdef	VM_FREELIST_DMA32
537d866a563SAlan Cox 		if (
538d866a563SAlan Cox #ifdef	VM_DMA32_NPAGES_THRESHOLD
539d866a563SAlan Cox 		    /*
540d866a563SAlan Cox 		     * Create the DMA32 free list only if the amount of
541d866a563SAlan Cox 		     * physical memory above physical address 4G exceeds the
542d866a563SAlan Cox 		     * given threshold.
543d866a563SAlan Cox 		     */
544d866a563SAlan Cox 		    npages > VM_DMA32_NPAGES_THRESHOLD &&
545d866a563SAlan Cox #endif
546d866a563SAlan Cox 		    seg->end <= VM_DMA32_BOUNDARY)
547d866a563SAlan Cox 			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
548d866a563SAlan Cox 		else
549d866a563SAlan Cox #endif
550d866a563SAlan Cox 		{
55152526922SJohn Baldwin #ifdef	VM_DMA32_NPAGES_THRESHOLD
552d866a563SAlan Cox 			npages += atop(seg->end - seg->start);
55352526922SJohn Baldwin #endif
554d866a563SAlan Cox 			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
555d866a563SAlan Cox 		}
556d866a563SAlan Cox 	}
557d866a563SAlan Cox 	/* Change each entry into a running total of the free lists. */
558d866a563SAlan Cox 	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
559d866a563SAlan Cox 		vm_freelist_to_flind[freelist] +=
560d866a563SAlan Cox 		    vm_freelist_to_flind[freelist - 1];
561d866a563SAlan Cox 	}
562d866a563SAlan Cox 	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
563d866a563SAlan Cox 	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
564d866a563SAlan Cox 	/* Change each entry into a free list index. */
565d866a563SAlan Cox 	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
566d866a563SAlan Cox 		vm_freelist_to_flind[freelist]--;
567d866a563SAlan Cox 
568d866a563SAlan Cox 	/*
569d866a563SAlan Cox 	 * Initialize the first_page and free_queues fields of each physical
570d866a563SAlan Cox 	 * memory segment.
571d866a563SAlan Cox 	 */
572271f0f12SAlan Cox #ifdef VM_PHYSSEG_SPARSE
573d866a563SAlan Cox 	npages = 0;
57411752d88SAlan Cox #endif
575271f0f12SAlan Cox 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
576271f0f12SAlan Cox 		seg = &vm_phys_segs[segind];
577271f0f12SAlan Cox #ifdef VM_PHYSSEG_SPARSE
578d866a563SAlan Cox 		seg->first_page = &vm_page_array[npages];
579d866a563SAlan Cox 		npages += atop(seg->end - seg->start);
580271f0f12SAlan Cox #else
581271f0f12SAlan Cox 		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
58211752d88SAlan Cox #endif
583d866a563SAlan Cox #ifdef	VM_FREELIST_LOWMEM
584d866a563SAlan Cox 		if (seg->end <= VM_LOWMEM_BOUNDARY) {
585d866a563SAlan Cox 			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
586d866a563SAlan Cox 			KASSERT(flind >= 0,
587d866a563SAlan Cox 			    ("vm_phys_init: LOWMEM flind < 0"));
588d866a563SAlan Cox 		} else
589d866a563SAlan Cox #endif
590d866a563SAlan Cox #ifdef	VM_FREELIST_DMA32
591d866a563SAlan Cox 		if (seg->end <= VM_DMA32_BOUNDARY) {
592d866a563SAlan Cox 			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
593d866a563SAlan Cox 			KASSERT(flind >= 0,
594d866a563SAlan Cox 			    ("vm_phys_init: DMA32 flind < 0"));
595d866a563SAlan Cox 		} else
596d866a563SAlan Cox #endif
597d866a563SAlan Cox 		{
598d866a563SAlan Cox 			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
599d866a563SAlan Cox 			KASSERT(flind >= 0,
600d866a563SAlan Cox 			    ("vm_phys_init: DEFAULT flind < 0"));
60111752d88SAlan Cox 		}
602d866a563SAlan Cox 		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
603d866a563SAlan Cox 	}
604d866a563SAlan Cox 
605d866a563SAlan Cox 	/*
60672aebdd7SAlan Cox 	 * Coalesce physical memory segments that are contiguous and share the
60772aebdd7SAlan Cox 	 * same per-domain free queues.
60872aebdd7SAlan Cox 	 */
60972aebdd7SAlan Cox 	prev_seg = vm_phys_segs;
61072aebdd7SAlan Cox 	seg = &vm_phys_segs[1];
61172aebdd7SAlan Cox 	end_seg = &vm_phys_segs[vm_phys_nsegs];
61272aebdd7SAlan Cox 	while (seg < end_seg) {
61372aebdd7SAlan Cox 		if (prev_seg->end == seg->start &&
61472aebdd7SAlan Cox 		    prev_seg->free_queues == seg->free_queues) {
61572aebdd7SAlan Cox 			prev_seg->end = seg->end;
61672aebdd7SAlan Cox 			KASSERT(prev_seg->domain == seg->domain,
61772aebdd7SAlan Cox 			    ("vm_phys_init: free queues cannot span domains"));
61872aebdd7SAlan Cox 			vm_phys_nsegs--;
61972aebdd7SAlan Cox 			end_seg--;
62072aebdd7SAlan Cox 			for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
62172aebdd7SAlan Cox 				*tmp_seg = *(tmp_seg + 1);
62272aebdd7SAlan Cox 		} else {
62372aebdd7SAlan Cox 			prev_seg = seg;
62472aebdd7SAlan Cox 			seg++;
62572aebdd7SAlan Cox 		}
62672aebdd7SAlan Cox 	}
62772aebdd7SAlan Cox 
62872aebdd7SAlan Cox 	/*
629d866a563SAlan Cox 	 * Initialize the free queues.
630d866a563SAlan Cox 	 */
6317e226537SAttilio Rao 	for (dom = 0; dom < vm_ndomains; dom++) {
63211752d88SAlan Cox 		for (flind = 0; flind < vm_nfreelists; flind++) {
63311752d88SAlan Cox 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
6347e226537SAttilio Rao 				fl = vm_phys_free_queues[dom][flind][pind];
63511752d88SAlan Cox 				for (oind = 0; oind < VM_NFREEORDER; oind++)
63611752d88SAlan Cox 					TAILQ_INIT(&fl[oind].pl);
63711752d88SAlan Cox 			}
63811752d88SAlan Cox 		}
639a3870a18SJohn Baldwin 	}
640d866a563SAlan Cox 
641b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
642b16b4c22SMark Johnston 	vm_default_freepool = VM_FREEPOOL_LAZYINIT;
643b16b4c22SMark Johnston #else
644b16b4c22SMark Johnston 	vm_default_freepool = VM_FREEPOOL_DEFAULT;
645b16b4c22SMark Johnston #endif
646b16b4c22SMark Johnston 
64738d6b2dcSRoger Pau Monné 	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
64811752d88SAlan Cox }
64911752d88SAlan Cox 
65011752d88SAlan Cox /*
651662e7fa8SMark Johnston  * Register info about the NUMA topology of the system.
652662e7fa8SMark Johnston  *
653662e7fa8SMark Johnston  * Invoked by platform-dependent code prior to vm_phys_init().
654662e7fa8SMark Johnston  */
655662e7fa8SMark Johnston void
656cb20a74cSStephen J. Kiernan vm_phys_register_domains(int ndomains __numa_used,
657cb20a74cSStephen J. Kiernan     struct mem_affinity *affinity __numa_used, int *locality __numa_used)
658662e7fa8SMark Johnston {
659662e7fa8SMark Johnston #ifdef NUMA
660c415cfc8SZhenlei Huang 	int i;
661662e7fa8SMark Johnston 
662b61f3142SMark Johnston 	/*
663b61f3142SMark Johnston 	 * For now the only override value that we support is 1, which
664b61f3142SMark Johnston 	 * effectively disables NUMA-awareness in the allocators.
665b61f3142SMark Johnston 	 */
666c415cfc8SZhenlei Huang 	TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled);
667c415cfc8SZhenlei Huang 	if (numa_disabled)
668b61f3142SMark Johnston 		ndomains = 1;
669b61f3142SMark Johnston 
670b61f3142SMark Johnston 	if (ndomains > 1) {
671662e7fa8SMark Johnston 		vm_ndomains = ndomains;
672662e7fa8SMark Johnston 		mem_affinity = affinity;
673662e7fa8SMark Johnston 		mem_locality = locality;
674b61f3142SMark Johnston 	}
675662e7fa8SMark Johnston 
676662e7fa8SMark Johnston 	for (i = 0; i < vm_ndomains; i++)
677662e7fa8SMark Johnston 		DOMAINSET_SET(i, &all_domains);
678662e7fa8SMark Johnston #endif
679662e7fa8SMark Johnston }
680662e7fa8SMark Johnston 
681662e7fa8SMark Johnston /*
68211752d88SAlan Cox  * Split a contiguous, power of two-sized set of physical pages.
683370a338aSAlan Cox  *
684370a338aSAlan Cox  * When this function is called by a page allocation function, the caller
685370a338aSAlan Cox  * should request insertion at the head unless the order [order, oind) queues
686370a338aSAlan Cox  * are known to be empty.  The objective being to reduce the likelihood of
687370a338aSAlan Cox  * long-term fragmentation by promoting contemporaneous allocation and
688370a338aSAlan Cox  * (hopefully) deallocation.
68911752d88SAlan Cox  */
69011752d88SAlan Cox static __inline void
691370a338aSAlan Cox vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
692*0078df5fSDoug Moore     int pool, int tail)
69311752d88SAlan Cox {
69411752d88SAlan Cox 	vm_page_t m_buddy;
69511752d88SAlan Cox 
69611752d88SAlan Cox 	while (oind > order) {
69711752d88SAlan Cox 		oind--;
69811752d88SAlan Cox 		m_buddy = &m[1 << oind];
69911752d88SAlan Cox 		KASSERT(m_buddy->order == VM_NFREEORDER,
70011752d88SAlan Cox 		    ("vm_phys_split_pages: page %p has unexpected order %d",
70111752d88SAlan Cox 		    m_buddy, m_buddy->order));
702*0078df5fSDoug Moore 		vm_freelist_add(fl, m_buddy, oind, pool, tail);
70311752d88SAlan Cox         }
70411752d88SAlan Cox }
70511752d88SAlan Cox 
706d7ec4a88SMark Johnston static void
707*0078df5fSDoug Moore vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int pool,
708*0078df5fSDoug Moore     int tail)
709d7ec4a88SMark Johnston {
710d7ec4a88SMark Johnston 	KASSERT(order >= 0 && order < VM_NFREEORDER,
711d7ec4a88SMark Johnston 	    ("%s: invalid order %d", __func__, order));
712d7ec4a88SMark Johnston 
713*0078df5fSDoug Moore 	vm_freelist_add(fl, m, order, pool, tail);
714b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
715*0078df5fSDoug Moore 	if (__predict_false(pool == VM_FREEPOOL_LAZYINIT)) {
716b16b4c22SMark Johnston 		vm_page_t m_next;
717517c5854SMark Johnston 		vm_paddr_t pa;
718b16b4c22SMark Johnston 		int npages;
719b16b4c22SMark Johnston 
720b16b4c22SMark Johnston 		npages = 1 << order;
721b16b4c22SMark Johnston 		m_next = m + npages;
722517c5854SMark Johnston 		pa = m->phys_addr + ptoa(npages);
723517c5854SMark Johnston 		if (pa < vm_phys_segs[m->segind].end) {
724517c5854SMark Johnston 			vm_page_init_page(m_next, pa, m->segind,
725b16b4c22SMark Johnston 			    VM_FREEPOOL_LAZYINIT);
726b16b4c22SMark Johnston 		}
727517c5854SMark Johnston 	}
728b16b4c22SMark Johnston #endif
729d7ec4a88SMark Johnston }
730d7ec4a88SMark Johnston 
73111752d88SAlan Cox /*
732e77f4e7fSDoug Moore  * Add the physical pages [m, m + npages) at the beginning of a power-of-two
733e77f4e7fSDoug Moore  * aligned and sized set to the specified free list.
734e77f4e7fSDoug Moore  *
735e77f4e7fSDoug Moore  * When this function is called by a page allocation function, the caller
736e77f4e7fSDoug Moore  * should request insertion at the head unless the lower-order queues are
737e77f4e7fSDoug Moore  * known to be empty.  The objective being to reduce the likelihood of long-
738e77f4e7fSDoug Moore  * term fragmentation by promoting contemporaneous allocation and (hopefully)
739e77f4e7fSDoug Moore  * deallocation.
740e77f4e7fSDoug Moore  *
741e77f4e7fSDoug Moore  * The physical page m's buddy must not be free.
742e77f4e7fSDoug Moore  */
743e77f4e7fSDoug Moore static void
744*0078df5fSDoug Moore vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
745*0078df5fSDoug Moore     int tail)
746e77f4e7fSDoug Moore {
747e77f4e7fSDoug Moore         int order;
748e77f4e7fSDoug Moore 
749e77f4e7fSDoug Moore 	KASSERT(npages == 0 ||
750e77f4e7fSDoug Moore 	    (VM_PAGE_TO_PHYS(m) &
751543d55d7SDoug Moore 	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
752e77f4e7fSDoug Moore 	    ("%s: page %p and npages %u are misaligned",
753e77f4e7fSDoug Moore 	    __func__, m, npages));
754e77f4e7fSDoug Moore         while (npages > 0) {
755e77f4e7fSDoug Moore 		KASSERT(m->order == VM_NFREEORDER,
756e77f4e7fSDoug Moore 		    ("%s: page %p has unexpected order %d",
757e77f4e7fSDoug Moore 		    __func__, m, m->order));
758543d55d7SDoug Moore 		order = ilog2(npages);
759e77f4e7fSDoug Moore 		KASSERT(order < VM_NFREEORDER,
760e77f4e7fSDoug Moore 		    ("%s: order %d is out of range", __func__, order));
761*0078df5fSDoug Moore 		vm_phys_enq_chunk(fl, m, order, pool, tail);
762e77f4e7fSDoug Moore 		m += 1 << order;
763e77f4e7fSDoug Moore 		npages -= 1 << order;
764e77f4e7fSDoug Moore 	}
765e77f4e7fSDoug Moore }
766e77f4e7fSDoug Moore 
767e77f4e7fSDoug Moore /*
7687493904eSAlan Cox  * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
7697493904eSAlan Cox  * and sized set to the specified free list.
7707493904eSAlan Cox  *
7717493904eSAlan Cox  * When this function is called by a page allocation function, the caller
7727493904eSAlan Cox  * should request insertion at the head unless the lower-order queues are
7737493904eSAlan Cox  * known to be empty.  The objective being to reduce the likelihood of long-
7747493904eSAlan Cox  * term fragmentation by promoting contemporaneous allocation and (hopefully)
7757493904eSAlan Cox  * deallocation.
7767493904eSAlan Cox  *
777ccdb2827SDoug Moore  * If npages is zero, this function does nothing and ignores the physical page
778ccdb2827SDoug Moore  * parameter m.  Otherwise, the physical page m's buddy must not be free.
7797493904eSAlan Cox  */
780c9b06fa5SDoug Moore static vm_page_t
781*0078df5fSDoug Moore vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
782*0078df5fSDoug Moore     int tail)
7837493904eSAlan Cox {
7847493904eSAlan Cox 	int order;
7857493904eSAlan Cox 
786ccdb2827SDoug Moore 	KASSERT(npages == 0 ||
787ccdb2827SDoug Moore 	    ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
788543d55d7SDoug Moore 	    ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
7897493904eSAlan Cox 	    ("vm_phys_enq_range: page %p and npages %u are misaligned",
7907493904eSAlan Cox 	    m, npages));
791c9b06fa5SDoug Moore 	while (npages > 0) {
7927493904eSAlan Cox 		KASSERT(m->order == VM_NFREEORDER,
7937493904eSAlan Cox 		    ("vm_phys_enq_range: page %p has unexpected order %d",
7947493904eSAlan Cox 		    m, m->order));
7957493904eSAlan Cox 		order = ffs(npages) - 1;
796*0078df5fSDoug Moore 		vm_phys_enq_chunk(fl, m, order, pool, tail);
797c9b06fa5SDoug Moore 		m += 1 << order;
798c9b06fa5SDoug Moore 		npages -= 1 << order;
799c9b06fa5SDoug Moore 	}
800c9b06fa5SDoug Moore 	return (m);
8017493904eSAlan Cox }
8027493904eSAlan Cox 
8037493904eSAlan Cox /*
804*0078df5fSDoug Moore  * Complete initialization a contiguous, power of two-sized set of physical
805*0078df5fSDoug Moore  * pages.
806b16b4c22SMark Johnston  *
807b16b4c22SMark Johnston  * If the pages currently belong to the lazy init pool, then the corresponding
808b16b4c22SMark Johnston  * page structures must be initialized.  In this case it is assumed that the
809b16b4c22SMark Johnston  * first page in the run has already been initialized.
810e3537f92SDoug Moore  */
811e3537f92SDoug Moore static void
812*0078df5fSDoug Moore vm_phys_finish_init(vm_page_t m, int order)
813e3537f92SDoug Moore {
814b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
815b16b4c22SMark Johnston 	if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
816b16b4c22SMark Johnston 		vm_paddr_t pa;
817b16b4c22SMark Johnston 		int segind;
818e3537f92SDoug Moore 
819b16b4c22SMark Johnston 		TSENTER();
820b16b4c22SMark Johnston 		pa = m->phys_addr + PAGE_SIZE;
821b16b4c22SMark Johnston 		segind = m->segind;
822b16b4c22SMark Johnston 		for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order];
823b16b4c22SMark Johnston 		    m_tmp++, pa += PAGE_SIZE)
824*0078df5fSDoug Moore 			vm_page_init_page(m_tmp, pa, segind, VM_NFREEPOOL);
825b16b4c22SMark Johnston 		TSEXIT();
826*0078df5fSDoug Moore 	}
827b16b4c22SMark Johnston #endif
828e3537f92SDoug Moore }
829e3537f92SDoug Moore 
830e3537f92SDoug Moore /*
83189ea39a7SAlan Cox  * Tries to allocate the specified number of pages from the specified pool
83289ea39a7SAlan Cox  * within the specified domain.  Returns the actual number of allocated pages
83389ea39a7SAlan Cox  * and a pointer to each page through the array ma[].
83489ea39a7SAlan Cox  *
83532d81f21SAlan Cox  * The returned pages may not be physically contiguous.  However, in contrast
83632d81f21SAlan Cox  * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
83732d81f21SAlan Cox  * calling this function once to allocate the desired number of pages will
838*0078df5fSDoug Moore  * avoid wasted time in vm_phys_split_pages().  The allocated pages have no
839*0078df5fSDoug Moore  * valid pool field set.
84089ea39a7SAlan Cox  *
84189ea39a7SAlan Cox  * The free page queues for the specified domain must be locked.
84289ea39a7SAlan Cox  */
84389ea39a7SAlan Cox int
84489ea39a7SAlan Cox vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
84589ea39a7SAlan Cox {
84689ea39a7SAlan Cox 	struct vm_freelist *alt, *fl;
84789ea39a7SAlan Cox 	vm_page_t m;
848c9b06fa5SDoug Moore 	int avail, end, flind, freelist, i, oind, pind;
84989ea39a7SAlan Cox 
85089ea39a7SAlan Cox 	KASSERT(domain >= 0 && domain < vm_ndomains,
85189ea39a7SAlan Cox 	    ("vm_phys_alloc_npages: domain %d is out of range", domain));
852b16b4c22SMark Johnston 	KASSERT(vm_phys_pool_valid(pool),
85389ea39a7SAlan Cox 	    ("vm_phys_alloc_npages: pool %d is out of range", pool));
85489ea39a7SAlan Cox 	KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
85589ea39a7SAlan Cox 	    ("vm_phys_alloc_npages: npages %d is out of range", npages));
85689ea39a7SAlan Cox 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
85789ea39a7SAlan Cox 	i = 0;
85889ea39a7SAlan Cox 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
85989ea39a7SAlan Cox 		flind = vm_freelist_to_flind[freelist];
86089ea39a7SAlan Cox 		if (flind < 0)
86189ea39a7SAlan Cox 			continue;
86289ea39a7SAlan Cox 		fl = vm_phys_free_queues[domain][flind][pool];
86389ea39a7SAlan Cox 		for (oind = 0; oind < VM_NFREEORDER; oind++) {
86489ea39a7SAlan Cox 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
86589ea39a7SAlan Cox 				vm_freelist_rem(fl, m, oind);
866c9b06fa5SDoug Moore 				avail = i + (1 << oind);
867c9b06fa5SDoug Moore 				end = imin(npages, avail);
868e3537f92SDoug Moore 				while (i < end)
86989ea39a7SAlan Cox 					ma[i++] = m++;
870c9b06fa5SDoug Moore 				if (i == npages) {
8717493904eSAlan Cox 					/*
872c9b06fa5SDoug Moore 					 * Return excess pages to fl.  Its order
873c9b06fa5SDoug Moore 					 * [0, oind) queues are empty.
8747493904eSAlan Cox 					 */
875*0078df5fSDoug Moore 					vm_phys_enq_range(m, avail - i, fl,
876*0078df5fSDoug Moore 					    pool, 1);
87789ea39a7SAlan Cox 					return (npages);
878c9b06fa5SDoug Moore 				}
87989ea39a7SAlan Cox 			}
88089ea39a7SAlan Cox 		}
88189ea39a7SAlan Cox 		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
882b16b4c22SMark Johnston 			for (pind = vm_default_freepool; pind < VM_NFREEPOOL;
883b16b4c22SMark Johnston 			    pind++) {
88489ea39a7SAlan Cox 				alt = vm_phys_free_queues[domain][flind][pind];
88589ea39a7SAlan Cox 				while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
88689ea39a7SAlan Cox 				    NULL) {
88789ea39a7SAlan Cox 					vm_freelist_rem(alt, m, oind);
888*0078df5fSDoug Moore 					vm_phys_finish_init(m, oind);
889c9b06fa5SDoug Moore 					avail = i + (1 << oind);
890c9b06fa5SDoug Moore 					end = imin(npages, avail);
891e3537f92SDoug Moore 					while (i < end)
89289ea39a7SAlan Cox 						ma[i++] = m++;
893c9b06fa5SDoug Moore 					if (i == npages) {
8947493904eSAlan Cox 						/*
8957493904eSAlan Cox 						 * Return excess pages to fl.
8967493904eSAlan Cox 						 * Its order [0, oind) queues
8977493904eSAlan Cox 						 * are empty.
8987493904eSAlan Cox 						 */
899c9b06fa5SDoug Moore 						vm_phys_enq_range(m, avail - i,
900*0078df5fSDoug Moore 						    fl, pool, 1);
90189ea39a7SAlan Cox 						return (npages);
902c9b06fa5SDoug Moore 					}
90389ea39a7SAlan Cox 				}
90489ea39a7SAlan Cox 			}
90589ea39a7SAlan Cox 		}
90689ea39a7SAlan Cox 	}
90789ea39a7SAlan Cox 	return (i);
90889ea39a7SAlan Cox }
90989ea39a7SAlan Cox 
91089ea39a7SAlan Cox /*
911d866a563SAlan Cox  * Allocate a contiguous, power of two-sized set of physical pages from the
912d866a563SAlan Cox  * specified free list.  The free list must be specified using one of the
913e3537f92SDoug Moore  * manifest constants VM_FREELIST_*.
914d866a563SAlan Cox  *
915d866a563SAlan Cox  * The free page queues must be locked.
91649ca10d4SJayachandran C.  */
9176aede562SDoug Moore static vm_page_t
9180db2102aSMichael Zhilin vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
91949ca10d4SJayachandran C. {
920ef435ae7SJeff Roberson 	struct vm_freelist *alt, *fl;
92111752d88SAlan Cox 	vm_page_t m;
9220db2102aSMichael Zhilin 	int oind, pind, flind;
92311752d88SAlan Cox 
924ef435ae7SJeff Roberson 	KASSERT(domain >= 0 && domain < vm_ndomains,
925ef435ae7SJeff Roberson 	    ("vm_phys_alloc_freelist_pages: domain %d is out of range",
926ef435ae7SJeff Roberson 	    domain));
9270db2102aSMichael Zhilin 	KASSERT(freelist < VM_NFREELIST,
928d866a563SAlan Cox 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
9295be93778SAndrew Turner 	    freelist));
930b16b4c22SMark Johnston 	KASSERT(vm_phys_pool_valid(pool),
93149ca10d4SJayachandran C. 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
93211752d88SAlan Cox 	KASSERT(order < VM_NFREEORDER,
93349ca10d4SJayachandran C. 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
9346520495aSAdrian Chadd 
9350db2102aSMichael Zhilin 	flind = vm_freelist_to_flind[freelist];
9360db2102aSMichael Zhilin 	/* Check if freelist is present */
9370db2102aSMichael Zhilin 	if (flind < 0)
9380db2102aSMichael Zhilin 		return (NULL);
9390db2102aSMichael Zhilin 
940e2068d0bSJeff Roberson 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
9417e226537SAttilio Rao 	fl = &vm_phys_free_queues[domain][flind][pool][0];
94211752d88SAlan Cox 	for (oind = order; oind < VM_NFREEORDER; oind++) {
94311752d88SAlan Cox 		m = TAILQ_FIRST(&fl[oind].pl);
94411752d88SAlan Cox 		if (m != NULL) {
9457e226537SAttilio Rao 			vm_freelist_rem(fl, m, oind);
946370a338aSAlan Cox 			/* The order [order, oind) queues are empty. */
947*0078df5fSDoug Moore 			vm_phys_split_pages(m, oind, fl, order, pool, 1);
94811752d88SAlan Cox 			return (m);
94911752d88SAlan Cox 		}
95011752d88SAlan Cox 	}
95111752d88SAlan Cox 
95211752d88SAlan Cox 	/*
95311752d88SAlan Cox 	 * The given pool was empty.  Find the largest
95411752d88SAlan Cox 	 * contiguous, power-of-two-sized set of pages in any
95511752d88SAlan Cox 	 * pool.  Transfer these pages to the given pool, and
95611752d88SAlan Cox 	 * use them to satisfy the allocation.
95711752d88SAlan Cox 	 */
95811752d88SAlan Cox 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
959b16b4c22SMark Johnston 		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
9607e226537SAttilio Rao 			alt = &vm_phys_free_queues[domain][flind][pind][0];
96111752d88SAlan Cox 			m = TAILQ_FIRST(&alt[oind].pl);
96211752d88SAlan Cox 			if (m != NULL) {
9637e226537SAttilio Rao 				vm_freelist_rem(alt, m, oind);
964*0078df5fSDoug Moore 				vm_phys_finish_init(m, oind);
965370a338aSAlan Cox 				/* The order [order, oind) queues are empty. */
966*0078df5fSDoug Moore 				vm_phys_split_pages(m, oind, fl, order, pool, 1);
96711752d88SAlan Cox 				return (m);
96811752d88SAlan Cox 			}
96911752d88SAlan Cox 		}
97011752d88SAlan Cox 	}
97111752d88SAlan Cox 	return (NULL);
97211752d88SAlan Cox }
97311752d88SAlan Cox 
97411752d88SAlan Cox /*
9756aede562SDoug Moore  * Allocate a contiguous, power of two-sized set of physical pages
9766aede562SDoug Moore  * from the free lists.
9776aede562SDoug Moore  *
9786aede562SDoug Moore  * The free page queues must be locked.
9796aede562SDoug Moore  */
9806aede562SDoug Moore vm_page_t
9816aede562SDoug Moore vm_phys_alloc_pages(int domain, int pool, int order)
9826aede562SDoug Moore {
9836aede562SDoug Moore 	vm_page_t m;
9846aede562SDoug Moore 	int freelist;
9856aede562SDoug Moore 
9866aede562SDoug Moore 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
9876aede562SDoug Moore 		m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
9886aede562SDoug Moore 		if (m != NULL)
9896aede562SDoug Moore 			return (m);
9906aede562SDoug Moore 	}
9916aede562SDoug Moore 	return (NULL);
9926aede562SDoug Moore }
9936aede562SDoug Moore 
9946aede562SDoug Moore /*
99569cbb187SMark Johnston  * Find the vm_page corresponding to the given physical address, which must lie
99669cbb187SMark Johnston  * within the given physical memory segment.
99769cbb187SMark Johnston  */
99869cbb187SMark Johnston vm_page_t
99969cbb187SMark Johnston vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa)
100069cbb187SMark Johnston {
100169cbb187SMark Johnston 	KASSERT(pa >= seg->start && pa < seg->end,
100269cbb187SMark Johnston 	    ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa));
100369cbb187SMark Johnston 
100469cbb187SMark Johnston 	return (&seg->first_page[atop(pa - seg->start)]);
100569cbb187SMark Johnston }
100669cbb187SMark Johnston 
100769cbb187SMark Johnston /*
100811752d88SAlan Cox  * Find the vm_page corresponding to the given physical address.
100911752d88SAlan Cox  */
101011752d88SAlan Cox vm_page_t
101111752d88SAlan Cox vm_phys_paddr_to_vm_page(vm_paddr_t pa)
101211752d88SAlan Cox {
101311752d88SAlan Cox 	struct vm_phys_seg *seg;
101411752d88SAlan Cox 
10159e817428SDoug Moore 	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
101669cbb187SMark Johnston 		return (vm_phys_seg_paddr_to_vm_page(seg, pa));
1017f06a3a36SAndrew Thompson 	return (NULL);
101811752d88SAlan Cox }
101911752d88SAlan Cox 
1020b6de32bdSKonstantin Belousov vm_page_t
1021b6de32bdSKonstantin Belousov vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
1022b6de32bdSKonstantin Belousov {
102338d6b2dcSRoger Pau Monné 	struct vm_phys_fictitious_seg tmp, *seg;
1024b6de32bdSKonstantin Belousov 	vm_page_t m;
1025b6de32bdSKonstantin Belousov 
1026b6de32bdSKonstantin Belousov 	m = NULL;
102738d6b2dcSRoger Pau Monné 	tmp.start = pa;
102838d6b2dcSRoger Pau Monné 	tmp.end = 0;
102938d6b2dcSRoger Pau Monné 
103038d6b2dcSRoger Pau Monné 	rw_rlock(&vm_phys_fictitious_reg_lock);
103138d6b2dcSRoger Pau Monné 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
103238d6b2dcSRoger Pau Monné 	rw_runlock(&vm_phys_fictitious_reg_lock);
103338d6b2dcSRoger Pau Monné 	if (seg == NULL)
103438d6b2dcSRoger Pau Monné 		return (NULL);
103538d6b2dcSRoger Pau Monné 
1036b6de32bdSKonstantin Belousov 	m = &seg->first_page[atop(pa - seg->start)];
103738d6b2dcSRoger Pau Monné 	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
103838d6b2dcSRoger Pau Monné 
1039b6de32bdSKonstantin Belousov 	return (m);
1040b6de32bdSKonstantin Belousov }
1041b6de32bdSKonstantin Belousov 
10425ebe728dSRoger Pau Monné static inline void
10435ebe728dSRoger Pau Monné vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
10445ebe728dSRoger Pau Monné     long page_count, vm_memattr_t memattr)
10455ebe728dSRoger Pau Monné {
10465ebe728dSRoger Pau Monné 	long i;
10475ebe728dSRoger Pau Monné 
1048f93f7cf1SMark Johnston 	bzero(range, page_count * sizeof(*range));
10495ebe728dSRoger Pau Monné 	for (i = 0; i < page_count; i++) {
10505ebe728dSRoger Pau Monné 		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
10515ebe728dSRoger Pau Monné 		range[i].oflags &= ~VPO_UNMANAGED;
10525ebe728dSRoger Pau Monné 		range[i].busy_lock = VPB_UNBUSIED;
10535ebe728dSRoger Pau Monné 	}
10545ebe728dSRoger Pau Monné }
10555ebe728dSRoger Pau Monné 
1056b6de32bdSKonstantin Belousov int
1057b6de32bdSKonstantin Belousov vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
1058b6de32bdSKonstantin Belousov     vm_memattr_t memattr)
1059b6de32bdSKonstantin Belousov {
1060b6de32bdSKonstantin Belousov 	struct vm_phys_fictitious_seg *seg;
1061b6de32bdSKonstantin Belousov 	vm_page_t fp;
10625ebe728dSRoger Pau Monné 	long page_count;
1063b6de32bdSKonstantin Belousov #ifdef VM_PHYSSEG_DENSE
10645ebe728dSRoger Pau Monné 	long pi, pe;
10655ebe728dSRoger Pau Monné 	long dpage_count;
1066b6de32bdSKonstantin Belousov #endif
1067b6de32bdSKonstantin Belousov 
10685ebe728dSRoger Pau Monné 	KASSERT(start < end,
10695ebe728dSRoger Pau Monné 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
10705ebe728dSRoger Pau Monné 	    (uintmax_t)start, (uintmax_t)end));
10715ebe728dSRoger Pau Monné 
1072b6de32bdSKonstantin Belousov 	page_count = (end - start) / PAGE_SIZE;
1073b6de32bdSKonstantin Belousov 
1074b6de32bdSKonstantin Belousov #ifdef VM_PHYSSEG_DENSE
1075b6de32bdSKonstantin Belousov 	pi = atop(start);
10765ebe728dSRoger Pau Monné 	pe = atop(end);
10775ebe728dSRoger Pau Monné 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1078b6de32bdSKonstantin Belousov 		fp = &vm_page_array[pi - first_page];
10795ebe728dSRoger Pau Monné 		if ((pe - first_page) > vm_page_array_size) {
10805ebe728dSRoger Pau Monné 			/*
10815ebe728dSRoger Pau Monné 			 * We have a segment that starts inside
10825ebe728dSRoger Pau Monné 			 * of vm_page_array, but ends outside of it.
10835ebe728dSRoger Pau Monné 			 *
10845ebe728dSRoger Pau Monné 			 * Use vm_page_array pages for those that are
10855ebe728dSRoger Pau Monné 			 * inside of the vm_page_array range, and
10865ebe728dSRoger Pau Monné 			 * allocate the remaining ones.
10875ebe728dSRoger Pau Monné 			 */
10885ebe728dSRoger Pau Monné 			dpage_count = vm_page_array_size - (pi - first_page);
10895ebe728dSRoger Pau Monné 			vm_phys_fictitious_init_range(fp, start, dpage_count,
10905ebe728dSRoger Pau Monné 			    memattr);
10915ebe728dSRoger Pau Monné 			page_count -= dpage_count;
10925ebe728dSRoger Pau Monné 			start += ptoa(dpage_count);
10935ebe728dSRoger Pau Monné 			goto alloc;
10945ebe728dSRoger Pau Monné 		}
10955ebe728dSRoger Pau Monné 		/*
10965ebe728dSRoger Pau Monné 		 * We can allocate the full range from vm_page_array,
10975ebe728dSRoger Pau Monné 		 * so there's no need to register the range in the tree.
10985ebe728dSRoger Pau Monné 		 */
10995ebe728dSRoger Pau Monné 		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
11005ebe728dSRoger Pau Monné 		return (0);
11015ebe728dSRoger Pau Monné 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
11025ebe728dSRoger Pau Monné 		/*
11035ebe728dSRoger Pau Monné 		 * We have a segment that ends inside of vm_page_array,
11045ebe728dSRoger Pau Monné 		 * but starts outside of it.
11055ebe728dSRoger Pau Monné 		 */
11065ebe728dSRoger Pau Monné 		fp = &vm_page_array[0];
11075ebe728dSRoger Pau Monné 		dpage_count = pe - first_page;
11085ebe728dSRoger Pau Monné 		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
11095ebe728dSRoger Pau Monné 		    memattr);
11105ebe728dSRoger Pau Monné 		end -= ptoa(dpage_count);
11115ebe728dSRoger Pau Monné 		page_count -= dpage_count;
11125ebe728dSRoger Pau Monné 		goto alloc;
11135ebe728dSRoger Pau Monné 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
11145ebe728dSRoger Pau Monné 		/*
11155ebe728dSRoger Pau Monné 		 * Trying to register a fictitious range that expands before
11165ebe728dSRoger Pau Monné 		 * and after vm_page_array.
11175ebe728dSRoger Pau Monné 		 */
11185ebe728dSRoger Pau Monné 		return (EINVAL);
11195ebe728dSRoger Pau Monné 	} else {
11205ebe728dSRoger Pau Monné alloc:
1121b6de32bdSKonstantin Belousov #endif
1122b6de32bdSKonstantin Belousov 		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
1123f93f7cf1SMark Johnston 		    M_WAITOK);
11245ebe728dSRoger Pau Monné #ifdef VM_PHYSSEG_DENSE
1125b6de32bdSKonstantin Belousov 	}
11265ebe728dSRoger Pau Monné #endif
11275ebe728dSRoger Pau Monné 	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
112838d6b2dcSRoger Pau Monné 
112938d6b2dcSRoger Pau Monné 	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
1130b6de32bdSKonstantin Belousov 	seg->start = start;
1131b6de32bdSKonstantin Belousov 	seg->end = end;
1132b6de32bdSKonstantin Belousov 	seg->first_page = fp;
113338d6b2dcSRoger Pau Monné 
113438d6b2dcSRoger Pau Monné 	rw_wlock(&vm_phys_fictitious_reg_lock);
113538d6b2dcSRoger Pau Monné 	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
113638d6b2dcSRoger Pau Monné 	rw_wunlock(&vm_phys_fictitious_reg_lock);
113738d6b2dcSRoger Pau Monné 
1138b6de32bdSKonstantin Belousov 	return (0);
1139b6de32bdSKonstantin Belousov }
1140b6de32bdSKonstantin Belousov 
1141b6de32bdSKonstantin Belousov void
1142b6de32bdSKonstantin Belousov vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1143b6de32bdSKonstantin Belousov {
114438d6b2dcSRoger Pau Monné 	struct vm_phys_fictitious_seg *seg, tmp;
1145b6de32bdSKonstantin Belousov #ifdef VM_PHYSSEG_DENSE
11465ebe728dSRoger Pau Monné 	long pi, pe;
1147b6de32bdSKonstantin Belousov #endif
1148b6de32bdSKonstantin Belousov 
11495ebe728dSRoger Pau Monné 	KASSERT(start < end,
11505ebe728dSRoger Pau Monné 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
11515ebe728dSRoger Pau Monné 	    (uintmax_t)start, (uintmax_t)end));
11525ebe728dSRoger Pau Monné 
1153b6de32bdSKonstantin Belousov #ifdef VM_PHYSSEG_DENSE
1154b6de32bdSKonstantin Belousov 	pi = atop(start);
11555ebe728dSRoger Pau Monné 	pe = atop(end);
11565ebe728dSRoger Pau Monné 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
11575ebe728dSRoger Pau Monné 		if ((pe - first_page) <= vm_page_array_size) {
11585ebe728dSRoger Pau Monné 			/*
11595ebe728dSRoger Pau Monné 			 * This segment was allocated using vm_page_array
11605ebe728dSRoger Pau Monné 			 * only, there's nothing to do since those pages
11615ebe728dSRoger Pau Monné 			 * were never added to the tree.
11625ebe728dSRoger Pau Monné 			 */
11635ebe728dSRoger Pau Monné 			return;
11645ebe728dSRoger Pau Monné 		}
11655ebe728dSRoger Pau Monné 		/*
11665ebe728dSRoger Pau Monné 		 * We have a segment that starts inside
11675ebe728dSRoger Pau Monné 		 * of vm_page_array, but ends outside of it.
11685ebe728dSRoger Pau Monné 		 *
11695ebe728dSRoger Pau Monné 		 * Calculate how many pages were added to the
11705ebe728dSRoger Pau Monné 		 * tree and free them.
11715ebe728dSRoger Pau Monné 		 */
11725ebe728dSRoger Pau Monné 		start = ptoa(first_page + vm_page_array_size);
11735ebe728dSRoger Pau Monné 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
11745ebe728dSRoger Pau Monné 		/*
11755ebe728dSRoger Pau Monné 		 * We have a segment that ends inside of vm_page_array,
11765ebe728dSRoger Pau Monné 		 * but starts outside of it.
11775ebe728dSRoger Pau Monné 		 */
11785ebe728dSRoger Pau Monné 		end = ptoa(first_page);
11795ebe728dSRoger Pau Monné 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
11805ebe728dSRoger Pau Monné 		/* Since it's not possible to register such a range, panic. */
11815ebe728dSRoger Pau Monné 		panic(
11825ebe728dSRoger Pau Monné 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
11835ebe728dSRoger Pau Monné 		    (uintmax_t)start, (uintmax_t)end);
11845ebe728dSRoger Pau Monné 	}
1185b6de32bdSKonstantin Belousov #endif
118638d6b2dcSRoger Pau Monné 	tmp.start = start;
118738d6b2dcSRoger Pau Monné 	tmp.end = 0;
1188b6de32bdSKonstantin Belousov 
118938d6b2dcSRoger Pau Monné 	rw_wlock(&vm_phys_fictitious_reg_lock);
119038d6b2dcSRoger Pau Monné 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
119138d6b2dcSRoger Pau Monné 	if (seg->start != start || seg->end != end) {
119238d6b2dcSRoger Pau Monné 		rw_wunlock(&vm_phys_fictitious_reg_lock);
119338d6b2dcSRoger Pau Monné 		panic(
119438d6b2dcSRoger Pau Monné 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
119538d6b2dcSRoger Pau Monné 		    (uintmax_t)start, (uintmax_t)end);
119638d6b2dcSRoger Pau Monné 	}
119738d6b2dcSRoger Pau Monné 	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
119838d6b2dcSRoger Pau Monné 	rw_wunlock(&vm_phys_fictitious_reg_lock);
119938d6b2dcSRoger Pau Monné 	free(seg->first_page, M_FICT_PAGES);
120038d6b2dcSRoger Pau Monné 	free(seg, M_FICT_PAGES);
1201b6de32bdSKonstantin Belousov }
1202b6de32bdSKonstantin Belousov 
120311752d88SAlan Cox /*
1204e3537f92SDoug Moore  * Free a contiguous, power of two-sized set of physical pages.
1205*0078df5fSDoug Moore  * The pool field in the first page determines the destination pool.
12068941dc44SAlan Cox  *
12078941dc44SAlan Cox  * The free page queues must be locked.
120811752d88SAlan Cox  */
120911752d88SAlan Cox void
1210*0078df5fSDoug Moore vm_phys_free_pages(vm_page_t m, int pool, int order)
121111752d88SAlan Cox {
121211752d88SAlan Cox 	struct vm_freelist *fl;
121311752d88SAlan Cox 	struct vm_phys_seg *seg;
12145c1f2cc4SAlan Cox 	vm_paddr_t pa;
121511752d88SAlan Cox 	vm_page_t m_buddy;
121611752d88SAlan Cox 
121711752d88SAlan Cox 	KASSERT(m->order == VM_NFREEORDER,
1218*0078df5fSDoug Moore 	    ("%s: page %p has unexpected order %d",
1219*0078df5fSDoug Moore 	    __func__, m, m->order));
1220*0078df5fSDoug Moore 	KASSERT(vm_phys_pool_valid(pool),
1221*0078df5fSDoug Moore 	    ("%s: unexpected pool param %d", __func__, pool));
122211752d88SAlan Cox 	KASSERT(order < VM_NFREEORDER,
1223*0078df5fSDoug Moore 	    ("%s: order %d is out of range", __func__, order));
122411752d88SAlan Cox 	seg = &vm_phys_segs[m->segind];
1225e2068d0bSJeff Roberson 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
12265c1f2cc4SAlan Cox 	if (order < VM_NFREEORDER - 1) {
12275c1f2cc4SAlan Cox 		pa = VM_PAGE_TO_PHYS(m);
12285c1f2cc4SAlan Cox 		do {
12295c1f2cc4SAlan Cox 			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
12305c1f2cc4SAlan Cox 			if (pa < seg->start || pa >= seg->end)
123111752d88SAlan Cox 				break;
123269cbb187SMark Johnston 			m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa);
123311752d88SAlan Cox 			if (m_buddy->order != order)
123411752d88SAlan Cox 				break;
123511752d88SAlan Cox 			fl = (*seg->free_queues)[m_buddy->pool];
12367e226537SAttilio Rao 			vm_freelist_rem(fl, m_buddy, order);
1237*0078df5fSDoug Moore 			vm_phys_finish_init(m_buddy, order);
123811752d88SAlan Cox 			order++;
12395c1f2cc4SAlan Cox 			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
124069cbb187SMark Johnston 			m = vm_phys_seg_paddr_to_vm_page(seg, pa);
12415c1f2cc4SAlan Cox 		} while (order < VM_NFREEORDER - 1);
124211752d88SAlan Cox 	}
1243*0078df5fSDoug Moore 	fl = (*seg->free_queues)[pool];
1244*0078df5fSDoug Moore 	vm_freelist_add(fl, m, order, pool, 1);
124511752d88SAlan Cox }
124611752d88SAlan Cox 
1247b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
1248b16b4c22SMark Johnston /*
1249b16b4c22SMark Johnston  * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving
1250b16b4c22SMark Johnston  * them to the default pool.  This is a prerequisite for some rare operations
1251b16b4c22SMark Johnston  * which need to scan the page array and thus depend on all pages being
1252b16b4c22SMark Johnston  * initialized.
1253b16b4c22SMark Johnston  */
1254b16b4c22SMark Johnston static void
1255b16b4c22SMark Johnston vm_phys_lazy_init_domain(int domain, bool locked)
1256b16b4c22SMark Johnston {
1257b16b4c22SMark Johnston 	static bool initdone[MAXMEMDOM];
1258b16b4c22SMark Johnston 	struct vm_domain *vmd;
1259b16b4c22SMark Johnston 	struct vm_freelist *fl;
1260b16b4c22SMark Johnston 	vm_page_t m;
1261b16b4c22SMark Johnston 	int pind;
1262b16b4c22SMark Johnston 	bool unlocked;
1263b16b4c22SMark Johnston 
1264b16b4c22SMark Johnston 	if (__predict_true(atomic_load_bool(&initdone[domain])))
1265b16b4c22SMark Johnston 		return;
1266b16b4c22SMark Johnston 
1267b16b4c22SMark Johnston 	vmd = VM_DOMAIN(domain);
1268b16b4c22SMark Johnston 	if (locked)
1269b16b4c22SMark Johnston 		vm_domain_free_assert_locked(vmd);
1270b16b4c22SMark Johnston 	else
1271b16b4c22SMark Johnston 		vm_domain_free_lock(vmd);
1272b16b4c22SMark Johnston 	if (atomic_load_bool(&initdone[domain]))
1273b16b4c22SMark Johnston 		goto out;
1274b16b4c22SMark Johnston 	pind = VM_FREEPOOL_LAZYINIT;
1275b16b4c22SMark Johnston 	for (int freelist = 0; freelist < VM_NFREELIST; freelist++) {
1276b16b4c22SMark Johnston 		int flind;
1277b16b4c22SMark Johnston 
1278b16b4c22SMark Johnston 		flind = vm_freelist_to_flind[freelist];
1279b16b4c22SMark Johnston 		if (flind < 0)
1280b16b4c22SMark Johnston 			continue;
1281b16b4c22SMark Johnston 		fl = vm_phys_free_queues[domain][flind][pind];
1282b16b4c22SMark Johnston 		for (int oind = 0; oind < VM_NFREEORDER; oind++) {
1283b16b4c22SMark Johnston 			if (atomic_load_int(&fl[oind].lcnt) == 0)
1284b16b4c22SMark Johnston 				continue;
1285b16b4c22SMark Johnston 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
1286b16b4c22SMark Johnston 				/*
1287b16b4c22SMark Johnston 				 * Avoid holding the lock across the
1288b16b4c22SMark Johnston 				 * initialization unless there's a free page
1289b16b4c22SMark Johnston 				 * shortage.
1290b16b4c22SMark Johnston 				 */
1291b16b4c22SMark Johnston 				vm_freelist_rem(fl, m, oind);
1292b16b4c22SMark Johnston 				unlocked = vm_domain_allocate(vmd,
1293b16b4c22SMark Johnston 				    VM_ALLOC_NORMAL, 1 << oind);
1294b16b4c22SMark Johnston 				if (unlocked)
1295b16b4c22SMark Johnston 					vm_domain_free_unlock(vmd);
1296*0078df5fSDoug Moore 				vm_phys_finish_init(m, oind);
1297b16b4c22SMark Johnston 				if (unlocked) {
1298b16b4c22SMark Johnston 					vm_domain_freecnt_inc(vmd, 1 << oind);
1299b16b4c22SMark Johnston 					vm_domain_free_lock(vmd);
1300b16b4c22SMark Johnston 				}
1301*0078df5fSDoug Moore 				vm_phys_free_pages(m, VM_FREEPOOL_DEFAULT,
1302*0078df5fSDoug Moore 				    oind);
1303b16b4c22SMark Johnston 			}
1304b16b4c22SMark Johnston 		}
1305b16b4c22SMark Johnston 	}
1306b16b4c22SMark Johnston 	atomic_store_bool(&initdone[domain], true);
1307b16b4c22SMark Johnston out:
1308b16b4c22SMark Johnston 	if (!locked)
1309b16b4c22SMark Johnston 		vm_domain_free_unlock(vmd);
1310b16b4c22SMark Johnston }
1311b16b4c22SMark Johnston 
1312b16b4c22SMark Johnston static void
1313b16b4c22SMark Johnston vm_phys_lazy_init(void)
1314b16b4c22SMark Johnston {
1315b16b4c22SMark Johnston 	for (int domain = 0; domain < vm_ndomains; domain++)
1316b16b4c22SMark Johnston 		vm_phys_lazy_init_domain(domain, false);
1317b16b4c22SMark Johnston 	atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT);
1318b16b4c22SMark Johnston }
1319b16b4c22SMark Johnston 
1320b16b4c22SMark Johnston static void
1321b16b4c22SMark Johnston vm_phys_lazy_init_kthr(void *arg __unused)
1322b16b4c22SMark Johnston {
1323b16b4c22SMark Johnston 	vm_phys_lazy_init();
1324b16b4c22SMark Johnston 	kthread_exit();
1325b16b4c22SMark Johnston }
1326b16b4c22SMark Johnston 
1327b16b4c22SMark Johnston static void
1328b16b4c22SMark Johnston vm_phys_lazy_sysinit(void *arg __unused)
1329b16b4c22SMark Johnston {
1330b16b4c22SMark Johnston 	struct thread *td;
1331b16b4c22SMark Johnston 	int error;
1332b16b4c22SMark Johnston 
1333b16b4c22SMark Johnston 	error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td,
1334b16b4c22SMark Johnston 	    RFSTOPPED, 0, "vmlazyinit");
1335b16b4c22SMark Johnston 	if (error == 0) {
1336b16b4c22SMark Johnston 		thread_lock(td);
1337b16b4c22SMark Johnston 		sched_prio(td, PRI_MIN_IDLE);
1338b16b4c22SMark Johnston 		sched_add(td, SRQ_BORING);
1339b16b4c22SMark Johnston 	} else {
1340b16b4c22SMark Johnston 		printf("%s: could not create lazy init thread: %d\n",
1341b16b4c22SMark Johnston 		    __func__, error);
1342b16b4c22SMark Johnston 		vm_phys_lazy_init();
1343b16b4c22SMark Johnston 	}
1344b16b4c22SMark Johnston }
1345b16b4c22SMark Johnston SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit,
1346b16b4c22SMark Johnston     NULL);
1347b16b4c22SMark Johnston #endif /* VM_FREEPOOL_LAZYINIT */
1348b16b4c22SMark Johnston 
134911752d88SAlan Cox /*
1350e3537f92SDoug Moore  * Free a contiguous, arbitrarily sized set of physical pages, without
1351*0078df5fSDoug Moore  * merging across set boundaries.  Assumes no pages have a valid pool field.
1352b8590daeSDoug Moore  *
1353b8590daeSDoug Moore  * The free page queues must be locked.
1354b8590daeSDoug Moore  */
1355b8590daeSDoug Moore void
1356*0078df5fSDoug Moore vm_phys_enqueue_contig(vm_page_t m, int pool, u_long npages)
1357b8590daeSDoug Moore {
1358b8590daeSDoug Moore 	struct vm_freelist *fl;
1359b8590daeSDoug Moore 	struct vm_phys_seg *seg;
1360b8590daeSDoug Moore 	vm_page_t m_end;
1361c9b06fa5SDoug Moore 	vm_paddr_t diff, lo;
1362b8590daeSDoug Moore 	int order;
1363b8590daeSDoug Moore 
1364b8590daeSDoug Moore 	/*
1365b8590daeSDoug Moore 	 * Avoid unnecessary coalescing by freeing the pages in the largest
1366b8590daeSDoug Moore 	 * possible power-of-two-sized subsets.
1367b8590daeSDoug Moore 	 */
1368b8590daeSDoug Moore 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1369b8590daeSDoug Moore 	seg = &vm_phys_segs[m->segind];
1370*0078df5fSDoug Moore 	fl = (*seg->free_queues)[pool];
1371b8590daeSDoug Moore 	m_end = m + npages;
1372b8590daeSDoug Moore 	/* Free blocks of increasing size. */
13736dd15b7aSDoug Moore 	lo = atop(VM_PAGE_TO_PHYS(m));
1374c9b06fa5SDoug Moore 	if (m < m_end &&
1375c9b06fa5SDoug Moore 	    (diff = lo ^ (lo + npages - 1)) != 0) {
1376543d55d7SDoug Moore 		order = min(ilog2(diff), VM_NFREEORDER - 1);
1377*0078df5fSDoug Moore 		m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl,
1378*0078df5fSDoug Moore 		    pool, 1);
13795c1f2cc4SAlan Cox 	}
1380c9b06fa5SDoug Moore 
1381b8590daeSDoug Moore 	/* Free blocks of maximum size. */
1382c9b06fa5SDoug Moore 	order = VM_NFREEORDER - 1;
1383b8590daeSDoug Moore 	while (m + (1 << order) <= m_end) {
1384b8590daeSDoug Moore 		KASSERT(seg == &vm_phys_segs[m->segind],
1385b8590daeSDoug Moore 		    ("%s: page range [%p,%p) spans multiple segments",
1386b8590daeSDoug Moore 		    __func__, m_end - npages, m));
1387*0078df5fSDoug Moore 		vm_phys_enq_chunk(fl, m, order, pool, 1);
1388b8590daeSDoug Moore 		m += 1 << order;
1389b8590daeSDoug Moore 	}
1390b8590daeSDoug Moore 	/* Free blocks of diminishing size. */
1391*0078df5fSDoug Moore 	vm_phys_enq_beg(m, m_end - m, fl, pool, 1);
1392b8590daeSDoug Moore }
1393b8590daeSDoug Moore 
1394b8590daeSDoug Moore /*
1395b8590daeSDoug Moore  * Free a contiguous, arbitrarily sized set of physical pages.
1396*0078df5fSDoug Moore  * Assumes that every page but the first has no valid pool field.
1397*0078df5fSDoug Moore  * Uses the pool value in the first page if valid, otherwise default.
1398b8590daeSDoug Moore  *
1399b8590daeSDoug Moore  * The free page queues must be locked.
1400b8590daeSDoug Moore  */
1401b8590daeSDoug Moore void
1402*0078df5fSDoug Moore vm_phys_free_contig(vm_page_t m, int pool, u_long npages)
1403b8590daeSDoug Moore {
14046dd15b7aSDoug Moore 	vm_paddr_t lo;
1405b8590daeSDoug Moore 	vm_page_t m_start, m_end;
14066dd15b7aSDoug Moore 	unsigned max_order, order_start, order_end;
1407b8590daeSDoug Moore 
1408b8590daeSDoug Moore 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1409b8590daeSDoug Moore 
14106dd15b7aSDoug Moore 	lo = atop(VM_PAGE_TO_PHYS(m));
1411543d55d7SDoug Moore 	max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1);
1412e3537f92SDoug Moore 
1413e3537f92SDoug Moore 	m_start = m;
14146dd15b7aSDoug Moore 	order_start = ffsll(lo) - 1;
14156dd15b7aSDoug Moore 	if (order_start < max_order)
1416b8590daeSDoug Moore 		m_start += 1 << order_start;
1417e3537f92SDoug Moore 	m_end = m + npages;
14186dd15b7aSDoug Moore 	order_end = ffsll(lo + npages) - 1;
14196dd15b7aSDoug Moore 	if (order_end < max_order)
1420b8590daeSDoug Moore 		m_end -= 1 << order_end;
1421b8590daeSDoug Moore 	/*
1422b8590daeSDoug Moore 	 * Avoid unnecessary coalescing by freeing the pages at the start and
1423b8590daeSDoug Moore 	 * end of the range last.
1424b8590daeSDoug Moore 	 */
1425b8590daeSDoug Moore 	if (m_start < m_end)
1426*0078df5fSDoug Moore 		vm_phys_enqueue_contig(m_start, pool, m_end - m_start);
1427e3537f92SDoug Moore 	if (order_start < max_order)
1428*0078df5fSDoug Moore 		vm_phys_free_pages(m, pool, order_start);
1429e3537f92SDoug Moore 	if (order_end < max_order)
1430*0078df5fSDoug Moore 		vm_phys_free_pages(m_end, pool, order_end);
14315c1f2cc4SAlan Cox }
14325c1f2cc4SAlan Cox 
14335c1f2cc4SAlan Cox /*
14349e817428SDoug Moore  * Identify the first address range within segment segind or greater
14359e817428SDoug Moore  * that matches the domain, lies within the low/high range, and has
14369e817428SDoug Moore  * enough pages.  Return -1 if there is none.
1437c869e672SAlan Cox  */
14389e817428SDoug Moore int
14399e817428SDoug Moore vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
14409e817428SDoug Moore     u_long npages, vm_paddr_t low, vm_paddr_t high)
1441c869e672SAlan Cox {
14429e817428SDoug Moore 	vm_paddr_t pa_end, pa_start;
14439e817428SDoug Moore 	struct vm_phys_seg *end_seg, *seg;
1444c869e672SAlan Cox 
14459e817428SDoug Moore 	KASSERT(npages > 0, ("npages is zero"));
144658d42717SAlan Cox 	KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range"));
14479e817428SDoug Moore 	end_seg = &vm_phys_segs[vm_phys_nsegs];
14489e817428SDoug Moore 	for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) {
14493f289c3fSJeff Roberson 		if (seg->domain != domain)
14503f289c3fSJeff Roberson 			continue;
1451c869e672SAlan Cox 		if (seg->start >= high)
14529e817428SDoug Moore 			return (-1);
14539e817428SDoug Moore 		pa_start = MAX(low, seg->start);
14549e817428SDoug Moore 		pa_end = MIN(high, seg->end);
14559e817428SDoug Moore 		if (pa_end - pa_start < ptoa(npages))
1456c869e672SAlan Cox 			continue;
1457b16b4c22SMark Johnston #ifdef VM_FREEPOOL_LAZYINIT
1458b16b4c22SMark Johnston 		/*
1459b16b4c22SMark Johnston 		 * The pages on the free lists must be initialized.
1460b16b4c22SMark Johnston 		 */
1461b16b4c22SMark Johnston 		vm_phys_lazy_init_domain(domain, false);
1462b16b4c22SMark Johnston #endif
146369cbb187SMark Johnston 		bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start);
1464fbff6d54SMark Johnston 		bounds[1] = &seg->first_page[atop(pa_end - seg->start)];
14659e817428SDoug Moore 		return (seg - vm_phys_segs);
1466c869e672SAlan Cox 	}
14679e817428SDoug Moore 	return (-1);
1468c869e672SAlan Cox }
1469c869e672SAlan Cox 
1470c869e672SAlan Cox /*
14719742373aSAlan Cox  * Search for the given physical page "m" in the free lists.  If the search
14726062d9faSMark Johnston  * succeeds, remove "m" from the free lists and return true.  Otherwise, return
14736062d9faSMark Johnston  * false, indicating that "m" is not in the free lists.
14747bfda801SAlan Cox  *
14757bfda801SAlan Cox  * The free page queues must be locked.
14767bfda801SAlan Cox  */
14776062d9faSMark Johnston bool
1478b16b4c22SMark Johnston vm_phys_unfree_page(vm_paddr_t pa)
14797bfda801SAlan Cox {
14807bfda801SAlan Cox 	struct vm_freelist *fl;
14817bfda801SAlan Cox 	struct vm_phys_seg *seg;
1482b16b4c22SMark Johnston 	vm_paddr_t pa_half;
1483b16b4c22SMark Johnston 	vm_page_t m, m_set, m_tmp;
1484*0078df5fSDoug Moore 	int order, pool;
14857bfda801SAlan Cox 
1486b16b4c22SMark Johnston 	seg = vm_phys_paddr_to_seg(pa);
1487b16b4c22SMark Johnston 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1488b16b4c22SMark Johnston 
1489*0078df5fSDoug Moore #ifdef VM_FREEPOOL_LAZYINIT
1490b16b4c22SMark Johnston 	/*
1491b16b4c22SMark Johnston 	 * The pages on the free lists must be initialized.
1492b16b4c22SMark Johnston 	 */
1493b16b4c22SMark Johnston 	vm_phys_lazy_init_domain(seg->domain, true);
1494b16b4c22SMark Johnston #endif
1495b16b4c22SMark Johnston 
14967bfda801SAlan Cox 	/*
14977bfda801SAlan Cox 	 * First, find the contiguous, power of two-sized set of free
14987bfda801SAlan Cox 	 * physical pages containing the given physical page "m" and
14997bfda801SAlan Cox 	 * assign it to "m_set".
15007bfda801SAlan Cox 	 */
1501b16b4c22SMark Johnston 	m = vm_phys_paddr_to_vm_page(pa);
15027bfda801SAlan Cox 	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1503bc8794a1SAlan Cox 	    order < VM_NFREEORDER - 1; ) {
15047bfda801SAlan Cox 		order++;
15057bfda801SAlan Cox 		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
15062fbced65SAlan Cox 		if (pa >= seg->start)
150769cbb187SMark Johnston 			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa);
1508e35395ceSAlan Cox 		else
15096062d9faSMark Johnston 			return (false);
15107bfda801SAlan Cox 	}
1511e35395ceSAlan Cox 	if (m_set->order < order)
15126062d9faSMark Johnston 		return (false);
1513e35395ceSAlan Cox 	if (m_set->order == VM_NFREEORDER)
15146062d9faSMark Johnston 		return (false);
15157bfda801SAlan Cox 	KASSERT(m_set->order < VM_NFREEORDER,
15167bfda801SAlan Cox 	    ("vm_phys_unfree_page: page %p has unexpected order %d",
15177bfda801SAlan Cox 	    m_set, m_set->order));
15187bfda801SAlan Cox 
15197bfda801SAlan Cox 	/*
15207bfda801SAlan Cox 	 * Next, remove "m_set" from the free lists.  Finally, extract
15217bfda801SAlan Cox 	 * "m" from "m_set" using an iterative algorithm: While "m_set"
15227bfda801SAlan Cox 	 * is larger than a page, shrink "m_set" by returning the half
15237bfda801SAlan Cox 	 * of "m_set" that does not contain "m" to the free lists.
15247bfda801SAlan Cox 	 */
1525*0078df5fSDoug Moore 	pool = m_set->pool;
1526*0078df5fSDoug Moore 	fl = (*seg->free_queues)[pool];
15277bfda801SAlan Cox 	order = m_set->order;
15287e226537SAttilio Rao 	vm_freelist_rem(fl, m_set, order);
15297bfda801SAlan Cox 	while (order > 0) {
15307bfda801SAlan Cox 		order--;
15317bfda801SAlan Cox 		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
15327bfda801SAlan Cox 		if (m->phys_addr < pa_half)
153369cbb187SMark Johnston 			m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
15347bfda801SAlan Cox 		else {
15357bfda801SAlan Cox 			m_tmp = m_set;
153669cbb187SMark Johnston 			m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
15377bfda801SAlan Cox 		}
1538*0078df5fSDoug Moore 		vm_freelist_add(fl, m_tmp, order, pool, 0);
15397bfda801SAlan Cox 	}
15407bfda801SAlan Cox 	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
15416062d9faSMark Johnston 	return (true);
15427bfda801SAlan Cox }
15437bfda801SAlan Cox 
15447bfda801SAlan Cox /*
15452a4897bdSDoug Moore  * Find a run of contiguous physical pages, meeting alignment requirements, from
15462a4897bdSDoug Moore  * a list of max-sized page blocks, where we need at least two consecutive
15472a4897bdSDoug Moore  * blocks to satisfy the (large) page request.
1548fa8a6585SDoug Moore  */
1549fa8a6585SDoug Moore static vm_page_t
15502a4897bdSDoug Moore vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages,
1551fa8a6585SDoug Moore     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1552fa8a6585SDoug Moore {
1553fa8a6585SDoug Moore 	struct vm_phys_seg *seg;
15542a4897bdSDoug Moore 	vm_page_t m, m_iter, m_ret;
15552a4897bdSDoug Moore 	vm_paddr_t max_size, size;
15562a4897bdSDoug Moore 	int max_order;
1557fa8a6585SDoug Moore 
15582a4897bdSDoug Moore 	max_order = VM_NFREEORDER - 1;
1559fa8a6585SDoug Moore 	size = npages << PAGE_SHIFT;
15602a4897bdSDoug Moore 	max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order);
15612a4897bdSDoug Moore 	KASSERT(size > max_size, ("size is too small"));
15622a4897bdSDoug Moore 
1563fa8a6585SDoug Moore 	/*
15642a4897bdSDoug Moore 	 * In order to avoid examining any free max-sized page block more than
15652a4897bdSDoug Moore 	 * twice, identify the ones that are first in a physically-contiguous
15662a4897bdSDoug Moore 	 * sequence of such blocks, and only for those walk the sequence to
15672a4897bdSDoug Moore 	 * check if there are enough free blocks starting at a properly aligned
15682a4897bdSDoug Moore 	 * block.  Thus, no block is checked for free-ness more than twice.
1569fa8a6585SDoug Moore 	 */
15702a4897bdSDoug Moore 	TAILQ_FOREACH(m, &fl[max_order].pl, listq) {
15712a4897bdSDoug Moore 		/*
15722a4897bdSDoug Moore 		 * Skip m unless it is first in a sequence of free max page
15732a4897bdSDoug Moore 		 * blocks >= low in its segment.
15742a4897bdSDoug Moore 		 */
15752a4897bdSDoug Moore 		seg = &vm_phys_segs[m->segind];
15762a4897bdSDoug Moore 		if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start))
15772a4897bdSDoug Moore 			continue;
15782a4897bdSDoug Moore 		if (VM_PAGE_TO_PHYS(m) >= max_size &&
15792a4897bdSDoug Moore 		    VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) &&
15802a4897bdSDoug Moore 		    max_order == m[-1 << max_order].order)
1581fa8a6585SDoug Moore 			continue;
1582fa8a6585SDoug Moore 
1583fa8a6585SDoug Moore 		/*
15842a4897bdSDoug Moore 		 * Advance m_ret from m to the first of the sequence, if any,
15852a4897bdSDoug Moore 		 * that satisfies alignment conditions and might leave enough
15862a4897bdSDoug Moore 		 * space.
1587fa8a6585SDoug Moore 		 */
15882a4897bdSDoug Moore 		m_ret = m;
15892a4897bdSDoug Moore 		while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret),
15902a4897bdSDoug Moore 		    size, alignment, boundary) &&
15912a4897bdSDoug Moore 		    VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) &&
15922a4897bdSDoug Moore 		    max_order == m_ret[1 << max_order].order)
15932a4897bdSDoug Moore 			m_ret += 1 << max_order;
15942a4897bdSDoug Moore 
15952a4897bdSDoug Moore 		/*
15962a4897bdSDoug Moore 		 * Skip m unless some block m_ret in the sequence is properly
15972a4897bdSDoug Moore 		 * aligned, and begins a sequence of enough pages less than
15982a4897bdSDoug Moore 		 * high, and in the same segment.
15992a4897bdSDoug Moore 		 */
16002a4897bdSDoug Moore 		if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end))
1601fa8a6585SDoug Moore 			continue;
1602fa8a6585SDoug Moore 
1603fa8a6585SDoug Moore 		/*
16042a4897bdSDoug Moore 		 * Skip m unless the blocks to allocate starting at m_ret are
16052a4897bdSDoug Moore 		 * all free.
1606fa8a6585SDoug Moore 		 */
16072a4897bdSDoug Moore 		for (m_iter = m_ret;
16082a4897bdSDoug Moore 		    m_iter < m_ret + npages && max_order == m_iter->order;
16092a4897bdSDoug Moore 		    m_iter += 1 << max_order) {
1610fa8a6585SDoug Moore 		}
16112a4897bdSDoug Moore 		if (m_iter < m_ret + npages)
1612fa8a6585SDoug Moore 			continue;
1613fa8a6585SDoug Moore 		return (m_ret);
1614fa8a6585SDoug Moore 	}
1615fa8a6585SDoug Moore 	return (NULL);
1616fa8a6585SDoug Moore }
1617fa8a6585SDoug Moore 
1618fa8a6585SDoug Moore /*
1619fa8a6585SDoug Moore  * Find a run of contiguous physical pages from the specified free list
1620342056faSDoug Moore  * table.
1621c869e672SAlan Cox  */
1622c869e672SAlan Cox static vm_page_t
1623fa8a6585SDoug Moore vm_phys_find_queues_contig(
1624342056faSDoug Moore     struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],
1625342056faSDoug Moore     u_long npages, vm_paddr_t low, vm_paddr_t high,
1626342056faSDoug Moore     u_long alignment, vm_paddr_t boundary)
1627c869e672SAlan Cox {
1628c869e672SAlan Cox 	struct vm_freelist *fl;
1629fa8a6585SDoug Moore 	vm_page_t m_ret;
1630c869e672SAlan Cox 	vm_paddr_t pa, pa_end, size;
1631c869e672SAlan Cox 	int oind, order, pind;
1632c869e672SAlan Cox 
1633c869e672SAlan Cox 	KASSERT(npages > 0, ("npages is 0"));
1634c869e672SAlan Cox 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1635c869e672SAlan Cox 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1636c869e672SAlan Cox 	/* Compute the queue that is the best fit for npages. */
16379161b4deSAlan Cox 	order = flsl(npages - 1);
1638fa8a6585SDoug Moore 	/* Search for a large enough free block. */
1639c869e672SAlan Cox 	size = npages << PAGE_SHIFT;
1640fa8a6585SDoug Moore 	for (oind = order; oind < VM_NFREEORDER; oind++) {
1641b16b4c22SMark Johnston 		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1642342056faSDoug Moore 			fl = (*queues)[pind];
16435cd29d0fSMark Johnston 			TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
1644c869e672SAlan Cox 				/*
1645da92ecbcSDoug Moore 				 * Determine if the address range starting at pa
1646da92ecbcSDoug Moore 				 * is within the given range, satisfies the
1647da92ecbcSDoug Moore 				 * given alignment, and does not cross the given
1648da92ecbcSDoug Moore 				 * boundary.
164911752d88SAlan Cox 				 */
1650da92ecbcSDoug Moore 				pa = VM_PAGE_TO_PHYS(m_ret);
1651da92ecbcSDoug Moore 				pa_end = pa + size;
1652fa8a6585SDoug Moore 				if (low <= pa && pa_end <= high &&
1653fa8a6585SDoug Moore 				    vm_addr_ok(pa, size, alignment, boundary))
1654fa8a6585SDoug Moore 					return (m_ret);
1655fa8a6585SDoug Moore 			}
1656fa8a6585SDoug Moore 		}
1657fa8a6585SDoug Moore 	}
1658da92ecbcSDoug Moore 	if (order < VM_NFREEORDER)
1659fa8a6585SDoug Moore 		return (NULL);
16602a4897bdSDoug Moore 	/* Search for a long-enough sequence of max-order blocks. */
1661b16b4c22SMark Johnston 	for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1662fa8a6585SDoug Moore 		fl = (*queues)[pind];
16632a4897bdSDoug Moore 		m_ret = vm_phys_find_freelist_contig(fl, npages,
1664fa8a6585SDoug Moore 		    low, high, alignment, boundary);
1665fa8a6585SDoug Moore 		if (m_ret != NULL)
1666fa8a6585SDoug Moore 			return (m_ret);
166711752d88SAlan Cox 	}
166811752d88SAlan Cox 	return (NULL);
166911752d88SAlan Cox }
167011752d88SAlan Cox 
1671b7565d44SJeff Roberson /*
1672342056faSDoug Moore  * Allocate a contiguous set of physical pages of the given size
1673342056faSDoug Moore  * "npages" from the free lists.  All of the physical pages must be at
1674342056faSDoug Moore  * or above the given physical address "low" and below the given
1675342056faSDoug Moore  * physical address "high".  The given value "alignment" determines the
1676342056faSDoug Moore  * alignment of the first physical page in the set.  If the given value
1677342056faSDoug Moore  * "boundary" is non-zero, then the set of physical pages cannot cross
1678342056faSDoug Moore  * any physical address boundary that is a multiple of that value.  Both
1679*0078df5fSDoug Moore  * "alignment" and "boundary" must be a power of two.  Sets the pool
1680*0078df5fSDoug Moore  * field to DEFAULT in the first allocated page.
1681342056faSDoug Moore  */
1682342056faSDoug Moore vm_page_t
1683342056faSDoug Moore vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
1684342056faSDoug Moore     u_long alignment, vm_paddr_t boundary)
1685342056faSDoug Moore {
1686342056faSDoug Moore 	vm_paddr_t pa_end, pa_start;
1687fa8a6585SDoug Moore 	struct vm_freelist *fl;
1688fa8a6585SDoug Moore 	vm_page_t m, m_run;
1689342056faSDoug Moore 	struct vm_phys_seg *seg;
1690342056faSDoug Moore 	struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
1691fa8a6585SDoug Moore 	int oind, segind;
1692342056faSDoug Moore 
1693342056faSDoug Moore 	KASSERT(npages > 0, ("npages is 0"));
1694342056faSDoug Moore 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1695342056faSDoug Moore 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1696342056faSDoug Moore 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
1697342056faSDoug Moore 	if (low >= high)
1698342056faSDoug Moore 		return (NULL);
1699342056faSDoug Moore 	queues = NULL;
1700342056faSDoug Moore 	m_run = NULL;
1701342056faSDoug Moore 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1702342056faSDoug Moore 		seg = &vm_phys_segs[segind];
1703342056faSDoug Moore 		if (seg->start >= high || seg->domain != domain)
1704342056faSDoug Moore 			continue;
1705342056faSDoug Moore 		if (low >= seg->end)
1706342056faSDoug Moore 			break;
1707342056faSDoug Moore 		if (low <= seg->start)
1708342056faSDoug Moore 			pa_start = seg->start;
1709342056faSDoug Moore 		else
1710342056faSDoug Moore 			pa_start = low;
1711342056faSDoug Moore 		if (high < seg->end)
1712342056faSDoug Moore 			pa_end = high;
1713342056faSDoug Moore 		else
1714342056faSDoug Moore 			pa_end = seg->end;
1715342056faSDoug Moore 		if (pa_end - pa_start < ptoa(npages))
1716342056faSDoug Moore 			continue;
1717342056faSDoug Moore 		/*
1718342056faSDoug Moore 		 * If a previous segment led to a search using
1719342056faSDoug Moore 		 * the same free lists as would this segment, then
1720342056faSDoug Moore 		 * we've actually already searched within this
1721342056faSDoug Moore 		 * too.  So skip it.
1722342056faSDoug Moore 		 */
1723342056faSDoug Moore 		if (seg->free_queues == queues)
1724342056faSDoug Moore 			continue;
1725342056faSDoug Moore 		queues = seg->free_queues;
1726fa8a6585SDoug Moore 		m_run = vm_phys_find_queues_contig(queues, npages,
1727342056faSDoug Moore 		    low, high, alignment, boundary);
1728342056faSDoug Moore 		if (m_run != NULL)
1729342056faSDoug Moore 			break;
1730342056faSDoug Moore 	}
1731fa8a6585SDoug Moore 	if (m_run == NULL)
1732fa8a6585SDoug Moore 		return (NULL);
1733fa8a6585SDoug Moore 
1734fa8a6585SDoug Moore 	/* Allocate pages from the page-range found. */
1735fa8a6585SDoug Moore 	for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) {
1736fa8a6585SDoug Moore 		fl = (*queues)[m->pool];
1737fa8a6585SDoug Moore 		oind = m->order;
1738fa8a6585SDoug Moore 		vm_freelist_rem(fl, m, oind);
1739*0078df5fSDoug Moore 		vm_phys_finish_init(m, oind);
1740fa8a6585SDoug Moore 	}
1741fa8a6585SDoug Moore 	/* Return excess pages to the free lists. */
1742fa8a6585SDoug Moore 	fl = (*queues)[VM_FREEPOOL_DEFAULT];
1743*0078df5fSDoug Moore 	vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl,
1744*0078df5fSDoug Moore 	    VM_FREEPOOL_DEFAULT, 0);
17452a4897bdSDoug Moore 
17462a4897bdSDoug Moore 	/* Return page verified to satisfy conditions of request. */
17472a4897bdSDoug Moore 	pa_start = VM_PAGE_TO_PHYS(m_run);
17482a4897bdSDoug Moore 	KASSERT(low <= pa_start,
17492a4897bdSDoug Moore 	    ("memory allocated below minimum requested range"));
17502a4897bdSDoug Moore 	KASSERT(pa_start + ptoa(npages) <= high,
17512a4897bdSDoug Moore 	    ("memory allocated above maximum requested range"));
17522a4897bdSDoug Moore 	seg = &vm_phys_segs[m_run->segind];
17532a4897bdSDoug Moore 	KASSERT(seg->domain == domain,
17542a4897bdSDoug Moore 	    ("memory not allocated from specified domain"));
17552a4897bdSDoug Moore 	KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary),
17562a4897bdSDoug Moore 	    ("memory alignment/boundary constraints not satisfied"));
1757342056faSDoug Moore 	return (m_run);
1758342056faSDoug Moore }
1759342056faSDoug Moore 
1760342056faSDoug Moore /*
1761b7565d44SJeff Roberson  * Return the index of the first unused slot which may be the terminating
1762b7565d44SJeff Roberson  * entry.
1763b7565d44SJeff Roberson  */
1764b7565d44SJeff Roberson static int
1765b7565d44SJeff Roberson vm_phys_avail_count(void)
1766b7565d44SJeff Roberson {
1767b7565d44SJeff Roberson 	int i;
1768b7565d44SJeff Roberson 
1769b7565d44SJeff Roberson 	for (i = 0; phys_avail[i + 1]; i += 2)
1770b7565d44SJeff Roberson 		continue;
1771b7565d44SJeff Roberson 	if (i > PHYS_AVAIL_ENTRIES)
1772b7565d44SJeff Roberson 		panic("Improperly terminated phys_avail %d entries", i);
1773b7565d44SJeff Roberson 
1774b7565d44SJeff Roberson 	return (i);
1775b7565d44SJeff Roberson }
1776b7565d44SJeff Roberson 
1777b7565d44SJeff Roberson /*
1778b7565d44SJeff Roberson  * Assert that a phys_avail entry is valid.
1779b7565d44SJeff Roberson  */
1780b7565d44SJeff Roberson static void
1781b7565d44SJeff Roberson vm_phys_avail_check(int i)
1782b7565d44SJeff Roberson {
1783b7565d44SJeff Roberson 	if (phys_avail[i] & PAGE_MASK)
1784b7565d44SJeff Roberson 		panic("Unaligned phys_avail[%d]: %#jx", i,
1785b7565d44SJeff Roberson 		    (intmax_t)phys_avail[i]);
1786b7565d44SJeff Roberson 	if (phys_avail[i+1] & PAGE_MASK)
1787b7565d44SJeff Roberson 		panic("Unaligned phys_avail[%d + 1]: %#jx", i,
1788b7565d44SJeff Roberson 		    (intmax_t)phys_avail[i]);
1789b7565d44SJeff Roberson 	if (phys_avail[i + 1] < phys_avail[i])
1790b7565d44SJeff Roberson 		panic("phys_avail[%d] start %#jx < end %#jx", i,
1791b7565d44SJeff Roberson 		    (intmax_t)phys_avail[i], (intmax_t)phys_avail[i+1]);
1792b7565d44SJeff Roberson }
1793b7565d44SJeff Roberson 
1794b7565d44SJeff Roberson /*
1795b7565d44SJeff Roberson  * Return the index of an overlapping phys_avail entry or -1.
1796b7565d44SJeff Roberson  */
1797be3f5f29SJeff Roberson #ifdef NUMA
1798b7565d44SJeff Roberson static int
1799b7565d44SJeff Roberson vm_phys_avail_find(vm_paddr_t pa)
1800b7565d44SJeff Roberson {
1801b7565d44SJeff Roberson 	int i;
1802b7565d44SJeff Roberson 
1803b7565d44SJeff Roberson 	for (i = 0; phys_avail[i + 1]; i += 2)
1804b7565d44SJeff Roberson 		if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
1805b7565d44SJeff Roberson 			return (i);
1806b7565d44SJeff Roberson 	return (-1);
1807b7565d44SJeff Roberson }
1808be3f5f29SJeff Roberson #endif
1809b7565d44SJeff Roberson 
1810b7565d44SJeff Roberson /*
1811b7565d44SJeff Roberson  * Return the index of the largest entry.
1812b7565d44SJeff Roberson  */
1813b7565d44SJeff Roberson int
1814b7565d44SJeff Roberson vm_phys_avail_largest(void)
1815b7565d44SJeff Roberson {
1816b7565d44SJeff Roberson 	vm_paddr_t sz, largesz;
1817b7565d44SJeff Roberson 	int largest;
1818b7565d44SJeff Roberson 	int i;
1819b7565d44SJeff Roberson 
1820b7565d44SJeff Roberson 	largest = 0;
1821b7565d44SJeff Roberson 	largesz = 0;
1822b7565d44SJeff Roberson 	for (i = 0; phys_avail[i + 1]; i += 2) {
1823b7565d44SJeff Roberson 		sz = vm_phys_avail_size(i);
1824b7565d44SJeff Roberson 		if (sz > largesz) {
1825b7565d44SJeff Roberson 			largesz = sz;
1826b7565d44SJeff Roberson 			largest = i;
1827b7565d44SJeff Roberson 		}
1828b7565d44SJeff Roberson 	}
1829b7565d44SJeff Roberson 
1830b7565d44SJeff Roberson 	return (largest);
1831b7565d44SJeff Roberson }
1832b7565d44SJeff Roberson 
1833b7565d44SJeff Roberson vm_paddr_t
1834b7565d44SJeff Roberson vm_phys_avail_size(int i)
1835b7565d44SJeff Roberson {
1836b7565d44SJeff Roberson 
1837b7565d44SJeff Roberson 	return (phys_avail[i + 1] - phys_avail[i]);
1838b7565d44SJeff Roberson }
1839b7565d44SJeff Roberson 
1840b7565d44SJeff Roberson /*
1841b7565d44SJeff Roberson  * Split an entry at the address 'pa'.  Return zero on success or errno.
1842b7565d44SJeff Roberson  */
1843b7565d44SJeff Roberson static int
1844b7565d44SJeff Roberson vm_phys_avail_split(vm_paddr_t pa, int i)
1845b7565d44SJeff Roberson {
1846b7565d44SJeff Roberson 	int cnt;
1847b7565d44SJeff Roberson 
1848b7565d44SJeff Roberson 	vm_phys_avail_check(i);
1849b7565d44SJeff Roberson 	if (pa <= phys_avail[i] || pa >= phys_avail[i + 1])
1850b7565d44SJeff Roberson 		panic("vm_phys_avail_split: invalid address");
1851b7565d44SJeff Roberson 	cnt = vm_phys_avail_count();
1852b7565d44SJeff Roberson 	if (cnt >= PHYS_AVAIL_ENTRIES)
1853b7565d44SJeff Roberson 		return (ENOSPC);
1854b7565d44SJeff Roberson 	memmove(&phys_avail[i + 2], &phys_avail[i],
1855b7565d44SJeff Roberson 	    (cnt - i) * sizeof(phys_avail[0]));
1856b7565d44SJeff Roberson 	phys_avail[i + 1] = pa;
1857b7565d44SJeff Roberson 	phys_avail[i + 2] = pa;
1858b7565d44SJeff Roberson 	vm_phys_avail_check(i);
1859b7565d44SJeff Roberson 	vm_phys_avail_check(i+2);
1860b7565d44SJeff Roberson 
1861b7565d44SJeff Roberson 	return (0);
1862b7565d44SJeff Roberson }
1863b7565d44SJeff Roberson 
186431991a5aSMitchell Horne /*
186531991a5aSMitchell Horne  * Check if a given physical address can be included as part of a crash dump.
186631991a5aSMitchell Horne  */
186731991a5aSMitchell Horne bool
186831991a5aSMitchell Horne vm_phys_is_dumpable(vm_paddr_t pa)
186931991a5aSMitchell Horne {
187031991a5aSMitchell Horne 	vm_page_t m;
187131991a5aSMitchell Horne 	int i;
187231991a5aSMitchell Horne 
187331991a5aSMitchell Horne 	if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
187431991a5aSMitchell Horne 		return ((m->flags & PG_NODUMP) == 0);
187531991a5aSMitchell Horne 
187631991a5aSMitchell Horne 	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
187731991a5aSMitchell Horne 		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
187831991a5aSMitchell Horne 			return (true);
187931991a5aSMitchell Horne 	}
188031991a5aSMitchell Horne 	return (false);
188131991a5aSMitchell Horne }
188231991a5aSMitchell Horne 
188381302f1dSMark Johnston void
188481302f1dSMark Johnston vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
188581302f1dSMark Johnston {
188681302f1dSMark Johnston 	struct vm_phys_seg *seg;
188781302f1dSMark Johnston 
188881302f1dSMark Johnston 	if (vm_phys_early_nsegs == -1)
188981302f1dSMark Johnston 		panic("%s: called after initialization", __func__);
189081302f1dSMark Johnston 	if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
189181302f1dSMark Johnston 		panic("%s: ran out of early segments", __func__);
189281302f1dSMark Johnston 
189381302f1dSMark Johnston 	seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
189481302f1dSMark Johnston 	seg->start = start;
189581302f1dSMark Johnston 	seg->end = end;
189681302f1dSMark Johnston }
189781302f1dSMark Johnston 
1898b7565d44SJeff Roberson /*
1899b7565d44SJeff Roberson  * This routine allocates NUMA node specific memory before the page
1900b7565d44SJeff Roberson  * allocator is bootstrapped.
1901b7565d44SJeff Roberson  */
1902b7565d44SJeff Roberson vm_paddr_t
1903b7565d44SJeff Roberson vm_phys_early_alloc(int domain, size_t alloc_size)
1904b7565d44SJeff Roberson {
19052e7838aeSJohn Baldwin #ifdef NUMA
19062e7838aeSJohn Baldwin 	int mem_index;
19072e7838aeSJohn Baldwin #endif
19082e7838aeSJohn Baldwin 	int i, biggestone;
1909b7565d44SJeff Roberson 	vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
1910b7565d44SJeff Roberson 
191181302f1dSMark Johnston 	KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
191281302f1dSMark Johnston 	    ("%s: invalid domain index %d", __func__, domain));
1913b7565d44SJeff Roberson 
1914b7565d44SJeff Roberson 	/*
1915b7565d44SJeff Roberson 	 * Search the mem_affinity array for the biggest address
1916b7565d44SJeff Roberson 	 * range in the desired domain.  This is used to constrain
1917b7565d44SJeff Roberson 	 * the phys_avail selection below.
1918b7565d44SJeff Roberson 	 */
1919b7565d44SJeff Roberson 	biggestsize = 0;
1920b7565d44SJeff Roberson 	mem_start = 0;
1921b7565d44SJeff Roberson 	mem_end = -1;
1922b7565d44SJeff Roberson #ifdef NUMA
19232e7838aeSJohn Baldwin 	mem_index = 0;
1924b7565d44SJeff Roberson 	if (mem_affinity != NULL) {
1925b7565d44SJeff Roberson 		for (i = 0;; i++) {
1926b7565d44SJeff Roberson 			size = mem_affinity[i].end - mem_affinity[i].start;
1927b7565d44SJeff Roberson 			if (size == 0)
1928b7565d44SJeff Roberson 				break;
192981302f1dSMark Johnston 			if (domain != -1 && mem_affinity[i].domain != domain)
1930b7565d44SJeff Roberson 				continue;
1931b7565d44SJeff Roberson 			if (size > biggestsize) {
1932b7565d44SJeff Roberson 				mem_index = i;
1933b7565d44SJeff Roberson 				biggestsize = size;
1934b7565d44SJeff Roberson 			}
1935b7565d44SJeff Roberson 		}
1936b7565d44SJeff Roberson 		mem_start = mem_affinity[mem_index].start;
1937b7565d44SJeff Roberson 		mem_end = mem_affinity[mem_index].end;
1938b7565d44SJeff Roberson 	}
1939b7565d44SJeff Roberson #endif
1940b7565d44SJeff Roberson 
1941b7565d44SJeff Roberson 	/*
1942b7565d44SJeff Roberson 	 * Now find biggest physical segment in within the desired
1943b7565d44SJeff Roberson 	 * numa domain.
1944b7565d44SJeff Roberson 	 */
1945b7565d44SJeff Roberson 	biggestsize = 0;
1946b7565d44SJeff Roberson 	biggestone = 0;
1947b7565d44SJeff Roberson 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1948b7565d44SJeff Roberson 		/* skip regions that are out of range */
1949b7565d44SJeff Roberson 		if (phys_avail[i+1] - alloc_size < mem_start ||
1950b7565d44SJeff Roberson 		    phys_avail[i+1] > mem_end)
1951b7565d44SJeff Roberson 			continue;
1952b7565d44SJeff Roberson 		size = vm_phys_avail_size(i);
1953b7565d44SJeff Roberson 		if (size > biggestsize) {
1954b7565d44SJeff Roberson 			biggestone = i;
1955b7565d44SJeff Roberson 			biggestsize = size;
1956b7565d44SJeff Roberson 		}
1957b7565d44SJeff Roberson 	}
1958b7565d44SJeff Roberson 	alloc_size = round_page(alloc_size);
1959b7565d44SJeff Roberson 
1960b7565d44SJeff Roberson 	/*
1961b7565d44SJeff Roberson 	 * Grab single pages from the front to reduce fragmentation.
1962b7565d44SJeff Roberson 	 */
1963b7565d44SJeff Roberson 	if (alloc_size == PAGE_SIZE) {
1964b7565d44SJeff Roberson 		pa = phys_avail[biggestone];
1965b7565d44SJeff Roberson 		phys_avail[biggestone] += PAGE_SIZE;
1966b7565d44SJeff Roberson 		vm_phys_avail_check(biggestone);
1967b7565d44SJeff Roberson 		return (pa);
1968b7565d44SJeff Roberson 	}
1969b7565d44SJeff Roberson 
1970b7565d44SJeff Roberson 	/*
1971b7565d44SJeff Roberson 	 * Naturally align large allocations.
1972b7565d44SJeff Roberson 	 */
1973b7565d44SJeff Roberson 	align = phys_avail[biggestone + 1] & (alloc_size - 1);
1974b7565d44SJeff Roberson 	if (alloc_size + align > biggestsize)
1975b7565d44SJeff Roberson 		panic("cannot find a large enough size\n");
1976b7565d44SJeff Roberson 	if (align != 0 &&
1977b7565d44SJeff Roberson 	    vm_phys_avail_split(phys_avail[biggestone + 1] - align,
1978b7565d44SJeff Roberson 	    biggestone) != 0)
1979b7565d44SJeff Roberson 		/* Wasting memory. */
1980b7565d44SJeff Roberson 		phys_avail[biggestone + 1] -= align;
1981b7565d44SJeff Roberson 
1982b7565d44SJeff Roberson 	phys_avail[biggestone + 1] -= alloc_size;
1983b7565d44SJeff Roberson 	vm_phys_avail_check(biggestone);
1984b7565d44SJeff Roberson 	pa = phys_avail[biggestone + 1];
1985b7565d44SJeff Roberson 	return (pa);
1986b7565d44SJeff Roberson }
1987b7565d44SJeff Roberson 
1988b7565d44SJeff Roberson void
1989b7565d44SJeff Roberson vm_phys_early_startup(void)
1990b7565d44SJeff Roberson {
199181302f1dSMark Johnston 	struct vm_phys_seg *seg;
1992b7565d44SJeff Roberson 	int i;
1993b7565d44SJeff Roberson 
1994b7565d44SJeff Roberson 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1995b7565d44SJeff Roberson 		phys_avail[i] = round_page(phys_avail[i]);
1996b7565d44SJeff Roberson 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
1997b7565d44SJeff Roberson 	}
1998b7565d44SJeff Roberson 
199981302f1dSMark Johnston 	for (i = 0; i < vm_phys_early_nsegs; i++) {
200081302f1dSMark Johnston 		seg = &vm_phys_early_segs[i];
200181302f1dSMark Johnston 		vm_phys_add_seg(seg->start, seg->end);
200281302f1dSMark Johnston 	}
200381302f1dSMark Johnston 	vm_phys_early_nsegs = -1;
200481302f1dSMark Johnston 
2005b7565d44SJeff Roberson #ifdef NUMA
2006b7565d44SJeff Roberson 	/* Force phys_avail to be split by domain. */
2007b7565d44SJeff Roberson 	if (mem_affinity != NULL) {
2008b7565d44SJeff Roberson 		int idx;
2009b7565d44SJeff Roberson 
2010b7565d44SJeff Roberson 		for (i = 0; mem_affinity[i].end != 0; i++) {
2011b7565d44SJeff Roberson 			idx = vm_phys_avail_find(mem_affinity[i].start);
2012b7565d44SJeff Roberson 			if (idx != -1 &&
2013b7565d44SJeff Roberson 			    phys_avail[idx] != mem_affinity[i].start)
2014b7565d44SJeff Roberson 				vm_phys_avail_split(mem_affinity[i].start, idx);
2015b7565d44SJeff Roberson 			idx = vm_phys_avail_find(mem_affinity[i].end);
2016b7565d44SJeff Roberson 			if (idx != -1 &&
2017b7565d44SJeff Roberson 			    phys_avail[idx] != mem_affinity[i].end)
2018b7565d44SJeff Roberson 				vm_phys_avail_split(mem_affinity[i].end, idx);
2019b7565d44SJeff Roberson 		}
2020b7565d44SJeff Roberson 	}
2021b7565d44SJeff Roberson #endif
2022b7565d44SJeff Roberson }
2023b7565d44SJeff Roberson 
202411752d88SAlan Cox #ifdef DDB
202511752d88SAlan Cox /*
202611752d88SAlan Cox  * Show the number of physical pages in each of the free lists.
202711752d88SAlan Cox  */
2028c84c5e00SMitchell Horne DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE)
202911752d88SAlan Cox {
203011752d88SAlan Cox 	struct vm_freelist *fl;
20317e226537SAttilio Rao 	int flind, oind, pind, dom;
203211752d88SAlan Cox 
20337e226537SAttilio Rao 	for (dom = 0; dom < vm_ndomains; dom++) {
20347e226537SAttilio Rao 		db_printf("DOMAIN: %d\n", dom);
203511752d88SAlan Cox 		for (flind = 0; flind < vm_nfreelists; flind++) {
203611752d88SAlan Cox 			db_printf("FREE LIST %d:\n"
203711752d88SAlan Cox 			    "\n  ORDER (SIZE)  |  NUMBER"
203811752d88SAlan Cox 			    "\n              ", flind);
203911752d88SAlan Cox 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
204011752d88SAlan Cox 				db_printf("  |  POOL %d", pind);
204111752d88SAlan Cox 			db_printf("\n--            ");
204211752d88SAlan Cox 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
204311752d88SAlan Cox 				db_printf("-- --      ");
204411752d88SAlan Cox 			db_printf("--\n");
204511752d88SAlan Cox 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
204611752d88SAlan Cox 				db_printf("  %2.2d (%6.6dK)", oind,
204711752d88SAlan Cox 				    1 << (PAGE_SHIFT - 10 + oind));
204811752d88SAlan Cox 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
20497e226537SAttilio Rao 				fl = vm_phys_free_queues[dom][flind][pind];
205011752d88SAlan Cox 					db_printf("  |  %6.6d", fl[oind].lcnt);
205111752d88SAlan Cox 				}
205211752d88SAlan Cox 				db_printf("\n");
205311752d88SAlan Cox 			}
205411752d88SAlan Cox 			db_printf("\n");
205511752d88SAlan Cox 		}
20567e226537SAttilio Rao 		db_printf("\n");
20577e226537SAttilio Rao 	}
205811752d88SAlan Cox }
205911752d88SAlan Cox #endif
2060