xref: /onnv-gate/usr/src/uts/common/vm/vm_pagelist.c (revision 12284:3d1135425dbe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
26 /*	All Rights Reserved   */
27 
28 /*
29  * Portions of this source code were derived from Berkeley 4.3 BSD
30  * under license from the Regents of the University of California.
31  */
32 
33 
34 /*
35  * This file contains common functions to access and manage the page lists.
36  * Many of these routines originated from platform dependent modules
37  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
38  * a platform independent manner.
39  *
40  * vm/vm_dep.h provides for platform specific support.
41  */
42 
43 #include <sys/types.h>
44 #include <sys/debug.h>
45 #include <sys/cmn_err.h>
46 #include <sys/systm.h>
47 #include <sys/atomic.h>
48 #include <sys/sysmacros.h>
49 #include <vm/as.h>
50 #include <vm/page.h>
51 #include <vm/seg_kmem.h>
52 #include <vm/seg_vn.h>
53 #include <sys/vmsystm.h>
54 #include <sys/memnode.h>
55 #include <vm/vm_dep.h>
56 #include <sys/lgrp.h>
57 #include <sys/mem_config.h>
58 #include <sys/callb.h>
59 #include <sys/mem_cage.h>
60 #include <sys/kflt_mem.h>
61 #include <sys/sdt.h>
62 #include <sys/dumphdr.h>
63 #include <sys/swap.h>
64 
65 extern uint_t	vac_colors;
66 
67 #define	MAX_PRAGMA_ALIGN	128
68 
69 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
70 
71 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
72 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
73 #else
74 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
75 #endif
76 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
77 
78 /*
79  * number of page colors equivalent to reqested color in page_get routines.
80  * If set, keeps large pages intact longer and keeps MPO allocation
81  * from the local mnode in favor of acquiring the 'correct' page color from
82  * a demoted large page or from a remote mnode.
83  */
84 uint_t	colorequiv;
85 
86 /*
87  * color equivalency mask for each page size.
88  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
89  * High 4 bits determine the number of high order bits of the color to ignore.
90  * Low 4 bits determines number of low order bits of color to ignore (it's only
91  * relevant for hashed index based page coloring).
92  */
93 uchar_t colorequivszc[MMU_PAGE_SIZES];
94 
95 /*
96  * if set, specifies the percentage of large pages that are free from within
97  * a large page region before attempting to lock those pages for
98  * page_get_contig_pages processing.
99  *
100  * Should be turned on when kpr is available when page_trylock_contig_pages
101  * can be more selective.
102  */
103 
104 int	ptcpthreshold;
105 
106 /*
107  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
108  * Enabled by default via pgcplimitsearch.
109  *
110  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
111  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
112  * bound. This upper bound range guarantees:
113  *    - all large page 'slots' will be searched over time
114  *    - the minimum (1) large page candidates considered on each pgcp call
115  *    - count doesn't wrap around to 0
116  */
117 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
118 int	pgcplimitsearch = 1;
119 
120 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
121 #define	SETPGCPFAILCNT(szc)						\
122 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
123 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
124 
125 /*
126  * There are two page freelist types that are supported, flt_user, the user
127  * page freelist type and flt_kern, the kernel page freelist type.
128  */
129 
130 page_freelist_type_t flt_user;
131 page_freelist_type_t flt_kern;
132 page_freelist_type_t *ufltp = &flt_user;
133 page_freelist_type_t *kfltp = &flt_kern;
134 
135 #ifdef VM_STATS
136 struct vmm_vmstats_str  vmm_vmstats;
137 #endif /* VM_STATS */
138 
139 #if defined(__sparc)
140 #define	LPGCREATE	0
141 #else
142 /* enable page_get_contig_pages */
143 #define	LPGCREATE	1
144 #endif
145 
146 int pg_contig_disable;
147 int pg_lpgcreate_nocage = LPGCREATE;
148 
149 /*
150  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
151  */
152 #define	PFNNULL		0
153 
154 /* Flags involved in promotion and demotion routines */
155 #define	PC_FREE		0x1	/* put page on freelist */
156 #define	PC_ALLOC	0x2	/* return page for allocation */
157 
158 /*
159  * Flag for page_demote to be used with PC_FREE to denote that we don't care
160  * what the color is as the color parameter to the function is ignored.
161  */
162 #define	PC_NO_COLOR	(-1)
163 
164 /* mtype value for page_promote to use when mtype does not matter */
165 #define	PC_MTYPE_ANY	(-1)
166 
167 /*
168  * page counters candidates info
169  * See page_ctrs_cands comment below for more details.
170  * fields are as follows:
171  *	pcc_pages_free:		# pages which freelist coalesce can create
172  *	pcc_color_free:		pointer to page free counts per color
173  */
174 typedef struct pcc_info {
175 	pgcnt_t	pcc_pages_free;
176 	pgcnt_t	*pcc_color_free;
177 	uint_t	pad[12];
178 } pcc_info_t;
179 
180 /*
181  * On big machines it can take a long time to check page_counters
182  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
183  * updated sum of all elements of the corresponding page_counters arrays.
184  * page_freelist_coalesce() searches page_counters only if an appropriate
185  * element of page_ctrs_cands array is greater than 0.
186  *
187  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
188  */
189 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
190 
191 /*
192  * Return in val the total number of free pages which can be created
193  * for the given mnode (m), mrange (g), and region size (r)
194  */
195 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
196 	int i;								\
197 	val = 0;							\
198 	for (i = 0; i < NPC_MUTEX; i++) {				\
199 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
200 	}								\
201 }
202 
203 /*
204  * Return in val the total number of free pages which can be created
205  * for the given mnode (m), mrange (g), region size (r), and color (c)
206  */
207 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
208 	int i;								\
209 	val = 0;							\
210 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
211 	for (i = 0; i < NPC_MUTEX; i++) {				\
212 	    val +=							\
213 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
214 	}								\
215 }
216 
217 /*
218  * We can only allow a single thread to update a counter within the physical
219  * range of the largest supported page size. That is the finest granularity
220  * possible since the counter values are dependent on each other
221  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
222  * ctr_mutex lock index for a particular physical range.
223  */
224 static kmutex_t	*ctr_mutex[NPC_MUTEX];
225 
226 #define	PP_CTR_LOCK_INDX(pp)						\
227 	(((pp)->p_pagenum >>						\
228 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
229 
230 #define	INVALID_COLOR 0xffffffff
231 #define	INVALID_MASK  0xffffffff
232 
233 /*
234  * Local functions prototypes.
235  */
236 
237 void page_ctr_add(int, int, page_t *, int);
238 void page_ctr_add_internal(int, int, page_t *, int);
239 void page_ctr_sub(int, int, page_t *, int);
240 void page_ctr_sub_internal(int, int, page_t *, int);
241 void page_freelist_lock(int);
242 void page_freelist_unlock(int);
243 page_t *page_promote(int, pfn_t, uchar_t, int, int);
244 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
245 page_t *page_freelist_split(uchar_t,
246     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
247 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
248 static page_t *page_get_flist(page_freelist_type_t *, uint_t, int,
249     uchar_t, uint_t, struct lgrp *);
250 
251 static int page_trylock_cons(page_t *pp, se_t se);
252 
253 /*
254  * The page_counters array below is used to keep track of free contiguous
255  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
256  * This contains an array of counters, the size of the array, a shift value
257  * used to convert a pagenum into a counter array index or vice versa, as
258  * well as a cache of the last successful index to be promoted to a larger
259  * page size.  As an optimization, we keep track of the last successful index
260  * to be promoted per page color for the given size region, and this is
261  * allocated dynamically based upon the number of colors for a given
262  * region size.
263  *
264  * Conceptually, the page counters are represented as:
265  *
266  *	page_counters[region_size][mnode]
267  *
268  *	region_size:	size code of a candidate larger page made up
269  *			of contiguous free smaller pages.
270  *
271  *	page_counters[region_size][mnode].hpm_counters[index]:
272  *		represents how many (region_size - 1) pages either
273  *		exist or can be created within the given index range.
274  *
275  * Let's look at a sparc example:
276  *	If we want to create a free 512k page, we look at region_size 2
277  *	for the mnode we want.  We calculate the index and look at a specific
278  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
279  *	this location, it means that 8 64k pages either exist or can be created
280  *	from 8K pages in order to make a single free 512k page at the given
281  *	index.  Note that when a region is full, it will contribute to the
282  *	counts in the region above it.  Thus we will not know what page
283  *	size the free pages will be which can be promoted to this new free
284  *	page unless we look at all regions below the current region.
285  */
286 
287 /*
288  * Note: hpmctr_t is defined in platform vm_dep.h
289  * hw_page_map_t contains all the information needed for the page_counters
290  * logic. The fields are as follows:
291  *
292  *	hpm_counters:	dynamically allocated array to hold counter data
293  *	hpm_entries:	entries in hpm_counters
294  *	hpm_shift:	shift for pnum/array index conv
295  *	hpm_base:	PFN mapped to counter index 0
296  *	hpm_color_current:	last index in counter array for this color at
297  *				which we successfully created a large page
298  */
299 typedef struct hw_page_map {
300 	hpmctr_t	*hpm_counters;
301 	size_t		hpm_entries;
302 	int		hpm_shift;
303 	pfn_t		hpm_base;
304 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
305 #if defined(__sparc)
306 	uint_t		pad[4];
307 #endif
308 } hw_page_map_t;
309 
310 /*
311  * Element zero is not used, but is allocated for convenience.
312  */
313 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
314 
315 /*
316  * Cached value of MNODE_RANGE_CNT(mnode).
317  * This is a function call in x86.
318  */
319 static int mnode_nranges[MAX_MEM_NODES];
320 static int mnode_maxmrange[MAX_MEM_NODES];
321 
322 /*
323  * The following macros are convenient ways to get access to the individual
324  * elements of the page_counters arrays.  They can be used on both
325  * the left side and right side of equations.
326  */
327 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
328 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
329 
330 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
331 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
332 
333 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
334 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
335 
336 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
337 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
338 
339 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
340 	(page_counters[(rg_szc)][(mnode)].hpm_base)
341 
342 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
343 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
344 
345 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
346 	(page_counters[(rg_szc)][(mnode)].				\
347 	hpm_color_current[(mrange)][(color)])
348 
349 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
350 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
351 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
352 
353 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
354 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
355 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
356 
357 /*
358  * Protects the hpm_counters and hpm_color_current memory from changing while
359  * looking at page counters information.
360  * Grab the write lock to modify what these fields point at.
361  * Grab the read lock to prevent any pointers from changing.
362  * The write lock can not be held during memory allocation due to a possible
363  * recursion deadlock with trying to grab the read lock while the
364  * write lock is already held.
365  */
366 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
367 
368 /*
369  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
370  */
371 void
372 cpu_vm_data_init(struct cpu *cp)
373 {
374 	if (cp == CPU0) {
375 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
376 	} else {
377 		void	*kmptr;
378 		int	align;
379 		size_t	sz;
380 
381 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
382 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
383 		kmptr = kmem_zalloc(sz, KM_SLEEP);
384 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
385 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
386 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
387 	}
388 }
389 
390 /*
391  * free cpu_vm_data
392  */
393 void
394 cpu_vm_data_destroy(struct cpu *cp)
395 {
396 	if (cp->cpu_seqid && cp->cpu_vm_data) {
397 		ASSERT(cp != CPU0);
398 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
399 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
400 	}
401 	cp->cpu_vm_data = NULL;
402 }
403 
404 
405 /*
406  * page size to page size code
407  */
408 int
409 page_szc(size_t pagesize)
410 {
411 	int	i = 0;
412 
413 	while (hw_page_array[i].hp_size) {
414 		if (pagesize == hw_page_array[i].hp_size)
415 			return (i);
416 		i++;
417 	}
418 	return (-1);
419 }
420 
421 /*
422  * page size to page size code with the restriction that it be a supported
423  * user page size.  If it's not a supported user page size, -1 will be returned.
424  */
425 int
426 page_szc_user_filtered(size_t pagesize)
427 {
428 	int szc = page_szc(pagesize);
429 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
430 		return (szc);
431 	}
432 	return (-1);
433 }
434 
435 /*
436  * Return how many page sizes are available for the user to use.  This is
437  * what the hardware supports and not based upon how the OS implements the
438  * support of different page sizes.
439  *
440  * If legacy is non-zero, return the number of pagesizes available to legacy
441  * applications. The number of legacy page sizes might be less than the
442  * exported user page sizes. This is to prevent legacy applications that
443  * use the largest page size returned from getpagesizes(3c) from inadvertantly
444  * using the 'new' large pagesizes.
445  */
446 uint_t
447 page_num_user_pagesizes(int legacy)
448 {
449 	if (legacy)
450 		return (mmu_legacy_page_sizes);
451 	return (mmu_exported_page_sizes);
452 }
453 
454 uint_t
455 page_num_pagesizes(void)
456 {
457 	return (mmu_page_sizes);
458 }
459 
460 /*
461  * returns the count of the number of base pagesize pages associated with szc
462  */
463 pgcnt_t
464 page_get_pagecnt(uint_t szc)
465 {
466 	if (szc >= mmu_page_sizes)
467 		panic("page_get_pagecnt: out of range %d", szc);
468 	return (hw_page_array[szc].hp_pgcnt);
469 }
470 
471 size_t
472 page_get_pagesize(uint_t szc)
473 {
474 	if (szc >= mmu_page_sizes)
475 		panic("page_get_pagesize: out of range %d", szc);
476 	return (hw_page_array[szc].hp_size);
477 }
478 
479 /*
480  * Return the size of a page based upon the index passed in.  An index of
481  * zero refers to the smallest page size in the system, and as index increases
482  * it refers to the next larger supported page size in the system.
483  * Note that szc and userszc may not be the same due to unsupported szc's on
484  * some systems.
485  */
486 size_t
487 page_get_user_pagesize(uint_t userszc)
488 {
489 	uint_t szc = USERSZC_2_SZC(userszc);
490 
491 	if (szc >= mmu_page_sizes)
492 		panic("page_get_user_pagesize: out of range %d", szc);
493 	return (hw_page_array[szc].hp_size);
494 }
495 
496 uint_t
497 page_get_shift(uint_t szc)
498 {
499 	if (szc >= mmu_page_sizes)
500 		panic("page_get_shift: out of range %d", szc);
501 	return (PAGE_GET_SHIFT(szc));
502 }
503 
504 uint_t
505 page_get_pagecolors(uint_t szc)
506 {
507 	if (szc >= mmu_page_sizes)
508 		panic("page_get_pagecolors: out of range %d", szc);
509 	return (PAGE_GET_PAGECOLORS(szc));
510 }
511 
512 /*
513  * this assigns the desired equivalent color after a split
514  */
515 uint_t
516 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
517     uint_t ncolor, uint_t ceq_mask)
518 {
519 	ASSERT(nszc > szc);
520 	ASSERT(szc < mmu_page_sizes);
521 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
522 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
523 
524 	color &= ceq_mask;
525 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
526 	return (color | (ncolor & ~ceq_mask));
527 }
528 
529 /*
530  * The interleaved_mnodes flag is set when mnodes overlap in
531  * the physbase..physmax range, but have disjoint slices.
532  * In this case hpm_counters is shared by all mnodes.
533  * This flag is set dynamically by the platform.
534  */
535 int interleaved_mnodes = 0;
536 
537 /*
538  * Called by startup().
539  * Size up the per page size free list counters based on physmax
540  * of each node and max_mem_nodes.
541  *
542  * If interleaved_mnodes is set we need to find the first mnode that
543  * exists. hpm_counters for the first mnode will then be shared by
544  * all other mnodes. If interleaved_mnodes is not set, just set
545  * first=mnode each time. That means there will be no sharing.
546  */
547 size_t
548 page_ctrs_sz(void)
549 {
550 	int	r;		/* region size */
551 	int	mnode;
552 	int	firstmn;	/* first mnode that exists */
553 	int	nranges;
554 	pfn_t	physbase;
555 	pfn_t	physmax;
556 	uint_t	ctrs_sz = 0;
557 	int 	i;
558 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
559 
560 	/*
561 	 * We need to determine how many page colors there are for each
562 	 * page size in order to allocate memory for any color specific
563 	 * arrays.
564 	 */
565 	for (i = 0; i < mmu_page_sizes; i++) {
566 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
567 	}
568 
569 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
570 
571 		pgcnt_t r_pgcnt;
572 		pfn_t   r_base;
573 		pgcnt_t r_align;
574 
575 		if (mem_node_config[mnode].exists == 0)
576 			continue;
577 
578 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
579 		nranges = MNODE_RANGE_CNT(mnode);
580 		mnode_nranges[mnode] = nranges;
581 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
582 
583 		/*
584 		 * determine size needed for page counter arrays with
585 		 * base aligned to large page size.
586 		 */
587 		for (r = 1; r < mmu_page_sizes; r++) {
588 			/* add in space for hpm_color_current */
589 			ctrs_sz += sizeof (size_t) *
590 			    colors_per_szc[r] * nranges;
591 
592 			if (firstmn != mnode)
593 				continue;
594 
595 			/* add in space for hpm_counters */
596 			r_align = page_get_pagecnt(r);
597 			r_base = physbase;
598 			r_base &= ~(r_align - 1);
599 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
600 
601 			/*
602 			 * Round up to always allocate on pointer sized
603 			 * boundaries.
604 			 */
605 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
606 			    sizeof (hpmctr_t *));
607 		}
608 	}
609 
610 	for (r = 1; r < mmu_page_sizes; r++) {
611 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
612 	}
613 
614 	/* add in space for page_ctrs_cands and pcc_color_free */
615 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
616 	    mmu_page_sizes * NPC_MUTEX;
617 
618 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
619 
620 		if (mem_node_config[mnode].exists == 0)
621 			continue;
622 
623 		nranges = mnode_nranges[mnode];
624 		ctrs_sz += sizeof (pcc_info_t) * nranges *
625 		    mmu_page_sizes * NPC_MUTEX;
626 		for (r = 1; r < mmu_page_sizes; r++) {
627 			ctrs_sz += sizeof (pgcnt_t) * nranges *
628 			    colors_per_szc[r] * NPC_MUTEX;
629 		}
630 	}
631 
632 	/* ctr_mutex */
633 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
634 
635 	/* size for page list counts */
636 	PLCNT_SZ(ctrs_sz);
637 
638 	/*
639 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
640 	 * address of the counters to ecache_alignsize boundary for every
641 	 * memory node.
642 	 */
643 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
644 }
645 
646 caddr_t
647 page_ctrs_alloc(caddr_t alloc_base)
648 {
649 	int	mnode;
650 	int	mrange, nranges;
651 	int	r;		/* region size */
652 	int	i;
653 	int	firstmn;	/* first mnode that exists */
654 	pfn_t	physbase;
655 	pfn_t	physmax;
656 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
657 
658 	/*
659 	 * We need to determine how many page colors there are for each
660 	 * page size in order to allocate memory for any color specific
661 	 * arrays.
662 	 */
663 	for (i = 0; i < mmu_page_sizes; i++) {
664 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
665 	}
666 
667 	for (r = 1; r < mmu_page_sizes; r++) {
668 		page_counters[r] = (hw_page_map_t *)alloc_base;
669 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
670 	}
671 
672 	/* page_ctrs_cands and pcc_color_free array */
673 	for (i = 0; i < NPC_MUTEX; i++) {
674 		for (r = 1; r < mmu_page_sizes; r++) {
675 
676 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
677 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
678 
679 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
680 				pcc_info_t *pi;
681 
682 				if (mem_node_config[mnode].exists == 0)
683 					continue;
684 
685 				nranges = mnode_nranges[mnode];
686 
687 				pi = (pcc_info_t *)alloc_base;
688 				alloc_base += sizeof (pcc_info_t) * nranges;
689 				page_ctrs_cands[i][r][mnode] = pi;
690 
691 				for (mrange = 0; mrange < nranges; mrange++) {
692 					pi->pcc_color_free =
693 					    (pgcnt_t *)alloc_base;
694 					alloc_base += sizeof (pgcnt_t) *
695 					    colors_per_szc[r];
696 					pi++;
697 				}
698 			}
699 		}
700 	}
701 
702 	/* ctr_mutex */
703 	for (i = 0; i < NPC_MUTEX; i++) {
704 		ctr_mutex[i] = (kmutex_t *)alloc_base;
705 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
706 	}
707 
708 	/* initialize page list counts */
709 	PLCNT_INIT(alloc_base);
710 
711 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
712 
713 		pgcnt_t r_pgcnt;
714 		pfn_t	r_base;
715 		pgcnt_t r_align;
716 		int	r_shift;
717 		int	nranges = mnode_nranges[mnode];
718 
719 		if (mem_node_config[mnode].exists == 0)
720 			continue;
721 
722 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
723 
724 		for (r = 1; r < mmu_page_sizes; r++) {
725 			/*
726 			 * the page_counters base has to be aligned to the
727 			 * page count of page size code r otherwise the counts
728 			 * will cross large page boundaries.
729 			 */
730 			r_align = page_get_pagecnt(r);
731 			r_base = physbase;
732 			/* base needs to be aligned - lower to aligned value */
733 			r_base &= ~(r_align - 1);
734 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
735 			r_shift = PAGE_BSZS_SHIFT(r);
736 
737 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
738 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
739 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
740 			for (mrange = 0; mrange < nranges; mrange++) {
741 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
742 				    r, mrange) = (size_t *)alloc_base;
743 				alloc_base += sizeof (size_t) *
744 				    colors_per_szc[r];
745 			}
746 			for (i = 0; i < colors_per_szc[r]; i++) {
747 				uint_t color_mask = colors_per_szc[r] - 1;
748 				pfn_t  pfnum = r_base;
749 				size_t idx;
750 				int mrange;
751 				MEM_NODE_ITERATOR_DECL(it);
752 
753 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
754 				if (pfnum == (pfn_t)-1) {
755 					idx = 0;
756 				} else {
757 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
758 					    color_mask, color_mask, &it);
759 					idx = PNUM_TO_IDX(mnode, r, pfnum);
760 					idx = (idx >= r_pgcnt) ? 0 : idx;
761 				}
762 				for (mrange = 0; mrange < nranges; mrange++) {
763 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
764 					    r, i, mrange) = idx;
765 				}
766 			}
767 
768 			/* hpm_counters may be shared by all mnodes */
769 			if (firstmn == mnode) {
770 				PAGE_COUNTERS_COUNTERS(mnode, r) =
771 				    (hpmctr_t *)alloc_base;
772 				alloc_base +=
773 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
774 				    sizeof (hpmctr_t *));
775 			} else {
776 				PAGE_COUNTERS_COUNTERS(mnode, r) =
777 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
778 			}
779 
780 			/*
781 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
782 			 * satisfy the identity requirement.
783 			 * We should be able to go from one to the other
784 			 * and get consistent values.
785 			 */
786 			ASSERT(PNUM_TO_IDX(mnode, r,
787 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
788 			ASSERT(IDX_TO_PNUM(mnode, r,
789 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
790 		}
791 		/*
792 		 * Roundup the start address of the page_counters to
793 		 * cache aligned boundary for every memory node.
794 		 * page_ctrs_sz() has added some slop for these roundups.
795 		 */
796 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
797 		    L2CACHE_ALIGN);
798 	}
799 
800 	/* Initialize other page counter specific data structures. */
801 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
802 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
803 	}
804 
805 	return (alloc_base);
806 }
807 
808 /*
809  * Functions to adjust region counters for each size free list.
810  * Caller is responsible to acquire the ctr_mutex lock if necessary and
811  * thus can be called during startup without locks.
812  */
813 /* ARGSUSED */
814 void
815 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
816 {
817 	ssize_t		r;	/* region size */
818 	ssize_t		idx;
819 	pfn_t		pfnum;
820 	int		lckidx;
821 
822 	ASSERT(mnode == PP_2_MEM_NODE(pp));
823 	ASSERT(mtype == PP_2_MTYPE(pp));
824 
825 	ASSERT(pp->p_szc < mmu_page_sizes);
826 
827 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
828 
829 	/* no counter update needed for largest page size */
830 	if (pp->p_szc >= mmu_page_sizes - 1) {
831 		return;
832 	}
833 
834 	r = pp->p_szc + 1;
835 	pfnum = pp->p_pagenum;
836 	lckidx = PP_CTR_LOCK_INDX(pp);
837 
838 	/*
839 	 * Increment the count of free pages for the current
840 	 * region. Continue looping up in region size incrementing
841 	 * count if the preceeding region is full.
842 	 */
843 	while (r < mmu_page_sizes) {
844 		idx = PNUM_TO_IDX(mnode, r, pfnum);
845 
846 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
847 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
848 
849 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
850 			break;
851 		} else {
852 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
853 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
854 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
855 
856 			cand->pcc_pages_free++;
857 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
858 		}
859 		r++;
860 	}
861 }
862 
863 void
864 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
865 {
866 	int		lckidx = PP_CTR_LOCK_INDX(pp);
867 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
868 
869 	mutex_enter(lock);
870 	page_ctr_add_internal(mnode, mtype, pp, flags);
871 	mutex_exit(lock);
872 }
873 
874 void
875 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
876 {
877 	int		lckidx;
878 	ssize_t		r;	/* region size */
879 	ssize_t		idx;
880 	pfn_t		pfnum;
881 
882 	ASSERT(mnode == PP_2_MEM_NODE(pp));
883 	ASSERT(mtype == PP_2_MTYPE(pp));
884 
885 	ASSERT(pp->p_szc < mmu_page_sizes);
886 
887 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
888 
889 	/* no counter update needed for largest page size */
890 	if (pp->p_szc >= mmu_page_sizes - 1) {
891 		return;
892 	}
893 
894 	r = pp->p_szc + 1;
895 	pfnum = pp->p_pagenum;
896 	lckidx = PP_CTR_LOCK_INDX(pp);
897 
898 	/*
899 	 * Decrement the count of free pages for the current
900 	 * region. Continue looping up in region size decrementing
901 	 * count if the preceeding region was full.
902 	 */
903 	while (r < mmu_page_sizes) {
904 		idx = PNUM_TO_IDX(mnode, r, pfnum);
905 
906 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
907 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
908 
909 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
910 			break;
911 		} else {
912 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
913 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
914 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
915 
916 			ASSERT(cand->pcc_pages_free != 0);
917 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
918 
919 			cand->pcc_pages_free--;
920 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
921 		}
922 		r++;
923 	}
924 }
925 
926 void
927 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
928 {
929 	int		lckidx = PP_CTR_LOCK_INDX(pp);
930 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
931 
932 	mutex_enter(lock);
933 	page_ctr_sub_internal(mnode, mtype, pp, flags);
934 	mutex_exit(lock);
935 }
936 
937 /*
938  * Adjust page counters following a memory attach, since typically the
939  * size of the array needs to change, and the PFN to counter index
940  * mapping needs to change.
941  *
942  * It is possible this mnode did not exist at startup. In that case
943  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
944  * to change (a theoretical possibility on x86), which means pcc_color_free
945  * arrays must be extended.
946  */
947 uint_t
948 page_ctrs_adjust(int mnode)
949 {
950 	pgcnt_t npgs;
951 	int	r;		/* region size */
952 	int	i;
953 	size_t	pcsz, old_csz;
954 	hpmctr_t *new_ctr, *old_ctr;
955 	pfn_t	oldbase, newbase;
956 	pfn_t	physbase, physmax;
957 	size_t	old_npgs;
958 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
959 	size_t	size_cache[MMU_PAGE_SIZES];
960 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
961 	size_t	*old_color_array[MAX_MNODE_MRANGES];
962 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
963 	pcc_info_t **cands_cache;
964 	pcc_info_t *old_pi, *pi;
965 	pgcnt_t *pgcntp;
966 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
967 	int cands_cache_nranges;
968 	int old_maxmrange, new_maxmrange;
969 	int rc = 0;
970 	int oldmnode;
971 
972 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
973 	    MMU_PAGE_SIZES, KM_NOSLEEP);
974 	if (cands_cache == NULL)
975 		return (ENOMEM);
976 
977 	i = -1;
978 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
979 
980 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
981 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
982 
983 	/* prepare to free non-null pointers on the way out */
984 	cands_cache_nranges = nranges;
985 	bzero(ctr_cache, sizeof (ctr_cache));
986 	bzero(color_cache, sizeof (color_cache));
987 
988 	/*
989 	 * We need to determine how many page colors there are for each
990 	 * page size in order to allocate memory for any color specific
991 	 * arrays.
992 	 */
993 	for (r = 0; r < mmu_page_sizes; r++) {
994 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
995 	}
996 
997 	/*
998 	 * Preallocate all of the new hpm_counters arrays as we can't
999 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1000 	 * If we can't allocate all of the arrays, undo our work so far
1001 	 * and return failure.
1002 	 */
1003 	for (r = 1; r < mmu_page_sizes; r++) {
1004 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
1005 		size_cache[r] = pcsz;
1006 		ctr_cache[r] = kmem_zalloc(pcsz *
1007 		    sizeof (hpmctr_t), KM_NOSLEEP);
1008 		if (ctr_cache[r] == NULL) {
1009 			rc = ENOMEM;
1010 			goto cleanup;
1011 		}
1012 	}
1013 
1014 	/*
1015 	 * Preallocate all of the new color current arrays as we can't
1016 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1017 	 * If we can't allocate all of the arrays, undo our work so far
1018 	 * and return failure.
1019 	 */
1020 	for (r = 1; r < mmu_page_sizes; r++) {
1021 		for (mrange = 0; mrange < nranges; mrange++) {
1022 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1023 			    colors_per_szc[r], KM_NOSLEEP);
1024 			if (color_cache[r][mrange] == NULL) {
1025 				rc = ENOMEM;
1026 				goto cleanup;
1027 			}
1028 		}
1029 	}
1030 
1031 	/*
1032 	 * Preallocate all of the new pcc_info_t arrays as we can't
1033 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1034 	 * If we can't allocate all of the arrays, undo our work so far
1035 	 * and return failure.
1036 	 */
1037 	for (r = 1; r < mmu_page_sizes; r++) {
1038 		for (i = 0; i < NPC_MUTEX; i++) {
1039 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1040 			    KM_NOSLEEP);
1041 			if (pi == NULL) {
1042 				rc = ENOMEM;
1043 				goto cleanup;
1044 			}
1045 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1046 
1047 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
1048 				pgcntp = kmem_zalloc(colors_per_szc[r] *
1049 				    sizeof (pgcnt_t), KM_NOSLEEP);
1050 				if (pgcntp == NULL) {
1051 					rc = ENOMEM;
1052 					goto cleanup;
1053 				}
1054 				pi->pcc_color_free = pgcntp;
1055 			}
1056 		}
1057 	}
1058 
1059 	/*
1060 	 * Grab the write lock to prevent others from walking these arrays
1061 	 * while we are modifying them.
1062 	 */
1063 	PAGE_CTRS_WRITE_LOCK(mnode);
1064 
1065 	/*
1066 	 * For interleaved mnodes, find the first mnode
1067 	 * with valid page counters since the current
1068 	 * mnode may have just been added and not have
1069 	 * valid page counters.
1070 	 */
1071 	if (interleaved_mnodes) {
1072 		for (i = 0; i < max_mem_nodes; i++)
1073 			if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1074 				break;
1075 		ASSERT(i < max_mem_nodes);
1076 		oldmnode = i;
1077 	} else
1078 		oldmnode = mnode;
1079 
1080 	old_nranges = mnode_nranges[mnode];
1081 	cands_cache_nranges = old_nranges;
1082 	mnode_nranges[mnode] = nranges;
1083 	old_maxmrange = mnode_maxmrange[mnode];
1084 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1085 	new_maxmrange = mnode_maxmrange[mnode];
1086 
1087 	for (r = 1; r < mmu_page_sizes; r++) {
1088 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1089 		old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1090 		old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1091 		oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1092 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1093 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1094 			old_color_array[mrange] =
1095 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1096 			    r, mrange);
1097 		}
1098 
1099 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1100 		new_ctr = ctr_cache[r];
1101 		ctr_cache[r] = NULL;
1102 		if (old_ctr != NULL &&
1103 		    (oldbase + old_npgs > newbase) &&
1104 		    (newbase + npgs > oldbase)) {
1105 			/*
1106 			 * Map the intersection of the old and new
1107 			 * counters into the new array.
1108 			 */
1109 			size_t offset;
1110 			if (newbase > oldbase) {
1111 				offset = (newbase - oldbase) >>
1112 				    PAGE_COUNTERS_SHIFT(mnode, r);
1113 				bcopy(old_ctr + offset, new_ctr,
1114 				    MIN(pcsz, (old_csz - offset)) *
1115 				    sizeof (hpmctr_t));
1116 			} else {
1117 				offset = (oldbase - newbase) >>
1118 				    PAGE_COUNTERS_SHIFT(mnode, r);
1119 				bcopy(old_ctr, new_ctr + offset,
1120 				    MIN(pcsz - offset, old_csz) *
1121 				    sizeof (hpmctr_t));
1122 			}
1123 		}
1124 
1125 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1126 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1127 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
1128 
1129 		/* update shared hpm_counters in other mnodes */
1130 		if (interleaved_mnodes) {
1131 			for (i = 0; i < max_mem_nodes; i++) {
1132 				if ((i == mnode) ||
1133 				    (mem_node_config[i].exists == 0))
1134 					continue;
1135 				ASSERT(
1136 				    PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1137 				    PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1138 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1139 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1140 				PAGE_COUNTERS_BASE(i, r) = newbase;
1141 			}
1142 		}
1143 
1144 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1145 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1146 			    color_cache[r][mrange];
1147 			color_cache[r][mrange] = NULL;
1148 		}
1149 		/*
1150 		 * for now, just reset on these events as it's probably
1151 		 * not worthwhile to try and optimize this.
1152 		 */
1153 		for (i = 0; i < colors_per_szc[r]; i++) {
1154 			uint_t color_mask = colors_per_szc[r] - 1;
1155 			int mlo = interleaved_mnodes ? 0 : mnode;
1156 			int mhi = interleaved_mnodes ? max_mem_nodes :
1157 			    (mnode + 1);
1158 			int m;
1159 			pfn_t  pfnum;
1160 			size_t idx;
1161 			MEM_NODE_ITERATOR_DECL(it);
1162 
1163 			for (m = mlo; m < mhi; m++) {
1164 				if (mem_node_config[m].exists == 0)
1165 					continue;
1166 				pfnum = newbase;
1167 				MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1168 				if (pfnum == (pfn_t)-1) {
1169 					idx = 0;
1170 				} else {
1171 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1172 					    color_mask, color_mask, &it);
1173 					idx = PNUM_TO_IDX(m, r, pfnum);
1174 					idx = (idx < pcsz) ? idx : 0;
1175 				}
1176 				for (mrange = 0; mrange < nranges; mrange++) {
1177 					if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1178 					    r, mrange) != NULL)
1179 						PAGE_COUNTERS_CURRENT_COLOR(m,
1180 						    r, i, mrange) = idx;
1181 				}
1182 			}
1183 		}
1184 
1185 		/* cache info for freeing out of the critical path */
1186 		if ((caddr_t)old_ctr >= kernelheap &&
1187 		    (caddr_t)old_ctr < ekernelheap) {
1188 			ctr_cache[r] = old_ctr;
1189 			size_cache[r] = old_csz;
1190 		}
1191 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1192 			size_t *tmp = old_color_array[mrange];
1193 			if ((caddr_t)tmp >= kernelheap &&
1194 			    (caddr_t)tmp < ekernelheap) {
1195 				color_cache[r][mrange] = tmp;
1196 			}
1197 		}
1198 		/*
1199 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1200 		 * satisfy the identity requirement.
1201 		 * We should be able to go from one to the other
1202 		 * and get consistent values.
1203 		 */
1204 		ASSERT(PNUM_TO_IDX(mnode, r,
1205 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
1206 		ASSERT(IDX_TO_PNUM(mnode, r,
1207 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1208 
1209 		/* pcc_info_t and pcc_color_free */
1210 		for (i = 0; i < NPC_MUTEX; i++) {
1211 			pcc_info_t *epi;
1212 			pcc_info_t *eold_pi;
1213 
1214 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1215 			old_pi = page_ctrs_cands[i][r][mnode];
1216 			page_ctrs_cands[i][r][mnode] = pi;
1217 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1218 
1219 			/* preserve old pcc_color_free values, if any */
1220 			if (old_pi == NULL)
1221 				continue;
1222 
1223 			/*
1224 			 * when/if x86 does DR, must account for
1225 			 * possible change in range index when
1226 			 * preserving pcc_info
1227 			 */
1228 			epi = &pi[nranges];
1229 			eold_pi = &old_pi[old_nranges];
1230 			if (new_maxmrange > old_maxmrange) {
1231 				pi += new_maxmrange - old_maxmrange;
1232 			} else if (new_maxmrange < old_maxmrange) {
1233 				old_pi += old_maxmrange - new_maxmrange;
1234 			}
1235 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1236 				pcc_info_t tmp = *pi;
1237 				*pi = *old_pi;
1238 				*old_pi = tmp;
1239 			}
1240 		}
1241 	}
1242 	PAGE_CTRS_WRITE_UNLOCK(mnode);
1243 
1244 	/*
1245 	 * Now that we have dropped the write lock, it is safe to free all
1246 	 * of the memory we have cached above.
1247 	 * We come thru here to free memory when pre-alloc fails, and also to
1248 	 * free old pointers which were recorded while locked.
1249 	 */
1250 cleanup:
1251 	for (r = 1; r < mmu_page_sizes; r++) {
1252 		if (ctr_cache[r] != NULL) {
1253 			kmem_free(ctr_cache[r],
1254 			    size_cache[r] * sizeof (hpmctr_t));
1255 		}
1256 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1257 			if (color_cache[r][mrange] != NULL) {
1258 				kmem_free(color_cache[r][mrange],
1259 				    colors_per_szc[r] * sizeof (size_t));
1260 			}
1261 		}
1262 		for (i = 0; i < NPC_MUTEX; i++) {
1263 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1264 			if (pi == NULL)
1265 				continue;
1266 			nr = cands_cache_nranges;
1267 			for (mrange = 0; mrange < nr; mrange++, pi++) {
1268 				pgcntp = pi->pcc_color_free;
1269 				if (pgcntp == NULL)
1270 					continue;
1271 				if ((caddr_t)pgcntp >= kernelheap &&
1272 				    (caddr_t)pgcntp < ekernelheap) {
1273 					kmem_free(pgcntp,
1274 					    colors_per_szc[r] *
1275 					    sizeof (pgcnt_t));
1276 				}
1277 			}
1278 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1279 			if ((caddr_t)pi >= kernelheap &&
1280 			    (caddr_t)pi < ekernelheap) {
1281 				kmem_free(pi, nr * sizeof (pcc_info_t));
1282 			}
1283 		}
1284 	}
1285 
1286 	kmem_free(cands_cache,
1287 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1288 	return (rc);
1289 }
1290 
1291 /*
1292  * Cleanup the hpm_counters field in the page counters
1293  * array.
1294  */
1295 void
1296 page_ctrs_cleanup(void)
1297 {
1298 	int r;	/* region size */
1299 	int i;	/* mnode index */
1300 
1301 	/*
1302 	 * Get the page counters write lock while we are
1303 	 * setting the page hpm_counters field to NULL
1304 	 * for non-existent mnodes.
1305 	 */
1306 	for (i = 0; i < max_mem_nodes; i++) {
1307 		PAGE_CTRS_WRITE_LOCK(i);
1308 		if (mem_node_config[i].exists) {
1309 			PAGE_CTRS_WRITE_UNLOCK(i);
1310 			continue;
1311 		}
1312 		for (r = 1; r < mmu_page_sizes; r++) {
1313 			PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1314 		}
1315 		PAGE_CTRS_WRITE_UNLOCK(i);
1316 	}
1317 }
1318 
1319 #ifdef DEBUG
1320 
1321 /*
1322  * confirm pp is a large page corresponding to szc
1323  */
1324 void
1325 chk_lpg(page_t *pp, uchar_t szc)
1326 {
1327 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1328 	uint_t noreloc;
1329 
1330 	if (npgs == 1) {
1331 		ASSERT(pp->p_szc == 0);
1332 		ASSERT(pp->p_next == pp);
1333 		ASSERT(pp->p_prev == pp);
1334 		return;
1335 	}
1336 
1337 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1338 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1339 
1340 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1341 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1342 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1343 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
1344 
1345 	/*
1346 	 * Check list of pages.
1347 	 */
1348 	noreloc = PP_ISNORELOC(pp);
1349 	while (npgs--) {
1350 		if (npgs != 0) {
1351 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1352 			ASSERT(pp->p_next == (pp + 1));
1353 		}
1354 		ASSERT(pp->p_szc == szc);
1355 		ASSERT(PP_ISFREE(pp));
1356 		ASSERT(PP_ISAGED(pp));
1357 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1358 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1359 		ASSERT(pp->p_vnode  == NULL);
1360 		ASSERT(PP_ISNORELOC(pp) == noreloc);
1361 
1362 		pp = pp->p_next;
1363 	}
1364 }
1365 #endif /* DEBUG */
1366 
1367 void
1368 page_freelist_lock(int mnode)
1369 {
1370 	int i;
1371 	for (i = 0; i < NPC_MUTEX; i++) {
1372 		mutex_enter(FPC_MUTEX(mnode, i));
1373 		mutex_enter(CPC_MUTEX(mnode, i));
1374 	}
1375 }
1376 
1377 void
1378 page_freelist_unlock(int mnode)
1379 {
1380 	int i;
1381 	for (i = 0; i < NPC_MUTEX; i++) {
1382 		mutex_exit(FPC_MUTEX(mnode, i));
1383 		mutex_exit(CPC_MUTEX(mnode, i));
1384 	}
1385 }
1386 
1387 /*
1388  * add pp to the specified page list. Defaults to head of the page list
1389  * unless PG_LIST_TAIL is specified.
1390  */
1391 void
1392 page_list_add(page_t *pp, int flags)
1393 {
1394 	page_t		**ppp;
1395 	kmutex_t	*pcm;
1396 	uint_t		bin, mtype;
1397 	int		mnode;
1398 
1399 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1400 	ASSERT(PP_ISFREE(pp));
1401 	ASSERT(!hat_page_is_mapped(pp));
1402 	ASSERT(hat_page_getshare(pp) == 0);
1403 
1404 	/*
1405 	 * Large pages should be freed via page_list_add_pages().
1406 	 */
1407 	ASSERT(pp->p_szc == 0);
1408 
1409 	/*
1410 	 * Don't need to lock the freelist first here
1411 	 * because the page isn't on the freelist yet.
1412 	 * This means p_szc can't change on us.
1413 	 */
1414 
1415 	bin = PP_2_BIN(pp);
1416 	mnode = PP_2_MEM_NODE(pp);
1417 	mtype = PP_2_MTYPE(pp);
1418 
1419 	if (flags & PG_LIST_ISINIT) {
1420 		/*
1421 		 * PG_LIST_ISINIT is set during system startup (ie. single
1422 		 * threaded), add a page to the free list and add to the
1423 		 * the free region counters w/o any locking
1424 		 */
1425 		ASSERT(!PP_ISKFLT(pp));
1426 		ppp = PAGE_FREELISTP(PFLT_USER, mnode, 0, bin, mtype);
1427 
1428 		/* inline version of page_add() */
1429 		if (*ppp != NULL) {
1430 			pp->p_next = *ppp;
1431 			pp->p_prev = (*ppp)->p_prev;
1432 			(*ppp)->p_prev = pp;
1433 			pp->p_prev->p_next = pp;
1434 		} else
1435 			*ppp = pp;
1436 
1437 		page_ctr_add_internal(mnode, mtype, pp, flags);
1438 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1439 	} else {
1440 		pcm = PC_BIN_MUTEX(PP_ISKFLT(pp), mnode, bin, flags);
1441 
1442 		if (flags & PG_FREE_LIST) {
1443 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1444 			ASSERT(PP_ISAGED(pp));
1445 			ppp = PAGE_FREELISTP(PP_ISKFLT(pp), mnode, 0,
1446 			    bin, mtype);
1447 		} else {
1448 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
1449 			ASSERT(pp->p_vnode);
1450 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1451 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1452 		}
1453 		mutex_enter(pcm);
1454 		page_add(ppp, pp);
1455 
1456 		if (flags & PG_LIST_TAIL)
1457 			*ppp = (*ppp)->p_next;
1458 		/*
1459 		 * Add counters before releasing pcm mutex to avoid a race with
1460 		 * page_freelist_coalesce and page_freelist_split.
1461 		 */
1462 		page_ctr_add(mnode, mtype, pp, flags);
1463 		mutex_exit(pcm);
1464 	}
1465 
1466 
1467 #if defined(__sparc)
1468 	if (PP_ISNORELOC(pp)) {
1469 		kcage_freemem_add(1);
1470 	}
1471 #elif defined(__amd64) && !defined(__xpv)
1472 	if (PP_ISKFLT(pp)) {
1473 		kflt_freemem_add(1);
1474 		if (PP_ISUSERKFLT(pp)) {
1475 			ASSERT(kflt_user_alloc > 0);
1476 			atomic_add_long(&kflt_user_alloc, -1);
1477 			PP_CLRUSERKFLT(pp);
1478 		}
1479 	}
1480 #endif /* __sparc */
1481 	/*
1482 	 * It is up to the caller to unlock the page!
1483 	 */
1484 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1485 }
1486 
1487 
1488 #ifdef __sparc
1489 /*
1490  * This routine is only used by kcage_init during system startup.
1491  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1492  * without the overhead of taking locks and updating counters.
1493  */
1494 void
1495 page_list_noreloc_startup(page_t *pp)
1496 {
1497 	page_t		**ppp;
1498 	uint_t		bin;
1499 	int		mnode;
1500 	int		mtype;
1501 	int		flags = 0;
1502 
1503 	/*
1504 	 * If this is a large page on the freelist then
1505 	 * break it up into smaller pages.
1506 	 */
1507 	if (pp->p_szc != 0)
1508 		page_boot_demote(pp);
1509 
1510 	/*
1511 	 * Get list page is currently on.
1512 	 */
1513 	bin = PP_2_BIN(pp);
1514 	mnode = PP_2_MEM_NODE(pp);
1515 	mtype = PP_2_MTYPE(pp);
1516 	ASSERT(mtype == MTYPE_RELOC);
1517 	ASSERT(pp->p_szc == 0);
1518 
1519 	if (PP_ISAGED(pp)) {
1520 		ASSERT(!PP_ISKFLT(pp));
1521 		ppp = PAGE_FREELISTP(PFLT_USER, mnode, 0, bin, mtype);
1522 		flags |= PG_FREE_LIST;
1523 	} else {
1524 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1525 		flags |= PG_CACHE_LIST;
1526 	}
1527 
1528 	ASSERT(*ppp != NULL);
1529 
1530 	/*
1531 	 * Delete page from current list.
1532 	 */
1533 	if (*ppp == pp)
1534 		*ppp = pp->p_next;		/* go to next page */
1535 	if (*ppp == pp) {
1536 		*ppp = NULL;			/* page list is gone */
1537 	} else {
1538 		pp->p_prev->p_next = pp->p_next;
1539 		pp->p_next->p_prev = pp->p_prev;
1540 	}
1541 
1542 	/*
1543 	 * Decrement page counters
1544 	 */
1545 	page_ctr_sub_internal(mnode, mtype, pp, flags);
1546 
1547 	/*
1548 	 * Set no reloc for cage initted pages.
1549 	 */
1550 	PP_SETNORELOC(pp);
1551 
1552 	mtype = PP_2_MTYPE(pp);
1553 	ASSERT(mtype == MTYPE_NORELOC);
1554 
1555 	/*
1556 	 * Get new list for page.
1557 	 */
1558 	if (PP_ISAGED(pp)) {
1559 		ASSERT(!PP_ISKFLT(pp));
1560 		ppp = PAGE_FREELISTP(PFLT_USER, mnode, 0, bin, mtype);
1561 	} else {
1562 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1563 	}
1564 
1565 	/*
1566 	 * Insert page on new list.
1567 	 */
1568 	if (*ppp == NULL) {
1569 		*ppp = pp;
1570 		pp->p_next = pp->p_prev = pp;
1571 	} else {
1572 		pp->p_next = *ppp;
1573 		pp->p_prev = (*ppp)->p_prev;
1574 		(*ppp)->p_prev = pp;
1575 		pp->p_prev->p_next = pp;
1576 	}
1577 
1578 	/*
1579 	 * Increment page counters
1580 	 */
1581 	page_ctr_add_internal(mnode, mtype, pp, flags);
1582 
1583 	/*
1584 	 * Update cage freemem counter
1585 	 */
1586 	atomic_add_long(&kcage_freemem, 1);
1587 }
1588 #else	/* __sparc */
1589 
1590 /* ARGSUSED */
1591 void
1592 page_list_noreloc_startup(page_t *pp)
1593 {
1594 	panic("page_list_noreloc_startup: should be here only for sparc");
1595 }
1596 #endif
1597 
1598 void
1599 page_list_add_pages(page_t *pp, int flags)
1600 {
1601 	kmutex_t *pcm;
1602 	pgcnt_t	pgcnt;
1603 	uint_t	bin, mtype, i;
1604 	int	mnode;
1605 
1606 	/* default to freelist/head */
1607 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1608 
1609 	CHK_LPG(pp, pp->p_szc);
1610 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1611 
1612 	bin = PP_2_BIN(pp);
1613 	mnode = PP_2_MEM_NODE(pp);
1614 	mtype = PP_2_MTYPE(pp);
1615 
1616 	if (flags & PG_LIST_ISINIT) {
1617 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
1618 		page_vpadd(PAGE_FREELISTP(PFLT_USER, mnode, pp->p_szc,
1619 		    bin, mtype), pp);
1620 		ASSERT(!PP_ISNORELOC(pp));
1621 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1622 	} else {
1623 
1624 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1625 
1626 		pcm = PC_BIN_MUTEX(PFLT_USER, mnode, bin, PG_FREE_LIST);
1627 
1628 		mutex_enter(pcm);
1629 		ASSERT(!PP_ISKFLT(pp));
1630 		page_vpadd(PAGE_FREELISTP(PFLT_USER, mnode, pp->p_szc,
1631 		    bin, mtype), pp);
1632 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1633 		mutex_exit(pcm);
1634 
1635 		pgcnt = page_get_pagecnt(pp->p_szc);
1636 #if defined(__sparc)
1637 		if (PP_ISNORELOC(pp)) {
1638 			kcage_freemem_add(pgcnt);
1639 		}
1640 #elif defined(__amd64) && !defined(__xpv)
1641 		ASSERT(!PP_ISKFLT(pp));
1642 #endif /* __sparc */
1643 		for (i = 0; i < pgcnt; i++, pp++)
1644 			page_unlock_nocapture(pp);
1645 	}
1646 }
1647 
1648 /*
1649  * During boot, need to demote a large page to base
1650  * pagesize pages for seg_kmem for use in boot_alloc()
1651  */
1652 void
1653 page_boot_demote(page_t *pp)
1654 {
1655 	ASSERT(pp->p_szc != 0);
1656 	ASSERT(PP_ISFREE(pp));
1657 	ASSERT(PP_ISAGED(pp));
1658 
1659 	(void) page_demote(PP_2_MEM_NODE(pp),
1660 	    PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1661 	    PC_FREE);
1662 
1663 	ASSERT(PP_ISFREE(pp));
1664 	ASSERT(PP_ISAGED(pp));
1665 	ASSERT(pp->p_szc == 0);
1666 }
1667 
1668 /*
1669  * Take a particular page off of whatever freelist the page
1670  * is claimed to be on.
1671  *
1672  * NOTE: Only used for PAGESIZE pages.
1673  */
1674 void
1675 page_list_sub(page_t *pp, int flags)
1676 {
1677 	int		bin;
1678 	uint_t		mtype;
1679 	int		mnode;
1680 	kmutex_t	*pcm;
1681 	page_t		**ppp;
1682 
1683 	ASSERT(PAGE_EXCL(pp));
1684 	ASSERT(PP_ISFREE(pp));
1685 
1686 	/*
1687 	 * The p_szc field can only be changed by page_promote()
1688 	 * and page_demote(). Only free pages can be promoted and
1689 	 * demoted and the free list MUST be locked during these
1690 	 * operations. So to prevent a race in page_list_sub()
1691 	 * between computing which bin of the freelist lock to
1692 	 * grab and actually grabing the lock we check again that
1693 	 * the bin we locked is still the correct one. Notice that
1694 	 * the p_szc field could have actually changed on us but
1695 	 * if the bin happens to still be the same we are safe.
1696 	 */
1697 try_again:
1698 	bin = PP_2_BIN(pp);
1699 	mnode = PP_2_MEM_NODE(pp);
1700 	pcm = PC_BIN_MUTEX(PP_ISKFLT(pp), mnode, bin, flags);
1701 	mutex_enter(pcm);
1702 	if (PP_2_BIN(pp) != bin) {
1703 		mutex_exit(pcm);
1704 		goto try_again;
1705 	}
1706 	mtype = PP_2_MTYPE(pp);
1707 
1708 	if (flags & PG_FREE_LIST) {
1709 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1710 		ASSERT(PP_ISAGED(pp));
1711 		ppp = PAGE_FREELISTP(PP_ISKFLT(pp), mnode, pp->p_szc,
1712 		    bin, mtype);
1713 	} else {
1714 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
1715 		ASSERT(!PP_ISAGED(pp));
1716 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1717 	}
1718 
1719 	/*
1720 	 * Common PAGESIZE case.
1721 	 *
1722 	 * Note that we locked the freelist. This prevents
1723 	 * any page promotion/demotion operations. Therefore
1724 	 * the p_szc will not change until we drop pcm mutex.
1725 	 */
1726 	if (pp->p_szc == 0) {
1727 		page_sub(ppp, pp);
1728 		/*
1729 		 * Subtract counters before releasing pcm mutex
1730 		 * to avoid race with page_freelist_coalesce.
1731 		 */
1732 		page_ctr_sub(mnode, mtype, pp, flags);
1733 		mutex_exit(pcm);
1734 
1735 #if defined(__sparc)
1736 		if (PP_ISNORELOC(pp)) {
1737 			kcage_freemem_sub(1);
1738 		}
1739 #elif defined(__amd64) && !defined(__xpv)
1740 		if (PP_ISKFLT(pp)) {
1741 			kflt_freemem_sub(1);
1742 		}
1743 #endif /* __sparc */
1744 		return;
1745 	}
1746 
1747 	/*
1748 	 * Large pages on the cache list are not supported.
1749 	 */
1750 	if (flags & PG_CACHE_LIST)
1751 		panic("page_list_sub: large page on cachelist");
1752 
1753 	/*
1754 	 * Slow but rare.
1755 	 *
1756 	 * Somebody wants this particular page which is part
1757 	 * of a large page. In this case we just demote the page
1758 	 * if it's on the freelist.
1759 	 *
1760 	 * We have to drop pcm before locking the entire freelist.
1761 	 * Once we have re-locked the freelist check to make sure
1762 	 * the page hasn't already been demoted or completely
1763 	 * freed.
1764 	 */
1765 	mutex_exit(pcm);
1766 	page_freelist_lock(mnode);
1767 	if (pp->p_szc != 0) {
1768 		/*
1769 		 * Large page is on freelist.
1770 		 */
1771 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1772 		    0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1773 	}
1774 	ASSERT(PP_ISFREE(pp));
1775 	ASSERT(PP_ISAGED(pp));
1776 	ASSERT(pp->p_szc == 0);
1777 
1778 	/* Large pages on the kernel freelist are not supported. */
1779 	ASSERT(!PP_ISKFLT(pp));
1780 
1781 	/*
1782 	 * Subtract counters before releasing pcm mutex
1783 	 * to avoid race with page_freelist_coalesce.
1784 	 */
1785 	bin = PP_2_BIN(pp);
1786 	mtype = PP_2_MTYPE(pp);
1787 	ppp = PAGE_FREELISTP(PFLT_USER, mnode, pp->p_szc, bin, mtype);
1788 	page_sub(ppp, pp);
1789 	page_ctr_sub(mnode, mtype, pp, flags);
1790 	page_freelist_unlock(mnode);
1791 
1792 #if defined(__sparc)
1793 	if (PP_ISNORELOC(pp)) {
1794 		kcage_freemem_sub(1);
1795 	}
1796 #endif /* __sparc */
1797 }
1798 
1799 void
1800 page_list_sub_pages(page_t *pp, uint_t szc)
1801 {
1802 	kmutex_t *pcm;
1803 	uint_t	bin, mtype;
1804 	int	mnode;
1805 
1806 	ASSERT(PAGE_EXCL(pp));
1807 	ASSERT(PP_ISFREE(pp));
1808 	ASSERT(PP_ISAGED(pp));
1809 
1810 	/*
1811 	 * See comment in page_list_sub().
1812 	 */
1813 try_again:
1814 	bin = PP_2_BIN(pp);
1815 	mnode = PP_2_MEM_NODE(pp);
1816 	pcm = PC_BIN_MUTEX(PP_ISKFLT(pp), mnode, bin, PG_FREE_LIST);
1817 	mutex_enter(pcm);
1818 	if (PP_2_BIN(pp) != bin) {
1819 		mutex_exit(pcm);
1820 		goto	try_again;
1821 	}
1822 
1823 	/*
1824 	 * If we're called with a page larger than szc or it got
1825 	 * promoted above szc before we locked the freelist then
1826 	 * drop pcm and re-lock entire freelist. If page still larger
1827 	 * than szc then demote it.
1828 	 */
1829 	if (pp->p_szc > szc) {
1830 		mutex_exit(pcm);
1831 		pcm = NULL;
1832 		page_freelist_lock(mnode);
1833 		if (pp->p_szc > szc) {
1834 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1835 			(void) page_demote(mnode,
1836 			    PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1837 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1838 		}
1839 		bin = PP_2_BIN(pp);
1840 	}
1841 	ASSERT(PP_ISFREE(pp));
1842 	ASSERT(PP_ISAGED(pp));
1843 	ASSERT(pp->p_szc <= szc);
1844 	ASSERT(pp == PP_PAGEROOT(pp));
1845 	ASSERT(!PP_ISKFLT(pp));
1846 
1847 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1848 
1849 	mtype = PP_2_MTYPE(pp);
1850 	if (pp->p_szc != 0) {
1851 		page_vpsub(PAGE_FREELISTP(PFLT_USER, mnode, pp->p_szc,
1852 		    bin, mtype), pp);
1853 		CHK_LPG(pp, pp->p_szc);
1854 	} else {
1855 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1856 		page_sub(PAGE_FREELISTP(PFLT_USER, mnode, pp->p_szc,
1857 		    bin, mtype), pp);
1858 	}
1859 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1860 
1861 	if (pcm != NULL) {
1862 		mutex_exit(pcm);
1863 	} else {
1864 		page_freelist_unlock(mnode);
1865 	}
1866 
1867 #if defined(__sparc)
1868 	if (PP_ISNORELOC(pp)) {
1869 		pgcnt_t	pgcnt;
1870 
1871 		pgcnt = page_get_pagecnt(pp->p_szc);
1872 		kcage_freemem_sub(pgcnt);
1873 	}
1874 #endif /* __sparc */
1875 }
1876 
1877 /*
1878  * Add the page to the front of a linked list of pages
1879  * using the p_next & p_prev pointers for the list.
1880  * The caller is responsible for protecting the list pointers.
1881  */
1882 void
1883 mach_page_add(page_t **ppp, page_t *pp)
1884 {
1885 	if (*ppp == NULL) {
1886 		pp->p_next = pp->p_prev = pp;
1887 	} else {
1888 		pp->p_next = *ppp;
1889 		pp->p_prev = (*ppp)->p_prev;
1890 		(*ppp)->p_prev = pp;
1891 		pp->p_prev->p_next = pp;
1892 	}
1893 	*ppp = pp;
1894 }
1895 
1896 /*
1897  * Remove this page from a linked list of pages
1898  * using the p_next & p_prev pointers for the list.
1899  *
1900  * The caller is responsible for protecting the list pointers.
1901  */
1902 void
1903 mach_page_sub(page_t **ppp, page_t *pp)
1904 {
1905 	ASSERT(PP_ISFREE(pp));
1906 
1907 	if (*ppp == NULL || pp == NULL)
1908 		panic("mach_page_sub");
1909 
1910 	if (*ppp == pp)
1911 		*ppp = pp->p_next;		/* go to next page */
1912 
1913 	if (*ppp == pp)
1914 		*ppp = NULL;			/* page list is gone */
1915 	else {
1916 		pp->p_prev->p_next = pp->p_next;
1917 		pp->p_next->p_prev = pp->p_prev;
1918 	}
1919 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
1920 }
1921 
1922 /*
1923  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1924  */
1925 void
1926 page_promote_size(page_t *pp, uint_t cur_szc)
1927 {
1928 	pfn_t pfn;
1929 	int mnode;
1930 	int idx;
1931 	int new_szc = cur_szc + 1;
1932 	int full = FULL_REGION_CNT(new_szc);
1933 
1934 	pfn = page_pptonum(pp);
1935 	mnode = PFN_2_MEM_NODE(pfn);
1936 
1937 	page_freelist_lock(mnode);
1938 
1939 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1940 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1941 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1942 
1943 	page_freelist_unlock(mnode);
1944 }
1945 
1946 static uint_t page_promote_err;
1947 static uint_t page_promote_noreloc_err;
1948 static uint_t page_promote_kflt_err;
1949 /*
1950  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1951  * for the given mnode starting at pfnum. Pages involved are on the freelist
1952  * before the call and may be returned to the caller if requested, otherwise
1953  * they will be placed back on the freelist.
1954  * If flags is PC_ALLOC, then the large page will be returned to the user in
1955  * a state which is consistent with a page being taken off the freelist.  If
1956  * we failed to lock the new large page, then we will return NULL to the
1957  * caller and put the large page on the freelist instead.
1958  * If flags is PC_FREE, then the large page will be placed on the freelist,
1959  * and NULL will be returned.
1960  * If the PC_KFLT_EXPORT flag is set, the large page will be returned to the
1961  * caller unlocked, as the caller is going to put it on the user page
1962  * freelist
1963  * The caller is responsible for locking the freelist as well as any other
1964  * accounting which needs to be done for a returned page.
1965  *
1966  * RFE: For performance pass in pp instead of pfnum so
1967  * 	we can avoid excessive calls to page_numtopp_nolock().
1968  *	This would depend on an assumption that all contiguous
1969  *	pages are in the same memseg so we can just add/dec
1970  *	our pp.
1971  *
1972  * Lock ordering:
1973  *
1974  *	There is a potential but rare deadlock situation
1975  *	for page promotion and demotion operations. The problem
1976  *	is there are two paths into the freelist manager and
1977  *	they have different lock orders:
1978  *
1979  *	page_create()
1980  *		lock freelist
1981  *		page_lock(EXCL)
1982  *		unlock freelist
1983  *		return
1984  *		caller drops page_lock
1985  *
1986  *	page_free() and page_reclaim()
1987  *		caller grabs page_lock(EXCL)
1988  *
1989  *		lock freelist
1990  *		unlock freelist
1991  *		drop page_lock
1992  *
1993  *	What prevents a thread in page_create() from deadlocking
1994  *	with a thread freeing or reclaiming the same page is the
1995  *	page_trylock() in page_get_freelist(). If the trylock fails
1996  *	it skips the page.
1997  *
1998  *	The lock ordering for promotion and demotion is the same as
1999  *	for page_create(). Since the same deadlock could occur during
2000  *	page promotion and freeing or reclaiming of a page on the
2001  *	cache list we might have to fail the operation and undo what
2002  *	have done so far. Again this is rare.
2003  */
2004 page_t *
2005 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
2006 {
2007 	page_t		*pp, *pplist, *tpp, *start_pp;
2008 	pgcnt_t		new_npgs, npgs;
2009 	uint_t		bin;
2010 	pgcnt_t		tmpnpgs, pages_left;
2011 	uint_t		noreloc;
2012 	int 		which_list;
2013 	ulong_t		index;
2014 	kmutex_t	*phm;
2015 
2016 	/*
2017 	 * General algorithm:
2018 	 * Find the starting page
2019 	 * Walk each page struct removing it from the freelist,
2020 	 * and linking it to all the other pages removed.
2021 	 * Once all pages are off the freelist,
2022 	 * walk the list, modifying p_szc to new_szc and what
2023 	 * ever other info needs to be done to create a large free page.
2024 	 * According to the flags, either return the page or put it
2025 	 * on the freelist.
2026 	 */
2027 
2028 	start_pp = page_numtopp_nolock(pfnum);
2029 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
2030 	new_npgs = page_get_pagecnt(new_szc);
2031 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
2032 
2033 	/* don't return page of the wrong mtype */
2034 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
2035 			return (NULL);
2036 
2037 	/*
2038 	 * Loop through smaller pages to confirm that all pages
2039 	 * give the same result for PP_ISNORELOC().
2040 	 * We can check this reliably here as the protocol for setting
2041 	 * P_NORELOC requires pages to be taken off the free list first.
2042 	 */
2043 	noreloc = PP_ISNORELOC(start_pp);
2044 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
2045 		if (noreloc != PP_ISNORELOC(pp)) {
2046 			page_promote_noreloc_err++;
2047 			page_promote_err++;
2048 			return (NULL);
2049 		}
2050 
2051 		/*
2052 		 * page promote() can only legitimately be called for
2053 		 * pages from the kernel freelist from the kflt_export()
2054 		 * routine which sets the PC_KFLT_EXPORT flag.
2055 		 */
2056 		if (PP_ISKFLT(pp) && !(flags & PC_KFLT_EXPORT)) {
2057 			page_promote_kflt_err++;
2058 			page_promote_err++;
2059 			return (NULL);
2060 		}
2061 	}
2062 
2063 	pages_left = new_npgs;
2064 	pplist = NULL;
2065 	pp = start_pp;
2066 
2067 	/* Loop around coalescing the smaller pages into a big page. */
2068 	while (pages_left) {
2069 		/*
2070 		 * Remove from the freelist.
2071 		 */
2072 		ASSERT(PP_ISFREE(pp));
2073 		bin = PP_2_BIN(pp);
2074 		ASSERT(mnode == PP_2_MEM_NODE(pp));
2075 		mtype = PP_2_MTYPE(pp);
2076 		if (PP_ISAGED(pp)) {
2077 
2078 			/*
2079 			 * PG_FREE_LIST
2080 			 */
2081 			if (pp->p_szc) {
2082 				page_vpsub(PAGE_FREELISTP(PFLT_USER, mnode,
2083 				    pp->p_szc, bin, mtype), pp);
2084 			} else {
2085 				ASSERT(!PP_ISKFLT(pp) ||
2086 				    (flags & PC_KFLT_EXPORT));
2087 				mach_page_sub(PAGE_FREELISTP(PP_ISKFLT(pp),
2088 				    mnode, 0, bin, mtype), pp);
2089 			}
2090 			which_list = PG_FREE_LIST;
2091 		} else {
2092 			ASSERT(pp->p_szc == 0);
2093 
2094 			/*
2095 			 * PG_CACHE_LIST
2096 			 *
2097 			 * Since this page comes from the
2098 			 * cachelist, we must destroy the
2099 			 * vnode association.
2100 			 */
2101 			if (!page_trylock(pp, SE_EXCL)) {
2102 				goto fail_promote;
2103 			}
2104 
2105 			/*
2106 			 * We need to be careful not to deadlock
2107 			 * with another thread in page_lookup().
2108 			 * The page_lookup() thread could be holding
2109 			 * the same phm that we need if the two
2110 			 * pages happen to hash to the same phm lock.
2111 			 * At this point we have locked the entire
2112 			 * freelist and page_lookup() could be trying
2113 			 * to grab a freelist lock.
2114 			 */
2115 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2116 			phm = PAGE_HASH_MUTEX(index);
2117 			if (!mutex_tryenter(phm)) {
2118 				page_unlock_nocapture(pp);
2119 				goto fail_promote;
2120 			}
2121 
2122 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2123 			page_hashout(pp, phm);
2124 			mutex_exit(phm);
2125 			PP_SETAGED(pp);
2126 			page_unlock_nocapture(pp);
2127 			which_list = PG_CACHE_LIST;
2128 		}
2129 		page_ctr_sub(mnode, mtype, pp, which_list);
2130 
2131 		/*
2132 		 * Concatenate the smaller page(s) onto
2133 		 * the large page list.
2134 		 */
2135 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2136 		pages_left -= npgs;
2137 		tpp = pp;
2138 		while (npgs--) {
2139 			tpp->p_szc = new_szc;
2140 			tpp = tpp->p_next;
2141 		}
2142 		page_list_concat(&pplist, &pp);
2143 		pp += tmpnpgs;
2144 	}
2145 	CHK_LPG(pplist, new_szc);
2146 
2147 	/*
2148 	 * return the page to the user if requested
2149 	 * in the properly locked state.
2150 	 */
2151 	if ((flags & PC_ALLOC) && (page_trylock_cons(pplist, SE_EXCL))) {
2152 		return (pplist);
2153 	}
2154 
2155 	/*
2156 	 * If the PC_KFLT_EXPORT flag is set, kflt_export() is just going to
2157 	 * return this large page to the user page freelist, so there is no
2158 	 * need to lock it.
2159 	 */
2160 	if (flags & PC_KFLT_EXPORT) {
2161 		return (pplist);
2162 	}
2163 
2164 	/*
2165 	 * Otherwise place the new large page on the freelist
2166 	 */
2167 	bin = PP_2_BIN(pplist);
2168 	mnode = PP_2_MEM_NODE(pplist);
2169 	mtype = PP_2_MTYPE(pplist);
2170 	page_vpadd(PAGE_FREELISTP(PFLT_USER, mnode, new_szc,
2171 	    bin, mtype), pplist);
2172 
2173 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2174 	return (NULL);
2175 
2176 fail_promote:
2177 	/*
2178 	 * A thread must have still been freeing or
2179 	 * reclaiming the page on the cachelist.
2180 	 * To prevent a deadlock undo what we have
2181 	 * done sofar and return failure. This
2182 	 * situation can only happen while promoting
2183 	 * PAGESIZE pages.
2184 	 */
2185 	page_promote_err++;
2186 	while (pplist) {
2187 		pp = pplist;
2188 		mach_page_sub(&pplist, pp);
2189 		pp->p_szc = 0;
2190 		bin = PP_2_BIN(pp);
2191 		mtype = PP_2_MTYPE(pp);
2192 		ASSERT(!PP_ISKFLT(pp));
2193 		mach_page_add(PAGE_FREELISTP(PFLT_USER, mnode,
2194 		    0, bin, mtype), pp);
2195 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2196 	}
2197 	return (NULL);
2198 
2199 }
2200 
2201 /*
2202  * Break up a large page into smaller size pages.
2203  * Pages involved are on the freelist before the call and may
2204  * be returned to the caller if requested, otherwise they will
2205  * be placed back on the freelist.
2206  * The caller is responsible for locking the freelist as well as any other
2207  * accounting which needs to be done for a returned page.
2208  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2209  * technically, any value may be passed in but PC_NO_COLOR is the standard
2210  * which should be followed for clarity's sake.
2211  * Returns a page whose pfn is < pfnmax
2212  */
2213 page_t *
2214 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2215     uchar_t new_szc, int color, int flags)
2216 {
2217 	page_t	*pp, *pplist, *npplist;
2218 	pgcnt_t	npgs, n;
2219 	uint_t	bin;
2220 	uint_t	mtype;
2221 	page_t	*ret_pp = NULL;
2222 
2223 	ASSERT(cur_szc != 0);
2224 	ASSERT(new_szc < cur_szc);
2225 
2226 	pplist = page_numtopp_nolock(pfnum);
2227 	ASSERT(pplist != NULL);
2228 
2229 	ASSERT(pplist->p_szc == cur_szc);
2230 	ASSERT(!PP_ISKFLT(pplist));
2231 
2232 	bin = PP_2_BIN(pplist);
2233 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
2234 	mtype = PP_2_MTYPE(pplist);
2235 	page_vpsub(PAGE_FREELISTP(PFLT_USER, mnode, cur_szc,
2236 	    bin, mtype), pplist);
2237 
2238 	CHK_LPG(pplist, cur_szc);
2239 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2240 
2241 	/*
2242 	 * Number of PAGESIZE pages for smaller new_szc
2243 	 * page.
2244 	 */
2245 	npgs = page_get_pagecnt(new_szc);
2246 
2247 	while (pplist) {
2248 		pp = pplist;
2249 
2250 		ASSERT(pp->p_szc == cur_szc);
2251 
2252 		/*
2253 		 * We either break it up into PAGESIZE pages or larger.
2254 		 */
2255 		if (npgs == 1) {	/* PAGESIZE case */
2256 			mach_page_sub(&pplist, pp);
2257 			ASSERT(pp->p_szc == cur_szc);
2258 			ASSERT(new_szc == 0);
2259 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2260 			pp->p_szc = new_szc;
2261 			bin = PP_2_BIN(pp);
2262 			if ((bin == color) && (flags == PC_ALLOC) &&
2263 			    (ret_pp == NULL) && (pfnmax == 0 ||
2264 			    pp->p_pagenum < pfnmax) &&
2265 			    page_trylock_cons(pp, SE_EXCL)) {
2266 				ret_pp = pp;
2267 			} else {
2268 				mtype = PP_2_MTYPE(pp);
2269 				ASSERT(!PP_ISKFLT(pp));
2270 				mach_page_add(PAGE_FREELISTP(PFLT_USER, mnode,
2271 				    0, bin, mtype), pp);
2272 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2273 			}
2274 		} else {
2275 			page_t *try_to_return_this_page = NULL;
2276 			int count = 0;
2277 
2278 			/*
2279 			 * Break down into smaller lists of pages.
2280 			 */
2281 			page_list_break(&pplist, &npplist, npgs);
2282 
2283 			pp = pplist;
2284 			n = npgs;
2285 			while (n--) {
2286 				ASSERT(pp->p_szc == cur_szc);
2287 				/*
2288 				 * Check whether all the pages in this list
2289 				 * fit the request criteria.
2290 				 */
2291 				if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2292 					count++;
2293 				}
2294 				pp->p_szc = new_szc;
2295 				pp = pp->p_next;
2296 			}
2297 
2298 			if (count == npgs &&
2299 			    (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2300 				try_to_return_this_page = pp;
2301 			}
2302 
2303 			CHK_LPG(pplist, new_szc);
2304 
2305 			bin = PP_2_BIN(pplist);
2306 			if (try_to_return_this_page)
2307 				ASSERT(mnode ==
2308 				    PP_2_MEM_NODE(try_to_return_this_page));
2309 			if ((bin == color) && (flags == PC_ALLOC) &&
2310 			    (ret_pp == NULL) && try_to_return_this_page &&
2311 			    page_trylock_cons(try_to_return_this_page,
2312 			    SE_EXCL)) {
2313 				ret_pp = try_to_return_this_page;
2314 			} else {
2315 				mtype = PP_2_MTYPE(pp);
2316 				page_vpadd(PAGE_FREELISTP(PFLT_USER, mnode,
2317 				    new_szc, bin, mtype), pplist);
2318 
2319 				page_ctr_add(mnode, mtype, pplist,
2320 				    PG_FREE_LIST);
2321 			}
2322 			pplist = npplist;
2323 		}
2324 	}
2325 	return (ret_pp);
2326 }
2327 
2328 int mpss_coalesce_disable = 0;
2329 
2330 /*
2331  * Coalesce free pages into a page of the given szc and color if possible.
2332  * Return the pointer to the page created, otherwise, return NULL.
2333  *
2334  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2335  */
2336 page_t *
2337 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2338     int mtype, pfn_t pfnhi)
2339 {
2340 	int 	r = szc;		/* region size */
2341 	int	mrange;
2342 	uint_t 	full, bin, color_mask, wrap = 0;
2343 	pfn_t	pfnum, lo, hi;
2344 	size_t	len, idx, idx0;
2345 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
2346 	page_t	*ret_pp;
2347 	MEM_NODE_ITERATOR_DECL(it);
2348 #if defined(__sparc)
2349 	pfn_t pfnum0, nlo, nhi;
2350 #endif
2351 	if (mpss_coalesce_disable) {
2352 		ASSERT(szc < MMU_PAGE_SIZES);
2353 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2354 		return (NULL);
2355 	}
2356 
2357 	ASSERT(szc < mmu_page_sizes);
2358 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2359 	ASSERT(ceq_mask <= color_mask);
2360 	ASSERT(color <= color_mask);
2361 	color &= ceq_mask;
2362 
2363 	/* Prevent page_counters dynamic memory from being freed */
2364 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2365 
2366 	mrange = MTYPE_2_MRANGE(mnode, mtype);
2367 	ASSERT(mrange < mnode_nranges[mnode]);
2368 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2369 
2370 	/* get pfn range for mtype */
2371 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
2372 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2373 	hi++;
2374 
2375 	/* use lower limit if given */
2376 	if (pfnhi != PFNNULL && pfnhi < hi)
2377 		hi = pfnhi;
2378 
2379 	/* round to szcpgcnt boundaries */
2380 	lo = P2ROUNDUP(lo, szcpgcnt);
2381 	MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2382 	if (lo == (pfn_t)-1) {
2383 		rw_exit(&page_ctrs_rwlock[mnode]);
2384 		return (NULL);
2385 	}
2386 	hi = hi & ~(szcpgcnt - 1);
2387 
2388 	/* set lo to the closest pfn of the right color */
2389 	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2390 	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2391 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2392 		    &it);
2393 	}
2394 
2395 	if (hi <= lo) {
2396 		rw_exit(&page_ctrs_rwlock[mnode]);
2397 		return (NULL);
2398 	}
2399 
2400 	full = FULL_REGION_CNT(r);
2401 
2402 	/* calculate the number of page candidates and initial search index */
2403 	bin = color;
2404 	idx0 = (size_t)(-1);
2405 	do {
2406 		pgcnt_t acand;
2407 
2408 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2409 		if (acand) {
2410 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2411 			    r, bin, mrange);
2412 			idx0 = MIN(idx0, idx);
2413 			cands += acand;
2414 		}
2415 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2416 	} while (bin != color);
2417 
2418 	if (cands == 0) {
2419 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2420 		rw_exit(&page_ctrs_rwlock[mnode]);
2421 		return (NULL);
2422 	}
2423 
2424 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
2425 	if (pfnum < lo || pfnum >= hi) {
2426 		pfnum = lo;
2427 	} else {
2428 		MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2429 		if (pfnum == (pfn_t)-1) {
2430 			pfnum = lo;
2431 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2432 			ASSERT(pfnum != (pfn_t)-1);
2433 		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2434 		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2435 			/* invalid color, get the closest correct pfn */
2436 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2437 			    color_mask, &it);
2438 			if (pfnum >= hi) {
2439 				pfnum = lo;
2440 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2441 			}
2442 		}
2443 	}
2444 
2445 	/* set starting index */
2446 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2447 	ASSERT(idx0 < len);
2448 
2449 #if defined(__sparc)
2450 	pfnum0 = pfnum;		/* page corresponding to idx0 */
2451 	nhi = 0;		/* search kcage ranges */
2452 #endif
2453 
2454 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2455 
2456 #if defined(__sparc)
2457 		/*
2458 		 * Find lowest intersection of kcage ranges and mnode.
2459 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
2460 		 */
2461 		if (nhi <= pfnum) {
2462 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2463 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2464 				goto wrapit;
2465 
2466 			/* jump to the next page in the range */
2467 			if (pfnum < nlo) {
2468 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
2469 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2470 				idx = PNUM_TO_IDX(mnode, r, pfnum);
2471 				if (idx >= len || pfnum >= hi)
2472 					goto wrapit;
2473 				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2474 				    ceq_mask)
2475 					goto next;
2476 				if (interleaved_mnodes &&
2477 				    PFN_2_MEM_NODE(pfnum) != mnode)
2478 					goto next;
2479 			}
2480 		}
2481 #endif
2482 
2483 		if (PAGE_COUNTERS(mnode, r, idx) != full)
2484 			goto next;
2485 
2486 		/*
2487 		 * RFE: For performance maybe we can do something less
2488 		 *	brutal than locking the entire freelist. So far
2489 		 * 	this doesn't seem to be a performance problem?
2490 		 */
2491 		page_freelist_lock(mnode);
2492 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
2493 			ret_pp =
2494 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2495 			if (ret_pp != NULL) {
2496 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2497 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2498 				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2499 				page_freelist_unlock(mnode);
2500 				rw_exit(&page_ctrs_rwlock[mnode]);
2501 #if defined(__sparc)
2502 				if (PP_ISNORELOC(ret_pp)) {
2503 					pgcnt_t npgs;
2504 
2505 					npgs = page_get_pagecnt(ret_pp->p_szc);
2506 					kcage_freemem_sub(npgs);
2507 				}
2508 #elif defined(__amd64) && !defined(__xpv)
2509 				/*
2510 				 * Only a single page size is supported on
2511 				 * the kernel freelist. This will need to
2512 				 * be changed to increase the availability
2513 				 * of more than one large page size.
2514 				 */
2515 				ASSERT(!PP_ISKFLT(ret_pp));
2516 #endif /* __sparc */
2517 				return (ret_pp);
2518 			}
2519 #ifdef VM_STATS
2520 		} else {
2521 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2522 #endif
2523 		}
2524 
2525 		page_freelist_unlock(mnode);
2526 		/*
2527 		 * No point looking for another page if we've
2528 		 * already tried all of the ones that
2529 		 * page_ctr_cands indicated.  Stash off where we left
2530 		 * off.
2531 		 * Note: this is not exact since we don't hold the
2532 		 * page_freelist_locks before we initially get the
2533 		 * value of cands for performance reasons, but should
2534 		 * be a decent approximation.
2535 		 */
2536 		if (--cands == 0) {
2537 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2538 			    idx;
2539 			break;
2540 		}
2541 next:
2542 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2543 		    color_mask, &it);
2544 		idx = PNUM_TO_IDX(mnode, r, pfnum);
2545 		if (idx >= len || pfnum >= hi) {
2546 wrapit:
2547 			pfnum = lo;
2548 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2549 			idx = PNUM_TO_IDX(mnode, r, pfnum);
2550 			wrap++;
2551 #if defined(__sparc)
2552 			nhi = 0;	/* search kcage ranges */
2553 #endif
2554 		}
2555 	}
2556 
2557 	rw_exit(&page_ctrs_rwlock[mnode]);
2558 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2559 	return (NULL);
2560 }
2561 
2562 /*
2563  * For the given mnode, promote as many small pages to large pages as possible.
2564  * mnode can be -1, which means do them all
2565  */
2566 void
2567 page_freelist_coalesce_all(int mnode)
2568 {
2569 	int 	r;		/* region size */
2570 	int 	idx, full;
2571 	size_t	len;
2572 	int doall = interleaved_mnodes || mnode < 0;
2573 	int mlo = doall ? 0 : mnode;
2574 	int mhi = doall ? max_mem_nodes : (mnode + 1);
2575 
2576 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2577 
2578 	if (mpss_coalesce_disable) {
2579 		return;
2580 	}
2581 
2582 	/*
2583 	 * Lock the entire freelist and coalesce what we can.
2584 	 *
2585 	 * Always promote to the largest page possible
2586 	 * first to reduce the number of page promotions.
2587 	 */
2588 	for (mnode = mlo; mnode < mhi; mnode++) {
2589 		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2590 		page_freelist_lock(mnode);
2591 	}
2592 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2593 		for (mnode = mlo; mnode < mhi; mnode++) {
2594 			pgcnt_t cands = 0;
2595 			int mrange, nranges = mnode_nranges[mnode];
2596 
2597 			for (mrange = 0; mrange < nranges; mrange++) {
2598 				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2599 				if (cands != 0)
2600 					break;
2601 			}
2602 			if (cands == 0) {
2603 				VM_STAT_ADD(vmm_vmstats.
2604 				    page_ctrs_cands_skip_all);
2605 				continue;
2606 			}
2607 
2608 			full = FULL_REGION_CNT(r);
2609 			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2610 
2611 			for (idx = 0; idx < len; idx++) {
2612 				if (PAGE_COUNTERS(mnode, r, idx) == full) {
2613 					pfn_t pfnum =
2614 					    IDX_TO_PNUM(mnode, r, idx);
2615 					int tmnode = interleaved_mnodes ?
2616 					    PFN_2_MEM_NODE(pfnum) : mnode;
2617 
2618 					ASSERT(pfnum >=
2619 					    mem_node_config[tmnode].physbase &&
2620 					    pfnum <
2621 					    mem_node_config[tmnode].physmax);
2622 
2623 					(void) page_promote(tmnode,
2624 					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
2625 				}
2626 			}
2627 			/* shared hpm_counters covers all mnodes, so we quit */
2628 			if (interleaved_mnodes)
2629 				break;
2630 		}
2631 	}
2632 	for (mnode = mlo; mnode < mhi; mnode++) {
2633 		page_freelist_unlock(mnode);
2634 		rw_exit(&page_ctrs_rwlock[mnode]);
2635 	}
2636 }
2637 
2638 /*
2639  * This is where all polices for moving pages around
2640  * to different page size free lists is implemented.
2641  * Returns 1 on success, 0 on failure.
2642  *
2643  * So far these are the priorities for this algorithm in descending
2644  * order:
2645  *
2646  *	1) When servicing a request try to do so with a free page
2647  *	   from next size up. Helps defer fragmentation as long
2648  *	   as possible.
2649  *
2650  *	2) Page coalesce on demand. Only when a freelist
2651  *	   larger than PAGESIZE is empty and step 1
2652  *	   will not work since all larger size lists are
2653  *	   also empty.
2654  *
2655  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2656  */
2657 
2658 page_t *
2659 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2660     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2661 {
2662 	uchar_t nszc = szc + 1;
2663 	uint_t 	bin, sbin, bin_prev;
2664 	page_t	*pp, *firstpp;
2665 	page_t	*ret_pp = NULL;
2666 	uint_t  color_mask;
2667 
2668 	if (nszc == mmu_page_sizes)
2669 		return (NULL);
2670 
2671 	ASSERT(nszc < mmu_page_sizes);
2672 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2673 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2674 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2675 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2676 
2677 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2678 	/*
2679 	 * First try to break up a larger page to fill current size freelist.
2680 	 */
2681 	while (plw->plw_bins[nszc] != 0) {
2682 
2683 		ASSERT(nszc < mmu_page_sizes);
2684 
2685 		/*
2686 		 * If page found then demote it.
2687 		 */
2688 		if (PAGE_FREELISTS(PFLT_USER, mnode, nszc, bin, mtype)) {
2689 			page_freelist_lock(mnode);
2690 			firstpp = pp = PAGE_FREELISTS(PFLT_USER, mnode,
2691 			    nszc, bin, mtype);
2692 
2693 			/*
2694 			 * If pfnhi is not PFNNULL, look for large page below
2695 			 * pfnhi. PFNNULL signifies no pfn requirement.
2696 			 */
2697 			if (pp &&
2698 			    ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2699 			    (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2700 				do {
2701 					pp = pp->p_vpnext;
2702 					if (pp == firstpp) {
2703 						pp = NULL;
2704 						break;
2705 					}
2706 				} while ((pfnhi != PFNNULL &&
2707 				    pp->p_pagenum >= pfnhi) ||
2708 				    (pfnlo != PFNNULL &&
2709 				    pp->p_pagenum < pfnlo));
2710 
2711 				if (pfnhi != PFNNULL && pp != NULL)
2712 					ASSERT(pp->p_pagenum < pfnhi);
2713 
2714 				if (pfnlo != PFNNULL && pp != NULL)
2715 					ASSERT(pp->p_pagenum >= pfnlo);
2716 			}
2717 			if (pp) {
2718 				uint_t ccolor = page_correct_color(szc, nszc,
2719 				    color, bin, plw->plw_ceq_mask[szc]);
2720 
2721 				ASSERT(pp->p_szc == nszc);
2722 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2723 				ret_pp = page_demote(mnode, pp->p_pagenum,
2724 				    pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2725 				if (ret_pp) {
2726 					page_freelist_unlock(mnode);
2727 #if defined(__sparc)
2728 					if (PP_ISNORELOC(ret_pp)) {
2729 						pgcnt_t npgs;
2730 
2731 						npgs = page_get_pagecnt(
2732 						    ret_pp->p_szc);
2733 						kcage_freemem_sub(npgs);
2734 					}
2735 #elif defined(__amd64) && !defined(__xpv)
2736 					ASSERT(!PP_ISKFLT(pp));
2737 #endif /* __sparc */
2738 					return (ret_pp);
2739 				}
2740 			}
2741 			page_freelist_unlock(mnode);
2742 		}
2743 
2744 		/* loop through next size bins */
2745 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2746 		plw->plw_bins[nszc]--;
2747 
2748 		if (bin == sbin) {
2749 			uchar_t nnszc = nszc + 1;
2750 
2751 			/* we are done with this page size - check next */
2752 			if (plw->plw_bins[nnszc] == 0)
2753 				/* we have already checked next size bins */
2754 				break;
2755 
2756 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2757 			if (bin_prev != INVALID_COLOR) {
2758 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2759 				if (!((bin ^ bin_prev) &
2760 				    plw->plw_ceq_mask[nnszc]))
2761 					break;
2762 			}
2763 			ASSERT(nnszc < mmu_page_sizes);
2764 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2765 			nszc = nnszc;
2766 			ASSERT(nszc < mmu_page_sizes);
2767 		}
2768 	}
2769 
2770 	return (ret_pp);
2771 }
2772 
2773 /*
2774  * Helper routine used only by the freelist code to lock
2775  * a page. If the page is a large page then it succeeds in
2776  * locking all the constituent pages or none at all.
2777  * Returns 1 on sucess, 0 on failure.
2778  */
2779 static int
2780 page_trylock_cons(page_t *pp, se_t se)
2781 {
2782 	page_t	*tpp, *first_pp = pp;
2783 
2784 	/*
2785 	 * Fail if can't lock first or only page.
2786 	 */
2787 	if (!page_trylock(pp, se)) {
2788 		return (0);
2789 	}
2790 
2791 	/*
2792 	 * PAGESIZE: common case.
2793 	 */
2794 	if (pp->p_szc == 0) {
2795 		return (1);
2796 	}
2797 
2798 	/*
2799 	 * Large page case.
2800 	 */
2801 	tpp = pp->p_next;
2802 	while (tpp != pp) {
2803 		if (!page_trylock(tpp, se)) {
2804 			/*
2805 			 * On failure unlock what we have locked so far.
2806 			 * We want to avoid attempting to capture these
2807 			 * pages as the pcm mutex may be held which could
2808 			 * lead to a recursive mutex panic.
2809 			 */
2810 			while (first_pp != tpp) {
2811 				page_unlock_nocapture(first_pp);
2812 				first_pp = first_pp->p_next;
2813 			}
2814 			return (0);
2815 		}
2816 		tpp = tpp->p_next;
2817 	}
2818 	return (1);
2819 }
2820 
2821 /*
2822  * init context for walking page lists
2823  * Called when a page of the given szc in unavailable. Sets markers
2824  * for the beginning of the search to detect when search has
2825  * completed a full cycle. Sets flags for splitting larger pages
2826  * and coalescing smaller pages. Page walking procedes until a page
2827  * of the desired equivalent color is found.
2828  */
2829 void
2830 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2831     int use_ceq, page_list_walker_t *plw)
2832 {
2833 	uint_t  nszc, ceq_mask, colors;
2834 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2835 
2836 	ASSERT(szc < mmu_page_sizes);
2837 	colors = PAGE_GET_PAGECOLORS(szc);
2838 
2839 	plw->plw_colors = colors;
2840 	plw->plw_color_mask = colors - 1;
2841 	plw->plw_bin_marker = plw->plw_bin0 = bin;
2842 	plw->plw_bin_split_prev = bin;
2843 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2844 
2845 	/*
2846 	 * if vac aliasing is possible make sure lower order color
2847 	 * bits are never ignored
2848 	 */
2849 	if (vac_colors > 1)
2850 		ceq &= 0xf0;
2851 
2852 	/*
2853 	 * calculate the number of non-equivalent colors and
2854 	 * color equivalency mask
2855 	 */
2856 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2857 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2858 	ASSERT(plw->plw_ceq_dif > 0);
2859 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2860 
2861 	if (flags & PG_MATCH_COLOR) {
2862 		if (cpu_page_colors <  0) {
2863 			/*
2864 			 * this is a heterogeneous machine with different CPUs
2865 			 * having different size e$ (not supported for ni2/rock
2866 			 */
2867 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2868 			cpucolors = MAX(cpucolors, 1);
2869 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2870 			plw->plw_ceq_mask[szc] =
2871 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2872 		}
2873 		plw->plw_ceq_dif = 1;
2874 	}
2875 
2876 	/* we can split pages in the freelist, but not the cachelist */
2877 	if (can_split) {
2878 		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2879 
2880 		/* set next szc color masks and number of free list bins */
2881 		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2882 			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2883 			    plw->plw_ceq_mask[szc]);
2884 			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2885 		}
2886 		plw->plw_ceq_mask[nszc] = INVALID_MASK;
2887 		plw->plw_bins[nszc] = 0;
2888 
2889 	} else {
2890 		ASSERT(szc == 0);
2891 		plw->plw_do_split = 0;
2892 		plw->plw_bins[1] = 0;
2893 		plw->plw_ceq_mask[1] = INVALID_MASK;
2894 	}
2895 	ASSERT(bin < plw->plw_colors);
2896 }
2897 
2898 /*
2899  * Walker variables for the kernel freelist are initialized so that all
2900  * kernel page colors are treated as equivalent. This mimimizes the amount
2901  * of memory used by the the kernel freelist.
2902  */
2903 /* ARGSUSED */
2904 void
2905 page_kflt_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2906     int use_ceq, page_list_walker_t *plw)
2907 {
2908 	/*
2909 	 * Note that the following values are only valid for pages with
2910 	 * szc == 0.
2911 	 */
2912 	ASSERT(szc == 0);
2913 
2914 	/* The number of colors for kernel pages */
2915 	plw->plw_colors = KFLT_PAGE_COLORS;
2916 	plw->plw_color_mask = KFLT_PAGE_COLORS - 1;
2917 
2918 	/* The marker indicates when at all the bins have been processed */
2919 	plw->plw_bin_marker = plw->plw_bin0 = bin;
2920 	plw->plw_bin_split_prev = bin;
2921 
2922 	/* Add plw_bin_step to get the next bin to process */
2923 	plw->plw_bin_step = vac_colors;
2924 
2925 	/* There is only 1 color group i.e. all colors are equivalent */
2926 	plw->plw_ceq_dif = 1;
2927 	plw->plw_ceq_mask[0] = 0;
2928 	plw->plw_do_split = 0;
2929 
2930 	ASSERT(bin < plw->plw_colors);
2931 }
2932 
2933 /*
2934  * set mark to flag where next split should occur
2935  */
2936 #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
2937 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
2938 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
2939 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2940 	plw->plw_split_next =						     \
2941 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
2942 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2943 		plw->plw_split_next =					     \
2944 		INC_MASKED(plw->plw_split_next,				     \
2945 		    neq_mask, plw->plw_color_mask);			     \
2946 	}								     \
2947 }
2948 
2949 uint_t
2950 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2951 {
2952 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2953 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2954 	uchar_t nszc = szc + 1;
2955 
2956 	nbin = ADD_MASKED(bin,
2957 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2958 
2959 	if (plw->plw_do_split) {
2960 		plw->plw_bin_split_prev = bin;
2961 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2962 		plw->plw_do_split = 0;
2963 	}
2964 
2965 	if (szc == 0) {
2966 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2967 			if (nbin == plw->plw_bin0 &&
2968 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2969 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2970 				    neq_mask, plw->plw_color_mask);
2971 				plw->plw_bin_split_prev = plw->plw_bin0;
2972 			}
2973 
2974 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2975 				plw->plw_bin_marker =
2976 				    nbin = INC_MASKED(nbin, neq_mask,
2977 				    plw->plw_color_mask);
2978 				plw->plw_bin_split_prev = plw->plw_bin0;
2979 				/*
2980 				 * large pages all have the same vac color
2981 				 * so by now we should be done with next
2982 				 * size page splitting process
2983 				 */
2984 				ASSERT(plw->plw_bins[1] == 0);
2985 				plw->plw_do_split = 0;
2986 				return (nbin);
2987 			}
2988 
2989 		} else {
2990 			uint_t bin_jump = (vac_colors == 1) ?
2991 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2992 
2993 			bin_jump &= ~(vac_colors - 1);
2994 
2995 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2996 			    plw->plw_color_mask);
2997 
2998 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2999 
3000 				plw->plw_bin_marker = nbin = nbin0;
3001 
3002 				if (plw->plw_bins[nszc] != 0) {
3003 					/*
3004 					 * check if next page size bin is the
3005 					 * same as the next page size bin for
3006 					 * bin0
3007 					 */
3008 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
3009 					    nbin);
3010 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
3011 					    plw->plw_bin0);
3012 
3013 					if ((bin0_nsz ^ nbin_nsz) &
3014 					    plw->plw_ceq_mask[nszc])
3015 						plw->plw_do_split = 1;
3016 				}
3017 				return (nbin);
3018 			}
3019 		}
3020 	}
3021 
3022 	if (plw->plw_bins[nszc] != 0) {
3023 		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
3024 		if (!((plw->plw_split_next ^ nbin_nsz) &
3025 		    plw->plw_ceq_mask[nszc]))
3026 			plw->plw_do_split = 1;
3027 	}
3028 
3029 	return (nbin);
3030 }
3031 
3032 page_t *
3033 page_get_mnode_freelist(page_freelist_type_t *fp, int mnode, uint_t bin,
3034     int mtype, uchar_t szc, uint_t flags)
3035 {
3036 	kmutex_t		*pcm;
3037 	page_t			*pp, *first_pp;
3038 	uint_t			sbin;
3039 	int			plw_initialized;
3040 	page_list_walker_t	plw;
3041 
3042 	ASSERT(szc < mmu_page_sizes);
3043 
3044 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
3045 
3046 	MTYPE_START(mnode, mtype, flags);
3047 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3048 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
3049 		return (NULL);
3050 	}
3051 try_again:
3052 	plw_initialized = 0;
3053 	plw.plw_ceq_dif = 1;
3054 
3055 	/*
3056 	 * Only hold one freelist lock at a time, that way we
3057 	 * can start anywhere and not have to worry about lock
3058 	 * ordering.
3059 	 */
3060 	for (plw.plw_count = 0;
3061 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3062 		sbin = bin;
3063 		do {
3064 			if (!PAGE_FREELISTS(PC_ISKFLT(fp), mnode, szc,
3065 			    bin, mtype)) {
3066 				goto bin_empty_1;
3067 			}
3068 
3069 			pcm = PC_BIN_MUTEX(PC_ISKFLT(fp), mnode, bin,
3070 			    PG_FREE_LIST);
3071 			mutex_enter(pcm);
3072 			pp = PAGE_FREELISTS(PC_ISKFLT(fp), mnode, szc,
3073 			    bin, mtype);
3074 			if (pp == NULL) {
3075 				goto bin_empty_0;
3076 			}
3077 
3078 			/*
3079 			 * These were set before the page
3080 			 * was put on the free list,
3081 			 * they must still be set.
3082 			 */
3083 			ASSERT(PP_ISFREE(pp));
3084 			ASSERT(PP_ISAGED(pp));
3085 			ASSERT(pp->p_vnode == NULL);
3086 			ASSERT(pp->p_hash == NULL);
3087 			ASSERT(pp->p_offset == (u_offset_t)-1);
3088 			ASSERT(pp->p_szc == szc);
3089 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3090 
3091 			/*
3092 			 * Walk down the hash chain.
3093 			 * 8k pages are linked on p_next
3094 			 * and p_prev fields. Large pages
3095 			 * are a contiguous group of
3096 			 * constituent pages linked together
3097 			 * on their p_next and p_prev fields.
3098 			 * The large pages are linked together
3099 			 * on the hash chain using p_vpnext
3100 			 * p_vpprev of the base constituent
3101 			 * page of each large page.
3102 			 */
3103 			first_pp = pp;
3104 			while (!page_trylock_cons(pp, SE_EXCL) ||
3105 			    IS_DUMP_PAGE(pp)) {
3106 				if (szc == 0) {
3107 					pp = pp->p_next;
3108 				} else {
3109 					pp = pp->p_vpnext;
3110 				}
3111 
3112 				ASSERT(PP_ISFREE(pp));
3113 				ASSERT(PP_ISAGED(pp));
3114 				ASSERT(pp->p_vnode == NULL);
3115 				ASSERT(pp->p_hash == NULL);
3116 				ASSERT(pp->p_offset == (u_offset_t)-1);
3117 				ASSERT(pp->p_szc == szc);
3118 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3119 
3120 				if (pp == first_pp)
3121 					goto bin_empty_0;
3122 			}
3123 
3124 			ASSERT(pp != NULL);
3125 			ASSERT(mtype == PP_2_MTYPE(pp));
3126 			ASSERT(pp->p_szc == szc);
3127 			if (szc == 0) {
3128 				page_sub(PAGE_FREELISTP(PC_ISKFLT(fp), mnode,
3129 				    szc, bin, mtype), pp);
3130 			} else {
3131 				page_vpsub(PAGE_FREELISTP(PC_ISKFLT(fp), mnode,
3132 				    szc, bin, mtype), pp);
3133 				CHK_LPG(pp, szc);
3134 			}
3135 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3136 
3137 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
3138 				panic("free page is not. pp %p", (void *)pp);
3139 			mutex_exit(pcm);
3140 
3141 #if defined(__sparc)
3142 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
3143 			    (flags & PG_NORELOC) == 0);
3144 
3145 			if (PP_ISNORELOC(pp))
3146 				kcage_freemem_sub(page_get_pagecnt(szc));
3147 #elif defined(__amd64) && !defined(__xpv)
3148 			if (PP_ISKFLT(pp)) {
3149 				ASSERT(szc == 0);
3150 				kflt_freemem_sub(1);
3151 			}
3152 #endif /* __sparc */
3153 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
3154 			return (pp);
3155 
3156 bin_empty_0:
3157 			mutex_exit(pcm);
3158 bin_empty_1:
3159 			if (plw_initialized == 0) {
3160 				PAGE_LIST_WALK_INIT(fp, szc, flags, bin, 1, 1,
3161 				    &plw);
3162 				plw_initialized = 1;
3163 				ASSERT(plw.plw_colors <=
3164 				    PAGE_GET_PAGECOLORS(szc));
3165 				ASSERT(plw.plw_colors > 0);
3166 				ASSERT((plw.plw_colors &
3167 				    (plw.plw_colors - 1)) == 0);
3168 				ASSERT(bin < plw.plw_colors);
3169 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3170 			}
3171 			/* calculate the next bin with equivalent color */
3172 			bin = ADD_MASKED(bin, plw.plw_bin_step,
3173 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
3174 
3175 		} while (sbin != bin);
3176 
3177 		/*
3178 		 * color bins are all empty if color match. Try and
3179 		 * satisfy the request by breaking up or coalescing
3180 		 * pages from a different size freelist of the correct
3181 		 * color that satisfies the ORIGINAL color requested.
3182 		 * If that fails then try pages of the same size but
3183 		 * different colors assuming we are not called with
3184 		 * PG_MATCH_COLOR.
3185 		 */
3186 		if (plw.plw_do_split &&
3187 		    (pp = page_freelist_split(szc, bin, mnode,
3188 		    mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3189 			return (pp);
3190 
3191 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3192 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
3193 			return (pp);
3194 
3195 		if (plw.plw_ceq_dif > 1)
3196 			bin = PAGE_LIST_WALK_NEXT(fp, szc, bin, &plw);
3197 	}
3198 
3199 	/* if allowed, cycle through additional mtypes */
3200 	MTYPE_NEXT(mnode, mtype, flags);
3201 	if (mtype >= 0)
3202 		goto try_again;
3203 
3204 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3205 
3206 	return (NULL);
3207 }
3208 
3209 /*
3210  * Returns the count of free pages for 'pp' with size code 'szc'.
3211  * Note: This function does not return an exact value as the page freelist
3212  * locks are not held and thus the values in the page_counters may be
3213  * changing as we walk through the data.
3214  */
3215 static int
3216 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3217 {
3218 	pgcnt_t	pgfree;
3219 	pgcnt_t cnt;
3220 	ssize_t	r = szc;	/* region size */
3221 	ssize_t	idx;
3222 	int	i;
3223 	int	full, range;
3224 
3225 	/* Make sure pagenum passed in is aligned properly */
3226 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3227 	ASSERT(szc > 0);
3228 
3229 	/* Prevent page_counters dynamic memory from being freed */
3230 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3231 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3232 	cnt = PAGE_COUNTERS(mnode, r, idx);
3233 	pgfree = cnt << PNUM_SHIFT(r - 1);
3234 	range = FULL_REGION_CNT(szc);
3235 
3236 	/* Check for completely full region */
3237 	if (cnt == range) {
3238 		rw_exit(&page_ctrs_rwlock[mnode]);
3239 		return (pgfree);
3240 	}
3241 
3242 	while (--r > 0) {
3243 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3244 		full = FULL_REGION_CNT(r);
3245 		for (i = 0; i < range; i++, idx++) {
3246 			cnt = PAGE_COUNTERS(mnode, r, idx);
3247 			/*
3248 			 * If cnt here is full, that means we have already
3249 			 * accounted for these pages earlier.
3250 			 */
3251 			if (cnt != full) {
3252 				pgfree += (cnt << PNUM_SHIFT(r - 1));
3253 			}
3254 		}
3255 		range *= full;
3256 	}
3257 	rw_exit(&page_ctrs_rwlock[mnode]);
3258 	return (pgfree);
3259 }
3260 
3261 /*
3262  * Called from page_geti_contig_pages to exclusively lock constituent pages
3263  * starting from 'spp' for page size code 'szc'.
3264  *
3265  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3266  * region needs to be greater than or equal to the threshold.
3267  */
3268 static int
3269 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3270 {
3271 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
3272 	pgcnt_t pgfree, i;
3273 	page_t *pp;
3274 
3275 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3276 
3277 
3278 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3279 		goto skipptcpcheck;
3280 	/*
3281 	 * check if there are sufficient free pages available before attempting
3282 	 * to trylock. Count is approximate as page counters can change.
3283 	 */
3284 	pgfree = page_freecnt(mnode, spp, szc);
3285 
3286 	/* attempt to trylock if there are sufficient already free pages */
3287 	if (pgfree < pgcnt/ptcpthreshold) {
3288 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3289 		return (0);
3290 	}
3291 
3292 skipptcpcheck:
3293 
3294 	for (i = 0; i < pgcnt; i++) {
3295 		pp = &spp[i];
3296 		if (!page_trylock(pp, SE_EXCL)) {
3297 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3298 			while (--i != (pgcnt_t)-1) {
3299 				pp = &spp[i];
3300 				ASSERT(PAGE_EXCL(pp));
3301 				page_unlock_nocapture(pp);
3302 			}
3303 			return (0);
3304 		}
3305 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3306 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3307 		    !PP_ISFREE(pp)) {
3308 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3309 			ASSERT(i == 0);
3310 			page_unlock_nocapture(pp);
3311 			return (0);
3312 		}
3313 		if (PP_ISNORELOC(pp)) {
3314 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3315 			while (i != (pgcnt_t)-1) {
3316 				pp = &spp[i];
3317 				ASSERT(PAGE_EXCL(pp));
3318 				page_unlock_nocapture(pp);
3319 				i--;
3320 			}
3321 			return (0);
3322 		}
3323 		if (PP_ISKFLT(pp)) {
3324 			VM_STAT_ADD(vmm_vmstats.ptcpfailkflt[szc]);
3325 			ASSERT(i == 0);
3326 			while (i != (pgcnt_t)-1) {
3327 				pp = &spp[i];
3328 				ASSERT(PAGE_EXCL(pp));
3329 				page_unlock_nocapture(pp);
3330 				i--;
3331 			}
3332 			return (0);
3333 		}
3334 	}
3335 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3336 	return (1);
3337 }
3338 
3339 /*
3340  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3341  * of 'szc' constituent pages that had been locked exclusively previously.
3342  * Will attempt to relocate constituent pages in use.
3343  */
3344 static page_t *
3345 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3346 {
3347 	spgcnt_t pgcnt, npgs, i;
3348 	page_t *targpp, *rpp, *hpp;
3349 	page_t *replpp = NULL;
3350 	page_t *pplist = NULL;
3351 
3352 	ASSERT(pp != NULL);
3353 
3354 	pgcnt = page_get_pagecnt(szc);
3355 	while (pgcnt) {
3356 		ASSERT(PAGE_EXCL(pp));
3357 		ASSERT(!PP_ISNORELOC(pp));
3358 		ASSERT(!PP_ISKFLT(pp));
3359 		if (PP_ISFREE(pp)) {
3360 			/*
3361 			 * If this is a PG_FREE_LIST page then its
3362 			 * size code can change underneath us due to
3363 			 * page promotion or demotion. As an optimzation
3364 			 * use page_list_sub_pages() instead of
3365 			 * page_list_sub().
3366 			 */
3367 			if (PP_ISAGED(pp)) {
3368 				page_list_sub_pages(pp, szc);
3369 				if (pp->p_szc == szc) {
3370 					return (pp);
3371 				}
3372 				ASSERT(pp->p_szc < szc);
3373 				npgs = page_get_pagecnt(pp->p_szc);
3374 				hpp = pp;
3375 				for (i = 0; i < npgs; i++, pp++) {
3376 					pp->p_szc = szc;
3377 				}
3378 				page_list_concat(&pplist, &hpp);
3379 				pgcnt -= npgs;
3380 				continue;
3381 			}
3382 			ASSERT(!PP_ISAGED(pp));
3383 			ASSERT(pp->p_szc == 0);
3384 			page_list_sub(pp, PG_CACHE_LIST);
3385 			page_hashout(pp, NULL);
3386 			PP_SETAGED(pp);
3387 			pp->p_szc = szc;
3388 			page_list_concat(&pplist, &pp);
3389 			pp++;
3390 			pgcnt--;
3391 			continue;
3392 		}
3393 		npgs = page_get_pagecnt(pp->p_szc);
3394 
3395 		/*
3396 		 * page_create_wait freemem accounting done by caller of
3397 		 * page_get_freelist and not necessary to call it prior to
3398 		 * calling page_get_replacement_page.
3399 		 *
3400 		 * page_get_replacement_page can call page_get_contig_pages
3401 		 * to acquire a large page (szc > 0); the replacement must be
3402 		 * smaller than the contig page size to avoid looping or
3403 		 * szc == 0 and PGI_PGCPSZC0 is set.
3404 		 */
3405 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3406 			replpp = page_get_replacement_page(pp, NULL, 0);
3407 			if (replpp) {
3408 				npgs = page_get_pagecnt(pp->p_szc);
3409 				ASSERT(npgs <= pgcnt);
3410 				targpp = pp;
3411 			}
3412 		}
3413 
3414 		/*
3415 		 * If replacement is NULL or do_page_relocate fails, fail
3416 		 * coalescing of pages.
3417 		 */
3418 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3419 		    &npgs, NULL) != 0)) {
3420 			/*
3421 			 * Unlock un-processed target list
3422 			 */
3423 			while (pgcnt--) {
3424 				ASSERT(PAGE_EXCL(pp));
3425 				page_unlock_nocapture(pp);
3426 				pp++;
3427 			}
3428 			/*
3429 			 * Free the processed target list.
3430 			 */
3431 			while (pplist) {
3432 				pp = pplist;
3433 				page_sub(&pplist, pp);
3434 				ASSERT(PAGE_EXCL(pp));
3435 				ASSERT(pp->p_szc == szc);
3436 				ASSERT(PP_ISFREE(pp));
3437 				ASSERT(PP_ISAGED(pp));
3438 				pp->p_szc = 0;
3439 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3440 				page_unlock_nocapture(pp);
3441 			}
3442 
3443 			if (replpp != NULL)
3444 				page_free_replacement_page(replpp);
3445 
3446 			return (NULL);
3447 		}
3448 		ASSERT(pp == targpp);
3449 
3450 		/* LINTED */
3451 		ASSERT(hpp = pp); /* That's right, it's an assignment */
3452 
3453 		pp += npgs;
3454 		pgcnt -= npgs;
3455 
3456 		while (npgs--) {
3457 			ASSERT(PAGE_EXCL(targpp));
3458 			ASSERT(!PP_ISFREE(targpp));
3459 			ASSERT(!PP_ISNORELOC(targpp));
3460 			ASSERT(!PP_ISKFLT(targpp));
3461 			PP_SETFREE(targpp);
3462 			ASSERT(PP_ISAGED(targpp));
3463 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
3464 			    (flags & PGI_PGCPSZC0)));
3465 			targpp->p_szc = szc;
3466 			targpp = targpp->p_next;
3467 
3468 			rpp = replpp;
3469 			ASSERT(rpp != NULL);
3470 			page_sub(&replpp, rpp);
3471 			ASSERT(PAGE_EXCL(rpp));
3472 			ASSERT(!PP_ISFREE(rpp));
3473 			page_unlock_nocapture(rpp);
3474 		}
3475 		ASSERT(targpp == hpp);
3476 		ASSERT(replpp == NULL);
3477 		page_list_concat(&pplist, &targpp);
3478 	}
3479 	CHK_LPG(pplist, szc);
3480 	return (pplist);
3481 }
3482 
3483 /*
3484  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3485  * of 0 means nothing left after trim.
3486  */
3487 /* LINTED */
3488 int
3489 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3490 {
3491 	pfn_t	kcagepfn;
3492 	int	decr;
3493 	int	rc = 0;
3494 
3495 	if (PP_ISNORELOC(mseg->pages)) {
3496 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3497 
3498 			/* lower part of this mseg inside kernel cage */
3499 			decr = kcage_current_pfn(&kcagepfn);
3500 
3501 			/* kernel cage may have transitioned past mseg */
3502 			if (kcagepfn >= mseg->pages_base &&
3503 			    kcagepfn < mseg->pages_end) {
3504 				ASSERT(decr == 0);
3505 				*lo = MAX(kcagepfn, pfnlo);
3506 				*hi = MIN(pfnhi, (mseg->pages_end - 1));
3507 				rc = 1;
3508 			}
3509 		}
3510 		/* else entire mseg in the cage */
3511 	} else {
3512 		if (PP_ISNORELOC(mseg->epages - 1)) {
3513 
3514 			/* upper part of this mseg inside kernel cage */
3515 			decr = kcage_current_pfn(&kcagepfn);
3516 
3517 			/* kernel cage may have transitioned past mseg */
3518 			if (kcagepfn >= mseg->pages_base &&
3519 			    kcagepfn < mseg->pages_end) {
3520 				ASSERT(decr);
3521 				*hi = MIN(kcagepfn, pfnhi);
3522 				*lo = MAX(pfnlo, mseg->pages_base);
3523 				rc = 1;
3524 			}
3525 		} else {
3526 			/* entire mseg outside of kernel cage */
3527 			*lo = MAX(pfnlo, mseg->pages_base);
3528 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
3529 			rc = 1;
3530 		}
3531 	}
3532 	return (rc);
3533 }
3534 
3535 /*
3536  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3537  * page with size code 'szc'. Claiming such a page requires acquiring
3538  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3539  * relocating pages in use and concatenating these constituent pages into a
3540  * large page.
3541  *
3542  * The page lists do not have such a large page and page_freelist_split has
3543  * already failed to demote larger pages and/or coalesce smaller free pages.
3544  *
3545  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3546  * pages with the same color as 'bin'.
3547  *
3548  * 'pfnflag' specifies the subset of the pfn range to search.
3549  */
3550 static page_t *
3551 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3552     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3553 {
3554 	struct memseg *mseg;
3555 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
3556 	pfn_t	randpfn;
3557 	page_t *pp, *randpp, *endpp;
3558 	uint_t colors, ceq_mask;
3559 	/* LINTED : set but not used in function */
3560 	uint_t color_mask;
3561 	pfn_t hi, lo;
3562 	uint_t skip;
3563 	MEM_NODE_ITERATOR_DECL(it);
3564 #ifdef DEBUG
3565 	pgcnt_t szcpgmask = szcpgcnt - 1;
3566 #endif
3567 
3568 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3569 	pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3570 
3571 	if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) {
3572 		return (NULL);
3573 	}
3574 
3575 	ASSERT(szc < mmu_page_sizes);
3576 
3577 	colors = PAGE_GET_PAGECOLORS(szc);
3578 	color_mask = colors - 1;
3579 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3580 		uchar_t ceq = colorequivszc[szc];
3581 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3582 
3583 		ASSERT(ceq_dif > 0);
3584 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3585 	} else {
3586 		ceq_mask = 0;
3587 	}
3588 
3589 	ASSERT(bin < colors);
3590 
3591 	/* clear "non-significant" color bits */
3592 	bin &= ceq_mask;
3593 
3594 	/*
3595 	 * trim the pfn range to search based on pfnflag. pfnflag is set
3596 	 * when there have been previous page_get_contig_page failures to
3597 	 * limit the search.
3598 	 *
3599 	 * The high bit in pfnflag specifies the number of 'slots' in the
3600 	 * pfn range and the remainder of pfnflag specifies which slot.
3601 	 * For example, a value of 1010b would mean the second slot of
3602 	 * the pfn range that has been divided into 8 slots.
3603 	 */
3604 	if (pfnflag > 1) {
3605 		int	slots = 1 << (highbit(pfnflag) - 1);
3606 		int	slotid = pfnflag & (slots - 1);
3607 		pgcnt_t	szcpages;
3608 		int	slotlen;
3609 
3610 		pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3611 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3612 		slotlen = howmany(szcpages, slots);
3613 		/* skip if 'slotid' slot is empty */
3614 		if (slotid * slotlen >= szcpages) {
3615 			return (NULL);
3616 		}
3617 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3618 		ASSERT(pfnlo < pfnhi);
3619 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3620 			pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3621 	}
3622 
3623 	/*
3624 	 * This routine is can be called recursively so we shouldn't
3625 	 * acquire a reader lock if a write request is pending. This
3626 	 * could lead to a deadlock with the DR thread.
3627 	 *
3628 	 * Returning NULL informs the caller that we could not get
3629 	 * a contig page with the required characteristics.
3630 	 */
3631 
3632 	if (!memsegs_trylock(0))
3633 		return (NULL);
3634 
3635 	/*
3636 	 * loop through memsegs to look for contig page candidates
3637 	 */
3638 
3639 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3640 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3641 			/* no overlap */
3642 			continue;
3643 		}
3644 
3645 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3646 			/* mseg too small */
3647 			continue;
3648 
3649 		/*
3650 		 * trim off kernel cage pages from pfn range and check for
3651 		 * a trimmed pfn range returned that does not span the
3652 		 * desired large page size.
3653 		 */
3654 		if (kcage_on) {
3655 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3656 			    lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3657 				continue;
3658 		} else {
3659 			lo = MAX(pfnlo, mseg->pages_base);
3660 			hi = MIN(pfnhi, (mseg->pages_end - 1));
3661 		}
3662 
3663 		/* round to szcpgcnt boundaries */
3664 		lo = P2ROUNDUP(lo, szcpgcnt);
3665 
3666 		MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3667 		hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3668 
3669 		if (hi <= lo)
3670 			continue;
3671 
3672 		/*
3673 		 * set lo to point to the pfn for the desired bin. Large
3674 		 * page sizes may only have a single page color
3675 		 */
3676 		skip = szcpgcnt;
3677 		if (ceq_mask > 0 || interleaved_mnodes) {
3678 			/* set lo to point at appropriate color */
3679 			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3680 			    (interleaved_mnodes &&
3681 			    PFN_2_MEM_NODE(lo) != mnode)) {
3682 				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3683 				    color_mask, &it);
3684 			}
3685 			if (hi <= lo)
3686 				/* mseg cannot satisfy color request */
3687 				continue;
3688 		}
3689 
3690 		/* randomly choose a point between lo and hi to begin search */
3691 
3692 		randpfn = (pfn_t)GETTICK();
3693 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3694 		MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3695 		if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3696 			if (randpfn != (pfn_t)-1) {
3697 				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3698 				    ceq_mask, color_mask, &it);
3699 			}
3700 			if (randpfn >= hi) {
3701 				randpfn = lo;
3702 				MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3703 				    &it);
3704 			}
3705 		}
3706 		randpp = mseg->pages + (randpfn - mseg->pages_base);
3707 
3708 		ASSERT(randpp->p_pagenum == randpfn);
3709 
3710 		pp = randpp;
3711 		endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
3712 
3713 		ASSERT(randpp + szcpgcnt <= endpp);
3714 
3715 		do {
3716 			ASSERT(!(pp->p_pagenum & szcpgmask));
3717 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3718 
3719 			/* Skip over pages on the kernel freelist */
3720 			if (PP_ISKFLT(pp)) {
3721 				pp += skip;
3722 				goto skip_contig;
3723 			}
3724 
3725 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3726 				/* pages unlocked by page_claim on failure */
3727 				if (page_claim_contig_pages(pp, szc, flags)) {
3728 					memsegs_unlock(0);
3729 					return (pp);
3730 				}
3731 			}
3732 
3733 			if (ceq_mask == 0 && !interleaved_mnodes) {
3734 				pp += skip;
3735 			} else {
3736 				pfn_t pfn = pp->p_pagenum;
3737 
3738 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3739 				    ceq_mask, color_mask, &it);
3740 				if (pfn == (pfn_t)-1) {
3741 					pp = endpp;
3742 				} else {
3743 					pp = mseg->pages +
3744 					    (pfn - mseg->pages_base);
3745 				}
3746 			}
3747 skip_contig:
3748 			if (pp >= endpp) {
3749 				/* start from the beginning */
3750 				MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3751 				pp = mseg->pages + (lo - mseg->pages_base);
3752 				ASSERT(pp->p_pagenum == lo);
3753 				ASSERT(pp + szcpgcnt <= endpp);
3754 			}
3755 		} while (pp != randpp);
3756 	}
3757 	memsegs_unlock(0);
3758 	return (NULL);
3759 }
3760 
3761 /*
3762  * controlling routine that searches through physical memory in an attempt to
3763  * claim a large page based on the input parameters.
3764  * on the page free lists.
3765  *
3766  * calls page_geti_contig_pages with an initial pfn range from the mnode
3767  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3768  * that overlaps with the kernel cage or does not match the requested page
3769  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3770  * page_geti_contig_pages may further limit the search range based on
3771  * previous failure counts (pgcpfailcnt[]).
3772  *
3773  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3774  * pagesize page that satisfies mtype.
3775  */
3776 /* ARGSUSED */
3777 page_t *
3778 page_get_contig_pages(page_freelist_type_t *fp, int mnode, uint_t bin,
3779     int mtype, uchar_t szc, uint_t flags)
3780 {
3781 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
3782 	page_t		*pp;
3783 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
3784 
3785 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3786 
3787 	/* no allocations from cage */
3788 	flags |= PGI_NOCAGE;
3789 
3790 	/* LINTED */
3791 	MTYPE_START(mnode, mtype, flags);
3792 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3793 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3794 		return (NULL);
3795 	}
3796 
3797 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3798 
3799 	/* do not limit search and ignore color if hi pri */
3800 
3801 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3802 		pfnflag = pgcpfailcnt[szc];
3803 
3804 	/* remove color match to improve chances */
3805 
3806 	if (flags & PGI_PGCPHIPRI || pfnflag)
3807 		flags &= ~PG_MATCH_COLOR;
3808 
3809 	do {
3810 		/* get pfn range based on mnode and mtype */
3811 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3812 		ASSERT(pfnhi >= pfnlo);
3813 
3814 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
3815 		    pfnlo, pfnhi, pfnflag);
3816 
3817 		if (pp != NULL) {
3818 			pfnflag = pgcpfailcnt[szc];
3819 			if (pfnflag) {
3820 				/* double the search size */
3821 				pgcpfailcnt[szc] = pfnflag >> 1;
3822 			}
3823 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3824 			return (pp);
3825 		}
3826 		MTYPE_NEXT(mnode, mtype, flags);
3827 	} while (mtype >= 0);
3828 
3829 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3830 	return (NULL);
3831 }
3832 
3833 #if defined(__i386) || defined(__amd64)
3834 /*
3835  * Determine the likelihood of finding/coalescing a szc page.
3836  * Return 0 if the likelihood is small otherwise return 1.
3837  *
3838  * For now, be conservative and check only 1g pages and return 0
3839  * if there had been previous coalescing failures and the szc pages
3840  * needed to satisfy request would exhaust most of freemem.
3841  */
3842 int
3843 page_chk_freelist(uint_t szc)
3844 {
3845 	pgcnt_t		pgcnt;
3846 
3847 	if (szc <= 1)
3848 		return (1);
3849 
3850 	pgcnt = page_get_pagecnt(szc);
3851 	if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3852 		VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3853 		return (0);
3854 	}
3855 	VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3856 	return (1);
3857 }
3858 #endif
3859 
3860 /*
3861  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3862  *
3863  * Does its own locking and accounting.
3864  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3865  * pages of the proper color even if there are pages of a different color.
3866  *
3867  * Finds a page, removes it, THEN locks it.
3868  */
3869 
3870 /*ARGSUSED*/
3871 page_t *
3872 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3873 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3874 {
3875 	page_t *pp;
3876 
3877 	PAGE_GET_FREELISTS(pp, vp, off, seg, vaddr, size, flags, lgrp);
3878 	return (pp);
3879 }
3880 
3881 /*
3882  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3883  *
3884  * Does its own locking.
3885  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3886  * pages of the proper color even if there are pages of a different color.
3887  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3888  * try to lock one of them.  If no page can be locked, try the
3889  * next bin.  Return NULL if a page can not be found and locked.
3890  *
3891  * Finds a pages, trys to lock it, then removes it.
3892  */
3893 
3894 /*ARGSUSED*/
3895 page_t *
3896 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3897     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3898 {
3899 	page_t		*pp;
3900 	struct as	*as = seg->s_as;
3901 	ulong_t		bin;
3902 	/*LINTED*/
3903 	int		mnode;
3904 	int		mtype;
3905 	lgrp_mnode_cookie_t	lgrp_cookie;
3906 
3907 	/*
3908 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3909 	 * assume we wish to allocate near to the current thread's home.
3910 	 */
3911 	if (!LGRP_EXISTS(lgrp))
3912 		lgrp = lgrp_home_lgrp();
3913 
3914 	if (!kcage_on) {
3915 		flags &= ~PG_NORELOC;
3916 		flags |= PGI_NOCAGE;
3917 	}
3918 
3919 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3920 	    kcage_freemem <= kcage_throttlefree) {
3921 		/*
3922 		 * Reserve kcage_throttlefree pages for critical kernel
3923 		 * threads.
3924 		 *
3925 		 * Everybody else has to go to page_create_get_something()
3926 		 * to get a cage page, so we don't deadlock cageout.
3927 		 */
3928 		return (NULL);
3929 	}
3930 
3931 	/* LINTED */
3932 	AS_2_BIN(PFLT_USER, as, seg, vp, vaddr, bin, 0);
3933 
3934 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3935 
3936 	/* LINTED */
3937 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3938 
3939 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3940 
3941 	/*
3942 	 * Try local cachelists first
3943 	 */
3944 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3945 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3946 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3947 		if (pp != NULL) {
3948 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3949 			DTRACE_PROBE4(page__get,
3950 			    lgrp_t *, lgrp,
3951 			    int, mnode,
3952 			    ulong_t, bin,
3953 			    uint_t, flags);
3954 			return (pp);
3955 		}
3956 	}
3957 
3958 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3959 
3960 	/*
3961 	 * Try freelists/cachelists that are farther away
3962 	 * This is our only chance to allocate remote pages for PAGESIZE
3963 	 * requests.
3964 	 */
3965 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3966 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3967 		pp = page_get_mnode_freelist(ufltp, mnode, bin, mtype,
3968 		    0, flags);
3969 		if (pp != NULL) {
3970 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3971 			DTRACE_PROBE4(page__get,
3972 			    lgrp_t *, lgrp,
3973 			    int, mnode,
3974 			    ulong_t, bin,
3975 			    uint_t, flags);
3976 			return (pp);
3977 		}
3978 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3979 		if (pp != NULL) {
3980 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3981 			DTRACE_PROBE4(page__get,
3982 			    lgrp_t *, lgrp,
3983 			    int, mnode,
3984 			    ulong_t, bin,
3985 			    uint_t, flags);
3986 			return (pp);
3987 		}
3988 	}
3989 
3990 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3991 	return (NULL);
3992 }
3993 
3994 page_t *
3995 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3996 {
3997 	kmutex_t		*pcm;
3998 	page_t			*pp, *first_pp;
3999 	uint_t			sbin;
4000 	int			plw_initialized;
4001 	page_list_walker_t	plw;
4002 
4003 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
4004 
4005 	/* LINTED */
4006 	MTYPE_START(mnode, mtype, flags);
4007 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
4008 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
4009 		return (NULL);
4010 	}
4011 
4012 try_again:
4013 
4014 	plw_initialized = 0;
4015 	plw.plw_ceq_dif = 1;
4016 
4017 	/*
4018 	 * Only hold one cachelist lock at a time, that way we
4019 	 * can start anywhere and not have to worry about lock
4020 	 * ordering.
4021 	 */
4022 
4023 	for (plw.plw_count = 0;
4024 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
4025 		sbin = bin;
4026 		do {
4027 
4028 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
4029 				goto bin_empty_1;
4030 			/*
4031 			 * The first parameter is irrelevant here as the flags
4032 			 * parameter to this macro decides which mutex to lock.
4033 			 * With the PG_CACHE_LIST flag, we lock the cpc_mutex[].
4034 			 *
4035 			 * User pages from the kernel page freelist may be
4036 			 * on the cachelist.
4037 			 */
4038 			pcm = PC_BIN_MUTEX(PFLT_USER, mnode, bin,
4039 			    PG_CACHE_LIST);
4040 			mutex_enter(pcm);
4041 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
4042 			if (pp == NULL)
4043 				goto bin_empty_0;
4044 
4045 			first_pp = pp;
4046 			ASSERT(pp->p_vnode);
4047 			ASSERT(PP_ISAGED(pp) == 0);
4048 			ASSERT(pp->p_szc == 0);
4049 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
4050 			while (!page_trylock(pp, SE_EXCL)) {
4051 				pp = pp->p_next;
4052 				ASSERT(pp->p_szc == 0);
4053 				if (pp == first_pp) {
4054 					/*
4055 					 * We have searched the complete list!
4056 					 * And all of them (might only be one)
4057 					 * are locked. This can happen since
4058 					 * these pages can also be found via
4059 					 * the hash list. When found via the
4060 					 * hash list, they are locked first,
4061 					 * then removed. We give up to let the
4062 					 * other thread run.
4063 					 */
4064 					pp = NULL;
4065 					break;
4066 				}
4067 				ASSERT(pp->p_vnode);
4068 				ASSERT(PP_ISFREE(pp));
4069 				ASSERT(PP_ISAGED(pp) == 0);
4070 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4071 				    mnode);
4072 			}
4073 
4074 			if (pp) {
4075 				page_t	**ppp;
4076 				/*
4077 				 * Found and locked a page.
4078 				 * Pull it off the list.
4079 				 */
4080 				ASSERT(mtype == PP_2_MTYPE(pp));
4081 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4082 				page_sub(ppp, pp);
4083 				/*
4084 				 * Subtract counters before releasing pcm mutex
4085 				 * to avoid a race with page_freelist_coalesce
4086 				 * and page_freelist_split.
4087 				 */
4088 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4089 				mutex_exit(pcm);
4090 				ASSERT(pp->p_vnode);
4091 				ASSERT(PP_ISAGED(pp) == 0);
4092 #if defined(__sparc)
4093 				ASSERT(!kcage_on ||
4094 				    (flags & PG_NORELOC) == 0 ||
4095 				    PP_ISNORELOC(pp));
4096 				if (PP_ISNORELOC(pp)) {
4097 					kcage_freemem_sub(1);
4098 				}
4099 #elif defined(__amd64) && !defined(__xpv)
4100 				if (PP_ISKFLT(pp)) {
4101 					kflt_freemem_sub(1);
4102 				}
4103 #endif /* __sparc */
4104 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4105 				return (pp);
4106 			}
4107 bin_empty_0:
4108 			mutex_exit(pcm);
4109 bin_empty_1:
4110 			if (plw_initialized == 0) {
4111 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
4112 				plw_initialized = 1;
4113 			}
4114 			/* calculate the next bin with equivalent color */
4115 			bin = ADD_MASKED(bin, plw.plw_bin_step,
4116 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
4117 		} while (sbin != bin);
4118 
4119 		if (plw.plw_ceq_dif > 1)
4120 			bin = page_list_walk_next_bin(0, bin, &plw);
4121 	}
4122 
4123 	MTYPE_NEXT(mnode, mtype, flags);
4124 	if (mtype >= 0)
4125 		goto try_again;
4126 
4127 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4128 	return (NULL);
4129 }
4130 
4131 #ifdef DEBUG
4132 #define	REPL_PAGE_STATS
4133 #endif /* DEBUG */
4134 
4135 #ifdef REPL_PAGE_STATS
4136 struct repl_page_stats {
4137 	uint_t	ngets;
4138 	uint_t	ngets_noreloc;
4139 	uint_t	npgr_noreloc;
4140 	uint_t	nnopage_first;
4141 	uint_t	nnopage;
4142 	uint_t	nhashout;
4143 	uint_t	nnofree;
4144 	uint_t	nnext_pp;
4145 } repl_page_stats;
4146 #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
4147 #else /* REPL_PAGE_STATS */
4148 #define	REPL_STAT_INCR(v)
4149 #endif /* REPL_PAGE_STATS */
4150 
4151 int	pgrppgcp;
4152 
4153 /*
4154  * The freemem accounting must be done by the caller.
4155  * First we try to get a replacement page of the same size as like_pp,
4156  * if that is not possible, then we just get a set of discontiguous
4157  * PAGESIZE pages.
4158  */
4159 page_t *
4160 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4161     uint_t pgrflags)
4162 {
4163 	page_t		*like_pp;
4164 	page_t		*pp, *pplist;
4165 	page_t		*pl = NULL;
4166 	ulong_t		bin;
4167 	int		mnode, page_mnode;
4168 	int		szc;
4169 	spgcnt_t	npgs, pg_cnt;
4170 	pfn_t		pfnum;
4171 	int		mtype;
4172 	int		flags = 0;
4173 	lgrp_mnode_cookie_t	lgrp_cookie;
4174 	lgrp_t		*lgrp;
4175 
4176 	REPL_STAT_INCR(ngets);
4177 	like_pp = orig_like_pp;
4178 	ASSERT(PAGE_EXCL(like_pp));
4179 
4180 	szc = like_pp->p_szc;
4181 	npgs = page_get_pagecnt(szc);
4182 	/*
4183 	 * Now we reset like_pp to the base page_t.
4184 	 * That way, we won't walk past the end of this 'szc' page.
4185 	 */
4186 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4187 	like_pp = page_numtopp_nolock(pfnum);
4188 	ASSERT(like_pp->p_szc == szc);
4189 
4190 	if (PP_ISNORELOC(like_pp)) {
4191 		ASSERT(kcage_on);
4192 		REPL_STAT_INCR(ngets_noreloc);
4193 		flags = PGI_RELOCONLY;
4194 	} else if (pgrflags & PGR_NORELOC) {
4195 		ASSERT(kcage_on);
4196 		REPL_STAT_INCR(npgr_noreloc);
4197 		flags = PG_NORELOC;
4198 	}
4199 
4200 	/*
4201 	 * Kernel pages must always be replaced with the same size
4202 	 * pages, since we cannot properly handle demotion of kernel
4203 	 * pages.
4204 	 */
4205 	if (PP_ISKAS(like_pp))
4206 		pgrflags |= PGR_SAMESZC;
4207 
4208 	/* LINTED */
4209 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4210 
4211 	while (npgs) {
4212 		pplist = NULL;
4213 		for (;;) {
4214 			pg_cnt = page_get_pagecnt(szc);
4215 			bin = PP_2_BIN(like_pp);
4216 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4217 			ASSERT(pg_cnt <= npgs);
4218 
4219 			/*
4220 			 * If an lgroup was specified, try to get the
4221 			 * page from that lgroup.
4222 			 * NOTE: Must be careful with code below because
4223 			 *	 lgroup may disappear and reappear since there
4224 			 *	 is no locking for lgroup here.
4225 			 */
4226 			if (LGRP_EXISTS(lgrp_target)) {
4227 				/*
4228 				 * Keep local variable for lgroup separate
4229 				 * from lgroup argument since this code should
4230 				 * only be exercised when lgroup argument
4231 				 * exists....
4232 				 */
4233 				lgrp = lgrp_target;
4234 
4235 				/* Try the lgroup's freelists first */
4236 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4237 				    LGRP_SRCH_LOCAL);
4238 				while ((pplist == NULL) &&
4239 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4240 				    != -1) {
4241 					pplist =
4242 					    page_get_mnode_freelist(ufltp,
4243 					    mnode, bin, mtype, szc, flags);
4244 				}
4245 
4246 				/*
4247 				 * Now try it's cachelists if this is a
4248 				 * small page. Don't need to do it for
4249 				 * larger ones since page_freelist_coalesce()
4250 				 * already failed.
4251 				 */
4252 				if (pplist != NULL || szc != 0)
4253 					break;
4254 
4255 				/* Now try it's cachelists */
4256 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4257 				    LGRP_SRCH_LOCAL);
4258 
4259 				while ((pplist == NULL) &&
4260 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4261 				    != -1) {
4262 					pplist =
4263 					    page_get_mnode_cachelist(bin, flags,
4264 					    mnode, mtype);
4265 				}
4266 				if (pplist != NULL) {
4267 					page_hashout(pplist, NULL);
4268 					PP_SETAGED(pplist);
4269 					REPL_STAT_INCR(nhashout);
4270 					break;
4271 				}
4272 				/* Done looking in this lgroup. Bail out. */
4273 				break;
4274 			}
4275 
4276 			/*
4277 			 * No lgroup was specified (or lgroup was removed by
4278 			 * DR, so just try to get the page as close to
4279 			 * like_pp's mnode as possible.
4280 			 * First try the local freelist...
4281 			 */
4282 			mnode = PP_2_MEM_NODE(like_pp);
4283 			pplist = page_get_mnode_freelist(ufltp, mnode, bin,
4284 			    mtype, szc, flags);
4285 			if (pplist != NULL)
4286 				break;
4287 
4288 			REPL_STAT_INCR(nnofree);
4289 
4290 			/*
4291 			 * ...then the local cachelist. Don't need to do it for
4292 			 * larger pages cause page_freelist_coalesce() already
4293 			 * failed there anyway.
4294 			 */
4295 			if (szc == 0) {
4296 				pplist = page_get_mnode_cachelist(bin, flags,
4297 				    mnode, mtype);
4298 				if (pplist != NULL) {
4299 					page_hashout(pplist, NULL);
4300 					PP_SETAGED(pplist);
4301 					REPL_STAT_INCR(nhashout);
4302 					break;
4303 				}
4304 			}
4305 
4306 			/* Now try remote freelists */
4307 			page_mnode = mnode;
4308 			lgrp =
4309 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4310 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4311 			    LGRP_SRCH_HIER);
4312 			while (pplist == NULL &&
4313 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4314 			    != -1) {
4315 				/*
4316 				 * Skip local mnode.
4317 				 */
4318 				if ((mnode == page_mnode) ||
4319 				    (mem_node_config[mnode].exists == 0))
4320 					continue;
4321 
4322 				pplist = page_get_mnode_freelist(ufltp, mnode,
4323 				    bin, mtype, szc, flags);
4324 			}
4325 
4326 			if (pplist != NULL)
4327 				break;
4328 
4329 			/* Now try remote cachelists */
4330 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4331 			    LGRP_SRCH_HIER);
4332 			while (pplist == NULL && szc == 0) {
4333 				mnode = lgrp_memnode_choose(&lgrp_cookie);
4334 				if (mnode == -1)
4335 					break;
4336 				/*
4337 				 * Skip local mnode.
4338 				 */
4339 				if ((mnode == page_mnode) ||
4340 				    (mem_node_config[mnode].exists == 0))
4341 					continue;
4342 
4343 				pplist = page_get_mnode_cachelist(bin,
4344 				    flags, mnode, mtype);
4345 
4346 				if (pplist != NULL) {
4347 					page_hashout(pplist, NULL);
4348 					PP_SETAGED(pplist);
4349 					REPL_STAT_INCR(nhashout);
4350 					break;
4351 				}
4352 			}
4353 
4354 			/*
4355 			 * Break out of while loop under the following cases:
4356 			 * - If we successfully got a page.
4357 			 * - If pgrflags specified only returning a specific
4358 			 *   page size and we could not find that page size.
4359 			 * - If we could not satisfy the request with PAGESIZE
4360 			 *   or larger pages.
4361 			 */
4362 			if (pplist != NULL || szc == 0)
4363 				break;
4364 
4365 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4366 				/* try to find contig page */
4367 
4368 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4369 				    LGRP_SRCH_HIER);
4370 
4371 				while ((pplist == NULL) &&
4372 				    (mnode =
4373 				    lgrp_memnode_choose(&lgrp_cookie))
4374 				    != -1) {
4375 					pplist = page_get_contig_pages(
4376 					    ufltp, mnode, bin, mtype, szc,
4377 					    flags | PGI_PGCPHIPRI);
4378 				}
4379 				break;
4380 			}
4381 
4382 			/*
4383 			 * The correct thing to do here is try the next
4384 			 * page size down using szc--. Due to a bug
4385 			 * with the processing of HAT_RELOAD_SHARE
4386 			 * where the sfmmu_ttecnt arrays of all
4387 			 * hats sharing an ISM segment don't get updated,
4388 			 * using intermediate size pages for relocation
4389 			 * can lead to continuous page faults.
4390 			 */
4391 			szc = 0;
4392 		}
4393 
4394 		if (pplist != NULL) {
4395 			DTRACE_PROBE4(page__get,
4396 			    lgrp_t *, lgrp,
4397 			    int, mnode,
4398 			    ulong_t, bin,
4399 			    uint_t, flags);
4400 
4401 			while (pplist != NULL && pg_cnt--) {
4402 				ASSERT(pplist != NULL);
4403 				pp = pplist;
4404 				page_sub(&pplist, pp);
4405 				PP_CLRFREE(pp);
4406 				PP_CLRAGED(pp);
4407 				page_list_concat(&pl, &pp);
4408 				npgs--;
4409 				like_pp = like_pp + 1;
4410 				REPL_STAT_INCR(nnext_pp);
4411 			}
4412 			ASSERT(pg_cnt == 0);
4413 		} else {
4414 			break;
4415 		}
4416 	}
4417 
4418 	if (npgs) {
4419 		/*
4420 		 * We were unable to allocate the necessary number
4421 		 * of pages.
4422 		 * We need to free up any pl.
4423 		 */
4424 		REPL_STAT_INCR(nnopage);
4425 		page_free_replacement_page(pl);
4426 		return (NULL);
4427 	} else {
4428 		return (pl);
4429 	}
4430 }
4431 
4432 /*
4433  * demote a free large page to it's constituent pages
4434  */
4435 void
4436 page_demote_free_pages(page_t *pp)
4437 {
4438 
4439 	int mnode;
4440 
4441 	ASSERT(pp != NULL);
4442 	ASSERT(PAGE_LOCKED(pp));
4443 	ASSERT(PP_ISFREE(pp));
4444 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4445 
4446 	mnode = PP_2_MEM_NODE(pp);
4447 	page_freelist_lock(mnode);
4448 	if (pp->p_szc != 0) {
4449 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4450 		    pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4451 	}
4452 	page_freelist_unlock(mnode);
4453 	ASSERT(pp->p_szc == 0);
4454 }
4455 
4456 /*
4457  * Factor in colorequiv to check additional 'equivalent' bins.
4458  * colorequiv may be set in /etc/system
4459  */
4460 void
4461 page_set_colorequiv_arr(void)
4462 {
4463 	if (colorequiv > 1) {
4464 		int i;
4465 		uint_t sv_a = lowbit(colorequiv) - 1;
4466 
4467 		if (sv_a > 15)
4468 			sv_a = 15;
4469 
4470 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
4471 			uint_t colors;
4472 			uint_t a = sv_a;
4473 
4474 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
4475 				continue;
4476 			}
4477 			while ((colors >> a) == 0)
4478 				a--;
4479 			if ((a << 4) > colorequivszc[i]) {
4480 				colorequivszc[i] = (a << 4);
4481 			}
4482 		}
4483 	}
4484 }
4485 
4486 /*
4487  * The freelist type data structures allow freelist type specific allocation
4488  * and policy routines to be configured.  There are two freelist types currently
4489  * defined, one for kernel memory allocation and the the other for user memory.
4490  * The page_get_uflt() routine is called by the PAGE_GET_FREELISTS() macro to
4491  * allocate memory from the user freelist type.
4492  */
4493 
4494 /* ARGSUSED */
4495 page_t *
4496 page_get_uflt(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t vaddr,
4497     size_t size, uint_t flags, struct lgrp *lgrp)
4498 {
4499 	struct as	*as = seg->s_as;
4500 	ulong_t		bin;
4501 	uchar_t		szc;
4502 	int		mtype;
4503 
4504 	/*
4505 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
4506 	 * assume we wish to allocate near the current thread's home.
4507 	 */
4508 	if (!LGRP_EXISTS(lgrp))
4509 		lgrp = lgrp_home_lgrp();
4510 
4511 	if (kcage_on) {
4512 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
4513 		    kcage_freemem < kcage_throttlefree + btop(size) &&
4514 		    curthread != kcage_cageout_thread) {
4515 			/*
4516 			 * Set a "reserve" of kcage_throttlefree pages for
4517 			 * PG_PANIC and cageout thread allocations.
4518 			 *
4519 			 * Everybody else has to serialize in
4520 			 * page_create_get_something() to get a cage page, so
4521 			 * that we don't deadlock cageout!
4522 			 */
4523 			return (NULL);
4524 		}
4525 	} else {
4526 		flags &= ~PG_NORELOC;
4527 		flags |= PGI_NOCAGE;
4528 	}
4529 
4530 	/* LINTED */
4531 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
4532 
4533 	/*
4534 	 * Convert size to page size code.
4535 	 */
4536 	if ((szc = page_szc(size)) == (uchar_t)-1)
4537 		panic("page_get_uflt: illegal page size request");
4538 	ASSERT(szc < mmu_page_sizes);
4539 
4540 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc][ufltp->pflt_type]);
4541 
4542 	/* LINTED */
4543 	AS_2_BIN(PFLT_USER, as, seg, vp, vaddr, bin, szc);
4544 
4545 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
4546 
4547 	return (page_get_flist(ufltp, bin, mtype, szc, flags, lgrp));
4548 }
4549 
4550 /*
4551  * This routine is passed a page color and inital mtype, and calls the page
4552  * freelist type policy routines which actually do the allocations, first
4553  * trying the local and then remote lgroups. The policy routines for user
4554  * page allocations are currently configured to be:
4555  *
4556  *  x64 systems support two freelist types, user and kernel.
4557  *
4558  * The user freelist has 3 policy routines.
4559  *
4560  *  1. page_get_mnode_freelist to allocate a page from the user freelists.
4561  *  2. page_user_alloc_kflt to allocate a page from the kernel freelists
4562  *  3. page_get_contig_pages to search for a large page in physical memory.
4563  *
4564  * The kernel freelist has only 1 policy routine.
4565  *
4566  * 1. page_get_mnode_freelist to allocate a page from the kernel freelists.
4567  *
4568  *  Sparc, x32 and Xen, systems support only the user freelist type.
4569  *
4570  * The user freelist has 2 policy routines.
4571  *
4572  *  1. page_get_mnode_freelist to allocate a page from the user freelists.
4573  *  2. page_get_contig_pages to search for a large page in physical memory.
4574  *
4575  */
4576 page_t *
4577 page_get_flist(page_freelist_type_t *fltp, uint_t bin, int mtype,
4578     uchar_t szc, uint_t flags, struct lgrp *lgrp)
4579 {
4580 	page_t		*pp = NULL;
4581 	page_t		*(*page_get_func)(page_freelist_type_t *,
4582 	    int, uint_t, int, uchar_t, uint_t);
4583 	lgrp_mnode_cookie_t	lgrp_cookie;
4584 	int 		i;
4585 	int		mnode;
4586 
4587 	for (i = 0; i < fltp->pflt_num_policies; i++) {
4588 		page_get_func =  PAGE_GET_FREELISTS_POLICY(fltp, i);
4589 
4590 		/*
4591 		 * when the cage and the kernel freelist are off chances are
4592 		 * that page_get_contig_pages() will fail to lock a large
4593 		 * page chunk therefore in this case it's not called by
4594 		 * default. This can be changed via /etc/system.
4595 		 *
4596 		 * page_get_contig_pages() also called to acquire a base
4597 		 * pagesize page for page_create_get_something().
4598 		 */
4599 		if (page_get_func == page_get_contig_pages) {
4600 			if ((flags & PG_NORELOC) ||
4601 			    (pg_contig_disable != 0) ||
4602 			    (!kcage_on && !kflt_on &&
4603 			    !pg_lpgcreate_nocage && szc != 0)) {
4604 				continue;
4605 #ifdef VM_STATS
4606 			} else {
4607 				VM_STAT_ADD(
4608 				    vmm_vmstats.
4609 				    pgf_allocretry[szc][fltp->pflt_type]);
4610 #endif
4611 			}
4612 		}
4613 
4614 		/*
4615 		 * Try to get a local page first, but try remote if we can't
4616 		 * get a page of the right color.
4617 		 */
4618 		LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
4619 		while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
4620 
4621 			pp = page_get_func(fltp, mnode, bin, mtype, szc,
4622 			    flags);
4623 			if (pp != NULL) {
4624 #ifdef VM_STATS
4625 				VM_STAT_ADD(
4626 				    vmm_vmstats.
4627 				    pgf_allocok[szc][fltp->pflt_type]);
4628 #endif
4629 				DTRACE_PROBE4(page__get__page,
4630 				    lgrp_t *, lgrp,
4631 				    int, mnode,
4632 				    ulong_t, bin,
4633 				    uint_t, flags);
4634 				return (pp);
4635 			}
4636 		}
4637 		ASSERT(pp == NULL);
4638 
4639 		/*
4640 		 * for non-PGI_PGCPSZC0 PAGESIZE requests, check cachelist
4641 		 * before checking remote free lists. Caller expected to call
4642 		 * page_get_cachelist which will check local cache lists
4643 		 * and remote free lists.
4644 		 */
4645 		if (!PC_ISKFLT(fltp) && szc == 0 &&
4646 		    ((flags & PGI_PGCPSZC0) == 0)) {
4647 			VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
4648 			return (NULL);
4649 		}
4650 
4651 		ASSERT(PC_ISKFLT(fltp) || szc > 0 || (flags & PGI_PGCPSZC0));
4652 
4653 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
4654 
4655 		if (!(flags & PG_LOCAL)) {
4656 			/*
4657 			 * Try to get a non-local freelist page.
4658 			 */
4659 			LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
4660 			while ((mnode =
4661 			    lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
4662 				pp = page_get_func(fltp, mnode, bin, mtype,
4663 				    szc, flags);
4664 				if (pp != NULL) {
4665 					DTRACE_PROBE4(page__get,
4666 					    lgrp_t *, lgrp,
4667 					    int, mnode,
4668 					    ulong_t, bin,
4669 					    uint_t, flags);
4670 #ifdef VM_STATS
4671 					VM_STAT_ADD(vmm_vmstats.
4672 					    pgf_allocokrem[szc]
4673 					    [fltp->pflt_type]);
4674 #endif
4675 					return (pp);
4676 				}
4677 			}
4678 			ASSERT(pp == NULL);
4679 		}
4680 
4681 		if (!(flags & PG_LOCAL) && pgcplimitsearch &&
4682 		    page_get_func == page_get_contig_pages)
4683 			SETPGCPFAILCNT(szc);
4684 	}
4685 
4686 #ifdef VM_STATS
4687 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc][fltp->pflt_type]);
4688 #endif
4689 
4690 	return (NULL);
4691 }
4692 #if defined(__amd64) && !defined(__xpv)
4693 /*
4694  * The page_get_kflt() routine is called by the PAGE_GET_FREELISTS() macro  to
4695  * allocate memory from the kernel freelist type.
4696  */
4697 /* ARGSUSED */
4698 page_t *
4699 page_get_kflt(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t vaddr,
4700     size_t size, uint_t flags, struct lgrp *lgrp)
4701 {
4702 	struct as	*as = seg->s_as;
4703 	page_t		*pp = NULL;
4704 	ulong_t		bin;
4705 	uchar_t		szc;
4706 	int		mtype;
4707 
4708 	ASSERT(!kcage_on);
4709 	ASSERT(kflt_on);
4710 	ASSERT((flags & PG_KFLT) == PG_KFLT);
4711 
4712 	flags &= ~PG_NORELOC;
4713 	flags |= PGI_NOCAGE;
4714 
4715 	if ((flags & PG_PANIC) == 0 &&
4716 	    kflt_freemem < kflt_throttlefree + btop(size) &&
4717 	    curthread != kflt_evict_thread) {
4718 		return (NULL);
4719 	}
4720 
4721 	/* LINTED */
4722 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
4723 
4724 	/*
4725 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
4726 	 * assume we wish to allocate near to the current thread's home.
4727 	 */
4728 	if (!LGRP_EXISTS(lgrp))
4729 		lgrp = lgrp_home_lgrp();
4730 
4731 	/*
4732 	 * Convert size to page size code.
4733 	 */
4734 	if ((szc = page_szc(size)) == (uchar_t)-1)
4735 		panic("page_get_kflt: illegal page size request");
4736 	ASSERT(szc == 0);
4737 	ASSERT(!(flags & PG_LOCAL));
4738 
4739 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc][kfltp->pflt_type]);
4740 
4741 	/* LINTED */
4742 	AS_2_BIN(PFLT_KMEM, as, seg, vp, vaddr, bin, szc);
4743 
4744 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
4745 	ASSERT(bin < KFLT_PAGE_COLORS);
4746 
4747 retry:
4748 	pp = page_get_flist(kfltp, bin, mtype, szc, flags, lgrp);
4749 
4750 	if (pp != NULL) {
4751 		return (pp);
4752 	}
4753 
4754 #if defined(__amd64)
4755 	if (kernel_page_update_flags_x86(&flags)) {
4756 		goto retry;
4757 	}
4758 #endif
4759 	/*
4760 	 * Import memory from user page freelists.
4761 	 */
4762 
4763 	/* LINTED: constant in conditional context */
4764 	AS_2_BIN(PFLT_USER, as, seg, vp, vaddr, bin, KFLT_PAGESIZE);
4765 
4766 	ASSERT(bin < PAGE_GET_PAGECOLORS(KFLT_PAGESIZE));
4767 
4768 	if ((pp = page_import_kflt(kfltp, bin, mtype, szc,
4769 	    flags | PGI_NOPGALLOC | PGI_PGCPHIPRI, NULL)) != NULL) {
4770 		VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc][kfltp->pflt_type]);
4771 		return (pp);
4772 	}
4773 
4774 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc][kfltp->pflt_type]);
4775 	return (NULL);
4776 }
4777 
4778 /*
4779  * This is the policy routine used to allocate user memory on the kernel
4780  * freelist.
4781  */
4782 /* ARGSUSED */
4783 page_t *
4784 page_user_alloc_kflt(page_freelist_type_t *fp, int mnode, uint_t bin, int mtype,
4785     uchar_t szc, uint_t flags)
4786 {
4787 	page_t *pp;
4788 
4789 	if (szc != 0)
4790 		return (NULL);
4791 
4792 	if (kflt_freemem < kflt_desfree) {
4793 		kflt_evict_wakeup();
4794 	}
4795 	flags &= ~PG_MATCH_COLOR;
4796 
4797 	bin = USER_2_KMEM_BIN(bin);
4798 
4799 	if ((pp = page_get_mnode_freelist(kfltp, mnode,
4800 	    bin, mtype, szc, flags)) != NULL) {
4801 		VM_STAT_ADD(vmm_vmstats.puak_allocok);
4802 		atomic_add_long(&kflt_user_alloc, 1);
4803 		PP_SETUSERKFLT(pp);
4804 		return (pp);
4805 	}
4806 
4807 	VM_STAT_ADD(vmm_vmstats.puak_allocfailed);
4808 	return (NULL);
4809 }
4810 
4811 /*
4812  * This routine is called in order to allocate a large page from the user page
4813  * freelist and split this into small pages which are then placed on the kernel
4814  * freelist. If it is is called from  kflt_expand() routine the PGI_NOPGALLOC
4815  * flag is set to indicate that all pages should be placed on the freelist,
4816  * otherwise a page of the requested type and color will be returned.
4817  */
4818 /* ARGSUSED */
4819 page_t *
4820 page_import_kflt(page_freelist_type_t *fp, uint_t bin, int mtype,
4821     uchar_t szc, uint_t flags, int *np)
4822 {
4823 	page_t *pp, *pplist;
4824 	uint_t alloc_szc = KFLT_PAGESIZE;
4825 	kmutex_t *pcm;
4826 	page_t	*ret_pp = NULL;
4827 	uint_t	req_bin = bin;
4828 	int	req_mtype = mtype;
4829 	int	pgcnt = 0;
4830 	int	pgalloc;
4831 	int	mnode;
4832 	struct lgrp *lgrp;
4833 
4834 	ASSERT(szc == 0);
4835 
4836 	flags &= ~(PG_LOCAL|PG_MATCH_COLOR);
4837 	lgrp = lgrp_home_lgrp();
4838 
4839 	pgalloc = ((flags & PGI_NOPGALLOC) == 0);
4840 
4841 	/* Allocate a large page from the user pagelist */
4842 	if ((pplist = page_get_flist(ufltp, bin, mtype, alloc_szc,
4843 	    flags, lgrp)) != NULL) {
4844 
4845 		VM_STAT_ADD(vmm_vmstats.pgik_allocok);
4846 		CHK_LPG(pplist, alloc_szc);
4847 		mnode = PP_2_MEM_NODE(pplist);
4848 		/*
4849 		 * Split up the large page and put the constituent pages
4850 		 * on the kernel freelist.
4851 		 */
4852 		while (pplist) {
4853 			pgcnt++;
4854 			pp = pplist;
4855 			ASSERT(pp->p_szc == alloc_szc);
4856 			ASSERT(PP_ISFREE(pp));
4857 			mach_page_sub(&pplist, pp);
4858 
4859 			pp->p_szc = 0;
4860 			PP_SETKFLT(pp);
4861 			mtype = PP_2_MTYPE(pp);
4862 			bin = PP_2_BIN(pp);
4863 			if (pgalloc && (ret_pp == NULL) &&
4864 			    ((bin == req_bin && mtype == req_mtype))) {
4865 				ret_pp = pp;
4866 			} else {
4867 				pcm = PC_BIN_MUTEX(PFLT_KMEM, mnode, bin,
4868 				    PG_FREE_LIST);
4869 				ASSERT(mtype == PP_2_MTYPE(pp));
4870 				mutex_enter(pcm);
4871 				mach_page_add(PAGE_FREELISTP(PFLT_KMEM, mnode,
4872 				    0, bin, mtype), pp);
4873 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
4874 				mutex_exit(pcm);
4875 				page_unlock(pp);
4876 			}
4877 		}
4878 
4879 		if (np != NULL)
4880 			*np = pgcnt;
4881 
4882 		if (ret_pp == NULL) {
4883 			kflt_freemem_add(pgcnt);
4884 		} else {
4885 			kflt_freemem_add(pgcnt - 1);
4886 		}
4887 		return (ret_pp);
4888 
4889 	} else {
4890 
4891 		VM_STAT_ADD(vmm_vmstats.pgik_allocfailed);
4892 		return (NULL);
4893 	}
4894 }
4895 
4896 /*
4897  * This routine is called from the kflt_user_evict() thread when kernel
4898  * memory is low and the thread has not managed to increase it by freeing up
4899  * user pages
4900  */
4901 void
4902 kflt_expand()
4903 {
4904 	ulong_t		bin;
4905 	int		mtype;
4906 	uint_t		flags;
4907 	spgcnt_t 	wanted;
4908 	caddr_t		vaddr;
4909 	int		np;
4910 	int		lpallocated  = 0;
4911 	int		retries;
4912 
4913 	ASSERT(kflt_on);
4914 	vaddr = 0;
4915 	flags = PGI_NOPGALLOC | PGI_PGCPHIPRI;
4916 
4917 	wanted = MAX(kflt_lotsfree, kflt_throttlefree + kflt_needfree)
4918 	    - kflt_freemem;
4919 
4920 	if (wanted <= 0) {
4921 		return;
4922 	}
4923 
4924 	/* LINTED */
4925 	MTYPE_INIT(mtype, &kvp, vaddr, flags, KFLT_PAGESIZE);
4926 
4927 #if defined(__amd64)
4928 	(void) kernel_page_update_flags_x86(&flags);
4929 #endif
4930 	/* LINTED */
4931 	AS_2_BIN(PFLT_USER, &kas, NULL, &kvp, vaddr, bin, 1);
4932 
4933 	retries = 0;
4934 	while (kflt_on && wanted > 0) {
4935 		(void) page_import_kflt(kfltp, bin, mtype, 0,
4936 		    flags, &np);
4937 
4938 		if (np == 0) {
4939 			if (lpallocated == 0 &&
4940 			    retries < KFLT_EXPAND_RETRIES) {
4941 				retries++;
4942 				ASSERT((flags & (PGI_NOPGALLOC | PGI_PGCPHIPRI))
4943 				    == (PGI_NOPGALLOC | PGI_PGCPHIPRI));
4944 				continue;
4945 			}
4946 			break;
4947 		} else {
4948 			wanted -= np;
4949 			lpallocated = 1;
4950 		}
4951 
4952 	}
4953 
4954 #ifdef DEBUG
4955 	if (lpallocated) {
4956 		VM_STAT_ADD(vmm_vmstats.pgkx_allocok);
4957 	} else {
4958 		VM_STAT_ADD(vmm_vmstats.pgkx_allocfailed);
4959 	}
4960 #endif
4961 }
4962 #endif /* __amd64 && !__xpv */
4963