xref: /onnv-gate/usr/src/uts/common/vm/vm_pagelist.c (revision 12342:0d4f730dbf17)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
52124Smec  * Common Development and Distribution License (the "License").
62124Smec  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
2212230SFrank.Rival@oracle.com  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
230Sstevel@tonic-gate  */
240Sstevel@tonic-gate 
250Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
260Sstevel@tonic-gate /*	All Rights Reserved   */
270Sstevel@tonic-gate 
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate  * Portions of this source code were derived from Berkeley 4.3 BSD
300Sstevel@tonic-gate  * under license from the Regents of the University of California.
310Sstevel@tonic-gate  */
320Sstevel@tonic-gate 
330Sstevel@tonic-gate 
340Sstevel@tonic-gate /*
350Sstevel@tonic-gate  * This file contains common functions to access and manage the page lists.
360Sstevel@tonic-gate  * Many of these routines originated from platform dependent modules
370Sstevel@tonic-gate  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
380Sstevel@tonic-gate  * a platform independent manner.
390Sstevel@tonic-gate  *
400Sstevel@tonic-gate  * vm/vm_dep.h provides for platform specific support.
410Sstevel@tonic-gate  */
420Sstevel@tonic-gate 
430Sstevel@tonic-gate #include <sys/types.h>
440Sstevel@tonic-gate #include <sys/debug.h>
450Sstevel@tonic-gate #include <sys/cmn_err.h>
460Sstevel@tonic-gate #include <sys/systm.h>
470Sstevel@tonic-gate #include <sys/atomic.h>
480Sstevel@tonic-gate #include <sys/sysmacros.h>
490Sstevel@tonic-gate #include <vm/as.h>
500Sstevel@tonic-gate #include <vm/page.h>
510Sstevel@tonic-gate #include <vm/seg_kmem.h>
520Sstevel@tonic-gate #include <vm/seg_vn.h>
535466Skchow #include <sys/vmsystm.h>
540Sstevel@tonic-gate #include <sys/memnode.h>
550Sstevel@tonic-gate #include <vm/vm_dep.h>
560Sstevel@tonic-gate #include <sys/lgrp.h>
570Sstevel@tonic-gate #include <sys/mem_config.h>
580Sstevel@tonic-gate #include <sys/callb.h>
590Sstevel@tonic-gate #include <sys/mem_cage.h>
600Sstevel@tonic-gate #include <sys/sdt.h>
6110843SDave.Plauger@Sun.COM #include <sys/dumphdr.h>
6212230SFrank.Rival@oracle.com #include <sys/swap.h>
630Sstevel@tonic-gate 
640Sstevel@tonic-gate extern uint_t	vac_colors;
650Sstevel@tonic-gate 
66450Skchow #define	MAX_PRAGMA_ALIGN	128
67450Skchow 
68450Skchow /* vm_cpu_data0 for the boot cpu before kmem is initialized */
69450Skchow 
70450Skchow #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
71414Skchow #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
72450Skchow #else
73450Skchow #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
74450Skchow #endif
75414Skchow char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
76414Skchow 
770Sstevel@tonic-gate /*
780Sstevel@tonic-gate  * number of page colors equivalent to reqested color in page_get routines.
790Sstevel@tonic-gate  * If set, keeps large pages intact longer and keeps MPO allocation
800Sstevel@tonic-gate  * from the local mnode in favor of acquiring the 'correct' page color from
810Sstevel@tonic-gate  * a demoted large page or from a remote mnode.
820Sstevel@tonic-gate  */
832961Sdp78419 uint_t	colorequiv;
842961Sdp78419 
852961Sdp78419 /*
862961Sdp78419  * color equivalency mask for each page size.
872961Sdp78419  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
882961Sdp78419  * High 4 bits determine the number of high order bits of the color to ignore.
892961Sdp78419  * Low 4 bits determines number of low order bits of color to ignore (it's only
902961Sdp78419  * relevant for hashed index based page coloring).
912961Sdp78419  */
922961Sdp78419 uchar_t colorequivszc[MMU_PAGE_SIZES];
930Sstevel@tonic-gate 
940Sstevel@tonic-gate /*
950Sstevel@tonic-gate  * if set, specifies the percentage of large pages that are free from within
960Sstevel@tonic-gate  * a large page region before attempting to lock those pages for
970Sstevel@tonic-gate  * page_get_contig_pages processing.
980Sstevel@tonic-gate  *
990Sstevel@tonic-gate  * Should be turned on when kpr is available when page_trylock_contig_pages
1000Sstevel@tonic-gate  * can be more selective.
1010Sstevel@tonic-gate  */
1020Sstevel@tonic-gate 
1030Sstevel@tonic-gate int	ptcpthreshold;
1040Sstevel@tonic-gate 
1050Sstevel@tonic-gate /*
1060Sstevel@tonic-gate  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
107841Skchow  * Enabled by default via pgcplimitsearch.
108841Skchow  *
109841Skchow  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
110841Skchow  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
111841Skchow  * bound. This upper bound range guarantees:
112841Skchow  *    - all large page 'slots' will be searched over time
113841Skchow  *    - the minimum (1) large page candidates considered on each pgcp call
114841Skchow  *    - count doesn't wrap around to 0
1150Sstevel@tonic-gate  */
116841Skchow pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
1170Sstevel@tonic-gate int	pgcplimitsearch = 1;
1180Sstevel@tonic-gate 
119841Skchow #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
120841Skchow #define	SETPGCPFAILCNT(szc)						\
121841Skchow 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
122841Skchow 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
123841Skchow 
1240Sstevel@tonic-gate #ifdef VM_STATS
1250Sstevel@tonic-gate struct vmm_vmstats_str  vmm_vmstats;
12612293SJames.McPherson@Sun.COM 
1270Sstevel@tonic-gate #endif /* VM_STATS */
1280Sstevel@tonic-gate 
1290Sstevel@tonic-gate #if defined(__sparc)
1300Sstevel@tonic-gate #define	LPGCREATE	0
1310Sstevel@tonic-gate #else
1320Sstevel@tonic-gate /* enable page_get_contig_pages */
1330Sstevel@tonic-gate #define	LPGCREATE	1
1340Sstevel@tonic-gate #endif
1350Sstevel@tonic-gate 
1360Sstevel@tonic-gate int pg_contig_disable;
1370Sstevel@tonic-gate int pg_lpgcreate_nocage = LPGCREATE;
1380Sstevel@tonic-gate 
1390Sstevel@tonic-gate /*
1407656SSherry.Moore@Sun.COM  * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
1410Sstevel@tonic-gate  */
1420Sstevel@tonic-gate #define	PFNNULL		0
1430Sstevel@tonic-gate 
1440Sstevel@tonic-gate /* Flags involved in promotion and demotion routines */
1450Sstevel@tonic-gate #define	PC_FREE		0x1	/* put page on freelist */
1460Sstevel@tonic-gate #define	PC_ALLOC	0x2	/* return page for allocation */
1470Sstevel@tonic-gate 
1480Sstevel@tonic-gate /*
1490Sstevel@tonic-gate  * Flag for page_demote to be used with PC_FREE to denote that we don't care
1500Sstevel@tonic-gate  * what the color is as the color parameter to the function is ignored.
1510Sstevel@tonic-gate  */
1520Sstevel@tonic-gate #define	PC_NO_COLOR	(-1)
1530Sstevel@tonic-gate 
1542961Sdp78419 /* mtype value for page_promote to use when mtype does not matter */
1552961Sdp78419 #define	PC_MTYPE_ANY	(-1)
1562961Sdp78419 
1570Sstevel@tonic-gate /*
1580Sstevel@tonic-gate  * page counters candidates info
1590Sstevel@tonic-gate  * See page_ctrs_cands comment below for more details.
1600Sstevel@tonic-gate  * fields are as follows:
1610Sstevel@tonic-gate  *	pcc_pages_free:		# pages which freelist coalesce can create
1620Sstevel@tonic-gate  *	pcc_color_free:		pointer to page free counts per color
1630Sstevel@tonic-gate  */
1640Sstevel@tonic-gate typedef struct pcc_info {
1650Sstevel@tonic-gate 	pgcnt_t	pcc_pages_free;
1660Sstevel@tonic-gate 	pgcnt_t	*pcc_color_free;
1676880Sdv142724 	uint_t	pad[12];
1680Sstevel@tonic-gate } pcc_info_t;
1690Sstevel@tonic-gate 
1700Sstevel@tonic-gate /*
1710Sstevel@tonic-gate  * On big machines it can take a long time to check page_counters
1720Sstevel@tonic-gate  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
1730Sstevel@tonic-gate  * updated sum of all elements of the corresponding page_counters arrays.
1740Sstevel@tonic-gate  * page_freelist_coalesce() searches page_counters only if an appropriate
1750Sstevel@tonic-gate  * element of page_ctrs_cands array is greater than 0.
1760Sstevel@tonic-gate  *
1772961Sdp78419  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
1780Sstevel@tonic-gate  */
1792961Sdp78419 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
1800Sstevel@tonic-gate 
1810Sstevel@tonic-gate /*
1820Sstevel@tonic-gate  * Return in val the total number of free pages which can be created
1832961Sdp78419  * for the given mnode (m), mrange (g), and region size (r)
1840Sstevel@tonic-gate  */
1852961Sdp78419 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
1860Sstevel@tonic-gate 	int i;								\
1870Sstevel@tonic-gate 	val = 0;							\
1880Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {				\
1892961Sdp78419 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
1900Sstevel@tonic-gate 	}								\
1910Sstevel@tonic-gate }
1920Sstevel@tonic-gate 
1930Sstevel@tonic-gate /*
1940Sstevel@tonic-gate  * Return in val the total number of free pages which can be created
1952961Sdp78419  * for the given mnode (m), mrange (g), region size (r), and color (c)
1960Sstevel@tonic-gate  */
1972961Sdp78419 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
1980Sstevel@tonic-gate 	int i;								\
1990Sstevel@tonic-gate 	val = 0;							\
2002961Sdp78419 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
2010Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {				\
2022961Sdp78419 	    val +=							\
2032961Sdp78419 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
2040Sstevel@tonic-gate 	}								\
2050Sstevel@tonic-gate }
2060Sstevel@tonic-gate 
2070Sstevel@tonic-gate /*
2080Sstevel@tonic-gate  * We can only allow a single thread to update a counter within the physical
2090Sstevel@tonic-gate  * range of the largest supported page size. That is the finest granularity
2100Sstevel@tonic-gate  * possible since the counter values are dependent on each other
2110Sstevel@tonic-gate  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
2120Sstevel@tonic-gate  * ctr_mutex lock index for a particular physical range.
2130Sstevel@tonic-gate  */
2140Sstevel@tonic-gate static kmutex_t	*ctr_mutex[NPC_MUTEX];
2150Sstevel@tonic-gate 
2160Sstevel@tonic-gate #define	PP_CTR_LOCK_INDX(pp)						\
2172961Sdp78419 	(((pp)->p_pagenum >>						\
2180Sstevel@tonic-gate 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
2190Sstevel@tonic-gate 
2202961Sdp78419 #define	INVALID_COLOR 0xffffffff
2212961Sdp78419 #define	INVALID_MASK  0xffffffff
2222961Sdp78419 
2230Sstevel@tonic-gate /*
2240Sstevel@tonic-gate  * Local functions prototypes.
2250Sstevel@tonic-gate  */
2260Sstevel@tonic-gate 
227414Skchow void page_ctr_add(int, int, page_t *, int);
228414Skchow void page_ctr_add_internal(int, int, page_t *, int);
229414Skchow void page_ctr_sub(int, int, page_t *, int);
2302961Sdp78419 void page_ctr_sub_internal(int, int, page_t *, int);
2310Sstevel@tonic-gate void page_freelist_lock(int);
2320Sstevel@tonic-gate void page_freelist_unlock(int);
2332961Sdp78419 page_t *page_promote(int, pfn_t, uchar_t, int, int);
2347656SSherry.Moore@Sun.COM page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
2352961Sdp78419 page_t *page_freelist_split(uchar_t,
2367656SSherry.Moore@Sun.COM     uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
2370Sstevel@tonic-gate page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
2380Sstevel@tonic-gate static int page_trylock_cons(page_t *pp, se_t se);
2390Sstevel@tonic-gate 
2400Sstevel@tonic-gate /*
2410Sstevel@tonic-gate  * The page_counters array below is used to keep track of free contiguous
2420Sstevel@tonic-gate  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
2430Sstevel@tonic-gate  * This contains an array of counters, the size of the array, a shift value
2440Sstevel@tonic-gate  * used to convert a pagenum into a counter array index or vice versa, as
2450Sstevel@tonic-gate  * well as a cache of the last successful index to be promoted to a larger
2460Sstevel@tonic-gate  * page size.  As an optimization, we keep track of the last successful index
2470Sstevel@tonic-gate  * to be promoted per page color for the given size region, and this is
2480Sstevel@tonic-gate  * allocated dynamically based upon the number of colors for a given
2490Sstevel@tonic-gate  * region size.
2500Sstevel@tonic-gate  *
2510Sstevel@tonic-gate  * Conceptually, the page counters are represented as:
2520Sstevel@tonic-gate  *
2530Sstevel@tonic-gate  *	page_counters[region_size][mnode]
2540Sstevel@tonic-gate  *
2550Sstevel@tonic-gate  *	region_size:	size code of a candidate larger page made up
2560Sstevel@tonic-gate  *			of contiguous free smaller pages.
2570Sstevel@tonic-gate  *
2580Sstevel@tonic-gate  *	page_counters[region_size][mnode].hpm_counters[index]:
2590Sstevel@tonic-gate  *		represents how many (region_size - 1) pages either
2600Sstevel@tonic-gate  *		exist or can be created within the given index range.
2610Sstevel@tonic-gate  *
2620Sstevel@tonic-gate  * Let's look at a sparc example:
2630Sstevel@tonic-gate  *	If we want to create a free 512k page, we look at region_size 2
2640Sstevel@tonic-gate  *	for the mnode we want.  We calculate the index and look at a specific
2650Sstevel@tonic-gate  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
2660Sstevel@tonic-gate  *	this location, it means that 8 64k pages either exist or can be created
2670Sstevel@tonic-gate  *	from 8K pages in order to make a single free 512k page at the given
2680Sstevel@tonic-gate  *	index.  Note that when a region is full, it will contribute to the
2690Sstevel@tonic-gate  *	counts in the region above it.  Thus we will not know what page
2700Sstevel@tonic-gate  *	size the free pages will be which can be promoted to this new free
2710Sstevel@tonic-gate  *	page unless we look at all regions below the current region.
2720Sstevel@tonic-gate  */
2730Sstevel@tonic-gate 
2740Sstevel@tonic-gate /*
2750Sstevel@tonic-gate  * Note: hpmctr_t is defined in platform vm_dep.h
2760Sstevel@tonic-gate  * hw_page_map_t contains all the information needed for the page_counters
2770Sstevel@tonic-gate  * logic. The fields are as follows:
2780Sstevel@tonic-gate  *
2790Sstevel@tonic-gate  *	hpm_counters:	dynamically allocated array to hold counter data
2800Sstevel@tonic-gate  *	hpm_entries:	entries in hpm_counters
2810Sstevel@tonic-gate  *	hpm_shift:	shift for pnum/array index conv
2820Sstevel@tonic-gate  *	hpm_base:	PFN mapped to counter index 0
2830Sstevel@tonic-gate  *	hpm_color_current:	last index in counter array for this color at
2840Sstevel@tonic-gate  *				which we successfully created a large page
2850Sstevel@tonic-gate  */
2860Sstevel@tonic-gate typedef struct hw_page_map {
2870Sstevel@tonic-gate 	hpmctr_t	*hpm_counters;
2880Sstevel@tonic-gate 	size_t		hpm_entries;
2890Sstevel@tonic-gate 	int		hpm_shift;
2900Sstevel@tonic-gate 	pfn_t		hpm_base;
2912961Sdp78419 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
2926880Sdv142724 #if defined(__sparc)
2936880Sdv142724 	uint_t		pad[4];
2946880Sdv142724 #endif
2950Sstevel@tonic-gate } hw_page_map_t;
2960Sstevel@tonic-gate 
2970Sstevel@tonic-gate /*
2980Sstevel@tonic-gate  * Element zero is not used, but is allocated for convenience.
2990Sstevel@tonic-gate  */
3000Sstevel@tonic-gate static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
3010Sstevel@tonic-gate 
3020Sstevel@tonic-gate /*
3032961Sdp78419  * Cached value of MNODE_RANGE_CNT(mnode).
3042961Sdp78419  * This is a function call in x86.
3052961Sdp78419  */
3062961Sdp78419 static int mnode_nranges[MAX_MEM_NODES];
3072961Sdp78419 static int mnode_maxmrange[MAX_MEM_NODES];
3082961Sdp78419 
3092961Sdp78419 /*
3100Sstevel@tonic-gate  * The following macros are convenient ways to get access to the individual
3110Sstevel@tonic-gate  * elements of the page_counters arrays.  They can be used on both
3120Sstevel@tonic-gate  * the left side and right side of equations.
3130Sstevel@tonic-gate  */
3140Sstevel@tonic-gate #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
3150Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
3160Sstevel@tonic-gate 
3170Sstevel@tonic-gate #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
3180Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
3190Sstevel@tonic-gate 
3200Sstevel@tonic-gate #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
3210Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
3220Sstevel@tonic-gate 
3230Sstevel@tonic-gate #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
3240Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
3250Sstevel@tonic-gate 
3260Sstevel@tonic-gate #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
3270Sstevel@tonic-gate 	(page_counters[(rg_szc)][(mnode)].hpm_base)
3280Sstevel@tonic-gate 
3292961Sdp78419 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
3302961Sdp78419 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
3312961Sdp78419 
3322961Sdp78419 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
3332961Sdp78419 	(page_counters[(rg_szc)][(mnode)].				\
3342961Sdp78419 	hpm_color_current[(mrange)][(color)])
3350Sstevel@tonic-gate 
3360Sstevel@tonic-gate #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
3370Sstevel@tonic-gate 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
3380Sstevel@tonic-gate 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
3390Sstevel@tonic-gate 
3400Sstevel@tonic-gate #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
3410Sstevel@tonic-gate 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
3420Sstevel@tonic-gate 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
3430Sstevel@tonic-gate 
3440Sstevel@tonic-gate /*
3450Sstevel@tonic-gate  * Protects the hpm_counters and hpm_color_current memory from changing while
3460Sstevel@tonic-gate  * looking at page counters information.
3470Sstevel@tonic-gate  * Grab the write lock to modify what these fields point at.
3480Sstevel@tonic-gate  * Grab the read lock to prevent any pointers from changing.
3490Sstevel@tonic-gate  * The write lock can not be held during memory allocation due to a possible
3500Sstevel@tonic-gate  * recursion deadlock with trying to grab the read lock while the
3510Sstevel@tonic-gate  * write lock is already held.
3520Sstevel@tonic-gate  */
3530Sstevel@tonic-gate krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
3540Sstevel@tonic-gate 
35512293SJames.McPherson@Sun.COM 
356414Skchow /*
357414Skchow  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
358414Skchow  */
359414Skchow void
cpu_vm_data_init(struct cpu * cp)360414Skchow cpu_vm_data_init(struct cpu *cp)
361414Skchow {
362414Skchow 	if (cp == CPU0) {
363414Skchow 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
364414Skchow 	} else {
365414Skchow 		void	*kmptr;
366450Skchow 		int	align;
367450Skchow 		size_t	sz;
368450Skchow 
369450Skchow 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
370450Skchow 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
371450Skchow 		kmptr = kmem_zalloc(sz, KM_SLEEP);
372414Skchow 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
373414Skchow 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
374450Skchow 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
375414Skchow 	}
376414Skchow }
377414Skchow 
378414Skchow /*
379414Skchow  * free cpu_vm_data
380414Skchow  */
381414Skchow void
cpu_vm_data_destroy(struct cpu * cp)382414Skchow cpu_vm_data_destroy(struct cpu *cp)
383414Skchow {
384414Skchow 	if (cp->cpu_seqid && cp->cpu_vm_data) {
385414Skchow 		ASSERT(cp != CPU0);
386414Skchow 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
387450Skchow 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
388414Skchow 	}
389414Skchow 	cp->cpu_vm_data = NULL;
390414Skchow }
391414Skchow 
392414Skchow 
3930Sstevel@tonic-gate /*
3940Sstevel@tonic-gate  * page size to page size code
3950Sstevel@tonic-gate  */
3960Sstevel@tonic-gate int
page_szc(size_t pagesize)3970Sstevel@tonic-gate page_szc(size_t pagesize)
3980Sstevel@tonic-gate {
3990Sstevel@tonic-gate 	int	i = 0;
4000Sstevel@tonic-gate 
4010Sstevel@tonic-gate 	while (hw_page_array[i].hp_size) {
4020Sstevel@tonic-gate 		if (pagesize == hw_page_array[i].hp_size)
4030Sstevel@tonic-gate 			return (i);
4040Sstevel@tonic-gate 		i++;
4050Sstevel@tonic-gate 	}
4060Sstevel@tonic-gate 	return (-1);
4070Sstevel@tonic-gate }
4080Sstevel@tonic-gate 
4090Sstevel@tonic-gate /*
41073Smec  * page size to page size code with the restriction that it be a supported
41173Smec  * user page size.  If it's not a supported user page size, -1 will be returned.
4120Sstevel@tonic-gate  */
4130Sstevel@tonic-gate int
page_szc_user_filtered(size_t pagesize)41473Smec page_szc_user_filtered(size_t pagesize)
4150Sstevel@tonic-gate {
4160Sstevel@tonic-gate 	int szc = page_szc(pagesize);
41773Smec 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
41873Smec 		return (szc);
41973Smec 	}
4200Sstevel@tonic-gate 	return (-1);
4210Sstevel@tonic-gate }
4220Sstevel@tonic-gate 
4230Sstevel@tonic-gate /*
4240Sstevel@tonic-gate  * Return how many page sizes are available for the user to use.  This is
4250Sstevel@tonic-gate  * what the hardware supports and not based upon how the OS implements the
4260Sstevel@tonic-gate  * support of different page sizes.
4275349Skchow  *
4285349Skchow  * If legacy is non-zero, return the number of pagesizes available to legacy
4295349Skchow  * applications. The number of legacy page sizes might be less than the
4305349Skchow  * exported user page sizes. This is to prevent legacy applications that
4315349Skchow  * use the largest page size returned from getpagesizes(3c) from inadvertantly
4325349Skchow  * using the 'new' large pagesizes.
4330Sstevel@tonic-gate  */
4340Sstevel@tonic-gate uint_t
page_num_user_pagesizes(int legacy)4355349Skchow page_num_user_pagesizes(int legacy)
4360Sstevel@tonic-gate {
4375349Skchow 	if (legacy)
4385349Skchow 		return (mmu_legacy_page_sizes);
4390Sstevel@tonic-gate 	return (mmu_exported_page_sizes);
4400Sstevel@tonic-gate }
4410Sstevel@tonic-gate 
4420Sstevel@tonic-gate uint_t
page_num_pagesizes(void)4430Sstevel@tonic-gate page_num_pagesizes(void)
4440Sstevel@tonic-gate {
4450Sstevel@tonic-gate 	return (mmu_page_sizes);
4460Sstevel@tonic-gate }
4470Sstevel@tonic-gate 
4480Sstevel@tonic-gate /*
4490Sstevel@tonic-gate  * returns the count of the number of base pagesize pages associated with szc
4500Sstevel@tonic-gate  */
4510Sstevel@tonic-gate pgcnt_t
page_get_pagecnt(uint_t szc)4520Sstevel@tonic-gate page_get_pagecnt(uint_t szc)
4530Sstevel@tonic-gate {
4540Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4550Sstevel@tonic-gate 		panic("page_get_pagecnt: out of range %d", szc);
4560Sstevel@tonic-gate 	return (hw_page_array[szc].hp_pgcnt);
4570Sstevel@tonic-gate }
4580Sstevel@tonic-gate 
4590Sstevel@tonic-gate size_t
page_get_pagesize(uint_t szc)4600Sstevel@tonic-gate page_get_pagesize(uint_t szc)
4610Sstevel@tonic-gate {
4620Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4630Sstevel@tonic-gate 		panic("page_get_pagesize: out of range %d", szc);
4640Sstevel@tonic-gate 	return (hw_page_array[szc].hp_size);
4650Sstevel@tonic-gate }
4660Sstevel@tonic-gate 
4670Sstevel@tonic-gate /*
4680Sstevel@tonic-gate  * Return the size of a page based upon the index passed in.  An index of
4690Sstevel@tonic-gate  * zero refers to the smallest page size in the system, and as index increases
4700Sstevel@tonic-gate  * it refers to the next larger supported page size in the system.
4710Sstevel@tonic-gate  * Note that szc and userszc may not be the same due to unsupported szc's on
4720Sstevel@tonic-gate  * some systems.
4730Sstevel@tonic-gate  */
4740Sstevel@tonic-gate size_t
page_get_user_pagesize(uint_t userszc)4750Sstevel@tonic-gate page_get_user_pagesize(uint_t userszc)
4760Sstevel@tonic-gate {
4770Sstevel@tonic-gate 	uint_t szc = USERSZC_2_SZC(userszc);
4780Sstevel@tonic-gate 
4790Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4800Sstevel@tonic-gate 		panic("page_get_user_pagesize: out of range %d", szc);
4810Sstevel@tonic-gate 	return (hw_page_array[szc].hp_size);
4820Sstevel@tonic-gate }
4830Sstevel@tonic-gate 
4840Sstevel@tonic-gate uint_t
page_get_shift(uint_t szc)4850Sstevel@tonic-gate page_get_shift(uint_t szc)
4860Sstevel@tonic-gate {
4870Sstevel@tonic-gate 	if (szc >= mmu_page_sizes)
4880Sstevel@tonic-gate 		panic("page_get_shift: out of range %d", szc);
4892961Sdp78419 	return (PAGE_GET_SHIFT(szc));
4900Sstevel@tonic-gate }
4910Sstevel@tonic-gate 
4920Sstevel@tonic-gate uint_t
page_get_pagecolors(uint_t szc)4930Sstevel@tonic-gate page_get_pagecolors(uint_t szc)
4940Sstevel@tonic-gate {
4952961Sdp78419 	if (szc >= mmu_page_sizes)
4962961Sdp78419 		panic("page_get_pagecolors: out of range %d", szc);
4972961Sdp78419 	return (PAGE_GET_PAGECOLORS(szc));
4982961Sdp78419 }
4992961Sdp78419 
5002961Sdp78419 /*
5012961Sdp78419  * this assigns the desired equivalent color after a split
5022961Sdp78419  */
5032961Sdp78419 uint_t
page_correct_color(uchar_t szc,uchar_t nszc,uint_t color,uint_t ncolor,uint_t ceq_mask)5042961Sdp78419 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
5052961Sdp78419     uint_t ncolor, uint_t ceq_mask)
5062961Sdp78419 {
5072961Sdp78419 	ASSERT(nszc > szc);
5082961Sdp78419 	ASSERT(szc < mmu_page_sizes);
5092961Sdp78419 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
5102961Sdp78419 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
5112961Sdp78419 
5122961Sdp78419 	color &= ceq_mask;
5134769Sdp78419 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
5142961Sdp78419 	return (color | (ncolor & ~ceq_mask));
5150Sstevel@tonic-gate }
5160Sstevel@tonic-gate 
5170Sstevel@tonic-gate /*
5184769Sdp78419  * The interleaved_mnodes flag is set when mnodes overlap in
5194769Sdp78419  * the physbase..physmax range, but have disjoint slices.
5204769Sdp78419  * In this case hpm_counters is shared by all mnodes.
5214769Sdp78419  * This flag is set dynamically by the platform.
5224769Sdp78419  */
5234769Sdp78419 int interleaved_mnodes = 0;
5244769Sdp78419 
5254769Sdp78419 /*
5260Sstevel@tonic-gate  * Called by startup().
5270Sstevel@tonic-gate  * Size up the per page size free list counters based on physmax
5280Sstevel@tonic-gate  * of each node and max_mem_nodes.
5294769Sdp78419  *
5304769Sdp78419  * If interleaved_mnodes is set we need to find the first mnode that
5314769Sdp78419  * exists. hpm_counters for the first mnode will then be shared by
5324769Sdp78419  * all other mnodes. If interleaved_mnodes is not set, just set
5334769Sdp78419  * first=mnode each time. That means there will be no sharing.
5340Sstevel@tonic-gate  */
5350Sstevel@tonic-gate size_t
page_ctrs_sz(void)5360Sstevel@tonic-gate page_ctrs_sz(void)
5370Sstevel@tonic-gate {
5380Sstevel@tonic-gate 	int	r;		/* region size */
5390Sstevel@tonic-gate 	int	mnode;
5404769Sdp78419 	int	firstmn;	/* first mnode that exists */
5412961Sdp78419 	int	nranges;
5424769Sdp78419 	pfn_t	physbase;
5434769Sdp78419 	pfn_t	physmax;
5440Sstevel@tonic-gate 	uint_t	ctrs_sz = 0;
5450Sstevel@tonic-gate 	int 	i;
5460Sstevel@tonic-gate 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
5470Sstevel@tonic-gate 
5480Sstevel@tonic-gate 	/*
5490Sstevel@tonic-gate 	 * We need to determine how many page colors there are for each
5500Sstevel@tonic-gate 	 * page size in order to allocate memory for any color specific
5510Sstevel@tonic-gate 	 * arrays.
5520Sstevel@tonic-gate 	 */
5532961Sdp78419 	for (i = 0; i < mmu_page_sizes; i++) {
5542961Sdp78419 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
5550Sstevel@tonic-gate 	}
5560Sstevel@tonic-gate 
5574769Sdp78419 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
5580Sstevel@tonic-gate 
5590Sstevel@tonic-gate 		pgcnt_t r_pgcnt;
5600Sstevel@tonic-gate 		pfn_t   r_base;
5610Sstevel@tonic-gate 		pgcnt_t r_align;
5620Sstevel@tonic-gate 
5630Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
5640Sstevel@tonic-gate 			continue;
5650Sstevel@tonic-gate 
5664769Sdp78419 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
5672961Sdp78419 		nranges = MNODE_RANGE_CNT(mnode);
5682961Sdp78419 		mnode_nranges[mnode] = nranges;
5692961Sdp78419 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
5702961Sdp78419 
5710Sstevel@tonic-gate 		/*
5720Sstevel@tonic-gate 		 * determine size needed for page counter arrays with
5730Sstevel@tonic-gate 		 * base aligned to large page size.
5740Sstevel@tonic-gate 		 */
5750Sstevel@tonic-gate 		for (r = 1; r < mmu_page_sizes; r++) {
5764769Sdp78419 			/* add in space for hpm_color_current */
5774769Sdp78419 			ctrs_sz += sizeof (size_t) *
5784769Sdp78419 			    colors_per_szc[r] * nranges;
5794769Sdp78419 
5804769Sdp78419 			if (firstmn != mnode)
5814769Sdp78419 				continue;
5824769Sdp78419 
5830Sstevel@tonic-gate 			/* add in space for hpm_counters */
5840Sstevel@tonic-gate 			r_align = page_get_pagecnt(r);
5854769Sdp78419 			r_base = physbase;
5860Sstevel@tonic-gate 			r_base &= ~(r_align - 1);
5874769Sdp78419 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
5884769Sdp78419 
5890Sstevel@tonic-gate 			/*
5900Sstevel@tonic-gate 			 * Round up to always allocate on pointer sized
5910Sstevel@tonic-gate 			 * boundaries.
5920Sstevel@tonic-gate 			 */
5930Sstevel@tonic-gate 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
5940Sstevel@tonic-gate 			    sizeof (hpmctr_t *));
5950Sstevel@tonic-gate 		}
5960Sstevel@tonic-gate 	}
5970Sstevel@tonic-gate 
5980Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
5990Sstevel@tonic-gate 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
6002961Sdp78419 	}
6012961Sdp78419 
6022961Sdp78419 	/* add in space for page_ctrs_cands and pcc_color_free */
6032961Sdp78419 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
6042961Sdp78419 	    mmu_page_sizes * NPC_MUTEX;
6052961Sdp78419 
6062961Sdp78419 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
6072961Sdp78419 
6082961Sdp78419 		if (mem_node_config[mnode].exists == 0)
6092961Sdp78419 			continue;
6102961Sdp78419 
6112961Sdp78419 		nranges = mnode_nranges[mnode];
6122961Sdp78419 		ctrs_sz += sizeof (pcc_info_t) * nranges *
6132961Sdp78419 		    mmu_page_sizes * NPC_MUTEX;
6142961Sdp78419 		for (r = 1; r < mmu_page_sizes; r++) {
6152961Sdp78419 			ctrs_sz += sizeof (pgcnt_t) * nranges *
6162961Sdp78419 			    colors_per_szc[r] * NPC_MUTEX;
6172961Sdp78419 		}
6180Sstevel@tonic-gate 	}
6190Sstevel@tonic-gate 
6200Sstevel@tonic-gate 	/* ctr_mutex */
6210Sstevel@tonic-gate 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
6220Sstevel@tonic-gate 
6230Sstevel@tonic-gate 	/* size for page list counts */
6240Sstevel@tonic-gate 	PLCNT_SZ(ctrs_sz);
6250Sstevel@tonic-gate 
6260Sstevel@tonic-gate 	/*
6270Sstevel@tonic-gate 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
6280Sstevel@tonic-gate 	 * address of the counters to ecache_alignsize boundary for every
6290Sstevel@tonic-gate 	 * memory node.
6300Sstevel@tonic-gate 	 */
6310Sstevel@tonic-gate 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
6320Sstevel@tonic-gate }
6330Sstevel@tonic-gate 
6340Sstevel@tonic-gate caddr_t
page_ctrs_alloc(caddr_t alloc_base)6350Sstevel@tonic-gate page_ctrs_alloc(caddr_t alloc_base)
6360Sstevel@tonic-gate {
6370Sstevel@tonic-gate 	int	mnode;
6382961Sdp78419 	int	mrange, nranges;
6390Sstevel@tonic-gate 	int	r;		/* region size */
6400Sstevel@tonic-gate 	int	i;
6414769Sdp78419 	int	firstmn;	/* first mnode that exists */
6424769Sdp78419 	pfn_t	physbase;
6434769Sdp78419 	pfn_t	physmax;
6440Sstevel@tonic-gate 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
6450Sstevel@tonic-gate 
6460Sstevel@tonic-gate 	/*
6470Sstevel@tonic-gate 	 * We need to determine how many page colors there are for each
6480Sstevel@tonic-gate 	 * page size in order to allocate memory for any color specific
6490Sstevel@tonic-gate 	 * arrays.
6500Sstevel@tonic-gate 	 */
6512961Sdp78419 	for (i = 0; i < mmu_page_sizes; i++) {
6522961Sdp78419 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
6530Sstevel@tonic-gate 	}
6540Sstevel@tonic-gate 
6550Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
6560Sstevel@tonic-gate 		page_counters[r] = (hw_page_map_t *)alloc_base;
6570Sstevel@tonic-gate 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
6580Sstevel@tonic-gate 	}
6590Sstevel@tonic-gate 
6602961Sdp78419 	/* page_ctrs_cands and pcc_color_free array */
6612961Sdp78419 	for (i = 0; i < NPC_MUTEX; i++) {
6622961Sdp78419 		for (r = 1; r < mmu_page_sizes; r++) {
6632961Sdp78419 
6642961Sdp78419 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
6652961Sdp78419 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
6662961Sdp78419 
6670Sstevel@tonic-gate 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
6682961Sdp78419 				pcc_info_t *pi;
6692961Sdp78419 
6702961Sdp78419 				if (mem_node_config[mnode].exists == 0)
6712961Sdp78419 					continue;
6722961Sdp78419 
6732961Sdp78419 				nranges = mnode_nranges[mnode];
6742961Sdp78419 
6752961Sdp78419 				pi = (pcc_info_t *)alloc_base;
6762961Sdp78419 				alloc_base += sizeof (pcc_info_t) * nranges;
6772961Sdp78419 				page_ctrs_cands[i][r][mnode] = pi;
6782961Sdp78419 
6792961Sdp78419 				for (mrange = 0; mrange < nranges; mrange++) {
6802961Sdp78419 					pi->pcc_color_free =
6812961Sdp78419 					    (pgcnt_t *)alloc_base;
6822961Sdp78419 					alloc_base += sizeof (pgcnt_t) *
6832961Sdp78419 					    colors_per_szc[r];
6842961Sdp78419 					pi++;
6852961Sdp78419 				}
6860Sstevel@tonic-gate 			}
6870Sstevel@tonic-gate 		}
6880Sstevel@tonic-gate 	}
6890Sstevel@tonic-gate 
6900Sstevel@tonic-gate 	/* ctr_mutex */
6910Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {
6920Sstevel@tonic-gate 		ctr_mutex[i] = (kmutex_t *)alloc_base;
6930Sstevel@tonic-gate 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
6940Sstevel@tonic-gate 	}
6950Sstevel@tonic-gate 
6960Sstevel@tonic-gate 	/* initialize page list counts */
6970Sstevel@tonic-gate 	PLCNT_INIT(alloc_base);
6980Sstevel@tonic-gate 
6994769Sdp78419 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
7000Sstevel@tonic-gate 
7010Sstevel@tonic-gate 		pgcnt_t r_pgcnt;
7020Sstevel@tonic-gate 		pfn_t	r_base;
7030Sstevel@tonic-gate 		pgcnt_t r_align;
7040Sstevel@tonic-gate 		int	r_shift;
7052961Sdp78419 		int	nranges = mnode_nranges[mnode];
7060Sstevel@tonic-gate 
7070Sstevel@tonic-gate 		if (mem_node_config[mnode].exists == 0)
7080Sstevel@tonic-gate 			continue;
7090Sstevel@tonic-gate 
7104769Sdp78419 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
7114769Sdp78419 
7120Sstevel@tonic-gate 		for (r = 1; r < mmu_page_sizes; r++) {
7130Sstevel@tonic-gate 			/*
7140Sstevel@tonic-gate 			 * the page_counters base has to be aligned to the
7150Sstevel@tonic-gate 			 * page count of page size code r otherwise the counts
7160Sstevel@tonic-gate 			 * will cross large page boundaries.
7170Sstevel@tonic-gate 			 */
7180Sstevel@tonic-gate 			r_align = page_get_pagecnt(r);
7194769Sdp78419 			r_base = physbase;
7200Sstevel@tonic-gate 			/* base needs to be aligned - lower to aligned value */
7210Sstevel@tonic-gate 			r_base &= ~(r_align - 1);
7224769Sdp78419 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
7230Sstevel@tonic-gate 			r_shift = PAGE_BSZS_SHIFT(r);
7240Sstevel@tonic-gate 
7250Sstevel@tonic-gate 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
7260Sstevel@tonic-gate 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
7270Sstevel@tonic-gate 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
7282961Sdp78419 			for (mrange = 0; mrange < nranges; mrange++) {
7292961Sdp78419 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
7302961Sdp78419 				    r, mrange) = (size_t *)alloc_base;
7312961Sdp78419 				alloc_base += sizeof (size_t) *
7322961Sdp78419 				    colors_per_szc[r];
7332961Sdp78419 			}
7340Sstevel@tonic-gate 			for (i = 0; i < colors_per_szc[r]; i++) {
7352961Sdp78419 				uint_t color_mask = colors_per_szc[r] - 1;
7362961Sdp78419 				pfn_t  pfnum = r_base;
7372961Sdp78419 				size_t idx;
7382961Sdp78419 				int mrange;
7394769Sdp78419 				MEM_NODE_ITERATOR_DECL(it);
7404769Sdp78419 
7416041Sdp78419 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
7426041Sdp78419 				if (pfnum == (pfn_t)-1) {
7436041Sdp78419 					idx = 0;
7446041Sdp78419 				} else {
7456041Sdp78419 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
7466041Sdp78419 					    color_mask, color_mask, &it);
7476041Sdp78419 					idx = PNUM_TO_IDX(mnode, r, pfnum);
7486041Sdp78419 					idx = (idx >= r_pgcnt) ? 0 : idx;
7496041Sdp78419 				}
7502961Sdp78419 				for (mrange = 0; mrange < nranges; mrange++) {
7512961Sdp78419 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
7522961Sdp78419 					    r, i, mrange) = idx;
7532961Sdp78419 				}
7540Sstevel@tonic-gate 			}
7554769Sdp78419 
7564769Sdp78419 			/* hpm_counters may be shared by all mnodes */
7574769Sdp78419 			if (firstmn == mnode) {
7584769Sdp78419 				PAGE_COUNTERS_COUNTERS(mnode, r) =
7594769Sdp78419 				    (hpmctr_t *)alloc_base;
7604769Sdp78419 				alloc_base +=
7614769Sdp78419 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
7624769Sdp78419 				    sizeof (hpmctr_t *));
7634769Sdp78419 			} else {
7644769Sdp78419 				PAGE_COUNTERS_COUNTERS(mnode, r) =
7654769Sdp78419 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
7664769Sdp78419 			}
7670Sstevel@tonic-gate 
7680Sstevel@tonic-gate 			/*
7690Sstevel@tonic-gate 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
7700Sstevel@tonic-gate 			 * satisfy the identity requirement.
7710Sstevel@tonic-gate 			 * We should be able to go from one to the other
7720Sstevel@tonic-gate 			 * and get consistent values.
7730Sstevel@tonic-gate 			 */
7740Sstevel@tonic-gate 			ASSERT(PNUM_TO_IDX(mnode, r,
7750Sstevel@tonic-gate 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
7760Sstevel@tonic-gate 			ASSERT(IDX_TO_PNUM(mnode, r,
7770Sstevel@tonic-gate 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
7780Sstevel@tonic-gate 		}
7790Sstevel@tonic-gate 		/*
7800Sstevel@tonic-gate 		 * Roundup the start address of the page_counters to
7810Sstevel@tonic-gate 		 * cache aligned boundary for every memory node.
7820Sstevel@tonic-gate 		 * page_ctrs_sz() has added some slop for these roundups.
7830Sstevel@tonic-gate 		 */
7840Sstevel@tonic-gate 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
7854769Sdp78419 		    L2CACHE_ALIGN);
7860Sstevel@tonic-gate 	}
7870Sstevel@tonic-gate 
7880Sstevel@tonic-gate 	/* Initialize other page counter specific data structures. */
7890Sstevel@tonic-gate 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
7900Sstevel@tonic-gate 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
7910Sstevel@tonic-gate 	}
7920Sstevel@tonic-gate 
7930Sstevel@tonic-gate 	return (alloc_base);
7940Sstevel@tonic-gate }
7950Sstevel@tonic-gate 
7960Sstevel@tonic-gate /*
7970Sstevel@tonic-gate  * Functions to adjust region counters for each size free list.
7980Sstevel@tonic-gate  * Caller is responsible to acquire the ctr_mutex lock if necessary and
7990Sstevel@tonic-gate  * thus can be called during startup without locks.
8000Sstevel@tonic-gate  */
8010Sstevel@tonic-gate /* ARGSUSED */
8020Sstevel@tonic-gate void
page_ctr_add_internal(int mnode,int mtype,page_t * pp,int flags)803414Skchow page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
8040Sstevel@tonic-gate {
8050Sstevel@tonic-gate 	ssize_t		r;	/* region size */
8060Sstevel@tonic-gate 	ssize_t		idx;
8070Sstevel@tonic-gate 	pfn_t		pfnum;
8080Sstevel@tonic-gate 	int		lckidx;
8090Sstevel@tonic-gate 
810414Skchow 	ASSERT(mnode == PP_2_MEM_NODE(pp));
811414Skchow 	ASSERT(mtype == PP_2_MTYPE(pp));
812414Skchow 
8130Sstevel@tonic-gate 	ASSERT(pp->p_szc < mmu_page_sizes);
8140Sstevel@tonic-gate 
815414Skchow 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
8160Sstevel@tonic-gate 
8170Sstevel@tonic-gate 	/* no counter update needed for largest page size */
8180Sstevel@tonic-gate 	if (pp->p_szc >= mmu_page_sizes - 1) {
8190Sstevel@tonic-gate 		return;
8200Sstevel@tonic-gate 	}
8210Sstevel@tonic-gate 
8220Sstevel@tonic-gate 	r = pp->p_szc + 1;
8230Sstevel@tonic-gate 	pfnum = pp->p_pagenum;
8240Sstevel@tonic-gate 	lckidx = PP_CTR_LOCK_INDX(pp);
8250Sstevel@tonic-gate 
8260Sstevel@tonic-gate 	/*
8270Sstevel@tonic-gate 	 * Increment the count of free pages for the current
8280Sstevel@tonic-gate 	 * region. Continue looping up in region size incrementing
8290Sstevel@tonic-gate 	 * count if the preceeding region is full.
8300Sstevel@tonic-gate 	 */
8310Sstevel@tonic-gate 	while (r < mmu_page_sizes) {
8320Sstevel@tonic-gate 		idx = PNUM_TO_IDX(mnode, r, pfnum);
8330Sstevel@tonic-gate 
8340Sstevel@tonic-gate 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
8350Sstevel@tonic-gate 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
8360Sstevel@tonic-gate 
8372961Sdp78419 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
8380Sstevel@tonic-gate 			break;
8392961Sdp78419 		} else {
8402961Sdp78419 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
8412961Sdp78419 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
8422961Sdp78419 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
8432961Sdp78419 
8442961Sdp78419 			cand->pcc_pages_free++;
8452961Sdp78419 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
8462961Sdp78419 		}
8470Sstevel@tonic-gate 		r++;
8480Sstevel@tonic-gate 	}
8490Sstevel@tonic-gate }
8500Sstevel@tonic-gate 
8510Sstevel@tonic-gate void
page_ctr_add(int mnode,int mtype,page_t * pp,int flags)852414Skchow page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
8530Sstevel@tonic-gate {
8540Sstevel@tonic-gate 	int		lckidx = PP_CTR_LOCK_INDX(pp);
8550Sstevel@tonic-gate 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
8560Sstevel@tonic-gate 
8570Sstevel@tonic-gate 	mutex_enter(lock);
858414Skchow 	page_ctr_add_internal(mnode, mtype, pp, flags);
8590Sstevel@tonic-gate 	mutex_exit(lock);
8600Sstevel@tonic-gate }
8610Sstevel@tonic-gate 
8620Sstevel@tonic-gate void
page_ctr_sub_internal(int mnode,int mtype,page_t * pp,int flags)8632961Sdp78419 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
8640Sstevel@tonic-gate {
8650Sstevel@tonic-gate 	int		lckidx;
8660Sstevel@tonic-gate 	ssize_t		r;	/* region size */
8670Sstevel@tonic-gate 	ssize_t		idx;
8680Sstevel@tonic-gate 	pfn_t		pfnum;
8690Sstevel@tonic-gate 
870414Skchow 	ASSERT(mnode == PP_2_MEM_NODE(pp));
871414Skchow 	ASSERT(mtype == PP_2_MTYPE(pp));
872414Skchow 
8730Sstevel@tonic-gate 	ASSERT(pp->p_szc < mmu_page_sizes);
8740Sstevel@tonic-gate 
875414Skchow 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
8760Sstevel@tonic-gate 
8770Sstevel@tonic-gate 	/* no counter update needed for largest page size */
8780Sstevel@tonic-gate 	if (pp->p_szc >= mmu_page_sizes - 1) {
8790Sstevel@tonic-gate 		return;
8800Sstevel@tonic-gate 	}
8810Sstevel@tonic-gate 
8820Sstevel@tonic-gate 	r = pp->p_szc + 1;
8830Sstevel@tonic-gate 	pfnum = pp->p_pagenum;
8840Sstevel@tonic-gate 	lckidx = PP_CTR_LOCK_INDX(pp);
8850Sstevel@tonic-gate 
8860Sstevel@tonic-gate 	/*
8870Sstevel@tonic-gate 	 * Decrement the count of free pages for the current
8880Sstevel@tonic-gate 	 * region. Continue looping up in region size decrementing
8890Sstevel@tonic-gate 	 * count if the preceeding region was full.
8900Sstevel@tonic-gate 	 */
8910Sstevel@tonic-gate 	while (r < mmu_page_sizes) {
8920Sstevel@tonic-gate 		idx = PNUM_TO_IDX(mnode, r, pfnum);
8930Sstevel@tonic-gate 
8940Sstevel@tonic-gate 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
8950Sstevel@tonic-gate 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
8960Sstevel@tonic-gate 
8970Sstevel@tonic-gate 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
8980Sstevel@tonic-gate 			break;
8992961Sdp78419 		} else {
9002961Sdp78419 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
9012961Sdp78419 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
9022961Sdp78419 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
9032961Sdp78419 
9042961Sdp78419 			ASSERT(cand->pcc_pages_free != 0);
9052961Sdp78419 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
9062961Sdp78419 
9072961Sdp78419 			cand->pcc_pages_free--;
9082961Sdp78419 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
9090Sstevel@tonic-gate 		}
9100Sstevel@tonic-gate 		r++;
9110Sstevel@tonic-gate 	}
9122961Sdp78419 }
9132961Sdp78419 
9142961Sdp78419 void
page_ctr_sub(int mnode,int mtype,page_t * pp,int flags)9152961Sdp78419 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
9162961Sdp78419 {
9172961Sdp78419 	int		lckidx = PP_CTR_LOCK_INDX(pp);
9182961Sdp78419 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
9192961Sdp78419 
9202961Sdp78419 	mutex_enter(lock);
9212961Sdp78419 	page_ctr_sub_internal(mnode, mtype, pp, flags);
9220Sstevel@tonic-gate 	mutex_exit(lock);
9230Sstevel@tonic-gate }
9240Sstevel@tonic-gate 
9250Sstevel@tonic-gate /*
9260Sstevel@tonic-gate  * Adjust page counters following a memory attach, since typically the
9270Sstevel@tonic-gate  * size of the array needs to change, and the PFN to counter index
9280Sstevel@tonic-gate  * mapping needs to change.
9292961Sdp78419  *
9302961Sdp78419  * It is possible this mnode did not exist at startup. In that case
9312961Sdp78419  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
9322961Sdp78419  * to change (a theoretical possibility on x86), which means pcc_color_free
9332961Sdp78419  * arrays must be extended.
9340Sstevel@tonic-gate  */
9350Sstevel@tonic-gate uint_t
page_ctrs_adjust(int mnode)9360Sstevel@tonic-gate page_ctrs_adjust(int mnode)
9370Sstevel@tonic-gate {
9380Sstevel@tonic-gate 	pgcnt_t npgs;
9390Sstevel@tonic-gate 	int	r;		/* region size */
9400Sstevel@tonic-gate 	int	i;
9410Sstevel@tonic-gate 	size_t	pcsz, old_csz;
9420Sstevel@tonic-gate 	hpmctr_t *new_ctr, *old_ctr;
9430Sstevel@tonic-gate 	pfn_t	oldbase, newbase;
9444769Sdp78419 	pfn_t	physbase, physmax;
9450Sstevel@tonic-gate 	size_t	old_npgs;
9460Sstevel@tonic-gate 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
9470Sstevel@tonic-gate 	size_t	size_cache[MMU_PAGE_SIZES];
9482961Sdp78419 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
9492961Sdp78419 	size_t	*old_color_array[MAX_MNODE_MRANGES];
9500Sstevel@tonic-gate 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
9512961Sdp78419 	pcc_info_t **cands_cache;
9522961Sdp78419 	pcc_info_t *old_pi, *pi;
9532961Sdp78419 	pgcnt_t *pgcntp;
9542961Sdp78419 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
9552961Sdp78419 	int cands_cache_nranges;
9562961Sdp78419 	int old_maxmrange, new_maxmrange;
9572961Sdp78419 	int rc = 0;
95810106SJason.Beloro@Sun.COM 	int oldmnode;
9590Sstevel@tonic-gate 
9602961Sdp78419 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
9612961Sdp78419 	    MMU_PAGE_SIZES, KM_NOSLEEP);
9622961Sdp78419 	if (cands_cache == NULL)
9632961Sdp78419 		return (ENOMEM);
9642961Sdp78419 
9654769Sdp78419 	i = -1;
9664769Sdp78419 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
9674769Sdp78419 
9684769Sdp78419 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
9694769Sdp78419 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
9704769Sdp78419 
9712961Sdp78419 	/* prepare to free non-null pointers on the way out */
9722961Sdp78419 	cands_cache_nranges = nranges;
9732961Sdp78419 	bzero(ctr_cache, sizeof (ctr_cache));
9742961Sdp78419 	bzero(color_cache, sizeof (color_cache));
9752961Sdp78419 
9760Sstevel@tonic-gate 	/*
9770Sstevel@tonic-gate 	 * We need to determine how many page colors there are for each
9780Sstevel@tonic-gate 	 * page size in order to allocate memory for any color specific
9790Sstevel@tonic-gate 	 * arrays.
9800Sstevel@tonic-gate 	 */
9812961Sdp78419 	for (r = 0; r < mmu_page_sizes; r++) {
9822961Sdp78419 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
9830Sstevel@tonic-gate 	}
9840Sstevel@tonic-gate 
9850Sstevel@tonic-gate 	/*
9860Sstevel@tonic-gate 	 * Preallocate all of the new hpm_counters arrays as we can't
9870Sstevel@tonic-gate 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
9880Sstevel@tonic-gate 	 * If we can't allocate all of the arrays, undo our work so far
9890Sstevel@tonic-gate 	 * and return failure.
9900Sstevel@tonic-gate 	 */
9910Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
9920Sstevel@tonic-gate 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
9932961Sdp78419 		size_cache[r] = pcsz;
9940Sstevel@tonic-gate 		ctr_cache[r] = kmem_zalloc(pcsz *
9950Sstevel@tonic-gate 		    sizeof (hpmctr_t), KM_NOSLEEP);
9960Sstevel@tonic-gate 		if (ctr_cache[r] == NULL) {
9972961Sdp78419 			rc = ENOMEM;
9982961Sdp78419 			goto cleanup;
9990Sstevel@tonic-gate 		}
10000Sstevel@tonic-gate 	}
10012961Sdp78419 
10020Sstevel@tonic-gate 	/*
10030Sstevel@tonic-gate 	 * Preallocate all of the new color current arrays as we can't
10040Sstevel@tonic-gate 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
10050Sstevel@tonic-gate 	 * If we can't allocate all of the arrays, undo our work so far
10060Sstevel@tonic-gate 	 * and return failure.
10070Sstevel@tonic-gate 	 */
10080Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
10092961Sdp78419 		for (mrange = 0; mrange < nranges; mrange++) {
10102961Sdp78419 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
10112961Sdp78419 			    colors_per_szc[r], KM_NOSLEEP);
10122961Sdp78419 			if (color_cache[r][mrange] == NULL) {
10132961Sdp78419 				rc = ENOMEM;
10142961Sdp78419 				goto cleanup;
10150Sstevel@tonic-gate 			}
10162961Sdp78419 		}
10172961Sdp78419 	}
10182961Sdp78419 
10192961Sdp78419 	/*
10202961Sdp78419 	 * Preallocate all of the new pcc_info_t arrays as we can't
10212961Sdp78419 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
10222961Sdp78419 	 * If we can't allocate all of the arrays, undo our work so far
10232961Sdp78419 	 * and return failure.
10242961Sdp78419 	 */
10252961Sdp78419 	for (r = 1; r < mmu_page_sizes; r++) {
10262961Sdp78419 		for (i = 0; i < NPC_MUTEX; i++) {
10272961Sdp78419 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
10282961Sdp78419 			    KM_NOSLEEP);
10292961Sdp78419 			if (pi == NULL) {
10302961Sdp78419 				rc = ENOMEM;
10312961Sdp78419 				goto cleanup;
10320Sstevel@tonic-gate 			}
10332961Sdp78419 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
10342961Sdp78419 
10352961Sdp78419 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
10362961Sdp78419 				pgcntp = kmem_zalloc(colors_per_szc[r] *
10372961Sdp78419 				    sizeof (pgcnt_t), KM_NOSLEEP);
10382961Sdp78419 				if (pgcntp == NULL) {
10392961Sdp78419 					rc = ENOMEM;
10402961Sdp78419 					goto cleanup;
10412961Sdp78419 				}
10422961Sdp78419 				pi->pcc_color_free = pgcntp;
10432961Sdp78419 			}
10440Sstevel@tonic-gate 		}
10450Sstevel@tonic-gate 	}
10460Sstevel@tonic-gate 
10470Sstevel@tonic-gate 	/*
10480Sstevel@tonic-gate 	 * Grab the write lock to prevent others from walking these arrays
10490Sstevel@tonic-gate 	 * while we are modifying them.
10500Sstevel@tonic-gate 	 */
10514769Sdp78419 	PAGE_CTRS_WRITE_LOCK(mnode);
10522961Sdp78419 
105310106SJason.Beloro@Sun.COM 	/*
105410106SJason.Beloro@Sun.COM 	 * For interleaved mnodes, find the first mnode
105510106SJason.Beloro@Sun.COM 	 * with valid page counters since the current
105610106SJason.Beloro@Sun.COM 	 * mnode may have just been added and not have
105710106SJason.Beloro@Sun.COM 	 * valid page counters.
105810106SJason.Beloro@Sun.COM 	 */
105910106SJason.Beloro@Sun.COM 	if (interleaved_mnodes) {
106010106SJason.Beloro@Sun.COM 		for (i = 0; i < max_mem_nodes; i++)
106110106SJason.Beloro@Sun.COM 			if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
106210106SJason.Beloro@Sun.COM 				break;
106310106SJason.Beloro@Sun.COM 		ASSERT(i < max_mem_nodes);
106410106SJason.Beloro@Sun.COM 		oldmnode = i;
106510106SJason.Beloro@Sun.COM 	} else
106610106SJason.Beloro@Sun.COM 		oldmnode = mnode;
106710106SJason.Beloro@Sun.COM 
10682961Sdp78419 	old_nranges = mnode_nranges[mnode];
10692961Sdp78419 	cands_cache_nranges = old_nranges;
10702961Sdp78419 	mnode_nranges[mnode] = nranges;
10712961Sdp78419 	old_maxmrange = mnode_maxmrange[mnode];
10722961Sdp78419 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
10732961Sdp78419 	new_maxmrange = mnode_maxmrange[mnode];
10742961Sdp78419 
10750Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
10760Sstevel@tonic-gate 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
107710106SJason.Beloro@Sun.COM 		old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
107810106SJason.Beloro@Sun.COM 		old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
107910106SJason.Beloro@Sun.COM 		oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
108010106SJason.Beloro@Sun.COM 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
10812961Sdp78419 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
10822961Sdp78419 			old_color_array[mrange] =
10832961Sdp78419 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
10844769Sdp78419 			    r, mrange);
10852961Sdp78419 		}
10860Sstevel@tonic-gate 
10870Sstevel@tonic-gate 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
10880Sstevel@tonic-gate 		new_ctr = ctr_cache[r];
10890Sstevel@tonic-gate 		ctr_cache[r] = NULL;
10900Sstevel@tonic-gate 		if (old_ctr != NULL &&
10910Sstevel@tonic-gate 		    (oldbase + old_npgs > newbase) &&
10920Sstevel@tonic-gate 		    (newbase + npgs > oldbase)) {
10930Sstevel@tonic-gate 			/*
10940Sstevel@tonic-gate 			 * Map the intersection of the old and new
10950Sstevel@tonic-gate 			 * counters into the new array.
10960Sstevel@tonic-gate 			 */
10970Sstevel@tonic-gate 			size_t offset;
10980Sstevel@tonic-gate 			if (newbase > oldbase) {
10990Sstevel@tonic-gate 				offset = (newbase - oldbase) >>
11000Sstevel@tonic-gate 				    PAGE_COUNTERS_SHIFT(mnode, r);
11010Sstevel@tonic-gate 				bcopy(old_ctr + offset, new_ctr,
11020Sstevel@tonic-gate 				    MIN(pcsz, (old_csz - offset)) *
11030Sstevel@tonic-gate 				    sizeof (hpmctr_t));
11040Sstevel@tonic-gate 			} else {
11050Sstevel@tonic-gate 				offset = (oldbase - newbase) >>
11060Sstevel@tonic-gate 				    PAGE_COUNTERS_SHIFT(mnode, r);
11070Sstevel@tonic-gate 				bcopy(old_ctr, new_ctr + offset,
11080Sstevel@tonic-gate 				    MIN(pcsz - offset, old_csz) *
11090Sstevel@tonic-gate 				    sizeof (hpmctr_t));
11100Sstevel@tonic-gate 			}
11110Sstevel@tonic-gate 		}
11120Sstevel@tonic-gate 
11130Sstevel@tonic-gate 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
11140Sstevel@tonic-gate 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
11150Sstevel@tonic-gate 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
11164769Sdp78419 
11174769Sdp78419 		/* update shared hpm_counters in other mnodes */
11184769Sdp78419 		if (interleaved_mnodes) {
11194769Sdp78419 			for (i = 0; i < max_mem_nodes; i++) {
112011185SSean.McEnroe@Sun.COM 				if ((i == mnode) ||
112111185SSean.McEnroe@Sun.COM 				    (mem_node_config[i].exists == 0))
11224769Sdp78419 					continue;
112310106SJason.Beloro@Sun.COM 				ASSERT(
112410106SJason.Beloro@Sun.COM 				    PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
112510106SJason.Beloro@Sun.COM 				    PAGE_COUNTERS_COUNTERS(i, r) == NULL);
11264769Sdp78419 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
11274769Sdp78419 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
11284769Sdp78419 				PAGE_COUNTERS_BASE(i, r) = newbase;
11294769Sdp78419 			}
11304769Sdp78419 		}
11314769Sdp78419 
11322961Sdp78419 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
11332961Sdp78419 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
11342961Sdp78419 			    color_cache[r][mrange];
11352961Sdp78419 			color_cache[r][mrange] = NULL;
11362961Sdp78419 		}
11370Sstevel@tonic-gate 		/*
11380Sstevel@tonic-gate 		 * for now, just reset on these events as it's probably
11390Sstevel@tonic-gate 		 * not worthwhile to try and optimize this.
11400Sstevel@tonic-gate 		 */
11410Sstevel@tonic-gate 		for (i = 0; i < colors_per_szc[r]; i++) {
11422961Sdp78419 			uint_t color_mask = colors_per_szc[r] - 1;
11434769Sdp78419 			int mlo = interleaved_mnodes ? 0 : mnode;
11444769Sdp78419 			int mhi = interleaved_mnodes ? max_mem_nodes :
11454769Sdp78419 			    (mnode + 1);
11464769Sdp78419 			int m;
114710106SJason.Beloro@Sun.COM 			pfn_t  pfnum;
11482961Sdp78419 			size_t idx;
11494769Sdp78419 			MEM_NODE_ITERATOR_DECL(it);
11504769Sdp78419 
11514769Sdp78419 			for (m = mlo; m < mhi; m++) {
11524769Sdp78419 				if (mem_node_config[m].exists == 0)
11534769Sdp78419 					continue;
115410106SJason.Beloro@Sun.COM 				pfnum = newbase;
11556041Sdp78419 				MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
11566041Sdp78419 				if (pfnum == (pfn_t)-1) {
11576041Sdp78419 					idx = 0;
11586041Sdp78419 				} else {
11596041Sdp78419 					PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
11606041Sdp78419 					    color_mask, color_mask, &it);
11616041Sdp78419 					idx = PNUM_TO_IDX(m, r, pfnum);
11626041Sdp78419 					idx = (idx < pcsz) ? idx : 0;
11636041Sdp78419 				}
11644769Sdp78419 				for (mrange = 0; mrange < nranges; mrange++) {
116510106SJason.Beloro@Sun.COM 					if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
116610106SJason.Beloro@Sun.COM 					    r, mrange) != NULL)
116710106SJason.Beloro@Sun.COM 						PAGE_COUNTERS_CURRENT_COLOR(m,
116810106SJason.Beloro@Sun.COM 						    r, i, mrange) = idx;
11694769Sdp78419 				}
11702961Sdp78419 			}
11710Sstevel@tonic-gate 		}
11720Sstevel@tonic-gate 
11730Sstevel@tonic-gate 		/* cache info for freeing out of the critical path */
11740Sstevel@tonic-gate 		if ((caddr_t)old_ctr >= kernelheap &&
11750Sstevel@tonic-gate 		    (caddr_t)old_ctr < ekernelheap) {
11760Sstevel@tonic-gate 			ctr_cache[r] = old_ctr;
11770Sstevel@tonic-gate 			size_cache[r] = old_csz;
11780Sstevel@tonic-gate 		}
11792961Sdp78419 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
11802961Sdp78419 			size_t *tmp = old_color_array[mrange];
11812961Sdp78419 			if ((caddr_t)tmp >= kernelheap &&
11822961Sdp78419 			    (caddr_t)tmp < ekernelheap) {
11832961Sdp78419 				color_cache[r][mrange] = tmp;
11842961Sdp78419 			}
11850Sstevel@tonic-gate 		}
11860Sstevel@tonic-gate 		/*
11870Sstevel@tonic-gate 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
11880Sstevel@tonic-gate 		 * satisfy the identity requirement.
11890Sstevel@tonic-gate 		 * We should be able to go from one to the other
11900Sstevel@tonic-gate 		 * and get consistent values.
11910Sstevel@tonic-gate 		 */
11920Sstevel@tonic-gate 		ASSERT(PNUM_TO_IDX(mnode, r,
11930Sstevel@tonic-gate 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
11940Sstevel@tonic-gate 		ASSERT(IDX_TO_PNUM(mnode, r,
11950Sstevel@tonic-gate 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
11962961Sdp78419 
11972961Sdp78419 		/* pcc_info_t and pcc_color_free */
11982961Sdp78419 		for (i = 0; i < NPC_MUTEX; i++) {
11992961Sdp78419 			pcc_info_t *epi;
12002961Sdp78419 			pcc_info_t *eold_pi;
12012961Sdp78419 
12022961Sdp78419 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
12032961Sdp78419 			old_pi = page_ctrs_cands[i][r][mnode];
12042961Sdp78419 			page_ctrs_cands[i][r][mnode] = pi;
12052961Sdp78419 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
12062961Sdp78419 
12072961Sdp78419 			/* preserve old pcc_color_free values, if any */
12082961Sdp78419 			if (old_pi == NULL)
12092961Sdp78419 				continue;
12102961Sdp78419 
12112961Sdp78419 			/*
12122961Sdp78419 			 * when/if x86 does DR, must account for
12132961Sdp78419 			 * possible change in range index when
12142961Sdp78419 			 * preserving pcc_info
12152961Sdp78419 			 */
12162961Sdp78419 			epi = &pi[nranges];
12172961Sdp78419 			eold_pi = &old_pi[old_nranges];
12182961Sdp78419 			if (new_maxmrange > old_maxmrange) {
12192961Sdp78419 				pi += new_maxmrange - old_maxmrange;
12202961Sdp78419 			} else if (new_maxmrange < old_maxmrange) {
12212961Sdp78419 				old_pi += old_maxmrange - new_maxmrange;
12222961Sdp78419 			}
12232961Sdp78419 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
12242961Sdp78419 				pcc_info_t tmp = *pi;
12252961Sdp78419 				*pi = *old_pi;
12262961Sdp78419 				*old_pi = tmp;
12272961Sdp78419 			}
12282961Sdp78419 		}
12290Sstevel@tonic-gate 	}
12304769Sdp78419 	PAGE_CTRS_WRITE_UNLOCK(mnode);
12310Sstevel@tonic-gate 
12320Sstevel@tonic-gate 	/*
12330Sstevel@tonic-gate 	 * Now that we have dropped the write lock, it is safe to free all
12340Sstevel@tonic-gate 	 * of the memory we have cached above.
12352961Sdp78419 	 * We come thru here to free memory when pre-alloc fails, and also to
12362961Sdp78419 	 * free old pointers which were recorded while locked.
12370Sstevel@tonic-gate 	 */
12382961Sdp78419 cleanup:
12390Sstevel@tonic-gate 	for (r = 1; r < mmu_page_sizes; r++) {
12400Sstevel@tonic-gate 		if (ctr_cache[r] != NULL) {
12410Sstevel@tonic-gate 			kmem_free(ctr_cache[r],
12420Sstevel@tonic-gate 			    size_cache[r] * sizeof (hpmctr_t));
12430Sstevel@tonic-gate 		}
12442961Sdp78419 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
12452961Sdp78419 			if (color_cache[r][mrange] != NULL) {
12462961Sdp78419 				kmem_free(color_cache[r][mrange],
12472961Sdp78419 				    colors_per_szc[r] * sizeof (size_t));
12482961Sdp78419 			}
12492961Sdp78419 		}
12502961Sdp78419 		for (i = 0; i < NPC_MUTEX; i++) {
12512961Sdp78419 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
12522961Sdp78419 			if (pi == NULL)
12532961Sdp78419 				continue;
12542961Sdp78419 			nr = cands_cache_nranges;
12552961Sdp78419 			for (mrange = 0; mrange < nr; mrange++, pi++) {
12562961Sdp78419 				pgcntp = pi->pcc_color_free;
12572961Sdp78419 				if (pgcntp == NULL)
12582961Sdp78419 					continue;
12592961Sdp78419 				if ((caddr_t)pgcntp >= kernelheap &&
12602961Sdp78419 				    (caddr_t)pgcntp < ekernelheap) {
12612961Sdp78419 					kmem_free(pgcntp,
12622961Sdp78419 					    colors_per_szc[r] *
12632961Sdp78419 					    sizeof (pgcnt_t));
12642961Sdp78419 				}
12652961Sdp78419 			}
12662961Sdp78419 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
12672961Sdp78419 			if ((caddr_t)pi >= kernelheap &&
12682961Sdp78419 			    (caddr_t)pi < ekernelheap) {
12692961Sdp78419 				kmem_free(pi, nr * sizeof (pcc_info_t));
12702961Sdp78419 			}
12710Sstevel@tonic-gate 		}
12720Sstevel@tonic-gate 	}
12732961Sdp78419 
12742961Sdp78419 	kmem_free(cands_cache,
12752961Sdp78419 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
12762961Sdp78419 	return (rc);
12770Sstevel@tonic-gate }
12780Sstevel@tonic-gate 
127911185SSean.McEnroe@Sun.COM /*
128011185SSean.McEnroe@Sun.COM  * Cleanup the hpm_counters field in the page counters
128111185SSean.McEnroe@Sun.COM  * array.
128211185SSean.McEnroe@Sun.COM  */
128311185SSean.McEnroe@Sun.COM void
page_ctrs_cleanup(void)128411185SSean.McEnroe@Sun.COM page_ctrs_cleanup(void)
128511185SSean.McEnroe@Sun.COM {
128611185SSean.McEnroe@Sun.COM 	int r;	/* region size */
128711185SSean.McEnroe@Sun.COM 	int i;	/* mnode index */
128811185SSean.McEnroe@Sun.COM 
128911185SSean.McEnroe@Sun.COM 	/*
129011185SSean.McEnroe@Sun.COM 	 * Get the page counters write lock while we are
129111185SSean.McEnroe@Sun.COM 	 * setting the page hpm_counters field to NULL
129211185SSean.McEnroe@Sun.COM 	 * for non-existent mnodes.
129311185SSean.McEnroe@Sun.COM 	 */
129411185SSean.McEnroe@Sun.COM 	for (i = 0; i < max_mem_nodes; i++) {
129511185SSean.McEnroe@Sun.COM 		PAGE_CTRS_WRITE_LOCK(i);
129611185SSean.McEnroe@Sun.COM 		if (mem_node_config[i].exists) {
129711185SSean.McEnroe@Sun.COM 			PAGE_CTRS_WRITE_UNLOCK(i);
129811185SSean.McEnroe@Sun.COM 			continue;
129911185SSean.McEnroe@Sun.COM 		}
130011185SSean.McEnroe@Sun.COM 		for (r = 1; r < mmu_page_sizes; r++) {
130111185SSean.McEnroe@Sun.COM 			PAGE_COUNTERS_COUNTERS(i, r) = NULL;
130211185SSean.McEnroe@Sun.COM 		}
130311185SSean.McEnroe@Sun.COM 		PAGE_CTRS_WRITE_UNLOCK(i);
130411185SSean.McEnroe@Sun.COM 	}
130511185SSean.McEnroe@Sun.COM }
13060Sstevel@tonic-gate 
13070Sstevel@tonic-gate #ifdef DEBUG
13080Sstevel@tonic-gate 
13090Sstevel@tonic-gate /*
13100Sstevel@tonic-gate  * confirm pp is a large page corresponding to szc
13110Sstevel@tonic-gate  */
13120Sstevel@tonic-gate void
chk_lpg(page_t * pp,uchar_t szc)13130Sstevel@tonic-gate chk_lpg(page_t *pp, uchar_t szc)
13140Sstevel@tonic-gate {
13150Sstevel@tonic-gate 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
13160Sstevel@tonic-gate 	uint_t noreloc;
13170Sstevel@tonic-gate 
13180Sstevel@tonic-gate 	if (npgs == 1) {
13190Sstevel@tonic-gate 		ASSERT(pp->p_szc == 0);
13200Sstevel@tonic-gate 		ASSERT(pp->p_next == pp);
13210Sstevel@tonic-gate 		ASSERT(pp->p_prev == pp);
13220Sstevel@tonic-gate 		return;
13230Sstevel@tonic-gate 	}
13240Sstevel@tonic-gate 
13250Sstevel@tonic-gate 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
13260Sstevel@tonic-gate 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
13270Sstevel@tonic-gate 
13280Sstevel@tonic-gate 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
13290Sstevel@tonic-gate 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
13300Sstevel@tonic-gate 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
13310Sstevel@tonic-gate 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
13320Sstevel@tonic-gate 
13330Sstevel@tonic-gate 	/*
13340Sstevel@tonic-gate 	 * Check list of pages.
13350Sstevel@tonic-gate 	 */
13360Sstevel@tonic-gate 	noreloc = PP_ISNORELOC(pp);
13370Sstevel@tonic-gate 	while (npgs--) {
13380Sstevel@tonic-gate 		if (npgs != 0) {
13390Sstevel@tonic-gate 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
13400Sstevel@tonic-gate 			ASSERT(pp->p_next == (pp + 1));
13410Sstevel@tonic-gate 		}
13420Sstevel@tonic-gate 		ASSERT(pp->p_szc == szc);
13430Sstevel@tonic-gate 		ASSERT(PP_ISFREE(pp));
13440Sstevel@tonic-gate 		ASSERT(PP_ISAGED(pp));
13450Sstevel@tonic-gate 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
13460Sstevel@tonic-gate 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
13470Sstevel@tonic-gate 		ASSERT(pp->p_vnode  == NULL);
13480Sstevel@tonic-gate 		ASSERT(PP_ISNORELOC(pp) == noreloc);
13490Sstevel@tonic-gate 
13500Sstevel@tonic-gate 		pp = pp->p_next;
13510Sstevel@tonic-gate 	}
13520Sstevel@tonic-gate }
13530Sstevel@tonic-gate #endif /* DEBUG */
13540Sstevel@tonic-gate 
13550Sstevel@tonic-gate void
page_freelist_lock(int mnode)13560Sstevel@tonic-gate page_freelist_lock(int mnode)
13570Sstevel@tonic-gate {
13580Sstevel@tonic-gate 	int i;
13590Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {
13600Sstevel@tonic-gate 		mutex_enter(FPC_MUTEX(mnode, i));
13610Sstevel@tonic-gate 		mutex_enter(CPC_MUTEX(mnode, i));
13620Sstevel@tonic-gate 	}
13630Sstevel@tonic-gate }
13640Sstevel@tonic-gate 
13650Sstevel@tonic-gate void
page_freelist_unlock(int mnode)13660Sstevel@tonic-gate page_freelist_unlock(int mnode)
13670Sstevel@tonic-gate {
13680Sstevel@tonic-gate 	int i;
13690Sstevel@tonic-gate 	for (i = 0; i < NPC_MUTEX; i++) {
13700Sstevel@tonic-gate 		mutex_exit(FPC_MUTEX(mnode, i));
13710Sstevel@tonic-gate 		mutex_exit(CPC_MUTEX(mnode, i));
13720Sstevel@tonic-gate 	}
13730Sstevel@tonic-gate }
13740Sstevel@tonic-gate 
13750Sstevel@tonic-gate /*
13760Sstevel@tonic-gate  * add pp to the specified page list. Defaults to head of the page list
13770Sstevel@tonic-gate  * unless PG_LIST_TAIL is specified.
13780Sstevel@tonic-gate  */
13790Sstevel@tonic-gate void
page_list_add(page_t * pp,int flags)13800Sstevel@tonic-gate page_list_add(page_t *pp, int flags)
13810Sstevel@tonic-gate {
13820Sstevel@tonic-gate 	page_t		**ppp;
13830Sstevel@tonic-gate 	kmutex_t	*pcm;
13840Sstevel@tonic-gate 	uint_t		bin, mtype;
13850Sstevel@tonic-gate 	int		mnode;
13860Sstevel@tonic-gate 
13870Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
13880Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
13890Sstevel@tonic-gate 	ASSERT(!hat_page_is_mapped(pp));
13900Sstevel@tonic-gate 	ASSERT(hat_page_getshare(pp) == 0);
13910Sstevel@tonic-gate 
13920Sstevel@tonic-gate 	/*
13930Sstevel@tonic-gate 	 * Large pages should be freed via page_list_add_pages().
13940Sstevel@tonic-gate 	 */
13950Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
13960Sstevel@tonic-gate 
13970Sstevel@tonic-gate 	/*
13980Sstevel@tonic-gate 	 * Don't need to lock the freelist first here
13990Sstevel@tonic-gate 	 * because the page isn't on the freelist yet.
14000Sstevel@tonic-gate 	 * This means p_szc can't change on us.
14010Sstevel@tonic-gate 	 */
14020Sstevel@tonic-gate 
14030Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
14040Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
14050Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
14060Sstevel@tonic-gate 
14070Sstevel@tonic-gate 	if (flags & PG_LIST_ISINIT) {
14080Sstevel@tonic-gate 		/*
14090Sstevel@tonic-gate 		 * PG_LIST_ISINIT is set during system startup (ie. single
14100Sstevel@tonic-gate 		 * threaded), add a page to the free list and add to the
14110Sstevel@tonic-gate 		 * the free region counters w/o any locking
14120Sstevel@tonic-gate 		 */
141312293SJames.McPherson@Sun.COM 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
14140Sstevel@tonic-gate 
14150Sstevel@tonic-gate 		/* inline version of page_add() */
14160Sstevel@tonic-gate 		if (*ppp != NULL) {
14170Sstevel@tonic-gate 			pp->p_next = *ppp;
14180Sstevel@tonic-gate 			pp->p_prev = (*ppp)->p_prev;
14190Sstevel@tonic-gate 			(*ppp)->p_prev = pp;
14200Sstevel@tonic-gate 			pp->p_prev->p_next = pp;
14210Sstevel@tonic-gate 		} else
14220Sstevel@tonic-gate 			*ppp = pp;
14230Sstevel@tonic-gate 
1424414Skchow 		page_ctr_add_internal(mnode, mtype, pp, flags);
1425414Skchow 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
14260Sstevel@tonic-gate 	} else {
142712293SJames.McPherson@Sun.COM 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
14280Sstevel@tonic-gate 
14290Sstevel@tonic-gate 		if (flags & PG_FREE_LIST) {
1430414Skchow 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
14310Sstevel@tonic-gate 			ASSERT(PP_ISAGED(pp));
143212293SJames.McPherson@Sun.COM 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
143312293SJames.McPherson@Sun.COM 
14340Sstevel@tonic-gate 		} else {
1435414Skchow 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
14360Sstevel@tonic-gate 			ASSERT(pp->p_vnode);
14370Sstevel@tonic-gate 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
14380Sstevel@tonic-gate 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
14390Sstevel@tonic-gate 		}
14400Sstevel@tonic-gate 		mutex_enter(pcm);
14410Sstevel@tonic-gate 		page_add(ppp, pp);
14420Sstevel@tonic-gate 
14430Sstevel@tonic-gate 		if (flags & PG_LIST_TAIL)
14440Sstevel@tonic-gate 			*ppp = (*ppp)->p_next;
14450Sstevel@tonic-gate 		/*
14460Sstevel@tonic-gate 		 * Add counters before releasing pcm mutex to avoid a race with
14472961Sdp78419 		 * page_freelist_coalesce and page_freelist_split.
14480Sstevel@tonic-gate 		 */
1449414Skchow 		page_ctr_add(mnode, mtype, pp, flags);
14500Sstevel@tonic-gate 		mutex_exit(pcm);
14510Sstevel@tonic-gate 	}
14520Sstevel@tonic-gate 
14530Sstevel@tonic-gate 
14540Sstevel@tonic-gate #if defined(__sparc)
14550Sstevel@tonic-gate 	if (PP_ISNORELOC(pp)) {
14560Sstevel@tonic-gate 		kcage_freemem_add(1);
14570Sstevel@tonic-gate 	}
145812293SJames.McPherson@Sun.COM #endif
14590Sstevel@tonic-gate 	/*
14600Sstevel@tonic-gate 	 * It is up to the caller to unlock the page!
14610Sstevel@tonic-gate 	 */
14620Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
14630Sstevel@tonic-gate }
14640Sstevel@tonic-gate 
14650Sstevel@tonic-gate 
14660Sstevel@tonic-gate #ifdef __sparc
14670Sstevel@tonic-gate /*
14680Sstevel@tonic-gate  * This routine is only used by kcage_init during system startup.
14690Sstevel@tonic-gate  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
14700Sstevel@tonic-gate  * without the overhead of taking locks and updating counters.
14710Sstevel@tonic-gate  */
14720Sstevel@tonic-gate void
page_list_noreloc_startup(page_t * pp)14730Sstevel@tonic-gate page_list_noreloc_startup(page_t *pp)
14740Sstevel@tonic-gate {
14750Sstevel@tonic-gate 	page_t		**ppp;
14760Sstevel@tonic-gate 	uint_t		bin;
14770Sstevel@tonic-gate 	int		mnode;
14780Sstevel@tonic-gate 	int		mtype;
14791373Skchow 	int		flags = 0;
14800Sstevel@tonic-gate 
14810Sstevel@tonic-gate 	/*
14820Sstevel@tonic-gate 	 * If this is a large page on the freelist then
14830Sstevel@tonic-gate 	 * break it up into smaller pages.
14840Sstevel@tonic-gate 	 */
14850Sstevel@tonic-gate 	if (pp->p_szc != 0)
14860Sstevel@tonic-gate 		page_boot_demote(pp);
14870Sstevel@tonic-gate 
14880Sstevel@tonic-gate 	/*
14890Sstevel@tonic-gate 	 * Get list page is currently on.
14900Sstevel@tonic-gate 	 */
14910Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
14920Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
14930Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
14940Sstevel@tonic-gate 	ASSERT(mtype == MTYPE_RELOC);
14950Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
14960Sstevel@tonic-gate 
14970Sstevel@tonic-gate 	if (PP_ISAGED(pp)) {
149812293SJames.McPherson@Sun.COM 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
14990Sstevel@tonic-gate 		flags |= PG_FREE_LIST;
15000Sstevel@tonic-gate 	} else {
15010Sstevel@tonic-gate 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
15020Sstevel@tonic-gate 		flags |= PG_CACHE_LIST;
15030Sstevel@tonic-gate 	}
15040Sstevel@tonic-gate 
15050Sstevel@tonic-gate 	ASSERT(*ppp != NULL);
15060Sstevel@tonic-gate 
15070Sstevel@tonic-gate 	/*
15080Sstevel@tonic-gate 	 * Delete page from current list.
15090Sstevel@tonic-gate 	 */
15100Sstevel@tonic-gate 	if (*ppp == pp)
15110Sstevel@tonic-gate 		*ppp = pp->p_next;		/* go to next page */
15120Sstevel@tonic-gate 	if (*ppp == pp) {
15130Sstevel@tonic-gate 		*ppp = NULL;			/* page list is gone */
15140Sstevel@tonic-gate 	} else {
15150Sstevel@tonic-gate 		pp->p_prev->p_next = pp->p_next;
15160Sstevel@tonic-gate 		pp->p_next->p_prev = pp->p_prev;
15170Sstevel@tonic-gate 	}
15180Sstevel@tonic-gate 
15192961Sdp78419 	/*
15202961Sdp78419 	 * Decrement page counters
15212961Sdp78419 	 */
15222961Sdp78419 	page_ctr_sub_internal(mnode, mtype, pp, flags);
15230Sstevel@tonic-gate 
15240Sstevel@tonic-gate 	/*
15250Sstevel@tonic-gate 	 * Set no reloc for cage initted pages.
15260Sstevel@tonic-gate 	 */
15270Sstevel@tonic-gate 	PP_SETNORELOC(pp);
15280Sstevel@tonic-gate 
15290Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
15300Sstevel@tonic-gate 	ASSERT(mtype == MTYPE_NORELOC);
15310Sstevel@tonic-gate 
15320Sstevel@tonic-gate 	/*
15330Sstevel@tonic-gate 	 * Get new list for page.
15340Sstevel@tonic-gate 	 */
15350Sstevel@tonic-gate 	if (PP_ISAGED(pp)) {
153612293SJames.McPherson@Sun.COM 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
15370Sstevel@tonic-gate 	} else {
15380Sstevel@tonic-gate 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
15390Sstevel@tonic-gate 	}
15400Sstevel@tonic-gate 
15410Sstevel@tonic-gate 	/*
15420Sstevel@tonic-gate 	 * Insert page on new list.
15430Sstevel@tonic-gate 	 */
15440Sstevel@tonic-gate 	if (*ppp == NULL) {
15450Sstevel@tonic-gate 		*ppp = pp;
15460Sstevel@tonic-gate 		pp->p_next = pp->p_prev = pp;
15470Sstevel@tonic-gate 	} else {
15480Sstevel@tonic-gate 		pp->p_next = *ppp;
15490Sstevel@tonic-gate 		pp->p_prev = (*ppp)->p_prev;
15500Sstevel@tonic-gate 		(*ppp)->p_prev = pp;
15510Sstevel@tonic-gate 		pp->p_prev->p_next = pp;
15520Sstevel@tonic-gate 	}
15530Sstevel@tonic-gate 
15542961Sdp78419 	/*
15552961Sdp78419 	 * Increment page counters
15562961Sdp78419 	 */
15572961Sdp78419 	page_ctr_add_internal(mnode, mtype, pp, flags);
15580Sstevel@tonic-gate 
15590Sstevel@tonic-gate 	/*
15600Sstevel@tonic-gate 	 * Update cage freemem counter
15610Sstevel@tonic-gate 	 */
15620Sstevel@tonic-gate 	atomic_add_long(&kcage_freemem, 1);
15630Sstevel@tonic-gate }
15640Sstevel@tonic-gate #else	/* __sparc */
15650Sstevel@tonic-gate 
15660Sstevel@tonic-gate /* ARGSUSED */
15670Sstevel@tonic-gate void
page_list_noreloc_startup(page_t * pp)15680Sstevel@tonic-gate page_list_noreloc_startup(page_t *pp)
15690Sstevel@tonic-gate {
15700Sstevel@tonic-gate 	panic("page_list_noreloc_startup: should be here only for sparc");
15710Sstevel@tonic-gate }
15720Sstevel@tonic-gate #endif
15730Sstevel@tonic-gate 
15740Sstevel@tonic-gate void
page_list_add_pages(page_t * pp,int flags)15750Sstevel@tonic-gate page_list_add_pages(page_t *pp, int flags)
15760Sstevel@tonic-gate {
15770Sstevel@tonic-gate 	kmutex_t *pcm;
15780Sstevel@tonic-gate 	pgcnt_t	pgcnt;
15790Sstevel@tonic-gate 	uint_t	bin, mtype, i;
15800Sstevel@tonic-gate 	int	mnode;
15810Sstevel@tonic-gate 
15820Sstevel@tonic-gate 	/* default to freelist/head */
15830Sstevel@tonic-gate 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
15840Sstevel@tonic-gate 
15850Sstevel@tonic-gate 	CHK_LPG(pp, pp->p_szc);
1586414Skchow 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
15870Sstevel@tonic-gate 
15880Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
15890Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
15900Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
15910Sstevel@tonic-gate 
15920Sstevel@tonic-gate 	if (flags & PG_LIST_ISINIT) {
15930Sstevel@tonic-gate 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
159412293SJames.McPherson@Sun.COM 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
15950Sstevel@tonic-gate 		ASSERT(!PP_ISNORELOC(pp));
1596414Skchow 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
15970Sstevel@tonic-gate 	} else {
15980Sstevel@tonic-gate 
15990Sstevel@tonic-gate 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
16000Sstevel@tonic-gate 
160112293SJames.McPherson@Sun.COM 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
16020Sstevel@tonic-gate 
16030Sstevel@tonic-gate 		mutex_enter(pcm);
160412293SJames.McPherson@Sun.COM 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1605414Skchow 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
16060Sstevel@tonic-gate 		mutex_exit(pcm);
16070Sstevel@tonic-gate 
16080Sstevel@tonic-gate 		pgcnt = page_get_pagecnt(pp->p_szc);
16090Sstevel@tonic-gate #if defined(__sparc)
161012293SJames.McPherson@Sun.COM 		if (PP_ISNORELOC(pp))
16110Sstevel@tonic-gate 			kcage_freemem_add(pgcnt);
161212293SJames.McPherson@Sun.COM #endif
16130Sstevel@tonic-gate 		for (i = 0; i < pgcnt; i++, pp++)
16143253Smec 			page_unlock_nocapture(pp);
16150Sstevel@tonic-gate 	}
16160Sstevel@tonic-gate }
16170Sstevel@tonic-gate 
16180Sstevel@tonic-gate /*
16190Sstevel@tonic-gate  * During boot, need to demote a large page to base
16200Sstevel@tonic-gate  * pagesize pages for seg_kmem for use in boot_alloc()
16210Sstevel@tonic-gate  */
16220Sstevel@tonic-gate void
page_boot_demote(page_t * pp)16230Sstevel@tonic-gate page_boot_demote(page_t *pp)
16240Sstevel@tonic-gate {
16250Sstevel@tonic-gate 	ASSERT(pp->p_szc != 0);
16260Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
16270Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
16280Sstevel@tonic-gate 
16290Sstevel@tonic-gate 	(void) page_demote(PP_2_MEM_NODE(pp),
16307656SSherry.Moore@Sun.COM 	    PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
16310Sstevel@tonic-gate 	    PC_FREE);
16320Sstevel@tonic-gate 
16330Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
16340Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
16350Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
16360Sstevel@tonic-gate }
16370Sstevel@tonic-gate 
16380Sstevel@tonic-gate /*
16390Sstevel@tonic-gate  * Take a particular page off of whatever freelist the page
16400Sstevel@tonic-gate  * is claimed to be on.
16410Sstevel@tonic-gate  *
16420Sstevel@tonic-gate  * NOTE: Only used for PAGESIZE pages.
16430Sstevel@tonic-gate  */
16440Sstevel@tonic-gate void
page_list_sub(page_t * pp,int flags)16450Sstevel@tonic-gate page_list_sub(page_t *pp, int flags)
16460Sstevel@tonic-gate {
16470Sstevel@tonic-gate 	int		bin;
16480Sstevel@tonic-gate 	uint_t		mtype;
16490Sstevel@tonic-gate 	int		mnode;
16500Sstevel@tonic-gate 	kmutex_t	*pcm;
16510Sstevel@tonic-gate 	page_t		**ppp;
16520Sstevel@tonic-gate 
16530Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
16540Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
16550Sstevel@tonic-gate 
16560Sstevel@tonic-gate 	/*
16570Sstevel@tonic-gate 	 * The p_szc field can only be changed by page_promote()
16580Sstevel@tonic-gate 	 * and page_demote(). Only free pages can be promoted and
16590Sstevel@tonic-gate 	 * demoted and the free list MUST be locked during these
16600Sstevel@tonic-gate 	 * operations. So to prevent a race in page_list_sub()
16610Sstevel@tonic-gate 	 * between computing which bin of the freelist lock to
16620Sstevel@tonic-gate 	 * grab and actually grabing the lock we check again that
16630Sstevel@tonic-gate 	 * the bin we locked is still the correct one. Notice that
16640Sstevel@tonic-gate 	 * the p_szc field could have actually changed on us but
16650Sstevel@tonic-gate 	 * if the bin happens to still be the same we are safe.
16660Sstevel@tonic-gate 	 */
16670Sstevel@tonic-gate try_again:
16680Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
16690Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
167012293SJames.McPherson@Sun.COM 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
16710Sstevel@tonic-gate 	mutex_enter(pcm);
16720Sstevel@tonic-gate 	if (PP_2_BIN(pp) != bin) {
16730Sstevel@tonic-gate 		mutex_exit(pcm);
16740Sstevel@tonic-gate 		goto try_again;
16750Sstevel@tonic-gate 	}
16760Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
16770Sstevel@tonic-gate 
16780Sstevel@tonic-gate 	if (flags & PG_FREE_LIST) {
1679414Skchow 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
16800Sstevel@tonic-gate 		ASSERT(PP_ISAGED(pp));
168112293SJames.McPherson@Sun.COM 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
16820Sstevel@tonic-gate 	} else {
1683414Skchow 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
16840Sstevel@tonic-gate 		ASSERT(!PP_ISAGED(pp));
16850Sstevel@tonic-gate 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
16860Sstevel@tonic-gate 	}
16870Sstevel@tonic-gate 
16880Sstevel@tonic-gate 	/*
16890Sstevel@tonic-gate 	 * Common PAGESIZE case.
16900Sstevel@tonic-gate 	 *
16910Sstevel@tonic-gate 	 * Note that we locked the freelist. This prevents
16920Sstevel@tonic-gate 	 * any page promotion/demotion operations. Therefore
16930Sstevel@tonic-gate 	 * the p_szc will not change until we drop pcm mutex.
16940Sstevel@tonic-gate 	 */
16950Sstevel@tonic-gate 	if (pp->p_szc == 0) {
16960Sstevel@tonic-gate 		page_sub(ppp, pp);
16970Sstevel@tonic-gate 		/*
16980Sstevel@tonic-gate 		 * Subtract counters before releasing pcm mutex
16990Sstevel@tonic-gate 		 * to avoid race with page_freelist_coalesce.
17000Sstevel@tonic-gate 		 */
1701414Skchow 		page_ctr_sub(mnode, mtype, pp, flags);
17020Sstevel@tonic-gate 		mutex_exit(pcm);
17030Sstevel@tonic-gate 
17040Sstevel@tonic-gate #if defined(__sparc)
17050Sstevel@tonic-gate 		if (PP_ISNORELOC(pp)) {
17060Sstevel@tonic-gate 			kcage_freemem_sub(1);
17070Sstevel@tonic-gate 		}
170812293SJames.McPherson@Sun.COM #endif
17090Sstevel@tonic-gate 		return;
17100Sstevel@tonic-gate 	}
17110Sstevel@tonic-gate 
17120Sstevel@tonic-gate 	/*
17130Sstevel@tonic-gate 	 * Large pages on the cache list are not supported.
17140Sstevel@tonic-gate 	 */
17150Sstevel@tonic-gate 	if (flags & PG_CACHE_LIST)
17160Sstevel@tonic-gate 		panic("page_list_sub: large page on cachelist");
17170Sstevel@tonic-gate 
17180Sstevel@tonic-gate 	/*
17190Sstevel@tonic-gate 	 * Slow but rare.
17200Sstevel@tonic-gate 	 *
17210Sstevel@tonic-gate 	 * Somebody wants this particular page which is part
17220Sstevel@tonic-gate 	 * of a large page. In this case we just demote the page
17230Sstevel@tonic-gate 	 * if it's on the freelist.
17240Sstevel@tonic-gate 	 *
17250Sstevel@tonic-gate 	 * We have to drop pcm before locking the entire freelist.
17260Sstevel@tonic-gate 	 * Once we have re-locked the freelist check to make sure
17270Sstevel@tonic-gate 	 * the page hasn't already been demoted or completely
17280Sstevel@tonic-gate 	 * freed.
17290Sstevel@tonic-gate 	 */
17300Sstevel@tonic-gate 	mutex_exit(pcm);
17310Sstevel@tonic-gate 	page_freelist_lock(mnode);
17320Sstevel@tonic-gate 	if (pp->p_szc != 0) {
17330Sstevel@tonic-gate 		/*
17340Sstevel@tonic-gate 		 * Large page is on freelist.
17350Sstevel@tonic-gate 		 */
17360Sstevel@tonic-gate 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
17377656SSherry.Moore@Sun.COM 		    0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
17380Sstevel@tonic-gate 	}
17390Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
17400Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
17410Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
17420Sstevel@tonic-gate 
17430Sstevel@tonic-gate 	/*
17440Sstevel@tonic-gate 	 * Subtract counters before releasing pcm mutex
17450Sstevel@tonic-gate 	 * to avoid race with page_freelist_coalesce.
17460Sstevel@tonic-gate 	 */
17470Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
17480Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
174912293SJames.McPherson@Sun.COM 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
175012293SJames.McPherson@Sun.COM 
17510Sstevel@tonic-gate 	page_sub(ppp, pp);
1752414Skchow 	page_ctr_sub(mnode, mtype, pp, flags);
17530Sstevel@tonic-gate 	page_freelist_unlock(mnode);
17540Sstevel@tonic-gate 
17550Sstevel@tonic-gate #if defined(__sparc)
17560Sstevel@tonic-gate 	if (PP_ISNORELOC(pp)) {
17570Sstevel@tonic-gate 		kcage_freemem_sub(1);
17580Sstevel@tonic-gate 	}
175912293SJames.McPherson@Sun.COM #endif
17600Sstevel@tonic-gate }
17610Sstevel@tonic-gate 
17620Sstevel@tonic-gate void
page_list_sub_pages(page_t * pp,uint_t szc)17630Sstevel@tonic-gate page_list_sub_pages(page_t *pp, uint_t szc)
17640Sstevel@tonic-gate {
17650Sstevel@tonic-gate 	kmutex_t *pcm;
17660Sstevel@tonic-gate 	uint_t	bin, mtype;
17670Sstevel@tonic-gate 	int	mnode;
17680Sstevel@tonic-gate 
17690Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
17700Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
17710Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
17720Sstevel@tonic-gate 
17730Sstevel@tonic-gate 	/*
17740Sstevel@tonic-gate 	 * See comment in page_list_sub().
17750Sstevel@tonic-gate 	 */
17760Sstevel@tonic-gate try_again:
17770Sstevel@tonic-gate 	bin = PP_2_BIN(pp);
17780Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
177912293SJames.McPherson@Sun.COM 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
17800Sstevel@tonic-gate 	mutex_enter(pcm);
17810Sstevel@tonic-gate 	if (PP_2_BIN(pp) != bin) {
17820Sstevel@tonic-gate 		mutex_exit(pcm);
17830Sstevel@tonic-gate 		goto	try_again;
17840Sstevel@tonic-gate 	}
17850Sstevel@tonic-gate 
17860Sstevel@tonic-gate 	/*
17870Sstevel@tonic-gate 	 * If we're called with a page larger than szc or it got
17880Sstevel@tonic-gate 	 * promoted above szc before we locked the freelist then
17890Sstevel@tonic-gate 	 * drop pcm and re-lock entire freelist. If page still larger
17900Sstevel@tonic-gate 	 * than szc then demote it.
17910Sstevel@tonic-gate 	 */
17920Sstevel@tonic-gate 	if (pp->p_szc > szc) {
17930Sstevel@tonic-gate 		mutex_exit(pcm);
17940Sstevel@tonic-gate 		pcm = NULL;
17950Sstevel@tonic-gate 		page_freelist_lock(mnode);
17960Sstevel@tonic-gate 		if (pp->p_szc > szc) {
1797414Skchow 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
17980Sstevel@tonic-gate 			(void) page_demote(mnode,
17997656SSherry.Moore@Sun.COM 			    PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
18000Sstevel@tonic-gate 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
18010Sstevel@tonic-gate 		}
18020Sstevel@tonic-gate 		bin = PP_2_BIN(pp);
18030Sstevel@tonic-gate 	}
18040Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
18050Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp));
18060Sstevel@tonic-gate 	ASSERT(pp->p_szc <= szc);
18070Sstevel@tonic-gate 	ASSERT(pp == PP_PAGEROOT(pp));
18080Sstevel@tonic-gate 
1809414Skchow 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1810414Skchow 
18110Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pp);
18120Sstevel@tonic-gate 	if (pp->p_szc != 0) {
181312293SJames.McPherson@Sun.COM 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
18140Sstevel@tonic-gate 		CHK_LPG(pp, pp->p_szc);
18150Sstevel@tonic-gate 	} else {
1816414Skchow 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
181712293SJames.McPherson@Sun.COM 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
18180Sstevel@tonic-gate 	}
1819414Skchow 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
18200Sstevel@tonic-gate 
18210Sstevel@tonic-gate 	if (pcm != NULL) {
18220Sstevel@tonic-gate 		mutex_exit(pcm);
18230Sstevel@tonic-gate 	} else {
18240Sstevel@tonic-gate 		page_freelist_unlock(mnode);
18250Sstevel@tonic-gate 	}
18260Sstevel@tonic-gate 
18270Sstevel@tonic-gate #if defined(__sparc)
18280Sstevel@tonic-gate 	if (PP_ISNORELOC(pp)) {
18290Sstevel@tonic-gate 		pgcnt_t	pgcnt;
18300Sstevel@tonic-gate 
18310Sstevel@tonic-gate 		pgcnt = page_get_pagecnt(pp->p_szc);
18320Sstevel@tonic-gate 		kcage_freemem_sub(pgcnt);
18330Sstevel@tonic-gate 	}
183412293SJames.McPherson@Sun.COM #endif
18350Sstevel@tonic-gate }
18360Sstevel@tonic-gate 
18370Sstevel@tonic-gate /*
18380Sstevel@tonic-gate  * Add the page to the front of a linked list of pages
18390Sstevel@tonic-gate  * using the p_next & p_prev pointers for the list.
18400Sstevel@tonic-gate  * The caller is responsible for protecting the list pointers.
18410Sstevel@tonic-gate  */
18420Sstevel@tonic-gate void
mach_page_add(page_t ** ppp,page_t * pp)18430Sstevel@tonic-gate mach_page_add(page_t **ppp, page_t *pp)
18440Sstevel@tonic-gate {
18450Sstevel@tonic-gate 	if (*ppp == NULL) {
18460Sstevel@tonic-gate 		pp->p_next = pp->p_prev = pp;
18470Sstevel@tonic-gate 	} else {
18480Sstevel@tonic-gate 		pp->p_next = *ppp;
18490Sstevel@tonic-gate 		pp->p_prev = (*ppp)->p_prev;
18500Sstevel@tonic-gate 		(*ppp)->p_prev = pp;
18510Sstevel@tonic-gate 		pp->p_prev->p_next = pp;
18520Sstevel@tonic-gate 	}
18530Sstevel@tonic-gate 	*ppp = pp;
18540Sstevel@tonic-gate }
18550Sstevel@tonic-gate 
18560Sstevel@tonic-gate /*
18570Sstevel@tonic-gate  * Remove this page from a linked list of pages
18580Sstevel@tonic-gate  * using the p_next & p_prev pointers for the list.
18590Sstevel@tonic-gate  *
18600Sstevel@tonic-gate  * The caller is responsible for protecting the list pointers.
18610Sstevel@tonic-gate  */
18620Sstevel@tonic-gate void
mach_page_sub(page_t ** ppp,page_t * pp)18630Sstevel@tonic-gate mach_page_sub(page_t **ppp, page_t *pp)
18640Sstevel@tonic-gate {
18650Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
18660Sstevel@tonic-gate 
18670Sstevel@tonic-gate 	if (*ppp == NULL || pp == NULL)
18680Sstevel@tonic-gate 		panic("mach_page_sub");
18690Sstevel@tonic-gate 
18700Sstevel@tonic-gate 	if (*ppp == pp)
18710Sstevel@tonic-gate 		*ppp = pp->p_next;		/* go to next page */
18720Sstevel@tonic-gate 
18730Sstevel@tonic-gate 	if (*ppp == pp)
18740Sstevel@tonic-gate 		*ppp = NULL;			/* page list is gone */
18750Sstevel@tonic-gate 	else {
18760Sstevel@tonic-gate 		pp->p_prev->p_next = pp->p_next;
18770Sstevel@tonic-gate 		pp->p_next->p_prev = pp->p_prev;
18780Sstevel@tonic-gate 	}
18790Sstevel@tonic-gate 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
18800Sstevel@tonic-gate }
18810Sstevel@tonic-gate 
18820Sstevel@tonic-gate /*
18830Sstevel@tonic-gate  * Routine fsflush uses to gradually coalesce the free list into larger pages.
18840Sstevel@tonic-gate  */
18850Sstevel@tonic-gate void
page_promote_size(page_t * pp,uint_t cur_szc)18860Sstevel@tonic-gate page_promote_size(page_t *pp, uint_t cur_szc)
18870Sstevel@tonic-gate {
18880Sstevel@tonic-gate 	pfn_t pfn;
18890Sstevel@tonic-gate 	int mnode;
18900Sstevel@tonic-gate 	int idx;
18910Sstevel@tonic-gate 	int new_szc = cur_szc + 1;
18920Sstevel@tonic-gate 	int full = FULL_REGION_CNT(new_szc);
18930Sstevel@tonic-gate 
18940Sstevel@tonic-gate 	pfn = page_pptonum(pp);
18950Sstevel@tonic-gate 	mnode = PFN_2_MEM_NODE(pfn);
18960Sstevel@tonic-gate 
18970Sstevel@tonic-gate 	page_freelist_lock(mnode);
18980Sstevel@tonic-gate 
18990Sstevel@tonic-gate 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
19000Sstevel@tonic-gate 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
19012961Sdp78419 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
19020Sstevel@tonic-gate 
19030Sstevel@tonic-gate 	page_freelist_unlock(mnode);
19040Sstevel@tonic-gate }
19050Sstevel@tonic-gate 
19060Sstevel@tonic-gate static uint_t page_promote_err;
19070Sstevel@tonic-gate static uint_t page_promote_noreloc_err;
190812293SJames.McPherson@Sun.COM 
19090Sstevel@tonic-gate /*
19100Sstevel@tonic-gate  * Create a single larger page (of szc new_szc) from smaller contiguous pages
19110Sstevel@tonic-gate  * for the given mnode starting at pfnum. Pages involved are on the freelist
19120Sstevel@tonic-gate  * before the call and may be returned to the caller if requested, otherwise
19130Sstevel@tonic-gate  * they will be placed back on the freelist.
19140Sstevel@tonic-gate  * If flags is PC_ALLOC, then the large page will be returned to the user in
19150Sstevel@tonic-gate  * a state which is consistent with a page being taken off the freelist.  If
19160Sstevel@tonic-gate  * we failed to lock the new large page, then we will return NULL to the
19170Sstevel@tonic-gate  * caller and put the large page on the freelist instead.
19180Sstevel@tonic-gate  * If flags is PC_FREE, then the large page will be placed on the freelist,
19190Sstevel@tonic-gate  * and NULL will be returned.
19200Sstevel@tonic-gate  * The caller is responsible for locking the freelist as well as any other
19210Sstevel@tonic-gate  * accounting which needs to be done for a returned page.
19220Sstevel@tonic-gate  *
19230Sstevel@tonic-gate  * RFE: For performance pass in pp instead of pfnum so
19240Sstevel@tonic-gate  * 	we can avoid excessive calls to page_numtopp_nolock().
19250Sstevel@tonic-gate  *	This would depend on an assumption that all contiguous
19260Sstevel@tonic-gate  *	pages are in the same memseg so we can just add/dec
19270Sstevel@tonic-gate  *	our pp.
19280Sstevel@tonic-gate  *
19290Sstevel@tonic-gate  * Lock ordering:
19300Sstevel@tonic-gate  *
19310Sstevel@tonic-gate  *	There is a potential but rare deadlock situation
19320Sstevel@tonic-gate  *	for page promotion and demotion operations. The problem
19330Sstevel@tonic-gate  *	is there are two paths into the freelist manager and
19340Sstevel@tonic-gate  *	they have different lock orders:
19350Sstevel@tonic-gate  *
19360Sstevel@tonic-gate  *	page_create()
19370Sstevel@tonic-gate  *		lock freelist
19380Sstevel@tonic-gate  *		page_lock(EXCL)
19390Sstevel@tonic-gate  *		unlock freelist
19400Sstevel@tonic-gate  *		return
19410Sstevel@tonic-gate  *		caller drops page_lock
19420Sstevel@tonic-gate  *
19430Sstevel@tonic-gate  *	page_free() and page_reclaim()
19440Sstevel@tonic-gate  *		caller grabs page_lock(EXCL)
19450Sstevel@tonic-gate  *
19460Sstevel@tonic-gate  *		lock freelist
19470Sstevel@tonic-gate  *		unlock freelist
19480Sstevel@tonic-gate  *		drop page_lock
19490Sstevel@tonic-gate  *
19500Sstevel@tonic-gate  *	What prevents a thread in page_create() from deadlocking
19510Sstevel@tonic-gate  *	with a thread freeing or reclaiming the same page is the
19520Sstevel@tonic-gate  *	page_trylock() in page_get_freelist(). If the trylock fails
19530Sstevel@tonic-gate  *	it skips the page.
19540Sstevel@tonic-gate  *
19550Sstevel@tonic-gate  *	The lock ordering for promotion and demotion is the same as
19560Sstevel@tonic-gate  *	for page_create(). Since the same deadlock could occur during
19570Sstevel@tonic-gate  *	page promotion and freeing or reclaiming of a page on the
19580Sstevel@tonic-gate  *	cache list we might have to fail the operation and undo what
19590Sstevel@tonic-gate  *	have done so far. Again this is rare.
19600Sstevel@tonic-gate  */
19610Sstevel@tonic-gate page_t *
page_promote(int mnode,pfn_t pfnum,uchar_t new_szc,int flags,int mtype)19622961Sdp78419 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
19630Sstevel@tonic-gate {
19640Sstevel@tonic-gate 	page_t		*pp, *pplist, *tpp, *start_pp;
19650Sstevel@tonic-gate 	pgcnt_t		new_npgs, npgs;
19660Sstevel@tonic-gate 	uint_t		bin;
19670Sstevel@tonic-gate 	pgcnt_t		tmpnpgs, pages_left;
19680Sstevel@tonic-gate 	uint_t		noreloc;
19690Sstevel@tonic-gate 	int 		which_list;
19700Sstevel@tonic-gate 	ulong_t		index;
19710Sstevel@tonic-gate 	kmutex_t	*phm;
19720Sstevel@tonic-gate 
19730Sstevel@tonic-gate 	/*
19740Sstevel@tonic-gate 	 * General algorithm:
19750Sstevel@tonic-gate 	 * Find the starting page
19760Sstevel@tonic-gate 	 * Walk each page struct removing it from the freelist,
19770Sstevel@tonic-gate 	 * and linking it to all the other pages removed.
19780Sstevel@tonic-gate 	 * Once all pages are off the freelist,
19790Sstevel@tonic-gate 	 * walk the list, modifying p_szc to new_szc and what
19800Sstevel@tonic-gate 	 * ever other info needs to be done to create a large free page.
19810Sstevel@tonic-gate 	 * According to the flags, either return the page or put it
19820Sstevel@tonic-gate 	 * on the freelist.
19830Sstevel@tonic-gate 	 */
19840Sstevel@tonic-gate 
19850Sstevel@tonic-gate 	start_pp = page_numtopp_nolock(pfnum);
19860Sstevel@tonic-gate 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
19870Sstevel@tonic-gate 	new_npgs = page_get_pagecnt(new_szc);
19880Sstevel@tonic-gate 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
19890Sstevel@tonic-gate 
19902961Sdp78419 	/* don't return page of the wrong mtype */
19912961Sdp78419 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
19922961Sdp78419 			return (NULL);
19932961Sdp78419 
19940Sstevel@tonic-gate 	/*
19950Sstevel@tonic-gate 	 * Loop through smaller pages to confirm that all pages
19960Sstevel@tonic-gate 	 * give the same result for PP_ISNORELOC().
19970Sstevel@tonic-gate 	 * We can check this reliably here as the protocol for setting
19980Sstevel@tonic-gate 	 * P_NORELOC requires pages to be taken off the free list first.
19990Sstevel@tonic-gate 	 */
20002961Sdp78419 	noreloc = PP_ISNORELOC(start_pp);
20012961Sdp78419 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
20022961Sdp78419 		if (noreloc != PP_ISNORELOC(pp)) {
20030Sstevel@tonic-gate 			page_promote_noreloc_err++;
20040Sstevel@tonic-gate 			page_promote_err++;
20050Sstevel@tonic-gate 			return (NULL);
20060Sstevel@tonic-gate 		}
20070Sstevel@tonic-gate 	}
20080Sstevel@tonic-gate 
20090Sstevel@tonic-gate 	pages_left = new_npgs;
20100Sstevel@tonic-gate 	pplist = NULL;
20110Sstevel@tonic-gate 	pp = start_pp;
20120Sstevel@tonic-gate 
20130Sstevel@tonic-gate 	/* Loop around coalescing the smaller pages into a big page. */
20140Sstevel@tonic-gate 	while (pages_left) {
20150Sstevel@tonic-gate 		/*
20160Sstevel@tonic-gate 		 * Remove from the freelist.
20170Sstevel@tonic-gate 		 */
20180Sstevel@tonic-gate 		ASSERT(PP_ISFREE(pp));
20190Sstevel@tonic-gate 		bin = PP_2_BIN(pp);
20200Sstevel@tonic-gate 		ASSERT(mnode == PP_2_MEM_NODE(pp));
20210Sstevel@tonic-gate 		mtype = PP_2_MTYPE(pp);
20220Sstevel@tonic-gate 		if (PP_ISAGED(pp)) {
20230Sstevel@tonic-gate 
20240Sstevel@tonic-gate 			/*
20250Sstevel@tonic-gate 			 * PG_FREE_LIST
20260Sstevel@tonic-gate 			 */
20270Sstevel@tonic-gate 			if (pp->p_szc) {
202812293SJames.McPherson@Sun.COM 				page_vpsub(&PAGE_FREELISTS(mnode,
20290Sstevel@tonic-gate 				    pp->p_szc, bin, mtype), pp);
20300Sstevel@tonic-gate 			} else {
203112293SJames.McPherson@Sun.COM 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
203212293SJames.McPherson@Sun.COM 				    bin, mtype), pp);
20330Sstevel@tonic-gate 			}
20340Sstevel@tonic-gate 			which_list = PG_FREE_LIST;
20350Sstevel@tonic-gate 		} else {
20360Sstevel@tonic-gate 			ASSERT(pp->p_szc == 0);
20370Sstevel@tonic-gate 
20380Sstevel@tonic-gate 			/*
20390Sstevel@tonic-gate 			 * PG_CACHE_LIST
20400Sstevel@tonic-gate 			 *
20410Sstevel@tonic-gate 			 * Since this page comes from the
20420Sstevel@tonic-gate 			 * cachelist, we must destroy the
20430Sstevel@tonic-gate 			 * vnode association.
20440Sstevel@tonic-gate 			 */
20450Sstevel@tonic-gate 			if (!page_trylock(pp, SE_EXCL)) {
20460Sstevel@tonic-gate 				goto fail_promote;
20470Sstevel@tonic-gate 			}
20480Sstevel@tonic-gate 
20490Sstevel@tonic-gate 			/*
20500Sstevel@tonic-gate 			 * We need to be careful not to deadlock
20510Sstevel@tonic-gate 			 * with another thread in page_lookup().
20520Sstevel@tonic-gate 			 * The page_lookup() thread could be holding
20530Sstevel@tonic-gate 			 * the same phm that we need if the two
20540Sstevel@tonic-gate 			 * pages happen to hash to the same phm lock.
20550Sstevel@tonic-gate 			 * At this point we have locked the entire
20560Sstevel@tonic-gate 			 * freelist and page_lookup() could be trying
20570Sstevel@tonic-gate 			 * to grab a freelist lock.
20580Sstevel@tonic-gate 			 */
20590Sstevel@tonic-gate 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
20600Sstevel@tonic-gate 			phm = PAGE_HASH_MUTEX(index);
20610Sstevel@tonic-gate 			if (!mutex_tryenter(phm)) {
20623253Smec 				page_unlock_nocapture(pp);
20630Sstevel@tonic-gate 				goto fail_promote;
20640Sstevel@tonic-gate 			}
20650Sstevel@tonic-gate 
20660Sstevel@tonic-gate 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
20670Sstevel@tonic-gate 			page_hashout(pp, phm);
20680Sstevel@tonic-gate 			mutex_exit(phm);
20690Sstevel@tonic-gate 			PP_SETAGED(pp);
20703253Smec 			page_unlock_nocapture(pp);
20710Sstevel@tonic-gate 			which_list = PG_CACHE_LIST;
20720Sstevel@tonic-gate 		}
2073414Skchow 		page_ctr_sub(mnode, mtype, pp, which_list);
20740Sstevel@tonic-gate 
20750Sstevel@tonic-gate 		/*
20760Sstevel@tonic-gate 		 * Concatenate the smaller page(s) onto
20770Sstevel@tonic-gate 		 * the large page list.
20780Sstevel@tonic-gate 		 */
20790Sstevel@tonic-gate 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
20800Sstevel@tonic-gate 		pages_left -= npgs;
20810Sstevel@tonic-gate 		tpp = pp;
20820Sstevel@tonic-gate 		while (npgs--) {
20830Sstevel@tonic-gate 			tpp->p_szc = new_szc;
20840Sstevel@tonic-gate 			tpp = tpp->p_next;
20850Sstevel@tonic-gate 		}
20860Sstevel@tonic-gate 		page_list_concat(&pplist, &pp);
20870Sstevel@tonic-gate 		pp += tmpnpgs;
20880Sstevel@tonic-gate 	}
20890Sstevel@tonic-gate 	CHK_LPG(pplist, new_szc);
20900Sstevel@tonic-gate 
20910Sstevel@tonic-gate 	/*
20920Sstevel@tonic-gate 	 * return the page to the user if requested
20930Sstevel@tonic-gate 	 * in the properly locked state.
20940Sstevel@tonic-gate 	 */
209512293SJames.McPherson@Sun.COM 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
20960Sstevel@tonic-gate 		return (pplist);
20970Sstevel@tonic-gate 	}
20980Sstevel@tonic-gate 
20990Sstevel@tonic-gate 	/*
21000Sstevel@tonic-gate 	 * Otherwise place the new large page on the freelist
21010Sstevel@tonic-gate 	 */
21020Sstevel@tonic-gate 	bin = PP_2_BIN(pplist);
21030Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pplist);
21040Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pplist);
210512293SJames.McPherson@Sun.COM 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
21060Sstevel@tonic-gate 
2107414Skchow 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
21080Sstevel@tonic-gate 	return (NULL);
21090Sstevel@tonic-gate 
21100Sstevel@tonic-gate fail_promote:
21110Sstevel@tonic-gate 	/*
21120Sstevel@tonic-gate 	 * A thread must have still been freeing or
21130Sstevel@tonic-gate 	 * reclaiming the page on the cachelist.
21140Sstevel@tonic-gate 	 * To prevent a deadlock undo what we have
21150Sstevel@tonic-gate 	 * done sofar and return failure. This
21160Sstevel@tonic-gate 	 * situation can only happen while promoting
21170Sstevel@tonic-gate 	 * PAGESIZE pages.
21180Sstevel@tonic-gate 	 */
21190Sstevel@tonic-gate 	page_promote_err++;
21200Sstevel@tonic-gate 	while (pplist) {
21210Sstevel@tonic-gate 		pp = pplist;
21220Sstevel@tonic-gate 		mach_page_sub(&pplist, pp);
21230Sstevel@tonic-gate 		pp->p_szc = 0;
21240Sstevel@tonic-gate 		bin = PP_2_BIN(pp);
21250Sstevel@tonic-gate 		mtype = PP_2_MTYPE(pp);
212612293SJames.McPherson@Sun.COM 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2127414Skchow 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
21280Sstevel@tonic-gate 	}
21290Sstevel@tonic-gate 	return (NULL);
21300Sstevel@tonic-gate 
21310Sstevel@tonic-gate }
21320Sstevel@tonic-gate 
21330Sstevel@tonic-gate /*
21340Sstevel@tonic-gate  * Break up a large page into smaller size pages.
21350Sstevel@tonic-gate  * Pages involved are on the freelist before the call and may
21360Sstevel@tonic-gate  * be returned to the caller if requested, otherwise they will
21370Sstevel@tonic-gate  * be placed back on the freelist.
21380Sstevel@tonic-gate  * The caller is responsible for locking the freelist as well as any other
21390Sstevel@tonic-gate  * accounting which needs to be done for a returned page.
21400Sstevel@tonic-gate  * If flags is not PC_ALLOC, the color argument is ignored, and thus
21410Sstevel@tonic-gate  * technically, any value may be passed in but PC_NO_COLOR is the standard
21420Sstevel@tonic-gate  * which should be followed for clarity's sake.
21437656SSherry.Moore@Sun.COM  * Returns a page whose pfn is < pfnmax
21440Sstevel@tonic-gate  */
21450Sstevel@tonic-gate page_t *
page_demote(int mnode,pfn_t pfnum,pfn_t pfnmax,uchar_t cur_szc,uchar_t new_szc,int color,int flags)21467656SSherry.Moore@Sun.COM page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
21477656SSherry.Moore@Sun.COM     uchar_t new_szc, int color, int flags)
21480Sstevel@tonic-gate {
21490Sstevel@tonic-gate 	page_t	*pp, *pplist, *npplist;
21500Sstevel@tonic-gate 	pgcnt_t	npgs, n;
21510Sstevel@tonic-gate 	uint_t	bin;
21520Sstevel@tonic-gate 	uint_t	mtype;
21530Sstevel@tonic-gate 	page_t	*ret_pp = NULL;
21540Sstevel@tonic-gate 
21550Sstevel@tonic-gate 	ASSERT(cur_szc != 0);
21560Sstevel@tonic-gate 	ASSERT(new_szc < cur_szc);
21570Sstevel@tonic-gate 
21580Sstevel@tonic-gate 	pplist = page_numtopp_nolock(pfnum);
21590Sstevel@tonic-gate 	ASSERT(pplist != NULL);
21600Sstevel@tonic-gate 
21610Sstevel@tonic-gate 	ASSERT(pplist->p_szc == cur_szc);
21620Sstevel@tonic-gate 
21630Sstevel@tonic-gate 	bin = PP_2_BIN(pplist);
21640Sstevel@tonic-gate 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
21650Sstevel@tonic-gate 	mtype = PP_2_MTYPE(pplist);
216612293SJames.McPherson@Sun.COM 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
21670Sstevel@tonic-gate 
21680Sstevel@tonic-gate 	CHK_LPG(pplist, cur_szc);
2169414Skchow 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
21700Sstevel@tonic-gate 
21710Sstevel@tonic-gate 	/*
21720Sstevel@tonic-gate 	 * Number of PAGESIZE pages for smaller new_szc
21730Sstevel@tonic-gate 	 * page.
21740Sstevel@tonic-gate 	 */
21750Sstevel@tonic-gate 	npgs = page_get_pagecnt(new_szc);
21760Sstevel@tonic-gate 
21770Sstevel@tonic-gate 	while (pplist) {
21780Sstevel@tonic-gate 		pp = pplist;
21790Sstevel@tonic-gate 
21800Sstevel@tonic-gate 		ASSERT(pp->p_szc == cur_szc);
21810Sstevel@tonic-gate 
21820Sstevel@tonic-gate 		/*
21830Sstevel@tonic-gate 		 * We either break it up into PAGESIZE pages or larger.
21840Sstevel@tonic-gate 		 */
21850Sstevel@tonic-gate 		if (npgs == 1) {	/* PAGESIZE case */
21860Sstevel@tonic-gate 			mach_page_sub(&pplist, pp);
21870Sstevel@tonic-gate 			ASSERT(pp->p_szc == cur_szc);
21880Sstevel@tonic-gate 			ASSERT(new_szc == 0);
21890Sstevel@tonic-gate 			ASSERT(mnode == PP_2_MEM_NODE(pp));
21900Sstevel@tonic-gate 			pp->p_szc = new_szc;
21910Sstevel@tonic-gate 			bin = PP_2_BIN(pp);
21920Sstevel@tonic-gate 			if ((bin == color) && (flags == PC_ALLOC) &&
21937656SSherry.Moore@Sun.COM 			    (ret_pp == NULL) && (pfnmax == 0 ||
21947656SSherry.Moore@Sun.COM 			    pp->p_pagenum < pfnmax) &&
21950Sstevel@tonic-gate 			    page_trylock_cons(pp, SE_EXCL)) {
21960Sstevel@tonic-gate 				ret_pp = pp;
21970Sstevel@tonic-gate 			} else {
21980Sstevel@tonic-gate 				mtype = PP_2_MTYPE(pp);
219912293SJames.McPherson@Sun.COM 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
220012293SJames.McPherson@Sun.COM 				    mtype), pp);
2201414Skchow 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
22020Sstevel@tonic-gate 			}
22030Sstevel@tonic-gate 		} else {
22047656SSherry.Moore@Sun.COM 			page_t *try_to_return_this_page = NULL;
22057656SSherry.Moore@Sun.COM 			int count = 0;
22060Sstevel@tonic-gate 
22070Sstevel@tonic-gate 			/*
22080Sstevel@tonic-gate 			 * Break down into smaller lists of pages.
22090Sstevel@tonic-gate 			 */
22100Sstevel@tonic-gate 			page_list_break(&pplist, &npplist, npgs);
22110Sstevel@tonic-gate 
22120Sstevel@tonic-gate 			pp = pplist;
22130Sstevel@tonic-gate 			n = npgs;
22140Sstevel@tonic-gate 			while (n--) {
22150Sstevel@tonic-gate 				ASSERT(pp->p_szc == cur_szc);
22167656SSherry.Moore@Sun.COM 				/*
22177656SSherry.Moore@Sun.COM 				 * Check whether all the pages in this list
22187656SSherry.Moore@Sun.COM 				 * fit the request criteria.
22197656SSherry.Moore@Sun.COM 				 */
22207656SSherry.Moore@Sun.COM 				if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
22217656SSherry.Moore@Sun.COM 					count++;
22227656SSherry.Moore@Sun.COM 				}
22230Sstevel@tonic-gate 				pp->p_szc = new_szc;
22240Sstevel@tonic-gate 				pp = pp->p_next;
22250Sstevel@tonic-gate 			}
22260Sstevel@tonic-gate 
22277656SSherry.Moore@Sun.COM 			if (count == npgs &&
22287656SSherry.Moore@Sun.COM 			    (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
22297656SSherry.Moore@Sun.COM 				try_to_return_this_page = pp;
22307656SSherry.Moore@Sun.COM 			}
22317656SSherry.Moore@Sun.COM 
22320Sstevel@tonic-gate 			CHK_LPG(pplist, new_szc);
22330Sstevel@tonic-gate 
22340Sstevel@tonic-gate 			bin = PP_2_BIN(pplist);
22357656SSherry.Moore@Sun.COM 			if (try_to_return_this_page)
22367656SSherry.Moore@Sun.COM 				ASSERT(mnode ==
22377656SSherry.Moore@Sun.COM 				    PP_2_MEM_NODE(try_to_return_this_page));
22380Sstevel@tonic-gate 			if ((bin == color) && (flags == PC_ALLOC) &&
22397656SSherry.Moore@Sun.COM 			    (ret_pp == NULL) && try_to_return_this_page &&
22407656SSherry.Moore@Sun.COM 			    page_trylock_cons(try_to_return_this_page,
22417656SSherry.Moore@Sun.COM 			    SE_EXCL)) {
22427656SSherry.Moore@Sun.COM 				ret_pp = try_to_return_this_page;
22430Sstevel@tonic-gate 			} else {
22440Sstevel@tonic-gate 				mtype = PP_2_MTYPE(pp);
224512293SJames.McPherson@Sun.COM 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
224612293SJames.McPherson@Sun.COM 				    bin, mtype), pplist);
22470Sstevel@tonic-gate 
2248414Skchow 				page_ctr_add(mnode, mtype, pplist,
2249414Skchow 				    PG_FREE_LIST);
22500Sstevel@tonic-gate 			}
22510Sstevel@tonic-gate 			pplist = npplist;
22520Sstevel@tonic-gate 		}
22530Sstevel@tonic-gate 	}
22540Sstevel@tonic-gate 	return (ret_pp);
22550Sstevel@tonic-gate }
22560Sstevel@tonic-gate 
22570Sstevel@tonic-gate int mpss_coalesce_disable = 0;
22580Sstevel@tonic-gate 
22590Sstevel@tonic-gate /*
22600Sstevel@tonic-gate  * Coalesce free pages into a page of the given szc and color if possible.
22610Sstevel@tonic-gate  * Return the pointer to the page created, otherwise, return NULL.
22622961Sdp78419  *
22632961Sdp78419  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
22640Sstevel@tonic-gate  */
22652961Sdp78419 page_t *
page_freelist_coalesce(int mnode,uchar_t szc,uint_t color,uint_t ceq_mask,int mtype,pfn_t pfnhi)22662961Sdp78419 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
22672961Sdp78419     int mtype, pfn_t pfnhi)
22680Sstevel@tonic-gate {
22692961Sdp78419 	int 	r = szc;		/* region size */
22702961Sdp78419 	int	mrange;
22712961Sdp78419 	uint_t 	full, bin, color_mask, wrap = 0;
22722961Sdp78419 	pfn_t	pfnum, lo, hi;
22732961Sdp78419 	size_t	len, idx, idx0;
22742961Sdp78419 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
22750Sstevel@tonic-gate 	page_t	*ret_pp;
22764769Sdp78419 	MEM_NODE_ITERATOR_DECL(it);
22772961Sdp78419 #if defined(__sparc)
22782961Sdp78419 	pfn_t pfnum0, nlo, nhi;
22792961Sdp78419 #endif
228012293SJames.McPherson@Sun.COM 
22810Sstevel@tonic-gate 	if (mpss_coalesce_disable) {
22822961Sdp78419 		ASSERT(szc < MMU_PAGE_SIZES);
22832961Sdp78419 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
22840Sstevel@tonic-gate 		return (NULL);
22850Sstevel@tonic-gate 	}
22860Sstevel@tonic-gate 
22872961Sdp78419 	ASSERT(szc < mmu_page_sizes);
22882961Sdp78419 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
22892961Sdp78419 	ASSERT(ceq_mask <= color_mask);
22902961Sdp78419 	ASSERT(color <= color_mask);
22912961Sdp78419 	color &= ceq_mask;
22920Sstevel@tonic-gate 
22930Sstevel@tonic-gate 	/* Prevent page_counters dynamic memory from being freed */
22940Sstevel@tonic-gate 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
22952961Sdp78419 
22962961Sdp78419 	mrange = MTYPE_2_MRANGE(mnode, mtype);
22972961Sdp78419 	ASSERT(mrange < mnode_nranges[mnode]);
22982961Sdp78419 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
22992961Sdp78419 
23002961Sdp78419 	/* get pfn range for mtype */
23012961Sdp78419 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
23022961Sdp78419 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
23032961Sdp78419 	hi++;
23042961Sdp78419 
23052961Sdp78419 	/* use lower limit if given */
23062961Sdp78419 	if (pfnhi != PFNNULL && pfnhi < hi)
23072961Sdp78419 		hi = pfnhi;
23082961Sdp78419 
23092961Sdp78419 	/* round to szcpgcnt boundaries */
23102961Sdp78419 	lo = P2ROUNDUP(lo, szcpgcnt);
23116041Sdp78419 	MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
23126041Sdp78419 	if (lo == (pfn_t)-1) {
23136041Sdp78419 		rw_exit(&page_ctrs_rwlock[mnode]);
23146041Sdp78419 		return (NULL);
23156041Sdp78419 	}
23162961Sdp78419 	hi = hi & ~(szcpgcnt - 1);
23172961Sdp78419 
23182961Sdp78419 	/* set lo to the closest pfn of the right color */
23194769Sdp78419 	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
23204769Sdp78419 	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
23214769Sdp78419 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
23224769Sdp78419 		    &it);
23232961Sdp78419 	}
23242961Sdp78419 
23252961Sdp78419 	if (hi <= lo) {
23262961Sdp78419 		rw_exit(&page_ctrs_rwlock[mnode]);
23272961Sdp78419 		return (NULL);
23282961Sdp78419 	}
23292961Sdp78419 
23302961Sdp78419 	full = FULL_REGION_CNT(r);
23312961Sdp78419 
23322961Sdp78419 	/* calculate the number of page candidates and initial search index */
23332961Sdp78419 	bin = color;
23342961Sdp78419 	idx0 = (size_t)(-1);
23352961Sdp78419 	do {
23362961Sdp78419 		pgcnt_t acand;
23372961Sdp78419 
23382961Sdp78419 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
23392961Sdp78419 		if (acand) {
23402961Sdp78419 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
23412961Sdp78419 			    r, bin, mrange);
23422961Sdp78419 			idx0 = MIN(idx0, idx);
23432961Sdp78419 			cands += acand;
23442961Sdp78419 		}
23452961Sdp78419 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
23462961Sdp78419 	} while (bin != color);
23472961Sdp78419 
23482961Sdp78419 	if (cands == 0) {
23492961Sdp78419 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
23502961Sdp78419 		rw_exit(&page_ctrs_rwlock[mnode]);
23512961Sdp78419 		return (NULL);
23522961Sdp78419 	}
23532961Sdp78419 
23542961Sdp78419 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
23552961Sdp78419 	if (pfnum < lo || pfnum >= hi) {
23562961Sdp78419 		pfnum = lo;
23574769Sdp78419 	} else {
23586041Sdp78419 		MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
23594769Sdp78419 		if (pfnum == (pfn_t)-1) {
23604769Sdp78419 			pfnum = lo;
23616041Sdp78419 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
23624769Sdp78419 			ASSERT(pfnum != (pfn_t)-1);
23634769Sdp78419 		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
23644769Sdp78419 		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
23654769Sdp78419 			/* invalid color, get the closest correct pfn */
23664769Sdp78419 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
23674769Sdp78419 			    color_mask, &it);
23684769Sdp78419 			if (pfnum >= hi) {
23694769Sdp78419 				pfnum = lo;
23706041Sdp78419 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
23714769Sdp78419 			}
23724769Sdp78419 		}
23732961Sdp78419 	}
23742961Sdp78419 
23752961Sdp78419 	/* set starting index */
23762961Sdp78419 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
23772961Sdp78419 	ASSERT(idx0 < len);
23782961Sdp78419 
23792961Sdp78419 #if defined(__sparc)
23802961Sdp78419 	pfnum0 = pfnum;		/* page corresponding to idx0 */
23812961Sdp78419 	nhi = 0;		/* search kcage ranges */
23822961Sdp78419 #endif
23832961Sdp78419 
23842961Sdp78419 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
23852961Sdp78419 
23862961Sdp78419 #if defined(__sparc)
23872961Sdp78419 		/*
23882961Sdp78419 		 * Find lowest intersection of kcage ranges and mnode.
23892961Sdp78419 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
23902961Sdp78419 		 */
23912961Sdp78419 		if (nhi <= pfnum) {
23922961Sdp78419 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
23932961Sdp78419 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
23942961Sdp78419 				goto wrapit;
23952961Sdp78419 
23962961Sdp78419 			/* jump to the next page in the range */
23972961Sdp78419 			if (pfnum < nlo) {
23982961Sdp78419 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
23996041Sdp78419 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
24002961Sdp78419 				idx = PNUM_TO_IDX(mnode, r, pfnum);
24012961Sdp78419 				if (idx >= len || pfnum >= hi)
24022961Sdp78419 					goto wrapit;
24034769Sdp78419 				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
24042961Sdp78419 				    ceq_mask)
24052961Sdp78419 					goto next;
24064769Sdp78419 				if (interleaved_mnodes &&
24074769Sdp78419 				    PFN_2_MEM_NODE(pfnum) != mnode)
24084769Sdp78419 					goto next;
24092961Sdp78419 			}
24102961Sdp78419 		}
24112961Sdp78419 #endif
24122961Sdp78419 
24132961Sdp78419 		if (PAGE_COUNTERS(mnode, r, idx) != full)
24142961Sdp78419 			goto next;
24152961Sdp78419 
24162961Sdp78419 		/*
24172961Sdp78419 		 * RFE: For performance maybe we can do something less
24182961Sdp78419 		 *	brutal than locking the entire freelist. So far
24192961Sdp78419 		 * 	this doesn't seem to be a performance problem?
24202961Sdp78419 		 */
24212961Sdp78419 		page_freelist_lock(mnode);
24220Sstevel@tonic-gate 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
24232961Sdp78419 			ret_pp =
24242961Sdp78419 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
24250Sstevel@tonic-gate 			if (ret_pp != NULL) {
24262961Sdp78419 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
24272961Sdp78419 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
24284769Sdp78419 				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
24290Sstevel@tonic-gate 				page_freelist_unlock(mnode);
24300Sstevel@tonic-gate 				rw_exit(&page_ctrs_rwlock[mnode]);
24310Sstevel@tonic-gate #if defined(__sparc)
24320Sstevel@tonic-gate 				if (PP_ISNORELOC(ret_pp)) {
24330Sstevel@tonic-gate 					pgcnt_t npgs;
24340Sstevel@tonic-gate 
24350Sstevel@tonic-gate 					npgs = page_get_pagecnt(ret_pp->p_szc);
24360Sstevel@tonic-gate 					kcage_freemem_sub(npgs);
24370Sstevel@tonic-gate 				}
243812293SJames.McPherson@Sun.COM #endif
24390Sstevel@tonic-gate 				return (ret_pp);
24400Sstevel@tonic-gate 			}
24412961Sdp78419 		} else {
24422961Sdp78419 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
24430Sstevel@tonic-gate 		}
24442961Sdp78419 
24452961Sdp78419 		page_freelist_unlock(mnode);
24462961Sdp78419 		/*
24472961Sdp78419 		 * No point looking for another page if we've
24482961Sdp78419 		 * already tried all of the ones that
24492961Sdp78419 		 * page_ctr_cands indicated.  Stash off where we left
24502961Sdp78419 		 * off.
24512961Sdp78419 		 * Note: this is not exact since we don't hold the
24522961Sdp78419 		 * page_freelist_locks before we initially get the
24532961Sdp78419 		 * value of cands for performance reasons, but should
24542961Sdp78419 		 * be a decent approximation.
24552961Sdp78419 		 */
24562961Sdp78419 		if (--cands == 0) {
24572961Sdp78419 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
24582961Sdp78419 			    idx;
24592961Sdp78419 			break;
24602961Sdp78419 		}
24612961Sdp78419 next:
24622961Sdp78419 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
24634769Sdp78419 		    color_mask, &it);
24642961Sdp78419 		idx = PNUM_TO_IDX(mnode, r, pfnum);
24652961Sdp78419 		if (idx >= len || pfnum >= hi) {
24662961Sdp78419 wrapit:
24672961Sdp78419 			pfnum = lo;
24686041Sdp78419 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
24692961Sdp78419 			idx = PNUM_TO_IDX(mnode, r, pfnum);
24702961Sdp78419 			wrap++;
24712961Sdp78419 #if defined(__sparc)
24722961Sdp78419 			nhi = 0;	/* search kcage ranges */
24732961Sdp78419 #endif
24742961Sdp78419 		}
24750Sstevel@tonic-gate 	}
24762961Sdp78419 
24770Sstevel@tonic-gate 	rw_exit(&page_ctrs_rwlock[mnode]);
24782961Sdp78419 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
24790Sstevel@tonic-gate 	return (NULL);
24800Sstevel@tonic-gate }
24810Sstevel@tonic-gate 
24820Sstevel@tonic-gate /*
24830Sstevel@tonic-gate  * For the given mnode, promote as many small pages to large pages as possible.
24844769Sdp78419  * mnode can be -1, which means do them all
24850Sstevel@tonic-gate  */
24860Sstevel@tonic-gate void
page_freelist_coalesce_all(int mnode)24870Sstevel@tonic-gate page_freelist_coalesce_all(int mnode)
24880Sstevel@tonic-gate {
24890Sstevel@tonic-gate 	int 	r;		/* region size */
24900Sstevel@tonic-gate 	int 	idx, full;
24910Sstevel@tonic-gate 	size_t	len;
24924769Sdp78419 	int doall = interleaved_mnodes || mnode < 0;
24934769Sdp78419 	int mlo = doall ? 0 : mnode;
24944769Sdp78419 	int mhi = doall ? max_mem_nodes : (mnode + 1);
24950Sstevel@tonic-gate 
24960Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
24970Sstevel@tonic-gate 
24980Sstevel@tonic-gate 	if (mpss_coalesce_disable) {
24990Sstevel@tonic-gate 		return;
25000Sstevel@tonic-gate 	}
25010Sstevel@tonic-gate 
25020Sstevel@tonic-gate 	/*
25030Sstevel@tonic-gate 	 * Lock the entire freelist and coalesce what we can.
25040Sstevel@tonic-gate 	 *
25050Sstevel@tonic-gate 	 * Always promote to the largest page possible
25060Sstevel@tonic-gate 	 * first to reduce the number of page promotions.
25070Sstevel@tonic-gate 	 */
25084769Sdp78419 	for (mnode = mlo; mnode < mhi; mnode++) {
25094769Sdp78419 		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
25104769Sdp78419 		page_freelist_lock(mnode);
25114769Sdp78419 	}
25120Sstevel@tonic-gate 	for (r = mmu_page_sizes - 1; r > 0; r--) {
25134769Sdp78419 		for (mnode = mlo; mnode < mhi; mnode++) {
25144769Sdp78419 			pgcnt_t cands = 0;
25154769Sdp78419 			int mrange, nranges = mnode_nranges[mnode];
25164769Sdp78419 
25174769Sdp78419 			for (mrange = 0; mrange < nranges; mrange++) {
25184769Sdp78419 				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
25194769Sdp78419 				if (cands != 0)
25204769Sdp78419 					break;
25214769Sdp78419 			}
25224769Sdp78419 			if (cands == 0) {
25234769Sdp78419 				VM_STAT_ADD(vmm_vmstats.
25244769Sdp78419 				    page_ctrs_cands_skip_all);
25254769Sdp78419 				continue;
25264769Sdp78419 			}
25274769Sdp78419 
25284769Sdp78419 			full = FULL_REGION_CNT(r);
25294769Sdp78419 			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
25304769Sdp78419 
25314769Sdp78419 			for (idx = 0; idx < len; idx++) {
25324769Sdp78419 				if (PAGE_COUNTERS(mnode, r, idx) == full) {
25334769Sdp78419 					pfn_t pfnum =
25344769Sdp78419 					    IDX_TO_PNUM(mnode, r, idx);
25354769Sdp78419 					int tmnode = interleaved_mnodes ?
25364769Sdp78419 					    PFN_2_MEM_NODE(pfnum) : mnode;
25374769Sdp78419 
25384769Sdp78419 					ASSERT(pfnum >=
25394769Sdp78419 					    mem_node_config[tmnode].physbase &&
25404769Sdp78419 					    pfnum <
25414769Sdp78419 					    mem_node_config[tmnode].physmax);
25424769Sdp78419 
25434769Sdp78419 					(void) page_promote(tmnode,
25444769Sdp78419 					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
25454769Sdp78419 				}
25464769Sdp78419 			}
25474769Sdp78419 			/* shared hpm_counters covers all mnodes, so we quit */
25484769Sdp78419 			if (interleaved_mnodes)
25492961Sdp78419 				break;
25502961Sdp78419 		}
25510Sstevel@tonic-gate 	}
25524769Sdp78419 	for (mnode = mlo; mnode < mhi; mnode++) {
25534769Sdp78419 		page_freelist_unlock(mnode);
25544769Sdp78419 		rw_exit(&page_ctrs_rwlock[mnode]);
25554769Sdp78419 	}
25560Sstevel@tonic-gate }
25570Sstevel@tonic-gate 
25580Sstevel@tonic-gate /*
25590Sstevel@tonic-gate  * This is where all polices for moving pages around
25600Sstevel@tonic-gate  * to different page size free lists is implemented.
25610Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
25620Sstevel@tonic-gate  *
25630Sstevel@tonic-gate  * So far these are the priorities for this algorithm in descending
25640Sstevel@tonic-gate  * order:
25650Sstevel@tonic-gate  *
25660Sstevel@tonic-gate  *	1) When servicing a request try to do so with a free page
25670Sstevel@tonic-gate  *	   from next size up. Helps defer fragmentation as long
25680Sstevel@tonic-gate  *	   as possible.
25690Sstevel@tonic-gate  *
25700Sstevel@tonic-gate  *	2) Page coalesce on demand. Only when a freelist
25710Sstevel@tonic-gate  *	   larger than PAGESIZE is empty and step 1
25720Sstevel@tonic-gate  *	   will not work since all larger size lists are
25730Sstevel@tonic-gate  *	   also empty.
25740Sstevel@tonic-gate  *
25750Sstevel@tonic-gate  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
25760Sstevel@tonic-gate  */
25772961Sdp78419 
25780Sstevel@tonic-gate page_t *
page_freelist_split(uchar_t szc,uint_t color,int mnode,int mtype,pfn_t pfnlo,pfn_t pfnhi,page_list_walker_t * plw)25792961Sdp78419 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
25807656SSherry.Moore@Sun.COM     pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
25810Sstevel@tonic-gate {
25820Sstevel@tonic-gate 	uchar_t nszc = szc + 1;
25832961Sdp78419 	uint_t 	bin, sbin, bin_prev;
25840Sstevel@tonic-gate 	page_t	*pp, *firstpp;
25850Sstevel@tonic-gate 	page_t	*ret_pp = NULL;
25862961Sdp78419 	uint_t  color_mask;
25872961Sdp78419 
25882961Sdp78419 	if (nszc == mmu_page_sizes)
25892961Sdp78419 		return (NULL);
25902961Sdp78419 
25912961Sdp78419 	ASSERT(nszc < mmu_page_sizes);
25922961Sdp78419 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
25932961Sdp78419 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
25942961Sdp78419 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
25952961Sdp78419 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
25962961Sdp78419 
25972961Sdp78419 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
25980Sstevel@tonic-gate 	/*
25992961Sdp78419 	 * First try to break up a larger page to fill current size freelist.
26000Sstevel@tonic-gate 	 */
26012961Sdp78419 	while (plw->plw_bins[nszc] != 0) {
26022961Sdp78419 
26032961Sdp78419 		ASSERT(nszc < mmu_page_sizes);
26042961Sdp78419 
26050Sstevel@tonic-gate 		/*
26060Sstevel@tonic-gate 		 * If page found then demote it.
26070Sstevel@tonic-gate 		 */
260812293SJames.McPherson@Sun.COM 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
26090Sstevel@tonic-gate 			page_freelist_lock(mnode);
261012293SJames.McPherson@Sun.COM 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
26110Sstevel@tonic-gate 
26120Sstevel@tonic-gate 			/*
26130Sstevel@tonic-gate 			 * If pfnhi is not PFNNULL, look for large page below
26140Sstevel@tonic-gate 			 * pfnhi. PFNNULL signifies no pfn requirement.
26150Sstevel@tonic-gate 			 */
26169278SSherry.Moore@Sun.COM 			if (pp &&
26179278SSherry.Moore@Sun.COM 			    ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
26189278SSherry.Moore@Sun.COM 			    (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
26190Sstevel@tonic-gate 				do {
26200Sstevel@tonic-gate 					pp = pp->p_vpnext;
26210Sstevel@tonic-gate 					if (pp == firstpp) {
26220Sstevel@tonic-gate 						pp = NULL;
26230Sstevel@tonic-gate 						break;
26240Sstevel@tonic-gate 					}
26257656SSherry.Moore@Sun.COM 				} while ((pfnhi != PFNNULL &&
26267656SSherry.Moore@Sun.COM 				    pp->p_pagenum >= pfnhi) ||
26277656SSherry.Moore@Sun.COM 				    (pfnlo != PFNNULL &&
26287656SSherry.Moore@Sun.COM 				    pp->p_pagenum < pfnlo));
26297656SSherry.Moore@Sun.COM 
26307656SSherry.Moore@Sun.COM 				if (pfnhi != PFNNULL && pp != NULL)
26317656SSherry.Moore@Sun.COM 					ASSERT(pp->p_pagenum < pfnhi);
26327656SSherry.Moore@Sun.COM 
26337656SSherry.Moore@Sun.COM 				if (pfnlo != PFNNULL && pp != NULL)
26347656SSherry.Moore@Sun.COM 					ASSERT(pp->p_pagenum >= pfnlo);
26350Sstevel@tonic-gate 			}
26360Sstevel@tonic-gate 			if (pp) {
26372961Sdp78419 				uint_t ccolor = page_correct_color(szc, nszc,
26382961Sdp78419 				    color, bin, plw->plw_ceq_mask[szc]);
26392961Sdp78419 
26400Sstevel@tonic-gate 				ASSERT(pp->p_szc == nszc);
26412961Sdp78419 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
26420Sstevel@tonic-gate 				ret_pp = page_demote(mnode, pp->p_pagenum,
26437656SSherry.Moore@Sun.COM 				    pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
26440Sstevel@tonic-gate 				if (ret_pp) {
26450Sstevel@tonic-gate 					page_freelist_unlock(mnode);
26460Sstevel@tonic-gate #if defined(__sparc)
26470Sstevel@tonic-gate 					if (PP_ISNORELOC(ret_pp)) {
26480Sstevel@tonic-gate 						pgcnt_t npgs;
26490Sstevel@tonic-gate 
26500Sstevel@tonic-gate 						npgs = page_get_pagecnt(
26510Sstevel@tonic-gate 						    ret_pp->p_szc);
26520Sstevel@tonic-gate 						kcage_freemem_sub(npgs);
26530Sstevel@tonic-gate 					}
265412293SJames.McPherson@Sun.COM #endif
26550Sstevel@tonic-gate 					return (ret_pp);
26560Sstevel@tonic-gate 				}
26570Sstevel@tonic-gate 			}
26580Sstevel@tonic-gate 			page_freelist_unlock(mnode);
26590Sstevel@tonic-gate 		}
26602961Sdp78419 
26612961Sdp78419 		/* loop through next size bins */
26622961Sdp78419 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
26632961Sdp78419 		plw->plw_bins[nszc]--;
26642961Sdp78419 
26652961Sdp78419 		if (bin == sbin) {
26662961Sdp78419 			uchar_t nnszc = nszc + 1;
26672961Sdp78419 
26682961Sdp78419 			/* we are done with this page size - check next */
26692961Sdp78419 			if (plw->plw_bins[nnszc] == 0)
26702961Sdp78419 				/* we have already checked next size bins */
26712961Sdp78419 				break;
26722961Sdp78419 
26732961Sdp78419 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
26742961Sdp78419 			if (bin_prev != INVALID_COLOR) {
26752961Sdp78419 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
26762961Sdp78419 				if (!((bin ^ bin_prev) &
26772961Sdp78419 				    plw->plw_ceq_mask[nnszc]))
26782961Sdp78419 					break;
26792961Sdp78419 			}
26802961Sdp78419 			ASSERT(nnszc < mmu_page_sizes);
26812961Sdp78419 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
26822961Sdp78419 			nszc = nnszc;
26832961Sdp78419 			ASSERT(nszc < mmu_page_sizes);
26842961Sdp78419 		}
26850Sstevel@tonic-gate 	}
26860Sstevel@tonic-gate 
26870Sstevel@tonic-gate 	return (ret_pp);
26880Sstevel@tonic-gate }
26890Sstevel@tonic-gate 
26900Sstevel@tonic-gate /*
26910Sstevel@tonic-gate  * Helper routine used only by the freelist code to lock
26920Sstevel@tonic-gate  * a page. If the page is a large page then it succeeds in
26930Sstevel@tonic-gate  * locking all the constituent pages or none at all.
26940Sstevel@tonic-gate  * Returns 1 on sucess, 0 on failure.
26950Sstevel@tonic-gate  */
26960Sstevel@tonic-gate static int
page_trylock_cons(page_t * pp,se_t se)26970Sstevel@tonic-gate page_trylock_cons(page_t *pp, se_t se)
26980Sstevel@tonic-gate {
26990Sstevel@tonic-gate 	page_t	*tpp, *first_pp = pp;
27000Sstevel@tonic-gate 
27010Sstevel@tonic-gate 	/*
27020Sstevel@tonic-gate 	 * Fail if can't lock first or only page.
27030Sstevel@tonic-gate 	 */
27040Sstevel@tonic-gate 	if (!page_trylock(pp, se)) {
27050Sstevel@tonic-gate 		return (0);
27060Sstevel@tonic-gate 	}
27070Sstevel@tonic-gate 
27080Sstevel@tonic-gate 	/*
27090Sstevel@tonic-gate 	 * PAGESIZE: common case.
27100Sstevel@tonic-gate 	 */
27110Sstevel@tonic-gate 	if (pp->p_szc == 0) {
27120Sstevel@tonic-gate 		return (1);
27130Sstevel@tonic-gate 	}
27140Sstevel@tonic-gate 
27150Sstevel@tonic-gate 	/*
27160Sstevel@tonic-gate 	 * Large page case.
27170Sstevel@tonic-gate 	 */
27180Sstevel@tonic-gate 	tpp = pp->p_next;
27190Sstevel@tonic-gate 	while (tpp != pp) {
27200Sstevel@tonic-gate 		if (!page_trylock(tpp, se)) {
27210Sstevel@tonic-gate 			/*
27223253Smec 			 * On failure unlock what we have locked so far.
27233253Smec 			 * We want to avoid attempting to capture these
27243253Smec 			 * pages as the pcm mutex may be held which could
27253253Smec 			 * lead to a recursive mutex panic.
27260Sstevel@tonic-gate 			 */
27270Sstevel@tonic-gate 			while (first_pp != tpp) {
27283253Smec 				page_unlock_nocapture(first_pp);
27290Sstevel@tonic-gate 				first_pp = first_pp->p_next;
27300Sstevel@tonic-gate 			}
27310Sstevel@tonic-gate 			return (0);
27320Sstevel@tonic-gate 		}
27330Sstevel@tonic-gate 		tpp = tpp->p_next;
27340Sstevel@tonic-gate 	}
27350Sstevel@tonic-gate 	return (1);
27360Sstevel@tonic-gate }
27370Sstevel@tonic-gate 
27382961Sdp78419 /*
27392961Sdp78419  * init context for walking page lists
27402961Sdp78419  * Called when a page of the given szc in unavailable. Sets markers
27412961Sdp78419  * for the beginning of the search to detect when search has
27422961Sdp78419  * completed a full cycle. Sets flags for splitting larger pages
27432961Sdp78419  * and coalescing smaller pages. Page walking procedes until a page
27442961Sdp78419  * of the desired equivalent color is found.
27452961Sdp78419  */
27462961Sdp78419 void
page_list_walk_init(uchar_t szc,uint_t flags,uint_t bin,int can_split,int use_ceq,page_list_walker_t * plw)27472961Sdp78419 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
27482961Sdp78419     int use_ceq, page_list_walker_t *plw)
27492961Sdp78419 {
27502961Sdp78419 	uint_t  nszc, ceq_mask, colors;
27512961Sdp78419 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
27522961Sdp78419 
27532961Sdp78419 	ASSERT(szc < mmu_page_sizes);
27542961Sdp78419 	colors = PAGE_GET_PAGECOLORS(szc);
27552961Sdp78419 
27562961Sdp78419 	plw->plw_colors = colors;
27572961Sdp78419 	plw->plw_color_mask = colors - 1;
27582961Sdp78419 	plw->plw_bin_marker = plw->plw_bin0 = bin;
27592961Sdp78419 	plw->plw_bin_split_prev = bin;
27602961Sdp78419 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
27612961Sdp78419 
27622961Sdp78419 	/*
27632961Sdp78419 	 * if vac aliasing is possible make sure lower order color
27642961Sdp78419 	 * bits are never ignored
27652961Sdp78419 	 */
27662961Sdp78419 	if (vac_colors > 1)
27672961Sdp78419 		ceq &= 0xf0;
27682961Sdp78419 
27692961Sdp78419 	/*
27702961Sdp78419 	 * calculate the number of non-equivalent colors and
27712961Sdp78419 	 * color equivalency mask
27722961Sdp78419 	 */
27732961Sdp78419 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
27742961Sdp78419 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
27752961Sdp78419 	ASSERT(plw->plw_ceq_dif > 0);
27762961Sdp78419 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
27772961Sdp78419 
27782961Sdp78419 	if (flags & PG_MATCH_COLOR) {
27792961Sdp78419 		if (cpu_page_colors <  0) {
27802961Sdp78419 			/*
27812961Sdp78419 			 * this is a heterogeneous machine with different CPUs
27822961Sdp78419 			 * having different size e$ (not supported for ni2/rock
27832961Sdp78419 			 */
27842961Sdp78419 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
27852961Sdp78419 			cpucolors = MAX(cpucolors, 1);
27862961Sdp78419 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
27872961Sdp78419 			plw->plw_ceq_mask[szc] =
27882961Sdp78419 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
27892961Sdp78419 		}
27902961Sdp78419 		plw->plw_ceq_dif = 1;
27912961Sdp78419 	}
27922961Sdp78419 
27932961Sdp78419 	/* we can split pages in the freelist, but not the cachelist */
27942961Sdp78419 	if (can_split) {
27954769Sdp78419 		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
27964769Sdp78419 
27974769Sdp78419 		/* set next szc color masks and number of free list bins */
27984769Sdp78419 		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
27994769Sdp78419 			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
28004769Sdp78419 			    plw->plw_ceq_mask[szc]);
28014769Sdp78419 			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
28024769Sdp78419 		}
28034769Sdp78419 		plw->plw_ceq_mask[nszc] = INVALID_MASK;
28044769Sdp78419 		plw->plw_bins[nszc] = 0;
28052961Sdp78419 
28062961Sdp78419 	} else {
28074769Sdp78419 		ASSERT(szc == 0);
28084769Sdp78419 		plw->plw_do_split = 0;
28094769Sdp78419 		plw->plw_bins[1] = 0;
28104769Sdp78419 		plw->plw_ceq_mask[1] = INVALID_MASK;
28112961Sdp78419 	}
28122961Sdp78419 }
28132961Sdp78419 
28142961Sdp78419 /*
28152961Sdp78419  * set mark to flag where next split should occur
28162961Sdp78419  */
28172961Sdp78419 #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
28182961Sdp78419 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
28192961Sdp78419 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
28202961Sdp78419 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
28212961Sdp78419 	plw->plw_split_next =						     \
28222961Sdp78419 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
28232961Sdp78419 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
28242961Sdp78419 		plw->plw_split_next =					     \
28252961Sdp78419 		INC_MASKED(plw->plw_split_next,				     \
28262961Sdp78419 		    neq_mask, plw->plw_color_mask);			     \
28272961Sdp78419 	}								     \
28282961Sdp78419 }
28292961Sdp78419 
28302961Sdp78419 uint_t
page_list_walk_next_bin(uchar_t szc,uint_t bin,page_list_walker_t * plw)28312961Sdp78419 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
28322961Sdp78419 {
28332961Sdp78419 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
28342961Sdp78419 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
28352961Sdp78419 	uchar_t nszc = szc + 1;
28362961Sdp78419 
28372961Sdp78419 	nbin = ADD_MASKED(bin,
28382961Sdp78419 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
28392961Sdp78419 
28402961Sdp78419 	if (plw->plw_do_split) {
28412961Sdp78419 		plw->plw_bin_split_prev = bin;
28422961Sdp78419 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
28432961Sdp78419 		plw->plw_do_split = 0;
28442961Sdp78419 	}
28452961Sdp78419 
28462961Sdp78419 	if (szc == 0) {
28472961Sdp78419 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
28482961Sdp78419 			if (nbin == plw->plw_bin0 &&
28492961Sdp78419 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
28502961Sdp78419 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
28512961Sdp78419 				    neq_mask, plw->plw_color_mask);
28522961Sdp78419 				plw->plw_bin_split_prev = plw->plw_bin0;
28532961Sdp78419 			}
28542961Sdp78419 
28552961Sdp78419 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
28562961Sdp78419 				plw->plw_bin_marker =
28572961Sdp78419 				    nbin = INC_MASKED(nbin, neq_mask,
28584769Sdp78419 				    plw->plw_color_mask);
28592961Sdp78419 				plw->plw_bin_split_prev = plw->plw_bin0;
28602961Sdp78419 				/*
28612961Sdp78419 				 * large pages all have the same vac color
28622961Sdp78419 				 * so by now we should be done with next
28632961Sdp78419 				 * size page splitting process
28642961Sdp78419 				 */
28652961Sdp78419 				ASSERT(plw->plw_bins[1] == 0);
28662961Sdp78419 				plw->plw_do_split = 0;
28672961Sdp78419 				return (nbin);
28682961Sdp78419 			}
28692961Sdp78419 
28702961Sdp78419 		} else {
28712961Sdp78419 			uint_t bin_jump = (vac_colors == 1) ?
28722961Sdp78419 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
28732961Sdp78419 
28742961Sdp78419 			bin_jump &= ~(vac_colors - 1);
28752961Sdp78419 
28762961Sdp78419 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
28772961Sdp78419 			    plw->plw_color_mask);
28782961Sdp78419 
28792961Sdp78419 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
28802961Sdp78419 
28812961Sdp78419 				plw->plw_bin_marker = nbin = nbin0;
28822961Sdp78419 
28832961Sdp78419 				if (plw->plw_bins[nszc] != 0) {
28842961Sdp78419 					/*
28852961Sdp78419 					 * check if next page size bin is the
28862961Sdp78419 					 * same as the next page size bin for
28872961Sdp78419 					 * bin0
28882961Sdp78419 					 */
28892961Sdp78419 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
28902961Sdp78419 					    nbin);
28912961Sdp78419 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
28922961Sdp78419 					    plw->plw_bin0);
28932961Sdp78419 
28942961Sdp78419 					if ((bin0_nsz ^ nbin_nsz) &
28952961Sdp78419 					    plw->plw_ceq_mask[nszc])
28962961Sdp78419 						plw->plw_do_split = 1;
28972961Sdp78419 				}
28982961Sdp78419 				return (nbin);
28992961Sdp78419 			}
29002961Sdp78419 		}
29012961Sdp78419 	}
29022961Sdp78419 
29032961Sdp78419 	if (plw->plw_bins[nszc] != 0) {
29044769Sdp78419 		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
29054769Sdp78419 		if (!((plw->plw_split_next ^ nbin_nsz) &
29064769Sdp78419 		    plw->plw_ceq_mask[nszc]))
29074769Sdp78419 			plw->plw_do_split = 1;
29082961Sdp78419 	}
29092961Sdp78419 
29102961Sdp78419 	return (nbin);
29112961Sdp78419 }
29122961Sdp78419 
29130Sstevel@tonic-gate page_t *
page_get_mnode_freelist(int mnode,uint_t bin,int mtype,uchar_t szc,uint_t flags)291412293SJames.McPherson@Sun.COM page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
291512293SJames.McPherson@Sun.COM     uint_t flags)
29160Sstevel@tonic-gate {
29172961Sdp78419 	kmutex_t		*pcm;
29182961Sdp78419 	page_t			*pp, *first_pp;
29192961Sdp78419 	uint_t			sbin;
29202961Sdp78419 	int			plw_initialized;
29212961Sdp78419 	page_list_walker_t	plw;
29220Sstevel@tonic-gate 
29230Sstevel@tonic-gate 	ASSERT(szc < mmu_page_sizes);
29240Sstevel@tonic-gate 
29250Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
29260Sstevel@tonic-gate 
29270Sstevel@tonic-gate 	MTYPE_START(mnode, mtype, flags);
29282961Sdp78419 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
29290Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
29300Sstevel@tonic-gate 		return (NULL);
29310Sstevel@tonic-gate 	}
29322961Sdp78419 try_again:
293312293SJames.McPherson@Sun.COM 
29342961Sdp78419 	plw_initialized = 0;
29352961Sdp78419 	plw.plw_ceq_dif = 1;
29360Sstevel@tonic-gate 
29370Sstevel@tonic-gate 	/*
29380Sstevel@tonic-gate 	 * Only hold one freelist lock at a time, that way we
29390Sstevel@tonic-gate 	 * can start anywhere and not have to worry about lock
29400Sstevel@tonic-gate 	 * ordering.
29410Sstevel@tonic-gate 	 */
29422961Sdp78419 	for (plw.plw_count = 0;
29432961Sdp78419 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
29442961Sdp78419 		sbin = bin;
29452961Sdp78419 		do {
294612293SJames.McPherson@Sun.COM 			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
29472961Sdp78419 				goto bin_empty_1;
294812293SJames.McPherson@Sun.COM 
294912293SJames.McPherson@Sun.COM 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
29500Sstevel@tonic-gate 			mutex_enter(pcm);
295112293SJames.McPherson@Sun.COM 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
295212293SJames.McPherson@Sun.COM 			if (pp == NULL)
29532961Sdp78419 				goto bin_empty_0;
29542961Sdp78419 
29552961Sdp78419 			/*
29562961Sdp78419 			 * These were set before the page
29572961Sdp78419 			 * was put on the free list,
29582961Sdp78419 			 * they must still be set.
29592961Sdp78419 			 */
29602961Sdp78419 			ASSERT(PP_ISFREE(pp));
29612961Sdp78419 			ASSERT(PP_ISAGED(pp));
29622961Sdp78419 			ASSERT(pp->p_vnode == NULL);
29632961Sdp78419 			ASSERT(pp->p_hash == NULL);
29642961Sdp78419 			ASSERT(pp->p_offset == (u_offset_t)-1);
29652961Sdp78419 			ASSERT(pp->p_szc == szc);
29662961Sdp78419 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
29672961Sdp78419 
29682961Sdp78419 			/*
29692961Sdp78419 			 * Walk down the hash chain.
29702961Sdp78419 			 * 8k pages are linked on p_next
29712961Sdp78419 			 * and p_prev fields. Large pages
29722961Sdp78419 			 * are a contiguous group of
29732961Sdp78419 			 * constituent pages linked together
29742961Sdp78419 			 * on their p_next and p_prev fields.
29752961Sdp78419 			 * The large pages are linked together
29762961Sdp78419 			 * on the hash chain using p_vpnext
29772961Sdp78419 			 * p_vpprev of the base constituent
29782961Sdp78419 			 * page of each large page.
29792961Sdp78419 			 */
29802961Sdp78419 			first_pp = pp;
2981*12342SDave.Plauger@Sun.COM 			while (IS_DUMP_PAGE(pp) || !page_trylock_cons(pp,
2982*12342SDave.Plauger@Sun.COM 			    SE_EXCL)) {
29832961Sdp78419 				if (szc == 0) {
29842961Sdp78419 					pp = pp->p_next;
29852961Sdp78419 				} else {
29862961Sdp78419 					pp = pp->p_vpnext;
29872961Sdp78419 				}
29882961Sdp78419 
29890Sstevel@tonic-gate 				ASSERT(PP_ISFREE(pp));
29900Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp));
29910Sstevel@tonic-gate 				ASSERT(pp->p_vnode == NULL);
29920Sstevel@tonic-gate 				ASSERT(pp->p_hash == NULL);
29930Sstevel@tonic-gate 				ASSERT(pp->p_offset == (u_offset_t)-1);
29940Sstevel@tonic-gate 				ASSERT(pp->p_szc == szc);
29950Sstevel@tonic-gate 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
29960Sstevel@tonic-gate 
29972961Sdp78419 				if (pp == first_pp)
29982961Sdp78419 					goto bin_empty_0;
29992961Sdp78419 			}
30002961Sdp78419 
30012961Sdp78419 			ASSERT(pp != NULL);
30022961Sdp78419 			ASSERT(mtype == PP_2_MTYPE(pp));
30032961Sdp78419 			ASSERT(pp->p_szc == szc);
30042961Sdp78419 			if (szc == 0) {
300512293SJames.McPherson@Sun.COM 				page_sub(&PAGE_FREELISTS(mnode,
30062961Sdp78419 				    szc, bin, mtype), pp);
30072961Sdp78419 			} else {
300812293SJames.McPherson@Sun.COM 				page_vpsub(&PAGE_FREELISTS(mnode,
30092961Sdp78419 				    szc, bin, mtype), pp);
30102961Sdp78419 				CHK_LPG(pp, szc);
30112961Sdp78419 			}
30122961Sdp78419 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
30132961Sdp78419 
30142961Sdp78419 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
30152961Sdp78419 				panic("free page is not. pp %p", (void *)pp);
30162961Sdp78419 			mutex_exit(pcm);
30170Sstevel@tonic-gate 
30180Sstevel@tonic-gate #if defined(__sparc)
30192961Sdp78419 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
30202961Sdp78419 			    (flags & PG_NORELOC) == 0);
30212961Sdp78419 
30222961Sdp78419 			if (PP_ISNORELOC(pp))
30232961Sdp78419 				kcage_freemem_sub(page_get_pagecnt(szc));
302412293SJames.McPherson@Sun.COM #endif
30252961Sdp78419 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
30262961Sdp78419 			return (pp);
30272961Sdp78419 
30282961Sdp78419 bin_empty_0:
30292961Sdp78419 			mutex_exit(pcm);
30302961Sdp78419 bin_empty_1:
30312961Sdp78419 			if (plw_initialized == 0) {
303212293SJames.McPherson@Sun.COM 				page_list_walk_init(szc, flags, bin, 1, 1,
30332961Sdp78419 				    &plw);
30342961Sdp78419 				plw_initialized = 1;
30352961Sdp78419 				ASSERT(plw.plw_colors <=
30362961Sdp78419 				    PAGE_GET_PAGECOLORS(szc));
30372961Sdp78419 				ASSERT(plw.plw_colors > 0);
30382961Sdp78419 				ASSERT((plw.plw_colors &
30392961Sdp78419 				    (plw.plw_colors - 1)) == 0);
30402961Sdp78419 				ASSERT(bin < plw.plw_colors);
30412961Sdp78419 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
30420Sstevel@tonic-gate 			}
30432961Sdp78419 			/* calculate the next bin with equivalent color */
30442961Sdp78419 			bin = ADD_MASKED(bin, plw.plw_bin_step,
30452961Sdp78419 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
30462961Sdp78419 		} while (sbin != bin);
30470Sstevel@tonic-gate 
30480Sstevel@tonic-gate 		/*
30492961Sdp78419 		 * color bins are all empty if color match. Try and
30502961Sdp78419 		 * satisfy the request by breaking up or coalescing
30512961Sdp78419 		 * pages from a different size freelist of the correct
30522961Sdp78419 		 * color that satisfies the ORIGINAL color requested.
30532961Sdp78419 		 * If that fails then try pages of the same size but
30542961Sdp78419 		 * different colors assuming we are not called with
30550Sstevel@tonic-gate 		 * PG_MATCH_COLOR.
30560Sstevel@tonic-gate 		 */
30572961Sdp78419 		if (plw.plw_do_split &&
30582961Sdp78419 		    (pp = page_freelist_split(szc, bin, mnode,
30597656SSherry.Moore@Sun.COM 		    mtype, PFNNULL, PFNNULL, &plw)) != NULL)
30604769Sdp78419 			return (pp);
30612961Sdp78419 
30622961Sdp78419 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
30632961Sdp78419 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
30642961Sdp78419 			return (pp);
30652961Sdp78419 
30662961Sdp78419 		if (plw.plw_ceq_dif > 1)
306712293SJames.McPherson@Sun.COM 			bin = page_list_walk_next_bin(szc, bin, &plw);
30680Sstevel@tonic-gate 	}
30690Sstevel@tonic-gate 
3070414Skchow 	/* if allowed, cycle through additional mtypes */
3071414Skchow 	MTYPE_NEXT(mnode, mtype, flags);
3072414Skchow 	if (mtype >= 0)
30732961Sdp78419 		goto try_again;
3074414Skchow 
30750Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
30760Sstevel@tonic-gate 
30770Sstevel@tonic-gate 	return (NULL);
30780Sstevel@tonic-gate }
30790Sstevel@tonic-gate 
30800Sstevel@tonic-gate /*
30810Sstevel@tonic-gate  * Returns the count of free pages for 'pp' with size code 'szc'.
30820Sstevel@tonic-gate  * Note: This function does not return an exact value as the page freelist
30830Sstevel@tonic-gate  * locks are not held and thus the values in the page_counters may be
30840Sstevel@tonic-gate  * changing as we walk through the data.
30850Sstevel@tonic-gate  */
30860Sstevel@tonic-gate static int
page_freecnt(int mnode,page_t * pp,uchar_t szc)30870Sstevel@tonic-gate page_freecnt(int mnode, page_t *pp, uchar_t szc)
30880Sstevel@tonic-gate {
30890Sstevel@tonic-gate 	pgcnt_t	pgfree;
30900Sstevel@tonic-gate 	pgcnt_t cnt;
30910Sstevel@tonic-gate 	ssize_t	r = szc;	/* region size */
30920Sstevel@tonic-gate 	ssize_t	idx;
30930Sstevel@tonic-gate 	int	i;
30940Sstevel@tonic-gate 	int	full, range;
30950Sstevel@tonic-gate 
30960Sstevel@tonic-gate 	/* Make sure pagenum passed in is aligned properly */
30970Sstevel@tonic-gate 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
30980Sstevel@tonic-gate 	ASSERT(szc > 0);
30990Sstevel@tonic-gate 
31000Sstevel@tonic-gate 	/* Prevent page_counters dynamic memory from being freed */
31010Sstevel@tonic-gate 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
31020Sstevel@tonic-gate 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
31030Sstevel@tonic-gate 	cnt = PAGE_COUNTERS(mnode, r, idx);
31040Sstevel@tonic-gate 	pgfree = cnt << PNUM_SHIFT(r - 1);
31050Sstevel@tonic-gate 	range = FULL_REGION_CNT(szc);
31060Sstevel@tonic-gate 
31070Sstevel@tonic-gate 	/* Check for completely full region */
31080Sstevel@tonic-gate 	if (cnt == range) {
31090Sstevel@tonic-gate 		rw_exit(&page_ctrs_rwlock[mnode]);
31100Sstevel@tonic-gate 		return (pgfree);
31110Sstevel@tonic-gate 	}
31120Sstevel@tonic-gate 
31130Sstevel@tonic-gate 	while (--r > 0) {
31140Sstevel@tonic-gate 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
31150Sstevel@tonic-gate 		full = FULL_REGION_CNT(r);
31160Sstevel@tonic-gate 		for (i = 0; i < range; i++, idx++) {
31170Sstevel@tonic-gate 			cnt = PAGE_COUNTERS(mnode, r, idx);
31180Sstevel@tonic-gate 			/*
31190Sstevel@tonic-gate 			 * If cnt here is full, that means we have already
31200Sstevel@tonic-gate 			 * accounted for these pages earlier.
31210Sstevel@tonic-gate 			 */
31220Sstevel@tonic-gate 			if (cnt != full) {
31230Sstevel@tonic-gate 				pgfree += (cnt << PNUM_SHIFT(r - 1));
31240Sstevel@tonic-gate 			}
31250Sstevel@tonic-gate 		}
31260Sstevel@tonic-gate 		range *= full;
31270Sstevel@tonic-gate 	}
31280Sstevel@tonic-gate 	rw_exit(&page_ctrs_rwlock[mnode]);
31290Sstevel@tonic-gate 	return (pgfree);
31300Sstevel@tonic-gate }
31310Sstevel@tonic-gate 
31320Sstevel@tonic-gate /*
31330Sstevel@tonic-gate  * Called from page_geti_contig_pages to exclusively lock constituent pages
31340Sstevel@tonic-gate  * starting from 'spp' for page size code 'szc'.
31350Sstevel@tonic-gate  *
31360Sstevel@tonic-gate  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
31370Sstevel@tonic-gate  * region needs to be greater than or equal to the threshold.
31380Sstevel@tonic-gate  */
31390Sstevel@tonic-gate static int
page_trylock_contig_pages(int mnode,page_t * spp,uchar_t szc,int flags)31400Sstevel@tonic-gate page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
31410Sstevel@tonic-gate {
31420Sstevel@tonic-gate 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
31430Sstevel@tonic-gate 	pgcnt_t pgfree, i;
31440Sstevel@tonic-gate 	page_t *pp;
31450Sstevel@tonic-gate 
31460Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
31470Sstevel@tonic-gate 
31480Sstevel@tonic-gate 
31490Sstevel@tonic-gate 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
31500Sstevel@tonic-gate 		goto skipptcpcheck;
31510Sstevel@tonic-gate 	/*
31520Sstevel@tonic-gate 	 * check if there are sufficient free pages available before attempting
31530Sstevel@tonic-gate 	 * to trylock. Count is approximate as page counters can change.
31540Sstevel@tonic-gate 	 */
31550Sstevel@tonic-gate 	pgfree = page_freecnt(mnode, spp, szc);
31560Sstevel@tonic-gate 
31570Sstevel@tonic-gate 	/* attempt to trylock if there are sufficient already free pages */
31580Sstevel@tonic-gate 	if (pgfree < pgcnt/ptcpthreshold) {
31590Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
31600Sstevel@tonic-gate 		return (0);
31610Sstevel@tonic-gate 	}
31620Sstevel@tonic-gate 
31630Sstevel@tonic-gate skipptcpcheck:
31640Sstevel@tonic-gate 
31650Sstevel@tonic-gate 	for (i = 0; i < pgcnt; i++) {
31660Sstevel@tonic-gate 		pp = &spp[i];
31670Sstevel@tonic-gate 		if (!page_trylock(pp, SE_EXCL)) {
31680Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
31690Sstevel@tonic-gate 			while (--i != (pgcnt_t)-1) {
31700Sstevel@tonic-gate 				pp = &spp[i];
31710Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
31723253Smec 				page_unlock_nocapture(pp);
31730Sstevel@tonic-gate 			}
31740Sstevel@tonic-gate 			return (0);
31750Sstevel@tonic-gate 		}
31760Sstevel@tonic-gate 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
31770Sstevel@tonic-gate 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
31780Sstevel@tonic-gate 		    !PP_ISFREE(pp)) {
31790Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
31800Sstevel@tonic-gate 			ASSERT(i == 0);
31813253Smec 			page_unlock_nocapture(pp);
31820Sstevel@tonic-gate 			return (0);
31830Sstevel@tonic-gate 		}
31840Sstevel@tonic-gate 		if (PP_ISNORELOC(pp)) {
31850Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
31860Sstevel@tonic-gate 			while (i != (pgcnt_t)-1) {
31870Sstevel@tonic-gate 				pp = &spp[i];
31880Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
31893253Smec 				page_unlock_nocapture(pp);
31900Sstevel@tonic-gate 				i--;
31910Sstevel@tonic-gate 			}
31920Sstevel@tonic-gate 			return (0);
31930Sstevel@tonic-gate 		}
31940Sstevel@tonic-gate 	}
31950Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
31960Sstevel@tonic-gate 	return (1);
31970Sstevel@tonic-gate }
31980Sstevel@tonic-gate 
31990Sstevel@tonic-gate /*
32000Sstevel@tonic-gate  * Claim large page pointed to by 'pp'. 'pp' is the starting set
32010Sstevel@tonic-gate  * of 'szc' constituent pages that had been locked exclusively previously.
32020Sstevel@tonic-gate  * Will attempt to relocate constituent pages in use.
32030Sstevel@tonic-gate  */
32040Sstevel@tonic-gate static page_t *
page_claim_contig_pages(page_t * pp,uchar_t szc,int flags)32050Sstevel@tonic-gate page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
32060Sstevel@tonic-gate {
32070Sstevel@tonic-gate 	spgcnt_t pgcnt, npgs, i;
32080Sstevel@tonic-gate 	page_t *targpp, *rpp, *hpp;
32090Sstevel@tonic-gate 	page_t *replpp = NULL;
32100Sstevel@tonic-gate 	page_t *pplist = NULL;
32110Sstevel@tonic-gate 
32120Sstevel@tonic-gate 	ASSERT(pp != NULL);
32130Sstevel@tonic-gate 
32140Sstevel@tonic-gate 	pgcnt = page_get_pagecnt(szc);
32150Sstevel@tonic-gate 	while (pgcnt) {
32160Sstevel@tonic-gate 		ASSERT(PAGE_EXCL(pp));
32170Sstevel@tonic-gate 		ASSERT(!PP_ISNORELOC(pp));
32180Sstevel@tonic-gate 		if (PP_ISFREE(pp)) {
32190Sstevel@tonic-gate 			/*
32200Sstevel@tonic-gate 			 * If this is a PG_FREE_LIST page then its
32210Sstevel@tonic-gate 			 * size code can change underneath us due to
32220Sstevel@tonic-gate 			 * page promotion or demotion. As an optimzation
32230Sstevel@tonic-gate 			 * use page_list_sub_pages() instead of
32240Sstevel@tonic-gate 			 * page_list_sub().
32250Sstevel@tonic-gate 			 */
32260Sstevel@tonic-gate 			if (PP_ISAGED(pp)) {
32270Sstevel@tonic-gate 				page_list_sub_pages(pp, szc);
32280Sstevel@tonic-gate 				if (pp->p_szc == szc) {
32290Sstevel@tonic-gate 					return (pp);
32300Sstevel@tonic-gate 				}
32310Sstevel@tonic-gate 				ASSERT(pp->p_szc < szc);
32320Sstevel@tonic-gate 				npgs = page_get_pagecnt(pp->p_szc);
32330Sstevel@tonic-gate 				hpp = pp;
32340Sstevel@tonic-gate 				for (i = 0; i < npgs; i++, pp++) {
32350Sstevel@tonic-gate 					pp->p_szc = szc;
32360Sstevel@tonic-gate 				}
32370Sstevel@tonic-gate 				page_list_concat(&pplist, &hpp);
32380Sstevel@tonic-gate 				pgcnt -= npgs;
32390Sstevel@tonic-gate 				continue;
32400Sstevel@tonic-gate 			}
32410Sstevel@tonic-gate 			ASSERT(!PP_ISAGED(pp));
32420Sstevel@tonic-gate 			ASSERT(pp->p_szc == 0);
32430Sstevel@tonic-gate 			page_list_sub(pp, PG_CACHE_LIST);
32440Sstevel@tonic-gate 			page_hashout(pp, NULL);
32450Sstevel@tonic-gate 			PP_SETAGED(pp);
32460Sstevel@tonic-gate 			pp->p_szc = szc;
32470Sstevel@tonic-gate 			page_list_concat(&pplist, &pp);
32480Sstevel@tonic-gate 			pp++;
32490Sstevel@tonic-gate 			pgcnt--;
32500Sstevel@tonic-gate 			continue;
32510Sstevel@tonic-gate 		}
32520Sstevel@tonic-gate 		npgs = page_get_pagecnt(pp->p_szc);
32530Sstevel@tonic-gate 
32540Sstevel@tonic-gate 		/*
32550Sstevel@tonic-gate 		 * page_create_wait freemem accounting done by caller of
32560Sstevel@tonic-gate 		 * page_get_freelist and not necessary to call it prior to
32570Sstevel@tonic-gate 		 * calling page_get_replacement_page.
32580Sstevel@tonic-gate 		 *
32590Sstevel@tonic-gate 		 * page_get_replacement_page can call page_get_contig_pages
32600Sstevel@tonic-gate 		 * to acquire a large page (szc > 0); the replacement must be
32610Sstevel@tonic-gate 		 * smaller than the contig page size to avoid looping or
32620Sstevel@tonic-gate 		 * szc == 0 and PGI_PGCPSZC0 is set.
32630Sstevel@tonic-gate 		 */
32640Sstevel@tonic-gate 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
32650Sstevel@tonic-gate 			replpp = page_get_replacement_page(pp, NULL, 0);
32660Sstevel@tonic-gate 			if (replpp) {
32670Sstevel@tonic-gate 				npgs = page_get_pagecnt(pp->p_szc);
32680Sstevel@tonic-gate 				ASSERT(npgs <= pgcnt);
32690Sstevel@tonic-gate 				targpp = pp;
32700Sstevel@tonic-gate 			}
32710Sstevel@tonic-gate 		}
32720Sstevel@tonic-gate 
32730Sstevel@tonic-gate 		/*
32740Sstevel@tonic-gate 		 * If replacement is NULL or do_page_relocate fails, fail
32750Sstevel@tonic-gate 		 * coalescing of pages.
32760Sstevel@tonic-gate 		 */
32770Sstevel@tonic-gate 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
32780Sstevel@tonic-gate 		    &npgs, NULL) != 0)) {
32790Sstevel@tonic-gate 			/*
32800Sstevel@tonic-gate 			 * Unlock un-processed target list
32810Sstevel@tonic-gate 			 */
32820Sstevel@tonic-gate 			while (pgcnt--) {
32830Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
32843253Smec 				page_unlock_nocapture(pp);
32850Sstevel@tonic-gate 				pp++;
32860Sstevel@tonic-gate 			}
32870Sstevel@tonic-gate 			/*
32880Sstevel@tonic-gate 			 * Free the processed target list.
32890Sstevel@tonic-gate 			 */
32900Sstevel@tonic-gate 			while (pplist) {
32910Sstevel@tonic-gate 				pp = pplist;
32920Sstevel@tonic-gate 				page_sub(&pplist, pp);
32930Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(pp));
32940Sstevel@tonic-gate 				ASSERT(pp->p_szc == szc);
32950Sstevel@tonic-gate 				ASSERT(PP_ISFREE(pp));
32960Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp));
32970Sstevel@tonic-gate 				pp->p_szc = 0;
32980Sstevel@tonic-gate 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
32993253Smec 				page_unlock_nocapture(pp);
33000Sstevel@tonic-gate 			}
33010Sstevel@tonic-gate 
33020Sstevel@tonic-gate 			if (replpp != NULL)
33030Sstevel@tonic-gate 				page_free_replacement_page(replpp);
33040Sstevel@tonic-gate 
33050Sstevel@tonic-gate 			return (NULL);
33060Sstevel@tonic-gate 		}
33070Sstevel@tonic-gate 		ASSERT(pp == targpp);
33080Sstevel@tonic-gate 
33090Sstevel@tonic-gate 		/* LINTED */
33100Sstevel@tonic-gate 		ASSERT(hpp = pp); /* That's right, it's an assignment */
33110Sstevel@tonic-gate 
33120Sstevel@tonic-gate 		pp += npgs;
33130Sstevel@tonic-gate 		pgcnt -= npgs;
33140Sstevel@tonic-gate 
33150Sstevel@tonic-gate 		while (npgs--) {
33160Sstevel@tonic-gate 			ASSERT(PAGE_EXCL(targpp));
33170Sstevel@tonic-gate 			ASSERT(!PP_ISFREE(targpp));
33180Sstevel@tonic-gate 			ASSERT(!PP_ISNORELOC(targpp));
33190Sstevel@tonic-gate 			PP_SETFREE(targpp);
33200Sstevel@tonic-gate 			ASSERT(PP_ISAGED(targpp));
33210Sstevel@tonic-gate 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
33220Sstevel@tonic-gate 			    (flags & PGI_PGCPSZC0)));
33230Sstevel@tonic-gate 			targpp->p_szc = szc;
33240Sstevel@tonic-gate 			targpp = targpp->p_next;
33250Sstevel@tonic-gate 
33260Sstevel@tonic-gate 			rpp = replpp;
33270Sstevel@tonic-gate 			ASSERT(rpp != NULL);
33280Sstevel@tonic-gate 			page_sub(&replpp, rpp);
33290Sstevel@tonic-gate 			ASSERT(PAGE_EXCL(rpp));
33300Sstevel@tonic-gate 			ASSERT(!PP_ISFREE(rpp));
33313253Smec 			page_unlock_nocapture(rpp);
33320Sstevel@tonic-gate 		}
33330Sstevel@tonic-gate 		ASSERT(targpp == hpp);
33340Sstevel@tonic-gate 		ASSERT(replpp == NULL);
33350Sstevel@tonic-gate 		page_list_concat(&pplist, &targpp);
33360Sstevel@tonic-gate 	}
33370Sstevel@tonic-gate 	CHK_LPG(pplist, szc);
33380Sstevel@tonic-gate 	return (pplist);
33390Sstevel@tonic-gate }
33400Sstevel@tonic-gate 
33410Sstevel@tonic-gate /*
33420Sstevel@tonic-gate  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
33430Sstevel@tonic-gate  * of 0 means nothing left after trim.
33440Sstevel@tonic-gate  */
33450Sstevel@tonic-gate int
trimkcage(struct memseg * mseg,pfn_t * lo,pfn_t * hi,pfn_t pfnlo,pfn_t pfnhi)33460Sstevel@tonic-gate trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
33470Sstevel@tonic-gate {
33480Sstevel@tonic-gate 	pfn_t	kcagepfn;
33490Sstevel@tonic-gate 	int	decr;
33500Sstevel@tonic-gate 	int	rc = 0;
33510Sstevel@tonic-gate 
33520Sstevel@tonic-gate 	if (PP_ISNORELOC(mseg->pages)) {
33530Sstevel@tonic-gate 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
33540Sstevel@tonic-gate 
33550Sstevel@tonic-gate 			/* lower part of this mseg inside kernel cage */
33560Sstevel@tonic-gate 			decr = kcage_current_pfn(&kcagepfn);
33570Sstevel@tonic-gate 
33580Sstevel@tonic-gate 			/* kernel cage may have transitioned past mseg */
33590Sstevel@tonic-gate 			if (kcagepfn >= mseg->pages_base &&
33600Sstevel@tonic-gate 			    kcagepfn < mseg->pages_end) {
33610Sstevel@tonic-gate 				ASSERT(decr == 0);
33625466Skchow 				*lo = MAX(kcagepfn, pfnlo);
33635466Skchow 				*hi = MIN(pfnhi, (mseg->pages_end - 1));
33640Sstevel@tonic-gate 				rc = 1;
33650Sstevel@tonic-gate 			}
33660Sstevel@tonic-gate 		}
33670Sstevel@tonic-gate 		/* else entire mseg in the cage */
33680Sstevel@tonic-gate 	} else {
33690Sstevel@tonic-gate 		if (PP_ISNORELOC(mseg->epages - 1)) {
33700Sstevel@tonic-gate 
33710Sstevel@tonic-gate 			/* upper part of this mseg inside kernel cage */
33720Sstevel@tonic-gate 			decr = kcage_current_pfn(&kcagepfn);
33730Sstevel@tonic-gate 
33740Sstevel@tonic-gate 			/* kernel cage may have transitioned past mseg */
33750Sstevel@tonic-gate 			if (kcagepfn >= mseg->pages_base &&
33760Sstevel@tonic-gate 			    kcagepfn < mseg->pages_end) {
33770Sstevel@tonic-gate 				ASSERT(decr);
33785466Skchow 				*hi = MIN(kcagepfn, pfnhi);
33790Sstevel@tonic-gate 				*lo = MAX(pfnlo, mseg->pages_base);
33800Sstevel@tonic-gate 				rc = 1;
33810Sstevel@tonic-gate 			}
33820Sstevel@tonic-gate 		} else {
33830Sstevel@tonic-gate 			/* entire mseg outside of kernel cage */
33840Sstevel@tonic-gate 			*lo = MAX(pfnlo, mseg->pages_base);
33850Sstevel@tonic-gate 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
33860Sstevel@tonic-gate 			rc = 1;
33870Sstevel@tonic-gate 		}
33880Sstevel@tonic-gate 	}
33890Sstevel@tonic-gate 	return (rc);
33900Sstevel@tonic-gate }
33910Sstevel@tonic-gate 
33920Sstevel@tonic-gate /*
33932961Sdp78419  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
33940Sstevel@tonic-gate  * page with size code 'szc'. Claiming such a page requires acquiring
33950Sstevel@tonic-gate  * exclusive locks on all constituent pages (page_trylock_contig_pages),
33960Sstevel@tonic-gate  * relocating pages in use and concatenating these constituent pages into a
33970Sstevel@tonic-gate  * large page.
33980Sstevel@tonic-gate  *
33992961Sdp78419  * The page lists do not have such a large page and page_freelist_split has
34000Sstevel@tonic-gate  * already failed to demote larger pages and/or coalesce smaller free pages.
34010Sstevel@tonic-gate  *
34020Sstevel@tonic-gate  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
34030Sstevel@tonic-gate  * pages with the same color as 'bin'.
34040Sstevel@tonic-gate  *
34050Sstevel@tonic-gate  * 'pfnflag' specifies the subset of the pfn range to search.
34060Sstevel@tonic-gate  */
340712293SJames.McPherson@Sun.COM 
34080Sstevel@tonic-gate static page_t *
page_geti_contig_pages(int mnode,uint_t bin,uchar_t szc,int flags,pfn_t pfnlo,pfn_t pfnhi,pgcnt_t pfnflag)34090Sstevel@tonic-gate page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3410841Skchow     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
34110Sstevel@tonic-gate {
34120Sstevel@tonic-gate 	struct memseg *mseg;
34130Sstevel@tonic-gate 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
341412293SJames.McPherson@Sun.COM 	pgcnt_t szcpgmask = szcpgcnt - 1;
34150Sstevel@tonic-gate 	pfn_t	randpfn;
34160Sstevel@tonic-gate 	page_t *pp, *randpp, *endpp;
34172961Sdp78419 	uint_t colors, ceq_mask;
34182961Sdp78419 	/* LINTED : set but not used in function */
34192961Sdp78419 	uint_t color_mask;
34200Sstevel@tonic-gate 	pfn_t hi, lo;
34210Sstevel@tonic-gate 	uint_t skip;
34224769Sdp78419 	MEM_NODE_ITERATOR_DECL(it);
34230Sstevel@tonic-gate 
34240Sstevel@tonic-gate 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
342512293SJames.McPherson@Sun.COM 
34265349Skchow 	pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
34275349Skchow 
342812293SJames.McPherson@Sun.COM 	if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
34290Sstevel@tonic-gate 		return (NULL);
34300Sstevel@tonic-gate 
34310Sstevel@tonic-gate 	ASSERT(szc < mmu_page_sizes);
34320Sstevel@tonic-gate 
34332961Sdp78419 	colors = PAGE_GET_PAGECOLORS(szc);
34342961Sdp78419 	color_mask = colors - 1;
34352961Sdp78419 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
34362961Sdp78419 		uchar_t ceq = colorequivszc[szc];
34372961Sdp78419 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
34382961Sdp78419 
34392961Sdp78419 		ASSERT(ceq_dif > 0);
34402961Sdp78419 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
34412961Sdp78419 	} else {
34422961Sdp78419 		ceq_mask = 0;
34432961Sdp78419 	}
34440Sstevel@tonic-gate 
34450Sstevel@tonic-gate 	ASSERT(bin < colors);
34460Sstevel@tonic-gate 
34472961Sdp78419 	/* clear "non-significant" color bits */
34482961Sdp78419 	bin &= ceq_mask;
34492961Sdp78419 
34500Sstevel@tonic-gate 	/*
34510Sstevel@tonic-gate 	 * trim the pfn range to search based on pfnflag. pfnflag is set
34520Sstevel@tonic-gate 	 * when there have been previous page_get_contig_page failures to
34530Sstevel@tonic-gate 	 * limit the search.
34540Sstevel@tonic-gate 	 *
34550Sstevel@tonic-gate 	 * The high bit in pfnflag specifies the number of 'slots' in the
34560Sstevel@tonic-gate 	 * pfn range and the remainder of pfnflag specifies which slot.
34570Sstevel@tonic-gate 	 * For example, a value of 1010b would mean the second slot of
34580Sstevel@tonic-gate 	 * the pfn range that has been divided into 8 slots.
34590Sstevel@tonic-gate 	 */
34600Sstevel@tonic-gate 	if (pfnflag > 1) {
34610Sstevel@tonic-gate 		int	slots = 1 << (highbit(pfnflag) - 1);
34620Sstevel@tonic-gate 		int	slotid = pfnflag & (slots - 1);
34630Sstevel@tonic-gate 		pgcnt_t	szcpages;
34640Sstevel@tonic-gate 		int	slotlen;
34650Sstevel@tonic-gate 
34665349Skchow 		pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
34670Sstevel@tonic-gate 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
34680Sstevel@tonic-gate 		slotlen = howmany(szcpages, slots);
34695349Skchow 		/* skip if 'slotid' slot is empty */
347012293SJames.McPherson@Sun.COM 		if (slotid * slotlen >= szcpages)
34715349Skchow 			return (NULL);
34720Sstevel@tonic-gate 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
34730Sstevel@tonic-gate 		ASSERT(pfnlo < pfnhi);
34740Sstevel@tonic-gate 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
34755349Skchow 			pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
34760Sstevel@tonic-gate 	}
34770Sstevel@tonic-gate 
347811185SSean.McEnroe@Sun.COM 	/*
347911185SSean.McEnroe@Sun.COM 	 * This routine is can be called recursively so we shouldn't
348011185SSean.McEnroe@Sun.COM 	 * acquire a reader lock if a write request is pending. This
348111185SSean.McEnroe@Sun.COM 	 * could lead to a deadlock with the DR thread.
348211185SSean.McEnroe@Sun.COM 	 *
348311185SSean.McEnroe@Sun.COM 	 * Returning NULL informs the caller that we could not get
348411185SSean.McEnroe@Sun.COM 	 * a contig page with the required characteristics.
348511185SSean.McEnroe@Sun.COM 	 */
348611185SSean.McEnroe@Sun.COM 
348711185SSean.McEnroe@Sun.COM 	if (!memsegs_trylock(0))
348811185SSean.McEnroe@Sun.COM 		return (NULL);
34890Sstevel@tonic-gate 
34900Sstevel@tonic-gate 	/*
34910Sstevel@tonic-gate 	 * loop through memsegs to look for contig page candidates
34920Sstevel@tonic-gate 	 */
34930Sstevel@tonic-gate 
34940Sstevel@tonic-gate 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
34950Sstevel@tonic-gate 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
34960Sstevel@tonic-gate 			/* no overlap */
34970Sstevel@tonic-gate 			continue;
34980Sstevel@tonic-gate 		}
34990Sstevel@tonic-gate 
35000Sstevel@tonic-gate 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
35010Sstevel@tonic-gate 			/* mseg too small */
35020Sstevel@tonic-gate 			continue;
35030Sstevel@tonic-gate 
35045466Skchow 		/*
35055466Skchow 		 * trim off kernel cage pages from pfn range and check for
35065466Skchow 		 * a trimmed pfn range returned that does not span the
35075466Skchow 		 * desired large page size.
35085466Skchow 		 */
35090Sstevel@tonic-gate 		if (kcage_on) {
35105466Skchow 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
35115725Skchow 			    lo >= hi || ((hi - lo) + 1) < szcpgcnt)
35120Sstevel@tonic-gate 				continue;
35130Sstevel@tonic-gate 		} else {
35140Sstevel@tonic-gate 			lo = MAX(pfnlo, mseg->pages_base);
35150Sstevel@tonic-gate 			hi = MIN(pfnhi, (mseg->pages_end - 1));
35160Sstevel@tonic-gate 		}
35170Sstevel@tonic-gate 
35180Sstevel@tonic-gate 		/* round to szcpgcnt boundaries */
35190Sstevel@tonic-gate 		lo = P2ROUNDUP(lo, szcpgcnt);
35205349Skchow 
35216041Sdp78419 		MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
35225349Skchow 		hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
35230Sstevel@tonic-gate 
35240Sstevel@tonic-gate 		if (hi <= lo)
35250Sstevel@tonic-gate 			continue;
35260Sstevel@tonic-gate 
35270Sstevel@tonic-gate 		/*
35280Sstevel@tonic-gate 		 * set lo to point to the pfn for the desired bin. Large
35290Sstevel@tonic-gate 		 * page sizes may only have a single page color
35300Sstevel@tonic-gate 		 */
35312961Sdp78419 		skip = szcpgcnt;
35324769Sdp78419 		if (ceq_mask > 0 || interleaved_mnodes) {
35332961Sdp78419 			/* set lo to point at appropriate color */
35344769Sdp78419 			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
35354769Sdp78419 			    (interleaved_mnodes &&
35364769Sdp78419 			    PFN_2_MEM_NODE(lo) != mnode)) {
35374769Sdp78419 				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
35384769Sdp78419 				    color_mask, &it);
35394769Sdp78419 			}
35402961Sdp78419 			if (hi <= lo)
35412961Sdp78419 				/* mseg cannot satisfy color request */
35422961Sdp78419 				continue;
35430Sstevel@tonic-gate 		}
35440Sstevel@tonic-gate 
35450Sstevel@tonic-gate 		/* randomly choose a point between lo and hi to begin search */
35460Sstevel@tonic-gate 
35470Sstevel@tonic-gate 		randpfn = (pfn_t)GETTICK();
35480Sstevel@tonic-gate 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
35496041Sdp78419 		MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
35506041Sdp78419 		if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
35515847Sdp78419 			if (randpfn != (pfn_t)-1) {
35524769Sdp78419 				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
35534769Sdp78419 				    ceq_mask, color_mask, &it);
35545847Sdp78419 			}
35554769Sdp78419 			if (randpfn >= hi) {
35564769Sdp78419 				randpfn = lo;
35576041Sdp78419 				MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
35586041Sdp78419 				    &it);
35594769Sdp78419 			}
35602961Sdp78419 		}
35610Sstevel@tonic-gate 		randpp = mseg->pages + (randpfn - mseg->pages_base);
35620Sstevel@tonic-gate 
35630Sstevel@tonic-gate 		ASSERT(randpp->p_pagenum == randpfn);
35640Sstevel@tonic-gate 
35650Sstevel@tonic-gate 		pp = randpp;
35665349Skchow 		endpp =  mseg->pages + (hi - mseg->pages_base) + 1;
35670Sstevel@tonic-gate 
35680Sstevel@tonic-gate 		ASSERT(randpp + szcpgcnt <= endpp);
35690Sstevel@tonic-gate 
35700Sstevel@tonic-gate 		do {
35710Sstevel@tonic-gate 			ASSERT(!(pp->p_pagenum & szcpgmask));
35722961Sdp78419 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
35732961Sdp78419 
35740Sstevel@tonic-gate 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
35750Sstevel@tonic-gate 				/* pages unlocked by page_claim on failure */
35760Sstevel@tonic-gate 				if (page_claim_contig_pages(pp, szc, flags)) {
35770Sstevel@tonic-gate 					memsegs_unlock(0);
35780Sstevel@tonic-gate 					return (pp);
35790Sstevel@tonic-gate 				}
35800Sstevel@tonic-gate 			}
35810Sstevel@tonic-gate 
35824769Sdp78419 			if (ceq_mask == 0 && !interleaved_mnodes) {
35832961Sdp78419 				pp += skip;
35842961Sdp78419 			} else {
35852961Sdp78419 				pfn_t pfn = pp->p_pagenum;
35862961Sdp78419 
35872961Sdp78419 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
35884769Sdp78419 				    ceq_mask, color_mask, &it);
35894769Sdp78419 				if (pfn == (pfn_t)-1) {
35904769Sdp78419 					pp = endpp;
35914769Sdp78419 				} else {
35924769Sdp78419 					pp = mseg->pages +
35934769Sdp78419 					    (pfn - mseg->pages_base);
35944769Sdp78419 				}
35952961Sdp78419 			}
35960Sstevel@tonic-gate 			if (pp >= endpp) {
35970Sstevel@tonic-gate 				/* start from the beginning */
35986041Sdp78419 				MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
35990Sstevel@tonic-gate 				pp = mseg->pages + (lo - mseg->pages_base);
36000Sstevel@tonic-gate 				ASSERT(pp->p_pagenum == lo);
36010Sstevel@tonic-gate 				ASSERT(pp + szcpgcnt <= endpp);
36020Sstevel@tonic-gate 			}
36030Sstevel@tonic-gate 		} while (pp != randpp);
36040Sstevel@tonic-gate 	}
36050Sstevel@tonic-gate 	memsegs_unlock(0);
36060Sstevel@tonic-gate 	return (NULL);
36070Sstevel@tonic-gate }
36080Sstevel@tonic-gate 
360912293SJames.McPherson@Sun.COM 
36100Sstevel@tonic-gate /*
36110Sstevel@tonic-gate  * controlling routine that searches through physical memory in an attempt to
36120Sstevel@tonic-gate  * claim a large page based on the input parameters.
36130Sstevel@tonic-gate  * on the page free lists.
36140Sstevel@tonic-gate  *
36150Sstevel@tonic-gate  * calls page_geti_contig_pages with an initial pfn range from the mnode
36160Sstevel@tonic-gate  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
36170Sstevel@tonic-gate  * that overlaps with the kernel cage or does not match the requested page
36180Sstevel@tonic-gate  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
36190Sstevel@tonic-gate  * page_geti_contig_pages may further limit the search range based on
36200Sstevel@tonic-gate  * previous failure counts (pgcpfailcnt[]).
36210Sstevel@tonic-gate  *
36220Sstevel@tonic-gate  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
36230Sstevel@tonic-gate  * pagesize page that satisfies mtype.
36240Sstevel@tonic-gate  */
36250Sstevel@tonic-gate page_t *
page_get_contig_pages(int mnode,uint_t bin,int mtype,uchar_t szc,uint_t flags)362612293SJames.McPherson@Sun.COM page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
362712293SJames.McPherson@Sun.COM     uint_t flags)
36280Sstevel@tonic-gate {
36290Sstevel@tonic-gate 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
36300Sstevel@tonic-gate 	page_t		*pp;
3631841Skchow 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
36320Sstevel@tonic-gate 
36330Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
36340Sstevel@tonic-gate 
36352124Smec 	/* no allocations from cage */
36362124Smec 	flags |= PGI_NOCAGE;
36372124Smec 
36380Sstevel@tonic-gate 	/* LINTED */
36390Sstevel@tonic-gate 	MTYPE_START(mnode, mtype, flags);
36400Sstevel@tonic-gate 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
36410Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
36420Sstevel@tonic-gate 		return (NULL);
36430Sstevel@tonic-gate 	}
36440Sstevel@tonic-gate 
36450Sstevel@tonic-gate 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
36460Sstevel@tonic-gate 
36470Sstevel@tonic-gate 	/* do not limit search and ignore color if hi pri */
36480Sstevel@tonic-gate 
36490Sstevel@tonic-gate 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
36500Sstevel@tonic-gate 		pfnflag = pgcpfailcnt[szc];
36510Sstevel@tonic-gate 
36520Sstevel@tonic-gate 	/* remove color match to improve chances */
36530Sstevel@tonic-gate 
36540Sstevel@tonic-gate 	if (flags & PGI_PGCPHIPRI || pfnflag)
36550Sstevel@tonic-gate 		flags &= ~PG_MATCH_COLOR;
36560Sstevel@tonic-gate 
36570Sstevel@tonic-gate 	do {
36580Sstevel@tonic-gate 		/* get pfn range based on mnode and mtype */
36590Sstevel@tonic-gate 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
366012293SJames.McPherson@Sun.COM 
36610Sstevel@tonic-gate 		ASSERT(pfnhi >= pfnlo);
36620Sstevel@tonic-gate 
36630Sstevel@tonic-gate 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
36640Sstevel@tonic-gate 		    pfnlo, pfnhi, pfnflag);
36650Sstevel@tonic-gate 
36660Sstevel@tonic-gate 		if (pp != NULL) {
36670Sstevel@tonic-gate 			pfnflag = pgcpfailcnt[szc];
36680Sstevel@tonic-gate 			if (pfnflag) {
36690Sstevel@tonic-gate 				/* double the search size */
36700Sstevel@tonic-gate 				pgcpfailcnt[szc] = pfnflag >> 1;
36710Sstevel@tonic-gate 			}
36720Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
36730Sstevel@tonic-gate 			return (pp);
36740Sstevel@tonic-gate 		}
3675414Skchow 		MTYPE_NEXT(mnode, mtype, flags);
3676414Skchow 	} while (mtype >= 0);
36770Sstevel@tonic-gate 
36780Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
36790Sstevel@tonic-gate 	return (NULL);
36800Sstevel@tonic-gate }
36810Sstevel@tonic-gate 
36825466Skchow #if defined(__i386) || defined(__amd64)
36835466Skchow /*
36845466Skchow  * Determine the likelihood of finding/coalescing a szc page.
36855466Skchow  * Return 0 if the likelihood is small otherwise return 1.
36865466Skchow  *
36875466Skchow  * For now, be conservative and check only 1g pages and return 0
36885466Skchow  * if there had been previous coalescing failures and the szc pages
36895466Skchow  * needed to satisfy request would exhaust most of freemem.
36905466Skchow  */
36915466Skchow int
page_chk_freelist(uint_t szc)36925466Skchow page_chk_freelist(uint_t szc)
36935466Skchow {
36945466Skchow 	pgcnt_t		pgcnt;
36955466Skchow 
36965466Skchow 	if (szc <= 1)
36975466Skchow 		return (1);
36985466Skchow 
36995466Skchow 	pgcnt = page_get_pagecnt(szc);
37005466Skchow 	if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
37015466Skchow 		VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
37025466Skchow 		return (0);
37035466Skchow 	}
37045466Skchow 	VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
37055466Skchow 	return (1);
37065466Skchow }
37075466Skchow #endif
37080Sstevel@tonic-gate 
37090Sstevel@tonic-gate /*
37100Sstevel@tonic-gate  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
37110Sstevel@tonic-gate  *
37120Sstevel@tonic-gate  * Does its own locking and accounting.
37130Sstevel@tonic-gate  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
37140Sstevel@tonic-gate  * pages of the proper color even if there are pages of a different color.
37150Sstevel@tonic-gate  *
37160Sstevel@tonic-gate  * Finds a page, removes it, THEN locks it.
37170Sstevel@tonic-gate  */
37180Sstevel@tonic-gate 
37190Sstevel@tonic-gate /*ARGSUSED*/
37200Sstevel@tonic-gate page_t *
page_get_freelist(struct vnode * vp,u_offset_t off,struct seg * seg,caddr_t vaddr,size_t size,uint_t flags,struct lgrp * lgrp)37210Sstevel@tonic-gate page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
37220Sstevel@tonic-gate 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
37230Sstevel@tonic-gate {
372412293SJames.McPherson@Sun.COM 	struct as	*as = seg->s_as;
372512293SJames.McPherson@Sun.COM 	page_t		*pp = NULL;
372612293SJames.McPherson@Sun.COM 	ulong_t		bin;
372712293SJames.McPherson@Sun.COM 	uchar_t		szc;
372812293SJames.McPherson@Sun.COM 	int		mnode;
372912293SJames.McPherson@Sun.COM 	int		mtype;
373012293SJames.McPherson@Sun.COM 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
373112293SJames.McPherson@Sun.COM 	lgrp_mnode_cookie_t	lgrp_cookie;
373212293SJames.McPherson@Sun.COM 
373312293SJames.McPherson@Sun.COM 	page_get_func = page_get_mnode_freelist;
373412293SJames.McPherson@Sun.COM 
373512293SJames.McPherson@Sun.COM 	/*
373612293SJames.McPherson@Sun.COM 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
373712293SJames.McPherson@Sun.COM 	 * assume we wish to allocate near to the current thread's home.
373812293SJames.McPherson@Sun.COM 	 */
373912293SJames.McPherson@Sun.COM 	if (!LGRP_EXISTS(lgrp))
374012293SJames.McPherson@Sun.COM 		lgrp = lgrp_home_lgrp();
374112293SJames.McPherson@Sun.COM 
374212293SJames.McPherson@Sun.COM 	if (kcage_on) {
374312293SJames.McPherson@Sun.COM 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
374412293SJames.McPherson@Sun.COM 		    kcage_freemem < kcage_throttlefree + btop(size) &&
374512293SJames.McPherson@Sun.COM 		    curthread != kcage_cageout_thread) {
374612293SJames.McPherson@Sun.COM 			/*
374712293SJames.McPherson@Sun.COM 			 * Set a "reserve" of kcage_throttlefree pages for
374812293SJames.McPherson@Sun.COM 			 * PG_PANIC and cageout thread allocations.
374912293SJames.McPherson@Sun.COM 			 *
375012293SJames.McPherson@Sun.COM 			 * Everybody else has to serialize in
375112293SJames.McPherson@Sun.COM 			 * page_create_get_something() to get a cage page, so
375212293SJames.McPherson@Sun.COM 			 * that we don't deadlock cageout!
375312293SJames.McPherson@Sun.COM 			 */
375412293SJames.McPherson@Sun.COM 			return (NULL);
375512293SJames.McPherson@Sun.COM 		}
375612293SJames.McPherson@Sun.COM 	} else {
375712293SJames.McPherson@Sun.COM 		flags &= ~PG_NORELOC;
375812293SJames.McPherson@Sun.COM 		flags |= PGI_NOCAGE;
375912293SJames.McPherson@Sun.COM 	}
376012293SJames.McPherson@Sun.COM 
376112293SJames.McPherson@Sun.COM 	/* LINTED */
376212293SJames.McPherson@Sun.COM 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
376312293SJames.McPherson@Sun.COM 
376412293SJames.McPherson@Sun.COM 	/*
376512293SJames.McPherson@Sun.COM 	 * Convert size to page size code.
376612293SJames.McPherson@Sun.COM 	 */
376712293SJames.McPherson@Sun.COM 	if ((szc = page_szc(size)) == (uchar_t)-1)
376812293SJames.McPherson@Sun.COM 		panic("page_get_freelist: illegal page size request");
376912293SJames.McPherson@Sun.COM 	ASSERT(szc < mmu_page_sizes);
377012293SJames.McPherson@Sun.COM 
377112293SJames.McPherson@Sun.COM 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
377212293SJames.McPherson@Sun.COM 
377312293SJames.McPherson@Sun.COM 	/* LINTED */
377412293SJames.McPherson@Sun.COM 	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
377512293SJames.McPherson@Sun.COM 
377612293SJames.McPherson@Sun.COM 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
377712293SJames.McPherson@Sun.COM 
377812293SJames.McPherson@Sun.COM 	/*
377912293SJames.McPherson@Sun.COM 	 * Try to get a local page first, but try remote if we can't
378012293SJames.McPherson@Sun.COM 	 * get a page of the right color.
378112293SJames.McPherson@Sun.COM 	 */
378212293SJames.McPherson@Sun.COM pgretry:
378312293SJames.McPherson@Sun.COM 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
378412293SJames.McPherson@Sun.COM 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
378512293SJames.McPherson@Sun.COM 		pp = page_get_func(mnode, bin, mtype, szc, flags);
378612293SJames.McPherson@Sun.COM 		if (pp != NULL) {
378712293SJames.McPherson@Sun.COM 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
378812293SJames.McPherson@Sun.COM 			DTRACE_PROBE4(page__get,
378912293SJames.McPherson@Sun.COM 			    lgrp_t *, lgrp,
379012293SJames.McPherson@Sun.COM 			    int, mnode,
379112293SJames.McPherson@Sun.COM 			    ulong_t, bin,
379212293SJames.McPherson@Sun.COM 			    uint_t, flags);
379312293SJames.McPherson@Sun.COM 			return (pp);
379412293SJames.McPherson@Sun.COM 		}
379512293SJames.McPherson@Sun.COM 	}
379612293SJames.McPherson@Sun.COM 	ASSERT(pp == NULL);
379712293SJames.McPherson@Sun.COM 
379812293SJames.McPherson@Sun.COM 	/*
379912293SJames.McPherson@Sun.COM 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
380012293SJames.McPherson@Sun.COM 	 * remote free lists.  Caller expected to call page_get_cachelist which
380112293SJames.McPherson@Sun.COM 	 * will check local cache lists and remote free lists.
380212293SJames.McPherson@Sun.COM 	 */
380312293SJames.McPherson@Sun.COM 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
380412293SJames.McPherson@Sun.COM 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
380512293SJames.McPherson@Sun.COM 		return (NULL);
380612293SJames.McPherson@Sun.COM 	}
380712293SJames.McPherson@Sun.COM 
380812293SJames.McPherson@Sun.COM 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
380912293SJames.McPherson@Sun.COM 
381012293SJames.McPherson@Sun.COM 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
381112293SJames.McPherson@Sun.COM 
381212293SJames.McPherson@Sun.COM 	if (!(flags & PG_LOCAL)) {
381312293SJames.McPherson@Sun.COM 		/*
381412293SJames.McPherson@Sun.COM 		 * Try to get a non-local freelist page.
381512293SJames.McPherson@Sun.COM 		 */
381612293SJames.McPherson@Sun.COM 		LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
381712293SJames.McPherson@Sun.COM 		while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
381812293SJames.McPherson@Sun.COM 			pp = page_get_func(mnode, bin, mtype, szc, flags);
381912293SJames.McPherson@Sun.COM 			if (pp != NULL) {
382012293SJames.McPherson@Sun.COM 				DTRACE_PROBE4(page__get,
382112293SJames.McPherson@Sun.COM 				    lgrp_t *, lgrp,
382212293SJames.McPherson@Sun.COM 				    int, mnode,
382312293SJames.McPherson@Sun.COM 				    ulong_t, bin,
382412293SJames.McPherson@Sun.COM 				    uint_t, flags);
382512293SJames.McPherson@Sun.COM 				VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
382612293SJames.McPherson@Sun.COM 				return (pp);
382712293SJames.McPherson@Sun.COM 			}
382812293SJames.McPherson@Sun.COM 		}
382912293SJames.McPherson@Sun.COM 		ASSERT(pp == NULL);
383012293SJames.McPherson@Sun.COM 	}
383112293SJames.McPherson@Sun.COM 
383212293SJames.McPherson@Sun.COM 	/*
383312293SJames.McPherson@Sun.COM 	 * when the cage is off chances are page_get_contig_pages() will fail
383412293SJames.McPherson@Sun.COM 	 * to lock a large page chunk therefore when the cage is off it's not
383512293SJames.McPherson@Sun.COM 	 * called by default.  this can be changed via /etc/system.
383612293SJames.McPherson@Sun.COM 	 *
383712293SJames.McPherson@Sun.COM 	 * page_get_contig_pages() also called to acquire a base pagesize page
383812293SJames.McPherson@Sun.COM 	 * for page_create_get_something().
383912293SJames.McPherson@Sun.COM 	 */
384012293SJames.McPherson@Sun.COM 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
384112293SJames.McPherson@Sun.COM 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
384212293SJames.McPherson@Sun.COM 	    (page_get_func != page_get_contig_pages)) {
384312293SJames.McPherson@Sun.COM 
384412293SJames.McPherson@Sun.COM 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
384512293SJames.McPherson@Sun.COM 		page_get_func = page_get_contig_pages;
384612293SJames.McPherson@Sun.COM 		goto pgretry;
384712293SJames.McPherson@Sun.COM 	}
384812293SJames.McPherson@Sun.COM 
384912293SJames.McPherson@Sun.COM 	if (!(flags & PG_LOCAL) && pgcplimitsearch &&
385012293SJames.McPherson@Sun.COM 	    page_get_func == page_get_contig_pages)
385112293SJames.McPherson@Sun.COM 		SETPGCPFAILCNT(szc);
385212293SJames.McPherson@Sun.COM 
385312293SJames.McPherson@Sun.COM 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
385412293SJames.McPherson@Sun.COM 	return (NULL);
38550Sstevel@tonic-gate }
38560Sstevel@tonic-gate 
38570Sstevel@tonic-gate /*
38580Sstevel@tonic-gate  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
38590Sstevel@tonic-gate  *
38600Sstevel@tonic-gate  * Does its own locking.
38610Sstevel@tonic-gate  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
38620Sstevel@tonic-gate  * pages of the proper color even if there are pages of a different color.
38630Sstevel@tonic-gate  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
38640Sstevel@tonic-gate  * try to lock one of them.  If no page can be locked, try the
38650Sstevel@tonic-gate  * next bin.  Return NULL if a page can not be found and locked.
38660Sstevel@tonic-gate  *
38670Sstevel@tonic-gate  * Finds a pages, trys to lock it, then removes it.
38680Sstevel@tonic-gate  */
38690Sstevel@tonic-gate 
38700Sstevel@tonic-gate /*ARGSUSED*/
38710Sstevel@tonic-gate page_t *
page_get_cachelist(struct vnode * vp,u_offset_t off,struct seg * seg,caddr_t vaddr,uint_t flags,struct lgrp * lgrp)38720Sstevel@tonic-gate page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
38730Sstevel@tonic-gate     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
38740Sstevel@tonic-gate {
38750Sstevel@tonic-gate 	page_t		*pp;
38760Sstevel@tonic-gate 	struct as	*as = seg->s_as;
38770Sstevel@tonic-gate 	ulong_t		bin;
38780Sstevel@tonic-gate 	/*LINTED*/
38790Sstevel@tonic-gate 	int		mnode;
38800Sstevel@tonic-gate 	int		mtype;
38810Sstevel@tonic-gate 	lgrp_mnode_cookie_t	lgrp_cookie;
38820Sstevel@tonic-gate 
38830Sstevel@tonic-gate 	/*
38840Sstevel@tonic-gate 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
38850Sstevel@tonic-gate 	 * assume we wish to allocate near to the current thread's home.
38860Sstevel@tonic-gate 	 */
38870Sstevel@tonic-gate 	if (!LGRP_EXISTS(lgrp))
38880Sstevel@tonic-gate 		lgrp = lgrp_home_lgrp();
38890Sstevel@tonic-gate 
38900Sstevel@tonic-gate 	if (!kcage_on) {
38910Sstevel@tonic-gate 		flags &= ~PG_NORELOC;
38920Sstevel@tonic-gate 		flags |= PGI_NOCAGE;
38930Sstevel@tonic-gate 	}
38940Sstevel@tonic-gate 
38950Sstevel@tonic-gate 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
38960Sstevel@tonic-gate 	    kcage_freemem <= kcage_throttlefree) {
38970Sstevel@tonic-gate 		/*
38980Sstevel@tonic-gate 		 * Reserve kcage_throttlefree pages for critical kernel
38990Sstevel@tonic-gate 		 * threads.
39000Sstevel@tonic-gate 		 *
39010Sstevel@tonic-gate 		 * Everybody else has to go to page_create_get_something()
39020Sstevel@tonic-gate 		 * to get a cage page, so we don't deadlock cageout.
39030Sstevel@tonic-gate 		 */
39040Sstevel@tonic-gate 		return (NULL);
39050Sstevel@tonic-gate 	}
39060Sstevel@tonic-gate 
39070Sstevel@tonic-gate 	/* LINTED */
390812293SJames.McPherson@Sun.COM 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
39092961Sdp78419 
39102961Sdp78419 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
39110Sstevel@tonic-gate 
39120Sstevel@tonic-gate 	/* LINTED */
39131385Skchow 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
39140Sstevel@tonic-gate 
39150Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
39160Sstevel@tonic-gate 
39170Sstevel@tonic-gate 	/*
39180Sstevel@tonic-gate 	 * Try local cachelists first
39190Sstevel@tonic-gate 	 */
39200Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
39210Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
39220Sstevel@tonic-gate 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
39230Sstevel@tonic-gate 		if (pp != NULL) {
39240Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
39250Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
39260Sstevel@tonic-gate 			    lgrp_t *, lgrp,
39270Sstevel@tonic-gate 			    int, mnode,
39280Sstevel@tonic-gate 			    ulong_t, bin,
39290Sstevel@tonic-gate 			    uint_t, flags);
39300Sstevel@tonic-gate 			return (pp);
39310Sstevel@tonic-gate 		}
39320Sstevel@tonic-gate 	}
39330Sstevel@tonic-gate 
39340Sstevel@tonic-gate 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
39350Sstevel@tonic-gate 
39360Sstevel@tonic-gate 	/*
39370Sstevel@tonic-gate 	 * Try freelists/cachelists that are farther away
39380Sstevel@tonic-gate 	 * This is our only chance to allocate remote pages for PAGESIZE
39390Sstevel@tonic-gate 	 * requests.
39400Sstevel@tonic-gate 	 */
39410Sstevel@tonic-gate 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
39420Sstevel@tonic-gate 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
394312293SJames.McPherson@Sun.COM 		pp = page_get_mnode_freelist(mnode, bin, mtype,
39440Sstevel@tonic-gate 		    0, flags);
39450Sstevel@tonic-gate 		if (pp != NULL) {
39460Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
39470Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
39480Sstevel@tonic-gate 			    lgrp_t *, lgrp,
39490Sstevel@tonic-gate 			    int, mnode,
39500Sstevel@tonic-gate 			    ulong_t, bin,
39510Sstevel@tonic-gate 			    uint_t, flags);
39520Sstevel@tonic-gate 			return (pp);
39530Sstevel@tonic-gate 		}
39540Sstevel@tonic-gate 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
39550Sstevel@tonic-gate 		if (pp != NULL) {
39560Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
39570Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
39580Sstevel@tonic-gate 			    lgrp_t *, lgrp,
39590Sstevel@tonic-gate 			    int, mnode,
39600Sstevel@tonic-gate 			    ulong_t, bin,
39610Sstevel@tonic-gate 			    uint_t, flags);
39620Sstevel@tonic-gate 			return (pp);
39630Sstevel@tonic-gate 		}
39640Sstevel@tonic-gate 	}
39650Sstevel@tonic-gate 
39660Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
39670Sstevel@tonic-gate 	return (NULL);
39680Sstevel@tonic-gate }
39690Sstevel@tonic-gate 
39700Sstevel@tonic-gate page_t *
page_get_mnode_cachelist(uint_t bin,uint_t flags,int mnode,int mtype)39710Sstevel@tonic-gate page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
39720Sstevel@tonic-gate {
39732961Sdp78419 	kmutex_t		*pcm;
39742961Sdp78419 	page_t			*pp, *first_pp;
39752961Sdp78419 	uint_t			sbin;
39762961Sdp78419 	int			plw_initialized;
39772961Sdp78419 	page_list_walker_t	plw;
39780Sstevel@tonic-gate 
39790Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
39800Sstevel@tonic-gate 
39810Sstevel@tonic-gate 	/* LINTED */
39820Sstevel@tonic-gate 	MTYPE_START(mnode, mtype, flags);
39830Sstevel@tonic-gate 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
39840Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
39850Sstevel@tonic-gate 		return (NULL);
39860Sstevel@tonic-gate 	}
39870Sstevel@tonic-gate 
39882961Sdp78419 try_again:
39892961Sdp78419 
39902961Sdp78419 	plw_initialized = 0;
39912961Sdp78419 	plw.plw_ceq_dif = 1;
39920Sstevel@tonic-gate 
39930Sstevel@tonic-gate 	/*
39940Sstevel@tonic-gate 	 * Only hold one cachelist lock at a time, that way we
39950Sstevel@tonic-gate 	 * can start anywhere and not have to worry about lock
39960Sstevel@tonic-gate 	 * ordering.
39970Sstevel@tonic-gate 	 */
39980Sstevel@tonic-gate 
39992961Sdp78419 	for (plw.plw_count = 0;
40002961Sdp78419 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
40012961Sdp78419 		sbin = bin;
40022961Sdp78419 		do {
40032961Sdp78419 
40042961Sdp78419 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
40052961Sdp78419 				goto bin_empty_1;
400612293SJames.McPherson@Sun.COM 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
40070Sstevel@tonic-gate 			mutex_enter(pcm);
40080Sstevel@tonic-gate 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
40092961Sdp78419 			if (pp == NULL)
40102961Sdp78419 				goto bin_empty_0;
40112961Sdp78419 
40122961Sdp78419 			first_pp = pp;
40132961Sdp78419 			ASSERT(pp->p_vnode);
40142961Sdp78419 			ASSERT(PP_ISAGED(pp) == 0);
40152961Sdp78419 			ASSERT(pp->p_szc == 0);
40162961Sdp78419 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
4017*12342SDave.Plauger@Sun.COM 			while (IS_DUMP_PAGE(pp) || !page_trylock(pp, SE_EXCL)) {
40182961Sdp78419 				pp = pp->p_next;
40192961Sdp78419 				ASSERT(pp->p_szc == 0);
40202961Sdp78419 				if (pp == first_pp) {
40212961Sdp78419 					/*
40222961Sdp78419 					 * We have searched the complete list!
40232961Sdp78419 					 * And all of them (might only be one)
40242961Sdp78419 					 * are locked. This can happen since
40252961Sdp78419 					 * these pages can also be found via
40262961Sdp78419 					 * the hash list. When found via the
40272961Sdp78419 					 * hash list, they are locked first,
40282961Sdp78419 					 * then removed. We give up to let the
40292961Sdp78419 					 * other thread run.
40302961Sdp78419 					 */
40312961Sdp78419 					pp = NULL;
40322961Sdp78419 					break;
40332961Sdp78419 				}
40342961Sdp78419 				ASSERT(pp->p_vnode);
40352961Sdp78419 				ASSERT(PP_ISFREE(pp));
40362961Sdp78419 				ASSERT(PP_ISAGED(pp) == 0);
40372961Sdp78419 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
40382961Sdp78419 				    mnode);
40392961Sdp78419 			}
40402961Sdp78419 
40412961Sdp78419 			if (pp) {
40422961Sdp78419 				page_t	**ppp;
40432961Sdp78419 				/*
40442961Sdp78419 				 * Found and locked a page.
40452961Sdp78419 				 * Pull it off the list.
40462961Sdp78419 				 */
40472961Sdp78419 				ASSERT(mtype == PP_2_MTYPE(pp));
40482961Sdp78419 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
40492961Sdp78419 				page_sub(ppp, pp);
40502961Sdp78419 				/*
40512961Sdp78419 				 * Subtract counters before releasing pcm mutex
40522961Sdp78419 				 * to avoid a race with page_freelist_coalesce
40532961Sdp78419 				 * and page_freelist_split.
40542961Sdp78419 				 */
40552961Sdp78419 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
40562961Sdp78419 				mutex_exit(pcm);
40570Sstevel@tonic-gate 				ASSERT(pp->p_vnode);
40580Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp) == 0);
40592961Sdp78419 #if defined(__sparc)
40602961Sdp78419 				ASSERT(!kcage_on ||
40612961Sdp78419 				    (flags & PG_NORELOC) == 0 ||
40622961Sdp78419 				    PP_ISNORELOC(pp));
40632961Sdp78419 				if (PP_ISNORELOC(pp)) {
40642961Sdp78419 					kcage_freemem_sub(1);
40650Sstevel@tonic-gate 				}
406612293SJames.McPherson@Sun.COM #endif
40672961Sdp78419 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
40682961Sdp78419 				return (pp);
40690Sstevel@tonic-gate 			}
40702961Sdp78419 bin_empty_0:
40710Sstevel@tonic-gate 			mutex_exit(pcm);
40722961Sdp78419 bin_empty_1:
40732961Sdp78419 			if (plw_initialized == 0) {
40742961Sdp78419 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
40752961Sdp78419 				plw_initialized = 1;
40760Sstevel@tonic-gate 			}
40772961Sdp78419 			/* calculate the next bin with equivalent color */
40782961Sdp78419 			bin = ADD_MASKED(bin, plw.plw_bin_step,
40792961Sdp78419 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
40802961Sdp78419 		} while (sbin != bin);
40812961Sdp78419 
40822961Sdp78419 		if (plw.plw_ceq_dif > 1)
40832961Sdp78419 			bin = page_list_walk_next_bin(0, bin, &plw);
40840Sstevel@tonic-gate 	}
40850Sstevel@tonic-gate 
4086414Skchow 	MTYPE_NEXT(mnode, mtype, flags);
4087414Skchow 	if (mtype >= 0)
40882961Sdp78419 		goto try_again;
4089414Skchow 
40900Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
40910Sstevel@tonic-gate 	return (NULL);
40920Sstevel@tonic-gate }
40930Sstevel@tonic-gate 
40940Sstevel@tonic-gate #ifdef DEBUG
40950Sstevel@tonic-gate #define	REPL_PAGE_STATS
40960Sstevel@tonic-gate #endif /* DEBUG */
40970Sstevel@tonic-gate 
40980Sstevel@tonic-gate #ifdef REPL_PAGE_STATS
40990Sstevel@tonic-gate struct repl_page_stats {
41000Sstevel@tonic-gate 	uint_t	ngets;
41010Sstevel@tonic-gate 	uint_t	ngets_noreloc;
41020Sstevel@tonic-gate 	uint_t	npgr_noreloc;
41030Sstevel@tonic-gate 	uint_t	nnopage_first;
41040Sstevel@tonic-gate 	uint_t	nnopage;
41050Sstevel@tonic-gate 	uint_t	nhashout;
41060Sstevel@tonic-gate 	uint_t	nnofree;
41070Sstevel@tonic-gate 	uint_t	nnext_pp;
41080Sstevel@tonic-gate } repl_page_stats;
41090Sstevel@tonic-gate #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
41100Sstevel@tonic-gate #else /* REPL_PAGE_STATS */
41110Sstevel@tonic-gate #define	REPL_STAT_INCR(v)
41120Sstevel@tonic-gate #endif /* REPL_PAGE_STATS */
41130Sstevel@tonic-gate 
41140Sstevel@tonic-gate int	pgrppgcp;
41150Sstevel@tonic-gate 
41160Sstevel@tonic-gate /*
41170Sstevel@tonic-gate  * The freemem accounting must be done by the caller.
41180Sstevel@tonic-gate  * First we try to get a replacement page of the same size as like_pp,
41190Sstevel@tonic-gate  * if that is not possible, then we just get a set of discontiguous
41200Sstevel@tonic-gate  * PAGESIZE pages.
41210Sstevel@tonic-gate  */
41220Sstevel@tonic-gate page_t *
page_get_replacement_page(page_t * orig_like_pp,struct lgrp * lgrp_target,uint_t pgrflags)412350Sjjc page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
41240Sstevel@tonic-gate     uint_t pgrflags)
41250Sstevel@tonic-gate {
41260Sstevel@tonic-gate 	page_t		*like_pp;
41270Sstevel@tonic-gate 	page_t		*pp, *pplist;
41280Sstevel@tonic-gate 	page_t		*pl = NULL;
41290Sstevel@tonic-gate 	ulong_t		bin;
41300Sstevel@tonic-gate 	int		mnode, page_mnode;
41310Sstevel@tonic-gate 	int		szc;
41320Sstevel@tonic-gate 	spgcnt_t	npgs, pg_cnt;
41330Sstevel@tonic-gate 	pfn_t		pfnum;
41340Sstevel@tonic-gate 	int		mtype;
41350Sstevel@tonic-gate 	int		flags = 0;
41360Sstevel@tonic-gate 	lgrp_mnode_cookie_t	lgrp_cookie;
413750Sjjc 	lgrp_t		*lgrp;
41380Sstevel@tonic-gate 
41390Sstevel@tonic-gate 	REPL_STAT_INCR(ngets);
41400Sstevel@tonic-gate 	like_pp = orig_like_pp;
41410Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(like_pp));
41420Sstevel@tonic-gate 
41430Sstevel@tonic-gate 	szc = like_pp->p_szc;
41440Sstevel@tonic-gate 	npgs = page_get_pagecnt(szc);
41450Sstevel@tonic-gate 	/*
41460Sstevel@tonic-gate 	 * Now we reset like_pp to the base page_t.
41470Sstevel@tonic-gate 	 * That way, we won't walk past the end of this 'szc' page.
41480Sstevel@tonic-gate 	 */
41490Sstevel@tonic-gate 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
41500Sstevel@tonic-gate 	like_pp = page_numtopp_nolock(pfnum);
41510Sstevel@tonic-gate 	ASSERT(like_pp->p_szc == szc);
41520Sstevel@tonic-gate 
41530Sstevel@tonic-gate 	if (PP_ISNORELOC(like_pp)) {
41540Sstevel@tonic-gate 		ASSERT(kcage_on);
41550Sstevel@tonic-gate 		REPL_STAT_INCR(ngets_noreloc);
41560Sstevel@tonic-gate 		flags = PGI_RELOCONLY;
41570Sstevel@tonic-gate 	} else if (pgrflags & PGR_NORELOC) {
41580Sstevel@tonic-gate 		ASSERT(kcage_on);
41590Sstevel@tonic-gate 		REPL_STAT_INCR(npgr_noreloc);
41600Sstevel@tonic-gate 		flags = PG_NORELOC;
41610Sstevel@tonic-gate 	}
41620Sstevel@tonic-gate 
41630Sstevel@tonic-gate 	/*
41640Sstevel@tonic-gate 	 * Kernel pages must always be replaced with the same size
41650Sstevel@tonic-gate 	 * pages, since we cannot properly handle demotion of kernel
41660Sstevel@tonic-gate 	 * pages.
41670Sstevel@tonic-gate 	 */
41683290Sjohansen 	if (PP_ISKAS(like_pp))
41690Sstevel@tonic-gate 		pgrflags |= PGR_SAMESZC;
41700Sstevel@tonic-gate 
41710Sstevel@tonic-gate 	/* LINTED */
41721385Skchow 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
41730Sstevel@tonic-gate 
41740Sstevel@tonic-gate 	while (npgs) {
41750Sstevel@tonic-gate 		pplist = NULL;
41760Sstevel@tonic-gate 		for (;;) {
41770Sstevel@tonic-gate 			pg_cnt = page_get_pagecnt(szc);
41780Sstevel@tonic-gate 			bin = PP_2_BIN(like_pp);
41790Sstevel@tonic-gate 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
41800Sstevel@tonic-gate 			ASSERT(pg_cnt <= npgs);
41810Sstevel@tonic-gate 
41820Sstevel@tonic-gate 			/*
41830Sstevel@tonic-gate 			 * If an lgroup was specified, try to get the
41840Sstevel@tonic-gate 			 * page from that lgroup.
418550Sjjc 			 * NOTE: Must be careful with code below because
418650Sjjc 			 *	 lgroup may disappear and reappear since there
418750Sjjc 			 *	 is no locking for lgroup here.
41880Sstevel@tonic-gate 			 */
418950Sjjc 			if (LGRP_EXISTS(lgrp_target)) {
419050Sjjc 				/*
419150Sjjc 				 * Keep local variable for lgroup separate
419250Sjjc 				 * from lgroup argument since this code should
419350Sjjc 				 * only be exercised when lgroup argument
419450Sjjc 				 * exists....
419550Sjjc 				 */
419650Sjjc 				lgrp = lgrp_target;
419750Sjjc 
41980Sstevel@tonic-gate 				/* Try the lgroup's freelists first */
41990Sstevel@tonic-gate 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
42000Sstevel@tonic-gate 				    LGRP_SRCH_LOCAL);
42010Sstevel@tonic-gate 				while ((pplist == NULL) &&
42020Sstevel@tonic-gate 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
42030Sstevel@tonic-gate 				    != -1) {
42044769Sdp78419 					pplist =
420512293SJames.McPherson@Sun.COM 					    page_get_mnode_freelist(mnode, bin,
420612293SJames.McPherson@Sun.COM 					    mtype, szc, flags);
42070Sstevel@tonic-gate 				}
42080Sstevel@tonic-gate 
42090Sstevel@tonic-gate 				/*
42100Sstevel@tonic-gate 				 * Now try it's cachelists if this is a
42110Sstevel@tonic-gate 				 * small page. Don't need to do it for
42120Sstevel@tonic-gate 				 * larger ones since page_freelist_coalesce()
42130Sstevel@tonic-gate 				 * already failed.
42140Sstevel@tonic-gate 				 */
42150Sstevel@tonic-gate 				if (pplist != NULL || szc != 0)
42160Sstevel@tonic-gate 					break;
42170Sstevel@tonic-gate 
42180Sstevel@tonic-gate 				/* Now try it's cachelists */
42190Sstevel@tonic-gate 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
42200Sstevel@tonic-gate 				    LGRP_SRCH_LOCAL);
42210Sstevel@tonic-gate 
42220Sstevel@tonic-gate 				while ((pplist == NULL) &&
42230Sstevel@tonic-gate 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
42240Sstevel@tonic-gate 				    != -1) {
42254769Sdp78419 					pplist =
42264769Sdp78419 					    page_get_mnode_cachelist(bin, flags,
42274769Sdp78419 					    mnode, mtype);
42280Sstevel@tonic-gate 				}
42290Sstevel@tonic-gate 				if (pplist != NULL) {
42300Sstevel@tonic-gate 					page_hashout(pplist, NULL);
42310Sstevel@tonic-gate 					PP_SETAGED(pplist);
42320Sstevel@tonic-gate 					REPL_STAT_INCR(nhashout);
42330Sstevel@tonic-gate 					break;
42340Sstevel@tonic-gate 				}
42350Sstevel@tonic-gate 				/* Done looking in this lgroup. Bail out. */
42360Sstevel@tonic-gate 				break;
42370Sstevel@tonic-gate 			}
42380Sstevel@tonic-gate 
42390Sstevel@tonic-gate 			/*
424050Sjjc 			 * No lgroup was specified (or lgroup was removed by
424150Sjjc 			 * DR, so just try to get the page as close to
424250Sjjc 			 * like_pp's mnode as possible.
42430Sstevel@tonic-gate 			 * First try the local freelist...
42440Sstevel@tonic-gate 			 */
42450Sstevel@tonic-gate 			mnode = PP_2_MEM_NODE(like_pp);
424612293SJames.McPherson@Sun.COM 			pplist = page_get_mnode_freelist(mnode, bin,
42470Sstevel@tonic-gate 			    mtype, szc, flags);
42480Sstevel@tonic-gate 			if (pplist != NULL)
42490Sstevel@tonic-gate 				break;
42500Sstevel@tonic-gate 
42510Sstevel@tonic-gate 			REPL_STAT_INCR(nnofree);
42520Sstevel@tonic-gate 
42530Sstevel@tonic-gate 			/*
42540Sstevel@tonic-gate 			 * ...then the local cachelist. Don't need to do it for
42550Sstevel@tonic-gate 			 * larger pages cause page_freelist_coalesce() already
42560Sstevel@tonic-gate 			 * failed there anyway.
42570Sstevel@tonic-gate 			 */
42580Sstevel@tonic-gate 			if (szc == 0) {
42590Sstevel@tonic-gate 				pplist = page_get_mnode_cachelist(bin, flags,
42600Sstevel@tonic-gate 				    mnode, mtype);
42610Sstevel@tonic-gate 				if (pplist != NULL) {
42620Sstevel@tonic-gate 					page_hashout(pplist, NULL);
42630Sstevel@tonic-gate 					PP_SETAGED(pplist);
42640Sstevel@tonic-gate 					REPL_STAT_INCR(nhashout);
42650Sstevel@tonic-gate 					break;
42660Sstevel@tonic-gate 				}
42670Sstevel@tonic-gate 			}
42680Sstevel@tonic-gate 
42690Sstevel@tonic-gate 			/* Now try remote freelists */
42700Sstevel@tonic-gate 			page_mnode = mnode;
42710Sstevel@tonic-gate 			lgrp =
42720Sstevel@tonic-gate 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
42730Sstevel@tonic-gate 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
42740Sstevel@tonic-gate 			    LGRP_SRCH_HIER);
42750Sstevel@tonic-gate 			while (pplist == NULL &&
42760Sstevel@tonic-gate 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
42770Sstevel@tonic-gate 			    != -1) {
42780Sstevel@tonic-gate 				/*
42790Sstevel@tonic-gate 				 * Skip local mnode.
42800Sstevel@tonic-gate 				 */
42810Sstevel@tonic-gate 				if ((mnode == page_mnode) ||
42820Sstevel@tonic-gate 				    (mem_node_config[mnode].exists == 0))
42830Sstevel@tonic-gate 					continue;
42840Sstevel@tonic-gate 
428512293SJames.McPherson@Sun.COM 				pplist = page_get_mnode_freelist(mnode,
42860Sstevel@tonic-gate 				    bin, mtype, szc, flags);
42870Sstevel@tonic-gate 			}
42880Sstevel@tonic-gate 
42890Sstevel@tonic-gate 			if (pplist != NULL)
42900Sstevel@tonic-gate 				break;
42910Sstevel@tonic-gate 
429212293SJames.McPherson@Sun.COM 
42930Sstevel@tonic-gate 			/* Now try remote cachelists */
42940Sstevel@tonic-gate 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
42950Sstevel@tonic-gate 			    LGRP_SRCH_HIER);
42960Sstevel@tonic-gate 			while (pplist == NULL && szc == 0) {
42970Sstevel@tonic-gate 				mnode = lgrp_memnode_choose(&lgrp_cookie);
42980Sstevel@tonic-gate 				if (mnode == -1)
42990Sstevel@tonic-gate 					break;
43000Sstevel@tonic-gate 				/*
43010Sstevel@tonic-gate 				 * Skip local mnode.
43020Sstevel@tonic-gate 				 */
43030Sstevel@tonic-gate 				if ((mnode == page_mnode) ||
43040Sstevel@tonic-gate 				    (mem_node_config[mnode].exists == 0))
43050Sstevel@tonic-gate 					continue;
43060Sstevel@tonic-gate 
43070Sstevel@tonic-gate 				pplist = page_get_mnode_cachelist(bin,
43080Sstevel@tonic-gate 				    flags, mnode, mtype);
43090Sstevel@tonic-gate 
43100Sstevel@tonic-gate 				if (pplist != NULL) {
43110Sstevel@tonic-gate 					page_hashout(pplist, NULL);
43120Sstevel@tonic-gate 					PP_SETAGED(pplist);
43130Sstevel@tonic-gate 					REPL_STAT_INCR(nhashout);
43140Sstevel@tonic-gate 					break;
43150Sstevel@tonic-gate 				}
43160Sstevel@tonic-gate 			}
43170Sstevel@tonic-gate 
43180Sstevel@tonic-gate 			/*
43190Sstevel@tonic-gate 			 * Break out of while loop under the following cases:
43200Sstevel@tonic-gate 			 * - If we successfully got a page.
43210Sstevel@tonic-gate 			 * - If pgrflags specified only returning a specific
43220Sstevel@tonic-gate 			 *   page size and we could not find that page size.
43230Sstevel@tonic-gate 			 * - If we could not satisfy the request with PAGESIZE
43240Sstevel@tonic-gate 			 *   or larger pages.
43250Sstevel@tonic-gate 			 */
43260Sstevel@tonic-gate 			if (pplist != NULL || szc == 0)
43270Sstevel@tonic-gate 				break;
43280Sstevel@tonic-gate 
43290Sstevel@tonic-gate 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
43300Sstevel@tonic-gate 				/* try to find contig page */
43310Sstevel@tonic-gate 
43320Sstevel@tonic-gate 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
43330Sstevel@tonic-gate 				    LGRP_SRCH_HIER);
43340Sstevel@tonic-gate 
43350Sstevel@tonic-gate 				while ((pplist == NULL) &&
43360Sstevel@tonic-gate 				    (mnode =
43374769Sdp78419 				    lgrp_memnode_choose(&lgrp_cookie))
43380Sstevel@tonic-gate 				    != -1) {
43390Sstevel@tonic-gate 					pplist = page_get_contig_pages(
434012293SJames.McPherson@Sun.COM 					    mnode, bin, mtype, szc,
43414769Sdp78419 					    flags | PGI_PGCPHIPRI);
43420Sstevel@tonic-gate 				}
43430Sstevel@tonic-gate 				break;
43440Sstevel@tonic-gate 			}
43450Sstevel@tonic-gate 
43460Sstevel@tonic-gate 			/*
43470Sstevel@tonic-gate 			 * The correct thing to do here is try the next
43480Sstevel@tonic-gate 			 * page size down using szc--. Due to a bug
43490Sstevel@tonic-gate 			 * with the processing of HAT_RELOAD_SHARE
43500Sstevel@tonic-gate 			 * where the sfmmu_ttecnt arrays of all
43510Sstevel@tonic-gate 			 * hats sharing an ISM segment don't get updated,
43520Sstevel@tonic-gate 			 * using intermediate size pages for relocation
43530Sstevel@tonic-gate 			 * can lead to continuous page faults.
43540Sstevel@tonic-gate 			 */
43550Sstevel@tonic-gate 			szc = 0;
43560Sstevel@tonic-gate 		}
43570Sstevel@tonic-gate 
43580Sstevel@tonic-gate 		if (pplist != NULL) {
43590Sstevel@tonic-gate 			DTRACE_PROBE4(page__get,
43600Sstevel@tonic-gate 			    lgrp_t *, lgrp,
43610Sstevel@tonic-gate 			    int, mnode,
43620Sstevel@tonic-gate 			    ulong_t, bin,
43630Sstevel@tonic-gate 			    uint_t, flags);
43640Sstevel@tonic-gate 
43650Sstevel@tonic-gate 			while (pplist != NULL && pg_cnt--) {
43660Sstevel@tonic-gate 				ASSERT(pplist != NULL);
43670Sstevel@tonic-gate 				pp = pplist;
43680Sstevel@tonic-gate 				page_sub(&pplist, pp);
43690Sstevel@tonic-gate 				PP_CLRFREE(pp);
43700Sstevel@tonic-gate 				PP_CLRAGED(pp);
43710Sstevel@tonic-gate 				page_list_concat(&pl, &pp);
43720Sstevel@tonic-gate 				npgs--;
43730Sstevel@tonic-gate 				like_pp = like_pp + 1;
43740Sstevel@tonic-gate 				REPL_STAT_INCR(nnext_pp);
43750Sstevel@tonic-gate 			}
43760Sstevel@tonic-gate 			ASSERT(pg_cnt == 0);
43770Sstevel@tonic-gate 		} else {
43780Sstevel@tonic-gate 			break;
43790Sstevel@tonic-gate 		}
43800Sstevel@tonic-gate 	}
43810Sstevel@tonic-gate 
43820Sstevel@tonic-gate 	if (npgs) {
43830Sstevel@tonic-gate 		/*
43840Sstevel@tonic-gate 		 * We were unable to allocate the necessary number
43850Sstevel@tonic-gate 		 * of pages.
43860Sstevel@tonic-gate 		 * We need to free up any pl.
43870Sstevel@tonic-gate 		 */
43880Sstevel@tonic-gate 		REPL_STAT_INCR(nnopage);
43890Sstevel@tonic-gate 		page_free_replacement_page(pl);
43900Sstevel@tonic-gate 		return (NULL);
43910Sstevel@tonic-gate 	} else {
43920Sstevel@tonic-gate 		return (pl);
43930Sstevel@tonic-gate 	}
43940Sstevel@tonic-gate }
43950Sstevel@tonic-gate 
43960Sstevel@tonic-gate /*
43970Sstevel@tonic-gate  * demote a free large page to it's constituent pages
43980Sstevel@tonic-gate  */
43990Sstevel@tonic-gate void
page_demote_free_pages(page_t * pp)44000Sstevel@tonic-gate page_demote_free_pages(page_t *pp)
44010Sstevel@tonic-gate {
44020Sstevel@tonic-gate 
44030Sstevel@tonic-gate 	int mnode;
44040Sstevel@tonic-gate 
44050Sstevel@tonic-gate 	ASSERT(pp != NULL);
44060Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
44070Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
44080Sstevel@tonic-gate 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
44090Sstevel@tonic-gate 
44100Sstevel@tonic-gate 	mnode = PP_2_MEM_NODE(pp);
44110Sstevel@tonic-gate 	page_freelist_lock(mnode);
44120Sstevel@tonic-gate 	if (pp->p_szc != 0) {
44130Sstevel@tonic-gate 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
44147656SSherry.Moore@Sun.COM 		    pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
44150Sstevel@tonic-gate 	}
44160Sstevel@tonic-gate 	page_freelist_unlock(mnode);
44170Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
44180Sstevel@tonic-gate }
44193717Sdp78419 
44203717Sdp78419 /*
44213717Sdp78419  * Factor in colorequiv to check additional 'equivalent' bins.
44223717Sdp78419  * colorequiv may be set in /etc/system
44233717Sdp78419  */
44243717Sdp78419 void
page_set_colorequiv_arr(void)44253717Sdp78419 page_set_colorequiv_arr(void)
44263717Sdp78419 {
44273717Sdp78419 	if (colorequiv > 1) {
44283717Sdp78419 		int i;
44293733Sdp78419 		uint_t sv_a = lowbit(colorequiv) - 1;
44303733Sdp78419 
44313733Sdp78419 		if (sv_a > 15)
44323733Sdp78419 			sv_a = 15;
44333717Sdp78419 
44343717Sdp78419 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
44353733Sdp78419 			uint_t colors;
44363733Sdp78419 			uint_t a = sv_a;
44373717Sdp78419 
44383717Sdp78419 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
44393717Sdp78419 				continue;
44403717Sdp78419 			}
44413717Sdp78419 			while ((colors >> a) == 0)
44423717Sdp78419 				a--;
44433717Sdp78419 			if ((a << 4) > colorequivszc[i]) {
44443717Sdp78419 				colorequivszc[i] = (a << 4);
44453717Sdp78419 			}
44463717Sdp78419 		}
44473717Sdp78419 	}
44483717Sdp78419 }
4449