1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate /*
23*0Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*0Sstevel@tonic-gate  * Use is subject to license terms.
25*0Sstevel@tonic-gate  */
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T	*/
28*0Sstevel@tonic-gate /*	  All Rights Reserved  	*/
29*0Sstevel@tonic-gate 
30*0Sstevel@tonic-gate /*
31*0Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
32*0Sstevel@tonic-gate  * The Regents of the University of California
33*0Sstevel@tonic-gate  * All Rights Reserved
34*0Sstevel@tonic-gate  *
35*0Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
36*0Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
37*0Sstevel@tonic-gate  * contributors.
38*0Sstevel@tonic-gate  */
39*0Sstevel@tonic-gate 
40*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
41*0Sstevel@tonic-gate 
42*0Sstevel@tonic-gate /*
43*0Sstevel@tonic-gate  * VM - physical page management.
44*0Sstevel@tonic-gate  */
45*0Sstevel@tonic-gate 
46*0Sstevel@tonic-gate #include <sys/types.h>
47*0Sstevel@tonic-gate #include <sys/t_lock.h>
48*0Sstevel@tonic-gate #include <sys/param.h>
49*0Sstevel@tonic-gate #include <sys/systm.h>
50*0Sstevel@tonic-gate #include <sys/errno.h>
51*0Sstevel@tonic-gate #include <sys/time.h>
52*0Sstevel@tonic-gate #include <sys/vnode.h>
53*0Sstevel@tonic-gate #include <sys/vm.h>
54*0Sstevel@tonic-gate #include <sys/vtrace.h>
55*0Sstevel@tonic-gate #include <sys/swap.h>
56*0Sstevel@tonic-gate #include <sys/cmn_err.h>
57*0Sstevel@tonic-gate #include <sys/tuneable.h>
58*0Sstevel@tonic-gate #include <sys/sysmacros.h>
59*0Sstevel@tonic-gate #include <sys/cpuvar.h>
60*0Sstevel@tonic-gate #include <sys/callb.h>
61*0Sstevel@tonic-gate #include <sys/debug.h>
62*0Sstevel@tonic-gate #include <sys/tnf_probe.h>
63*0Sstevel@tonic-gate #include <sys/condvar_impl.h>
64*0Sstevel@tonic-gate #include <sys/mem_config.h>
65*0Sstevel@tonic-gate #include <sys/mem_cage.h>
66*0Sstevel@tonic-gate #include <sys/kmem.h>
67*0Sstevel@tonic-gate #include <sys/atomic.h>
68*0Sstevel@tonic-gate #include <sys/strlog.h>
69*0Sstevel@tonic-gate #include <sys/mman.h>
70*0Sstevel@tonic-gate #include <sys/ontrap.h>
71*0Sstevel@tonic-gate #include <sys/lgrp.h>
72*0Sstevel@tonic-gate #include <sys/vfs.h>
73*0Sstevel@tonic-gate 
74*0Sstevel@tonic-gate #include <vm/hat.h>
75*0Sstevel@tonic-gate #include <vm/anon.h>
76*0Sstevel@tonic-gate #include <vm/page.h>
77*0Sstevel@tonic-gate #include <vm/seg.h>
78*0Sstevel@tonic-gate #include <vm/pvn.h>
79*0Sstevel@tonic-gate #include <vm/seg_kmem.h>
80*0Sstevel@tonic-gate #include <vm/vm_dep.h>
81*0Sstevel@tonic-gate 
82*0Sstevel@tonic-gate #include <fs/fs_subr.h>
83*0Sstevel@tonic-gate 
84*0Sstevel@tonic-gate static int nopageage = 0;
85*0Sstevel@tonic-gate 
86*0Sstevel@tonic-gate static pgcnt_t max_page_get;	/* max page_get request size in pages */
87*0Sstevel@tonic-gate pgcnt_t total_pages = 0;	/* total number of pages (used by /proc) */
88*0Sstevel@tonic-gate 
89*0Sstevel@tonic-gate /*
90*0Sstevel@tonic-gate  * vnode for all pages which are retired from the VM system;
91*0Sstevel@tonic-gate  * such as pages with Uncorrectable Errors.
92*0Sstevel@tonic-gate  */
93*0Sstevel@tonic-gate struct vnode retired_ppages;
94*0Sstevel@tonic-gate 
95*0Sstevel@tonic-gate static void	page_retired_init(void);
96*0Sstevel@tonic-gate static void	retired_dispose(vnode_t *vp, page_t *pp, int flag,
97*0Sstevel@tonic-gate 			int dn, cred_t *cr);
98*0Sstevel@tonic-gate static void	retired_inactive(vnode_t *vp, cred_t *cr);
99*0Sstevel@tonic-gate static void	page_retired(page_t *pp);
100*0Sstevel@tonic-gate static void	retired_page_removed(page_t *pp);
101*0Sstevel@tonic-gate void		page_unretire_pages(void);
102*0Sstevel@tonic-gate 
103*0Sstevel@tonic-gate /*
104*0Sstevel@tonic-gate  * The maximum number of pages that will be unretired in one iteration.
105*0Sstevel@tonic-gate  * This number is totally arbitrary.
106*0Sstevel@tonic-gate  */
107*0Sstevel@tonic-gate #define	UNRETIRE_PAGES		256
108*0Sstevel@tonic-gate 
109*0Sstevel@tonic-gate /*
110*0Sstevel@tonic-gate  * We limit the number of pages that may be retired to
111*0Sstevel@tonic-gate  * a percentage of the total physical memory. Note that
112*0Sstevel@tonic-gate  * the percentage values are  stored as 'basis points',
113*0Sstevel@tonic-gate  * ie, 100 basis points is 1%.
114*0Sstevel@tonic-gate  */
115*0Sstevel@tonic-gate #define	MAX_PAGES_RETIRED_BPS_DEFAULT	10	/* .1% */
116*0Sstevel@tonic-gate 
117*0Sstevel@tonic-gate uint64_t max_pages_retired_bps = MAX_PAGES_RETIRED_BPS_DEFAULT;
118*0Sstevel@tonic-gate 
119*0Sstevel@tonic-gate static int	pages_retired_limit_exceeded(void);
120*0Sstevel@tonic-gate 
121*0Sstevel@tonic-gate /*
122*0Sstevel@tonic-gate  * operations vector for vnode with retired pages. Only VOP_DISPOSE
123*0Sstevel@tonic-gate  * and VOP_INACTIVE are intercepted.
124*0Sstevel@tonic-gate  */
125*0Sstevel@tonic-gate struct vnodeops retired_vnodeops = {
126*0Sstevel@tonic-gate 	"retired_vnodeops",
127*0Sstevel@tonic-gate 	fs_nosys,	/* open */
128*0Sstevel@tonic-gate 	fs_nosys,	/* close */
129*0Sstevel@tonic-gate 	fs_nosys,	/* read */
130*0Sstevel@tonic-gate 	fs_nosys,	/* write */
131*0Sstevel@tonic-gate 	fs_nosys,	/* ioctl */
132*0Sstevel@tonic-gate 	fs_nosys,	/* setfl */
133*0Sstevel@tonic-gate 	fs_nosys,	/* getattr */
134*0Sstevel@tonic-gate 	fs_nosys,	/* setattr */
135*0Sstevel@tonic-gate 	fs_nosys,	/* access */
136*0Sstevel@tonic-gate 	fs_nosys,	/* lookup */
137*0Sstevel@tonic-gate 	fs_nosys,	/* create */
138*0Sstevel@tonic-gate 	fs_nosys,	/* remove */
139*0Sstevel@tonic-gate 	fs_nosys,	/* link */
140*0Sstevel@tonic-gate 	fs_nosys,	/* rename */
141*0Sstevel@tonic-gate 	fs_nosys,	/* mkdir */
142*0Sstevel@tonic-gate 	fs_nosys,	/* rmdir */
143*0Sstevel@tonic-gate 	fs_nosys,	/* readdir */
144*0Sstevel@tonic-gate 	fs_nosys,	/* symlink */
145*0Sstevel@tonic-gate 	fs_nosys,	/* readlink */
146*0Sstevel@tonic-gate 	fs_nosys,	/* fsync */
147*0Sstevel@tonic-gate 	retired_inactive,
148*0Sstevel@tonic-gate 	fs_nosys,	/* fid */
149*0Sstevel@tonic-gate 	fs_rwlock,	/* rwlock */
150*0Sstevel@tonic-gate 	fs_rwunlock,	/* rwunlock */
151*0Sstevel@tonic-gate 	fs_nosys,	/* seek */
152*0Sstevel@tonic-gate 	fs_nosys,	/* cmp */
153*0Sstevel@tonic-gate 	fs_nosys,	/* frlock */
154*0Sstevel@tonic-gate 	fs_nosys,	/* space */
155*0Sstevel@tonic-gate 	fs_nosys,	/* realvp */
156*0Sstevel@tonic-gate 	fs_nosys,	/* getpage */
157*0Sstevel@tonic-gate 	fs_nosys,	/* putpage */
158*0Sstevel@tonic-gate 	fs_nosys_map,
159*0Sstevel@tonic-gate 	fs_nosys_addmap,
160*0Sstevel@tonic-gate 	fs_nosys,	/* delmap */
161*0Sstevel@tonic-gate 	fs_nosys_poll,
162*0Sstevel@tonic-gate 	fs_nosys,	/* dump */
163*0Sstevel@tonic-gate 	fs_nosys,	/* l_pathconf */
164*0Sstevel@tonic-gate 	fs_nosys,	/* pageio */
165*0Sstevel@tonic-gate 	fs_nosys,	/* dumpctl */
166*0Sstevel@tonic-gate 	retired_dispose,
167*0Sstevel@tonic-gate 	fs_nosys,	/* setsecattr */
168*0Sstevel@tonic-gate 	fs_nosys,	/* getsecatt */
169*0Sstevel@tonic-gate 	fs_nosys,	/* shrlock */
170*0Sstevel@tonic-gate 	fs_vnevent_nosupport	/* vnevent */
171*0Sstevel@tonic-gate };
172*0Sstevel@tonic-gate 
173*0Sstevel@tonic-gate /*
174*0Sstevel@tonic-gate  * freemem_lock protects all freemem variables:
175*0Sstevel@tonic-gate  * availrmem. Also this lock protects the globals which track the
176*0Sstevel@tonic-gate  * availrmem changes for accurate kernel footprint calculation.
177*0Sstevel@tonic-gate  * See below for an explanation of these
178*0Sstevel@tonic-gate  * globals.
179*0Sstevel@tonic-gate  */
180*0Sstevel@tonic-gate kmutex_t freemem_lock;
181*0Sstevel@tonic-gate pgcnt_t availrmem;
182*0Sstevel@tonic-gate pgcnt_t availrmem_initial;
183*0Sstevel@tonic-gate 
184*0Sstevel@tonic-gate /*
185*0Sstevel@tonic-gate  * These globals track availrmem changes to get a more accurate
186*0Sstevel@tonic-gate  * estimate of tke kernel size. Historically pp_kernel is used for
187*0Sstevel@tonic-gate  * kernel size and is based on availrmem. But availrmem is adjusted for
188*0Sstevel@tonic-gate  * locked pages in the system not just for kernel locked pages.
189*0Sstevel@tonic-gate  * These new counters will track the pages locked through segvn and
190*0Sstevel@tonic-gate  * by explicit user locking.
191*0Sstevel@tonic-gate  *
192*0Sstevel@tonic-gate  * segvn_pages_locked : This keeps track on a global basis how many pages
193*0Sstevel@tonic-gate  * are currently locked because of I/O.
194*0Sstevel@tonic-gate  *
195*0Sstevel@tonic-gate  * pages_locked : How many pages are locked becuase of user specified
196*0Sstevel@tonic-gate  * locking through mlock or plock.
197*0Sstevel@tonic-gate  *
198*0Sstevel@tonic-gate  * pages_useclaim,pages_claimed : These two variables track the
199*0Sstevel@tonic-gate  * cliam adjustments because of the protection changes on a segvn segment.
200*0Sstevel@tonic-gate  *
201*0Sstevel@tonic-gate  * All these globals are protected by the same lock which protects availrmem.
202*0Sstevel@tonic-gate  */
203*0Sstevel@tonic-gate pgcnt_t segvn_pages_locked;
204*0Sstevel@tonic-gate pgcnt_t pages_locked;
205*0Sstevel@tonic-gate pgcnt_t pages_useclaim;
206*0Sstevel@tonic-gate pgcnt_t pages_claimed;
207*0Sstevel@tonic-gate 
208*0Sstevel@tonic-gate 
209*0Sstevel@tonic-gate /*
210*0Sstevel@tonic-gate  * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
211*0Sstevel@tonic-gate  */
212*0Sstevel@tonic-gate static kmutex_t	new_freemem_lock;
213*0Sstevel@tonic-gate static uint_t	freemem_wait;	/* someone waiting for freemem */
214*0Sstevel@tonic-gate static kcondvar_t freemem_cv;
215*0Sstevel@tonic-gate 
216*0Sstevel@tonic-gate /*
217*0Sstevel@tonic-gate  * The logical page free list is maintained as two lists, the 'free'
218*0Sstevel@tonic-gate  * and the 'cache' lists.
219*0Sstevel@tonic-gate  * The free list contains those pages that should be reused first.
220*0Sstevel@tonic-gate  *
221*0Sstevel@tonic-gate  * The implementation of the lists is machine dependent.
222*0Sstevel@tonic-gate  * page_get_freelist(), page_get_cachelist(),
223*0Sstevel@tonic-gate  * page_list_sub(), and page_list_add()
224*0Sstevel@tonic-gate  * form the interface to the machine dependent implementation.
225*0Sstevel@tonic-gate  *
226*0Sstevel@tonic-gate  * Pages with p_free set are on the cache list.
227*0Sstevel@tonic-gate  * Pages with p_free and p_age set are on the free list,
228*0Sstevel@tonic-gate  *
229*0Sstevel@tonic-gate  * A page may be locked while on either list.
230*0Sstevel@tonic-gate  */
231*0Sstevel@tonic-gate 
232*0Sstevel@tonic-gate /*
233*0Sstevel@tonic-gate  * free list accounting stuff.
234*0Sstevel@tonic-gate  *
235*0Sstevel@tonic-gate  *
236*0Sstevel@tonic-gate  * Spread out the value for the number of pages on the
237*0Sstevel@tonic-gate  * page free and page cache lists.  If there is just one
238*0Sstevel@tonic-gate  * value, then it must be under just one lock.
239*0Sstevel@tonic-gate  * The lock contention and cache traffic are a real bother.
240*0Sstevel@tonic-gate  *
241*0Sstevel@tonic-gate  * When we acquire and then drop a single pcf lock
242*0Sstevel@tonic-gate  * we can start in the middle of the array of pcf structures.
243*0Sstevel@tonic-gate  * If we acquire more than one pcf lock at a time, we need to
244*0Sstevel@tonic-gate  * start at the front to avoid deadlocking.
245*0Sstevel@tonic-gate  *
246*0Sstevel@tonic-gate  * pcf_count holds the number of pages in each pool.
247*0Sstevel@tonic-gate  *
248*0Sstevel@tonic-gate  * pcf_block is set when page_create_get_something() has asked the
249*0Sstevel@tonic-gate  * PSM page freelist and page cachelist routines without specifying
250*0Sstevel@tonic-gate  * a color and nothing came back.  This is used to block anything
251*0Sstevel@tonic-gate  * else from moving pages from one list to the other while the
252*0Sstevel@tonic-gate  * lists are searched again.  If a page is freeed while pcf_block is
253*0Sstevel@tonic-gate  * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
254*0Sstevel@tonic-gate  * of clearning pcf_block, doing the wakeups, etc.
255*0Sstevel@tonic-gate  */
256*0Sstevel@tonic-gate 
257*0Sstevel@tonic-gate #if NCPU <= 4
258*0Sstevel@tonic-gate #define	PAD	1
259*0Sstevel@tonic-gate #define	PCF_FANOUT	4
260*0Sstevel@tonic-gate static	uint_t	pcf_mask = PCF_FANOUT - 1;
261*0Sstevel@tonic-gate #else
262*0Sstevel@tonic-gate #define	PAD	9
263*0Sstevel@tonic-gate #ifdef sun4v
264*0Sstevel@tonic-gate #define	PCF_FANOUT	32
265*0Sstevel@tonic-gate #else
266*0Sstevel@tonic-gate #define	PCF_FANOUT	128
267*0Sstevel@tonic-gate #endif
268*0Sstevel@tonic-gate static	uint_t	pcf_mask = PCF_FANOUT - 1;
269*0Sstevel@tonic-gate #endif
270*0Sstevel@tonic-gate 
271*0Sstevel@tonic-gate struct pcf {
272*0Sstevel@tonic-gate 	uint_t		pcf_touch;	/* just to help the cache */
273*0Sstevel@tonic-gate 	uint_t		pcf_count;	/* page count */
274*0Sstevel@tonic-gate 	kmutex_t	pcf_lock;	/* protects the structure */
275*0Sstevel@tonic-gate 	uint_t		pcf_wait;	/* number of waiters */
276*0Sstevel@tonic-gate 	uint_t		pcf_block; 	/* pcgs flag to page_free() */
277*0Sstevel@tonic-gate 	uint_t		pcf_reserve; 	/* pages freed after pcf_block set */
278*0Sstevel@tonic-gate 	uint_t		pcf_fill[PAD];	/* to line up on the caches */
279*0Sstevel@tonic-gate };
280*0Sstevel@tonic-gate 
281*0Sstevel@tonic-gate static struct	pcf	pcf[PCF_FANOUT];
282*0Sstevel@tonic-gate #define	PCF_INDEX()	((CPU->cpu_id) & (pcf_mask))
283*0Sstevel@tonic-gate 
284*0Sstevel@tonic-gate kmutex_t	pcgs_lock;		/* serializes page_create_get_ */
285*0Sstevel@tonic-gate kmutex_t	pcgs_cagelock;		/* serializes NOSLEEP cage allocs */
286*0Sstevel@tonic-gate kmutex_t	pcgs_wait_lock;		/* used for delay in pcgs */
287*0Sstevel@tonic-gate static kcondvar_t	pcgs_cv;	/* cv for delay in pcgs */
288*0Sstevel@tonic-gate 
289*0Sstevel@tonic-gate #define	PAGE_LOCK_MAXIMUM \
290*0Sstevel@tonic-gate 	((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1)
291*0Sstevel@tonic-gate 
292*0Sstevel@tonic-gate /*
293*0Sstevel@tonic-gate  * Control over the verbosity of page retirement.  When set to zero, no messages
294*0Sstevel@tonic-gate  * will be printed.  A value of one will trigger messages for retirement
295*0Sstevel@tonic-gate  * operations, and is intended for processors which don't yet support FMA
296*0Sstevel@tonic-gate  * (spitfire).  Two will cause verbose messages to be printed when retirements
297*0Sstevel@tonic-gate  * complete, and is intended only for debugging purposes.
298*0Sstevel@tonic-gate  */
299*0Sstevel@tonic-gate int page_retire_messages = 0;
300*0Sstevel@tonic-gate 
301*0Sstevel@tonic-gate #ifdef VM_STATS
302*0Sstevel@tonic-gate 
303*0Sstevel@tonic-gate /*
304*0Sstevel@tonic-gate  * No locks, but so what, they are only statistics.
305*0Sstevel@tonic-gate  */
306*0Sstevel@tonic-gate 
307*0Sstevel@tonic-gate static struct page_tcnt {
308*0Sstevel@tonic-gate 	int	pc_free_cache;		/* free's into cache list */
309*0Sstevel@tonic-gate 	int	pc_free_dontneed;	/* free's with dontneed */
310*0Sstevel@tonic-gate 	int	pc_free_pageout;	/* free's from pageout */
311*0Sstevel@tonic-gate 	int	pc_free_free;		/* free's into free list */
312*0Sstevel@tonic-gate 	int	pc_free_pages;		/* free's into large page free list */
313*0Sstevel@tonic-gate 	int	pc_destroy_pages;	/* large page destroy's */
314*0Sstevel@tonic-gate 	int	pc_get_cache;		/* get's from cache list */
315*0Sstevel@tonic-gate 	int	pc_get_free;		/* get's from free list */
316*0Sstevel@tonic-gate 	int	pc_reclaim;		/* reclaim's */
317*0Sstevel@tonic-gate 	int	pc_abortfree;		/* abort's of free pages */
318*0Sstevel@tonic-gate 	int	pc_find_hit;		/* find's that find page */
319*0Sstevel@tonic-gate 	int	pc_find_miss;		/* find's that don't find page */
320*0Sstevel@tonic-gate 	int	pc_destroy_free;	/* # of free pages destroyed */
321*0Sstevel@tonic-gate #define	PC_HASH_CNT	(4*PAGE_HASHAVELEN)
322*0Sstevel@tonic-gate 	int	pc_find_hashlen[PC_HASH_CNT+1];
323*0Sstevel@tonic-gate 	int	pc_addclaim_pages;
324*0Sstevel@tonic-gate 	int	pc_subclaim_pages;
325*0Sstevel@tonic-gate 	int	pc_free_replacement_page[2];
326*0Sstevel@tonic-gate 	int	pc_try_demote_pages[6];
327*0Sstevel@tonic-gate 	int	pc_demote_pages[2];
328*0Sstevel@tonic-gate } pagecnt;
329*0Sstevel@tonic-gate 
330*0Sstevel@tonic-gate uint_t	hashin_count;
331*0Sstevel@tonic-gate uint_t	hashin_not_held;
332*0Sstevel@tonic-gate uint_t	hashin_already;
333*0Sstevel@tonic-gate 
334*0Sstevel@tonic-gate uint_t	hashout_count;
335*0Sstevel@tonic-gate uint_t	hashout_not_held;
336*0Sstevel@tonic-gate 
337*0Sstevel@tonic-gate uint_t	page_create_count;
338*0Sstevel@tonic-gate uint_t	page_create_not_enough;
339*0Sstevel@tonic-gate uint_t	page_create_not_enough_again;
340*0Sstevel@tonic-gate uint_t	page_create_zero;
341*0Sstevel@tonic-gate uint_t	page_create_hashout;
342*0Sstevel@tonic-gate uint_t	page_create_page_lock_failed;
343*0Sstevel@tonic-gate uint_t	page_create_trylock_failed;
344*0Sstevel@tonic-gate uint_t	page_create_found_one;
345*0Sstevel@tonic-gate uint_t	page_create_hashin_failed;
346*0Sstevel@tonic-gate uint_t	page_create_dropped_phm;
347*0Sstevel@tonic-gate 
348*0Sstevel@tonic-gate uint_t	page_create_new;
349*0Sstevel@tonic-gate uint_t	page_create_exists;
350*0Sstevel@tonic-gate uint_t	page_create_putbacks;
351*0Sstevel@tonic-gate uint_t	page_create_overshoot;
352*0Sstevel@tonic-gate 
353*0Sstevel@tonic-gate uint_t	page_reclaim_zero;
354*0Sstevel@tonic-gate uint_t	page_reclaim_zero_locked;
355*0Sstevel@tonic-gate 
356*0Sstevel@tonic-gate uint_t	page_rename_exists;
357*0Sstevel@tonic-gate uint_t	page_rename_count;
358*0Sstevel@tonic-gate 
359*0Sstevel@tonic-gate uint_t	page_lookup_cnt[20];
360*0Sstevel@tonic-gate uint_t	page_lookup_nowait_cnt[10];
361*0Sstevel@tonic-gate uint_t	page_find_cnt;
362*0Sstevel@tonic-gate uint_t	page_exists_cnt;
363*0Sstevel@tonic-gate uint_t	page_exists_forreal_cnt;
364*0Sstevel@tonic-gate uint_t	page_lookup_dev_cnt;
365*0Sstevel@tonic-gate uint_t	get_cachelist_cnt;
366*0Sstevel@tonic-gate uint_t	page_create_cnt[10];
367*0Sstevel@tonic-gate uint_t	alloc_pages[8];
368*0Sstevel@tonic-gate uint_t	page_exphcontg[19];
369*0Sstevel@tonic-gate uint_t  page_create_large_cnt[10];
370*0Sstevel@tonic-gate 
371*0Sstevel@tonic-gate /*
372*0Sstevel@tonic-gate  * Collects statistics.
373*0Sstevel@tonic-gate  */
374*0Sstevel@tonic-gate #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
375*0Sstevel@tonic-gate 	uint_t	mylen = 0; \
376*0Sstevel@tonic-gate 			\
377*0Sstevel@tonic-gate 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
378*0Sstevel@tonic-gate 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
379*0Sstevel@tonic-gate 			break; \
380*0Sstevel@tonic-gate 	} \
381*0Sstevel@tonic-gate 	if ((pp) != NULL) \
382*0Sstevel@tonic-gate 		pagecnt.pc_find_hit++; \
383*0Sstevel@tonic-gate 	else \
384*0Sstevel@tonic-gate 		pagecnt.pc_find_miss++; \
385*0Sstevel@tonic-gate 	if (mylen > PC_HASH_CNT) \
386*0Sstevel@tonic-gate 		mylen = PC_HASH_CNT; \
387*0Sstevel@tonic-gate 	pagecnt.pc_find_hashlen[mylen]++; \
388*0Sstevel@tonic-gate }
389*0Sstevel@tonic-gate 
390*0Sstevel@tonic-gate #else	/* VM_STATS */
391*0Sstevel@tonic-gate 
392*0Sstevel@tonic-gate /*
393*0Sstevel@tonic-gate  * Don't collect statistics
394*0Sstevel@tonic-gate  */
395*0Sstevel@tonic-gate #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
396*0Sstevel@tonic-gate 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
397*0Sstevel@tonic-gate 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
398*0Sstevel@tonic-gate 			break; \
399*0Sstevel@tonic-gate 	} \
400*0Sstevel@tonic-gate }
401*0Sstevel@tonic-gate 
402*0Sstevel@tonic-gate #endif	/* VM_STATS */
403*0Sstevel@tonic-gate 
404*0Sstevel@tonic-gate 
405*0Sstevel@tonic-gate 
406*0Sstevel@tonic-gate #ifdef DEBUG
407*0Sstevel@tonic-gate #define	MEMSEG_SEARCH_STATS
408*0Sstevel@tonic-gate #endif
409*0Sstevel@tonic-gate 
410*0Sstevel@tonic-gate #ifdef MEMSEG_SEARCH_STATS
411*0Sstevel@tonic-gate struct memseg_stats {
412*0Sstevel@tonic-gate     uint_t nsearch;
413*0Sstevel@tonic-gate     uint_t nlastwon;
414*0Sstevel@tonic-gate     uint_t nhashwon;
415*0Sstevel@tonic-gate     uint_t nnotfound;
416*0Sstevel@tonic-gate } memseg_stats;
417*0Sstevel@tonic-gate 
418*0Sstevel@tonic-gate #define	MEMSEG_STAT_INCR(v) \
419*0Sstevel@tonic-gate 	atomic_add_32(&memseg_stats.v, 1)
420*0Sstevel@tonic-gate #else
421*0Sstevel@tonic-gate #define	MEMSEG_STAT_INCR(x)
422*0Sstevel@tonic-gate #endif
423*0Sstevel@tonic-gate 
424*0Sstevel@tonic-gate struct memseg *memsegs;		/* list of memory segments */
425*0Sstevel@tonic-gate 
426*0Sstevel@tonic-gate 
427*0Sstevel@tonic-gate static void page_init_mem_config(void);
428*0Sstevel@tonic-gate static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
429*0Sstevel@tonic-gate static void page_do_hashout(page_t *);
430*0Sstevel@tonic-gate 
431*0Sstevel@tonic-gate static void page_demote_vp_pages(page_t *);
432*0Sstevel@tonic-gate 
433*0Sstevel@tonic-gate /*
434*0Sstevel@tonic-gate  * vm subsystem related initialization
435*0Sstevel@tonic-gate  */
436*0Sstevel@tonic-gate void
437*0Sstevel@tonic-gate vm_init(void)
438*0Sstevel@tonic-gate {
439*0Sstevel@tonic-gate 	boolean_t callb_vm_cpr(void *, int);
440*0Sstevel@tonic-gate 
441*0Sstevel@tonic-gate 	(void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
442*0Sstevel@tonic-gate 	page_init_mem_config();
443*0Sstevel@tonic-gate 
444*0Sstevel@tonic-gate 	/*
445*0Sstevel@tonic-gate 	 * initialise the vnode for retired pages
446*0Sstevel@tonic-gate 	 */
447*0Sstevel@tonic-gate 	page_retired_init();
448*0Sstevel@tonic-gate }
449*0Sstevel@tonic-gate 
450*0Sstevel@tonic-gate /*
451*0Sstevel@tonic-gate  * This function is called at startup and when memory is added or deleted.
452*0Sstevel@tonic-gate  */
453*0Sstevel@tonic-gate void
454*0Sstevel@tonic-gate init_pages_pp_maximum()
455*0Sstevel@tonic-gate {
456*0Sstevel@tonic-gate 	static pgcnt_t p_min;
457*0Sstevel@tonic-gate 	static pgcnt_t pages_pp_maximum_startup;
458*0Sstevel@tonic-gate 	static pgcnt_t avrmem_delta;
459*0Sstevel@tonic-gate 	static int init_done;
460*0Sstevel@tonic-gate 	static int user_set;	/* true if set in /etc/system */
461*0Sstevel@tonic-gate 
462*0Sstevel@tonic-gate 	if (init_done == 0) {
463*0Sstevel@tonic-gate 
464*0Sstevel@tonic-gate 		/* If the user specified a value, save it */
465*0Sstevel@tonic-gate 		if (pages_pp_maximum != 0) {
466*0Sstevel@tonic-gate 			user_set = 1;
467*0Sstevel@tonic-gate 			pages_pp_maximum_startup = pages_pp_maximum;
468*0Sstevel@tonic-gate 		}
469*0Sstevel@tonic-gate 
470*0Sstevel@tonic-gate 		/*
471*0Sstevel@tonic-gate 		 * Setting of pages_pp_maximum is based first time
472*0Sstevel@tonic-gate 		 * on the value of availrmem just after the start-up
473*0Sstevel@tonic-gate 		 * allocations. To preserve this relationship at run
474*0Sstevel@tonic-gate 		 * time, use a delta from availrmem_initial.
475*0Sstevel@tonic-gate 		 */
476*0Sstevel@tonic-gate 		ASSERT(availrmem_initial >= availrmem);
477*0Sstevel@tonic-gate 		avrmem_delta = availrmem_initial - availrmem;
478*0Sstevel@tonic-gate 
479*0Sstevel@tonic-gate 		/* The allowable floor of pages_pp_maximum */
480*0Sstevel@tonic-gate 		p_min = tune.t_minarmem + 100;
481*0Sstevel@tonic-gate 
482*0Sstevel@tonic-gate 		/* Make sure we don't come through here again. */
483*0Sstevel@tonic-gate 		init_done = 1;
484*0Sstevel@tonic-gate 	}
485*0Sstevel@tonic-gate 	/*
486*0Sstevel@tonic-gate 	 * Determine pages_pp_maximum, the number of currently available
487*0Sstevel@tonic-gate 	 * pages (availrmem) that can't be `locked'. If not set by
488*0Sstevel@tonic-gate 	 * the user, we set it to 4% of the currently available memory
489*0Sstevel@tonic-gate 	 * plus 4MB.
490*0Sstevel@tonic-gate 	 * But we also insist that it be greater than tune.t_minarmem;
491*0Sstevel@tonic-gate 	 * otherwise a process could lock down a lot of memory, get swapped
492*0Sstevel@tonic-gate 	 * out, and never have enough to get swapped back in.
493*0Sstevel@tonic-gate 	 */
494*0Sstevel@tonic-gate 	if (user_set)
495*0Sstevel@tonic-gate 		pages_pp_maximum = pages_pp_maximum_startup;
496*0Sstevel@tonic-gate 	else
497*0Sstevel@tonic-gate 		pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
498*0Sstevel@tonic-gate 		    + btop(4 * 1024 * 1024);
499*0Sstevel@tonic-gate 
500*0Sstevel@tonic-gate 	if (pages_pp_maximum <= p_min) {
501*0Sstevel@tonic-gate 		pages_pp_maximum = p_min;
502*0Sstevel@tonic-gate 	}
503*0Sstevel@tonic-gate }
504*0Sstevel@tonic-gate 
505*0Sstevel@tonic-gate void
506*0Sstevel@tonic-gate set_max_page_get(pgcnt_t target_total_pages)
507*0Sstevel@tonic-gate {
508*0Sstevel@tonic-gate 	max_page_get = target_total_pages / 2;
509*0Sstevel@tonic-gate }
510*0Sstevel@tonic-gate 
511*0Sstevel@tonic-gate static pgcnt_t pending_delete;
512*0Sstevel@tonic-gate 
513*0Sstevel@tonic-gate /*ARGSUSED*/
514*0Sstevel@tonic-gate static void
515*0Sstevel@tonic-gate page_mem_config_post_add(
516*0Sstevel@tonic-gate 	void *arg,
517*0Sstevel@tonic-gate 	pgcnt_t delta_pages)
518*0Sstevel@tonic-gate {
519*0Sstevel@tonic-gate 	set_max_page_get(total_pages - pending_delete);
520*0Sstevel@tonic-gate 	init_pages_pp_maximum();
521*0Sstevel@tonic-gate }
522*0Sstevel@tonic-gate 
523*0Sstevel@tonic-gate /*ARGSUSED*/
524*0Sstevel@tonic-gate static int
525*0Sstevel@tonic-gate page_mem_config_pre_del(
526*0Sstevel@tonic-gate 	void *arg,
527*0Sstevel@tonic-gate 	pgcnt_t delta_pages)
528*0Sstevel@tonic-gate {
529*0Sstevel@tonic-gate 	pgcnt_t nv;
530*0Sstevel@tonic-gate 
531*0Sstevel@tonic-gate 	nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
532*0Sstevel@tonic-gate 	set_max_page_get(total_pages - nv);
533*0Sstevel@tonic-gate 	return (0);
534*0Sstevel@tonic-gate }
535*0Sstevel@tonic-gate 
536*0Sstevel@tonic-gate /*ARGSUSED*/
537*0Sstevel@tonic-gate static void
538*0Sstevel@tonic-gate page_mem_config_post_del(
539*0Sstevel@tonic-gate 	void *arg,
540*0Sstevel@tonic-gate 	pgcnt_t delta_pages,
541*0Sstevel@tonic-gate 	int cancelled)
542*0Sstevel@tonic-gate {
543*0Sstevel@tonic-gate 	pgcnt_t nv;
544*0Sstevel@tonic-gate 
545*0Sstevel@tonic-gate 	nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
546*0Sstevel@tonic-gate 	set_max_page_get(total_pages - nv);
547*0Sstevel@tonic-gate 	if (!cancelled)
548*0Sstevel@tonic-gate 		init_pages_pp_maximum();
549*0Sstevel@tonic-gate }
550*0Sstevel@tonic-gate 
551*0Sstevel@tonic-gate static kphysm_setup_vector_t page_mem_config_vec = {
552*0Sstevel@tonic-gate 	KPHYSM_SETUP_VECTOR_VERSION,
553*0Sstevel@tonic-gate 	page_mem_config_post_add,
554*0Sstevel@tonic-gate 	page_mem_config_pre_del,
555*0Sstevel@tonic-gate 	page_mem_config_post_del,
556*0Sstevel@tonic-gate };
557*0Sstevel@tonic-gate 
558*0Sstevel@tonic-gate static void
559*0Sstevel@tonic-gate page_init_mem_config(void)
560*0Sstevel@tonic-gate {
561*0Sstevel@tonic-gate 	int ret;
562*0Sstevel@tonic-gate 
563*0Sstevel@tonic-gate 	ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
564*0Sstevel@tonic-gate 	ASSERT(ret == 0);
565*0Sstevel@tonic-gate }
566*0Sstevel@tonic-gate 
567*0Sstevel@tonic-gate /*
568*0Sstevel@tonic-gate  * Evenly spread out the PCF counters for large free pages
569*0Sstevel@tonic-gate  */
570*0Sstevel@tonic-gate static void
571*0Sstevel@tonic-gate page_free_large_ctr(pgcnt_t npages)
572*0Sstevel@tonic-gate {
573*0Sstevel@tonic-gate 	static struct pcf	*p = pcf;
574*0Sstevel@tonic-gate 	pgcnt_t			lump;
575*0Sstevel@tonic-gate 
576*0Sstevel@tonic-gate 	freemem += npages;
577*0Sstevel@tonic-gate 
578*0Sstevel@tonic-gate 	lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT;
579*0Sstevel@tonic-gate 
580*0Sstevel@tonic-gate 	while (npages > 0) {
581*0Sstevel@tonic-gate 
582*0Sstevel@tonic-gate 		ASSERT(!p->pcf_block);
583*0Sstevel@tonic-gate 
584*0Sstevel@tonic-gate 		if (lump < npages) {
585*0Sstevel@tonic-gate 			p->pcf_count += (uint_t)lump;
586*0Sstevel@tonic-gate 			npages -= lump;
587*0Sstevel@tonic-gate 		} else {
588*0Sstevel@tonic-gate 			p->pcf_count += (uint_t)npages;
589*0Sstevel@tonic-gate 			npages = 0;
590*0Sstevel@tonic-gate 		}
591*0Sstevel@tonic-gate 
592*0Sstevel@tonic-gate 		ASSERT(!p->pcf_wait);
593*0Sstevel@tonic-gate 
594*0Sstevel@tonic-gate 		if (++p > &pcf[PCF_FANOUT - 1])
595*0Sstevel@tonic-gate 			p = pcf;
596*0Sstevel@tonic-gate 	}
597*0Sstevel@tonic-gate 
598*0Sstevel@tonic-gate 	ASSERT(npages == 0);
599*0Sstevel@tonic-gate }
600*0Sstevel@tonic-gate 
601*0Sstevel@tonic-gate /*
602*0Sstevel@tonic-gate  * Add a physical chunk of memory to the system freee lists during startup.
603*0Sstevel@tonic-gate  * Platform specific startup() allocates the memory for the page structs.
604*0Sstevel@tonic-gate  *
605*0Sstevel@tonic-gate  * num	- number of page structures
606*0Sstevel@tonic-gate  * base - page number (pfn) to be associated with the first page.
607*0Sstevel@tonic-gate  *
608*0Sstevel@tonic-gate  * Since we are doing this during startup (ie. single threaded), we will
609*0Sstevel@tonic-gate  * use shortcut routines to avoid any locking overhead while putting all
610*0Sstevel@tonic-gate  * these pages on the freelists.
611*0Sstevel@tonic-gate  *
612*0Sstevel@tonic-gate  * NOTE: Any changes performed to page_free(), must also be performed to
613*0Sstevel@tonic-gate  *	 add_physmem() since this is how we initialize all page_t's at
614*0Sstevel@tonic-gate  *	 boot time.
615*0Sstevel@tonic-gate  */
616*0Sstevel@tonic-gate void
617*0Sstevel@tonic-gate add_physmem(
618*0Sstevel@tonic-gate 	page_t	*pp,
619*0Sstevel@tonic-gate 	pgcnt_t	num,
620*0Sstevel@tonic-gate 	pfn_t	pnum)
621*0Sstevel@tonic-gate {
622*0Sstevel@tonic-gate 	page_t	*root = NULL;
623*0Sstevel@tonic-gate 	uint_t	szc = page_num_pagesizes() - 1;
624*0Sstevel@tonic-gate 	pgcnt_t	large = page_get_pagecnt(szc);
625*0Sstevel@tonic-gate 	pgcnt_t	cnt = 0;
626*0Sstevel@tonic-gate 
627*0Sstevel@tonic-gate 	TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
628*0Sstevel@tonic-gate 		"add_physmem:pp %p num %lu", pp, num);
629*0Sstevel@tonic-gate 
630*0Sstevel@tonic-gate 	/*
631*0Sstevel@tonic-gate 	 * Arbitrarily limit the max page_get request
632*0Sstevel@tonic-gate 	 * to 1/2 of the page structs we have.
633*0Sstevel@tonic-gate 	 */
634*0Sstevel@tonic-gate 	total_pages += num;
635*0Sstevel@tonic-gate 	set_max_page_get(total_pages);
636*0Sstevel@tonic-gate 
637*0Sstevel@tonic-gate 	/*
638*0Sstevel@tonic-gate 	 * The physical space for the pages array
639*0Sstevel@tonic-gate 	 * representing ram pages has already been
640*0Sstevel@tonic-gate 	 * allocated.  Here we initialize each lock
641*0Sstevel@tonic-gate 	 * in the page structure, and put each on
642*0Sstevel@tonic-gate 	 * the free list
643*0Sstevel@tonic-gate 	 */
644*0Sstevel@tonic-gate 	for (; num; pp = page_next_raw(pp), pnum++, num--) {
645*0Sstevel@tonic-gate 
646*0Sstevel@tonic-gate 		/*
647*0Sstevel@tonic-gate 		 * this needs to fill in the page number
648*0Sstevel@tonic-gate 		 * and do any other arch specific initialization
649*0Sstevel@tonic-gate 		 */
650*0Sstevel@tonic-gate 		add_physmem_cb(pp, pnum);
651*0Sstevel@tonic-gate 
652*0Sstevel@tonic-gate 		/*
653*0Sstevel@tonic-gate 		 * Initialize the page lock as unlocked, since nobody
654*0Sstevel@tonic-gate 		 * can see or access this page yet.
655*0Sstevel@tonic-gate 		 */
656*0Sstevel@tonic-gate 		pp->p_selock = 0;
657*0Sstevel@tonic-gate 
658*0Sstevel@tonic-gate 		/*
659*0Sstevel@tonic-gate 		 * Initialize IO lock
660*0Sstevel@tonic-gate 		 */
661*0Sstevel@tonic-gate 		page_iolock_init(pp);
662*0Sstevel@tonic-gate 
663*0Sstevel@tonic-gate 		/*
664*0Sstevel@tonic-gate 		 * initialize other fields in the page_t
665*0Sstevel@tonic-gate 		 */
666*0Sstevel@tonic-gate 		PP_SETFREE(pp);
667*0Sstevel@tonic-gate 		page_clr_all_props(pp);
668*0Sstevel@tonic-gate 		PP_SETAGED(pp);
669*0Sstevel@tonic-gate 		pp->p_offset = (u_offset_t)-1;
670*0Sstevel@tonic-gate 		pp->p_next = pp;
671*0Sstevel@tonic-gate 		pp->p_prev = pp;
672*0Sstevel@tonic-gate 
673*0Sstevel@tonic-gate 		/*
674*0Sstevel@tonic-gate 		 * Simple case: System doesn't support large pages.
675*0Sstevel@tonic-gate 		 */
676*0Sstevel@tonic-gate 		if (szc == 0) {
677*0Sstevel@tonic-gate 			pp->p_szc = 0;
678*0Sstevel@tonic-gate 			page_free_at_startup(pp);
679*0Sstevel@tonic-gate 			continue;
680*0Sstevel@tonic-gate 		}
681*0Sstevel@tonic-gate 
682*0Sstevel@tonic-gate 		/*
683*0Sstevel@tonic-gate 		 * Handle unaligned pages, we collect them up onto
684*0Sstevel@tonic-gate 		 * the root page until we have a full large page.
685*0Sstevel@tonic-gate 		 */
686*0Sstevel@tonic-gate 		if (!IS_P2ALIGNED(pnum, large)) {
687*0Sstevel@tonic-gate 
688*0Sstevel@tonic-gate 			/*
689*0Sstevel@tonic-gate 			 * If not in a large page,
690*0Sstevel@tonic-gate 			 * just free as small page.
691*0Sstevel@tonic-gate 			 */
692*0Sstevel@tonic-gate 			if (root == NULL) {
693*0Sstevel@tonic-gate 				pp->p_szc = 0;
694*0Sstevel@tonic-gate 				page_free_at_startup(pp);
695*0Sstevel@tonic-gate 				continue;
696*0Sstevel@tonic-gate 			}
697*0Sstevel@tonic-gate 
698*0Sstevel@tonic-gate 			/*
699*0Sstevel@tonic-gate 			 * Link a constituent page into the large page.
700*0Sstevel@tonic-gate 			 */
701*0Sstevel@tonic-gate 			pp->p_szc = szc;
702*0Sstevel@tonic-gate 			page_list_concat(&root, &pp);
703*0Sstevel@tonic-gate 
704*0Sstevel@tonic-gate 			/*
705*0Sstevel@tonic-gate 			 * When large page is fully formed, free it.
706*0Sstevel@tonic-gate 			 */
707*0Sstevel@tonic-gate 			if (++cnt == large) {
708*0Sstevel@tonic-gate 				page_free_large_ctr(cnt);
709*0Sstevel@tonic-gate 				page_list_add_pages(root, PG_LIST_ISINIT);
710*0Sstevel@tonic-gate 				root = NULL;
711*0Sstevel@tonic-gate 				cnt = 0;
712*0Sstevel@tonic-gate 			}
713*0Sstevel@tonic-gate 			continue;
714*0Sstevel@tonic-gate 		}
715*0Sstevel@tonic-gate 
716*0Sstevel@tonic-gate 		/*
717*0Sstevel@tonic-gate 		 * At this point we have a page number which
718*0Sstevel@tonic-gate 		 * is aligned. We assert that we aren't already
719*0Sstevel@tonic-gate 		 * in a different large page.
720*0Sstevel@tonic-gate 		 */
721*0Sstevel@tonic-gate 		ASSERT(IS_P2ALIGNED(pnum, large));
722*0Sstevel@tonic-gate 		ASSERT(root == NULL && cnt == 0);
723*0Sstevel@tonic-gate 
724*0Sstevel@tonic-gate 		/*
725*0Sstevel@tonic-gate 		 * If insufficient number of pages left to form
726*0Sstevel@tonic-gate 		 * a large page, just free the small page.
727*0Sstevel@tonic-gate 		 */
728*0Sstevel@tonic-gate 		if (num < large) {
729*0Sstevel@tonic-gate 			pp->p_szc = 0;
730*0Sstevel@tonic-gate 			page_free_at_startup(pp);
731*0Sstevel@tonic-gate 			continue;
732*0Sstevel@tonic-gate 		}
733*0Sstevel@tonic-gate 
734*0Sstevel@tonic-gate 		/*
735*0Sstevel@tonic-gate 		 * Otherwise start a new large page.
736*0Sstevel@tonic-gate 		 */
737*0Sstevel@tonic-gate 		pp->p_szc = szc;
738*0Sstevel@tonic-gate 		cnt++;
739*0Sstevel@tonic-gate 		root = pp;
740*0Sstevel@tonic-gate 	}
741*0Sstevel@tonic-gate 	ASSERT(root == NULL && cnt == 0);
742*0Sstevel@tonic-gate }
743*0Sstevel@tonic-gate 
744*0Sstevel@tonic-gate /*
745*0Sstevel@tonic-gate  * Find a page representing the specified [vp, offset].
746*0Sstevel@tonic-gate  * If we find the page but it is intransit coming in,
747*0Sstevel@tonic-gate  * it will have an "exclusive" lock and we wait for
748*0Sstevel@tonic-gate  * the i/o to complete.  A page found on the free list
749*0Sstevel@tonic-gate  * is always reclaimed and then locked.  On success, the page
750*0Sstevel@tonic-gate  * is locked, its data is valid and it isn't on the free
751*0Sstevel@tonic-gate  * list, while a NULL is returned if the page doesn't exist.
752*0Sstevel@tonic-gate  */
753*0Sstevel@tonic-gate page_t *
754*0Sstevel@tonic-gate page_lookup(vnode_t *vp, u_offset_t off, se_t se)
755*0Sstevel@tonic-gate {
756*0Sstevel@tonic-gate 	return (page_lookup_create(vp, off, se, NULL, NULL, 0));
757*0Sstevel@tonic-gate }
758*0Sstevel@tonic-gate 
759*0Sstevel@tonic-gate /*
760*0Sstevel@tonic-gate  * Find a page representing the specified [vp, offset].
761*0Sstevel@tonic-gate  * We either return the one we found or, if passed in,
762*0Sstevel@tonic-gate  * create one with identity of [vp, offset] of the
763*0Sstevel@tonic-gate  * pre-allocated page. If we find exsisting page but it is
764*0Sstevel@tonic-gate  * intransit coming in, it will have an "exclusive" lock
765*0Sstevel@tonic-gate  * and we wait for the i/o to complete.  A page found on
766*0Sstevel@tonic-gate  * the free list is always reclaimed and then locked.
767*0Sstevel@tonic-gate  * On success, the page is locked, its data is valid and
768*0Sstevel@tonic-gate  * it isn't on the free list, while a NULL is returned
769*0Sstevel@tonic-gate  * if the page doesn't exist and newpp is NULL;
770*0Sstevel@tonic-gate  */
771*0Sstevel@tonic-gate page_t *
772*0Sstevel@tonic-gate page_lookup_create(
773*0Sstevel@tonic-gate 	vnode_t *vp,
774*0Sstevel@tonic-gate 	u_offset_t off,
775*0Sstevel@tonic-gate 	se_t se,
776*0Sstevel@tonic-gate 	page_t *newpp,
777*0Sstevel@tonic-gate 	spgcnt_t *nrelocp,
778*0Sstevel@tonic-gate 	int flags)
779*0Sstevel@tonic-gate {
780*0Sstevel@tonic-gate 	page_t		*pp;
781*0Sstevel@tonic-gate 	kmutex_t	*phm;
782*0Sstevel@tonic-gate 	ulong_t		index;
783*0Sstevel@tonic-gate 	uint_t		hash_locked;
784*0Sstevel@tonic-gate 	uint_t		es;
785*0Sstevel@tonic-gate 
786*0Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
787*0Sstevel@tonic-gate 	VM_STAT_ADD(page_lookup_cnt[0]);
788*0Sstevel@tonic-gate 	ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
789*0Sstevel@tonic-gate 
790*0Sstevel@tonic-gate 	/*
791*0Sstevel@tonic-gate 	 * Acquire the appropriate page hash lock since
792*0Sstevel@tonic-gate 	 * we have to search the hash list.  Pages that
793*0Sstevel@tonic-gate 	 * hash to this list can't change identity while
794*0Sstevel@tonic-gate 	 * this lock is held.
795*0Sstevel@tonic-gate 	 */
796*0Sstevel@tonic-gate 	hash_locked = 0;
797*0Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
798*0Sstevel@tonic-gate 	phm = NULL;
799*0Sstevel@tonic-gate top:
800*0Sstevel@tonic-gate 	PAGE_HASH_SEARCH(index, pp, vp, off);
801*0Sstevel@tonic-gate 	if (pp != NULL) {
802*0Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_cnt[1]);
803*0Sstevel@tonic-gate 		es = (newpp != NULL) ? 1 : 0;
804*0Sstevel@tonic-gate 		es |= flags;
805*0Sstevel@tonic-gate 		if (!hash_locked) {
806*0Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_cnt[2]);
807*0Sstevel@tonic-gate 			if (!page_try_reclaim_lock(pp, se, es)) {
808*0Sstevel@tonic-gate 				/*
809*0Sstevel@tonic-gate 				 * On a miss, acquire the phm.  Then
810*0Sstevel@tonic-gate 				 * next time, page_lock() will be called,
811*0Sstevel@tonic-gate 				 * causing a wait if the page is busy.
812*0Sstevel@tonic-gate 				 * just looping with page_trylock() would
813*0Sstevel@tonic-gate 				 * get pretty boring.
814*0Sstevel@tonic-gate 				 */
815*0Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_cnt[3]);
816*0Sstevel@tonic-gate 				phm = PAGE_HASH_MUTEX(index);
817*0Sstevel@tonic-gate 				mutex_enter(phm);
818*0Sstevel@tonic-gate 				hash_locked = 1;
819*0Sstevel@tonic-gate 				goto top;
820*0Sstevel@tonic-gate 			}
821*0Sstevel@tonic-gate 		} else {
822*0Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_cnt[4]);
823*0Sstevel@tonic-gate 			if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
824*0Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_cnt[5]);
825*0Sstevel@tonic-gate 				goto top;
826*0Sstevel@tonic-gate 			}
827*0Sstevel@tonic-gate 		}
828*0Sstevel@tonic-gate 
829*0Sstevel@tonic-gate 		/*
830*0Sstevel@tonic-gate 		 * Since `pp' is locked it can not change identity now.
831*0Sstevel@tonic-gate 		 * Reconfirm we locked the correct page.
832*0Sstevel@tonic-gate 		 *
833*0Sstevel@tonic-gate 		 * Both the p_vnode and p_offset *must* be cast volatile
834*0Sstevel@tonic-gate 		 * to force a reload of their values: The PAGE_HASH_SEARCH
835*0Sstevel@tonic-gate 		 * macro will have stuffed p_vnode and p_offset into
836*0Sstevel@tonic-gate 		 * registers before calling page_trylock(); another thread,
837*0Sstevel@tonic-gate 		 * actually holding the hash lock, could have changed the
838*0Sstevel@tonic-gate 		 * page's identity in memory, but our registers would not
839*0Sstevel@tonic-gate 		 * be changed, fooling the reconfirmation.  If the hash
840*0Sstevel@tonic-gate 		 * lock was held during the search, the casting would
841*0Sstevel@tonic-gate 		 * not be needed.
842*0Sstevel@tonic-gate 		 */
843*0Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_cnt[6]);
844*0Sstevel@tonic-gate 		if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
845*0Sstevel@tonic-gate 		    ((volatile u_offset_t)(pp->p_offset) != off)) {
846*0Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_cnt[7]);
847*0Sstevel@tonic-gate 			if (hash_locked) {
848*0Sstevel@tonic-gate 				panic("page_lookup_create: lost page %p",
849*0Sstevel@tonic-gate 				    (void *)pp);
850*0Sstevel@tonic-gate 				/*NOTREACHED*/
851*0Sstevel@tonic-gate 			}
852*0Sstevel@tonic-gate 			page_unlock(pp);
853*0Sstevel@tonic-gate 			phm = PAGE_HASH_MUTEX(index);
854*0Sstevel@tonic-gate 			mutex_enter(phm);
855*0Sstevel@tonic-gate 			hash_locked = 1;
856*0Sstevel@tonic-gate 			goto top;
857*0Sstevel@tonic-gate 		}
858*0Sstevel@tonic-gate 
859*0Sstevel@tonic-gate 		/*
860*0Sstevel@tonic-gate 		 * If page_trylock() was called, then pp may still be on
861*0Sstevel@tonic-gate 		 * the cachelist (can't be on the free list, it would not
862*0Sstevel@tonic-gate 		 * have been found in the search).  If it is on the
863*0Sstevel@tonic-gate 		 * cachelist it must be pulled now. To pull the page from
864*0Sstevel@tonic-gate 		 * the cachelist, it must be exclusively locked.
865*0Sstevel@tonic-gate 		 *
866*0Sstevel@tonic-gate 		 * The other big difference between page_trylock() and
867*0Sstevel@tonic-gate 		 * page_lock(), is that page_lock() will pull the
868*0Sstevel@tonic-gate 		 * page from whatever free list (the cache list in this
869*0Sstevel@tonic-gate 		 * case) the page is on.  If page_trylock() was used
870*0Sstevel@tonic-gate 		 * above, then we have to do the reclaim ourselves.
871*0Sstevel@tonic-gate 		 */
872*0Sstevel@tonic-gate 		if ((!hash_locked) && (PP_ISFREE(pp))) {
873*0Sstevel@tonic-gate 			ASSERT(PP_ISAGED(pp) == 0);
874*0Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_cnt[8]);
875*0Sstevel@tonic-gate 
876*0Sstevel@tonic-gate 			/*
877*0Sstevel@tonic-gate 			 * page_relcaim will insure that we
878*0Sstevel@tonic-gate 			 * have this page exclusively
879*0Sstevel@tonic-gate 			 */
880*0Sstevel@tonic-gate 
881*0Sstevel@tonic-gate 			if (!page_reclaim(pp, NULL)) {
882*0Sstevel@tonic-gate 				/*
883*0Sstevel@tonic-gate 				 * Page_reclaim dropped whatever lock
884*0Sstevel@tonic-gate 				 * we held.
885*0Sstevel@tonic-gate 				 */
886*0Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_cnt[9]);
887*0Sstevel@tonic-gate 				phm = PAGE_HASH_MUTEX(index);
888*0Sstevel@tonic-gate 				mutex_enter(phm);
889*0Sstevel@tonic-gate 				hash_locked = 1;
890*0Sstevel@tonic-gate 				goto top;
891*0Sstevel@tonic-gate 			} else if (se == SE_SHARED && newpp == NULL) {
892*0Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_cnt[10]);
893*0Sstevel@tonic-gate 				page_downgrade(pp);
894*0Sstevel@tonic-gate 			}
895*0Sstevel@tonic-gate 		}
896*0Sstevel@tonic-gate 
897*0Sstevel@tonic-gate 		if (hash_locked) {
898*0Sstevel@tonic-gate 			mutex_exit(phm);
899*0Sstevel@tonic-gate 		}
900*0Sstevel@tonic-gate 
901*0Sstevel@tonic-gate 		if (newpp != NULL && pp->p_szc < newpp->p_szc &&
902*0Sstevel@tonic-gate 		    PAGE_EXCL(pp) && nrelocp != NULL) {
903*0Sstevel@tonic-gate 			ASSERT(nrelocp != NULL);
904*0Sstevel@tonic-gate 			(void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
905*0Sstevel@tonic-gate 			    NULL);
906*0Sstevel@tonic-gate 			if (*nrelocp > 0) {
907*0Sstevel@tonic-gate 				VM_STAT_COND_ADD(*nrelocp == 1,
908*0Sstevel@tonic-gate 				    page_lookup_cnt[11]);
909*0Sstevel@tonic-gate 				VM_STAT_COND_ADD(*nrelocp > 1,
910*0Sstevel@tonic-gate 				    page_lookup_cnt[12]);
911*0Sstevel@tonic-gate 				pp = newpp;
912*0Sstevel@tonic-gate 				se = SE_EXCL;
913*0Sstevel@tonic-gate 			} else {
914*0Sstevel@tonic-gate 				if (se == SE_SHARED) {
915*0Sstevel@tonic-gate 					page_downgrade(pp);
916*0Sstevel@tonic-gate 				}
917*0Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_cnt[13]);
918*0Sstevel@tonic-gate 			}
919*0Sstevel@tonic-gate 		} else if (newpp != NULL && nrelocp != NULL) {
920*0Sstevel@tonic-gate 			if (PAGE_EXCL(pp) && se == SE_SHARED) {
921*0Sstevel@tonic-gate 				page_downgrade(pp);
922*0Sstevel@tonic-gate 			}
923*0Sstevel@tonic-gate 			VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
924*0Sstevel@tonic-gate 			    page_lookup_cnt[14]);
925*0Sstevel@tonic-gate 			VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
926*0Sstevel@tonic-gate 			    page_lookup_cnt[15]);
927*0Sstevel@tonic-gate 			VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
928*0Sstevel@tonic-gate 			    page_lookup_cnt[16]);
929*0Sstevel@tonic-gate 		} else if (newpp != NULL && PAGE_EXCL(pp)) {
930*0Sstevel@tonic-gate 			se = SE_EXCL;
931*0Sstevel@tonic-gate 		}
932*0Sstevel@tonic-gate 	} else if (!hash_locked) {
933*0Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_cnt[17]);
934*0Sstevel@tonic-gate 		phm = PAGE_HASH_MUTEX(index);
935*0Sstevel@tonic-gate 		mutex_enter(phm);
936*0Sstevel@tonic-gate 		hash_locked = 1;
937*0Sstevel@tonic-gate 		goto top;
938*0Sstevel@tonic-gate 	} else if (newpp != NULL) {
939*0Sstevel@tonic-gate 		/*
940*0Sstevel@tonic-gate 		 * If we have a preallocated page then
941*0Sstevel@tonic-gate 		 * insert it now and basically behave like
942*0Sstevel@tonic-gate 		 * page_create.
943*0Sstevel@tonic-gate 		 */
944*0Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_cnt[18]);
945*0Sstevel@tonic-gate 		/*
946*0Sstevel@tonic-gate 		 * Since we hold the page hash mutex and
947*0Sstevel@tonic-gate 		 * just searched for this page, page_hashin
948*0Sstevel@tonic-gate 		 * had better not fail.  If it does, that
949*0Sstevel@tonic-gate 		 * means some thread did not follow the
950*0Sstevel@tonic-gate 		 * page hash mutex rules.  Panic now and
951*0Sstevel@tonic-gate 		 * get it over with.  As usual, go down
952*0Sstevel@tonic-gate 		 * holding all the locks.
953*0Sstevel@tonic-gate 		 */
954*0Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(phm));
955*0Sstevel@tonic-gate 		if (!page_hashin(newpp, vp, off, phm)) {
956*0Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(phm));
957*0Sstevel@tonic-gate 			panic("page_lookup_create: hashin failed %p %p %llx %p",
958*0Sstevel@tonic-gate 			    (void *)newpp, (void *)vp, off, (void *)phm);
959*0Sstevel@tonic-gate 			/*NOTREACHED*/
960*0Sstevel@tonic-gate 		}
961*0Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(phm));
962*0Sstevel@tonic-gate 		mutex_exit(phm);
963*0Sstevel@tonic-gate 		phm = NULL;
964*0Sstevel@tonic-gate 		page_set_props(newpp, P_REF);
965*0Sstevel@tonic-gate 		page_io_lock(newpp);
966*0Sstevel@tonic-gate 		pp = newpp;
967*0Sstevel@tonic-gate 		se = SE_EXCL;
968*0Sstevel@tonic-gate 	} else {
969*0Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_cnt[19]);
970*0Sstevel@tonic-gate 		mutex_exit(phm);
971*0Sstevel@tonic-gate 	}
972*0Sstevel@tonic-gate 
973*0Sstevel@tonic-gate 	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
974*0Sstevel@tonic-gate 
975*0Sstevel@tonic-gate 	ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
976*0Sstevel@tonic-gate 
977*0Sstevel@tonic-gate 	return (pp);
978*0Sstevel@tonic-gate }
979*0Sstevel@tonic-gate 
980*0Sstevel@tonic-gate /*
981*0Sstevel@tonic-gate  * Search the hash list for the page representing the
982*0Sstevel@tonic-gate  * specified [vp, offset] and return it locked.  Skip
983*0Sstevel@tonic-gate  * free pages and pages that cannot be locked as requested.
984*0Sstevel@tonic-gate  * Used while attempting to kluster pages.
985*0Sstevel@tonic-gate  */
986*0Sstevel@tonic-gate page_t *
987*0Sstevel@tonic-gate page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
988*0Sstevel@tonic-gate {
989*0Sstevel@tonic-gate 	page_t		*pp;
990*0Sstevel@tonic-gate 	kmutex_t	*phm;
991*0Sstevel@tonic-gate 	ulong_t		index;
992*0Sstevel@tonic-gate 	uint_t		locked;
993*0Sstevel@tonic-gate 
994*0Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
995*0Sstevel@tonic-gate 	VM_STAT_ADD(page_lookup_nowait_cnt[0]);
996*0Sstevel@tonic-gate 
997*0Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
998*0Sstevel@tonic-gate 	PAGE_HASH_SEARCH(index, pp, vp, off);
999*0Sstevel@tonic-gate 	locked = 0;
1000*0Sstevel@tonic-gate 	if (pp == NULL) {
1001*0Sstevel@tonic-gate top:
1002*0Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_nowait_cnt[1]);
1003*0Sstevel@tonic-gate 		locked = 1;
1004*0Sstevel@tonic-gate 		phm = PAGE_HASH_MUTEX(index);
1005*0Sstevel@tonic-gate 		mutex_enter(phm);
1006*0Sstevel@tonic-gate 		PAGE_HASH_SEARCH(index, pp, vp, off);
1007*0Sstevel@tonic-gate 	}
1008*0Sstevel@tonic-gate 
1009*0Sstevel@tonic-gate 	if (pp == NULL || PP_ISFREE(pp)) {
1010*0Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_nowait_cnt[2]);
1011*0Sstevel@tonic-gate 		pp = NULL;
1012*0Sstevel@tonic-gate 	} else {
1013*0Sstevel@tonic-gate 		if (!page_trylock(pp, se)) {
1014*0Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_nowait_cnt[3]);
1015*0Sstevel@tonic-gate 			pp = NULL;
1016*0Sstevel@tonic-gate 		} else {
1017*0Sstevel@tonic-gate 			VM_STAT_ADD(page_lookup_nowait_cnt[4]);
1018*0Sstevel@tonic-gate 			/*
1019*0Sstevel@tonic-gate 			 * See the comment in page_lookup()
1020*0Sstevel@tonic-gate 			 */
1021*0Sstevel@tonic-gate 			if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
1022*0Sstevel@tonic-gate 			    ((u_offset_t)(pp->p_offset) != off)) {
1023*0Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_nowait_cnt[5]);
1024*0Sstevel@tonic-gate 				if (locked) {
1025*0Sstevel@tonic-gate 					panic("page_lookup_nowait %p",
1026*0Sstevel@tonic-gate 					    (void *)pp);
1027*0Sstevel@tonic-gate 					/*NOTREACHED*/
1028*0Sstevel@tonic-gate 				}
1029*0Sstevel@tonic-gate 				page_unlock(pp);
1030*0Sstevel@tonic-gate 				goto top;
1031*0Sstevel@tonic-gate 			}
1032*0Sstevel@tonic-gate 			if (PP_ISFREE(pp)) {
1033*0Sstevel@tonic-gate 				VM_STAT_ADD(page_lookup_nowait_cnt[6]);
1034*0Sstevel@tonic-gate 				page_unlock(pp);
1035*0Sstevel@tonic-gate 				pp = NULL;
1036*0Sstevel@tonic-gate 			}
1037*0Sstevel@tonic-gate 		}
1038*0Sstevel@tonic-gate 	}
1039*0Sstevel@tonic-gate 	if (locked) {
1040*0Sstevel@tonic-gate 		VM_STAT_ADD(page_lookup_nowait_cnt[7]);
1041*0Sstevel@tonic-gate 		mutex_exit(phm);
1042*0Sstevel@tonic-gate 	}
1043*0Sstevel@tonic-gate 
1044*0Sstevel@tonic-gate 	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
1045*0Sstevel@tonic-gate 
1046*0Sstevel@tonic-gate 	return (pp);
1047*0Sstevel@tonic-gate }
1048*0Sstevel@tonic-gate 
1049*0Sstevel@tonic-gate /*
1050*0Sstevel@tonic-gate  * Search the hash list for a page with the specified [vp, off]
1051*0Sstevel@tonic-gate  * that is known to exist and is already locked.  This routine
1052*0Sstevel@tonic-gate  * is typically used by segment SOFTUNLOCK routines.
1053*0Sstevel@tonic-gate  */
1054*0Sstevel@tonic-gate page_t *
1055*0Sstevel@tonic-gate page_find(vnode_t *vp, u_offset_t off)
1056*0Sstevel@tonic-gate {
1057*0Sstevel@tonic-gate 	page_t		*pp;
1058*0Sstevel@tonic-gate 	kmutex_t	*phm;
1059*0Sstevel@tonic-gate 	ulong_t		index;
1060*0Sstevel@tonic-gate 
1061*0Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1062*0Sstevel@tonic-gate 	VM_STAT_ADD(page_find_cnt);
1063*0Sstevel@tonic-gate 
1064*0Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
1065*0Sstevel@tonic-gate 	phm = PAGE_HASH_MUTEX(index);
1066*0Sstevel@tonic-gate 
1067*0Sstevel@tonic-gate 	mutex_enter(phm);
1068*0Sstevel@tonic-gate 	PAGE_HASH_SEARCH(index, pp, vp, off);
1069*0Sstevel@tonic-gate 	mutex_exit(phm);
1070*0Sstevel@tonic-gate 
1071*0Sstevel@tonic-gate 	ASSERT(pp != NULL);
1072*0Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp) || panicstr);
1073*0Sstevel@tonic-gate 	return (pp);
1074*0Sstevel@tonic-gate }
1075*0Sstevel@tonic-gate 
1076*0Sstevel@tonic-gate /*
1077*0Sstevel@tonic-gate  * Determine whether a page with the specified [vp, off]
1078*0Sstevel@tonic-gate  * currently exists in the system.  Obviously this should
1079*0Sstevel@tonic-gate  * only be considered as a hint since nothing prevents the
1080*0Sstevel@tonic-gate  * page from disappearing or appearing immediately after
1081*0Sstevel@tonic-gate  * the return from this routine. Subsequently, we don't
1082*0Sstevel@tonic-gate  * even bother to lock the list.
1083*0Sstevel@tonic-gate  */
1084*0Sstevel@tonic-gate page_t *
1085*0Sstevel@tonic-gate page_exists(vnode_t *vp, u_offset_t off)
1086*0Sstevel@tonic-gate {
1087*0Sstevel@tonic-gate 	page_t	*pp;
1088*0Sstevel@tonic-gate 	ulong_t		index;
1089*0Sstevel@tonic-gate 
1090*0Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1091*0Sstevel@tonic-gate 	VM_STAT_ADD(page_exists_cnt);
1092*0Sstevel@tonic-gate 
1093*0Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
1094*0Sstevel@tonic-gate 	PAGE_HASH_SEARCH(index, pp, vp, off);
1095*0Sstevel@tonic-gate 
1096*0Sstevel@tonic-gate 	return (pp);
1097*0Sstevel@tonic-gate }
1098*0Sstevel@tonic-gate 
1099*0Sstevel@tonic-gate /*
1100*0Sstevel@tonic-gate  * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1101*0Sstevel@tonic-gate  * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
1102*0Sstevel@tonic-gate  * with these pages locked SHARED. If necessary reclaim pages from
1103*0Sstevel@tonic-gate  * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1104*0Sstevel@tonic-gate  *
1105*0Sstevel@tonic-gate  * If we fail to lock pages still return 1 if pages exist and contiguous.
1106*0Sstevel@tonic-gate  * But in this case return value is just a hint. ppa array won't be filled.
1107*0Sstevel@tonic-gate  * Caller should initialize ppa[0] as NULL to distinguish return value.
1108*0Sstevel@tonic-gate  *
1109*0Sstevel@tonic-gate  * Returns 0 if pages don't exist or not physically contiguous.
1110*0Sstevel@tonic-gate  *
1111*0Sstevel@tonic-gate  * This routine doesn't work for anonymous(swapfs) pages.
1112*0Sstevel@tonic-gate  */
1113*0Sstevel@tonic-gate int
1114*0Sstevel@tonic-gate page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1115*0Sstevel@tonic-gate {
1116*0Sstevel@tonic-gate 	pgcnt_t pages;
1117*0Sstevel@tonic-gate 	pfn_t pfn;
1118*0Sstevel@tonic-gate 	page_t *rootpp;
1119*0Sstevel@tonic-gate 	pgcnt_t i;
1120*0Sstevel@tonic-gate 	pgcnt_t j;
1121*0Sstevel@tonic-gate 	u_offset_t save_off = off;
1122*0Sstevel@tonic-gate 	ulong_t index;
1123*0Sstevel@tonic-gate 	kmutex_t *phm;
1124*0Sstevel@tonic-gate 	page_t *pp;
1125*0Sstevel@tonic-gate 	uint_t pszc;
1126*0Sstevel@tonic-gate 	int loopcnt = 0;
1127*0Sstevel@tonic-gate 
1128*0Sstevel@tonic-gate 	ASSERT(szc != 0);
1129*0Sstevel@tonic-gate 	ASSERT(vp != NULL);
1130*0Sstevel@tonic-gate 	ASSERT(!IS_SWAPFSVP(vp));
1131*0Sstevel@tonic-gate 	ASSERT(vp != &kvp);
1132*0Sstevel@tonic-gate 
1133*0Sstevel@tonic-gate again:
1134*0Sstevel@tonic-gate 	if (++loopcnt > 3) {
1135*0Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[0]);
1136*0Sstevel@tonic-gate 		return (0);
1137*0Sstevel@tonic-gate 	}
1138*0Sstevel@tonic-gate 
1139*0Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
1140*0Sstevel@tonic-gate 	phm = PAGE_HASH_MUTEX(index);
1141*0Sstevel@tonic-gate 
1142*0Sstevel@tonic-gate 	mutex_enter(phm);
1143*0Sstevel@tonic-gate 	PAGE_HASH_SEARCH(index, pp, vp, off);
1144*0Sstevel@tonic-gate 	mutex_exit(phm);
1145*0Sstevel@tonic-gate 
1146*0Sstevel@tonic-gate 	VM_STAT_ADD(page_exphcontg[1]);
1147*0Sstevel@tonic-gate 
1148*0Sstevel@tonic-gate 	if (pp == NULL) {
1149*0Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[2]);
1150*0Sstevel@tonic-gate 		return (0);
1151*0Sstevel@tonic-gate 	}
1152*0Sstevel@tonic-gate 
1153*0Sstevel@tonic-gate 	pages = page_get_pagecnt(szc);
1154*0Sstevel@tonic-gate 	rootpp = pp;
1155*0Sstevel@tonic-gate 	pfn = rootpp->p_pagenum;
1156*0Sstevel@tonic-gate 
1157*0Sstevel@tonic-gate 	if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1158*0Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[3]);
1159*0Sstevel@tonic-gate 		if (!page_trylock(pp, SE_SHARED)) {
1160*0Sstevel@tonic-gate 			VM_STAT_ADD(page_exphcontg[4]);
1161*0Sstevel@tonic-gate 			return (1);
1162*0Sstevel@tonic-gate 		}
1163*0Sstevel@tonic-gate 		if (pp->p_szc != pszc || pp->p_vnode != vp ||
1164*0Sstevel@tonic-gate 		    pp->p_offset != off) {
1165*0Sstevel@tonic-gate 			VM_STAT_ADD(page_exphcontg[5]);
1166*0Sstevel@tonic-gate 			page_unlock(pp);
1167*0Sstevel@tonic-gate 			off = save_off;
1168*0Sstevel@tonic-gate 			goto again;
1169*0Sstevel@tonic-gate 		}
1170*0Sstevel@tonic-gate 		/*
1171*0Sstevel@tonic-gate 		 * szc was non zero and vnode and offset matched after we
1172*0Sstevel@tonic-gate 		 * locked the page it means it can't become free on us.
1173*0Sstevel@tonic-gate 		 */
1174*0Sstevel@tonic-gate 		ASSERT(!PP_ISFREE(pp));
1175*0Sstevel@tonic-gate 		if (!IS_P2ALIGNED(pfn, pages)) {
1176*0Sstevel@tonic-gate 			page_unlock(pp);
1177*0Sstevel@tonic-gate 			return (0);
1178*0Sstevel@tonic-gate 		}
1179*0Sstevel@tonic-gate 		ppa[0] = pp;
1180*0Sstevel@tonic-gate 		pp++;
1181*0Sstevel@tonic-gate 		off += PAGESIZE;
1182*0Sstevel@tonic-gate 		pfn++;
1183*0Sstevel@tonic-gate 		for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1184*0Sstevel@tonic-gate 			if (!page_trylock(pp, SE_SHARED)) {
1185*0Sstevel@tonic-gate 				VM_STAT_ADD(page_exphcontg[6]);
1186*0Sstevel@tonic-gate 				pp--;
1187*0Sstevel@tonic-gate 				while (i-- > 0) {
1188*0Sstevel@tonic-gate 					page_unlock(pp);
1189*0Sstevel@tonic-gate 					pp--;
1190*0Sstevel@tonic-gate 				}
1191*0Sstevel@tonic-gate 				ppa[0] = NULL;
1192*0Sstevel@tonic-gate 				return (1);
1193*0Sstevel@tonic-gate 			}
1194*0Sstevel@tonic-gate 			if (pp->p_szc != pszc) {
1195*0Sstevel@tonic-gate 				VM_STAT_ADD(page_exphcontg[7]);
1196*0Sstevel@tonic-gate 				page_unlock(pp);
1197*0Sstevel@tonic-gate 				pp--;
1198*0Sstevel@tonic-gate 				while (i-- > 0) {
1199*0Sstevel@tonic-gate 					page_unlock(pp);
1200*0Sstevel@tonic-gate 					pp--;
1201*0Sstevel@tonic-gate 				}
1202*0Sstevel@tonic-gate 				ppa[0] = NULL;
1203*0Sstevel@tonic-gate 				off = save_off;
1204*0Sstevel@tonic-gate 				goto again;
1205*0Sstevel@tonic-gate 			}
1206*0Sstevel@tonic-gate 			/*
1207*0Sstevel@tonic-gate 			 * szc the same as for previous already locked pages
1208*0Sstevel@tonic-gate 			 * with right identity. Since this page had correct
1209*0Sstevel@tonic-gate 			 * szc after we locked it can't get freed or destroyed
1210*0Sstevel@tonic-gate 			 * and therefore must have the expected identity.
1211*0Sstevel@tonic-gate 			 */
1212*0Sstevel@tonic-gate 			ASSERT(!PP_ISFREE(pp));
1213*0Sstevel@tonic-gate 			if (pp->p_vnode != vp ||
1214*0Sstevel@tonic-gate 			    pp->p_offset != off) {
1215*0Sstevel@tonic-gate 				panic("page_exists_physcontig: "
1216*0Sstevel@tonic-gate 				    "large page identity doesn't match");
1217*0Sstevel@tonic-gate 			}
1218*0Sstevel@tonic-gate 			ppa[i] = pp;
1219*0Sstevel@tonic-gate 			ASSERT(pp->p_pagenum == pfn);
1220*0Sstevel@tonic-gate 		}
1221*0Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[8]);
1222*0Sstevel@tonic-gate 		ppa[pages] = NULL;
1223*0Sstevel@tonic-gate 		return (1);
1224*0Sstevel@tonic-gate 	} else if (pszc >= szc) {
1225*0Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[9]);
1226*0Sstevel@tonic-gate 		if (!IS_P2ALIGNED(pfn, pages)) {
1227*0Sstevel@tonic-gate 			return (0);
1228*0Sstevel@tonic-gate 		}
1229*0Sstevel@tonic-gate 		return (1);
1230*0Sstevel@tonic-gate 	}
1231*0Sstevel@tonic-gate 
1232*0Sstevel@tonic-gate 	if (!IS_P2ALIGNED(pfn, pages)) {
1233*0Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[10]);
1234*0Sstevel@tonic-gate 		return (0);
1235*0Sstevel@tonic-gate 	}
1236*0Sstevel@tonic-gate 
1237*0Sstevel@tonic-gate 	if (page_numtomemseg_nolock(pfn) !=
1238*0Sstevel@tonic-gate 	    page_numtomemseg_nolock(pfn + pages - 1)) {
1239*0Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[11]);
1240*0Sstevel@tonic-gate 		return (0);
1241*0Sstevel@tonic-gate 	}
1242*0Sstevel@tonic-gate 
1243*0Sstevel@tonic-gate 	/*
1244*0Sstevel@tonic-gate 	 * We loop up 4 times across pages to promote page size.
1245*0Sstevel@tonic-gate 	 * We're extra cautious to promote page size atomically with respect
1246*0Sstevel@tonic-gate 	 * to everybody else.  But we can probably optimize into 1 loop if
1247*0Sstevel@tonic-gate 	 * this becomes an issue.
1248*0Sstevel@tonic-gate 	 */
1249*0Sstevel@tonic-gate 
1250*0Sstevel@tonic-gate 	for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1251*0Sstevel@tonic-gate 		ASSERT(pp->p_pagenum == pfn);
1252*0Sstevel@tonic-gate 		if (!page_trylock(pp, SE_EXCL)) {
1253*0Sstevel@tonic-gate 			VM_STAT_ADD(page_exphcontg[12]);
1254*0Sstevel@tonic-gate 			break;
1255*0Sstevel@tonic-gate 		}
1256*0Sstevel@tonic-gate 		if (pp->p_vnode != vp ||
1257*0Sstevel@tonic-gate 		    pp->p_offset != off) {
1258*0Sstevel@tonic-gate 			VM_STAT_ADD(page_exphcontg[13]);
1259*0Sstevel@tonic-gate 			page_unlock(pp);
1260*0Sstevel@tonic-gate 			break;
1261*0Sstevel@tonic-gate 		}
1262*0Sstevel@tonic-gate 		if (pp->p_szc >= szc) {
1263*0Sstevel@tonic-gate 			ASSERT(i == 0);
1264*0Sstevel@tonic-gate 			page_unlock(pp);
1265*0Sstevel@tonic-gate 			off = save_off;
1266*0Sstevel@tonic-gate 			goto again;
1267*0Sstevel@tonic-gate 		}
1268*0Sstevel@tonic-gate 	}
1269*0Sstevel@tonic-gate 
1270*0Sstevel@tonic-gate 	if (i != pages) {
1271*0Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[14]);
1272*0Sstevel@tonic-gate 		--pp;
1273*0Sstevel@tonic-gate 		while (i-- > 0) {
1274*0Sstevel@tonic-gate 			page_unlock(pp);
1275*0Sstevel@tonic-gate 			--pp;
1276*0Sstevel@tonic-gate 		}
1277*0Sstevel@tonic-gate 		return (0);
1278*0Sstevel@tonic-gate 	}
1279*0Sstevel@tonic-gate 
1280*0Sstevel@tonic-gate 	pp = rootpp;
1281*0Sstevel@tonic-gate 	for (i = 0; i < pages; i++, pp++) {
1282*0Sstevel@tonic-gate 		if (PP_ISFREE(pp)) {
1283*0Sstevel@tonic-gate 			VM_STAT_ADD(page_exphcontg[15]);
1284*0Sstevel@tonic-gate 			ASSERT(!PP_ISAGED(pp));
1285*0Sstevel@tonic-gate 			ASSERT(pp->p_szc == 0);
1286*0Sstevel@tonic-gate 			if (!page_reclaim(pp, NULL)) {
1287*0Sstevel@tonic-gate 				break;
1288*0Sstevel@tonic-gate 			}
1289*0Sstevel@tonic-gate 		} else {
1290*0Sstevel@tonic-gate 			ASSERT(pp->p_szc < szc);
1291*0Sstevel@tonic-gate 			VM_STAT_ADD(page_exphcontg[16]);
1292*0Sstevel@tonic-gate 			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1293*0Sstevel@tonic-gate 		}
1294*0Sstevel@tonic-gate 	}
1295*0Sstevel@tonic-gate 	if (i < pages) {
1296*0Sstevel@tonic-gate 		VM_STAT_ADD(page_exphcontg[17]);
1297*0Sstevel@tonic-gate 		/*
1298*0Sstevel@tonic-gate 		 * page_reclaim failed because we were out of memory.
1299*0Sstevel@tonic-gate 		 * drop the rest of the locks and return because this page
1300*0Sstevel@tonic-gate 		 * must be already reallocated anyway.
1301*0Sstevel@tonic-gate 		 */
1302*0Sstevel@tonic-gate 		pp = rootpp;
1303*0Sstevel@tonic-gate 		for (j = 0; j < pages; j++, pp++) {
1304*0Sstevel@tonic-gate 			if (j != i) {
1305*0Sstevel@tonic-gate 				page_unlock(pp);
1306*0Sstevel@tonic-gate 			}
1307*0Sstevel@tonic-gate 		}
1308*0Sstevel@tonic-gate 		return (0);
1309*0Sstevel@tonic-gate 	}
1310*0Sstevel@tonic-gate 
1311*0Sstevel@tonic-gate 	off = save_off;
1312*0Sstevel@tonic-gate 	pp = rootpp;
1313*0Sstevel@tonic-gate 	for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1314*0Sstevel@tonic-gate 		ASSERT(PAGE_EXCL(pp));
1315*0Sstevel@tonic-gate 		ASSERT(!PP_ISFREE(pp));
1316*0Sstevel@tonic-gate 		ASSERT(!hat_page_is_mapped(pp));
1317*0Sstevel@tonic-gate 		ASSERT(pp->p_vnode == vp);
1318*0Sstevel@tonic-gate 		ASSERT(pp->p_offset == off);
1319*0Sstevel@tonic-gate 		pp->p_szc = szc;
1320*0Sstevel@tonic-gate 	}
1321*0Sstevel@tonic-gate 	pp = rootpp;
1322*0Sstevel@tonic-gate 	for (i = 0; i < pages; i++, pp++) {
1323*0Sstevel@tonic-gate 		if (ppa == NULL) {
1324*0Sstevel@tonic-gate 			page_unlock(pp);
1325*0Sstevel@tonic-gate 		} else {
1326*0Sstevel@tonic-gate 			ppa[i] = pp;
1327*0Sstevel@tonic-gate 			page_downgrade(ppa[i]);
1328*0Sstevel@tonic-gate 		}
1329*0Sstevel@tonic-gate 	}
1330*0Sstevel@tonic-gate 	if (ppa != NULL) {
1331*0Sstevel@tonic-gate 		ppa[pages] = NULL;
1332*0Sstevel@tonic-gate 	}
1333*0Sstevel@tonic-gate 	VM_STAT_ADD(page_exphcontg[18]);
1334*0Sstevel@tonic-gate 	ASSERT(vp->v_pages != NULL);
1335*0Sstevel@tonic-gate 	return (1);
1336*0Sstevel@tonic-gate }
1337*0Sstevel@tonic-gate 
1338*0Sstevel@tonic-gate /*
1339*0Sstevel@tonic-gate  * Determine whether a page with the specified [vp, off]
1340*0Sstevel@tonic-gate  * currently exists in the system and if so return its
1341*0Sstevel@tonic-gate  * size code. Obviously this should only be considered as
1342*0Sstevel@tonic-gate  * a hint since nothing prevents the page from disappearing
1343*0Sstevel@tonic-gate  * or appearing immediately after the return from this routine.
1344*0Sstevel@tonic-gate  */
1345*0Sstevel@tonic-gate int
1346*0Sstevel@tonic-gate page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1347*0Sstevel@tonic-gate {
1348*0Sstevel@tonic-gate 	page_t		*pp;
1349*0Sstevel@tonic-gate 	kmutex_t	*phm;
1350*0Sstevel@tonic-gate 	ulong_t		index;
1351*0Sstevel@tonic-gate 	int		rc = 0;
1352*0Sstevel@tonic-gate 
1353*0Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1354*0Sstevel@tonic-gate 	ASSERT(szc != NULL);
1355*0Sstevel@tonic-gate 	VM_STAT_ADD(page_exists_forreal_cnt);
1356*0Sstevel@tonic-gate 
1357*0Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
1358*0Sstevel@tonic-gate 	phm = PAGE_HASH_MUTEX(index);
1359*0Sstevel@tonic-gate 
1360*0Sstevel@tonic-gate 	mutex_enter(phm);
1361*0Sstevel@tonic-gate 	PAGE_HASH_SEARCH(index, pp, vp, off);
1362*0Sstevel@tonic-gate 	if (pp != NULL) {
1363*0Sstevel@tonic-gate 		*szc = pp->p_szc;
1364*0Sstevel@tonic-gate 		rc = 1;
1365*0Sstevel@tonic-gate 	}
1366*0Sstevel@tonic-gate 	mutex_exit(phm);
1367*0Sstevel@tonic-gate 	return (rc);
1368*0Sstevel@tonic-gate }
1369*0Sstevel@tonic-gate 
1370*0Sstevel@tonic-gate /* wakeup threads waiting for pages in page_create_get_something() */
1371*0Sstevel@tonic-gate void
1372*0Sstevel@tonic-gate wakeup_pcgs(void)
1373*0Sstevel@tonic-gate {
1374*0Sstevel@tonic-gate 	if (!CV_HAS_WAITERS(&pcgs_cv))
1375*0Sstevel@tonic-gate 		return;
1376*0Sstevel@tonic-gate 	cv_broadcast(&pcgs_cv);
1377*0Sstevel@tonic-gate }
1378*0Sstevel@tonic-gate 
1379*0Sstevel@tonic-gate /*
1380*0Sstevel@tonic-gate  * 'freemem' is used all over the kernel as an indication of how many
1381*0Sstevel@tonic-gate  * pages are free (either on the cache list or on the free page list)
1382*0Sstevel@tonic-gate  * in the system.  In very few places is a really accurate 'freemem'
1383*0Sstevel@tonic-gate  * needed.  To avoid contention of the lock protecting a the
1384*0Sstevel@tonic-gate  * single freemem, it was spread out into NCPU buckets.  Set_freemem
1385*0Sstevel@tonic-gate  * sets freemem to the total of all NCPU buckets.  It is called from
1386*0Sstevel@tonic-gate  * clock() on each TICK.
1387*0Sstevel@tonic-gate  */
1388*0Sstevel@tonic-gate void
1389*0Sstevel@tonic-gate set_freemem()
1390*0Sstevel@tonic-gate {
1391*0Sstevel@tonic-gate 	struct pcf	*p;
1392*0Sstevel@tonic-gate 	ulong_t		t;
1393*0Sstevel@tonic-gate 	uint_t		i;
1394*0Sstevel@tonic-gate 
1395*0Sstevel@tonic-gate 	t = 0;
1396*0Sstevel@tonic-gate 	p = pcf;
1397*0Sstevel@tonic-gate 	for (i = 0;  i < PCF_FANOUT; i++) {
1398*0Sstevel@tonic-gate 		t += p->pcf_count;
1399*0Sstevel@tonic-gate 		p++;
1400*0Sstevel@tonic-gate 	}
1401*0Sstevel@tonic-gate 	freemem = t;
1402*0Sstevel@tonic-gate 
1403*0Sstevel@tonic-gate 	/*
1404*0Sstevel@tonic-gate 	 * Don't worry about grabbing mutex.  It's not that
1405*0Sstevel@tonic-gate 	 * critical if we miss a tick or two.  This is
1406*0Sstevel@tonic-gate 	 * where we wakeup possible delayers in
1407*0Sstevel@tonic-gate 	 * page_create_get_something().
1408*0Sstevel@tonic-gate 	 */
1409*0Sstevel@tonic-gate 	wakeup_pcgs();
1410*0Sstevel@tonic-gate }
1411*0Sstevel@tonic-gate 
1412*0Sstevel@tonic-gate ulong_t
1413*0Sstevel@tonic-gate get_freemem()
1414*0Sstevel@tonic-gate {
1415*0Sstevel@tonic-gate 	struct pcf	*p;
1416*0Sstevel@tonic-gate 	ulong_t		t;
1417*0Sstevel@tonic-gate 	uint_t		i;
1418*0Sstevel@tonic-gate 
1419*0Sstevel@tonic-gate 	t = 0;
1420*0Sstevel@tonic-gate 	p = pcf;
1421*0Sstevel@tonic-gate 	for (i = 0; i < PCF_FANOUT; i++) {
1422*0Sstevel@tonic-gate 		t += p->pcf_count;
1423*0Sstevel@tonic-gate 		p++;
1424*0Sstevel@tonic-gate 	}
1425*0Sstevel@tonic-gate 	/*
1426*0Sstevel@tonic-gate 	 * We just calculated it, might as well set it.
1427*0Sstevel@tonic-gate 	 */
1428*0Sstevel@tonic-gate 	freemem = t;
1429*0Sstevel@tonic-gate 	return (t);
1430*0Sstevel@tonic-gate }
1431*0Sstevel@tonic-gate 
1432*0Sstevel@tonic-gate /*
1433*0Sstevel@tonic-gate  * Acquire all of the page cache & free (pcf) locks.
1434*0Sstevel@tonic-gate  */
1435*0Sstevel@tonic-gate void
1436*0Sstevel@tonic-gate pcf_acquire_all()
1437*0Sstevel@tonic-gate {
1438*0Sstevel@tonic-gate 	struct pcf	*p;
1439*0Sstevel@tonic-gate 	uint_t		i;
1440*0Sstevel@tonic-gate 
1441*0Sstevel@tonic-gate 	p = pcf;
1442*0Sstevel@tonic-gate 	for (i = 0; i < PCF_FANOUT; i++) {
1443*0Sstevel@tonic-gate 		p->pcf_touch = 1;
1444*0Sstevel@tonic-gate 		mutex_enter(&p->pcf_lock);
1445*0Sstevel@tonic-gate 		p++;
1446*0Sstevel@tonic-gate 	}
1447*0Sstevel@tonic-gate }
1448*0Sstevel@tonic-gate 
1449*0Sstevel@tonic-gate /*
1450*0Sstevel@tonic-gate  * Release all the pcf_locks.
1451*0Sstevel@tonic-gate  */
1452*0Sstevel@tonic-gate void
1453*0Sstevel@tonic-gate pcf_release_all()
1454*0Sstevel@tonic-gate {
1455*0Sstevel@tonic-gate 	struct pcf	*p;
1456*0Sstevel@tonic-gate 	uint_t		i;
1457*0Sstevel@tonic-gate 
1458*0Sstevel@tonic-gate 	p = pcf;
1459*0Sstevel@tonic-gate 	for (i = 0; i < PCF_FANOUT; i++) {
1460*0Sstevel@tonic-gate 		mutex_exit(&p->pcf_lock);
1461*0Sstevel@tonic-gate 		p++;
1462*0Sstevel@tonic-gate 	}
1463*0Sstevel@tonic-gate }
1464*0Sstevel@tonic-gate 
1465*0Sstevel@tonic-gate /*
1466*0Sstevel@tonic-gate  * Inform the VM system that we need some pages freed up.
1467*0Sstevel@tonic-gate  * Calls must be symmetric, e.g.:
1468*0Sstevel@tonic-gate  *
1469*0Sstevel@tonic-gate  *	page_needfree(100);
1470*0Sstevel@tonic-gate  *	wait a bit;
1471*0Sstevel@tonic-gate  *	page_needfree(-100);
1472*0Sstevel@tonic-gate  */
1473*0Sstevel@tonic-gate void
1474*0Sstevel@tonic-gate page_needfree(spgcnt_t npages)
1475*0Sstevel@tonic-gate {
1476*0Sstevel@tonic-gate 	mutex_enter(&new_freemem_lock);
1477*0Sstevel@tonic-gate 	needfree += npages;
1478*0Sstevel@tonic-gate 	mutex_exit(&new_freemem_lock);
1479*0Sstevel@tonic-gate }
1480*0Sstevel@tonic-gate 
1481*0Sstevel@tonic-gate /*
1482*0Sstevel@tonic-gate  * Throttle for page_create(): try to prevent freemem from dropping
1483*0Sstevel@tonic-gate  * below throttlefree.  We can't provide a 100% guarantee because
1484*0Sstevel@tonic-gate  * KM_NOSLEEP allocations, page_reclaim(), and various other things
1485*0Sstevel@tonic-gate  * nibble away at the freelist.  However, we can block all PG_WAIT
1486*0Sstevel@tonic-gate  * allocations until memory becomes available.  The motivation is
1487*0Sstevel@tonic-gate  * that several things can fall apart when there's no free memory:
1488*0Sstevel@tonic-gate  *
1489*0Sstevel@tonic-gate  * (1) If pageout() needs memory to push a page, the system deadlocks.
1490*0Sstevel@tonic-gate  *
1491*0Sstevel@tonic-gate  * (2) By (broken) specification, timeout(9F) can neither fail nor
1492*0Sstevel@tonic-gate  *     block, so it has no choice but to panic the system if it
1493*0Sstevel@tonic-gate  *     cannot allocate a callout structure.
1494*0Sstevel@tonic-gate  *
1495*0Sstevel@tonic-gate  * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1496*0Sstevel@tonic-gate  *     it panics if it cannot allocate a callback structure.
1497*0Sstevel@tonic-gate  *
1498*0Sstevel@tonic-gate  * (4) Untold numbers of third-party drivers have not yet been hardened
1499*0Sstevel@tonic-gate  *     against KM_NOSLEEP and/or allocb() failures; they simply assume
1500*0Sstevel@tonic-gate  *     success and panic the system with a data fault on failure.
1501*0Sstevel@tonic-gate  *     (The long-term solution to this particular problem is to ship
1502*0Sstevel@tonic-gate  *     hostile fault-injecting DEBUG kernels with the DDK.)
1503*0Sstevel@tonic-gate  *
1504*0Sstevel@tonic-gate  * It is theoretically impossible to guarantee success of non-blocking
1505*0Sstevel@tonic-gate  * allocations, but in practice, this throttle is very hard to break.
1506*0Sstevel@tonic-gate  */
1507*0Sstevel@tonic-gate static int
1508*0Sstevel@tonic-gate page_create_throttle(pgcnt_t npages, int flags)
1509*0Sstevel@tonic-gate {
1510*0Sstevel@tonic-gate 	ulong_t	fm;
1511*0Sstevel@tonic-gate 	uint_t	i;
1512*0Sstevel@tonic-gate 	pgcnt_t tf;	/* effective value of throttlefree */
1513*0Sstevel@tonic-gate 
1514*0Sstevel@tonic-gate 	/*
1515*0Sstevel@tonic-gate 	 * Never deny pages when:
1516*0Sstevel@tonic-gate 	 * - it's a thread that cannot block [NOMEMWAIT()]
1517*0Sstevel@tonic-gate 	 * - the allocation cannot block and must not fail
1518*0Sstevel@tonic-gate 	 * - the allocation cannot block and is pageout dispensated
1519*0Sstevel@tonic-gate 	 */
1520*0Sstevel@tonic-gate 	if (NOMEMWAIT() ||
1521*0Sstevel@tonic-gate 	    ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1522*0Sstevel@tonic-gate 	    ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1523*0Sstevel@tonic-gate 		return (1);
1524*0Sstevel@tonic-gate 
1525*0Sstevel@tonic-gate 	/*
1526*0Sstevel@tonic-gate 	 * If the allocation can't block, we look favorably upon it
1527*0Sstevel@tonic-gate 	 * unless we're below pageout_reserve.  In that case we fail
1528*0Sstevel@tonic-gate 	 * the allocation because we want to make sure there are a few
1529*0Sstevel@tonic-gate 	 * pages available for pageout.
1530*0Sstevel@tonic-gate 	 */
1531*0Sstevel@tonic-gate 	if ((flags & PG_WAIT) == 0)
1532*0Sstevel@tonic-gate 		return (freemem >= npages + pageout_reserve);
1533*0Sstevel@tonic-gate 
1534*0Sstevel@tonic-gate 	/* Calculate the effective throttlefree value */
1535*0Sstevel@tonic-gate 	tf = throttlefree -
1536*0Sstevel@tonic-gate 	    ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1537*0Sstevel@tonic-gate 
1538*0Sstevel@tonic-gate 	cv_signal(&proc_pageout->p_cv);
1539*0Sstevel@tonic-gate 
1540*0Sstevel@tonic-gate 	while (freemem < npages + tf) {
1541*0Sstevel@tonic-gate 		pcf_acquire_all();
1542*0Sstevel@tonic-gate 		mutex_enter(&new_freemem_lock);
1543*0Sstevel@tonic-gate 		fm = 0;
1544*0Sstevel@tonic-gate 		for (i = 0; i < PCF_FANOUT; i++) {
1545*0Sstevel@tonic-gate 			fm += pcf[i].pcf_count;
1546*0Sstevel@tonic-gate 			pcf[i].pcf_wait++;
1547*0Sstevel@tonic-gate 			mutex_exit(&pcf[i].pcf_lock);
1548*0Sstevel@tonic-gate 		}
1549*0Sstevel@tonic-gate 		freemem = fm;
1550*0Sstevel@tonic-gate 		needfree += npages;
1551*0Sstevel@tonic-gate 		freemem_wait++;
1552*0Sstevel@tonic-gate 		cv_wait(&freemem_cv, &new_freemem_lock);
1553*0Sstevel@tonic-gate 		freemem_wait--;
1554*0Sstevel@tonic-gate 		needfree -= npages;
1555*0Sstevel@tonic-gate 		mutex_exit(&new_freemem_lock);
1556*0Sstevel@tonic-gate 	}
1557*0Sstevel@tonic-gate 	return (1);
1558*0Sstevel@tonic-gate }
1559*0Sstevel@tonic-gate 
1560*0Sstevel@tonic-gate /*
1561*0Sstevel@tonic-gate  * page_create_wait() is called to either coalecse pages from the
1562*0Sstevel@tonic-gate  * different pcf buckets or to wait because there simply are not
1563*0Sstevel@tonic-gate  * enough pages to satisfy the caller's request.
1564*0Sstevel@tonic-gate  *
1565*0Sstevel@tonic-gate  * Sadly, this is called from platform/vm/vm_machdep.c
1566*0Sstevel@tonic-gate  */
1567*0Sstevel@tonic-gate int
1568*0Sstevel@tonic-gate page_create_wait(size_t npages, uint_t flags)
1569*0Sstevel@tonic-gate {
1570*0Sstevel@tonic-gate 	pgcnt_t		total;
1571*0Sstevel@tonic-gate 	uint_t		i;
1572*0Sstevel@tonic-gate 	struct pcf	*p;
1573*0Sstevel@tonic-gate 
1574*0Sstevel@tonic-gate 	/*
1575*0Sstevel@tonic-gate 	 * Wait until there are enough free pages to satisfy our
1576*0Sstevel@tonic-gate 	 * entire request.
1577*0Sstevel@tonic-gate 	 * We set needfree += npages before prodding pageout, to make sure
1578*0Sstevel@tonic-gate 	 * it does real work when npages > lotsfree > freemem.
1579*0Sstevel@tonic-gate 	 */
1580*0Sstevel@tonic-gate 	VM_STAT_ADD(page_create_not_enough);
1581*0Sstevel@tonic-gate 
1582*0Sstevel@tonic-gate 	ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1583*0Sstevel@tonic-gate checkagain:
1584*0Sstevel@tonic-gate 	if ((flags & PG_NORELOC) &&
1585*0Sstevel@tonic-gate 	    kcage_freemem < kcage_throttlefree + npages)
1586*0Sstevel@tonic-gate 		(void) kcage_create_throttle(npages, flags);
1587*0Sstevel@tonic-gate 
1588*0Sstevel@tonic-gate 	if (freemem < npages + throttlefree)
1589*0Sstevel@tonic-gate 		if (!page_create_throttle(npages, flags))
1590*0Sstevel@tonic-gate 			return (0);
1591*0Sstevel@tonic-gate 
1592*0Sstevel@tonic-gate 	/*
1593*0Sstevel@tonic-gate 	 * Since page_create_va() looked at every
1594*0Sstevel@tonic-gate 	 * bucket, assume we are going to have to wait.
1595*0Sstevel@tonic-gate 	 * Get all of the pcf locks.
1596*0Sstevel@tonic-gate 	 */
1597*0Sstevel@tonic-gate 	total = 0;
1598*0Sstevel@tonic-gate 	p = pcf;
1599*0Sstevel@tonic-gate 	for (i = 0; i < PCF_FANOUT; i++) {
1600*0Sstevel@tonic-gate 		p->pcf_touch = 1;
1601*0Sstevel@tonic-gate 		mutex_enter(&p->pcf_lock);
1602*0Sstevel@tonic-gate 		total += p->pcf_count;
1603*0Sstevel@tonic-gate 		if (total >= npages) {
1604*0Sstevel@tonic-gate 			/*
1605*0Sstevel@tonic-gate 			 * Wow!  There are enough pages laying around
1606*0Sstevel@tonic-gate 			 * to satisfy the request.  Do the accounting,
1607*0Sstevel@tonic-gate 			 * drop the locks we acquired, and go back.
1608*0Sstevel@tonic-gate 			 *
1609*0Sstevel@tonic-gate 			 * freemem is not protected by any lock. So,
1610*0Sstevel@tonic-gate 			 * we cannot have any assertion containing
1611*0Sstevel@tonic-gate 			 * freemem.
1612*0Sstevel@tonic-gate 			 */
1613*0Sstevel@tonic-gate 			freemem -= npages;
1614*0Sstevel@tonic-gate 
1615*0Sstevel@tonic-gate 			while (p >= pcf) {
1616*0Sstevel@tonic-gate 				if (p->pcf_count <= npages) {
1617*0Sstevel@tonic-gate 					npages -= p->pcf_count;
1618*0Sstevel@tonic-gate 					p->pcf_count = 0;
1619*0Sstevel@tonic-gate 				} else {
1620*0Sstevel@tonic-gate 					p->pcf_count -= (uint_t)npages;
1621*0Sstevel@tonic-gate 					npages = 0;
1622*0Sstevel@tonic-gate 				}
1623*0Sstevel@tonic-gate 				mutex_exit(&p->pcf_lock);
1624*0Sstevel@tonic-gate 				p--;
1625*0Sstevel@tonic-gate 			}
1626*0Sstevel@tonic-gate 			ASSERT(npages == 0);
1627*0Sstevel@tonic-gate 			return (1);
1628*0Sstevel@tonic-gate 		}
1629*0Sstevel@tonic-gate 		p++;
1630*0Sstevel@tonic-gate 	}
1631*0Sstevel@tonic-gate 
1632*0Sstevel@tonic-gate 	/*
1633*0Sstevel@tonic-gate 	 * All of the pcf locks are held, there are not enough pages
1634*0Sstevel@tonic-gate 	 * to satisfy the request (npages < total).
1635*0Sstevel@tonic-gate 	 * Be sure to acquire the new_freemem_lock before dropping
1636*0Sstevel@tonic-gate 	 * the pcf locks.  This prevents dropping wakeups in page_free().
1637*0Sstevel@tonic-gate 	 * The order is always pcf_lock then new_freemem_lock.
1638*0Sstevel@tonic-gate 	 *
1639*0Sstevel@tonic-gate 	 * Since we hold all the pcf locks, it is a good time to set freemem.
1640*0Sstevel@tonic-gate 	 *
1641*0Sstevel@tonic-gate 	 * If the caller does not want to wait, return now.
1642*0Sstevel@tonic-gate 	 * Else turn the pageout daemon loose to find something
1643*0Sstevel@tonic-gate 	 * and wait till it does.
1644*0Sstevel@tonic-gate 	 *
1645*0Sstevel@tonic-gate 	 */
1646*0Sstevel@tonic-gate 	freemem = total;
1647*0Sstevel@tonic-gate 
1648*0Sstevel@tonic-gate 	if ((flags & PG_WAIT) == 0) {
1649*0Sstevel@tonic-gate 		pcf_release_all();
1650*0Sstevel@tonic-gate 
1651*0Sstevel@tonic-gate 		TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1652*0Sstevel@tonic-gate 		"page_create_nomem:npages %ld freemem %ld", npages, freemem);
1653*0Sstevel@tonic-gate 		return (0);
1654*0Sstevel@tonic-gate 	}
1655*0Sstevel@tonic-gate 
1656*0Sstevel@tonic-gate 	ASSERT(proc_pageout != NULL);
1657*0Sstevel@tonic-gate 	cv_signal(&proc_pageout->p_cv);
1658*0Sstevel@tonic-gate 
1659*0Sstevel@tonic-gate 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1660*0Sstevel@tonic-gate 	    "page_create_sleep_start: freemem %ld needfree %ld",
1661*0Sstevel@tonic-gate 	    freemem, needfree);
1662*0Sstevel@tonic-gate 
1663*0Sstevel@tonic-gate 	/*
1664*0Sstevel@tonic-gate 	 * We are going to wait.
1665*0Sstevel@tonic-gate 	 * We currently hold all of the pcf_locks,
1666*0Sstevel@tonic-gate 	 * get the new_freemem_lock (it protects freemem_wait),
1667*0Sstevel@tonic-gate 	 * before dropping the pcf_locks.
1668*0Sstevel@tonic-gate 	 */
1669*0Sstevel@tonic-gate 	mutex_enter(&new_freemem_lock);
1670*0Sstevel@tonic-gate 
1671*0Sstevel@tonic-gate 	p = pcf;
1672*0Sstevel@tonic-gate 	for (i = 0; i < PCF_FANOUT; i++) {
1673*0Sstevel@tonic-gate 		p->pcf_wait++;
1674*0Sstevel@tonic-gate 		mutex_exit(&p->pcf_lock);
1675*0Sstevel@tonic-gate 		p++;
1676*0Sstevel@tonic-gate 	}
1677*0Sstevel@tonic-gate 
1678*0Sstevel@tonic-gate 	needfree += npages;
1679*0Sstevel@tonic-gate 	freemem_wait++;
1680*0Sstevel@tonic-gate 
1681*0Sstevel@tonic-gate 	cv_wait(&freemem_cv, &new_freemem_lock);
1682*0Sstevel@tonic-gate 
1683*0Sstevel@tonic-gate 	freemem_wait--;
1684*0Sstevel@tonic-gate 	needfree -= npages;
1685*0Sstevel@tonic-gate 
1686*0Sstevel@tonic-gate 	mutex_exit(&new_freemem_lock);
1687*0Sstevel@tonic-gate 
1688*0Sstevel@tonic-gate 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1689*0Sstevel@tonic-gate 	    "page_create_sleep_end: freemem %ld needfree %ld",
1690*0Sstevel@tonic-gate 	    freemem, needfree);
1691*0Sstevel@tonic-gate 
1692*0Sstevel@tonic-gate 	VM_STAT_ADD(page_create_not_enough_again);
1693*0Sstevel@tonic-gate 	goto checkagain;
1694*0Sstevel@tonic-gate }
1695*0Sstevel@tonic-gate 
1696*0Sstevel@tonic-gate /*
1697*0Sstevel@tonic-gate  * A routine to do the opposite of page_create_wait().
1698*0Sstevel@tonic-gate  */
1699*0Sstevel@tonic-gate void
1700*0Sstevel@tonic-gate page_create_putback(spgcnt_t npages)
1701*0Sstevel@tonic-gate {
1702*0Sstevel@tonic-gate 	struct pcf	*p;
1703*0Sstevel@tonic-gate 	pgcnt_t		lump;
1704*0Sstevel@tonic-gate 	uint_t		*which;
1705*0Sstevel@tonic-gate 
1706*0Sstevel@tonic-gate 	/*
1707*0Sstevel@tonic-gate 	 * When a contiguous lump is broken up, we have to
1708*0Sstevel@tonic-gate 	 * deal with lots of pages (min 64) so lets spread
1709*0Sstevel@tonic-gate 	 * the wealth around.
1710*0Sstevel@tonic-gate 	 */
1711*0Sstevel@tonic-gate 	lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT;
1712*0Sstevel@tonic-gate 	freemem += npages;
1713*0Sstevel@tonic-gate 
1714*0Sstevel@tonic-gate 	for (p = pcf; (npages > 0) && (p < &pcf[PCF_FANOUT]); p++) {
1715*0Sstevel@tonic-gate 		which = &p->pcf_count;
1716*0Sstevel@tonic-gate 
1717*0Sstevel@tonic-gate 		mutex_enter(&p->pcf_lock);
1718*0Sstevel@tonic-gate 
1719*0Sstevel@tonic-gate 		if (p->pcf_block) {
1720*0Sstevel@tonic-gate 			which = &p->pcf_reserve;
1721*0Sstevel@tonic-gate 		}
1722*0Sstevel@tonic-gate 
1723*0Sstevel@tonic-gate 		if (lump < npages) {
1724*0Sstevel@tonic-gate 			*which += (uint_t)lump;
1725*0Sstevel@tonic-gate 			npages -= lump;
1726*0Sstevel@tonic-gate 		} else {
1727*0Sstevel@tonic-gate 			*which += (uint_t)npages;
1728*0Sstevel@tonic-gate 			npages = 0;
1729*0Sstevel@tonic-gate 		}
1730*0Sstevel@tonic-gate 
1731*0Sstevel@tonic-gate 		if (p->pcf_wait) {
1732*0Sstevel@tonic-gate 			mutex_enter(&new_freemem_lock);
1733*0Sstevel@tonic-gate 			/*
1734*0Sstevel@tonic-gate 			 * Check to see if some other thread
1735*0Sstevel@tonic-gate 			 * is actually waiting.  Another bucket
1736*0Sstevel@tonic-gate 			 * may have woken it up by now.  If there
1737*0Sstevel@tonic-gate 			 * are no waiters, then set our pcf_wait
1738*0Sstevel@tonic-gate 			 * count to zero to avoid coming in here
1739*0Sstevel@tonic-gate 			 * next time.
1740*0Sstevel@tonic-gate 			 */
1741*0Sstevel@tonic-gate 			if (freemem_wait) {
1742*0Sstevel@tonic-gate 				if (npages > 1) {
1743*0Sstevel@tonic-gate 					cv_broadcast(&freemem_cv);
1744*0Sstevel@tonic-gate 				} else {
1745*0Sstevel@tonic-gate 					cv_signal(&freemem_cv);
1746*0Sstevel@tonic-gate 				}
1747*0Sstevel@tonic-gate 				p->pcf_wait--;
1748*0Sstevel@tonic-gate 			} else {
1749*0Sstevel@tonic-gate 				p->pcf_wait = 0;
1750*0Sstevel@tonic-gate 			}
1751*0Sstevel@tonic-gate 			mutex_exit(&new_freemem_lock);
1752*0Sstevel@tonic-gate 		}
1753*0Sstevel@tonic-gate 		mutex_exit(&p->pcf_lock);
1754*0Sstevel@tonic-gate 	}
1755*0Sstevel@tonic-gate 	ASSERT(npages == 0);
1756*0Sstevel@tonic-gate }
1757*0Sstevel@tonic-gate 
1758*0Sstevel@tonic-gate /*
1759*0Sstevel@tonic-gate  * A helper routine for page_create_get_something.
1760*0Sstevel@tonic-gate  * The indenting got to deep down there.
1761*0Sstevel@tonic-gate  * Unblock the pcf counters.  Any pages freed after
1762*0Sstevel@tonic-gate  * pcf_block got set are moved to pcf_count and
1763*0Sstevel@tonic-gate  * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1764*0Sstevel@tonic-gate  */
1765*0Sstevel@tonic-gate static void
1766*0Sstevel@tonic-gate pcgs_unblock(void)
1767*0Sstevel@tonic-gate {
1768*0Sstevel@tonic-gate 	int		i;
1769*0Sstevel@tonic-gate 	struct pcf	*p;
1770*0Sstevel@tonic-gate 
1771*0Sstevel@tonic-gate 	/* Update freemem while we're here. */
1772*0Sstevel@tonic-gate 	freemem = 0;
1773*0Sstevel@tonic-gate 	p = pcf;
1774*0Sstevel@tonic-gate 	for (i = 0; i < PCF_FANOUT; i++) {
1775*0Sstevel@tonic-gate 		mutex_enter(&p->pcf_lock);
1776*0Sstevel@tonic-gate 		ASSERT(p->pcf_count == 0);
1777*0Sstevel@tonic-gate 		p->pcf_count = p->pcf_reserve;
1778*0Sstevel@tonic-gate 		p->pcf_block = 0;
1779*0Sstevel@tonic-gate 		freemem += p->pcf_count;
1780*0Sstevel@tonic-gate 		if (p->pcf_wait) {
1781*0Sstevel@tonic-gate 			mutex_enter(&new_freemem_lock);
1782*0Sstevel@tonic-gate 			if (freemem_wait) {
1783*0Sstevel@tonic-gate 				if (p->pcf_reserve > 1) {
1784*0Sstevel@tonic-gate 					cv_broadcast(&freemem_cv);
1785*0Sstevel@tonic-gate 					p->pcf_wait = 0;
1786*0Sstevel@tonic-gate 				} else {
1787*0Sstevel@tonic-gate 					cv_signal(&freemem_cv);
1788*0Sstevel@tonic-gate 					p->pcf_wait--;
1789*0Sstevel@tonic-gate 				}
1790*0Sstevel@tonic-gate 			} else {
1791*0Sstevel@tonic-gate 				p->pcf_wait = 0;
1792*0Sstevel@tonic-gate 			}
1793*0Sstevel@tonic-gate 			mutex_exit(&new_freemem_lock);
1794*0Sstevel@tonic-gate 		}
1795*0Sstevel@tonic-gate 		p->pcf_reserve = 0;
1796*0Sstevel@tonic-gate 		mutex_exit(&p->pcf_lock);
1797*0Sstevel@tonic-gate 		p++;
1798*0Sstevel@tonic-gate 	}
1799*0Sstevel@tonic-gate }
1800*0Sstevel@tonic-gate 
1801*0Sstevel@tonic-gate /*
1802*0Sstevel@tonic-gate  * Called from page_create_va() when both the cache and free lists
1803*0Sstevel@tonic-gate  * have been checked once.
1804*0Sstevel@tonic-gate  *
1805*0Sstevel@tonic-gate  * Either returns a page or panics since the accounting was done
1806*0Sstevel@tonic-gate  * way before we got here.
1807*0Sstevel@tonic-gate  *
1808*0Sstevel@tonic-gate  * We don't come here often, so leave the accounting on permanently.
1809*0Sstevel@tonic-gate  */
1810*0Sstevel@tonic-gate 
1811*0Sstevel@tonic-gate #define	MAX_PCGS	100
1812*0Sstevel@tonic-gate 
1813*0Sstevel@tonic-gate #ifdef	DEBUG
1814*0Sstevel@tonic-gate #define	PCGS_TRIES	100
1815*0Sstevel@tonic-gate #else	/* DEBUG */
1816*0Sstevel@tonic-gate #define	PCGS_TRIES	10
1817*0Sstevel@tonic-gate #endif	/* DEBUG */
1818*0Sstevel@tonic-gate 
1819*0Sstevel@tonic-gate #ifdef	VM_STATS
1820*0Sstevel@tonic-gate uint_t	pcgs_counts[PCGS_TRIES];
1821*0Sstevel@tonic-gate uint_t	pcgs_too_many;
1822*0Sstevel@tonic-gate uint_t	pcgs_entered;
1823*0Sstevel@tonic-gate uint_t	pcgs_entered_noreloc;
1824*0Sstevel@tonic-gate uint_t	pcgs_locked;
1825*0Sstevel@tonic-gate uint_t	pcgs_cagelocked;
1826*0Sstevel@tonic-gate #endif	/* VM_STATS */
1827*0Sstevel@tonic-gate 
1828*0Sstevel@tonic-gate static page_t *
1829*0Sstevel@tonic-gate page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1830*0Sstevel@tonic-gate     caddr_t vaddr, uint_t flags)
1831*0Sstevel@tonic-gate {
1832*0Sstevel@tonic-gate 	uint_t		count;
1833*0Sstevel@tonic-gate 	page_t		*pp;
1834*0Sstevel@tonic-gate 	uint_t		locked, i;
1835*0Sstevel@tonic-gate 	struct	pcf	*p;
1836*0Sstevel@tonic-gate 	lgrp_t		*lgrp;
1837*0Sstevel@tonic-gate 	int		cagelocked = 0;
1838*0Sstevel@tonic-gate 
1839*0Sstevel@tonic-gate 	VM_STAT_ADD(pcgs_entered);
1840*0Sstevel@tonic-gate 
1841*0Sstevel@tonic-gate 	/*
1842*0Sstevel@tonic-gate 	 * Tap any reserve freelists: if we fail now, we'll die
1843*0Sstevel@tonic-gate 	 * since the page(s) we're looking for have already been
1844*0Sstevel@tonic-gate 	 * accounted for.
1845*0Sstevel@tonic-gate 	 */
1846*0Sstevel@tonic-gate 	flags |= PG_PANIC;
1847*0Sstevel@tonic-gate 
1848*0Sstevel@tonic-gate 	if ((flags & PG_NORELOC) != 0) {
1849*0Sstevel@tonic-gate 		VM_STAT_ADD(pcgs_entered_noreloc);
1850*0Sstevel@tonic-gate 		/*
1851*0Sstevel@tonic-gate 		 * Requests for free pages from critical threads
1852*0Sstevel@tonic-gate 		 * such as pageout still won't throttle here, but
1853*0Sstevel@tonic-gate 		 * we must try again, to give the cageout thread
1854*0Sstevel@tonic-gate 		 * another chance to catch up. Since we already
1855*0Sstevel@tonic-gate 		 * accounted for the pages, we had better get them
1856*0Sstevel@tonic-gate 		 * this time.
1857*0Sstevel@tonic-gate 		 *
1858*0Sstevel@tonic-gate 		 * N.B. All non-critical threads acquire the pcgs_cagelock
1859*0Sstevel@tonic-gate 		 * to serialize access to the freelists. This implements a
1860*0Sstevel@tonic-gate 		 * turnstile-type synchornization to avoid starvation of
1861*0Sstevel@tonic-gate 		 * critical requests for PG_NORELOC memory by non-critical
1862*0Sstevel@tonic-gate 		 * threads: all non-critical threads must acquire a 'ticket'
1863*0Sstevel@tonic-gate 		 * before passing through, which entails making sure
1864*0Sstevel@tonic-gate 		 * kcage_freemem won't fall below minfree prior to grabbing
1865*0Sstevel@tonic-gate 		 * pages from the freelists.
1866*0Sstevel@tonic-gate 		 */
1867*0Sstevel@tonic-gate 		if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1868*0Sstevel@tonic-gate 			mutex_enter(&pcgs_cagelock);
1869*0Sstevel@tonic-gate 			cagelocked = 1;
1870*0Sstevel@tonic-gate 			VM_STAT_ADD(pcgs_cagelocked);
1871*0Sstevel@tonic-gate 		}
1872*0Sstevel@tonic-gate 	}
1873*0Sstevel@tonic-gate 
1874*0Sstevel@tonic-gate 	/*
1875*0Sstevel@tonic-gate 	 * Time to get serious.
1876*0Sstevel@tonic-gate 	 * We failed to get a `correctly colored' page from both the
1877*0Sstevel@tonic-gate 	 * free and cache lists.
1878*0Sstevel@tonic-gate 	 * We escalate in stage.
1879*0Sstevel@tonic-gate 	 *
1880*0Sstevel@tonic-gate 	 * First try both lists without worring about color.
1881*0Sstevel@tonic-gate 	 *
1882*0Sstevel@tonic-gate 	 * Then, grab all page accounting locks (ie. pcf[]) and
1883*0Sstevel@tonic-gate 	 * steal any pages that they have and set the pcf_block flag to
1884*0Sstevel@tonic-gate 	 * stop deletions from the lists.  This will help because
1885*0Sstevel@tonic-gate 	 * a page can get added to the free list while we are looking
1886*0Sstevel@tonic-gate 	 * at the cache list, then another page could be added to the cache
1887*0Sstevel@tonic-gate 	 * list allowing the page on the free list to be removed as we
1888*0Sstevel@tonic-gate 	 * move from looking at the cache list to the free list. This
1889*0Sstevel@tonic-gate 	 * could happen over and over. We would never find the page
1890*0Sstevel@tonic-gate 	 * we have accounted for.
1891*0Sstevel@tonic-gate 	 *
1892*0Sstevel@tonic-gate 	 * Noreloc pages are a subset of the global (relocatable) page pool.
1893*0Sstevel@tonic-gate 	 * They are not tracked separately in the pcf bins, so it is
1894*0Sstevel@tonic-gate 	 * impossible to know when doing pcf accounting if the available
1895*0Sstevel@tonic-gate 	 * page(s) are noreloc pages or not. When looking for a noreloc page
1896*0Sstevel@tonic-gate 	 * it is quite easy to end up here even if the global (relocatable)
1897*0Sstevel@tonic-gate 	 * page pool has plenty of free pages but the noreloc pool is empty.
1898*0Sstevel@tonic-gate 	 *
1899*0Sstevel@tonic-gate 	 * When the noreloc pool is empty (or low), additional noreloc pages
1900*0Sstevel@tonic-gate 	 * are created by converting pages from the global page pool. This
1901*0Sstevel@tonic-gate 	 * process will stall during pcf accounting if the pcf bins are
1902*0Sstevel@tonic-gate 	 * already locked. Such is the case when a noreloc allocation is
1903*0Sstevel@tonic-gate 	 * looping here in page_create_get_something waiting for more noreloc
1904*0Sstevel@tonic-gate 	 * pages to appear.
1905*0Sstevel@tonic-gate 	 *
1906*0Sstevel@tonic-gate 	 * Short of adding a new field to the pcf bins to accurately track
1907*0Sstevel@tonic-gate 	 * the number of free noreloc pages, we instead do not grab the
1908*0Sstevel@tonic-gate 	 * pcgs_lock, do not set the pcf blocks and do not timeout when
1909*0Sstevel@tonic-gate 	 * allocating a noreloc page. This allows noreloc allocations to
1910*0Sstevel@tonic-gate 	 * loop without blocking global page pool allocations.
1911*0Sstevel@tonic-gate 	 *
1912*0Sstevel@tonic-gate 	 * NOTE: the behaviour of page_create_get_something has not changed
1913*0Sstevel@tonic-gate 	 * for the case of global page pool allocations.
1914*0Sstevel@tonic-gate 	 */
1915*0Sstevel@tonic-gate 
1916*0Sstevel@tonic-gate 	flags &= ~PG_MATCH_COLOR;
1917*0Sstevel@tonic-gate 	locked = 0;
1918*0Sstevel@tonic-gate #ifndef __sparc
1919*0Sstevel@tonic-gate 	/*
1920*0Sstevel@tonic-gate 	 * page_create_get_something may be called because 4g memory may be
1921*0Sstevel@tonic-gate 	 * depleted. Set flags to allow for relocation of base page below
1922*0Sstevel@tonic-gate 	 * 4g if necessary.
1923*0Sstevel@tonic-gate 	 */
1924*0Sstevel@tonic-gate 	if (physmax4g)
1925*0Sstevel@tonic-gate 		flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
1926*0Sstevel@tonic-gate #endif
1927*0Sstevel@tonic-gate 
1928*0Sstevel@tonic-gate 	lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1929*0Sstevel@tonic-gate 
1930*0Sstevel@tonic-gate 	for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1931*0Sstevel@tonic-gate 		pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1932*0Sstevel@tonic-gate 		    flags, lgrp);
1933*0Sstevel@tonic-gate 		if (pp == NULL) {
1934*0Sstevel@tonic-gate 			pp = page_get_cachelist(vp, off, seg, vaddr,
1935*0Sstevel@tonic-gate 				flags, lgrp);
1936*0Sstevel@tonic-gate 		}
1937*0Sstevel@tonic-gate 		if (pp == NULL) {
1938*0Sstevel@tonic-gate 			/*
1939*0Sstevel@tonic-gate 			 * Serialize.  Don't fight with other pcgs().
1940*0Sstevel@tonic-gate 			 */
1941*0Sstevel@tonic-gate 			if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1942*0Sstevel@tonic-gate 				mutex_enter(&pcgs_lock);
1943*0Sstevel@tonic-gate 				VM_STAT_ADD(pcgs_locked);
1944*0Sstevel@tonic-gate 				locked = 1;
1945*0Sstevel@tonic-gate 				p = pcf;
1946*0Sstevel@tonic-gate 				for (i = 0; i < PCF_FANOUT; i++) {
1947*0Sstevel@tonic-gate 					mutex_enter(&p->pcf_lock);
1948*0Sstevel@tonic-gate 					ASSERT(p->pcf_block == 0);
1949*0Sstevel@tonic-gate 					p->pcf_block = 1;
1950*0Sstevel@tonic-gate 					p->pcf_reserve = p->pcf_count;
1951*0Sstevel@tonic-gate 					p->pcf_count = 0;
1952*0Sstevel@tonic-gate 					mutex_exit(&p->pcf_lock);
1953*0Sstevel@tonic-gate 					p++;
1954*0Sstevel@tonic-gate 				}
1955*0Sstevel@tonic-gate 				freemem = 0;
1956*0Sstevel@tonic-gate 			}
1957*0Sstevel@tonic-gate 
1958*0Sstevel@tonic-gate 			if (count) {
1959*0Sstevel@tonic-gate 				/*
1960*0Sstevel@tonic-gate 				 * Since page_free() puts pages on
1961*0Sstevel@tonic-gate 				 * a list then accounts for it, we
1962*0Sstevel@tonic-gate 				 * just have to wait for page_free()
1963*0Sstevel@tonic-gate 				 * to unlock any page it was working
1964*0Sstevel@tonic-gate 				 * with. The page_lock()-page_reclaim()
1965*0Sstevel@tonic-gate 				 * path falls in the same boat.
1966*0Sstevel@tonic-gate 				 *
1967*0Sstevel@tonic-gate 				 * We don't need to check on the
1968*0Sstevel@tonic-gate 				 * PG_WAIT flag, we have already
1969*0Sstevel@tonic-gate 				 * accounted for the page we are
1970*0Sstevel@tonic-gate 				 * looking for in page_create_va().
1971*0Sstevel@tonic-gate 				 *
1972*0Sstevel@tonic-gate 				 * We just wait a moment to let any
1973*0Sstevel@tonic-gate 				 * locked pages on the lists free up,
1974*0Sstevel@tonic-gate 				 * then continue around and try again.
1975*0Sstevel@tonic-gate 				 *
1976*0Sstevel@tonic-gate 				 * Will be awakened by set_freemem().
1977*0Sstevel@tonic-gate 				 */
1978*0Sstevel@tonic-gate 				mutex_enter(&pcgs_wait_lock);
1979*0Sstevel@tonic-gate 				cv_wait(&pcgs_cv, &pcgs_wait_lock);
1980*0Sstevel@tonic-gate 				mutex_exit(&pcgs_wait_lock);
1981*0Sstevel@tonic-gate 			}
1982*0Sstevel@tonic-gate 		} else {
1983*0Sstevel@tonic-gate #ifdef VM_STATS
1984*0Sstevel@tonic-gate 			if (count >= PCGS_TRIES) {
1985*0Sstevel@tonic-gate 				VM_STAT_ADD(pcgs_too_many);
1986*0Sstevel@tonic-gate 			} else {
1987*0Sstevel@tonic-gate 				VM_STAT_ADD(pcgs_counts[count]);
1988*0Sstevel@tonic-gate 			}
1989*0Sstevel@tonic-gate #endif
1990*0Sstevel@tonic-gate 			if (locked) {
1991*0Sstevel@tonic-gate 				pcgs_unblock();
1992*0Sstevel@tonic-gate 				mutex_exit(&pcgs_lock);
1993*0Sstevel@tonic-gate 			}
1994*0Sstevel@tonic-gate 			if (cagelocked)
1995*0Sstevel@tonic-gate 				mutex_exit(&pcgs_cagelock);
1996*0Sstevel@tonic-gate 			return (pp);
1997*0Sstevel@tonic-gate 		}
1998*0Sstevel@tonic-gate 	}
1999*0Sstevel@tonic-gate 	/*
2000*0Sstevel@tonic-gate 	 * we go down holding the pcf locks.
2001*0Sstevel@tonic-gate 	 */
2002*0Sstevel@tonic-gate 	panic("no %spage found %d",
2003*0Sstevel@tonic-gate 	    ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
2004*0Sstevel@tonic-gate 	/*NOTREACHED*/
2005*0Sstevel@tonic-gate }
2006*0Sstevel@tonic-gate 
2007*0Sstevel@tonic-gate /*
2008*0Sstevel@tonic-gate  * Create enough pages for "bytes" worth of data starting at
2009*0Sstevel@tonic-gate  * "off" in "vp".
2010*0Sstevel@tonic-gate  *
2011*0Sstevel@tonic-gate  *	Where flag must be one of:
2012*0Sstevel@tonic-gate  *
2013*0Sstevel@tonic-gate  *		PG_EXCL:	Exclusive create (fail if any page already
2014*0Sstevel@tonic-gate  *				exists in the page cache) which does not
2015*0Sstevel@tonic-gate  *				wait for memory to become available.
2016*0Sstevel@tonic-gate  *
2017*0Sstevel@tonic-gate  *		PG_WAIT:	Non-exclusive create which can wait for
2018*0Sstevel@tonic-gate  *				memory to become available.
2019*0Sstevel@tonic-gate  *
2020*0Sstevel@tonic-gate  *		PG_PHYSCONTIG:	Allocate physically contiguous pages.
2021*0Sstevel@tonic-gate  *				(Not Supported)
2022*0Sstevel@tonic-gate  *
2023*0Sstevel@tonic-gate  * A doubly linked list of pages is returned to the caller.  Each page
2024*0Sstevel@tonic-gate  * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
2025*0Sstevel@tonic-gate  * lock.
2026*0Sstevel@tonic-gate  *
2027*0Sstevel@tonic-gate  * Unable to change the parameters to page_create() in a minor release,
2028*0Sstevel@tonic-gate  * we renamed page_create() to page_create_va(), changed all known calls
2029*0Sstevel@tonic-gate  * from page_create() to page_create_va(), and created this wrapper.
2030*0Sstevel@tonic-gate  *
2031*0Sstevel@tonic-gate  * Upon a major release, we should break compatibility by deleting this
2032*0Sstevel@tonic-gate  * wrapper, and replacing all the strings "page_create_va", with "page_create".
2033*0Sstevel@tonic-gate  *
2034*0Sstevel@tonic-gate  * NOTE: There is a copy of this interface as page_create_io() in
2035*0Sstevel@tonic-gate  *	 i86/vm/vm_machdep.c. Any bugs fixed here should be applied
2036*0Sstevel@tonic-gate  *	 there.
2037*0Sstevel@tonic-gate  */
2038*0Sstevel@tonic-gate page_t *
2039*0Sstevel@tonic-gate page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
2040*0Sstevel@tonic-gate {
2041*0Sstevel@tonic-gate 	caddr_t random_vaddr;
2042*0Sstevel@tonic-gate 	struct seg kseg;
2043*0Sstevel@tonic-gate 
2044*0Sstevel@tonic-gate #ifdef DEBUG
2045*0Sstevel@tonic-gate 	cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
2046*0Sstevel@tonic-gate 	    (void *)caller());
2047*0Sstevel@tonic-gate #endif
2048*0Sstevel@tonic-gate 
2049*0Sstevel@tonic-gate 	random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
2050*0Sstevel@tonic-gate 	    (uintptr_t)(off >> PAGESHIFT));
2051*0Sstevel@tonic-gate 	kseg.s_as = &kas;
2052*0Sstevel@tonic-gate 
2053*0Sstevel@tonic-gate 	return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
2054*0Sstevel@tonic-gate }
2055*0Sstevel@tonic-gate 
2056*0Sstevel@tonic-gate #ifdef DEBUG
2057*0Sstevel@tonic-gate uint32_t pg_alloc_pgs_mtbf = 0;
2058*0Sstevel@tonic-gate #endif
2059*0Sstevel@tonic-gate 
2060*0Sstevel@tonic-gate /*
2061*0Sstevel@tonic-gate  * Used for large page support. It will attempt to allocate
2062*0Sstevel@tonic-gate  * a large page(s) off the freelist.
2063*0Sstevel@tonic-gate  *
2064*0Sstevel@tonic-gate  * Returns non zero on failure.
2065*0Sstevel@tonic-gate  */
2066*0Sstevel@tonic-gate int
2067*0Sstevel@tonic-gate page_alloc_pages(struct seg *seg, caddr_t addr, page_t **basepp,
2068*0Sstevel@tonic-gate     page_t *ppa[], uint_t szc, int anypgsz)
2069*0Sstevel@tonic-gate {
2070*0Sstevel@tonic-gate 	pgcnt_t		npgs, curnpgs, totpgs;
2071*0Sstevel@tonic-gate 	size_t		pgsz;
2072*0Sstevel@tonic-gate 	page_t		*pplist = NULL, *pp;
2073*0Sstevel@tonic-gate 	int		err = 0;
2074*0Sstevel@tonic-gate 	lgrp_t		*lgrp;
2075*0Sstevel@tonic-gate 
2076*0Sstevel@tonic-gate 	ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
2077*0Sstevel@tonic-gate 
2078*0Sstevel@tonic-gate 	VM_STAT_ADD(alloc_pages[0]);
2079*0Sstevel@tonic-gate 
2080*0Sstevel@tonic-gate #ifdef DEBUG
2081*0Sstevel@tonic-gate 	if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
2082*0Sstevel@tonic-gate 		return (ENOMEM);
2083*0Sstevel@tonic-gate 	}
2084*0Sstevel@tonic-gate #endif
2085*0Sstevel@tonic-gate 
2086*0Sstevel@tonic-gate 	pgsz = page_get_pagesize(szc);
2087*0Sstevel@tonic-gate 	totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
2088*0Sstevel@tonic-gate 
2089*0Sstevel@tonic-gate 	ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
2090*0Sstevel@tonic-gate 	/*
2091*0Sstevel@tonic-gate 	 * One must be NULL but not both.
2092*0Sstevel@tonic-gate 	 * And one must be non NULL but not both.
2093*0Sstevel@tonic-gate 	 */
2094*0Sstevel@tonic-gate 	ASSERT(basepp != NULL || ppa != NULL);
2095*0Sstevel@tonic-gate 	ASSERT(basepp == NULL || ppa == NULL);
2096*0Sstevel@tonic-gate 
2097*0Sstevel@tonic-gate 	(void) page_create_wait(npgs, PG_WAIT);
2098*0Sstevel@tonic-gate 
2099*0Sstevel@tonic-gate 	while (npgs && szc) {
2100*0Sstevel@tonic-gate 		lgrp = lgrp_mem_choose(seg, addr, pgsz);
2101*0Sstevel@tonic-gate 		pp = page_get_freelist(NULL, 0, seg, addr, pgsz, 0, lgrp);
2102*0Sstevel@tonic-gate 		if (pp != NULL) {
2103*0Sstevel@tonic-gate 			VM_STAT_ADD(alloc_pages[1]);
2104*0Sstevel@tonic-gate 			page_list_concat(&pplist, &pp);
2105*0Sstevel@tonic-gate 			ASSERT(npgs >= curnpgs);
2106*0Sstevel@tonic-gate 			npgs -= curnpgs;
2107*0Sstevel@tonic-gate 		} else if (anypgsz) {
2108*0Sstevel@tonic-gate 			VM_STAT_ADD(alloc_pages[2]);
2109*0Sstevel@tonic-gate 			szc--;
2110*0Sstevel@tonic-gate 			pgsz = page_get_pagesize(szc);
2111*0Sstevel@tonic-gate 			curnpgs = pgsz >> PAGESHIFT;
2112*0Sstevel@tonic-gate 		} else {
2113*0Sstevel@tonic-gate 			VM_STAT_ADD(alloc_pages[3]);
2114*0Sstevel@tonic-gate 			ASSERT(npgs == totpgs);
2115*0Sstevel@tonic-gate 			page_create_putback(npgs);
2116*0Sstevel@tonic-gate 			return (ENOMEM);
2117*0Sstevel@tonic-gate 		}
2118*0Sstevel@tonic-gate 	}
2119*0Sstevel@tonic-gate 	if (szc == 0) {
2120*0Sstevel@tonic-gate 		VM_STAT_ADD(alloc_pages[4]);
2121*0Sstevel@tonic-gate 		ASSERT(npgs != 0);
2122*0Sstevel@tonic-gate 		page_create_putback(npgs);
2123*0Sstevel@tonic-gate 		err = ENOMEM;
2124*0Sstevel@tonic-gate 	} else if (basepp != NULL) {
2125*0Sstevel@tonic-gate 		ASSERT(npgs == 0);
2126*0Sstevel@tonic-gate 		ASSERT(ppa == NULL);
2127*0Sstevel@tonic-gate 		*basepp = pplist;
2128*0Sstevel@tonic-gate 	}
2129*0Sstevel@tonic-gate 
2130*0Sstevel@tonic-gate 	npgs = totpgs - npgs;
2131*0Sstevel@tonic-gate 	pp = pplist;
2132*0Sstevel@tonic-gate 
2133*0Sstevel@tonic-gate 	/*
2134*0Sstevel@tonic-gate 	 * Clear the free and age bits. Also if we were passed in a ppa then
2135*0Sstevel@tonic-gate 	 * fill it in with all the constituent pages from the large page. But
2136*0Sstevel@tonic-gate 	 * if we failed to allocate all the pages just free what we got.
2137*0Sstevel@tonic-gate 	 */
2138*0Sstevel@tonic-gate 	while (npgs != 0) {
2139*0Sstevel@tonic-gate 		ASSERT(PP_ISFREE(pp));
2140*0Sstevel@tonic-gate 		ASSERT(PP_ISAGED(pp));
2141*0Sstevel@tonic-gate 		if (ppa != NULL || err != 0) {
2142*0Sstevel@tonic-gate 			if (err == 0) {
2143*0Sstevel@tonic-gate 				VM_STAT_ADD(alloc_pages[5]);
2144*0Sstevel@tonic-gate 				PP_CLRFREE(pp);
2145*0Sstevel@tonic-gate 				PP_CLRAGED(pp);
2146*0Sstevel@tonic-gate 				page_sub(&pplist, pp);
2147*0Sstevel@tonic-gate 				*ppa++ = pp;
2148*0Sstevel@tonic-gate 				npgs--;
2149*0Sstevel@tonic-gate 			} else {
2150*0Sstevel@tonic-gate 				VM_STAT_ADD(alloc_pages[6]);
2151*0Sstevel@tonic-gate 				ASSERT(pp->p_szc != 0);
2152*0Sstevel@tonic-gate 				curnpgs = page_get_pagecnt(pp->p_szc);
2153*0Sstevel@tonic-gate 				page_list_break(&pp, &pplist, curnpgs);
2154*0Sstevel@tonic-gate 				page_list_add_pages(pp, 0);
2155*0Sstevel@tonic-gate 				page_create_putback(curnpgs);
2156*0Sstevel@tonic-gate 				ASSERT(npgs >= curnpgs);
2157*0Sstevel@tonic-gate 				npgs -= curnpgs;
2158*0Sstevel@tonic-gate 			}
2159*0Sstevel@tonic-gate 			pp = pplist;
2160*0Sstevel@tonic-gate 		} else {
2161*0Sstevel@tonic-gate 			VM_STAT_ADD(alloc_pages[7]);
2162*0Sstevel@tonic-gate 			PP_CLRFREE(pp);
2163*0Sstevel@tonic-gate 			PP_CLRAGED(pp);
2164*0Sstevel@tonic-gate 			pp = pp->p_next;
2165*0Sstevel@tonic-gate 			npgs--;
2166*0Sstevel@tonic-gate 		}
2167*0Sstevel@tonic-gate 	}
2168*0Sstevel@tonic-gate 	return (err);
2169*0Sstevel@tonic-gate }
2170*0Sstevel@tonic-gate 
2171*0Sstevel@tonic-gate /*
2172*0Sstevel@tonic-gate  * Get a single large page off of the freelists, and set it up for use.
2173*0Sstevel@tonic-gate  * Number of bytes requested must be a supported page size.
2174*0Sstevel@tonic-gate  *
2175*0Sstevel@tonic-gate  * Note that this call may fail even if there is sufficient
2176*0Sstevel@tonic-gate  * memory available or PG_WAIT is set, so the caller must
2177*0Sstevel@tonic-gate  * be willing to fallback on page_create_va(), block and retry,
2178*0Sstevel@tonic-gate  * or fail the requester.
2179*0Sstevel@tonic-gate  */
2180*0Sstevel@tonic-gate page_t *
2181*0Sstevel@tonic-gate page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2182*0Sstevel@tonic-gate     struct seg *seg, caddr_t vaddr, void *arg)
2183*0Sstevel@tonic-gate {
2184*0Sstevel@tonic-gate 	pgcnt_t		npages, pcftotal;
2185*0Sstevel@tonic-gate 	page_t		*pp;
2186*0Sstevel@tonic-gate 	page_t		*rootpp;
2187*0Sstevel@tonic-gate 	lgrp_t		*lgrp;
2188*0Sstevel@tonic-gate 	uint_t		enough;
2189*0Sstevel@tonic-gate 	uint_t		pcf_index;
2190*0Sstevel@tonic-gate 	uint_t		i;
2191*0Sstevel@tonic-gate 	struct pcf	*p;
2192*0Sstevel@tonic-gate 	struct pcf	*q;
2193*0Sstevel@tonic-gate 	lgrp_id_t	*lgrpid = (lgrp_id_t *)arg;
2194*0Sstevel@tonic-gate 
2195*0Sstevel@tonic-gate 	ASSERT(vp != NULL);
2196*0Sstevel@tonic-gate 
2197*0Sstevel@tonic-gate 	ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2198*0Sstevel@tonic-gate 		    PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
2199*0Sstevel@tonic-gate 	/* but no others */
2200*0Sstevel@tonic-gate 
2201*0Sstevel@tonic-gate 	ASSERT((flags & PG_EXCL) == PG_EXCL);
2202*0Sstevel@tonic-gate 
2203*0Sstevel@tonic-gate 	npages = btop(bytes);
2204*0Sstevel@tonic-gate 
2205*0Sstevel@tonic-gate 	if (!kcage_on || panicstr) {
2206*0Sstevel@tonic-gate 		/*
2207*0Sstevel@tonic-gate 		 * Cage is OFF, or we are single threaded in
2208*0Sstevel@tonic-gate 		 * panic, so make everything a RELOC request.
2209*0Sstevel@tonic-gate 		 */
2210*0Sstevel@tonic-gate 		flags &= ~PG_NORELOC;
2211*0Sstevel@tonic-gate 	}
2212*0Sstevel@tonic-gate 
2213*0Sstevel@tonic-gate 	/*
2214*0Sstevel@tonic-gate 	 * Make sure there's adequate physical memory available.
2215*0Sstevel@tonic-gate 	 * Note: PG_WAIT is ignored here.
2216*0Sstevel@tonic-gate 	 */
2217*0Sstevel@tonic-gate 	if (freemem <= throttlefree + npages) {
2218*0Sstevel@tonic-gate 		VM_STAT_ADD(page_create_large_cnt[1]);
2219*0Sstevel@tonic-gate 		return (NULL);
2220*0Sstevel@tonic-gate 	}
2221*0Sstevel@tonic-gate 
2222*0Sstevel@tonic-gate 	/*
2223*0Sstevel@tonic-gate 	 * If cage is on, dampen draw from cage when available
2224*0Sstevel@tonic-gate 	 * cage space is low.
2225*0Sstevel@tonic-gate 	 */
2226*0Sstevel@tonic-gate 	if ((flags & (PG_NORELOC | PG_WAIT)) ==  (PG_NORELOC | PG_WAIT) &&
2227*0Sstevel@tonic-gate 	    kcage_freemem < kcage_throttlefree + npages) {
2228*0Sstevel@tonic-gate 
2229*0Sstevel@tonic-gate 		/*
2230*0Sstevel@tonic-gate 		 * The cage is on, the caller wants PG_NORELOC
2231*0Sstevel@tonic-gate 		 * pages and available cage memory is very low.
2232*0Sstevel@tonic-gate 		 * Call kcage_create_throttle() to attempt to
2233*0Sstevel@tonic-gate 		 * control demand on the cage.
2234*0Sstevel@tonic-gate 		 */
2235*0Sstevel@tonic-gate 		if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2236*0Sstevel@tonic-gate 			VM_STAT_ADD(page_create_large_cnt[2]);
2237*0Sstevel@tonic-gate 			return (NULL);
2238*0Sstevel@tonic-gate 		}
2239*0Sstevel@tonic-gate 	}
2240*0Sstevel@tonic-gate 
2241*0Sstevel@tonic-gate 	enough = 0;
2242*0Sstevel@tonic-gate 	pcf_index = PCF_INDEX();
2243*0Sstevel@tonic-gate 	p = &pcf[pcf_index];
2244*0Sstevel@tonic-gate 	p->pcf_touch = 1;
2245*0Sstevel@tonic-gate 	q = &pcf[PCF_FANOUT];
2246*0Sstevel@tonic-gate 	for (pcftotal = 0, i = 0; i < PCF_FANOUT; i++) {
2247*0Sstevel@tonic-gate 		if (p->pcf_count > npages) {
2248*0Sstevel@tonic-gate 			/*
2249*0Sstevel@tonic-gate 			 * a good one to try.
2250*0Sstevel@tonic-gate 			 */
2251*0Sstevel@tonic-gate 			mutex_enter(&p->pcf_lock);
2252*0Sstevel@tonic-gate 			if (p->pcf_count > npages) {
2253*0Sstevel@tonic-gate 				p->pcf_count -= (uint_t)npages;
2254*0Sstevel@tonic-gate 				/*
2255*0Sstevel@tonic-gate 				 * freemem is not protected by any lock.
2256*0Sstevel@tonic-gate 				 * Thus, we cannot have any assertion
2257*0Sstevel@tonic-gate 				 * containing freemem here.
2258*0Sstevel@tonic-gate 				 */
2259*0Sstevel@tonic-gate 				freemem -= npages;
2260*0Sstevel@tonic-gate 				enough = 1;
2261*0Sstevel@tonic-gate 				mutex_exit(&p->pcf_lock);
2262*0Sstevel@tonic-gate 				break;
2263*0Sstevel@tonic-gate 			}
2264*0Sstevel@tonic-gate 			mutex_exit(&p->pcf_lock);
2265*0Sstevel@tonic-gate 		}
2266*0Sstevel@tonic-gate 		pcftotal += p->pcf_count;
2267*0Sstevel@tonic-gate 		p++;
2268*0Sstevel@tonic-gate 		if (p >= q) {
2269*0Sstevel@tonic-gate 			p = pcf;
2270*0Sstevel@tonic-gate 		}
2271*0Sstevel@tonic-gate 		p->pcf_touch = 1;
2272*0Sstevel@tonic-gate 	}
2273*0Sstevel@tonic-gate 
2274*0Sstevel@tonic-gate 	if (!enough) {
2275*0Sstevel@tonic-gate 		/* If there isn't enough memory available, give up. */
2276*0Sstevel@tonic-gate 		if (pcftotal < npages) {
2277*0Sstevel@tonic-gate 			VM_STAT_ADD(page_create_large_cnt[3]);
2278*0Sstevel@tonic-gate 			return (NULL);
2279*0Sstevel@tonic-gate 		}
2280*0Sstevel@tonic-gate 
2281*0Sstevel@tonic-gate 		/* try to collect pages from several pcf bins */
2282*0Sstevel@tonic-gate 		for (p = pcf, pcftotal = 0, i = 0; i < PCF_FANOUT; i++) {
2283*0Sstevel@tonic-gate 			p->pcf_touch = 1;
2284*0Sstevel@tonic-gate 			mutex_enter(&p->pcf_lock);
2285*0Sstevel@tonic-gate 			pcftotal += p->pcf_count;
2286*0Sstevel@tonic-gate 			if (pcftotal >= npages) {
2287*0Sstevel@tonic-gate 				/*
2288*0Sstevel@tonic-gate 				 * Wow!  There are enough pages laying around
2289*0Sstevel@tonic-gate 				 * to satisfy the request.  Do the accounting,
2290*0Sstevel@tonic-gate 				 * drop the locks we acquired, and go back.
2291*0Sstevel@tonic-gate 				 *
2292*0Sstevel@tonic-gate 				 * freemem is not protected by any lock. So,
2293*0Sstevel@tonic-gate 				 * we cannot have any assertion containing
2294*0Sstevel@tonic-gate 				 * freemem.
2295*0Sstevel@tonic-gate 				 */
2296*0Sstevel@tonic-gate 				pgcnt_t	tpages = npages;
2297*0Sstevel@tonic-gate 				freemem -= npages;
2298*0Sstevel@tonic-gate 				while (p >= pcf) {
2299*0Sstevel@tonic-gate 					if (p->pcf_count <= tpages) {
2300*0Sstevel@tonic-gate 						tpages -= p->pcf_count;
2301*0Sstevel@tonic-gate 						p->pcf_count = 0;
2302*0Sstevel@tonic-gate 					} else {
2303*0Sstevel@tonic-gate 						p->pcf_count -= (uint_t)tpages;
2304*0Sstevel@tonic-gate 						tpages = 0;
2305*0Sstevel@tonic-gate 					}
2306*0Sstevel@tonic-gate 					mutex_exit(&p->pcf_lock);
2307*0Sstevel@tonic-gate 					p--;
2308*0Sstevel@tonic-gate 				}
2309*0Sstevel@tonic-gate 				ASSERT(tpages == 0);
2310*0Sstevel@tonic-gate 				break;
2311*0Sstevel@tonic-gate 			}
2312*0Sstevel@tonic-gate 			p++;
2313*0Sstevel@tonic-gate 		}
2314*0Sstevel@tonic-gate 		if (i == PCF_FANOUT) {
2315*0Sstevel@tonic-gate 			/* failed to collect pages - release the locks */
2316*0Sstevel@tonic-gate 			while (--p >= pcf) {
2317*0Sstevel@tonic-gate 				mutex_exit(&p->pcf_lock);
2318*0Sstevel@tonic-gate 			}
2319*0Sstevel@tonic-gate 			VM_STAT_ADD(page_create_large_cnt[4]);
2320*0Sstevel@tonic-gate 			return (NULL);
2321*0Sstevel@tonic-gate 		}
2322*0Sstevel@tonic-gate 	}
2323*0Sstevel@tonic-gate 
2324*0Sstevel@tonic-gate 	/*
2325*0Sstevel@tonic-gate 	 * This is where this function behaves fundamentally differently
2326*0Sstevel@tonic-gate 	 * than page_create_va(); since we're intending to map the page
2327*0Sstevel@tonic-gate 	 * with a single TTE, we have to get it as a physically contiguous
2328*0Sstevel@tonic-gate 	 * hardware pagesize chunk.  If we can't, we fail.
2329*0Sstevel@tonic-gate 	 */
2330*0Sstevel@tonic-gate 	if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2331*0Sstevel@tonic-gate 		LGRP_EXISTS(lgrp_table[*lgrpid]))
2332*0Sstevel@tonic-gate 		lgrp = lgrp_table[*lgrpid];
2333*0Sstevel@tonic-gate 	else
2334*0Sstevel@tonic-gate 		lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2335*0Sstevel@tonic-gate 
2336*0Sstevel@tonic-gate 	if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2337*0Sstevel@tonic-gate 	    bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2338*0Sstevel@tonic-gate 		page_create_putback(npages);
2339*0Sstevel@tonic-gate 		VM_STAT_ADD(page_create_large_cnt[5]);
2340*0Sstevel@tonic-gate 		return (NULL);
2341*0Sstevel@tonic-gate 	}
2342*0Sstevel@tonic-gate 
2343*0Sstevel@tonic-gate 	/*
2344*0Sstevel@tonic-gate 	 * if we got the page with the wrong mtype give it back this is a
2345*0Sstevel@tonic-gate 	 * workaround for CR 6249718. When CR 6249718 is fixed we never get
2346*0Sstevel@tonic-gate 	 * inside "if" and the workaround becomes just a nop
2347*0Sstevel@tonic-gate 	 */
2348*0Sstevel@tonic-gate 	if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2349*0Sstevel@tonic-gate 		page_list_add_pages(rootpp, 0);
2350*0Sstevel@tonic-gate 		page_create_putback(npages);
2351*0Sstevel@tonic-gate 		VM_STAT_ADD(page_create_large_cnt[6]);
2352*0Sstevel@tonic-gate 		return (NULL);
2353*0Sstevel@tonic-gate 	}
2354*0Sstevel@tonic-gate 
2355*0Sstevel@tonic-gate 	/*
2356*0Sstevel@tonic-gate 	 * If satisfying this request has left us with too little
2357*0Sstevel@tonic-gate 	 * memory, start the wheels turning to get some back.  The
2358*0Sstevel@tonic-gate 	 * first clause of the test prevents waking up the pageout
2359*0Sstevel@tonic-gate 	 * daemon in situations where it would decide that there's
2360*0Sstevel@tonic-gate 	 * nothing to do.
2361*0Sstevel@tonic-gate 	 */
2362*0Sstevel@tonic-gate 	if (nscan < desscan && freemem < minfree) {
2363*0Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2364*0Sstevel@tonic-gate 		    "pageout_cv_signal:freemem %ld", freemem);
2365*0Sstevel@tonic-gate 		cv_signal(&proc_pageout->p_cv);
2366*0Sstevel@tonic-gate 	}
2367*0Sstevel@tonic-gate 
2368*0Sstevel@tonic-gate 	pp = rootpp;
2369*0Sstevel@tonic-gate 	while (npages--) {
2370*0Sstevel@tonic-gate 		ASSERT(PAGE_EXCL(pp));
2371*0Sstevel@tonic-gate 		ASSERT(pp->p_vnode == NULL);
2372*0Sstevel@tonic-gate 		ASSERT(!hat_page_is_mapped(pp));
2373*0Sstevel@tonic-gate 		PP_CLRFREE(pp);
2374*0Sstevel@tonic-gate 		PP_CLRAGED(pp);
2375*0Sstevel@tonic-gate 		if (!page_hashin(pp, vp, off, NULL))
2376*0Sstevel@tonic-gate 			panic("page_create_large: hashin failed: page %p",
2377*0Sstevel@tonic-gate 			    (void *)pp);
2378*0Sstevel@tonic-gate 		page_io_lock(pp);
2379*0Sstevel@tonic-gate 		off += PAGESIZE;
2380*0Sstevel@tonic-gate 		pp = pp->p_next;
2381*0Sstevel@tonic-gate 	}
2382*0Sstevel@tonic-gate 
2383*0Sstevel@tonic-gate 	VM_STAT_ADD(page_create_large_cnt[0]);
2384*0Sstevel@tonic-gate 	return (rootpp);
2385*0Sstevel@tonic-gate }
2386*0Sstevel@tonic-gate 
2387*0Sstevel@tonic-gate page_t *
2388*0Sstevel@tonic-gate page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2389*0Sstevel@tonic-gate     struct seg *seg, caddr_t vaddr)
2390*0Sstevel@tonic-gate {
2391*0Sstevel@tonic-gate 	page_t		*plist = NULL;
2392*0Sstevel@tonic-gate 	pgcnt_t		npages;
2393*0Sstevel@tonic-gate 	pgcnt_t		found_on_free = 0;
2394*0Sstevel@tonic-gate 	pgcnt_t		pages_req;
2395*0Sstevel@tonic-gate 	page_t		*npp = NULL;
2396*0Sstevel@tonic-gate 	uint_t		enough;
2397*0Sstevel@tonic-gate 	uint_t		i;
2398*0Sstevel@tonic-gate 	uint_t		pcf_index;
2399*0Sstevel@tonic-gate 	struct pcf	*p;
2400*0Sstevel@tonic-gate 	struct pcf	*q;
2401*0Sstevel@tonic-gate 	lgrp_t		*lgrp;
2402*0Sstevel@tonic-gate 
2403*0Sstevel@tonic-gate 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2404*0Sstevel@tonic-gate 		"page_create_start:vp %p off %llx bytes %lu flags %x",
2405*0Sstevel@tonic-gate 		vp, off, bytes, flags);
2406*0Sstevel@tonic-gate 
2407*0Sstevel@tonic-gate 	ASSERT(bytes != 0 && vp != NULL);
2408*0Sstevel@tonic-gate 
2409*0Sstevel@tonic-gate 	if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2410*0Sstevel@tonic-gate 		panic("page_create: invalid flags");
2411*0Sstevel@tonic-gate 		/*NOTREACHED*/
2412*0Sstevel@tonic-gate 	}
2413*0Sstevel@tonic-gate 	ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2414*0Sstevel@tonic-gate 	    PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
2415*0Sstevel@tonic-gate 	    /* but no others */
2416*0Sstevel@tonic-gate 
2417*0Sstevel@tonic-gate 	pages_req = npages = btopr(bytes);
2418*0Sstevel@tonic-gate 	/*
2419*0Sstevel@tonic-gate 	 * Try to see whether request is too large to *ever* be
2420*0Sstevel@tonic-gate 	 * satisfied, in order to prevent deadlock.  We arbitrarily
2421*0Sstevel@tonic-gate 	 * decide to limit maximum size requests to max_page_get.
2422*0Sstevel@tonic-gate 	 */
2423*0Sstevel@tonic-gate 	if (npages >= max_page_get) {
2424*0Sstevel@tonic-gate 		if ((flags & PG_WAIT) == 0) {
2425*0Sstevel@tonic-gate 			TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2426*0Sstevel@tonic-gate 			    "page_create_toobig:vp %p off %llx npages "
2427*0Sstevel@tonic-gate 			    "%lu max_page_get %lu",
2428*0Sstevel@tonic-gate 			    vp, off, npages, max_page_get);
2429*0Sstevel@tonic-gate 			return (NULL);
2430*0Sstevel@tonic-gate 		} else {
2431*0Sstevel@tonic-gate 			cmn_err(CE_WARN,
2432*0Sstevel@tonic-gate 			    "Request for too much kernel memory "
2433*0Sstevel@tonic-gate 			    "(%lu bytes), will hang forever", bytes);
2434*0Sstevel@tonic-gate 			for (;;)
2435*0Sstevel@tonic-gate 				delay(1000000000);
2436*0Sstevel@tonic-gate 		}
2437*0Sstevel@tonic-gate 	}
2438*0Sstevel@tonic-gate 
2439*0Sstevel@tonic-gate 	if (!kcage_on || panicstr) {
2440*0Sstevel@tonic-gate 		/*
2441*0Sstevel@tonic-gate 		 * Cage is OFF, or we are single threaded in
2442*0Sstevel@tonic-gate 		 * panic, so make everything a RELOC request.
2443*0Sstevel@tonic-gate 		 */
2444*0Sstevel@tonic-gate 		flags &= ~PG_NORELOC;
2445*0Sstevel@tonic-gate 	}
2446*0Sstevel@tonic-gate 
2447*0Sstevel@tonic-gate 	if (freemem <= throttlefree + npages)
2448*0Sstevel@tonic-gate 		if (!page_create_throttle(npages, flags))
2449*0Sstevel@tonic-gate 			return (NULL);
2450*0Sstevel@tonic-gate 
2451*0Sstevel@tonic-gate 	/*
2452*0Sstevel@tonic-gate 	 * If cage is on, dampen draw from cage when available
2453*0Sstevel@tonic-gate 	 * cage space is low.
2454*0Sstevel@tonic-gate 	 */
2455*0Sstevel@tonic-gate 	if ((flags & PG_NORELOC) &&
2456*0Sstevel@tonic-gate 		kcage_freemem < kcage_throttlefree + npages) {
2457*0Sstevel@tonic-gate 
2458*0Sstevel@tonic-gate 		/*
2459*0Sstevel@tonic-gate 		 * The cage is on, the caller wants PG_NORELOC
2460*0Sstevel@tonic-gate 		 * pages and available cage memory is very low.
2461*0Sstevel@tonic-gate 		 * Call kcage_create_throttle() to attempt to
2462*0Sstevel@tonic-gate 		 * control demand on the cage.
2463*0Sstevel@tonic-gate 		 */
2464*0Sstevel@tonic-gate 		if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2465*0Sstevel@tonic-gate 			return (NULL);
2466*0Sstevel@tonic-gate 	}
2467*0Sstevel@tonic-gate 
2468*0Sstevel@tonic-gate 	VM_STAT_ADD(page_create_cnt[0]);
2469*0Sstevel@tonic-gate 
2470*0Sstevel@tonic-gate 	enough = 0;
2471*0Sstevel@tonic-gate 	pcf_index = PCF_INDEX();
2472*0Sstevel@tonic-gate 
2473*0Sstevel@tonic-gate 	p = &pcf[pcf_index];
2474*0Sstevel@tonic-gate 	p->pcf_touch = 1;
2475*0Sstevel@tonic-gate 	q = &pcf[PCF_FANOUT];
2476*0Sstevel@tonic-gate 	for (i = 0; i < PCF_FANOUT; i++) {
2477*0Sstevel@tonic-gate 		if (p->pcf_count > npages) {
2478*0Sstevel@tonic-gate 			/*
2479*0Sstevel@tonic-gate 			 * a good one to try.
2480*0Sstevel@tonic-gate 			 */
2481*0Sstevel@tonic-gate 			mutex_enter(&p->pcf_lock);
2482*0Sstevel@tonic-gate 			if (p->pcf_count > npages) {
2483*0Sstevel@tonic-gate 				p->pcf_count -= (uint_t)npages;
2484*0Sstevel@tonic-gate 				/*
2485*0Sstevel@tonic-gate 				 * freemem is not protected by any lock.
2486*0Sstevel@tonic-gate 				 * Thus, we cannot have any assertion
2487*0Sstevel@tonic-gate 				 * containing freemem here.
2488*0Sstevel@tonic-gate 				 */
2489*0Sstevel@tonic-gate 				freemem -= npages;
2490*0Sstevel@tonic-gate 				enough = 1;
2491*0Sstevel@tonic-gate 				mutex_exit(&p->pcf_lock);
2492*0Sstevel@tonic-gate 				break;
2493*0Sstevel@tonic-gate 			}
2494*0Sstevel@tonic-gate 			mutex_exit(&p->pcf_lock);
2495*0Sstevel@tonic-gate 		}
2496*0Sstevel@tonic-gate 		p++;
2497*0Sstevel@tonic-gate 		if (p >= q) {
2498*0Sstevel@tonic-gate 			p = pcf;
2499*0Sstevel@tonic-gate 		}
2500*0Sstevel@tonic-gate 		p->pcf_touch = 1;
2501*0Sstevel@tonic-gate 	}
2502*0Sstevel@tonic-gate 
2503*0Sstevel@tonic-gate 	if (!enough) {
2504*0Sstevel@tonic-gate 		/*
2505*0Sstevel@tonic-gate 		 * Have to look harder.  If npages is greater than
2506*0Sstevel@tonic-gate 		 * one, then we might have to coalecse the counters.
2507*0Sstevel@tonic-gate 		 *
2508*0Sstevel@tonic-gate 		 * Go wait.  We come back having accounted
2509*0Sstevel@tonic-gate 		 * for the memory.
2510*0Sstevel@tonic-gate 		 */
2511*0Sstevel@tonic-gate 		VM_STAT_ADD(page_create_cnt[1]);
2512*0Sstevel@tonic-gate 		if (!page_create_wait(npages, flags)) {
2513*0Sstevel@tonic-gate 			VM_STAT_ADD(page_create_cnt[2]);
2514*0Sstevel@tonic-gate 			return (NULL);
2515*0Sstevel@tonic-gate 		}
2516*0Sstevel@tonic-gate 	}
2517*0Sstevel@tonic-gate 
2518*0Sstevel@tonic-gate 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2519*0Sstevel@tonic-gate 		"page_create_success:vp %p off %llx", vp, off);
2520*0Sstevel@tonic-gate 
2521*0Sstevel@tonic-gate 	/*
2522*0Sstevel@tonic-gate 	 * If satisfying this request has left us with too little
2523*0Sstevel@tonic-gate 	 * memory, start the wheels turning to get some back.  The
2524*0Sstevel@tonic-gate 	 * first clause of the test prevents waking up the pageout
2525*0Sstevel@tonic-gate 	 * daemon in situations where it would decide that there's
2526*0Sstevel@tonic-gate 	 * nothing to do.
2527*0Sstevel@tonic-gate 	 */
2528*0Sstevel@tonic-gate 	if (nscan < desscan && freemem < minfree) {
2529*0Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2530*0Sstevel@tonic-gate 			"pageout_cv_signal:freemem %ld", freemem);
2531*0Sstevel@tonic-gate 		cv_signal(&proc_pageout->p_cv);
2532*0Sstevel@tonic-gate 	}
2533*0Sstevel@tonic-gate 
2534*0Sstevel@tonic-gate 	/*
2535*0Sstevel@tonic-gate 	 * Loop around collecting the requested number of pages.
2536*0Sstevel@tonic-gate 	 * Most of the time, we have to `create' a new page. With
2537*0Sstevel@tonic-gate 	 * this in mind, pull the page off the free list before
2538*0Sstevel@tonic-gate 	 * getting the hash lock.  This will minimize the hash
2539*0Sstevel@tonic-gate 	 * lock hold time, nesting, and the like.  If it turns
2540*0Sstevel@tonic-gate 	 * out we don't need the page, we put it back at the end.
2541*0Sstevel@tonic-gate 	 */
2542*0Sstevel@tonic-gate 	while (npages--) {
2543*0Sstevel@tonic-gate 		page_t		*pp;
2544*0Sstevel@tonic-gate 		kmutex_t	*phm = NULL;
2545*0Sstevel@tonic-gate 		ulong_t		index;
2546*0Sstevel@tonic-gate 
2547*0Sstevel@tonic-gate 		index = PAGE_HASH_FUNC(vp, off);
2548*0Sstevel@tonic-gate top:
2549*0Sstevel@tonic-gate 		ASSERT(phm == NULL);
2550*0Sstevel@tonic-gate 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
2551*0Sstevel@tonic-gate 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2552*0Sstevel@tonic-gate 
2553*0Sstevel@tonic-gate 		if (npp == NULL) {
2554*0Sstevel@tonic-gate 			/*
2555*0Sstevel@tonic-gate 			 * Try to get a page from the freelist (ie,
2556*0Sstevel@tonic-gate 			 * a page with no [vp, off] tag).  If that
2557*0Sstevel@tonic-gate 			 * fails, use the cachelist.
2558*0Sstevel@tonic-gate 			 *
2559*0Sstevel@tonic-gate 			 * During the first attempt at both the free
2560*0Sstevel@tonic-gate 			 * and cache lists we try for the correct color.
2561*0Sstevel@tonic-gate 			 */
2562*0Sstevel@tonic-gate 			/*
2563*0Sstevel@tonic-gate 			 * XXXX-how do we deal with virtual indexed
2564*0Sstevel@tonic-gate 			 * caches and and colors?
2565*0Sstevel@tonic-gate 			 */
2566*0Sstevel@tonic-gate 			VM_STAT_ADD(page_create_cnt[4]);
2567*0Sstevel@tonic-gate 			/*
2568*0Sstevel@tonic-gate 			 * Get lgroup to allocate next page of shared memory
2569*0Sstevel@tonic-gate 			 * from and use it to specify where to allocate
2570*0Sstevel@tonic-gate 			 * the physical memory
2571*0Sstevel@tonic-gate 			 */
2572*0Sstevel@tonic-gate 			lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2573*0Sstevel@tonic-gate 			npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2574*0Sstevel@tonic-gate 			    flags | PG_MATCH_COLOR, lgrp);
2575*0Sstevel@tonic-gate 			if (npp == NULL) {
2576*0Sstevel@tonic-gate 				npp = page_get_cachelist(vp, off, seg,
2577*0Sstevel@tonic-gate 				    vaddr, flags | PG_MATCH_COLOR, lgrp);
2578*0Sstevel@tonic-gate 				if (npp == NULL) {
2579*0Sstevel@tonic-gate 					npp = page_create_get_something(vp,
2580*0Sstevel@tonic-gate 					    off, seg, vaddr,
2581*0Sstevel@tonic-gate 					    flags & ~PG_MATCH_COLOR);
2582*0Sstevel@tonic-gate 				}
2583*0Sstevel@tonic-gate 
2584*0Sstevel@tonic-gate 				if (PP_ISAGED(npp) == 0) {
2585*0Sstevel@tonic-gate 					/*
2586*0Sstevel@tonic-gate 					 * Since this page came from the
2587*0Sstevel@tonic-gate 					 * cachelist, we must destroy the
2588*0Sstevel@tonic-gate 					 * old vnode association.
2589*0Sstevel@tonic-gate 					 */
2590*0Sstevel@tonic-gate 					page_hashout(npp, NULL);
2591*0Sstevel@tonic-gate 				}
2592*0Sstevel@tonic-gate 			}
2593*0Sstevel@tonic-gate 		}
2594*0Sstevel@tonic-gate 
2595*0Sstevel@tonic-gate 		/*
2596*0Sstevel@tonic-gate 		 * We own this page!
2597*0Sstevel@tonic-gate 		 */
2598*0Sstevel@tonic-gate 		ASSERT(PAGE_EXCL(npp));
2599*0Sstevel@tonic-gate 		ASSERT(npp->p_vnode == NULL);
2600*0Sstevel@tonic-gate 		ASSERT(!hat_page_is_mapped(npp));
2601*0Sstevel@tonic-gate 		PP_CLRFREE(npp);
2602*0Sstevel@tonic-gate 		PP_CLRAGED(npp);
2603*0Sstevel@tonic-gate 
2604*0Sstevel@tonic-gate 		/*
2605*0Sstevel@tonic-gate 		 * Here we have a page in our hot little mits and are
2606*0Sstevel@tonic-gate 		 * just waiting to stuff it on the appropriate lists.
2607*0Sstevel@tonic-gate 		 * Get the mutex and check to see if it really does
2608*0Sstevel@tonic-gate 		 * not exist.
2609*0Sstevel@tonic-gate 		 */
2610*0Sstevel@tonic-gate 		phm = PAGE_HASH_MUTEX(index);
2611*0Sstevel@tonic-gate 		mutex_enter(phm);
2612*0Sstevel@tonic-gate 		PAGE_HASH_SEARCH(index, pp, vp, off);
2613*0Sstevel@tonic-gate 		if (pp == NULL) {
2614*0Sstevel@tonic-gate 			VM_STAT_ADD(page_create_new);
2615*0Sstevel@tonic-gate 			pp = npp;
2616*0Sstevel@tonic-gate 			npp = NULL;
2617*0Sstevel@tonic-gate 			if (!page_hashin(pp, vp, off, phm)) {
2618*0Sstevel@tonic-gate 				/*
2619*0Sstevel@tonic-gate 				 * Since we hold the page hash mutex and
2620*0Sstevel@tonic-gate 				 * just searched for this page, page_hashin
2621*0Sstevel@tonic-gate 				 * had better not fail.  If it does, that
2622*0Sstevel@tonic-gate 				 * means somethread did not follow the
2623*0Sstevel@tonic-gate 				 * page hash mutex rules.  Panic now and
2624*0Sstevel@tonic-gate 				 * get it over with.  As usual, go down
2625*0Sstevel@tonic-gate 				 * holding all the locks.
2626*0Sstevel@tonic-gate 				 */
2627*0Sstevel@tonic-gate 				ASSERT(MUTEX_HELD(phm));
2628*0Sstevel@tonic-gate 				panic("page_create: "
2629*0Sstevel@tonic-gate 				    "hashin failed %p %p %llx %p",
2630*0Sstevel@tonic-gate 				    (void *)pp, (void *)vp, off, (void *)phm);
2631*0Sstevel@tonic-gate 				/*NOTREACHED*/
2632*0Sstevel@tonic-gate 			}
2633*0Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(phm));
2634*0Sstevel@tonic-gate 			mutex_exit(phm);
2635*0Sstevel@tonic-gate 			phm = NULL;
2636*0Sstevel@tonic-gate 
2637*0Sstevel@tonic-gate 			/*
2638*0Sstevel@tonic-gate 			 * Hat layer locking need not be done to set
2639*0Sstevel@tonic-gate 			 * the following bits since the page is not hashed
2640*0Sstevel@tonic-gate 			 * and was on the free list (i.e., had no mappings).
2641*0Sstevel@tonic-gate 			 *
2642*0Sstevel@tonic-gate 			 * Set the reference bit to protect
2643*0Sstevel@tonic-gate 			 * against immediate pageout
2644*0Sstevel@tonic-gate 			 *
2645*0Sstevel@tonic-gate 			 * XXXmh modify freelist code to set reference
2646*0Sstevel@tonic-gate 			 * bit so we don't have to do it here.
2647*0Sstevel@tonic-gate 			 */
2648*0Sstevel@tonic-gate 			page_set_props(pp, P_REF);
2649*0Sstevel@tonic-gate 			found_on_free++;
2650*0Sstevel@tonic-gate 		} else {
2651*0Sstevel@tonic-gate 			VM_STAT_ADD(page_create_exists);
2652*0Sstevel@tonic-gate 			if (flags & PG_EXCL) {
2653*0Sstevel@tonic-gate 				/*
2654*0Sstevel@tonic-gate 				 * Found an existing page, and the caller
2655*0Sstevel@tonic-gate 				 * wanted all new pages.  Undo all of the work
2656*0Sstevel@tonic-gate 				 * we have done.
2657*0Sstevel@tonic-gate 				 */
2658*0Sstevel@tonic-gate 				mutex_exit(phm);
2659*0Sstevel@tonic-gate 				phm = NULL;
2660*0Sstevel@tonic-gate 				while (plist != NULL) {
2661*0Sstevel@tonic-gate 					pp = plist;
2662*0Sstevel@tonic-gate 					page_sub(&plist, pp);
2663*0Sstevel@tonic-gate 					page_io_unlock(pp);
2664*0Sstevel@tonic-gate 					/* large pages should not end up here */
2665*0Sstevel@tonic-gate 					ASSERT(pp->p_szc == 0);
2666*0Sstevel@tonic-gate 					/*LINTED: constant in conditional ctx*/
2667*0Sstevel@tonic-gate 					VN_DISPOSE(pp, B_INVAL, 0, kcred);
2668*0Sstevel@tonic-gate 				}
2669*0Sstevel@tonic-gate 				VM_STAT_ADD(page_create_found_one);
2670*0Sstevel@tonic-gate 				goto fail;
2671*0Sstevel@tonic-gate 			}
2672*0Sstevel@tonic-gate 			ASSERT(flags & PG_WAIT);
2673*0Sstevel@tonic-gate 			if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2674*0Sstevel@tonic-gate 				/*
2675*0Sstevel@tonic-gate 				 * Start all over again if we blocked trying
2676*0Sstevel@tonic-gate 				 * to lock the page.
2677*0Sstevel@tonic-gate 				 */
2678*0Sstevel@tonic-gate 				mutex_exit(phm);
2679*0Sstevel@tonic-gate 				VM_STAT_ADD(page_create_page_lock_failed);
2680*0Sstevel@tonic-gate 				phm = NULL;
2681*0Sstevel@tonic-gate 				goto top;
2682*0Sstevel@tonic-gate 			}
2683*0Sstevel@tonic-gate 			mutex_exit(phm);
2684*0Sstevel@tonic-gate 			phm = NULL;
2685*0Sstevel@tonic-gate 
2686*0Sstevel@tonic-gate 			if (PP_ISFREE(pp)) {
2687*0Sstevel@tonic-gate 				ASSERT(PP_ISAGED(pp) == 0);
2688*0Sstevel@tonic-gate 				VM_STAT_ADD(pagecnt.pc_get_cache);
2689*0Sstevel@tonic-gate 				page_list_sub(pp, PG_CACHE_LIST);
2690*0Sstevel@tonic-gate 				PP_CLRFREE(pp);
2691*0Sstevel@tonic-gate 				found_on_free++;
2692*0Sstevel@tonic-gate 			}
2693*0Sstevel@tonic-gate 		}
2694*0Sstevel@tonic-gate 
2695*0Sstevel@tonic-gate 		/*
2696*0Sstevel@tonic-gate 		 * Got a page!  It is locked.  Acquire the i/o
2697*0Sstevel@tonic-gate 		 * lock since we are going to use the p_next and
2698*0Sstevel@tonic-gate 		 * p_prev fields to link the requested pages together.
2699*0Sstevel@tonic-gate 		 */
2700*0Sstevel@tonic-gate 		page_io_lock(pp);
2701*0Sstevel@tonic-gate 		page_add(&plist, pp);
2702*0Sstevel@tonic-gate 		plist = plist->p_next;
2703*0Sstevel@tonic-gate 		off += PAGESIZE;
2704*0Sstevel@tonic-gate 		vaddr += PAGESIZE;
2705*0Sstevel@tonic-gate 	}
2706*0Sstevel@tonic-gate 
2707*0Sstevel@tonic-gate 	ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2708*0Sstevel@tonic-gate fail:
2709*0Sstevel@tonic-gate 	if (npp != NULL) {
2710*0Sstevel@tonic-gate 		/*
2711*0Sstevel@tonic-gate 		 * Did not need this page after all.
2712*0Sstevel@tonic-gate 		 * Put it back on the free list.
2713*0Sstevel@tonic-gate 		 */
2714*0Sstevel@tonic-gate 		VM_STAT_ADD(page_create_putbacks);
2715*0Sstevel@tonic-gate 		PP_SETFREE(npp);
2716*0Sstevel@tonic-gate 		PP_SETAGED(npp);
2717*0Sstevel@tonic-gate 		npp->p_offset = (u_offset_t)-1;
2718*0Sstevel@tonic-gate 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2719*0Sstevel@tonic-gate 		page_unlock(npp);
2720*0Sstevel@tonic-gate 
2721*0Sstevel@tonic-gate 	}
2722*0Sstevel@tonic-gate 
2723*0Sstevel@tonic-gate 	ASSERT(pages_req >= found_on_free);
2724*0Sstevel@tonic-gate 
2725*0Sstevel@tonic-gate 	{
2726*0Sstevel@tonic-gate 		uint_t overshoot = (uint_t)(pages_req - found_on_free);
2727*0Sstevel@tonic-gate 
2728*0Sstevel@tonic-gate 		if (overshoot) {
2729*0Sstevel@tonic-gate 			VM_STAT_ADD(page_create_overshoot);
2730*0Sstevel@tonic-gate 			p = &pcf[pcf_index];
2731*0Sstevel@tonic-gate 			p->pcf_touch = 1;
2732*0Sstevel@tonic-gate 			mutex_enter(&p->pcf_lock);
2733*0Sstevel@tonic-gate 			if (p->pcf_block) {
2734*0Sstevel@tonic-gate 				p->pcf_reserve += overshoot;
2735*0Sstevel@tonic-gate 			} else {
2736*0Sstevel@tonic-gate 				p->pcf_count += overshoot;
2737*0Sstevel@tonic-gate 				if (p->pcf_wait) {
2738*0Sstevel@tonic-gate 					mutex_enter(&new_freemem_lock);
2739*0Sstevel@tonic-gate 					if (freemem_wait) {
2740*0Sstevel@tonic-gate 						cv_signal(&freemem_cv);
2741*0Sstevel@tonic-gate 						p->pcf_wait--;
2742*0Sstevel@tonic-gate 					} else {
2743*0Sstevel@tonic-gate 						p->pcf_wait = 0;
2744*0Sstevel@tonic-gate 					}
2745*0Sstevel@tonic-gate 					mutex_exit(&new_freemem_lock);
2746*0Sstevel@tonic-gate 				}
2747*0Sstevel@tonic-gate 			}
2748*0Sstevel@tonic-gate 			mutex_exit(&p->pcf_lock);
2749*0Sstevel@tonic-gate 			/* freemem is approximate, so this test OK */
2750*0Sstevel@tonic-gate 			if (!p->pcf_block)
2751*0Sstevel@tonic-gate 				freemem += overshoot;
2752*0Sstevel@tonic-gate 		}
2753*0Sstevel@tonic-gate 	}
2754*0Sstevel@tonic-gate 
2755*0Sstevel@tonic-gate 	return (plist);
2756*0Sstevel@tonic-gate }
2757*0Sstevel@tonic-gate 
2758*0Sstevel@tonic-gate /*
2759*0Sstevel@tonic-gate  * One or more constituent pages of this large page has been marked
2760*0Sstevel@tonic-gate  * toxic. Simply demote the large page to PAGESIZE pages and let
2761*0Sstevel@tonic-gate  * page_free() handle it. This routine should only be called by
2762*0Sstevel@tonic-gate  * large page free routines (page_free_pages() and page_destroy_pages().
2763*0Sstevel@tonic-gate  * All pages are locked SE_EXCL and have already been marked free.
2764*0Sstevel@tonic-gate  */
2765*0Sstevel@tonic-gate static void
2766*0Sstevel@tonic-gate page_free_toxic_pages(page_t *rootpp)
2767*0Sstevel@tonic-gate {
2768*0Sstevel@tonic-gate 	page_t	*tpp;
2769*0Sstevel@tonic-gate 	pgcnt_t	i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2770*0Sstevel@tonic-gate 	uint_t	szc = rootpp->p_szc;
2771*0Sstevel@tonic-gate 
2772*0Sstevel@tonic-gate 	for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2773*0Sstevel@tonic-gate 		ASSERT(tpp->p_szc == szc);
2774*0Sstevel@tonic-gate 		ASSERT((PAGE_EXCL(tpp) &&
2775*0Sstevel@tonic-gate 		    !page_iolock_assert(tpp)) || panicstr);
2776*0Sstevel@tonic-gate 		tpp->p_szc = 0;
2777*0Sstevel@tonic-gate 	}
2778*0Sstevel@tonic-gate 
2779*0Sstevel@tonic-gate 	while (rootpp != NULL) {
2780*0Sstevel@tonic-gate 		tpp = rootpp;
2781*0Sstevel@tonic-gate 		page_sub(&rootpp, tpp);
2782*0Sstevel@tonic-gate 		ASSERT(PP_ISFREE(tpp));
2783*0Sstevel@tonic-gate 		PP_CLRFREE(tpp);
2784*0Sstevel@tonic-gate 		page_free(tpp, 1);
2785*0Sstevel@tonic-gate 	}
2786*0Sstevel@tonic-gate }
2787*0Sstevel@tonic-gate 
2788*0Sstevel@tonic-gate /*
2789*0Sstevel@tonic-gate  * Put page on the "free" list.
2790*0Sstevel@tonic-gate  * The free list is really two lists maintained by
2791*0Sstevel@tonic-gate  * the PSM of whatever machine we happen to be on.
2792*0Sstevel@tonic-gate  */
2793*0Sstevel@tonic-gate void
2794*0Sstevel@tonic-gate page_free(page_t *pp, int dontneed)
2795*0Sstevel@tonic-gate {
2796*0Sstevel@tonic-gate 	struct pcf	*p;
2797*0Sstevel@tonic-gate 	uint_t		pcf_index;
2798*0Sstevel@tonic-gate 
2799*0Sstevel@tonic-gate 	ASSERT((PAGE_EXCL(pp) &&
2800*0Sstevel@tonic-gate 	    !page_iolock_assert(pp)) || panicstr);
2801*0Sstevel@tonic-gate 
2802*0Sstevel@tonic-gate 	if (page_deteriorating(pp)) {
2803*0Sstevel@tonic-gate 		volatile int i = 0;
2804*0Sstevel@tonic-gate 		char *kaddr;
2805*0Sstevel@tonic-gate 		volatile int rb, wb;
2806*0Sstevel@tonic-gate 		uint64_t pa;
2807*0Sstevel@tonic-gate 		volatile int ue = 0;
2808*0Sstevel@tonic-gate 		on_trap_data_t otd;
2809*0Sstevel@tonic-gate 
2810*0Sstevel@tonic-gate 		if (pp->p_vnode != NULL) {
2811*0Sstevel@tonic-gate 			/*
2812*0Sstevel@tonic-gate 			 * Let page_destroy() do its bean counting and
2813*0Sstevel@tonic-gate 			 * hash out the page; it will then call back
2814*0Sstevel@tonic-gate 			 * into page_free() with pp->p_vnode == NULL.
2815*0Sstevel@tonic-gate 			 */
2816*0Sstevel@tonic-gate 			page_destroy(pp, 0);
2817*0Sstevel@tonic-gate 			return;
2818*0Sstevel@tonic-gate 		}
2819*0Sstevel@tonic-gate 
2820*0Sstevel@tonic-gate 		if (page_isfailing(pp)) {
2821*0Sstevel@tonic-gate 			/*
2822*0Sstevel@tonic-gate 			 * If we have already exceeded the limit for
2823*0Sstevel@tonic-gate 			 * pages retired, we will treat this page as
2824*0Sstevel@tonic-gate 			 * 'toxic' rather than failing. That will ensure
2825*0Sstevel@tonic-gate 			 * that the page is at least cleaned, and if
2826*0Sstevel@tonic-gate 			 * a UE is detected, the page will be retired
2827*0Sstevel@tonic-gate 			 * anyway.
2828*0Sstevel@tonic-gate 			 */
2829*0Sstevel@tonic-gate 			if (pages_retired_limit_exceeded()) {
2830*0Sstevel@tonic-gate 				/*
2831*0Sstevel@tonic-gate 				 * clear the flag and reset to toxic
2832*0Sstevel@tonic-gate 				 */
2833*0Sstevel@tonic-gate 				page_clrtoxic(pp);
2834*0Sstevel@tonic-gate 				page_settoxic(pp, PAGE_IS_TOXIC);
2835*0Sstevel@tonic-gate 			} else {
2836*0Sstevel@tonic-gate 				pa = ptob((uint64_t)page_pptonum(pp));
2837*0Sstevel@tonic-gate 				if (page_retire_messages) {
2838*0Sstevel@tonic-gate 					cmn_err(CE_NOTE, "Page 0x%08x.%08x "
2839*0Sstevel@tonic-gate 					    "removed from service",
2840*0Sstevel@tonic-gate 					    (uint32_t)(pa >> 32), (uint32_t)pa);
2841*0Sstevel@tonic-gate 				}
2842*0Sstevel@tonic-gate 				goto page_failed;
2843*0Sstevel@tonic-gate 			}
2844*0Sstevel@tonic-gate 		}
2845*0Sstevel@tonic-gate 
2846*0Sstevel@tonic-gate 		pagescrub(pp, 0, PAGESIZE);
2847*0Sstevel@tonic-gate 
2848*0Sstevel@tonic-gate 		/*
2849*0Sstevel@tonic-gate 		 * We want to determine whether the error that occurred on
2850*0Sstevel@tonic-gate 		 * this page is transient or persistent, so we get a mapping
2851*0Sstevel@tonic-gate 		 * to the page and try every possible bit pattern to compare
2852*0Sstevel@tonic-gate 		 * what we write with what we read back.  A smaller number
2853*0Sstevel@tonic-gate 		 * of bit patterns might suffice, but there's no point in
2854*0Sstevel@tonic-gate 		 * getting fancy.  If this is the hot path on your system,
2855*0Sstevel@tonic-gate 		 * you've got bigger problems.
2856*0Sstevel@tonic-gate 		 */
2857*0Sstevel@tonic-gate 		kaddr = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
2858*0Sstevel@tonic-gate 		for (wb = 0xff; wb >= 0; wb--) {
2859*0Sstevel@tonic-gate 			if (on_trap(&otd, OT_DATA_EC)) {
2860*0Sstevel@tonic-gate 				pa = ptob((uint64_t)page_pptonum(pp)) + i;
2861*0Sstevel@tonic-gate 				page_settoxic(pp, PAGE_IS_FAILING);
2862*0Sstevel@tonic-gate 
2863*0Sstevel@tonic-gate 				if (page_retire_messages) {
2864*0Sstevel@tonic-gate 					cmn_err(CE_WARN, "Uncorrectable Error "
2865*0Sstevel@tonic-gate 					    "occurred at PA 0x%08x.%08x while "
2866*0Sstevel@tonic-gate 					    "attempting to clear previously "
2867*0Sstevel@tonic-gate 					    "reported error; page removed from "
2868*0Sstevel@tonic-gate 					    "service", (uint32_t)(pa >> 32),
2869*0Sstevel@tonic-gate 					    (uint32_t)pa);
2870*0Sstevel@tonic-gate 				}
2871*0Sstevel@tonic-gate 
2872*0Sstevel@tonic-gate 				ue++;
2873*0Sstevel@tonic-gate 				break;
2874*0Sstevel@tonic-gate 			}
2875*0Sstevel@tonic-gate 
2876*0Sstevel@tonic-gate 			/*
2877*0Sstevel@tonic-gate 			 * Write out the bit pattern, flush it to memory, and
2878*0Sstevel@tonic-gate 			 * read it back while under on_trap() protection.
2879*0Sstevel@tonic-gate 			 */
2880*0Sstevel@tonic-gate 			for (i = 0; i < PAGESIZE; i++)
2881*0Sstevel@tonic-gate 				kaddr[i] = wb;
2882*0Sstevel@tonic-gate 
2883*0Sstevel@tonic-gate 			sync_data_memory(kaddr, PAGESIZE);
2884*0Sstevel@tonic-gate 
2885*0Sstevel@tonic-gate 			for (i = 0; i < PAGESIZE; i++) {
2886*0Sstevel@tonic-gate 				if ((rb = (uchar_t)kaddr[i]) != wb) {
2887*0Sstevel@tonic-gate 					page_settoxic(pp, PAGE_IS_FAILING);
2888*0Sstevel@tonic-gate 					goto out;
2889*0Sstevel@tonic-gate 				}
2890*0Sstevel@tonic-gate 			}
2891*0Sstevel@tonic-gate 		}
2892*0Sstevel@tonic-gate out:
2893*0Sstevel@tonic-gate 		no_trap();
2894*0Sstevel@tonic-gate 		ppmapout(kaddr);
2895*0Sstevel@tonic-gate 
2896*0Sstevel@tonic-gate 		if (wb >= 0 && !ue) {
2897*0Sstevel@tonic-gate 			pa = ptob((uint64_t)page_pptonum(pp)) + i;
2898*0Sstevel@tonic-gate 			if (page_retire_messages) {
2899*0Sstevel@tonic-gate 				cmn_err(CE_WARN, "Data Mismatch occurred at PA "
2900*0Sstevel@tonic-gate 				    "0x%08x.%08x [ 0x%x != 0x%x ] while "
2901*0Sstevel@tonic-gate 				    "attempting to clear previously reported "
2902*0Sstevel@tonic-gate 				    "error; page removed from service",
2903*0Sstevel@tonic-gate 				    (uint32_t)(pa >> 32), (uint32_t)pa, rb, wb);
2904*0Sstevel@tonic-gate 			}
2905*0Sstevel@tonic-gate 		}
2906*0Sstevel@tonic-gate page_failed:
2907*0Sstevel@tonic-gate 		/*
2908*0Sstevel@tonic-gate 		 * DR operations change the association between a page_t
2909*0Sstevel@tonic-gate 		 * and the physical page it represents. Check if the
2910*0Sstevel@tonic-gate 		 * page is still bad. If it is, then retire it.
2911*0Sstevel@tonic-gate 		 */
2912*0Sstevel@tonic-gate 		if (page_isfaulty(pp) && page_isfailing(pp)) {
2913*0Sstevel@tonic-gate 			/*
2914*0Sstevel@tonic-gate 			 * In the future, it might be useful to have a platform
2915*0Sstevel@tonic-gate 			 * callback here to tell the hardware to fence off this
2916*0Sstevel@tonic-gate 			 * page during the next reboot.
2917*0Sstevel@tonic-gate 			 *
2918*0Sstevel@tonic-gate 			 * We move the page to the retired_vnode here
2919*0Sstevel@tonic-gate 			 */
2920*0Sstevel@tonic-gate 			(void) page_hashin(pp, &retired_ppages,
2921*0Sstevel@tonic-gate 			    (u_offset_t)ptob((uint64_t)page_pptonum(pp)), NULL);
2922*0Sstevel@tonic-gate 			mutex_enter(&freemem_lock);
2923*0Sstevel@tonic-gate 			availrmem--;
2924*0Sstevel@tonic-gate 			mutex_exit(&freemem_lock);
2925*0Sstevel@tonic-gate 			page_retired(pp);
2926*0Sstevel@tonic-gate 			page_downgrade(pp);
2927*0Sstevel@tonic-gate 
2928*0Sstevel@tonic-gate 			/*
2929*0Sstevel@tonic-gate 			 * If DR raced with the above page retirement code,
2930*0Sstevel@tonic-gate 			 * we might have retired a good page. If so, unretire
2931*0Sstevel@tonic-gate 			 * the page.
2932*0Sstevel@tonic-gate 			 */
2933*0Sstevel@tonic-gate 			if (!page_isfaulty(pp))
2934*0Sstevel@tonic-gate 				page_unretire_pages();
2935*0Sstevel@tonic-gate 			return;
2936*0Sstevel@tonic-gate 		}
2937*0Sstevel@tonic-gate 
2938*0Sstevel@tonic-gate 		pa = ptob((uint64_t)page_pptonum(pp));
2939*0Sstevel@tonic-gate 
2940*0Sstevel@tonic-gate 		if (page_retire_messages) {
2941*0Sstevel@tonic-gate 			cmn_err(CE_NOTE, "Previously reported error on page "
2942*0Sstevel@tonic-gate 			    "0x%08x.%08x cleared", (uint32_t)(pa >> 32),
2943*0Sstevel@tonic-gate 			    (uint32_t)pa);
2944*0Sstevel@tonic-gate 		}
2945*0Sstevel@tonic-gate 
2946*0Sstevel@tonic-gate 		page_clrtoxic(pp);
2947*0Sstevel@tonic-gate 	}
2948*0Sstevel@tonic-gate 
2949*0Sstevel@tonic-gate 	if (PP_ISFREE(pp)) {
2950*0Sstevel@tonic-gate 		panic("page_free: page %p is free", (void *)pp);
2951*0Sstevel@tonic-gate 	}
2952*0Sstevel@tonic-gate 
2953*0Sstevel@tonic-gate 	if (pp->p_szc != 0) {
2954*0Sstevel@tonic-gate 		if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2955*0Sstevel@tonic-gate 		    pp->p_vnode == &kvp) {
2956*0Sstevel@tonic-gate 			panic("page_free: anon or kernel "
2957*0Sstevel@tonic-gate 			    "or no vnode large page %p", (void *)pp);
2958*0Sstevel@tonic-gate 		}
2959*0Sstevel@tonic-gate 		page_demote_vp_pages(pp);
2960*0Sstevel@tonic-gate 		ASSERT(pp->p_szc == 0);
2961*0Sstevel@tonic-gate 	}
2962*0Sstevel@tonic-gate 
2963*0Sstevel@tonic-gate 	/*
2964*0Sstevel@tonic-gate 	 * The page_struct_lock need not be acquired to examine these
2965*0Sstevel@tonic-gate 	 * fields since the page has an "exclusive" lock.
2966*0Sstevel@tonic-gate 	 */
2967*0Sstevel@tonic-gate 	if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
2968*0Sstevel@tonic-gate 		panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d",
2969*0Sstevel@tonic-gate 		    pp, page_pptonum(pp), pp->p_lckcnt, pp->p_cowcnt);
2970*0Sstevel@tonic-gate 		/*NOTREACHED*/
2971*0Sstevel@tonic-gate 	}
2972*0Sstevel@tonic-gate 
2973*0Sstevel@tonic-gate 	ASSERT(!hat_page_getshare(pp));
2974*0Sstevel@tonic-gate 
2975*0Sstevel@tonic-gate 	PP_SETFREE(pp);
2976*0Sstevel@tonic-gate 	ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2977*0Sstevel@tonic-gate 	    !hat_ismod(pp));
2978*0Sstevel@tonic-gate 	page_clr_all_props(pp);
2979*0Sstevel@tonic-gate 	ASSERT(!hat_page_getshare(pp));
2980*0Sstevel@tonic-gate 
2981*0Sstevel@tonic-gate 	/*
2982*0Sstevel@tonic-gate 	 * Now we add the page to the head of the free list.
2983*0Sstevel@tonic-gate 	 * But if this page is associated with a paged vnode
2984*0Sstevel@tonic-gate 	 * then we adjust the head forward so that the page is
2985*0Sstevel@tonic-gate 	 * effectively at the end of the list.
2986*0Sstevel@tonic-gate 	 */
2987*0Sstevel@tonic-gate 	if (pp->p_vnode == NULL) {
2988*0Sstevel@tonic-gate 		/*
2989*0Sstevel@tonic-gate 		 * Page has no identity, put it on the free list.
2990*0Sstevel@tonic-gate 		 */
2991*0Sstevel@tonic-gate 		PP_SETAGED(pp);
2992*0Sstevel@tonic-gate 		pp->p_offset = (u_offset_t)-1;
2993*0Sstevel@tonic-gate 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2994*0Sstevel@tonic-gate 		VM_STAT_ADD(pagecnt.pc_free_free);
2995*0Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2996*0Sstevel@tonic-gate 		    "page_free_free:pp %p", pp);
2997*0Sstevel@tonic-gate 	} else {
2998*0Sstevel@tonic-gate 		PP_CLRAGED(pp);
2999*0Sstevel@tonic-gate 
3000*0Sstevel@tonic-gate 		if (!dontneed || nopageage) {
3001*0Sstevel@tonic-gate 			/* move it to the tail of the list */
3002*0Sstevel@tonic-gate 			page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
3003*0Sstevel@tonic-gate 
3004*0Sstevel@tonic-gate 			VM_STAT_ADD(pagecnt.pc_free_cache);
3005*0Sstevel@tonic-gate 			TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
3006*0Sstevel@tonic-gate 			    "page_free_cache_tail:pp %p", pp);
3007*0Sstevel@tonic-gate 		} else {
3008*0Sstevel@tonic-gate 			page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
3009*0Sstevel@tonic-gate 
3010*0Sstevel@tonic-gate 			VM_STAT_ADD(pagecnt.pc_free_dontneed);
3011*0Sstevel@tonic-gate 			TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
3012*0Sstevel@tonic-gate 			    "page_free_cache_head:pp %p", pp);
3013*0Sstevel@tonic-gate 		}
3014*0Sstevel@tonic-gate 	}
3015*0Sstevel@tonic-gate 	page_unlock(pp);
3016*0Sstevel@tonic-gate 
3017*0Sstevel@tonic-gate 	/*
3018*0Sstevel@tonic-gate 	 * Now do the `freemem' accounting.
3019*0Sstevel@tonic-gate 	 */
3020*0Sstevel@tonic-gate 	pcf_index = PCF_INDEX();
3021*0Sstevel@tonic-gate 	p = &pcf[pcf_index];
3022*0Sstevel@tonic-gate 	p->pcf_touch = 1;
3023*0Sstevel@tonic-gate 
3024*0Sstevel@tonic-gate 	mutex_enter(&p->pcf_lock);
3025*0Sstevel@tonic-gate 	if (p->pcf_block) {
3026*0Sstevel@tonic-gate 		p->pcf_reserve += 1;
3027*0Sstevel@tonic-gate 	} else {
3028*0Sstevel@tonic-gate 		p->pcf_count += 1;
3029*0Sstevel@tonic-gate 		if (p->pcf_wait) {
3030*0Sstevel@tonic-gate 			mutex_enter(&new_freemem_lock);
3031*0Sstevel@tonic-gate 			/*
3032*0Sstevel@tonic-gate 			 * Check to see if some other thread
3033*0Sstevel@tonic-gate 			 * is actually waiting.  Another bucket
3034*0Sstevel@tonic-gate 			 * may have woken it up by now.  If there
3035*0Sstevel@tonic-gate 			 * are no waiters, then set our pcf_wait
3036*0Sstevel@tonic-gate 			 * count to zero to avoid coming in here
3037*0Sstevel@tonic-gate 			 * next time.  Also, since only one page
3038*0Sstevel@tonic-gate 			 * was put on the free list, just wake
3039*0Sstevel@tonic-gate 			 * up one waiter.
3040*0Sstevel@tonic-gate 			 */
3041*0Sstevel@tonic-gate 			if (freemem_wait) {
3042*0Sstevel@tonic-gate 				cv_signal(&freemem_cv);
3043*0Sstevel@tonic-gate 				p->pcf_wait--;
3044*0Sstevel@tonic-gate 			} else {
3045*0Sstevel@tonic-gate 				p->pcf_wait = 0;
3046*0Sstevel@tonic-gate 			}
3047*0Sstevel@tonic-gate 			mutex_exit(&new_freemem_lock);
3048*0Sstevel@tonic-gate 		}
3049*0Sstevel@tonic-gate 	}
3050*0Sstevel@tonic-gate 	mutex_exit(&p->pcf_lock);
3051*0Sstevel@tonic-gate 
3052*0Sstevel@tonic-gate 	/* freemem is approximate, so this test OK */
3053*0Sstevel@tonic-gate 	if (!p->pcf_block)
3054*0Sstevel@tonic-gate 		freemem += 1;
3055*0Sstevel@tonic-gate }
3056*0Sstevel@tonic-gate 
3057*0Sstevel@tonic-gate /*
3058*0Sstevel@tonic-gate  * Put page on the "free" list during intial startup.
3059*0Sstevel@tonic-gate  * This happens during initial single threaded execution.
3060*0Sstevel@tonic-gate  */
3061*0Sstevel@tonic-gate void
3062*0Sstevel@tonic-gate page_free_at_startup(page_t *pp)
3063*0Sstevel@tonic-gate {
3064*0Sstevel@tonic-gate 	struct pcf	*p;
3065*0Sstevel@tonic-gate 	uint_t		pcf_index;
3066*0Sstevel@tonic-gate 
3067*0Sstevel@tonic-gate 	page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
3068*0Sstevel@tonic-gate 	VM_STAT_ADD(pagecnt.pc_free_free);
3069*0Sstevel@tonic-gate 
3070*0Sstevel@tonic-gate 	/*
3071*0Sstevel@tonic-gate 	 * Now do the `freemem' accounting.
3072*0Sstevel@tonic-gate 	 */
3073*0Sstevel@tonic-gate 	pcf_index = PCF_INDEX();
3074*0Sstevel@tonic-gate 	p = &pcf[pcf_index];
3075*0Sstevel@tonic-gate 	p->pcf_touch = 1;
3076*0Sstevel@tonic-gate 
3077*0Sstevel@tonic-gate 	ASSERT(p->pcf_block == 0);
3078*0Sstevel@tonic-gate 	ASSERT(p->pcf_wait == 0);
3079*0Sstevel@tonic-gate 	p->pcf_count += 1;
3080*0Sstevel@tonic-gate 
3081*0Sstevel@tonic-gate 	/* freemem is approximate, so this is OK */
3082*0Sstevel@tonic-gate 	freemem += 1;
3083*0Sstevel@tonic-gate }
3084*0Sstevel@tonic-gate 
3085*0Sstevel@tonic-gate void
3086*0Sstevel@tonic-gate page_free_pages(page_t *pp)
3087*0Sstevel@tonic-gate {
3088*0Sstevel@tonic-gate 	page_t	*tpp, *rootpp = NULL;
3089*0Sstevel@tonic-gate 	pgcnt_t	pgcnt = page_get_pagecnt(pp->p_szc);
3090*0Sstevel@tonic-gate 	pgcnt_t	i;
3091*0Sstevel@tonic-gate 	uint_t	szc = pp->p_szc;
3092*0Sstevel@tonic-gate 	int	toxic = 0;
3093*0Sstevel@tonic-gate 
3094*0Sstevel@tonic-gate 	VM_STAT_ADD(pagecnt.pc_free_pages);
3095*0Sstevel@tonic-gate 	TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
3096*0Sstevel@tonic-gate 	    "page_free_free:pp %p", pp);
3097*0Sstevel@tonic-gate 
3098*0Sstevel@tonic-gate 	ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3099*0Sstevel@tonic-gate 	if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3100*0Sstevel@tonic-gate 		panic("page_free_pages: not root page %p", (void *)pp);
3101*0Sstevel@tonic-gate 		/*NOTREACHED*/
3102*0Sstevel@tonic-gate 	}
3103*0Sstevel@tonic-gate 
3104*0Sstevel@tonic-gate 	for (i = 0, tpp = pp; i < pgcnt; i++, tpp = page_next(tpp)) {
3105*0Sstevel@tonic-gate 		ASSERT((PAGE_EXCL(tpp) &&
3106*0Sstevel@tonic-gate 		    !page_iolock_assert(tpp)) || panicstr);
3107*0Sstevel@tonic-gate 		if (PP_ISFREE(tpp)) {
3108*0Sstevel@tonic-gate 			panic("page_free_pages: page %p is free", (void *)tpp);
3109*0Sstevel@tonic-gate 			/*NOTREACHED*/
3110*0Sstevel@tonic-gate 		}
3111*0Sstevel@tonic-gate 		if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
3112*0Sstevel@tonic-gate 		    tpp->p_cowcnt != 0) {
3113*0Sstevel@tonic-gate 			panic("page_free_pages %p", (void *)tpp);
3114*0Sstevel@tonic-gate 			/*NOTREACHED*/
3115*0Sstevel@tonic-gate 		}
3116*0Sstevel@tonic-gate 
3117*0Sstevel@tonic-gate 		ASSERT(!hat_page_getshare(tpp));
3118*0Sstevel@tonic-gate 		ASSERT(tpp->p_vnode == NULL);
3119*0Sstevel@tonic-gate 		ASSERT(tpp->p_szc == szc);
3120*0Sstevel@tonic-gate 
3121*0Sstevel@tonic-gate 		if (page_deteriorating(tpp))
3122*0Sstevel@tonic-gate 			toxic = 1;
3123*0Sstevel@tonic-gate 
3124*0Sstevel@tonic-gate 		PP_SETFREE(tpp);
3125*0Sstevel@tonic-gate 		page_clr_all_props(tpp);
3126*0Sstevel@tonic-gate 		PP_SETAGED(tpp);
3127*0Sstevel@tonic-gate 		tpp->p_offset = (u_offset_t)-1;
3128*0Sstevel@tonic-gate 		ASSERT(tpp->p_next == tpp);
3129*0Sstevel@tonic-gate 		ASSERT(tpp->p_prev == tpp);
3130*0Sstevel@tonic-gate 		page_list_concat(&rootpp, &tpp);
3131*0Sstevel@tonic-gate 	}
3132*0Sstevel@tonic-gate 	ASSERT(rootpp == pp);
3133*0Sstevel@tonic-gate 
3134*0Sstevel@tonic-gate 	if (toxic) {
3135*0Sstevel@tonic-gate 		page_free_toxic_pages(rootpp);
3136*0Sstevel@tonic-gate 		return;
3137*0Sstevel@tonic-gate 	}
3138*0Sstevel@tonic-gate 	page_list_add_pages(rootpp, 0);
3139*0Sstevel@tonic-gate 	page_create_putback(pgcnt);
3140*0Sstevel@tonic-gate }
3141*0Sstevel@tonic-gate 
3142*0Sstevel@tonic-gate int free_pages = 1;
3143*0Sstevel@tonic-gate 
3144*0Sstevel@tonic-gate /*
3145*0Sstevel@tonic-gate  * This routine attempts to return pages to the cachelist via page_release().
3146*0Sstevel@tonic-gate  * It does not *have* to be successful in all cases, since the pageout scanner
3147*0Sstevel@tonic-gate  * will catch any pages it misses.  It does need to be fast and not introduce
3148*0Sstevel@tonic-gate  * too much overhead.
3149*0Sstevel@tonic-gate  *
3150*0Sstevel@tonic-gate  * If a page isn't found on the unlocked sweep of the page_hash bucket, we
3151*0Sstevel@tonic-gate  * don't lock and retry.  This is ok, since the page scanner will eventually
3152*0Sstevel@tonic-gate  * find any page we miss in free_vp_pages().
3153*0Sstevel@tonic-gate  */
3154*0Sstevel@tonic-gate void
3155*0Sstevel@tonic-gate free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
3156*0Sstevel@tonic-gate {
3157*0Sstevel@tonic-gate 	page_t *pp;
3158*0Sstevel@tonic-gate 	u_offset_t eoff;
3159*0Sstevel@tonic-gate 	extern int swap_in_range(vnode_t *, u_offset_t, size_t);
3160*0Sstevel@tonic-gate 
3161*0Sstevel@tonic-gate 	eoff = off + len;
3162*0Sstevel@tonic-gate 
3163*0Sstevel@tonic-gate 	if (free_pages == 0)
3164*0Sstevel@tonic-gate 		return;
3165*0Sstevel@tonic-gate 	if (swap_in_range(vp, off, len))
3166*0Sstevel@tonic-gate 		return;
3167*0Sstevel@tonic-gate 
3168*0Sstevel@tonic-gate 	for (; off < eoff; off += PAGESIZE) {
3169*0Sstevel@tonic-gate 
3170*0Sstevel@tonic-gate 		/*
3171*0Sstevel@tonic-gate 		 * find the page using a fast, but inexact search. It'll be OK
3172*0Sstevel@tonic-gate 		 * if a few pages slip through the cracks here.
3173*0Sstevel@tonic-gate 		 */
3174*0Sstevel@tonic-gate 		pp = page_exists(vp, off);
3175*0Sstevel@tonic-gate 
3176*0Sstevel@tonic-gate 		/*
3177*0Sstevel@tonic-gate 		 * If we didn't find the page (it may not exist), the page
3178*0Sstevel@tonic-gate 		 * is free, looks still in use (shared), or we can't lock it,
3179*0Sstevel@tonic-gate 		 * just give up.
3180*0Sstevel@tonic-gate 		 */
3181*0Sstevel@tonic-gate 		if (pp == NULL ||
3182*0Sstevel@tonic-gate 		    PP_ISFREE(pp) ||
3183*0Sstevel@tonic-gate 		    page_share_cnt(pp) > 0 ||
3184*0Sstevel@tonic-gate 		    !page_trylock(pp, SE_EXCL))
3185*0Sstevel@tonic-gate 			continue;
3186*0Sstevel@tonic-gate 
3187*0Sstevel@tonic-gate 		/*
3188*0Sstevel@tonic-gate 		 * Once we have locked pp, verify that it's still the
3189*0Sstevel@tonic-gate 		 * correct page and not already free
3190*0Sstevel@tonic-gate 		 */
3191*0Sstevel@tonic-gate 		ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
3192*0Sstevel@tonic-gate 		if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
3193*0Sstevel@tonic-gate 			page_unlock(pp);
3194*0Sstevel@tonic-gate 			continue;
3195*0Sstevel@tonic-gate 		}
3196*0Sstevel@tonic-gate 
3197*0Sstevel@tonic-gate 		/*
3198*0Sstevel@tonic-gate 		 * try to release the page...
3199*0Sstevel@tonic-gate 		 */
3200*0Sstevel@tonic-gate 		(void) page_release(pp, 1);
3201*0Sstevel@tonic-gate 	}
3202*0Sstevel@tonic-gate }
3203*0Sstevel@tonic-gate 
3204*0Sstevel@tonic-gate /*
3205*0Sstevel@tonic-gate  * Reclaim the given page from the free list.
3206*0Sstevel@tonic-gate  * Returns 1 on success or 0 on failure.
3207*0Sstevel@tonic-gate  *
3208*0Sstevel@tonic-gate  * The page is unlocked if it can't be reclaimed (when freemem == 0).
3209*0Sstevel@tonic-gate  * If `lock' is non-null, it will be dropped and re-acquired if
3210*0Sstevel@tonic-gate  * the routine must wait while freemem is 0.
3211*0Sstevel@tonic-gate  *
3212*0Sstevel@tonic-gate  * As it turns out, boot_getpages() does this.  It picks a page,
3213*0Sstevel@tonic-gate  * based on where OBP mapped in some address, gets its pfn, searches
3214*0Sstevel@tonic-gate  * the memsegs, locks the page, then pulls it off the free list!
3215*0Sstevel@tonic-gate  */
3216*0Sstevel@tonic-gate int
3217*0Sstevel@tonic-gate page_reclaim(page_t *pp, kmutex_t *lock)
3218*0Sstevel@tonic-gate {
3219*0Sstevel@tonic-gate 	struct pcf	*p;
3220*0Sstevel@tonic-gate 	uint_t		pcf_index;
3221*0Sstevel@tonic-gate 	struct cpu	*cpup;
3222*0Sstevel@tonic-gate 	int		enough;
3223*0Sstevel@tonic-gate 	uint_t		i;
3224*0Sstevel@tonic-gate 
3225*0Sstevel@tonic-gate 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
3226*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
3227*0Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
3228*0Sstevel@tonic-gate 
3229*0Sstevel@tonic-gate 	/*
3230*0Sstevel@tonic-gate 	 * If `freemem' is 0, we cannot reclaim this page from the
3231*0Sstevel@tonic-gate 	 * freelist, so release every lock we might hold: the page,
3232*0Sstevel@tonic-gate 	 * and the `lock' before blocking.
3233*0Sstevel@tonic-gate 	 *
3234*0Sstevel@tonic-gate 	 * The only way `freemem' can become 0 while there are pages
3235*0Sstevel@tonic-gate 	 * marked free (have their p->p_free bit set) is when the
3236*0Sstevel@tonic-gate 	 * system is low on memory and doing a page_create().  In
3237*0Sstevel@tonic-gate 	 * order to guarantee that once page_create() starts acquiring
3238*0Sstevel@tonic-gate 	 * pages it will be able to get all that it needs since `freemem'
3239*0Sstevel@tonic-gate 	 * was decreased by the requested amount.  So, we need to release
3240*0Sstevel@tonic-gate 	 * this page, and let page_create() have it.
3241*0Sstevel@tonic-gate 	 *
3242*0Sstevel@tonic-gate 	 * Since `freemem' being zero is not supposed to happen, just
3243*0Sstevel@tonic-gate 	 * use the usual hash stuff as a starting point.  If that bucket
3244*0Sstevel@tonic-gate 	 * is empty, then assume the worst, and start at the beginning
3245*0Sstevel@tonic-gate 	 * of the pcf array.  If we always start at the beginning
3246*0Sstevel@tonic-gate 	 * when acquiring more than one pcf lock, there won't be any
3247*0Sstevel@tonic-gate 	 * deadlock problems.
3248*0Sstevel@tonic-gate 	 */
3249*0Sstevel@tonic-gate 
3250*0Sstevel@tonic-gate 	/* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
3251*0Sstevel@tonic-gate 
3252*0Sstevel@tonic-gate 	if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
3253*0Sstevel@tonic-gate 		pcf_acquire_all();
3254*0Sstevel@tonic-gate 		goto page_reclaim_nomem;
3255*0Sstevel@tonic-gate 	}
3256*0Sstevel@tonic-gate 
3257*0Sstevel@tonic-gate 	enough = 0;
3258*0Sstevel@tonic-gate 	pcf_index = PCF_INDEX();
3259*0Sstevel@tonic-gate 	p = &pcf[pcf_index];
3260*0Sstevel@tonic-gate 	p->pcf_touch = 1;
3261*0Sstevel@tonic-gate 	mutex_enter(&p->pcf_lock);
3262*0Sstevel@tonic-gate 	if (p->pcf_count >= 1) {
3263*0Sstevel@tonic-gate 		enough = 1;
3264*0Sstevel@tonic-gate 		p->pcf_count--;
3265*0Sstevel@tonic-gate 	}
3266*0Sstevel@tonic-gate 	mutex_exit(&p->pcf_lock);
3267*0Sstevel@tonic-gate 
3268*0Sstevel@tonic-gate 	if (!enough) {
3269*0Sstevel@tonic-gate 		VM_STAT_ADD(page_reclaim_zero);
3270*0Sstevel@tonic-gate 		/*
3271*0Sstevel@tonic-gate 		 * Check again. Its possible that some other thread
3272*0Sstevel@tonic-gate 		 * could have been right behind us, and added one
3273*0Sstevel@tonic-gate 		 * to a list somewhere.  Acquire each of the pcf locks
3274*0Sstevel@tonic-gate 		 * until we find a page.
3275*0Sstevel@tonic-gate 		 */
3276*0Sstevel@tonic-gate 		p = pcf;
3277*0Sstevel@tonic-gate 		for (i = 0; i < PCF_FANOUT; i++) {
3278*0Sstevel@tonic-gate 			p->pcf_touch = 1;
3279*0Sstevel@tonic-gate 			mutex_enter(&p->pcf_lock);
3280*0Sstevel@tonic-gate 			if (p->pcf_count >= 1) {
3281*0Sstevel@tonic-gate 				p->pcf_count -= 1;
3282*0Sstevel@tonic-gate 				enough = 1;
3283*0Sstevel@tonic-gate 				break;
3284*0Sstevel@tonic-gate 			}
3285*0Sstevel@tonic-gate 			p++;
3286*0Sstevel@tonic-gate 		}
3287*0Sstevel@tonic-gate 
3288*0Sstevel@tonic-gate 		if (!enough) {
3289*0Sstevel@tonic-gate page_reclaim_nomem:
3290*0Sstevel@tonic-gate 			/*
3291*0Sstevel@tonic-gate 			 * We really can't have page `pp'.
3292*0Sstevel@tonic-gate 			 * Time for the no-memory dance with
3293*0Sstevel@tonic-gate 			 * page_free().  This is just like
3294*0Sstevel@tonic-gate 			 * page_create_wait().  Plus the added
3295*0Sstevel@tonic-gate 			 * attraction of releasing whatever mutex
3296*0Sstevel@tonic-gate 			 * we held when we were called with in `lock'.
3297*0Sstevel@tonic-gate 			 * Page_unlock() will wakeup any thread
3298*0Sstevel@tonic-gate 			 * waiting around for this page.
3299*0Sstevel@tonic-gate 			 */
3300*0Sstevel@tonic-gate 			if (lock) {
3301*0Sstevel@tonic-gate 				VM_STAT_ADD(page_reclaim_zero_locked);
3302*0Sstevel@tonic-gate 				mutex_exit(lock);
3303*0Sstevel@tonic-gate 			}
3304*0Sstevel@tonic-gate 			page_unlock(pp);
3305*0Sstevel@tonic-gate 
3306*0Sstevel@tonic-gate 			/*
3307*0Sstevel@tonic-gate 			 * get this before we drop all the pcf locks.
3308*0Sstevel@tonic-gate 			 */
3309*0Sstevel@tonic-gate 			mutex_enter(&new_freemem_lock);
3310*0Sstevel@tonic-gate 
3311*0Sstevel@tonic-gate 			p = pcf;
3312*0Sstevel@tonic-gate 			for (i = 0; i < PCF_FANOUT; i++) {
3313*0Sstevel@tonic-gate 				p->pcf_wait++;
3314*0Sstevel@tonic-gate 				mutex_exit(&p->pcf_lock);
3315*0Sstevel@tonic-gate 				p++;
3316*0Sstevel@tonic-gate 			}
3317*0Sstevel@tonic-gate 
3318*0Sstevel@tonic-gate 			freemem_wait++;
3319*0Sstevel@tonic-gate 			cv_wait(&freemem_cv, &new_freemem_lock);
3320*0Sstevel@tonic-gate 			freemem_wait--;
3321*0Sstevel@tonic-gate 
3322*0Sstevel@tonic-gate 			mutex_exit(&new_freemem_lock);
3323*0Sstevel@tonic-gate 
3324*0Sstevel@tonic-gate 			if (lock) {
3325*0Sstevel@tonic-gate 				mutex_enter(lock);
3326*0Sstevel@tonic-gate 			}
3327*0Sstevel@tonic-gate 			return (0);
3328*0Sstevel@tonic-gate 		}
3329*0Sstevel@tonic-gate 
3330*0Sstevel@tonic-gate 		/*
3331*0Sstevel@tonic-gate 		 * There was a page to be found.
3332*0Sstevel@tonic-gate 		 * The pcf accounting has been done,
3333*0Sstevel@tonic-gate 		 * though none of the pcf_wait flags have been set,
3334*0Sstevel@tonic-gate 		 * drop the locks and continue on.
3335*0Sstevel@tonic-gate 		 */
3336*0Sstevel@tonic-gate 		while (p >= pcf) {
3337*0Sstevel@tonic-gate 			mutex_exit(&p->pcf_lock);
3338*0Sstevel@tonic-gate 			p--;
3339*0Sstevel@tonic-gate 		}
3340*0Sstevel@tonic-gate 	}
3341*0Sstevel@tonic-gate 
3342*0Sstevel@tonic-gate 	/*
3343*0Sstevel@tonic-gate 	 * freemem is not protected by any lock. Thus, we cannot
3344*0Sstevel@tonic-gate 	 * have any assertion containing freemem here.
3345*0Sstevel@tonic-gate 	 */
3346*0Sstevel@tonic-gate 	freemem -= 1;
3347*0Sstevel@tonic-gate 
3348*0Sstevel@tonic-gate 	VM_STAT_ADD(pagecnt.pc_reclaim);
3349*0Sstevel@tonic-gate 	if (PP_ISAGED(pp)) {
3350*0Sstevel@tonic-gate 		page_list_sub(pp, PG_FREE_LIST);
3351*0Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3352*0Sstevel@tonic-gate 		    "page_reclaim_free:pp %p", pp);
3353*0Sstevel@tonic-gate 	} else {
3354*0Sstevel@tonic-gate 		page_list_sub(pp, PG_CACHE_LIST);
3355*0Sstevel@tonic-gate 		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3356*0Sstevel@tonic-gate 		    "page_reclaim_cache:pp %p", pp);
3357*0Sstevel@tonic-gate 	}
3358*0Sstevel@tonic-gate 
3359*0Sstevel@tonic-gate 	/*
3360*0Sstevel@tonic-gate 	 * clear the p_free & p_age bits since this page is no longer
3361*0Sstevel@tonic-gate 	 * on the free list.  Notice that there was a brief time where
3362*0Sstevel@tonic-gate 	 * a page is marked as free, but is not on the list.
3363*0Sstevel@tonic-gate 	 *
3364*0Sstevel@tonic-gate 	 * Set the reference bit to protect against immediate pageout.
3365*0Sstevel@tonic-gate 	 */
3366*0Sstevel@tonic-gate 	PP_CLRFREE(pp);
3367*0Sstevel@tonic-gate 	PP_CLRAGED(pp);
3368*0Sstevel@tonic-gate 	page_set_props(pp, P_REF);
3369*0Sstevel@tonic-gate 
3370*0Sstevel@tonic-gate 	CPU_STATS_ENTER_K();
3371*0Sstevel@tonic-gate 	cpup = CPU;	/* get cpup now that CPU cannot change */
3372*0Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3373*0Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3374*0Sstevel@tonic-gate 	CPU_STATS_EXIT_K();
3375*0Sstevel@tonic-gate 
3376*0Sstevel@tonic-gate 	return (1);
3377*0Sstevel@tonic-gate }
3378*0Sstevel@tonic-gate 
3379*0Sstevel@tonic-gate 
3380*0Sstevel@tonic-gate 
3381*0Sstevel@tonic-gate /*
3382*0Sstevel@tonic-gate  * Destroy identity of the page and put it back on
3383*0Sstevel@tonic-gate  * the page free list.  Assumes that the caller has
3384*0Sstevel@tonic-gate  * acquired the "exclusive" lock on the page.
3385*0Sstevel@tonic-gate  */
3386*0Sstevel@tonic-gate void
3387*0Sstevel@tonic-gate page_destroy(page_t *pp, int dontfree)
3388*0Sstevel@tonic-gate {
3389*0Sstevel@tonic-gate 	ASSERT((PAGE_EXCL(pp) &&
3390*0Sstevel@tonic-gate 	    !page_iolock_assert(pp)) || panicstr);
3391*0Sstevel@tonic-gate 
3392*0Sstevel@tonic-gate 	if (pp->p_szc != 0) {
3393*0Sstevel@tonic-gate 		if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3394*0Sstevel@tonic-gate 		    pp->p_vnode == &kvp) {
3395*0Sstevel@tonic-gate 			panic("page_destroy: anon or kernel or no vnode "
3396*0Sstevel@tonic-gate 			    "large page %p", (void *)pp);
3397*0Sstevel@tonic-gate 		}
3398*0Sstevel@tonic-gate 		page_demote_vp_pages(pp);
3399*0Sstevel@tonic-gate 		ASSERT(pp->p_szc == 0);
3400*0Sstevel@tonic-gate 	}
3401*0Sstevel@tonic-gate 
3402*0Sstevel@tonic-gate 	TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3403*0Sstevel@tonic-gate 
3404*0Sstevel@tonic-gate 	/*
3405*0Sstevel@tonic-gate 	 * Unload translations, if any, then hash out the
3406*0Sstevel@tonic-gate 	 * page to erase its identity.
3407*0Sstevel@tonic-gate 	 */
3408*0Sstevel@tonic-gate 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3409*0Sstevel@tonic-gate 	page_hashout(pp, NULL);
3410*0Sstevel@tonic-gate 
3411*0Sstevel@tonic-gate 	if (!dontfree) {
3412*0Sstevel@tonic-gate 		/*
3413*0Sstevel@tonic-gate 		 * Acquire the "freemem_lock" for availrmem.
3414*0Sstevel@tonic-gate 		 * The page_struct_lock need not be acquired for lckcnt
3415*0Sstevel@tonic-gate 		 * and cowcnt since the page has an "exclusive" lock.
3416*0Sstevel@tonic-gate 		 */
3417*0Sstevel@tonic-gate 		if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3418*0Sstevel@tonic-gate 			mutex_enter(&freemem_lock);
3419*0Sstevel@tonic-gate 			if (pp->p_lckcnt != 0) {
3420*0Sstevel@tonic-gate 				availrmem++;
3421*0Sstevel@tonic-gate 				pp->p_lckcnt = 0;
3422*0Sstevel@tonic-gate 			}
3423*0Sstevel@tonic-gate 			if (pp->p_cowcnt != 0) {
3424*0Sstevel@tonic-gate 				availrmem += pp->p_cowcnt;
3425*0Sstevel@tonic-gate 				pp->p_cowcnt = 0;
3426*0Sstevel@tonic-gate 			}
3427*0Sstevel@tonic-gate 			mutex_exit(&freemem_lock);
3428*0Sstevel@tonic-gate 		}
3429*0Sstevel@tonic-gate 		/*
3430*0Sstevel@tonic-gate 		 * Put the page on the "free" list.
3431*0Sstevel@tonic-gate 		 */
3432*0Sstevel@tonic-gate 		page_free(pp, 0);
3433*0Sstevel@tonic-gate 	}
3434*0Sstevel@tonic-gate }
3435*0Sstevel@tonic-gate 
3436*0Sstevel@tonic-gate void
3437*0Sstevel@tonic-gate page_destroy_pages(page_t *pp)
3438*0Sstevel@tonic-gate {
3439*0Sstevel@tonic-gate 
3440*0Sstevel@tonic-gate 	page_t	*tpp, *rootpp = NULL;
3441*0Sstevel@tonic-gate 	pgcnt_t	pgcnt = page_get_pagecnt(pp->p_szc);
3442*0Sstevel@tonic-gate 	pgcnt_t	i, pglcks = 0;
3443*0Sstevel@tonic-gate 	uint_t	szc = pp->p_szc;
3444*0Sstevel@tonic-gate 	int	toxic = 0;
3445*0Sstevel@tonic-gate 
3446*0Sstevel@tonic-gate 	ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3447*0Sstevel@tonic-gate 
3448*0Sstevel@tonic-gate 	VM_STAT_ADD(pagecnt.pc_destroy_pages);
3449*0Sstevel@tonic-gate 
3450*0Sstevel@tonic-gate 	TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3451*0Sstevel@tonic-gate 
3452*0Sstevel@tonic-gate 	if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3453*0Sstevel@tonic-gate 		panic("page_destroy_pages: not root page %p", (void *)pp);
3454*0Sstevel@tonic-gate 		/*NOTREACHED*/
3455*0Sstevel@tonic-gate 	}
3456*0Sstevel@tonic-gate 
3457*0Sstevel@tonic-gate 	for (i = 0, tpp = pp; i < pgcnt; i++, tpp = page_next(tpp)) {
3458*0Sstevel@tonic-gate 		ASSERT((PAGE_EXCL(tpp) &&
3459*0Sstevel@tonic-gate 		    !page_iolock_assert(tpp)) || panicstr);
3460*0Sstevel@tonic-gate 		(void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3461*0Sstevel@tonic-gate 		page_hashout(tpp, NULL);
3462*0Sstevel@tonic-gate 		ASSERT(tpp->p_offset == (u_offset_t)-1);
3463*0Sstevel@tonic-gate 		if (tpp->p_lckcnt != 0) {
3464*0Sstevel@tonic-gate 			pglcks++;
3465*0Sstevel@tonic-gate 			tpp->p_lckcnt = 0;
3466*0Sstevel@tonic-gate 		} else if (tpp->p_cowcnt != 0) {
3467*0Sstevel@tonic-gate 			pglcks += tpp->p_cowcnt;
3468*0Sstevel@tonic-gate 			tpp->p_cowcnt = 0;
3469*0Sstevel@tonic-gate 		}
3470*0Sstevel@tonic-gate 		ASSERT(!hat_page_getshare(tpp));
3471*0Sstevel@tonic-gate 		ASSERT(tpp->p_vnode == NULL);
3472*0Sstevel@tonic-gate 		ASSERT(tpp->p_szc == szc);
3473*0Sstevel@tonic-gate 
3474*0Sstevel@tonic-gate 		if (page_deteriorating(tpp))
3475*0Sstevel@tonic-gate 			toxic = 1;
3476*0Sstevel@tonic-gate 
3477*0Sstevel@tonic-gate 		PP_SETFREE(tpp);
3478*0Sstevel@tonic-gate 		page_clr_all_props(tpp);
3479*0Sstevel@tonic-gate 		PP_SETAGED(tpp);
3480*0Sstevel@tonic-gate 		ASSERT(tpp->p_next == tpp);
3481*0Sstevel@tonic-gate 		ASSERT(tpp->p_prev == tpp);
3482*0Sstevel@tonic-gate 		page_list_concat(&rootpp, &tpp);
3483*0Sstevel@tonic-gate 	}
3484*0Sstevel@tonic-gate 
3485*0Sstevel@tonic-gate 	ASSERT(rootpp == pp);
3486*0Sstevel@tonic-gate 	if (pglcks != 0) {
3487*0Sstevel@tonic-gate 		mutex_enter(&freemem_lock);
3488*0Sstevel@tonic-gate 		availrmem += pglcks;
3489*0Sstevel@tonic-gate 		mutex_exit(&freemem_lock);
3490*0Sstevel@tonic-gate 	}
3491*0Sstevel@tonic-gate 
3492*0Sstevel@tonic-gate 	if (toxic) {
3493*0Sstevel@tonic-gate 		page_free_toxic_pages(rootpp);
3494*0Sstevel@tonic-gate 		return;
3495*0Sstevel@tonic-gate 	}
3496*0Sstevel@tonic-gate 	page_list_add_pages(rootpp, 0);
3497*0Sstevel@tonic-gate 	page_create_putback(pgcnt);
3498*0Sstevel@tonic-gate }
3499*0Sstevel@tonic-gate 
3500*0Sstevel@tonic-gate /*
3501*0Sstevel@tonic-gate  * Similar to page_destroy(), but destroys pages which are
3502*0Sstevel@tonic-gate  * locked and known to be on the page free list.  Since
3503*0Sstevel@tonic-gate  * the page is known to be free and locked, no one can access
3504*0Sstevel@tonic-gate  * it.
3505*0Sstevel@tonic-gate  *
3506*0Sstevel@tonic-gate  * Also, the number of free pages does not change.
3507*0Sstevel@tonic-gate  */
3508*0Sstevel@tonic-gate void
3509*0Sstevel@tonic-gate page_destroy_free(page_t *pp)
3510*0Sstevel@tonic-gate {
3511*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
3512*0Sstevel@tonic-gate 	ASSERT(PP_ISFREE(pp));
3513*0Sstevel@tonic-gate 	ASSERT(pp->p_vnode);
3514*0Sstevel@tonic-gate 	ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3515*0Sstevel@tonic-gate 	ASSERT(!hat_page_is_mapped(pp));
3516*0Sstevel@tonic-gate 	ASSERT(PP_ISAGED(pp) == 0);
3517*0Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
3518*0Sstevel@tonic-gate 
3519*0Sstevel@tonic-gate 	VM_STAT_ADD(pagecnt.pc_destroy_free);
3520*0Sstevel@tonic-gate 	page_list_sub(pp, PG_CACHE_LIST);
3521*0Sstevel@tonic-gate 
3522*0Sstevel@tonic-gate 	page_hashout(pp, NULL);
3523*0Sstevel@tonic-gate 	ASSERT(pp->p_vnode == NULL);
3524*0Sstevel@tonic-gate 	ASSERT(pp->p_offset == (u_offset_t)-1);
3525*0Sstevel@tonic-gate 	ASSERT(pp->p_hash == NULL);
3526*0Sstevel@tonic-gate 
3527*0Sstevel@tonic-gate 	PP_SETAGED(pp);
3528*0Sstevel@tonic-gate 	page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3529*0Sstevel@tonic-gate 	page_unlock(pp);
3530*0Sstevel@tonic-gate 
3531*0Sstevel@tonic-gate 	mutex_enter(&new_freemem_lock);
3532*0Sstevel@tonic-gate 	if (freemem_wait) {
3533*0Sstevel@tonic-gate 		cv_signal(&freemem_cv);
3534*0Sstevel@tonic-gate 	}
3535*0Sstevel@tonic-gate 	mutex_exit(&new_freemem_lock);
3536*0Sstevel@tonic-gate }
3537*0Sstevel@tonic-gate 
3538*0Sstevel@tonic-gate /*
3539*0Sstevel@tonic-gate  * Rename the page "opp" to have an identity specified
3540*0Sstevel@tonic-gate  * by [vp, off].  If a page already exists with this name
3541*0Sstevel@tonic-gate  * it is locked and destroyed.  Note that the page's
3542*0Sstevel@tonic-gate  * translations are not unloaded during the rename.
3543*0Sstevel@tonic-gate  *
3544*0Sstevel@tonic-gate  * This routine is used by the anon layer to "steal" the
3545*0Sstevel@tonic-gate  * original page and is not unlike destroying a page and
3546*0Sstevel@tonic-gate  * creating a new page using the same page frame.
3547*0Sstevel@tonic-gate  *
3548*0Sstevel@tonic-gate  * XXX -- Could deadlock if caller 1 tries to rename A to B while
3549*0Sstevel@tonic-gate  * caller 2 tries to rename B to A.
3550*0Sstevel@tonic-gate  */
3551*0Sstevel@tonic-gate void
3552*0Sstevel@tonic-gate page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3553*0Sstevel@tonic-gate {
3554*0Sstevel@tonic-gate 	page_t		*pp;
3555*0Sstevel@tonic-gate 	int		olckcnt = 0;
3556*0Sstevel@tonic-gate 	int		ocowcnt = 0;
3557*0Sstevel@tonic-gate 	kmutex_t	*phm;
3558*0Sstevel@tonic-gate 	ulong_t		index;
3559*0Sstevel@tonic-gate 
3560*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3561*0Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3562*0Sstevel@tonic-gate 	ASSERT(PP_ISFREE(opp) == 0);
3563*0Sstevel@tonic-gate 
3564*0Sstevel@tonic-gate 	VM_STAT_ADD(page_rename_count);
3565*0Sstevel@tonic-gate 
3566*0Sstevel@tonic-gate 	TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3567*0Sstevel@tonic-gate 		"page rename:pp %p vp %p off %llx", opp, vp, off);
3568*0Sstevel@tonic-gate 
3569*0Sstevel@tonic-gate 	page_hashout(opp, NULL);
3570*0Sstevel@tonic-gate 	PP_CLRAGED(opp);
3571*0Sstevel@tonic-gate 
3572*0Sstevel@tonic-gate 	/*
3573*0Sstevel@tonic-gate 	 * Acquire the appropriate page hash lock, since
3574*0Sstevel@tonic-gate 	 * we're going to rename the page.
3575*0Sstevel@tonic-gate 	 */
3576*0Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, off);
3577*0Sstevel@tonic-gate 	phm = PAGE_HASH_MUTEX(index);
3578*0Sstevel@tonic-gate 	mutex_enter(phm);
3579*0Sstevel@tonic-gate top:
3580*0Sstevel@tonic-gate 	/*
3581*0Sstevel@tonic-gate 	 * Look for an existing page with this name and destroy it if found.
3582*0Sstevel@tonic-gate 	 * By holding the page hash lock all the way to the page_hashin()
3583*0Sstevel@tonic-gate 	 * call, we are assured that no page can be created with this
3584*0Sstevel@tonic-gate 	 * identity.  In the case when the phm lock is dropped to undo any
3585*0Sstevel@tonic-gate 	 * hat layer mappings, the existing page is held with an "exclusive"
3586*0Sstevel@tonic-gate 	 * lock, again preventing another page from being created with
3587*0Sstevel@tonic-gate 	 * this identity.
3588*0Sstevel@tonic-gate 	 */
3589*0Sstevel@tonic-gate 	PAGE_HASH_SEARCH(index, pp, vp, off);
3590*0Sstevel@tonic-gate 	if (pp != NULL) {
3591*0Sstevel@tonic-gate 		VM_STAT_ADD(page_rename_exists);
3592*0Sstevel@tonic-gate 
3593*0Sstevel@tonic-gate 		/*
3594*0Sstevel@tonic-gate 		 * As it turns out, this is one of only two places where
3595*0Sstevel@tonic-gate 		 * page_lock() needs to hold the passed in lock in the
3596*0Sstevel@tonic-gate 		 * successful case.  In all of the others, the lock could
3597*0Sstevel@tonic-gate 		 * be dropped as soon as the attempt is made to lock
3598*0Sstevel@tonic-gate 		 * the page.  It is tempting to add yet another arguement,
3599*0Sstevel@tonic-gate 		 * PL_KEEP or PL_DROP, to let page_lock know what to do.
3600*0Sstevel@tonic-gate 		 */
3601*0Sstevel@tonic-gate 		if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3602*0Sstevel@tonic-gate 			/*
3603*0Sstevel@tonic-gate 			 * Went to sleep because the page could not
3604*0Sstevel@tonic-gate 			 * be locked.  We were woken up when the page
3605*0Sstevel@tonic-gate 			 * was unlocked, or when the page was destroyed.
3606*0Sstevel@tonic-gate 			 * In either case, `phm' was dropped while we
3607*0Sstevel@tonic-gate 			 * slept.  Hence we should not just roar through
3608*0Sstevel@tonic-gate 			 * this loop.
3609*0Sstevel@tonic-gate 			 */
3610*0Sstevel@tonic-gate 			goto top;
3611*0Sstevel@tonic-gate 		}
3612*0Sstevel@tonic-gate 
3613*0Sstevel@tonic-gate 		if (hat_page_is_mapped(pp)) {
3614*0Sstevel@tonic-gate 			/*
3615*0Sstevel@tonic-gate 			 * Unload translations.  Since we hold the
3616*0Sstevel@tonic-gate 			 * exclusive lock on this page, the page
3617*0Sstevel@tonic-gate 			 * can not be changed while we drop phm.
3618*0Sstevel@tonic-gate 			 * This is also not a lock protocol violation,
3619*0Sstevel@tonic-gate 			 * but rather the proper way to do things.
3620*0Sstevel@tonic-gate 			 */
3621*0Sstevel@tonic-gate 			mutex_exit(phm);
3622*0Sstevel@tonic-gate 			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3623*0Sstevel@tonic-gate 			mutex_enter(phm);
3624*0Sstevel@tonic-gate 		}
3625*0Sstevel@tonic-gate 		page_hashout(pp, phm);
3626*0Sstevel@tonic-gate 	}
3627*0Sstevel@tonic-gate 	/*
3628*0Sstevel@tonic-gate 	 * Hash in the page with the new identity.
3629*0Sstevel@tonic-gate 	 */
3630*0Sstevel@tonic-gate 	if (!page_hashin(opp, vp, off, phm)) {
3631*0Sstevel@tonic-gate 		/*
3632*0Sstevel@tonic-gate 		 * We were holding phm while we searched for [vp, off]
3633*0Sstevel@tonic-gate 		 * and only dropped phm if we found and locked a page.
3634*0Sstevel@tonic-gate 		 * If we can't create this page now, then some thing
3635*0Sstevel@tonic-gate 		 * is really broken.
3636*0Sstevel@tonic-gate 		 */
3637*0Sstevel@tonic-gate 		panic("page_rename: Can't hash in page: %p", (void *)pp);
3638*0Sstevel@tonic-gate 		/*NOTREACHED*/
3639*0Sstevel@tonic-gate 	}
3640*0Sstevel@tonic-gate 
3641*0Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(phm));
3642*0Sstevel@tonic-gate 	mutex_exit(phm);
3643*0Sstevel@tonic-gate 
3644*0Sstevel@tonic-gate 	/*
3645*0Sstevel@tonic-gate 	 * Now that we have dropped phm, lets get around to finishing up
3646*0Sstevel@tonic-gate 	 * with pp.
3647*0Sstevel@tonic-gate 	 */
3648*0Sstevel@tonic-gate 	if (pp != NULL) {
3649*0Sstevel@tonic-gate 		ASSERT(!hat_page_is_mapped(pp));
3650*0Sstevel@tonic-gate 		/* for now large pages should not end up here */
3651*0Sstevel@tonic-gate 		ASSERT(pp->p_szc == 0);
3652*0Sstevel@tonic-gate 		/*
3653*0Sstevel@tonic-gate 		 * Save the locks for transfer to the new page and then
3654*0Sstevel@tonic-gate 		 * clear them so page_free doesn't think they're important.
3655*0Sstevel@tonic-gate 		 * The page_struct_lock need not be acquired for lckcnt and
3656*0Sstevel@tonic-gate 		 * cowcnt since the page has an "exclusive" lock.
3657*0Sstevel@tonic-gate 		 */
3658*0Sstevel@tonic-gate 		olckcnt = pp->p_lckcnt;
3659*0Sstevel@tonic-gate 		ocowcnt = pp->p_cowcnt;
3660*0Sstevel@tonic-gate 		pp->p_lckcnt = pp->p_cowcnt = 0;
3661*0Sstevel@tonic-gate 
3662*0Sstevel@tonic-gate 		/*
3663*0Sstevel@tonic-gate 		 * Put the page on the "free" list after we drop
3664*0Sstevel@tonic-gate 		 * the lock.  The less work under the lock the better.
3665*0Sstevel@tonic-gate 		 */
3666*0Sstevel@tonic-gate 		/*LINTED: constant in conditional context*/
3667*0Sstevel@tonic-gate 		VN_DISPOSE(pp, B_FREE, 0, kcred);
3668*0Sstevel@tonic-gate 	}
3669*0Sstevel@tonic-gate 
3670*0Sstevel@tonic-gate 	/*
3671*0Sstevel@tonic-gate 	 * Transfer the lock count from the old page (if any).
3672*0Sstevel@tonic-gate 	 * The page_struct_lock need not be acquired for lckcnt and
3673*0Sstevel@tonic-gate 	 * cowcnt since the page has an "exclusive" lock.
3674*0Sstevel@tonic-gate 	 */
3675*0Sstevel@tonic-gate 	opp->p_lckcnt += olckcnt;
3676*0Sstevel@tonic-gate 	opp->p_cowcnt += ocowcnt;
3677*0Sstevel@tonic-gate }
3678*0Sstevel@tonic-gate 
3679*0Sstevel@tonic-gate /*
3680*0Sstevel@tonic-gate  * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3681*0Sstevel@tonic-gate  *
3682*0Sstevel@tonic-gate  * Pages are normally inserted at the start of a vnode's v_pages list.
3683*0Sstevel@tonic-gate  * If the vnode is VMODSORT and the page is modified, it goes at the end.
3684*0Sstevel@tonic-gate  * This can happen when a modified page is relocated for DR.
3685*0Sstevel@tonic-gate  *
3686*0Sstevel@tonic-gate  * Returns 1 on success and 0 on failure.
3687*0Sstevel@tonic-gate  */
3688*0Sstevel@tonic-gate static int
3689*0Sstevel@tonic-gate page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3690*0Sstevel@tonic-gate {
3691*0Sstevel@tonic-gate 	page_t		**listp;
3692*0Sstevel@tonic-gate 	page_t		*tp;
3693*0Sstevel@tonic-gate 	ulong_t		index;
3694*0Sstevel@tonic-gate 
3695*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
3696*0Sstevel@tonic-gate 	ASSERT(vp != NULL);
3697*0Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3698*0Sstevel@tonic-gate 
3699*0Sstevel@tonic-gate 	/*
3700*0Sstevel@tonic-gate 	 * Be sure to set these up before the page is inserted on the hash
3701*0Sstevel@tonic-gate 	 * list.  As soon as the page is placed on the list some other
3702*0Sstevel@tonic-gate 	 * thread might get confused and wonder how this page could
3703*0Sstevel@tonic-gate 	 * possibly hash to this list.
3704*0Sstevel@tonic-gate 	 */
3705*0Sstevel@tonic-gate 	pp->p_vnode = vp;
3706*0Sstevel@tonic-gate 	pp->p_offset = offset;
3707*0Sstevel@tonic-gate 
3708*0Sstevel@tonic-gate 	/*
3709*0Sstevel@tonic-gate 	 * record if this page is on a swap vnode
3710*0Sstevel@tonic-gate 	 */
3711*0Sstevel@tonic-gate 	if ((vp->v_flag & VISSWAP) != 0)
3712*0Sstevel@tonic-gate 		PP_SETSWAP(pp);
3713*0Sstevel@tonic-gate 
3714*0Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, offset);
3715*0Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3716*0Sstevel@tonic-gate 	listp = &page_hash[index];
3717*0Sstevel@tonic-gate 
3718*0Sstevel@tonic-gate 	/*
3719*0Sstevel@tonic-gate 	 * If this page is already hashed in, fail this attempt to add it.
3720*0Sstevel@tonic-gate 	 */
3721*0Sstevel@tonic-gate 	for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3722*0Sstevel@tonic-gate 		if (tp->p_vnode == vp && tp->p_offset == offset) {
3723*0Sstevel@tonic-gate 			pp->p_vnode = NULL;
3724*0Sstevel@tonic-gate 			pp->p_offset = (u_offset_t)(-1);
3725*0Sstevel@tonic-gate 			return (0);
3726*0Sstevel@tonic-gate 		}
3727*0Sstevel@tonic-gate 	}
3728*0Sstevel@tonic-gate 	pp->p_hash = *listp;
3729*0Sstevel@tonic-gate 	*listp = pp;
3730*0Sstevel@tonic-gate 
3731*0Sstevel@tonic-gate 	/*
3732*0Sstevel@tonic-gate 	 * Add the page to the vnode's list of pages
3733*0Sstevel@tonic-gate 	 */
3734*0Sstevel@tonic-gate 	if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3735*0Sstevel@tonic-gate 		listp = &vp->v_pages->p_vpprev->p_vpnext;
3736*0Sstevel@tonic-gate 	else
3737*0Sstevel@tonic-gate 		listp = &vp->v_pages;
3738*0Sstevel@tonic-gate 
3739*0Sstevel@tonic-gate 	page_vpadd(listp, pp);
3740*0Sstevel@tonic-gate 
3741*0Sstevel@tonic-gate 	return (1);
3742*0Sstevel@tonic-gate }
3743*0Sstevel@tonic-gate 
3744*0Sstevel@tonic-gate /*
3745*0Sstevel@tonic-gate  * Add page `pp' to both the hash and vp chains for [vp, offset].
3746*0Sstevel@tonic-gate  *
3747*0Sstevel@tonic-gate  * Returns 1 on success and 0 on failure.
3748*0Sstevel@tonic-gate  * If hold is passed in, it is not dropped.
3749*0Sstevel@tonic-gate  */
3750*0Sstevel@tonic-gate int
3751*0Sstevel@tonic-gate page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3752*0Sstevel@tonic-gate {
3753*0Sstevel@tonic-gate 	kmutex_t	*phm = NULL;
3754*0Sstevel@tonic-gate 	kmutex_t	*vphm;
3755*0Sstevel@tonic-gate 	int		rc;
3756*0Sstevel@tonic-gate 
3757*0Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3758*0Sstevel@tonic-gate 
3759*0Sstevel@tonic-gate 	TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3760*0Sstevel@tonic-gate 		"page_hashin:pp %p vp %p offset %llx",
3761*0Sstevel@tonic-gate 		pp, vp, offset);
3762*0Sstevel@tonic-gate 
3763*0Sstevel@tonic-gate 	VM_STAT_ADD(hashin_count);
3764*0Sstevel@tonic-gate 
3765*0Sstevel@tonic-gate 	if (hold != NULL)
3766*0Sstevel@tonic-gate 		phm = hold;
3767*0Sstevel@tonic-gate 	else {
3768*0Sstevel@tonic-gate 		VM_STAT_ADD(hashin_not_held);
3769*0Sstevel@tonic-gate 		phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3770*0Sstevel@tonic-gate 		mutex_enter(phm);
3771*0Sstevel@tonic-gate 	}
3772*0Sstevel@tonic-gate 
3773*0Sstevel@tonic-gate 	vphm = page_vnode_mutex(vp);
3774*0Sstevel@tonic-gate 	mutex_enter(vphm);
3775*0Sstevel@tonic-gate 	rc = page_do_hashin(pp, vp, offset);
3776*0Sstevel@tonic-gate 	mutex_exit(vphm);
3777*0Sstevel@tonic-gate 	if (hold == NULL)
3778*0Sstevel@tonic-gate 		mutex_exit(phm);
3779*0Sstevel@tonic-gate 	if (rc == 0)
3780*0Sstevel@tonic-gate 		VM_STAT_ADD(hashin_already);
3781*0Sstevel@tonic-gate 	return (rc);
3782*0Sstevel@tonic-gate }
3783*0Sstevel@tonic-gate 
3784*0Sstevel@tonic-gate /*
3785*0Sstevel@tonic-gate  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3786*0Sstevel@tonic-gate  * All mutexes must be held
3787*0Sstevel@tonic-gate  */
3788*0Sstevel@tonic-gate static void
3789*0Sstevel@tonic-gate page_do_hashout(page_t *pp)
3790*0Sstevel@tonic-gate {
3791*0Sstevel@tonic-gate 	page_t	**hpp;
3792*0Sstevel@tonic-gate 	page_t	*hp;
3793*0Sstevel@tonic-gate 	vnode_t	*vp = pp->p_vnode;
3794*0Sstevel@tonic-gate 
3795*0Sstevel@tonic-gate 	ASSERT(vp != NULL);
3796*0Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3797*0Sstevel@tonic-gate 
3798*0Sstevel@tonic-gate 	/*
3799*0Sstevel@tonic-gate 	 * First, take pp off of its hash chain.
3800*0Sstevel@tonic-gate 	 */
3801*0Sstevel@tonic-gate 	hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3802*0Sstevel@tonic-gate 
3803*0Sstevel@tonic-gate 	for (;;) {
3804*0Sstevel@tonic-gate 		hp = *hpp;
3805*0Sstevel@tonic-gate 		if (hp == pp)
3806*0Sstevel@tonic-gate 			break;
3807*0Sstevel@tonic-gate 		if (hp == NULL) {
3808*0Sstevel@tonic-gate 			panic("page_do_hashout");
3809*0Sstevel@tonic-gate 			/*NOTREACHED*/
3810*0Sstevel@tonic-gate 		}
3811*0Sstevel@tonic-gate 		hpp = &hp->p_hash;
3812*0Sstevel@tonic-gate 	}
3813*0Sstevel@tonic-gate 	*hpp = pp->p_hash;
3814*0Sstevel@tonic-gate 
3815*0Sstevel@tonic-gate 	/*
3816*0Sstevel@tonic-gate 	 * Now remove it from its associated vnode.
3817*0Sstevel@tonic-gate 	 */
3818*0Sstevel@tonic-gate 	if (vp->v_pages)
3819*0Sstevel@tonic-gate 		page_vpsub(&vp->v_pages, pp);
3820*0Sstevel@tonic-gate 
3821*0Sstevel@tonic-gate 	pp->p_hash = NULL;
3822*0Sstevel@tonic-gate 	page_clr_all_props(pp);
3823*0Sstevel@tonic-gate 	PP_CLRSWAP(pp);
3824*0Sstevel@tonic-gate 	pp->p_vnode = NULL;
3825*0Sstevel@tonic-gate 	pp->p_offset = (u_offset_t)-1;
3826*0Sstevel@tonic-gate }
3827*0Sstevel@tonic-gate 
3828*0Sstevel@tonic-gate /*
3829*0Sstevel@tonic-gate  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3830*0Sstevel@tonic-gate  *
3831*0Sstevel@tonic-gate  * When `phm' is non-NULL it contains the address of the mutex protecting the
3832*0Sstevel@tonic-gate  * hash list pp is on.  It is not dropped.
3833*0Sstevel@tonic-gate  */
3834*0Sstevel@tonic-gate void
3835*0Sstevel@tonic-gate page_hashout(page_t *pp, kmutex_t *phm)
3836*0Sstevel@tonic-gate {
3837*0Sstevel@tonic-gate 	vnode_t		*vp;
3838*0Sstevel@tonic-gate 	ulong_t		index;
3839*0Sstevel@tonic-gate 	kmutex_t	*nphm;
3840*0Sstevel@tonic-gate 	kmutex_t	*vphm;
3841*0Sstevel@tonic-gate 	kmutex_t	*sep;
3842*0Sstevel@tonic-gate 
3843*0Sstevel@tonic-gate 	ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3844*0Sstevel@tonic-gate 	ASSERT(pp->p_vnode != NULL);
3845*0Sstevel@tonic-gate 	ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3846*0Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3847*0Sstevel@tonic-gate 
3848*0Sstevel@tonic-gate 	vp = pp->p_vnode;
3849*0Sstevel@tonic-gate 
3850*0Sstevel@tonic-gate 	TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3851*0Sstevel@tonic-gate 		"page_hashout:pp %p vp %p", pp, vp);
3852*0Sstevel@tonic-gate 
3853*0Sstevel@tonic-gate 	/* Kernel probe */
3854*0Sstevel@tonic-gate 	TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3855*0Sstevel@tonic-gate 	    tnf_opaque, vnode, vp,
3856*0Sstevel@tonic-gate 	    tnf_offset, offset, pp->p_offset);
3857*0Sstevel@tonic-gate 
3858*0Sstevel@tonic-gate 	/*
3859*0Sstevel@tonic-gate 	 *
3860*0Sstevel@tonic-gate 	 */
3861*0Sstevel@tonic-gate 	VM_STAT_ADD(hashout_count);
3862*0Sstevel@tonic-gate 	index = PAGE_HASH_FUNC(vp, pp->p_offset);
3863*0Sstevel@tonic-gate 	if (phm == NULL) {
3864*0Sstevel@tonic-gate 		VM_STAT_ADD(hashout_not_held);
3865*0Sstevel@tonic-gate 		nphm = PAGE_HASH_MUTEX(index);
3866*0Sstevel@tonic-gate 		mutex_enter(nphm);
3867*0Sstevel@tonic-gate 	}
3868*0Sstevel@tonic-gate 	ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3869*0Sstevel@tonic-gate 
3870*0Sstevel@tonic-gate 
3871*0Sstevel@tonic-gate 	/*
3872*0Sstevel@tonic-gate 	 * grab page vnode mutex and remove it...
3873*0Sstevel@tonic-gate 	 */
3874*0Sstevel@tonic-gate 	vphm = page_vnode_mutex(vp);
3875*0Sstevel@tonic-gate 	mutex_enter(vphm);
3876*0Sstevel@tonic-gate 
3877*0Sstevel@tonic-gate 	page_do_hashout(pp);
3878*0Sstevel@tonic-gate 
3879*0Sstevel@tonic-gate 	mutex_exit(vphm);
3880*0Sstevel@tonic-gate 	if (phm == NULL)
3881*0Sstevel@tonic-gate 		mutex_exit(nphm);
3882*0Sstevel@tonic-gate 
3883*0Sstevel@tonic-gate 	/*
3884*0Sstevel@tonic-gate 	 * If the page was retired, update the pages_retired
3885*0Sstevel@tonic-gate 	 * total and clear the page flag
3886*0Sstevel@tonic-gate 	 */
3887*0Sstevel@tonic-gate 	if (page_isretired(pp)) {
3888*0Sstevel@tonic-gate 		retired_page_removed(pp);
3889*0Sstevel@tonic-gate 	}
3890*0Sstevel@tonic-gate 
3891*0Sstevel@tonic-gate 	/*
3892*0Sstevel@tonic-gate 	 * Wake up processes waiting for this page.  The page's
3893*0Sstevel@tonic-gate 	 * identity has been changed, and is probably not the
3894*0Sstevel@tonic-gate 	 * desired page any longer.
3895*0Sstevel@tonic-gate 	 */
3896*0Sstevel@tonic-gate 	sep = page_se_mutex(pp);
3897*0Sstevel@tonic-gate 	mutex_enter(sep);
3898*0Sstevel@tonic-gate 	if (CV_HAS_WAITERS(&pp->p_cv))
3899*0Sstevel@tonic-gate 		cv_broadcast(&pp->p_cv);
3900*0Sstevel@tonic-gate 	mutex_exit(sep);
3901*0Sstevel@tonic-gate }
3902*0Sstevel@tonic-gate 
3903*0Sstevel@tonic-gate /*
3904*0Sstevel@tonic-gate  * Add the page to the front of a linked list of pages
3905*0Sstevel@tonic-gate  * using the p_next & p_prev pointers for the list.
3906*0Sstevel@tonic-gate  * The caller is responsible for protecting the list pointers.
3907*0Sstevel@tonic-gate  */
3908*0Sstevel@tonic-gate void
3909*0Sstevel@tonic-gate page_add(page_t **ppp, page_t *pp)
3910*0Sstevel@tonic-gate {
3911*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3912*0Sstevel@tonic-gate 
3913*0Sstevel@tonic-gate 	page_add_common(ppp, pp);
3914*0Sstevel@tonic-gate }
3915*0Sstevel@tonic-gate 
3916*0Sstevel@tonic-gate 
3917*0Sstevel@tonic-gate 
3918*0Sstevel@tonic-gate /*
3919*0Sstevel@tonic-gate  *  Common code for page_add() and mach_page_add()
3920*0Sstevel@tonic-gate  */
3921*0Sstevel@tonic-gate void
3922*0Sstevel@tonic-gate page_add_common(page_t **ppp, page_t *pp)
3923*0Sstevel@tonic-gate {
3924*0Sstevel@tonic-gate 	if (*ppp == NULL) {
3925*0Sstevel@tonic-gate 		pp->p_next = pp->p_prev = pp;
3926*0Sstevel@tonic-gate 	} else {
3927*0Sstevel@tonic-gate 		pp->p_next = *ppp;
3928*0Sstevel@tonic-gate 		pp->p_prev = (*ppp)->p_prev;
3929*0Sstevel@tonic-gate 		(*ppp)->p_prev = pp;
3930*0Sstevel@tonic-gate 		pp->p_prev->p_next = pp;
3931*0Sstevel@tonic-gate 	}
3932*0Sstevel@tonic-gate 	*ppp = pp;
3933*0Sstevel@tonic-gate }
3934*0Sstevel@tonic-gate 
3935*0Sstevel@tonic-gate 
3936*0Sstevel@tonic-gate /*
3937*0Sstevel@tonic-gate  * Remove this page from a linked list of pages
3938*0Sstevel@tonic-gate  * using the p_next & p_prev pointers for the list.
3939*0Sstevel@tonic-gate  *
3940*0Sstevel@tonic-gate  * The caller is responsible for protecting the list pointers.
3941*0Sstevel@tonic-gate  */
3942*0Sstevel@tonic-gate void
3943*0Sstevel@tonic-gate page_sub(page_t **ppp, page_t *pp)
3944*0Sstevel@tonic-gate {
3945*0Sstevel@tonic-gate 	ASSERT((PP_ISFREE(pp)) ? 1 :
3946*0Sstevel@tonic-gate 	    (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3947*0Sstevel@tonic-gate 
3948*0Sstevel@tonic-gate 	if (*ppp == NULL || pp == NULL) {
3949*0Sstevel@tonic-gate 		panic("page_sub: bad arg(s): pp %p, *ppp %p",
3950*0Sstevel@tonic-gate 		    (void *)pp, (void *)(*ppp));
3951*0Sstevel@tonic-gate 		/*NOTREACHED*/
3952*0Sstevel@tonic-gate 	}
3953*0Sstevel@tonic-gate 
3954*0Sstevel@tonic-gate 	page_sub_common(ppp, pp);
3955*0Sstevel@tonic-gate }
3956*0Sstevel@tonic-gate 
3957*0Sstevel@tonic-gate 
3958*0Sstevel@tonic-gate /*
3959*0Sstevel@tonic-gate  *  Common code for page_sub() and mach_page_sub()
3960*0Sstevel@tonic-gate  */
3961*0Sstevel@tonic-gate void
3962*0Sstevel@tonic-gate page_sub_common(page_t **ppp, page_t *pp)
3963*0Sstevel@tonic-gate {
3964*0Sstevel@tonic-gate 	if (*ppp == pp)
3965*0Sstevel@tonic-gate 		*ppp = pp->p_next;		/* go to next page */
3966*0Sstevel@tonic-gate 
3967*0Sstevel@tonic-gate 	if (*ppp == pp)
3968*0Sstevel@tonic-gate 		*ppp = NULL;			/* page list is gone */
3969*0Sstevel@tonic-gate 	else {
3970*0Sstevel@tonic-gate 		pp->p_prev->p_next = pp->p_next;
3971*0Sstevel@tonic-gate 		pp->p_next->p_prev = pp->p_prev;
3972*0Sstevel@tonic-gate 	}
3973*0Sstevel@tonic-gate 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
3974*0Sstevel@tonic-gate }
3975*0Sstevel@tonic-gate 
3976*0Sstevel@tonic-gate 
3977*0Sstevel@tonic-gate /*
3978*0Sstevel@tonic-gate  * Break page list cppp into two lists with npages in the first list.
3979*0Sstevel@tonic-gate  * The tail is returned in nppp.
3980*0Sstevel@tonic-gate  */
3981*0Sstevel@tonic-gate void
3982*0Sstevel@tonic-gate page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3983*0Sstevel@tonic-gate {
3984*0Sstevel@tonic-gate 	page_t *s1pp = *oppp;
3985*0Sstevel@tonic-gate 	page_t *s2pp;
3986*0Sstevel@tonic-gate 	page_t *e1pp, *e2pp;
3987*0Sstevel@tonic-gate 	long n = 0;
3988*0Sstevel@tonic-gate 
3989*0Sstevel@tonic-gate 	if (s1pp == NULL) {
3990*0Sstevel@tonic-gate 		*nppp = NULL;
3991*0Sstevel@tonic-gate 		return;
3992*0Sstevel@tonic-gate 	}
3993*0Sstevel@tonic-gate 	if (npages == 0) {
3994*0Sstevel@tonic-gate 		*nppp = s1pp;
3995*0Sstevel@tonic-gate 		*oppp = NULL;
3996*0Sstevel@tonic-gate 		return;
3997*0Sstevel@tonic-gate 	}
3998*0Sstevel@tonic-gate 	for (n = 0, s2pp = *oppp; n < npages; n++) {
3999*0Sstevel@tonic-gate 		s2pp = s2pp->p_next;
4000*0Sstevel@tonic-gate 	}
4001*0Sstevel@tonic-gate 	/* Fix head and tail of new lists */
4002*0Sstevel@tonic-gate 	e1pp = s2pp->p_prev;
4003*0Sstevel@tonic-gate 	e2pp = s1pp->p_prev;
4004*0Sstevel@tonic-gate 	s1pp->p_prev = e1pp;
4005*0Sstevel@tonic-gate 	e1pp->p_next = s1pp;
4006*0Sstevel@tonic-gate 	s2pp->p_prev = e2pp;
4007*0Sstevel@tonic-gate 	e2pp->p_next = s2pp;
4008*0Sstevel@tonic-gate 
4009*0Sstevel@tonic-gate 	/* second list empty */
4010*0Sstevel@tonic-gate 	if (s2pp == s1pp) {
4011*0Sstevel@tonic-gate 		*oppp = s1pp;
4012*0Sstevel@tonic-gate 		*nppp = NULL;
4013*0Sstevel@tonic-gate 	} else {
4014*0Sstevel@tonic-gate 		*oppp = s1pp;
4015*0Sstevel@tonic-gate 		*nppp = s2pp;
4016*0Sstevel@tonic-gate 	}
4017*0Sstevel@tonic-gate }
4018*0Sstevel@tonic-gate 
4019*0Sstevel@tonic-gate /*
4020*0Sstevel@tonic-gate  * Concatenate page list nppp onto the end of list ppp.
4021*0Sstevel@tonic-gate  */
4022*0Sstevel@tonic-gate void
4023*0Sstevel@tonic-gate page_list_concat(page_t **ppp, page_t **nppp)
4024*0Sstevel@tonic-gate {
4025*0Sstevel@tonic-gate 	page_t *s1pp, *s2pp, *e1pp, *e2pp;
4026*0Sstevel@tonic-gate 
4027*0Sstevel@tonic-gate 	if (*nppp == NULL) {
4028*0Sstevel@tonic-gate 		return;
4029*0Sstevel@tonic-gate 	}
4030*0Sstevel@tonic-gate 	if (*ppp == NULL) {
4031*0Sstevel@tonic-gate 		*ppp = *nppp;
4032*0Sstevel@tonic-gate 		return;
4033*0Sstevel@tonic-gate 	}
4034*0Sstevel@tonic-gate 	s1pp = *ppp;
4035*0Sstevel@tonic-gate 	e1pp =  s1pp->p_prev;
4036*0Sstevel@tonic-gate 	s2pp = *nppp;
4037*0Sstevel@tonic-gate 	e2pp = s2pp->p_prev;
4038*0Sstevel@tonic-gate 	s1pp->p_prev = e2pp;
4039*0Sstevel@tonic-gate 	e2pp->p_next = s1pp;
4040*0Sstevel@tonic-gate 	e1pp->p_next = s2pp;
4041*0Sstevel@tonic-gate 	s2pp->p_prev = e1pp;
4042*0Sstevel@tonic-gate }
4043*0Sstevel@tonic-gate 
4044*0Sstevel@tonic-gate /*
4045*0Sstevel@tonic-gate  * return the next page in the page list
4046*0Sstevel@tonic-gate  */
4047*0Sstevel@tonic-gate page_t *
4048*0Sstevel@tonic-gate page_list_next(page_t *pp)
4049*0Sstevel@tonic-gate {
4050*0Sstevel@tonic-gate 	return (pp->p_next);
4051*0Sstevel@tonic-gate }
4052*0Sstevel@tonic-gate 
4053*0Sstevel@tonic-gate 
4054*0Sstevel@tonic-gate /*
4055*0Sstevel@tonic-gate  * Add the page to the front of the linked list of pages
4056*0Sstevel@tonic-gate  * using p_vpnext/p_vpprev pointers for the list.
4057*0Sstevel@tonic-gate  *
4058*0Sstevel@tonic-gate  * The caller is responsible for protecting the lists.
4059*0Sstevel@tonic-gate  */
4060*0Sstevel@tonic-gate void
4061*0Sstevel@tonic-gate page_vpadd(page_t **ppp, page_t *pp)
4062*0Sstevel@tonic-gate {
4063*0Sstevel@tonic-gate 	if (*ppp == NULL) {
4064*0Sstevel@tonic-gate 		pp->p_vpnext = pp->p_vpprev = pp;
4065*0Sstevel@tonic-gate 	} else {
4066*0Sstevel@tonic-gate 		pp->p_vpnext = *ppp;
4067*0Sstevel@tonic-gate 		pp->p_vpprev = (*ppp)->p_vpprev;
4068*0Sstevel@tonic-gate 		(*ppp)->p_vpprev = pp;
4069*0Sstevel@tonic-gate 		pp->p_vpprev->p_vpnext = pp;
4070*0Sstevel@tonic-gate 	}
4071*0Sstevel@tonic-gate 	*ppp = pp;
4072*0Sstevel@tonic-gate }
4073*0Sstevel@tonic-gate 
4074*0Sstevel@tonic-gate /*
4075*0Sstevel@tonic-gate  * Remove this page from the linked list of pages
4076*0Sstevel@tonic-gate  * using p_vpnext/p_vpprev pointers for the list.
4077*0Sstevel@tonic-gate  *
4078*0Sstevel@tonic-gate  * The caller is responsible for protecting the lists.
4079*0Sstevel@tonic-gate  */
4080*0Sstevel@tonic-gate void
4081*0Sstevel@tonic-gate page_vpsub(page_t **ppp, page_t *pp)
4082*0Sstevel@tonic-gate {
4083*0Sstevel@tonic-gate 	if (*ppp == NULL || pp == NULL) {
4084*0Sstevel@tonic-gate 		panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
4085*0Sstevel@tonic-gate 		    (void *)pp, (void *)(*ppp));
4086*0Sstevel@tonic-gate 		/*NOTREACHED*/
4087*0Sstevel@tonic-gate 	}
4088*0Sstevel@tonic-gate 
4089*0Sstevel@tonic-gate 	if (*ppp == pp)
4090*0Sstevel@tonic-gate 		*ppp = pp->p_vpnext;		/* go to next page */
4091*0Sstevel@tonic-gate 
4092*0Sstevel@tonic-gate 	if (*ppp == pp)
4093*0Sstevel@tonic-gate 		*ppp = NULL;			/* page list is gone */
4094*0Sstevel@tonic-gate 	else {
4095*0Sstevel@tonic-gate 		pp->p_vpprev->p_vpnext = pp->p_vpnext;
4096*0Sstevel@tonic-gate 		pp->p_vpnext->p_vpprev = pp->p_vpprev;
4097*0Sstevel@tonic-gate 	}
4098*0Sstevel@tonic-gate 	pp->p_vpprev = pp->p_vpnext = pp;	/* make pp a list of one */
4099*0Sstevel@tonic-gate }
4100*0Sstevel@tonic-gate 
4101*0Sstevel@tonic-gate /*
4102*0Sstevel@tonic-gate  * Lock a physical page into memory "long term".  Used to support "lock
4103*0Sstevel@tonic-gate  * in memory" functions.  Accepts the page to be locked, and a cow variable
4104*0Sstevel@tonic-gate  * to indicate whether a the lock will travel to the new page during
4105*0Sstevel@tonic-gate  * a potential copy-on-write.
4106*0Sstevel@tonic-gate  */
4107*0Sstevel@tonic-gate int
4108*0Sstevel@tonic-gate page_pp_lock(
4109*0Sstevel@tonic-gate 	page_t *pp,			/* page to be locked */
4110*0Sstevel@tonic-gate 	int cow,			/* cow lock */
4111*0Sstevel@tonic-gate 	int kernel)			/* must succeed -- ignore checking */
4112*0Sstevel@tonic-gate {
4113*0Sstevel@tonic-gate 	int r = 0;			/* result -- assume failure */
4114*0Sstevel@tonic-gate 
4115*0Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
4116*0Sstevel@tonic-gate 
4117*0Sstevel@tonic-gate 	page_struct_lock(pp);
4118*0Sstevel@tonic-gate 	/*
4119*0Sstevel@tonic-gate 	 * Acquire the "freemem_lock" for availrmem.
4120*0Sstevel@tonic-gate 	 */
4121*0Sstevel@tonic-gate 	if (cow) {
4122*0Sstevel@tonic-gate 		mutex_enter(&freemem_lock);
4123*0Sstevel@tonic-gate 		if ((availrmem > pages_pp_maximum) &&
4124*0Sstevel@tonic-gate 		    (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4125*0Sstevel@tonic-gate 			availrmem--;
4126*0Sstevel@tonic-gate 			pages_locked++;
4127*0Sstevel@tonic-gate 			mutex_exit(&freemem_lock);
4128*0Sstevel@tonic-gate 			r = 1;
4129*0Sstevel@tonic-gate 			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4130*0Sstevel@tonic-gate 				cmn_err(CE_WARN,
4131*0Sstevel@tonic-gate 				    "COW lock limit reached on pfn 0x%lx",
4132*0Sstevel@tonic-gate 				    page_pptonum(pp));
4133*0Sstevel@tonic-gate 			}
4134*0Sstevel@tonic-gate 		} else
4135*0Sstevel@tonic-gate 			mutex_exit(&freemem_lock);
4136*0Sstevel@tonic-gate 	} else {
4137*0Sstevel@tonic-gate 		if (pp->p_lckcnt) {
4138*0Sstevel@tonic-gate 			if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4139*0Sstevel@tonic-gate 				r = 1;
4140*0Sstevel@tonic-gate 				if (++pp->p_lckcnt ==
4141*0Sstevel@tonic-gate 				    (ushort_t)PAGE_LOCK_MAXIMUM) {
4142*0Sstevel@tonic-gate 					cmn_err(CE_WARN, "Page lock limit "
4143*0Sstevel@tonic-gate 					    "reached on pfn 0x%lx",
4144*0Sstevel@tonic-gate 					    page_pptonum(pp));
4145*0Sstevel@tonic-gate 				}
4146*0Sstevel@tonic-gate 			}
4147*0Sstevel@tonic-gate 		} else {
4148*0Sstevel@tonic-gate 			if (kernel) {
4149*0Sstevel@tonic-gate 				/* availrmem accounting done by caller */
4150*0Sstevel@tonic-gate 				++pp->p_lckcnt;
4151*0Sstevel@tonic-gate 				r = 1;
4152*0Sstevel@tonic-gate 			} else {
4153*0Sstevel@tonic-gate 				mutex_enter(&freemem_lock);
4154*0Sstevel@tonic-gate 				if (availrmem > pages_pp_maximum) {
4155*0Sstevel@tonic-gate 					availrmem--;
4156*0Sstevel@tonic-gate 					pages_locked++;
4157*0Sstevel@tonic-gate 					++pp->p_lckcnt;
4158*0Sstevel@tonic-gate 					r = 1;
4159*0Sstevel@tonic-gate 				}
4160*0Sstevel@tonic-gate 				mutex_exit(&freemem_lock);
4161*0Sstevel@tonic-gate 			}
4162*0Sstevel@tonic-gate 		}
4163*0Sstevel@tonic-gate 	}
4164*0Sstevel@tonic-gate 	page_struct_unlock(pp);
4165*0Sstevel@tonic-gate 	return (r);
4166*0Sstevel@tonic-gate }
4167*0Sstevel@tonic-gate 
4168*0Sstevel@tonic-gate /*
4169*0Sstevel@tonic-gate  * Decommit a lock on a physical page frame.  Account for cow locks if
4170*0Sstevel@tonic-gate  * appropriate.
4171*0Sstevel@tonic-gate  */
4172*0Sstevel@tonic-gate void
4173*0Sstevel@tonic-gate page_pp_unlock(
4174*0Sstevel@tonic-gate 	page_t *pp,			/* page to be unlocked */
4175*0Sstevel@tonic-gate 	int cow,			/* expect cow lock */
4176*0Sstevel@tonic-gate 	int kernel)			/* this was a kernel lock */
4177*0Sstevel@tonic-gate {
4178*0Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
4179*0Sstevel@tonic-gate 
4180*0Sstevel@tonic-gate 	page_struct_lock(pp);
4181*0Sstevel@tonic-gate 	/*
4182*0Sstevel@tonic-gate 	 * Acquire the "freemem_lock" for availrmem.
4183*0Sstevel@tonic-gate 	 * If cowcnt or lcknt is already 0 do nothing; i.e., we
4184*0Sstevel@tonic-gate 	 * could be called to unlock even if nothing is locked. This could
4185*0Sstevel@tonic-gate 	 * happen if locked file pages were truncated (removing the lock)
4186*0Sstevel@tonic-gate 	 * and the file was grown again and new pages faulted in; the new
4187*0Sstevel@tonic-gate 	 * pages are unlocked but the segment still thinks they're locked.
4188*0Sstevel@tonic-gate 	 */
4189*0Sstevel@tonic-gate 	if (cow) {
4190*0Sstevel@tonic-gate 		if (pp->p_cowcnt) {
4191*0Sstevel@tonic-gate 			mutex_enter(&freemem_lock);
4192*0Sstevel@tonic-gate 			pp->p_cowcnt--;
4193*0Sstevel@tonic-gate 			availrmem++;
4194*0Sstevel@tonic-gate 			pages_locked--;
4195*0Sstevel@tonic-gate 			mutex_exit(&freemem_lock);
4196*0Sstevel@tonic-gate 		}
4197*0Sstevel@tonic-gate 	} else {
4198*0Sstevel@tonic-gate 		if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
4199*0Sstevel@tonic-gate 			if (!kernel) {
4200*0Sstevel@tonic-gate 				mutex_enter(&freemem_lock);
4201*0Sstevel@tonic-gate 				availrmem++;
4202*0Sstevel@tonic-gate 				pages_locked--;
4203*0Sstevel@tonic-gate 				mutex_exit(&freemem_lock);
4204*0Sstevel@tonic-gate 			}
4205*0Sstevel@tonic-gate 		}
4206*0Sstevel@tonic-gate 	}
4207*0Sstevel@tonic-gate 	page_struct_unlock(pp);
4208*0Sstevel@tonic-gate }
4209*0Sstevel@tonic-gate 
4210*0Sstevel@tonic-gate /*
4211*0Sstevel@tonic-gate  * This routine reserves availrmem for npages;
4212*0Sstevel@tonic-gate  * 	flags: KM_NOSLEEP or KM_SLEEP
4213*0Sstevel@tonic-gate  * 	returns 1 on success or 0 on failure
4214*0Sstevel@tonic-gate  */
4215*0Sstevel@tonic-gate int
4216*0Sstevel@tonic-gate page_resv(pgcnt_t npages, uint_t flags)
4217*0Sstevel@tonic-gate {
4218*0Sstevel@tonic-gate 	mutex_enter(&freemem_lock);
4219*0Sstevel@tonic-gate 	while (availrmem < tune.t_minarmem + npages) {
4220*0Sstevel@tonic-gate 		if (flags & KM_NOSLEEP) {
4221*0Sstevel@tonic-gate 			mutex_exit(&freemem_lock);
4222*0Sstevel@tonic-gate 			return (0);
4223*0Sstevel@tonic-gate 		}
4224*0Sstevel@tonic-gate 		mutex_exit(&freemem_lock);
4225*0Sstevel@tonic-gate 		page_needfree(npages);
4226*0Sstevel@tonic-gate 		kmem_reap();
4227*0Sstevel@tonic-gate 		delay(hz >> 2);
4228*0Sstevel@tonic-gate 		page_needfree(-(spgcnt_t)npages);
4229*0Sstevel@tonic-gate 		mutex_enter(&freemem_lock);
4230*0Sstevel@tonic-gate 	}
4231*0Sstevel@tonic-gate 	availrmem -= npages;
4232*0Sstevel@tonic-gate 	mutex_exit(&freemem_lock);
4233*0Sstevel@tonic-gate 	return (1);
4234*0Sstevel@tonic-gate }
4235*0Sstevel@tonic-gate 
4236*0Sstevel@tonic-gate /*
4237*0Sstevel@tonic-gate  * This routine unreserves availrmem for npages;
4238*0Sstevel@tonic-gate  */
4239*0Sstevel@tonic-gate void
4240*0Sstevel@tonic-gate page_unresv(pgcnt_t npages)
4241*0Sstevel@tonic-gate {
4242*0Sstevel@tonic-gate 	mutex_enter(&freemem_lock);
4243*0Sstevel@tonic-gate 	availrmem += npages;
4244*0Sstevel@tonic-gate 	mutex_exit(&freemem_lock);
4245*0Sstevel@tonic-gate }
4246*0Sstevel@tonic-gate 
4247*0Sstevel@tonic-gate /*
4248*0Sstevel@tonic-gate  * See Statement at the beginning of segvn_lockop() regarding
4249*0Sstevel@tonic-gate  * the way we handle cowcnts and lckcnts.
4250*0Sstevel@tonic-gate  *
4251*0Sstevel@tonic-gate  * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
4252*0Sstevel@tonic-gate  * that breaks COW has PROT_WRITE.
4253*0Sstevel@tonic-gate  *
4254*0Sstevel@tonic-gate  * Note that, we may also break COW in case we are softlocking
4255*0Sstevel@tonic-gate  * on read access during physio;
4256*0Sstevel@tonic-gate  * in this softlock case, the vpage may not have PROT_WRITE.
4257*0Sstevel@tonic-gate  * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
4258*0Sstevel@tonic-gate  * if the vpage doesn't have PROT_WRITE.
4259*0Sstevel@tonic-gate  *
4260*0Sstevel@tonic-gate  * This routine is never called if we are stealing a page
4261*0Sstevel@tonic-gate  * in anon_private.
4262*0Sstevel@tonic-gate  *
4263*0Sstevel@tonic-gate  * The caller subtracted from availrmem for read only mapping.
4264*0Sstevel@tonic-gate  * if lckcnt is 1 increment availrmem.
4265*0Sstevel@tonic-gate  */
4266*0Sstevel@tonic-gate void
4267*0Sstevel@tonic-gate page_pp_useclaim(
4268*0Sstevel@tonic-gate 	page_t *opp,		/* original page frame losing lock */
4269*0Sstevel@tonic-gate 	page_t *npp,		/* new page frame gaining lock */
4270*0Sstevel@tonic-gate 	uint_t	write_perm) 	/* set if vpage has PROT_WRITE */
4271*0Sstevel@tonic-gate {
4272*0Sstevel@tonic-gate 	int payback = 0;
4273*0Sstevel@tonic-gate 
4274*0Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(opp));
4275*0Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(npp));
4276*0Sstevel@tonic-gate 
4277*0Sstevel@tonic-gate 	page_struct_lock(opp);
4278*0Sstevel@tonic-gate 
4279*0Sstevel@tonic-gate 	ASSERT(npp->p_cowcnt == 0);
4280*0Sstevel@tonic-gate 	ASSERT(npp->p_lckcnt == 0);
4281*0Sstevel@tonic-gate 
4282*0Sstevel@tonic-gate 	/* Don't use claim if nothing is locked (see page_pp_unlock above) */
4283*0Sstevel@tonic-gate 	if ((write_perm && opp->p_cowcnt != 0) ||
4284*0Sstevel@tonic-gate 	    (!write_perm && opp->p_lckcnt != 0)) {
4285*0Sstevel@tonic-gate 
4286*0Sstevel@tonic-gate 		if (write_perm) {
4287*0Sstevel@tonic-gate 			npp->p_cowcnt++;
4288*0Sstevel@tonic-gate 			ASSERT(opp->p_cowcnt != 0);
4289*0Sstevel@tonic-gate 			opp->p_cowcnt--;
4290*0Sstevel@tonic-gate 		} else {
4291*0Sstevel@tonic-gate 
4292*0Sstevel@tonic-gate 			ASSERT(opp->p_lckcnt != 0);
4293*0Sstevel@tonic-gate 
4294*0Sstevel@tonic-gate 			/*
4295*0Sstevel@tonic-gate 			 * We didn't need availrmem decremented if p_lckcnt on
4296*0Sstevel@tonic-gate 			 * original page is 1. Here, we are unlocking
4297*0Sstevel@tonic-gate 			 * read-only copy belonging to original page and
4298*0Sstevel@tonic-gate 			 * are locking a copy belonging to new page.
4299*0Sstevel@tonic-gate 			 */
4300*0Sstevel@tonic-gate 			if (opp->p_lckcnt == 1)
4301*0Sstevel@tonic-gate 				payback = 1;
4302*0Sstevel@tonic-gate 
4303*0Sstevel@tonic-gate 			npp->p_lckcnt++;
4304*0Sstevel@tonic-gate 			opp->p_lckcnt--;
4305*0Sstevel@tonic-gate 		}
4306*0Sstevel@tonic-gate 	}
4307*0Sstevel@tonic-gate 	if (payback) {
4308*0Sstevel@tonic-gate 		mutex_enter(&freemem_lock);
4309*0Sstevel@tonic-gate 		availrmem++;
4310*0Sstevel@tonic-gate 		pages_useclaim--;
4311*0Sstevel@tonic-gate 		mutex_exit(&freemem_lock);
4312*0Sstevel@tonic-gate 	}
4313*0Sstevel@tonic-gate 	page_struct_unlock(opp);
4314*0Sstevel@tonic-gate }
4315*0Sstevel@tonic-gate 
4316*0Sstevel@tonic-gate /*
4317*0Sstevel@tonic-gate  * Simple claim adjust functions -- used to support changes in
4318*0Sstevel@tonic-gate  * claims due to changes in access permissions.  Used by segvn_setprot().
4319*0Sstevel@tonic-gate  */
4320*0Sstevel@tonic-gate int
4321*0Sstevel@tonic-gate page_addclaim(page_t *pp)
4322*0Sstevel@tonic-gate {
4323*0Sstevel@tonic-gate 	int r = 0;			/* result */
4324*0Sstevel@tonic-gate 
4325*0Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
4326*0Sstevel@tonic-gate 
4327*0Sstevel@tonic-gate 	page_struct_lock(pp);
4328*0Sstevel@tonic-gate 	ASSERT(pp->p_lckcnt != 0);
4329*0Sstevel@tonic-gate 
4330*0Sstevel@tonic-gate 	if (pp->p_lckcnt == 1) {
4331*0Sstevel@tonic-gate 		if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4332*0Sstevel@tonic-gate 			--pp->p_lckcnt;
4333*0Sstevel@tonic-gate 			r = 1;
4334*0Sstevel@tonic-gate 			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4335*0Sstevel@tonic-gate 				cmn_err(CE_WARN,
4336*0Sstevel@tonic-gate 				    "COW lock limit reached on pfn 0x%lx",
4337*0Sstevel@tonic-gate 				    page_pptonum(pp));
4338*0Sstevel@tonic-gate 			}
4339*0Sstevel@tonic-gate 		}
4340*0Sstevel@tonic-gate 	} else {
4341*0Sstevel@tonic-gate 		mutex_enter(&freemem_lock);
4342*0Sstevel@tonic-gate 		if ((availrmem > pages_pp_maximum) &&
4343*0Sstevel@tonic-gate 		    (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4344*0Sstevel@tonic-gate 			--availrmem;
4345*0Sstevel@tonic-gate 			++pages_claimed;
4346*0Sstevel@tonic-gate 			mutex_exit(&freemem_lock);
4347*0Sstevel@tonic-gate 			--pp->p_lckcnt;
4348*0Sstevel@tonic-gate 			r = 1;
4349*0Sstevel@tonic-gate 			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4350*0Sstevel@tonic-gate 				cmn_err(CE_WARN,
4351*0Sstevel@tonic-gate 				    "COW lock limit reached on pfn 0x%lx",
4352*0Sstevel@tonic-gate 				    page_pptonum(pp));
4353*0Sstevel@tonic-gate 			}
4354*0Sstevel@tonic-gate 		} else
4355*0Sstevel@tonic-gate 			mutex_exit(&freemem_lock);
4356*0Sstevel@tonic-gate 	}
4357*0Sstevel@tonic-gate 	page_struct_unlock(pp);
4358*0Sstevel@tonic-gate 	return (r);
4359*0Sstevel@tonic-gate }
4360*0Sstevel@tonic-gate 
4361*0Sstevel@tonic-gate int
4362*0Sstevel@tonic-gate page_subclaim(page_t *pp)
4363*0Sstevel@tonic-gate {
4364*0Sstevel@tonic-gate 	int r = 0;
4365*0Sstevel@tonic-gate 
4366*0Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
4367*0Sstevel@tonic-gate 
4368*0Sstevel@tonic-gate 	page_struct_lock(pp);
4369*0Sstevel@tonic-gate 	ASSERT(pp->p_cowcnt != 0);
4370*0Sstevel@tonic-gate 
4371*0Sstevel@tonic-gate 	if (pp->p_lckcnt) {
4372*0Sstevel@tonic-gate 		if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4373*0Sstevel@tonic-gate 			r = 1;
4374*0Sstevel@tonic-gate 			/*
4375*0Sstevel@tonic-gate 			 * for availrmem
4376*0Sstevel@tonic-gate 			 */
4377*0Sstevel@tonic-gate 			mutex_enter(&freemem_lock);
4378*0Sstevel@tonic-gate 			availrmem++;
4379*0Sstevel@tonic-gate 			pages_claimed--;
4380*0Sstevel@tonic-gate 			mutex_exit(&freemem_lock);
4381*0Sstevel@tonic-gate 
4382*0Sstevel@tonic-gate 			pp->p_cowcnt--;
4383*0Sstevel@tonic-gate 
4384*0Sstevel@tonic-gate 			if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4385*0Sstevel@tonic-gate 				cmn_err(CE_WARN,
4386*0Sstevel@tonic-gate 				    "Page lock limit reached on pfn 0x%lx",
4387*0Sstevel@tonic-gate 				    page_pptonum(pp));
4388*0Sstevel@tonic-gate 			}
4389*0Sstevel@tonic-gate 		}
4390*0Sstevel@tonic-gate 	} else {
4391*0Sstevel@tonic-gate 		r = 1;
4392*0Sstevel@tonic-gate 		pp->p_cowcnt--;
4393*0Sstevel@tonic-gate 		pp->p_lckcnt++;
4394*0Sstevel@tonic-gate 	}
4395*0Sstevel@tonic-gate 	page_struct_unlock(pp);
4396*0Sstevel@tonic-gate 	return (r);
4397*0Sstevel@tonic-gate }
4398*0Sstevel@tonic-gate 
4399*0Sstevel@tonic-gate int
4400*0Sstevel@tonic-gate page_addclaim_pages(page_t  **ppa)
4401*0Sstevel@tonic-gate {
4402*0Sstevel@tonic-gate 
4403*0Sstevel@tonic-gate 	pgcnt_t	lckpgs = 0, pg_idx;
4404*0Sstevel@tonic-gate 
4405*0Sstevel@tonic-gate 	VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4406*0Sstevel@tonic-gate 
4407*0Sstevel@tonic-gate 	mutex_enter(&page_llock);
4408*0Sstevel@tonic-gate 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4409*0Sstevel@tonic-gate 
4410*0Sstevel@tonic-gate 		ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4411*0Sstevel@tonic-gate 		ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4412*0Sstevel@tonic-gate 		if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4413*0Sstevel@tonic-gate 			mutex_exit(&page_llock);
4414*0Sstevel@tonic-gate 			return (0);
4415*0Sstevel@tonic-gate 		}
4416*0Sstevel@tonic-gate 		if (ppa[pg_idx]->p_lckcnt > 1)
4417*0Sstevel@tonic-gate 			lckpgs++;
4418*0Sstevel@tonic-gate 	}
4419*0Sstevel@tonic-gate 
4420*0Sstevel@tonic-gate 	if (lckpgs != 0) {
4421*0Sstevel@tonic-gate 		mutex_enter(&freemem_lock);
4422*0Sstevel@tonic-gate 		if (availrmem >= pages_pp_maximum + lckpgs) {
4423*0Sstevel@tonic-gate 			availrmem -= lckpgs;
4424*0Sstevel@tonic-gate 			pages_claimed += lckpgs;
4425*0Sstevel@tonic-gate 		} else {
4426*0Sstevel@tonic-gate 			mutex_exit(&freemem_lock);
4427*0Sstevel@tonic-gate 			mutex_exit(&page_llock);
4428*0Sstevel@tonic-gate 			return (0);
4429*0Sstevel@tonic-gate 		}
4430*0Sstevel@tonic-gate 		mutex_exit(&freemem_lock);
4431*0Sstevel@tonic-gate 	}
4432*0Sstevel@tonic-gate 
4433*0Sstevel@tonic-gate 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4434*0Sstevel@tonic-gate 		ppa[pg_idx]->p_lckcnt--;
4435*0Sstevel@tonic-gate 		ppa[pg_idx]->p_cowcnt++;
4436*0Sstevel@tonic-gate 	}
4437*0Sstevel@tonic-gate 	mutex_exit(&page_llock);
4438*0Sstevel@tonic-gate 	return (1);
4439*0Sstevel@tonic-gate }
4440*0Sstevel@tonic-gate 
4441*0Sstevel@tonic-gate int
4442*0Sstevel@tonic-gate page_subclaim_pages(page_t  **ppa)
4443*0Sstevel@tonic-gate {
4444*0Sstevel@tonic-gate 	pgcnt_t	ulckpgs = 0, pg_idx;
4445*0Sstevel@tonic-gate 
4446*0Sstevel@tonic-gate 	VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4447*0Sstevel@tonic-gate 
4448*0Sstevel@tonic-gate 	mutex_enter(&page_llock);
4449*0Sstevel@tonic-gate 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4450*0Sstevel@tonic-gate 
4451*0Sstevel@tonic-gate 		ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4452*0Sstevel@tonic-gate 		ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4453*0Sstevel@tonic-gate 		if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4454*0Sstevel@tonic-gate 			mutex_exit(&page_llock);
4455*0Sstevel@tonic-gate 			return (0);
4456*0Sstevel@tonic-gate 		}
4457*0Sstevel@tonic-gate 		if (ppa[pg_idx]->p_lckcnt != 0)
4458*0Sstevel@tonic-gate 			ulckpgs++;
4459*0Sstevel@tonic-gate 	}
4460*0Sstevel@tonic-gate 
4461*0Sstevel@tonic-gate 	if (ulckpgs != 0) {
4462*0Sstevel@tonic-gate 		mutex_enter(&freemem_lock);
4463*0Sstevel@tonic-gate 		availrmem += ulckpgs;
4464*0Sstevel@tonic-gate 		pages_claimed -= ulckpgs;
4465*0Sstevel@tonic-gate 		mutex_exit(&freemem_lock);
4466*0Sstevel@tonic-gate 	}
4467*0Sstevel@tonic-gate 
4468*0Sstevel@tonic-gate 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4469*0Sstevel@tonic-gate 		ppa[pg_idx]->p_cowcnt--;
4470*0Sstevel@tonic-gate 		ppa[pg_idx]->p_lckcnt++;
4471*0Sstevel@tonic-gate 
4472*0Sstevel@tonic-gate 	}
4473*0Sstevel@tonic-gate 	mutex_exit(&page_llock);
4474*0Sstevel@tonic-gate 	return (1);
4475*0Sstevel@tonic-gate }
4476*0Sstevel@tonic-gate 
4477*0Sstevel@tonic-gate page_t *
4478*0Sstevel@tonic-gate page_numtopp(pfn_t pfnum, se_t se)
4479*0Sstevel@tonic-gate {
4480*0Sstevel@tonic-gate 	page_t *pp;
4481*0Sstevel@tonic-gate 
4482*0Sstevel@tonic-gate retry:
4483*0Sstevel@tonic-gate 	pp = page_numtopp_nolock(pfnum);
4484*0Sstevel@tonic-gate 	if (pp == NULL) {
4485*0Sstevel@tonic-gate 		return ((page_t *)NULL);
4486*0Sstevel@tonic-gate 	}
4487*0Sstevel@tonic-gate 
4488*0Sstevel@tonic-gate 	/*
4489*0Sstevel@tonic-gate 	 * Acquire the appropriate lock on the page.
4490*0Sstevel@tonic-gate 	 */
4491*0Sstevel@tonic-gate 	while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4492*0Sstevel@tonic-gate 		if (page_pptonum(pp) != pfnum)
4493*0Sstevel@tonic-gate 			goto retry;
4494*0Sstevel@tonic-gate 		continue;
4495*0Sstevel@tonic-gate 	}
4496*0Sstevel@tonic-gate 
4497*0Sstevel@tonic-gate 	if (page_pptonum(pp) != pfnum) {
4498*0Sstevel@tonic-gate 		page_unlock(pp);
4499*0Sstevel@tonic-gate 		goto retry;
4500*0Sstevel@tonic-gate 	}
4501*0Sstevel@tonic-gate 
4502*0Sstevel@tonic-gate 	return (pp);
4503*0Sstevel@tonic-gate }
4504*0Sstevel@tonic-gate 
4505*0Sstevel@tonic-gate page_t *
4506*0Sstevel@tonic-gate page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4507*0Sstevel@tonic-gate {
4508*0Sstevel@tonic-gate 	page_t *pp;
4509*0Sstevel@tonic-gate 
4510*0Sstevel@tonic-gate retry:
4511*0Sstevel@tonic-gate 	pp = page_numtopp_nolock(pfnum);
4512*0Sstevel@tonic-gate 	if (pp == NULL) {
4513*0Sstevel@tonic-gate 		return ((page_t *)NULL);
4514*0Sstevel@tonic-gate 	}
4515*0Sstevel@tonic-gate 
4516*0Sstevel@tonic-gate 	/*
4517*0Sstevel@tonic-gate 	 * Acquire the appropriate lock on the page.
4518*0Sstevel@tonic-gate 	 */
4519*0Sstevel@tonic-gate 	while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4520*0Sstevel@tonic-gate 		if (page_pptonum(pp) != pfnum)
4521*0Sstevel@tonic-gate 			goto retry;
4522*0Sstevel@tonic-gate 		continue;
4523*0Sstevel@tonic-gate 	}
4524*0Sstevel@tonic-gate 
4525*0Sstevel@tonic-gate 	if (page_pptonum(pp) != pfnum) {
4526*0Sstevel@tonic-gate 		page_unlock(pp);
4527*0Sstevel@tonic-gate 		goto retry;
4528*0Sstevel@tonic-gate 	}
4529*0Sstevel@tonic-gate 
4530*0Sstevel@tonic-gate 	return (pp);
4531*0Sstevel@tonic-gate }
4532*0Sstevel@tonic-gate 
4533*0Sstevel@tonic-gate /*
4534*0Sstevel@tonic-gate  * This routine is like page_numtopp, but will only return page structs
4535*0Sstevel@tonic-gate  * for pages which are ok for loading into hardware using the page struct.
4536*0Sstevel@tonic-gate  */
4537*0Sstevel@tonic-gate page_t *
4538*0Sstevel@tonic-gate page_numtopp_nowait(pfn_t pfnum, se_t se)
4539*0Sstevel@tonic-gate {
4540*0Sstevel@tonic-gate 	page_t *pp;
4541*0Sstevel@tonic-gate 
4542*0Sstevel@tonic-gate retry:
4543*0Sstevel@tonic-gate 	pp = page_numtopp_nolock(pfnum);
4544*0Sstevel@tonic-gate 	if (pp == NULL) {
4545*0Sstevel@tonic-gate 		return ((page_t *)NULL);
4546*0Sstevel@tonic-gate 	}
4547*0Sstevel@tonic-gate 
4548*0Sstevel@tonic-gate 	/*
4549*0Sstevel@tonic-gate 	 * Try to acquire the appropriate lock on the page.
4550*0Sstevel@tonic-gate 	 */
4551*0Sstevel@tonic-gate 	if (PP_ISFREE(pp))
4552*0Sstevel@tonic-gate 		pp = NULL;
4553*0Sstevel@tonic-gate 	else {
4554*0Sstevel@tonic-gate 		if (!page_trylock(pp, se))
4555*0Sstevel@tonic-gate 			pp = NULL;
4556*0Sstevel@tonic-gate 		else {
4557*0Sstevel@tonic-gate 			if (page_pptonum(pp) != pfnum) {
4558*0Sstevel@tonic-gate 				page_unlock(pp);
4559*0Sstevel@tonic-gate 				goto retry;
4560*0Sstevel@tonic-gate 			}
4561*0Sstevel@tonic-gate 			if (PP_ISFREE(pp)) {
4562*0Sstevel@tonic-gate 				page_unlock(pp);
4563*0Sstevel@tonic-gate 				pp = NULL;
4564*0Sstevel@tonic-gate 			}
4565*0Sstevel@tonic-gate 		}
4566*0Sstevel@tonic-gate 	}
4567*0Sstevel@tonic-gate 	return (pp);
4568*0Sstevel@tonic-gate }
4569*0Sstevel@tonic-gate 
4570*0Sstevel@tonic-gate /*
4571*0Sstevel@tonic-gate  * Returns a count of dirty pages that are in the process
4572*0Sstevel@tonic-gate  * of being written out.  If 'cleanit' is set, try to push the page.
4573*0Sstevel@tonic-gate  */
4574*0Sstevel@tonic-gate pgcnt_t
4575*0Sstevel@tonic-gate page_busy(int cleanit)
4576*0Sstevel@tonic-gate {
4577*0Sstevel@tonic-gate 	page_t *page0 = page_first();
4578*0Sstevel@tonic-gate 	page_t *pp = page0;
4579*0Sstevel@tonic-gate 	pgcnt_t nppbusy = 0;
4580*0Sstevel@tonic-gate 	u_offset_t off;
4581*0Sstevel@tonic-gate 
4582*0Sstevel@tonic-gate 	do {
4583*0Sstevel@tonic-gate 		vnode_t *vp = pp->p_vnode;
4584*0Sstevel@tonic-gate 
4585*0Sstevel@tonic-gate 		/*
4586*0Sstevel@tonic-gate 		 * A page is a candidate for syncing if it is:
4587*0Sstevel@tonic-gate 		 *
4588*0Sstevel@tonic-gate 		 * (a)	On neither the freelist nor the cachelist
4589*0Sstevel@tonic-gate 		 * (b)	Hashed onto a vnode
4590*0Sstevel@tonic-gate 		 * (c)	Not a kernel page
4591*0Sstevel@tonic-gate 		 * (d)	Dirty
4592*0Sstevel@tonic-gate 		 * (e)	Not part of a swapfile
4593*0Sstevel@tonic-gate 		 * (f)	a page which belongs to a real vnode; eg has a non-null
4594*0Sstevel@tonic-gate 		 *	v_vfsp pointer.
4595*0Sstevel@tonic-gate 		 * (g)	Backed by a filesystem which doesn't have a
4596*0Sstevel@tonic-gate 		 *	stubbed-out sync operation
4597*0Sstevel@tonic-gate 		 */
4598*0Sstevel@tonic-gate 		if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp &&
4599*0Sstevel@tonic-gate 		    hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4600*0Sstevel@tonic-gate 		    vfs_can_sync(vp->v_vfsp)) {
4601*0Sstevel@tonic-gate 			nppbusy++;
4602*0Sstevel@tonic-gate 			vfs_syncprogress();
4603*0Sstevel@tonic-gate 
4604*0Sstevel@tonic-gate 			if (!cleanit)
4605*0Sstevel@tonic-gate 				continue;
4606*0Sstevel@tonic-gate 			if (!page_trylock(pp, SE_EXCL))
4607*0Sstevel@tonic-gate 				continue;
4608*0Sstevel@tonic-gate 
4609*0Sstevel@tonic-gate 			if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4610*0Sstevel@tonic-gate 			    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4611*0Sstevel@tonic-gate 			    !(hat_pagesync(pp,
4612*0Sstevel@tonic-gate 			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4613*0Sstevel@tonic-gate 				page_unlock(pp);
4614*0Sstevel@tonic-gate 				continue;
4615*0Sstevel@tonic-gate 			}
4616*0Sstevel@tonic-gate 			off = pp->p_offset;
4617*0Sstevel@tonic-gate 			VN_HOLD(vp);
4618*0Sstevel@tonic-gate 			page_unlock(pp);
4619*0Sstevel@tonic-gate 			(void) VOP_PUTPAGE(vp, off, PAGESIZE,
4620*0Sstevel@tonic-gate 			    B_ASYNC | B_FREE, kcred);
4621*0Sstevel@tonic-gate 			VN_RELE(vp);
4622*0Sstevel@tonic-gate 		}
4623*0Sstevel@tonic-gate 	} while ((pp = page_next(pp)) != page0);
4624*0Sstevel@tonic-gate 
4625*0Sstevel@tonic-gate 	return (nppbusy);
4626*0Sstevel@tonic-gate }
4627*0Sstevel@tonic-gate 
4628*0Sstevel@tonic-gate void page_invalidate_pages(void);
4629*0Sstevel@tonic-gate 
4630*0Sstevel@tonic-gate /*
4631*0Sstevel@tonic-gate  * callback handler to vm sub-system
4632*0Sstevel@tonic-gate  *
4633*0Sstevel@tonic-gate  * callers make sure no recursive entries to this func.
4634*0Sstevel@tonic-gate  */
4635*0Sstevel@tonic-gate /*ARGSUSED*/
4636*0Sstevel@tonic-gate boolean_t
4637*0Sstevel@tonic-gate callb_vm_cpr(void *arg, int code)
4638*0Sstevel@tonic-gate {
4639*0Sstevel@tonic-gate 	if (code == CB_CODE_CPR_CHKPT)
4640*0Sstevel@tonic-gate 		page_invalidate_pages();
4641*0Sstevel@tonic-gate 	return (B_TRUE);
4642*0Sstevel@tonic-gate }
4643*0Sstevel@tonic-gate 
4644*0Sstevel@tonic-gate /*
4645*0Sstevel@tonic-gate  * Invalidate all pages of the system.
4646*0Sstevel@tonic-gate  * It shouldn't be called until all user page activities are all stopped.
4647*0Sstevel@tonic-gate  */
4648*0Sstevel@tonic-gate void
4649*0Sstevel@tonic-gate page_invalidate_pages()
4650*0Sstevel@tonic-gate {
4651*0Sstevel@tonic-gate 	page_t *pp;
4652*0Sstevel@tonic-gate 	page_t *page0;
4653*0Sstevel@tonic-gate 	pgcnt_t nbusypages;
4654*0Sstevel@tonic-gate 	int retry = 0;
4655*0Sstevel@tonic-gate 	const int MAXRETRIES = 4;
4656*0Sstevel@tonic-gate #if defined(__sparc)
4657*0Sstevel@tonic-gate 	extern struct vnode prom_ppages;
4658*0Sstevel@tonic-gate #endif /* __sparc */
4659*0Sstevel@tonic-gate 
4660*0Sstevel@tonic-gate top:
4661*0Sstevel@tonic-gate 	/*
4662*0Sstevel@tonic-gate 	 * Flush dirty pages and destory the clean ones.
4663*0Sstevel@tonic-gate 	 */
4664*0Sstevel@tonic-gate 	nbusypages = 0;
4665*0Sstevel@tonic-gate 
4666*0Sstevel@tonic-gate 	pp = page0 = page_first();
4667*0Sstevel@tonic-gate 	do {
4668*0Sstevel@tonic-gate 		struct vnode	*vp;
4669*0Sstevel@tonic-gate 		u_offset_t	offset;
4670*0Sstevel@tonic-gate 		int		mod;
4671*0Sstevel@tonic-gate 
4672*0Sstevel@tonic-gate 		/*
4673*0Sstevel@tonic-gate 		 * skip the page if it has no vnode or the page associated
4674*0Sstevel@tonic-gate 		 * with the kernel vnode or prom allocated kernel mem.
4675*0Sstevel@tonic-gate 		 */
4676*0Sstevel@tonic-gate #if defined(__sparc)
4677*0Sstevel@tonic-gate 		if ((vp = pp->p_vnode) == NULL || vp == &kvp ||
4678*0Sstevel@tonic-gate 		    vp == &prom_ppages)
4679*0Sstevel@tonic-gate #else /* x86 doesn't have prom or prom_ppage */
4680*0Sstevel@tonic-gate 		if ((vp = pp->p_vnode) == NULL || vp == &kvp)
4681*0Sstevel@tonic-gate #endif /* __sparc */
4682*0Sstevel@tonic-gate 			continue;
4683*0Sstevel@tonic-gate 
4684*0Sstevel@tonic-gate 		/*
4685*0Sstevel@tonic-gate 		 * skip the page which is already free invalidated.
4686*0Sstevel@tonic-gate 		 */
4687*0Sstevel@tonic-gate 		if (PP_ISFREE(pp) && PP_ISAGED(pp))
4688*0Sstevel@tonic-gate 			continue;
4689*0Sstevel@tonic-gate 
4690*0Sstevel@tonic-gate 		/*
4691*0Sstevel@tonic-gate 		 * skip pages that are already locked or can't be "exclusively"
4692*0Sstevel@tonic-gate 		 * locked or are already free.  After we lock the page, check
4693*0Sstevel@tonic-gate 		 * the free and age bits again to be sure it's not destroied
4694*0Sstevel@tonic-gate 		 * yet.
4695*0Sstevel@tonic-gate 		 * To achieve max. parallelization, we use page_trylock instead
4696*0Sstevel@tonic-gate 		 * of page_lock so that we don't get block on individual pages
4697*0Sstevel@tonic-gate 		 * while we have thousands of other pages to process.
4698*0Sstevel@tonic-gate 		 */
4699*0Sstevel@tonic-gate 		if (!page_trylock(pp, SE_EXCL)) {
4700*0Sstevel@tonic-gate 			nbusypages++;
4701*0Sstevel@tonic-gate 			continue;
4702*0Sstevel@tonic-gate 		} else if (PP_ISFREE(pp)) {
4703*0Sstevel@tonic-gate 			if (!PP_ISAGED(pp)) {
4704*0Sstevel@tonic-gate 				page_destroy_free(pp);
4705*0Sstevel@tonic-gate 			} else {
4706*0Sstevel@tonic-gate 				page_unlock(pp);
4707*0Sstevel@tonic-gate 			}
4708*0Sstevel@tonic-gate 			continue;
4709*0Sstevel@tonic-gate 		}
4710*0Sstevel@tonic-gate 		/*
4711*0Sstevel@tonic-gate 		 * Is this page involved in some I/O? shared?
4712*0Sstevel@tonic-gate 		 *
4713*0Sstevel@tonic-gate 		 * The page_struct_lock need not be acquired to
4714*0Sstevel@tonic-gate 		 * examine these fields since the page has an
4715*0Sstevel@tonic-gate 		 * "exclusive" lock.
4716*0Sstevel@tonic-gate 		 */
4717*0Sstevel@tonic-gate 		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4718*0Sstevel@tonic-gate 			page_unlock(pp);
4719*0Sstevel@tonic-gate 			continue;
4720*0Sstevel@tonic-gate 		}
4721*0Sstevel@tonic-gate 
4722*0Sstevel@tonic-gate 		if (vp->v_type == VCHR) {
4723*0Sstevel@tonic-gate 			panic("vp->v_type == VCHR");
4724*0Sstevel@tonic-gate 			/*NOTREACHED*/
4725*0Sstevel@tonic-gate 		}
4726*0Sstevel@tonic-gate 
4727*0Sstevel@tonic-gate 		if (!page_try_demote_pages(pp)) {
4728*0Sstevel@tonic-gate 			page_unlock(pp);
4729*0Sstevel@tonic-gate 			continue;
4730*0Sstevel@tonic-gate 		}
4731*0Sstevel@tonic-gate 
4732*0Sstevel@tonic-gate 		/*
4733*0Sstevel@tonic-gate 		 * Check the modified bit. Leave the bits alone in hardware
4734*0Sstevel@tonic-gate 		 * (they will be modified if we do the putpage).
4735*0Sstevel@tonic-gate 		 */
4736*0Sstevel@tonic-gate 		mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4737*0Sstevel@tonic-gate 			& P_MOD);
4738*0Sstevel@tonic-gate 		if (mod) {
4739*0Sstevel@tonic-gate 			offset = pp->p_offset;
4740*0Sstevel@tonic-gate 			/*
4741*0Sstevel@tonic-gate 			 * Hold the vnode before releasing the page lock
4742*0Sstevel@tonic-gate 			 * to prevent it from being freed and re-used by
4743*0Sstevel@tonic-gate 			 * some other thread.
4744*0Sstevel@tonic-gate 			 */
4745*0Sstevel@tonic-gate 			VN_HOLD(vp);
4746*0Sstevel@tonic-gate 			page_unlock(pp);
4747*0Sstevel@tonic-gate 			/*
4748*0Sstevel@tonic-gate 			 * No error return is checked here. Callers such as
4749*0Sstevel@tonic-gate 			 * cpr deals with the dirty pages at the dump time
4750*0Sstevel@tonic-gate 			 * if this putpage fails.
4751*0Sstevel@tonic-gate 			 */
4752*0Sstevel@tonic-gate 			(void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4753*0Sstevel@tonic-gate 			    kcred);
4754*0Sstevel@tonic-gate 			VN_RELE(vp);
4755*0Sstevel@tonic-gate 		} else {
4756*0Sstevel@tonic-gate 			page_destroy(pp, 0);
4757*0Sstevel@tonic-gate 		}
4758*0Sstevel@tonic-gate 	} while ((pp = page_next(pp)) != page0);
4759*0Sstevel@tonic-gate 	if (nbusypages && retry++ < MAXRETRIES) {
4760*0Sstevel@tonic-gate 		delay(1);
4761*0Sstevel@tonic-gate 		goto top;
4762*0Sstevel@tonic-gate 	}
4763*0Sstevel@tonic-gate }
4764*0Sstevel@tonic-gate 
4765*0Sstevel@tonic-gate /*
4766*0Sstevel@tonic-gate  * Replace the page "old" with the page "new" on the page hash and vnode lists
4767*0Sstevel@tonic-gate  *
4768*0Sstevel@tonic-gate  * the replacemnt must be done in place, ie the equivalent sequence:
4769*0Sstevel@tonic-gate  *
4770*0Sstevel@tonic-gate  *	vp = old->p_vnode;
4771*0Sstevel@tonic-gate  *	off = old->p_offset;
4772*0Sstevel@tonic-gate  *	page_do_hashout(old)
4773*0Sstevel@tonic-gate  *	page_do_hashin(new, vp, off)
4774*0Sstevel@tonic-gate  *
4775*0Sstevel@tonic-gate  * doesn't work, since
4776*0Sstevel@tonic-gate  *  1) if old is the only page on the vnode, the v_pages list has a window
4777*0Sstevel@tonic-gate  *     where it looks empty. This will break file system assumptions.
4778*0Sstevel@tonic-gate  * and
4779*0Sstevel@tonic-gate  *  2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4780*0Sstevel@tonic-gate  */
4781*0Sstevel@tonic-gate static void
4782*0Sstevel@tonic-gate page_do_relocate_hash(page_t *new, page_t *old)
4783*0Sstevel@tonic-gate {
4784*0Sstevel@tonic-gate 	page_t	**hash_list;
4785*0Sstevel@tonic-gate 	vnode_t	*vp = old->p_vnode;
4786*0Sstevel@tonic-gate 	kmutex_t *sep;
4787*0Sstevel@tonic-gate 
4788*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(old));
4789*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(new));
4790*0Sstevel@tonic-gate 	ASSERT(vp != NULL);
4791*0Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4792*0Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4793*0Sstevel@tonic-gate 
4794*0Sstevel@tonic-gate 	/*
4795*0Sstevel@tonic-gate 	 * First find old page on the page hash list
4796*0Sstevel@tonic-gate 	 */
4797*0Sstevel@tonic-gate 	hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4798*0Sstevel@tonic-gate 
4799*0Sstevel@tonic-gate 	for (;;) {
4800*0Sstevel@tonic-gate 		if (*hash_list == old)
4801*0Sstevel@tonic-gate 			break;
4802*0Sstevel@tonic-gate 		if (*hash_list == NULL) {
4803*0Sstevel@tonic-gate 			panic("page_do_hashout");
4804*0Sstevel@tonic-gate 			/*NOTREACHED*/
4805*0Sstevel@tonic-gate 		}
4806*0Sstevel@tonic-gate 		hash_list = &(*hash_list)->p_hash;
4807*0Sstevel@tonic-gate 	}
4808*0Sstevel@tonic-gate 
4809*0Sstevel@tonic-gate 	/*
4810*0Sstevel@tonic-gate 	 * update new and replace old with new on the page hash list
4811*0Sstevel@tonic-gate 	 */
4812*0Sstevel@tonic-gate 	new->p_vnode = old->p_vnode;
4813*0Sstevel@tonic-gate 	new->p_offset = old->p_offset;
4814*0Sstevel@tonic-gate 	new->p_hash = old->p_hash;
4815*0Sstevel@tonic-gate 	*hash_list = new;
4816*0Sstevel@tonic-gate 
4817*0Sstevel@tonic-gate 	if ((new->p_vnode->v_flag & VISSWAP) != 0)
4818*0Sstevel@tonic-gate 		PP_SETSWAP(new);
4819*0Sstevel@tonic-gate 
4820*0Sstevel@tonic-gate 	/*
4821*0Sstevel@tonic-gate 	 * replace old with new on the vnode's page list
4822*0Sstevel@tonic-gate 	 */
4823*0Sstevel@tonic-gate 	if (old->p_vpnext == old) {
4824*0Sstevel@tonic-gate 		new->p_vpnext = new;
4825*0Sstevel@tonic-gate 		new->p_vpprev = new;
4826*0Sstevel@tonic-gate 	} else {
4827*0Sstevel@tonic-gate 		new->p_vpnext = old->p_vpnext;
4828*0Sstevel@tonic-gate 		new->p_vpprev = old->p_vpprev;
4829*0Sstevel@tonic-gate 		new->p_vpnext->p_vpprev = new;
4830*0Sstevel@tonic-gate 		new->p_vpprev->p_vpnext = new;
4831*0Sstevel@tonic-gate 	}
4832*0Sstevel@tonic-gate 	if (vp->v_pages == old)
4833*0Sstevel@tonic-gate 		vp->v_pages = new;
4834*0Sstevel@tonic-gate 
4835*0Sstevel@tonic-gate 	/*
4836*0Sstevel@tonic-gate 	 * clear out the old page
4837*0Sstevel@tonic-gate 	 */
4838*0Sstevel@tonic-gate 	old->p_hash = NULL;
4839*0Sstevel@tonic-gate 	old->p_vpnext = NULL;
4840*0Sstevel@tonic-gate 	old->p_vpprev = NULL;
4841*0Sstevel@tonic-gate 	old->p_vnode = NULL;
4842*0Sstevel@tonic-gate 	PP_CLRSWAP(old);
4843*0Sstevel@tonic-gate 	old->p_offset = (u_offset_t)-1;
4844*0Sstevel@tonic-gate 	page_clr_all_props(old);
4845*0Sstevel@tonic-gate 
4846*0Sstevel@tonic-gate 	/*
4847*0Sstevel@tonic-gate 	 * Wake up processes waiting for this page.  The page's
4848*0Sstevel@tonic-gate 	 * identity has been changed, and is probably not the
4849*0Sstevel@tonic-gate 	 * desired page any longer.
4850*0Sstevel@tonic-gate 	 */
4851*0Sstevel@tonic-gate 	sep = page_se_mutex(old);
4852*0Sstevel@tonic-gate 	mutex_enter(sep);
4853*0Sstevel@tonic-gate 	if (CV_HAS_WAITERS(&old->p_cv))
4854*0Sstevel@tonic-gate 		cv_broadcast(&old->p_cv);
4855*0Sstevel@tonic-gate 	mutex_exit(sep);
4856*0Sstevel@tonic-gate }
4857*0Sstevel@tonic-gate 
4858*0Sstevel@tonic-gate /*
4859*0Sstevel@tonic-gate  * This function moves the identity of page "pp_old" to page "pp_new".
4860*0Sstevel@tonic-gate  * Both pages must be locked on entry.  "pp_new" is free, has no identity,
4861*0Sstevel@tonic-gate  * and need not be hashed out from anywhere.
4862*0Sstevel@tonic-gate  */
4863*0Sstevel@tonic-gate void
4864*0Sstevel@tonic-gate page_relocate_hash(page_t *pp_new, page_t *pp_old)
4865*0Sstevel@tonic-gate {
4866*0Sstevel@tonic-gate 	vnode_t *vp = pp_old->p_vnode;
4867*0Sstevel@tonic-gate 	u_offset_t off = pp_old->p_offset;
4868*0Sstevel@tonic-gate 	kmutex_t *phm, *vphm;
4869*0Sstevel@tonic-gate 
4870*0Sstevel@tonic-gate 	/*
4871*0Sstevel@tonic-gate 	 * Rehash two pages
4872*0Sstevel@tonic-gate 	 */
4873*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp_old));
4874*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp_new));
4875*0Sstevel@tonic-gate 	ASSERT(vp != NULL);
4876*0Sstevel@tonic-gate 	ASSERT(pp_new->p_vnode == NULL);
4877*0Sstevel@tonic-gate 
4878*0Sstevel@tonic-gate 	/*
4879*0Sstevel@tonic-gate 	 * hashout then hashin while holding the mutexes
4880*0Sstevel@tonic-gate 	 */
4881*0Sstevel@tonic-gate 	phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4882*0Sstevel@tonic-gate 	mutex_enter(phm);
4883*0Sstevel@tonic-gate 	vphm = page_vnode_mutex(vp);
4884*0Sstevel@tonic-gate 	mutex_enter(vphm);
4885*0Sstevel@tonic-gate 
4886*0Sstevel@tonic-gate 	page_do_relocate_hash(pp_new, pp_old);
4887*0Sstevel@tonic-gate 
4888*0Sstevel@tonic-gate 	mutex_exit(vphm);
4889*0Sstevel@tonic-gate 	mutex_exit(phm);
4890*0Sstevel@tonic-gate 
4891*0Sstevel@tonic-gate 	/*
4892*0Sstevel@tonic-gate 	 * The page_struct_lock need not be acquired for lckcnt and
4893*0Sstevel@tonic-gate 	 * cowcnt since the page has an "exclusive" lock.
4894*0Sstevel@tonic-gate 	 */
4895*0Sstevel@tonic-gate 	ASSERT(pp_new->p_lckcnt == 0);
4896*0Sstevel@tonic-gate 	ASSERT(pp_new->p_cowcnt == 0);
4897*0Sstevel@tonic-gate 	pp_new->p_lckcnt = pp_old->p_lckcnt;
4898*0Sstevel@tonic-gate 	pp_new->p_cowcnt = pp_old->p_cowcnt;
4899*0Sstevel@tonic-gate 	pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4900*0Sstevel@tonic-gate 
4901*0Sstevel@tonic-gate 	/* The following comment preserved from page_flip(). */
4902*0Sstevel@tonic-gate 	/* XXX - Do we need to protect fsdata? */
4903*0Sstevel@tonic-gate 	pp_new->p_fsdata = pp_old->p_fsdata;
4904*0Sstevel@tonic-gate }
4905*0Sstevel@tonic-gate 
4906*0Sstevel@tonic-gate /*
4907*0Sstevel@tonic-gate  * Helper routine used to lock all remaining members of a
4908*0Sstevel@tonic-gate  * large page. The caller is responsible for passing in a locked
4909*0Sstevel@tonic-gate  * pp. If pp is a large page, then it succeeds in locking all the
4910*0Sstevel@tonic-gate  * remaining constituent pages or it returns with only the
4911*0Sstevel@tonic-gate  * original page locked.
4912*0Sstevel@tonic-gate  *
4913*0Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
4914*0Sstevel@tonic-gate  *
4915*0Sstevel@tonic-gate  * If success is returned this routine gurantees p_szc for all constituent
4916*0Sstevel@tonic-gate  * pages of a large page pp belongs to can't change. To achieve this we
4917*0Sstevel@tonic-gate  * recheck szc of pp after locking all constituent pages and retry if szc
4918*0Sstevel@tonic-gate  * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4919*0Sstevel@tonic-gate  * lock on one of constituent pages it can't be running after all constituent
4920*0Sstevel@tonic-gate  * pages are locked.  hat_page_demote() with a lock on a constituent page
4921*0Sstevel@tonic-gate  * outside of this large page (i.e. pp belonged to a larger large page) is
4922*0Sstevel@tonic-gate  * already done with all constituent pages of pp since the root's p_szc is
4923*0Sstevel@tonic-gate  * changed last. Thefore no need to synchronize with hat_page_demote() that
4924*0Sstevel@tonic-gate  * locked a constituent page outside of pp's current large page.
4925*0Sstevel@tonic-gate  */
4926*0Sstevel@tonic-gate #ifdef DEBUG
4927*0Sstevel@tonic-gate uint32_t gpg_trylock_mtbf = 0;
4928*0Sstevel@tonic-gate #endif
4929*0Sstevel@tonic-gate 
4930*0Sstevel@tonic-gate int
4931*0Sstevel@tonic-gate group_page_trylock(page_t *pp, se_t se)
4932*0Sstevel@tonic-gate {
4933*0Sstevel@tonic-gate 	page_t  *tpp;
4934*0Sstevel@tonic-gate 	pgcnt_t	npgs, i, j;
4935*0Sstevel@tonic-gate 	uint_t pszc = pp->p_szc;
4936*0Sstevel@tonic-gate 
4937*0Sstevel@tonic-gate #ifdef DEBUG
4938*0Sstevel@tonic-gate 	if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4939*0Sstevel@tonic-gate 		return (0);
4940*0Sstevel@tonic-gate 	}
4941*0Sstevel@tonic-gate #endif
4942*0Sstevel@tonic-gate 
4943*0Sstevel@tonic-gate 	if (pp != PP_GROUPLEADER(pp, pszc)) {
4944*0Sstevel@tonic-gate 		return (0);
4945*0Sstevel@tonic-gate 	}
4946*0Sstevel@tonic-gate 
4947*0Sstevel@tonic-gate retry:
4948*0Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED_SE(pp, se));
4949*0Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(pp));
4950*0Sstevel@tonic-gate 	if (pszc == 0) {
4951*0Sstevel@tonic-gate 		return (1);
4952*0Sstevel@tonic-gate 	}
4953*0Sstevel@tonic-gate 	npgs = page_get_pagecnt(pszc);
4954*0Sstevel@tonic-gate 	tpp = pp + 1;
4955*0Sstevel@tonic-gate 	for (i = 1; i < npgs; i++, tpp++) {
4956*0Sstevel@tonic-gate 		if (!page_trylock(tpp, se)) {
4957*0Sstevel@tonic-gate 			tpp = pp + 1;
4958*0Sstevel@tonic-gate 			for (j = 1; j < i; j++, tpp++) {
4959*0Sstevel@tonic-gate 				page_unlock(tpp);
4960*0Sstevel@tonic-gate 			}
4961*0Sstevel@tonic-gate 			return (0);
4962*0Sstevel@tonic-gate 		}
4963*0Sstevel@tonic-gate 	}
4964*0Sstevel@tonic-gate 	if (pp->p_szc != pszc) {
4965*0Sstevel@tonic-gate 		ASSERT(pp->p_szc < pszc);
4966*0Sstevel@tonic-gate 		ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp &&
4967*0Sstevel@tonic-gate 		    !IS_SWAPFSVP(pp->p_vnode));
4968*0Sstevel@tonic-gate 		tpp = pp + 1;
4969*0Sstevel@tonic-gate 		for (i = 1; i < npgs; i++, tpp++) {
4970*0Sstevel@tonic-gate 			page_unlock(tpp);
4971*0Sstevel@tonic-gate 		}
4972*0Sstevel@tonic-gate 		pszc = pp->p_szc;
4973*0Sstevel@tonic-gate 		goto retry;
4974*0Sstevel@tonic-gate 	}
4975*0Sstevel@tonic-gate 	return (1);
4976*0Sstevel@tonic-gate }
4977*0Sstevel@tonic-gate 
4978*0Sstevel@tonic-gate void
4979*0Sstevel@tonic-gate group_page_unlock(page_t *pp)
4980*0Sstevel@tonic-gate {
4981*0Sstevel@tonic-gate 	page_t *tpp;
4982*0Sstevel@tonic-gate 	pgcnt_t	npgs, i;
4983*0Sstevel@tonic-gate 
4984*0Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
4985*0Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(pp));
4986*0Sstevel@tonic-gate 	ASSERT(pp == PP_PAGEROOT(pp));
4987*0Sstevel@tonic-gate 	npgs = page_get_pagecnt(pp->p_szc);
4988*0Sstevel@tonic-gate 	for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4989*0Sstevel@tonic-gate 		page_unlock(tpp);
4990*0Sstevel@tonic-gate 	}
4991*0Sstevel@tonic-gate }
4992*0Sstevel@tonic-gate 
4993*0Sstevel@tonic-gate /*
4994*0Sstevel@tonic-gate  * returns
4995*0Sstevel@tonic-gate  * 0 		: on success and *nrelocp is number of relocated PAGESIZE pages
4996*0Sstevel@tonic-gate  * ERANGE	: this is not a base page
4997*0Sstevel@tonic-gate  * EBUSY	: failure to get locks on the page/pages
4998*0Sstevel@tonic-gate  * ENOMEM	: failure to obtain replacement pages
4999*0Sstevel@tonic-gate  * EAGAIN	: OBP has not yet completed its boot-time handoff to the kernel
5000*0Sstevel@tonic-gate  *
5001*0Sstevel@tonic-gate  * Return with all constituent members of target and replacement
5002*0Sstevel@tonic-gate  * SE_EXCL locked. It is the callers responsibility to drop the
5003*0Sstevel@tonic-gate  * locks.
5004*0Sstevel@tonic-gate  */
5005*0Sstevel@tonic-gate int
5006*0Sstevel@tonic-gate do_page_relocate(
5007*0Sstevel@tonic-gate 	page_t **target,
5008*0Sstevel@tonic-gate 	page_t **replacement,
5009*0Sstevel@tonic-gate 	int grouplock,
5010*0Sstevel@tonic-gate 	spgcnt_t *nrelocp,
5011*0Sstevel@tonic-gate 	lgrp_t *lgrp)
5012*0Sstevel@tonic-gate {
5013*0Sstevel@tonic-gate #ifdef DEBUG
5014*0Sstevel@tonic-gate 	page_t *first_repl;
5015*0Sstevel@tonic-gate #endif /* DEBUG */
5016*0Sstevel@tonic-gate 	page_t *repl;
5017*0Sstevel@tonic-gate 	page_t *targ;
5018*0Sstevel@tonic-gate 	page_t *pl = NULL;
5019*0Sstevel@tonic-gate 	uint_t ppattr;
5020*0Sstevel@tonic-gate 	pfn_t   pfn, repl_pfn;
5021*0Sstevel@tonic-gate 	uint_t	szc;
5022*0Sstevel@tonic-gate 	spgcnt_t npgs, i;
5023*0Sstevel@tonic-gate 	int repl_contig = 0;
5024*0Sstevel@tonic-gate 	uint_t flags = 0;
5025*0Sstevel@tonic-gate 	spgcnt_t dofree = 0;
5026*0Sstevel@tonic-gate 
5027*0Sstevel@tonic-gate 	*nrelocp = 0;
5028*0Sstevel@tonic-gate 
5029*0Sstevel@tonic-gate #if defined(__sparc)
5030*0Sstevel@tonic-gate 	/*
5031*0Sstevel@tonic-gate 	 * We need to wait till OBP has completed
5032*0Sstevel@tonic-gate 	 * its boot-time handoff of its resources to the kernel
5033*0Sstevel@tonic-gate 	 * before we allow page relocation
5034*0Sstevel@tonic-gate 	 */
5035*0Sstevel@tonic-gate 	if (page_relocate_ready == 0) {
5036*0Sstevel@tonic-gate 		return (EAGAIN);
5037*0Sstevel@tonic-gate 	}
5038*0Sstevel@tonic-gate #endif
5039*0Sstevel@tonic-gate 
5040*0Sstevel@tonic-gate 	/*
5041*0Sstevel@tonic-gate 	 * If this is not a base page,
5042*0Sstevel@tonic-gate 	 * just return with 0x0 pages relocated.
5043*0Sstevel@tonic-gate 	 */
5044*0Sstevel@tonic-gate 	targ = *target;
5045*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(targ));
5046*0Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(targ));
5047*0Sstevel@tonic-gate 	szc = targ->p_szc;
5048*0Sstevel@tonic-gate 	ASSERT(szc < mmu_page_sizes);
5049*0Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
5050*0Sstevel@tonic-gate 	pfn = targ->p_pagenum;
5051*0Sstevel@tonic-gate 	if (pfn != PFN_BASE(pfn, szc)) {
5052*0Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
5053*0Sstevel@tonic-gate 		return (ERANGE);
5054*0Sstevel@tonic-gate 	}
5055*0Sstevel@tonic-gate 
5056*0Sstevel@tonic-gate 	if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
5057*0Sstevel@tonic-gate 		repl_pfn = repl->p_pagenum;
5058*0Sstevel@tonic-gate 		if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
5059*0Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
5060*0Sstevel@tonic-gate 			return (ERANGE);
5061*0Sstevel@tonic-gate 		}
5062*0Sstevel@tonic-gate 		repl_contig = 1;
5063*0Sstevel@tonic-gate 	}
5064*0Sstevel@tonic-gate 
5065*0Sstevel@tonic-gate 	/*
5066*0Sstevel@tonic-gate 	 * We must lock all members of this large page or we cannot
5067*0Sstevel@tonic-gate 	 * relocate any part of it.
5068*0Sstevel@tonic-gate 	 */
5069*0Sstevel@tonic-gate 	if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
5070*0Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
5071*0Sstevel@tonic-gate 		return (EBUSY);
5072*0Sstevel@tonic-gate 	}
5073*0Sstevel@tonic-gate 
5074*0Sstevel@tonic-gate 	/*
5075*0Sstevel@tonic-gate 	 * reread szc it could have been decreased before
5076*0Sstevel@tonic-gate 	 * group_page_trylock() was done.
5077*0Sstevel@tonic-gate 	 */
5078*0Sstevel@tonic-gate 	szc = targ->p_szc;
5079*0Sstevel@tonic-gate 	ASSERT(szc < mmu_page_sizes);
5080*0Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
5081*0Sstevel@tonic-gate 	ASSERT(pfn == PFN_BASE(pfn, szc));
5082*0Sstevel@tonic-gate 
5083*0Sstevel@tonic-gate 	npgs = page_get_pagecnt(targ->p_szc);
5084*0Sstevel@tonic-gate 
5085*0Sstevel@tonic-gate 	if (repl == NULL) {
5086*0Sstevel@tonic-gate 		dofree = npgs;		/* Size of target page in MMU pages */
5087*0Sstevel@tonic-gate 		if (!page_create_wait(dofree, 0)) {
5088*0Sstevel@tonic-gate 			if (grouplock != 0) {
5089*0Sstevel@tonic-gate 				group_page_unlock(targ);
5090*0Sstevel@tonic-gate 			}
5091*0Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
5092*0Sstevel@tonic-gate 			return (ENOMEM);
5093*0Sstevel@tonic-gate 		}
5094*0Sstevel@tonic-gate 
5095*0Sstevel@tonic-gate 		/*
5096*0Sstevel@tonic-gate 		 * seg kmem pages require that the target and replacement
5097*0Sstevel@tonic-gate 		 * page be the same pagesize.
5098*0Sstevel@tonic-gate 		 */
5099*0Sstevel@tonic-gate 		flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0;
5100*0Sstevel@tonic-gate 		repl = page_get_replacement_page(targ, lgrp, flags);
5101*0Sstevel@tonic-gate 		if (repl == NULL) {
5102*0Sstevel@tonic-gate 			if (grouplock != 0) {
5103*0Sstevel@tonic-gate 				group_page_unlock(targ);
5104*0Sstevel@tonic-gate 			}
5105*0Sstevel@tonic-gate 			page_create_putback(dofree);
5106*0Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
5107*0Sstevel@tonic-gate 			return (ENOMEM);
5108*0Sstevel@tonic-gate 		}
5109*0Sstevel@tonic-gate 	}
5110*0Sstevel@tonic-gate #ifdef DEBUG
5111*0Sstevel@tonic-gate 	else {
5112*0Sstevel@tonic-gate 		ASSERT(PAGE_LOCKED(repl));
5113*0Sstevel@tonic-gate 	}
5114*0Sstevel@tonic-gate #endif /* DEBUG */
5115*0Sstevel@tonic-gate 
5116*0Sstevel@tonic-gate #if defined(__sparc)
5117*0Sstevel@tonic-gate 	/*
5118*0Sstevel@tonic-gate 	 * Let hat_page_relocate() complete the relocation if it's kernel page
5119*0Sstevel@tonic-gate 	 */
5120*0Sstevel@tonic-gate 	if (targ->p_vnode == &kvp) {
5121*0Sstevel@tonic-gate 		*replacement = repl;
5122*0Sstevel@tonic-gate 		if (hat_page_relocate(target, replacement, nrelocp) != 0) {
5123*0Sstevel@tonic-gate 			if (grouplock != 0) {
5124*0Sstevel@tonic-gate 				group_page_unlock(targ);
5125*0Sstevel@tonic-gate 			}
5126*0Sstevel@tonic-gate 			if (dofree) {
5127*0Sstevel@tonic-gate 				*replacement = NULL;
5128*0Sstevel@tonic-gate 				page_free_replacement_page(repl);
5129*0Sstevel@tonic-gate 				page_create_putback(dofree);
5130*0Sstevel@tonic-gate 			}
5131*0Sstevel@tonic-gate 			VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
5132*0Sstevel@tonic-gate 			return (EAGAIN);
5133*0Sstevel@tonic-gate 		}
5134*0Sstevel@tonic-gate 		VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
5135*0Sstevel@tonic-gate 		return (0);
5136*0Sstevel@tonic-gate 	}
5137*0Sstevel@tonic-gate #else
5138*0Sstevel@tonic-gate #if defined(lint)
5139*0Sstevel@tonic-gate 	dofree = dofree;
5140*0Sstevel@tonic-gate #endif
5141*0Sstevel@tonic-gate #endif
5142*0Sstevel@tonic-gate 
5143*0Sstevel@tonic-gate #ifdef DEBUG
5144*0Sstevel@tonic-gate 	first_repl = repl;
5145*0Sstevel@tonic-gate #endif /* DEBUG */
5146*0Sstevel@tonic-gate 
5147*0Sstevel@tonic-gate 	for (i = 0; i < npgs; i++) {
5148*0Sstevel@tonic-gate 		ASSERT(PAGE_EXCL(targ));
5149*0Sstevel@tonic-gate 
5150*0Sstevel@tonic-gate 		(void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
5151*0Sstevel@tonic-gate 
5152*0Sstevel@tonic-gate 		ASSERT(hat_page_getshare(targ) == 0);
5153*0Sstevel@tonic-gate 		ASSERT(!PP_ISFREE(targ));
5154*0Sstevel@tonic-gate 		ASSERT(targ->p_pagenum == (pfn + i));
5155*0Sstevel@tonic-gate 		ASSERT(repl_contig == 0 ||
5156*0Sstevel@tonic-gate 		    repl->p_pagenum == (repl_pfn + i));
5157*0Sstevel@tonic-gate 
5158*0Sstevel@tonic-gate 		/*
5159*0Sstevel@tonic-gate 		 * Copy the page contents and attributes then
5160*0Sstevel@tonic-gate 		 * relocate the page in the page hash.
5161*0Sstevel@tonic-gate 		 */
5162*0Sstevel@tonic-gate 		ppcopy(targ, repl);
5163*0Sstevel@tonic-gate 		ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
5164*0Sstevel@tonic-gate 		page_clr_all_props(repl);
5165*0Sstevel@tonic-gate 		page_set_props(repl, ppattr);
5166*0Sstevel@tonic-gate 		page_relocate_hash(repl, targ);
5167*0Sstevel@tonic-gate 
5168*0Sstevel@tonic-gate 		ASSERT(hat_page_getshare(targ) == 0);
5169*0Sstevel@tonic-gate 		ASSERT(hat_page_getshare(repl) == 0);
5170*0Sstevel@tonic-gate 		/*
5171*0Sstevel@tonic-gate 		 * Now clear the props on targ, after the
5172*0Sstevel@tonic-gate 		 * page_relocate_hash(), they no longer
5173*0Sstevel@tonic-gate 		 * have any meaning.
5174*0Sstevel@tonic-gate 		 */
5175*0Sstevel@tonic-gate 		page_clr_all_props(targ);
5176*0Sstevel@tonic-gate 		ASSERT(targ->p_next == targ);
5177*0Sstevel@tonic-gate 		ASSERT(targ->p_prev == targ);
5178*0Sstevel@tonic-gate 		page_list_concat(&pl, &targ);
5179*0Sstevel@tonic-gate 
5180*0Sstevel@tonic-gate 		targ++;
5181*0Sstevel@tonic-gate 		if (repl_contig != 0) {
5182*0Sstevel@tonic-gate 			repl++;
5183*0Sstevel@tonic-gate 		} else {
5184*0Sstevel@tonic-gate 			repl = repl->p_next;
5185*0Sstevel@tonic-gate 		}
5186*0Sstevel@tonic-gate 	}
5187*0Sstevel@tonic-gate 	/* assert that we have come full circle with repl */
5188*0Sstevel@tonic-gate 	ASSERT(repl_contig == 1 || first_repl == repl);
5189*0Sstevel@tonic-gate 
5190*0Sstevel@tonic-gate 	*target = pl;
5191*0Sstevel@tonic-gate 	if (*replacement == NULL) {
5192*0Sstevel@tonic-gate 		ASSERT(first_repl == repl);
5193*0Sstevel@tonic-gate 		*replacement = repl;
5194*0Sstevel@tonic-gate 	}
5195*0Sstevel@tonic-gate 	VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
5196*0Sstevel@tonic-gate 	*nrelocp = npgs;
5197*0Sstevel@tonic-gate 	return (0);
5198*0Sstevel@tonic-gate }
5199*0Sstevel@tonic-gate /*
5200*0Sstevel@tonic-gate  * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
5201*0Sstevel@tonic-gate  */
5202*0Sstevel@tonic-gate int
5203*0Sstevel@tonic-gate page_relocate(
5204*0Sstevel@tonic-gate 	page_t **target,
5205*0Sstevel@tonic-gate 	page_t **replacement,
5206*0Sstevel@tonic-gate 	int grouplock,
5207*0Sstevel@tonic-gate 	int freetarget,
5208*0Sstevel@tonic-gate 	spgcnt_t *nrelocp,
5209*0Sstevel@tonic-gate 	lgrp_t *lgrp)
5210*0Sstevel@tonic-gate {
5211*0Sstevel@tonic-gate 	spgcnt_t ret;
5212*0Sstevel@tonic-gate 
5213*0Sstevel@tonic-gate 	/* do_page_relocate returns 0 on success or errno value */
5214*0Sstevel@tonic-gate 	ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
5215*0Sstevel@tonic-gate 
5216*0Sstevel@tonic-gate 	if (ret != 0 || freetarget == 0) {
5217*0Sstevel@tonic-gate 		return (ret);
5218*0Sstevel@tonic-gate 	}
5219*0Sstevel@tonic-gate 	if (*nrelocp == 1) {
5220*0Sstevel@tonic-gate 		ASSERT(*target != NULL);
5221*0Sstevel@tonic-gate 		page_free(*target, 1);
5222*0Sstevel@tonic-gate 	} else {
5223*0Sstevel@tonic-gate 		page_t *tpp = *target;
5224*0Sstevel@tonic-gate 		uint_t szc = tpp->p_szc;
5225*0Sstevel@tonic-gate 		pgcnt_t npgs = page_get_pagecnt(szc);
5226*0Sstevel@tonic-gate 		ASSERT(npgs > 1);
5227*0Sstevel@tonic-gate 		ASSERT(szc != 0);
5228*0Sstevel@tonic-gate 		do {
5229*0Sstevel@tonic-gate 			ASSERT(PAGE_EXCL(tpp));
5230*0Sstevel@tonic-gate 			ASSERT(!hat_page_is_mapped(tpp));
5231*0Sstevel@tonic-gate 			ASSERT(tpp->p_szc == szc);
5232*0Sstevel@tonic-gate 			PP_SETFREE(tpp);
5233*0Sstevel@tonic-gate 			PP_SETAGED(tpp);
5234*0Sstevel@tonic-gate 			npgs--;
5235*0Sstevel@tonic-gate 		} while ((tpp = tpp->p_next) != *target);
5236*0Sstevel@tonic-gate 		ASSERT(npgs == 0);
5237*0Sstevel@tonic-gate 		page_list_add_pages(*target, 0);
5238*0Sstevel@tonic-gate 		npgs = page_get_pagecnt(szc);
5239*0Sstevel@tonic-gate 		page_create_putback(npgs);
5240*0Sstevel@tonic-gate 	}
5241*0Sstevel@tonic-gate 	return (ret);
5242*0Sstevel@tonic-gate }
5243*0Sstevel@tonic-gate 
5244*0Sstevel@tonic-gate /*
5245*0Sstevel@tonic-gate  * it is up to the caller to deal with pcf accounting.
5246*0Sstevel@tonic-gate  */
5247*0Sstevel@tonic-gate void
5248*0Sstevel@tonic-gate page_free_replacement_page(page_t *pplist)
5249*0Sstevel@tonic-gate {
5250*0Sstevel@tonic-gate 	page_t *pp;
5251*0Sstevel@tonic-gate 
5252*0Sstevel@tonic-gate 	while (pplist != NULL) {
5253*0Sstevel@tonic-gate 		/*
5254*0Sstevel@tonic-gate 		 * pp_targ is a linked list.
5255*0Sstevel@tonic-gate 		 */
5256*0Sstevel@tonic-gate 		pp = pplist;
5257*0Sstevel@tonic-gate 		if (pp->p_szc == 0) {
5258*0Sstevel@tonic-gate 			page_sub(&pplist, pp);
5259*0Sstevel@tonic-gate 			page_clr_all_props(pp);
5260*0Sstevel@tonic-gate 			PP_SETFREE(pp);
5261*0Sstevel@tonic-gate 			PP_SETAGED(pp);
5262*0Sstevel@tonic-gate 			page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
5263*0Sstevel@tonic-gate 			page_unlock(pp);
5264*0Sstevel@tonic-gate 			VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
5265*0Sstevel@tonic-gate 		} else {
5266*0Sstevel@tonic-gate 			spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
5267*0Sstevel@tonic-gate 			page_t *tpp;
5268*0Sstevel@tonic-gate 			page_list_break(&pp, &pplist, curnpgs);
5269*0Sstevel@tonic-gate 			tpp = pp;
5270*0Sstevel@tonic-gate 			do {
5271*0Sstevel@tonic-gate 				ASSERT(PAGE_EXCL(tpp));
5272*0Sstevel@tonic-gate 				ASSERT(!hat_page_is_mapped(tpp));
5273*0Sstevel@tonic-gate 				page_clr_all_props(pp);
5274*0Sstevel@tonic-gate 				PP_SETFREE(tpp);
5275*0Sstevel@tonic-gate 				PP_SETAGED(tpp);
5276*0Sstevel@tonic-gate 			} while ((tpp = tpp->p_next) != pp);
5277*0Sstevel@tonic-gate 			page_list_add_pages(pp, 0);
5278*0Sstevel@tonic-gate 			VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
5279*0Sstevel@tonic-gate 		}
5280*0Sstevel@tonic-gate 	}
5281*0Sstevel@tonic-gate }
5282*0Sstevel@tonic-gate 
5283*0Sstevel@tonic-gate /*
5284*0Sstevel@tonic-gate  * Relocate target to non-relocatable replacement page.
5285*0Sstevel@tonic-gate  */
5286*0Sstevel@tonic-gate int
5287*0Sstevel@tonic-gate page_relocate_cage(page_t **target, page_t **replacement)
5288*0Sstevel@tonic-gate {
5289*0Sstevel@tonic-gate 	page_t *tpp, *rpp;
5290*0Sstevel@tonic-gate 	spgcnt_t pgcnt, npgs;
5291*0Sstevel@tonic-gate 	int result;
5292*0Sstevel@tonic-gate 
5293*0Sstevel@tonic-gate 	tpp = *target;
5294*0Sstevel@tonic-gate 
5295*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(tpp));
5296*0Sstevel@tonic-gate 	ASSERT(tpp->p_szc == 0);
5297*0Sstevel@tonic-gate 
5298*0Sstevel@tonic-gate 	pgcnt = btop(page_get_pagesize(tpp->p_szc));
5299*0Sstevel@tonic-gate 
5300*0Sstevel@tonic-gate 	do {
5301*0Sstevel@tonic-gate 		(void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5302*0Sstevel@tonic-gate 		rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5303*0Sstevel@tonic-gate 		if (rpp == NULL) {
5304*0Sstevel@tonic-gate 			page_create_putback(pgcnt);
5305*0Sstevel@tonic-gate 			kcage_cageout_wakeup();
5306*0Sstevel@tonic-gate 		}
5307*0Sstevel@tonic-gate 	} while (rpp == NULL);
5308*0Sstevel@tonic-gate 
5309*0Sstevel@tonic-gate 	ASSERT(PP_ISNORELOC(rpp));
5310*0Sstevel@tonic-gate 
5311*0Sstevel@tonic-gate 	result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5312*0Sstevel@tonic-gate 
5313*0Sstevel@tonic-gate 	if (result == 0) {
5314*0Sstevel@tonic-gate 		*replacement = rpp;
5315*0Sstevel@tonic-gate 		if (pgcnt != npgs)
5316*0Sstevel@tonic-gate 			panic("page_relocate_cage: partial relocation");
5317*0Sstevel@tonic-gate 	}
5318*0Sstevel@tonic-gate 
5319*0Sstevel@tonic-gate 	return (result);
5320*0Sstevel@tonic-gate }
5321*0Sstevel@tonic-gate 
5322*0Sstevel@tonic-gate /*
5323*0Sstevel@tonic-gate  * Release the page lock on a page, place on cachelist
5324*0Sstevel@tonic-gate  * tail if no longer mapped. Caller can let us know if
5325*0Sstevel@tonic-gate  * the page is known to be clean.
5326*0Sstevel@tonic-gate  */
5327*0Sstevel@tonic-gate int
5328*0Sstevel@tonic-gate page_release(page_t *pp, int checkmod)
5329*0Sstevel@tonic-gate {
5330*0Sstevel@tonic-gate 	int status;
5331*0Sstevel@tonic-gate 
5332*0Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5333*0Sstevel@tonic-gate 		(pp->p_vnode != NULL));
5334*0Sstevel@tonic-gate 
5335*0Sstevel@tonic-gate 	if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5336*0Sstevel@tonic-gate 	    ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5337*0Sstevel@tonic-gate 	    pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5338*0Sstevel@tonic-gate 	    !hat_page_is_mapped(pp)) {
5339*0Sstevel@tonic-gate 
5340*0Sstevel@tonic-gate 		/*
5341*0Sstevel@tonic-gate 		 * If page is modified, unlock it
5342*0Sstevel@tonic-gate 		 *
5343*0Sstevel@tonic-gate 		 * (p_nrm & P_MOD) bit has the latest stuff because:
5344*0Sstevel@tonic-gate 		 * (1) We found that this page doesn't have any mappings
5345*0Sstevel@tonic-gate 		 *	_after_ holding SE_EXCL and
5346*0Sstevel@tonic-gate 		 * (2) We didn't drop SE_EXCL lock after the check in (1)
5347*0Sstevel@tonic-gate 		 */
5348*0Sstevel@tonic-gate 		if (checkmod && hat_ismod(pp)) {
5349*0Sstevel@tonic-gate 			page_unlock(pp);
5350*0Sstevel@tonic-gate 			status = PGREL_MOD;
5351*0Sstevel@tonic-gate 		} else {
5352*0Sstevel@tonic-gate 			/*LINTED: constant in conditional context*/
5353*0Sstevel@tonic-gate 			VN_DISPOSE(pp, B_FREE, 0, kcred);
5354*0Sstevel@tonic-gate 			status = PGREL_CLEAN;
5355*0Sstevel@tonic-gate 		}
5356*0Sstevel@tonic-gate 	} else {
5357*0Sstevel@tonic-gate 		page_unlock(pp);
5358*0Sstevel@tonic-gate 		status = PGREL_NOTREL;
5359*0Sstevel@tonic-gate 	}
5360*0Sstevel@tonic-gate 	return (status);
5361*0Sstevel@tonic-gate }
5362*0Sstevel@tonic-gate 
5363*0Sstevel@tonic-gate int
5364*0Sstevel@tonic-gate page_try_demote_pages(page_t *pp)
5365*0Sstevel@tonic-gate {
5366*0Sstevel@tonic-gate 	page_t *tpp, *rootpp = pp;
5367*0Sstevel@tonic-gate 	pfn_t	pfn = page_pptonum(pp);
5368*0Sstevel@tonic-gate 	spgcnt_t i, npgs;
5369*0Sstevel@tonic-gate 	uint_t	szc = pp->p_szc;
5370*0Sstevel@tonic-gate 	vnode_t *vp = pp->p_vnode;
5371*0Sstevel@tonic-gate 
5372*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(rootpp));
5373*0Sstevel@tonic-gate 
5374*0Sstevel@tonic-gate 	VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5375*0Sstevel@tonic-gate 
5376*0Sstevel@tonic-gate 	if (rootpp->p_szc == 0) {
5377*0Sstevel@tonic-gate 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5378*0Sstevel@tonic-gate 		return (1);
5379*0Sstevel@tonic-gate 	}
5380*0Sstevel@tonic-gate 
5381*0Sstevel@tonic-gate 	if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) {
5382*0Sstevel@tonic-gate 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5383*0Sstevel@tonic-gate 		page_demote_vp_pages(rootpp);
5384*0Sstevel@tonic-gate 		ASSERT(pp->p_szc == 0);
5385*0Sstevel@tonic-gate 		return (1);
5386*0Sstevel@tonic-gate 	}
5387*0Sstevel@tonic-gate 
5388*0Sstevel@tonic-gate 	/*
5389*0Sstevel@tonic-gate 	 * Adjust rootpp if  passed in is not the base
5390*0Sstevel@tonic-gate 	 * constituent page.
5391*0Sstevel@tonic-gate 	 */
5392*0Sstevel@tonic-gate 	npgs = page_get_pagecnt(rootpp->p_szc);
5393*0Sstevel@tonic-gate 	ASSERT(npgs > 1);
5394*0Sstevel@tonic-gate 	if (!IS_P2ALIGNED(pfn, npgs)) {
5395*0Sstevel@tonic-gate 		pfn = P2ALIGN(pfn, npgs);
5396*0Sstevel@tonic-gate 		rootpp = page_numtopp_nolock(pfn);
5397*0Sstevel@tonic-gate 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5398*0Sstevel@tonic-gate 		ASSERT(rootpp->p_vnode != NULL);
5399*0Sstevel@tonic-gate 		ASSERT(rootpp->p_szc == szc);
5400*0Sstevel@tonic-gate 	}
5401*0Sstevel@tonic-gate 
5402*0Sstevel@tonic-gate 	/*
5403*0Sstevel@tonic-gate 	 * We can't demote kernel pages since we can't hat_unload()
5404*0Sstevel@tonic-gate 	 * the mappings.
5405*0Sstevel@tonic-gate 	 */
5406*0Sstevel@tonic-gate 	if (rootpp->p_vnode == &kvp)
5407*0Sstevel@tonic-gate 		return (0);
5408*0Sstevel@tonic-gate 
5409*0Sstevel@tonic-gate 	/*
5410*0Sstevel@tonic-gate 	 * Attempt to lock all constituent pages except the page passed
5411*0Sstevel@tonic-gate 	 * in since it's already locked.
5412*0Sstevel@tonic-gate 	 */
5413*0Sstevel@tonic-gate 	for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) {
5414*0Sstevel@tonic-gate 		ASSERT(!PP_ISFREE(tpp));
5415*0Sstevel@tonic-gate 		ASSERT(tpp->p_vnode != NULL);
5416*0Sstevel@tonic-gate 
5417*0Sstevel@tonic-gate 		if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5418*0Sstevel@tonic-gate 			break;
5419*0Sstevel@tonic-gate 		ASSERT(tpp->p_szc == rootpp->p_szc);
5420*0Sstevel@tonic-gate 		ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5421*0Sstevel@tonic-gate 		(void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5422*0Sstevel@tonic-gate 	}
5423*0Sstevel@tonic-gate 
5424*0Sstevel@tonic-gate 	/*
5425*0Sstevel@tonic-gate 	 * If we failed to lock them all then unlock what we have locked
5426*0Sstevel@tonic-gate 	 * so far and bail.
5427*0Sstevel@tonic-gate 	 */
5428*0Sstevel@tonic-gate 	if (i < npgs) {
5429*0Sstevel@tonic-gate 		tpp = rootpp;
5430*0Sstevel@tonic-gate 		while (i-- > 0) {
5431*0Sstevel@tonic-gate 			if (tpp != pp)
5432*0Sstevel@tonic-gate 				page_unlock(tpp);
5433*0Sstevel@tonic-gate 			tpp = page_next(tpp);
5434*0Sstevel@tonic-gate 		}
5435*0Sstevel@tonic-gate 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5436*0Sstevel@tonic-gate 		return (0);
5437*0Sstevel@tonic-gate 	}
5438*0Sstevel@tonic-gate 
5439*0Sstevel@tonic-gate 	/*
5440*0Sstevel@tonic-gate 	 * XXX probably p_szc clearing and page unlocking can be done within
5441*0Sstevel@tonic-gate 	 * one loop but since this is rare code we can play very safe.
5442*0Sstevel@tonic-gate 	 */
5443*0Sstevel@tonic-gate 	for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) {
5444*0Sstevel@tonic-gate 		ASSERT(PAGE_EXCL(tpp));
5445*0Sstevel@tonic-gate 		tpp->p_szc = 0;
5446*0Sstevel@tonic-gate 	}
5447*0Sstevel@tonic-gate 
5448*0Sstevel@tonic-gate 	/*
5449*0Sstevel@tonic-gate 	 * Unlock all pages except the page passed in.
5450*0Sstevel@tonic-gate 	 */
5451*0Sstevel@tonic-gate 	for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) {
5452*0Sstevel@tonic-gate 		ASSERT(!hat_page_is_mapped(tpp));
5453*0Sstevel@tonic-gate 		if (tpp != pp)
5454*0Sstevel@tonic-gate 			page_unlock(tpp);
5455*0Sstevel@tonic-gate 	}
5456*0Sstevel@tonic-gate 	VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5457*0Sstevel@tonic-gate 	return (1);
5458*0Sstevel@tonic-gate }
5459*0Sstevel@tonic-gate 
5460*0Sstevel@tonic-gate /*
5461*0Sstevel@tonic-gate  * Called by page_free() and page_destroy() to demote the page size code
5462*0Sstevel@tonic-gate  * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5463*0Sstevel@tonic-gate  * p_szc on free list, neither can we just clear p_szc of a single page_t
5464*0Sstevel@tonic-gate  * within a large page since it will break other code that relies on p_szc
5465*0Sstevel@tonic-gate  * being the same for all page_t's of a large page). Anonymous pages should
5466*0Sstevel@tonic-gate  * never end up here because anon_map_getpages() cannot deal with p_szc
5467*0Sstevel@tonic-gate  * changes after a single constituent page is locked.  While anonymous or
5468*0Sstevel@tonic-gate  * kernel large pages are demoted or freed the entire large page at a time
5469*0Sstevel@tonic-gate  * with all constituent pages locked EXCL for the file system pages we
5470*0Sstevel@tonic-gate  * have to be able to demote a large page (i.e. decrease all constituent pages
5471*0Sstevel@tonic-gate  * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5472*0Sstevel@tonic-gate  * we can easily deal with anonymous page demotion the entire large page at a
5473*0Sstevel@tonic-gate  * time is that those operation originate at address space level and concern
5474*0Sstevel@tonic-gate  * the entire large page region with actual demotion only done when pages are
5475*0Sstevel@tonic-gate  * not shared with any other processes (therefore we can always get EXCL lock
5476*0Sstevel@tonic-gate  * on all anonymous constituent pages after clearing segment page
5477*0Sstevel@tonic-gate  * cache). However file system pages can be truncated or invalidated at a
5478*0Sstevel@tonic-gate  * PAGESIZE level from the file system side and end up in page_free() or
5479*0Sstevel@tonic-gate  * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5480*0Sstevel@tonic-gate  * and therfore pageout should be able to demote a large page by EXCL locking
5481*0Sstevel@tonic-gate  * any constituent page that is not under SOFTLOCK). In those cases we cannot
5482*0Sstevel@tonic-gate  * rely on being able to lock EXCL all constituent pages.
5483*0Sstevel@tonic-gate  *
5484*0Sstevel@tonic-gate  * To prevent szc changes on file system pages one has to lock all constituent
5485*0Sstevel@tonic-gate  * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5486*0Sstevel@tonic-gate  * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5487*0Sstevel@tonic-gate  * prevent szc changes is hat layer that uses its own page level mlist
5488*0Sstevel@tonic-gate  * locks. hat assumes that szc doesn't change after mlist lock for a page is
5489*0Sstevel@tonic-gate  * taken. Therefore we need to change szc under hat level locks if we only
5490*0Sstevel@tonic-gate  * have an EXCL lock on a single constituent page and hat still references any
5491*0Sstevel@tonic-gate  * of constituent pages.  (Note we can't "ignore" hat layer by simply
5492*0Sstevel@tonic-gate  * hat_pageunload() all constituent pages without having EXCL locks on all of
5493*0Sstevel@tonic-gate  * constituent pages). We use hat_page_demote() call to safely demote szc of
5494*0Sstevel@tonic-gate  * all constituent pages under hat locks when we only have an EXCL lock on one
5495*0Sstevel@tonic-gate  * of constituent pages.
5496*0Sstevel@tonic-gate  *
5497*0Sstevel@tonic-gate  * This routine calls page_szc_lock() before calling hat_page_demote() to
5498*0Sstevel@tonic-gate  * allow segvn in one special case not to lock all constituent pages SHARED
5499*0Sstevel@tonic-gate  * before calling hat_memload_array() that relies on p_szc not changeing even
5500*0Sstevel@tonic-gate  * before hat level mlist lock is taken.  In that case segvn uses
5501*0Sstevel@tonic-gate  * page_szc_lock() to prevent hat_page_demote() changeing p_szc values.
5502*0Sstevel@tonic-gate  *
5503*0Sstevel@tonic-gate  * Anonymous or kernel page demotion still has to lock all pages exclusively
5504*0Sstevel@tonic-gate  * and do hat_pageunload() on all constituent pages before demoting the page
5505*0Sstevel@tonic-gate  * therefore there's no need for anonymous or kernel page demotion to use
5506*0Sstevel@tonic-gate  * hat_page_demote() mechanism.
5507*0Sstevel@tonic-gate  *
5508*0Sstevel@tonic-gate  * hat_page_demote() removes all large mappings that map pp and then decreases
5509*0Sstevel@tonic-gate  * p_szc starting from the last constituent page of the large page. By working
5510*0Sstevel@tonic-gate  * from the tail of a large page in pfn decreasing order allows one looking at
5511*0Sstevel@tonic-gate  * the root page to know that hat_page_demote() is done for root's szc area.
5512*0Sstevel@tonic-gate  * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5513*0Sstevel@tonic-gate  * pages within szc 1 area to prevent szc changes because hat_page_demote()
5514*0Sstevel@tonic-gate  * that started on this page when it had szc > 1 is done for this szc 1 area.
5515*0Sstevel@tonic-gate  *
5516*0Sstevel@tonic-gate  * We are guranteed that all constituent pages of pp's large page belong to
5517*0Sstevel@tonic-gate  * the same vnode with the consecutive offsets increasing in the direction of
5518*0Sstevel@tonic-gate  * the pfn i.e. the identity of constituent pages can't change until their
5519*0Sstevel@tonic-gate  * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5520*0Sstevel@tonic-gate  * large mappings to pp even though we don't lock any constituent page except
5521*0Sstevel@tonic-gate  * pp (i.e. we won't unload e.g. kernel locked page).
5522*0Sstevel@tonic-gate  */
5523*0Sstevel@tonic-gate static void
5524*0Sstevel@tonic-gate page_demote_vp_pages(page_t *pp)
5525*0Sstevel@tonic-gate {
5526*0Sstevel@tonic-gate 	kmutex_t *mtx;
5527*0Sstevel@tonic-gate 
5528*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
5529*0Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(pp));
5530*0Sstevel@tonic-gate 	ASSERT(pp->p_vnode != NULL);
5531*0Sstevel@tonic-gate 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5532*0Sstevel@tonic-gate 	ASSERT(pp->p_vnode != &kvp);
5533*0Sstevel@tonic-gate 
5534*0Sstevel@tonic-gate 	VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5535*0Sstevel@tonic-gate 
5536*0Sstevel@tonic-gate 	mtx = page_szc_lock(pp);
5537*0Sstevel@tonic-gate 	if (mtx != NULL) {
5538*0Sstevel@tonic-gate 		hat_page_demote(pp);
5539*0Sstevel@tonic-gate 		mutex_exit(mtx);
5540*0Sstevel@tonic-gate 	}
5541*0Sstevel@tonic-gate 	ASSERT(pp->p_szc == 0);
5542*0Sstevel@tonic-gate }
5543*0Sstevel@tonic-gate 
5544*0Sstevel@tonic-gate /*
5545*0Sstevel@tonic-gate  * Page retire operation.
5546*0Sstevel@tonic-gate  *
5547*0Sstevel@tonic-gate  * page_retire()
5548*0Sstevel@tonic-gate  * Attempt to retire (throw away) page pp.  We cannot do this if
5549*0Sstevel@tonic-gate  * the page is dirty; if the page is clean, we can try.  We return 0 on
5550*0Sstevel@tonic-gate  * success, -1 on failure.  This routine should be invoked by the platform's
5551*0Sstevel@tonic-gate  * memory error detection code.
5552*0Sstevel@tonic-gate  *
5553*0Sstevel@tonic-gate  * pages_retired_limit_exceeded()
5554*0Sstevel@tonic-gate  * We set a limit on the number of pages which may be retired. This
5555*0Sstevel@tonic-gate  * is set to a percentage of total physical memory. This limit is
5556*0Sstevel@tonic-gate  * enforced here.
5557*0Sstevel@tonic-gate  */
5558*0Sstevel@tonic-gate 
5559*0Sstevel@tonic-gate static pgcnt_t	retired_pgcnt = 0;
5560*0Sstevel@tonic-gate 
5561*0Sstevel@tonic-gate /*
5562*0Sstevel@tonic-gate  * routines to update the count of retired pages
5563*0Sstevel@tonic-gate  */
5564*0Sstevel@tonic-gate static void
5565*0Sstevel@tonic-gate page_retired(page_t *pp)
5566*0Sstevel@tonic-gate {
5567*0Sstevel@tonic-gate 	ASSERT(pp);
5568*0Sstevel@tonic-gate 
5569*0Sstevel@tonic-gate 	page_settoxic(pp, PAGE_IS_RETIRED);
5570*0Sstevel@tonic-gate 	atomic_add_long(&retired_pgcnt, 1);
5571*0Sstevel@tonic-gate }
5572*0Sstevel@tonic-gate 
5573*0Sstevel@tonic-gate static void
5574*0Sstevel@tonic-gate retired_page_removed(page_t *pp)
5575*0Sstevel@tonic-gate {
5576*0Sstevel@tonic-gate 	ASSERT(pp);
5577*0Sstevel@tonic-gate 	ASSERT(page_isretired(pp));
5578*0Sstevel@tonic-gate 	ASSERT(retired_pgcnt > 0);
5579*0Sstevel@tonic-gate 
5580*0Sstevel@tonic-gate 	page_clrtoxic(pp);
5581*0Sstevel@tonic-gate 	atomic_add_long(&retired_pgcnt, -1);
5582*0Sstevel@tonic-gate }
5583*0Sstevel@tonic-gate 
5584*0Sstevel@tonic-gate 
5585*0Sstevel@tonic-gate static int
5586*0Sstevel@tonic-gate pages_retired_limit_exceeded()
5587*0Sstevel@tonic-gate {
5588*0Sstevel@tonic-gate 	pgcnt_t	retired_max;
5589*0Sstevel@tonic-gate 
5590*0Sstevel@tonic-gate 	/*
5591*0Sstevel@tonic-gate 	 * If the percentage is zero or is not set correctly,
5592*0Sstevel@tonic-gate 	 * return TRUE so that pages are not retired.
5593*0Sstevel@tonic-gate 	 */
5594*0Sstevel@tonic-gate 	if (max_pages_retired_bps <= 0 ||
5595*0Sstevel@tonic-gate 	    max_pages_retired_bps >= 10000)
5596*0Sstevel@tonic-gate 		return (1);
5597*0Sstevel@tonic-gate 
5598*0Sstevel@tonic-gate 	/*
5599*0Sstevel@tonic-gate 	 * Calculate the maximum number of pages allowed to
5600*0Sstevel@tonic-gate 	 * be retired as a percentage of total physical memory
5601*0Sstevel@tonic-gate 	 * (Remember that we are using basis points, hence the 10000.)
5602*0Sstevel@tonic-gate 	 */
5603*0Sstevel@tonic-gate 	retired_max = (physmem * max_pages_retired_bps) / 10000;
5604*0Sstevel@tonic-gate 
5605*0Sstevel@tonic-gate 	/*
5606*0Sstevel@tonic-gate 	 * return 'TRUE' if we have already retired more
5607*0Sstevel@tonic-gate 	 * than the legal limit
5608*0Sstevel@tonic-gate 	 */
5609*0Sstevel@tonic-gate 	return (retired_pgcnt >= retired_max);
5610*0Sstevel@tonic-gate }
5611*0Sstevel@tonic-gate 
5612*0Sstevel@tonic-gate #define	PAGE_RETIRE_SELOCK	0
5613*0Sstevel@tonic-gate #define	PAGE_RETIRE_NORECLAIM	1
5614*0Sstevel@tonic-gate #define	PAGE_RETIRE_LOCKED	2
5615*0Sstevel@tonic-gate #define	PAGE_RETIRE_COW		3
5616*0Sstevel@tonic-gate #define	PAGE_RETIRE_DIRTY	4
5617*0Sstevel@tonic-gate #define	PAGE_RETIRE_LPAGE	5
5618*0Sstevel@tonic-gate #define	PAGE_RETIRE_SUCCESS	6
5619*0Sstevel@tonic-gate #define	PAGE_RETIRE_LIMIT	7
5620*0Sstevel@tonic-gate #define	PAGE_RETIRE_NCODES	8
5621*0Sstevel@tonic-gate 
5622*0Sstevel@tonic-gate typedef struct page_retire_op {
5623*0Sstevel@tonic-gate 	int	pr_count;
5624*0Sstevel@tonic-gate 	short	pr_unlock;
5625*0Sstevel@tonic-gate 	short	pr_retval;
5626*0Sstevel@tonic-gate 	char	*pr_message;
5627*0Sstevel@tonic-gate } page_retire_op_t;
5628*0Sstevel@tonic-gate 
5629*0Sstevel@tonic-gate page_retire_op_t page_retire_ops[PAGE_RETIRE_NCODES] = {
5630*0Sstevel@tonic-gate 	{	0,	0,	-1,	"cannot lock page"		},
5631*0Sstevel@tonic-gate 	{	0,	0,	-1,	"cannot reclaim cached page"	},
5632*0Sstevel@tonic-gate 	{	0,	1,	-1,	"page is locked"		},
5633*0Sstevel@tonic-gate 	{	0,	1,	-1,	"copy-on-write page"		},
5634*0Sstevel@tonic-gate 	{	0,	1,	-1,	"page is dirty"			},
5635*0Sstevel@tonic-gate 	{	0,	1,	-1,	"cannot demote large page"	},
5636*0Sstevel@tonic-gate 	{	0,	0,	0,	"page successfully retired"	},
5637*0Sstevel@tonic-gate 	{	0,	0,	-1,	"excess pages retired already"	},
5638*0Sstevel@tonic-gate };
5639*0Sstevel@tonic-gate 
5640*0Sstevel@tonic-gate static int
5641*0Sstevel@tonic-gate page_retire_done(page_t *pp, int code)
5642*0Sstevel@tonic-gate {
5643*0Sstevel@tonic-gate 	page_retire_op_t *prop = &page_retire_ops[code];
5644*0Sstevel@tonic-gate 
5645*0Sstevel@tonic-gate 	prop->pr_count++;
5646*0Sstevel@tonic-gate 
5647*0Sstevel@tonic-gate 	if (prop->pr_unlock)
5648*0Sstevel@tonic-gate 		page_unlock(pp);
5649*0Sstevel@tonic-gate 
5650*0Sstevel@tonic-gate 	if (page_retire_messages > 1) {
5651*0Sstevel@tonic-gate 		printf("page_retire(%p) pfn 0x%lx %s: %s\n",
5652*0Sstevel@tonic-gate 		    (void *)pp, page_pptonum(pp),
5653*0Sstevel@tonic-gate 		    prop->pr_retval == -1 ? "failed" : "succeeded",
5654*0Sstevel@tonic-gate 		    prop->pr_message);
5655*0Sstevel@tonic-gate 	}
5656*0Sstevel@tonic-gate 
5657*0Sstevel@tonic-gate 	return (prop->pr_retval);
5658*0Sstevel@tonic-gate }
5659*0Sstevel@tonic-gate 
5660*0Sstevel@tonic-gate int
5661*0Sstevel@tonic-gate page_retire(page_t *pp, uchar_t flag)
5662*0Sstevel@tonic-gate {
5663*0Sstevel@tonic-gate 	uint64_t pa = ptob((uint64_t)page_pptonum(pp));
5664*0Sstevel@tonic-gate 
5665*0Sstevel@tonic-gate 	ASSERT(flag == PAGE_IS_FAILING || flag == PAGE_IS_TOXIC);
5666*0Sstevel@tonic-gate 
5667*0Sstevel@tonic-gate 	/*
5668*0Sstevel@tonic-gate 	 * DR operations change the association between a page_t
5669*0Sstevel@tonic-gate 	 * and the physical page it represents. Check if the
5670*0Sstevel@tonic-gate 	 * page is still bad.
5671*0Sstevel@tonic-gate 	 */
5672*0Sstevel@tonic-gate 	if (!page_isfaulty(pp)) {
5673*0Sstevel@tonic-gate 		page_clrtoxic(pp);
5674*0Sstevel@tonic-gate 		return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
5675*0Sstevel@tonic-gate 	}
5676*0Sstevel@tonic-gate 
5677*0Sstevel@tonic-gate 	/*
5678*0Sstevel@tonic-gate 	 * We set the flag here so that even if we fail due
5679*0Sstevel@tonic-gate 	 * to exceeding the limit for retired pages, the
5680*0Sstevel@tonic-gate 	 * page will still be checked and either cleared
5681*0Sstevel@tonic-gate 	 * or retired in page_free().
5682*0Sstevel@tonic-gate 	 */
5683*0Sstevel@tonic-gate 	page_settoxic(pp, flag);
5684*0Sstevel@tonic-gate 
5685*0Sstevel@tonic-gate 	if (flag == PAGE_IS_TOXIC) {
5686*0Sstevel@tonic-gate 		if (page_retire_messages) {
5687*0Sstevel@tonic-gate 			cmn_err(CE_NOTE, "Scheduling clearing of error on"
5688*0Sstevel@tonic-gate 			    " page 0x%08x.%08x",
5689*0Sstevel@tonic-gate 			    (uint32_t)(pa >> 32), (uint32_t)pa);
5690*0Sstevel@tonic-gate 		}
5691*0Sstevel@tonic-gate 
5692*0Sstevel@tonic-gate 	} else { /* PAGE_IS_FAILING */
5693*0Sstevel@tonic-gate 		if (pages_retired_limit_exceeded()) {
5694*0Sstevel@tonic-gate 			/*
5695*0Sstevel@tonic-gate 			 * Return as we have already exceeded the
5696*0Sstevel@tonic-gate 			 * maximum number of pages allowed to be
5697*0Sstevel@tonic-gate 			 * retired
5698*0Sstevel@tonic-gate 			 */
5699*0Sstevel@tonic-gate 			return (page_retire_done(pp, PAGE_RETIRE_LIMIT));
5700*0Sstevel@tonic-gate 		}
5701*0Sstevel@tonic-gate 
5702*0Sstevel@tonic-gate 		if (page_retire_messages) {
5703*0Sstevel@tonic-gate 			cmn_err(CE_NOTE, "Scheduling removal of "
5704*0Sstevel@tonic-gate 			    "page 0x%08x.%08x",
5705*0Sstevel@tonic-gate 			    (uint32_t)(pa >> 32), (uint32_t)pa);
5706*0Sstevel@tonic-gate 		}
5707*0Sstevel@tonic-gate 	}
5708*0Sstevel@tonic-gate 
5709*0Sstevel@tonic-gate 	if (PAGE_LOCKED(pp) || !page_trylock(pp, SE_EXCL))
5710*0Sstevel@tonic-gate 		return (page_retire_done(pp, PAGE_RETIRE_SELOCK));
5711*0Sstevel@tonic-gate 
5712*0Sstevel@tonic-gate 	/*
5713*0Sstevel@tonic-gate 	 * If this is a large page we first try and demote it
5714*0Sstevel@tonic-gate 	 * to PAGESIZE pages and then dispose of the toxic page.
5715*0Sstevel@tonic-gate 	 * On failure we will let the page free/destroy
5716*0Sstevel@tonic-gate 	 * code handle it later since this is a mapped page.
5717*0Sstevel@tonic-gate 	 * Note that free large pages can always be demoted.
5718*0Sstevel@tonic-gate 	 *
5719*0Sstevel@tonic-gate 	 */
5720*0Sstevel@tonic-gate 	if (pp->p_szc != 0) {
5721*0Sstevel@tonic-gate 		if (PP_ISFREE(pp))
5722*0Sstevel@tonic-gate 			(void) page_demote_free_pages(pp);
5723*0Sstevel@tonic-gate 		else
5724*0Sstevel@tonic-gate 			(void) page_try_demote_pages(pp);
5725*0Sstevel@tonic-gate 
5726*0Sstevel@tonic-gate 		if (pp->p_szc != 0)
5727*0Sstevel@tonic-gate 			return (page_retire_done(pp, PAGE_RETIRE_LPAGE));
5728*0Sstevel@tonic-gate 	}
5729*0Sstevel@tonic-gate 
5730*0Sstevel@tonic-gate 	if (PP_ISFREE(pp)) {
5731*0Sstevel@tonic-gate 		if (!page_reclaim(pp, NULL))
5732*0Sstevel@tonic-gate 			return (page_retire_done(pp, PAGE_RETIRE_NORECLAIM));
5733*0Sstevel@tonic-gate 		/*LINTED: constant in conditional context*/
5734*0Sstevel@tonic-gate 		VN_DISPOSE(pp, pp->p_vnode ? B_INVAL : B_FREE, 0, kcred)
5735*0Sstevel@tonic-gate 		return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
5736*0Sstevel@tonic-gate 	}
5737*0Sstevel@tonic-gate 
5738*0Sstevel@tonic-gate 	if (pp->p_lckcnt != 0)
5739*0Sstevel@tonic-gate 		return (page_retire_done(pp, PAGE_RETIRE_LOCKED));
5740*0Sstevel@tonic-gate 
5741*0Sstevel@tonic-gate 	if (pp->p_cowcnt != 0)
5742*0Sstevel@tonic-gate 		return (page_retire_done(pp, PAGE_RETIRE_COW));
5743*0Sstevel@tonic-gate 
5744*0Sstevel@tonic-gate 	/*
5745*0Sstevel@tonic-gate 	 * Unload all translations to this page.  No new translations
5746*0Sstevel@tonic-gate 	 * can be created while we hold the exclusive lock on the page.
5747*0Sstevel@tonic-gate 	 */
5748*0Sstevel@tonic-gate 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
5749*0Sstevel@tonic-gate 
5750*0Sstevel@tonic-gate 	if (hat_ismod(pp))
5751*0Sstevel@tonic-gate 		return (page_retire_done(pp, PAGE_RETIRE_DIRTY));
5752*0Sstevel@tonic-gate 
5753*0Sstevel@tonic-gate 	/*LINTED: constant in conditional context*/
5754*0Sstevel@tonic-gate 	VN_DISPOSE(pp, B_INVAL, 0, kcred);
5755*0Sstevel@tonic-gate 
5756*0Sstevel@tonic-gate 	return (page_retire_done(pp, PAGE_RETIRE_SUCCESS));
5757*0Sstevel@tonic-gate }
5758*0Sstevel@tonic-gate 
5759*0Sstevel@tonic-gate /*
5760*0Sstevel@tonic-gate  * Mark any existing pages for migration in the given range
5761*0Sstevel@tonic-gate  */
5762*0Sstevel@tonic-gate void
5763*0Sstevel@tonic-gate page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5764*0Sstevel@tonic-gate     struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5765*0Sstevel@tonic-gate     u_offset_t vnoff, int rflag)
5766*0Sstevel@tonic-gate {
5767*0Sstevel@tonic-gate 	struct anon	*ap;
5768*0Sstevel@tonic-gate 	vnode_t		*curvp;
5769*0Sstevel@tonic-gate 	lgrp_t		*from;
5770*0Sstevel@tonic-gate 	pgcnt_t		i;
5771*0Sstevel@tonic-gate 	pgcnt_t		nlocked;
5772*0Sstevel@tonic-gate 	u_offset_t	off;
5773*0Sstevel@tonic-gate 	pfn_t		pfn;
5774*0Sstevel@tonic-gate 	size_t		pgsz;
5775*0Sstevel@tonic-gate 	size_t		segpgsz;
5776*0Sstevel@tonic-gate 	pgcnt_t		pages;
5777*0Sstevel@tonic-gate 	uint_t		pszc;
5778*0Sstevel@tonic-gate 	page_t		**ppa;
5779*0Sstevel@tonic-gate 	pgcnt_t		ppa_nentries;
5780*0Sstevel@tonic-gate 	page_t		*pp;
5781*0Sstevel@tonic-gate 	caddr_t		va;
5782*0Sstevel@tonic-gate 	ulong_t		an_idx;
5783*0Sstevel@tonic-gate 	anon_sync_obj_t	cookie;
5784*0Sstevel@tonic-gate 
5785*0Sstevel@tonic-gate 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5786*0Sstevel@tonic-gate 
5787*0Sstevel@tonic-gate 	/*
5788*0Sstevel@tonic-gate 	 * Don't do anything if don't need to do lgroup optimizations
5789*0Sstevel@tonic-gate 	 * on this system
5790*0Sstevel@tonic-gate 	 */
5791*0Sstevel@tonic-gate 	if (!lgrp_optimizations())
5792*0Sstevel@tonic-gate 		return;
5793*0Sstevel@tonic-gate 
5794*0Sstevel@tonic-gate 	/*
5795*0Sstevel@tonic-gate 	 * Align address and length to (potentially large) page boundary
5796*0Sstevel@tonic-gate 	 */
5797*0Sstevel@tonic-gate 	segpgsz = page_get_pagesize(seg->s_szc);
5798*0Sstevel@tonic-gate 	addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5799*0Sstevel@tonic-gate 	if (rflag)
5800*0Sstevel@tonic-gate 		len = P2ROUNDUP(len, segpgsz);
5801*0Sstevel@tonic-gate 
5802*0Sstevel@tonic-gate 	/*
5803*0Sstevel@tonic-gate 	 * Allocate page array to accomodate largest page size
5804*0Sstevel@tonic-gate 	 */
5805*0Sstevel@tonic-gate 	pgsz = page_get_pagesize(page_num_pagesizes() - 1);
5806*0Sstevel@tonic-gate 	ppa_nentries = btop(pgsz);
5807*0Sstevel@tonic-gate 	ppa = kmem_zalloc(ppa_nentries * sizeof (page_t *), KM_SLEEP);
5808*0Sstevel@tonic-gate 
5809*0Sstevel@tonic-gate 	/*
5810*0Sstevel@tonic-gate 	 * Do one (large) page at a time
5811*0Sstevel@tonic-gate 	 */
5812*0Sstevel@tonic-gate 	va = addr;
5813*0Sstevel@tonic-gate 	while (va < addr + len) {
5814*0Sstevel@tonic-gate 		/*
5815*0Sstevel@tonic-gate 		 * Lookup (root) page for vnode and offset corresponding to
5816*0Sstevel@tonic-gate 		 * this virtual address
5817*0Sstevel@tonic-gate 		 * Try anonmap first since there may be copy-on-write
5818*0Sstevel@tonic-gate 		 * pages, but initialize vnode pointer and offset using
5819*0Sstevel@tonic-gate 		 * vnode arguments just in case there isn't an amp.
5820*0Sstevel@tonic-gate 		 */
5821*0Sstevel@tonic-gate 		curvp = vp;
5822*0Sstevel@tonic-gate 		off = vnoff + va - seg->s_base;
5823*0Sstevel@tonic-gate 		if (amp) {
5824*0Sstevel@tonic-gate 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5825*0Sstevel@tonic-gate 			an_idx = anon_index + seg_page(seg, va);
5826*0Sstevel@tonic-gate 			anon_array_enter(amp, an_idx, &cookie);
5827*0Sstevel@tonic-gate 			ap = anon_get_ptr(amp->ahp, an_idx);
5828*0Sstevel@tonic-gate 			if (ap)
5829*0Sstevel@tonic-gate 				swap_xlate(ap, &curvp, &off);
5830*0Sstevel@tonic-gate 			anon_array_exit(&cookie);
5831*0Sstevel@tonic-gate 			ANON_LOCK_EXIT(&amp->a_rwlock);
5832*0Sstevel@tonic-gate 		}
5833*0Sstevel@tonic-gate 
5834*0Sstevel@tonic-gate 		pp = NULL;
5835*0Sstevel@tonic-gate 		if (curvp)
5836*0Sstevel@tonic-gate 			pp = page_lookup(curvp, off, SE_SHARED);
5837*0Sstevel@tonic-gate 
5838*0Sstevel@tonic-gate 		/*
5839*0Sstevel@tonic-gate 		 * If there isn't a page at this virtual address,
5840*0Sstevel@tonic-gate 		 * skip to next page
5841*0Sstevel@tonic-gate 		 */
5842*0Sstevel@tonic-gate 		if (pp == NULL) {
5843*0Sstevel@tonic-gate 			va += PAGESIZE;
5844*0Sstevel@tonic-gate 			continue;
5845*0Sstevel@tonic-gate 		}
5846*0Sstevel@tonic-gate 
5847*0Sstevel@tonic-gate 		/*
5848*0Sstevel@tonic-gate 		 * Figure out which lgroup this page is in for kstats
5849*0Sstevel@tonic-gate 		 */
5850*0Sstevel@tonic-gate 		pfn = page_pptonum(pp);
5851*0Sstevel@tonic-gate 		from = lgrp_pfn_to_lgrp(pfn);
5852*0Sstevel@tonic-gate 
5853*0Sstevel@tonic-gate 		/*
5854*0Sstevel@tonic-gate 		 * Get page size, and round up and skip to next page boundary
5855*0Sstevel@tonic-gate 		 * if unaligned address
5856*0Sstevel@tonic-gate 		 */
5857*0Sstevel@tonic-gate 		pszc = pp->p_szc;
5858*0Sstevel@tonic-gate 		pgsz = page_get_pagesize(pszc);
5859*0Sstevel@tonic-gate 		pages = btop(pgsz);
5860*0Sstevel@tonic-gate 		if (!IS_P2ALIGNED(va, pgsz) ||
5861*0Sstevel@tonic-gate 		    !IS_P2ALIGNED(pfn, pages) ||
5862*0Sstevel@tonic-gate 		    pgsz > segpgsz) {
5863*0Sstevel@tonic-gate 			pgsz = MIN(pgsz, segpgsz);
5864*0Sstevel@tonic-gate 			page_unlock(pp);
5865*0Sstevel@tonic-gate 			i = btop(P2END((uintptr_t)va, pgsz) -
5866*0Sstevel@tonic-gate 			    (uintptr_t)va);
5867*0Sstevel@tonic-gate 			va = (caddr_t)P2END((uintptr_t)va, pgsz);
5868*0Sstevel@tonic-gate 			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, i);
5869*0Sstevel@tonic-gate 			continue;
5870*0Sstevel@tonic-gate 		}
5871*0Sstevel@tonic-gate 
5872*0Sstevel@tonic-gate 		/*
5873*0Sstevel@tonic-gate 		 * Upgrade to exclusive lock on page
5874*0Sstevel@tonic-gate 		 */
5875*0Sstevel@tonic-gate 		if (!page_tryupgrade(pp)) {
5876*0Sstevel@tonic-gate 			page_unlock(pp);
5877*0Sstevel@tonic-gate 			va += pgsz;
5878*0Sstevel@tonic-gate 			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5879*0Sstevel@tonic-gate 			    btop(pgsz));
5880*0Sstevel@tonic-gate 			continue;
5881*0Sstevel@tonic-gate 		}
5882*0Sstevel@tonic-gate 
5883*0Sstevel@tonic-gate 		/*
5884*0Sstevel@tonic-gate 		 * Remember pages locked exclusively and how many
5885*0Sstevel@tonic-gate 		 */
5886*0Sstevel@tonic-gate 		ppa[0] = pp;
5887*0Sstevel@tonic-gate 		nlocked = 1;
5888*0Sstevel@tonic-gate 
5889*0Sstevel@tonic-gate 		/*
5890*0Sstevel@tonic-gate 		 * Lock constituent pages if this is large page
5891*0Sstevel@tonic-gate 		 */
5892*0Sstevel@tonic-gate 		if (pages > 1) {
5893*0Sstevel@tonic-gate 			/*
5894*0Sstevel@tonic-gate 			 * Lock all constituents except root page, since it
5895*0Sstevel@tonic-gate 			 * should be locked already.
5896*0Sstevel@tonic-gate 			 */
5897*0Sstevel@tonic-gate 			for (i = 1; i < pages; i++) {
5898*0Sstevel@tonic-gate 				pp = page_next(pp);
5899*0Sstevel@tonic-gate 				if (!page_trylock(pp, SE_EXCL)) {
5900*0Sstevel@tonic-gate 					break;
5901*0Sstevel@tonic-gate 				}
5902*0Sstevel@tonic-gate 				if (PP_ISFREE(pp) ||
5903*0Sstevel@tonic-gate 				    pp->p_szc != pszc) {
5904*0Sstevel@tonic-gate 					/*
5905*0Sstevel@tonic-gate 					 * hat_page_demote() raced in with us.
5906*0Sstevel@tonic-gate 					 */
5907*0Sstevel@tonic-gate 					ASSERT(!IS_SWAPFSVP(curvp));
5908*0Sstevel@tonic-gate 					page_unlock(pp);
5909*0Sstevel@tonic-gate 					break;
5910*0Sstevel@tonic-gate 				}
5911*0Sstevel@tonic-gate 				ppa[nlocked] = pp;
5912*0Sstevel@tonic-gate 				nlocked++;
5913*0Sstevel@tonic-gate 			}
5914*0Sstevel@tonic-gate 		}
5915*0Sstevel@tonic-gate 
5916*0Sstevel@tonic-gate 		/*
5917*0Sstevel@tonic-gate 		 * If all constituent pages couldn't be locked,
5918*0Sstevel@tonic-gate 		 * unlock pages locked so far and skip to next page.
5919*0Sstevel@tonic-gate 		 */
5920*0Sstevel@tonic-gate 		if (nlocked != pages) {
5921*0Sstevel@tonic-gate 			for (i = 0; i < nlocked; i++)
5922*0Sstevel@tonic-gate 				page_unlock(ppa[i]);
5923*0Sstevel@tonic-gate 			va += pgsz;
5924*0Sstevel@tonic-gate 			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5925*0Sstevel@tonic-gate 			    btop(pgsz));
5926*0Sstevel@tonic-gate 			continue;
5927*0Sstevel@tonic-gate 		}
5928*0Sstevel@tonic-gate 
5929*0Sstevel@tonic-gate 		/*
5930*0Sstevel@tonic-gate 		 * hat_page_demote() can no longer happen
5931*0Sstevel@tonic-gate 		 * since last cons page had the right p_szc after
5932*0Sstevel@tonic-gate 		 * all cons pages were locked. all cons pages
5933*0Sstevel@tonic-gate 		 * should now have the same p_szc.
5934*0Sstevel@tonic-gate 		 */
5935*0Sstevel@tonic-gate 
5936*0Sstevel@tonic-gate 		/*
5937*0Sstevel@tonic-gate 		 * All constituent pages locked successfully, so mark
5938*0Sstevel@tonic-gate 		 * large page for migration and unload the mappings of
5939*0Sstevel@tonic-gate 		 * constituent pages, so a fault will occur on any part of the
5940*0Sstevel@tonic-gate 		 * large page
5941*0Sstevel@tonic-gate 		 */
5942*0Sstevel@tonic-gate 		PP_SETMIGRATE(ppa[0]);
5943*0Sstevel@tonic-gate 		for (i = 0; i < nlocked; i++) {
5944*0Sstevel@tonic-gate 			pp = ppa[i];
5945*0Sstevel@tonic-gate 			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
5946*0Sstevel@tonic-gate 			ASSERT(hat_page_getshare(pp) == 0);
5947*0Sstevel@tonic-gate 			page_unlock(pp);
5948*0Sstevel@tonic-gate 		}
5949*0Sstevel@tonic-gate 		lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5950*0Sstevel@tonic-gate 
5951*0Sstevel@tonic-gate 		va += pgsz;
5952*0Sstevel@tonic-gate 	}
5953*0Sstevel@tonic-gate 	kmem_free(ppa, ppa_nentries * sizeof (page_t *));
5954*0Sstevel@tonic-gate }
5955*0Sstevel@tonic-gate 
5956*0Sstevel@tonic-gate /*
5957*0Sstevel@tonic-gate  * Migrate any pages that have been marked for migration in the given range
5958*0Sstevel@tonic-gate  */
5959*0Sstevel@tonic-gate void
5960*0Sstevel@tonic-gate page_migrate(
5961*0Sstevel@tonic-gate 	struct seg	*seg,
5962*0Sstevel@tonic-gate 	caddr_t		addr,
5963*0Sstevel@tonic-gate 	page_t		**ppa,
5964*0Sstevel@tonic-gate 	pgcnt_t		npages)
5965*0Sstevel@tonic-gate {
5966*0Sstevel@tonic-gate 	lgrp_t		*from;
5967*0Sstevel@tonic-gate 	lgrp_t		*to;
5968*0Sstevel@tonic-gate 	page_t		*newpp;
5969*0Sstevel@tonic-gate 	page_t		*pp;
5970*0Sstevel@tonic-gate 	pfn_t		pfn;
5971*0Sstevel@tonic-gate 	size_t		pgsz;
5972*0Sstevel@tonic-gate 	spgcnt_t	page_cnt;
5973*0Sstevel@tonic-gate 	spgcnt_t	i;
5974*0Sstevel@tonic-gate 	uint_t		pszc;
5975*0Sstevel@tonic-gate 
5976*0Sstevel@tonic-gate 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5977*0Sstevel@tonic-gate 
5978*0Sstevel@tonic-gate 	while (npages > 0) {
5979*0Sstevel@tonic-gate 		pp = *ppa;
5980*0Sstevel@tonic-gate 		pszc = pp->p_szc;
5981*0Sstevel@tonic-gate 		pgsz = page_get_pagesize(pszc);
5982*0Sstevel@tonic-gate 		page_cnt = btop(pgsz);
5983*0Sstevel@tonic-gate 
5984*0Sstevel@tonic-gate 		/*
5985*0Sstevel@tonic-gate 		 * Check to see whether this page is marked for migration
5986*0Sstevel@tonic-gate 		 *
5987*0Sstevel@tonic-gate 		 * Assume that root page of large page is marked for
5988*0Sstevel@tonic-gate 		 * migration and none of the other constituent pages
5989*0Sstevel@tonic-gate 		 * are marked.  This really simplifies clearing the
5990*0Sstevel@tonic-gate 		 * migrate bit by not having to clear it from each
5991*0Sstevel@tonic-gate 		 * constituent page.
5992*0Sstevel@tonic-gate 		 *
5993*0Sstevel@tonic-gate 		 * note we don't want to relocate an entire large page if
5994*0Sstevel@tonic-gate 		 * someone is only using one subpage.
5995*0Sstevel@tonic-gate 		 */
5996*0Sstevel@tonic-gate 		if (npages < page_cnt)
5997*0Sstevel@tonic-gate 			break;
5998*0Sstevel@tonic-gate 
5999*0Sstevel@tonic-gate 		/*
6000*0Sstevel@tonic-gate 		 * Is it marked for migration?
6001*0Sstevel@tonic-gate 		 */
6002*0Sstevel@tonic-gate 		if (!PP_ISMIGRATE(pp))
6003*0Sstevel@tonic-gate 			goto next;
6004*0Sstevel@tonic-gate 
6005*0Sstevel@tonic-gate 		/*
6006*0Sstevel@tonic-gate 		 * Determine lgroups that page is being migrated between
6007*0Sstevel@tonic-gate 		 */
6008*0Sstevel@tonic-gate 		pfn = page_pptonum(pp);
6009*0Sstevel@tonic-gate 		if (!IS_P2ALIGNED(pfn, page_cnt)) {
6010*0Sstevel@tonic-gate 			break;
6011*0Sstevel@tonic-gate 		}
6012*0Sstevel@tonic-gate 		from = lgrp_pfn_to_lgrp(pfn);
6013*0Sstevel@tonic-gate 		to = lgrp_mem_choose(seg, addr, pgsz);
6014*0Sstevel@tonic-gate 
6015*0Sstevel@tonic-gate 		/*
6016*0Sstevel@tonic-gate 		 * Check to see whether we are trying to migrate page to lgroup
6017*0Sstevel@tonic-gate 		 * where it is allocated already
6018*0Sstevel@tonic-gate 		 */
6019*0Sstevel@tonic-gate 		if (to == from) {
6020*0Sstevel@tonic-gate 			PP_CLRMIGRATE(pp);
6021*0Sstevel@tonic-gate 			goto next;
6022*0Sstevel@tonic-gate 		}
6023*0Sstevel@tonic-gate 
6024*0Sstevel@tonic-gate 		/*
6025*0Sstevel@tonic-gate 		 * Need to get exclusive lock's to migrate
6026*0Sstevel@tonic-gate 		 */
6027*0Sstevel@tonic-gate 		for (i = 0; i < page_cnt; i++) {
6028*0Sstevel@tonic-gate 			ASSERT(PAGE_LOCKED(ppa[i]));
6029*0Sstevel@tonic-gate 			if (page_pptonum(ppa[i]) != pfn + i ||
6030*0Sstevel@tonic-gate 			    ppa[i]->p_szc != pszc) {
6031*0Sstevel@tonic-gate 				break;
6032*0Sstevel@tonic-gate 			}
6033*0Sstevel@tonic-gate 			if (!page_tryupgrade(ppa[i])) {
6034*0Sstevel@tonic-gate 				lgrp_stat_add(from->lgrp_id,
6035*0Sstevel@tonic-gate 				    LGRP_PM_FAIL_LOCK_PGS,
6036*0Sstevel@tonic-gate 				    page_cnt);
6037*0Sstevel@tonic-gate 				break;
6038*0Sstevel@tonic-gate 			}
6039*0Sstevel@tonic-gate 		}
6040*0Sstevel@tonic-gate 		if (i != page_cnt) {
6041*0Sstevel@tonic-gate 			while (--i != -1) {
6042*0Sstevel@tonic-gate 				page_downgrade(ppa[i]);
6043*0Sstevel@tonic-gate 			}
6044*0Sstevel@tonic-gate 			goto next;
6045*0Sstevel@tonic-gate 		}
6046*0Sstevel@tonic-gate 
6047*0Sstevel@tonic-gate 		(void) page_create_wait(page_cnt, PG_WAIT);
6048*0Sstevel@tonic-gate 		newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
6049*0Sstevel@tonic-gate 		if (newpp == NULL) {
6050*0Sstevel@tonic-gate 			page_create_putback(page_cnt);
6051*0Sstevel@tonic-gate 			for (i = 0; i < page_cnt; i++) {
6052*0Sstevel@tonic-gate 				page_downgrade(ppa[i]);
6053*0Sstevel@tonic-gate 			}
6054*0Sstevel@tonic-gate 			lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
6055*0Sstevel@tonic-gate 			    page_cnt);
6056*0Sstevel@tonic-gate 			goto next;
6057*0Sstevel@tonic-gate 		}
6058*0Sstevel@tonic-gate 		ASSERT(newpp->p_szc == pszc);
6059*0Sstevel@tonic-gate 		/*
6060*0Sstevel@tonic-gate 		 * Clear migrate bit and relocate page
6061*0Sstevel@tonic-gate 		 */
6062*0Sstevel@tonic-gate 		PP_CLRMIGRATE(pp);
6063*0Sstevel@tonic-gate 		if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
6064*0Sstevel@tonic-gate 			panic("page_migrate: page_relocate failed");
6065*0Sstevel@tonic-gate 		}
6066*0Sstevel@tonic-gate 		ASSERT(page_cnt * PAGESIZE == pgsz);
6067*0Sstevel@tonic-gate 
6068*0Sstevel@tonic-gate 		/*
6069*0Sstevel@tonic-gate 		 * Keep stats for number of pages migrated from and to
6070*0Sstevel@tonic-gate 		 * each lgroup
6071*0Sstevel@tonic-gate 		 */
6072*0Sstevel@tonic-gate 		lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
6073*0Sstevel@tonic-gate 		lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
6074*0Sstevel@tonic-gate 		/*
6075*0Sstevel@tonic-gate 		 * update the page_t array we were passed in and
6076*0Sstevel@tonic-gate 		 * unlink constituent pages of a large page.
6077*0Sstevel@tonic-gate 		 */
6078*0Sstevel@tonic-gate 		for (i = 0; i < page_cnt; ++i, ++pp) {
6079*0Sstevel@tonic-gate 			ASSERT(PAGE_EXCL(newpp));
6080*0Sstevel@tonic-gate 			ASSERT(newpp->p_szc == pszc);
6081*0Sstevel@tonic-gate 			ppa[i] = newpp;
6082*0Sstevel@tonic-gate 			pp = newpp;
6083*0Sstevel@tonic-gate 			page_sub(&newpp, pp);
6084*0Sstevel@tonic-gate 			page_downgrade(pp);
6085*0Sstevel@tonic-gate 		}
6086*0Sstevel@tonic-gate 		ASSERT(newpp == NULL);
6087*0Sstevel@tonic-gate next:
6088*0Sstevel@tonic-gate 		addr += pgsz;
6089*0Sstevel@tonic-gate 		ppa += page_cnt;
6090*0Sstevel@tonic-gate 		npages -= page_cnt;
6091*0Sstevel@tonic-gate 	}
6092*0Sstevel@tonic-gate }
6093*0Sstevel@tonic-gate 
6094*0Sstevel@tonic-gate /*
6095*0Sstevel@tonic-gate  * initialize the vnode for retired pages
6096*0Sstevel@tonic-gate  */
6097*0Sstevel@tonic-gate static void
6098*0Sstevel@tonic-gate page_retired_init(void)
6099*0Sstevel@tonic-gate {
6100*0Sstevel@tonic-gate 	vn_setops(&retired_ppages, &retired_vnodeops);
6101*0Sstevel@tonic-gate }
6102*0Sstevel@tonic-gate 
6103*0Sstevel@tonic-gate /* ARGSUSED */
6104*0Sstevel@tonic-gate static void
6105*0Sstevel@tonic-gate retired_dispose(vnode_t *vp, page_t *pp, int flag, int dn, cred_t *cr)
6106*0Sstevel@tonic-gate {
6107*0Sstevel@tonic-gate 	panic("retired_dispose invoked");
6108*0Sstevel@tonic-gate }
6109*0Sstevel@tonic-gate 
6110*0Sstevel@tonic-gate /* ARGSUSED */
6111*0Sstevel@tonic-gate static void
6112*0Sstevel@tonic-gate retired_inactive(vnode_t *vp, cred_t *cr)
6113*0Sstevel@tonic-gate {}
6114*0Sstevel@tonic-gate 
6115*0Sstevel@tonic-gate void
6116*0Sstevel@tonic-gate page_unretire_pages(void)
6117*0Sstevel@tonic-gate {
6118*0Sstevel@tonic-gate 	page_t		*pp;
6119*0Sstevel@tonic-gate 	kmutex_t	*vphm;
6120*0Sstevel@tonic-gate 	vnode_t		*vp;
6121*0Sstevel@tonic-gate 	page_t		*rpages[UNRETIRE_PAGES];
6122*0Sstevel@tonic-gate 	pgcnt_t		i, npages, rmem;
6123*0Sstevel@tonic-gate 	uint64_t	pa;
6124*0Sstevel@tonic-gate 
6125*0Sstevel@tonic-gate 	rmem = 0;
6126*0Sstevel@tonic-gate 
6127*0Sstevel@tonic-gate 	for (;;) {
6128*0Sstevel@tonic-gate 		/*
6129*0Sstevel@tonic-gate 		 * We do this in 2 steps:
6130*0Sstevel@tonic-gate 		 *
6131*0Sstevel@tonic-gate 		 * 1. We walk the retired pages list and collect a list of
6132*0Sstevel@tonic-gate 		 *    pages that have the toxic field cleared.
6133*0Sstevel@tonic-gate 		 *
6134*0Sstevel@tonic-gate 		 * 2. We iterate through the page list and unretire each one.
6135*0Sstevel@tonic-gate 		 *
6136*0Sstevel@tonic-gate 		 * We have to do it in two steps on account of the mutexes that
6137*0Sstevel@tonic-gate 		 * we need to acquire.
6138*0Sstevel@tonic-gate 		 */
6139*0Sstevel@tonic-gate 
6140*0Sstevel@tonic-gate 		vp = &retired_ppages;
6141*0Sstevel@tonic-gate 		vphm = page_vnode_mutex(vp);
6142*0Sstevel@tonic-gate 		mutex_enter(vphm);
6143*0Sstevel@tonic-gate 
6144*0Sstevel@tonic-gate 		if ((pp = vp->v_pages) == NULL) {
6145*0Sstevel@tonic-gate 			mutex_exit(vphm);
6146*0Sstevel@tonic-gate 			break;
6147*0Sstevel@tonic-gate 		}
6148*0Sstevel@tonic-gate 
6149*0Sstevel@tonic-gate 		i = 0;
6150*0Sstevel@tonic-gate 		do {
6151*0Sstevel@tonic-gate 			ASSERT(pp != NULL);
6152*0Sstevel@tonic-gate 			ASSERT(pp->p_vnode == vp);
6153*0Sstevel@tonic-gate 
6154*0Sstevel@tonic-gate 			/*
6155*0Sstevel@tonic-gate 			 * DR operations change the association between a page_t
6156*0Sstevel@tonic-gate 			 * and the physical page it represents. Check if the
6157*0Sstevel@tonic-gate 			 * page is still bad. If not, unretire it.
6158*0Sstevel@tonic-gate 			 */
6159*0Sstevel@tonic-gate 			if (!page_isfaulty(pp))
6160*0Sstevel@tonic-gate 				rpages[i++] = pp;
6161*0Sstevel@tonic-gate 
6162*0Sstevel@tonic-gate 			pp = pp->p_vpnext;
6163*0Sstevel@tonic-gate 		} while ((pp != vp->v_pages) && (i < UNRETIRE_PAGES));
6164*0Sstevel@tonic-gate 
6165*0Sstevel@tonic-gate 		mutex_exit(vphm);
6166*0Sstevel@tonic-gate 
6167*0Sstevel@tonic-gate 		npages = i;
6168*0Sstevel@tonic-gate 		for (i = 0; i < npages; i++) {
6169*0Sstevel@tonic-gate 			pp = rpages[i];
6170*0Sstevel@tonic-gate 			pa = ptob((uint64_t)page_pptonum(pp));
6171*0Sstevel@tonic-gate 
6172*0Sstevel@tonic-gate 			/*
6173*0Sstevel@tonic-gate 			 * Need to upgrade the shared lock to an exclusive
6174*0Sstevel@tonic-gate 			 * lock in order to hash out the page.
6175*0Sstevel@tonic-gate 			 *
6176*0Sstevel@tonic-gate 			 * The page could have been retired but the page lock
6177*0Sstevel@tonic-gate 			 * may not have been downgraded yet. If so, skip this
6178*0Sstevel@tonic-gate 			 * page. page_free() will call this function after the
6179*0Sstevel@tonic-gate 			 * lock is downgraded.
6180*0Sstevel@tonic-gate 			 */
6181*0Sstevel@tonic-gate 
6182*0Sstevel@tonic-gate 			if (!PAGE_SHARED(pp) || !page_tryupgrade(pp))
6183*0Sstevel@tonic-gate 				continue;
6184*0Sstevel@tonic-gate 
6185*0Sstevel@tonic-gate 			/*
6186*0Sstevel@tonic-gate 			 * Both page_free() and DR call this function. They
6187*0Sstevel@tonic-gate 			 * can potentially call this function at the same
6188*0Sstevel@tonic-gate 			 * time and race with each other.
6189*0Sstevel@tonic-gate 			 */
6190*0Sstevel@tonic-gate 			if (!page_isretired(pp) || page_isfaulty(pp)) {
6191*0Sstevel@tonic-gate 				page_downgrade(pp);
6192*0Sstevel@tonic-gate 				continue;
6193*0Sstevel@tonic-gate 			}
6194*0Sstevel@tonic-gate 
6195*0Sstevel@tonic-gate 			cmn_err(CE_NOTE,
6196*0Sstevel@tonic-gate 				"unretiring retired page 0x%08x.%08x",
6197*0Sstevel@tonic-gate 				(uint32_t)(pa >> 32), (uint32_t)pa);
6198*0Sstevel@tonic-gate 
6199*0Sstevel@tonic-gate 			/*
6200*0Sstevel@tonic-gate 			 * When a page is removed from the retired pages vnode,
6201*0Sstevel@tonic-gate 			 * its toxic field is also cleared. So, we do not have
6202*0Sstevel@tonic-gate 			 * to do that seperately here.
6203*0Sstevel@tonic-gate 			 */
6204*0Sstevel@tonic-gate 			page_hashout(pp, (kmutex_t *)NULL);
6205*0Sstevel@tonic-gate 
6206*0Sstevel@tonic-gate 			/*
6207*0Sstevel@tonic-gate 			 * This is a good page. So, free it.
6208*0Sstevel@tonic-gate 			 */
6209*0Sstevel@tonic-gate 			pp->p_vnode = NULL;
6210*0Sstevel@tonic-gate 			page_free(pp, 1);
6211*0Sstevel@tonic-gate 			rmem++;
6212*0Sstevel@tonic-gate 		}
6213*0Sstevel@tonic-gate 
6214*0Sstevel@tonic-gate 		/*
6215*0Sstevel@tonic-gate 		 * If the rpages array was filled up, then there could be more
6216*0Sstevel@tonic-gate 		 * retired pages that are not faulty. We need to iterate
6217*0Sstevel@tonic-gate 		 * again and unretire them. Otherwise, we are done.
6218*0Sstevel@tonic-gate 		 */
6219*0Sstevel@tonic-gate 		if (npages < UNRETIRE_PAGES)
6220*0Sstevel@tonic-gate 			break;
6221*0Sstevel@tonic-gate 	}
6222*0Sstevel@tonic-gate 
6223*0Sstevel@tonic-gate 	mutex_enter(&freemem_lock);
6224*0Sstevel@tonic-gate 	availrmem += rmem;
6225*0Sstevel@tonic-gate 	mutex_exit(&freemem_lock);
6226*0Sstevel@tonic-gate }
6227*0Sstevel@tonic-gate 
6228*0Sstevel@tonic-gate ulong_t mem_waiters 	= 0;
6229*0Sstevel@tonic-gate ulong_t	max_count 	= 20;
6230*0Sstevel@tonic-gate #define	MAX_DELAY	0x1ff
6231*0Sstevel@tonic-gate 
6232*0Sstevel@tonic-gate /*
6233*0Sstevel@tonic-gate  * Check if enough memory is available to proceed.
6234*0Sstevel@tonic-gate  * Depending on system configuration and how much memory is
6235*0Sstevel@tonic-gate  * reserved for swap we need to check against two variables.
6236*0Sstevel@tonic-gate  * e.g. on systems with little physical swap availrmem can be
6237*0Sstevel@tonic-gate  * more reliable indicator of how much memory is available.
6238*0Sstevel@tonic-gate  * On systems with large phys swap freemem can be better indicator.
6239*0Sstevel@tonic-gate  * If freemem drops below threshold level don't return an error
6240*0Sstevel@tonic-gate  * immediately but wake up pageout to free memory and block.
6241*0Sstevel@tonic-gate  * This is done number of times. If pageout is not able to free
6242*0Sstevel@tonic-gate  * memory within certain time return an error.
6243*0Sstevel@tonic-gate  * The same applies for availrmem but kmem_reap is used to
6244*0Sstevel@tonic-gate  * free memory.
6245*0Sstevel@tonic-gate  */
6246*0Sstevel@tonic-gate int
6247*0Sstevel@tonic-gate page_mem_avail(pgcnt_t npages)
6248*0Sstevel@tonic-gate {
6249*0Sstevel@tonic-gate 	ulong_t count;
6250*0Sstevel@tonic-gate 
6251*0Sstevel@tonic-gate #if defined(__i386)
6252*0Sstevel@tonic-gate 	if (freemem > desfree + npages &&
6253*0Sstevel@tonic-gate 	    availrmem > swapfs_reserve + npages &&
6254*0Sstevel@tonic-gate 	    btop(vmem_size(heap_arena, VMEM_FREE)) > tune.t_minarmem +
6255*0Sstevel@tonic-gate 	    npages)
6256*0Sstevel@tonic-gate 		return (1);
6257*0Sstevel@tonic-gate #else
6258*0Sstevel@tonic-gate 	if (freemem > desfree + npages &&
6259*0Sstevel@tonic-gate 	    availrmem > swapfs_reserve + npages)
6260*0Sstevel@tonic-gate 		return (1);
6261*0Sstevel@tonic-gate #endif
6262*0Sstevel@tonic-gate 
6263*0Sstevel@tonic-gate 	count = max_count;
6264*0Sstevel@tonic-gate 	atomic_add_long(&mem_waiters, 1);
6265*0Sstevel@tonic-gate 
6266*0Sstevel@tonic-gate 	while (freemem < desfree + npages && --count) {
6267*0Sstevel@tonic-gate 		cv_signal(&proc_pageout->p_cv);
6268*0Sstevel@tonic-gate 		if (delay_sig(hz + (mem_waiters & MAX_DELAY))) {
6269*0Sstevel@tonic-gate 			atomic_add_long(&mem_waiters, -1);
6270*0Sstevel@tonic-gate 			return (0);
6271*0Sstevel@tonic-gate 		}
6272*0Sstevel@tonic-gate 	}
6273*0Sstevel@tonic-gate 	if (count == 0) {
6274*0Sstevel@tonic-gate 		atomic_add_long(&mem_waiters, -1);
6275*0Sstevel@tonic-gate 		return (0);
6276*0Sstevel@tonic-gate 	}
6277*0Sstevel@tonic-gate 
6278*0Sstevel@tonic-gate 	count = max_count;
6279*0Sstevel@tonic-gate 	while (availrmem < swapfs_reserve + npages && --count) {
6280*0Sstevel@tonic-gate 		kmem_reap();
6281*0Sstevel@tonic-gate 		if (delay_sig(hz + (mem_waiters & MAX_DELAY))) {
6282*0Sstevel@tonic-gate 			atomic_add_long(&mem_waiters, -1);
6283*0Sstevel@tonic-gate 			return (0);
6284*0Sstevel@tonic-gate 		}
6285*0Sstevel@tonic-gate 	}
6286*0Sstevel@tonic-gate 	atomic_add_long(&mem_waiters, -1);
6287*0Sstevel@tonic-gate 	if (count == 0)
6288*0Sstevel@tonic-gate 		return (0);
6289*0Sstevel@tonic-gate 
6290*0Sstevel@tonic-gate #if defined(__i386)
6291*0Sstevel@tonic-gate 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
6292*0Sstevel@tonic-gate 	    tune.t_minarmem + npages)
6293*0Sstevel@tonic-gate 		return (0);
6294*0Sstevel@tonic-gate #endif
6295*0Sstevel@tonic-gate 	return (1);
6296*0Sstevel@tonic-gate }
6297*0Sstevel@tonic-gate 
6298*0Sstevel@tonic-gate 
6299*0Sstevel@tonic-gate /*
6300*0Sstevel@tonic-gate  * Search the memory segments to locate the desired page.  Within a
6301*0Sstevel@tonic-gate  * segment, pages increase linearly with one page structure per
6302*0Sstevel@tonic-gate  * physical page frame (size PAGESIZE).  The search begins
6303*0Sstevel@tonic-gate  * with the segment that was accessed last, to take advantage of locality.
6304*0Sstevel@tonic-gate  * If the hint misses, we start from the beginning of the sorted memseg list
6305*0Sstevel@tonic-gate  */
6306*0Sstevel@tonic-gate 
6307*0Sstevel@tonic-gate 
6308*0Sstevel@tonic-gate /*
6309*0Sstevel@tonic-gate  * Some data structures for pfn to pp lookup.
6310*0Sstevel@tonic-gate  */
6311*0Sstevel@tonic-gate ulong_t mhash_per_slot;
6312*0Sstevel@tonic-gate struct memseg *memseg_hash[N_MEM_SLOTS];
6313*0Sstevel@tonic-gate 
6314*0Sstevel@tonic-gate page_t *
6315*0Sstevel@tonic-gate page_numtopp_nolock(pfn_t pfnum)
6316*0Sstevel@tonic-gate {
6317*0Sstevel@tonic-gate 	static struct memseg *last_memseg_by_pfnum = NULL;
6318*0Sstevel@tonic-gate 	struct memseg *seg;
6319*0Sstevel@tonic-gate 	page_t *pp;
6320*0Sstevel@tonic-gate 
6321*0Sstevel@tonic-gate 	/*
6322*0Sstevel@tonic-gate 	 *	XXX - Since page_numtopp_nolock is called in many places where
6323*0Sstevel@tonic-gate 	 *	the search fails more than it succeeds. It maybe worthwhile
6324*0Sstevel@tonic-gate 	 *	to put a check for pf_is_memory or a pfnum <= max_pfn (set at
6325*0Sstevel@tonic-gate 	 *	boot time).
6326*0Sstevel@tonic-gate 	 *
6327*0Sstevel@tonic-gate 	 *	if (!pf_is_memory(pfnum) || (pfnum > max_pfn))
6328*0Sstevel@tonic-gate 	 *		return (NULL);
6329*0Sstevel@tonic-gate 	 */
6330*0Sstevel@tonic-gate 
6331*0Sstevel@tonic-gate 	MEMSEG_STAT_INCR(nsearch);
6332*0Sstevel@tonic-gate 
6333*0Sstevel@tonic-gate 	/* Try last winner first */
6334*0Sstevel@tonic-gate 	if (((seg = last_memseg_by_pfnum) != NULL) &&
6335*0Sstevel@tonic-gate 		(pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
6336*0Sstevel@tonic-gate 		MEMSEG_STAT_INCR(nlastwon);
6337*0Sstevel@tonic-gate 		pp = seg->pages + (pfnum - seg->pages_base);
6338*0Sstevel@tonic-gate 		if (pp->p_pagenum == pfnum)
6339*0Sstevel@tonic-gate 			return ((page_t *)pp);
6340*0Sstevel@tonic-gate 	}
6341*0Sstevel@tonic-gate 
6342*0Sstevel@tonic-gate 	/* Else Try hash */
6343*0Sstevel@tonic-gate 	if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
6344*0Sstevel@tonic-gate 		(pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
6345*0Sstevel@tonic-gate 		MEMSEG_STAT_INCR(nhashwon);
6346*0Sstevel@tonic-gate 		last_memseg_by_pfnum = seg;
6347*0Sstevel@tonic-gate 		pp = seg->pages + (pfnum - seg->pages_base);
6348*0Sstevel@tonic-gate 		if (pp->p_pagenum == pfnum)
6349*0Sstevel@tonic-gate 			return ((page_t *)pp);
6350*0Sstevel@tonic-gate 	}
6351*0Sstevel@tonic-gate 
6352*0Sstevel@tonic-gate 	/* Else Brute force */
6353*0Sstevel@tonic-gate 	for (seg = memsegs; seg != NULL; seg = seg->next) {
6354*0Sstevel@tonic-gate 		if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
6355*0Sstevel@tonic-gate 			last_memseg_by_pfnum = seg;
6356*0Sstevel@tonic-gate 			pp = seg->pages + (pfnum - seg->pages_base);
6357*0Sstevel@tonic-gate 			return ((page_t *)pp);
6358*0Sstevel@tonic-gate 		}
6359*0Sstevel@tonic-gate 	}
6360*0Sstevel@tonic-gate 	last_memseg_by_pfnum = NULL;
6361*0Sstevel@tonic-gate 	MEMSEG_STAT_INCR(nnotfound);
6362*0Sstevel@tonic-gate 	return ((page_t *)NULL);
6363*0Sstevel@tonic-gate 
6364*0Sstevel@tonic-gate }
6365*0Sstevel@tonic-gate 
6366*0Sstevel@tonic-gate struct memseg *
6367*0Sstevel@tonic-gate page_numtomemseg_nolock(pfn_t pfnum)
6368*0Sstevel@tonic-gate {
6369*0Sstevel@tonic-gate 	struct memseg *seg;
6370*0Sstevel@tonic-gate 	page_t *pp;
6371*0Sstevel@tonic-gate 
6372*0Sstevel@tonic-gate 	/* Try hash */
6373*0Sstevel@tonic-gate 	if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
6374*0Sstevel@tonic-gate 		(pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
6375*0Sstevel@tonic-gate 		pp = seg->pages + (pfnum - seg->pages_base);
6376*0Sstevel@tonic-gate 		if (pp->p_pagenum == pfnum)
6377*0Sstevel@tonic-gate 			return (seg);
6378*0Sstevel@tonic-gate 	}
6379*0Sstevel@tonic-gate 
6380*0Sstevel@tonic-gate 	/* Else Brute force */
6381*0Sstevel@tonic-gate 	for (seg = memsegs; seg != NULL; seg = seg->next) {
6382*0Sstevel@tonic-gate 		if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
6383*0Sstevel@tonic-gate 			return (seg);
6384*0Sstevel@tonic-gate 		}
6385*0Sstevel@tonic-gate 	}
6386*0Sstevel@tonic-gate 	return ((struct memseg *)NULL);
6387*0Sstevel@tonic-gate }
6388*0Sstevel@tonic-gate 
6389*0Sstevel@tonic-gate /*
6390*0Sstevel@tonic-gate  * Given a page and a count return the page struct that is
6391*0Sstevel@tonic-gate  * n structs away from the current one in the global page
6392*0Sstevel@tonic-gate  * list.
6393*0Sstevel@tonic-gate  *
6394*0Sstevel@tonic-gate  * This function wraps to the first page upon
6395*0Sstevel@tonic-gate  * reaching the end of the memseg list.
6396*0Sstevel@tonic-gate  */
6397*0Sstevel@tonic-gate page_t *
6398*0Sstevel@tonic-gate page_nextn(page_t *pp, ulong_t n)
6399*0Sstevel@tonic-gate {
6400*0Sstevel@tonic-gate 	static struct memseg *last_page_next_memseg = NULL;
6401*0Sstevel@tonic-gate 	struct memseg *seg;
6402*0Sstevel@tonic-gate 	page_t *ppn;
6403*0Sstevel@tonic-gate 
6404*0Sstevel@tonic-gate 	if (((seg = last_page_next_memseg) == NULL) ||
6405*0Sstevel@tonic-gate 	    (seg->pages_base == seg->pages_end) ||
6406*0Sstevel@tonic-gate 	    !(pp >= seg->pages && pp < seg->epages)) {
6407*0Sstevel@tonic-gate 
6408*0Sstevel@tonic-gate 		for (seg = memsegs; seg; seg = seg->next) {
6409*0Sstevel@tonic-gate 			if (pp >= seg->pages && pp < seg->epages)
6410*0Sstevel@tonic-gate 				break;
6411*0Sstevel@tonic-gate 		}
6412*0Sstevel@tonic-gate 
6413*0Sstevel@tonic-gate 		if (seg == NULL) {
6414*0Sstevel@tonic-gate 			/* Memory delete got in, return something valid. */
6415*0Sstevel@tonic-gate 			/* TODO: fix me. */
6416*0Sstevel@tonic-gate 			seg = memsegs;
6417*0Sstevel@tonic-gate 			pp = seg->pages;
6418*0Sstevel@tonic-gate 		}
6419*0Sstevel@tonic-gate 	}
6420*0Sstevel@tonic-gate 
6421*0Sstevel@tonic-gate 	/* check for wraparound - possible if n is large */
6422*0Sstevel@tonic-gate 	while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
6423*0Sstevel@tonic-gate 		n -= seg->epages - pp;
6424*0Sstevel@tonic-gate 		seg = seg->next;
6425*0Sstevel@tonic-gate 		if (seg == NULL)
6426*0Sstevel@tonic-gate 			seg = memsegs;
6427*0Sstevel@tonic-gate 		pp = seg->pages;
6428*0Sstevel@tonic-gate 	}
6429*0Sstevel@tonic-gate 	last_page_next_memseg = seg;
6430*0Sstevel@tonic-gate 	return (ppn);
6431*0Sstevel@tonic-gate }
6432*0Sstevel@tonic-gate 
6433*0Sstevel@tonic-gate /*
6434*0Sstevel@tonic-gate  * Initialize for a loop using page_next_scan_large().
6435*0Sstevel@tonic-gate  */
6436*0Sstevel@tonic-gate page_t *
6437*0Sstevel@tonic-gate page_next_scan_init(void **cookie)
6438*0Sstevel@tonic-gate {
6439*0Sstevel@tonic-gate 	ASSERT(cookie != NULL);
6440*0Sstevel@tonic-gate 	*cookie = (void *)memsegs;
6441*0Sstevel@tonic-gate 	return ((page_t *)memsegs->pages);
6442*0Sstevel@tonic-gate }
6443*0Sstevel@tonic-gate 
6444*0Sstevel@tonic-gate /*
6445*0Sstevel@tonic-gate  * Return the next page in a scan of page_t's, assuming we want
6446*0Sstevel@tonic-gate  * to skip over sub-pages within larger page sizes.
6447*0Sstevel@tonic-gate  *
6448*0Sstevel@tonic-gate  * The cookie is used to keep track of the current memseg.
6449*0Sstevel@tonic-gate  */
6450*0Sstevel@tonic-gate page_t *
6451*0Sstevel@tonic-gate page_next_scan_large(
6452*0Sstevel@tonic-gate 	page_t		*pp,
6453*0Sstevel@tonic-gate 	ulong_t		*n,
6454*0Sstevel@tonic-gate 	void		**cookie)
6455*0Sstevel@tonic-gate {
6456*0Sstevel@tonic-gate 	struct memseg	*seg = (struct memseg *)*cookie;
6457*0Sstevel@tonic-gate 	page_t		*new_pp;
6458*0Sstevel@tonic-gate 	ulong_t		cnt;
6459*0Sstevel@tonic-gate 	pfn_t		pfn;
6460*0Sstevel@tonic-gate 
6461*0Sstevel@tonic-gate 
6462*0Sstevel@tonic-gate 	/*
6463*0Sstevel@tonic-gate 	 * get the count of page_t's to skip based on the page size
6464*0Sstevel@tonic-gate 	 */
6465*0Sstevel@tonic-gate 	ASSERT(pp != NULL);
6466*0Sstevel@tonic-gate 	if (pp->p_szc == 0) {
6467*0Sstevel@tonic-gate 		cnt = 1;
6468*0Sstevel@tonic-gate 	} else {
6469*0Sstevel@tonic-gate 		pfn = page_pptonum(pp);
6470*0Sstevel@tonic-gate 		cnt = page_get_pagecnt(pp->p_szc);
6471*0Sstevel@tonic-gate 		cnt -= pfn & (cnt - 1);
6472*0Sstevel@tonic-gate 	}
6473*0Sstevel@tonic-gate 	*n += cnt;
6474*0Sstevel@tonic-gate 	new_pp = pp + cnt;
6475*0Sstevel@tonic-gate 
6476*0Sstevel@tonic-gate 	/*
6477*0Sstevel@tonic-gate 	 * Catch if we went past the end of the current memory segment. If so,
6478*0Sstevel@tonic-gate 	 * just move to the next segment with pages.
6479*0Sstevel@tonic-gate 	 */
6480*0Sstevel@tonic-gate 	if (new_pp >= seg->epages) {
6481*0Sstevel@tonic-gate 		do {
6482*0Sstevel@tonic-gate 			seg = seg->next;
6483*0Sstevel@tonic-gate 			if (seg == NULL)
6484*0Sstevel@tonic-gate 				seg = memsegs;
6485*0Sstevel@tonic-gate 		} while (seg->pages == seg->epages);
6486*0Sstevel@tonic-gate 		new_pp = seg->pages;
6487*0Sstevel@tonic-gate 		*cookie = (void *)seg;
6488*0Sstevel@tonic-gate 	}
6489*0Sstevel@tonic-gate 
6490*0Sstevel@tonic-gate 	return (new_pp);
6491*0Sstevel@tonic-gate }
6492*0Sstevel@tonic-gate 
6493*0Sstevel@tonic-gate 
6494*0Sstevel@tonic-gate /*
6495*0Sstevel@tonic-gate  * Returns next page in list. Note: this function wraps
6496*0Sstevel@tonic-gate  * to the first page in the list upon reaching the end
6497*0Sstevel@tonic-gate  * of the list. Callers should be aware of this fact.
6498*0Sstevel@tonic-gate  */
6499*0Sstevel@tonic-gate 
6500*0Sstevel@tonic-gate /* We should change this be a #define */
6501*0Sstevel@tonic-gate 
6502*0Sstevel@tonic-gate page_t *
6503*0Sstevel@tonic-gate page_next(page_t *pp)
6504*0Sstevel@tonic-gate {
6505*0Sstevel@tonic-gate 	return (page_nextn(pp, 1));
6506*0Sstevel@tonic-gate }
6507*0Sstevel@tonic-gate 
6508*0Sstevel@tonic-gate /*
6509*0Sstevel@tonic-gate  * Special for routines processing an array of page_t.
6510*0Sstevel@tonic-gate  */
6511*0Sstevel@tonic-gate page_t *
6512*0Sstevel@tonic-gate page_nextn_raw(page_t *pp, ulong_t n)
6513*0Sstevel@tonic-gate {
6514*0Sstevel@tonic-gate 	return (pp+n);
6515*0Sstevel@tonic-gate }
6516*0Sstevel@tonic-gate 
6517*0Sstevel@tonic-gate page_t *
6518*0Sstevel@tonic-gate page_first()
6519*0Sstevel@tonic-gate {
6520*0Sstevel@tonic-gate 	return ((page_t *)memsegs->pages);
6521*0Sstevel@tonic-gate }
6522*0Sstevel@tonic-gate 
6523*0Sstevel@tonic-gate 
6524*0Sstevel@tonic-gate /*
6525*0Sstevel@tonic-gate  * This routine is called at boot with the initial memory configuration
6526*0Sstevel@tonic-gate  * and when memory is added or removed.
6527*0Sstevel@tonic-gate  */
6528*0Sstevel@tonic-gate void
6529*0Sstevel@tonic-gate build_pfn_hash()
6530*0Sstevel@tonic-gate {
6531*0Sstevel@tonic-gate 	pfn_t cur;
6532*0Sstevel@tonic-gate 	pgcnt_t index;
6533*0Sstevel@tonic-gate 	struct memseg *pseg;
6534*0Sstevel@tonic-gate 	int	i;
6535*0Sstevel@tonic-gate 
6536*0Sstevel@tonic-gate 	/*
6537*0Sstevel@tonic-gate 	 * Clear memseg_hash array.
6538*0Sstevel@tonic-gate 	 * Since memory add/delete is designed to operate concurrently
6539*0Sstevel@tonic-gate 	 * with normal operation, the hash rebuild must be able to run
6540*0Sstevel@tonic-gate 	 * concurrently with page_numtopp_nolock(). To support this
6541*0Sstevel@tonic-gate 	 * functionality, assignments to memseg_hash array members must
6542*0Sstevel@tonic-gate 	 * be done atomically.
6543*0Sstevel@tonic-gate 	 *
6544*0Sstevel@tonic-gate 	 * NOTE: bzero() does not currently guarantee this for kernel
6545*0Sstevel@tonic-gate 	 * threads, and cannot be used here.
6546*0Sstevel@tonic-gate 	 */
6547*0Sstevel@tonic-gate 	for (i = 0; i < N_MEM_SLOTS; i++)
6548*0Sstevel@tonic-gate 		memseg_hash[i] = NULL;
6549*0Sstevel@tonic-gate 
6550*0Sstevel@tonic-gate 	hat_kpm_mseghash_clear(N_MEM_SLOTS);
6551*0Sstevel@tonic-gate 
6552*0Sstevel@tonic-gate 	/*
6553*0Sstevel@tonic-gate 	 * Physmax is the last valid pfn.
6554*0Sstevel@tonic-gate 	 */
6555*0Sstevel@tonic-gate 	mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6556*0Sstevel@tonic-gate 	for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6557*0Sstevel@tonic-gate 		index = MEMSEG_PFN_HASH(pseg->pages_base);
6558*0Sstevel@tonic-gate 		cur = pseg->pages_base;
6559*0Sstevel@tonic-gate 		do {
6560*0Sstevel@tonic-gate 			if (index >= N_MEM_SLOTS)
6561*0Sstevel@tonic-gate 				index = MEMSEG_PFN_HASH(cur);
6562*0Sstevel@tonic-gate 
6563*0Sstevel@tonic-gate 			if (memseg_hash[index] == NULL ||
6564*0Sstevel@tonic-gate 			    memseg_hash[index]->pages_base > pseg->pages_base) {
6565*0Sstevel@tonic-gate 				memseg_hash[index] = pseg;
6566*0Sstevel@tonic-gate 				hat_kpm_mseghash_update(index, pseg);
6567*0Sstevel@tonic-gate 			}
6568*0Sstevel@tonic-gate 			cur += mhash_per_slot;
6569*0Sstevel@tonic-gate 			index++;
6570*0Sstevel@tonic-gate 		} while (cur < pseg->pages_end);
6571*0Sstevel@tonic-gate 	}
6572*0Sstevel@tonic-gate }
6573*0Sstevel@tonic-gate 
6574*0Sstevel@tonic-gate /*
6575*0Sstevel@tonic-gate  * Return the pagenum for the pp
6576*0Sstevel@tonic-gate  */
6577*0Sstevel@tonic-gate pfn_t
6578*0Sstevel@tonic-gate page_pptonum(page_t *pp)
6579*0Sstevel@tonic-gate {
6580*0Sstevel@tonic-gate 	return (pp->p_pagenum);
6581*0Sstevel@tonic-gate }
6582*0Sstevel@tonic-gate 
6583*0Sstevel@tonic-gate /*
6584*0Sstevel@tonic-gate  * interface to the referenced and modified etc bits
6585*0Sstevel@tonic-gate  * in the PSM part of the page struct
6586*0Sstevel@tonic-gate  * when no locking is desired.
6587*0Sstevel@tonic-gate  */
6588*0Sstevel@tonic-gate void
6589*0Sstevel@tonic-gate page_set_props(page_t *pp, uint_t flags)
6590*0Sstevel@tonic-gate {
6591*0Sstevel@tonic-gate 	ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6592*0Sstevel@tonic-gate 	pp->p_nrm |= (uchar_t)flags;
6593*0Sstevel@tonic-gate }
6594*0Sstevel@tonic-gate 
6595*0Sstevel@tonic-gate void
6596*0Sstevel@tonic-gate page_clr_all_props(page_t *pp)
6597*0Sstevel@tonic-gate {
6598*0Sstevel@tonic-gate 	pp->p_nrm = 0;
6599*0Sstevel@tonic-gate }
6600*0Sstevel@tonic-gate 
6601*0Sstevel@tonic-gate /*
6602*0Sstevel@tonic-gate  * The following functions is called from free_vp_pages()
6603*0Sstevel@tonic-gate  * for an inexact estimate of a newly free'd page...
6604*0Sstevel@tonic-gate  */
6605*0Sstevel@tonic-gate ulong_t
6606*0Sstevel@tonic-gate page_share_cnt(page_t *pp)
6607*0Sstevel@tonic-gate {
6608*0Sstevel@tonic-gate 	return (hat_page_getshare(pp));
6609*0Sstevel@tonic-gate }
6610*0Sstevel@tonic-gate 
6611*0Sstevel@tonic-gate /*
6612*0Sstevel@tonic-gate  * The following functions are used in handling memory
6613*0Sstevel@tonic-gate  * errors.
6614*0Sstevel@tonic-gate  */
6615*0Sstevel@tonic-gate 
6616*0Sstevel@tonic-gate int
6617*0Sstevel@tonic-gate page_istoxic(page_t *pp)
6618*0Sstevel@tonic-gate {
6619*0Sstevel@tonic-gate 	return ((pp->p_toxic & PAGE_IS_TOXIC) == PAGE_IS_TOXIC);
6620*0Sstevel@tonic-gate }
6621*0Sstevel@tonic-gate 
6622*0Sstevel@tonic-gate int
6623*0Sstevel@tonic-gate page_isfailing(page_t *pp)
6624*0Sstevel@tonic-gate {
6625*0Sstevel@tonic-gate 	return ((pp->p_toxic & PAGE_IS_FAILING) == PAGE_IS_FAILING);
6626*0Sstevel@tonic-gate }
6627*0Sstevel@tonic-gate 
6628*0Sstevel@tonic-gate int
6629*0Sstevel@tonic-gate page_isretired(page_t *pp)
6630*0Sstevel@tonic-gate {
6631*0Sstevel@tonic-gate 	return ((pp->p_toxic & PAGE_IS_RETIRED) == PAGE_IS_RETIRED);
6632*0Sstevel@tonic-gate }
6633*0Sstevel@tonic-gate 
6634*0Sstevel@tonic-gate int
6635*0Sstevel@tonic-gate page_deteriorating(page_t *pp)
6636*0Sstevel@tonic-gate {
6637*0Sstevel@tonic-gate 	return ((pp->p_toxic & (PAGE_IS_TOXIC | PAGE_IS_FAILING)) != 0);
6638*0Sstevel@tonic-gate }
6639*0Sstevel@tonic-gate 
6640*0Sstevel@tonic-gate void
6641*0Sstevel@tonic-gate page_settoxic(page_t *pp, uchar_t flag)
6642*0Sstevel@tonic-gate {
6643*0Sstevel@tonic-gate 	uchar_t new_flag = 0;
6644*0Sstevel@tonic-gate 	while ((new_flag & flag) != flag) {
6645*0Sstevel@tonic-gate 		uchar_t old_flag = pp->p_toxic;
6646*0Sstevel@tonic-gate 		new_flag = old_flag | flag;
6647*0Sstevel@tonic-gate 		(void) cas8(&pp->p_toxic, old_flag, new_flag);
6648*0Sstevel@tonic-gate 		new_flag = ((volatile page_t *)pp)->p_toxic;
6649*0Sstevel@tonic-gate 	}
6650*0Sstevel@tonic-gate }
6651*0Sstevel@tonic-gate 
6652*0Sstevel@tonic-gate void
6653*0Sstevel@tonic-gate page_clrtoxic(page_t *pp)
6654*0Sstevel@tonic-gate {
6655*0Sstevel@tonic-gate 	/*
6656*0Sstevel@tonic-gate 	 * We don't need to worry about atomicity on the
6657*0Sstevel@tonic-gate 	 * p_toxic flag here as this is only called from
6658*0Sstevel@tonic-gate 	 * page_free() while holding an exclusive lock on
6659*0Sstevel@tonic-gate 	 * the page
6660*0Sstevel@tonic-gate 	 */
6661*0Sstevel@tonic-gate 	pp->p_toxic = PAGE_IS_OK;
6662*0Sstevel@tonic-gate }
6663*0Sstevel@tonic-gate 
6664*0Sstevel@tonic-gate void
6665*0Sstevel@tonic-gate page_clrtoxic_flag(page_t *pp, uchar_t flag)
6666*0Sstevel@tonic-gate {
6667*0Sstevel@tonic-gate 	uchar_t new_flag = ((volatile page_t *)pp)->p_toxic;
6668*0Sstevel@tonic-gate 	while ((new_flag & flag) == flag) {
6669*0Sstevel@tonic-gate 		uchar_t old_flag = new_flag;
6670*0Sstevel@tonic-gate 		new_flag = old_flag & ~flag;
6671*0Sstevel@tonic-gate 		(void) cas8(&pp->p_toxic, old_flag, new_flag);
6672*0Sstevel@tonic-gate 		new_flag = ((volatile page_t *)pp)->p_toxic;
6673*0Sstevel@tonic-gate 	}
6674*0Sstevel@tonic-gate }
6675*0Sstevel@tonic-gate 
6676*0Sstevel@tonic-gate int
6677*0Sstevel@tonic-gate page_isfaulty(page_t *pp)
6678*0Sstevel@tonic-gate {
6679*0Sstevel@tonic-gate 	return ((pp->p_toxic & PAGE_IS_FAULTY) == PAGE_IS_FAULTY);
6680*0Sstevel@tonic-gate }
6681*0Sstevel@tonic-gate 
6682*0Sstevel@tonic-gate /*
6683*0Sstevel@tonic-gate  * The following four functions are called from /proc code
6684*0Sstevel@tonic-gate  * for the /proc/<pid>/xmap interface.
6685*0Sstevel@tonic-gate  */
6686*0Sstevel@tonic-gate int
6687*0Sstevel@tonic-gate page_isshared(page_t *pp)
6688*0Sstevel@tonic-gate {
6689*0Sstevel@tonic-gate 	return (hat_page_getshare(pp) > 1);
6690*0Sstevel@tonic-gate }
6691*0Sstevel@tonic-gate 
6692*0Sstevel@tonic-gate int
6693*0Sstevel@tonic-gate page_isfree(page_t *pp)
6694*0Sstevel@tonic-gate {
6695*0Sstevel@tonic-gate 	return (PP_ISFREE(pp));
6696*0Sstevel@tonic-gate }
6697*0Sstevel@tonic-gate 
6698*0Sstevel@tonic-gate int
6699*0Sstevel@tonic-gate page_isref(page_t *pp)
6700*0Sstevel@tonic-gate {
6701*0Sstevel@tonic-gate 	return (hat_page_getattr(pp, P_REF));
6702*0Sstevel@tonic-gate }
6703*0Sstevel@tonic-gate 
6704*0Sstevel@tonic-gate int
6705*0Sstevel@tonic-gate page_ismod(page_t *pp)
6706*0Sstevel@tonic-gate {
6707*0Sstevel@tonic-gate 	return (hat_page_getattr(pp, P_MOD));
6708*0Sstevel@tonic-gate }
6709