1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate /*
23*0Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*0Sstevel@tonic-gate  * Use is subject to license terms.
25*0Sstevel@tonic-gate  */
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*0Sstevel@tonic-gate 
29*0Sstevel@tonic-gate /*
30*0Sstevel@tonic-gate  * VM - page locking primitives
31*0Sstevel@tonic-gate  */
32*0Sstevel@tonic-gate #include <sys/param.h>
33*0Sstevel@tonic-gate #include <sys/t_lock.h>
34*0Sstevel@tonic-gate #include <sys/vtrace.h>
35*0Sstevel@tonic-gate #include <sys/debug.h>
36*0Sstevel@tonic-gate #include <sys/cmn_err.h>
37*0Sstevel@tonic-gate #include <sys/vnode.h>
38*0Sstevel@tonic-gate #include <sys/bitmap.h>
39*0Sstevel@tonic-gate #include <sys/lockstat.h>
40*0Sstevel@tonic-gate #include <sys/condvar_impl.h>
41*0Sstevel@tonic-gate #include <vm/page.h>
42*0Sstevel@tonic-gate #include <vm/seg_enum.h>
43*0Sstevel@tonic-gate #include <vm/vm_dep.h>
44*0Sstevel@tonic-gate 
45*0Sstevel@tonic-gate /*
46*0Sstevel@tonic-gate  * This global mutex is for logical page locking.
47*0Sstevel@tonic-gate  * The following fields in the page structure are protected
48*0Sstevel@tonic-gate  * by this lock:
49*0Sstevel@tonic-gate  *
50*0Sstevel@tonic-gate  *	p_lckcnt
51*0Sstevel@tonic-gate  *	p_cowcnt
52*0Sstevel@tonic-gate  */
53*0Sstevel@tonic-gate kmutex_t page_llock;
54*0Sstevel@tonic-gate 
55*0Sstevel@tonic-gate /*
56*0Sstevel@tonic-gate  * This is a global lock for the logical page free list.  The
57*0Sstevel@tonic-gate  * logical free list, in this implementation, is maintained as two
58*0Sstevel@tonic-gate  * separate physical lists - the cache list and the free list.
59*0Sstevel@tonic-gate  */
60*0Sstevel@tonic-gate kmutex_t  page_freelock;
61*0Sstevel@tonic-gate 
62*0Sstevel@tonic-gate /*
63*0Sstevel@tonic-gate  * The hash table, page_hash[], the p_selock fields, and the
64*0Sstevel@tonic-gate  * list of pages associated with vnodes are protected by arrays of mutexes.
65*0Sstevel@tonic-gate  *
66*0Sstevel@tonic-gate  * Unless the hashes are changed radically, the table sizes must be
67*0Sstevel@tonic-gate  * a power of two.  Also, we typically need more mutexes for the
68*0Sstevel@tonic-gate  * vnodes since these locks are occasionally held for long periods.
69*0Sstevel@tonic-gate  * And since there seem to be two special vnodes (kvp and swapvp),
70*0Sstevel@tonic-gate  * we make room for private mutexes for them.
71*0Sstevel@tonic-gate  *
72*0Sstevel@tonic-gate  * The pse_mutex[] array holds the mutexes to protect the p_selock
73*0Sstevel@tonic-gate  * fields of all page_t structures.
74*0Sstevel@tonic-gate  *
75*0Sstevel@tonic-gate  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
76*0Sstevel@tonic-gate  * when given a pointer to a page_t.
77*0Sstevel@tonic-gate  *
78*0Sstevel@tonic-gate  * PSE_TABLE_SIZE must be a power of two.  One could argue that we
79*0Sstevel@tonic-gate  * should go to the trouble of setting it up at run time and base it
80*0Sstevel@tonic-gate  * on memory size rather than the number of compile time CPUs.
81*0Sstevel@tonic-gate  *
82*0Sstevel@tonic-gate  * XX64	We should be using physmem size to calculate PSE_TABLE_SIZE,
83*0Sstevel@tonic-gate  *	PSE_SHIFT, PIO_SHIFT.
84*0Sstevel@tonic-gate  *
85*0Sstevel@tonic-gate  *	These might break in 64 bit world.
86*0Sstevel@tonic-gate  */
87*0Sstevel@tonic-gate #define	PSE_SHIFT	7		/* log2(PSE_TABLE_SIZE) */
88*0Sstevel@tonic-gate 
89*0Sstevel@tonic-gate #define	PSE_TABLE_SIZE	128		/* number of mutexes to have */
90*0Sstevel@tonic-gate 
91*0Sstevel@tonic-gate #define	PIO_SHIFT	PSE_SHIFT	/* next power of 2 bigger than page_t */
92*0Sstevel@tonic-gate #define	PIO_TABLE_SIZE	PSE_TABLE_SIZE	/* number of io mutexes to have */
93*0Sstevel@tonic-gate 
94*0Sstevel@tonic-gate pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
95*0Sstevel@tonic-gate pad_mutex_t	pse_mutex[PSE_TABLE_SIZE];
96*0Sstevel@tonic-gate kmutex_t	pio_mutex[PIO_TABLE_SIZE];
97*0Sstevel@tonic-gate 
98*0Sstevel@tonic-gate #define	PAGE_SE_MUTEX(pp) \
99*0Sstevel@tonic-gate 	    &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \
100*0Sstevel@tonic-gate 		((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \
101*0Sstevel@tonic-gate 		(PSE_TABLE_SIZE - 1))].pad_mutex
102*0Sstevel@tonic-gate 
103*0Sstevel@tonic-gate #define	PAGE_IO_MUTEX(pp) \
104*0Sstevel@tonic-gate 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
105*0Sstevel@tonic-gate 
106*0Sstevel@tonic-gate #define	PSZC_MTX_TABLE_SIZE	128
107*0Sstevel@tonic-gate #define	PSZC_MTX_TABLE_SHIFT	7
108*0Sstevel@tonic-gate 
109*0Sstevel@tonic-gate static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
110*0Sstevel@tonic-gate 
111*0Sstevel@tonic-gate #define	PAGE_SZC_MUTEX(_pp) \
112*0Sstevel@tonic-gate 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
113*0Sstevel@tonic-gate 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
114*0Sstevel@tonic-gate 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
115*0Sstevel@tonic-gate 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
116*0Sstevel@tonic-gate 
117*0Sstevel@tonic-gate /*
118*0Sstevel@tonic-gate  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
119*0Sstevel@tonic-gate  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
120*0Sstevel@tonic-gate  * and p_vpnext).
121*0Sstevel@tonic-gate  *
122*0Sstevel@tonic-gate  * The page_vnode_mutex(vp) function returns the address of the appropriate
123*0Sstevel@tonic-gate  * mutex from this array given a pointer to a vnode.  It is complicated
124*0Sstevel@tonic-gate  * by the fact that the kernel's vnode and the swapfs vnode are referenced
125*0Sstevel@tonic-gate  * frequently enough to warrent their own mutexes.
126*0Sstevel@tonic-gate  *
127*0Sstevel@tonic-gate  * The VP_HASH_FUNC returns the index into the vph_mutex array given
128*0Sstevel@tonic-gate  * an address of a vnode.
129*0Sstevel@tonic-gate  */
130*0Sstevel@tonic-gate 
131*0Sstevel@tonic-gate /*
132*0Sstevel@tonic-gate  * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
133*0Sstevel@tonic-gate  *	Need to review again.
134*0Sstevel@tonic-gate  */
135*0Sstevel@tonic-gate #define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
136*0Sstevel@tonic-gate 
137*0Sstevel@tonic-gate #define	VP_HASH_FUNC(vp) \
138*0Sstevel@tonic-gate 	((((uintptr_t)(vp) >> 6) + \
139*0Sstevel@tonic-gate 	    ((uintptr_t)(vp) >> 8) + \
140*0Sstevel@tonic-gate 	    ((uintptr_t)(vp) >> 10) + \
141*0Sstevel@tonic-gate 	    ((uintptr_t)(vp) >> 12)) \
142*0Sstevel@tonic-gate 	    & (VPH_TABLE_SIZE - 1))
143*0Sstevel@tonic-gate 
144*0Sstevel@tonic-gate extern	struct vnode	kvp;
145*0Sstevel@tonic-gate 
146*0Sstevel@tonic-gate kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
147*0Sstevel@tonic-gate 
148*0Sstevel@tonic-gate /*
149*0Sstevel@tonic-gate  * Initialize the locks used by the Virtual Memory Management system.
150*0Sstevel@tonic-gate  */
151*0Sstevel@tonic-gate void
152*0Sstevel@tonic-gate page_lock_init()
153*0Sstevel@tonic-gate {
154*0Sstevel@tonic-gate }
155*0Sstevel@tonic-gate 
156*0Sstevel@tonic-gate /*
157*0Sstevel@tonic-gate  * At present we only use page ownership to aid debugging, so it's
158*0Sstevel@tonic-gate  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
159*0Sstevel@tonic-gate  * can map to the same owner because we just 'or' in 0x80000000 and
160*0Sstevel@tonic-gate  * then clear the second highest bit, so that (for example) 0x2faced00
161*0Sstevel@tonic-gate  * and 0xafaced00 both map to 0xafaced00.
162*0Sstevel@tonic-gate  * In the 64-bit world, p_selock may not be large enough to hold a full
163*0Sstevel@tonic-gate  * thread pointer.  If we ever need precise ownership (e.g. if we implement
164*0Sstevel@tonic-gate  * priority inheritance for page locks) then p_selock should become a
165*0Sstevel@tonic-gate  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
166*0Sstevel@tonic-gate  */
167*0Sstevel@tonic-gate #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
168*0Sstevel@tonic-gate #define	SE_READER	1
169*0Sstevel@tonic-gate 
170*0Sstevel@tonic-gate /*
171*0Sstevel@tonic-gate  * A page that is deleted must be marked as such using the
172*0Sstevel@tonic-gate  * page_lock_delete() function. The page must be exclusively locked.
173*0Sstevel@tonic-gate  * The SE_DELETED marker is put in p_selock when this function is called.
174*0Sstevel@tonic-gate  * SE_DELETED must be distinct from any SE_WRITER value.
175*0Sstevel@tonic-gate  */
176*0Sstevel@tonic-gate #define	SE_DELETED	(1 | INT_MIN)
177*0Sstevel@tonic-gate 
178*0Sstevel@tonic-gate #ifdef VM_STATS
179*0Sstevel@tonic-gate uint_t	vph_kvp_count;
180*0Sstevel@tonic-gate uint_t	vph_swapfsvp_count;
181*0Sstevel@tonic-gate uint_t	vph_other;
182*0Sstevel@tonic-gate #endif /* VM_STATS */
183*0Sstevel@tonic-gate 
184*0Sstevel@tonic-gate #ifdef VM_STATS
185*0Sstevel@tonic-gate uint_t	page_lock_count;
186*0Sstevel@tonic-gate uint_t	page_lock_miss;
187*0Sstevel@tonic-gate uint_t	page_lock_miss_lock;
188*0Sstevel@tonic-gate uint_t	page_lock_reclaim;
189*0Sstevel@tonic-gate uint_t	page_lock_bad_reclaim;
190*0Sstevel@tonic-gate uint_t	page_lock_same_page;
191*0Sstevel@tonic-gate uint_t	page_lock_upgrade;
192*0Sstevel@tonic-gate uint_t	page_lock_upgrade_failed;
193*0Sstevel@tonic-gate uint_t	page_lock_deleted;
194*0Sstevel@tonic-gate 
195*0Sstevel@tonic-gate uint_t	page_trylock_locked;
196*0Sstevel@tonic-gate uint_t	page_trylock_missed;
197*0Sstevel@tonic-gate 
198*0Sstevel@tonic-gate uint_t	page_try_reclaim_upgrade;
199*0Sstevel@tonic-gate #endif /* VM_STATS */
200*0Sstevel@tonic-gate 
201*0Sstevel@tonic-gate 
202*0Sstevel@tonic-gate /*
203*0Sstevel@tonic-gate  * Acquire the "shared/exclusive" lock on a page.
204*0Sstevel@tonic-gate  *
205*0Sstevel@tonic-gate  * Returns 1 on success and locks the page appropriately.
206*0Sstevel@tonic-gate  *	   0 on failure and does not lock the page.
207*0Sstevel@tonic-gate  *
208*0Sstevel@tonic-gate  * If `lock' is non-NULL, it will be dropped and reacquired in the
209*0Sstevel@tonic-gate  * failure case.  This routine can block, and if it does
210*0Sstevel@tonic-gate  * it will always return a failure since the page identity [vp, off]
211*0Sstevel@tonic-gate  * or state may have changed.
212*0Sstevel@tonic-gate  */
213*0Sstevel@tonic-gate 
214*0Sstevel@tonic-gate int
215*0Sstevel@tonic-gate page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
216*0Sstevel@tonic-gate {
217*0Sstevel@tonic-gate 	return (page_lock_es(pp, se, lock, reclaim, 0));
218*0Sstevel@tonic-gate }
219*0Sstevel@tonic-gate 
220*0Sstevel@tonic-gate /*
221*0Sstevel@tonic-gate  * With the addition of reader-writer lock semantics to page_lock_es,
222*0Sstevel@tonic-gate  * callers wanting an exclusive (writer) lock may prevent shared-lock
223*0Sstevel@tonic-gate  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
224*0Sstevel@tonic-gate  * In this case, when an exclusive lock cannot be acquired, p_selock's
225*0Sstevel@tonic-gate  * SE_EWANTED bit is set.
226*0Sstevel@tonic-gate  * This bit, along with the se and es parameters, are used to decide
227*0Sstevel@tonic-gate  * if the requested lock should be granted:
228*0Sstevel@tonic-gate  *
229*0Sstevel@tonic-gate  * Lock wanted SE_EXCL_WANTED p_selock/SE_EWANTED  Action
230*0Sstevel@tonic-gate  * ----------  -------------- -------------------  ---------
231*0Sstevel@tonic-gate  * SE_EXCL        no           dont-care/1         deny lock
232*0Sstevel@tonic-gate  * SE_EXCL     any(see note)   unlocked/any        grant lock, clear SE_EWANTED
233*0Sstevel@tonic-gate  * SE_EXCL        yes          any lock/any        deny, set SE_EWANTED
234*0Sstevel@tonic-gate  * SE_EXCL        no           any lock/any        deny
235*0Sstevel@tonic-gate  * SE_SHARED   not applicable    shared/0          grant
236*0Sstevel@tonic-gate  * SE_SHARED   not applicable  unlocked/0          grant
237*0Sstevel@tonic-gate  * SE_SHARED   not applicable    shared/1          deny
238*0Sstevel@tonic-gate  * SE_SHARED   not applicable  unlocked/1          deny
239*0Sstevel@tonic-gate  * SE_SHARED   not applicable      excl/any        deny
240*0Sstevel@tonic-gate  *
241*0Sstevel@tonic-gate  * Note: the code grants an exclusive lock to the caller and clears
242*0Sstevel@tonic-gate  * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
243*0Sstevel@tonic-gate  * bit's value.  This was deemed acceptable as we are not concerned about
244*0Sstevel@tonic-gate  * exclusive-lock starvation. If this ever becomes an issue, a priority or
245*0Sstevel@tonic-gate  * fifo mechanism should also be implemented.
246*0Sstevel@tonic-gate  */
247*0Sstevel@tonic-gate int
248*0Sstevel@tonic-gate page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
249*0Sstevel@tonic-gate {
250*0Sstevel@tonic-gate 	int		retval;
251*0Sstevel@tonic-gate 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
252*0Sstevel@tonic-gate 	int		upgraded;
253*0Sstevel@tonic-gate 	int		reclaim_it;
254*0Sstevel@tonic-gate 
255*0Sstevel@tonic-gate 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
256*0Sstevel@tonic-gate 
257*0Sstevel@tonic-gate 	VM_STAT_ADD(page_lock_count);
258*0Sstevel@tonic-gate 
259*0Sstevel@tonic-gate 	upgraded = 0;
260*0Sstevel@tonic-gate 	reclaim_it = 0;
261*0Sstevel@tonic-gate 
262*0Sstevel@tonic-gate 	mutex_enter(pse);
263*0Sstevel@tonic-gate 
264*0Sstevel@tonic-gate 	/*
265*0Sstevel@tonic-gate 	 * Current uses of 'es':
266*0Sstevel@tonic-gate 	 * es == 1 page_lookup_create will attempt page relocation
267*0Sstevel@tonic-gate 	 * es == SE_EXCL_WANTED caller wants SE_EWANTED set (eg. delete
268*0Sstevel@tonic-gate 	 * memory thread); this prevents reader-starvation of waiting
269*0Sstevel@tonic-gate 	 * writer thread(s).
270*0Sstevel@tonic-gate 	 */
271*0Sstevel@tonic-gate 
272*0Sstevel@tonic-gate 
273*0Sstevel@tonic-gate 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
274*0Sstevel@tonic-gate 	    ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
275*0Sstevel@tonic-gate 
276*0Sstevel@tonic-gate 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
277*0Sstevel@tonic-gate 		se = SE_EXCL;
278*0Sstevel@tonic-gate 	}
279*0Sstevel@tonic-gate 
280*0Sstevel@tonic-gate 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
281*0Sstevel@tonic-gate 
282*0Sstevel@tonic-gate 		reclaim_it = 1;
283*0Sstevel@tonic-gate 		if (se == SE_SHARED) {
284*0Sstevel@tonic-gate 			/*
285*0Sstevel@tonic-gate 			 * This is an interesting situation.
286*0Sstevel@tonic-gate 			 *
287*0Sstevel@tonic-gate 			 * Remember that p_free can only change if
288*0Sstevel@tonic-gate 			 * p_selock < 0.
289*0Sstevel@tonic-gate 			 * p_free does not depend on our holding `pse'.
290*0Sstevel@tonic-gate 			 * And, since we hold `pse', p_selock can not change.
291*0Sstevel@tonic-gate 			 * So, if p_free changes on us, the page is already
292*0Sstevel@tonic-gate 			 * exclusively held, and we would fail to get p_selock
293*0Sstevel@tonic-gate 			 * regardless.
294*0Sstevel@tonic-gate 			 *
295*0Sstevel@tonic-gate 			 * We want to avoid getting the share
296*0Sstevel@tonic-gate 			 * lock on a free page that needs to be reclaimed.
297*0Sstevel@tonic-gate 			 * It is possible that some other thread has the share
298*0Sstevel@tonic-gate 			 * lock and has left the free page on the cache list.
299*0Sstevel@tonic-gate 			 * pvn_vplist_dirty() does this for brief periods.
300*0Sstevel@tonic-gate 			 * If the se_share is currently SE_EXCL, we will fail
301*0Sstevel@tonic-gate 			 * to acquire p_selock anyway.  Blocking is the
302*0Sstevel@tonic-gate 			 * right thing to do.
303*0Sstevel@tonic-gate 			 * If we need to reclaim this page, we must get
304*0Sstevel@tonic-gate 			 * exclusive access to it, force the upgrade now.
305*0Sstevel@tonic-gate 			 * Again, we will fail to acquire p_selock if the
306*0Sstevel@tonic-gate 			 * page is not free and block.
307*0Sstevel@tonic-gate 			 */
308*0Sstevel@tonic-gate 			upgraded = 1;
309*0Sstevel@tonic-gate 			se = SE_EXCL;
310*0Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_upgrade);
311*0Sstevel@tonic-gate 		}
312*0Sstevel@tonic-gate 	}
313*0Sstevel@tonic-gate 
314*0Sstevel@tonic-gate 	if (se == SE_EXCL) {
315*0Sstevel@tonic-gate 		if ((es != SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
316*0Sstevel@tonic-gate 			/*
317*0Sstevel@tonic-gate 			 * if the caller wants a writer lock (but did not
318*0Sstevel@tonic-gate 			 * specify exclusive access), and there is a pending
319*0Sstevel@tonic-gate 			 * writer that wants exclusive access, return failure
320*0Sstevel@tonic-gate 			 */
321*0Sstevel@tonic-gate 			retval = 0;
322*0Sstevel@tonic-gate 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
323*0Sstevel@tonic-gate 			/* no reader/writer lock held */
324*0Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
325*0Sstevel@tonic-gate 			/* this clears our setting of the SE_EWANTED bit */
326*0Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
327*0Sstevel@tonic-gate 			retval = 1;
328*0Sstevel@tonic-gate 		} else {
329*0Sstevel@tonic-gate 			/* page is locked */
330*0Sstevel@tonic-gate 			if (es == SE_EXCL_WANTED) {
331*0Sstevel@tonic-gate 				/* set the SE_EWANTED bit */
332*0Sstevel@tonic-gate 				pp->p_selock |= SE_EWANTED;
333*0Sstevel@tonic-gate 			}
334*0Sstevel@tonic-gate 			retval = 0;
335*0Sstevel@tonic-gate 		}
336*0Sstevel@tonic-gate 	} else {
337*0Sstevel@tonic-gate 		retval = 0;
338*0Sstevel@tonic-gate 		if (pp->p_selock >= 0) {
339*0Sstevel@tonic-gate 			/* readers are not allowed when excl wanted */
340*0Sstevel@tonic-gate 			if (!(pp->p_selock & SE_EWANTED)) {
341*0Sstevel@tonic-gate 				pp->p_selock += SE_READER;
342*0Sstevel@tonic-gate 				retval = 1;
343*0Sstevel@tonic-gate 			}
344*0Sstevel@tonic-gate 		}
345*0Sstevel@tonic-gate 	}
346*0Sstevel@tonic-gate 
347*0Sstevel@tonic-gate 	if (retval == 0) {
348*0Sstevel@tonic-gate 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
349*0Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_deleted);
350*0Sstevel@tonic-gate 			mutex_exit(pse);
351*0Sstevel@tonic-gate 			return (retval);
352*0Sstevel@tonic-gate 		}
353*0Sstevel@tonic-gate 
354*0Sstevel@tonic-gate #ifdef VM_STATS
355*0Sstevel@tonic-gate 		VM_STAT_ADD(page_lock_miss);
356*0Sstevel@tonic-gate 		if (upgraded) {
357*0Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_upgrade_failed);
358*0Sstevel@tonic-gate 		}
359*0Sstevel@tonic-gate #endif
360*0Sstevel@tonic-gate 		if (lock) {
361*0Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_miss_lock);
362*0Sstevel@tonic-gate 			mutex_exit(lock);
363*0Sstevel@tonic-gate 		}
364*0Sstevel@tonic-gate 
365*0Sstevel@tonic-gate 		/*
366*0Sstevel@tonic-gate 		 * Now, wait for the page to be unlocked and
367*0Sstevel@tonic-gate 		 * release the lock protecting p_cv and p_selock.
368*0Sstevel@tonic-gate 		 */
369*0Sstevel@tonic-gate 		cv_wait(&pp->p_cv, pse);
370*0Sstevel@tonic-gate 		mutex_exit(pse);
371*0Sstevel@tonic-gate 
372*0Sstevel@tonic-gate 		/*
373*0Sstevel@tonic-gate 		 * The page identity may have changed while we were
374*0Sstevel@tonic-gate 		 * blocked.  If we are willing to depend on "pp"
375*0Sstevel@tonic-gate 		 * still pointing to a valid page structure (i.e.,
376*0Sstevel@tonic-gate 		 * assuming page structures are not dynamically allocated
377*0Sstevel@tonic-gate 		 * or freed), we could try to lock the page if its
378*0Sstevel@tonic-gate 		 * identity hasn't changed.
379*0Sstevel@tonic-gate 		 *
380*0Sstevel@tonic-gate 		 * This needs to be measured, since we come back from
381*0Sstevel@tonic-gate 		 * cv_wait holding pse (the expensive part of this
382*0Sstevel@tonic-gate 		 * operation) we might as well try the cheap part.
383*0Sstevel@tonic-gate 		 * Though we would also have to confirm that dropping
384*0Sstevel@tonic-gate 		 * `lock' did not cause any grief to the callers.
385*0Sstevel@tonic-gate 		 */
386*0Sstevel@tonic-gate 		if (lock) {
387*0Sstevel@tonic-gate 			mutex_enter(lock);
388*0Sstevel@tonic-gate 		}
389*0Sstevel@tonic-gate 	} else {
390*0Sstevel@tonic-gate 		/*
391*0Sstevel@tonic-gate 		 * We have the page lock.
392*0Sstevel@tonic-gate 		 * If we needed to reclaim the page, and the page
393*0Sstevel@tonic-gate 		 * needed reclaiming (ie, it was free), then we
394*0Sstevel@tonic-gate 		 * have the page exclusively locked.  We may need
395*0Sstevel@tonic-gate 		 * to downgrade the page.
396*0Sstevel@tonic-gate 		 */
397*0Sstevel@tonic-gate 		ASSERT((upgraded) ?
398*0Sstevel@tonic-gate 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
399*0Sstevel@tonic-gate 		mutex_exit(pse);
400*0Sstevel@tonic-gate 
401*0Sstevel@tonic-gate 		/*
402*0Sstevel@tonic-gate 		 * We now hold this page's lock, either shared or
403*0Sstevel@tonic-gate 		 * exclusive.  This will prevent its identity from changing.
404*0Sstevel@tonic-gate 		 * The page, however, may or may not be free.  If the caller
405*0Sstevel@tonic-gate 		 * requested, and it is free, go reclaim it from the
406*0Sstevel@tonic-gate 		 * free list.  If the page can't be reclaimed, return failure
407*0Sstevel@tonic-gate 		 * so that the caller can start all over again.
408*0Sstevel@tonic-gate 		 *
409*0Sstevel@tonic-gate 		 * NOTE:page_reclaim() releases the page lock (p_selock)
410*0Sstevel@tonic-gate 		 *	if it can't be reclaimed.
411*0Sstevel@tonic-gate 		 */
412*0Sstevel@tonic-gate 		if (reclaim_it) {
413*0Sstevel@tonic-gate 			if (!page_reclaim(pp, lock)) {
414*0Sstevel@tonic-gate 				VM_STAT_ADD(page_lock_bad_reclaim);
415*0Sstevel@tonic-gate 				retval = 0;
416*0Sstevel@tonic-gate 			} else {
417*0Sstevel@tonic-gate 				VM_STAT_ADD(page_lock_reclaim);
418*0Sstevel@tonic-gate 				if (upgraded) {
419*0Sstevel@tonic-gate 					page_downgrade(pp);
420*0Sstevel@tonic-gate 				}
421*0Sstevel@tonic-gate 			}
422*0Sstevel@tonic-gate 		}
423*0Sstevel@tonic-gate 	}
424*0Sstevel@tonic-gate 	return (retval);
425*0Sstevel@tonic-gate }
426*0Sstevel@tonic-gate 
427*0Sstevel@tonic-gate /*
428*0Sstevel@tonic-gate  * Clear the SE_EWANTED bit from p_selock.  This function allows
429*0Sstevel@tonic-gate  * callers of page_lock_es and page_try_reclaim_lock to clear
430*0Sstevel@tonic-gate  * their setting of this bit if they decide they no longer wish
431*0Sstevel@tonic-gate  * to gain exclusive access to the page.  Currently only
432*0Sstevel@tonic-gate  * delete_memory_thread uses this when the delete memory
433*0Sstevel@tonic-gate  * operation is cancelled.
434*0Sstevel@tonic-gate  */
435*0Sstevel@tonic-gate void
436*0Sstevel@tonic-gate page_lock_clr_exclwanted(page_t *pp)
437*0Sstevel@tonic-gate {
438*0Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
439*0Sstevel@tonic-gate 
440*0Sstevel@tonic-gate 	mutex_enter(pse);
441*0Sstevel@tonic-gate 	pp->p_selock &= ~SE_EWANTED;
442*0Sstevel@tonic-gate 	if (CV_HAS_WAITERS(&pp->p_cv))
443*0Sstevel@tonic-gate 		cv_broadcast(&pp->p_cv);
444*0Sstevel@tonic-gate 	mutex_exit(pse);
445*0Sstevel@tonic-gate }
446*0Sstevel@tonic-gate 
447*0Sstevel@tonic-gate /*
448*0Sstevel@tonic-gate  * Read the comments inside of page_lock_es() carefully.
449*0Sstevel@tonic-gate  *
450*0Sstevel@tonic-gate  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
451*0Sstevel@tonic-gate  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
452*0Sstevel@tonic-gate  * This is used by threads subject to reader-starvation (eg. memory delete).
453*0Sstevel@tonic-gate  *
454*0Sstevel@tonic-gate  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
455*0Sstevel@tonic-gate  * it is expected that it will retry at a later time.  Threads that will
456*0Sstevel@tonic-gate  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
457*0Sstevel@tonic-gate  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
458*0Sstevel@tonic-gate  * the bit is cleared.)
459*0Sstevel@tonic-gate  */
460*0Sstevel@tonic-gate int
461*0Sstevel@tonic-gate page_try_reclaim_lock(page_t *pp, se_t se, int es)
462*0Sstevel@tonic-gate {
463*0Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
464*0Sstevel@tonic-gate 	selock_t old;
465*0Sstevel@tonic-gate 
466*0Sstevel@tonic-gate 	mutex_enter(pse);
467*0Sstevel@tonic-gate 
468*0Sstevel@tonic-gate 	old = pp->p_selock;
469*0Sstevel@tonic-gate 
470*0Sstevel@tonic-gate 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
471*0Sstevel@tonic-gate 	    ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
472*0Sstevel@tonic-gate 
473*0Sstevel@tonic-gate 	if (se == SE_SHARED && es == 1 && old == 0) {
474*0Sstevel@tonic-gate 		se = SE_EXCL;
475*0Sstevel@tonic-gate 	}
476*0Sstevel@tonic-gate 
477*0Sstevel@tonic-gate 	if (se == SE_SHARED) {
478*0Sstevel@tonic-gate 		if (!PP_ISFREE(pp)) {
479*0Sstevel@tonic-gate 			if (old >= 0) {
480*0Sstevel@tonic-gate 				/* readers are not allowed when excl wanted */
481*0Sstevel@tonic-gate 				if (!(old & SE_EWANTED)) {
482*0Sstevel@tonic-gate 					pp->p_selock = old + SE_READER;
483*0Sstevel@tonic-gate 					mutex_exit(pse);
484*0Sstevel@tonic-gate 					return (1);
485*0Sstevel@tonic-gate 				}
486*0Sstevel@tonic-gate 			}
487*0Sstevel@tonic-gate 			mutex_exit(pse);
488*0Sstevel@tonic-gate 			return (0);
489*0Sstevel@tonic-gate 		}
490*0Sstevel@tonic-gate 		/*
491*0Sstevel@tonic-gate 		 * The page is free, so we really want SE_EXCL (below)
492*0Sstevel@tonic-gate 		 */
493*0Sstevel@tonic-gate 		VM_STAT_ADD(page_try_reclaim_upgrade);
494*0Sstevel@tonic-gate 	}
495*0Sstevel@tonic-gate 
496*0Sstevel@tonic-gate 	/*
497*0Sstevel@tonic-gate 	 * The caller wants a writer lock.  We try for it only if
498*0Sstevel@tonic-gate 	 * SE_EWANTED is not set, or if the caller specified
499*0Sstevel@tonic-gate 	 * SE_EXCL_WANTED.
500*0Sstevel@tonic-gate 	 */
501*0Sstevel@tonic-gate 	if (!(old & SE_EWANTED) || (es == SE_EXCL_WANTED)) {
502*0Sstevel@tonic-gate 		if ((old & ~SE_EWANTED) == 0) {
503*0Sstevel@tonic-gate 			/* no reader/writer lock held */
504*0Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
505*0Sstevel@tonic-gate 			/* this clears out our setting of the SE_EWANTED bit */
506*0Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
507*0Sstevel@tonic-gate 			mutex_exit(pse);
508*0Sstevel@tonic-gate 			return (1);
509*0Sstevel@tonic-gate 		}
510*0Sstevel@tonic-gate 	}
511*0Sstevel@tonic-gate 	if (es == SE_EXCL_WANTED) {
512*0Sstevel@tonic-gate 		/* page is locked, set the SE_EWANTED bit */
513*0Sstevel@tonic-gate 		pp->p_selock |= SE_EWANTED;
514*0Sstevel@tonic-gate 	}
515*0Sstevel@tonic-gate 	mutex_exit(pse);
516*0Sstevel@tonic-gate 	return (0);
517*0Sstevel@tonic-gate }
518*0Sstevel@tonic-gate 
519*0Sstevel@tonic-gate /*
520*0Sstevel@tonic-gate  * Acquire a page's "shared/exclusive" lock, but never block.
521*0Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
522*0Sstevel@tonic-gate  */
523*0Sstevel@tonic-gate int
524*0Sstevel@tonic-gate page_trylock(page_t *pp, se_t se)
525*0Sstevel@tonic-gate {
526*0Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
527*0Sstevel@tonic-gate 
528*0Sstevel@tonic-gate 	mutex_enter(pse);
529*0Sstevel@tonic-gate 	if (pp->p_selock & SE_EWANTED) {
530*0Sstevel@tonic-gate 		/* fail if a thread wants exclusive access */
531*0Sstevel@tonic-gate 		mutex_exit(pse);
532*0Sstevel@tonic-gate 		return (0);
533*0Sstevel@tonic-gate 	}
534*0Sstevel@tonic-gate 
535*0Sstevel@tonic-gate 	if (se == SE_EXCL) {
536*0Sstevel@tonic-gate 		if (pp->p_selock == 0) {
537*0Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
538*0Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
539*0Sstevel@tonic-gate 			mutex_exit(pse);
540*0Sstevel@tonic-gate 			return (1);
541*0Sstevel@tonic-gate 		}
542*0Sstevel@tonic-gate 	} else {
543*0Sstevel@tonic-gate 		if (pp->p_selock >= 0) {
544*0Sstevel@tonic-gate 			pp->p_selock += SE_READER;
545*0Sstevel@tonic-gate 			mutex_exit(pse);
546*0Sstevel@tonic-gate 			return (1);
547*0Sstevel@tonic-gate 		}
548*0Sstevel@tonic-gate 	}
549*0Sstevel@tonic-gate 	mutex_exit(pse);
550*0Sstevel@tonic-gate 	return (0);
551*0Sstevel@tonic-gate }
552*0Sstevel@tonic-gate 
553*0Sstevel@tonic-gate /*
554*0Sstevel@tonic-gate  * Release the page's "shared/exclusive" lock and wake up anyone
555*0Sstevel@tonic-gate  * who might be waiting for it.
556*0Sstevel@tonic-gate  */
557*0Sstevel@tonic-gate void
558*0Sstevel@tonic-gate page_unlock(page_t *pp)
559*0Sstevel@tonic-gate {
560*0Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
561*0Sstevel@tonic-gate 	selock_t old;
562*0Sstevel@tonic-gate 
563*0Sstevel@tonic-gate 	mutex_enter(pse);
564*0Sstevel@tonic-gate 	old = pp->p_selock;
565*0Sstevel@tonic-gate 	if ((old & ~SE_EWANTED) == SE_READER) {
566*0Sstevel@tonic-gate 		pp->p_selock = old & ~SE_READER;
567*0Sstevel@tonic-gate 		if (CV_HAS_WAITERS(&pp->p_cv))
568*0Sstevel@tonic-gate 			cv_broadcast(&pp->p_cv);
569*0Sstevel@tonic-gate 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
570*0Sstevel@tonic-gate 		panic("page_unlock: page %p is deleted", pp);
571*0Sstevel@tonic-gate 	} else if (old < 0) {
572*0Sstevel@tonic-gate 		THREAD_KPRI_RELEASE();
573*0Sstevel@tonic-gate 		pp->p_selock &= SE_EWANTED;
574*0Sstevel@tonic-gate 		if (CV_HAS_WAITERS(&pp->p_cv))
575*0Sstevel@tonic-gate 			cv_broadcast(&pp->p_cv);
576*0Sstevel@tonic-gate 	} else if ((old & ~SE_EWANTED) > SE_READER) {
577*0Sstevel@tonic-gate 		pp->p_selock = old - SE_READER;
578*0Sstevel@tonic-gate 	} else {
579*0Sstevel@tonic-gate 		panic("page_unlock: page %p is not locked", pp);
580*0Sstevel@tonic-gate 	}
581*0Sstevel@tonic-gate 	mutex_exit(pse);
582*0Sstevel@tonic-gate }
583*0Sstevel@tonic-gate 
584*0Sstevel@tonic-gate /*
585*0Sstevel@tonic-gate  * Try to upgrade the lock on the page from a "shared" to an
586*0Sstevel@tonic-gate  * "exclusive" lock.  Since this upgrade operation is done while
587*0Sstevel@tonic-gate  * holding the mutex protecting this page, no one else can acquire this page's
588*0Sstevel@tonic-gate  * lock and change the page. Thus, it is safe to drop the "shared"
589*0Sstevel@tonic-gate  * lock and attempt to acquire the "exclusive" lock.
590*0Sstevel@tonic-gate  *
591*0Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
592*0Sstevel@tonic-gate  */
593*0Sstevel@tonic-gate int
594*0Sstevel@tonic-gate page_tryupgrade(page_t *pp)
595*0Sstevel@tonic-gate {
596*0Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
597*0Sstevel@tonic-gate 
598*0Sstevel@tonic-gate 	mutex_enter(pse);
599*0Sstevel@tonic-gate 	if (!(pp->p_selock & SE_EWANTED)) {
600*0Sstevel@tonic-gate 		/* no threads want exclusive access, try upgrade */
601*0Sstevel@tonic-gate 		if (pp->p_selock == SE_READER) {
602*0Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
603*0Sstevel@tonic-gate 			/* convert to exclusive lock */
604*0Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
605*0Sstevel@tonic-gate 			mutex_exit(pse);
606*0Sstevel@tonic-gate 			return (1);
607*0Sstevel@tonic-gate 		}
608*0Sstevel@tonic-gate 	}
609*0Sstevel@tonic-gate 	mutex_exit(pse);
610*0Sstevel@tonic-gate 	return (0);
611*0Sstevel@tonic-gate }
612*0Sstevel@tonic-gate 
613*0Sstevel@tonic-gate /*
614*0Sstevel@tonic-gate  * Downgrade the "exclusive" lock on the page to a "shared" lock
615*0Sstevel@tonic-gate  * while holding the mutex protecting this page's p_selock field.
616*0Sstevel@tonic-gate  */
617*0Sstevel@tonic-gate void
618*0Sstevel@tonic-gate page_downgrade(page_t *pp)
619*0Sstevel@tonic-gate {
620*0Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
621*0Sstevel@tonic-gate 	int excl_waiting;
622*0Sstevel@tonic-gate 
623*0Sstevel@tonic-gate 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
624*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
625*0Sstevel@tonic-gate 
626*0Sstevel@tonic-gate 	mutex_enter(pse);
627*0Sstevel@tonic-gate 	excl_waiting =  pp->p_selock & SE_EWANTED;
628*0Sstevel@tonic-gate 	THREAD_KPRI_RELEASE();
629*0Sstevel@tonic-gate 	pp->p_selock = SE_READER | excl_waiting;
630*0Sstevel@tonic-gate 	if (CV_HAS_WAITERS(&pp->p_cv))
631*0Sstevel@tonic-gate 		cv_broadcast(&pp->p_cv);
632*0Sstevel@tonic-gate 	mutex_exit(pse);
633*0Sstevel@tonic-gate }
634*0Sstevel@tonic-gate 
635*0Sstevel@tonic-gate void
636*0Sstevel@tonic-gate page_lock_delete(page_t *pp)
637*0Sstevel@tonic-gate {
638*0Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
639*0Sstevel@tonic-gate 
640*0Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
641*0Sstevel@tonic-gate 	ASSERT(pp->p_vnode == NULL);
642*0Sstevel@tonic-gate 	ASSERT(pp->p_offset == (u_offset_t)-1);
643*0Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(pp));
644*0Sstevel@tonic-gate 
645*0Sstevel@tonic-gate 	mutex_enter(pse);
646*0Sstevel@tonic-gate 	THREAD_KPRI_RELEASE();
647*0Sstevel@tonic-gate 	pp->p_selock = SE_DELETED;
648*0Sstevel@tonic-gate 	if (CV_HAS_WAITERS(&pp->p_cv))
649*0Sstevel@tonic-gate 		cv_broadcast(&pp->p_cv);
650*0Sstevel@tonic-gate 	mutex_exit(pse);
651*0Sstevel@tonic-gate }
652*0Sstevel@tonic-gate 
653*0Sstevel@tonic-gate /*
654*0Sstevel@tonic-gate  * Implement the io lock for pages
655*0Sstevel@tonic-gate  */
656*0Sstevel@tonic-gate void
657*0Sstevel@tonic-gate page_iolock_init(page_t *pp)
658*0Sstevel@tonic-gate {
659*0Sstevel@tonic-gate 	pp->p_iolock_state = 0;
660*0Sstevel@tonic-gate 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
661*0Sstevel@tonic-gate }
662*0Sstevel@tonic-gate 
663*0Sstevel@tonic-gate /*
664*0Sstevel@tonic-gate  * Acquire the i/o lock on a page.
665*0Sstevel@tonic-gate  */
666*0Sstevel@tonic-gate void
667*0Sstevel@tonic-gate page_io_lock(page_t *pp)
668*0Sstevel@tonic-gate {
669*0Sstevel@tonic-gate 	kmutex_t *pio;
670*0Sstevel@tonic-gate 
671*0Sstevel@tonic-gate 	pio = PAGE_IO_MUTEX(pp);
672*0Sstevel@tonic-gate 	mutex_enter(pio);
673*0Sstevel@tonic-gate 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
674*0Sstevel@tonic-gate 		cv_wait(&(pp->p_io_cv), pio);
675*0Sstevel@tonic-gate 	}
676*0Sstevel@tonic-gate 	pp->p_iolock_state |= PAGE_IO_INUSE;
677*0Sstevel@tonic-gate 	mutex_exit(pio);
678*0Sstevel@tonic-gate }
679*0Sstevel@tonic-gate 
680*0Sstevel@tonic-gate /*
681*0Sstevel@tonic-gate  * Release the i/o lock on a page.
682*0Sstevel@tonic-gate  */
683*0Sstevel@tonic-gate void
684*0Sstevel@tonic-gate page_io_unlock(page_t *pp)
685*0Sstevel@tonic-gate {
686*0Sstevel@tonic-gate 	kmutex_t *pio;
687*0Sstevel@tonic-gate 
688*0Sstevel@tonic-gate 	pio = PAGE_IO_MUTEX(pp);
689*0Sstevel@tonic-gate 	mutex_enter(pio);
690*0Sstevel@tonic-gate 	cv_signal(&pp->p_io_cv);
691*0Sstevel@tonic-gate 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
692*0Sstevel@tonic-gate 	mutex_exit(pio);
693*0Sstevel@tonic-gate }
694*0Sstevel@tonic-gate 
695*0Sstevel@tonic-gate /*
696*0Sstevel@tonic-gate  * Try to acquire the i/o lock on a page without blocking.
697*0Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
698*0Sstevel@tonic-gate  */
699*0Sstevel@tonic-gate int
700*0Sstevel@tonic-gate page_io_trylock(page_t *pp)
701*0Sstevel@tonic-gate {
702*0Sstevel@tonic-gate 	kmutex_t *pio;
703*0Sstevel@tonic-gate 
704*0Sstevel@tonic-gate 	if (pp->p_iolock_state & PAGE_IO_INUSE)
705*0Sstevel@tonic-gate 		return (0);
706*0Sstevel@tonic-gate 
707*0Sstevel@tonic-gate 	pio = PAGE_IO_MUTEX(pp);
708*0Sstevel@tonic-gate 	mutex_enter(pio);
709*0Sstevel@tonic-gate 
710*0Sstevel@tonic-gate 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
711*0Sstevel@tonic-gate 		mutex_exit(pio);
712*0Sstevel@tonic-gate 		return (0);
713*0Sstevel@tonic-gate 	}
714*0Sstevel@tonic-gate 	pp->p_iolock_state |= PAGE_IO_INUSE;
715*0Sstevel@tonic-gate 	mutex_exit(pio);
716*0Sstevel@tonic-gate 
717*0Sstevel@tonic-gate 	return (1);
718*0Sstevel@tonic-gate }
719*0Sstevel@tonic-gate 
720*0Sstevel@tonic-gate /*
721*0Sstevel@tonic-gate  * Assert that the i/o lock on a page is held.
722*0Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
723*0Sstevel@tonic-gate  */
724*0Sstevel@tonic-gate int
725*0Sstevel@tonic-gate page_iolock_assert(page_t *pp)
726*0Sstevel@tonic-gate {
727*0Sstevel@tonic-gate 	return (pp->p_iolock_state & PAGE_IO_INUSE);
728*0Sstevel@tonic-gate }
729*0Sstevel@tonic-gate 
730*0Sstevel@tonic-gate /*
731*0Sstevel@tonic-gate  * Wrapper exported to kernel routines that are built
732*0Sstevel@tonic-gate  * platform-independent (the macro is platform-dependent;
733*0Sstevel@tonic-gate  * the size of vph_mutex[] is based on NCPU).
734*0Sstevel@tonic-gate  *
735*0Sstevel@tonic-gate  * Note that you can do stress testing on this by setting the
736*0Sstevel@tonic-gate  * variable page_vnode_mutex_stress to something other than
737*0Sstevel@tonic-gate  * zero in a DEBUG kernel in a debugger after loading the kernel.
738*0Sstevel@tonic-gate  * Setting it after the kernel is running may not work correctly.
739*0Sstevel@tonic-gate  */
740*0Sstevel@tonic-gate #ifdef DEBUG
741*0Sstevel@tonic-gate static int page_vnode_mutex_stress = 0;
742*0Sstevel@tonic-gate #endif
743*0Sstevel@tonic-gate 
744*0Sstevel@tonic-gate kmutex_t *
745*0Sstevel@tonic-gate page_vnode_mutex(vnode_t *vp)
746*0Sstevel@tonic-gate {
747*0Sstevel@tonic-gate 	if (vp == &kvp)
748*0Sstevel@tonic-gate 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
749*0Sstevel@tonic-gate #ifdef DEBUG
750*0Sstevel@tonic-gate 	if (page_vnode_mutex_stress != 0)
751*0Sstevel@tonic-gate 		return (&vph_mutex[0]);
752*0Sstevel@tonic-gate #endif
753*0Sstevel@tonic-gate 
754*0Sstevel@tonic-gate 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
755*0Sstevel@tonic-gate }
756*0Sstevel@tonic-gate 
757*0Sstevel@tonic-gate kmutex_t *
758*0Sstevel@tonic-gate page_se_mutex(page_t *pp)
759*0Sstevel@tonic-gate {
760*0Sstevel@tonic-gate 	return (PAGE_SE_MUTEX(pp));
761*0Sstevel@tonic-gate }
762*0Sstevel@tonic-gate 
763*0Sstevel@tonic-gate #ifdef VM_STATS
764*0Sstevel@tonic-gate uint_t pszclck_stat[4];
765*0Sstevel@tonic-gate #endif
766*0Sstevel@tonic-gate /*
767*0Sstevel@tonic-gate  * Find, take and return a mutex held by hat_page_demote().
768*0Sstevel@tonic-gate  * Called by page_demote_vp_pages() before hat_page_demote() call and by
769*0Sstevel@tonic-gate  * routines that want to block hat_page_demote() but can't do it
770*0Sstevel@tonic-gate  * via locking all constituent pages.
771*0Sstevel@tonic-gate  *
772*0Sstevel@tonic-gate  * Return NULL if p_szc is 0.
773*0Sstevel@tonic-gate  *
774*0Sstevel@tonic-gate  * It should only be used for pages that can be demoted by hat_page_demote()
775*0Sstevel@tonic-gate  * i.e. non swapfs file system pages.  The logic here is lifted from
776*0Sstevel@tonic-gate  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
777*0Sstevel@tonic-gate  * since the page is locked and not free.
778*0Sstevel@tonic-gate  *
779*0Sstevel@tonic-gate  * Hash of the root page is used to find the lock.
780*0Sstevel@tonic-gate  * To find the root in the presense of hat_page_demote() chageing the location
781*0Sstevel@tonic-gate  * of the root this routine relies on the fact that hat_page_demote() changes
782*0Sstevel@tonic-gate  * root last.
783*0Sstevel@tonic-gate  *
784*0Sstevel@tonic-gate  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
785*0Sstevel@tonic-gate  * returned pp's p_szc may be any value.
786*0Sstevel@tonic-gate  */
787*0Sstevel@tonic-gate kmutex_t *
788*0Sstevel@tonic-gate page_szc_lock(page_t *pp)
789*0Sstevel@tonic-gate {
790*0Sstevel@tonic-gate 	kmutex_t	*mtx;
791*0Sstevel@tonic-gate 	page_t		*rootpp;
792*0Sstevel@tonic-gate 	uint_t		szc;
793*0Sstevel@tonic-gate 	uint_t		rszc;
794*0Sstevel@tonic-gate 	uint_t		pszc = pp->p_szc;
795*0Sstevel@tonic-gate 
796*0Sstevel@tonic-gate 	ASSERT(pp != NULL);
797*0Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
798*0Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(pp));
799*0Sstevel@tonic-gate 	ASSERT(pp->p_vnode != NULL);
800*0Sstevel@tonic-gate 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
801*0Sstevel@tonic-gate 	ASSERT(pp->p_vnode != &kvp);
802*0Sstevel@tonic-gate 
803*0Sstevel@tonic-gate again:
804*0Sstevel@tonic-gate 	if (pszc == 0) {
805*0Sstevel@tonic-gate 		VM_STAT_ADD(pszclck_stat[0]);
806*0Sstevel@tonic-gate 		return (NULL);
807*0Sstevel@tonic-gate 	}
808*0Sstevel@tonic-gate 
809*0Sstevel@tonic-gate 	/* The lock lives in the root page */
810*0Sstevel@tonic-gate 
811*0Sstevel@tonic-gate 	rootpp = PP_GROUPLEADER(pp, pszc);
812*0Sstevel@tonic-gate 	mtx = PAGE_SZC_MUTEX(rootpp);
813*0Sstevel@tonic-gate 	mutex_enter(mtx);
814*0Sstevel@tonic-gate 
815*0Sstevel@tonic-gate 	/*
816*0Sstevel@tonic-gate 	 * since p_szc can only decrease if pp == rootpp
817*0Sstevel@tonic-gate 	 * rootpp will be always the same i.e we have the right root
818*0Sstevel@tonic-gate 	 * regardless of rootpp->p_szc.
819*0Sstevel@tonic-gate 	 * If location of pp's root didn't change after we took
820*0Sstevel@tonic-gate 	 * the lock we have the right root. return mutex hashed off it.
821*0Sstevel@tonic-gate 	 */
822*0Sstevel@tonic-gate 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
823*0Sstevel@tonic-gate 		VM_STAT_ADD(pszclck_stat[1]);
824*0Sstevel@tonic-gate 		return (mtx);
825*0Sstevel@tonic-gate 	}
826*0Sstevel@tonic-gate 
827*0Sstevel@tonic-gate 	/*
828*0Sstevel@tonic-gate 	 * root location changed because page got demoted.
829*0Sstevel@tonic-gate 	 * locate the new root.
830*0Sstevel@tonic-gate 	 */
831*0Sstevel@tonic-gate 	if (rszc < pszc) {
832*0Sstevel@tonic-gate 		szc = pp->p_szc;
833*0Sstevel@tonic-gate 		ASSERT(szc < pszc);
834*0Sstevel@tonic-gate 		mutex_exit(mtx);
835*0Sstevel@tonic-gate 		pszc = szc;
836*0Sstevel@tonic-gate 		VM_STAT_ADD(pszclck_stat[2]);
837*0Sstevel@tonic-gate 		goto again;
838*0Sstevel@tonic-gate 	}
839*0Sstevel@tonic-gate 
840*0Sstevel@tonic-gate 	VM_STAT_ADD(pszclck_stat[3]);
841*0Sstevel@tonic-gate 	/*
842*0Sstevel@tonic-gate 	 * current hat_page_demote not done yet.
843*0Sstevel@tonic-gate 	 * wait for it to finish.
844*0Sstevel@tonic-gate 	 */
845*0Sstevel@tonic-gate 	mutex_exit(mtx);
846*0Sstevel@tonic-gate 	rootpp = PP_GROUPLEADER(rootpp, rszc);
847*0Sstevel@tonic-gate 	mtx = PAGE_SZC_MUTEX(rootpp);
848*0Sstevel@tonic-gate 	mutex_enter(mtx);
849*0Sstevel@tonic-gate 	mutex_exit(mtx);
850*0Sstevel@tonic-gate 	ASSERT(rootpp->p_szc < rszc);
851*0Sstevel@tonic-gate 	goto again;
852*0Sstevel@tonic-gate }
853*0Sstevel@tonic-gate 
854*0Sstevel@tonic-gate int
855*0Sstevel@tonic-gate page_szc_lock_assert(page_t *pp)
856*0Sstevel@tonic-gate {
857*0Sstevel@tonic-gate 	page_t *rootpp = PP_PAGEROOT(pp);
858*0Sstevel@tonic-gate 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
859*0Sstevel@tonic-gate 
860*0Sstevel@tonic-gate 	return (MUTEX_HELD(mtx));
861*0Sstevel@tonic-gate }
862