10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
52759Selowe  * Common Development and Distribution License (the "License").
62759Selowe  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
223446Smrj  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
270Sstevel@tonic-gate 
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate  * VM - page locking primitives
300Sstevel@tonic-gate  */
310Sstevel@tonic-gate #include <sys/param.h>
320Sstevel@tonic-gate #include <sys/t_lock.h>
330Sstevel@tonic-gate #include <sys/vtrace.h>
340Sstevel@tonic-gate #include <sys/debug.h>
350Sstevel@tonic-gate #include <sys/cmn_err.h>
360Sstevel@tonic-gate #include <sys/vnode.h>
370Sstevel@tonic-gate #include <sys/bitmap.h>
380Sstevel@tonic-gate #include <sys/lockstat.h>
39*4878Sblakej #include <sys/sysmacros.h>
400Sstevel@tonic-gate #include <sys/condvar_impl.h>
410Sstevel@tonic-gate #include <vm/page.h>
420Sstevel@tonic-gate #include <vm/seg_enum.h>
430Sstevel@tonic-gate #include <vm/vm_dep.h>
440Sstevel@tonic-gate 
450Sstevel@tonic-gate /*
460Sstevel@tonic-gate  * This global mutex is for logical page locking.
470Sstevel@tonic-gate  * The following fields in the page structure are protected
480Sstevel@tonic-gate  * by this lock:
490Sstevel@tonic-gate  *
500Sstevel@tonic-gate  *	p_lckcnt
510Sstevel@tonic-gate  *	p_cowcnt
520Sstevel@tonic-gate  */
530Sstevel@tonic-gate kmutex_t page_llock;
540Sstevel@tonic-gate 
550Sstevel@tonic-gate /*
560Sstevel@tonic-gate  * This is a global lock for the logical page free list.  The
570Sstevel@tonic-gate  * logical free list, in this implementation, is maintained as two
580Sstevel@tonic-gate  * separate physical lists - the cache list and the free list.
590Sstevel@tonic-gate  */
600Sstevel@tonic-gate kmutex_t  page_freelock;
610Sstevel@tonic-gate 
620Sstevel@tonic-gate /*
630Sstevel@tonic-gate  * The hash table, page_hash[], the p_selock fields, and the
640Sstevel@tonic-gate  * list of pages associated with vnodes are protected by arrays of mutexes.
650Sstevel@tonic-gate  *
660Sstevel@tonic-gate  * Unless the hashes are changed radically, the table sizes must be
670Sstevel@tonic-gate  * a power of two.  Also, we typically need more mutexes for the
680Sstevel@tonic-gate  * vnodes since these locks are occasionally held for long periods.
690Sstevel@tonic-gate  * And since there seem to be two special vnodes (kvp and swapvp),
700Sstevel@tonic-gate  * we make room for private mutexes for them.
710Sstevel@tonic-gate  *
720Sstevel@tonic-gate  * The pse_mutex[] array holds the mutexes to protect the p_selock
730Sstevel@tonic-gate  * fields of all page_t structures.
740Sstevel@tonic-gate  *
750Sstevel@tonic-gate  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
760Sstevel@tonic-gate  * when given a pointer to a page_t.
770Sstevel@tonic-gate  *
78*4878Sblakej  * PIO_TABLE_SIZE must be a power of two.  One could argue that we
790Sstevel@tonic-gate  * should go to the trouble of setting it up at run time and base it
800Sstevel@tonic-gate  * on memory size rather than the number of compile time CPUs.
810Sstevel@tonic-gate  *
82*4878Sblakej  * XX64	We should be using physmem size to calculate PIO_SHIFT.
830Sstevel@tonic-gate  *
840Sstevel@tonic-gate  *	These might break in 64 bit world.
850Sstevel@tonic-gate  */
86*4878Sblakej #define	PIO_SHIFT	7	/* log2(sizeof(page_t)) */
87*4878Sblakej #define	PIO_TABLE_SIZE	128	/* number of io mutexes to have */
880Sstevel@tonic-gate 
890Sstevel@tonic-gate pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
900Sstevel@tonic-gate kmutex_t	pio_mutex[PIO_TABLE_SIZE];
910Sstevel@tonic-gate 
920Sstevel@tonic-gate #define	PAGE_IO_MUTEX(pp) \
930Sstevel@tonic-gate 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
940Sstevel@tonic-gate 
95*4878Sblakej /*
96*4878Sblakej  * The pse_mutex[] array is allocated in the platform startup code
97*4878Sblakej  * based on the size of the machine at startup.
98*4878Sblakej  */
99*4878Sblakej extern pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
100*4878Sblakej extern size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
101*4878Sblakej extern int pse_shift;			/* log2(pse_table_size) */
102*4878Sblakej #define	PAGE_SE_MUTEX(pp)	&pse_mutex[				\
103*4878Sblakej 	((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &	\
104*4878Sblakej 	(pse_table_size - 1)].pad_mutex
105*4878Sblakej 
1060Sstevel@tonic-gate #define	PSZC_MTX_TABLE_SIZE	128
1070Sstevel@tonic-gate #define	PSZC_MTX_TABLE_SHIFT	7
1080Sstevel@tonic-gate 
1090Sstevel@tonic-gate static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
1100Sstevel@tonic-gate 
1110Sstevel@tonic-gate #define	PAGE_SZC_MUTEX(_pp) \
1120Sstevel@tonic-gate 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
1130Sstevel@tonic-gate 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
1140Sstevel@tonic-gate 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
1150Sstevel@tonic-gate 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
1160Sstevel@tonic-gate 
1170Sstevel@tonic-gate /*
1180Sstevel@tonic-gate  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
1190Sstevel@tonic-gate  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
1200Sstevel@tonic-gate  * and p_vpnext).
1210Sstevel@tonic-gate  *
1220Sstevel@tonic-gate  * The page_vnode_mutex(vp) function returns the address of the appropriate
1230Sstevel@tonic-gate  * mutex from this array given a pointer to a vnode.  It is complicated
1240Sstevel@tonic-gate  * by the fact that the kernel's vnode and the swapfs vnode are referenced
1250Sstevel@tonic-gate  * frequently enough to warrent their own mutexes.
1260Sstevel@tonic-gate  *
1270Sstevel@tonic-gate  * The VP_HASH_FUNC returns the index into the vph_mutex array given
1280Sstevel@tonic-gate  * an address of a vnode.
1290Sstevel@tonic-gate  */
1300Sstevel@tonic-gate 
1310Sstevel@tonic-gate /*
1320Sstevel@tonic-gate  * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
1330Sstevel@tonic-gate  *	Need to review again.
1340Sstevel@tonic-gate  */
1354325Sqiao #if defined(_LP64)
1364325Sqiao #define	VPH_TABLE_SIZE  (1 << (VP_SHIFT + 3))
1374325Sqiao #else	/* 32 bits */
1380Sstevel@tonic-gate #define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
1394325Sqiao #endif
1400Sstevel@tonic-gate 
1410Sstevel@tonic-gate #define	VP_HASH_FUNC(vp) \
1420Sstevel@tonic-gate 	((((uintptr_t)(vp) >> 6) + \
1430Sstevel@tonic-gate 	    ((uintptr_t)(vp) >> 8) + \
1440Sstevel@tonic-gate 	    ((uintptr_t)(vp) >> 10) + \
1450Sstevel@tonic-gate 	    ((uintptr_t)(vp) >> 12)) \
1460Sstevel@tonic-gate 	    & (VPH_TABLE_SIZE - 1))
1470Sstevel@tonic-gate 
1480Sstevel@tonic-gate extern	struct vnode	kvp;
1490Sstevel@tonic-gate 
1503290Sjohansen /*
1513290Sjohansen  * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
1523290Sjohansen  * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
1533290Sjohansen  * VPH_TABLE_SIZE + 1.
1543290Sjohansen  */
1553290Sjohansen 
1560Sstevel@tonic-gate kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
1570Sstevel@tonic-gate 
1580Sstevel@tonic-gate /*
1590Sstevel@tonic-gate  * Initialize the locks used by the Virtual Memory Management system.
1600Sstevel@tonic-gate  */
1610Sstevel@tonic-gate void
1620Sstevel@tonic-gate page_lock_init()
1630Sstevel@tonic-gate {
1640Sstevel@tonic-gate }
1650Sstevel@tonic-gate 
1660Sstevel@tonic-gate /*
167*4878Sblakej  * Return a value for pse_shift based on npg (the number of physical pages)
168*4878Sblakej  * and ncpu (the maximum number of CPUs).  This is called by platform startup
169*4878Sblakej  * code.
170*4878Sblakej  *
171*4878Sblakej  * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
172*4878Sblakej  * locks grew approximately as the square of the number of threads executing.
173*4878Sblakej  * So the primary scaling factor used is NCPU^2.  The size of the machine in
174*4878Sblakej  * megabytes is used as an upper bound, particularly for sun4v machines which
175*4878Sblakej  * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
176*4878Sblakej  * (128) is used as a minimum.  Since the size of the table has to be a power
177*4878Sblakej  * of two, the calculated size is rounded up to the next power of two.
178*4878Sblakej  */
179*4878Sblakej /*ARGSUSED*/
180*4878Sblakej int
181*4878Sblakej size_pse_array(pgcnt_t npg, int ncpu)
182*4878Sblakej {
183*4878Sblakej 	size_t size;
184*4878Sblakej 	pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
185*4878Sblakej 
186*4878Sblakej 	size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
187*4878Sblakej 	size += (1 << (highbit(size) - 1)) - 1;
188*4878Sblakej 	return (highbit(size) - 1);
189*4878Sblakej }
190*4878Sblakej 
191*4878Sblakej /*
1920Sstevel@tonic-gate  * At present we only use page ownership to aid debugging, so it's
1930Sstevel@tonic-gate  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
1940Sstevel@tonic-gate  * can map to the same owner because we just 'or' in 0x80000000 and
1950Sstevel@tonic-gate  * then clear the second highest bit, so that (for example) 0x2faced00
1960Sstevel@tonic-gate  * and 0xafaced00 both map to 0xafaced00.
1970Sstevel@tonic-gate  * In the 64-bit world, p_selock may not be large enough to hold a full
1980Sstevel@tonic-gate  * thread pointer.  If we ever need precise ownership (e.g. if we implement
1990Sstevel@tonic-gate  * priority inheritance for page locks) then p_selock should become a
2000Sstevel@tonic-gate  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
2010Sstevel@tonic-gate  */
2020Sstevel@tonic-gate #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
2030Sstevel@tonic-gate #define	SE_READER	1
2040Sstevel@tonic-gate 
2050Sstevel@tonic-gate /*
2060Sstevel@tonic-gate  * A page that is deleted must be marked as such using the
2070Sstevel@tonic-gate  * page_lock_delete() function. The page must be exclusively locked.
2080Sstevel@tonic-gate  * The SE_DELETED marker is put in p_selock when this function is called.
2090Sstevel@tonic-gate  * SE_DELETED must be distinct from any SE_WRITER value.
2100Sstevel@tonic-gate  */
2110Sstevel@tonic-gate #define	SE_DELETED	(1 | INT_MIN)
2120Sstevel@tonic-gate 
2130Sstevel@tonic-gate #ifdef VM_STATS
2140Sstevel@tonic-gate uint_t	vph_kvp_count;
2150Sstevel@tonic-gate uint_t	vph_swapfsvp_count;
2160Sstevel@tonic-gate uint_t	vph_other;
2170Sstevel@tonic-gate #endif /* VM_STATS */
2180Sstevel@tonic-gate 
2190Sstevel@tonic-gate #ifdef VM_STATS
2200Sstevel@tonic-gate uint_t	page_lock_count;
2210Sstevel@tonic-gate uint_t	page_lock_miss;
2220Sstevel@tonic-gate uint_t	page_lock_miss_lock;
2230Sstevel@tonic-gate uint_t	page_lock_reclaim;
2240Sstevel@tonic-gate uint_t	page_lock_bad_reclaim;
2250Sstevel@tonic-gate uint_t	page_lock_same_page;
2260Sstevel@tonic-gate uint_t	page_lock_upgrade;
227917Selowe uint_t	page_lock_retired;
2280Sstevel@tonic-gate uint_t	page_lock_upgrade_failed;
2290Sstevel@tonic-gate uint_t	page_lock_deleted;
2300Sstevel@tonic-gate 
2310Sstevel@tonic-gate uint_t	page_trylock_locked;
232917Selowe uint_t	page_trylock_failed;
2330Sstevel@tonic-gate uint_t	page_trylock_missed;
2340Sstevel@tonic-gate 
2350Sstevel@tonic-gate uint_t	page_try_reclaim_upgrade;
2360Sstevel@tonic-gate #endif /* VM_STATS */
2370Sstevel@tonic-gate 
2380Sstevel@tonic-gate /*
2390Sstevel@tonic-gate  * Acquire the "shared/exclusive" lock on a page.
2400Sstevel@tonic-gate  *
2410Sstevel@tonic-gate  * Returns 1 on success and locks the page appropriately.
2420Sstevel@tonic-gate  *	   0 on failure and does not lock the page.
2430Sstevel@tonic-gate  *
2440Sstevel@tonic-gate  * If `lock' is non-NULL, it will be dropped and reacquired in the
2450Sstevel@tonic-gate  * failure case.  This routine can block, and if it does
2460Sstevel@tonic-gate  * it will always return a failure since the page identity [vp, off]
2470Sstevel@tonic-gate  * or state may have changed.
2480Sstevel@tonic-gate  */
2490Sstevel@tonic-gate 
2500Sstevel@tonic-gate int
2510Sstevel@tonic-gate page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
2520Sstevel@tonic-gate {
2530Sstevel@tonic-gate 	return (page_lock_es(pp, se, lock, reclaim, 0));
2540Sstevel@tonic-gate }
2550Sstevel@tonic-gate 
2560Sstevel@tonic-gate /*
2570Sstevel@tonic-gate  * With the addition of reader-writer lock semantics to page_lock_es,
2580Sstevel@tonic-gate  * callers wanting an exclusive (writer) lock may prevent shared-lock
2590Sstevel@tonic-gate  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
2600Sstevel@tonic-gate  * In this case, when an exclusive lock cannot be acquired, p_selock's
261917Selowe  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
262917Selowe  * if the page is slated for retirement.
263917Selowe  *
264917Selowe  * The se and es parameters determine if the lock should be granted
265917Selowe  * based on the following decision table:
266917Selowe  *
267917Selowe  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
268917Selowe  * ----------- -------------- -------------------  ---------
269917Selowe  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
270917Selowe  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
271917Selowe  * SE_EXCL        none         any lock/any        deny
2722759Selowe  * SE_SHARED      n/a [2]        shared/0          grant
2732759Selowe  * SE_SHARED      n/a [2]      unlocked/0          grant
274917Selowe  * SE_SHARED      n/a            shared/1          deny
275917Selowe  * SE_SHARED      n/a          unlocked/1          deny
276917Selowe  * SE_SHARED      n/a              excl/any        deny
2770Sstevel@tonic-gate  *
278917Selowe  * Notes:
279917Selowe  * [1] The code grants an exclusive lock to the caller and clears the bit
280917Selowe  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
281917Selowe  *   bit's value.  This was deemed acceptable as we are not concerned about
282917Selowe  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
283917Selowe  *   fifo mechanism should also be implemented. Meantime, the thread that
284917Selowe  *   set SE_EWANTED should be prepared to catch this condition and reset it
285917Selowe  *
286917Selowe  * [2] Retired pages may not be locked at any time, regardless of the
287917Selowe  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
2880Sstevel@tonic-gate  *
289917Selowe  * Notes on values of "es":
290917Selowe  *
291917Selowe  *   es & 1: page_lookup_create will attempt page relocation
292917Selowe  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
293917Selowe  *       memory thread); this prevents reader-starvation of waiting
294917Selowe  *       writer thread(s) by giving priority to writers over readers.
295917Selowe  *   es & SE_RETIRED: caller wants to lock pages even if they are
296917Selowe  *       retired.  Default is to deny the lock if the page is retired.
297917Selowe  *
298917Selowe  * And yes, we know, the semantics of this function are too complicated.
299917Selowe  * It's on the list to be cleaned up.
3000Sstevel@tonic-gate  */
3010Sstevel@tonic-gate int
3020Sstevel@tonic-gate page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
3030Sstevel@tonic-gate {
3040Sstevel@tonic-gate 	int		retval;
3050Sstevel@tonic-gate 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
3060Sstevel@tonic-gate 	int		upgraded;
3070Sstevel@tonic-gate 	int		reclaim_it;
3080Sstevel@tonic-gate 
3090Sstevel@tonic-gate 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
3100Sstevel@tonic-gate 
3110Sstevel@tonic-gate 	VM_STAT_ADD(page_lock_count);
3120Sstevel@tonic-gate 
3130Sstevel@tonic-gate 	upgraded = 0;
3140Sstevel@tonic-gate 	reclaim_it = 0;
3150Sstevel@tonic-gate 
3160Sstevel@tonic-gate 	mutex_enter(pse);
3170Sstevel@tonic-gate 
318917Selowe 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
319917Selowe 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
3200Sstevel@tonic-gate 
321917Selowe 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
322917Selowe 		mutex_exit(pse);
323917Selowe 		VM_STAT_ADD(page_lock_retired);
324917Selowe 		return (0);
325917Selowe 	}
3260Sstevel@tonic-gate 
3270Sstevel@tonic-gate 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
3280Sstevel@tonic-gate 		se = SE_EXCL;
3290Sstevel@tonic-gate 	}
3300Sstevel@tonic-gate 
3310Sstevel@tonic-gate 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
3320Sstevel@tonic-gate 
3330Sstevel@tonic-gate 		reclaim_it = 1;
3340Sstevel@tonic-gate 		if (se == SE_SHARED) {
3350Sstevel@tonic-gate 			/*
3360Sstevel@tonic-gate 			 * This is an interesting situation.
3370Sstevel@tonic-gate 			 *
3380Sstevel@tonic-gate 			 * Remember that p_free can only change if
3390Sstevel@tonic-gate 			 * p_selock < 0.
3400Sstevel@tonic-gate 			 * p_free does not depend on our holding `pse'.
3410Sstevel@tonic-gate 			 * And, since we hold `pse', p_selock can not change.
3420Sstevel@tonic-gate 			 * So, if p_free changes on us, the page is already
3430Sstevel@tonic-gate 			 * exclusively held, and we would fail to get p_selock
3440Sstevel@tonic-gate 			 * regardless.
3450Sstevel@tonic-gate 			 *
3460Sstevel@tonic-gate 			 * We want to avoid getting the share
3470Sstevel@tonic-gate 			 * lock on a free page that needs to be reclaimed.
3480Sstevel@tonic-gate 			 * It is possible that some other thread has the share
3490Sstevel@tonic-gate 			 * lock and has left the free page on the cache list.
3500Sstevel@tonic-gate 			 * pvn_vplist_dirty() does this for brief periods.
3510Sstevel@tonic-gate 			 * If the se_share is currently SE_EXCL, we will fail
3520Sstevel@tonic-gate 			 * to acquire p_selock anyway.  Blocking is the
3530Sstevel@tonic-gate 			 * right thing to do.
3540Sstevel@tonic-gate 			 * If we need to reclaim this page, we must get
3550Sstevel@tonic-gate 			 * exclusive access to it, force the upgrade now.
3560Sstevel@tonic-gate 			 * Again, we will fail to acquire p_selock if the
3570Sstevel@tonic-gate 			 * page is not free and block.
3580Sstevel@tonic-gate 			 */
3590Sstevel@tonic-gate 			upgraded = 1;
3600Sstevel@tonic-gate 			se = SE_EXCL;
3610Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_upgrade);
3620Sstevel@tonic-gate 		}
3630Sstevel@tonic-gate 	}
3640Sstevel@tonic-gate 
3650Sstevel@tonic-gate 	if (se == SE_EXCL) {
366917Selowe 		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
3670Sstevel@tonic-gate 			/*
3680Sstevel@tonic-gate 			 * if the caller wants a writer lock (but did not
3690Sstevel@tonic-gate 			 * specify exclusive access), and there is a pending
3700Sstevel@tonic-gate 			 * writer that wants exclusive access, return failure
3710Sstevel@tonic-gate 			 */
3720Sstevel@tonic-gate 			retval = 0;
3730Sstevel@tonic-gate 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
3740Sstevel@tonic-gate 			/* no reader/writer lock held */
3750Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
3760Sstevel@tonic-gate 			/* this clears our setting of the SE_EWANTED bit */
3770Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
3780Sstevel@tonic-gate 			retval = 1;
3790Sstevel@tonic-gate 		} else {
3800Sstevel@tonic-gate 			/* page is locked */
381917Selowe 			if (es & SE_EXCL_WANTED) {
3820Sstevel@tonic-gate 				/* set the SE_EWANTED bit */
3830Sstevel@tonic-gate 				pp->p_selock |= SE_EWANTED;
3840Sstevel@tonic-gate 			}
3850Sstevel@tonic-gate 			retval = 0;
3860Sstevel@tonic-gate 		}
3870Sstevel@tonic-gate 	} else {
3880Sstevel@tonic-gate 		retval = 0;
3890Sstevel@tonic-gate 		if (pp->p_selock >= 0) {
390917Selowe 			if ((pp->p_selock & SE_EWANTED) == 0) {
3912759Selowe 				pp->p_selock += SE_READER;
3922759Selowe 				retval = 1;
3930Sstevel@tonic-gate 			}
3940Sstevel@tonic-gate 		}
3950Sstevel@tonic-gate 	}
3960Sstevel@tonic-gate 
3970Sstevel@tonic-gate 	if (retval == 0) {
3980Sstevel@tonic-gate 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
3990Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_deleted);
4000Sstevel@tonic-gate 			mutex_exit(pse);
4010Sstevel@tonic-gate 			return (retval);
4020Sstevel@tonic-gate 		}
4030Sstevel@tonic-gate 
4040Sstevel@tonic-gate #ifdef VM_STATS
4050Sstevel@tonic-gate 		VM_STAT_ADD(page_lock_miss);
4060Sstevel@tonic-gate 		if (upgraded) {
4070Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_upgrade_failed);
4080Sstevel@tonic-gate 		}
4090Sstevel@tonic-gate #endif
4100Sstevel@tonic-gate 		if (lock) {
4110Sstevel@tonic-gate 			VM_STAT_ADD(page_lock_miss_lock);
4120Sstevel@tonic-gate 			mutex_exit(lock);
4130Sstevel@tonic-gate 		}
4140Sstevel@tonic-gate 
4150Sstevel@tonic-gate 		/*
4160Sstevel@tonic-gate 		 * Now, wait for the page to be unlocked and
4170Sstevel@tonic-gate 		 * release the lock protecting p_cv and p_selock.
4180Sstevel@tonic-gate 		 */
4190Sstevel@tonic-gate 		cv_wait(&pp->p_cv, pse);
4200Sstevel@tonic-gate 		mutex_exit(pse);
4210Sstevel@tonic-gate 
4220Sstevel@tonic-gate 		/*
4230Sstevel@tonic-gate 		 * The page identity may have changed while we were
4240Sstevel@tonic-gate 		 * blocked.  If we are willing to depend on "pp"
4250Sstevel@tonic-gate 		 * still pointing to a valid page structure (i.e.,
4260Sstevel@tonic-gate 		 * assuming page structures are not dynamically allocated
4270Sstevel@tonic-gate 		 * or freed), we could try to lock the page if its
4280Sstevel@tonic-gate 		 * identity hasn't changed.
4290Sstevel@tonic-gate 		 *
4300Sstevel@tonic-gate 		 * This needs to be measured, since we come back from
4310Sstevel@tonic-gate 		 * cv_wait holding pse (the expensive part of this
4320Sstevel@tonic-gate 		 * operation) we might as well try the cheap part.
4330Sstevel@tonic-gate 		 * Though we would also have to confirm that dropping
4340Sstevel@tonic-gate 		 * `lock' did not cause any grief to the callers.
4350Sstevel@tonic-gate 		 */
4360Sstevel@tonic-gate 		if (lock) {
4370Sstevel@tonic-gate 			mutex_enter(lock);
4380Sstevel@tonic-gate 		}
4390Sstevel@tonic-gate 	} else {
4400Sstevel@tonic-gate 		/*
4410Sstevel@tonic-gate 		 * We have the page lock.
4420Sstevel@tonic-gate 		 * If we needed to reclaim the page, and the page
4430Sstevel@tonic-gate 		 * needed reclaiming (ie, it was free), then we
4440Sstevel@tonic-gate 		 * have the page exclusively locked.  We may need
4450Sstevel@tonic-gate 		 * to downgrade the page.
4460Sstevel@tonic-gate 		 */
4470Sstevel@tonic-gate 		ASSERT((upgraded) ?
4480Sstevel@tonic-gate 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
4490Sstevel@tonic-gate 		mutex_exit(pse);
4500Sstevel@tonic-gate 
4510Sstevel@tonic-gate 		/*
4520Sstevel@tonic-gate 		 * We now hold this page's lock, either shared or
4530Sstevel@tonic-gate 		 * exclusive.  This will prevent its identity from changing.
4540Sstevel@tonic-gate 		 * The page, however, may or may not be free.  If the caller
4550Sstevel@tonic-gate 		 * requested, and it is free, go reclaim it from the
4560Sstevel@tonic-gate 		 * free list.  If the page can't be reclaimed, return failure
4570Sstevel@tonic-gate 		 * so that the caller can start all over again.
4580Sstevel@tonic-gate 		 *
4590Sstevel@tonic-gate 		 * NOTE:page_reclaim() releases the page lock (p_selock)
4600Sstevel@tonic-gate 		 *	if it can't be reclaimed.
4610Sstevel@tonic-gate 		 */
4620Sstevel@tonic-gate 		if (reclaim_it) {
4630Sstevel@tonic-gate 			if (!page_reclaim(pp, lock)) {
4640Sstevel@tonic-gate 				VM_STAT_ADD(page_lock_bad_reclaim);
4650Sstevel@tonic-gate 				retval = 0;
4660Sstevel@tonic-gate 			} else {
4670Sstevel@tonic-gate 				VM_STAT_ADD(page_lock_reclaim);
4680Sstevel@tonic-gate 				if (upgraded) {
4690Sstevel@tonic-gate 					page_downgrade(pp);
4700Sstevel@tonic-gate 				}
4710Sstevel@tonic-gate 			}
4720Sstevel@tonic-gate 		}
4730Sstevel@tonic-gate 	}
4740Sstevel@tonic-gate 	return (retval);
4750Sstevel@tonic-gate }
4760Sstevel@tonic-gate 
4770Sstevel@tonic-gate /*
4780Sstevel@tonic-gate  * Clear the SE_EWANTED bit from p_selock.  This function allows
4790Sstevel@tonic-gate  * callers of page_lock_es and page_try_reclaim_lock to clear
4800Sstevel@tonic-gate  * their setting of this bit if they decide they no longer wish
4810Sstevel@tonic-gate  * to gain exclusive access to the page.  Currently only
4820Sstevel@tonic-gate  * delete_memory_thread uses this when the delete memory
4830Sstevel@tonic-gate  * operation is cancelled.
4840Sstevel@tonic-gate  */
4850Sstevel@tonic-gate void
4860Sstevel@tonic-gate page_lock_clr_exclwanted(page_t *pp)
4870Sstevel@tonic-gate {
4880Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
4890Sstevel@tonic-gate 
4900Sstevel@tonic-gate 	mutex_enter(pse);
4910Sstevel@tonic-gate 	pp->p_selock &= ~SE_EWANTED;
4920Sstevel@tonic-gate 	if (CV_HAS_WAITERS(&pp->p_cv))
4930Sstevel@tonic-gate 		cv_broadcast(&pp->p_cv);
4940Sstevel@tonic-gate 	mutex_exit(pse);
4950Sstevel@tonic-gate }
4960Sstevel@tonic-gate 
4970Sstevel@tonic-gate /*
4980Sstevel@tonic-gate  * Read the comments inside of page_lock_es() carefully.
4990Sstevel@tonic-gate  *
5000Sstevel@tonic-gate  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
5010Sstevel@tonic-gate  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
5020Sstevel@tonic-gate  * This is used by threads subject to reader-starvation (eg. memory delete).
5030Sstevel@tonic-gate  *
5040Sstevel@tonic-gate  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
5050Sstevel@tonic-gate  * it is expected that it will retry at a later time.  Threads that will
5060Sstevel@tonic-gate  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
5070Sstevel@tonic-gate  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
5080Sstevel@tonic-gate  * the bit is cleared.)
5090Sstevel@tonic-gate  */
5100Sstevel@tonic-gate int
5110Sstevel@tonic-gate page_try_reclaim_lock(page_t *pp, se_t se, int es)
5120Sstevel@tonic-gate {
5130Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
5140Sstevel@tonic-gate 	selock_t old;
5150Sstevel@tonic-gate 
5160Sstevel@tonic-gate 	mutex_enter(pse);
5170Sstevel@tonic-gate 
5180Sstevel@tonic-gate 	old = pp->p_selock;
5190Sstevel@tonic-gate 
5200Sstevel@tonic-gate 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
521917Selowe 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
522917Selowe 
523917Selowe 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
524917Selowe 		mutex_exit(pse);
525917Selowe 		VM_STAT_ADD(page_trylock_failed);
526917Selowe 		return (0);
527917Selowe 	}
5280Sstevel@tonic-gate 
5290Sstevel@tonic-gate 	if (se == SE_SHARED && es == 1 && old == 0) {
5300Sstevel@tonic-gate 		se = SE_EXCL;
5310Sstevel@tonic-gate 	}
5320Sstevel@tonic-gate 
5330Sstevel@tonic-gate 	if (se == SE_SHARED) {
5340Sstevel@tonic-gate 		if (!PP_ISFREE(pp)) {
5350Sstevel@tonic-gate 			if (old >= 0) {
536917Selowe 				/*
537917Selowe 				 * Readers are not allowed when excl wanted
538917Selowe 				 */
539917Selowe 				if ((old & SE_EWANTED) == 0) {
5402759Selowe 					pp->p_selock = old + SE_READER;
5412759Selowe 					mutex_exit(pse);
5422759Selowe 					return (1);
5430Sstevel@tonic-gate 				}
5440Sstevel@tonic-gate 			}
5450Sstevel@tonic-gate 			mutex_exit(pse);
5460Sstevel@tonic-gate 			return (0);
5470Sstevel@tonic-gate 		}
5480Sstevel@tonic-gate 		/*
5490Sstevel@tonic-gate 		 * The page is free, so we really want SE_EXCL (below)
5500Sstevel@tonic-gate 		 */
5510Sstevel@tonic-gate 		VM_STAT_ADD(page_try_reclaim_upgrade);
5520Sstevel@tonic-gate 	}
5530Sstevel@tonic-gate 
5540Sstevel@tonic-gate 	/*
5550Sstevel@tonic-gate 	 * The caller wants a writer lock.  We try for it only if
5560Sstevel@tonic-gate 	 * SE_EWANTED is not set, or if the caller specified
5570Sstevel@tonic-gate 	 * SE_EXCL_WANTED.
5580Sstevel@tonic-gate 	 */
559917Selowe 	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
5600Sstevel@tonic-gate 		if ((old & ~SE_EWANTED) == 0) {
5610Sstevel@tonic-gate 			/* no reader/writer lock held */
5620Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
5630Sstevel@tonic-gate 			/* this clears out our setting of the SE_EWANTED bit */
5640Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
5650Sstevel@tonic-gate 			mutex_exit(pse);
5660Sstevel@tonic-gate 			return (1);
5670Sstevel@tonic-gate 		}
5680Sstevel@tonic-gate 	}
569917Selowe 	if (es & SE_EXCL_WANTED) {
5700Sstevel@tonic-gate 		/* page is locked, set the SE_EWANTED bit */
5710Sstevel@tonic-gate 		pp->p_selock |= SE_EWANTED;
5720Sstevel@tonic-gate 	}
5730Sstevel@tonic-gate 	mutex_exit(pse);
5740Sstevel@tonic-gate 	return (0);
5750Sstevel@tonic-gate }
5760Sstevel@tonic-gate 
5770Sstevel@tonic-gate /*
5780Sstevel@tonic-gate  * Acquire a page's "shared/exclusive" lock, but never block.
5790Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
5800Sstevel@tonic-gate  */
5810Sstevel@tonic-gate int
5820Sstevel@tonic-gate page_trylock(page_t *pp, se_t se)
5830Sstevel@tonic-gate {
5840Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
5850Sstevel@tonic-gate 
5860Sstevel@tonic-gate 	mutex_enter(pse);
587917Selowe 	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
588973Selowe 	    (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
589917Selowe 		/*
590917Selowe 		 * Fail if a thread wants exclusive access and page is
591917Selowe 		 * retired, if the page is slated for retirement, or a
592917Selowe 		 * share lock is requested.
593917Selowe 		 */
5940Sstevel@tonic-gate 		mutex_exit(pse);
595917Selowe 		VM_STAT_ADD(page_trylock_failed);
5960Sstevel@tonic-gate 		return (0);
5970Sstevel@tonic-gate 	}
5980Sstevel@tonic-gate 
5990Sstevel@tonic-gate 	if (se == SE_EXCL) {
6000Sstevel@tonic-gate 		if (pp->p_selock == 0) {
6010Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
6020Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
6030Sstevel@tonic-gate 			mutex_exit(pse);
6040Sstevel@tonic-gate 			return (1);
6050Sstevel@tonic-gate 		}
6060Sstevel@tonic-gate 	} else {
6070Sstevel@tonic-gate 		if (pp->p_selock >= 0) {
6080Sstevel@tonic-gate 			pp->p_selock += SE_READER;
6090Sstevel@tonic-gate 			mutex_exit(pse);
6100Sstevel@tonic-gate 			return (1);
6110Sstevel@tonic-gate 		}
6120Sstevel@tonic-gate 	}
6130Sstevel@tonic-gate 	mutex_exit(pse);
6140Sstevel@tonic-gate 	return (0);
6150Sstevel@tonic-gate }
6160Sstevel@tonic-gate 
6170Sstevel@tonic-gate /*
618917Selowe  * Variant of page_unlock() specifically for the page freelist
619917Selowe  * code. The mere existence of this code is a vile hack that
620917Selowe  * has resulted due to the backwards locking order of the page
621917Selowe  * freelist manager; please don't call it.
622917Selowe  */
623917Selowe void
6243253Smec page_unlock_nocapture(page_t *pp)
625917Selowe {
626917Selowe 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
627917Selowe 	selock_t old;
628917Selowe 
629917Selowe 	mutex_enter(pse);
630917Selowe 
631917Selowe 	old = pp->p_selock;
632917Selowe 	if ((old & ~SE_EWANTED) == SE_READER) {
633917Selowe 		pp->p_selock = old & ~SE_READER;
634917Selowe 		if (CV_HAS_WAITERS(&pp->p_cv))
635917Selowe 			cv_broadcast(&pp->p_cv);
636917Selowe 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
6373253Smec 		panic("page_unlock_nocapture: page %p is deleted", pp);
638917Selowe 	} else if (old < 0) {
639917Selowe 		THREAD_KPRI_RELEASE();
640917Selowe 		pp->p_selock &= SE_EWANTED;
641917Selowe 		if (CV_HAS_WAITERS(&pp->p_cv))
642917Selowe 			cv_broadcast(&pp->p_cv);
643917Selowe 	} else if ((old & ~SE_EWANTED) > SE_READER) {
644917Selowe 		pp->p_selock = old - SE_READER;
645917Selowe 	} else {
6463253Smec 		panic("page_unlock_nocapture: page %p is not locked", pp);
647917Selowe 	}
648917Selowe 
649917Selowe 	mutex_exit(pse);
650917Selowe }
651917Selowe 
652917Selowe /*
6530Sstevel@tonic-gate  * Release the page's "shared/exclusive" lock and wake up anyone
6540Sstevel@tonic-gate  * who might be waiting for it.
6550Sstevel@tonic-gate  */
6560Sstevel@tonic-gate void
6570Sstevel@tonic-gate page_unlock(page_t *pp)
6580Sstevel@tonic-gate {
6590Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
6600Sstevel@tonic-gate 	selock_t old;
6610Sstevel@tonic-gate 
6620Sstevel@tonic-gate 	mutex_enter(pse);
663917Selowe 
6640Sstevel@tonic-gate 	old = pp->p_selock;
6650Sstevel@tonic-gate 	if ((old & ~SE_EWANTED) == SE_READER) {
6660Sstevel@tonic-gate 		pp->p_selock = old & ~SE_READER;
6670Sstevel@tonic-gate 		if (CV_HAS_WAITERS(&pp->p_cv))
6680Sstevel@tonic-gate 			cv_broadcast(&pp->p_cv);
6690Sstevel@tonic-gate 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
6700Sstevel@tonic-gate 		panic("page_unlock: page %p is deleted", pp);
6710Sstevel@tonic-gate 	} else if (old < 0) {
6720Sstevel@tonic-gate 		THREAD_KPRI_RELEASE();
6730Sstevel@tonic-gate 		pp->p_selock &= SE_EWANTED;
6740Sstevel@tonic-gate 		if (CV_HAS_WAITERS(&pp->p_cv))
6750Sstevel@tonic-gate 			cv_broadcast(&pp->p_cv);
6760Sstevel@tonic-gate 	} else if ((old & ~SE_EWANTED) > SE_READER) {
6770Sstevel@tonic-gate 		pp->p_selock = old - SE_READER;
6780Sstevel@tonic-gate 	} else {
6790Sstevel@tonic-gate 		panic("page_unlock: page %p is not locked", pp);
6800Sstevel@tonic-gate 	}
681917Selowe 
6823253Smec 	if (pp->p_selock == 0) {
683917Selowe 		/*
6843253Smec 		 * If the T_CAPTURING bit is set, that means that we should
6853253Smec 		 * not try and capture the page again as we could recurse
6863253Smec 		 * which could lead to a stack overflow panic or spending a
6873253Smec 		 * relatively long time in the kernel making no progress.
688917Selowe 		 */
6893253Smec 		if ((pp->p_toxic & PR_CAPTURE) &&
6903253Smec 		    !(curthread->t_flag & T_CAPTURING) &&
6913253Smec 		    !PP_RETIRED(pp)) {
692917Selowe 			THREAD_KPRI_REQUEST();
693917Selowe 			pp->p_selock = SE_WRITER;
694917Selowe 			mutex_exit(pse);
6953253Smec 			page_unlock_capture(pp);
696917Selowe 		} else {
697917Selowe 			mutex_exit(pse);
698917Selowe 		}
699917Selowe 	} else {
700917Selowe 		mutex_exit(pse);
701917Selowe 	}
7020Sstevel@tonic-gate }
7030Sstevel@tonic-gate 
7040Sstevel@tonic-gate /*
7050Sstevel@tonic-gate  * Try to upgrade the lock on the page from a "shared" to an
7060Sstevel@tonic-gate  * "exclusive" lock.  Since this upgrade operation is done while
7070Sstevel@tonic-gate  * holding the mutex protecting this page, no one else can acquire this page's
7080Sstevel@tonic-gate  * lock and change the page. Thus, it is safe to drop the "shared"
7090Sstevel@tonic-gate  * lock and attempt to acquire the "exclusive" lock.
7100Sstevel@tonic-gate  *
7110Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
7120Sstevel@tonic-gate  */
7130Sstevel@tonic-gate int
7140Sstevel@tonic-gate page_tryupgrade(page_t *pp)
7150Sstevel@tonic-gate {
7160Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
7170Sstevel@tonic-gate 
7180Sstevel@tonic-gate 	mutex_enter(pse);
7190Sstevel@tonic-gate 	if (!(pp->p_selock & SE_EWANTED)) {
7200Sstevel@tonic-gate 		/* no threads want exclusive access, try upgrade */
7210Sstevel@tonic-gate 		if (pp->p_selock == SE_READER) {
7220Sstevel@tonic-gate 			THREAD_KPRI_REQUEST();
7230Sstevel@tonic-gate 			/* convert to exclusive lock */
7240Sstevel@tonic-gate 			pp->p_selock = SE_WRITER;
7250Sstevel@tonic-gate 			mutex_exit(pse);
7260Sstevel@tonic-gate 			return (1);
7270Sstevel@tonic-gate 		}
7280Sstevel@tonic-gate 	}
7290Sstevel@tonic-gate 	mutex_exit(pse);
7300Sstevel@tonic-gate 	return (0);
7310Sstevel@tonic-gate }
7320Sstevel@tonic-gate 
7330Sstevel@tonic-gate /*
7340Sstevel@tonic-gate  * Downgrade the "exclusive" lock on the page to a "shared" lock
7350Sstevel@tonic-gate  * while holding the mutex protecting this page's p_selock field.
7360Sstevel@tonic-gate  */
7370Sstevel@tonic-gate void
7380Sstevel@tonic-gate page_downgrade(page_t *pp)
7390Sstevel@tonic-gate {
7400Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
7410Sstevel@tonic-gate 	int excl_waiting;
7420Sstevel@tonic-gate 
7430Sstevel@tonic-gate 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
7440Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
7450Sstevel@tonic-gate 
7460Sstevel@tonic-gate 	mutex_enter(pse);
7470Sstevel@tonic-gate 	excl_waiting =  pp->p_selock & SE_EWANTED;
7480Sstevel@tonic-gate 	THREAD_KPRI_RELEASE();
7490Sstevel@tonic-gate 	pp->p_selock = SE_READER | excl_waiting;
7500Sstevel@tonic-gate 	if (CV_HAS_WAITERS(&pp->p_cv))
7510Sstevel@tonic-gate 		cv_broadcast(&pp->p_cv);
7520Sstevel@tonic-gate 	mutex_exit(pse);
7530Sstevel@tonic-gate }
7540Sstevel@tonic-gate 
7550Sstevel@tonic-gate void
7560Sstevel@tonic-gate page_lock_delete(page_t *pp)
7570Sstevel@tonic-gate {
7580Sstevel@tonic-gate 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
7590Sstevel@tonic-gate 
7600Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
7610Sstevel@tonic-gate 	ASSERT(pp->p_vnode == NULL);
7620Sstevel@tonic-gate 	ASSERT(pp->p_offset == (u_offset_t)-1);
7630Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(pp));
7640Sstevel@tonic-gate 
7650Sstevel@tonic-gate 	mutex_enter(pse);
7660Sstevel@tonic-gate 	THREAD_KPRI_RELEASE();
7670Sstevel@tonic-gate 	pp->p_selock = SE_DELETED;
7680Sstevel@tonic-gate 	if (CV_HAS_WAITERS(&pp->p_cv))
7690Sstevel@tonic-gate 		cv_broadcast(&pp->p_cv);
7700Sstevel@tonic-gate 	mutex_exit(pse);
7710Sstevel@tonic-gate }
7720Sstevel@tonic-gate 
7733253Smec int
7743253Smec page_deleted(page_t *pp)
7753253Smec {
7763253Smec 	return (pp->p_selock == SE_DELETED);
7773253Smec }
7783253Smec 
7790Sstevel@tonic-gate /*
7800Sstevel@tonic-gate  * Implement the io lock for pages
7810Sstevel@tonic-gate  */
7820Sstevel@tonic-gate void
7830Sstevel@tonic-gate page_iolock_init(page_t *pp)
7840Sstevel@tonic-gate {
7850Sstevel@tonic-gate 	pp->p_iolock_state = 0;
7860Sstevel@tonic-gate 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
7870Sstevel@tonic-gate }
7880Sstevel@tonic-gate 
7890Sstevel@tonic-gate /*
7900Sstevel@tonic-gate  * Acquire the i/o lock on a page.
7910Sstevel@tonic-gate  */
7920Sstevel@tonic-gate void
7930Sstevel@tonic-gate page_io_lock(page_t *pp)
7940Sstevel@tonic-gate {
7950Sstevel@tonic-gate 	kmutex_t *pio;
7960Sstevel@tonic-gate 
7970Sstevel@tonic-gate 	pio = PAGE_IO_MUTEX(pp);
7980Sstevel@tonic-gate 	mutex_enter(pio);
7990Sstevel@tonic-gate 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
8000Sstevel@tonic-gate 		cv_wait(&(pp->p_io_cv), pio);
8010Sstevel@tonic-gate 	}
8020Sstevel@tonic-gate 	pp->p_iolock_state |= PAGE_IO_INUSE;
8030Sstevel@tonic-gate 	mutex_exit(pio);
8040Sstevel@tonic-gate }
8050Sstevel@tonic-gate 
8060Sstevel@tonic-gate /*
8070Sstevel@tonic-gate  * Release the i/o lock on a page.
8080Sstevel@tonic-gate  */
8090Sstevel@tonic-gate void
8100Sstevel@tonic-gate page_io_unlock(page_t *pp)
8110Sstevel@tonic-gate {
8120Sstevel@tonic-gate 	kmutex_t *pio;
8130Sstevel@tonic-gate 
8140Sstevel@tonic-gate 	pio = PAGE_IO_MUTEX(pp);
8150Sstevel@tonic-gate 	mutex_enter(pio);
8162999Sstans 	cv_broadcast(&pp->p_io_cv);
8170Sstevel@tonic-gate 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
8180Sstevel@tonic-gate 	mutex_exit(pio);
8190Sstevel@tonic-gate }
8200Sstevel@tonic-gate 
8210Sstevel@tonic-gate /*
8220Sstevel@tonic-gate  * Try to acquire the i/o lock on a page without blocking.
8230Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
8240Sstevel@tonic-gate  */
8250Sstevel@tonic-gate int
8260Sstevel@tonic-gate page_io_trylock(page_t *pp)
8270Sstevel@tonic-gate {
8280Sstevel@tonic-gate 	kmutex_t *pio;
8290Sstevel@tonic-gate 
8300Sstevel@tonic-gate 	if (pp->p_iolock_state & PAGE_IO_INUSE)
8310Sstevel@tonic-gate 		return (0);
8320Sstevel@tonic-gate 
8330Sstevel@tonic-gate 	pio = PAGE_IO_MUTEX(pp);
8340Sstevel@tonic-gate 	mutex_enter(pio);
8350Sstevel@tonic-gate 
8360Sstevel@tonic-gate 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
8370Sstevel@tonic-gate 		mutex_exit(pio);
8380Sstevel@tonic-gate 		return (0);
8390Sstevel@tonic-gate 	}
8400Sstevel@tonic-gate 	pp->p_iolock_state |= PAGE_IO_INUSE;
8410Sstevel@tonic-gate 	mutex_exit(pio);
8420Sstevel@tonic-gate 
8430Sstevel@tonic-gate 	return (1);
8440Sstevel@tonic-gate }
8450Sstevel@tonic-gate 
8460Sstevel@tonic-gate /*
8472999Sstans  * Wait until the i/o lock is not held.
8482999Sstans  */
8492999Sstans void
8502999Sstans page_io_wait(page_t *pp)
8512999Sstans {
8522999Sstans 	kmutex_t *pio;
8532999Sstans 
8542999Sstans 	pio = PAGE_IO_MUTEX(pp);
8552999Sstans 	mutex_enter(pio);
8562999Sstans 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
8572999Sstans 		cv_wait(&(pp->p_io_cv), pio);
8582999Sstans 	}
8592999Sstans 	mutex_exit(pio);
8602999Sstans }
8612999Sstans 
8622999Sstans /*
8632999Sstans  * Returns 1 on success, 0 on failure.
8642999Sstans  */
8652999Sstans int
8662999Sstans page_io_locked(page_t *pp)
8672999Sstans {
8682999Sstans 	return (pp->p_iolock_state & PAGE_IO_INUSE);
8692999Sstans }
8702999Sstans 
8712999Sstans /*
8720Sstevel@tonic-gate  * Assert that the i/o lock on a page is held.
8730Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
8740Sstevel@tonic-gate  */
8750Sstevel@tonic-gate int
8760Sstevel@tonic-gate page_iolock_assert(page_t *pp)
8770Sstevel@tonic-gate {
8782999Sstans 	return (page_io_locked(pp));
8790Sstevel@tonic-gate }
8800Sstevel@tonic-gate 
8810Sstevel@tonic-gate /*
8820Sstevel@tonic-gate  * Wrapper exported to kernel routines that are built
8830Sstevel@tonic-gate  * platform-independent (the macro is platform-dependent;
8840Sstevel@tonic-gate  * the size of vph_mutex[] is based on NCPU).
8850Sstevel@tonic-gate  *
8860Sstevel@tonic-gate  * Note that you can do stress testing on this by setting the
8870Sstevel@tonic-gate  * variable page_vnode_mutex_stress to something other than
8880Sstevel@tonic-gate  * zero in a DEBUG kernel in a debugger after loading the kernel.
8890Sstevel@tonic-gate  * Setting it after the kernel is running may not work correctly.
8900Sstevel@tonic-gate  */
8910Sstevel@tonic-gate #ifdef DEBUG
8920Sstevel@tonic-gate static int page_vnode_mutex_stress = 0;
8930Sstevel@tonic-gate #endif
8940Sstevel@tonic-gate 
8950Sstevel@tonic-gate kmutex_t *
8960Sstevel@tonic-gate page_vnode_mutex(vnode_t *vp)
8970Sstevel@tonic-gate {
8980Sstevel@tonic-gate 	if (vp == &kvp)
8990Sstevel@tonic-gate 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
9003290Sjohansen 
9013290Sjohansen 	if (vp == &zvp)
9023290Sjohansen 		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
9030Sstevel@tonic-gate #ifdef DEBUG
9040Sstevel@tonic-gate 	if (page_vnode_mutex_stress != 0)
9050Sstevel@tonic-gate 		return (&vph_mutex[0]);
9060Sstevel@tonic-gate #endif
9070Sstevel@tonic-gate 
9080Sstevel@tonic-gate 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
9090Sstevel@tonic-gate }
9100Sstevel@tonic-gate 
9110Sstevel@tonic-gate kmutex_t *
9120Sstevel@tonic-gate page_se_mutex(page_t *pp)
9130Sstevel@tonic-gate {
9140Sstevel@tonic-gate 	return (PAGE_SE_MUTEX(pp));
9150Sstevel@tonic-gate }
9160Sstevel@tonic-gate 
9170Sstevel@tonic-gate #ifdef VM_STATS
9180Sstevel@tonic-gate uint_t pszclck_stat[4];
9190Sstevel@tonic-gate #endif
9200Sstevel@tonic-gate /*
9210Sstevel@tonic-gate  * Find, take and return a mutex held by hat_page_demote().
9220Sstevel@tonic-gate  * Called by page_demote_vp_pages() before hat_page_demote() call and by
9230Sstevel@tonic-gate  * routines that want to block hat_page_demote() but can't do it
9240Sstevel@tonic-gate  * via locking all constituent pages.
9250Sstevel@tonic-gate  *
9260Sstevel@tonic-gate  * Return NULL if p_szc is 0.
9270Sstevel@tonic-gate  *
9280Sstevel@tonic-gate  * It should only be used for pages that can be demoted by hat_page_demote()
9290Sstevel@tonic-gate  * i.e. non swapfs file system pages.  The logic here is lifted from
9300Sstevel@tonic-gate  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
9310Sstevel@tonic-gate  * since the page is locked and not free.
9320Sstevel@tonic-gate  *
9330Sstevel@tonic-gate  * Hash of the root page is used to find the lock.
9340Sstevel@tonic-gate  * To find the root in the presense of hat_page_demote() chageing the location
9350Sstevel@tonic-gate  * of the root this routine relies on the fact that hat_page_demote() changes
9360Sstevel@tonic-gate  * root last.
9370Sstevel@tonic-gate  *
9380Sstevel@tonic-gate  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
9390Sstevel@tonic-gate  * returned pp's p_szc may be any value.
9400Sstevel@tonic-gate  */
9410Sstevel@tonic-gate kmutex_t *
9420Sstevel@tonic-gate page_szc_lock(page_t *pp)
9430Sstevel@tonic-gate {
9440Sstevel@tonic-gate 	kmutex_t	*mtx;
9450Sstevel@tonic-gate 	page_t		*rootpp;
9460Sstevel@tonic-gate 	uint_t		szc;
9470Sstevel@tonic-gate 	uint_t		rszc;
9480Sstevel@tonic-gate 	uint_t		pszc = pp->p_szc;
9490Sstevel@tonic-gate 
9500Sstevel@tonic-gate 	ASSERT(pp != NULL);
9510Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED(pp));
9520Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(pp));
9530Sstevel@tonic-gate 	ASSERT(pp->p_vnode != NULL);
9540Sstevel@tonic-gate 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
9553290Sjohansen 	ASSERT(!PP_ISKAS(pp));
9560Sstevel@tonic-gate 
9570Sstevel@tonic-gate again:
9580Sstevel@tonic-gate 	if (pszc == 0) {
9590Sstevel@tonic-gate 		VM_STAT_ADD(pszclck_stat[0]);
9600Sstevel@tonic-gate 		return (NULL);
9610Sstevel@tonic-gate 	}
9620Sstevel@tonic-gate 
9630Sstevel@tonic-gate 	/* The lock lives in the root page */
9640Sstevel@tonic-gate 
9650Sstevel@tonic-gate 	rootpp = PP_GROUPLEADER(pp, pszc);
9660Sstevel@tonic-gate 	mtx = PAGE_SZC_MUTEX(rootpp);
9670Sstevel@tonic-gate 	mutex_enter(mtx);
9680Sstevel@tonic-gate 
9690Sstevel@tonic-gate 	/*
9700Sstevel@tonic-gate 	 * since p_szc can only decrease if pp == rootpp
9710Sstevel@tonic-gate 	 * rootpp will be always the same i.e we have the right root
9720Sstevel@tonic-gate 	 * regardless of rootpp->p_szc.
9730Sstevel@tonic-gate 	 * If location of pp's root didn't change after we took
9740Sstevel@tonic-gate 	 * the lock we have the right root. return mutex hashed off it.
9750Sstevel@tonic-gate 	 */
9760Sstevel@tonic-gate 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
9770Sstevel@tonic-gate 		VM_STAT_ADD(pszclck_stat[1]);
9780Sstevel@tonic-gate 		return (mtx);
9790Sstevel@tonic-gate 	}
9800Sstevel@tonic-gate 
9810Sstevel@tonic-gate 	/*
9820Sstevel@tonic-gate 	 * root location changed because page got demoted.
9830Sstevel@tonic-gate 	 * locate the new root.
9840Sstevel@tonic-gate 	 */
9850Sstevel@tonic-gate 	if (rszc < pszc) {
9860Sstevel@tonic-gate 		szc = pp->p_szc;
9870Sstevel@tonic-gate 		ASSERT(szc < pszc);
9880Sstevel@tonic-gate 		mutex_exit(mtx);
9890Sstevel@tonic-gate 		pszc = szc;
9900Sstevel@tonic-gate 		VM_STAT_ADD(pszclck_stat[2]);
9910Sstevel@tonic-gate 		goto again;
9920Sstevel@tonic-gate 	}
9930Sstevel@tonic-gate 
9940Sstevel@tonic-gate 	VM_STAT_ADD(pszclck_stat[3]);
9950Sstevel@tonic-gate 	/*
9960Sstevel@tonic-gate 	 * current hat_page_demote not done yet.
9970Sstevel@tonic-gate 	 * wait for it to finish.
9980Sstevel@tonic-gate 	 */
9990Sstevel@tonic-gate 	mutex_exit(mtx);
10000Sstevel@tonic-gate 	rootpp = PP_GROUPLEADER(rootpp, rszc);
10010Sstevel@tonic-gate 	mtx = PAGE_SZC_MUTEX(rootpp);
10020Sstevel@tonic-gate 	mutex_enter(mtx);
10030Sstevel@tonic-gate 	mutex_exit(mtx);
10040Sstevel@tonic-gate 	ASSERT(rootpp->p_szc < rszc);
10050Sstevel@tonic-gate 	goto again;
10060Sstevel@tonic-gate }
10070Sstevel@tonic-gate 
10080Sstevel@tonic-gate int
10090Sstevel@tonic-gate page_szc_lock_assert(page_t *pp)
10100Sstevel@tonic-gate {
10110Sstevel@tonic-gate 	page_t *rootpp = PP_PAGEROOT(pp);
10120Sstevel@tonic-gate 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
10130Sstevel@tonic-gate 
10140Sstevel@tonic-gate 	return (MUTEX_HELD(mtx));
10150Sstevel@tonic-gate }
10163446Smrj 
10173446Smrj /*
10183446Smrj  * memseg locking
10193446Smrj  */
10203446Smrj static krwlock_t memsegslock;
10213446Smrj 
10223446Smrj /*
10233446Smrj  * memlist (phys_install, phys_avail) locking.
10243446Smrj  */
10253446Smrj static krwlock_t memlists_lock;
10263446Smrj 
10273446Smrj void
10283446Smrj memsegs_lock(int writer)
10293446Smrj {
10303446Smrj 	rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
10313446Smrj }
10323446Smrj 
10333446Smrj /*ARGSUSED*/
10343446Smrj void
10353446Smrj memsegs_unlock(int writer)
10363446Smrj {
10373446Smrj 	rw_exit(&memsegslock);
10383446Smrj }
10393446Smrj 
10403446Smrj int
10413446Smrj memsegs_lock_held(void)
10423446Smrj {
10433446Smrj 	return (RW_LOCK_HELD(&memsegslock));
10443446Smrj }
10453446Smrj 
10463446Smrj void
10473446Smrj memlist_read_lock(void)
10483446Smrj {
10493446Smrj 	rw_enter(&memlists_lock, RW_READER);
10503446Smrj }
10513446Smrj 
10523446Smrj void
10533446Smrj memlist_read_unlock(void)
10543446Smrj {
10553446Smrj 	rw_exit(&memlists_lock);
10563446Smrj }
10573446Smrj 
10583446Smrj void
10593446Smrj memlist_write_lock(void)
10603446Smrj {
10613446Smrj 	rw_enter(&memlists_lock, RW_WRITER);
10623446Smrj }
10633446Smrj 
10643446Smrj void
10653446Smrj memlist_write_unlock(void)
10663446Smrj {
10673446Smrj 	rw_exit(&memlists_lock);
10683446Smrj }
1069