10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 52759Selowe * Common Development and Distribution License (the "License"). 62759Selowe * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 223446Smrj * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 270Sstevel@tonic-gate 280Sstevel@tonic-gate /* 290Sstevel@tonic-gate * VM - page locking primitives 300Sstevel@tonic-gate */ 310Sstevel@tonic-gate #include <sys/param.h> 320Sstevel@tonic-gate #include <sys/t_lock.h> 330Sstevel@tonic-gate #include <sys/vtrace.h> 340Sstevel@tonic-gate #include <sys/debug.h> 350Sstevel@tonic-gate #include <sys/cmn_err.h> 360Sstevel@tonic-gate #include <sys/vnode.h> 370Sstevel@tonic-gate #include <sys/bitmap.h> 380Sstevel@tonic-gate #include <sys/lockstat.h> 39*4878Sblakej #include <sys/sysmacros.h> 400Sstevel@tonic-gate #include <sys/condvar_impl.h> 410Sstevel@tonic-gate #include <vm/page.h> 420Sstevel@tonic-gate #include <vm/seg_enum.h> 430Sstevel@tonic-gate #include <vm/vm_dep.h> 440Sstevel@tonic-gate 450Sstevel@tonic-gate /* 460Sstevel@tonic-gate * This global mutex is for logical page locking. 470Sstevel@tonic-gate * The following fields in the page structure are protected 480Sstevel@tonic-gate * by this lock: 490Sstevel@tonic-gate * 500Sstevel@tonic-gate * p_lckcnt 510Sstevel@tonic-gate * p_cowcnt 520Sstevel@tonic-gate */ 530Sstevel@tonic-gate kmutex_t page_llock; 540Sstevel@tonic-gate 550Sstevel@tonic-gate /* 560Sstevel@tonic-gate * This is a global lock for the logical page free list. The 570Sstevel@tonic-gate * logical free list, in this implementation, is maintained as two 580Sstevel@tonic-gate * separate physical lists - the cache list and the free list. 590Sstevel@tonic-gate */ 600Sstevel@tonic-gate kmutex_t page_freelock; 610Sstevel@tonic-gate 620Sstevel@tonic-gate /* 630Sstevel@tonic-gate * The hash table, page_hash[], the p_selock fields, and the 640Sstevel@tonic-gate * list of pages associated with vnodes are protected by arrays of mutexes. 650Sstevel@tonic-gate * 660Sstevel@tonic-gate * Unless the hashes are changed radically, the table sizes must be 670Sstevel@tonic-gate * a power of two. Also, we typically need more mutexes for the 680Sstevel@tonic-gate * vnodes since these locks are occasionally held for long periods. 690Sstevel@tonic-gate * And since there seem to be two special vnodes (kvp and swapvp), 700Sstevel@tonic-gate * we make room for private mutexes for them. 710Sstevel@tonic-gate * 720Sstevel@tonic-gate * The pse_mutex[] array holds the mutexes to protect the p_selock 730Sstevel@tonic-gate * fields of all page_t structures. 740Sstevel@tonic-gate * 750Sstevel@tonic-gate * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex 760Sstevel@tonic-gate * when given a pointer to a page_t. 770Sstevel@tonic-gate * 78*4878Sblakej * PIO_TABLE_SIZE must be a power of two. One could argue that we 790Sstevel@tonic-gate * should go to the trouble of setting it up at run time and base it 800Sstevel@tonic-gate * on memory size rather than the number of compile time CPUs. 810Sstevel@tonic-gate * 82*4878Sblakej * XX64 We should be using physmem size to calculate PIO_SHIFT. 830Sstevel@tonic-gate * 840Sstevel@tonic-gate * These might break in 64 bit world. 850Sstevel@tonic-gate */ 86*4878Sblakej #define PIO_SHIFT 7 /* log2(sizeof(page_t)) */ 87*4878Sblakej #define PIO_TABLE_SIZE 128 /* number of io mutexes to have */ 880Sstevel@tonic-gate 890Sstevel@tonic-gate pad_mutex_t ph_mutex[PH_TABLE_SIZE]; 900Sstevel@tonic-gate kmutex_t pio_mutex[PIO_TABLE_SIZE]; 910Sstevel@tonic-gate 920Sstevel@tonic-gate #define PAGE_IO_MUTEX(pp) \ 930Sstevel@tonic-gate &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] 940Sstevel@tonic-gate 95*4878Sblakej /* 96*4878Sblakej * The pse_mutex[] array is allocated in the platform startup code 97*4878Sblakej * based on the size of the machine at startup. 98*4878Sblakej */ 99*4878Sblakej extern pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */ 100*4878Sblakej extern size_t pse_table_size; /* Number of mutexes in pse_mutex[] */ 101*4878Sblakej extern int pse_shift; /* log2(pse_table_size) */ 102*4878Sblakej #define PAGE_SE_MUTEX(pp) &pse_mutex[ \ 103*4878Sblakej ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) & \ 104*4878Sblakej (pse_table_size - 1)].pad_mutex 105*4878Sblakej 1060Sstevel@tonic-gate #define PSZC_MTX_TABLE_SIZE 128 1070Sstevel@tonic-gate #define PSZC_MTX_TABLE_SHIFT 7 1080Sstevel@tonic-gate 1090Sstevel@tonic-gate static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; 1100Sstevel@tonic-gate 1110Sstevel@tonic-gate #define PAGE_SZC_MUTEX(_pp) \ 1120Sstevel@tonic-gate &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \ 1130Sstevel@tonic-gate ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \ 1140Sstevel@tonic-gate ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \ 1150Sstevel@tonic-gate (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex 1160Sstevel@tonic-gate 1170Sstevel@tonic-gate /* 1180Sstevel@tonic-gate * The vph_mutex[] array holds the mutexes to protect the vnode chains, 1190Sstevel@tonic-gate * (i.e., the list of pages anchored by v_pages and connected via p_vpprev 1200Sstevel@tonic-gate * and p_vpnext). 1210Sstevel@tonic-gate * 1220Sstevel@tonic-gate * The page_vnode_mutex(vp) function returns the address of the appropriate 1230Sstevel@tonic-gate * mutex from this array given a pointer to a vnode. It is complicated 1240Sstevel@tonic-gate * by the fact that the kernel's vnode and the swapfs vnode are referenced 1250Sstevel@tonic-gate * frequently enough to warrent their own mutexes. 1260Sstevel@tonic-gate * 1270Sstevel@tonic-gate * The VP_HASH_FUNC returns the index into the vph_mutex array given 1280Sstevel@tonic-gate * an address of a vnode. 1290Sstevel@tonic-gate */ 1300Sstevel@tonic-gate 1310Sstevel@tonic-gate /* 1320Sstevel@tonic-gate * XX64 VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world. 1330Sstevel@tonic-gate * Need to review again. 1340Sstevel@tonic-gate */ 1354325Sqiao #if defined(_LP64) 1364325Sqiao #define VPH_TABLE_SIZE (1 << (VP_SHIFT + 3)) 1374325Sqiao #else /* 32 bits */ 1380Sstevel@tonic-gate #define VPH_TABLE_SIZE (2 << VP_SHIFT) 1394325Sqiao #endif 1400Sstevel@tonic-gate 1410Sstevel@tonic-gate #define VP_HASH_FUNC(vp) \ 1420Sstevel@tonic-gate ((((uintptr_t)(vp) >> 6) + \ 1430Sstevel@tonic-gate ((uintptr_t)(vp) >> 8) + \ 1440Sstevel@tonic-gate ((uintptr_t)(vp) >> 10) + \ 1450Sstevel@tonic-gate ((uintptr_t)(vp) >> 12)) \ 1460Sstevel@tonic-gate & (VPH_TABLE_SIZE - 1)) 1470Sstevel@tonic-gate 1480Sstevel@tonic-gate extern struct vnode kvp; 1490Sstevel@tonic-gate 1503290Sjohansen /* 1513290Sjohansen * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes. 1523290Sjohansen * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is 1533290Sjohansen * VPH_TABLE_SIZE + 1. 1543290Sjohansen */ 1553290Sjohansen 1560Sstevel@tonic-gate kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; 1570Sstevel@tonic-gate 1580Sstevel@tonic-gate /* 1590Sstevel@tonic-gate * Initialize the locks used by the Virtual Memory Management system. 1600Sstevel@tonic-gate */ 1610Sstevel@tonic-gate void 1620Sstevel@tonic-gate page_lock_init() 1630Sstevel@tonic-gate { 1640Sstevel@tonic-gate } 1650Sstevel@tonic-gate 1660Sstevel@tonic-gate /* 167*4878Sblakej * Return a value for pse_shift based on npg (the number of physical pages) 168*4878Sblakej * and ncpu (the maximum number of CPUs). This is called by platform startup 169*4878Sblakej * code. 170*4878Sblakej * 171*4878Sblakej * Lockstat data from TPC-H runs showed that contention on the pse_mutex[] 172*4878Sblakej * locks grew approximately as the square of the number of threads executing. 173*4878Sblakej * So the primary scaling factor used is NCPU^2. The size of the machine in 174*4878Sblakej * megabytes is used as an upper bound, particularly for sun4v machines which 175*4878Sblakej * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE 176*4878Sblakej * (128) is used as a minimum. Since the size of the table has to be a power 177*4878Sblakej * of two, the calculated size is rounded up to the next power of two. 178*4878Sblakej */ 179*4878Sblakej /*ARGSUSED*/ 180*4878Sblakej int 181*4878Sblakej size_pse_array(pgcnt_t npg, int ncpu) 182*4878Sblakej { 183*4878Sblakej size_t size; 184*4878Sblakej pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE; 185*4878Sblakej 186*4878Sblakej size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu)); 187*4878Sblakej size += (1 << (highbit(size) - 1)) - 1; 188*4878Sblakej return (highbit(size) - 1); 189*4878Sblakej } 190*4878Sblakej 191*4878Sblakej /* 1920Sstevel@tonic-gate * At present we only use page ownership to aid debugging, so it's 1930Sstevel@tonic-gate * OK if the owner field isn't exact. In the 32-bit world two thread ids 1940Sstevel@tonic-gate * can map to the same owner because we just 'or' in 0x80000000 and 1950Sstevel@tonic-gate * then clear the second highest bit, so that (for example) 0x2faced00 1960Sstevel@tonic-gate * and 0xafaced00 both map to 0xafaced00. 1970Sstevel@tonic-gate * In the 64-bit world, p_selock may not be large enough to hold a full 1980Sstevel@tonic-gate * thread pointer. If we ever need precise ownership (e.g. if we implement 1990Sstevel@tonic-gate * priority inheritance for page locks) then p_selock should become a 2000Sstevel@tonic-gate * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2). 2010Sstevel@tonic-gate */ 2020Sstevel@tonic-gate #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED) 2030Sstevel@tonic-gate #define SE_READER 1 2040Sstevel@tonic-gate 2050Sstevel@tonic-gate /* 2060Sstevel@tonic-gate * A page that is deleted must be marked as such using the 2070Sstevel@tonic-gate * page_lock_delete() function. The page must be exclusively locked. 2080Sstevel@tonic-gate * The SE_DELETED marker is put in p_selock when this function is called. 2090Sstevel@tonic-gate * SE_DELETED must be distinct from any SE_WRITER value. 2100Sstevel@tonic-gate */ 2110Sstevel@tonic-gate #define SE_DELETED (1 | INT_MIN) 2120Sstevel@tonic-gate 2130Sstevel@tonic-gate #ifdef VM_STATS 2140Sstevel@tonic-gate uint_t vph_kvp_count; 2150Sstevel@tonic-gate uint_t vph_swapfsvp_count; 2160Sstevel@tonic-gate uint_t vph_other; 2170Sstevel@tonic-gate #endif /* VM_STATS */ 2180Sstevel@tonic-gate 2190Sstevel@tonic-gate #ifdef VM_STATS 2200Sstevel@tonic-gate uint_t page_lock_count; 2210Sstevel@tonic-gate uint_t page_lock_miss; 2220Sstevel@tonic-gate uint_t page_lock_miss_lock; 2230Sstevel@tonic-gate uint_t page_lock_reclaim; 2240Sstevel@tonic-gate uint_t page_lock_bad_reclaim; 2250Sstevel@tonic-gate uint_t page_lock_same_page; 2260Sstevel@tonic-gate uint_t page_lock_upgrade; 227917Selowe uint_t page_lock_retired; 2280Sstevel@tonic-gate uint_t page_lock_upgrade_failed; 2290Sstevel@tonic-gate uint_t page_lock_deleted; 2300Sstevel@tonic-gate 2310Sstevel@tonic-gate uint_t page_trylock_locked; 232917Selowe uint_t page_trylock_failed; 2330Sstevel@tonic-gate uint_t page_trylock_missed; 2340Sstevel@tonic-gate 2350Sstevel@tonic-gate uint_t page_try_reclaim_upgrade; 2360Sstevel@tonic-gate #endif /* VM_STATS */ 2370Sstevel@tonic-gate 2380Sstevel@tonic-gate /* 2390Sstevel@tonic-gate * Acquire the "shared/exclusive" lock on a page. 2400Sstevel@tonic-gate * 2410Sstevel@tonic-gate * Returns 1 on success and locks the page appropriately. 2420Sstevel@tonic-gate * 0 on failure and does not lock the page. 2430Sstevel@tonic-gate * 2440Sstevel@tonic-gate * If `lock' is non-NULL, it will be dropped and reacquired in the 2450Sstevel@tonic-gate * failure case. This routine can block, and if it does 2460Sstevel@tonic-gate * it will always return a failure since the page identity [vp, off] 2470Sstevel@tonic-gate * or state may have changed. 2480Sstevel@tonic-gate */ 2490Sstevel@tonic-gate 2500Sstevel@tonic-gate int 2510Sstevel@tonic-gate page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) 2520Sstevel@tonic-gate { 2530Sstevel@tonic-gate return (page_lock_es(pp, se, lock, reclaim, 0)); 2540Sstevel@tonic-gate } 2550Sstevel@tonic-gate 2560Sstevel@tonic-gate /* 2570Sstevel@tonic-gate * With the addition of reader-writer lock semantics to page_lock_es, 2580Sstevel@tonic-gate * callers wanting an exclusive (writer) lock may prevent shared-lock 2590Sstevel@tonic-gate * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. 2600Sstevel@tonic-gate * In this case, when an exclusive lock cannot be acquired, p_selock's 261917Selowe * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied 262917Selowe * if the page is slated for retirement. 263917Selowe * 264917Selowe * The se and es parameters determine if the lock should be granted 265917Selowe * based on the following decision table: 266917Selowe * 267917Selowe * Lock wanted es flags p_selock/SE_EWANTED Action 268917Selowe * ----------- -------------- ------------------- --------- 269917Selowe * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED 270917Selowe * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED 271917Selowe * SE_EXCL none any lock/any deny 2722759Selowe * SE_SHARED n/a [2] shared/0 grant 2732759Selowe * SE_SHARED n/a [2] unlocked/0 grant 274917Selowe * SE_SHARED n/a shared/1 deny 275917Selowe * SE_SHARED n/a unlocked/1 deny 276917Selowe * SE_SHARED n/a excl/any deny 2770Sstevel@tonic-gate * 278917Selowe * Notes: 279917Selowe * [1] The code grants an exclusive lock to the caller and clears the bit 280917Selowe * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED 281917Selowe * bit's value. This was deemed acceptable as we are not concerned about 282917Selowe * exclusive-lock starvation. If this ever becomes an issue, a priority or 283917Selowe * fifo mechanism should also be implemented. Meantime, the thread that 284917Selowe * set SE_EWANTED should be prepared to catch this condition and reset it 285917Selowe * 286917Selowe * [2] Retired pages may not be locked at any time, regardless of the 287917Selowe * dispostion of se, unless the es parameter has SE_RETIRED flag set. 2880Sstevel@tonic-gate * 289917Selowe * Notes on values of "es": 290917Selowe * 291917Selowe * es & 1: page_lookup_create will attempt page relocation 292917Selowe * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete 293917Selowe * memory thread); this prevents reader-starvation of waiting 294917Selowe * writer thread(s) by giving priority to writers over readers. 295917Selowe * es & SE_RETIRED: caller wants to lock pages even if they are 296917Selowe * retired. Default is to deny the lock if the page is retired. 297917Selowe * 298917Selowe * And yes, we know, the semantics of this function are too complicated. 299917Selowe * It's on the list to be cleaned up. 3000Sstevel@tonic-gate */ 3010Sstevel@tonic-gate int 3020Sstevel@tonic-gate page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) 3030Sstevel@tonic-gate { 3040Sstevel@tonic-gate int retval; 3050Sstevel@tonic-gate kmutex_t *pse = PAGE_SE_MUTEX(pp); 3060Sstevel@tonic-gate int upgraded; 3070Sstevel@tonic-gate int reclaim_it; 3080Sstevel@tonic-gate 3090Sstevel@tonic-gate ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 3100Sstevel@tonic-gate 3110Sstevel@tonic-gate VM_STAT_ADD(page_lock_count); 3120Sstevel@tonic-gate 3130Sstevel@tonic-gate upgraded = 0; 3140Sstevel@tonic-gate reclaim_it = 0; 3150Sstevel@tonic-gate 3160Sstevel@tonic-gate mutex_enter(pse); 3170Sstevel@tonic-gate 318917Selowe ASSERT(((es & SE_EXCL_WANTED) == 0) || 319917Selowe ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 3200Sstevel@tonic-gate 321917Selowe if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 322917Selowe mutex_exit(pse); 323917Selowe VM_STAT_ADD(page_lock_retired); 324917Selowe return (0); 325917Selowe } 3260Sstevel@tonic-gate 3270Sstevel@tonic-gate if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { 3280Sstevel@tonic-gate se = SE_EXCL; 3290Sstevel@tonic-gate } 3300Sstevel@tonic-gate 3310Sstevel@tonic-gate if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { 3320Sstevel@tonic-gate 3330Sstevel@tonic-gate reclaim_it = 1; 3340Sstevel@tonic-gate if (se == SE_SHARED) { 3350Sstevel@tonic-gate /* 3360Sstevel@tonic-gate * This is an interesting situation. 3370Sstevel@tonic-gate * 3380Sstevel@tonic-gate * Remember that p_free can only change if 3390Sstevel@tonic-gate * p_selock < 0. 3400Sstevel@tonic-gate * p_free does not depend on our holding `pse'. 3410Sstevel@tonic-gate * And, since we hold `pse', p_selock can not change. 3420Sstevel@tonic-gate * So, if p_free changes on us, the page is already 3430Sstevel@tonic-gate * exclusively held, and we would fail to get p_selock 3440Sstevel@tonic-gate * regardless. 3450Sstevel@tonic-gate * 3460Sstevel@tonic-gate * We want to avoid getting the share 3470Sstevel@tonic-gate * lock on a free page that needs to be reclaimed. 3480Sstevel@tonic-gate * It is possible that some other thread has the share 3490Sstevel@tonic-gate * lock and has left the free page on the cache list. 3500Sstevel@tonic-gate * pvn_vplist_dirty() does this for brief periods. 3510Sstevel@tonic-gate * If the se_share is currently SE_EXCL, we will fail 3520Sstevel@tonic-gate * to acquire p_selock anyway. Blocking is the 3530Sstevel@tonic-gate * right thing to do. 3540Sstevel@tonic-gate * If we need to reclaim this page, we must get 3550Sstevel@tonic-gate * exclusive access to it, force the upgrade now. 3560Sstevel@tonic-gate * Again, we will fail to acquire p_selock if the 3570Sstevel@tonic-gate * page is not free and block. 3580Sstevel@tonic-gate */ 3590Sstevel@tonic-gate upgraded = 1; 3600Sstevel@tonic-gate se = SE_EXCL; 3610Sstevel@tonic-gate VM_STAT_ADD(page_lock_upgrade); 3620Sstevel@tonic-gate } 3630Sstevel@tonic-gate } 3640Sstevel@tonic-gate 3650Sstevel@tonic-gate if (se == SE_EXCL) { 366917Selowe if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { 3670Sstevel@tonic-gate /* 3680Sstevel@tonic-gate * if the caller wants a writer lock (but did not 3690Sstevel@tonic-gate * specify exclusive access), and there is a pending 3700Sstevel@tonic-gate * writer that wants exclusive access, return failure 3710Sstevel@tonic-gate */ 3720Sstevel@tonic-gate retval = 0; 3730Sstevel@tonic-gate } else if ((pp->p_selock & ~SE_EWANTED) == 0) { 3740Sstevel@tonic-gate /* no reader/writer lock held */ 3750Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 3760Sstevel@tonic-gate /* this clears our setting of the SE_EWANTED bit */ 3770Sstevel@tonic-gate pp->p_selock = SE_WRITER; 3780Sstevel@tonic-gate retval = 1; 3790Sstevel@tonic-gate } else { 3800Sstevel@tonic-gate /* page is locked */ 381917Selowe if (es & SE_EXCL_WANTED) { 3820Sstevel@tonic-gate /* set the SE_EWANTED bit */ 3830Sstevel@tonic-gate pp->p_selock |= SE_EWANTED; 3840Sstevel@tonic-gate } 3850Sstevel@tonic-gate retval = 0; 3860Sstevel@tonic-gate } 3870Sstevel@tonic-gate } else { 3880Sstevel@tonic-gate retval = 0; 3890Sstevel@tonic-gate if (pp->p_selock >= 0) { 390917Selowe if ((pp->p_selock & SE_EWANTED) == 0) { 3912759Selowe pp->p_selock += SE_READER; 3922759Selowe retval = 1; 3930Sstevel@tonic-gate } 3940Sstevel@tonic-gate } 3950Sstevel@tonic-gate } 3960Sstevel@tonic-gate 3970Sstevel@tonic-gate if (retval == 0) { 3980Sstevel@tonic-gate if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { 3990Sstevel@tonic-gate VM_STAT_ADD(page_lock_deleted); 4000Sstevel@tonic-gate mutex_exit(pse); 4010Sstevel@tonic-gate return (retval); 4020Sstevel@tonic-gate } 4030Sstevel@tonic-gate 4040Sstevel@tonic-gate #ifdef VM_STATS 4050Sstevel@tonic-gate VM_STAT_ADD(page_lock_miss); 4060Sstevel@tonic-gate if (upgraded) { 4070Sstevel@tonic-gate VM_STAT_ADD(page_lock_upgrade_failed); 4080Sstevel@tonic-gate } 4090Sstevel@tonic-gate #endif 4100Sstevel@tonic-gate if (lock) { 4110Sstevel@tonic-gate VM_STAT_ADD(page_lock_miss_lock); 4120Sstevel@tonic-gate mutex_exit(lock); 4130Sstevel@tonic-gate } 4140Sstevel@tonic-gate 4150Sstevel@tonic-gate /* 4160Sstevel@tonic-gate * Now, wait for the page to be unlocked and 4170Sstevel@tonic-gate * release the lock protecting p_cv and p_selock. 4180Sstevel@tonic-gate */ 4190Sstevel@tonic-gate cv_wait(&pp->p_cv, pse); 4200Sstevel@tonic-gate mutex_exit(pse); 4210Sstevel@tonic-gate 4220Sstevel@tonic-gate /* 4230Sstevel@tonic-gate * The page identity may have changed while we were 4240Sstevel@tonic-gate * blocked. If we are willing to depend on "pp" 4250Sstevel@tonic-gate * still pointing to a valid page structure (i.e., 4260Sstevel@tonic-gate * assuming page structures are not dynamically allocated 4270Sstevel@tonic-gate * or freed), we could try to lock the page if its 4280Sstevel@tonic-gate * identity hasn't changed. 4290Sstevel@tonic-gate * 4300Sstevel@tonic-gate * This needs to be measured, since we come back from 4310Sstevel@tonic-gate * cv_wait holding pse (the expensive part of this 4320Sstevel@tonic-gate * operation) we might as well try the cheap part. 4330Sstevel@tonic-gate * Though we would also have to confirm that dropping 4340Sstevel@tonic-gate * `lock' did not cause any grief to the callers. 4350Sstevel@tonic-gate */ 4360Sstevel@tonic-gate if (lock) { 4370Sstevel@tonic-gate mutex_enter(lock); 4380Sstevel@tonic-gate } 4390Sstevel@tonic-gate } else { 4400Sstevel@tonic-gate /* 4410Sstevel@tonic-gate * We have the page lock. 4420Sstevel@tonic-gate * If we needed to reclaim the page, and the page 4430Sstevel@tonic-gate * needed reclaiming (ie, it was free), then we 4440Sstevel@tonic-gate * have the page exclusively locked. We may need 4450Sstevel@tonic-gate * to downgrade the page. 4460Sstevel@tonic-gate */ 4470Sstevel@tonic-gate ASSERT((upgraded) ? 4480Sstevel@tonic-gate ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); 4490Sstevel@tonic-gate mutex_exit(pse); 4500Sstevel@tonic-gate 4510Sstevel@tonic-gate /* 4520Sstevel@tonic-gate * We now hold this page's lock, either shared or 4530Sstevel@tonic-gate * exclusive. This will prevent its identity from changing. 4540Sstevel@tonic-gate * The page, however, may or may not be free. If the caller 4550Sstevel@tonic-gate * requested, and it is free, go reclaim it from the 4560Sstevel@tonic-gate * free list. If the page can't be reclaimed, return failure 4570Sstevel@tonic-gate * so that the caller can start all over again. 4580Sstevel@tonic-gate * 4590Sstevel@tonic-gate * NOTE:page_reclaim() releases the page lock (p_selock) 4600Sstevel@tonic-gate * if it can't be reclaimed. 4610Sstevel@tonic-gate */ 4620Sstevel@tonic-gate if (reclaim_it) { 4630Sstevel@tonic-gate if (!page_reclaim(pp, lock)) { 4640Sstevel@tonic-gate VM_STAT_ADD(page_lock_bad_reclaim); 4650Sstevel@tonic-gate retval = 0; 4660Sstevel@tonic-gate } else { 4670Sstevel@tonic-gate VM_STAT_ADD(page_lock_reclaim); 4680Sstevel@tonic-gate if (upgraded) { 4690Sstevel@tonic-gate page_downgrade(pp); 4700Sstevel@tonic-gate } 4710Sstevel@tonic-gate } 4720Sstevel@tonic-gate } 4730Sstevel@tonic-gate } 4740Sstevel@tonic-gate return (retval); 4750Sstevel@tonic-gate } 4760Sstevel@tonic-gate 4770Sstevel@tonic-gate /* 4780Sstevel@tonic-gate * Clear the SE_EWANTED bit from p_selock. This function allows 4790Sstevel@tonic-gate * callers of page_lock_es and page_try_reclaim_lock to clear 4800Sstevel@tonic-gate * their setting of this bit if they decide they no longer wish 4810Sstevel@tonic-gate * to gain exclusive access to the page. Currently only 4820Sstevel@tonic-gate * delete_memory_thread uses this when the delete memory 4830Sstevel@tonic-gate * operation is cancelled. 4840Sstevel@tonic-gate */ 4850Sstevel@tonic-gate void 4860Sstevel@tonic-gate page_lock_clr_exclwanted(page_t *pp) 4870Sstevel@tonic-gate { 4880Sstevel@tonic-gate kmutex_t *pse = PAGE_SE_MUTEX(pp); 4890Sstevel@tonic-gate 4900Sstevel@tonic-gate mutex_enter(pse); 4910Sstevel@tonic-gate pp->p_selock &= ~SE_EWANTED; 4920Sstevel@tonic-gate if (CV_HAS_WAITERS(&pp->p_cv)) 4930Sstevel@tonic-gate cv_broadcast(&pp->p_cv); 4940Sstevel@tonic-gate mutex_exit(pse); 4950Sstevel@tonic-gate } 4960Sstevel@tonic-gate 4970Sstevel@tonic-gate /* 4980Sstevel@tonic-gate * Read the comments inside of page_lock_es() carefully. 4990Sstevel@tonic-gate * 5000Sstevel@tonic-gate * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the 5010Sstevel@tonic-gate * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. 5020Sstevel@tonic-gate * This is used by threads subject to reader-starvation (eg. memory delete). 5030Sstevel@tonic-gate * 5040Sstevel@tonic-gate * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, 5050Sstevel@tonic-gate * it is expected that it will retry at a later time. Threads that will 5060Sstevel@tonic-gate * not retry the lock *must* call page_lock_clr_exclwanted to clear the 5070Sstevel@tonic-gate * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, 5080Sstevel@tonic-gate * the bit is cleared.) 5090Sstevel@tonic-gate */ 5100Sstevel@tonic-gate int 5110Sstevel@tonic-gate page_try_reclaim_lock(page_t *pp, se_t se, int es) 5120Sstevel@tonic-gate { 5130Sstevel@tonic-gate kmutex_t *pse = PAGE_SE_MUTEX(pp); 5140Sstevel@tonic-gate selock_t old; 5150Sstevel@tonic-gate 5160Sstevel@tonic-gate mutex_enter(pse); 5170Sstevel@tonic-gate 5180Sstevel@tonic-gate old = pp->p_selock; 5190Sstevel@tonic-gate 5200Sstevel@tonic-gate ASSERT(((es & SE_EXCL_WANTED) == 0) || 521917Selowe ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 522917Selowe 523917Selowe if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 524917Selowe mutex_exit(pse); 525917Selowe VM_STAT_ADD(page_trylock_failed); 526917Selowe return (0); 527917Selowe } 5280Sstevel@tonic-gate 5290Sstevel@tonic-gate if (se == SE_SHARED && es == 1 && old == 0) { 5300Sstevel@tonic-gate se = SE_EXCL; 5310Sstevel@tonic-gate } 5320Sstevel@tonic-gate 5330Sstevel@tonic-gate if (se == SE_SHARED) { 5340Sstevel@tonic-gate if (!PP_ISFREE(pp)) { 5350Sstevel@tonic-gate if (old >= 0) { 536917Selowe /* 537917Selowe * Readers are not allowed when excl wanted 538917Selowe */ 539917Selowe if ((old & SE_EWANTED) == 0) { 5402759Selowe pp->p_selock = old + SE_READER; 5412759Selowe mutex_exit(pse); 5422759Selowe return (1); 5430Sstevel@tonic-gate } 5440Sstevel@tonic-gate } 5450Sstevel@tonic-gate mutex_exit(pse); 5460Sstevel@tonic-gate return (0); 5470Sstevel@tonic-gate } 5480Sstevel@tonic-gate /* 5490Sstevel@tonic-gate * The page is free, so we really want SE_EXCL (below) 5500Sstevel@tonic-gate */ 5510Sstevel@tonic-gate VM_STAT_ADD(page_try_reclaim_upgrade); 5520Sstevel@tonic-gate } 5530Sstevel@tonic-gate 5540Sstevel@tonic-gate /* 5550Sstevel@tonic-gate * The caller wants a writer lock. We try for it only if 5560Sstevel@tonic-gate * SE_EWANTED is not set, or if the caller specified 5570Sstevel@tonic-gate * SE_EXCL_WANTED. 5580Sstevel@tonic-gate */ 559917Selowe if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { 5600Sstevel@tonic-gate if ((old & ~SE_EWANTED) == 0) { 5610Sstevel@tonic-gate /* no reader/writer lock held */ 5620Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 5630Sstevel@tonic-gate /* this clears out our setting of the SE_EWANTED bit */ 5640Sstevel@tonic-gate pp->p_selock = SE_WRITER; 5650Sstevel@tonic-gate mutex_exit(pse); 5660Sstevel@tonic-gate return (1); 5670Sstevel@tonic-gate } 5680Sstevel@tonic-gate } 569917Selowe if (es & SE_EXCL_WANTED) { 5700Sstevel@tonic-gate /* page is locked, set the SE_EWANTED bit */ 5710Sstevel@tonic-gate pp->p_selock |= SE_EWANTED; 5720Sstevel@tonic-gate } 5730Sstevel@tonic-gate mutex_exit(pse); 5740Sstevel@tonic-gate return (0); 5750Sstevel@tonic-gate } 5760Sstevel@tonic-gate 5770Sstevel@tonic-gate /* 5780Sstevel@tonic-gate * Acquire a page's "shared/exclusive" lock, but never block. 5790Sstevel@tonic-gate * Returns 1 on success, 0 on failure. 5800Sstevel@tonic-gate */ 5810Sstevel@tonic-gate int 5820Sstevel@tonic-gate page_trylock(page_t *pp, se_t se) 5830Sstevel@tonic-gate { 5840Sstevel@tonic-gate kmutex_t *pse = PAGE_SE_MUTEX(pp); 5850Sstevel@tonic-gate 5860Sstevel@tonic-gate mutex_enter(pse); 587917Selowe if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) || 588973Selowe (se == SE_SHARED && PP_PR_NOSHARE(pp))) { 589917Selowe /* 590917Selowe * Fail if a thread wants exclusive access and page is 591917Selowe * retired, if the page is slated for retirement, or a 592917Selowe * share lock is requested. 593917Selowe */ 5940Sstevel@tonic-gate mutex_exit(pse); 595917Selowe VM_STAT_ADD(page_trylock_failed); 5960Sstevel@tonic-gate return (0); 5970Sstevel@tonic-gate } 5980Sstevel@tonic-gate 5990Sstevel@tonic-gate if (se == SE_EXCL) { 6000Sstevel@tonic-gate if (pp->p_selock == 0) { 6010Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 6020Sstevel@tonic-gate pp->p_selock = SE_WRITER; 6030Sstevel@tonic-gate mutex_exit(pse); 6040Sstevel@tonic-gate return (1); 6050Sstevel@tonic-gate } 6060Sstevel@tonic-gate } else { 6070Sstevel@tonic-gate if (pp->p_selock >= 0) { 6080Sstevel@tonic-gate pp->p_selock += SE_READER; 6090Sstevel@tonic-gate mutex_exit(pse); 6100Sstevel@tonic-gate return (1); 6110Sstevel@tonic-gate } 6120Sstevel@tonic-gate } 6130Sstevel@tonic-gate mutex_exit(pse); 6140Sstevel@tonic-gate return (0); 6150Sstevel@tonic-gate } 6160Sstevel@tonic-gate 6170Sstevel@tonic-gate /* 618917Selowe * Variant of page_unlock() specifically for the page freelist 619917Selowe * code. The mere existence of this code is a vile hack that 620917Selowe * has resulted due to the backwards locking order of the page 621917Selowe * freelist manager; please don't call it. 622917Selowe */ 623917Selowe void 6243253Smec page_unlock_nocapture(page_t *pp) 625917Selowe { 626917Selowe kmutex_t *pse = PAGE_SE_MUTEX(pp); 627917Selowe selock_t old; 628917Selowe 629917Selowe mutex_enter(pse); 630917Selowe 631917Selowe old = pp->p_selock; 632917Selowe if ((old & ~SE_EWANTED) == SE_READER) { 633917Selowe pp->p_selock = old & ~SE_READER; 634917Selowe if (CV_HAS_WAITERS(&pp->p_cv)) 635917Selowe cv_broadcast(&pp->p_cv); 636917Selowe } else if ((old & ~SE_EWANTED) == SE_DELETED) { 6373253Smec panic("page_unlock_nocapture: page %p is deleted", pp); 638917Selowe } else if (old < 0) { 639917Selowe THREAD_KPRI_RELEASE(); 640917Selowe pp->p_selock &= SE_EWANTED; 641917Selowe if (CV_HAS_WAITERS(&pp->p_cv)) 642917Selowe cv_broadcast(&pp->p_cv); 643917Selowe } else if ((old & ~SE_EWANTED) > SE_READER) { 644917Selowe pp->p_selock = old - SE_READER; 645917Selowe } else { 6463253Smec panic("page_unlock_nocapture: page %p is not locked", pp); 647917Selowe } 648917Selowe 649917Selowe mutex_exit(pse); 650917Selowe } 651917Selowe 652917Selowe /* 6530Sstevel@tonic-gate * Release the page's "shared/exclusive" lock and wake up anyone 6540Sstevel@tonic-gate * who might be waiting for it. 6550Sstevel@tonic-gate */ 6560Sstevel@tonic-gate void 6570Sstevel@tonic-gate page_unlock(page_t *pp) 6580Sstevel@tonic-gate { 6590Sstevel@tonic-gate kmutex_t *pse = PAGE_SE_MUTEX(pp); 6600Sstevel@tonic-gate selock_t old; 6610Sstevel@tonic-gate 6620Sstevel@tonic-gate mutex_enter(pse); 663917Selowe 6640Sstevel@tonic-gate old = pp->p_selock; 6650Sstevel@tonic-gate if ((old & ~SE_EWANTED) == SE_READER) { 6660Sstevel@tonic-gate pp->p_selock = old & ~SE_READER; 6670Sstevel@tonic-gate if (CV_HAS_WAITERS(&pp->p_cv)) 6680Sstevel@tonic-gate cv_broadcast(&pp->p_cv); 6690Sstevel@tonic-gate } else if ((old & ~SE_EWANTED) == SE_DELETED) { 6700Sstevel@tonic-gate panic("page_unlock: page %p is deleted", pp); 6710Sstevel@tonic-gate } else if (old < 0) { 6720Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 6730Sstevel@tonic-gate pp->p_selock &= SE_EWANTED; 6740Sstevel@tonic-gate if (CV_HAS_WAITERS(&pp->p_cv)) 6750Sstevel@tonic-gate cv_broadcast(&pp->p_cv); 6760Sstevel@tonic-gate } else if ((old & ~SE_EWANTED) > SE_READER) { 6770Sstevel@tonic-gate pp->p_selock = old - SE_READER; 6780Sstevel@tonic-gate } else { 6790Sstevel@tonic-gate panic("page_unlock: page %p is not locked", pp); 6800Sstevel@tonic-gate } 681917Selowe 6823253Smec if (pp->p_selock == 0) { 683917Selowe /* 6843253Smec * If the T_CAPTURING bit is set, that means that we should 6853253Smec * not try and capture the page again as we could recurse 6863253Smec * which could lead to a stack overflow panic or spending a 6873253Smec * relatively long time in the kernel making no progress. 688917Selowe */ 6893253Smec if ((pp->p_toxic & PR_CAPTURE) && 6903253Smec !(curthread->t_flag & T_CAPTURING) && 6913253Smec !PP_RETIRED(pp)) { 692917Selowe THREAD_KPRI_REQUEST(); 693917Selowe pp->p_selock = SE_WRITER; 694917Selowe mutex_exit(pse); 6953253Smec page_unlock_capture(pp); 696917Selowe } else { 697917Selowe mutex_exit(pse); 698917Selowe } 699917Selowe } else { 700917Selowe mutex_exit(pse); 701917Selowe } 7020Sstevel@tonic-gate } 7030Sstevel@tonic-gate 7040Sstevel@tonic-gate /* 7050Sstevel@tonic-gate * Try to upgrade the lock on the page from a "shared" to an 7060Sstevel@tonic-gate * "exclusive" lock. Since this upgrade operation is done while 7070Sstevel@tonic-gate * holding the mutex protecting this page, no one else can acquire this page's 7080Sstevel@tonic-gate * lock and change the page. Thus, it is safe to drop the "shared" 7090Sstevel@tonic-gate * lock and attempt to acquire the "exclusive" lock. 7100Sstevel@tonic-gate * 7110Sstevel@tonic-gate * Returns 1 on success, 0 on failure. 7120Sstevel@tonic-gate */ 7130Sstevel@tonic-gate int 7140Sstevel@tonic-gate page_tryupgrade(page_t *pp) 7150Sstevel@tonic-gate { 7160Sstevel@tonic-gate kmutex_t *pse = PAGE_SE_MUTEX(pp); 7170Sstevel@tonic-gate 7180Sstevel@tonic-gate mutex_enter(pse); 7190Sstevel@tonic-gate if (!(pp->p_selock & SE_EWANTED)) { 7200Sstevel@tonic-gate /* no threads want exclusive access, try upgrade */ 7210Sstevel@tonic-gate if (pp->p_selock == SE_READER) { 7220Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 7230Sstevel@tonic-gate /* convert to exclusive lock */ 7240Sstevel@tonic-gate pp->p_selock = SE_WRITER; 7250Sstevel@tonic-gate mutex_exit(pse); 7260Sstevel@tonic-gate return (1); 7270Sstevel@tonic-gate } 7280Sstevel@tonic-gate } 7290Sstevel@tonic-gate mutex_exit(pse); 7300Sstevel@tonic-gate return (0); 7310Sstevel@tonic-gate } 7320Sstevel@tonic-gate 7330Sstevel@tonic-gate /* 7340Sstevel@tonic-gate * Downgrade the "exclusive" lock on the page to a "shared" lock 7350Sstevel@tonic-gate * while holding the mutex protecting this page's p_selock field. 7360Sstevel@tonic-gate */ 7370Sstevel@tonic-gate void 7380Sstevel@tonic-gate page_downgrade(page_t *pp) 7390Sstevel@tonic-gate { 7400Sstevel@tonic-gate kmutex_t *pse = PAGE_SE_MUTEX(pp); 7410Sstevel@tonic-gate int excl_waiting; 7420Sstevel@tonic-gate 7430Sstevel@tonic-gate ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); 7440Sstevel@tonic-gate ASSERT(PAGE_EXCL(pp)); 7450Sstevel@tonic-gate 7460Sstevel@tonic-gate mutex_enter(pse); 7470Sstevel@tonic-gate excl_waiting = pp->p_selock & SE_EWANTED; 7480Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 7490Sstevel@tonic-gate pp->p_selock = SE_READER | excl_waiting; 7500Sstevel@tonic-gate if (CV_HAS_WAITERS(&pp->p_cv)) 7510Sstevel@tonic-gate cv_broadcast(&pp->p_cv); 7520Sstevel@tonic-gate mutex_exit(pse); 7530Sstevel@tonic-gate } 7540Sstevel@tonic-gate 7550Sstevel@tonic-gate void 7560Sstevel@tonic-gate page_lock_delete(page_t *pp) 7570Sstevel@tonic-gate { 7580Sstevel@tonic-gate kmutex_t *pse = PAGE_SE_MUTEX(pp); 7590Sstevel@tonic-gate 7600Sstevel@tonic-gate ASSERT(PAGE_EXCL(pp)); 7610Sstevel@tonic-gate ASSERT(pp->p_vnode == NULL); 7620Sstevel@tonic-gate ASSERT(pp->p_offset == (u_offset_t)-1); 7630Sstevel@tonic-gate ASSERT(!PP_ISFREE(pp)); 7640Sstevel@tonic-gate 7650Sstevel@tonic-gate mutex_enter(pse); 7660Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 7670Sstevel@tonic-gate pp->p_selock = SE_DELETED; 7680Sstevel@tonic-gate if (CV_HAS_WAITERS(&pp->p_cv)) 7690Sstevel@tonic-gate cv_broadcast(&pp->p_cv); 7700Sstevel@tonic-gate mutex_exit(pse); 7710Sstevel@tonic-gate } 7720Sstevel@tonic-gate 7733253Smec int 7743253Smec page_deleted(page_t *pp) 7753253Smec { 7763253Smec return (pp->p_selock == SE_DELETED); 7773253Smec } 7783253Smec 7790Sstevel@tonic-gate /* 7800Sstevel@tonic-gate * Implement the io lock for pages 7810Sstevel@tonic-gate */ 7820Sstevel@tonic-gate void 7830Sstevel@tonic-gate page_iolock_init(page_t *pp) 7840Sstevel@tonic-gate { 7850Sstevel@tonic-gate pp->p_iolock_state = 0; 7860Sstevel@tonic-gate cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL); 7870Sstevel@tonic-gate } 7880Sstevel@tonic-gate 7890Sstevel@tonic-gate /* 7900Sstevel@tonic-gate * Acquire the i/o lock on a page. 7910Sstevel@tonic-gate */ 7920Sstevel@tonic-gate void 7930Sstevel@tonic-gate page_io_lock(page_t *pp) 7940Sstevel@tonic-gate { 7950Sstevel@tonic-gate kmutex_t *pio; 7960Sstevel@tonic-gate 7970Sstevel@tonic-gate pio = PAGE_IO_MUTEX(pp); 7980Sstevel@tonic-gate mutex_enter(pio); 7990Sstevel@tonic-gate while (pp->p_iolock_state & PAGE_IO_INUSE) { 8000Sstevel@tonic-gate cv_wait(&(pp->p_io_cv), pio); 8010Sstevel@tonic-gate } 8020Sstevel@tonic-gate pp->p_iolock_state |= PAGE_IO_INUSE; 8030Sstevel@tonic-gate mutex_exit(pio); 8040Sstevel@tonic-gate } 8050Sstevel@tonic-gate 8060Sstevel@tonic-gate /* 8070Sstevel@tonic-gate * Release the i/o lock on a page. 8080Sstevel@tonic-gate */ 8090Sstevel@tonic-gate void 8100Sstevel@tonic-gate page_io_unlock(page_t *pp) 8110Sstevel@tonic-gate { 8120Sstevel@tonic-gate kmutex_t *pio; 8130Sstevel@tonic-gate 8140Sstevel@tonic-gate pio = PAGE_IO_MUTEX(pp); 8150Sstevel@tonic-gate mutex_enter(pio); 8162999Sstans cv_broadcast(&pp->p_io_cv); 8170Sstevel@tonic-gate pp->p_iolock_state &= ~PAGE_IO_INUSE; 8180Sstevel@tonic-gate mutex_exit(pio); 8190Sstevel@tonic-gate } 8200Sstevel@tonic-gate 8210Sstevel@tonic-gate /* 8220Sstevel@tonic-gate * Try to acquire the i/o lock on a page without blocking. 8230Sstevel@tonic-gate * Returns 1 on success, 0 on failure. 8240Sstevel@tonic-gate */ 8250Sstevel@tonic-gate int 8260Sstevel@tonic-gate page_io_trylock(page_t *pp) 8270Sstevel@tonic-gate { 8280Sstevel@tonic-gate kmutex_t *pio; 8290Sstevel@tonic-gate 8300Sstevel@tonic-gate if (pp->p_iolock_state & PAGE_IO_INUSE) 8310Sstevel@tonic-gate return (0); 8320Sstevel@tonic-gate 8330Sstevel@tonic-gate pio = PAGE_IO_MUTEX(pp); 8340Sstevel@tonic-gate mutex_enter(pio); 8350Sstevel@tonic-gate 8360Sstevel@tonic-gate if (pp->p_iolock_state & PAGE_IO_INUSE) { 8370Sstevel@tonic-gate mutex_exit(pio); 8380Sstevel@tonic-gate return (0); 8390Sstevel@tonic-gate } 8400Sstevel@tonic-gate pp->p_iolock_state |= PAGE_IO_INUSE; 8410Sstevel@tonic-gate mutex_exit(pio); 8420Sstevel@tonic-gate 8430Sstevel@tonic-gate return (1); 8440Sstevel@tonic-gate } 8450Sstevel@tonic-gate 8460Sstevel@tonic-gate /* 8472999Sstans * Wait until the i/o lock is not held. 8482999Sstans */ 8492999Sstans void 8502999Sstans page_io_wait(page_t *pp) 8512999Sstans { 8522999Sstans kmutex_t *pio; 8532999Sstans 8542999Sstans pio = PAGE_IO_MUTEX(pp); 8552999Sstans mutex_enter(pio); 8562999Sstans while (pp->p_iolock_state & PAGE_IO_INUSE) { 8572999Sstans cv_wait(&(pp->p_io_cv), pio); 8582999Sstans } 8592999Sstans mutex_exit(pio); 8602999Sstans } 8612999Sstans 8622999Sstans /* 8632999Sstans * Returns 1 on success, 0 on failure. 8642999Sstans */ 8652999Sstans int 8662999Sstans page_io_locked(page_t *pp) 8672999Sstans { 8682999Sstans return (pp->p_iolock_state & PAGE_IO_INUSE); 8692999Sstans } 8702999Sstans 8712999Sstans /* 8720Sstevel@tonic-gate * Assert that the i/o lock on a page is held. 8730Sstevel@tonic-gate * Returns 1 on success, 0 on failure. 8740Sstevel@tonic-gate */ 8750Sstevel@tonic-gate int 8760Sstevel@tonic-gate page_iolock_assert(page_t *pp) 8770Sstevel@tonic-gate { 8782999Sstans return (page_io_locked(pp)); 8790Sstevel@tonic-gate } 8800Sstevel@tonic-gate 8810Sstevel@tonic-gate /* 8820Sstevel@tonic-gate * Wrapper exported to kernel routines that are built 8830Sstevel@tonic-gate * platform-independent (the macro is platform-dependent; 8840Sstevel@tonic-gate * the size of vph_mutex[] is based on NCPU). 8850Sstevel@tonic-gate * 8860Sstevel@tonic-gate * Note that you can do stress testing on this by setting the 8870Sstevel@tonic-gate * variable page_vnode_mutex_stress to something other than 8880Sstevel@tonic-gate * zero in a DEBUG kernel in a debugger after loading the kernel. 8890Sstevel@tonic-gate * Setting it after the kernel is running may not work correctly. 8900Sstevel@tonic-gate */ 8910Sstevel@tonic-gate #ifdef DEBUG 8920Sstevel@tonic-gate static int page_vnode_mutex_stress = 0; 8930Sstevel@tonic-gate #endif 8940Sstevel@tonic-gate 8950Sstevel@tonic-gate kmutex_t * 8960Sstevel@tonic-gate page_vnode_mutex(vnode_t *vp) 8970Sstevel@tonic-gate { 8980Sstevel@tonic-gate if (vp == &kvp) 8990Sstevel@tonic-gate return (&vph_mutex[VPH_TABLE_SIZE + 0]); 9003290Sjohansen 9013290Sjohansen if (vp == &zvp) 9023290Sjohansen return (&vph_mutex[VPH_TABLE_SIZE + 1]); 9030Sstevel@tonic-gate #ifdef DEBUG 9040Sstevel@tonic-gate if (page_vnode_mutex_stress != 0) 9050Sstevel@tonic-gate return (&vph_mutex[0]); 9060Sstevel@tonic-gate #endif 9070Sstevel@tonic-gate 9080Sstevel@tonic-gate return (&vph_mutex[VP_HASH_FUNC(vp)]); 9090Sstevel@tonic-gate } 9100Sstevel@tonic-gate 9110Sstevel@tonic-gate kmutex_t * 9120Sstevel@tonic-gate page_se_mutex(page_t *pp) 9130Sstevel@tonic-gate { 9140Sstevel@tonic-gate return (PAGE_SE_MUTEX(pp)); 9150Sstevel@tonic-gate } 9160Sstevel@tonic-gate 9170Sstevel@tonic-gate #ifdef VM_STATS 9180Sstevel@tonic-gate uint_t pszclck_stat[4]; 9190Sstevel@tonic-gate #endif 9200Sstevel@tonic-gate /* 9210Sstevel@tonic-gate * Find, take and return a mutex held by hat_page_demote(). 9220Sstevel@tonic-gate * Called by page_demote_vp_pages() before hat_page_demote() call and by 9230Sstevel@tonic-gate * routines that want to block hat_page_demote() but can't do it 9240Sstevel@tonic-gate * via locking all constituent pages. 9250Sstevel@tonic-gate * 9260Sstevel@tonic-gate * Return NULL if p_szc is 0. 9270Sstevel@tonic-gate * 9280Sstevel@tonic-gate * It should only be used for pages that can be demoted by hat_page_demote() 9290Sstevel@tonic-gate * i.e. non swapfs file system pages. The logic here is lifted from 9300Sstevel@tonic-gate * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase 9310Sstevel@tonic-gate * since the page is locked and not free. 9320Sstevel@tonic-gate * 9330Sstevel@tonic-gate * Hash of the root page is used to find the lock. 9340Sstevel@tonic-gate * To find the root in the presense of hat_page_demote() chageing the location 9350Sstevel@tonic-gate * of the root this routine relies on the fact that hat_page_demote() changes 9360Sstevel@tonic-gate * root last. 9370Sstevel@tonic-gate * 9380Sstevel@tonic-gate * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is 9390Sstevel@tonic-gate * returned pp's p_szc may be any value. 9400Sstevel@tonic-gate */ 9410Sstevel@tonic-gate kmutex_t * 9420Sstevel@tonic-gate page_szc_lock(page_t *pp) 9430Sstevel@tonic-gate { 9440Sstevel@tonic-gate kmutex_t *mtx; 9450Sstevel@tonic-gate page_t *rootpp; 9460Sstevel@tonic-gate uint_t szc; 9470Sstevel@tonic-gate uint_t rszc; 9480Sstevel@tonic-gate uint_t pszc = pp->p_szc; 9490Sstevel@tonic-gate 9500Sstevel@tonic-gate ASSERT(pp != NULL); 9510Sstevel@tonic-gate ASSERT(PAGE_LOCKED(pp)); 9520Sstevel@tonic-gate ASSERT(!PP_ISFREE(pp)); 9530Sstevel@tonic-gate ASSERT(pp->p_vnode != NULL); 9540Sstevel@tonic-gate ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 9553290Sjohansen ASSERT(!PP_ISKAS(pp)); 9560Sstevel@tonic-gate 9570Sstevel@tonic-gate again: 9580Sstevel@tonic-gate if (pszc == 0) { 9590Sstevel@tonic-gate VM_STAT_ADD(pszclck_stat[0]); 9600Sstevel@tonic-gate return (NULL); 9610Sstevel@tonic-gate } 9620Sstevel@tonic-gate 9630Sstevel@tonic-gate /* The lock lives in the root page */ 9640Sstevel@tonic-gate 9650Sstevel@tonic-gate rootpp = PP_GROUPLEADER(pp, pszc); 9660Sstevel@tonic-gate mtx = PAGE_SZC_MUTEX(rootpp); 9670Sstevel@tonic-gate mutex_enter(mtx); 9680Sstevel@tonic-gate 9690Sstevel@tonic-gate /* 9700Sstevel@tonic-gate * since p_szc can only decrease if pp == rootpp 9710Sstevel@tonic-gate * rootpp will be always the same i.e we have the right root 9720Sstevel@tonic-gate * regardless of rootpp->p_szc. 9730Sstevel@tonic-gate * If location of pp's root didn't change after we took 9740Sstevel@tonic-gate * the lock we have the right root. return mutex hashed off it. 9750Sstevel@tonic-gate */ 9760Sstevel@tonic-gate if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { 9770Sstevel@tonic-gate VM_STAT_ADD(pszclck_stat[1]); 9780Sstevel@tonic-gate return (mtx); 9790Sstevel@tonic-gate } 9800Sstevel@tonic-gate 9810Sstevel@tonic-gate /* 9820Sstevel@tonic-gate * root location changed because page got demoted. 9830Sstevel@tonic-gate * locate the new root. 9840Sstevel@tonic-gate */ 9850Sstevel@tonic-gate if (rszc < pszc) { 9860Sstevel@tonic-gate szc = pp->p_szc; 9870Sstevel@tonic-gate ASSERT(szc < pszc); 9880Sstevel@tonic-gate mutex_exit(mtx); 9890Sstevel@tonic-gate pszc = szc; 9900Sstevel@tonic-gate VM_STAT_ADD(pszclck_stat[2]); 9910Sstevel@tonic-gate goto again; 9920Sstevel@tonic-gate } 9930Sstevel@tonic-gate 9940Sstevel@tonic-gate VM_STAT_ADD(pszclck_stat[3]); 9950Sstevel@tonic-gate /* 9960Sstevel@tonic-gate * current hat_page_demote not done yet. 9970Sstevel@tonic-gate * wait for it to finish. 9980Sstevel@tonic-gate */ 9990Sstevel@tonic-gate mutex_exit(mtx); 10000Sstevel@tonic-gate rootpp = PP_GROUPLEADER(rootpp, rszc); 10010Sstevel@tonic-gate mtx = PAGE_SZC_MUTEX(rootpp); 10020Sstevel@tonic-gate mutex_enter(mtx); 10030Sstevel@tonic-gate mutex_exit(mtx); 10040Sstevel@tonic-gate ASSERT(rootpp->p_szc < rszc); 10050Sstevel@tonic-gate goto again; 10060Sstevel@tonic-gate } 10070Sstevel@tonic-gate 10080Sstevel@tonic-gate int 10090Sstevel@tonic-gate page_szc_lock_assert(page_t *pp) 10100Sstevel@tonic-gate { 10110Sstevel@tonic-gate page_t *rootpp = PP_PAGEROOT(pp); 10120Sstevel@tonic-gate kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp); 10130Sstevel@tonic-gate 10140Sstevel@tonic-gate return (MUTEX_HELD(mtx)); 10150Sstevel@tonic-gate } 10163446Smrj 10173446Smrj /* 10183446Smrj * memseg locking 10193446Smrj */ 10203446Smrj static krwlock_t memsegslock; 10213446Smrj 10223446Smrj /* 10233446Smrj * memlist (phys_install, phys_avail) locking. 10243446Smrj */ 10253446Smrj static krwlock_t memlists_lock; 10263446Smrj 10273446Smrj void 10283446Smrj memsegs_lock(int writer) 10293446Smrj { 10303446Smrj rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 10313446Smrj } 10323446Smrj 10333446Smrj /*ARGSUSED*/ 10343446Smrj void 10353446Smrj memsegs_unlock(int writer) 10363446Smrj { 10373446Smrj rw_exit(&memsegslock); 10383446Smrj } 10393446Smrj 10403446Smrj int 10413446Smrj memsegs_lock_held(void) 10423446Smrj { 10433446Smrj return (RW_LOCK_HELD(&memsegslock)); 10443446Smrj } 10453446Smrj 10463446Smrj void 10473446Smrj memlist_read_lock(void) 10483446Smrj { 10493446Smrj rw_enter(&memlists_lock, RW_READER); 10503446Smrj } 10513446Smrj 10523446Smrj void 10533446Smrj memlist_read_unlock(void) 10543446Smrj { 10553446Smrj rw_exit(&memlists_lock); 10563446Smrj } 10573446Smrj 10583446Smrj void 10593446Smrj memlist_write_lock(void) 10603446Smrj { 10613446Smrj rw_enter(&memlists_lock, RW_WRITER); 10623446Smrj } 10633446Smrj 10643446Smrj void 10653446Smrj memlist_write_unlock(void) 10663446Smrj { 10673446Smrj rw_exit(&memlists_lock); 10683446Smrj } 1069