10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 52205Sdv142724 * Common Development and Distribution License (the "License"). 62205Sdv142724 * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 22*3914Spm145316 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 270Sstevel@tonic-gate 280Sstevel@tonic-gate /* 290Sstevel@tonic-gate * Big Theory Statement for mutual exclusion locking primitives. 300Sstevel@tonic-gate * 310Sstevel@tonic-gate * A mutex serializes multiple threads so that only one thread 320Sstevel@tonic-gate * (the "owner" of the mutex) is active at a time. See mutex(9F) 330Sstevel@tonic-gate * for a full description of the interfaces and programming model. 340Sstevel@tonic-gate * The rest of this comment describes the implementation. 350Sstevel@tonic-gate * 360Sstevel@tonic-gate * Mutexes come in two flavors: adaptive and spin. mutex_init(9F) 370Sstevel@tonic-gate * determines the type based solely on the iblock cookie (PIL) argument. 380Sstevel@tonic-gate * PIL > LOCK_LEVEL implies a spin lock; everything else is adaptive. 390Sstevel@tonic-gate * 400Sstevel@tonic-gate * Spin mutexes block interrupts and spin until the lock becomes available. 410Sstevel@tonic-gate * A thread may not sleep, or call any function that might sleep, while 420Sstevel@tonic-gate * holding a spin mutex. With few exceptions, spin mutexes should only 430Sstevel@tonic-gate * be used to synchronize with interrupt handlers. 440Sstevel@tonic-gate * 450Sstevel@tonic-gate * Adaptive mutexes (the default type) spin if the owner is running on 460Sstevel@tonic-gate * another CPU and block otherwise. This policy is based on the assumption 470Sstevel@tonic-gate * that mutex hold times are typically short enough that the time spent 480Sstevel@tonic-gate * spinning is less than the time it takes to block. If you need mutual 490Sstevel@tonic-gate * exclusion semantics with long hold times, consider an rwlock(9F) as 500Sstevel@tonic-gate * RW_WRITER. Better still, reconsider the algorithm: if it requires 510Sstevel@tonic-gate * mutual exclusion for long periods of time, it's probably not scalable. 520Sstevel@tonic-gate * 530Sstevel@tonic-gate * Adaptive mutexes are overwhelmingly more common than spin mutexes, 540Sstevel@tonic-gate * so mutex_enter() assumes that the lock is adaptive. We get away 550Sstevel@tonic-gate * with this by structuring mutexes so that an attempt to acquire a 560Sstevel@tonic-gate * spin mutex as adaptive always fails. When mutex_enter() fails 570Sstevel@tonic-gate * it punts to mutex_vector_enter(), which does all the hard stuff. 580Sstevel@tonic-gate * 590Sstevel@tonic-gate * mutex_vector_enter() first checks the type. If it's spin mutex, 600Sstevel@tonic-gate * we just call lock_set_spl() and return. If it's an adaptive mutex, 610Sstevel@tonic-gate * we check to see what the owner is doing. If the owner is running, 620Sstevel@tonic-gate * we spin until the lock becomes available; if not, we mark the lock 630Sstevel@tonic-gate * as having waiters and block. 640Sstevel@tonic-gate * 650Sstevel@tonic-gate * Blocking on a mutex is surprisingly delicate dance because, for speed, 660Sstevel@tonic-gate * mutex_exit() doesn't use an atomic instruction. Thus we have to work 670Sstevel@tonic-gate * a little harder in the (rarely-executed) blocking path to make sure 680Sstevel@tonic-gate * we don't block on a mutex that's just been released -- otherwise we 690Sstevel@tonic-gate * might never be woken up. 700Sstevel@tonic-gate * 710Sstevel@tonic-gate * The logic for synchronizing mutex_vector_enter() with mutex_exit() 720Sstevel@tonic-gate * in the face of preemption and relaxed memory ordering is as follows: 730Sstevel@tonic-gate * 740Sstevel@tonic-gate * (1) Preemption in the middle of mutex_exit() must cause mutex_exit() 750Sstevel@tonic-gate * to restart. Each platform must enforce this by checking the 760Sstevel@tonic-gate * interrupted PC in the interrupt handler (or on return from trap -- 770Sstevel@tonic-gate * whichever is more convenient for the platform). If the PC 780Sstevel@tonic-gate * lies within the critical region of mutex_exit(), the interrupt 790Sstevel@tonic-gate * handler must reset the PC back to the beginning of mutex_exit(). 800Sstevel@tonic-gate * The critical region consists of all instructions up to, but not 810Sstevel@tonic-gate * including, the store that clears the lock (which, of course, 820Sstevel@tonic-gate * must never be executed twice.) 830Sstevel@tonic-gate * 840Sstevel@tonic-gate * This ensures that the owner will always check for waiters after 850Sstevel@tonic-gate * resuming from a previous preemption. 860Sstevel@tonic-gate * 870Sstevel@tonic-gate * (2) A thread resuming in mutex_exit() does (at least) the following: 880Sstevel@tonic-gate * 890Sstevel@tonic-gate * when resuming: set CPU_THREAD = owner 900Sstevel@tonic-gate * membar #StoreLoad 910Sstevel@tonic-gate * 920Sstevel@tonic-gate * in mutex_exit: check waiters bit; do wakeup if set 930Sstevel@tonic-gate * membar #LoadStore|#StoreStore 940Sstevel@tonic-gate * clear owner 950Sstevel@tonic-gate * (at this point, other threads may or may not grab 960Sstevel@tonic-gate * the lock, and we may or may not reacquire it) 970Sstevel@tonic-gate * 980Sstevel@tonic-gate * when blocking: membar #StoreStore (due to disp_lock_enter()) 990Sstevel@tonic-gate * set CPU_THREAD = (possibly) someone else 1000Sstevel@tonic-gate * 1010Sstevel@tonic-gate * (3) A thread blocking in mutex_vector_enter() does the following: 1020Sstevel@tonic-gate * 1030Sstevel@tonic-gate * set waiters bit 1040Sstevel@tonic-gate * membar #StoreLoad (via membar_enter()) 1050Sstevel@tonic-gate * check CPU_THREAD for each CPU; abort if owner running 1060Sstevel@tonic-gate * membar #LoadLoad (via membar_consumer()) 1070Sstevel@tonic-gate * check owner and waiters bit; abort if either changed 1080Sstevel@tonic-gate * block 1090Sstevel@tonic-gate * 1100Sstevel@tonic-gate * Thus the global memory orderings for (2) and (3) are as follows: 1110Sstevel@tonic-gate * 1120Sstevel@tonic-gate * (2M) mutex_exit() memory order: 1130Sstevel@tonic-gate * 1140Sstevel@tonic-gate * STORE CPU_THREAD = owner 1150Sstevel@tonic-gate * LOAD waiters bit 1160Sstevel@tonic-gate * STORE owner = NULL 1170Sstevel@tonic-gate * STORE CPU_THREAD = (possibly) someone else 1180Sstevel@tonic-gate * 1190Sstevel@tonic-gate * (3M) mutex_vector_enter() memory order: 1200Sstevel@tonic-gate * 1210Sstevel@tonic-gate * STORE waiters bit = 1 1220Sstevel@tonic-gate * LOAD CPU_THREAD for each CPU 1230Sstevel@tonic-gate * LOAD owner and waiters bit 1240Sstevel@tonic-gate * 1250Sstevel@tonic-gate * It has been verified by exhaustive simulation that all possible global 1260Sstevel@tonic-gate * memory orderings of (2M) interleaved with (3M) result in correct 1270Sstevel@tonic-gate * behavior. Moreover, these ordering constraints are minimal: changing 1280Sstevel@tonic-gate * the ordering of anything in (2M) or (3M) breaks the algorithm, creating 1290Sstevel@tonic-gate * windows for missed wakeups. Note: the possibility that other threads 1300Sstevel@tonic-gate * may grab the lock after the owner drops it can be factored out of the 1310Sstevel@tonic-gate * memory ordering analysis because mutex_vector_enter() won't block 1320Sstevel@tonic-gate * if the lock isn't still owned by the same thread. 1330Sstevel@tonic-gate * 1340Sstevel@tonic-gate * The only requirements of code outside the mutex implementation are 1350Sstevel@tonic-gate * (1) mutex_exit() preemption fixup in interrupt handlers or trap return, 1360Sstevel@tonic-gate * and (2) a membar #StoreLoad after setting CPU_THREAD in resume(). 1370Sstevel@tonic-gate * Note: idle threads cannot grab adaptive locks (since they cannot block), 1380Sstevel@tonic-gate * so the membar may be safely omitted when resuming an idle thread. 1390Sstevel@tonic-gate * 1400Sstevel@tonic-gate * When a mutex has waiters, mutex_vector_exit() has several options: 1410Sstevel@tonic-gate * 1420Sstevel@tonic-gate * (1) Choose a waiter and make that thread the owner before waking it; 1430Sstevel@tonic-gate * this is known as "direct handoff" of ownership. 1440Sstevel@tonic-gate * 1450Sstevel@tonic-gate * (2) Drop the lock and wake one waiter. 1460Sstevel@tonic-gate * 1470Sstevel@tonic-gate * (3) Drop the lock, clear the waiters bit, and wake all waiters. 1480Sstevel@tonic-gate * 1490Sstevel@tonic-gate * In many ways (1) is the cleanest solution, but if a lock is moderately 1500Sstevel@tonic-gate * contended it defeats the adaptive spin logic. If we make some other 1510Sstevel@tonic-gate * thread the owner, but he's not ONPROC yet, then all other threads on 1520Sstevel@tonic-gate * other cpus that try to get the lock will conclude that the owner is 1530Sstevel@tonic-gate * blocked, so they'll block too. And so on -- it escalates quickly, 1540Sstevel@tonic-gate * with every thread taking the blocking path rather than the spin path. 1550Sstevel@tonic-gate * Thus, direct handoff is *not* a good idea for adaptive mutexes. 1560Sstevel@tonic-gate * 1570Sstevel@tonic-gate * Option (2) is the next most natural-seeming option, but it has several 1580Sstevel@tonic-gate * annoying properties. If there's more than one waiter, we must preserve 1590Sstevel@tonic-gate * the waiters bit on an unheld lock. On cas-capable platforms, where 1600Sstevel@tonic-gate * the waiters bit is part of the lock word, this means that both 0x0 1610Sstevel@tonic-gate * and 0x1 represent unheld locks, so we have to cas against *both*. 1620Sstevel@tonic-gate * Priority inheritance also gets more complicated, because a lock can 1630Sstevel@tonic-gate * have waiters but no owner to whom priority can be willed. So while 1640Sstevel@tonic-gate * it is possible to make option (2) work, it's surprisingly vile. 1650Sstevel@tonic-gate * 1660Sstevel@tonic-gate * Option (3), the least-intuitive at first glance, is what we actually do. 1670Sstevel@tonic-gate * It has the advantage that because you always wake all waiters, you 1680Sstevel@tonic-gate * never have to preserve the waiters bit. Waking all waiters seems like 1690Sstevel@tonic-gate * begging for a thundering herd problem, but consider: under option (2), 1700Sstevel@tonic-gate * every thread that grabs and drops the lock will wake one waiter -- so 1710Sstevel@tonic-gate * if the lock is fairly active, all waiters will be awakened very quickly 1720Sstevel@tonic-gate * anyway. Moreover, this is how adaptive locks are *supposed* to work. 1730Sstevel@tonic-gate * The blocking case is rare; the more common case (by 3-4 orders of 1740Sstevel@tonic-gate * magnitude) is that one or more threads spin waiting to get the lock. 1750Sstevel@tonic-gate * Only direct handoff can prevent the thundering herd problem, but as 1760Sstevel@tonic-gate * mentioned earlier, that would tend to defeat the adaptive spin logic. 1770Sstevel@tonic-gate * In practice, option (3) works well because the blocking case is rare. 1780Sstevel@tonic-gate */ 1790Sstevel@tonic-gate 1800Sstevel@tonic-gate /* 1810Sstevel@tonic-gate * delayed lock retry with exponential delay for spin locks 1820Sstevel@tonic-gate * 1830Sstevel@tonic-gate * It is noted above that for both the spin locks and the adaptive locks, 1840Sstevel@tonic-gate * spinning is the dominate mode of operation. So long as there is only 1850Sstevel@tonic-gate * one thread waiting on a lock, the naive spin loop works very well in 1860Sstevel@tonic-gate * cache based architectures. The lock data structure is pulled into the 1870Sstevel@tonic-gate * cache of the processor with the waiting/spinning thread and no further 1880Sstevel@tonic-gate * memory traffic is generated until the lock is released. Unfortunately, 1890Sstevel@tonic-gate * once two or more threads are waiting on a lock, the naive spin has 1900Sstevel@tonic-gate * the property of generating maximum memory traffic from each spinning 1910Sstevel@tonic-gate * thread as the spinning threads contend for the lock data structure. 1920Sstevel@tonic-gate * 1930Sstevel@tonic-gate * By executing a delay loop before retrying a lock, a waiting thread 1940Sstevel@tonic-gate * can reduce its memory traffic by a large factor, depending on the 1950Sstevel@tonic-gate * size of the delay loop. A large delay loop greatly reduced the memory 1960Sstevel@tonic-gate * traffic, but has the drawback of having a period of time when 1970Sstevel@tonic-gate * no thread is attempting to gain the lock even though several threads 1980Sstevel@tonic-gate * might be waiting. A small delay loop has the drawback of not 1990Sstevel@tonic-gate * much reduction in memory traffic, but reduces the potential idle time. 2000Sstevel@tonic-gate * The theory of the exponential delay code is to start with a short 2010Sstevel@tonic-gate * delay loop and double the waiting time on each iteration, up to 2020Sstevel@tonic-gate * a preselected maximum. The BACKOFF_BASE provides the equivalent 2030Sstevel@tonic-gate * of 2 to 3 memory references delay for US-III+ and US-IV architectures. 2040Sstevel@tonic-gate * The BACKOFF_CAP is the equivalent of 50 to 100 memory references of 2050Sstevel@tonic-gate * time (less than 12 microseconds for a 1000 MHz system). 2060Sstevel@tonic-gate * 2070Sstevel@tonic-gate * To determine appropriate BACKOFF_BASE and BACKOFF_CAP values, 2080Sstevel@tonic-gate * studies on US-III+ and US-IV systems using 1 to 66 threads were 2090Sstevel@tonic-gate * done. A range of possible values were studied. 2100Sstevel@tonic-gate * Performance differences below 10 threads were not large. For 2110Sstevel@tonic-gate * systems with more threads, substantial increases in total lock 2120Sstevel@tonic-gate * throughput was observed with the given values. For cases where 2130Sstevel@tonic-gate * more than 20 threads were waiting on the same lock, lock throughput 2140Sstevel@tonic-gate * increased by a factor of 5 or more using the backoff algorithm. 215*3914Spm145316 * 216*3914Spm145316 * Some platforms may provide their own platform specific delay code, 217*3914Spm145316 * using plat_lock_delay(backoff). If it is available, plat_lock_delay 218*3914Spm145316 * is executed instead of the default delay code. 2190Sstevel@tonic-gate */ 2200Sstevel@tonic-gate 221*3914Spm145316 #pragma weak plat_lock_delay 222*3914Spm145316 2230Sstevel@tonic-gate #include <sys/param.h> 2240Sstevel@tonic-gate #include <sys/time.h> 2250Sstevel@tonic-gate #include <sys/cpuvar.h> 2260Sstevel@tonic-gate #include <sys/thread.h> 2270Sstevel@tonic-gate #include <sys/debug.h> 2280Sstevel@tonic-gate #include <sys/cmn_err.h> 2290Sstevel@tonic-gate #include <sys/sobject.h> 2300Sstevel@tonic-gate #include <sys/turnstile.h> 2310Sstevel@tonic-gate #include <sys/systm.h> 2320Sstevel@tonic-gate #include <sys/mutex_impl.h> 2330Sstevel@tonic-gate #include <sys/spl.h> 2340Sstevel@tonic-gate #include <sys/lockstat.h> 2350Sstevel@tonic-gate #include <sys/atomic.h> 2360Sstevel@tonic-gate #include <sys/cpu.h> 2370Sstevel@tonic-gate #include <sys/stack.h> 2380Sstevel@tonic-gate 2390Sstevel@tonic-gate #define BACKOFF_BASE 50 2400Sstevel@tonic-gate #define BACKOFF_CAP 1600 2410Sstevel@tonic-gate 2420Sstevel@tonic-gate /* 2430Sstevel@tonic-gate * The sobj_ops vector exports a set of functions needed when a thread 2440Sstevel@tonic-gate * is asleep on a synchronization object of this type. 2450Sstevel@tonic-gate */ 2460Sstevel@tonic-gate static sobj_ops_t mutex_sobj_ops = { 2470Sstevel@tonic-gate SOBJ_MUTEX, mutex_owner, turnstile_stay_asleep, turnstile_change_pri 2480Sstevel@tonic-gate }; 2490Sstevel@tonic-gate 2500Sstevel@tonic-gate /* 2510Sstevel@tonic-gate * If the system panics on a mutex, save the address of the offending 2520Sstevel@tonic-gate * mutex in panic_mutex_addr, and save the contents in panic_mutex. 2530Sstevel@tonic-gate */ 2540Sstevel@tonic-gate static mutex_impl_t panic_mutex; 2550Sstevel@tonic-gate static mutex_impl_t *panic_mutex_addr; 2560Sstevel@tonic-gate 2570Sstevel@tonic-gate static void 2580Sstevel@tonic-gate mutex_panic(char *msg, mutex_impl_t *lp) 2590Sstevel@tonic-gate { 2600Sstevel@tonic-gate if (panicstr) 2610Sstevel@tonic-gate return; 2620Sstevel@tonic-gate 2630Sstevel@tonic-gate if (casptr(&panic_mutex_addr, NULL, lp) == NULL) 2640Sstevel@tonic-gate panic_mutex = *lp; 2650Sstevel@tonic-gate 2660Sstevel@tonic-gate panic("%s, lp=%p owner=%p thread=%p", 2670Sstevel@tonic-gate msg, lp, MUTEX_OWNER(&panic_mutex), curthread); 2680Sstevel@tonic-gate } 2690Sstevel@tonic-gate 2700Sstevel@tonic-gate /* 2710Sstevel@tonic-gate * mutex_vector_enter() is called from the assembly mutex_enter() routine 2720Sstevel@tonic-gate * if the lock is held or is not of type MUTEX_ADAPTIVE. 2730Sstevel@tonic-gate */ 2740Sstevel@tonic-gate void 2750Sstevel@tonic-gate mutex_vector_enter(mutex_impl_t *lp) 2760Sstevel@tonic-gate { 2770Sstevel@tonic-gate kthread_id_t owner; 2780Sstevel@tonic-gate hrtime_t sleep_time = 0; /* how long we slept */ 2790Sstevel@tonic-gate uint_t spin_count = 0; /* how many times we spun */ 2800Sstevel@tonic-gate cpu_t *cpup, *last_cpu; 2810Sstevel@tonic-gate extern cpu_t *cpu_list; 2820Sstevel@tonic-gate turnstile_t *ts; 2830Sstevel@tonic-gate volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp; 2840Sstevel@tonic-gate int backoff; /* current backoff */ 2850Sstevel@tonic-gate int backctr; /* ctr for backoff */ 2862205Sdv142724 int sleep_count = 0; 2870Sstevel@tonic-gate 2880Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 2890Sstevel@tonic-gate 2900Sstevel@tonic-gate if (MUTEX_TYPE_SPIN(lp)) { 2910Sstevel@tonic-gate lock_set_spl(&lp->m_spin.m_spinlock, lp->m_spin.m_minspl, 2920Sstevel@tonic-gate &lp->m_spin.m_oldspl); 2930Sstevel@tonic-gate return; 2940Sstevel@tonic-gate } 2950Sstevel@tonic-gate 2960Sstevel@tonic-gate if (!MUTEX_TYPE_ADAPTIVE(lp)) { 2970Sstevel@tonic-gate mutex_panic("mutex_enter: bad mutex", lp); 2980Sstevel@tonic-gate return; 2990Sstevel@tonic-gate } 3000Sstevel@tonic-gate 3010Sstevel@tonic-gate /* 3020Sstevel@tonic-gate * Adaptive mutexes must not be acquired from above LOCK_LEVEL. 3030Sstevel@tonic-gate * We can migrate after loading CPU but before checking CPU_ON_INTR, 3040Sstevel@tonic-gate * so we must verify by disabling preemption and loading CPU again. 3050Sstevel@tonic-gate */ 3060Sstevel@tonic-gate cpup = CPU; 3070Sstevel@tonic-gate if (CPU_ON_INTR(cpup) && !panicstr) { 3080Sstevel@tonic-gate kpreempt_disable(); 3090Sstevel@tonic-gate if (CPU_ON_INTR(CPU)) 3100Sstevel@tonic-gate mutex_panic("mutex_enter: adaptive at high PIL", lp); 3110Sstevel@tonic-gate kpreempt_enable(); 3120Sstevel@tonic-gate } 3130Sstevel@tonic-gate 3140Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1); 3150Sstevel@tonic-gate 316*3914Spm145316 if (&plat_lock_delay) { 317*3914Spm145316 backoff = 0; 318*3914Spm145316 } else { 319*3914Spm145316 backoff = BACKOFF_BASE; 320*3914Spm145316 } 3210Sstevel@tonic-gate 3220Sstevel@tonic-gate for (;;) { 3230Sstevel@tonic-gate spin: 3240Sstevel@tonic-gate spin_count++; 3250Sstevel@tonic-gate /* 3260Sstevel@tonic-gate * Add an exponential backoff delay before trying again 3270Sstevel@tonic-gate * to touch the mutex data structure. 3280Sstevel@tonic-gate * the spin_count test and call to nulldev are to prevent 3290Sstevel@tonic-gate * the compiler optimizer from eliminating the delay loop. 3300Sstevel@tonic-gate */ 331*3914Spm145316 if (&plat_lock_delay) { 332*3914Spm145316 plat_lock_delay(&backoff); 333*3914Spm145316 } else { 334*3914Spm145316 for (backctr = backoff; backctr; backctr--) { 335*3914Spm145316 if (!spin_count) (void) nulldev(); 336*3914Spm145316 }; /* delay */ 337*3914Spm145316 backoff = backoff << 1; /* double it */ 338*3914Spm145316 if (backoff > BACKOFF_CAP) { 339*3914Spm145316 backoff = BACKOFF_CAP; 340*3914Spm145316 } 341*3914Spm145316 342*3914Spm145316 SMT_PAUSE(); 3430Sstevel@tonic-gate } 3440Sstevel@tonic-gate 3450Sstevel@tonic-gate if (panicstr) 3460Sstevel@tonic-gate return; 3470Sstevel@tonic-gate 3480Sstevel@tonic-gate if ((owner = MUTEX_OWNER(vlp)) == NULL) { 3490Sstevel@tonic-gate if (mutex_adaptive_tryenter(lp)) 3500Sstevel@tonic-gate break; 3510Sstevel@tonic-gate continue; 3520Sstevel@tonic-gate } 3530Sstevel@tonic-gate 3540Sstevel@tonic-gate if (owner == curthread) 3550Sstevel@tonic-gate mutex_panic("recursive mutex_enter", lp); 3560Sstevel@tonic-gate 3570Sstevel@tonic-gate /* 3580Sstevel@tonic-gate * If lock is held but owner is not yet set, spin. 3590Sstevel@tonic-gate * (Only relevant for platforms that don't have cas.) 3600Sstevel@tonic-gate */ 3610Sstevel@tonic-gate if (owner == MUTEX_NO_OWNER) 3620Sstevel@tonic-gate continue; 3630Sstevel@tonic-gate 3640Sstevel@tonic-gate /* 3650Sstevel@tonic-gate * When searching the other CPUs, start with the one where 3660Sstevel@tonic-gate * we last saw the owner thread. If owner is running, spin. 3670Sstevel@tonic-gate * 3680Sstevel@tonic-gate * We must disable preemption at this point to guarantee 3690Sstevel@tonic-gate * that the list doesn't change while we traverse it 3700Sstevel@tonic-gate * without the cpu_lock mutex. While preemption is 3710Sstevel@tonic-gate * disabled, we must revalidate our cached cpu pointer. 3720Sstevel@tonic-gate */ 3730Sstevel@tonic-gate kpreempt_disable(); 3740Sstevel@tonic-gate if (cpup->cpu_next == NULL) 3750Sstevel@tonic-gate cpup = cpu_list; 3760Sstevel@tonic-gate last_cpu = cpup; /* mark end of search */ 3770Sstevel@tonic-gate do { 3780Sstevel@tonic-gate if (cpup->cpu_thread == owner) { 3790Sstevel@tonic-gate kpreempt_enable(); 3800Sstevel@tonic-gate goto spin; 3810Sstevel@tonic-gate } 3820Sstevel@tonic-gate } while ((cpup = cpup->cpu_next) != last_cpu); 3830Sstevel@tonic-gate kpreempt_enable(); 3840Sstevel@tonic-gate 3850Sstevel@tonic-gate /* 3860Sstevel@tonic-gate * The owner appears not to be running, so block. 3870Sstevel@tonic-gate * See the Big Theory Statement for memory ordering issues. 3880Sstevel@tonic-gate */ 3890Sstevel@tonic-gate ts = turnstile_lookup(lp); 3900Sstevel@tonic-gate MUTEX_SET_WAITERS(lp); 3910Sstevel@tonic-gate membar_enter(); 3920Sstevel@tonic-gate 3930Sstevel@tonic-gate /* 3940Sstevel@tonic-gate * Recheck whether owner is running after waiters bit hits 3950Sstevel@tonic-gate * global visibility (above). If owner is running, spin. 3960Sstevel@tonic-gate * 3970Sstevel@tonic-gate * Since we are at ipl DISP_LEVEL, kernel preemption is 3980Sstevel@tonic-gate * disabled, however we still need to revalidate our cached 3990Sstevel@tonic-gate * cpu pointer to make sure the cpu hasn't been deleted. 4000Sstevel@tonic-gate */ 4010Sstevel@tonic-gate if (cpup->cpu_next == NULL) 4020Sstevel@tonic-gate last_cpu = cpup = cpu_list; 4030Sstevel@tonic-gate do { 4040Sstevel@tonic-gate if (cpup->cpu_thread == owner) { 4050Sstevel@tonic-gate turnstile_exit(lp); 4060Sstevel@tonic-gate goto spin; 4070Sstevel@tonic-gate } 4080Sstevel@tonic-gate } while ((cpup = cpup->cpu_next) != last_cpu); 4090Sstevel@tonic-gate membar_consumer(); 4100Sstevel@tonic-gate 4110Sstevel@tonic-gate /* 4120Sstevel@tonic-gate * If owner and waiters bit are unchanged, block. 4130Sstevel@tonic-gate */ 4140Sstevel@tonic-gate if (MUTEX_OWNER(vlp) == owner && MUTEX_HAS_WAITERS(vlp)) { 4150Sstevel@tonic-gate sleep_time -= gethrtime(); 4160Sstevel@tonic-gate (void) turnstile_block(ts, TS_WRITER_Q, lp, 4170Sstevel@tonic-gate &mutex_sobj_ops, NULL, NULL); 4180Sstevel@tonic-gate sleep_time += gethrtime(); 4192205Sdv142724 sleep_count++; 4200Sstevel@tonic-gate } else { 4210Sstevel@tonic-gate turnstile_exit(lp); 4220Sstevel@tonic-gate } 4230Sstevel@tonic-gate } 4240Sstevel@tonic-gate 4250Sstevel@tonic-gate ASSERT(MUTEX_OWNER(lp) == curthread); 4260Sstevel@tonic-gate 4272205Sdv142724 if (sleep_time != 0) { 4282205Sdv142724 /* 4292205Sdv142724 * Note, sleep time is the sum of all the sleeping we 4302205Sdv142724 * did. 4312205Sdv142724 */ 4320Sstevel@tonic-gate LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK, lp, sleep_time); 4330Sstevel@tonic-gate } 4340Sstevel@tonic-gate 4352205Sdv142724 /* 4362205Sdv142724 * We do not count a sleep as a spin. 4372205Sdv142724 */ 4382205Sdv142724 if (spin_count > sleep_count) 4392205Sdv142724 LOCKSTAT_RECORD(LS_MUTEX_ENTER_SPIN, lp, 4402205Sdv142724 spin_count - sleep_count); 4412205Sdv142724 4420Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp); 4430Sstevel@tonic-gate } 4440Sstevel@tonic-gate 4450Sstevel@tonic-gate /* 4460Sstevel@tonic-gate * mutex_vector_tryenter() is called from the assembly mutex_tryenter() 4470Sstevel@tonic-gate * routine if the lock is held or is not of type MUTEX_ADAPTIVE. 4480Sstevel@tonic-gate */ 4490Sstevel@tonic-gate int 4500Sstevel@tonic-gate mutex_vector_tryenter(mutex_impl_t *lp) 4510Sstevel@tonic-gate { 4520Sstevel@tonic-gate int s; 4530Sstevel@tonic-gate 4540Sstevel@tonic-gate if (MUTEX_TYPE_ADAPTIVE(lp)) 4550Sstevel@tonic-gate return (0); /* we already tried in assembly */ 4560Sstevel@tonic-gate 4570Sstevel@tonic-gate if (!MUTEX_TYPE_SPIN(lp)) { 4580Sstevel@tonic-gate mutex_panic("mutex_tryenter: bad mutex", lp); 4590Sstevel@tonic-gate return (0); 4600Sstevel@tonic-gate } 4610Sstevel@tonic-gate 4620Sstevel@tonic-gate s = splr(lp->m_spin.m_minspl); 4630Sstevel@tonic-gate if (lock_try(&lp->m_spin.m_spinlock)) { 4640Sstevel@tonic-gate lp->m_spin.m_oldspl = (ushort_t)s; 4650Sstevel@tonic-gate return (1); 4660Sstevel@tonic-gate } 4670Sstevel@tonic-gate splx(s); 4680Sstevel@tonic-gate return (0); 4690Sstevel@tonic-gate } 4700Sstevel@tonic-gate 4710Sstevel@tonic-gate /* 4720Sstevel@tonic-gate * mutex_vector_exit() is called from mutex_exit() if the lock is not 4730Sstevel@tonic-gate * adaptive, has waiters, or is not owned by the current thread (panic). 4740Sstevel@tonic-gate */ 4750Sstevel@tonic-gate void 4760Sstevel@tonic-gate mutex_vector_exit(mutex_impl_t *lp) 4770Sstevel@tonic-gate { 4780Sstevel@tonic-gate turnstile_t *ts; 4790Sstevel@tonic-gate 4800Sstevel@tonic-gate if (MUTEX_TYPE_SPIN(lp)) { 4810Sstevel@tonic-gate lock_clear_splx(&lp->m_spin.m_spinlock, lp->m_spin.m_oldspl); 4820Sstevel@tonic-gate return; 4830Sstevel@tonic-gate } 4840Sstevel@tonic-gate 4850Sstevel@tonic-gate if (MUTEX_OWNER(lp) != curthread) { 4860Sstevel@tonic-gate mutex_panic("mutex_exit: not owner", lp); 4870Sstevel@tonic-gate return; 4880Sstevel@tonic-gate } 4890Sstevel@tonic-gate 4900Sstevel@tonic-gate ts = turnstile_lookup(lp); 4910Sstevel@tonic-gate MUTEX_CLEAR_LOCK_AND_WAITERS(lp); 4920Sstevel@tonic-gate if (ts == NULL) 4930Sstevel@tonic-gate turnstile_exit(lp); 4940Sstevel@tonic-gate else 4950Sstevel@tonic-gate turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL); 4960Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_MUTEX_EXIT_RELEASE, lp); 4970Sstevel@tonic-gate } 4980Sstevel@tonic-gate 4990Sstevel@tonic-gate int 5000Sstevel@tonic-gate mutex_owned(kmutex_t *mp) 5010Sstevel@tonic-gate { 5020Sstevel@tonic-gate mutex_impl_t *lp = (mutex_impl_t *)mp; 5030Sstevel@tonic-gate 5040Sstevel@tonic-gate if (panicstr) 5050Sstevel@tonic-gate return (1); 5060Sstevel@tonic-gate 5070Sstevel@tonic-gate if (MUTEX_TYPE_ADAPTIVE(lp)) 5080Sstevel@tonic-gate return (MUTEX_OWNER(lp) == curthread); 5090Sstevel@tonic-gate return (LOCK_HELD(&lp->m_spin.m_spinlock)); 5100Sstevel@tonic-gate } 5110Sstevel@tonic-gate 5120Sstevel@tonic-gate kthread_t * 5130Sstevel@tonic-gate mutex_owner(kmutex_t *mp) 5140Sstevel@tonic-gate { 5150Sstevel@tonic-gate mutex_impl_t *lp = (mutex_impl_t *)mp; 5160Sstevel@tonic-gate kthread_id_t t; 5170Sstevel@tonic-gate 5180Sstevel@tonic-gate if (MUTEX_TYPE_ADAPTIVE(lp) && (t = MUTEX_OWNER(lp)) != MUTEX_NO_OWNER) 5190Sstevel@tonic-gate return (t); 5200Sstevel@tonic-gate return (NULL); 5210Sstevel@tonic-gate } 5220Sstevel@tonic-gate 5230Sstevel@tonic-gate /* 5240Sstevel@tonic-gate * The iblock cookie 'ibc' is the spl level associated with the lock; 5250Sstevel@tonic-gate * this alone determines whether the lock will be ADAPTIVE or SPIN. 5260Sstevel@tonic-gate * 5270Sstevel@tonic-gate * Adaptive mutexes created in zeroed memory do not need to call 5280Sstevel@tonic-gate * mutex_init() as their allocation in this fashion guarantees 5290Sstevel@tonic-gate * their initialization. 5300Sstevel@tonic-gate * eg adaptive mutexes created as static within the BSS or allocated 5310Sstevel@tonic-gate * by kmem_zalloc(). 5320Sstevel@tonic-gate */ 5330Sstevel@tonic-gate /* ARGSUSED */ 5340Sstevel@tonic-gate void 5350Sstevel@tonic-gate mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc) 5360Sstevel@tonic-gate { 5370Sstevel@tonic-gate mutex_impl_t *lp = (mutex_impl_t *)mp; 5380Sstevel@tonic-gate 5390Sstevel@tonic-gate ASSERT(ibc < (void *)KERNELBASE); /* see 1215173 */ 5400Sstevel@tonic-gate 5410Sstevel@tonic-gate if ((intptr_t)ibc > ipltospl(LOCK_LEVEL) && ibc < (void *)KERNELBASE) { 5420Sstevel@tonic-gate ASSERT(type != MUTEX_ADAPTIVE && type != MUTEX_DEFAULT); 5430Sstevel@tonic-gate MUTEX_SET_TYPE(lp, MUTEX_SPIN); 5440Sstevel@tonic-gate LOCK_INIT_CLEAR(&lp->m_spin.m_spinlock); 5450Sstevel@tonic-gate LOCK_INIT_HELD(&lp->m_spin.m_dummylock); 5460Sstevel@tonic-gate lp->m_spin.m_minspl = (int)(intptr_t)ibc; 5470Sstevel@tonic-gate } else { 5480Sstevel@tonic-gate ASSERT(type != MUTEX_SPIN); 5490Sstevel@tonic-gate MUTEX_SET_TYPE(lp, MUTEX_ADAPTIVE); 5500Sstevel@tonic-gate MUTEX_CLEAR_LOCK_AND_WAITERS(lp); 5510Sstevel@tonic-gate } 5520Sstevel@tonic-gate } 5530Sstevel@tonic-gate 5540Sstevel@tonic-gate void 5550Sstevel@tonic-gate mutex_destroy(kmutex_t *mp) 5560Sstevel@tonic-gate { 5570Sstevel@tonic-gate mutex_impl_t *lp = (mutex_impl_t *)mp; 5580Sstevel@tonic-gate 5590Sstevel@tonic-gate if (lp->m_owner == 0 && !MUTEX_HAS_WAITERS(lp)) { 5600Sstevel@tonic-gate MUTEX_DESTROY(lp); 5610Sstevel@tonic-gate } else if (MUTEX_TYPE_SPIN(lp)) { 5620Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE, lp); 5630Sstevel@tonic-gate MUTEX_DESTROY(lp); 5640Sstevel@tonic-gate } else if (MUTEX_TYPE_ADAPTIVE(lp)) { 5650Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE, lp); 5660Sstevel@tonic-gate if (MUTEX_OWNER(lp) != curthread) 5670Sstevel@tonic-gate mutex_panic("mutex_destroy: not owner", lp); 5680Sstevel@tonic-gate if (MUTEX_HAS_WAITERS(lp)) { 5690Sstevel@tonic-gate turnstile_t *ts = turnstile_lookup(lp); 5700Sstevel@tonic-gate turnstile_exit(lp); 5710Sstevel@tonic-gate if (ts != NULL) 5720Sstevel@tonic-gate mutex_panic("mutex_destroy: has waiters", lp); 5730Sstevel@tonic-gate } 5740Sstevel@tonic-gate MUTEX_DESTROY(lp); 5750Sstevel@tonic-gate } else { 5760Sstevel@tonic-gate mutex_panic("mutex_destroy: bad mutex", lp); 5770Sstevel@tonic-gate } 5780Sstevel@tonic-gate } 5790Sstevel@tonic-gate 5800Sstevel@tonic-gate /* 5810Sstevel@tonic-gate * Simple C support for the cases where spin locks miss on the first try. 5820Sstevel@tonic-gate */ 5830Sstevel@tonic-gate void 5840Sstevel@tonic-gate lock_set_spin(lock_t *lp) 5850Sstevel@tonic-gate { 5860Sstevel@tonic-gate int spin_count = 1; 5870Sstevel@tonic-gate int backoff; /* current backoff */ 5880Sstevel@tonic-gate int backctr; /* ctr for backoff */ 5890Sstevel@tonic-gate 5900Sstevel@tonic-gate if (panicstr) 5910Sstevel@tonic-gate return; 5920Sstevel@tonic-gate 5930Sstevel@tonic-gate if (ncpus == 1) 5940Sstevel@tonic-gate panic("lock_set: %p lock held and only one CPU", lp); 5950Sstevel@tonic-gate 596*3914Spm145316 if (&plat_lock_delay) { 597*3914Spm145316 backoff = 0; 598*3914Spm145316 } else { 599*3914Spm145316 backoff = BACKOFF_BASE; 600*3914Spm145316 } 601*3914Spm145316 6020Sstevel@tonic-gate while (LOCK_HELD(lp) || !lock_spin_try(lp)) { 6030Sstevel@tonic-gate if (panicstr) 6040Sstevel@tonic-gate return; 6050Sstevel@tonic-gate spin_count++; 6060Sstevel@tonic-gate /* 6070Sstevel@tonic-gate * Add an exponential backoff delay before trying again 6080Sstevel@tonic-gate * to touch the mutex data structure. 6090Sstevel@tonic-gate * the spin_count test and call to nulldev are to prevent 6100Sstevel@tonic-gate * the compiler optimizer from eliminating the delay loop. 6110Sstevel@tonic-gate */ 612*3914Spm145316 if (&plat_lock_delay) { 613*3914Spm145316 plat_lock_delay(&backoff); 614*3914Spm145316 } else { 615*3914Spm145316 /* delay */ 616*3914Spm145316 for (backctr = backoff; backctr; backctr--) { 617*3914Spm145316 if (!spin_count) (void) nulldev(); 618*3914Spm145316 } 6190Sstevel@tonic-gate 620*3914Spm145316 backoff = backoff << 1; /* double it */ 621*3914Spm145316 if (backoff > BACKOFF_CAP) { 622*3914Spm145316 backoff = BACKOFF_CAP; 623*3914Spm145316 } 624*3914Spm145316 SMT_PAUSE(); 6250Sstevel@tonic-gate } 6260Sstevel@tonic-gate } 6270Sstevel@tonic-gate 6280Sstevel@tonic-gate if (spin_count) { 6290Sstevel@tonic-gate LOCKSTAT_RECORD(LS_LOCK_SET_SPIN, lp, spin_count); 6300Sstevel@tonic-gate } 6310Sstevel@tonic-gate 6320Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_LOCK_SET_ACQUIRE, lp); 6330Sstevel@tonic-gate } 6340Sstevel@tonic-gate 6350Sstevel@tonic-gate void 6360Sstevel@tonic-gate lock_set_spl_spin(lock_t *lp, int new_pil, ushort_t *old_pil_addr, int old_pil) 6370Sstevel@tonic-gate { 6380Sstevel@tonic-gate int spin_count = 1; 6390Sstevel@tonic-gate int backoff; /* current backoff */ 6400Sstevel@tonic-gate int backctr; /* ctr for backoff */ 6410Sstevel@tonic-gate 6420Sstevel@tonic-gate if (panicstr) 6430Sstevel@tonic-gate return; 6440Sstevel@tonic-gate 6450Sstevel@tonic-gate if (ncpus == 1) 6460Sstevel@tonic-gate panic("lock_set_spl: %p lock held and only one CPU", lp); 6470Sstevel@tonic-gate 6480Sstevel@tonic-gate ASSERT(new_pil > LOCK_LEVEL); 6490Sstevel@tonic-gate 650*3914Spm145316 if (&plat_lock_delay) { 651*3914Spm145316 backoff = 0; 652*3914Spm145316 } else { 653*3914Spm145316 backoff = BACKOFF_BASE; 654*3914Spm145316 } 6550Sstevel@tonic-gate do { 6560Sstevel@tonic-gate splx(old_pil); 6570Sstevel@tonic-gate while (LOCK_HELD(lp)) { 6580Sstevel@tonic-gate if (panicstr) { 6590Sstevel@tonic-gate *old_pil_addr = (ushort_t)splr(new_pil); 6600Sstevel@tonic-gate return; 6610Sstevel@tonic-gate } 6620Sstevel@tonic-gate spin_count++; 6630Sstevel@tonic-gate /* 6640Sstevel@tonic-gate * Add an exponential backoff delay before trying again 6650Sstevel@tonic-gate * to touch the mutex data structure. 6660Sstevel@tonic-gate * spin_count test and call to nulldev are to prevent 6670Sstevel@tonic-gate * compiler optimizer from eliminating the delay loop. 6680Sstevel@tonic-gate */ 669*3914Spm145316 if (&plat_lock_delay) { 670*3914Spm145316 plat_lock_delay(&backoff); 671*3914Spm145316 } else { 672*3914Spm145316 for (backctr = backoff; backctr; backctr--) { 673*3914Spm145316 if (!spin_count) (void) nulldev(); 674*3914Spm145316 } 675*3914Spm145316 backoff = backoff << 1; /* double it */ 676*3914Spm145316 if (backoff > BACKOFF_CAP) { 677*3914Spm145316 backoff = BACKOFF_CAP; 678*3914Spm145316 } 679*3914Spm145316 680*3914Spm145316 SMT_PAUSE(); 6810Sstevel@tonic-gate } 6820Sstevel@tonic-gate } 6830Sstevel@tonic-gate old_pil = splr(new_pil); 6840Sstevel@tonic-gate } while (!lock_spin_try(lp)); 6850Sstevel@tonic-gate 6860Sstevel@tonic-gate *old_pil_addr = (ushort_t)old_pil; 6870Sstevel@tonic-gate 6880Sstevel@tonic-gate if (spin_count) { 6890Sstevel@tonic-gate LOCKSTAT_RECORD(LS_LOCK_SET_SPL_SPIN, lp, spin_count); 6900Sstevel@tonic-gate } 6910Sstevel@tonic-gate 6920Sstevel@tonic-gate LOCKSTAT_RECORD(LS_LOCK_SET_SPL_ACQUIRE, lp, spin_count); 6930Sstevel@tonic-gate } 694