10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
52205Sdv142724 * Common Development and Distribution License (the "License").
62205Sdv142724 * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate /*
22*9160SSherry.Moore@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
230Sstevel@tonic-gate * Use is subject to license terms.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate * Big Theory Statement for mutual exclusion locking primitives.
280Sstevel@tonic-gate *
290Sstevel@tonic-gate * A mutex serializes multiple threads so that only one thread
300Sstevel@tonic-gate * (the "owner" of the mutex) is active at a time. See mutex(9F)
310Sstevel@tonic-gate * for a full description of the interfaces and programming model.
320Sstevel@tonic-gate * The rest of this comment describes the implementation.
330Sstevel@tonic-gate *
340Sstevel@tonic-gate * Mutexes come in two flavors: adaptive and spin. mutex_init(9F)
350Sstevel@tonic-gate * determines the type based solely on the iblock cookie (PIL) argument.
360Sstevel@tonic-gate * PIL > LOCK_LEVEL implies a spin lock; everything else is adaptive.
370Sstevel@tonic-gate *
380Sstevel@tonic-gate * Spin mutexes block interrupts and spin until the lock becomes available.
390Sstevel@tonic-gate * A thread may not sleep, or call any function that might sleep, while
400Sstevel@tonic-gate * holding a spin mutex. With few exceptions, spin mutexes should only
410Sstevel@tonic-gate * be used to synchronize with interrupt handlers.
420Sstevel@tonic-gate *
430Sstevel@tonic-gate * Adaptive mutexes (the default type) spin if the owner is running on
440Sstevel@tonic-gate * another CPU and block otherwise. This policy is based on the assumption
450Sstevel@tonic-gate * that mutex hold times are typically short enough that the time spent
460Sstevel@tonic-gate * spinning is less than the time it takes to block. If you need mutual
470Sstevel@tonic-gate * exclusion semantics with long hold times, consider an rwlock(9F) as
480Sstevel@tonic-gate * RW_WRITER. Better still, reconsider the algorithm: if it requires
490Sstevel@tonic-gate * mutual exclusion for long periods of time, it's probably not scalable.
500Sstevel@tonic-gate *
510Sstevel@tonic-gate * Adaptive mutexes are overwhelmingly more common than spin mutexes,
520Sstevel@tonic-gate * so mutex_enter() assumes that the lock is adaptive. We get away
530Sstevel@tonic-gate * with this by structuring mutexes so that an attempt to acquire a
540Sstevel@tonic-gate * spin mutex as adaptive always fails. When mutex_enter() fails
550Sstevel@tonic-gate * it punts to mutex_vector_enter(), which does all the hard stuff.
560Sstevel@tonic-gate *
570Sstevel@tonic-gate * mutex_vector_enter() first checks the type. If it's spin mutex,
580Sstevel@tonic-gate * we just call lock_set_spl() and return. If it's an adaptive mutex,
590Sstevel@tonic-gate * we check to see what the owner is doing. If the owner is running,
600Sstevel@tonic-gate * we spin until the lock becomes available; if not, we mark the lock
610Sstevel@tonic-gate * as having waiters and block.
620Sstevel@tonic-gate *
630Sstevel@tonic-gate * Blocking on a mutex is surprisingly delicate dance because, for speed,
640Sstevel@tonic-gate * mutex_exit() doesn't use an atomic instruction. Thus we have to work
650Sstevel@tonic-gate * a little harder in the (rarely-executed) blocking path to make sure
660Sstevel@tonic-gate * we don't block on a mutex that's just been released -- otherwise we
670Sstevel@tonic-gate * might never be woken up.
680Sstevel@tonic-gate *
690Sstevel@tonic-gate * The logic for synchronizing mutex_vector_enter() with mutex_exit()
700Sstevel@tonic-gate * in the face of preemption and relaxed memory ordering is as follows:
710Sstevel@tonic-gate *
720Sstevel@tonic-gate * (1) Preemption in the middle of mutex_exit() must cause mutex_exit()
730Sstevel@tonic-gate * to restart. Each platform must enforce this by checking the
740Sstevel@tonic-gate * interrupted PC in the interrupt handler (or on return from trap --
750Sstevel@tonic-gate * whichever is more convenient for the platform). If the PC
760Sstevel@tonic-gate * lies within the critical region of mutex_exit(), the interrupt
770Sstevel@tonic-gate * handler must reset the PC back to the beginning of mutex_exit().
780Sstevel@tonic-gate * The critical region consists of all instructions up to, but not
790Sstevel@tonic-gate * including, the store that clears the lock (which, of course,
800Sstevel@tonic-gate * must never be executed twice.)
810Sstevel@tonic-gate *
820Sstevel@tonic-gate * This ensures that the owner will always check for waiters after
830Sstevel@tonic-gate * resuming from a previous preemption.
840Sstevel@tonic-gate *
850Sstevel@tonic-gate * (2) A thread resuming in mutex_exit() does (at least) the following:
860Sstevel@tonic-gate *
870Sstevel@tonic-gate * when resuming: set CPU_THREAD = owner
880Sstevel@tonic-gate * membar #StoreLoad
890Sstevel@tonic-gate *
900Sstevel@tonic-gate * in mutex_exit: check waiters bit; do wakeup if set
910Sstevel@tonic-gate * membar #LoadStore|#StoreStore
920Sstevel@tonic-gate * clear owner
930Sstevel@tonic-gate * (at this point, other threads may or may not grab
940Sstevel@tonic-gate * the lock, and we may or may not reacquire it)
950Sstevel@tonic-gate *
960Sstevel@tonic-gate * when blocking: membar #StoreStore (due to disp_lock_enter())
970Sstevel@tonic-gate * set CPU_THREAD = (possibly) someone else
980Sstevel@tonic-gate *
990Sstevel@tonic-gate * (3) A thread blocking in mutex_vector_enter() does the following:
1000Sstevel@tonic-gate *
1010Sstevel@tonic-gate * set waiters bit
1020Sstevel@tonic-gate * membar #StoreLoad (via membar_enter())
1035834Spt157919 * check CPU_THREAD for owner's t_cpu
1045834Spt157919 * continue if owner running
1050Sstevel@tonic-gate * membar #LoadLoad (via membar_consumer())
1060Sstevel@tonic-gate * check owner and waiters bit; abort if either changed
1070Sstevel@tonic-gate * block
1080Sstevel@tonic-gate *
1090Sstevel@tonic-gate * Thus the global memory orderings for (2) and (3) are as follows:
1100Sstevel@tonic-gate *
1110Sstevel@tonic-gate * (2M) mutex_exit() memory order:
1120Sstevel@tonic-gate *
1130Sstevel@tonic-gate * STORE CPU_THREAD = owner
1140Sstevel@tonic-gate * LOAD waiters bit
1150Sstevel@tonic-gate * STORE owner = NULL
1160Sstevel@tonic-gate * STORE CPU_THREAD = (possibly) someone else
1170Sstevel@tonic-gate *
1180Sstevel@tonic-gate * (3M) mutex_vector_enter() memory order:
1190Sstevel@tonic-gate *
1200Sstevel@tonic-gate * STORE waiters bit = 1
1210Sstevel@tonic-gate * LOAD CPU_THREAD for each CPU
1220Sstevel@tonic-gate * LOAD owner and waiters bit
1230Sstevel@tonic-gate *
1240Sstevel@tonic-gate * It has been verified by exhaustive simulation that all possible global
1250Sstevel@tonic-gate * memory orderings of (2M) interleaved with (3M) result in correct
1260Sstevel@tonic-gate * behavior. Moreover, these ordering constraints are minimal: changing
1270Sstevel@tonic-gate * the ordering of anything in (2M) or (3M) breaks the algorithm, creating
1280Sstevel@tonic-gate * windows for missed wakeups. Note: the possibility that other threads
1290Sstevel@tonic-gate * may grab the lock after the owner drops it can be factored out of the
1300Sstevel@tonic-gate * memory ordering analysis because mutex_vector_enter() won't block
1310Sstevel@tonic-gate * if the lock isn't still owned by the same thread.
1320Sstevel@tonic-gate *
1330Sstevel@tonic-gate * The only requirements of code outside the mutex implementation are
1340Sstevel@tonic-gate * (1) mutex_exit() preemption fixup in interrupt handlers or trap return,
1355834Spt157919 * (2) a membar #StoreLoad after setting CPU_THREAD in resume(),
1365834Spt157919 * (3) mutex_owner_running() preemption fixup in interrupt handlers
1375834Spt157919 * or trap returns.
1380Sstevel@tonic-gate * Note: idle threads cannot grab adaptive locks (since they cannot block),
1390Sstevel@tonic-gate * so the membar may be safely omitted when resuming an idle thread.
1400Sstevel@tonic-gate *
1410Sstevel@tonic-gate * When a mutex has waiters, mutex_vector_exit() has several options:
1420Sstevel@tonic-gate *
1430Sstevel@tonic-gate * (1) Choose a waiter and make that thread the owner before waking it;
1440Sstevel@tonic-gate * this is known as "direct handoff" of ownership.
1450Sstevel@tonic-gate *
1460Sstevel@tonic-gate * (2) Drop the lock and wake one waiter.
1470Sstevel@tonic-gate *
1480Sstevel@tonic-gate * (3) Drop the lock, clear the waiters bit, and wake all waiters.
1490Sstevel@tonic-gate *
1500Sstevel@tonic-gate * In many ways (1) is the cleanest solution, but if a lock is moderately
1510Sstevel@tonic-gate * contended it defeats the adaptive spin logic. If we make some other
1520Sstevel@tonic-gate * thread the owner, but he's not ONPROC yet, then all other threads on
1530Sstevel@tonic-gate * other cpus that try to get the lock will conclude that the owner is
1540Sstevel@tonic-gate * blocked, so they'll block too. And so on -- it escalates quickly,
1550Sstevel@tonic-gate * with every thread taking the blocking path rather than the spin path.
1560Sstevel@tonic-gate * Thus, direct handoff is *not* a good idea for adaptive mutexes.
1570Sstevel@tonic-gate *
1580Sstevel@tonic-gate * Option (2) is the next most natural-seeming option, but it has several
1590Sstevel@tonic-gate * annoying properties. If there's more than one waiter, we must preserve
1600Sstevel@tonic-gate * the waiters bit on an unheld lock. On cas-capable platforms, where
1610Sstevel@tonic-gate * the waiters bit is part of the lock word, this means that both 0x0
1620Sstevel@tonic-gate * and 0x1 represent unheld locks, so we have to cas against *both*.
1630Sstevel@tonic-gate * Priority inheritance also gets more complicated, because a lock can
1640Sstevel@tonic-gate * have waiters but no owner to whom priority can be willed. So while
1650Sstevel@tonic-gate * it is possible to make option (2) work, it's surprisingly vile.
1660Sstevel@tonic-gate *
1670Sstevel@tonic-gate * Option (3), the least-intuitive at first glance, is what we actually do.
1680Sstevel@tonic-gate * It has the advantage that because you always wake all waiters, you
1690Sstevel@tonic-gate * never have to preserve the waiters bit. Waking all waiters seems like
1700Sstevel@tonic-gate * begging for a thundering herd problem, but consider: under option (2),
1710Sstevel@tonic-gate * every thread that grabs and drops the lock will wake one waiter -- so
1720Sstevel@tonic-gate * if the lock is fairly active, all waiters will be awakened very quickly
1730Sstevel@tonic-gate * anyway. Moreover, this is how adaptive locks are *supposed* to work.
1740Sstevel@tonic-gate * The blocking case is rare; the more common case (by 3-4 orders of
1750Sstevel@tonic-gate * magnitude) is that one or more threads spin waiting to get the lock.
1760Sstevel@tonic-gate * Only direct handoff can prevent the thundering herd problem, but as
1770Sstevel@tonic-gate * mentioned earlier, that would tend to defeat the adaptive spin logic.
1780Sstevel@tonic-gate * In practice, option (3) works well because the blocking case is rare.
1790Sstevel@tonic-gate */
1800Sstevel@tonic-gate
1810Sstevel@tonic-gate /*
1820Sstevel@tonic-gate * delayed lock retry with exponential delay for spin locks
1830Sstevel@tonic-gate *
1840Sstevel@tonic-gate * It is noted above that for both the spin locks and the adaptive locks,
1850Sstevel@tonic-gate * spinning is the dominate mode of operation. So long as there is only
1860Sstevel@tonic-gate * one thread waiting on a lock, the naive spin loop works very well in
1870Sstevel@tonic-gate * cache based architectures. The lock data structure is pulled into the
1880Sstevel@tonic-gate * cache of the processor with the waiting/spinning thread and no further
1890Sstevel@tonic-gate * memory traffic is generated until the lock is released. Unfortunately,
1900Sstevel@tonic-gate * once two or more threads are waiting on a lock, the naive spin has
1910Sstevel@tonic-gate * the property of generating maximum memory traffic from each spinning
1920Sstevel@tonic-gate * thread as the spinning threads contend for the lock data structure.
1930Sstevel@tonic-gate *
1940Sstevel@tonic-gate * By executing a delay loop before retrying a lock, a waiting thread
1950Sstevel@tonic-gate * can reduce its memory traffic by a large factor, depending on the
1960Sstevel@tonic-gate * size of the delay loop. A large delay loop greatly reduced the memory
1970Sstevel@tonic-gate * traffic, but has the drawback of having a period of time when
1980Sstevel@tonic-gate * no thread is attempting to gain the lock even though several threads
1990Sstevel@tonic-gate * might be waiting. A small delay loop has the drawback of not
2000Sstevel@tonic-gate * much reduction in memory traffic, but reduces the potential idle time.
2010Sstevel@tonic-gate * The theory of the exponential delay code is to start with a short
2020Sstevel@tonic-gate * delay loop and double the waiting time on each iteration, up to
2035834Spt157919 * a preselected maximum.
2040Sstevel@tonic-gate */
2050Sstevel@tonic-gate
2060Sstevel@tonic-gate #include <sys/param.h>
2070Sstevel@tonic-gate #include <sys/time.h>
2080Sstevel@tonic-gate #include <sys/cpuvar.h>
2090Sstevel@tonic-gate #include <sys/thread.h>
2100Sstevel@tonic-gate #include <sys/debug.h>
2110Sstevel@tonic-gate #include <sys/cmn_err.h>
2120Sstevel@tonic-gate #include <sys/sobject.h>
2130Sstevel@tonic-gate #include <sys/turnstile.h>
2140Sstevel@tonic-gate #include <sys/systm.h>
2150Sstevel@tonic-gate #include <sys/mutex_impl.h>
2160Sstevel@tonic-gate #include <sys/spl.h>
2170Sstevel@tonic-gate #include <sys/lockstat.h>
2180Sstevel@tonic-gate #include <sys/atomic.h>
2190Sstevel@tonic-gate #include <sys/cpu.h>
2200Sstevel@tonic-gate #include <sys/stack.h>
2215084Sjohnlev #include <sys/archsystm.h>
2225834Spt157919 #include <sys/machsystm.h>
2235834Spt157919 #include <sys/x_call.h>
2240Sstevel@tonic-gate
2250Sstevel@tonic-gate /*
2260Sstevel@tonic-gate * The sobj_ops vector exports a set of functions needed when a thread
2270Sstevel@tonic-gate * is asleep on a synchronization object of this type.
2280Sstevel@tonic-gate */
2290Sstevel@tonic-gate static sobj_ops_t mutex_sobj_ops = {
2300Sstevel@tonic-gate SOBJ_MUTEX, mutex_owner, turnstile_stay_asleep, turnstile_change_pri
2310Sstevel@tonic-gate };
2320Sstevel@tonic-gate
2330Sstevel@tonic-gate /*
2340Sstevel@tonic-gate * If the system panics on a mutex, save the address of the offending
2350Sstevel@tonic-gate * mutex in panic_mutex_addr, and save the contents in panic_mutex.
2360Sstevel@tonic-gate */
2370Sstevel@tonic-gate static mutex_impl_t panic_mutex;
2380Sstevel@tonic-gate static mutex_impl_t *panic_mutex_addr;
2390Sstevel@tonic-gate
2400Sstevel@tonic-gate static void
mutex_panic(char * msg,mutex_impl_t * lp)2410Sstevel@tonic-gate mutex_panic(char *msg, mutex_impl_t *lp)
2420Sstevel@tonic-gate {
2430Sstevel@tonic-gate if (panicstr)
2440Sstevel@tonic-gate return;
2450Sstevel@tonic-gate
2460Sstevel@tonic-gate if (casptr(&panic_mutex_addr, NULL, lp) == NULL)
2470Sstevel@tonic-gate panic_mutex = *lp;
2480Sstevel@tonic-gate
2490Sstevel@tonic-gate panic("%s, lp=%p owner=%p thread=%p",
2507632SNick.Todd@Sun.COM msg, (void *)lp, (void *)MUTEX_OWNER(&panic_mutex),
2517632SNick.Todd@Sun.COM (void *)curthread);
2520Sstevel@tonic-gate }
2530Sstevel@tonic-gate
2545834Spt157919 /* "tunables" for per-platform backoff constants. */
2555834Spt157919 uint_t mutex_backoff_cap = 0;
2565834Spt157919 ushort_t mutex_backoff_base = MUTEX_BACKOFF_BASE;
2575834Spt157919 ushort_t mutex_cap_factor = MUTEX_CAP_FACTOR;
2585834Spt157919 uchar_t mutex_backoff_shift = MUTEX_BACKOFF_SHIFT;
2595834Spt157919
2605834Spt157919 void
mutex_sync(void)2615834Spt157919 mutex_sync(void)
2625834Spt157919 {
2635834Spt157919 MUTEX_SYNC();
2645834Spt157919 }
2655834Spt157919
2665834Spt157919 /* calculate the backoff interval */
2676138Ssvemuri uint_t
default_lock_backoff(uint_t backoff)2685834Spt157919 default_lock_backoff(uint_t backoff)
2695834Spt157919 {
2705834Spt157919 uint_t cap; /* backoff cap calculated */
2715834Spt157919
2725834Spt157919 if (backoff == 0) {
2735834Spt157919 backoff = mutex_backoff_base;
2745834Spt157919 /* first call just sets the base */
2755834Spt157919 return (backoff);
2765834Spt157919 }
2775834Spt157919
2785834Spt157919 /* set cap */
2795834Spt157919 if (mutex_backoff_cap == 0) {
2805834Spt157919 /*
2815834Spt157919 * For a contended lock, in the worst case a load + cas may
2825834Spt157919 * be queued at the controller for each contending CPU.
2835834Spt157919 * Therefore, to avoid queueing, the accesses for all CPUS must
2845834Spt157919 * be spread out in time over an interval of (ncpu *
2855834Spt157919 * cap-factor). Maximum backoff is set to this value, and
2865834Spt157919 * actual backoff is a random number from 0 to the current max.
2875834Spt157919 */
2885834Spt157919 cap = ncpus_online * mutex_cap_factor;
2895834Spt157919 } else {
2905834Spt157919 cap = mutex_backoff_cap;
2915834Spt157919 }
2925834Spt157919
2935834Spt157919 /* calculate new backoff value */
2945834Spt157919 backoff <<= mutex_backoff_shift; /* increase backoff */
2955834Spt157919 if (backoff > cap) {
2965834Spt157919 if (cap < mutex_backoff_base)
2975834Spt157919 backoff = mutex_backoff_base;
2985834Spt157919 else
2995834Spt157919 backoff = cap;
3005834Spt157919 }
3015834Spt157919
3025834Spt157919 return (backoff);
3035834Spt157919 }
3045834Spt157919
3055834Spt157919 /*
3065834Spt157919 * default delay function for mutexes.
3075834Spt157919 */
3086138Ssvemuri void
default_lock_delay(uint_t backoff)3095834Spt157919 default_lock_delay(uint_t backoff)
3105834Spt157919 {
3115834Spt157919 ulong_t rnd; /* random factor */
3125834Spt157919 uint_t cur_backoff; /* calculated backoff */
3135834Spt157919 uint_t backctr;
3145834Spt157919
3155834Spt157919 /*
3165834Spt157919 * Modify backoff by a random amount to avoid lockstep, and to
3175834Spt157919 * make it probable that some thread gets a small backoff, and
3185834Spt157919 * re-checks quickly
3195834Spt157919 */
3205834Spt157919 rnd = (((long)curthread >> PTR24_LSB) ^ (long)MUTEX_GETTICK());
3215834Spt157919 cur_backoff = (uint_t)(rnd % (backoff - mutex_backoff_base + 1)) +
3225834Spt157919 mutex_backoff_base;
3235834Spt157919
3245834Spt157919 /*
3255834Spt157919 * Delay before trying
3265834Spt157919 * to touch the mutex data structure.
3275834Spt157919 */
3285834Spt157919 for (backctr = cur_backoff; backctr; backctr--) {
3295834Spt157919 MUTEX_DELAY();
3305834Spt157919 };
3315834Spt157919 }
3325834Spt157919
3335834Spt157919 uint_t (*mutex_lock_backoff)(uint_t) = default_lock_backoff;
3345834Spt157919 void (*mutex_lock_delay)(uint_t) = default_lock_delay;
3355834Spt157919 void (*mutex_delay)(void) = mutex_delay_default;
3365834Spt157919
3370Sstevel@tonic-gate /*
3380Sstevel@tonic-gate * mutex_vector_enter() is called from the assembly mutex_enter() routine
3390Sstevel@tonic-gate * if the lock is held or is not of type MUTEX_ADAPTIVE.
3400Sstevel@tonic-gate */
3410Sstevel@tonic-gate void
mutex_vector_enter(mutex_impl_t * lp)3420Sstevel@tonic-gate mutex_vector_enter(mutex_impl_t *lp)
3430Sstevel@tonic-gate {
3440Sstevel@tonic-gate kthread_id_t owner;
3455834Spt157919 kthread_id_t lastowner = MUTEX_NO_OWNER; /* track owner changes */
3460Sstevel@tonic-gate hrtime_t sleep_time = 0; /* how long we slept */
3476103Sck142721 hrtime_t spin_time = 0; /* how long we spun */
3485834Spt157919 cpu_t *cpup;
3490Sstevel@tonic-gate turnstile_t *ts;
3500Sstevel@tonic-gate volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp;
3515834Spt157919 uint_t backoff = 0; /* current backoff */
3525834Spt157919 int changecnt = 0; /* count of owner changes */
3530Sstevel@tonic-gate
3540Sstevel@tonic-gate ASSERT_STACK_ALIGNED();
3550Sstevel@tonic-gate
3560Sstevel@tonic-gate if (MUTEX_TYPE_SPIN(lp)) {
3570Sstevel@tonic-gate lock_set_spl(&lp->m_spin.m_spinlock, lp->m_spin.m_minspl,
3580Sstevel@tonic-gate &lp->m_spin.m_oldspl);
3590Sstevel@tonic-gate return;
3600Sstevel@tonic-gate }
3610Sstevel@tonic-gate
3620Sstevel@tonic-gate if (!MUTEX_TYPE_ADAPTIVE(lp)) {
3630Sstevel@tonic-gate mutex_panic("mutex_enter: bad mutex", lp);
3640Sstevel@tonic-gate return;
3650Sstevel@tonic-gate }
3660Sstevel@tonic-gate
3670Sstevel@tonic-gate /*
3680Sstevel@tonic-gate * Adaptive mutexes must not be acquired from above LOCK_LEVEL.
3690Sstevel@tonic-gate * We can migrate after loading CPU but before checking CPU_ON_INTR,
3700Sstevel@tonic-gate * so we must verify by disabling preemption and loading CPU again.
3710Sstevel@tonic-gate */
3720Sstevel@tonic-gate cpup = CPU;
3730Sstevel@tonic-gate if (CPU_ON_INTR(cpup) && !panicstr) {
3740Sstevel@tonic-gate kpreempt_disable();
3750Sstevel@tonic-gate if (CPU_ON_INTR(CPU))
3760Sstevel@tonic-gate mutex_panic("mutex_enter: adaptive at high PIL", lp);
3770Sstevel@tonic-gate kpreempt_enable();
3780Sstevel@tonic-gate }
3790Sstevel@tonic-gate
3800Sstevel@tonic-gate CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1);
3810Sstevel@tonic-gate
3826103Sck142721 spin_time = LOCKSTAT_START_TIME(LS_MUTEX_ENTER_SPIN);
3836103Sck142721
3845834Spt157919 backoff = mutex_lock_backoff(0); /* set base backoff */
3850Sstevel@tonic-gate for (;;) {
3865834Spt157919 mutex_lock_delay(backoff); /* backoff delay */
3870Sstevel@tonic-gate
3880Sstevel@tonic-gate if (panicstr)
3890Sstevel@tonic-gate return;
3900Sstevel@tonic-gate
3910Sstevel@tonic-gate if ((owner = MUTEX_OWNER(vlp)) == NULL) {
3925834Spt157919 if (mutex_adaptive_tryenter(lp)) {
3930Sstevel@tonic-gate break;
3945834Spt157919 }
3955834Spt157919 /* increase backoff only on failed attempt. */
3965834Spt157919 backoff = mutex_lock_backoff(backoff);
3975834Spt157919 changecnt++;
3980Sstevel@tonic-gate continue;
3995834Spt157919 } else if (lastowner != owner) {
4005834Spt157919 lastowner = owner;
4015834Spt157919 backoff = mutex_lock_backoff(backoff);
4025834Spt157919 changecnt++;
4035834Spt157919 }
4045834Spt157919
4055834Spt157919 if (changecnt >= ncpus_online) {
4065834Spt157919 backoff = mutex_lock_backoff(0);
4075834Spt157919 changecnt = 0;
4080Sstevel@tonic-gate }
4090Sstevel@tonic-gate
4100Sstevel@tonic-gate if (owner == curthread)
4110Sstevel@tonic-gate mutex_panic("recursive mutex_enter", lp);
4120Sstevel@tonic-gate
4130Sstevel@tonic-gate /*
4140Sstevel@tonic-gate * If lock is held but owner is not yet set, spin.
4150Sstevel@tonic-gate * (Only relevant for platforms that don't have cas.)
4160Sstevel@tonic-gate */
4170Sstevel@tonic-gate if (owner == MUTEX_NO_OWNER)
4180Sstevel@tonic-gate continue;
4190Sstevel@tonic-gate
4205834Spt157919 if (mutex_owner_running(lp) != NULL) {
4215834Spt157919 continue;
4225834Spt157919 }
4230Sstevel@tonic-gate
4240Sstevel@tonic-gate /*
4250Sstevel@tonic-gate * The owner appears not to be running, so block.
4260Sstevel@tonic-gate * See the Big Theory Statement for memory ordering issues.
4270Sstevel@tonic-gate */
4280Sstevel@tonic-gate ts = turnstile_lookup(lp);
4290Sstevel@tonic-gate MUTEX_SET_WAITERS(lp);
4300Sstevel@tonic-gate membar_enter();
4310Sstevel@tonic-gate
4320Sstevel@tonic-gate /*
4330Sstevel@tonic-gate * Recheck whether owner is running after waiters bit hits
4340Sstevel@tonic-gate * global visibility (above). If owner is running, spin.
4350Sstevel@tonic-gate */
4365834Spt157919 if (mutex_owner_running(lp) != NULL) {
4375834Spt157919 turnstile_exit(lp);
4385834Spt157919 continue;
4395834Spt157919 }
4400Sstevel@tonic-gate membar_consumer();
4410Sstevel@tonic-gate
4420Sstevel@tonic-gate /*
4430Sstevel@tonic-gate * If owner and waiters bit are unchanged, block.
4440Sstevel@tonic-gate */
4450Sstevel@tonic-gate if (MUTEX_OWNER(vlp) == owner && MUTEX_HAS_WAITERS(vlp)) {
4460Sstevel@tonic-gate sleep_time -= gethrtime();
4470Sstevel@tonic-gate (void) turnstile_block(ts, TS_WRITER_Q, lp,
4480Sstevel@tonic-gate &mutex_sobj_ops, NULL, NULL);
4490Sstevel@tonic-gate sleep_time += gethrtime();
4505834Spt157919 /* reset backoff after turnstile */
4515834Spt157919 backoff = mutex_lock_backoff(0);
4520Sstevel@tonic-gate } else {
4530Sstevel@tonic-gate turnstile_exit(lp);
4540Sstevel@tonic-gate }
4550Sstevel@tonic-gate }
4560Sstevel@tonic-gate
4570Sstevel@tonic-gate ASSERT(MUTEX_OWNER(lp) == curthread);
4580Sstevel@tonic-gate
4592205Sdv142724 if (sleep_time != 0) {
4602205Sdv142724 /*
4612205Sdv142724 * Note, sleep time is the sum of all the sleeping we
4622205Sdv142724 * did.
4632205Sdv142724 */
4640Sstevel@tonic-gate LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK, lp, sleep_time);
4650Sstevel@tonic-gate }
4660Sstevel@tonic-gate
4676103Sck142721 /* record spin time, don't count sleep time */
4686103Sck142721 if (spin_time != 0) {
4696103Sck142721 LOCKSTAT_RECORD_TIME(LS_MUTEX_ENTER_SPIN, lp,
4706103Sck142721 spin_time + sleep_time);
4715834Spt157919 }
4722205Sdv142724
4730Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp);
4740Sstevel@tonic-gate }
4750Sstevel@tonic-gate
4760Sstevel@tonic-gate /*
4770Sstevel@tonic-gate * mutex_vector_tryenter() is called from the assembly mutex_tryenter()
4780Sstevel@tonic-gate * routine if the lock is held or is not of type MUTEX_ADAPTIVE.
4790Sstevel@tonic-gate */
4800Sstevel@tonic-gate int
mutex_vector_tryenter(mutex_impl_t * lp)4810Sstevel@tonic-gate mutex_vector_tryenter(mutex_impl_t *lp)
4820Sstevel@tonic-gate {
4830Sstevel@tonic-gate int s;
4840Sstevel@tonic-gate
4850Sstevel@tonic-gate if (MUTEX_TYPE_ADAPTIVE(lp))
4860Sstevel@tonic-gate return (0); /* we already tried in assembly */
4870Sstevel@tonic-gate
4880Sstevel@tonic-gate if (!MUTEX_TYPE_SPIN(lp)) {
4890Sstevel@tonic-gate mutex_panic("mutex_tryenter: bad mutex", lp);
4900Sstevel@tonic-gate return (0);
4910Sstevel@tonic-gate }
4920Sstevel@tonic-gate
4930Sstevel@tonic-gate s = splr(lp->m_spin.m_minspl);
4940Sstevel@tonic-gate if (lock_try(&lp->m_spin.m_spinlock)) {
4950Sstevel@tonic-gate lp->m_spin.m_oldspl = (ushort_t)s;
4960Sstevel@tonic-gate return (1);
4970Sstevel@tonic-gate }
4980Sstevel@tonic-gate splx(s);
4990Sstevel@tonic-gate return (0);
5000Sstevel@tonic-gate }
5010Sstevel@tonic-gate
5020Sstevel@tonic-gate /*
5030Sstevel@tonic-gate * mutex_vector_exit() is called from mutex_exit() if the lock is not
5040Sstevel@tonic-gate * adaptive, has waiters, or is not owned by the current thread (panic).
5050Sstevel@tonic-gate */
5060Sstevel@tonic-gate void
mutex_vector_exit(mutex_impl_t * lp)5070Sstevel@tonic-gate mutex_vector_exit(mutex_impl_t *lp)
5080Sstevel@tonic-gate {
5090Sstevel@tonic-gate turnstile_t *ts;
5100Sstevel@tonic-gate
5110Sstevel@tonic-gate if (MUTEX_TYPE_SPIN(lp)) {
5120Sstevel@tonic-gate lock_clear_splx(&lp->m_spin.m_spinlock, lp->m_spin.m_oldspl);
5130Sstevel@tonic-gate return;
5140Sstevel@tonic-gate }
5150Sstevel@tonic-gate
5160Sstevel@tonic-gate if (MUTEX_OWNER(lp) != curthread) {
5170Sstevel@tonic-gate mutex_panic("mutex_exit: not owner", lp);
5180Sstevel@tonic-gate return;
5190Sstevel@tonic-gate }
5200Sstevel@tonic-gate
5210Sstevel@tonic-gate ts = turnstile_lookup(lp);
5220Sstevel@tonic-gate MUTEX_CLEAR_LOCK_AND_WAITERS(lp);
5230Sstevel@tonic-gate if (ts == NULL)
5240Sstevel@tonic-gate turnstile_exit(lp);
5250Sstevel@tonic-gate else
5260Sstevel@tonic-gate turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
5270Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_MUTEX_EXIT_RELEASE, lp);
5280Sstevel@tonic-gate }
5290Sstevel@tonic-gate
5300Sstevel@tonic-gate int
mutex_owned(const kmutex_t * mp)5316712Stomee mutex_owned(const kmutex_t *mp)
5320Sstevel@tonic-gate {
5336712Stomee const mutex_impl_t *lp = (const mutex_impl_t *)mp;
5340Sstevel@tonic-gate
5357656SSherry.Moore@Sun.COM if (panicstr || quiesce_active)
5360Sstevel@tonic-gate return (1);
5370Sstevel@tonic-gate
5380Sstevel@tonic-gate if (MUTEX_TYPE_ADAPTIVE(lp))
5390Sstevel@tonic-gate return (MUTEX_OWNER(lp) == curthread);
5400Sstevel@tonic-gate return (LOCK_HELD(&lp->m_spin.m_spinlock));
5410Sstevel@tonic-gate }
5420Sstevel@tonic-gate
5430Sstevel@tonic-gate kthread_t *
mutex_owner(const kmutex_t * mp)5446712Stomee mutex_owner(const kmutex_t *mp)
5450Sstevel@tonic-gate {
5466712Stomee const mutex_impl_t *lp = (const mutex_impl_t *)mp;
5470Sstevel@tonic-gate kthread_id_t t;
5480Sstevel@tonic-gate
5490Sstevel@tonic-gate if (MUTEX_TYPE_ADAPTIVE(lp) && (t = MUTEX_OWNER(lp)) != MUTEX_NO_OWNER)
5500Sstevel@tonic-gate return (t);
5510Sstevel@tonic-gate return (NULL);
5520Sstevel@tonic-gate }
5530Sstevel@tonic-gate
5540Sstevel@tonic-gate /*
5550Sstevel@tonic-gate * The iblock cookie 'ibc' is the spl level associated with the lock;
5560Sstevel@tonic-gate * this alone determines whether the lock will be ADAPTIVE or SPIN.
5570Sstevel@tonic-gate *
5580Sstevel@tonic-gate * Adaptive mutexes created in zeroed memory do not need to call
5590Sstevel@tonic-gate * mutex_init() as their allocation in this fashion guarantees
5600Sstevel@tonic-gate * their initialization.
5610Sstevel@tonic-gate * eg adaptive mutexes created as static within the BSS or allocated
5620Sstevel@tonic-gate * by kmem_zalloc().
5630Sstevel@tonic-gate */
5640Sstevel@tonic-gate /* ARGSUSED */
5650Sstevel@tonic-gate void
mutex_init(kmutex_t * mp,char * name,kmutex_type_t type,void * ibc)5660Sstevel@tonic-gate mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc)
5670Sstevel@tonic-gate {
5680Sstevel@tonic-gate mutex_impl_t *lp = (mutex_impl_t *)mp;
5690Sstevel@tonic-gate
5700Sstevel@tonic-gate ASSERT(ibc < (void *)KERNELBASE); /* see 1215173 */
5710Sstevel@tonic-gate
5720Sstevel@tonic-gate if ((intptr_t)ibc > ipltospl(LOCK_LEVEL) && ibc < (void *)KERNELBASE) {
5730Sstevel@tonic-gate ASSERT(type != MUTEX_ADAPTIVE && type != MUTEX_DEFAULT);
5740Sstevel@tonic-gate MUTEX_SET_TYPE(lp, MUTEX_SPIN);
5750Sstevel@tonic-gate LOCK_INIT_CLEAR(&lp->m_spin.m_spinlock);
5760Sstevel@tonic-gate LOCK_INIT_HELD(&lp->m_spin.m_dummylock);
5770Sstevel@tonic-gate lp->m_spin.m_minspl = (int)(intptr_t)ibc;
5780Sstevel@tonic-gate } else {
5796617Sck142721 #ifdef MUTEX_ALIGN
5806617Sck142721 static int misalign_cnt = 0;
5816617Sck142721
5826617Sck142721 if (((uintptr_t)lp & (uintptr_t)(MUTEX_ALIGN - 1)) &&
5836617Sck142721 (misalign_cnt < MUTEX_ALIGN_WARNINGS)) {
5846617Sck142721 /*
5856617Sck142721 * The mutex is not aligned and may cross a cache line.
5866617Sck142721 * This is not supported and may cause a panic.
5876617Sck142721 * Show a warning that the mutex is not aligned
5886617Sck142721 * and attempt to identify the origin.
5896617Sck142721 * Unaligned mutexes are not (supposed to be)
5906617Sck142721 * possible on SPARC.
5916617Sck142721 */
5926617Sck142721 char *funcname;
5936617Sck142721 ulong_t offset = 0;
5946617Sck142721
5956617Sck142721 funcname = modgetsymname((uintptr_t)caller(), &offset);
5966617Sck142721 cmn_err(CE_WARN, "mutex_init: %p is not %d byte "
5976617Sck142721 "aligned; caller %s+%lx in module %s. "
5986617Sck142721 "This is unsupported and may cause a panic. "
5996617Sck142721 "Please report this to the kernel module supplier.",
6006626Sck142721 (void *)lp, MUTEX_ALIGN,
6016617Sck142721 funcname ? funcname : "unknown", offset,
6026617Sck142721 mod_containing_pc(caller()));
6036617Sck142721 misalign_cnt++;
6046617Sck142721 if (misalign_cnt >= MUTEX_ALIGN_WARNINGS) {
6056617Sck142721 cmn_err(CE_WARN, "mutex_init: further unaligned"
6066617Sck142721 " mutex warnings will be suppressed.");
6076617Sck142721 }
6086617Sck142721 }
6096617Sck142721 #endif /* MUTEX_ALIGN */
6100Sstevel@tonic-gate ASSERT(type != MUTEX_SPIN);
6116617Sck142721
6120Sstevel@tonic-gate MUTEX_SET_TYPE(lp, MUTEX_ADAPTIVE);
6130Sstevel@tonic-gate MUTEX_CLEAR_LOCK_AND_WAITERS(lp);
6140Sstevel@tonic-gate }
6150Sstevel@tonic-gate }
6160Sstevel@tonic-gate
6170Sstevel@tonic-gate void
mutex_destroy(kmutex_t * mp)6180Sstevel@tonic-gate mutex_destroy(kmutex_t *mp)
6190Sstevel@tonic-gate {
6200Sstevel@tonic-gate mutex_impl_t *lp = (mutex_impl_t *)mp;
6210Sstevel@tonic-gate
6220Sstevel@tonic-gate if (lp->m_owner == 0 && !MUTEX_HAS_WAITERS(lp)) {
6230Sstevel@tonic-gate MUTEX_DESTROY(lp);
6240Sstevel@tonic-gate } else if (MUTEX_TYPE_SPIN(lp)) {
6250Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE, lp);
6260Sstevel@tonic-gate MUTEX_DESTROY(lp);
6270Sstevel@tonic-gate } else if (MUTEX_TYPE_ADAPTIVE(lp)) {
6280Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE, lp);
6290Sstevel@tonic-gate if (MUTEX_OWNER(lp) != curthread)
6300Sstevel@tonic-gate mutex_panic("mutex_destroy: not owner", lp);
6310Sstevel@tonic-gate if (MUTEX_HAS_WAITERS(lp)) {
6320Sstevel@tonic-gate turnstile_t *ts = turnstile_lookup(lp);
6330Sstevel@tonic-gate turnstile_exit(lp);
6340Sstevel@tonic-gate if (ts != NULL)
6350Sstevel@tonic-gate mutex_panic("mutex_destroy: has waiters", lp);
6360Sstevel@tonic-gate }
6370Sstevel@tonic-gate MUTEX_DESTROY(lp);
6380Sstevel@tonic-gate } else {
6390Sstevel@tonic-gate mutex_panic("mutex_destroy: bad mutex", lp);
6400Sstevel@tonic-gate }
6410Sstevel@tonic-gate }
6420Sstevel@tonic-gate
6430Sstevel@tonic-gate /*
6440Sstevel@tonic-gate * Simple C support for the cases where spin locks miss on the first try.
6450Sstevel@tonic-gate */
6460Sstevel@tonic-gate void
lock_set_spin(lock_t * lp)6470Sstevel@tonic-gate lock_set_spin(lock_t *lp)
6480Sstevel@tonic-gate {
6495834Spt157919 int loop_count = 0;
6505834Spt157919 uint_t backoff = 0; /* current backoff */
6516103Sck142721 hrtime_t spin_time = 0; /* how long we spun */
6520Sstevel@tonic-gate
6530Sstevel@tonic-gate if (panicstr)
6540Sstevel@tonic-gate return;
6550Sstevel@tonic-gate
6560Sstevel@tonic-gate if (ncpus == 1)
6577632SNick.Todd@Sun.COM panic("lock_set: %p lock held and only one CPU", (void *)lp);
6580Sstevel@tonic-gate
6596103Sck142721 spin_time = LOCKSTAT_START_TIME(LS_LOCK_SET_SPIN);
6606103Sck142721
6610Sstevel@tonic-gate while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
6620Sstevel@tonic-gate if (panicstr)
6630Sstevel@tonic-gate return;
6645834Spt157919 loop_count++;
6655834Spt157919
6665834Spt157919 if (ncpus_online == loop_count) {
6675834Spt157919 backoff = mutex_lock_backoff(0);
6685834Spt157919 loop_count = 0;
6693914Spm145316 } else {
6705834Spt157919 backoff = mutex_lock_backoff(backoff);
6710Sstevel@tonic-gate }
6725834Spt157919 mutex_lock_delay(backoff);
6730Sstevel@tonic-gate }
6740Sstevel@tonic-gate
6756103Sck142721 LOCKSTAT_RECORD_TIME(LS_LOCK_SET_SPIN, lp, spin_time);
6760Sstevel@tonic-gate
6770Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_LOCK_SET_ACQUIRE, lp);
6780Sstevel@tonic-gate }
6790Sstevel@tonic-gate
6800Sstevel@tonic-gate void
lock_set_spl_spin(lock_t * lp,int new_pil,ushort_t * old_pil_addr,int old_pil)6810Sstevel@tonic-gate lock_set_spl_spin(lock_t *lp, int new_pil, ushort_t *old_pil_addr, int old_pil)
6820Sstevel@tonic-gate {
6835834Spt157919 int loop_count = 0;
6845834Spt157919 uint_t backoff = 0; /* current backoff */
6856103Sck142721 hrtime_t spin_time = 0; /* how long we spun */
6860Sstevel@tonic-gate
6870Sstevel@tonic-gate if (panicstr)
6880Sstevel@tonic-gate return;
6890Sstevel@tonic-gate
6900Sstevel@tonic-gate if (ncpus == 1)
6917632SNick.Todd@Sun.COM panic("lock_set_spl: %p lock held and only one CPU",
6927632SNick.Todd@Sun.COM (void *)lp);
6930Sstevel@tonic-gate
6940Sstevel@tonic-gate ASSERT(new_pil > LOCK_LEVEL);
6950Sstevel@tonic-gate
6966103Sck142721 spin_time = LOCKSTAT_START_TIME(LS_LOCK_SET_SPL_SPIN);
6976103Sck142721
6980Sstevel@tonic-gate do {
6990Sstevel@tonic-gate splx(old_pil);
7000Sstevel@tonic-gate while (LOCK_HELD(lp)) {
7015834Spt157919 loop_count++;
7025834Spt157919
7030Sstevel@tonic-gate if (panicstr) {
7040Sstevel@tonic-gate *old_pil_addr = (ushort_t)splr(new_pil);
7050Sstevel@tonic-gate return;
7060Sstevel@tonic-gate }
7075834Spt157919 if (ncpus_online == loop_count) {
7085834Spt157919 backoff = mutex_lock_backoff(0);
7095834Spt157919 loop_count = 0;
7103914Spm145316 } else {
7115834Spt157919 backoff = mutex_lock_backoff(backoff);
7120Sstevel@tonic-gate }
7135834Spt157919 mutex_lock_delay(backoff);
7140Sstevel@tonic-gate }
7150Sstevel@tonic-gate old_pil = splr(new_pil);
7160Sstevel@tonic-gate } while (!lock_spin_try(lp));
7170Sstevel@tonic-gate
7180Sstevel@tonic-gate *old_pil_addr = (ushort_t)old_pil;
7190Sstevel@tonic-gate
7206103Sck142721 LOCKSTAT_RECORD_TIME(LS_LOCK_SET_SPL_SPIN, lp, spin_time);
7210Sstevel@tonic-gate
7226103Sck142721 LOCKSTAT_RECORD0(LS_LOCK_SET_SPL_ACQUIRE, lp);
7230Sstevel@tonic-gate }
724