xref: /onnv-gate/usr/src/uts/common/os/mutex.c (revision 3914:9f2fcd00d060)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
52205Sdv142724  * Common Development and Distribution License (the "License").
62205Sdv142724  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*3914Spm145316  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
270Sstevel@tonic-gate 
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate  * Big Theory Statement for mutual exclusion locking primitives.
300Sstevel@tonic-gate  *
310Sstevel@tonic-gate  * A mutex serializes multiple threads so that only one thread
320Sstevel@tonic-gate  * (the "owner" of the mutex) is active at a time.  See mutex(9F)
330Sstevel@tonic-gate  * for a full description of the interfaces and programming model.
340Sstevel@tonic-gate  * The rest of this comment describes the implementation.
350Sstevel@tonic-gate  *
360Sstevel@tonic-gate  * Mutexes come in two flavors: adaptive and spin.  mutex_init(9F)
370Sstevel@tonic-gate  * determines the type based solely on the iblock cookie (PIL) argument.
380Sstevel@tonic-gate  * PIL > LOCK_LEVEL implies a spin lock; everything else is adaptive.
390Sstevel@tonic-gate  *
400Sstevel@tonic-gate  * Spin mutexes block interrupts and spin until the lock becomes available.
410Sstevel@tonic-gate  * A thread may not sleep, or call any function that might sleep, while
420Sstevel@tonic-gate  * holding a spin mutex.  With few exceptions, spin mutexes should only
430Sstevel@tonic-gate  * be used to synchronize with interrupt handlers.
440Sstevel@tonic-gate  *
450Sstevel@tonic-gate  * Adaptive mutexes (the default type) spin if the owner is running on
460Sstevel@tonic-gate  * another CPU and block otherwise.  This policy is based on the assumption
470Sstevel@tonic-gate  * that mutex hold times are typically short enough that the time spent
480Sstevel@tonic-gate  * spinning is less than the time it takes to block.  If you need mutual
490Sstevel@tonic-gate  * exclusion semantics with long hold times, consider an rwlock(9F) as
500Sstevel@tonic-gate  * RW_WRITER.  Better still, reconsider the algorithm: if it requires
510Sstevel@tonic-gate  * mutual exclusion for long periods of time, it's probably not scalable.
520Sstevel@tonic-gate  *
530Sstevel@tonic-gate  * Adaptive mutexes are overwhelmingly more common than spin mutexes,
540Sstevel@tonic-gate  * so mutex_enter() assumes that the lock is adaptive.  We get away
550Sstevel@tonic-gate  * with this by structuring mutexes so that an attempt to acquire a
560Sstevel@tonic-gate  * spin mutex as adaptive always fails.  When mutex_enter() fails
570Sstevel@tonic-gate  * it punts to mutex_vector_enter(), which does all the hard stuff.
580Sstevel@tonic-gate  *
590Sstevel@tonic-gate  * mutex_vector_enter() first checks the type.  If it's spin mutex,
600Sstevel@tonic-gate  * we just call lock_set_spl() and return.  If it's an adaptive mutex,
610Sstevel@tonic-gate  * we check to see what the owner is doing.  If the owner is running,
620Sstevel@tonic-gate  * we spin until the lock becomes available; if not, we mark the lock
630Sstevel@tonic-gate  * as having waiters and block.
640Sstevel@tonic-gate  *
650Sstevel@tonic-gate  * Blocking on a mutex is surprisingly delicate dance because, for speed,
660Sstevel@tonic-gate  * mutex_exit() doesn't use an atomic instruction.  Thus we have to work
670Sstevel@tonic-gate  * a little harder in the (rarely-executed) blocking path to make sure
680Sstevel@tonic-gate  * we don't block on a mutex that's just been released -- otherwise we
690Sstevel@tonic-gate  * might never be woken up.
700Sstevel@tonic-gate  *
710Sstevel@tonic-gate  * The logic for synchronizing mutex_vector_enter() with mutex_exit()
720Sstevel@tonic-gate  * in the face of preemption and relaxed memory ordering is as follows:
730Sstevel@tonic-gate  *
740Sstevel@tonic-gate  * (1) Preemption in the middle of mutex_exit() must cause mutex_exit()
750Sstevel@tonic-gate  *     to restart.  Each platform must enforce this by checking the
760Sstevel@tonic-gate  *     interrupted PC in the interrupt handler (or on return from trap --
770Sstevel@tonic-gate  *     whichever is more convenient for the platform).  If the PC
780Sstevel@tonic-gate  *     lies within the critical region of mutex_exit(), the interrupt
790Sstevel@tonic-gate  *     handler must reset the PC back to the beginning of mutex_exit().
800Sstevel@tonic-gate  *     The critical region consists of all instructions up to, but not
810Sstevel@tonic-gate  *     including, the store that clears the lock (which, of course,
820Sstevel@tonic-gate  *     must never be executed twice.)
830Sstevel@tonic-gate  *
840Sstevel@tonic-gate  *     This ensures that the owner will always check for waiters after
850Sstevel@tonic-gate  *     resuming from a previous preemption.
860Sstevel@tonic-gate  *
870Sstevel@tonic-gate  * (2) A thread resuming in mutex_exit() does (at least) the following:
880Sstevel@tonic-gate  *
890Sstevel@tonic-gate  *	when resuming:	set CPU_THREAD = owner
900Sstevel@tonic-gate  *			membar #StoreLoad
910Sstevel@tonic-gate  *
920Sstevel@tonic-gate  *	in mutex_exit:	check waiters bit; do wakeup if set
930Sstevel@tonic-gate  *			membar #LoadStore|#StoreStore
940Sstevel@tonic-gate  *			clear owner
950Sstevel@tonic-gate  *			(at this point, other threads may or may not grab
960Sstevel@tonic-gate  *			the lock, and we may or may not reacquire it)
970Sstevel@tonic-gate  *
980Sstevel@tonic-gate  *	when blocking:	membar #StoreStore (due to disp_lock_enter())
990Sstevel@tonic-gate  *			set CPU_THREAD = (possibly) someone else
1000Sstevel@tonic-gate  *
1010Sstevel@tonic-gate  * (3) A thread blocking in mutex_vector_enter() does the following:
1020Sstevel@tonic-gate  *
1030Sstevel@tonic-gate  *			set waiters bit
1040Sstevel@tonic-gate  *			membar #StoreLoad (via membar_enter())
1050Sstevel@tonic-gate  *			check CPU_THREAD for each CPU; abort if owner running
1060Sstevel@tonic-gate  *			membar #LoadLoad (via membar_consumer())
1070Sstevel@tonic-gate  *			check owner and waiters bit; abort if either changed
1080Sstevel@tonic-gate  *			block
1090Sstevel@tonic-gate  *
1100Sstevel@tonic-gate  * Thus the global memory orderings for (2) and (3) are as follows:
1110Sstevel@tonic-gate  *
1120Sstevel@tonic-gate  * (2M) mutex_exit() memory order:
1130Sstevel@tonic-gate  *
1140Sstevel@tonic-gate  *			STORE	CPU_THREAD = owner
1150Sstevel@tonic-gate  *			LOAD	waiters bit
1160Sstevel@tonic-gate  *			STORE	owner = NULL
1170Sstevel@tonic-gate  *			STORE	CPU_THREAD = (possibly) someone else
1180Sstevel@tonic-gate  *
1190Sstevel@tonic-gate  * (3M) mutex_vector_enter() memory order:
1200Sstevel@tonic-gate  *
1210Sstevel@tonic-gate  *			STORE	waiters bit = 1
1220Sstevel@tonic-gate  *			LOAD	CPU_THREAD for each CPU
1230Sstevel@tonic-gate  *			LOAD	owner and waiters bit
1240Sstevel@tonic-gate  *
1250Sstevel@tonic-gate  * It has been verified by exhaustive simulation that all possible global
1260Sstevel@tonic-gate  * memory orderings of (2M) interleaved with (3M) result in correct
1270Sstevel@tonic-gate  * behavior.  Moreover, these ordering constraints are minimal: changing
1280Sstevel@tonic-gate  * the ordering of anything in (2M) or (3M) breaks the algorithm, creating
1290Sstevel@tonic-gate  * windows for missed wakeups.  Note: the possibility that other threads
1300Sstevel@tonic-gate  * may grab the lock after the owner drops it can be factored out of the
1310Sstevel@tonic-gate  * memory ordering analysis because mutex_vector_enter() won't block
1320Sstevel@tonic-gate  * if the lock isn't still owned by the same thread.
1330Sstevel@tonic-gate  *
1340Sstevel@tonic-gate  * The only requirements of code outside the mutex implementation are
1350Sstevel@tonic-gate  * (1) mutex_exit() preemption fixup in interrupt handlers or trap return,
1360Sstevel@tonic-gate  * and (2) a membar #StoreLoad after setting CPU_THREAD in resume().
1370Sstevel@tonic-gate  * Note: idle threads cannot grab adaptive locks (since they cannot block),
1380Sstevel@tonic-gate  * so the membar may be safely omitted when resuming an idle thread.
1390Sstevel@tonic-gate  *
1400Sstevel@tonic-gate  * When a mutex has waiters, mutex_vector_exit() has several options:
1410Sstevel@tonic-gate  *
1420Sstevel@tonic-gate  * (1) Choose a waiter and make that thread the owner before waking it;
1430Sstevel@tonic-gate  *     this is known as "direct handoff" of ownership.
1440Sstevel@tonic-gate  *
1450Sstevel@tonic-gate  * (2) Drop the lock and wake one waiter.
1460Sstevel@tonic-gate  *
1470Sstevel@tonic-gate  * (3) Drop the lock, clear the waiters bit, and wake all waiters.
1480Sstevel@tonic-gate  *
1490Sstevel@tonic-gate  * In many ways (1) is the cleanest solution, but if a lock is moderately
1500Sstevel@tonic-gate  * contended it defeats the adaptive spin logic.  If we make some other
1510Sstevel@tonic-gate  * thread the owner, but he's not ONPROC yet, then all other threads on
1520Sstevel@tonic-gate  * other cpus that try to get the lock will conclude that the owner is
1530Sstevel@tonic-gate  * blocked, so they'll block too.  And so on -- it escalates quickly,
1540Sstevel@tonic-gate  * with every thread taking the blocking path rather than the spin path.
1550Sstevel@tonic-gate  * Thus, direct handoff is *not* a good idea for adaptive mutexes.
1560Sstevel@tonic-gate  *
1570Sstevel@tonic-gate  * Option (2) is the next most natural-seeming option, but it has several
1580Sstevel@tonic-gate  * annoying properties.  If there's more than one waiter, we must preserve
1590Sstevel@tonic-gate  * the waiters bit on an unheld lock.  On cas-capable platforms, where
1600Sstevel@tonic-gate  * the waiters bit is part of the lock word, this means that both 0x0
1610Sstevel@tonic-gate  * and 0x1 represent unheld locks, so we have to cas against *both*.
1620Sstevel@tonic-gate  * Priority inheritance also gets more complicated, because a lock can
1630Sstevel@tonic-gate  * have waiters but no owner to whom priority can be willed.  So while
1640Sstevel@tonic-gate  * it is possible to make option (2) work, it's surprisingly vile.
1650Sstevel@tonic-gate  *
1660Sstevel@tonic-gate  * Option (3), the least-intuitive at first glance, is what we actually do.
1670Sstevel@tonic-gate  * It has the advantage that because you always wake all waiters, you
1680Sstevel@tonic-gate  * never have to preserve the waiters bit.  Waking all waiters seems like
1690Sstevel@tonic-gate  * begging for a thundering herd problem, but consider: under option (2),
1700Sstevel@tonic-gate  * every thread that grabs and drops the lock will wake one waiter -- so
1710Sstevel@tonic-gate  * if the lock is fairly active, all waiters will be awakened very quickly
1720Sstevel@tonic-gate  * anyway.  Moreover, this is how adaptive locks are *supposed* to work.
1730Sstevel@tonic-gate  * The blocking case is rare; the more common case (by 3-4 orders of
1740Sstevel@tonic-gate  * magnitude) is that one or more threads spin waiting to get the lock.
1750Sstevel@tonic-gate  * Only direct handoff can prevent the thundering herd problem, but as
1760Sstevel@tonic-gate  * mentioned earlier, that would tend to defeat the adaptive spin logic.
1770Sstevel@tonic-gate  * In practice, option (3) works well because the blocking case is rare.
1780Sstevel@tonic-gate  */
1790Sstevel@tonic-gate 
1800Sstevel@tonic-gate /*
1810Sstevel@tonic-gate  * delayed lock retry with exponential delay for spin locks
1820Sstevel@tonic-gate  *
1830Sstevel@tonic-gate  * It is noted above that for both the spin locks and the adaptive locks,
1840Sstevel@tonic-gate  * spinning is the dominate mode of operation.  So long as there is only
1850Sstevel@tonic-gate  * one thread waiting on a lock, the naive spin loop works very well in
1860Sstevel@tonic-gate  * cache based architectures.  The lock data structure is pulled into the
1870Sstevel@tonic-gate  * cache of the processor with the waiting/spinning thread and no further
1880Sstevel@tonic-gate  * memory traffic is generated until the lock is released.  Unfortunately,
1890Sstevel@tonic-gate  * once two or more threads are waiting on a lock, the naive spin has
1900Sstevel@tonic-gate  * the property of generating maximum memory traffic from each spinning
1910Sstevel@tonic-gate  * thread as the spinning threads contend for the lock data structure.
1920Sstevel@tonic-gate  *
1930Sstevel@tonic-gate  * By executing a delay loop before retrying a lock, a waiting thread
1940Sstevel@tonic-gate  * can reduce its memory traffic by a large factor, depending on the
1950Sstevel@tonic-gate  * size of the delay loop.  A large delay loop greatly reduced the memory
1960Sstevel@tonic-gate  * traffic, but has the drawback of having a period of time when
1970Sstevel@tonic-gate  * no thread is attempting to gain the lock even though several threads
1980Sstevel@tonic-gate  * might be waiting.  A small delay loop has the drawback of not
1990Sstevel@tonic-gate  * much reduction in memory traffic, but reduces the potential idle time.
2000Sstevel@tonic-gate  * The theory of the exponential delay code is to start with a short
2010Sstevel@tonic-gate  * delay loop and double the waiting time on each iteration, up to
2020Sstevel@tonic-gate  * a preselected maximum.  The BACKOFF_BASE provides the equivalent
2030Sstevel@tonic-gate  * of 2 to 3 memory references delay for US-III+ and US-IV architectures.
2040Sstevel@tonic-gate  * The BACKOFF_CAP is the equivalent of 50 to 100 memory references of
2050Sstevel@tonic-gate  * time (less than 12 microseconds for a 1000 MHz system).
2060Sstevel@tonic-gate  *
2070Sstevel@tonic-gate  * To determine appropriate BACKOFF_BASE and BACKOFF_CAP values,
2080Sstevel@tonic-gate  * studies on US-III+ and US-IV systems using 1 to 66 threads were
2090Sstevel@tonic-gate  * done.  A range of possible values were studied.
2100Sstevel@tonic-gate  * Performance differences below 10 threads were not large.  For
2110Sstevel@tonic-gate  * systems with more threads, substantial increases in total lock
2120Sstevel@tonic-gate  * throughput was observed with the given values.  For cases where
2130Sstevel@tonic-gate  * more than 20 threads were waiting on the same lock, lock throughput
2140Sstevel@tonic-gate  * increased by a factor of 5 or more using the backoff algorithm.
215*3914Spm145316  *
216*3914Spm145316  * Some platforms may provide their own platform specific delay code,
217*3914Spm145316  * using plat_lock_delay(backoff).  If it is available, plat_lock_delay
218*3914Spm145316  * is executed instead of the default delay code.
2190Sstevel@tonic-gate  */
2200Sstevel@tonic-gate 
221*3914Spm145316 #pragma weak plat_lock_delay
222*3914Spm145316 
2230Sstevel@tonic-gate #include <sys/param.h>
2240Sstevel@tonic-gate #include <sys/time.h>
2250Sstevel@tonic-gate #include <sys/cpuvar.h>
2260Sstevel@tonic-gate #include <sys/thread.h>
2270Sstevel@tonic-gate #include <sys/debug.h>
2280Sstevel@tonic-gate #include <sys/cmn_err.h>
2290Sstevel@tonic-gate #include <sys/sobject.h>
2300Sstevel@tonic-gate #include <sys/turnstile.h>
2310Sstevel@tonic-gate #include <sys/systm.h>
2320Sstevel@tonic-gate #include <sys/mutex_impl.h>
2330Sstevel@tonic-gate #include <sys/spl.h>
2340Sstevel@tonic-gate #include <sys/lockstat.h>
2350Sstevel@tonic-gate #include <sys/atomic.h>
2360Sstevel@tonic-gate #include <sys/cpu.h>
2370Sstevel@tonic-gate #include <sys/stack.h>
2380Sstevel@tonic-gate 
2390Sstevel@tonic-gate #define	BACKOFF_BASE	50
2400Sstevel@tonic-gate #define	BACKOFF_CAP 	1600
2410Sstevel@tonic-gate 
2420Sstevel@tonic-gate /*
2430Sstevel@tonic-gate  * The sobj_ops vector exports a set of functions needed when a thread
2440Sstevel@tonic-gate  * is asleep on a synchronization object of this type.
2450Sstevel@tonic-gate  */
2460Sstevel@tonic-gate static sobj_ops_t mutex_sobj_ops = {
2470Sstevel@tonic-gate 	SOBJ_MUTEX, mutex_owner, turnstile_stay_asleep, turnstile_change_pri
2480Sstevel@tonic-gate };
2490Sstevel@tonic-gate 
2500Sstevel@tonic-gate /*
2510Sstevel@tonic-gate  * If the system panics on a mutex, save the address of the offending
2520Sstevel@tonic-gate  * mutex in panic_mutex_addr, and save the contents in panic_mutex.
2530Sstevel@tonic-gate  */
2540Sstevel@tonic-gate static mutex_impl_t panic_mutex;
2550Sstevel@tonic-gate static mutex_impl_t *panic_mutex_addr;
2560Sstevel@tonic-gate 
2570Sstevel@tonic-gate static void
2580Sstevel@tonic-gate mutex_panic(char *msg, mutex_impl_t *lp)
2590Sstevel@tonic-gate {
2600Sstevel@tonic-gate 	if (panicstr)
2610Sstevel@tonic-gate 		return;
2620Sstevel@tonic-gate 
2630Sstevel@tonic-gate 	if (casptr(&panic_mutex_addr, NULL, lp) == NULL)
2640Sstevel@tonic-gate 		panic_mutex = *lp;
2650Sstevel@tonic-gate 
2660Sstevel@tonic-gate 	panic("%s, lp=%p owner=%p thread=%p",
2670Sstevel@tonic-gate 	    msg, lp, MUTEX_OWNER(&panic_mutex), curthread);
2680Sstevel@tonic-gate }
2690Sstevel@tonic-gate 
2700Sstevel@tonic-gate /*
2710Sstevel@tonic-gate  * mutex_vector_enter() is called from the assembly mutex_enter() routine
2720Sstevel@tonic-gate  * if the lock is held or is not of type MUTEX_ADAPTIVE.
2730Sstevel@tonic-gate  */
2740Sstevel@tonic-gate void
2750Sstevel@tonic-gate mutex_vector_enter(mutex_impl_t *lp)
2760Sstevel@tonic-gate {
2770Sstevel@tonic-gate 	kthread_id_t	owner;
2780Sstevel@tonic-gate 	hrtime_t	sleep_time = 0;	/* how long we slept */
2790Sstevel@tonic-gate 	uint_t		spin_count = 0;	/* how many times we spun */
2800Sstevel@tonic-gate 	cpu_t 		*cpup, *last_cpu;
2810Sstevel@tonic-gate 	extern cpu_t	*cpu_list;
2820Sstevel@tonic-gate 	turnstile_t	*ts;
2830Sstevel@tonic-gate 	volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp;
2840Sstevel@tonic-gate 	int		backoff;	/* current backoff */
2850Sstevel@tonic-gate 	int		backctr;	/* ctr for backoff */
2862205Sdv142724 	int		sleep_count = 0;
2870Sstevel@tonic-gate 
2880Sstevel@tonic-gate 	ASSERT_STACK_ALIGNED();
2890Sstevel@tonic-gate 
2900Sstevel@tonic-gate 	if (MUTEX_TYPE_SPIN(lp)) {
2910Sstevel@tonic-gate 		lock_set_spl(&lp->m_spin.m_spinlock, lp->m_spin.m_minspl,
2920Sstevel@tonic-gate 		    &lp->m_spin.m_oldspl);
2930Sstevel@tonic-gate 		return;
2940Sstevel@tonic-gate 	}
2950Sstevel@tonic-gate 
2960Sstevel@tonic-gate 	if (!MUTEX_TYPE_ADAPTIVE(lp)) {
2970Sstevel@tonic-gate 		mutex_panic("mutex_enter: bad mutex", lp);
2980Sstevel@tonic-gate 		return;
2990Sstevel@tonic-gate 	}
3000Sstevel@tonic-gate 
3010Sstevel@tonic-gate 	/*
3020Sstevel@tonic-gate 	 * Adaptive mutexes must not be acquired from above LOCK_LEVEL.
3030Sstevel@tonic-gate 	 * We can migrate after loading CPU but before checking CPU_ON_INTR,
3040Sstevel@tonic-gate 	 * so we must verify by disabling preemption and loading CPU again.
3050Sstevel@tonic-gate 	 */
3060Sstevel@tonic-gate 	cpup = CPU;
3070Sstevel@tonic-gate 	if (CPU_ON_INTR(cpup) && !panicstr) {
3080Sstevel@tonic-gate 		kpreempt_disable();
3090Sstevel@tonic-gate 		if (CPU_ON_INTR(CPU))
3100Sstevel@tonic-gate 			mutex_panic("mutex_enter: adaptive at high PIL", lp);
3110Sstevel@tonic-gate 		kpreempt_enable();
3120Sstevel@tonic-gate 	}
3130Sstevel@tonic-gate 
3140Sstevel@tonic-gate 	CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1);
3150Sstevel@tonic-gate 
316*3914Spm145316 	if (&plat_lock_delay) {
317*3914Spm145316 		backoff = 0;
318*3914Spm145316 	} else {
319*3914Spm145316 		backoff = BACKOFF_BASE;
320*3914Spm145316 	}
3210Sstevel@tonic-gate 
3220Sstevel@tonic-gate 	for (;;) {
3230Sstevel@tonic-gate spin:
3240Sstevel@tonic-gate 		spin_count++;
3250Sstevel@tonic-gate 		/*
3260Sstevel@tonic-gate 		 * Add an exponential backoff delay before trying again
3270Sstevel@tonic-gate 		 * to touch the mutex data structure.
3280Sstevel@tonic-gate 		 * the spin_count test and call to nulldev are to prevent
3290Sstevel@tonic-gate 		 * the compiler optimizer from eliminating the delay loop.
3300Sstevel@tonic-gate 		 */
331*3914Spm145316 		if (&plat_lock_delay) {
332*3914Spm145316 			plat_lock_delay(&backoff);
333*3914Spm145316 		} else {
334*3914Spm145316 			for (backctr = backoff; backctr; backctr--) {
335*3914Spm145316 				if (!spin_count) (void) nulldev();
336*3914Spm145316 			};    /* delay */
337*3914Spm145316 			backoff = backoff << 1;			/* double it */
338*3914Spm145316 			if (backoff > BACKOFF_CAP) {
339*3914Spm145316 				backoff = BACKOFF_CAP;
340*3914Spm145316 			}
341*3914Spm145316 
342*3914Spm145316 			SMT_PAUSE();
3430Sstevel@tonic-gate 		}
3440Sstevel@tonic-gate 
3450Sstevel@tonic-gate 		if (panicstr)
3460Sstevel@tonic-gate 			return;
3470Sstevel@tonic-gate 
3480Sstevel@tonic-gate 		if ((owner = MUTEX_OWNER(vlp)) == NULL) {
3490Sstevel@tonic-gate 			if (mutex_adaptive_tryenter(lp))
3500Sstevel@tonic-gate 				break;
3510Sstevel@tonic-gate 			continue;
3520Sstevel@tonic-gate 		}
3530Sstevel@tonic-gate 
3540Sstevel@tonic-gate 		if (owner == curthread)
3550Sstevel@tonic-gate 			mutex_panic("recursive mutex_enter", lp);
3560Sstevel@tonic-gate 
3570Sstevel@tonic-gate 		/*
3580Sstevel@tonic-gate 		 * If lock is held but owner is not yet set, spin.
3590Sstevel@tonic-gate 		 * (Only relevant for platforms that don't have cas.)
3600Sstevel@tonic-gate 		 */
3610Sstevel@tonic-gate 		if (owner == MUTEX_NO_OWNER)
3620Sstevel@tonic-gate 			continue;
3630Sstevel@tonic-gate 
3640Sstevel@tonic-gate 		/*
3650Sstevel@tonic-gate 		 * When searching the other CPUs, start with the one where
3660Sstevel@tonic-gate 		 * we last saw the owner thread.  If owner is running, spin.
3670Sstevel@tonic-gate 		 *
3680Sstevel@tonic-gate 		 * We must disable preemption at this point to guarantee
3690Sstevel@tonic-gate 		 * that the list doesn't change while we traverse it
3700Sstevel@tonic-gate 		 * without the cpu_lock mutex.  While preemption is
3710Sstevel@tonic-gate 		 * disabled, we must revalidate our cached cpu pointer.
3720Sstevel@tonic-gate 		 */
3730Sstevel@tonic-gate 		kpreempt_disable();
3740Sstevel@tonic-gate 		if (cpup->cpu_next == NULL)
3750Sstevel@tonic-gate 			cpup = cpu_list;
3760Sstevel@tonic-gate 		last_cpu = cpup;	/* mark end of search */
3770Sstevel@tonic-gate 		do {
3780Sstevel@tonic-gate 			if (cpup->cpu_thread == owner) {
3790Sstevel@tonic-gate 				kpreempt_enable();
3800Sstevel@tonic-gate 				goto spin;
3810Sstevel@tonic-gate 			}
3820Sstevel@tonic-gate 		} while ((cpup = cpup->cpu_next) != last_cpu);
3830Sstevel@tonic-gate 		kpreempt_enable();
3840Sstevel@tonic-gate 
3850Sstevel@tonic-gate 		/*
3860Sstevel@tonic-gate 		 * The owner appears not to be running, so block.
3870Sstevel@tonic-gate 		 * See the Big Theory Statement for memory ordering issues.
3880Sstevel@tonic-gate 		 */
3890Sstevel@tonic-gate 		ts = turnstile_lookup(lp);
3900Sstevel@tonic-gate 		MUTEX_SET_WAITERS(lp);
3910Sstevel@tonic-gate 		membar_enter();
3920Sstevel@tonic-gate 
3930Sstevel@tonic-gate 		/*
3940Sstevel@tonic-gate 		 * Recheck whether owner is running after waiters bit hits
3950Sstevel@tonic-gate 		 * global visibility (above).  If owner is running, spin.
3960Sstevel@tonic-gate 		 *
3970Sstevel@tonic-gate 		 * Since we are at ipl DISP_LEVEL, kernel preemption is
3980Sstevel@tonic-gate 		 * disabled, however we still need to revalidate our cached
3990Sstevel@tonic-gate 		 * cpu pointer to make sure the cpu hasn't been deleted.
4000Sstevel@tonic-gate 		 */
4010Sstevel@tonic-gate 		if (cpup->cpu_next == NULL)
4020Sstevel@tonic-gate 			last_cpu = cpup = cpu_list;
4030Sstevel@tonic-gate 		do {
4040Sstevel@tonic-gate 			if (cpup->cpu_thread == owner) {
4050Sstevel@tonic-gate 				turnstile_exit(lp);
4060Sstevel@tonic-gate 				goto spin;
4070Sstevel@tonic-gate 			}
4080Sstevel@tonic-gate 		} while ((cpup = cpup->cpu_next) != last_cpu);
4090Sstevel@tonic-gate 		membar_consumer();
4100Sstevel@tonic-gate 
4110Sstevel@tonic-gate 		/*
4120Sstevel@tonic-gate 		 * If owner and waiters bit are unchanged, block.
4130Sstevel@tonic-gate 		 */
4140Sstevel@tonic-gate 		if (MUTEX_OWNER(vlp) == owner && MUTEX_HAS_WAITERS(vlp)) {
4150Sstevel@tonic-gate 			sleep_time -= gethrtime();
4160Sstevel@tonic-gate 			(void) turnstile_block(ts, TS_WRITER_Q, lp,
4170Sstevel@tonic-gate 			    &mutex_sobj_ops, NULL, NULL);
4180Sstevel@tonic-gate 			sleep_time += gethrtime();
4192205Sdv142724 			sleep_count++;
4200Sstevel@tonic-gate 		} else {
4210Sstevel@tonic-gate 			turnstile_exit(lp);
4220Sstevel@tonic-gate 		}
4230Sstevel@tonic-gate 	}
4240Sstevel@tonic-gate 
4250Sstevel@tonic-gate 	ASSERT(MUTEX_OWNER(lp) == curthread);
4260Sstevel@tonic-gate 
4272205Sdv142724 	if (sleep_time != 0) {
4282205Sdv142724 		/*
4292205Sdv142724 		 * Note, sleep time is the sum of all the sleeping we
4302205Sdv142724 		 * did.
4312205Sdv142724 		 */
4320Sstevel@tonic-gate 		LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK, lp, sleep_time);
4330Sstevel@tonic-gate 	}
4340Sstevel@tonic-gate 
4352205Sdv142724 	/*
4362205Sdv142724 	 * We do not count a sleep as a spin.
4372205Sdv142724 	 */
4382205Sdv142724 	if (spin_count > sleep_count)
4392205Sdv142724 		LOCKSTAT_RECORD(LS_MUTEX_ENTER_SPIN, lp,
4402205Sdv142724 		    spin_count - sleep_count);
4412205Sdv142724 
4420Sstevel@tonic-gate 	LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp);
4430Sstevel@tonic-gate }
4440Sstevel@tonic-gate 
4450Sstevel@tonic-gate /*
4460Sstevel@tonic-gate  * mutex_vector_tryenter() is called from the assembly mutex_tryenter()
4470Sstevel@tonic-gate  * routine if the lock is held or is not of type MUTEX_ADAPTIVE.
4480Sstevel@tonic-gate  */
4490Sstevel@tonic-gate int
4500Sstevel@tonic-gate mutex_vector_tryenter(mutex_impl_t *lp)
4510Sstevel@tonic-gate {
4520Sstevel@tonic-gate 	int s;
4530Sstevel@tonic-gate 
4540Sstevel@tonic-gate 	if (MUTEX_TYPE_ADAPTIVE(lp))
4550Sstevel@tonic-gate 		return (0);		/* we already tried in assembly */
4560Sstevel@tonic-gate 
4570Sstevel@tonic-gate 	if (!MUTEX_TYPE_SPIN(lp)) {
4580Sstevel@tonic-gate 		mutex_panic("mutex_tryenter: bad mutex", lp);
4590Sstevel@tonic-gate 		return (0);
4600Sstevel@tonic-gate 	}
4610Sstevel@tonic-gate 
4620Sstevel@tonic-gate 	s = splr(lp->m_spin.m_minspl);
4630Sstevel@tonic-gate 	if (lock_try(&lp->m_spin.m_spinlock)) {
4640Sstevel@tonic-gate 		lp->m_spin.m_oldspl = (ushort_t)s;
4650Sstevel@tonic-gate 		return (1);
4660Sstevel@tonic-gate 	}
4670Sstevel@tonic-gate 	splx(s);
4680Sstevel@tonic-gate 	return (0);
4690Sstevel@tonic-gate }
4700Sstevel@tonic-gate 
4710Sstevel@tonic-gate /*
4720Sstevel@tonic-gate  * mutex_vector_exit() is called from mutex_exit() if the lock is not
4730Sstevel@tonic-gate  * adaptive, has waiters, or is not owned by the current thread (panic).
4740Sstevel@tonic-gate  */
4750Sstevel@tonic-gate void
4760Sstevel@tonic-gate mutex_vector_exit(mutex_impl_t *lp)
4770Sstevel@tonic-gate {
4780Sstevel@tonic-gate 	turnstile_t *ts;
4790Sstevel@tonic-gate 
4800Sstevel@tonic-gate 	if (MUTEX_TYPE_SPIN(lp)) {
4810Sstevel@tonic-gate 		lock_clear_splx(&lp->m_spin.m_spinlock, lp->m_spin.m_oldspl);
4820Sstevel@tonic-gate 		return;
4830Sstevel@tonic-gate 	}
4840Sstevel@tonic-gate 
4850Sstevel@tonic-gate 	if (MUTEX_OWNER(lp) != curthread) {
4860Sstevel@tonic-gate 		mutex_panic("mutex_exit: not owner", lp);
4870Sstevel@tonic-gate 		return;
4880Sstevel@tonic-gate 	}
4890Sstevel@tonic-gate 
4900Sstevel@tonic-gate 	ts = turnstile_lookup(lp);
4910Sstevel@tonic-gate 	MUTEX_CLEAR_LOCK_AND_WAITERS(lp);
4920Sstevel@tonic-gate 	if (ts == NULL)
4930Sstevel@tonic-gate 		turnstile_exit(lp);
4940Sstevel@tonic-gate 	else
4950Sstevel@tonic-gate 		turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
4960Sstevel@tonic-gate 	LOCKSTAT_RECORD0(LS_MUTEX_EXIT_RELEASE, lp);
4970Sstevel@tonic-gate }
4980Sstevel@tonic-gate 
4990Sstevel@tonic-gate int
5000Sstevel@tonic-gate mutex_owned(kmutex_t *mp)
5010Sstevel@tonic-gate {
5020Sstevel@tonic-gate 	mutex_impl_t *lp = (mutex_impl_t *)mp;
5030Sstevel@tonic-gate 
5040Sstevel@tonic-gate 	if (panicstr)
5050Sstevel@tonic-gate 		return (1);
5060Sstevel@tonic-gate 
5070Sstevel@tonic-gate 	if (MUTEX_TYPE_ADAPTIVE(lp))
5080Sstevel@tonic-gate 		return (MUTEX_OWNER(lp) == curthread);
5090Sstevel@tonic-gate 	return (LOCK_HELD(&lp->m_spin.m_spinlock));
5100Sstevel@tonic-gate }
5110Sstevel@tonic-gate 
5120Sstevel@tonic-gate kthread_t *
5130Sstevel@tonic-gate mutex_owner(kmutex_t *mp)
5140Sstevel@tonic-gate {
5150Sstevel@tonic-gate 	mutex_impl_t *lp = (mutex_impl_t *)mp;
5160Sstevel@tonic-gate 	kthread_id_t t;
5170Sstevel@tonic-gate 
5180Sstevel@tonic-gate 	if (MUTEX_TYPE_ADAPTIVE(lp) && (t = MUTEX_OWNER(lp)) != MUTEX_NO_OWNER)
5190Sstevel@tonic-gate 		return (t);
5200Sstevel@tonic-gate 	return (NULL);
5210Sstevel@tonic-gate }
5220Sstevel@tonic-gate 
5230Sstevel@tonic-gate /*
5240Sstevel@tonic-gate  * The iblock cookie 'ibc' is the spl level associated with the lock;
5250Sstevel@tonic-gate  * this alone determines whether the lock will be ADAPTIVE or SPIN.
5260Sstevel@tonic-gate  *
5270Sstevel@tonic-gate  * Adaptive mutexes created in zeroed memory do not need to call
5280Sstevel@tonic-gate  * mutex_init() as their allocation in this fashion guarantees
5290Sstevel@tonic-gate  * their initialization.
5300Sstevel@tonic-gate  *   eg adaptive mutexes created as static within the BSS or allocated
5310Sstevel@tonic-gate  *      by kmem_zalloc().
5320Sstevel@tonic-gate  */
5330Sstevel@tonic-gate /* ARGSUSED */
5340Sstevel@tonic-gate void
5350Sstevel@tonic-gate mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc)
5360Sstevel@tonic-gate {
5370Sstevel@tonic-gate 	mutex_impl_t *lp = (mutex_impl_t *)mp;
5380Sstevel@tonic-gate 
5390Sstevel@tonic-gate 	ASSERT(ibc < (void *)KERNELBASE);	/* see 1215173 */
5400Sstevel@tonic-gate 
5410Sstevel@tonic-gate 	if ((intptr_t)ibc > ipltospl(LOCK_LEVEL) && ibc < (void *)KERNELBASE) {
5420Sstevel@tonic-gate 		ASSERT(type != MUTEX_ADAPTIVE && type != MUTEX_DEFAULT);
5430Sstevel@tonic-gate 		MUTEX_SET_TYPE(lp, MUTEX_SPIN);
5440Sstevel@tonic-gate 		LOCK_INIT_CLEAR(&lp->m_spin.m_spinlock);
5450Sstevel@tonic-gate 		LOCK_INIT_HELD(&lp->m_spin.m_dummylock);
5460Sstevel@tonic-gate 		lp->m_spin.m_minspl = (int)(intptr_t)ibc;
5470Sstevel@tonic-gate 	} else {
5480Sstevel@tonic-gate 		ASSERT(type != MUTEX_SPIN);
5490Sstevel@tonic-gate 		MUTEX_SET_TYPE(lp, MUTEX_ADAPTIVE);
5500Sstevel@tonic-gate 		MUTEX_CLEAR_LOCK_AND_WAITERS(lp);
5510Sstevel@tonic-gate 	}
5520Sstevel@tonic-gate }
5530Sstevel@tonic-gate 
5540Sstevel@tonic-gate void
5550Sstevel@tonic-gate mutex_destroy(kmutex_t *mp)
5560Sstevel@tonic-gate {
5570Sstevel@tonic-gate 	mutex_impl_t *lp = (mutex_impl_t *)mp;
5580Sstevel@tonic-gate 
5590Sstevel@tonic-gate 	if (lp->m_owner == 0 && !MUTEX_HAS_WAITERS(lp)) {
5600Sstevel@tonic-gate 		MUTEX_DESTROY(lp);
5610Sstevel@tonic-gate 	} else if (MUTEX_TYPE_SPIN(lp)) {
5620Sstevel@tonic-gate 		LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE, lp);
5630Sstevel@tonic-gate 		MUTEX_DESTROY(lp);
5640Sstevel@tonic-gate 	} else if (MUTEX_TYPE_ADAPTIVE(lp)) {
5650Sstevel@tonic-gate 		LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE, lp);
5660Sstevel@tonic-gate 		if (MUTEX_OWNER(lp) != curthread)
5670Sstevel@tonic-gate 			mutex_panic("mutex_destroy: not owner", lp);
5680Sstevel@tonic-gate 		if (MUTEX_HAS_WAITERS(lp)) {
5690Sstevel@tonic-gate 			turnstile_t *ts = turnstile_lookup(lp);
5700Sstevel@tonic-gate 			turnstile_exit(lp);
5710Sstevel@tonic-gate 			if (ts != NULL)
5720Sstevel@tonic-gate 				mutex_panic("mutex_destroy: has waiters", lp);
5730Sstevel@tonic-gate 		}
5740Sstevel@tonic-gate 		MUTEX_DESTROY(lp);
5750Sstevel@tonic-gate 	} else {
5760Sstevel@tonic-gate 		mutex_panic("mutex_destroy: bad mutex", lp);
5770Sstevel@tonic-gate 	}
5780Sstevel@tonic-gate }
5790Sstevel@tonic-gate 
5800Sstevel@tonic-gate /*
5810Sstevel@tonic-gate  * Simple C support for the cases where spin locks miss on the first try.
5820Sstevel@tonic-gate  */
5830Sstevel@tonic-gate void
5840Sstevel@tonic-gate lock_set_spin(lock_t *lp)
5850Sstevel@tonic-gate {
5860Sstevel@tonic-gate 	int spin_count = 1;
5870Sstevel@tonic-gate 	int backoff;	/* current backoff */
5880Sstevel@tonic-gate 	int backctr;	/* ctr for backoff */
5890Sstevel@tonic-gate 
5900Sstevel@tonic-gate 	if (panicstr)
5910Sstevel@tonic-gate 		return;
5920Sstevel@tonic-gate 
5930Sstevel@tonic-gate 	if (ncpus == 1)
5940Sstevel@tonic-gate 		panic("lock_set: %p lock held and only one CPU", lp);
5950Sstevel@tonic-gate 
596*3914Spm145316 	if (&plat_lock_delay) {
597*3914Spm145316 		backoff = 0;
598*3914Spm145316 	} else {
599*3914Spm145316 		backoff = BACKOFF_BASE;
600*3914Spm145316 	}
601*3914Spm145316 
6020Sstevel@tonic-gate 	while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
6030Sstevel@tonic-gate 		if (panicstr)
6040Sstevel@tonic-gate 			return;
6050Sstevel@tonic-gate 		spin_count++;
6060Sstevel@tonic-gate 		/*
6070Sstevel@tonic-gate 		 * Add an exponential backoff delay before trying again
6080Sstevel@tonic-gate 		 * to touch the mutex data structure.
6090Sstevel@tonic-gate 		 * the spin_count test and call to nulldev are to prevent
6100Sstevel@tonic-gate 		 * the compiler optimizer from eliminating the delay loop.
6110Sstevel@tonic-gate 		 */
612*3914Spm145316 		if (&plat_lock_delay) {
613*3914Spm145316 			plat_lock_delay(&backoff);
614*3914Spm145316 		} else {
615*3914Spm145316 			/* delay */
616*3914Spm145316 			for (backctr = backoff; backctr; backctr--) {
617*3914Spm145316 				if (!spin_count) (void) nulldev();
618*3914Spm145316 			}
6190Sstevel@tonic-gate 
620*3914Spm145316 			backoff = backoff << 1;		/* double it */
621*3914Spm145316 			if (backoff > BACKOFF_CAP) {
622*3914Spm145316 				backoff = BACKOFF_CAP;
623*3914Spm145316 			}
624*3914Spm145316 			SMT_PAUSE();
6250Sstevel@tonic-gate 		}
6260Sstevel@tonic-gate 	}
6270Sstevel@tonic-gate 
6280Sstevel@tonic-gate 	if (spin_count) {
6290Sstevel@tonic-gate 		LOCKSTAT_RECORD(LS_LOCK_SET_SPIN, lp, spin_count);
6300Sstevel@tonic-gate 	}
6310Sstevel@tonic-gate 
6320Sstevel@tonic-gate 	LOCKSTAT_RECORD0(LS_LOCK_SET_ACQUIRE, lp);
6330Sstevel@tonic-gate }
6340Sstevel@tonic-gate 
6350Sstevel@tonic-gate void
6360Sstevel@tonic-gate lock_set_spl_spin(lock_t *lp, int new_pil, ushort_t *old_pil_addr, int old_pil)
6370Sstevel@tonic-gate {
6380Sstevel@tonic-gate 	int spin_count = 1;
6390Sstevel@tonic-gate 	int backoff;	/* current backoff */
6400Sstevel@tonic-gate 	int backctr;	/* ctr for backoff */
6410Sstevel@tonic-gate 
6420Sstevel@tonic-gate 	if (panicstr)
6430Sstevel@tonic-gate 		return;
6440Sstevel@tonic-gate 
6450Sstevel@tonic-gate 	if (ncpus == 1)
6460Sstevel@tonic-gate 		panic("lock_set_spl: %p lock held and only one CPU", lp);
6470Sstevel@tonic-gate 
6480Sstevel@tonic-gate 	ASSERT(new_pil > LOCK_LEVEL);
6490Sstevel@tonic-gate 
650*3914Spm145316 	if (&plat_lock_delay) {
651*3914Spm145316 		backoff = 0;
652*3914Spm145316 	} else {
653*3914Spm145316 		backoff = BACKOFF_BASE;
654*3914Spm145316 	}
6550Sstevel@tonic-gate 	do {
6560Sstevel@tonic-gate 		splx(old_pil);
6570Sstevel@tonic-gate 		while (LOCK_HELD(lp)) {
6580Sstevel@tonic-gate 			if (panicstr) {
6590Sstevel@tonic-gate 				*old_pil_addr = (ushort_t)splr(new_pil);
6600Sstevel@tonic-gate 				return;
6610Sstevel@tonic-gate 			}
6620Sstevel@tonic-gate 			spin_count++;
6630Sstevel@tonic-gate 			/*
6640Sstevel@tonic-gate 			 * Add an exponential backoff delay before trying again
6650Sstevel@tonic-gate 			 * to touch the mutex data structure.
6660Sstevel@tonic-gate 			 * spin_count test and call to nulldev are to prevent
6670Sstevel@tonic-gate 			 * compiler optimizer from eliminating the delay loop.
6680Sstevel@tonic-gate 			 */
669*3914Spm145316 			if (&plat_lock_delay) {
670*3914Spm145316 				plat_lock_delay(&backoff);
671*3914Spm145316 			} else {
672*3914Spm145316 				for (backctr = backoff; backctr; backctr--) {
673*3914Spm145316 					if (!spin_count) (void) nulldev();
674*3914Spm145316 				}
675*3914Spm145316 				backoff = backoff << 1;		/* double it */
676*3914Spm145316 				if (backoff > BACKOFF_CAP) {
677*3914Spm145316 					backoff = BACKOFF_CAP;
678*3914Spm145316 				}
679*3914Spm145316 
680*3914Spm145316 				SMT_PAUSE();
6810Sstevel@tonic-gate 			}
6820Sstevel@tonic-gate 		}
6830Sstevel@tonic-gate 		old_pil = splr(new_pil);
6840Sstevel@tonic-gate 	} while (!lock_spin_try(lp));
6850Sstevel@tonic-gate 
6860Sstevel@tonic-gate 	*old_pil_addr = (ushort_t)old_pil;
6870Sstevel@tonic-gate 
6880Sstevel@tonic-gate 	if (spin_count) {
6890Sstevel@tonic-gate 		LOCKSTAT_RECORD(LS_LOCK_SET_SPL_SPIN, lp, spin_count);
6900Sstevel@tonic-gate 	}
6910Sstevel@tonic-gate 
6920Sstevel@tonic-gate 	LOCKSTAT_RECORD(LS_LOCK_SET_SPL_ACQUIRE, lp, spin_count);
6930Sstevel@tonic-gate }
694