common/os/mutex.c

*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate * CDDL HEADER START
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * The contents of this file are subject to the terms of the
*0Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only
*0Sstevel@tonic-gate * (the "License").  You may not use this file except in compliance
*0Sstevel@tonic-gate * with the License.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
*0Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
*0Sstevel@tonic-gate * See the License for the specific language governing permissions
*0Sstevel@tonic-gate * and limitations under the License.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
*0Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
*0Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
*0Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
*0Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * CDDL HEADER END
*0Sstevel@tonic-gate */
*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
*0Sstevel@tonic-gate * Use is subject to license terms.
*0Sstevel@tonic-gate */
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate#pragma ident	"%Z%%M%	%I%	%E% SMI"
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate * Big Theory Statement for mutual exclusion locking primitives.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * A mutex serializes multiple threads so that only one thread
*0Sstevel@tonic-gate * (the "owner" of the mutex) is active at a time.  See mutex(9F)
*0Sstevel@tonic-gate * for a full description of the interfaces and programming model.
*0Sstevel@tonic-gate * The rest of this comment describes the implementation.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * Mutexes come in two flavors: adaptive and spin.  mutex_init(9F)
*0Sstevel@tonic-gate * determines the type based solely on the iblock cookie (PIL) argument.
*0Sstevel@tonic-gate * PIL > LOCK_LEVEL implies a spin lock; everything else is adaptive.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * Spin mutexes block interrupts and spin until the lock becomes available.
*0Sstevel@tonic-gate * A thread may not sleep, or call any function that might sleep, while
*0Sstevel@tonic-gate * holding a spin mutex.  With few exceptions, spin mutexes should only
*0Sstevel@tonic-gate * be used to synchronize with interrupt handlers.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * Adaptive mutexes (the default type) spin if the owner is running on
*0Sstevel@tonic-gate * another CPU and block otherwise.  This policy is based on the assumption
*0Sstevel@tonic-gate * that mutex hold times are typically short enough that the time spent
*0Sstevel@tonic-gate * spinning is less than the time it takes to block.  If you need mutual
*0Sstevel@tonic-gate * exclusion semantics with long hold times, consider an rwlock(9F) as
*0Sstevel@tonic-gate * RW_WRITER.  Better still, reconsider the algorithm: if it requires
*0Sstevel@tonic-gate * mutual exclusion for long periods of time, it's probably not scalable.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * Adaptive mutexes are overwhelmingly more common than spin mutexes,
*0Sstevel@tonic-gate * so mutex_enter() assumes that the lock is adaptive.  We get away
*0Sstevel@tonic-gate * with this by structuring mutexes so that an attempt to acquire a
*0Sstevel@tonic-gate * spin mutex as adaptive always fails.  When mutex_enter() fails
*0Sstevel@tonic-gate * it punts to mutex_vector_enter(), which does all the hard stuff.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * mutex_vector_enter() first checks the type.  If it's spin mutex,
*0Sstevel@tonic-gate * we just call lock_set_spl() and return.  If it's an adaptive mutex,
*0Sstevel@tonic-gate * we check to see what the owner is doing.  If the owner is running,
*0Sstevel@tonic-gate * we spin until the lock becomes available; if not, we mark the lock
*0Sstevel@tonic-gate * as having waiters and block.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * Blocking on a mutex is surprisingly delicate dance because, for speed,
*0Sstevel@tonic-gate * mutex_exit() doesn't use an atomic instruction.  Thus we have to work
*0Sstevel@tonic-gate * a little harder in the (rarely-executed) blocking path to make sure
*0Sstevel@tonic-gate * we don't block on a mutex that's just been released -- otherwise we
*0Sstevel@tonic-gate * might never be woken up.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * The logic for synchronizing mutex_vector_enter() with mutex_exit()
*0Sstevel@tonic-gate * in the face of preemption and relaxed memory ordering is as follows:
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * (1) Preemption in the middle of mutex_exit() must cause mutex_exit()
*0Sstevel@tonic-gate *     to restart.  Each platform must enforce this by checking the
*0Sstevel@tonic-gate *     interrupted PC in the interrupt handler (or on return from trap --
*0Sstevel@tonic-gate *     whichever is more convenient for the platform).  If the PC
*0Sstevel@tonic-gate *     lies within the critical region of mutex_exit(), the interrupt
*0Sstevel@tonic-gate *     handler must reset the PC back to the beginning of mutex_exit().
*0Sstevel@tonic-gate *     The critical region consists of all instructions up to, but not
*0Sstevel@tonic-gate *     including, the store that clears the lock (which, of course,
*0Sstevel@tonic-gate *     must never be executed twice.)
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate *     This ensures that the owner will always check for waiters after
*0Sstevel@tonic-gate *     resuming from a previous preemption.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * (2) A thread resuming in mutex_exit() does (at least) the following:
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate *	when resuming:	set CPU_THREAD = owner
*0Sstevel@tonic-gate *			membar #StoreLoad
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate *	in mutex_exit:	check waiters bit; do wakeup if set
*0Sstevel@tonic-gate *			membar #LoadStore|#StoreStore
*0Sstevel@tonic-gate *			clear owner
*0Sstevel@tonic-gate *			(at this point, other threads may or may not grab
*0Sstevel@tonic-gate *			the lock, and we may or may not reacquire it)
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate *	when blocking:	membar #StoreStore (due to disp_lock_enter())
*0Sstevel@tonic-gate *			set CPU_THREAD = (possibly) someone else
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * (3) A thread blocking in mutex_vector_enter() does the following:
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate *			set waiters bit
*0Sstevel@tonic-gate *			membar #StoreLoad (via membar_enter())
*0Sstevel@tonic-gate *			check CPU_THREAD for each CPU; abort if owner running
*0Sstevel@tonic-gate *			membar #LoadLoad (via membar_consumer())
*0Sstevel@tonic-gate *			check owner and waiters bit; abort if either changed
*0Sstevel@tonic-gate *			block
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * Thus the global memory orderings for (2) and (3) are as follows:
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * (2M) mutex_exit() memory order:
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate *			STORE	CPU_THREAD = owner
*0Sstevel@tonic-gate *			LOAD	waiters bit
*0Sstevel@tonic-gate *			STORE	owner = NULL
*0Sstevel@tonic-gate *			STORE	CPU_THREAD = (possibly) someone else
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * (3M) mutex_vector_enter() memory order:
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate *			STORE	waiters bit = 1
*0Sstevel@tonic-gate *			LOAD	CPU_THREAD for each CPU
*0Sstevel@tonic-gate *			LOAD	owner and waiters bit
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * It has been verified by exhaustive simulation that all possible global
*0Sstevel@tonic-gate * memory orderings of (2M) interleaved with (3M) result in correct
*0Sstevel@tonic-gate * behavior.  Moreover, these ordering constraints are minimal: changing
*0Sstevel@tonic-gate * the ordering of anything in (2M) or (3M) breaks the algorithm, creating
*0Sstevel@tonic-gate * windows for missed wakeups.  Note: the possibility that other threads
*0Sstevel@tonic-gate * may grab the lock after the owner drops it can be factored out of the
*0Sstevel@tonic-gate * memory ordering analysis because mutex_vector_enter() won't block
*0Sstevel@tonic-gate * if the lock isn't still owned by the same thread.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * The only requirements of code outside the mutex implementation are
*0Sstevel@tonic-gate * (1) mutex_exit() preemption fixup in interrupt handlers or trap return,
*0Sstevel@tonic-gate * and (2) a membar #StoreLoad after setting CPU_THREAD in resume().
*0Sstevel@tonic-gate * Note: idle threads cannot grab adaptive locks (since they cannot block),
*0Sstevel@tonic-gate * so the membar may be safely omitted when resuming an idle thread.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * When a mutex has waiters, mutex_vector_exit() has several options:
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * (1) Choose a waiter and make that thread the owner before waking it;
*0Sstevel@tonic-gate *     this is known as "direct handoff" of ownership.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * (2) Drop the lock and wake one waiter.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * (3) Drop the lock, clear the waiters bit, and wake all waiters.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * In many ways (1) is the cleanest solution, but if a lock is moderately
*0Sstevel@tonic-gate * contended it defeats the adaptive spin logic.  If we make some other
*0Sstevel@tonic-gate * thread the owner, but he's not ONPROC yet, then all other threads on
*0Sstevel@tonic-gate * other cpus that try to get the lock will conclude that the owner is
*0Sstevel@tonic-gate * blocked, so they'll block too.  And so on -- it escalates quickly,
*0Sstevel@tonic-gate * with every thread taking the blocking path rather than the spin path.
*0Sstevel@tonic-gate * Thus, direct handoff is *not* a good idea for adaptive mutexes.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * Option (2) is the next most natural-seeming option, but it has several
*0Sstevel@tonic-gate * annoying properties.  If there's more than one waiter, we must preserve
*0Sstevel@tonic-gate * the waiters bit on an unheld lock.  On cas-capable platforms, where
*0Sstevel@tonic-gate * the waiters bit is part of the lock word, this means that both 0x0
*0Sstevel@tonic-gate * and 0x1 represent unheld locks, so we have to cas against *both*.
*0Sstevel@tonic-gate * Priority inheritance also gets more complicated, because a lock can
*0Sstevel@tonic-gate * have waiters but no owner to whom priority can be willed.  So while
*0Sstevel@tonic-gate * it is possible to make option (2) work, it's surprisingly vile.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * Option (3), the least-intuitive at first glance, is what we actually do.
*0Sstevel@tonic-gate * It has the advantage that because you always wake all waiters, you
*0Sstevel@tonic-gate * never have to preserve the waiters bit.  Waking all waiters seems like
*0Sstevel@tonic-gate * begging for a thundering herd problem, but consider: under option (2),
*0Sstevel@tonic-gate * every thread that grabs and drops the lock will wake one waiter -- so
*0Sstevel@tonic-gate * if the lock is fairly active, all waiters will be awakened very quickly
*0Sstevel@tonic-gate * anyway.  Moreover, this is how adaptive locks are *supposed* to work.
*0Sstevel@tonic-gate * The blocking case is rare; the more common case (by 3-4 orders of
*0Sstevel@tonic-gate * magnitude) is that one or more threads spin waiting to get the lock.
*0Sstevel@tonic-gate * Only direct handoff can prevent the thundering herd problem, but as
*0Sstevel@tonic-gate * mentioned earlier, that would tend to defeat the adaptive spin logic.
*0Sstevel@tonic-gate * In practice, option (3) works well because the blocking case is rare.
*0Sstevel@tonic-gate */
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate * delayed lock retry with exponential delay for spin locks
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * It is noted above that for both the spin locks and the adaptive locks,
*0Sstevel@tonic-gate * spinning is the dominate mode of operation.  So long as there is only
*0Sstevel@tonic-gate * one thread waiting on a lock, the naive spin loop works very well in
*0Sstevel@tonic-gate * cache based architectures.  The lock data structure is pulled into the
*0Sstevel@tonic-gate * cache of the processor with the waiting/spinning thread and no further
*0Sstevel@tonic-gate * memory traffic is generated until the lock is released.  Unfortunately,
*0Sstevel@tonic-gate * once two or more threads are waiting on a lock, the naive spin has
*0Sstevel@tonic-gate * the property of generating maximum memory traffic from each spinning
*0Sstevel@tonic-gate * thread as the spinning threads contend for the lock data structure.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * By executing a delay loop before retrying a lock, a waiting thread
*0Sstevel@tonic-gate * can reduce its memory traffic by a large factor, depending on the
*0Sstevel@tonic-gate * size of the delay loop.  A large delay loop greatly reduced the memory
*0Sstevel@tonic-gate * traffic, but has the drawback of having a period of time when
*0Sstevel@tonic-gate * no thread is attempting to gain the lock even though several threads
*0Sstevel@tonic-gate * might be waiting.  A small delay loop has the drawback of not
*0Sstevel@tonic-gate * much reduction in memory traffic, but reduces the potential idle time.
*0Sstevel@tonic-gate * The theory of the exponential delay code is to start with a short
*0Sstevel@tonic-gate * delay loop and double the waiting time on each iteration, up to
*0Sstevel@tonic-gate * a preselected maximum.  The BACKOFF_BASE provides the equivalent
*0Sstevel@tonic-gate * of 2 to 3 memory references delay for US-III+ and US-IV architectures.
*0Sstevel@tonic-gate * The BACKOFF_CAP is the equivalent of 50 to 100 memory references of
*0Sstevel@tonic-gate * time (less than 12 microseconds for a 1000 MHz system).
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * To determine appropriate BACKOFF_BASE and BACKOFF_CAP values,
*0Sstevel@tonic-gate * studies on US-III+ and US-IV systems using 1 to 66 threads were
*0Sstevel@tonic-gate * done.  A range of possible values were studied.
*0Sstevel@tonic-gate * Performance differences below 10 threads were not large.  For
*0Sstevel@tonic-gate * systems with more threads, substantial increases in total lock
*0Sstevel@tonic-gate * throughput was observed with the given values.  For cases where
*0Sstevel@tonic-gate * more than 20 threads were waiting on the same lock, lock throughput
*0Sstevel@tonic-gate * increased by a factor of 5 or more using the backoff algorithm.
*0Sstevel@tonic-gate */
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate#include <sys/param.h>
*0Sstevel@tonic-gate#include <sys/time.h>
*0Sstevel@tonic-gate#include <sys/cpuvar.h>
*0Sstevel@tonic-gate#include <sys/thread.h>
*0Sstevel@tonic-gate#include <sys/debug.h>
*0Sstevel@tonic-gate#include <sys/cmn_err.h>
*0Sstevel@tonic-gate#include <sys/sobject.h>
*0Sstevel@tonic-gate#include <sys/turnstile.h>
*0Sstevel@tonic-gate#include <sys/systm.h>
*0Sstevel@tonic-gate#include <sys/mutex_impl.h>
*0Sstevel@tonic-gate#include <sys/spl.h>
*0Sstevel@tonic-gate#include <sys/lockstat.h>
*0Sstevel@tonic-gate#include <sys/atomic.h>
*0Sstevel@tonic-gate#include <sys/cpu.h>
*0Sstevel@tonic-gate#include <sys/stack.h>
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate#define	BACKOFF_BASE	50
*0Sstevel@tonic-gate#define	BACKOFF_CAP 	1600
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate * The sobj_ops vector exports a set of functions needed when a thread
*0Sstevel@tonic-gate * is asleep on a synchronization object of this type.
*0Sstevel@tonic-gate */
*0Sstevel@tonic-gatestatic sobj_ops_t mutex_sobj_ops = {
*0Sstevel@tonic-gate	SOBJ_MUTEX, mutex_owner, turnstile_stay_asleep, turnstile_change_pri
*0Sstevel@tonic-gate};
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate * If the system panics on a mutex, save the address of the offending
*0Sstevel@tonic-gate * mutex in panic_mutex_addr, and save the contents in panic_mutex.
*0Sstevel@tonic-gate */
*0Sstevel@tonic-gatestatic mutex_impl_t panic_mutex;
*0Sstevel@tonic-gatestatic mutex_impl_t *panic_mutex_addr;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatestatic void
*0Sstevel@tonic-gatemutex_panic(char *msg, mutex_impl_t *lp)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate	if (panicstr)
*0Sstevel@tonic-gate		return;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (casptr(&panic_mutex_addr, NULL, lp) == NULL)
*0Sstevel@tonic-gate		panic_mutex = *lp;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	panic("%s, lp=%p owner=%p thread=%p",
*0Sstevel@tonic-gate	    msg, lp, MUTEX_OWNER(&panic_mutex), curthread);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate * mutex_vector_enter() is called from the assembly mutex_enter() routine
*0Sstevel@tonic-gate * if the lock is held or is not of type MUTEX_ADAPTIVE.
*0Sstevel@tonic-gate */
*0Sstevel@tonic-gatevoid
*0Sstevel@tonic-gatemutex_vector_enter(mutex_impl_t *lp)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate	kthread_id_t	owner;
*0Sstevel@tonic-gate	hrtime_t	sleep_time = 0;	/* how long we slept */
*0Sstevel@tonic-gate	uint_t		spin_count = 0;	/* how many times we spun */
*0Sstevel@tonic-gate	cpu_t 		*cpup, *last_cpu;
*0Sstevel@tonic-gate	extern cpu_t	*cpu_list;
*0Sstevel@tonic-gate	turnstile_t	*ts;
*0Sstevel@tonic-gate	volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp;
*0Sstevel@tonic-gate	int		backoff;	/* current backoff */
*0Sstevel@tonic-gate	int		backctr;	/* ctr for backoff */
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	ASSERT_STACK_ALIGNED();
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (MUTEX_TYPE_SPIN(lp)) {
*0Sstevel@tonic-gate		lock_set_spl(&lp->m_spin.m_spinlock, lp->m_spin.m_minspl,
*0Sstevel@tonic-gate		    &lp->m_spin.m_oldspl);
*0Sstevel@tonic-gate		return;
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (!MUTEX_TYPE_ADAPTIVE(lp)) {
*0Sstevel@tonic-gate		mutex_panic("mutex_enter: bad mutex", lp);
*0Sstevel@tonic-gate		return;
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	/*
*0Sstevel@tonic-gate	 * Adaptive mutexes must not be acquired from above LOCK_LEVEL.
*0Sstevel@tonic-gate	 * We can migrate after loading CPU but before checking CPU_ON_INTR,
*0Sstevel@tonic-gate	 * so we must verify by disabling preemption and loading CPU again.
*0Sstevel@tonic-gate	 */
*0Sstevel@tonic-gate	cpup = CPU;
*0Sstevel@tonic-gate	if (CPU_ON_INTR(cpup) && !panicstr) {
*0Sstevel@tonic-gate		kpreempt_disable();
*0Sstevel@tonic-gate		if (CPU_ON_INTR(CPU))
*0Sstevel@tonic-gate			mutex_panic("mutex_enter: adaptive at high PIL", lp);
*0Sstevel@tonic-gate		kpreempt_enable();
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	backoff = BACKOFF_BASE;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	for (;;) {
*0Sstevel@tonic-gatespin:
*0Sstevel@tonic-gate		spin_count++;
*0Sstevel@tonic-gate		/*
*0Sstevel@tonic-gate		 * Add an exponential backoff delay before trying again
*0Sstevel@tonic-gate		 * to touch the mutex data structure.
*0Sstevel@tonic-gate		 * the spin_count test and call to nulldev are to prevent
*0Sstevel@tonic-gate		 * the compiler optimizer from eliminating the delay loop.
*0Sstevel@tonic-gate		 */
*0Sstevel@tonic-gate		for (backctr = backoff; backctr; backctr--) {
*0Sstevel@tonic-gate			if (!spin_count) (void) nulldev();
*0Sstevel@tonic-gate		};    /* delay */
*0Sstevel@tonic-gate		backoff = backoff << 1;			/* double it */
*0Sstevel@tonic-gate		if (backoff > BACKOFF_CAP) {
*0Sstevel@tonic-gate			backoff = BACKOFF_CAP;
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate		SMT_PAUSE();
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate		if (panicstr)
*0Sstevel@tonic-gate			return;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate		if ((owner = MUTEX_OWNER(vlp)) == NULL) {
*0Sstevel@tonic-gate			if (mutex_adaptive_tryenter(lp))
*0Sstevel@tonic-gate				break;
*0Sstevel@tonic-gate			continue;
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate		if (owner == curthread)
*0Sstevel@tonic-gate			mutex_panic("recursive mutex_enter", lp);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate		/*
*0Sstevel@tonic-gate		 * If lock is held but owner is not yet set, spin.
*0Sstevel@tonic-gate		 * (Only relevant for platforms that don't have cas.)
*0Sstevel@tonic-gate		 */
*0Sstevel@tonic-gate		if (owner == MUTEX_NO_OWNER)
*0Sstevel@tonic-gate			continue;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate		/*
*0Sstevel@tonic-gate		 * When searching the other CPUs, start with the one where
*0Sstevel@tonic-gate		 * we last saw the owner thread.  If owner is running, spin.
*0Sstevel@tonic-gate		 *
*0Sstevel@tonic-gate		 * We must disable preemption at this point to guarantee
*0Sstevel@tonic-gate		 * that the list doesn't change while we traverse it
*0Sstevel@tonic-gate		 * without the cpu_lock mutex.  While preemption is
*0Sstevel@tonic-gate		 * disabled, we must revalidate our cached cpu pointer.
*0Sstevel@tonic-gate		 */
*0Sstevel@tonic-gate		kpreempt_disable();
*0Sstevel@tonic-gate		if (cpup->cpu_next == NULL)
*0Sstevel@tonic-gate			cpup = cpu_list;
*0Sstevel@tonic-gate		last_cpu = cpup;	/* mark end of search */
*0Sstevel@tonic-gate		do {
*0Sstevel@tonic-gate			if (cpup->cpu_thread == owner) {
*0Sstevel@tonic-gate				kpreempt_enable();
*0Sstevel@tonic-gate				goto spin;
*0Sstevel@tonic-gate			}
*0Sstevel@tonic-gate		} while ((cpup = cpup->cpu_next) != last_cpu);
*0Sstevel@tonic-gate		kpreempt_enable();
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate		/*
*0Sstevel@tonic-gate		 * The owner appears not to be running, so block.
*0Sstevel@tonic-gate		 * See the Big Theory Statement for memory ordering issues.
*0Sstevel@tonic-gate		 */
*0Sstevel@tonic-gate		ts = turnstile_lookup(lp);
*0Sstevel@tonic-gate		MUTEX_SET_WAITERS(lp);
*0Sstevel@tonic-gate		membar_enter();
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate		/*
*0Sstevel@tonic-gate		 * Recheck whether owner is running after waiters bit hits
*0Sstevel@tonic-gate		 * global visibility (above).  If owner is running, spin.
*0Sstevel@tonic-gate		 *
*0Sstevel@tonic-gate		 * Since we are at ipl DISP_LEVEL, kernel preemption is
*0Sstevel@tonic-gate		 * disabled, however we still need to revalidate our cached
*0Sstevel@tonic-gate		 * cpu pointer to make sure the cpu hasn't been deleted.
*0Sstevel@tonic-gate		 */
*0Sstevel@tonic-gate		if (cpup->cpu_next == NULL)
*0Sstevel@tonic-gate			last_cpu = cpup = cpu_list;
*0Sstevel@tonic-gate		do {
*0Sstevel@tonic-gate			if (cpup->cpu_thread == owner) {
*0Sstevel@tonic-gate				turnstile_exit(lp);
*0Sstevel@tonic-gate				goto spin;
*0Sstevel@tonic-gate			}
*0Sstevel@tonic-gate		} while ((cpup = cpup->cpu_next) != last_cpu);
*0Sstevel@tonic-gate		membar_consumer();
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate		/*
*0Sstevel@tonic-gate		 * If owner and waiters bit are unchanged, block.
*0Sstevel@tonic-gate		 */
*0Sstevel@tonic-gate		if (MUTEX_OWNER(vlp) == owner && MUTEX_HAS_WAITERS(vlp)) {
*0Sstevel@tonic-gate			sleep_time -= gethrtime();
*0Sstevel@tonic-gate			(void) turnstile_block(ts, TS_WRITER_Q, lp,
*0Sstevel@tonic-gate			    &mutex_sobj_ops, NULL, NULL);
*0Sstevel@tonic-gate			sleep_time += gethrtime();
*0Sstevel@tonic-gate		} else {
*0Sstevel@tonic-gate			turnstile_exit(lp);
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	ASSERT(MUTEX_OWNER(lp) == curthread);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (sleep_time == 0) {
*0Sstevel@tonic-gate		LOCKSTAT_RECORD(LS_MUTEX_ENTER_SPIN, lp, spin_count);
*0Sstevel@tonic-gate	} else {
*0Sstevel@tonic-gate		LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK, lp, sleep_time);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate * mutex_vector_tryenter() is called from the assembly mutex_tryenter()
*0Sstevel@tonic-gate * routine if the lock is held or is not of type MUTEX_ADAPTIVE.
*0Sstevel@tonic-gate */
*0Sstevel@tonic-gateint
*0Sstevel@tonic-gatemutex_vector_tryenter(mutex_impl_t *lp)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate	int s;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (MUTEX_TYPE_ADAPTIVE(lp))
*0Sstevel@tonic-gate		return (0);		/* we already tried in assembly */
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (!MUTEX_TYPE_SPIN(lp)) {
*0Sstevel@tonic-gate		mutex_panic("mutex_tryenter: bad mutex", lp);
*0Sstevel@tonic-gate		return (0);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	s = splr(lp->m_spin.m_minspl);
*0Sstevel@tonic-gate	if (lock_try(&lp->m_spin.m_spinlock)) {
*0Sstevel@tonic-gate		lp->m_spin.m_oldspl = (ushort_t)s;
*0Sstevel@tonic-gate		return (1);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate	splx(s);
*0Sstevel@tonic-gate	return (0);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate * mutex_vector_exit() is called from mutex_exit() if the lock is not
*0Sstevel@tonic-gate * adaptive, has waiters, or is not owned by the current thread (panic).
*0Sstevel@tonic-gate */
*0Sstevel@tonic-gatevoid
*0Sstevel@tonic-gatemutex_vector_exit(mutex_impl_t *lp)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate	turnstile_t *ts;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (MUTEX_TYPE_SPIN(lp)) {
*0Sstevel@tonic-gate		lock_clear_splx(&lp->m_spin.m_spinlock, lp->m_spin.m_oldspl);
*0Sstevel@tonic-gate		return;
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (MUTEX_OWNER(lp) != curthread) {
*0Sstevel@tonic-gate		mutex_panic("mutex_exit: not owner", lp);
*0Sstevel@tonic-gate		return;
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	ts = turnstile_lookup(lp);
*0Sstevel@tonic-gate	MUTEX_CLEAR_LOCK_AND_WAITERS(lp);
*0Sstevel@tonic-gate	if (ts == NULL)
*0Sstevel@tonic-gate		turnstile_exit(lp);
*0Sstevel@tonic-gate	else
*0Sstevel@tonic-gate		turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
*0Sstevel@tonic-gate	LOCKSTAT_RECORD0(LS_MUTEX_EXIT_RELEASE, lp);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gateint
*0Sstevel@tonic-gatemutex_owned(kmutex_t *mp)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate	mutex_impl_t *lp = (mutex_impl_t *)mp;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (panicstr)
*0Sstevel@tonic-gate		return (1);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (MUTEX_TYPE_ADAPTIVE(lp))
*0Sstevel@tonic-gate		return (MUTEX_OWNER(lp) == curthread);
*0Sstevel@tonic-gate	return (LOCK_HELD(&lp->m_spin.m_spinlock));
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatekthread_t *
*0Sstevel@tonic-gatemutex_owner(kmutex_t *mp)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate	mutex_impl_t *lp = (mutex_impl_t *)mp;
*0Sstevel@tonic-gate	kthread_id_t t;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (MUTEX_TYPE_ADAPTIVE(lp) && (t = MUTEX_OWNER(lp)) != MUTEX_NO_OWNER)
*0Sstevel@tonic-gate		return (t);
*0Sstevel@tonic-gate	return (NULL);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate * The iblock cookie 'ibc' is the spl level associated with the lock;
*0Sstevel@tonic-gate * this alone determines whether the lock will be ADAPTIVE or SPIN.
*0Sstevel@tonic-gate *
*0Sstevel@tonic-gate * Adaptive mutexes created in zeroed memory do not need to call
*0Sstevel@tonic-gate * mutex_init() as their allocation in this fashion guarantees
*0Sstevel@tonic-gate * their initialization.
*0Sstevel@tonic-gate *   eg adaptive mutexes created as static within the BSS or allocated
*0Sstevel@tonic-gate *      by kmem_zalloc().
*0Sstevel@tonic-gate */
*0Sstevel@tonic-gate/* ARGSUSED */
*0Sstevel@tonic-gatevoid
*0Sstevel@tonic-gatemutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate	mutex_impl_t *lp = (mutex_impl_t *)mp;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	ASSERT(ibc < (void *)KERNELBASE);	/* see 1215173 */
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if ((intptr_t)ibc > ipltospl(LOCK_LEVEL) && ibc < (void *)KERNELBASE) {
*0Sstevel@tonic-gate		ASSERT(type != MUTEX_ADAPTIVE && type != MUTEX_DEFAULT);
*0Sstevel@tonic-gate		MUTEX_SET_TYPE(lp, MUTEX_SPIN);
*0Sstevel@tonic-gate		LOCK_INIT_CLEAR(&lp->m_spin.m_spinlock);
*0Sstevel@tonic-gate		LOCK_INIT_HELD(&lp->m_spin.m_dummylock);
*0Sstevel@tonic-gate		lp->m_spin.m_minspl = (int)(intptr_t)ibc;
*0Sstevel@tonic-gate	} else {
*0Sstevel@tonic-gate		ASSERT(type != MUTEX_SPIN);
*0Sstevel@tonic-gate		MUTEX_SET_TYPE(lp, MUTEX_ADAPTIVE);
*0Sstevel@tonic-gate		MUTEX_CLEAR_LOCK_AND_WAITERS(lp);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatevoid
*0Sstevel@tonic-gatemutex_destroy(kmutex_t *mp)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate	mutex_impl_t *lp = (mutex_impl_t *)mp;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (lp->m_owner == 0 && !MUTEX_HAS_WAITERS(lp)) {
*0Sstevel@tonic-gate		MUTEX_DESTROY(lp);
*0Sstevel@tonic-gate	} else if (MUTEX_TYPE_SPIN(lp)) {
*0Sstevel@tonic-gate		LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE, lp);
*0Sstevel@tonic-gate		MUTEX_DESTROY(lp);
*0Sstevel@tonic-gate	} else if (MUTEX_TYPE_ADAPTIVE(lp)) {
*0Sstevel@tonic-gate		LOCKSTAT_RECORD0(LS_MUTEX_DESTROY_RELEASE, lp);
*0Sstevel@tonic-gate		if (MUTEX_OWNER(lp) != curthread)
*0Sstevel@tonic-gate			mutex_panic("mutex_destroy: not owner", lp);
*0Sstevel@tonic-gate		if (MUTEX_HAS_WAITERS(lp)) {
*0Sstevel@tonic-gate			turnstile_t *ts = turnstile_lookup(lp);
*0Sstevel@tonic-gate			turnstile_exit(lp);
*0Sstevel@tonic-gate			if (ts != NULL)
*0Sstevel@tonic-gate				mutex_panic("mutex_destroy: has waiters", lp);
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate		MUTEX_DESTROY(lp);
*0Sstevel@tonic-gate	} else {
*0Sstevel@tonic-gate		mutex_panic("mutex_destroy: bad mutex", lp);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate/*
*0Sstevel@tonic-gate * Simple C support for the cases where spin locks miss on the first try.
*0Sstevel@tonic-gate */
*0Sstevel@tonic-gatevoid
*0Sstevel@tonic-gatelock_set_spin(lock_t *lp)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate	int spin_count = 1;
*0Sstevel@tonic-gate	int backoff;	/* current backoff */
*0Sstevel@tonic-gate	int backctr;	/* ctr for backoff */
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (panicstr)
*0Sstevel@tonic-gate		return;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (ncpus == 1)
*0Sstevel@tonic-gate		panic("lock_set: %p lock held and only one CPU", lp);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	backoff = BACKOFF_BASE;
*0Sstevel@tonic-gate	while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
*0Sstevel@tonic-gate		if (panicstr)
*0Sstevel@tonic-gate			return;
*0Sstevel@tonic-gate		spin_count++;
*0Sstevel@tonic-gate		/*
*0Sstevel@tonic-gate		 * Add an exponential backoff delay before trying again
*0Sstevel@tonic-gate		 * to touch the mutex data structure.
*0Sstevel@tonic-gate		 * the spin_count test and call to nulldev are to prevent
*0Sstevel@tonic-gate		 * the compiler optimizer from eliminating the delay loop.
*0Sstevel@tonic-gate		 */
*0Sstevel@tonic-gate		for (backctr = backoff; backctr; backctr--) {	/* delay */
*0Sstevel@tonic-gate			if (!spin_count) (void) nulldev();
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate		backoff = backoff << 1;		/* double it */
*0Sstevel@tonic-gate		if (backoff > BACKOFF_CAP) {
*0Sstevel@tonic-gate			backoff = BACKOFF_CAP;
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate		SMT_PAUSE();
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (spin_count) {
*0Sstevel@tonic-gate		LOCKSTAT_RECORD(LS_LOCK_SET_SPIN, lp, spin_count);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	LOCKSTAT_RECORD0(LS_LOCK_SET_ACQUIRE, lp);
*0Sstevel@tonic-gate}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gatevoid
*0Sstevel@tonic-gatelock_set_spl_spin(lock_t *lp, int new_pil, ushort_t *old_pil_addr, int old_pil)
*0Sstevel@tonic-gate{
*0Sstevel@tonic-gate	int spin_count = 1;
*0Sstevel@tonic-gate	int backoff;	/* current backoff */
*0Sstevel@tonic-gate	int backctr;	/* ctr for backoff */
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (panicstr)
*0Sstevel@tonic-gate		return;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (ncpus == 1)
*0Sstevel@tonic-gate		panic("lock_set_spl: %p lock held and only one CPU", lp);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	ASSERT(new_pil > LOCK_LEVEL);
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	backoff = BACKOFF_BASE;
*0Sstevel@tonic-gate	do {
*0Sstevel@tonic-gate		splx(old_pil);
*0Sstevel@tonic-gate		while (LOCK_HELD(lp)) {
*0Sstevel@tonic-gate			if (panicstr) {
*0Sstevel@tonic-gate				*old_pil_addr = (ushort_t)splr(new_pil);
*0Sstevel@tonic-gate				return;
*0Sstevel@tonic-gate			}
*0Sstevel@tonic-gate			spin_count++;
*0Sstevel@tonic-gate			/*
*0Sstevel@tonic-gate			 * Add an exponential backoff delay before trying again
*0Sstevel@tonic-gate			 * to touch the mutex data structure.
*0Sstevel@tonic-gate			 * spin_count test and call to nulldev are to prevent
*0Sstevel@tonic-gate			 * compiler optimizer from eliminating the delay loop.
*0Sstevel@tonic-gate			 */
*0Sstevel@tonic-gate			for (backctr = backoff; backctr; backctr--) {
*0Sstevel@tonic-gate				if (!spin_count) (void) nulldev();
*0Sstevel@tonic-gate			}
*0Sstevel@tonic-gate			backoff = backoff << 1;		/* double it */
*0Sstevel@tonic-gate			if (backoff > BACKOFF_CAP) {
*0Sstevel@tonic-gate				backoff = BACKOFF_CAP;
*0Sstevel@tonic-gate			}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate			SMT_PAUSE();
*0Sstevel@tonic-gate		}
*0Sstevel@tonic-gate		old_pil = splr(new_pil);
*0Sstevel@tonic-gate	} while (!lock_spin_try(lp));
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	*old_pil_addr = (ushort_t)old_pil;
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	if (spin_count) {
*0Sstevel@tonic-gate		LOCKSTAT_RECORD(LS_LOCK_SET_SPL_SPIN, lp, spin_count);
*0Sstevel@tonic-gate	}
*0Sstevel@tonic-gate
*0Sstevel@tonic-gate	LOCKSTAT_RECORD(LS_LOCK_SET_SPL_ACQUIRE, lp, spin_count);
*0Sstevel@tonic-gate}