10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
56138Ssvemuri * Common Development and Distribution License (the "License").
66138Ssvemuri * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate /*
226138Ssvemuri * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
230Sstevel@tonic-gate * Use is subject to license terms.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate #include <sys/param.h>
270Sstevel@tonic-gate #include <sys/thread.h>
280Sstevel@tonic-gate #include <sys/cmn_err.h>
290Sstevel@tonic-gate #include <sys/debug.h>
300Sstevel@tonic-gate #include <sys/cpuvar.h>
310Sstevel@tonic-gate #include <sys/sobject.h>
320Sstevel@tonic-gate #include <sys/turnstile.h>
330Sstevel@tonic-gate #include <sys/rwlock.h>
340Sstevel@tonic-gate #include <sys/rwlock_impl.h>
350Sstevel@tonic-gate #include <sys/atomic.h>
360Sstevel@tonic-gate #include <sys/lockstat.h>
370Sstevel@tonic-gate
380Sstevel@tonic-gate /*
390Sstevel@tonic-gate * Big Theory Statement for readers/writer locking primitives.
400Sstevel@tonic-gate *
410Sstevel@tonic-gate * An rwlock provides exclusive access to a single thread ("writer") or
420Sstevel@tonic-gate * concurrent access to multiple threads ("readers"). See rwlock(9F)
430Sstevel@tonic-gate * for a full description of the interfaces and programming model.
440Sstevel@tonic-gate * The rest of this comment describes the implementation.
450Sstevel@tonic-gate *
460Sstevel@tonic-gate * An rwlock is a single word with the following structure:
470Sstevel@tonic-gate *
480Sstevel@tonic-gate * ---------------------------------------------------------------------
490Sstevel@tonic-gate * | OWNER (writer) or HOLD COUNT (readers) | WRLOCK | WRWANT | WAIT |
500Sstevel@tonic-gate * ---------------------------------------------------------------------
510Sstevel@tonic-gate * 63 / 31 .. 3 2 1 0
520Sstevel@tonic-gate *
530Sstevel@tonic-gate * The waiters bit (0) indicates whether any threads are blocked waiting
540Sstevel@tonic-gate * for the lock. The write-wanted bit (1) indicates whether any threads
550Sstevel@tonic-gate * are blocked waiting for write access. The write-locked bit (2) indicates
560Sstevel@tonic-gate * whether the lock is held by a writer, which determines whether the upper
570Sstevel@tonic-gate * bits (3..31 in ILP32, 3..63 in LP64) should be interpreted as the owner
580Sstevel@tonic-gate * (thread pointer) or the hold count (number of readers).
590Sstevel@tonic-gate *
600Sstevel@tonic-gate * In the absence of any contention, a writer gets the lock by setting
610Sstevel@tonic-gate * this word to (curthread | RW_WRITE_LOCKED); a reader gets the lock
620Sstevel@tonic-gate * by incrementing the hold count (i.e. adding 8, aka RW_READ_LOCK).
630Sstevel@tonic-gate *
640Sstevel@tonic-gate * A writer will fail to acquire the lock if any other thread owns it.
650Sstevel@tonic-gate * A reader will fail if the lock is either owned or wanted by a writer.
660Sstevel@tonic-gate * rw_tryenter() returns 0 in these cases; rw_enter() blocks until the
670Sstevel@tonic-gate * lock becomes available.
680Sstevel@tonic-gate *
690Sstevel@tonic-gate * When a thread blocks it acquires the rwlock's hashed turnstile lock and
700Sstevel@tonic-gate * attempts to set RW_HAS_WAITERS (and RW_WRITE_WANTED in the writer case)
710Sstevel@tonic-gate * atomically *only if the lock still appears busy*. A thread must never
720Sstevel@tonic-gate * accidentally block for an available lock since there would be no owner
730Sstevel@tonic-gate * to awaken it. casip() provides the required atomicity. Once casip()
740Sstevel@tonic-gate * succeeds, the decision to block becomes final and irreversible. The
750Sstevel@tonic-gate * thread will not become runnable again until it has been granted ownership
760Sstevel@tonic-gate * of the lock via direct handoff from a former owner as described below.
770Sstevel@tonic-gate *
780Sstevel@tonic-gate * In the absence of any waiters, rw_exit() just clears the lock (if it
790Sstevel@tonic-gate * is write-locked) or decrements the hold count (if it is read-locked).
800Sstevel@tonic-gate * Note that even if waiters are present, decrementing the hold count
810Sstevel@tonic-gate * to a non-zero value requires no special action since the lock is still
820Sstevel@tonic-gate * held by at least one other thread.
830Sstevel@tonic-gate *
840Sstevel@tonic-gate * On the "final exit" (transition to unheld state) of a lock with waiters,
850Sstevel@tonic-gate * rw_exit_wakeup() grabs the turnstile lock and transfers ownership directly
860Sstevel@tonic-gate * to the next writer or set of readers. There are several advantages to this
870Sstevel@tonic-gate * approach: (1) it closes all windows for priority inversion (when a new
880Sstevel@tonic-gate * writer has grabbed the lock but has not yet inherited from blocked readers);
890Sstevel@tonic-gate * (2) it prevents starvation of equal-priority threads by granting the lock
900Sstevel@tonic-gate * in FIFO order; (3) it eliminates the need for a write-wanted count -- a
910Sstevel@tonic-gate * single bit suffices because the lock remains held until all waiting
920Sstevel@tonic-gate * writers are gone; (4) when we awaken N readers we can perform a single
930Sstevel@tonic-gate * "atomic_add(&x, N)" to set the total hold count rather than having all N
940Sstevel@tonic-gate * threads fight for the cache to perform an "atomic_add(&x, 1)" upon wakeup.
950Sstevel@tonic-gate *
960Sstevel@tonic-gate * The most interesting policy decision in rw_exit_wakeup() is which thread
970Sstevel@tonic-gate * to wake. Starvation is always possible with priority-based scheduling,
980Sstevel@tonic-gate * but any sane wakeup policy should at least satisfy these requirements:
990Sstevel@tonic-gate *
1000Sstevel@tonic-gate * (1) The highest-priority thread in the system should not starve.
1010Sstevel@tonic-gate * (2) The highest-priority writer should not starve.
1020Sstevel@tonic-gate * (3) No writer should starve due to lower-priority threads.
1030Sstevel@tonic-gate * (4) No reader should starve due to lower-priority writers.
1040Sstevel@tonic-gate * (5) If all threads have equal priority, none of them should starve.
1050Sstevel@tonic-gate *
1060Sstevel@tonic-gate * We used to employ a writers-always-win policy, which doesn't even
1070Sstevel@tonic-gate * satisfy (1): a steady stream of low-priority writers can starve out
1080Sstevel@tonic-gate * a real-time reader! This is clearly a broken policy -- it violates
1090Sstevel@tonic-gate * (1), (4), and (5) -- but it's how rwlocks always used to behave.
1100Sstevel@tonic-gate *
1110Sstevel@tonic-gate * A round-robin policy (exiting readers grant the lock to blocked writers
1120Sstevel@tonic-gate * and vice versa) satisfies all but (3): a single high-priority writer
1130Sstevel@tonic-gate * and many low-priority readers can starve out medium-priority writers.
1140Sstevel@tonic-gate *
1150Sstevel@tonic-gate * A strict priority policy (grant the lock to the highest priority blocked
1160Sstevel@tonic-gate * thread) satisfies everything but (2): a steady stream of high-priority
1170Sstevel@tonic-gate * readers can permanently starve the highest-priority writer.
1180Sstevel@tonic-gate *
1190Sstevel@tonic-gate * The reason we care about (2) is that it's important to process writers
1200Sstevel@tonic-gate * reasonably quickly -- even if they're low priority -- because their very
1210Sstevel@tonic-gate * presence causes all readers to take the slow (blocking) path through this
1220Sstevel@tonic-gate * code. There is also a general sense that writers deserve some degree of
1230Sstevel@tonic-gate * deference because they're updating the data upon which all readers act.
1240Sstevel@tonic-gate * Presumably this data should not be allowed to become arbitrarily stale
1250Sstevel@tonic-gate * due to writer starvation. Finally, it seems reasonable to level the
1260Sstevel@tonic-gate * playing field a bit to compensate for the fact that it's so much harder
1270Sstevel@tonic-gate * for a writer to get in when there are already many readers present.
1280Sstevel@tonic-gate *
1290Sstevel@tonic-gate * A hybrid of round-robin and strict priority can be made to satisfy
1300Sstevel@tonic-gate * all five criteria. In this "writer priority policy" exiting readers
1310Sstevel@tonic-gate * always grant the lock to waiting writers, but exiting writers only
1320Sstevel@tonic-gate * grant the lock to readers of the same or higher priority than the
1330Sstevel@tonic-gate * highest-priority blocked writer. Thus requirement (2) is satisfied,
1340Sstevel@tonic-gate * necessarily, by a willful act of priority inversion: an exiting reader
1350Sstevel@tonic-gate * will grant the lock to a blocked writer even if there are blocked
1360Sstevel@tonic-gate * readers of higher priority. The situation is mitigated by the fact
1370Sstevel@tonic-gate * that writers always inherit priority from blocked readers, and the
1380Sstevel@tonic-gate * writer will awaken those readers as soon as it exits the lock.
1390Sstevel@tonic-gate *
1400Sstevel@tonic-gate * rw_downgrade() follows the same wakeup policy as an exiting writer.
1410Sstevel@tonic-gate *
1420Sstevel@tonic-gate * rw_tryupgrade() has the same failure mode as rw_tryenter() for a
1430Sstevel@tonic-gate * write lock. Both honor the WRITE_WANTED bit by specification.
1440Sstevel@tonic-gate *
1450Sstevel@tonic-gate * The following rules apply to manipulation of rwlock internal state:
1460Sstevel@tonic-gate *
1470Sstevel@tonic-gate * (1) The rwlock is only modified via the atomic primitives casip()
1480Sstevel@tonic-gate * and atomic_add_ip().
1490Sstevel@tonic-gate *
1500Sstevel@tonic-gate * (2) The waiters bit and write-wanted bit are only modified under
1510Sstevel@tonic-gate * turnstile_lookup(). This ensures that the turnstile is consistent
1520Sstevel@tonic-gate * with the rwlock.
1530Sstevel@tonic-gate *
1540Sstevel@tonic-gate * (3) Waiters receive the lock by direct handoff from the previous
1550Sstevel@tonic-gate * owner. Therefore, waiters *always* wake up holding the lock.
1560Sstevel@tonic-gate */
1570Sstevel@tonic-gate
1580Sstevel@tonic-gate /*
1590Sstevel@tonic-gate * The sobj_ops vector exports a set of functions needed when a thread
1600Sstevel@tonic-gate * is asleep on a synchronization object of a given type.
1610Sstevel@tonic-gate */
1620Sstevel@tonic-gate static sobj_ops_t rw_sobj_ops = {
1630Sstevel@tonic-gate SOBJ_RWLOCK, rw_owner, turnstile_stay_asleep, turnstile_change_pri
1640Sstevel@tonic-gate };
1650Sstevel@tonic-gate
1660Sstevel@tonic-gate /*
1670Sstevel@tonic-gate * If the system panics on an rwlock, save the address of the offending
1680Sstevel@tonic-gate * rwlock in panic_rwlock_addr, and save the contents in panic_rwlock.
1690Sstevel@tonic-gate */
1700Sstevel@tonic-gate static rwlock_impl_t panic_rwlock;
1710Sstevel@tonic-gate static rwlock_impl_t *panic_rwlock_addr;
1720Sstevel@tonic-gate
1730Sstevel@tonic-gate static void
rw_panic(char * msg,rwlock_impl_t * lp)1740Sstevel@tonic-gate rw_panic(char *msg, rwlock_impl_t *lp)
1750Sstevel@tonic-gate {
1760Sstevel@tonic-gate if (panicstr)
1770Sstevel@tonic-gate return;
1780Sstevel@tonic-gate
1790Sstevel@tonic-gate if (casptr(&panic_rwlock_addr, NULL, lp) == NULL)
1800Sstevel@tonic-gate panic_rwlock = *lp;
1810Sstevel@tonic-gate
1820Sstevel@tonic-gate panic("%s, lp=%p wwwh=%lx thread=%p",
183*7632SNick.Todd@Sun.COM msg, (void *)lp, panic_rwlock.rw_wwwh, (void *)curthread);
1840Sstevel@tonic-gate }
1850Sstevel@tonic-gate
1860Sstevel@tonic-gate /* ARGSUSED */
1870Sstevel@tonic-gate void
rw_init(krwlock_t * rwlp,char * name,krw_type_t type,void * arg)1880Sstevel@tonic-gate rw_init(krwlock_t *rwlp, char *name, krw_type_t type, void *arg)
1890Sstevel@tonic-gate {
1900Sstevel@tonic-gate ((rwlock_impl_t *)rwlp)->rw_wwwh = 0;
1910Sstevel@tonic-gate }
1920Sstevel@tonic-gate
1930Sstevel@tonic-gate void
rw_destroy(krwlock_t * rwlp)1940Sstevel@tonic-gate rw_destroy(krwlock_t *rwlp)
1950Sstevel@tonic-gate {
1960Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp;
1970Sstevel@tonic-gate
1980Sstevel@tonic-gate if (lp->rw_wwwh != 0) {
1990Sstevel@tonic-gate if ((lp->rw_wwwh & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK)
2000Sstevel@tonic-gate rw_panic("rw_destroy: lock already destroyed", lp);
2010Sstevel@tonic-gate else
2020Sstevel@tonic-gate rw_panic("rw_destroy: lock still active", lp);
2030Sstevel@tonic-gate }
2040Sstevel@tonic-gate
2050Sstevel@tonic-gate lp->rw_wwwh = RW_DOUBLE_LOCK;
2060Sstevel@tonic-gate }
2070Sstevel@tonic-gate
2080Sstevel@tonic-gate /*
2090Sstevel@tonic-gate * Verify that an rwlock is held correctly.
2100Sstevel@tonic-gate */
2110Sstevel@tonic-gate static int
rw_locked(rwlock_impl_t * lp,krw_t rw)2120Sstevel@tonic-gate rw_locked(rwlock_impl_t *lp, krw_t rw)
2130Sstevel@tonic-gate {
2140Sstevel@tonic-gate uintptr_t old = lp->rw_wwwh;
2150Sstevel@tonic-gate
2160Sstevel@tonic-gate if (rw == RW_READER)
2170Sstevel@tonic-gate return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED));
2180Sstevel@tonic-gate
2190Sstevel@tonic-gate if (rw == RW_WRITER)
2200Sstevel@tonic-gate return ((old & RW_OWNER) == (uintptr_t)curthread);
2210Sstevel@tonic-gate
2220Sstevel@tonic-gate return (0);
2230Sstevel@tonic-gate }
2240Sstevel@tonic-gate
2256138Ssvemuri uint_t (*rw_lock_backoff)(uint_t) = NULL;
2266138Ssvemuri void (*rw_lock_delay)(uint_t) = NULL;
2276138Ssvemuri
2280Sstevel@tonic-gate /*
2290Sstevel@tonic-gate * Full-service implementation of rw_enter() to handle all the hard cases.
2300Sstevel@tonic-gate * Called from the assembly version if anything complicated is going on.
2310Sstevel@tonic-gate * The only semantic difference between calling rw_enter() and calling
2320Sstevel@tonic-gate * rw_enter_sleep() directly is that we assume the caller has already done
2330Sstevel@tonic-gate * a THREAD_KPRI_REQUEST() in the RW_READER case.
2340Sstevel@tonic-gate */
2350Sstevel@tonic-gate void
rw_enter_sleep(rwlock_impl_t * lp,krw_t rw)2360Sstevel@tonic-gate rw_enter_sleep(rwlock_impl_t *lp, krw_t rw)
2370Sstevel@tonic-gate {
2380Sstevel@tonic-gate uintptr_t old, new, lock_value, lock_busy, lock_wait;
2390Sstevel@tonic-gate hrtime_t sleep_time;
2400Sstevel@tonic-gate turnstile_t *ts;
2416138Ssvemuri uint_t backoff = 0;
2426138Ssvemuri int loop_count = 0;
2430Sstevel@tonic-gate
2440Sstevel@tonic-gate if (rw == RW_READER) {
2450Sstevel@tonic-gate lock_value = RW_READ_LOCK;
2460Sstevel@tonic-gate lock_busy = RW_WRITE_CLAIMED;
2470Sstevel@tonic-gate lock_wait = RW_HAS_WAITERS;
2480Sstevel@tonic-gate } else {
2490Sstevel@tonic-gate lock_value = RW_WRITE_LOCK(curthread);
2500Sstevel@tonic-gate lock_busy = (uintptr_t)RW_LOCKED;
2510Sstevel@tonic-gate lock_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
2520Sstevel@tonic-gate }
2530Sstevel@tonic-gate
2540Sstevel@tonic-gate for (;;) {
2550Sstevel@tonic-gate if (((old = lp->rw_wwwh) & lock_busy) == 0) {
2566138Ssvemuri if (casip(&lp->rw_wwwh, old, old + lock_value) != old) {
2576138Ssvemuri if (rw_lock_delay != NULL) {
2586138Ssvemuri backoff = rw_lock_backoff(backoff);
2596138Ssvemuri rw_lock_delay(backoff);
2606138Ssvemuri if (++loop_count == ncpus_online) {
2616138Ssvemuri backoff = 0;
2626138Ssvemuri loop_count = 0;
2636138Ssvemuri }
2646138Ssvemuri }
2650Sstevel@tonic-gate continue;
2666138Ssvemuri }
2670Sstevel@tonic-gate break;
2680Sstevel@tonic-gate }
2690Sstevel@tonic-gate
2700Sstevel@tonic-gate if (panicstr)
2710Sstevel@tonic-gate return;
2720Sstevel@tonic-gate
2730Sstevel@tonic-gate if ((old & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) {
2740Sstevel@tonic-gate rw_panic("rw_enter: bad rwlock", lp);
2750Sstevel@tonic-gate return;
2760Sstevel@tonic-gate }
2770Sstevel@tonic-gate
2780Sstevel@tonic-gate if ((old & RW_OWNER) == (uintptr_t)curthread) {
2790Sstevel@tonic-gate rw_panic("recursive rw_enter", lp);
2800Sstevel@tonic-gate return;
2810Sstevel@tonic-gate }
2820Sstevel@tonic-gate
2830Sstevel@tonic-gate ts = turnstile_lookup(lp);
2840Sstevel@tonic-gate
2850Sstevel@tonic-gate do {
2860Sstevel@tonic-gate if (((old = lp->rw_wwwh) & lock_busy) == 0)
2870Sstevel@tonic-gate break;
2880Sstevel@tonic-gate new = old | lock_wait;
2890Sstevel@tonic-gate } while (old != new && casip(&lp->rw_wwwh, old, new) != old);
2900Sstevel@tonic-gate
2910Sstevel@tonic-gate if ((old & lock_busy) == 0) {
2920Sstevel@tonic-gate /*
2930Sstevel@tonic-gate * The lock appears free now; try the dance again
2940Sstevel@tonic-gate */
2950Sstevel@tonic-gate turnstile_exit(lp);
2960Sstevel@tonic-gate continue;
2970Sstevel@tonic-gate }
2980Sstevel@tonic-gate
2990Sstevel@tonic-gate /*
3000Sstevel@tonic-gate * We really are going to block. Bump the stats, and drop
3010Sstevel@tonic-gate * kpri if we're a reader.
3020Sstevel@tonic-gate */
3030Sstevel@tonic-gate ASSERT(lp->rw_wwwh & lock_wait);
3040Sstevel@tonic-gate ASSERT(lp->rw_wwwh & RW_LOCKED);
3050Sstevel@tonic-gate
3060Sstevel@tonic-gate sleep_time = -gethrtime();
3070Sstevel@tonic-gate if (rw == RW_READER) {
3080Sstevel@tonic-gate THREAD_KPRI_RELEASE();
3090Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, sys, rw_rdfails, 1);
3100Sstevel@tonic-gate (void) turnstile_block(ts, TS_READER_Q, lp,
3110Sstevel@tonic-gate &rw_sobj_ops, NULL, NULL);
3120Sstevel@tonic-gate } else {
3130Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, sys, rw_wrfails, 1);
3140Sstevel@tonic-gate (void) turnstile_block(ts, TS_WRITER_Q, lp,
3150Sstevel@tonic-gate &rw_sobj_ops, NULL, NULL);
3160Sstevel@tonic-gate }
3170Sstevel@tonic-gate sleep_time += gethrtime();
3180Sstevel@tonic-gate
3190Sstevel@tonic-gate LOCKSTAT_RECORD4(LS_RW_ENTER_BLOCK, lp, sleep_time, rw,
3200Sstevel@tonic-gate (old & RW_WRITE_LOCKED) ? 1 : 0,
3210Sstevel@tonic-gate old >> RW_HOLD_COUNT_SHIFT);
3220Sstevel@tonic-gate
3230Sstevel@tonic-gate /*
3240Sstevel@tonic-gate * We wake up holding the lock (and having kpri if we're
3250Sstevel@tonic-gate * a reader) via direct handoff from the previous owner.
3260Sstevel@tonic-gate */
3270Sstevel@tonic-gate break;
3280Sstevel@tonic-gate }
3290Sstevel@tonic-gate
3300Sstevel@tonic-gate ASSERT(rw_locked(lp, rw));
3310Sstevel@tonic-gate
3320Sstevel@tonic-gate membar_enter();
3330Sstevel@tonic-gate
3340Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_ENTER_ACQUIRE, lp, rw);
3350Sstevel@tonic-gate }
3360Sstevel@tonic-gate
3370Sstevel@tonic-gate /*
3380Sstevel@tonic-gate * Return the number of readers to wake, or zero if we should wake a writer.
3390Sstevel@tonic-gate * Called only by exiting/downgrading writers (readers don't wake readers).
3400Sstevel@tonic-gate */
3410Sstevel@tonic-gate static int
rw_readers_to_wake(turnstile_t * ts)3420Sstevel@tonic-gate rw_readers_to_wake(turnstile_t *ts)
3430Sstevel@tonic-gate {
3440Sstevel@tonic-gate kthread_t *next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first;
3450Sstevel@tonic-gate kthread_t *next_reader = ts->ts_sleepq[TS_READER_Q].sq_first;
3460Sstevel@tonic-gate pri_t wpri = (next_writer != NULL) ? DISP_PRIO(next_writer) : -1;
3470Sstevel@tonic-gate int count = 0;
3480Sstevel@tonic-gate
3490Sstevel@tonic-gate while (next_reader != NULL) {
3500Sstevel@tonic-gate if (DISP_PRIO(next_reader) < wpri)
3510Sstevel@tonic-gate break;
3520Sstevel@tonic-gate next_reader->t_kpri_req++;
3530Sstevel@tonic-gate next_reader = next_reader->t_link;
3540Sstevel@tonic-gate count++;
3550Sstevel@tonic-gate }
3560Sstevel@tonic-gate return (count);
3570Sstevel@tonic-gate }
3580Sstevel@tonic-gate
3590Sstevel@tonic-gate /*
3600Sstevel@tonic-gate * Full-service implementation of rw_exit() to handle all the hard cases.
3610Sstevel@tonic-gate * Called from the assembly version if anything complicated is going on.
3620Sstevel@tonic-gate * There is no semantic difference between calling rw_exit() and calling
3630Sstevel@tonic-gate * rw_exit_wakeup() directly.
3640Sstevel@tonic-gate */
3650Sstevel@tonic-gate void
rw_exit_wakeup(rwlock_impl_t * lp)3660Sstevel@tonic-gate rw_exit_wakeup(rwlock_impl_t *lp)
3670Sstevel@tonic-gate {
3680Sstevel@tonic-gate turnstile_t *ts;
3690Sstevel@tonic-gate uintptr_t old, new, lock_value;
3700Sstevel@tonic-gate kthread_t *next_writer;
3710Sstevel@tonic-gate int nreaders;
3726138Ssvemuri uint_t backoff = 0;
3736138Ssvemuri int loop_count = 0;
3740Sstevel@tonic-gate
3750Sstevel@tonic-gate membar_exit();
3760Sstevel@tonic-gate
3770Sstevel@tonic-gate old = lp->rw_wwwh;
3780Sstevel@tonic-gate if (old & RW_WRITE_LOCKED) {
3790Sstevel@tonic-gate if ((old & RW_OWNER) != (uintptr_t)curthread) {
3800Sstevel@tonic-gate rw_panic("rw_exit: not owner", lp);
3810Sstevel@tonic-gate lp->rw_wwwh = 0;
3820Sstevel@tonic-gate return;
3830Sstevel@tonic-gate }
3840Sstevel@tonic-gate lock_value = RW_WRITE_LOCK(curthread);
3850Sstevel@tonic-gate } else {
3860Sstevel@tonic-gate if ((old & RW_LOCKED) == 0) {
3870Sstevel@tonic-gate rw_panic("rw_exit: lock not held", lp);
3880Sstevel@tonic-gate return;
3890Sstevel@tonic-gate }
3900Sstevel@tonic-gate lock_value = RW_READ_LOCK;
3910Sstevel@tonic-gate }
3920Sstevel@tonic-gate
3930Sstevel@tonic-gate for (;;) {
3940Sstevel@tonic-gate /*
3950Sstevel@tonic-gate * If this is *not* the final exit of a lock with waiters,
3960Sstevel@tonic-gate * just drop the lock -- there's nothing tricky going on.
3970Sstevel@tonic-gate */
3980Sstevel@tonic-gate old = lp->rw_wwwh;
3990Sstevel@tonic-gate new = old - lock_value;
4000Sstevel@tonic-gate if ((new & (RW_LOCKED | RW_HAS_WAITERS)) != RW_HAS_WAITERS) {
4016138Ssvemuri if (casip(&lp->rw_wwwh, old, new) != old) {
4026138Ssvemuri if (rw_lock_delay != NULL) {
4036138Ssvemuri backoff = rw_lock_backoff(backoff);
4046138Ssvemuri rw_lock_delay(backoff);
4056138Ssvemuri if (++loop_count == ncpus_online) {
4066138Ssvemuri backoff = 0;
4076138Ssvemuri loop_count = 0;
4086138Ssvemuri }
4096138Ssvemuri }
4100Sstevel@tonic-gate continue;
4116138Ssvemuri }
4120Sstevel@tonic-gate break;
4130Sstevel@tonic-gate }
4140Sstevel@tonic-gate
4150Sstevel@tonic-gate /*
4160Sstevel@tonic-gate * Perform the final exit of a lock that has waiters.
4170Sstevel@tonic-gate */
4180Sstevel@tonic-gate ts = turnstile_lookup(lp);
4190Sstevel@tonic-gate
4200Sstevel@tonic-gate next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first;
4210Sstevel@tonic-gate
4220Sstevel@tonic-gate if ((old & RW_WRITE_LOCKED) &&
4230Sstevel@tonic-gate (nreaders = rw_readers_to_wake(ts)) > 0) {
4240Sstevel@tonic-gate /*
4250Sstevel@tonic-gate * Don't drop the lock -- just set the hold count
4260Sstevel@tonic-gate * such that we grant the lock to all readers at once.
4270Sstevel@tonic-gate */
4280Sstevel@tonic-gate new = nreaders * RW_READ_LOCK;
4290Sstevel@tonic-gate if (ts->ts_waiters > nreaders)
4300Sstevel@tonic-gate new |= RW_HAS_WAITERS;
4310Sstevel@tonic-gate if (next_writer)
4320Sstevel@tonic-gate new |= RW_WRITE_WANTED;
4330Sstevel@tonic-gate lp->rw_wwwh = new;
4340Sstevel@tonic-gate membar_enter();
4350Sstevel@tonic-gate turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL);
4360Sstevel@tonic-gate } else {
4370Sstevel@tonic-gate /*
4380Sstevel@tonic-gate * Don't drop the lock -- just transfer ownership
4390Sstevel@tonic-gate * directly to next_writer. Note that there must
4400Sstevel@tonic-gate * be at least one waiting writer, because we get
4410Sstevel@tonic-gate * here only if (A) the lock is read-locked or
4420Sstevel@tonic-gate * (B) there are no waiting readers. In case (A),
4430Sstevel@tonic-gate * since the lock is read-locked there would be no
4440Sstevel@tonic-gate * reason for other readers to have blocked unless
4450Sstevel@tonic-gate * the RW_WRITE_WANTED bit was set. In case (B),
4460Sstevel@tonic-gate * since there are waiters but no waiting readers,
4470Sstevel@tonic-gate * they must all be waiting writers.
4480Sstevel@tonic-gate */
4490Sstevel@tonic-gate ASSERT(lp->rw_wwwh & RW_WRITE_WANTED);
4500Sstevel@tonic-gate new = RW_WRITE_LOCK(next_writer);
4510Sstevel@tonic-gate if (ts->ts_waiters > 1)
4520Sstevel@tonic-gate new |= RW_HAS_WAITERS;
4530Sstevel@tonic-gate if (next_writer->t_link)
4540Sstevel@tonic-gate new |= RW_WRITE_WANTED;
4550Sstevel@tonic-gate lp->rw_wwwh = new;
4560Sstevel@tonic-gate membar_enter();
4570Sstevel@tonic-gate turnstile_wakeup(ts, TS_WRITER_Q, 1, next_writer);
4580Sstevel@tonic-gate }
4590Sstevel@tonic-gate break;
4600Sstevel@tonic-gate }
4610Sstevel@tonic-gate
4620Sstevel@tonic-gate if (lock_value == RW_READ_LOCK) {
4630Sstevel@tonic-gate THREAD_KPRI_RELEASE();
4640Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_READER);
4650Sstevel@tonic-gate } else {
4660Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_WRITER);
4670Sstevel@tonic-gate }
4680Sstevel@tonic-gate }
4690Sstevel@tonic-gate
4700Sstevel@tonic-gate int
rw_tryenter(krwlock_t * rwlp,krw_t rw)4710Sstevel@tonic-gate rw_tryenter(krwlock_t *rwlp, krw_t rw)
4720Sstevel@tonic-gate {
4730Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp;
4740Sstevel@tonic-gate uintptr_t old;
4750Sstevel@tonic-gate
4760Sstevel@tonic-gate if (rw == RW_READER) {
4776138Ssvemuri uint_t backoff = 0;
4786138Ssvemuri int loop_count = 0;
4790Sstevel@tonic-gate THREAD_KPRI_REQUEST();
4806138Ssvemuri for (;;) {
4810Sstevel@tonic-gate if ((old = lp->rw_wwwh) & RW_WRITE_CLAIMED) {
4820Sstevel@tonic-gate THREAD_KPRI_RELEASE();
4830Sstevel@tonic-gate return (0);
4840Sstevel@tonic-gate }
4856138Ssvemuri if (casip(&lp->rw_wwwh, old, old + RW_READ_LOCK) == old)
4866138Ssvemuri break;
4876138Ssvemuri if (rw_lock_delay != NULL) {
4886138Ssvemuri backoff = rw_lock_backoff(backoff);
4896138Ssvemuri rw_lock_delay(backoff);
4906138Ssvemuri if (++loop_count == ncpus_online) {
4916138Ssvemuri backoff = 0;
4926138Ssvemuri loop_count = 0;
4936138Ssvemuri }
4946138Ssvemuri }
4956138Ssvemuri }
4960Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw);
4970Sstevel@tonic-gate } else {
4980Sstevel@tonic-gate if (casip(&lp->rw_wwwh, 0, RW_WRITE_LOCK(curthread)) != 0)
4990Sstevel@tonic-gate return (0);
5000Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw);
5010Sstevel@tonic-gate }
5020Sstevel@tonic-gate ASSERT(rw_locked(lp, rw));
5030Sstevel@tonic-gate membar_enter();
5040Sstevel@tonic-gate return (1);
5050Sstevel@tonic-gate }
5060Sstevel@tonic-gate
5070Sstevel@tonic-gate void
rw_downgrade(krwlock_t * rwlp)5080Sstevel@tonic-gate rw_downgrade(krwlock_t *rwlp)
5090Sstevel@tonic-gate {
5100Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp;
5110Sstevel@tonic-gate
5120Sstevel@tonic-gate THREAD_KPRI_REQUEST();
5130Sstevel@tonic-gate membar_exit();
5140Sstevel@tonic-gate
5150Sstevel@tonic-gate if ((lp->rw_wwwh & RW_OWNER) != (uintptr_t)curthread) {
5160Sstevel@tonic-gate rw_panic("rw_downgrade: not owner", lp);
5170Sstevel@tonic-gate return;
5180Sstevel@tonic-gate }
5190Sstevel@tonic-gate
5200Sstevel@tonic-gate if (atomic_add_ip_nv(&lp->rw_wwwh,
5210Sstevel@tonic-gate RW_READ_LOCK - RW_WRITE_LOCK(curthread)) & RW_HAS_WAITERS) {
5220Sstevel@tonic-gate turnstile_t *ts = turnstile_lookup(lp);
5230Sstevel@tonic-gate int nreaders = rw_readers_to_wake(ts);
5240Sstevel@tonic-gate if (nreaders > 0) {
5250Sstevel@tonic-gate uintptr_t delta = nreaders * RW_READ_LOCK;
5260Sstevel@tonic-gate if (ts->ts_waiters == nreaders)
5270Sstevel@tonic-gate delta -= RW_HAS_WAITERS;
5280Sstevel@tonic-gate atomic_add_ip(&lp->rw_wwwh, delta);
5290Sstevel@tonic-gate }
5300Sstevel@tonic-gate turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL);
5310Sstevel@tonic-gate }
5320Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_READER));
5330Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, lp);
5340Sstevel@tonic-gate }
5350Sstevel@tonic-gate
5360Sstevel@tonic-gate int
rw_tryupgrade(krwlock_t * rwlp)5370Sstevel@tonic-gate rw_tryupgrade(krwlock_t *rwlp)
5380Sstevel@tonic-gate {
5390Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp;
5400Sstevel@tonic-gate uintptr_t old, new;
5410Sstevel@tonic-gate
5420Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_READER));
5430Sstevel@tonic-gate
5440Sstevel@tonic-gate do {
5450Sstevel@tonic-gate if (((old = lp->rw_wwwh) & ~RW_HAS_WAITERS) != RW_READ_LOCK)
5460Sstevel@tonic-gate return (0);
5470Sstevel@tonic-gate new = old + RW_WRITE_LOCK(curthread) - RW_READ_LOCK;
5480Sstevel@tonic-gate } while (casip(&lp->rw_wwwh, old, new) != old);
5490Sstevel@tonic-gate
5500Sstevel@tonic-gate membar_enter();
5510Sstevel@tonic-gate THREAD_KPRI_RELEASE();
5520Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, lp);
5530Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_WRITER));
5540Sstevel@tonic-gate return (1);
5550Sstevel@tonic-gate }
5560Sstevel@tonic-gate
5570Sstevel@tonic-gate int
rw_read_held(krwlock_t * rwlp)5580Sstevel@tonic-gate rw_read_held(krwlock_t *rwlp)
5590Sstevel@tonic-gate {
5600Sstevel@tonic-gate uintptr_t tmp;
5610Sstevel@tonic-gate
5620Sstevel@tonic-gate return (_RW_READ_HELD(rwlp, tmp));
5630Sstevel@tonic-gate }
5640Sstevel@tonic-gate
5650Sstevel@tonic-gate int
rw_write_held(krwlock_t * rwlp)5660Sstevel@tonic-gate rw_write_held(krwlock_t *rwlp)
5670Sstevel@tonic-gate {
5680Sstevel@tonic-gate return (_RW_WRITE_HELD(rwlp));
5690Sstevel@tonic-gate }
5700Sstevel@tonic-gate
5710Sstevel@tonic-gate int
rw_lock_held(krwlock_t * rwlp)5720Sstevel@tonic-gate rw_lock_held(krwlock_t *rwlp)
5730Sstevel@tonic-gate {
5740Sstevel@tonic-gate return (_RW_LOCK_HELD(rwlp));
5750Sstevel@tonic-gate }
5760Sstevel@tonic-gate
5770Sstevel@tonic-gate /*
5780Sstevel@tonic-gate * Like rw_read_held(), but ASSERTs that the lock is currently held
5790Sstevel@tonic-gate */
5800Sstevel@tonic-gate int
rw_read_locked(krwlock_t * rwlp)5810Sstevel@tonic-gate rw_read_locked(krwlock_t *rwlp)
5820Sstevel@tonic-gate {
5830Sstevel@tonic-gate uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh;
5840Sstevel@tonic-gate
5850Sstevel@tonic-gate ASSERT(old & RW_LOCKED);
5860Sstevel@tonic-gate return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED));
5870Sstevel@tonic-gate }
5880Sstevel@tonic-gate
5890Sstevel@tonic-gate /*
5900Sstevel@tonic-gate * Returns non-zero if the lock is either held or desired by a writer
5910Sstevel@tonic-gate */
5920Sstevel@tonic-gate int
rw_iswriter(krwlock_t * rwlp)5930Sstevel@tonic-gate rw_iswriter(krwlock_t *rwlp)
5940Sstevel@tonic-gate {
5950Sstevel@tonic-gate return (_RW_ISWRITER(rwlp));
5960Sstevel@tonic-gate }
5970Sstevel@tonic-gate
5980Sstevel@tonic-gate kthread_t *
rw_owner(krwlock_t * rwlp)5990Sstevel@tonic-gate rw_owner(krwlock_t *rwlp)
6000Sstevel@tonic-gate {
6010Sstevel@tonic-gate uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh;
6020Sstevel@tonic-gate
6030Sstevel@tonic-gate return ((old & RW_WRITE_LOCKED) ? (kthread_t *)(old & RW_OWNER) : NULL);
6040Sstevel@tonic-gate }
605