10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*6138Ssvemuri * Common Development and Distribution License (the "License"). 6*6138Ssvemuri * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 22*6138Ssvemuri * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 270Sstevel@tonic-gate 280Sstevel@tonic-gate #include <sys/param.h> 290Sstevel@tonic-gate #include <sys/thread.h> 300Sstevel@tonic-gate #include <sys/cmn_err.h> 310Sstevel@tonic-gate #include <sys/debug.h> 320Sstevel@tonic-gate #include <sys/cpuvar.h> 330Sstevel@tonic-gate #include <sys/sobject.h> 340Sstevel@tonic-gate #include <sys/turnstile.h> 350Sstevel@tonic-gate #include <sys/rwlock.h> 360Sstevel@tonic-gate #include <sys/rwlock_impl.h> 370Sstevel@tonic-gate #include <sys/atomic.h> 380Sstevel@tonic-gate #include <sys/lockstat.h> 390Sstevel@tonic-gate 400Sstevel@tonic-gate /* 410Sstevel@tonic-gate * Big Theory Statement for readers/writer locking primitives. 420Sstevel@tonic-gate * 430Sstevel@tonic-gate * An rwlock provides exclusive access to a single thread ("writer") or 440Sstevel@tonic-gate * concurrent access to multiple threads ("readers"). See rwlock(9F) 450Sstevel@tonic-gate * for a full description of the interfaces and programming model. 460Sstevel@tonic-gate * The rest of this comment describes the implementation. 470Sstevel@tonic-gate * 480Sstevel@tonic-gate * An rwlock is a single word with the following structure: 490Sstevel@tonic-gate * 500Sstevel@tonic-gate * --------------------------------------------------------------------- 510Sstevel@tonic-gate * | OWNER (writer) or HOLD COUNT (readers) | WRLOCK | WRWANT | WAIT | 520Sstevel@tonic-gate * --------------------------------------------------------------------- 530Sstevel@tonic-gate * 63 / 31 .. 3 2 1 0 540Sstevel@tonic-gate * 550Sstevel@tonic-gate * The waiters bit (0) indicates whether any threads are blocked waiting 560Sstevel@tonic-gate * for the lock. The write-wanted bit (1) indicates whether any threads 570Sstevel@tonic-gate * are blocked waiting for write access. The write-locked bit (2) indicates 580Sstevel@tonic-gate * whether the lock is held by a writer, which determines whether the upper 590Sstevel@tonic-gate * bits (3..31 in ILP32, 3..63 in LP64) should be interpreted as the owner 600Sstevel@tonic-gate * (thread pointer) or the hold count (number of readers). 610Sstevel@tonic-gate * 620Sstevel@tonic-gate * In the absence of any contention, a writer gets the lock by setting 630Sstevel@tonic-gate * this word to (curthread | RW_WRITE_LOCKED); a reader gets the lock 640Sstevel@tonic-gate * by incrementing the hold count (i.e. adding 8, aka RW_READ_LOCK). 650Sstevel@tonic-gate * 660Sstevel@tonic-gate * A writer will fail to acquire the lock if any other thread owns it. 670Sstevel@tonic-gate * A reader will fail if the lock is either owned or wanted by a writer. 680Sstevel@tonic-gate * rw_tryenter() returns 0 in these cases; rw_enter() blocks until the 690Sstevel@tonic-gate * lock becomes available. 700Sstevel@tonic-gate * 710Sstevel@tonic-gate * When a thread blocks it acquires the rwlock's hashed turnstile lock and 720Sstevel@tonic-gate * attempts to set RW_HAS_WAITERS (and RW_WRITE_WANTED in the writer case) 730Sstevel@tonic-gate * atomically *only if the lock still appears busy*. A thread must never 740Sstevel@tonic-gate * accidentally block for an available lock since there would be no owner 750Sstevel@tonic-gate * to awaken it. casip() provides the required atomicity. Once casip() 760Sstevel@tonic-gate * succeeds, the decision to block becomes final and irreversible. The 770Sstevel@tonic-gate * thread will not become runnable again until it has been granted ownership 780Sstevel@tonic-gate * of the lock via direct handoff from a former owner as described below. 790Sstevel@tonic-gate * 800Sstevel@tonic-gate * In the absence of any waiters, rw_exit() just clears the lock (if it 810Sstevel@tonic-gate * is write-locked) or decrements the hold count (if it is read-locked). 820Sstevel@tonic-gate * Note that even if waiters are present, decrementing the hold count 830Sstevel@tonic-gate * to a non-zero value requires no special action since the lock is still 840Sstevel@tonic-gate * held by at least one other thread. 850Sstevel@tonic-gate * 860Sstevel@tonic-gate * On the "final exit" (transition to unheld state) of a lock with waiters, 870Sstevel@tonic-gate * rw_exit_wakeup() grabs the turnstile lock and transfers ownership directly 880Sstevel@tonic-gate * to the next writer or set of readers. There are several advantages to this 890Sstevel@tonic-gate * approach: (1) it closes all windows for priority inversion (when a new 900Sstevel@tonic-gate * writer has grabbed the lock but has not yet inherited from blocked readers); 910Sstevel@tonic-gate * (2) it prevents starvation of equal-priority threads by granting the lock 920Sstevel@tonic-gate * in FIFO order; (3) it eliminates the need for a write-wanted count -- a 930Sstevel@tonic-gate * single bit suffices because the lock remains held until all waiting 940Sstevel@tonic-gate * writers are gone; (4) when we awaken N readers we can perform a single 950Sstevel@tonic-gate * "atomic_add(&x, N)" to set the total hold count rather than having all N 960Sstevel@tonic-gate * threads fight for the cache to perform an "atomic_add(&x, 1)" upon wakeup. 970Sstevel@tonic-gate * 980Sstevel@tonic-gate * The most interesting policy decision in rw_exit_wakeup() is which thread 990Sstevel@tonic-gate * to wake. Starvation is always possible with priority-based scheduling, 1000Sstevel@tonic-gate * but any sane wakeup policy should at least satisfy these requirements: 1010Sstevel@tonic-gate * 1020Sstevel@tonic-gate * (1) The highest-priority thread in the system should not starve. 1030Sstevel@tonic-gate * (2) The highest-priority writer should not starve. 1040Sstevel@tonic-gate * (3) No writer should starve due to lower-priority threads. 1050Sstevel@tonic-gate * (4) No reader should starve due to lower-priority writers. 1060Sstevel@tonic-gate * (5) If all threads have equal priority, none of them should starve. 1070Sstevel@tonic-gate * 1080Sstevel@tonic-gate * We used to employ a writers-always-win policy, which doesn't even 1090Sstevel@tonic-gate * satisfy (1): a steady stream of low-priority writers can starve out 1100Sstevel@tonic-gate * a real-time reader! This is clearly a broken policy -- it violates 1110Sstevel@tonic-gate * (1), (4), and (5) -- but it's how rwlocks always used to behave. 1120Sstevel@tonic-gate * 1130Sstevel@tonic-gate * A round-robin policy (exiting readers grant the lock to blocked writers 1140Sstevel@tonic-gate * and vice versa) satisfies all but (3): a single high-priority writer 1150Sstevel@tonic-gate * and many low-priority readers can starve out medium-priority writers. 1160Sstevel@tonic-gate * 1170Sstevel@tonic-gate * A strict priority policy (grant the lock to the highest priority blocked 1180Sstevel@tonic-gate * thread) satisfies everything but (2): a steady stream of high-priority 1190Sstevel@tonic-gate * readers can permanently starve the highest-priority writer. 1200Sstevel@tonic-gate * 1210Sstevel@tonic-gate * The reason we care about (2) is that it's important to process writers 1220Sstevel@tonic-gate * reasonably quickly -- even if they're low priority -- because their very 1230Sstevel@tonic-gate * presence causes all readers to take the slow (blocking) path through this 1240Sstevel@tonic-gate * code. There is also a general sense that writers deserve some degree of 1250Sstevel@tonic-gate * deference because they're updating the data upon which all readers act. 1260Sstevel@tonic-gate * Presumably this data should not be allowed to become arbitrarily stale 1270Sstevel@tonic-gate * due to writer starvation. Finally, it seems reasonable to level the 1280Sstevel@tonic-gate * playing field a bit to compensate for the fact that it's so much harder 1290Sstevel@tonic-gate * for a writer to get in when there are already many readers present. 1300Sstevel@tonic-gate * 1310Sstevel@tonic-gate * A hybrid of round-robin and strict priority can be made to satisfy 1320Sstevel@tonic-gate * all five criteria. In this "writer priority policy" exiting readers 1330Sstevel@tonic-gate * always grant the lock to waiting writers, but exiting writers only 1340Sstevel@tonic-gate * grant the lock to readers of the same or higher priority than the 1350Sstevel@tonic-gate * highest-priority blocked writer. Thus requirement (2) is satisfied, 1360Sstevel@tonic-gate * necessarily, by a willful act of priority inversion: an exiting reader 1370Sstevel@tonic-gate * will grant the lock to a blocked writer even if there are blocked 1380Sstevel@tonic-gate * readers of higher priority. The situation is mitigated by the fact 1390Sstevel@tonic-gate * that writers always inherit priority from blocked readers, and the 1400Sstevel@tonic-gate * writer will awaken those readers as soon as it exits the lock. 1410Sstevel@tonic-gate * 1420Sstevel@tonic-gate * rw_downgrade() follows the same wakeup policy as an exiting writer. 1430Sstevel@tonic-gate * 1440Sstevel@tonic-gate * rw_tryupgrade() has the same failure mode as rw_tryenter() for a 1450Sstevel@tonic-gate * write lock. Both honor the WRITE_WANTED bit by specification. 1460Sstevel@tonic-gate * 1470Sstevel@tonic-gate * The following rules apply to manipulation of rwlock internal state: 1480Sstevel@tonic-gate * 1490Sstevel@tonic-gate * (1) The rwlock is only modified via the atomic primitives casip() 1500Sstevel@tonic-gate * and atomic_add_ip(). 1510Sstevel@tonic-gate * 1520Sstevel@tonic-gate * (2) The waiters bit and write-wanted bit are only modified under 1530Sstevel@tonic-gate * turnstile_lookup(). This ensures that the turnstile is consistent 1540Sstevel@tonic-gate * with the rwlock. 1550Sstevel@tonic-gate * 1560Sstevel@tonic-gate * (3) Waiters receive the lock by direct handoff from the previous 1570Sstevel@tonic-gate * owner. Therefore, waiters *always* wake up holding the lock. 1580Sstevel@tonic-gate */ 1590Sstevel@tonic-gate 1600Sstevel@tonic-gate /* 1610Sstevel@tonic-gate * The sobj_ops vector exports a set of functions needed when a thread 1620Sstevel@tonic-gate * is asleep on a synchronization object of a given type. 1630Sstevel@tonic-gate */ 1640Sstevel@tonic-gate static sobj_ops_t rw_sobj_ops = { 1650Sstevel@tonic-gate SOBJ_RWLOCK, rw_owner, turnstile_stay_asleep, turnstile_change_pri 1660Sstevel@tonic-gate }; 1670Sstevel@tonic-gate 1680Sstevel@tonic-gate /* 1690Sstevel@tonic-gate * If the system panics on an rwlock, save the address of the offending 1700Sstevel@tonic-gate * rwlock in panic_rwlock_addr, and save the contents in panic_rwlock. 1710Sstevel@tonic-gate */ 1720Sstevel@tonic-gate static rwlock_impl_t panic_rwlock; 1730Sstevel@tonic-gate static rwlock_impl_t *panic_rwlock_addr; 1740Sstevel@tonic-gate 1750Sstevel@tonic-gate static void 1760Sstevel@tonic-gate rw_panic(char *msg, rwlock_impl_t *lp) 1770Sstevel@tonic-gate { 1780Sstevel@tonic-gate if (panicstr) 1790Sstevel@tonic-gate return; 1800Sstevel@tonic-gate 1810Sstevel@tonic-gate if (casptr(&panic_rwlock_addr, NULL, lp) == NULL) 1820Sstevel@tonic-gate panic_rwlock = *lp; 1830Sstevel@tonic-gate 1840Sstevel@tonic-gate panic("%s, lp=%p wwwh=%lx thread=%p", 1850Sstevel@tonic-gate msg, lp, panic_rwlock.rw_wwwh, curthread); 1860Sstevel@tonic-gate } 1870Sstevel@tonic-gate 1880Sstevel@tonic-gate /* ARGSUSED */ 1890Sstevel@tonic-gate void 1900Sstevel@tonic-gate rw_init(krwlock_t *rwlp, char *name, krw_type_t type, void *arg) 1910Sstevel@tonic-gate { 1920Sstevel@tonic-gate ((rwlock_impl_t *)rwlp)->rw_wwwh = 0; 1930Sstevel@tonic-gate } 1940Sstevel@tonic-gate 1950Sstevel@tonic-gate void 1960Sstevel@tonic-gate rw_destroy(krwlock_t *rwlp) 1970Sstevel@tonic-gate { 1980Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 1990Sstevel@tonic-gate 2000Sstevel@tonic-gate if (lp->rw_wwwh != 0) { 2010Sstevel@tonic-gate if ((lp->rw_wwwh & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) 2020Sstevel@tonic-gate rw_panic("rw_destroy: lock already destroyed", lp); 2030Sstevel@tonic-gate else 2040Sstevel@tonic-gate rw_panic("rw_destroy: lock still active", lp); 2050Sstevel@tonic-gate } 2060Sstevel@tonic-gate 2070Sstevel@tonic-gate lp->rw_wwwh = RW_DOUBLE_LOCK; 2080Sstevel@tonic-gate } 2090Sstevel@tonic-gate 2100Sstevel@tonic-gate /* 2110Sstevel@tonic-gate * Verify that an rwlock is held correctly. 2120Sstevel@tonic-gate */ 2130Sstevel@tonic-gate static int 2140Sstevel@tonic-gate rw_locked(rwlock_impl_t *lp, krw_t rw) 2150Sstevel@tonic-gate { 2160Sstevel@tonic-gate uintptr_t old = lp->rw_wwwh; 2170Sstevel@tonic-gate 2180Sstevel@tonic-gate if (rw == RW_READER) 2190Sstevel@tonic-gate return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); 2200Sstevel@tonic-gate 2210Sstevel@tonic-gate if (rw == RW_WRITER) 2220Sstevel@tonic-gate return ((old & RW_OWNER) == (uintptr_t)curthread); 2230Sstevel@tonic-gate 2240Sstevel@tonic-gate return (0); 2250Sstevel@tonic-gate } 2260Sstevel@tonic-gate 227*6138Ssvemuri uint_t (*rw_lock_backoff)(uint_t) = NULL; 228*6138Ssvemuri void (*rw_lock_delay)(uint_t) = NULL; 229*6138Ssvemuri 2300Sstevel@tonic-gate /* 2310Sstevel@tonic-gate * Full-service implementation of rw_enter() to handle all the hard cases. 2320Sstevel@tonic-gate * Called from the assembly version if anything complicated is going on. 2330Sstevel@tonic-gate * The only semantic difference between calling rw_enter() and calling 2340Sstevel@tonic-gate * rw_enter_sleep() directly is that we assume the caller has already done 2350Sstevel@tonic-gate * a THREAD_KPRI_REQUEST() in the RW_READER case. 2360Sstevel@tonic-gate */ 2370Sstevel@tonic-gate void 2380Sstevel@tonic-gate rw_enter_sleep(rwlock_impl_t *lp, krw_t rw) 2390Sstevel@tonic-gate { 2400Sstevel@tonic-gate uintptr_t old, new, lock_value, lock_busy, lock_wait; 2410Sstevel@tonic-gate hrtime_t sleep_time; 2420Sstevel@tonic-gate turnstile_t *ts; 243*6138Ssvemuri uint_t backoff = 0; 244*6138Ssvemuri int loop_count = 0; 2450Sstevel@tonic-gate 2460Sstevel@tonic-gate if (rw == RW_READER) { 2470Sstevel@tonic-gate lock_value = RW_READ_LOCK; 2480Sstevel@tonic-gate lock_busy = RW_WRITE_CLAIMED; 2490Sstevel@tonic-gate lock_wait = RW_HAS_WAITERS; 2500Sstevel@tonic-gate } else { 2510Sstevel@tonic-gate lock_value = RW_WRITE_LOCK(curthread); 2520Sstevel@tonic-gate lock_busy = (uintptr_t)RW_LOCKED; 2530Sstevel@tonic-gate lock_wait = RW_HAS_WAITERS | RW_WRITE_WANTED; 2540Sstevel@tonic-gate } 2550Sstevel@tonic-gate 2560Sstevel@tonic-gate for (;;) { 2570Sstevel@tonic-gate if (((old = lp->rw_wwwh) & lock_busy) == 0) { 258*6138Ssvemuri if (casip(&lp->rw_wwwh, old, old + lock_value) != old) { 259*6138Ssvemuri if (rw_lock_delay != NULL) { 260*6138Ssvemuri backoff = rw_lock_backoff(backoff); 261*6138Ssvemuri rw_lock_delay(backoff); 262*6138Ssvemuri if (++loop_count == ncpus_online) { 263*6138Ssvemuri backoff = 0; 264*6138Ssvemuri loop_count = 0; 265*6138Ssvemuri } 266*6138Ssvemuri } 2670Sstevel@tonic-gate continue; 268*6138Ssvemuri } 2690Sstevel@tonic-gate break; 2700Sstevel@tonic-gate } 2710Sstevel@tonic-gate 2720Sstevel@tonic-gate if (panicstr) 2730Sstevel@tonic-gate return; 2740Sstevel@tonic-gate 2750Sstevel@tonic-gate if ((old & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) { 2760Sstevel@tonic-gate rw_panic("rw_enter: bad rwlock", lp); 2770Sstevel@tonic-gate return; 2780Sstevel@tonic-gate } 2790Sstevel@tonic-gate 2800Sstevel@tonic-gate if ((old & RW_OWNER) == (uintptr_t)curthread) { 2810Sstevel@tonic-gate rw_panic("recursive rw_enter", lp); 2820Sstevel@tonic-gate return; 2830Sstevel@tonic-gate } 2840Sstevel@tonic-gate 2850Sstevel@tonic-gate ts = turnstile_lookup(lp); 2860Sstevel@tonic-gate 2870Sstevel@tonic-gate do { 2880Sstevel@tonic-gate if (((old = lp->rw_wwwh) & lock_busy) == 0) 2890Sstevel@tonic-gate break; 2900Sstevel@tonic-gate new = old | lock_wait; 2910Sstevel@tonic-gate } while (old != new && casip(&lp->rw_wwwh, old, new) != old); 2920Sstevel@tonic-gate 2930Sstevel@tonic-gate if ((old & lock_busy) == 0) { 2940Sstevel@tonic-gate /* 2950Sstevel@tonic-gate * The lock appears free now; try the dance again 2960Sstevel@tonic-gate */ 2970Sstevel@tonic-gate turnstile_exit(lp); 2980Sstevel@tonic-gate continue; 2990Sstevel@tonic-gate } 3000Sstevel@tonic-gate 3010Sstevel@tonic-gate /* 3020Sstevel@tonic-gate * We really are going to block. Bump the stats, and drop 3030Sstevel@tonic-gate * kpri if we're a reader. 3040Sstevel@tonic-gate */ 3050Sstevel@tonic-gate ASSERT(lp->rw_wwwh & lock_wait); 3060Sstevel@tonic-gate ASSERT(lp->rw_wwwh & RW_LOCKED); 3070Sstevel@tonic-gate 3080Sstevel@tonic-gate sleep_time = -gethrtime(); 3090Sstevel@tonic-gate if (rw == RW_READER) { 3100Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 3110Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, sys, rw_rdfails, 1); 3120Sstevel@tonic-gate (void) turnstile_block(ts, TS_READER_Q, lp, 3130Sstevel@tonic-gate &rw_sobj_ops, NULL, NULL); 3140Sstevel@tonic-gate } else { 3150Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, sys, rw_wrfails, 1); 3160Sstevel@tonic-gate (void) turnstile_block(ts, TS_WRITER_Q, lp, 3170Sstevel@tonic-gate &rw_sobj_ops, NULL, NULL); 3180Sstevel@tonic-gate } 3190Sstevel@tonic-gate sleep_time += gethrtime(); 3200Sstevel@tonic-gate 3210Sstevel@tonic-gate LOCKSTAT_RECORD4(LS_RW_ENTER_BLOCK, lp, sleep_time, rw, 3220Sstevel@tonic-gate (old & RW_WRITE_LOCKED) ? 1 : 0, 3230Sstevel@tonic-gate old >> RW_HOLD_COUNT_SHIFT); 3240Sstevel@tonic-gate 3250Sstevel@tonic-gate /* 3260Sstevel@tonic-gate * We wake up holding the lock (and having kpri if we're 3270Sstevel@tonic-gate * a reader) via direct handoff from the previous owner. 3280Sstevel@tonic-gate */ 3290Sstevel@tonic-gate break; 3300Sstevel@tonic-gate } 3310Sstevel@tonic-gate 3320Sstevel@tonic-gate ASSERT(rw_locked(lp, rw)); 3330Sstevel@tonic-gate 3340Sstevel@tonic-gate membar_enter(); 3350Sstevel@tonic-gate 3360Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_ENTER_ACQUIRE, lp, rw); 3370Sstevel@tonic-gate } 3380Sstevel@tonic-gate 3390Sstevel@tonic-gate /* 3400Sstevel@tonic-gate * Return the number of readers to wake, or zero if we should wake a writer. 3410Sstevel@tonic-gate * Called only by exiting/downgrading writers (readers don't wake readers). 3420Sstevel@tonic-gate */ 3430Sstevel@tonic-gate static int 3440Sstevel@tonic-gate rw_readers_to_wake(turnstile_t *ts) 3450Sstevel@tonic-gate { 3460Sstevel@tonic-gate kthread_t *next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; 3470Sstevel@tonic-gate kthread_t *next_reader = ts->ts_sleepq[TS_READER_Q].sq_first; 3480Sstevel@tonic-gate pri_t wpri = (next_writer != NULL) ? DISP_PRIO(next_writer) : -1; 3490Sstevel@tonic-gate int count = 0; 3500Sstevel@tonic-gate 3510Sstevel@tonic-gate while (next_reader != NULL) { 3520Sstevel@tonic-gate if (DISP_PRIO(next_reader) < wpri) 3530Sstevel@tonic-gate break; 3540Sstevel@tonic-gate next_reader->t_kpri_req++; 3550Sstevel@tonic-gate next_reader = next_reader->t_link; 3560Sstevel@tonic-gate count++; 3570Sstevel@tonic-gate } 3580Sstevel@tonic-gate return (count); 3590Sstevel@tonic-gate } 3600Sstevel@tonic-gate 3610Sstevel@tonic-gate /* 3620Sstevel@tonic-gate * Full-service implementation of rw_exit() to handle all the hard cases. 3630Sstevel@tonic-gate * Called from the assembly version if anything complicated is going on. 3640Sstevel@tonic-gate * There is no semantic difference between calling rw_exit() and calling 3650Sstevel@tonic-gate * rw_exit_wakeup() directly. 3660Sstevel@tonic-gate */ 3670Sstevel@tonic-gate void 3680Sstevel@tonic-gate rw_exit_wakeup(rwlock_impl_t *lp) 3690Sstevel@tonic-gate { 3700Sstevel@tonic-gate turnstile_t *ts; 3710Sstevel@tonic-gate uintptr_t old, new, lock_value; 3720Sstevel@tonic-gate kthread_t *next_writer; 3730Sstevel@tonic-gate int nreaders; 374*6138Ssvemuri uint_t backoff = 0; 375*6138Ssvemuri int loop_count = 0; 3760Sstevel@tonic-gate 3770Sstevel@tonic-gate membar_exit(); 3780Sstevel@tonic-gate 3790Sstevel@tonic-gate old = lp->rw_wwwh; 3800Sstevel@tonic-gate if (old & RW_WRITE_LOCKED) { 3810Sstevel@tonic-gate if ((old & RW_OWNER) != (uintptr_t)curthread) { 3820Sstevel@tonic-gate rw_panic("rw_exit: not owner", lp); 3830Sstevel@tonic-gate lp->rw_wwwh = 0; 3840Sstevel@tonic-gate return; 3850Sstevel@tonic-gate } 3860Sstevel@tonic-gate lock_value = RW_WRITE_LOCK(curthread); 3870Sstevel@tonic-gate } else { 3880Sstevel@tonic-gate if ((old & RW_LOCKED) == 0) { 3890Sstevel@tonic-gate rw_panic("rw_exit: lock not held", lp); 3900Sstevel@tonic-gate return; 3910Sstevel@tonic-gate } 3920Sstevel@tonic-gate lock_value = RW_READ_LOCK; 3930Sstevel@tonic-gate } 3940Sstevel@tonic-gate 3950Sstevel@tonic-gate for (;;) { 3960Sstevel@tonic-gate /* 3970Sstevel@tonic-gate * If this is *not* the final exit of a lock with waiters, 3980Sstevel@tonic-gate * just drop the lock -- there's nothing tricky going on. 3990Sstevel@tonic-gate */ 4000Sstevel@tonic-gate old = lp->rw_wwwh; 4010Sstevel@tonic-gate new = old - lock_value; 4020Sstevel@tonic-gate if ((new & (RW_LOCKED | RW_HAS_WAITERS)) != RW_HAS_WAITERS) { 403*6138Ssvemuri if (casip(&lp->rw_wwwh, old, new) != old) { 404*6138Ssvemuri if (rw_lock_delay != NULL) { 405*6138Ssvemuri backoff = rw_lock_backoff(backoff); 406*6138Ssvemuri rw_lock_delay(backoff); 407*6138Ssvemuri if (++loop_count == ncpus_online) { 408*6138Ssvemuri backoff = 0; 409*6138Ssvemuri loop_count = 0; 410*6138Ssvemuri } 411*6138Ssvemuri } 4120Sstevel@tonic-gate continue; 413*6138Ssvemuri } 4140Sstevel@tonic-gate break; 4150Sstevel@tonic-gate } 4160Sstevel@tonic-gate 4170Sstevel@tonic-gate /* 4180Sstevel@tonic-gate * Perform the final exit of a lock that has waiters. 4190Sstevel@tonic-gate */ 4200Sstevel@tonic-gate ts = turnstile_lookup(lp); 4210Sstevel@tonic-gate 4220Sstevel@tonic-gate next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; 4230Sstevel@tonic-gate 4240Sstevel@tonic-gate if ((old & RW_WRITE_LOCKED) && 4250Sstevel@tonic-gate (nreaders = rw_readers_to_wake(ts)) > 0) { 4260Sstevel@tonic-gate /* 4270Sstevel@tonic-gate * Don't drop the lock -- just set the hold count 4280Sstevel@tonic-gate * such that we grant the lock to all readers at once. 4290Sstevel@tonic-gate */ 4300Sstevel@tonic-gate new = nreaders * RW_READ_LOCK; 4310Sstevel@tonic-gate if (ts->ts_waiters > nreaders) 4320Sstevel@tonic-gate new |= RW_HAS_WAITERS; 4330Sstevel@tonic-gate if (next_writer) 4340Sstevel@tonic-gate new |= RW_WRITE_WANTED; 4350Sstevel@tonic-gate lp->rw_wwwh = new; 4360Sstevel@tonic-gate membar_enter(); 4370Sstevel@tonic-gate turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); 4380Sstevel@tonic-gate } else { 4390Sstevel@tonic-gate /* 4400Sstevel@tonic-gate * Don't drop the lock -- just transfer ownership 4410Sstevel@tonic-gate * directly to next_writer. Note that there must 4420Sstevel@tonic-gate * be at least one waiting writer, because we get 4430Sstevel@tonic-gate * here only if (A) the lock is read-locked or 4440Sstevel@tonic-gate * (B) there are no waiting readers. In case (A), 4450Sstevel@tonic-gate * since the lock is read-locked there would be no 4460Sstevel@tonic-gate * reason for other readers to have blocked unless 4470Sstevel@tonic-gate * the RW_WRITE_WANTED bit was set. In case (B), 4480Sstevel@tonic-gate * since there are waiters but no waiting readers, 4490Sstevel@tonic-gate * they must all be waiting writers. 4500Sstevel@tonic-gate */ 4510Sstevel@tonic-gate ASSERT(lp->rw_wwwh & RW_WRITE_WANTED); 4520Sstevel@tonic-gate new = RW_WRITE_LOCK(next_writer); 4530Sstevel@tonic-gate if (ts->ts_waiters > 1) 4540Sstevel@tonic-gate new |= RW_HAS_WAITERS; 4550Sstevel@tonic-gate if (next_writer->t_link) 4560Sstevel@tonic-gate new |= RW_WRITE_WANTED; 4570Sstevel@tonic-gate lp->rw_wwwh = new; 4580Sstevel@tonic-gate membar_enter(); 4590Sstevel@tonic-gate turnstile_wakeup(ts, TS_WRITER_Q, 1, next_writer); 4600Sstevel@tonic-gate } 4610Sstevel@tonic-gate break; 4620Sstevel@tonic-gate } 4630Sstevel@tonic-gate 4640Sstevel@tonic-gate if (lock_value == RW_READ_LOCK) { 4650Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 4660Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_READER); 4670Sstevel@tonic-gate } else { 4680Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_WRITER); 4690Sstevel@tonic-gate } 4700Sstevel@tonic-gate } 4710Sstevel@tonic-gate 4720Sstevel@tonic-gate int 4730Sstevel@tonic-gate rw_tryenter(krwlock_t *rwlp, krw_t rw) 4740Sstevel@tonic-gate { 4750Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 4760Sstevel@tonic-gate uintptr_t old; 4770Sstevel@tonic-gate 4780Sstevel@tonic-gate if (rw == RW_READER) { 479*6138Ssvemuri uint_t backoff = 0; 480*6138Ssvemuri int loop_count = 0; 4810Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 482*6138Ssvemuri for (;;) { 4830Sstevel@tonic-gate if ((old = lp->rw_wwwh) & RW_WRITE_CLAIMED) { 4840Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 4850Sstevel@tonic-gate return (0); 4860Sstevel@tonic-gate } 487*6138Ssvemuri if (casip(&lp->rw_wwwh, old, old + RW_READ_LOCK) == old) 488*6138Ssvemuri break; 489*6138Ssvemuri if (rw_lock_delay != NULL) { 490*6138Ssvemuri backoff = rw_lock_backoff(backoff); 491*6138Ssvemuri rw_lock_delay(backoff); 492*6138Ssvemuri if (++loop_count == ncpus_online) { 493*6138Ssvemuri backoff = 0; 494*6138Ssvemuri loop_count = 0; 495*6138Ssvemuri } 496*6138Ssvemuri } 497*6138Ssvemuri } 4980Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); 4990Sstevel@tonic-gate } else { 5000Sstevel@tonic-gate if (casip(&lp->rw_wwwh, 0, RW_WRITE_LOCK(curthread)) != 0) 5010Sstevel@tonic-gate return (0); 5020Sstevel@tonic-gate LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); 5030Sstevel@tonic-gate } 5040Sstevel@tonic-gate ASSERT(rw_locked(lp, rw)); 5050Sstevel@tonic-gate membar_enter(); 5060Sstevel@tonic-gate return (1); 5070Sstevel@tonic-gate } 5080Sstevel@tonic-gate 5090Sstevel@tonic-gate void 5100Sstevel@tonic-gate rw_downgrade(krwlock_t *rwlp) 5110Sstevel@tonic-gate { 5120Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 5130Sstevel@tonic-gate 5140Sstevel@tonic-gate THREAD_KPRI_REQUEST(); 5150Sstevel@tonic-gate membar_exit(); 5160Sstevel@tonic-gate 5170Sstevel@tonic-gate if ((lp->rw_wwwh & RW_OWNER) != (uintptr_t)curthread) { 5180Sstevel@tonic-gate rw_panic("rw_downgrade: not owner", lp); 5190Sstevel@tonic-gate return; 5200Sstevel@tonic-gate } 5210Sstevel@tonic-gate 5220Sstevel@tonic-gate if (atomic_add_ip_nv(&lp->rw_wwwh, 5230Sstevel@tonic-gate RW_READ_LOCK - RW_WRITE_LOCK(curthread)) & RW_HAS_WAITERS) { 5240Sstevel@tonic-gate turnstile_t *ts = turnstile_lookup(lp); 5250Sstevel@tonic-gate int nreaders = rw_readers_to_wake(ts); 5260Sstevel@tonic-gate if (nreaders > 0) { 5270Sstevel@tonic-gate uintptr_t delta = nreaders * RW_READ_LOCK; 5280Sstevel@tonic-gate if (ts->ts_waiters == nreaders) 5290Sstevel@tonic-gate delta -= RW_HAS_WAITERS; 5300Sstevel@tonic-gate atomic_add_ip(&lp->rw_wwwh, delta); 5310Sstevel@tonic-gate } 5320Sstevel@tonic-gate turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); 5330Sstevel@tonic-gate } 5340Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_READER)); 5350Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, lp); 5360Sstevel@tonic-gate } 5370Sstevel@tonic-gate 5380Sstevel@tonic-gate int 5390Sstevel@tonic-gate rw_tryupgrade(krwlock_t *rwlp) 5400Sstevel@tonic-gate { 5410Sstevel@tonic-gate rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; 5420Sstevel@tonic-gate uintptr_t old, new; 5430Sstevel@tonic-gate 5440Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_READER)); 5450Sstevel@tonic-gate 5460Sstevel@tonic-gate do { 5470Sstevel@tonic-gate if (((old = lp->rw_wwwh) & ~RW_HAS_WAITERS) != RW_READ_LOCK) 5480Sstevel@tonic-gate return (0); 5490Sstevel@tonic-gate new = old + RW_WRITE_LOCK(curthread) - RW_READ_LOCK; 5500Sstevel@tonic-gate } while (casip(&lp->rw_wwwh, old, new) != old); 5510Sstevel@tonic-gate 5520Sstevel@tonic-gate membar_enter(); 5530Sstevel@tonic-gate THREAD_KPRI_RELEASE(); 5540Sstevel@tonic-gate LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, lp); 5550Sstevel@tonic-gate ASSERT(rw_locked(lp, RW_WRITER)); 5560Sstevel@tonic-gate return (1); 5570Sstevel@tonic-gate } 5580Sstevel@tonic-gate 5590Sstevel@tonic-gate int 5600Sstevel@tonic-gate rw_read_held(krwlock_t *rwlp) 5610Sstevel@tonic-gate { 5620Sstevel@tonic-gate uintptr_t tmp; 5630Sstevel@tonic-gate 5640Sstevel@tonic-gate return (_RW_READ_HELD(rwlp, tmp)); 5650Sstevel@tonic-gate } 5660Sstevel@tonic-gate 5670Sstevel@tonic-gate int 5680Sstevel@tonic-gate rw_write_held(krwlock_t *rwlp) 5690Sstevel@tonic-gate { 5700Sstevel@tonic-gate return (_RW_WRITE_HELD(rwlp)); 5710Sstevel@tonic-gate } 5720Sstevel@tonic-gate 5730Sstevel@tonic-gate int 5740Sstevel@tonic-gate rw_lock_held(krwlock_t *rwlp) 5750Sstevel@tonic-gate { 5760Sstevel@tonic-gate return (_RW_LOCK_HELD(rwlp)); 5770Sstevel@tonic-gate } 5780Sstevel@tonic-gate 5790Sstevel@tonic-gate /* 5800Sstevel@tonic-gate * Like rw_read_held(), but ASSERTs that the lock is currently held 5810Sstevel@tonic-gate */ 5820Sstevel@tonic-gate int 5830Sstevel@tonic-gate rw_read_locked(krwlock_t *rwlp) 5840Sstevel@tonic-gate { 5850Sstevel@tonic-gate uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; 5860Sstevel@tonic-gate 5870Sstevel@tonic-gate ASSERT(old & RW_LOCKED); 5880Sstevel@tonic-gate return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); 5890Sstevel@tonic-gate } 5900Sstevel@tonic-gate 5910Sstevel@tonic-gate /* 5920Sstevel@tonic-gate * Returns non-zero if the lock is either held or desired by a writer 5930Sstevel@tonic-gate */ 5940Sstevel@tonic-gate int 5950Sstevel@tonic-gate rw_iswriter(krwlock_t *rwlp) 5960Sstevel@tonic-gate { 5970Sstevel@tonic-gate return (_RW_ISWRITER(rwlp)); 5980Sstevel@tonic-gate } 5990Sstevel@tonic-gate 6000Sstevel@tonic-gate kthread_t * 6010Sstevel@tonic-gate rw_owner(krwlock_t *rwlp) 6020Sstevel@tonic-gate { 6030Sstevel@tonic-gate uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; 6040Sstevel@tonic-gate 6050Sstevel@tonic-gate return ((old & RW_WRITE_LOCKED) ? (kthread_t *)(old & RW_OWNER) : NULL); 6060Sstevel@tonic-gate } 607