xref: /netbsd-src/sys/kern/kern_rwlock.c (revision 4e1f96faf8dda38f639610fb3e7474dee1c26ab2)
1 /*	$NetBSD: kern_rwlock.c,v 1.32 2009/05/16 08:36:32 yamt Exp $	*/
2 
3 /*-
4  * Copyright (c) 2002, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe and Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Kernel reader/writer lock implementation, modeled after those
34  * found in Solaris, a description of which can be found in:
35  *
36  *	Solaris Internals: Core Kernel Architecture, Jim Mauro and
37  *	    Richard McDougall.
38  */
39 
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.32 2009/05/16 08:36:32 yamt Exp $");
42 
43 #define	__RWLOCK_PRIVATE
44 
45 #include <sys/param.h>
46 #include <sys/proc.h>
47 #include <sys/rwlock.h>
48 #include <sys/sched.h>
49 #include <sys/sleepq.h>
50 #include <sys/systm.h>
51 #include <sys/lockdebug.h>
52 #include <sys/cpu.h>
53 #include <sys/atomic.h>
54 #include <sys/lock.h>
55 
56 #include <dev/lockstat.h>
57 
58 /*
59  * LOCKDEBUG
60  */
61 
62 #if defined(LOCKDEBUG)
63 
64 #define	RW_WANTLOCK(rw, op, t)						\
65 	LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw),			\
66 	    (uintptr_t)__builtin_return_address(0), op == RW_READER, t);
67 #define	RW_LOCKED(rw, op)						\
68 	LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL,			\
69 	    (uintptr_t)__builtin_return_address(0), op == RW_READER);
70 #define	RW_UNLOCKED(rw, op)						\
71 	LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw),			\
72 	    (uintptr_t)__builtin_return_address(0), op == RW_READER);
73 #define	RW_DASSERT(rw, cond)						\
74 do {									\
75 	if (!(cond))							\
76 		rw_abort(rw, __func__, "assertion failed: " #cond);	\
77 } while (/* CONSTCOND */ 0);
78 
79 #else	/* LOCKDEBUG */
80 
81 #define	RW_WANTLOCK(rw, op, t)	/* nothing */
82 #define	RW_LOCKED(rw, op)	/* nothing */
83 #define	RW_UNLOCKED(rw, op)	/* nothing */
84 #define	RW_DASSERT(rw, cond)	/* nothing */
85 
86 #endif	/* LOCKDEBUG */
87 
88 /*
89  * DIAGNOSTIC
90  */
91 
92 #if defined(DIAGNOSTIC)
93 
94 #define	RW_ASSERT(rw, cond)						\
95 do {									\
96 	if (!(cond))							\
97 		rw_abort(rw, __func__, "assertion failed: " #cond);	\
98 } while (/* CONSTCOND */ 0)
99 
100 #else
101 
102 #define	RW_ASSERT(rw, cond)	/* nothing */
103 
104 #endif	/* DIAGNOSTIC */
105 
106 #define	RW_SETDEBUG(rw, on)		((rw)->rw_owner |= (on) ? RW_DEBUG : 0)
107 #define	RW_DEBUG_P(rw)			(((rw)->rw_owner & RW_DEBUG) != 0)
108 #if defined(LOCKDEBUG)
109 #define	RW_INHERITDEBUG(new, old)	(new) |= (old) & RW_DEBUG
110 #else /* defined(LOCKDEBUG) */
111 #define	RW_INHERITDEBUG(new, old)	/* nothing */
112 #endif /* defined(LOCKDEBUG) */
113 
114 static void	rw_abort(krwlock_t *, const char *, const char *);
115 static void	rw_dump(volatile void *);
116 static lwp_t	*rw_owner(wchan_t);
117 
118 static inline uintptr_t
119 rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
120 {
121 
122 	RW_INHERITDEBUG(n, o);
123 	return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
124 	    (void *)o, (void *)n);
125 }
126 
127 static inline void
128 rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
129 {
130 
131 	RW_INHERITDEBUG(n, o);
132 	n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
133 	    (void *)n);
134 	RW_DASSERT(rw, n == o);
135 }
136 
137 /*
138  * For platforms that do not provide stubs, or for the LOCKDEBUG case.
139  */
140 #ifdef LOCKDEBUG
141 #undef	__HAVE_RW_STUBS
142 #endif
143 
144 #ifndef __HAVE_RW_STUBS
145 __strong_alias(rw_enter,rw_vector_enter);
146 __strong_alias(rw_exit,rw_vector_exit);
147 __strong_alias(rw_tryenter,rw_vector_tryenter);
148 #endif
149 
150 lockops_t rwlock_lockops = {
151 	"Reader / writer lock",
152 	LOCKOPS_SLEEP,
153 	rw_dump
154 };
155 
156 syncobj_t rw_syncobj = {
157 	SOBJ_SLEEPQ_SORTED,
158 	turnstile_unsleep,
159 	turnstile_changepri,
160 	sleepq_lendpri,
161 	rw_owner,
162 };
163 
164 /* Mutex cache */
165 #define	RW_OBJ_MAGIC	0x85d3c85d
166 struct krwobj {
167 	krwlock_t	ro_lock;
168 	u_int		ro_magic;
169 	u_int		ro_refcnt;
170 };
171 
172 static int	rw_obj_ctor(void *, void *, int);
173 
174 static pool_cache_t	rw_obj_cache;
175 
176 /*
177  * rw_dump:
178  *
179  *	Dump the contents of a rwlock structure.
180  */
181 static void
182 rw_dump(volatile void *cookie)
183 {
184 	volatile krwlock_t *rw = cookie;
185 
186 	printf_nolog("owner/count  : %#018lx flags    : %#018x\n",
187 	    (long)RW_OWNER(rw), (int)RW_FLAGS(rw));
188 }
189 
190 /*
191  * rw_abort:
192  *
193  *	Dump information about an error and panic the system.  This
194  *	generates a lot of machine code in the DIAGNOSTIC case, so
195  *	we ask the compiler to not inline it.
196  */
197 static void __noinline
198 rw_abort(krwlock_t *rw, const char *func, const char *msg)
199 {
200 
201 	if (panicstr != NULL)
202 		return;
203 
204 	LOCKDEBUG_ABORT(rw, &rwlock_lockops, func, msg);
205 }
206 
207 /*
208  * rw_init:
209  *
210  *	Initialize a rwlock for use.
211  */
212 void
213 rw_init(krwlock_t *rw)
214 {
215 	bool dodebug;
216 
217 	memset(rw, 0, sizeof(*rw));
218 
219 	dodebug = LOCKDEBUG_ALLOC(rw, &rwlock_lockops,
220 	    (uintptr_t)__builtin_return_address(0));
221 	RW_SETDEBUG(rw, dodebug);
222 }
223 
224 /*
225  * rw_destroy:
226  *
227  *	Tear down a rwlock.
228  */
229 void
230 rw_destroy(krwlock_t *rw)
231 {
232 
233 	RW_ASSERT(rw, (rw->rw_owner & ~RW_DEBUG) == 0);
234 	LOCKDEBUG_FREE(RW_DEBUG_P(rw), rw);
235 }
236 
237 /*
238  * rw_onproc:
239  *
240  *	Return true if an rwlock owner is running on a CPU in the system.
241  *	If the target is waiting on the kernel big lock, then we must
242  *	release it.  This is necessary to avoid deadlock.
243  *
244  *	Note that we can't use the rwlock owner field as an LWP pointer.  We
245  *	don't have full control over the timing of our execution, and so the
246  *	pointer could be completely invalid by the time we dereference it.
247  */
248 static int
249 rw_onproc(uintptr_t owner, struct cpu_info **cip)
250 {
251 #ifdef MULTIPROCESSOR
252 	CPU_INFO_ITERATOR cii;
253 	struct cpu_info *ci;
254 	lwp_t *l;
255 
256 	if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED)
257 		return 0;
258 	l = (lwp_t *)(owner & RW_THREAD);
259 
260 	/* See if the target is running on a CPU somewhere. */
261 	if ((ci = *cip) != NULL && ci->ci_curlwp == l)
262 		goto run;
263 	for (CPU_INFO_FOREACH(cii, ci))
264 		if (ci->ci_curlwp == l)
265 			goto run;
266 
267 	/* No: it may be safe to block now. */
268 	*cip = NULL;
269 	return 0;
270 
271  run:
272  	/* Target is running; do we need to block? */
273  	*cip = ci;
274 	return ci->ci_biglock_wanted != l;
275 #else
276 	return 0;
277 #endif	/* MULTIPROCESSOR */
278 }
279 
280 /*
281  * rw_vector_enter:
282  *
283  *	Acquire a rwlock.
284  */
285 void
286 rw_vector_enter(krwlock_t *rw, const krw_t op)
287 {
288 	uintptr_t owner, incr, need_wait, set_wait, curthread, next;
289 	struct cpu_info *ci;
290 	turnstile_t *ts;
291 	int queue;
292 	lwp_t *l;
293 	LOCKSTAT_TIMER(slptime);
294 	LOCKSTAT_TIMER(slpcnt);
295 	LOCKSTAT_TIMER(spintime);
296 	LOCKSTAT_COUNTER(spincnt);
297 	LOCKSTAT_FLAG(lsflag);
298 
299 	l = curlwp;
300 	curthread = (uintptr_t)l;
301 
302 	RW_ASSERT(rw, !cpu_intr_p());
303 	RW_ASSERT(rw, curthread != 0);
304 	RW_WANTLOCK(rw, op, false);
305 
306 	if (panicstr == NULL) {
307 		LOCKDEBUG_BARRIER(&kernel_lock, 1);
308 	}
309 
310 	/*
311 	 * We play a slight trick here.  If we're a reader, we want
312 	 * increment the read count.  If we're a writer, we want to
313 	 * set the owner field and whe WRITE_LOCKED bit.
314 	 *
315 	 * In the latter case, we expect those bits to be zero,
316 	 * therefore we can use an add operation to set them, which
317 	 * means an add operation for both cases.
318 	 */
319 	if (__predict_true(op == RW_READER)) {
320 		incr = RW_READ_INCR;
321 		set_wait = RW_HAS_WAITERS;
322 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
323 		queue = TS_READER_Q;
324 	} else {
325 		RW_DASSERT(rw, op == RW_WRITER);
326 		incr = curthread | RW_WRITE_LOCKED;
327 		set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
328 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
329 		queue = TS_WRITER_Q;
330 	}
331 
332 	LOCKSTAT_ENTER(lsflag);
333 
334 	for (ci = NULL, owner = rw->rw_owner;;) {
335 		/*
336 		 * Read the lock owner field.  If the need-to-wait
337 		 * indicator is clear, then try to acquire the lock.
338 		 */
339 		if ((owner & need_wait) == 0) {
340 			next = rw_cas(rw, owner, (owner + incr) &
341 			    ~RW_WRITE_WANTED);
342 			if (__predict_true(next == owner)) {
343 				/* Got it! */
344 				membar_enter();
345 				break;
346 			}
347 
348 			/*
349 			 * Didn't get it -- spin around again (we'll
350 			 * probably sleep on the next iteration).
351 			 */
352 			owner = next;
353 			continue;
354 		}
355 
356 		if (__predict_false(panicstr != NULL))
357 			return;
358 		if (__predict_false(RW_OWNER(rw) == curthread))
359 			rw_abort(rw, __func__, "locking against myself");
360 
361 		/*
362 		 * If the lock owner is running on another CPU, and
363 		 * there are no existing waiters, then spin.
364 		 */
365 		if (rw_onproc(owner, &ci)) {
366 			LOCKSTAT_START_TIMER(lsflag, spintime);
367 			u_int count = SPINLOCK_BACKOFF_MIN;
368 			do {
369 				SPINLOCK_BACKOFF(count);
370 				owner = rw->rw_owner;
371 			} while (rw_onproc(owner, &ci));
372 			LOCKSTAT_STOP_TIMER(lsflag, spintime);
373 			LOCKSTAT_COUNT(spincnt, 1);
374 			if ((owner & need_wait) == 0)
375 				continue;
376 		}
377 
378 		/*
379 		 * Grab the turnstile chain lock.  Once we have that, we
380 		 * can adjust the waiter bits and sleep queue.
381 		 */
382 		ts = turnstile_lookup(rw);
383 
384 		/*
385 		 * Mark the rwlock as having waiters.  If the set fails,
386 		 * then we may not need to sleep and should spin again.
387 		 * Reload rw_owner because turnstile_lookup() may have
388 		 * spun on the turnstile chain lock.
389 		 */
390 		owner = rw->rw_owner;
391 		if ((owner & need_wait) == 0 || rw_onproc(owner, &ci)) {
392 			turnstile_exit(rw);
393 			continue;
394 		}
395 		next = rw_cas(rw, owner, owner | set_wait);
396 		if (__predict_false(next != owner)) {
397 			turnstile_exit(rw);
398 			owner = next;
399 			continue;
400 		}
401 
402 		LOCKSTAT_START_TIMER(lsflag, slptime);
403 		turnstile_block(ts, queue, rw, &rw_syncobj);
404 		LOCKSTAT_STOP_TIMER(lsflag, slptime);
405 		LOCKSTAT_COUNT(slpcnt, 1);
406 
407 		/*
408 		 * No need for a memory barrier because of context switch.
409 		 * If not handed the lock, then spin again.
410 		 */
411 		if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
412 			break;
413 	}
414 
415 	LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK |
416 	    (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime);
417 	LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime);
418 	LOCKSTAT_EXIT(lsflag);
419 
420 	RW_DASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
421 	    (op == RW_READER && RW_COUNT(rw) != 0));
422 	RW_LOCKED(rw, op);
423 }
424 
425 /*
426  * rw_vector_exit:
427  *
428  *	Release a rwlock.
429  */
430 void
431 rw_vector_exit(krwlock_t *rw)
432 {
433 	uintptr_t curthread, owner, decr, new, next;
434 	turnstile_t *ts;
435 	int rcnt, wcnt;
436 	lwp_t *l;
437 
438 	curthread = (uintptr_t)curlwp;
439 	RW_ASSERT(rw, curthread != 0);
440 
441 	if (__predict_false(panicstr != NULL))
442 		return;
443 
444 	/*
445 	 * Again, we use a trick.  Since we used an add operation to
446 	 * set the required lock bits, we can use a subtract to clear
447 	 * them, which makes the read-release and write-release path
448 	 * the same.
449 	 */
450 	owner = rw->rw_owner;
451 	if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
452 		RW_UNLOCKED(rw, RW_WRITER);
453 		RW_ASSERT(rw, RW_OWNER(rw) == curthread);
454 		decr = curthread | RW_WRITE_LOCKED;
455 	} else {
456 		RW_UNLOCKED(rw, RW_READER);
457 		RW_ASSERT(rw, RW_COUNT(rw) != 0);
458 		decr = RW_READ_INCR;
459 	}
460 
461 	/*
462 	 * Compute what we expect the new value of the lock to be. Only
463 	 * proceed to do direct handoff if there are waiters, and if the
464 	 * lock would become unowned.
465 	 */
466 	membar_exit();
467 	for (;;) {
468 		new = (owner - decr);
469 		if ((new & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
470 			break;
471 		next = rw_cas(rw, owner, new);
472 		if (__predict_true(next == owner))
473 			return;
474 		owner = next;
475 	}
476 
477 	/*
478 	 * Grab the turnstile chain lock.  This gets the interlock
479 	 * on the sleep queue.  Once we have that, we can adjust the
480 	 * waiter bits.
481 	 */
482 	ts = turnstile_lookup(rw);
483 	owner = rw->rw_owner;
484 	RW_DASSERT(rw, ts != NULL);
485 	RW_DASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
486 
487 	wcnt = TS_WAITERS(ts, TS_WRITER_Q);
488 	rcnt = TS_WAITERS(ts, TS_READER_Q);
489 
490 	/*
491 	 * Give the lock away.
492 	 *
493 	 * If we are releasing a write lock, then prefer to wake all
494 	 * outstanding readers.  Otherwise, wake one writer if there
495 	 * are outstanding readers, or all writers if there are no
496 	 * pending readers.  If waking one specific writer, the writer
497 	 * is handed the lock here.  If waking multiple writers, we
498 	 * set WRITE_WANTED to block out new readers, and let them
499 	 * do the work of acquring the lock in rw_vector_enter().
500 	 */
501 	if (rcnt == 0 || decr == RW_READ_INCR) {
502 		RW_DASSERT(rw, wcnt != 0);
503 		RW_DASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
504 
505 		if (rcnt != 0) {
506 			/* Give the lock to the longest waiting writer. */
507 			l = TS_FIRST(ts, TS_WRITER_Q);
508 			new = (uintptr_t)l | RW_WRITE_LOCKED | RW_HAS_WAITERS;
509 			if (wcnt > 1)
510 				new |= RW_WRITE_WANTED;
511 			rw_swap(rw, owner, new);
512 			turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
513 		} else {
514 			/* Wake all writers and let them fight it out. */
515 			rw_swap(rw, owner, RW_WRITE_WANTED);
516 			turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
517 		}
518 	} else {
519 		RW_DASSERT(rw, rcnt != 0);
520 
521 		/*
522 		 * Give the lock to all blocked readers.  If there
523 		 * is a writer waiting, new readers that arrive
524 		 * after the release will be blocked out.
525 		 */
526 		new = rcnt << RW_READ_COUNT_SHIFT;
527 		if (wcnt != 0)
528 			new |= RW_HAS_WAITERS | RW_WRITE_WANTED;
529 
530 		/* Wake up all sleeping readers. */
531 		rw_swap(rw, owner, new);
532 		turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
533 	}
534 }
535 
536 /*
537  * rw_vector_tryenter:
538  *
539  *	Try to acquire a rwlock.
540  */
541 int
542 rw_vector_tryenter(krwlock_t *rw, const krw_t op)
543 {
544 	uintptr_t curthread, owner, incr, need_wait, next;
545 
546 	curthread = (uintptr_t)curlwp;
547 
548 	RW_ASSERT(rw, curthread != 0);
549 
550 	if (op == RW_READER) {
551 		incr = RW_READ_INCR;
552 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
553 	} else {
554 		RW_DASSERT(rw, op == RW_WRITER);
555 		incr = curthread | RW_WRITE_LOCKED;
556 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
557 	}
558 
559 	for (owner = rw->rw_owner;; owner = next) {
560 		owner = rw->rw_owner;
561 		if (__predict_false((owner & need_wait) != 0))
562 			return 0;
563 		next = rw_cas(rw, owner, owner + incr);
564 		if (__predict_true(next == owner)) {
565 			/* Got it! */
566 			membar_enter();
567 			break;
568 		}
569 	}
570 
571 	RW_WANTLOCK(rw, op, true);
572 	RW_LOCKED(rw, op);
573 	RW_DASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
574 	    (op == RW_READER && RW_COUNT(rw) != 0));
575 
576 	return 1;
577 }
578 
579 /*
580  * rw_downgrade:
581  *
582  *	Downgrade a write lock to a read lock.
583  */
584 void
585 rw_downgrade(krwlock_t *rw)
586 {
587 	uintptr_t owner, curthread, new, next;
588 	turnstile_t *ts;
589 	int rcnt, wcnt;
590 
591 	curthread = (uintptr_t)curlwp;
592 	RW_ASSERT(rw, curthread != 0);
593 	RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
594 	RW_ASSERT(rw, RW_OWNER(rw) == curthread);
595 	RW_UNLOCKED(rw, RW_WRITER);
596 
597 	membar_producer();
598 	owner = rw->rw_owner;
599 	if ((owner & RW_HAS_WAITERS) == 0) {
600 		/*
601 		 * There are no waiters, so we can do this the easy way.
602 		 * Try swapping us down to one read hold.  If it fails, the
603 		 * lock condition has changed and we most likely now have
604 		 * waiters.
605 		 */
606 		next = rw_cas(rw, owner, RW_READ_INCR);
607 		if (__predict_true(next == owner)) {
608 			RW_LOCKED(rw, RW_READER);
609 			RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
610 			RW_DASSERT(rw, RW_COUNT(rw) != 0);
611 			return;
612 		}
613 		owner = next;
614 	}
615 
616 	/*
617 	 * Grab the turnstile chain lock.  This gets the interlock
618 	 * on the sleep queue.  Once we have that, we can adjust the
619 	 * waiter bits.
620 	 */
621 	for (;; owner = next) {
622 		ts = turnstile_lookup(rw);
623 		RW_DASSERT(rw, ts != NULL);
624 
625 		rcnt = TS_WAITERS(ts, TS_READER_Q);
626 		wcnt = TS_WAITERS(ts, TS_WRITER_Q);
627 
628 		/*
629 		 * If there are no readers, just preserve the waiters
630 		 * bits, swap us down to one read hold and return.
631 		 */
632 		if (rcnt == 0) {
633 			RW_DASSERT(rw, wcnt != 0);
634 			RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
635 			RW_DASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
636 
637 			new = RW_READ_INCR | RW_HAS_WAITERS | RW_WRITE_WANTED;
638 			next = rw_cas(rw, owner, new);
639 			turnstile_exit(rw);
640 			if (__predict_true(next == owner))
641 				break;
642 		} else {
643 			/*
644 			 * Give the lock to all blocked readers.  We may
645 			 * retain one read hold if downgrading.  If there
646 			 * is a writer waiting, new readers will be blocked
647 			 * out.
648 			 */
649 			new = (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
650 			if (wcnt != 0)
651 				new |= RW_HAS_WAITERS | RW_WRITE_WANTED;
652 
653 			next = rw_cas(rw, owner, new);
654 			if (__predict_true(next == owner)) {
655 				/* Wake up all sleeping readers. */
656 				turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
657 				break;
658 			}
659 			turnstile_exit(rw);
660 		}
661 	}
662 
663 	RW_WANTLOCK(rw, RW_READER, false);
664 	RW_LOCKED(rw, RW_READER);
665 	RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
666 	RW_DASSERT(rw, RW_COUNT(rw) != 0);
667 }
668 
669 /*
670  * rw_tryupgrade:
671  *
672  *	Try to upgrade a read lock to a write lock.  We must be the
673  *	only reader.
674  */
675 int
676 rw_tryupgrade(krwlock_t *rw)
677 {
678 	uintptr_t owner, curthread, new, next;
679 
680 	curthread = (uintptr_t)curlwp;
681 	RW_ASSERT(rw, curthread != 0);
682 	RW_ASSERT(rw, rw_read_held(rw));
683 
684 	for (owner = rw->rw_owner;; owner = next) {
685 		RW_ASSERT(rw, (owner & RW_WRITE_LOCKED) == 0);
686 		if (__predict_false((owner & RW_THREAD) != RW_READ_INCR)) {
687 			RW_ASSERT(rw, (owner & RW_THREAD) != 0);
688 			return 0;
689 		}
690 		new = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
691 		next = rw_cas(rw, owner, new);
692 		if (__predict_true(next == owner)) {
693 			membar_producer();
694 			break;
695 		}
696 	}
697 
698 	RW_UNLOCKED(rw, RW_READER);
699 	RW_WANTLOCK(rw, RW_WRITER, true);
700 	RW_LOCKED(rw, RW_WRITER);
701 	RW_DASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED);
702 	RW_DASSERT(rw, RW_OWNER(rw) == curthread);
703 
704 	return 1;
705 }
706 
707 /*
708  * rw_read_held:
709  *
710  *	Returns true if the rwlock is held for reading.  Must only be
711  *	used for diagnostic assertions, and never be used to make
712  * 	decisions about how to use a rwlock.
713  */
714 int
715 rw_read_held(krwlock_t *rw)
716 {
717 	uintptr_t owner;
718 
719 	if (panicstr != NULL)
720 		return 1;
721 	if (rw == NULL)
722 		return 0;
723 	owner = rw->rw_owner;
724 	return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
725 }
726 
727 /*
728  * rw_write_held:
729  *
730  *	Returns true if the rwlock is held for writing.  Must only be
731  *	used for diagnostic assertions, and never be used to make
732  *	decisions about how to use a rwlock.
733  */
734 int
735 rw_write_held(krwlock_t *rw)
736 {
737 
738 	if (panicstr != NULL)
739 		return 1;
740 	if (rw == NULL)
741 		return 0;
742 	return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
743 	    (RW_WRITE_LOCKED | (uintptr_t)curlwp);
744 }
745 
746 /*
747  * rw_lock_held:
748  *
749  *	Returns true if the rwlock is held for reading or writing.  Must
750  *	only be used for diagnostic assertions, and never be used to make
751  *	decisions about how to use a rwlock.
752  */
753 int
754 rw_lock_held(krwlock_t *rw)
755 {
756 
757 	if (panicstr != NULL)
758 		return 1;
759 	if (rw == NULL)
760 		return 0;
761 	return (rw->rw_owner & RW_THREAD) != 0;
762 }
763 
764 /*
765  * rw_owner:
766  *
767  *	Return the current owner of an RW lock, but only if it is write
768  *	held.  Used for priority inheritance.
769  */
770 static lwp_t *
771 rw_owner(wchan_t obj)
772 {
773 	krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
774 	uintptr_t owner = rw->rw_owner;
775 
776 	if ((owner & RW_WRITE_LOCKED) == 0)
777 		return NULL;
778 
779 	return (void *)(owner & RW_THREAD);
780 }
781 
782 /*
783  * rw_obj_init:
784  *
785  *	Initialize the rw object store.
786  */
787 void
788 rw_obj_init(void)
789 {
790 
791 	rw_obj_cache = pool_cache_init(sizeof(struct krwobj),
792 	    coherency_unit, 0, 0, "rwlock", NULL, IPL_NONE, rw_obj_ctor,
793 	    NULL, NULL);
794 }
795 
796 /*
797  * rw_obj_ctor:
798  *
799  *	Initialize a new lock for the cache.
800  */
801 static int
802 rw_obj_ctor(void *arg, void *obj, int flags)
803 {
804 	struct krwobj * ro = obj;
805 
806 	ro->ro_magic = RW_OBJ_MAGIC;
807 
808 	return 0;
809 }
810 
811 /*
812  * rw_obj_alloc:
813  *
814  *	Allocate a single lock object.
815  */
816 krwlock_t *
817 rw_obj_alloc(void)
818 {
819 	struct krwobj *ro;
820 
821 	ro = pool_cache_get(rw_obj_cache, PR_WAITOK);
822 	rw_init(&ro->ro_lock);
823 	ro->ro_refcnt = 1;
824 
825 	return (krwlock_t *)ro;
826 }
827 
828 /*
829  * rw_obj_hold:
830  *
831  *	Add a single reference to a lock object.  A reference to the object
832  *	must already be held, and must be held across this call.
833  */
834 void
835 rw_obj_hold(krwlock_t *lock)
836 {
837 	struct krwobj *ro = (struct krwobj *)lock;
838 
839 	KASSERT(ro->ro_magic == RW_OBJ_MAGIC);
840 	KASSERT(ro->ro_refcnt > 0);
841 
842 	atomic_inc_uint(&ro->ro_refcnt);
843 }
844 
845 /*
846  * rw_obj_free:
847  *
848  *	Drop a reference from a lock object.  If the last reference is being
849  *	dropped, free the object and return true.  Otherwise, return false.
850  */
851 bool
852 rw_obj_free(krwlock_t *lock)
853 {
854 	struct krwobj *ro = (struct krwobj *)lock;
855 
856 	KASSERT(ro->ro_magic == RW_OBJ_MAGIC);
857 	KASSERT(ro->ro_refcnt > 0);
858 
859 	if (atomic_dec_uint_nv(&ro->ro_refcnt) > 0) {
860 		return false;
861 	}
862 	rw_destroy(&ro->ro_lock);
863 	pool_cache_put(rw_obj_cache, ro);
864 	return true;
865 }
866