xref: /openbsd-src/sys/kern/kern_rwlock.c (revision 1ad61ae0a79a724d2d3ec69e69c8e1d1ff6b53a0)
1 /*	$OpenBSD: kern_rwlock.c,v 1.50 2023/07/14 07:07:08 claudio Exp $	*/
2 
3 /*
4  * Copyright (c) 2002, 2003 Artur Grabowski <art@openbsd.org>
5  * Copyright (c) 2011 Thordur Bjornsson <thib@secnorth.net>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <sys/pool.h>
23 #include <sys/proc.h>
24 #include <sys/rwlock.h>
25 #include <sys/limits.h>
26 #include <sys/atomic.h>
27 #include <sys/witness.h>
28 
29 void	rw_do_exit(struct rwlock *, unsigned long);
30 
31 /* XXX - temporary measure until proc0 is properly aligned */
32 #define RW_PROC(p) (((long)p) & ~RWLOCK_MASK)
33 
34 /*
35  * Other OSes implement more sophisticated mechanism to determine how long the
36  * process attempting to acquire the lock should be spinning. We start with
37  * the most simple approach: we do RW_SPINS attempts at most before eventually
38  * giving up and putting the process to sleep queue.
39  */
40 #define RW_SPINS	1000
41 
42 #ifdef MULTIPROCESSOR
43 #define rw_cas(p, o, n)	(atomic_cas_ulong(p, o, n) != o)
44 #else
45 static inline int
46 rw_cas(volatile unsigned long *p, unsigned long o, unsigned long n)
47 {
48 	if (*p != o)
49 		return (1);
50 	*p = n;
51 
52 	return (0);
53 }
54 #endif
55 
56 /*
57  * Magic wand for lock operations. Every operation checks if certain
58  * flags are set and if they aren't, it increments the lock with some
59  * value (that might need some computing in a few cases). If the operation
60  * fails, we need to set certain flags while waiting for the lock.
61  *
62  * RW_WRITE	The lock must be completely empty. We increment it with
63  *		RWLOCK_WRLOCK and the proc pointer of the holder.
64  *		Sets RWLOCK_WAIT|RWLOCK_WRWANT while waiting.
65  * RW_READ	RWLOCK_WRLOCK|RWLOCK_WRWANT may not be set. We increment
66  *		with RWLOCK_READ_INCR. RWLOCK_WAIT while waiting.
67  */
68 static const struct rwlock_op {
69 	unsigned long inc;
70 	unsigned long check;
71 	unsigned long wait_set;
72 	long proc_mult;
73 	int wait_prio;
74 } rw_ops[] = {
75 	{	/* RW_WRITE */
76 		RWLOCK_WRLOCK,
77 		ULONG_MAX,
78 		RWLOCK_WAIT | RWLOCK_WRWANT,
79 		1,
80 		PLOCK - 4
81 	},
82 	{	/* RW_READ */
83 		RWLOCK_READ_INCR,
84 		RWLOCK_WRLOCK | RWLOCK_WRWANT,
85 		RWLOCK_WAIT,
86 		0,
87 		PLOCK
88 	},
89 	{	/* Sparse Entry. */
90 		0,
91 	},
92 	{	/* RW_DOWNGRADE */
93 		RWLOCK_READ_INCR - RWLOCK_WRLOCK,
94 		0,
95 		0,
96 		-1,
97 		PLOCK
98 	},
99 };
100 
101 void
102 rw_enter_read(struct rwlock *rwl)
103 {
104 	unsigned long owner = rwl->rwl_owner;
105 
106 	if (__predict_false((owner & (RWLOCK_WRLOCK | RWLOCK_WRWANT)) ||
107 	    rw_cas(&rwl->rwl_owner, owner, owner + RWLOCK_READ_INCR)))
108 		rw_enter(rwl, RW_READ);
109 	else {
110 		membar_enter_after_atomic();
111 		WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, NULL);
112 		WITNESS_LOCK(&rwl->rwl_lock_obj, 0);
113 	}
114 }
115 
116 void
117 rw_enter_write(struct rwlock *rwl)
118 {
119 	struct proc *p = curproc;
120 
121 	if (__predict_false(rw_cas(&rwl->rwl_owner, 0,
122 	    RW_PROC(p) | RWLOCK_WRLOCK)))
123 		rw_enter(rwl, RW_WRITE);
124 	else {
125 		membar_enter_after_atomic();
126 		WITNESS_CHECKORDER(&rwl->rwl_lock_obj,
127 		    LOP_EXCLUSIVE | LOP_NEWORDER, NULL);
128 		WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
129 	}
130 }
131 
132 void
133 rw_exit_read(struct rwlock *rwl)
134 {
135 	unsigned long owner;
136 
137 	rw_assert_rdlock(rwl);
138 	WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
139 
140 	membar_exit_before_atomic();
141 	owner = rwl->rwl_owner;
142 	if (__predict_false((owner & RWLOCK_WAIT) ||
143 	    rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
144 		rw_do_exit(rwl, 0);
145 }
146 
147 void
148 rw_exit_write(struct rwlock *rwl)
149 {
150 	unsigned long owner;
151 
152 	rw_assert_wrlock(rwl);
153 	WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
154 
155 	membar_exit_before_atomic();
156 	owner = rwl->rwl_owner;
157 	if (__predict_false((owner & RWLOCK_WAIT) ||
158 	    rw_cas(&rwl->rwl_owner, owner, 0)))
159 		rw_do_exit(rwl, RWLOCK_WRLOCK);
160 }
161 
162 #ifdef DIAGNOSTIC
163 /*
164  * Put the diagnostic functions here to keep the main code free
165  * from ifdef clutter.
166  */
167 static void
168 rw_enter_diag(struct rwlock *rwl, int flags)
169 {
170 	switch (flags & RW_OPMASK) {
171 	case RW_WRITE:
172 	case RW_READ:
173 		if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner))
174 			panic("rw_enter: %s locking against myself",
175 			    rwl->rwl_name);
176 		break;
177 	case RW_DOWNGRADE:
178 		/*
179 		 * If we're downgrading, we must hold the write lock.
180 		 */
181 		if ((rwl->rwl_owner & RWLOCK_WRLOCK) == 0)
182 			panic("rw_enter: %s downgrade of non-write lock",
183 			    rwl->rwl_name);
184 		if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner))
185 			panic("rw_enter: %s downgrade, not holder",
186 			    rwl->rwl_name);
187 		break;
188 
189 	default:
190 		panic("rw_enter: unknown op 0x%x", flags);
191 	}
192 }
193 
194 #else
195 #define rw_enter_diag(r, f)
196 #endif
197 
198 static void
199 _rw_init_flags_witness(struct rwlock *rwl, const char *name, int lo_flags,
200     const struct lock_type *type)
201 {
202 	rwl->rwl_owner = 0;
203 	rwl->rwl_name = name;
204 
205 #ifdef WITNESS
206 	rwl->rwl_lock_obj.lo_flags = lo_flags;
207 	rwl->rwl_lock_obj.lo_name = name;
208 	rwl->rwl_lock_obj.lo_type = type;
209 	WITNESS_INIT(&rwl->rwl_lock_obj, type);
210 #else
211 	(void)type;
212 	(void)lo_flags;
213 #endif
214 }
215 
216 void
217 _rw_init_flags(struct rwlock *rwl, const char *name, int flags,
218     const struct lock_type *type)
219 {
220 	_rw_init_flags_witness(rwl, name, RWLOCK_LO_FLAGS(flags), type);
221 }
222 
223 int
224 rw_enter(struct rwlock *rwl, int flags)
225 {
226 	const struct rwlock_op *op;
227 	unsigned long inc, o;
228 #ifdef MULTIPROCESSOR
229 	/*
230 	 * If process holds the kernel lock, then we want to give up on CPU
231 	 * as soon as possible so other processes waiting for the kernel lock
232 	 * can progress. Hence no spinning if we hold the kernel lock.
233 	 */
234 	unsigned int spin = (_kernel_lock_held()) ? 0 : RW_SPINS;
235 #endif
236 	int error, prio;
237 #ifdef WITNESS
238 	int lop_flags;
239 
240 	lop_flags = LOP_NEWORDER;
241 	if (flags & RW_WRITE)
242 		lop_flags |= LOP_EXCLUSIVE;
243 	if (flags & RW_DUPOK)
244 		lop_flags |= LOP_DUPOK;
245 	if ((flags & RW_NOSLEEP) == 0 && (flags & RW_DOWNGRADE) == 0)
246 		WITNESS_CHECKORDER(&rwl->rwl_lock_obj, lop_flags, NULL);
247 #endif
248 
249 	op = &rw_ops[(flags & RW_OPMASK) - 1];
250 
251 	inc = op->inc + RW_PROC(curproc) * op->proc_mult;
252 retry:
253 	while (__predict_false(((o = rwl->rwl_owner) & op->check) != 0)) {
254 		unsigned long set = o | op->wait_set;
255 		int do_sleep;
256 
257 		/* Avoid deadlocks after panic or in DDB */
258 		if (panicstr || db_active)
259 			return (0);
260 
261 #ifdef MULTIPROCESSOR
262 		/*
263 		 * It makes sense to try to spin just in case the lock
264 		 * is acquired by writer.
265 		 */
266 		if ((o & RWLOCK_WRLOCK) && (spin != 0)) {
267 			spin--;
268 			CPU_BUSY_CYCLE();
269 			continue;
270 		}
271 #endif
272 
273 		rw_enter_diag(rwl, flags);
274 
275 		if (flags & RW_NOSLEEP)
276 			return (EBUSY);
277 
278 		prio = op->wait_prio;
279 		if (flags & RW_INTR)
280 			prio |= PCATCH;
281 		sleep_setup(rwl, prio, rwl->rwl_name);
282 
283 		do_sleep = !rw_cas(&rwl->rwl_owner, o, set);
284 
285 		error = sleep_finish(0, do_sleep);
286 		if ((flags & RW_INTR) &&
287 		    (error != 0))
288 			return (error);
289 		if (flags & RW_SLEEPFAIL)
290 			return (EAGAIN);
291 	}
292 
293 	if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc)))
294 		goto retry;
295 	membar_enter_after_atomic();
296 
297 	/*
298 	 * If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we
299 	 * downgraded a write lock and had possible read waiter, wake them
300 	 * to let them retry the lock.
301 	 */
302 	if (__predict_false((o & (RWLOCK_WRLOCK|RWLOCK_WAIT)) ==
303 	    (RWLOCK_WRLOCK|RWLOCK_WAIT)))
304 		wakeup(rwl);
305 
306 	if (flags & RW_DOWNGRADE)
307 		WITNESS_DOWNGRADE(&rwl->rwl_lock_obj, lop_flags);
308 	else
309 		WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags);
310 
311 	return (0);
312 }
313 
314 void
315 rw_exit(struct rwlock *rwl)
316 {
317 	unsigned long wrlock;
318 
319 	/* Avoid deadlocks after panic or in DDB */
320 	if (panicstr || db_active)
321 		return;
322 
323 	wrlock = rwl->rwl_owner & RWLOCK_WRLOCK;
324 	if (wrlock)
325 		rw_assert_wrlock(rwl);
326 	else
327 		rw_assert_rdlock(rwl);
328 	WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0);
329 
330 	membar_exit_before_atomic();
331 	rw_do_exit(rwl, wrlock);
332 }
333 
334 /* membar_exit_before_atomic() has to precede call of this function. */
335 void
336 rw_do_exit(struct rwlock *rwl, unsigned long wrlock)
337 {
338 	unsigned long owner, set;
339 
340 	do {
341 		owner = rwl->rwl_owner;
342 		if (wrlock)
343 			set = 0;
344 		else
345 			set = (owner - RWLOCK_READ_INCR) &
346 				~(RWLOCK_WAIT|RWLOCK_WRWANT);
347 		/*
348 		 * Potential MP race here.  If the owner had WRWANT set, we
349 		 * cleared it and a reader can sneak in before a writer.
350 		 */
351 	} while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set)));
352 
353 	if (owner & RWLOCK_WAIT)
354 		wakeup(rwl);
355 }
356 
357 int
358 rw_status(struct rwlock *rwl)
359 {
360 	unsigned long owner = rwl->rwl_owner;
361 
362 	if (owner & RWLOCK_WRLOCK) {
363 		if (RW_PROC(curproc) == RW_PROC(owner))
364 			return RW_WRITE;
365 		else
366 			return RW_WRITE_OTHER;
367 	}
368 	if (owner)
369 		return RW_READ;
370 	return (0);
371 }
372 
373 #ifdef DIAGNOSTIC
374 void
375 rw_assert_wrlock(struct rwlock *rwl)
376 {
377 	if (panicstr || db_active)
378 		return;
379 
380 #ifdef WITNESS
381 	witness_assert(&rwl->rwl_lock_obj, LA_XLOCKED);
382 #else
383 	if (!(rwl->rwl_owner & RWLOCK_WRLOCK))
384 		panic("%s: lock not held", rwl->rwl_name);
385 
386 	if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner))
387 		panic("%s: lock not held by this process", rwl->rwl_name);
388 #endif
389 }
390 
391 void
392 rw_assert_rdlock(struct rwlock *rwl)
393 {
394 	if (panicstr || db_active)
395 		return;
396 
397 #ifdef WITNESS
398 	witness_assert(&rwl->rwl_lock_obj, LA_SLOCKED);
399 #else
400 	if (!RW_PROC(rwl->rwl_owner) || (rwl->rwl_owner & RWLOCK_WRLOCK))
401 		panic("%s: lock not shared", rwl->rwl_name);
402 #endif
403 }
404 
405 void
406 rw_assert_anylock(struct rwlock *rwl)
407 {
408 	if (panicstr || db_active)
409 		return;
410 
411 #ifdef WITNESS
412 	witness_assert(&rwl->rwl_lock_obj, LA_LOCKED);
413 #else
414 	switch (rw_status(rwl)) {
415 	case RW_WRITE_OTHER:
416 		panic("%s: lock held by different process", rwl->rwl_name);
417 	case 0:
418 		panic("%s: lock not held", rwl->rwl_name);
419 	}
420 #endif
421 }
422 
423 void
424 rw_assert_unlocked(struct rwlock *rwl)
425 {
426 	if (panicstr || db_active)
427 		return;
428 
429 #ifdef WITNESS
430 	witness_assert(&rwl->rwl_lock_obj, LA_UNLOCKED);
431 #else
432 	if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner))
433 		panic("%s: lock held", rwl->rwl_name);
434 #endif
435 }
436 #endif
437 
438 /* recursive rwlocks; */
439 void
440 _rrw_init_flags(struct rrwlock *rrwl, const char *name, int flags,
441     const struct lock_type *type)
442 {
443 	memset(rrwl, 0, sizeof(struct rrwlock));
444 	_rw_init_flags_witness(&rrwl->rrwl_lock, name, RRWLOCK_LO_FLAGS(flags),
445 	    type);
446 }
447 
448 int
449 rrw_enter(struct rrwlock *rrwl, int flags)
450 {
451 	int	rv;
452 
453 	if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) {
454 		if (flags & RW_RECURSEFAIL)
455 			return (EDEADLK);
456 		else {
457 			rrwl->rrwl_wcnt++;
458 			WITNESS_LOCK(&rrwl->rrwl_lock.rwl_lock_obj,
459 			    LOP_EXCLUSIVE);
460 			return (0);
461 		}
462 	}
463 
464 	rv = rw_enter(&rrwl->rrwl_lock, flags);
465 	if (rv == 0)
466 		rrwl->rrwl_wcnt = 1;
467 
468 	return (rv);
469 }
470 
471 void
472 rrw_exit(struct rrwlock *rrwl)
473 {
474 
475 	if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) {
476 		KASSERT(rrwl->rrwl_wcnt > 0);
477 		rrwl->rrwl_wcnt--;
478 		if (rrwl->rrwl_wcnt != 0) {
479 			WITNESS_UNLOCK(&rrwl->rrwl_lock.rwl_lock_obj,
480 			    LOP_EXCLUSIVE);
481 			return;
482 		}
483 	}
484 
485 	rw_exit(&rrwl->rrwl_lock);
486 }
487 
488 int
489 rrw_status(struct rrwlock *rrwl)
490 {
491 	return (rw_status(&rrwl->rrwl_lock));
492 }
493 
494 /*-
495  * Copyright (c) 2008 The NetBSD Foundation, Inc.
496  * All rights reserved.
497  *
498  * This code is derived from software contributed to The NetBSD Foundation
499  * by Andrew Doran.
500  *
501  * Redistribution and use in source and binary forms, with or without
502  * modification, are permitted provided that the following conditions
503  * are met:
504  * 1. Redistributions of source code must retain the above copyright
505  *    notice, this list of conditions and the following disclaimer.
506  * 2. Redistributions in binary form must reproduce the above copyright
507  *    notice, this list of conditions and the following disclaimer in the
508  *    documentation and/or other materials provided with the distribution.
509  *
510  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
511  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
512  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
513  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
514  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
515  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
516  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
517  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
518  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
519  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
520  * POSSIBILITY OF SUCH DAMAGE.
521  */
522 
523 #define	RWLOCK_OBJ_MAGIC	0x5aa3c85d
524 struct rwlock_obj {
525 	struct rwlock	ro_lock;
526 	u_int		ro_magic;
527 	u_int		ro_refcnt;
528 };
529 
530 
531 struct pool rwlock_obj_pool;
532 
533 /*
534  * rw_obj_init:
535  *
536  *	Initialize the mutex object store.
537  */
538 void
539 rw_obj_init(void)
540 {
541 	pool_init(&rwlock_obj_pool, sizeof(struct rwlock_obj), 0, IPL_MPFLOOR,
542 	    PR_WAITOK, "rwobjpl", NULL);
543 }
544 
545 /*
546  * rw_obj_alloc:
547  *
548  *	Allocate a single lock object.
549  */
550 void
551 _rw_obj_alloc_flags(struct rwlock **lock, const char *name, int flags,
552     struct lock_type *type)
553 {
554 	struct rwlock_obj *mo;
555 
556 	mo = pool_get(&rwlock_obj_pool, PR_WAITOK);
557 	mo->ro_magic = RWLOCK_OBJ_MAGIC;
558 	_rw_init_flags(&mo->ro_lock, name, flags, type);
559 	mo->ro_refcnt = 1;
560 
561 	*lock = &mo->ro_lock;
562 }
563 
564 /*
565  * rw_obj_hold:
566  *
567  *	Add a single reference to a lock object.  A reference to the object
568  *	must already be held, and must be held across this call.
569  */
570 
571 void
572 rw_obj_hold(struct rwlock *lock)
573 {
574 	struct rwlock_obj *mo = (struct rwlock_obj *)lock;
575 
576 	KASSERTMSG(mo->ro_magic == RWLOCK_OBJ_MAGIC,
577 	    "%s: lock %p: mo->ro_magic (%#x) != RWLOCK_OBJ_MAGIC (%#x)",
578 	     __func__, mo, mo->ro_magic, RWLOCK_OBJ_MAGIC);
579 	KASSERTMSG(mo->ro_refcnt > 0,
580 	    "%s: lock %p: mo->ro_refcnt (%#x) == 0",
581 	     __func__, mo, mo->ro_refcnt);
582 
583 	atomic_inc_int(&mo->ro_refcnt);
584 }
585 
586 /*
587  * rw_obj_free:
588  *
589  *	Drop a reference from a lock object.  If the last reference is being
590  *	dropped, free the object and return true.  Otherwise, return false.
591  */
592 int
593 rw_obj_free(struct rwlock *lock)
594 {
595 	struct rwlock_obj *mo = (struct rwlock_obj *)lock;
596 
597 	KASSERTMSG(mo->ro_magic == RWLOCK_OBJ_MAGIC,
598 	    "%s: lock %p: mo->ro_magic (%#x) != RWLOCK_OBJ_MAGIC (%#x)",
599 	     __func__, mo, mo->ro_magic, RWLOCK_OBJ_MAGIC);
600 	KASSERTMSG(mo->ro_refcnt > 0,
601 	    "%s: lock %p: mo->ro_refcnt (%#x) == 0",
602 	     __func__, mo, mo->ro_refcnt);
603 
604 	if (atomic_dec_int_nv(&mo->ro_refcnt) > 0) {
605 		return false;
606 	}
607 #if notyet
608 	WITNESS_DESTROY(&mo->ro_lock);
609 #endif
610 	pool_put(&rwlock_obj_pool, mo);
611 	return true;
612 }
613