xref: /dflybsd-src/sys/kern/kern_mutex.c (revision 617f6bd6553bb7523bb0198e7aaa5d9c36fb6e05)
1 /*
2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Implement fast persistent locks based on atomic_cmpset_int() with
36  * semantics similar to lockmgr locks but faster and taking up much less
37  * space.  Taken from HAMMER's lock implementation.
38  *
39  * These are meant to complement our LWKT tokens.  Tokens are only held
40  * while the thread is running.  Mutexes can be held across blocking
41  * conditions.
42  *
43  * - Exclusive priority over shared to prevent SMP starvation.
44  * - locks can be aborted (async callback, if any, will be made w/ENOLCK).
45  * - locks can be asynchronous.
46  * - synchronous fast path if no blocking occurs (async callback is not
47  *   made in this case).
48  *
49  * Generally speaking any caller-supplied link state must be properly
50  * initialized before use.
51  *
52  * Most of the support is in sys/mutex[2].h.  We mostly provide backoff
53  * functions here.
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/sysctl.h>
60 #include <sys/thread.h>
61 
62 #include <machine/cpufunc.h>
63 
64 #include <sys/thread2.h>
65 #include <sys/mutex2.h>
66 
67 static int mtx_chain_link_ex(mtx_t *mtx, u_int olock);
68 static int mtx_chain_link_sh(mtx_t *mtx, u_int olock);
69 static void mtx_delete_link(mtx_t *mtx, mtx_link_t *link);
70 
71 /*
72  * Exclusive-lock a mutex, block until acquired unless link is async.
73  * Recursion is allowed.
74  *
75  * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
76  * if async.  If immediately successful an async exclusive lock will return 0
77  * and not issue the async callback or link the link structure.  The caller
78  * must handle this case (typically this is an optimal code path).
79  *
80  * A tsleep() error can only be returned if PCATCH is specified in the flags.
81  */
82 static __inline int
83 __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link, int flags, int to)
84 {
85 	thread_t td;
86 	u_int	lock;
87 	u_int	nlock;
88 	int	error;
89 	int	isasync;
90 
91 	for (;;) {
92 		lock = mtx->mtx_lock;
93 		cpu_ccfence();
94 
95 		if (lock == 0) {
96 			nlock = MTX_EXCLUSIVE | 1;
97 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
98 				mtx->mtx_owner = curthread;
99 				cpu_sfence();
100 				link->state = MTX_LINK_ACQUIRED;
101 				error = 0;
102 				break;
103 			}
104 			continue;
105 		}
106 		if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
107 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
108 			nlock = lock + 1;
109 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
110 				cpu_sfence();
111 				link->state = MTX_LINK_ACQUIRED;
112 				error = 0;
113 				break;
114 			}
115 			continue;
116 		}
117 
118 		/*
119 		 * We need MTX_LINKSPIN to manipulate exlink or
120 		 * shlink.
121 		 *
122 		 * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
123 		 * pending exclusive requests.  It cannot be set as a separate
124 		 * operation prior to acquiring MTX_LINKSPIN.
125 		 *
126 		 * To avoid unnecessary cpu cache traffic we poll
127 		 * for collisions.  It is also possible that EXWANTED
128 		 * state failing the above test was spurious, so all the
129 		 * tests must be repeated if we cannot obtain LINKSPIN
130 		 * with the prior state tests intact (i.e. don't reload
131 		 * the (lock) variable here, for heaven's sake!).
132 		 */
133 		if (lock & MTX_LINKSPIN) {
134 			cpu_pause();
135 			continue;
136 		}
137 		td = curthread;
138 		nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
139 		++td->td_critcount;
140 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
141 			--td->td_critcount;
142 			continue;
143 		}
144 
145 		/*
146 		 * Check for early abort.
147 		 */
148 		if (link->state == MTX_LINK_ABORTED) {
149 			if (mtx->mtx_exlink == NULL) {
150 				atomic_clear_int(&mtx->mtx_lock,
151 						 MTX_LINKSPIN |
152 						 MTX_EXWANTED);
153 			} else {
154 				atomic_clear_int(&mtx->mtx_lock,
155 						 MTX_LINKSPIN);
156 			}
157 			--td->td_critcount;
158 			link->state = MTX_LINK_IDLE;
159 			error = ENOLCK;
160 			break;
161 		}
162 
163 		/*
164 		 * Add our link to the exlink list and release LINKSPIN.
165 		 */
166 		link->owner = td;
167 		link->state = MTX_LINK_LINKED_EX;
168 		if (mtx->mtx_exlink) {
169 			link->next = mtx->mtx_exlink;
170 			link->prev = link->next->prev;
171 			link->next->prev = link;
172 			link->prev->next = link;
173 		} else {
174 			link->next = link;
175 			link->prev = link;
176 			mtx->mtx_exlink = link;
177 		}
178 		isasync = (link->callback != NULL);
179 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
180 		--td->td_critcount;
181 
182 		/*
183 		 * If asynchronous lock request return without
184 		 * blocking, leave link structure linked.
185 		 */
186 		if (isasync) {
187 			error = EINPROGRESS;
188 			break;
189 		}
190 
191 		/*
192 		 * Wait for lock
193 		 */
194 		error = mtx_wait_link(mtx, link, flags, to);
195 		break;
196 	}
197 	return (error);
198 }
199 
200 int
201 _mtx_lock_ex_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
202 {
203 	return(__mtx_lock_ex(mtx, link, flags, to));
204 }
205 
206 int
207 _mtx_lock_ex(mtx_t *mtx, int flags, int to)
208 {
209 	mtx_link_t link;
210 
211 	mtx_link_init(&link);
212 	return(__mtx_lock_ex(mtx, &link, flags, to));
213 }
214 
215 int
216 _mtx_lock_ex_quick(mtx_t *mtx)
217 {
218 	mtx_link_t link;
219 
220 	mtx_link_init(&link);
221 	return(__mtx_lock_ex(mtx, &link, 0, 0));
222 }
223 
224 /*
225  * Share-lock a mutex, block until acquired.  Recursion is allowed.
226  *
227  * Returns 0 on success, or the tsleep() return code on failure.
228  * An error can only be returned if PCATCH is specified in the flags.
229  *
230  * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we
231  *	 do not have to chain the wakeup().
232  */
233 static __inline int
234 __mtx_lock_sh(mtx_t *mtx, mtx_link_t *link, int flags, int to)
235 {
236 	thread_t td;
237 	u_int	lock;
238 	u_int	nlock;
239 	int	error;
240 	int	isasync;
241 
242 	for (;;) {
243 		lock = mtx->mtx_lock;
244 		cpu_ccfence();
245 
246 		if (lock == 0) {
247 			nlock = 1;
248 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
249 				error = 0;
250 				cpu_sfence();
251 				link->state = MTX_LINK_ACQUIRED;
252 				break;
253 			}
254 			continue;
255 		}
256 		if ((lock & (MTX_EXCLUSIVE | MTX_EXWANTED)) == 0) {
257 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
258 			nlock = lock + 1;
259 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
260 				error = 0;
261 				cpu_sfence();
262 				link->state = MTX_LINK_ACQUIRED;
263 				break;
264 			}
265 			continue;
266 		}
267 
268 		/*
269 		 * We need MTX_LINKSPIN to manipulate exlink or
270 		 * shlink.
271 		 *
272 		 * We must set MTX_SHWANTED with MTX_LINKSPIN to indicate
273 		 * pending shared requests.  It cannot be set as a separate
274 		 * operation prior to acquiring MTX_LINKSPIN.
275 		 *
276 		 * To avoid unnecessary cpu cache traffic we poll
277 		 * for collisions.  It is also possible that EXWANTED
278 		 * state failing the above test was spurious, so all the
279 		 * tests must be repeated if we cannot obtain LINKSPIN
280 		 * with the prior state tests intact (i.e. don't reload
281 		 * the (lock) variable here, for heaven's sake!).
282 		 */
283 		if (lock & MTX_LINKSPIN) {
284 			cpu_pause();
285 			continue;
286 		}
287 		td = curthread;
288 		nlock = lock | MTX_SHWANTED | MTX_LINKSPIN;
289 		++td->td_critcount;
290 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
291 			--td->td_critcount;
292 			continue;
293 		}
294 
295 		/*
296 		 * Check for early abort.  Other shared lock requestors
297 		 * could have sneaked in before we set LINKSPIN so make
298 		 * sure we undo the state properly.
299 		 */
300 		if (link->state == MTX_LINK_ABORTED) {
301 			if (mtx->mtx_shlink) {
302 				atomic_clear_int(&mtx->mtx_lock,
303 						 MTX_LINKSPIN);
304 			} else {
305 				atomic_clear_int(&mtx->mtx_lock,
306 						 MTX_LINKSPIN |
307 						 MTX_SHWANTED);
308 			}
309 			--td->td_critcount;
310 			link->state = MTX_LINK_IDLE;
311 			error = ENOLCK;
312 			break;
313 		}
314 
315 		/*
316 		 * Add our link to the shlink list and release LINKSPIN.
317 		 */
318 		link->owner = td;
319 		link->state = MTX_LINK_LINKED_SH;
320 		if (mtx->mtx_shlink) {
321 			link->next = mtx->mtx_shlink;
322 			link->prev = link->next->prev;
323 			link->next->prev = link;
324 			link->prev->next = link;
325 		} else {
326 			link->next = link;
327 			link->prev = link;
328 			mtx->mtx_shlink = link;
329 		}
330 		isasync = (link->callback != NULL);
331 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
332 		--td->td_critcount;
333 
334 		/*
335 		 * If asynchronous lock request return without
336 		 * blocking, leave link structure linked.
337 		 */
338 		if (isasync) {
339 			error = EINPROGRESS;
340 			break;
341 		}
342 
343 		/*
344 		 * Wait for lock
345 		 */
346 		error = mtx_wait_link(mtx, link, flags, to);
347 		break;
348 	}
349 	return (error);
350 }
351 
352 int
353 _mtx_lock_sh_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
354 {
355 	return(__mtx_lock_sh(mtx, link, flags, to));
356 }
357 
358 int
359 _mtx_lock_sh(mtx_t *mtx, int flags, int to)
360 {
361 	mtx_link_t link;
362 
363 	mtx_link_init(&link);
364 	return(__mtx_lock_sh(mtx, &link, flags, to));
365 }
366 
367 int
368 _mtx_lock_sh_quick(mtx_t *mtx)
369 {
370 	mtx_link_t link;
371 
372 	mtx_link_init(&link);
373 	return(__mtx_lock_sh(mtx, &link, 0, 0));
374 }
375 
376 /*
377  * Get an exclusive spinlock the hard way.
378  */
379 void
380 _mtx_spinlock(mtx_t *mtx)
381 {
382 	u_int	lock;
383 	u_int	nlock;
384 	int	bb = 1;
385 	int	bo;
386 
387 	for (;;) {
388 		lock = mtx->mtx_lock;
389 		if (lock == 0) {
390 			nlock = MTX_EXCLUSIVE | 1;
391 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
392 				mtx->mtx_owner = curthread;
393 				break;
394 			}
395 		} else if ((lock & MTX_EXCLUSIVE) &&
396 			   mtx->mtx_owner == curthread) {
397 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
398 			nlock = lock + 1;
399 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
400 				break;
401 		} else {
402 			/* MWAIT here */
403 			if (bb < 1000)
404 				++bb;
405 			cpu_pause();
406 			for (bo = 0; bo < bb; ++bo)
407 				;
408 		}
409 		cpu_pause();
410 	}
411 }
412 
413 /*
414  * Attempt to acquire a spinlock, if we fail we must undo the
415  * gd->gd_spinlocks/gd->gd_curthead->td_critcount predisposition.
416  *
417  * Returns 0 on success, EAGAIN on failure.
418  */
419 int
420 _mtx_spinlock_try(mtx_t *mtx)
421 {
422 	globaldata_t gd = mycpu;
423 	u_int	lock;
424 	u_int	nlock;
425 	int	res = 0;
426 
427 	for (;;) {
428 		lock = mtx->mtx_lock;
429 		if (lock == 0) {
430 			nlock = MTX_EXCLUSIVE | 1;
431 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
432 				mtx->mtx_owner = gd->gd_curthread;
433 				break;
434 			}
435 		} else if ((lock & MTX_EXCLUSIVE) &&
436 			   mtx->mtx_owner == gd->gd_curthread) {
437 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
438 			nlock = lock + 1;
439 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
440 				break;
441 		} else {
442 			--gd->gd_spinlocks;
443 			cpu_ccfence();
444 			--gd->gd_curthread->td_critcount;
445 			res = EAGAIN;
446 			break;
447 		}
448 		cpu_pause();
449 	}
450 	return res;
451 }
452 
453 #if 0
454 
455 void
456 _mtx_spinlock_sh(mtx_t *mtx)
457 {
458 	u_int	lock;
459 	u_int	nlock;
460 	int	bb = 1;
461 	int	bo;
462 
463 	for (;;) {
464 		lock = mtx->mtx_lock;
465 		if ((lock & MTX_EXCLUSIVE) == 0) {
466 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
467 			nlock = lock + 1;
468 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
469 				break;
470 		} else {
471 			/* MWAIT here */
472 			if (bb < 1000)
473 				++bb;
474 			cpu_pause();
475 			for (bo = 0; bo < bb; ++bo)
476 				;
477 		}
478 		cpu_pause();
479 	}
480 }
481 
482 #endif
483 
484 int
485 _mtx_lock_ex_try(mtx_t *mtx)
486 {
487 	u_int	lock;
488 	u_int	nlock;
489 	int	error;
490 
491 	for (;;) {
492 		lock = mtx->mtx_lock;
493 		if (lock == 0) {
494 			nlock = MTX_EXCLUSIVE | 1;
495 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
496 				mtx->mtx_owner = curthread;
497 				error = 0;
498 				break;
499 			}
500 		} else if ((lock & MTX_EXCLUSIVE) &&
501 			   mtx->mtx_owner == curthread) {
502 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
503 			nlock = lock + 1;
504 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
505 				error = 0;
506 				break;
507 			}
508 		} else {
509 			error = EAGAIN;
510 			break;
511 		}
512 		cpu_pause();
513 	}
514 	return (error);
515 }
516 
517 int
518 _mtx_lock_sh_try(mtx_t *mtx)
519 {
520 	u_int	lock;
521 	u_int	nlock;
522 	int	error = 0;
523 
524 	for (;;) {
525 		lock = mtx->mtx_lock;
526 		if ((lock & MTX_EXCLUSIVE) == 0) {
527 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
528 			nlock = lock + 1;
529 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
530 				break;
531 		} else {
532 			error = EAGAIN;
533 			break;
534 		}
535 		cpu_pause();
536 	}
537 	return (error);
538 }
539 
540 /*
541  * If the lock is held exclusively it must be owned by the caller.  If the
542  * lock is already a shared lock this operation is a NOP.  A panic will
543  * occur if the lock is not held either shared or exclusive.
544  *
545  * The exclusive count is converted to a shared count.
546  */
547 void
548 _mtx_downgrade(mtx_t *mtx)
549 {
550 	u_int	lock;
551 	u_int	nlock;
552 
553 	for (;;) {
554 		lock = mtx->mtx_lock;
555 		cpu_ccfence();
556 
557 		/*
558 		 * NOP if already shared.
559 		 */
560 		if ((lock & MTX_EXCLUSIVE) == 0) {
561 			KKASSERT((lock & MTX_MASK) > 0);
562 			break;
563 		}
564 
565 		/*
566 		 * Transfer count to shared.  Any additional pending shared
567 		 * waiters must be woken up.
568 		 */
569 		if (lock & MTX_SHWANTED) {
570 			if (mtx_chain_link_sh(mtx, lock))
571 				break;
572 			/* retry */
573 		} else {
574 			nlock = lock & ~MTX_EXCLUSIVE;
575 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
576 				break;
577 			/* retry */
578 		}
579 		cpu_pause();
580 	}
581 }
582 
583 /*
584  * Upgrade a shared lock to an exclusive lock.  The upgrade will fail if
585  * the shared lock has a count other then 1.  Optimize the most likely case
586  * but note that a single cmpset can fail due to WANTED races.
587  *
588  * If the lock is held exclusively it must be owned by the caller and
589  * this function will simply return without doing anything.   A panic will
590  * occur if the lock is held exclusively by someone other then the caller.
591  *
592  * Returns 0 on success, EDEADLK on failure.
593  */
594 int
595 _mtx_upgrade_try(mtx_t *mtx)
596 {
597 	u_int	lock;
598 	u_int	nlock;
599 	int	error = 0;
600 
601 	for (;;) {
602 		lock = mtx->mtx_lock;
603 		cpu_ccfence();
604 
605 		if ((lock & ~MTX_EXWANTED) == 1) {
606 			nlock = lock | MTX_EXCLUSIVE;
607 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
608 				mtx->mtx_owner = curthread;
609 				break;
610 			}
611 		} else if (lock & MTX_EXCLUSIVE) {
612 			KKASSERT(mtx->mtx_owner == curthread);
613 			break;
614 		} else {
615 			error = EDEADLK;
616 			break;
617 		}
618 		cpu_pause();
619 	}
620 	return (error);
621 }
622 
623 /*
624  * Unlock a lock.  The caller must hold the lock either shared or exclusive.
625  *
626  * On the last release we handle any pending chains.
627  */
628 void
629 _mtx_unlock(mtx_t *mtx)
630 {
631 	u_int	lock;
632 	u_int	nlock;
633 
634 	for (;;) {
635 		lock = mtx->mtx_lock;
636 		cpu_ccfence();
637 
638 		switch(lock) {
639 		case MTX_EXCLUSIVE | 1:
640 			/*
641 			 * Last release, exclusive lock.
642 			 * No exclusive or shared requests pending.
643 			 */
644 			mtx->mtx_owner = NULL;
645 			nlock = 0;
646 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
647 				goto done;
648 			break;
649 		case MTX_EXCLUSIVE | MTX_EXWANTED | 1:
650 		case MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1:
651 			/*
652 			 * Last release, exclusive lock.
653 			 * Exclusive requests pending.
654 			 * Exclusive requests have priority over shared reqs.
655 			 */
656 			if (mtx_chain_link_ex(mtx, lock))
657 				goto done;
658 			break;
659 		case MTX_EXCLUSIVE | MTX_SHWANTED | 1:
660 			/*
661 			 * Last release, exclusive lock.
662 			 *
663 			 * Shared requests are pending.  Transfer our count (1)
664 			 * to the first shared request, wakeup all shared reqs.
665 			 */
666 			if (mtx_chain_link_sh(mtx, lock))
667 				goto done;
668 			break;
669 		case 1:
670 			/*
671 			 * Last release, shared lock.
672 			 * No exclusive or shared requests pending.
673 			 */
674 			nlock = 0;
675 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
676 				goto done;
677 			break;
678 		case MTX_EXWANTED | 1:
679 		case MTX_EXWANTED | MTX_SHWANTED | 1:
680 			/*
681 			 * Last release, shared lock.
682 			 *
683 			 * Exclusive requests are pending.  Upgrade this
684 			 * final shared lock to exclusive and transfer our
685 			 * count (1) to the next exclusive request.
686 			 *
687 			 * Exclusive requests have priority over shared reqs.
688 			 */
689 			if (mtx_chain_link_ex(mtx, lock))
690 				goto done;
691 			break;
692 		case MTX_SHWANTED | 1:
693 			/*
694 			 * Last release, shared lock.
695 			 * Shared requests pending.
696 			 */
697 			if (mtx_chain_link_sh(mtx, lock))
698 				goto done;
699 			break;
700 		default:
701 			/*
702 			 * We have to loop if this is the last release but
703 			 * someone is fiddling with LINKSPIN.
704 			 */
705 			if ((lock & MTX_MASK) == 1) {
706 				KKASSERT(lock & MTX_LINKSPIN);
707 				break;
708 			}
709 
710 			/*
711 			 * Not the last release (shared or exclusive)
712 			 */
713 			nlock = lock - 1;
714 			KKASSERT((nlock & MTX_MASK) != MTX_MASK);
715 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
716 				goto done;
717 			break;
718 		}
719 		/* loop try again */
720 		cpu_pause();
721 	}
722 done:
723 	;
724 }
725 
726 /*
727  * Chain pending links.  Called on the last release of an exclusive or
728  * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
729  * is passed in with the count left at 1, which we can inherit, and other
730  * bits which we must adjust in a single atomic operation.
731  *
732  * Return non-zero on success, 0 if caller needs to retry.
733  *
734  * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
735  *	 acquiring LINKSPIN as all other cases will also need to acquire
736  *	 LINKSPIN when handling the EXWANTED case.
737  */
738 static int
739 mtx_chain_link_ex(mtx_t *mtx, u_int olock)
740 {
741 	thread_t td = curthread;
742 	mtx_link_t *link;
743 	u_int	nlock;
744 
745 	olock &= ~MTX_LINKSPIN;
746 	nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;	/* upgrade if necc */
747 	++td->td_critcount;
748 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
749 		link = mtx->mtx_exlink;
750 		KKASSERT(link != NULL);
751 		if (link->next == link) {
752 			mtx->mtx_exlink = NULL;
753 			nlock = MTX_LINKSPIN | MTX_EXWANTED;	/* to clear */
754 		} else {
755 			mtx->mtx_exlink = link->next;
756 			link->next->prev = link->prev;
757 			link->prev->next = link->next;
758 			nlock = MTX_LINKSPIN;			/* to clear */
759 		}
760 		KKASSERT(link->state == MTX_LINK_LINKED_EX);
761 		mtx->mtx_owner = link->owner;
762 		cpu_sfence();
763 
764 		/*
765 		 * WARNING! The callback can only be safely
766 		 *	    made with LINKSPIN still held
767 		 *	    and in a critical section.
768 		 *
769 		 * WARNING! The link can go away after the
770 		 *	    state is set, or after the
771 		 *	    callback.
772 		 */
773 		if (link->callback) {
774 			link->state = MTX_LINK_CALLEDBACK;
775 			link->callback(link, link->arg, 0);
776 		} else {
777 			link->state = MTX_LINK_ACQUIRED;
778 			wakeup(link);
779 		}
780 		atomic_clear_int(&mtx->mtx_lock, nlock);
781 		--td->td_critcount;
782 		return 1;
783 	}
784 	/* retry */
785 	--td->td_critcount;
786 	return 0;
787 }
788 
789 /*
790  * Flush waiting shared locks.  The lock's prior state is passed in and must
791  * be adjusted atomically only if it matches and LINKSPIN is not set.
792  *
793  * IMPORTANT! The caller has left one active count on the lock for us to
794  *	      consume.  We will apply this to the first link, but must add
795  *	      additional counts for any other links.
796  */
797 static int
798 mtx_chain_link_sh(mtx_t *mtx, u_int olock)
799 {
800 	thread_t td = curthread;
801 	mtx_link_t *link;
802 	u_int	addcount;
803 	u_int	nlock;
804 
805 	olock &= ~MTX_LINKSPIN;
806 	nlock = olock | MTX_LINKSPIN;
807 	nlock &= ~MTX_EXCLUSIVE;
808 	++td->td_critcount;
809 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
810 		/*
811 		 * It should not be possible for SHWANTED to be set without
812 		 * any links pending.
813 		 */
814 		KKASSERT(mtx->mtx_shlink != NULL);
815 
816 		/*
817 		 * We have to process the count for all shared locks before
818 		 * we process any of the links.  Count the additional shared
819 		 * locks beyond the first link (which is already accounted
820 		 * for) and associate the full count with the lock
821 		 * immediately.
822 		 */
823 		addcount = 0;
824 		for (link = mtx->mtx_shlink->next; link != mtx->mtx_shlink;
825 		     link = link->next) {
826 			++addcount;
827 		}
828 		if (addcount > 0)
829 			atomic_add_int(&mtx->mtx_lock, addcount);
830 
831 		/*
832 		 * We can wakeup all waiting shared locks.
833 		 */
834 		while ((link = mtx->mtx_shlink) != NULL) {
835 			KKASSERT(link->state == MTX_LINK_LINKED_SH);
836 			if (link->next == link) {
837 				mtx->mtx_shlink = NULL;
838 			} else {
839 				mtx->mtx_shlink = link->next;
840 				link->next->prev = link->prev;
841 				link->prev->next = link->next;
842 			}
843 			link->next = NULL;
844 			link->prev = NULL;
845 			cpu_sfence();
846 			if (link->callback) {
847 				link->state = MTX_LINK_CALLEDBACK;
848 				link->callback(link, link->arg, 0);
849 			} else {
850 				cpu_sfence();
851 				link->state = MTX_LINK_ACQUIRED;
852 				wakeup(link);
853 			}
854 		}
855 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
856 						 MTX_SHWANTED);
857 		--td->td_critcount;
858 		return 1;
859 	}
860 	/* retry */
861 	--td->td_critcount;
862 	return 0;
863 }
864 
865 /*
866  * Delete a link structure after tsleep has failed.  This code is not
867  * in the critical path as most exclusive waits are chained.
868  */
869 static
870 void
871 mtx_delete_link(mtx_t *mtx, mtx_link_t *link)
872 {
873 	thread_t td = curthread;
874 	u_int	lock;
875 	u_int	nlock;
876 
877 	/*
878 	 * Acquire MTX_LINKSPIN.
879 	 *
880 	 * Do not use cmpxchg to wait for LINKSPIN to clear as this might
881 	 * result in too much cpu cache traffic.
882 	 */
883 	++td->td_critcount;
884 	for (;;) {
885 		lock = mtx->mtx_lock;
886 		if (lock & MTX_LINKSPIN) {
887 			cpu_pause();
888 			continue;
889 		}
890 		nlock = lock | MTX_LINKSPIN;
891 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
892 			break;
893 		cpu_pause();
894 	}
895 
896 	/*
897 	 * Delete the link and release LINKSPIN.
898 	 */
899 	nlock = MTX_LINKSPIN;	/* to clear */
900 
901 	switch(link->state) {
902 	case MTX_LINK_LINKED_EX:
903 		if (link->next == link) {
904 			mtx->mtx_exlink = NULL;
905 			nlock |= MTX_EXWANTED;	/* to clear */
906 		} else {
907 			mtx->mtx_exlink = link->next;
908 			link->next->prev = link->prev;
909 			link->prev->next = link->next;
910 		}
911 		break;
912 	case MTX_LINK_LINKED_SH:
913 		if (link->next == link) {
914 			mtx->mtx_shlink = NULL;
915 			nlock |= MTX_SHWANTED;	/* to clear */
916 		} else {
917 			mtx->mtx_shlink = link->next;
918 			link->next->prev = link->prev;
919 			link->prev->next = link->next;
920 		}
921 		break;
922 	default:
923 		/* no change */
924 		break;
925 	}
926 	atomic_clear_int(&mtx->mtx_lock, nlock);
927 	--td->td_critcount;
928 }
929 
930 /*
931  * Wait for async lock completion or abort.  Returns ENOLCK if an abort
932  * occurred.
933  */
934 int
935 mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
936 {
937 	int error;
938 
939 	/*
940 	 * Sleep.  Handle false wakeups, interruptions, etc.
941 	 * The link may also have been aborted.  The LINKED
942 	 * bit was set by this cpu so we can test it without
943 	 * fences.
944 	 */
945 	error = 0;
946 	while (link->state & MTX_LINK_LINKED) {
947 		tsleep_interlock(link, 0);
948 		cpu_lfence();
949 		if (link->state & MTX_LINK_LINKED) {
950 			if (link->state & MTX_LINK_LINKED_SH)
951 				mycpu->gd_cnt.v_lock_name[0] = 'S';
952 			else
953 				mycpu->gd_cnt.v_lock_name[0] = 'X';
954 			strncpy(mycpu->gd_cnt.v_lock_name + 1,
955 				mtx->mtx_ident,
956 				sizeof(mycpu->gd_cnt.v_lock_name) - 2);
957 			++mycpu->gd_cnt.v_lock_colls;
958 
959 			error = tsleep(link, flags | PINTERLOCKED,
960 				       mtx->mtx_ident, to);
961 			if (error)
962 				break;
963 		}
964 	}
965 
966 	/*
967 	 * We need at least a lfence (load fence) to ensure our cpu does not
968 	 * reorder loads (of data outside the lock structure) prior to the
969 	 * remote cpu's release, since the above test may have run without
970 	 * any atomic interactions.
971 	 *
972 	 * If we do not do this then state updated by the other cpu before
973 	 * releasing its lock may not be read cleanly by our cpu when this
974 	 * function returns.  Even though the other cpu ordered its stores,
975 	 * our loads can still be out of order.
976 	 */
977 	cpu_mfence();
978 
979 	/*
980 	 * We are done, make sure the link structure is unlinked.
981 	 * It may still be on the list due to e.g. EINTR or
982 	 * EWOULDBLOCK.
983 	 *
984 	 * It is possible for the tsleep to race an ABORT and cause
985 	 * error to be 0.
986 	 *
987 	 * The tsleep() can be woken up for numerous reasons and error
988 	 * might be zero in situations where we intend to return an error.
989 	 *
990 	 * (This is the synchronous case so state cannot be CALLEDBACK)
991 	 */
992 	switch(link->state) {
993 	case MTX_LINK_ACQUIRED:
994 	case MTX_LINK_CALLEDBACK:
995 		error = 0;
996 		break;
997 	case MTX_LINK_ABORTED:
998 		error = ENOLCK;
999 		break;
1000 	case MTX_LINK_LINKED_EX:
1001 	case MTX_LINK_LINKED_SH:
1002 		mtx_delete_link(mtx, link);
1003 		/* fall through */
1004 	default:
1005 		if (error == 0)
1006 			error = EWOULDBLOCK;
1007 		break;
1008 	}
1009 
1010 	/*
1011 	 * Clear state on status returned.
1012 	 */
1013 	link->state = MTX_LINK_IDLE;
1014 
1015 	return error;
1016 }
1017 
1018 /*
1019  * Abort a mutex locking operation, causing mtx_lock_ex_link() to
1020  * return ENOLCK.  This may be called at any time after the mtx_link
1021  * is initialized or the status from a previous lock has been
1022  * returned.  If called prior to the next (non-try) lock attempt, the
1023  * next lock attempt using this link structure will abort instantly.
1024  *
1025  * Caller must still wait for the operation to complete, either from a
1026  * blocking call that is still in progress or by calling mtx_wait_link().
1027  *
1028  * If an asynchronous lock request is possibly in-progress, the caller
1029  * should call mtx_wait_link() synchronously.  Note that the asynchronous
1030  * lock callback will NOT be called if a successful abort occurred. XXX
1031  */
1032 void
1033 mtx_abort_link(mtx_t *mtx, mtx_link_t *link)
1034 {
1035 	thread_t td = curthread;
1036 	u_int	lock;
1037 	u_int	nlock;
1038 
1039 	/*
1040 	 * Acquire MTX_LINKSPIN
1041 	 */
1042 	++td->td_critcount;
1043 	for (;;) {
1044 		lock = mtx->mtx_lock;
1045 		if (lock & MTX_LINKSPIN) {
1046 			cpu_pause();
1047 			continue;
1048 		}
1049 		nlock = lock | MTX_LINKSPIN;
1050 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
1051 			break;
1052 		cpu_pause();
1053 	}
1054 
1055 	/*
1056 	 * Do the abort.
1057 	 *
1058 	 * WARNING! Link structure can disappear once link->state is set.
1059 	 */
1060 	nlock = MTX_LINKSPIN;	/* to clear */
1061 
1062 	switch(link->state) {
1063 	case MTX_LINK_IDLE:
1064 		/*
1065 		 * Link not started yet
1066 		 */
1067 		link->state = MTX_LINK_ABORTED;
1068 		break;
1069 	case MTX_LINK_LINKED_EX:
1070 		/*
1071 		 * de-link, mark aborted, and potentially wakeup the thread
1072 		 * or issue the callback.
1073 		 */
1074 		if (link->next == link) {
1075 			if (mtx->mtx_exlink == link) {
1076 				mtx->mtx_exlink = NULL;
1077 				nlock |= MTX_EXWANTED;	/* to clear */
1078 			}
1079 		} else {
1080 			if (mtx->mtx_exlink == link)
1081 				mtx->mtx_exlink = link->next;
1082 			link->next->prev = link->prev;
1083 			link->prev->next = link->next;
1084 		}
1085 
1086 		/*
1087 		 * When aborting the async callback is still made.  We must
1088 		 * not set the link status to ABORTED in the callback case
1089 		 * since there is nothing else to clear its status if the
1090 		 * link is reused.
1091 		 */
1092 		if (link->callback) {
1093 			link->state = MTX_LINK_CALLEDBACK;
1094 			link->callback(link, link->arg, ENOLCK);
1095 		} else {
1096 			link->state = MTX_LINK_ABORTED;
1097 			wakeup(link);
1098 		}
1099 		break;
1100 	case MTX_LINK_LINKED_SH:
1101 		/*
1102 		 * de-link, mark aborted, and potentially wakeup the thread
1103 		 * or issue the callback.
1104 		 */
1105 		if (link->next == link) {
1106 			if (mtx->mtx_shlink == link) {
1107 				mtx->mtx_shlink = NULL;
1108 				nlock |= MTX_SHWANTED;	/* to clear */
1109 			}
1110 		} else {
1111 			if (mtx->mtx_shlink == link)
1112 				mtx->mtx_shlink = link->next;
1113 			link->next->prev = link->prev;
1114 			link->prev->next = link->next;
1115 		}
1116 
1117 		/*
1118 		 * When aborting the async callback is still made.  We must
1119 		 * not set the link status to ABORTED in the callback case
1120 		 * since there is nothing else to clear its status if the
1121 		 * link is reused.
1122 		 */
1123 		if (link->callback) {
1124 			link->state = MTX_LINK_CALLEDBACK;
1125 			link->callback(link, link->arg, ENOLCK);
1126 		} else {
1127 			link->state = MTX_LINK_ABORTED;
1128 			wakeup(link);
1129 		}
1130 		break;
1131 	case MTX_LINK_ACQUIRED:
1132 	case MTX_LINK_CALLEDBACK:
1133 		/*
1134 		 * Too late, the lock was acquired.  Let it complete.
1135 		 */
1136 		break;
1137 	default:
1138 		/*
1139 		 * link already aborted, do nothing.
1140 		 */
1141 		break;
1142 	}
1143 	atomic_clear_int(&mtx->mtx_lock, nlock);
1144 	--td->td_critcount;
1145 }
1146