xref: /dflybsd-src/sys/kern/kern_mutex.c (revision 7822b354ecf7d21d16f320f21735ed8096ad78a1)
1 /*
2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Implement fast persistent locks based on atomic_cmpset_int() with
36  * semantics similar to lockmgr locks but faster and taking up much less
37  * space.  Taken from HAMMER's lock implementation.
38  *
39  * These are meant to complement our LWKT tokens.  Tokens are only held
40  * while the thread is running.  Mutexes can be held across blocking
41  * conditions.
42  *
43  * - Exclusive priority over shared to prevent SMP starvation.
44  * - locks can be aborted (async callback, if any, will be made w/ENOLCK).
45  * - locks can be asynchronous.
46  * - synchronous fast path if no blocking occurs (async callback is not
47  *   made in this case).
48  *
49  * Generally speaking any caller-supplied link state must be properly
50  * initialized before use.
51  *
52  * Most of the support is in sys/mutex[2].h.  We mostly provide backoff
53  * functions here.
54  */
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/sysctl.h>
60 #include <sys/thread.h>
61 
62 #include <machine/cpufunc.h>
63 
64 #include <sys/thread2.h>
65 #include <sys/mutex2.h>
66 
67 static __int64_t mtx_contention_count;
68 static __int64_t mtx_collision_count;
69 static __int64_t mtx_wakeup_count;
70 
71 SYSCTL_QUAD(_kern, OID_AUTO, mtx_contention_count, CTLFLAG_RW,
72 	    &mtx_contention_count, 0, "");
73 SYSCTL_QUAD(_kern, OID_AUTO, mtx_collision_count, CTLFLAG_RW,
74 	    &mtx_collision_count, 0, "");
75 SYSCTL_QUAD(_kern, OID_AUTO, mtx_wakeup_count, CTLFLAG_RW,
76 	    &mtx_wakeup_count, 0, "");
77 
78 static int mtx_chain_link_ex(mtx_t *mtx, u_int olock);
79 static int mtx_chain_link_sh(mtx_t *mtx, u_int olock, int addcount);
80 static void mtx_delete_link(mtx_t *mtx, mtx_link_t *link);
81 
82 /*
83  * Exclusive-lock a mutex, block until acquired unless link is async.
84  * Recursion is allowed.
85  *
86  * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
87  * if async.  If immediately successful an async exclusive lock will return 0
88  * and not issue the async callback or link the link structure.  The caller
89  * must handle this case (typically this is an optimal code path).
90  *
91  * A tsleep() error can only be returned if PCATCH is specified in the flags.
92  */
93 static __inline int
94 __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link,
95 	      const char *ident, int flags, int to)
96 {
97 	thread_t td;
98 	u_int	lock;
99 	u_int	nlock;
100 	int	error;
101 	int	isasync;
102 
103 	for (;;) {
104 		lock = mtx->mtx_lock;
105 		cpu_ccfence();
106 
107 		if (lock == 0) {
108 			nlock = MTX_EXCLUSIVE | 1;
109 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
110 				mtx->mtx_owner = curthread;
111 				link->state = MTX_LINK_ACQUIRED;
112 				error = 0;
113 				break;
114 			}
115 			continue;
116 		}
117 		if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
118 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
119 			nlock = lock + 1;
120 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
121 				link->state = MTX_LINK_ACQUIRED;
122 				error = 0;
123 				break;
124 			}
125 			continue;
126 		}
127 
128 		/*
129 		 * We need MTX_LINKSPIN to manipulate exlink or
130 		 * shlink.
131 		 *
132 		 * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
133 		 * pending shared requests.  It cannot be set as a separate
134 		 * operation prior to acquiring MTX_LINKSPIN.
135 		 *
136 		 * To avoid unnecessary cpu cache traffic we poll
137 		 * for collisions.  It is also possible that EXWANTED
138 		 * state failing the above test was spurious, so all the
139 		 * tests must be repeated if we cannot obtain LINKSPIN
140 		 * with the prior state tests intact (i.e. don't reload
141 		 * the (lock) variable here, for heaven's sake!).
142 		 */
143 		if (lock & MTX_LINKSPIN) {
144 			cpu_pause();
145 			++mtx_collision_count;
146 			continue;
147 		}
148 		td = curthread;
149 		nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
150 		++td->td_critcount;
151 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
152 			--td->td_critcount;
153 			continue;
154 		}
155 
156 		/*
157 		 * Check for early abort.
158 		 */
159 		if (link->state == MTX_LINK_ABORTED) {
160 			if (mtx->mtx_exlink == NULL) {
161 				atomic_clear_int(&mtx->mtx_lock,
162 						 MTX_LINKSPIN |
163 						 MTX_EXWANTED);
164 			} else {
165 				atomic_clear_int(&mtx->mtx_lock,
166 						 MTX_LINKSPIN);
167 			}
168 			--td->td_critcount;
169 			link->state = MTX_LINK_IDLE;
170 			error = ENOLCK;
171 			break;
172 		}
173 
174 		/*
175 		 * Add our link to the exlink list and release LINKSPIN.
176 		 */
177 		link->owner = td;
178 		link->state = MTX_LINK_LINKED_EX;
179 		if (mtx->mtx_exlink) {
180 			link->next = mtx->mtx_exlink;
181 			link->prev = link->next->prev;
182 			link->next->prev = link;
183 			link->prev->next = link;
184 		} else {
185 			link->next = link;
186 			link->prev = link;
187 			mtx->mtx_exlink = link;
188 		}
189 		isasync = (link->callback != NULL);
190 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
191 		--td->td_critcount;
192 
193 		/*
194 		 * If asynchronous lock request return without
195 		 * blocking, leave link structure linked.
196 		 */
197 		if (isasync) {
198 			error = EINPROGRESS;
199 			break;
200 		}
201 
202 		/*
203 		 * Wait for lock
204 		 */
205 		error = mtx_wait_link(mtx, link, ident, flags, to);
206 		break;
207 	}
208 	return (error);
209 }
210 
211 int
212 _mtx_lock_ex_link(mtx_t *mtx, mtx_link_t *link,
213 		  const char *ident, int flags, int to)
214 {
215 	return(__mtx_lock_ex(mtx, link, ident, flags, to));
216 }
217 
218 int
219 _mtx_lock_ex(mtx_t *mtx, const char *ident, int flags, int to)
220 {
221 	mtx_link_t link;
222 
223 	mtx_link_init(&link);
224 	return(__mtx_lock_ex(mtx, &link, ident, flags, to));
225 }
226 
227 int
228 _mtx_lock_ex_quick(mtx_t *mtx, const char *ident)
229 {
230 	mtx_link_t link;
231 
232 	mtx_link_init(&link);
233 	return(__mtx_lock_ex(mtx, &link, ident, 0, 0));
234 }
235 
236 /*
237  * Share-lock a mutex, block until acquired.  Recursion is allowed.
238  *
239  * Returns 0 on success, or the tsleep() return code on failure.
240  * An error can only be returned if PCATCH is specified in the flags.
241  *
242  * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we
243  *	 do not have to chain the wakeup().
244  */
245 static __inline int
246 __mtx_lock_sh(mtx_t *mtx, mtx_link_t *link,
247 	      const char *ident, int flags, int to)
248 {
249 	thread_t td;
250 	u_int	lock;
251 	u_int	nlock;
252 	int	error;
253 	int	isasync;
254 
255 	for (;;) {
256 		lock = mtx->mtx_lock;
257 		cpu_ccfence();
258 
259 		if (lock == 0) {
260 			nlock = 1;
261 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
262 				error = 0;
263 				link->state = MTX_LINK_ACQUIRED;
264 				break;
265 			}
266 			continue;
267 		}
268 		if ((lock & (MTX_EXCLUSIVE | MTX_EXWANTED)) == 0) {
269 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
270 			nlock = lock + 1;
271 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
272 				error = 0;
273 				link->state = MTX_LINK_ACQUIRED;
274 				break;
275 			}
276 			continue;
277 		}
278 
279 		/*
280 		 * We need MTX_LINKSPIN to manipulate exlink or
281 		 * shlink.
282 		 *
283 		 * We must set MTX_SHWANTED with MTX_LINKSPIN to indicate
284 		 * pending shared requests.  It cannot be set as a separate
285 		 * operation prior to acquiring MTX_LINKSPIN.
286 		 *
287 		 * To avoid unnecessary cpu cache traffic we poll
288 		 * for collisions.  It is also possible that EXWANTED
289 		 * state failing the above test was spurious, so all the
290 		 * tests must be repeated if we cannot obtain LINKSPIN
291 		 * with the prior state tests intact (i.e. don't reload
292 		 * the (lock) variable here, for heaven's sake!).
293 		 */
294 		if (lock & MTX_LINKSPIN) {
295 			cpu_pause();
296 			++mtx_collision_count;
297 			continue;
298 		}
299 		td = curthread;
300 		nlock = lock | MTX_SHWANTED | MTX_LINKSPIN;
301 		++td->td_critcount;
302 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
303 			--td->td_critcount;
304 			continue;
305 		}
306 
307 		/*
308 		 * Check for early abort.
309 		 */
310 		if (link->state == MTX_LINK_ABORTED) {
311 			if (mtx->mtx_exlink == NULL) {
312 				atomic_clear_int(&mtx->mtx_lock,
313 						 MTX_LINKSPIN |
314 						 MTX_SHWANTED);
315 			} else {
316 				atomic_clear_int(&mtx->mtx_lock,
317 						 MTX_LINKSPIN);
318 			}
319 			--td->td_critcount;
320 			link->state = MTX_LINK_IDLE;
321 			error = ENOLCK;
322 			break;
323 		}
324 
325 		/*
326 		 * Add our link to the exlink list and release LINKSPIN.
327 		 */
328 		link->owner = td;
329 		link->state = MTX_LINK_LINKED_SH;
330 		if (mtx->mtx_shlink) {
331 			link->next = mtx->mtx_shlink;
332 			link->prev = link->next->prev;
333 			link->next->prev = link;
334 			link->prev->next = link;
335 		} else {
336 			link->next = link;
337 			link->prev = link;
338 			mtx->mtx_shlink = link;
339 		}
340 		isasync = (link->callback != NULL);
341 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
342 		--td->td_critcount;
343 
344 		/*
345 		 * If asynchronous lock request return without
346 		 * blocking, leave link structure linked.
347 		 */
348 		if (isasync) {
349 			error = EINPROGRESS;
350 			break;
351 		}
352 
353 		/*
354 		 * Wait for lock
355 		 */
356 		error = mtx_wait_link(mtx, link, ident, flags, to);
357 		break;
358 	}
359 	return (error);
360 }
361 
362 int
363 _mtx_lock_sh_link(mtx_t *mtx, mtx_link_t *link,
364 		  const char *ident, int flags, int to)
365 {
366 	return(__mtx_lock_sh(mtx, link, ident, flags, to));
367 }
368 
369 int
370 _mtx_lock_sh(mtx_t *mtx, const char *ident, int flags, int to)
371 {
372 	mtx_link_t link;
373 
374 	mtx_link_init(&link);
375 	return(__mtx_lock_sh(mtx, &link, ident, flags, to));
376 }
377 
378 int
379 _mtx_lock_sh_quick(mtx_t *mtx, const char *ident)
380 {
381 	mtx_link_t link;
382 
383 	mtx_link_init(&link);
384 	return(__mtx_lock_sh(mtx, &link, ident, 0, 0));
385 }
386 
387 /*
388  * Get an exclusive spinlock the hard way.
389  */
390 void
391 _mtx_spinlock(mtx_t *mtx)
392 {
393 	u_int	lock;
394 	u_int	nlock;
395 	int	bb = 1;
396 	int	bo;
397 
398 	for (;;) {
399 		lock = mtx->mtx_lock;
400 		if (lock == 0) {
401 			nlock = MTX_EXCLUSIVE | 1;
402 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
403 				mtx->mtx_owner = curthread;
404 				break;
405 			}
406 		} else if ((lock & MTX_EXCLUSIVE) &&
407 			   mtx->mtx_owner == curthread) {
408 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
409 			nlock = lock + 1;
410 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
411 				break;
412 		} else {
413 			/* MWAIT here */
414 			if (bb < 1000)
415 				++bb;
416 			cpu_pause();
417 			for (bo = 0; bo < bb; ++bo)
418 				;
419 			++mtx_contention_count;
420 		}
421 		cpu_pause();
422 		++mtx_collision_count;
423 	}
424 }
425 
426 /*
427  * Attempt to acquire a spinlock, if we fail we must undo the
428  * gd->gd_spinlocks/gd->gd_curthead->td_critcount predisposition.
429  *
430  * Returns 0 on success, EAGAIN on failure.
431  */
432 int
433 _mtx_spinlock_try(mtx_t *mtx)
434 {
435 	globaldata_t gd = mycpu;
436 	u_int	lock;
437 	u_int	nlock;
438 	int	res = 0;
439 
440 	for (;;) {
441 		lock = mtx->mtx_lock;
442 		if (lock == 0) {
443 			nlock = MTX_EXCLUSIVE | 1;
444 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
445 				mtx->mtx_owner = gd->gd_curthread;
446 				break;
447 			}
448 		} else if ((lock & MTX_EXCLUSIVE) &&
449 			   mtx->mtx_owner == gd->gd_curthread) {
450 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
451 			nlock = lock + 1;
452 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
453 				break;
454 		} else {
455 			--gd->gd_spinlocks;
456 			cpu_ccfence();
457 			--gd->gd_curthread->td_critcount;
458 			res = EAGAIN;
459 			break;
460 		}
461 		cpu_pause();
462 		++mtx_collision_count;
463 	}
464 	return res;
465 }
466 
467 #if 0
468 
469 void
470 _mtx_spinlock_sh(mtx_t *mtx)
471 {
472 	u_int	lock;
473 	u_int	nlock;
474 	int	bb = 1;
475 	int	bo;
476 
477 	for (;;) {
478 		lock = mtx->mtx_lock;
479 		if ((lock & MTX_EXCLUSIVE) == 0) {
480 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
481 			nlock = lock + 1;
482 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
483 				break;
484 		} else {
485 			/* MWAIT here */
486 			if (bb < 1000)
487 				++bb;
488 			cpu_pause();
489 			for (bo = 0; bo < bb; ++bo)
490 				;
491 			++mtx_contention_count;
492 		}
493 		cpu_pause();
494 		++mtx_collision_count;
495 	}
496 }
497 
498 #endif
499 
500 int
501 _mtx_lock_ex_try(mtx_t *mtx)
502 {
503 	u_int	lock;
504 	u_int	nlock;
505 	int	error;
506 
507 	for (;;) {
508 		lock = mtx->mtx_lock;
509 		if (lock == 0) {
510 			nlock = MTX_EXCLUSIVE | 1;
511 			if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
512 				mtx->mtx_owner = curthread;
513 				error = 0;
514 				break;
515 			}
516 		} else if ((lock & MTX_EXCLUSIVE) &&
517 			   mtx->mtx_owner == curthread) {
518 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
519 			nlock = lock + 1;
520 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
521 				error = 0;
522 				break;
523 			}
524 		} else {
525 			error = EAGAIN;
526 			break;
527 		}
528 		cpu_pause();
529 		++mtx_collision_count;
530 	}
531 	return (error);
532 }
533 
534 int
535 _mtx_lock_sh_try(mtx_t *mtx)
536 {
537 	u_int	lock;
538 	u_int	nlock;
539 	int	error = 0;
540 
541 	for (;;) {
542 		lock = mtx->mtx_lock;
543 		if ((lock & MTX_EXCLUSIVE) == 0) {
544 			KKASSERT((lock & MTX_MASK) != MTX_MASK);
545 			nlock = lock + 1;
546 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
547 				break;
548 		} else {
549 			error = EAGAIN;
550 			break;
551 		}
552 		cpu_pause();
553 		++mtx_collision_count;
554 	}
555 	return (error);
556 }
557 
558 /*
559  * If the lock is held exclusively it must be owned by the caller.  If the
560  * lock is already a shared lock this operation is a NOP.  A panic will
561  * occur if the lock is not held either shared or exclusive.
562  *
563  * The exclusive count is converted to a shared count.
564  */
565 void
566 _mtx_downgrade(mtx_t *mtx)
567 {
568 	u_int	lock;
569 	u_int	nlock;
570 
571 	for (;;) {
572 		lock = mtx->mtx_lock;
573 		cpu_ccfence();
574 
575 		/*
576 		 * NOP if already shared.
577 		 */
578 		if ((lock & MTX_EXCLUSIVE) == 0) {
579 			KKASSERT((lock & MTX_MASK) > 0);
580 			break;
581 		}
582 
583 		/*
584 		 * Transfer count to shared.  Any additional pending shared
585 		 * waiters must be woken up.
586 		 */
587 		if (lock & MTX_SHWANTED) {
588 			if (mtx_chain_link_sh(mtx, lock, 1))
589 				break;
590 			/* retry */
591 		} else {
592 			nlock = lock & ~MTX_EXCLUSIVE;
593 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
594 				break;
595 			/* retry */
596 		}
597 		cpu_pause();
598 		++mtx_collision_count;
599 	}
600 }
601 
602 /*
603  * Upgrade a shared lock to an exclusive lock.  The upgrade will fail if
604  * the shared lock has a count other then 1.  Optimize the most likely case
605  * but note that a single cmpset can fail due to WANTED races.
606  *
607  * If the lock is held exclusively it must be owned by the caller and
608  * this function will simply return without doing anything.   A panic will
609  * occur if the lock is held exclusively by someone other then the caller.
610  *
611  * Returns 0 on success, EDEADLK on failure.
612  */
613 int
614 _mtx_upgrade_try(mtx_t *mtx)
615 {
616 	u_int	lock;
617 	u_int	nlock;
618 	int	error = 0;
619 
620 	for (;;) {
621 		lock = mtx->mtx_lock;
622 
623 		if ((lock & ~MTX_EXWANTED) == 1) {
624 			nlock = lock | MTX_EXCLUSIVE;
625 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
626 				mtx->mtx_owner = curthread;
627 				break;
628 			}
629 		} else if (lock & MTX_EXCLUSIVE) {
630 			KKASSERT(mtx->mtx_owner == curthread);
631 			break;
632 		} else {
633 			error = EDEADLK;
634 			break;
635 		}
636 		cpu_pause();
637 		++mtx_collision_count;
638 	}
639 	return (error);
640 }
641 
642 /*
643  * Unlock a lock.  The caller must hold the lock either shared or exclusive.
644  *
645  * On the last release we handle any pending chains.
646  */
647 void
648 _mtx_unlock(mtx_t *mtx)
649 {
650 	u_int	lock;
651 	u_int	nlock;
652 
653 	for (;;) {
654 		lock = mtx->mtx_lock;
655 		cpu_ccfence();
656 
657 		switch(lock) {
658 		case MTX_EXCLUSIVE | 1:
659 			/*
660 			 * Last release, exclusive lock.
661 			 * No exclusive or shared requests pending.
662 			 */
663 			mtx->mtx_owner = NULL;
664 			nlock = 0;
665 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
666 				goto done;
667 			break;
668 		case MTX_EXCLUSIVE | MTX_EXWANTED | 1:
669 		case MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1:
670 			/*
671 			 * Last release, exclusive lock.
672 			 * Exclusive requests pending.
673 			 * Exclusive requests have priority over shared reqs.
674 			 */
675 			if (mtx_chain_link_ex(mtx, lock))
676 				goto done;
677 			break;
678 		case MTX_EXCLUSIVE | MTX_SHWANTED | 1:
679 			/*
680 			 * Last release, exclusive lock.
681 			 *
682 			 * Shared requests are pending.  Transfer our count (1)
683 			 * to the first shared request, wakeup all shared reqs.
684 			 */
685 			if (mtx_chain_link_sh(mtx, lock, 0))
686 				goto done;
687 			break;
688 		case 1:
689 			/*
690 			 * Last release, shared lock.
691 			 * No exclusive or shared requests pending.
692 			 */
693 			nlock = 0;
694 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
695 				goto done;
696 			break;
697 		case MTX_EXWANTED | 1:
698 		case MTX_EXWANTED | MTX_SHWANTED | 1:
699 			/*
700 			 * Last release, shared lock.
701 			 *
702 			 * Exclusive requests are pending.  Transfer our
703 			 * count (1) to the next exclusive request.
704 			 *
705 			 * Exclusive requests have priority over shared reqs.
706 			 */
707 			if (mtx_chain_link_ex(mtx, lock))
708 				goto done;
709 			break;
710 		case MTX_SHWANTED | 1:
711 			/*
712 			 * Last release, shared lock.
713 			 * Shared requests pending.
714 			 */
715 			if (mtx_chain_link_sh(mtx, lock, 0))
716 				goto done;
717 			break;
718 		default:
719 			/*
720 			 * We have to loop if this is the last release but
721 			 * someone is fiddling with LINKSPIN.
722 			 */
723 			if ((lock & MTX_MASK) == 1) {
724 				KKASSERT(lock & MTX_LINKSPIN);
725 				break;
726 			}
727 
728 			/*
729 			 * Not the last release (shared or exclusive)
730 			 */
731 			nlock = lock - 1;
732 			KKASSERT((nlock & MTX_MASK) != MTX_MASK);
733 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
734 				goto done;
735 			break;
736 		}
737 		/* loop try again */
738 		cpu_pause();
739 		++mtx_collision_count;
740 	}
741 done:
742 	;
743 }
744 
745 /*
746  * Chain pending links.  Called on the last release of an exclusive or
747  * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
748  * is passed in with the count left at 1, which we can inherit, and other
749  * bits which we must adjust in a single atomic operation.
750  *
751  * Return non-zero on success, 0 if caller needs to retry.
752  *
753  * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
754  *	 acquiring LINKSPIN as all other cases will also need to acquire
755  *	 LINKSPIN when handling the EXWANTED case.
756  */
757 static int
758 mtx_chain_link_ex(mtx_t *mtx, u_int olock)
759 {
760 	thread_t td = curthread;
761 	mtx_link_t *link;
762 	u_int	nlock;
763 
764 	olock &= ~MTX_LINKSPIN;
765 	nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;
766 	++td->td_critcount;
767 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
768 		link = mtx->mtx_exlink;
769 		KKASSERT(link != NULL);
770 		if (link->next == link) {
771 			mtx->mtx_exlink = NULL;
772 			nlock = MTX_LINKSPIN | MTX_EXWANTED;	/* to clear */
773 		} else {
774 			mtx->mtx_exlink = link->next;
775 			link->next->prev = link->prev;
776 			link->prev->next = link->next;
777 			nlock = MTX_LINKSPIN;			/* to clear */
778 		}
779 		KKASSERT(link->state == MTX_LINK_LINKED_EX);
780 		mtx->mtx_owner = link->owner;
781 		cpu_sfence();
782 
783 		/*
784 		 * WARNING! The callback can only be safely
785 		 *	    made with LINKSPIN still held
786 		 *	    and in a critical section.
787 		 *
788 		 * WARNING! The link can go away after the
789 		 *	    state is set, or after the
790 		 *	    callback.
791 		 */
792 		if (link->callback) {
793 			link->state = MTX_LINK_CALLEDBACK;
794 			link->callback(link, link->arg, 0);
795 		} else {
796 			link->state = MTX_LINK_ACQUIRED;
797 			wakeup(link);
798 		}
799 		atomic_clear_int(&mtx->mtx_lock, nlock);
800 		--td->td_critcount;
801 		++mtx_wakeup_count;
802 		return 1;
803 	}
804 	/* retry */
805 	--td->td_critcount;
806 	return 0;
807 }
808 
809 /*
810  * Flush waiting shared locks.  The lock's prior state is passed in and must
811  * be adjusted atomically only if it matches.
812  *
813  * If addcount is 0, the count for the first shared lock in the chain is
814  * assumed to have already been accounted for.
815  *
816  * If addcount is 1, the count for the first shared lock in the chain has
817  * not yet been accounted for.
818  */
819 static int
820 mtx_chain_link_sh(mtx_t *mtx, u_int olock, int addcount)
821 {
822 	thread_t td = curthread;
823 	mtx_link_t *link;
824 	u_int	nlock;
825 
826 	olock &= ~MTX_LINKSPIN;
827 	nlock = olock | MTX_LINKSPIN;
828 	nlock &= ~MTX_EXCLUSIVE;
829 	++td->td_critcount;
830 	if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
831 		KKASSERT(mtx->mtx_shlink != NULL);
832 		for (;;) {
833 			link = mtx->mtx_shlink;
834 			atomic_add_int(&mtx->mtx_lock, addcount);
835 			KKASSERT(link->state == MTX_LINK_LINKED_SH);
836 			if (link->next == link) {
837 				mtx->mtx_shlink = NULL;
838 				cpu_sfence();
839 
840 				/*
841 				 * WARNING! The callback can only be safely
842 				 *	    made with LINKSPIN still held
843 				 *	    and in a critical section.
844 				 *
845 				 * WARNING! The link can go away after the
846 				 *	    state is set, or after the
847 				 *	    callback.
848 				 */
849 				if (link->callback) {
850 					link->state = MTX_LINK_CALLEDBACK;
851 					link->callback(link, link->arg, 0);
852 				} else {
853 					link->state = MTX_LINK_ACQUIRED;
854 					wakeup(link);
855 				}
856 				++mtx_wakeup_count;
857 				break;
858 			}
859 			mtx->mtx_shlink = link->next;
860 			link->next->prev = link->prev;
861 			link->prev->next = link->next;
862 			cpu_sfence();
863 			link->state = MTX_LINK_ACQUIRED;
864 			/* link can go away */
865 			wakeup(link);
866 			++mtx_wakeup_count;
867 			addcount = 1;
868 		}
869 		atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
870 						 MTX_SHWANTED);
871 		--td->td_critcount;
872 		return 1;
873 	}
874 	/* retry */
875 	--td->td_critcount;
876 	return 0;
877 }
878 
879 /*
880  * Delete a link structure after tsleep has failed.  This code is not
881  * in the critical path as most exclusive waits are chained.
882  */
883 static
884 void
885 mtx_delete_link(mtx_t *mtx, mtx_link_t *link)
886 {
887 	thread_t td = curthread;
888 	u_int	lock;
889 	u_int	nlock;
890 
891 	/*
892 	 * Acquire MTX_LINKSPIN.
893 	 *
894 	 * Do not use cmpxchg to wait for LINKSPIN to clear as this might
895 	 * result in too much cpu cache traffic.
896 	 */
897 	++td->td_critcount;
898 	for (;;) {
899 		lock = mtx->mtx_lock;
900 		if (lock & MTX_LINKSPIN) {
901 			cpu_pause();
902 			++mtx_collision_count;
903 			continue;
904 		}
905 		nlock = lock | MTX_LINKSPIN;
906 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
907 			break;
908 		cpu_pause();
909 		++mtx_collision_count;
910 	}
911 
912 	/*
913 	 * Delete the link and release LINKSPIN.
914 	 */
915 	nlock = MTX_LINKSPIN;	/* to clear */
916 
917 	switch(link->state) {
918 	case MTX_LINK_LINKED_EX:
919 		if (link->next == link) {
920 			mtx->mtx_exlink = NULL;
921 			nlock |= MTX_EXWANTED;	/* to clear */
922 		} else {
923 			mtx->mtx_exlink = link->next;
924 			link->next->prev = link->prev;
925 			link->prev->next = link->next;
926 		}
927 		break;
928 	case MTX_LINK_LINKED_SH:
929 		if (link->next == link) {
930 			mtx->mtx_shlink = NULL;
931 			nlock |= MTX_SHWANTED;	/* to clear */
932 		} else {
933 			mtx->mtx_shlink = link->next;
934 			link->next->prev = link->prev;
935 			link->prev->next = link->next;
936 		}
937 		break;
938 	default:
939 		/* no change */
940 		break;
941 	}
942 	atomic_clear_int(&mtx->mtx_lock, nlock);
943 	--td->td_critcount;
944 }
945 
946 /*
947  * Wait for async lock completion or abort.  Returns ENOLCK if an abort
948  * occurred.
949  */
950 int
951 mtx_wait_link(mtx_t *mtx, mtx_link_t *link,
952 	      const char *ident, int flags, int to)
953 {
954 	int error;
955 
956 	/*
957 	 * Sleep.  Handle false wakeups, interruptions, etc.
958 	 * The link may also have been aborted.
959 	 */
960 	error = 0;
961 	while (link->state & MTX_LINK_LINKED) {
962 		tsleep_interlock(link, 0);
963 		cpu_lfence();
964 		if (link->state & MTX_LINK_LINKED) {
965 			++mtx_contention_count;
966 			if (link->state & MTX_LINK_LINKED_SH)
967 				mycpu->gd_cnt.v_lock_name[0] = 'S';
968 			else
969 				mycpu->gd_cnt.v_lock_name[0] = 'X';
970 			strncpy(mycpu->gd_cnt.v_lock_name + 1,
971 				ident,
972 				sizeof(mycpu->gd_cnt.v_lock_name) - 2);
973 			++mycpu->gd_cnt.v_lock_colls;
974 
975 			error = tsleep(link, flags | PINTERLOCKED,
976 				       ident, to);
977 			if (error)
978 				break;
979 		}
980 	}
981 
982 	/*
983 	 * We are done, make sure the link structure is unlinked.
984 	 * It may still be on the list due to e.g. EINTR or
985 	 * EWOULDBLOCK.
986 	 *
987 	 * It is possible for the tsleep to race an ABORT and cause
988 	 * error to be 0.
989 	 *
990 	 * The tsleep() can be woken up for numerous reasons and error
991 	 * might be zero in situations where we intend to return an error.
992 	 *
993 	 * (This is the synchronous case so state cannot be CALLEDBACK)
994 	 */
995 	switch(link->state) {
996 	case MTX_LINK_ACQUIRED:
997 	case MTX_LINK_CALLEDBACK:
998 		error = 0;
999 		break;
1000 	case MTX_LINK_ABORTED:
1001 		error = ENOLCK;
1002 		break;
1003 	case MTX_LINK_LINKED_EX:
1004 	case MTX_LINK_LINKED_SH:
1005 		mtx_delete_link(mtx, link);
1006 		/* fall through */
1007 	default:
1008 		if (error == 0)
1009 			error = EWOULDBLOCK;
1010 		break;
1011 	}
1012 
1013 	/*
1014 	 * Clear state on status returned.
1015 	 */
1016 	link->state = MTX_LINK_IDLE;
1017 
1018 	return error;
1019 }
1020 
1021 /*
1022  * Abort a mutex locking operation, causing mtx_lock_ex_link() to
1023  * return ENOLCK.  This may be called at any time after the mtx_link
1024  * is initialized or the status from a previous lock has been
1025  * returned.  If called prior to the next (non-try) lock attempt, the
1026  * next lock attempt using this link structure will abort instantly.
1027  *
1028  * Caller must still wait for the operation to complete, either from a
1029  * blocking call that is still in progress or by calling mtx_wait_link().
1030  *
1031  * If an asynchronous lock request is possibly in-progress, the caller
1032  * should call mtx_wait_link() synchronously.  Note that the asynchronous
1033  * lock callback will NOT be called if a successful abort occurred. XXX
1034  */
1035 void
1036 mtx_abort_link(mtx_t *mtx, mtx_link_t *link)
1037 {
1038 	thread_t td = curthread;
1039 	u_int	lock;
1040 	u_int	nlock;
1041 
1042 	/*
1043 	 * Acquire MTX_LINKSPIN
1044 	 */
1045 	++td->td_critcount;
1046 	for (;;) {
1047 		lock = mtx->mtx_lock;
1048 		if (lock & MTX_LINKSPIN) {
1049 			cpu_pause();
1050 			++mtx_collision_count;
1051 			continue;
1052 		}
1053 		nlock = lock | MTX_LINKSPIN;
1054 		if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
1055 			break;
1056 		cpu_pause();
1057 		++mtx_collision_count;
1058 	}
1059 
1060 	/*
1061 	 * Do the abort.
1062 	 *
1063 	 * WARNING! Link structure can disappear once link->state is set.
1064 	 */
1065 	nlock = MTX_LINKSPIN;	/* to clear */
1066 
1067 	switch(link->state) {
1068 	case MTX_LINK_IDLE:
1069 		/*
1070 		 * Link not started yet
1071 		 */
1072 		link->state = MTX_LINK_ABORTED;
1073 		break;
1074 	case MTX_LINK_LINKED_EX:
1075 		/*
1076 		 * de-link, mark aborted, and potentially wakeup the thread
1077 		 * or issue the callback.
1078 		 */
1079 		if (link->next == link) {
1080 			if (mtx->mtx_exlink == link) {
1081 				mtx->mtx_exlink = NULL;
1082 				nlock |= MTX_EXWANTED;	/* to clear */
1083 			}
1084 		} else {
1085 			if (mtx->mtx_exlink == link)
1086 				mtx->mtx_exlink = link->next;
1087 			link->next->prev = link->prev;
1088 			link->prev->next = link->next;
1089 		}
1090 
1091 		/*
1092 		 * When aborting the async callback is still made.  We must
1093 		 * not set the link status to ABORTED in the callback case
1094 		 * since there is nothing else to clear its status if the
1095 		 * link is reused.
1096 		 */
1097 		if (link->callback) {
1098 			link->state = MTX_LINK_CALLEDBACK;
1099 			link->callback(link, link->arg, ENOLCK);
1100 		} else {
1101 			link->state = MTX_LINK_ABORTED;
1102 			wakeup(link);
1103 		}
1104 		++mtx_wakeup_count;
1105 		break;
1106 	case MTX_LINK_LINKED_SH:
1107 		/*
1108 		 * de-link, mark aborted, and potentially wakeup the thread
1109 		 * or issue the callback.
1110 		 */
1111 		if (link->next == link) {
1112 			if (mtx->mtx_shlink == link) {
1113 				mtx->mtx_shlink = NULL;
1114 				nlock |= MTX_SHWANTED;	/* to clear */
1115 			}
1116 		} else {
1117 			if (mtx->mtx_shlink == link)
1118 				mtx->mtx_shlink = link->next;
1119 			link->next->prev = link->prev;
1120 			link->prev->next = link->next;
1121 		}
1122 
1123 		/*
1124 		 * When aborting the async callback is still made.  We must
1125 		 * not set the link status to ABORTED in the callback case
1126 		 * since there is nothing else to clear its status if the
1127 		 * link is reused.
1128 		 */
1129 		if (link->callback) {
1130 			link->state = MTX_LINK_CALLEDBACK;
1131 			link->callback(link, link->arg, ENOLCK);
1132 		} else {
1133 			link->state = MTX_LINK_ABORTED;
1134 			wakeup(link);
1135 		}
1136 		++mtx_wakeup_count;
1137 		break;
1138 	case MTX_LINK_ACQUIRED:
1139 	case MTX_LINK_CALLEDBACK:
1140 		/*
1141 		 * Too late, the lock was acquired.  Let it complete.
1142 		 */
1143 		break;
1144 	default:
1145 		/*
1146 		 * link already aborted, do nothing.
1147 		 */
1148 		break;
1149 	}
1150 	atomic_clear_int(&mtx->mtx_lock, nlock);
1151 	--td->td_critcount;
1152 }
1153